├── .gitignore
├── CyclingERD.sql
├── README.md
├── airflow
    ├── .env.example
    ├── Dockerfile
    ├── README.md
    ├── dags
    │   ├── init_0_ingestion_to_s3_dag.py
    │   ├── init_1_spark_emr_dag.py
    │   ├── init_2_s3_to_redshifht_dag.py
    │   ├── init_3_web_scraping_dag.py
    │   ├── proc_0_ingestion_to_s3_dag.py
    │   ├── proc_1_spark_emr_dag.py
    │   ├── proc_2_s3_to_redshifht_dag.py
    │   └── scripts
    │   │   ├── init-data-transformation.py
    │   │   └── journey-data-transformation.py
    ├── docker-compose.yaml
    ├── logs
    │   └── scheduler
    │   │   └── latest
    └── requirements.txt
├── images
    ├── CyclingERD.png
    ├── batch-on-aws.png
    ├── dags
    │   ├── init_0.png
    │   ├── init_1.png
    │   ├── init_2.png
    │   ├── init_3.png
    │   ├── inits.png
    │   ├── proc_0.png
    │   ├── proc_1.png
    │   └── proc_2.png
    ├── final-dashboard.png
    └── redshift-metabase.png
├── metabase
    └── README.md
├── notebook
    ├── data-exploration
    │   ├── Exploration.ipynb
    │   └── Scraping.ipynb
    └── data-transformation
    │   ├── experiment.ipynb
    │   ├── init-data-transformation.ipynb
    │   └── journey-data-transformation.ipynb
├── services.md
└── terraform
    ├── main.tf
    ├── services.md
    └── variables.tf


/.gitignore:
--------------------------------------------------------------------------------
 1 | log/*
 2 | 
 3 | *.log
 4 | 
 5 | .terraform*
 6 | 
 7 | terraform.tfstate*
 8 | 
 9 | __pycache__
10 | 
11 | airflow/.env
12 | 
13 | 
14 | .ipynb_checkpoints


--------------------------------------------------------------------------------
/CyclingERD.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS "fact_journey";
 2 | DROP TABLE IF EXISTS "dim_station";
 3 | DROP TABLE IF EXISTS "dim_weather";
 4 | DROP TABLE IF EXISTS "dim_datetime";
 5 | 
 6 | CREATE TABLE "fact_journey" (
 7 |   "rental_id" int PRIMARY KEY,
 8 |   "bike_id" int,
 9 |   "end_date" timestamp,
10 |   "end_station" int,
11 |   "start_date" timestamp,
12 |   "start_station" int,
13 |   "weather_date" date
14 | );
15 | 
16 | CREATE TABLE "dim_station" (
17 |   "station_id" int PRIMARY KEY,
18 |   "station_name" varchar,
19 |   "longitude" double precision,
20 |   "latitude" double precision,
21 |   "easting" double precision,
22 |   "northing" double precision
23 | );
24 | 
25 | CREATE TABLE "dim_weather" (
26 |   "weather_date" date PRIMARY KEY,
27 |   "feelslike" double precision,
28 |   "feelslikemax" double precision,
29 |   "feelslikemin" double precision,
30 |   "humidity" double precision,
31 |   "moonphase" double precision,
32 |   "precip" double precision,
33 |   "pressure" double precision,
34 |   "solarenergy" double precision,
35 |   "solarradiation" double precision,
36 |   "sunrise" varchar,
37 |   "sunset" varchar,
38 |   "temp" double precision,
39 |   "tempmax" double precision,
40 |   "tempmin" double precision,
41 |   "tzoffset" double precision,
42 |   "uvindex" double precision,
43 |   "visibility" double precision,
44 |   "winddir" double precision,
45 |   "windgust" double precision,
46 |   "windspeed" double precision
47 | );
48 | 
49 | CREATE TABLE "dim_datetime" (
50 |   "datetime_id" timestamp PRIMARY KEY,
51 |   "second" int,
52 |   "minute" int,
53 |   "hour" int,
54 |   "day" int,
55 |   "month" int,
56 |   "week_day" int,
57 |   "year" int
58 | );
59 | 
60 | ALTER TABLE "fact_journey" ADD FOREIGN KEY ("start_date") REFERENCES "dim_datetime" ("datetime_id");
61 | 
62 | ALTER TABLE "fact_journey" ADD FOREIGN KEY ("end_date") REFERENCES "dim_datetime" ("datetime_id");
63 | 
64 | ALTER TABLE "fact_journey" ADD FOREIGN KEY ("start_station") REFERENCES "dim_station" ("station_id");
65 | 
66 | ALTER TABLE "fact_journey" ADD FOREIGN KEY ("end_station") REFERENCES "dim_station" ("station_id");
67 | 
68 | ALTER TABLE "fact_journey" ADD FOREIGN KEY ("weather_date") REFERENCES "dim_weather" ("weather_date");


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Batch processing on aws
  2 | This project shows one way to perform a batch processing using mainly AWS and a few open-source tools.
  3 | 
  4 | ## Table of contents
  5 | - [Overview](#overview)
  6 | - [The Goal](#the-goal)
  7 | - [The dataset](#the-dataset)
  8 | - [Data modeling](#data-modeling)
  9 | - [Tools](#tools)
 10 | - [Scalability](#scalability)
 11 | - [Running the project](#running-the-project)
 12 |   * [1. Requirements](#1-requirements)
 13 |   * [2. Clone the repository](#2-clone-the-repository)
 14 |   * [3. Run Terraform](#3-run-terraform)
 15 |   * [4. Create the Data Warehouse](#4-create-the-data-warehouse)
 16 |   * [5. Run Airflow](#5-run-airflow)
 17 |   * [6. Run the Airflow DAGs](#6-run-the-airflow-dags)
 18 |   * [7. Visualise data on Metabase](#7-visualise-data-on-metabase)
 19 | - [Project limitations](#project-limitations)
 20 |   * [Manual DAGs triggering](#manual-dags-triggering)
 21 | 
 22 | 
 23 | ## Overview
 24 | 
 25 | The current work aims to give answers to business questions concerning bicycle rentals in the city of London from 2021 to January 2022. To do so, we are going to build a data pipeline which collects data from multiple sources, applies transformations and displays the preprocessed data into a dashboard. 
 26 | 
 27 | The following diagram illustrates a high-level structure of the pipeline where data flows from different sources to the final visualisation tool.
 28 | 
 29 | ![The ELT](/images/batch-on-aws.png "ERD edited from dbdiagram.io")
 30 | 
 31 | 
 32 | ## The Goal
 33 | The end goal of the current project is to preprocess the data on AWS platform and get useful insights from it. We can learn more from the data by responding to some of the following business questions on the final dashboard.
 34 | 
 35 | - At what time or which hour of the day has the most active rental in average? 
 36 | 
 37 | - Which area has the most active bike rentals in London?
 38 | 
 39 | - Which day of the week is the most active in general?
 40 | 
 41 | - What is the global trend for daily rentals over the year?
 42 | 
 43 | ## The dataset
 44 | We are going to process 3 datasets along this project.
 45 | 
 46 | 1. __Cycling journey__ dataset from January 2021 to January 2022. It is spread into multiple files in the [Transport for London (TFL)](https://cycling.data.tfl.gov.uk/) website. We will scrap the web page to extract all the relevant links. Then download each file afterwards. This dataset contains the main features for every cycling journey, including: the locations of start/end point of each journey, the timestamps for both departure and arrival, etc. 
 47 | 
 48 | 2. __Stations__ dataset encompasses the details of every station involved in a journey. This dataset is quite outdated as it does not include stations which were added after 2016. To solve this issue, We will add in this old dataset, all the new stations we encounter in each journey. The stations were found in a forum [What do they know](www.whatdotheyknow.com) and can be downloaded directly from [here](https://www.whatdotheyknow.com/request/664717/response/1572474/attach/3/Cycle%20hire%20docking%20stations.csv.txt).
 49 | 
 50 | 3. __Weather__ dataset includes daily weather data in the city of London from January 2021 to January 2022. It was originally retrieved from [Visual Crossing](https://www.visualcrossing.com/) website and made available to download from [this link](https://drive.google.com/file/d/13LWAH93xxEvOukCnPhrfXH7rZZq_-mss/view?usp=sharing).
 51 | 
 52 | In total the cycling journey data contains: 10925928 entries, stations: 808 and weather: 396.
 53 | 
 54 | ## Data modeling
 55 | We are going to build a **Star Schema** which comprises one fact and multiple dimension tables for our Data Warehouse.
 56 | 
 57 | The Entity Relational Diagram (ERD) for the final Data Warehouse is represented in the following image:
 58 | ![The ERD](/images/CyclingERD.png "ERD edited from dbdiagram.io")
 59 | 
 60 | In the transformation phase, several columns from both weather and journey data will be removed. Also, we will add dimension table dim_datetime which will contain the reference for all datetime-related columns.
 61 | 
 62 | The given schema will facilitate the exploration of the whole data in order to answer relevant business questions about them.
 63 | 
 64 | ## Tools
 65 | 1. **Terraform**: an open-source tool which provides `Infrastructure as Code (IaC)`. It allows us to build and maintain our AWS infrastructure including: `Redshift`, `S3` and `EC2 instance`. We will not include our `EMR clusters` in  Terraform as they will be manually added and terminated from `Airflow` when we need them.
 66 | 
 67 | 2. **Apache Airflow**: an open-source tool to programmatically author, schedule and monitor workflows. The majority of data tasks in the project will be monitored on Airflow.
 68 | 
 69 | 3. **Selenium** and **BeautifulSoup** are packages which help us to perform web scraping. BeautifulSoup cannot scrape a webpage that displays data lazily, this is where Selenium comes into the picture as it can wait for a specific content to load on the page before doing further processing.
 70 | 
 71 | 4. **AWS Simple Storage System** or Simple Storage System: provides a large storage for us to create a __Data Lake__. We will store all the raw data in this location. Also, the preprocessed data will be stored in S3 before being loaded to Redshift.
 72 | 
 73 | 5. **Apache Spark**: an open-source software that can efficiently process Big Data in a distributed or parallel system. We will use PySpark (Spark with Python) to transform the raw data and prepare them for the Data Warehouse on Redshift.
 74 | 
 75 | 6. **AWS Elastic MapReduce**, a managed cluster platform that allows the running of big data tools such as Spark and Hadoop. We will employ AWS EMR to run our Spark jobs during the transformation phase of the data.
 76 | 
 77 | 7. **AWS Redshift**, a fully managed and highly scalable data warehouse solution offered by Amazon. We will build our Data Warehouse on Redshift and we will make the data available for visualisation tools from there.
 78 | 
 79 | 8. **Metabase** another open-source software that allows an easy visualisation and analytics of structured data. We will build a dashboard with Metabase to better visualise our data stored in Redshift.
 80 | 
 81 | 9. **Docker**, a set of platform as a service which containerise softwares, allowing them to act the same way across multiple platforms. In this project, we will run Airflow and Metabase on Docker.
 82 | 
 83 | ## Scalability
 84 | It is always a good practice to consider scalability scenarios when building a data pipeline. The significant increase of the data in the future is much expected. 
 85 | 
 86 | For instance, if the volume of the data has increased 500x or even as high as 1000x, that should not break our pipeline. 
 87 | 
 88 | We need to scale our EMR cluster nodes either horizontally or both vertically and horizontally 
 89 | 
 90 | - Horizontal scale refers to adding more cluster nodes to process the high-volume data.
 91 | 
 92 | - Vertical and Horizontal scale means that we increase the performance of existing nodes. Then we also add new nodes to the cluster.
 93 | 
 94 | 
 95 | ## Running the project
 96 | ### 1. Requirements
 97 | In order to run the project smoothly, a few requirements should be met:
 98 | - AWS account with sufficient permissions to access and work on S3, Redshift, and EMR. 
 99 | To do so: 
100 |     *   Go to [IAM](https://console.aws.amazon.com/iam/home) in AWS console.
101 |     *   Create a new user
102 |     *   Add permissions to that new user: `AmazonS3FullAccess`, `AmazonRedshiftFullAccess`, `AdministratorAccess`, `AmazonEMRFullAccessPolicy_v2`, `AmazonEMRServicePolicy_v2`, `AmazonEC2FullAccess`.
103 |     *   In the "Security credentials" tab, create access key and download the `.csv` file.  
104 | 
105 | - It is also necessary to have the AWS account preconfigured (i.e having `~/.aws/credentials` and `~/.aws/config` available in your local environment). [This AWS Doc](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/setup-credentials.html) shows the essential steps to setup local environment with AWS.
106 | 
107 | 
108 | - Docker and Docker Compose, preinstalled in your local environment. Otherwise, they can be installed from [Get Docker](https://docs.docker.com/get-docker/).
109 | 
110 | - Terraform preinstalled in your local environment. If not, please install it by following the instructions given in the [official download page](https://www.terraform.io/downloads).
111 | 
112 | 
113 | ### 2. Clone the repository
114 | ```bash
115 | git clone https://github.com/HoracioSoldman/batch-processing-on-aws.git
116 | ```
117 | 
118 | ### 3. Run Terraform
119 | We are going to use Terraform to build our AWS infrastructure
120 | 
121 | From the project root folder, move to the `./terraform` directory
122 | ```bash
123 | cd terraform
124 | ```
125 | Run terraform commands one by one
126 | 
127 | - Initialization
128 |     ```bash
129 |     terraform init
130 |     ```
131 | 
132 | - Planning
133 |     ```bash
134 |     terraform plan
135 |     ```
136 | - Applying
137 |     ```bash
138 |     terraform apply
139 |     ```
140 | 
141 | ### 4. Create the Data Warehouse
142 | 
143 | - Go to the [AWS Redshift](https://console.aws.amazon.com/redshiftv2/home) cluster which was freshly created from Terraform. 
144 | 
145 | - Connect to your database then go to `Query Data`.
146 | 
147 | - Manually `Copy` the content of [CyclingERD.sql](/CyclingERD.sql) into the query field and `RUN` the command. This will create the tables and attach constraints to them.
148 | 
149 | 
150 | ### 5. Run Airflow
151 |      
152 | - From the project root folder, move to the `./airflow` directory
153 |     ```bash
154 |     cd airflow
155 |     ```
156 | - Create environment variables in the `.env` file for our future Docker containers.
157 |     ```bash
158 |     cp .env.example .env
159 |     ```
160 | 
161 | - Fill in the content of the `.env` file.
162 |     The value for `AIRFLOW_UID` is obtained from the following command:
163 |     ```bash
164 |     echo -e "AIRFLOW_UID=$(id -u)"
165 |     ```
166 |     Then the value for `AIRFLOW_GID` can be left to `0`.
167 | 
168 |     - Build our extended Airflow Docker image
169 |     ```bash
170 |     docker build -t airflow-img .
171 |     ```
172 |     If you would prefer having another tag, replace the `airflow-img` by whatever you like. Then just make sure that you also change the image tag in [docker-compose.yaml](/airflow/docker-compose.yaml) at line `48`: `image: <your-tag>:latest`.
173 | 
174 |     This process might take up to 15 minutes or even more depending on your internet speed. At this stage, Docker also instals several packages defined in the [requirements.txt](/airflow/requirements.txt).
175 | 
176 | - Run docker-compose to launch Airflow
177 | 
178 |     Initialise Airflow
179 |     ```bash
180 |     docker-compose up airflow-init 
181 |     ```
182 | 
183 |     Launch Airflow
184 |     ```bash
185 |     docker-compose up
186 |     ```
187 |     This last command launched `Airflow Postgres` internal database, `Airflow Scheduler` and `Airflow Webserver` which could have been launched separately if we did not use Docker.
188 | 
189 | ### 6. Run the Airflow DAGs
190 |     
191 | Once Airflow is up and running, we can now proceed to the most exciting part of the project.
192 | 
193 | The initialisation DAGs (`init_?_*_dag`) are interdependent. In essence, each DAG wait the success run of its predecessor before starting its tasks. 
194 | For instance, `init_1_spark_emr_dag` will not be started until `init_0_ingestionto_s3_dag` is complete successfully.
195 | In order to trigger these DAGs, please enable the 4 of them _SIMULTANEOUSLY_.
196 | 
197 | The processor DAGs (`proc_?_*_dag`) on the other hand, needs to be started individually. 
198 | __It is necessary to wait for 4 initialisation DAGs to complete before starting the processor ones__.
199 | To run these last 3 DAGs, please enable the `proc_0_ingestion_to_s3_dag`, wait for it to finish its tasks before enabling the next DAG: `proc_1_spark_emr_dag`.
200 | Likewise, it is necessary to wait unti the end of `proc_1_spark_emr_dag` process before enabling the last DAG: `proc_2_s3_to_redshift_dag` 
201 | 
202 | 
203 | The following screenshot shows a success run of the first DAG.
204 | 
205 | ![Ingestion DAG (init_0_ingestion_to_s3_dag)](/images/dags/init_0.png "Ingestion DAG in the Graph view")
206 | 
207 | 
208 | After all dags operations, we can now move to Metabase to visualise the data. 
209 | 
210 | ### 7. Visualise data on Metabase
211 |     
212 | Again we will install and run Metabase in a Docker container.
213 | ```bash
214 | docker run -d -p 3033:3000 --name metabase metabase/metabase
215 | ```
216 | 
217 | For the very first time of its execution, the above command downloads the latest Docker image available for Metabase before exposing the application on port `3033`.
218 | 
219 | Once the above command finishes its execution, Metabase should be available at [http://localhost:3033](http://localhost:3033).
220 | 
221 | We can now connect our Redshift database to this platform and visualise the data in multiple charts.
222 | 
223 | The following screenshot displays a part of our final dashboard which clearly shows some useful insights about bicycle rides in different dimensions.
224 | 
225 | ![Final Dashboard](/images/final-dashboard.png "The final dashboard on Metabase")
226 | 


--------------------------------------------------------------------------------
/airflow/.env.example:
--------------------------------------------------------------------------------
 1 | # Custom
 2 | AIRFLOW_CONN_AWS_DEFAULT="aws://<ACCESS KEY ID>:<SECRET ACCESS KEY>@"
 3 | AIRFLOW_UID=
 4 | AIRFLOW_GID=
 5 | AWS_DEFAULT_REGION="<REGION e.g eu-west1>"
 6 | AWS_PROFILE=
 7 | S3_BUCKET=
 8 | 
 9 | AIRFLOW_CONN_REDSHIFT_DEFAULT=<Create a redshift uri from the guideline from https://airflow.apache.org/docs/apache-airflow-providers-amazon/stable/connections/redshift.html then paste the URI here>
10 | AIRFLOW_CONN_EMR_DEFAULT="aws://<ACCESS KEY ID>:<SECRET ACCESS KEY>@"
11 | 
12 | # Postgres
13 | POSTGRES_USER=airflow
14 | POSTGRES_PASSWORD=airflow
15 | POSTGRES_DB=airflow
16 | 
17 | # Airflow
18 | AIRFLOW__CORE__EXECUTOR=LocalExecutor
19 | AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC=10
20 | 
21 | AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB}
22 | AIRFLOW_CONN_METADATA_DB=postgres+psycopg2://airflow:airflow@postgres:5432/airflow
23 | AIRFLOW_VAR__METADATA_DB_SCHEMA=airflow
24 | 
25 | _AIRFLOW_WWW_USER_CREATE=True
26 | _AIRFLOW_WWW_USER_USERNAME=airflow
27 | _AIRFLOW_WWW_USER_PASSWORD=airflow
28 | 


--------------------------------------------------------------------------------
/airflow/Dockerfile:
--------------------------------------------------------------------------------
 1 | # First-time build can take upto 10 mins.
 2 | 
 3 | FROM apache/airflow:2.2.3
 4 | 
 5 | ENV AIRFLOW_HOME=/opt/airflow
 6 | 
 7 | USER root
 8 | 
 9 | RUN apt-get update -qq \
10 |     && apt-get install firefox-esr -y -qq \
11 |     && apt-get install wget -y -qq
12 | 
13 | COPY requirements.txt .
14 | 
15 | RUN pip install -q -r requirements.txt
16 | 
17 | # workaround to fix selenium module not found
18 | RUN curl -sSLf "https://files.pythonhosted.org/packages/58/76/705b5c776f783d1ba7c630347463d4ae323282bbd859a8e9420c7ff79581/selenium-4.1.0-py3-none-any.whl" > ~/selenium-4.1.0-py3-none-any.whl \
19 |     && chmod +x ~/selenium-4.1.0-py3-none-any.whl \
20 |     && sudo pip install -q ~/selenium-4.1.0-py3-none-any.whl webdriver_manager
21 | 
22 | WORKDIR $AIRFLOW_HOME
23 | 
24 | USER $AIRFLOW_UID


--------------------------------------------------------------------------------
/airflow/README.md:
--------------------------------------------------------------------------------
 1 | ## Running DAGS on Airflow
 2 | In this project, we are using Airflow in a docker container.
 3 | 
 4 | ### Requirements
 5 | In order to run Airflow and the pipeline in this project, you need to have:
 6 | 
 7 | * [Docker](https://www.docker.com) and [docker compose](https://docs.docker.com/compose/install) installed. This can be checked by running 
 8 | ```bash
 9 | docker -v
10 | ``` 
11 | * [AWS Account](https://aws.amazon.com/account/) which has access to an [S3 bucket](https://aws.amazon.com/s3/)
12 | * An **AWS_ACCESS_KEY_ID** and an **AWS_SECRET_ACCESS_KEY** associated with the [AWS Account](https://aws.amazon.com/account/). 
13 | 
14 | ### Set ENVIRONMENT VARIABLES
15 | Before building the airflow docker image, it is necessary to set ENVIRONMENT VARIABLES in a `.env` file.
16 | 
17 | To do so, rename the `.env.example` file located in this directory to `.env` then add the correct values for your own environment.
18 | 
19 | The `AIRFLOW_UID` value can be obtained from the following command:
20 | ```bash
21 | echo -e "AIRFLOW_UID=$(id -u)" > .env
22 | ```
23 | Then `AIRFLOW_GID` can be set to `0`.
24 | 
25 | ### Dockerfile and docker-compose.yaml
26 | In DOckerfile, we download several packages such as: 
27 | 1. [firefox esr](https://www.mozilla.org/en-US/firefox/enterprise/) for web scraping
28 | 
29 | 2. [selenium](https://pypi.org/project/selenium/) for web scraping
30 | 
31 | 3. [webdriver_manager](https://pypi.org/project/webdriver-manager/)  for web scraping
32 | 
33 | 4. [apache-airflow-providers-amazon](https://airflow.apache.org/docs/apache-airflow-providers-amazon/stable/index.html) to communicate and work with AWS
34 | 
35 | 5. [pyarrow](https://pypi.org/project/pyarrow/) to convert `.csv` to `.parquet` files
36 | 
37 | 6. [bs4](https://pypi.org/project/beautifulsoup4/) for web scraping
38 | 
39 | Also, the `docker-compose.yaml` is a modified version of the original [docker-compose.yaml](https://airflow.apache.org/docs/apache-airflow/stable/docker-compose.yaml).
40 | 
41 | 
42 | ### Run Airflow
43 | 
44 | 1. Build docker image with the current Dockerfile
45 | ```bash
46 | docker build -t airflow-img .
47 | ```
48 | This command should be run only once or after editing the content of Dockerfile.
49 | 
50 | 2. Initialize Airflow
51 | ```bash
52 | docker-compose up airflow-init
53 | ```
54 | This command should terminate with `exit code 0` if everything went well.
55 | 
56 | 3. Launch Airflow
57 | ```bash
58 | docker-compose up
59 | ```
60 | 
61 | 4. Visit [http://localhost:8080](http://localhost:8080) to access the Airflow GUI.
62 | 
63 | 5. In order to stop the Airflow container,
64 | ```bash
65 | docker-compose down
66 | ```
67 | 
68 | ### Note: 
69 | It is highly recommended to manually trigger the `web_scraping_dag` prior to enabling the `s3_ingestion_dag`. 
70 | 
71 | In order to do this, open the `web_scraping_dag` on Airflow and click on the **Play** button on the right, then select **Trigger Dags now**. All the scraping tasks must be completed before starting the ingestion with the other DAG.
72 | 
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/airflow/dags/init_0_ingestion_to_s3_dag.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import logging
  4 | import json
  5 | import pandas as pd
  6 | 
  7 | from airflow import DAG
  8 | from airflow.utils.dates import days_ago
  9 | from airflow.operators.dummy import DummyOperator
 10 | from airflow.operators.bash import BashOperator
 11 | from airflow.operators.python import PythonOperator
 12 | from airflow.utils.task_group import TaskGroup
 13 | from airflow.providers.amazon.aws.transfers.local_to_s3 import LocalFilesystemToS3Operator
 14 | 
 15 | 
 16 | path_to_local_home = os.environ.get("AIRFLOW_HOME", "/opt/airflow")
 17 | S3_DESTINATION = "raw/cycling-extras"
 18 | S3_BUCKET = os.environ.get("S3_BUCKET", "s3_no_bucket")
 19 | S3_SCRIPT_DESTINATION = "utils/scripts"
 20 | download_links= [
 21 |     {   
 22 |         'name': 'stations',
 23 |         'link': 'https://www.whatdotheyknow.com/request/664717/response/1572474/attach/3/Cycle%20hire%20docking%20stations.csv.txt',
 24 |         'output': 'stations.csv'
 25 |     },
 26 |     {
 27 |         'name': 'weather',
 28 |         'link': '--no-check-certificate "https://docs.google.com/uc?export=download&id=13LWAH93xxEvOukCnPhrfXH7rZZq_-mss"',
 29 |         'output': 'weather.json'
 30 |     },
 31 |     {
 32 |         'name': 'journey',
 33 |         'link': 'https://cycling.data.tfl.gov.uk/usage-stats/246JourneyDataExtract30Dec2020-05Jan2021.csv',
 34 |         'output': 'journey.csv'
 35 |     }
 36 | ]
 37 | local_scripts = [ 'init-data-transformation.py', 'journey-data-transformation.py' ]
 38 | 
 39 | 
 40 | # extract days value from the weather data
 41 | def preprocess_data(filepath):
 42 | 
 43 |     filename= filepath.split('/')[-1]
 44 | 
 45 |     if filename != 'weather.json':
 46 |         print(f'No preprocessing needed for {filename}')
 47 |         return
 48 |     
 49 |     with open(filepath, 'r') as f:
 50 |         weather = json.load(f)
 51 | 
 52 |     daily_weather= weather['days']
 53 |     
 54 |     with open(filepath, 'w') as f:
 55 |         json.dump(daily_weather, f)
 56 | 
 57 | 
 58 | default_args = {
 59 |     "owner": "airflow",
 60 |     "start_date": days_ago(1),
 61 |     "depends_on_past": False,
 62 |     "retries": 1,
 63 | }
 64 | 
 65 | # NOTE: DAG declaration - using a Context Manager (an implicit way)
 66 | with DAG(
 67 |     dag_id="init_0_ingestion_to_s3_dag",
 68 |     description="""
 69 |         This dag ingests extra files for the cycling journey including: the docking stations, 
 70 |         the weather data and an example file for cycling journey.
 71 |     """, 
 72 |     schedule_interval="@once",
 73 |     default_args=default_args,
 74 |     catchup=False,
 75 |     max_active_runs=3,
 76 |     tags=['weather', 'stations', 'docking stations', 'london', '2021', 'journey'],
 77 | ) as dag:
 78 | 
 79 |     start = DummyOperator(task_id="start")
 80 | 
 81 |         
 82 |     with TaskGroup(f"Download_files", tooltip="Download - Preprocess") as download_section:
 83 | 
 84 |         for index, item in enumerate(download_links):
 85 |             download_task = BashOperator(
 86 |                 task_id=f"download_{item['name']}_task",
 87 |                 bash_command=f"wget {item['link']} -O {path_to_local_home}/{item['output']}"
 88 |             )
 89 | 
 90 |             if item['output'] == 'weather.json':
 91 |                 preprocessing_task = PythonOperator(
 92 |                     task_id=f"extract_daily_weather_data",
 93 |                     python_callable=preprocess_data,
 94 |                     provide_context=True,
 95 |                     op_kwargs={
 96 |                         "filepath": f"{path_to_local_home}/{item['output']}"
 97 |                     }
 98 |                 )
 99 | 
100 |                 download_task >> preprocessing_task
101 |             
102 | 
103 | 
104 |     with TaskGroup("upload_files_to_s3") as upload_section:
105 | 
106 |         for index, item in enumerate(download_links):
107 |             
108 |             upload_to_s3_task = LocalFilesystemToS3Operator(
109 |                 task_id=f"upload_{item['name']}_to_s3_task",
110 |                 filename=item['output'],
111 |                 dest_key=f"{S3_DESTINATION}/{item['output']}",
112 |                 dest_bucket=S3_BUCKET,
113 |             )
114 | 
115 | 
116 |     cleanup = BashOperator(
117 |         task_id="cleanup_local_storage",
118 |         bash_command=f"rm {path_to_local_home}/*.json {path_to_local_home}/*.csv "
119 |     )
120 | 
121 |     # upload scripts
122 |     with TaskGroup("upload_scripts_to_s3") as upload_scripts_section:
123 |         for index, item in enumerate(local_scripts):
124 |             upload_scripts_to_s3_task = LocalFilesystemToS3Operator(
125 |                 task_id=f"upload_scritps_{index}_to_s3_task",
126 |                 filename=f"dags/scripts/{item}",
127 |                 dest_key=f"{S3_SCRIPT_DESTINATION}/{item}",
128 |                 dest_bucket=S3_BUCKET,
129 |             )
130 | 
131 |     end = DummyOperator(task_id="end")
132 | 
133 |     start >> download_section >> upload_section >> cleanup >> end
134 |     start >> upload_scripts_section >> end
135 | 
136 | 


--------------------------------------------------------------------------------
/airflow/dags/init_1_spark_emr_dag.py:
--------------------------------------------------------------------------------
  1 | from airflow import DAG
  2 | from airflow.utils.dates import days_ago
  3 | from airflow.operators.dummy import DummyOperator
  4 | 
  5 | from airflow.providers.amazon.aws.operators.emr_add_steps import EmrAddStepsOperator
  6 | from airflow.providers.amazon.aws.operators.emr_create_job_flow import EmrCreateJobFlowOperator
  7 | from airflow.providers.amazon.aws.operators.emr_terminate_job_flow import EmrTerminateJobFlowOperator
  8 | from airflow.providers.amazon.aws.sensors.emr_step import EmrStepSensor
  9 | from airflow.sensors.external_task import ExternalTaskSensor
 10 | 
 11 | 
 12 | SPARK_STEPS = [
 13 |     {
 14 |         "Name": "One-time data transformation",
 15 |         "ActionOnFailure": "CANCEL_AND_WAIT",
 16 |         "HadoopJarStep": {
 17 |             "Jar": "command-runner.jar",
 18 |             "Args": [
 19 |                 "spark-submit",
 20 |                 "--deploy-mode",
 21 |                 "client",
 22 |                 "s3://hrc-de-data/utils/scripts/init-data-transformation.py",
 23 |             ],
 24 |         },
 25 |     }
 26 | ]
 27 | 
 28 | JOB_FLOW_OVERRIDES = {
 29 |     'Name': 'ExtrasDataTransformer',
 30 |     'ReleaseLabel': 'emr-5.34.0',
 31 |     'Applications': [{'Name': 'Spark'}, {'Name': 'Hadoop'}],
 32 |     'LogUri': 's3n://hrc-de-data/emr/logs',
 33 |     'Instances': {
 34 |         'InstanceGroups': [
 35 |             {
 36 |                 'Name': 'Primary node',
 37 |                 'Market': 'SPOT',
 38 |                 'InstanceRole': 'MASTER',
 39 |                 'InstanceType': 'm5.xlarge',
 40 |                 'InstanceCount': 1,
 41 |             }
 42 |         ],
 43 |         'KeepJobFlowAliveWhenNoSteps': False,
 44 |         'TerminationProtected': False,
 45 |     },
 46 |     'Steps': SPARK_STEPS,
 47 |     'JobFlowRole': 'EMR_EC2_DefaultRole',
 48 |     'ServiceRole': 'EMR_DefaultRole',
 49 | }
 50 | 
 51 | 
 52 | 
 53 | default_args = {
 54 |     "owner": "airflow",
 55 |     "start_date": days_ago(1),
 56 |     "depends_on_past": False,
 57 |     "retries": 1,
 58 | }
 59 | 
 60 | with DAG(
 61 |     dag_id="init_1_spark_emr_dag",
 62 |     description="""
 63 |         This dag perform a manually triggered and one-time-running spark jobs which processes extra files in s3.
 64 |     """, 
 65 |     schedule_interval="@once",
 66 |     default_args=default_args,
 67 |     catchup=False,
 68 |     max_active_runs=1,
 69 |     tags=['spark', 'emr', 'weather', 'stations', 'docking stations', 'london', '2021', 'journey'],
 70 | ) as dag:
 71 | 
 72 | 
 73 |     external_task_sensor = ExternalTaskSensor(
 74 |         task_id='sensor_for_init_0_ingestion_dag',
 75 |         poke_interval=30,
 76 |         soft_fail=False,
 77 |         retries=2,
 78 |         allowed_states=['success'],
 79 |         failed_states=['failed', 'skipped'],
 80 |         external_task_id='end',
 81 |         external_dag_id='init_0_ingestion_to_s3_dag',
 82 |     )
 83 | 
 84 |     start = DummyOperator(task_id="start")
 85 |     
 86 |     cluster_creator = EmrCreateJobFlowOperator(
 87 |         task_id='create_job_flow',
 88 |         job_flow_overrides=JOB_FLOW_OVERRIDES,
 89 |         
 90 |     )
 91 | 
 92 |     step_adder = EmrAddStepsOperator(
 93 |         task_id='add_steps',
 94 |         job_flow_id=cluster_creator.output,
 95 |         steps=SPARK_STEPS,
 96 |         
 97 |     )
 98 | 
 99 |     step_checker = EmrStepSensor(
100 |         task_id='watch_step',
101 |         job_flow_id=cluster_creator.output,
102 |         step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
103 |         
104 |     )
105 | 
106 |     cluster_remover = EmrTerminateJobFlowOperator(
107 |         task_id='remove_cluster', job_flow_id=cluster_creator.output,
108 |         
109 |     )
110 | 
111 |     
112 |     end = DummyOperator(task_id="end")
113 | 
114 |     external_task_sensor >> start >> cluster_creator >> step_adder >> step_checker  >> cluster_remover >> end


--------------------------------------------------------------------------------
/airflow/dags/init_2_s3_to_redshifht_dag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from airflow import DAG
 4 | from airflow.utils.dates import days_ago
 5 | from airflow.operators.dummy import DummyOperator
 6 | from airflow.utils.task_group import TaskGroup
 7 | from airflow.providers.amazon.aws.transfers.s3_to_redshift import S3ToRedshiftOperator
 8 | from airflow.sensors.external_task import ExternalTaskSensor
 9 | 
10 | 
11 | S3_BUCKET = os.environ.get("S3_BUCKET", "s3_no_bucket")
12 | S3_KEY_EXTRAS = f"processed/cycling-dimension"
13 | S3_KEY_JOURNEY = f"processed/test"
14 | 
15 | s3_objects= [
16 |     # we will only load the processed weather dimensional data in this dag, 
17 |     # as the stations and datetime data still receives updates from proc_2_spark_emr_dag.py.
18 |     {
19 |         'type': 'weather',
20 |         'key': S3_KEY_EXTRAS,
21 |         'filename': 'weather/',
22 |         'table': 'dim_weather',
23 |         'file_type': 'parquet',
24 |         'upsert_key': 'weather_date'
25 |     }
26 | ]
27 | 
28 | 
29 | default_args = {
30 |     "owner": "airflow",
31 |     "start_date": days_ago(1),
32 |     "depends_on_past": False,
33 |     "retries": 1,
34 | }
35 | 
36 | # NOTE: DAG declaration - using a Context Manager (an implicit way)
37 | with DAG(
38 |     dag_id="init_2_s3_to_redshifht_dag",
39 |     description="""
40 |         This dag transfers extra files for dimensions from S3 to Redshift.
41 |     """, 
42 |     schedule_interval="@once",
43 |     default_args=default_args,
44 |     catchup=False,
45 |     max_active_runs=3,
46 |     tags=['weather', 'stations', '2021', 's3 to redshift'],
47 | ) as dag:
48 | 
49 | 
50 |     external_task_sensor = ExternalTaskSensor(
51 |         task_id='sensor_for_init_1_spark_dag',
52 |         poke_interval=30,
53 |         soft_fail=False,
54 |         retries=2,
55 |         allowed_states=['success'],
56 |         failed_states=['failed', 'skipped'],
57 |         external_task_id='end',
58 |         external_dag_id='init_1_spark_emr_dag',
59 |     )
60 | 
61 |     start = DummyOperator(task_id="start")
62 | 
63 |     with TaskGroup("load_files_to_redshift") as transfer_section:
64 |         for item in s3_objects:
65 |             transfer_task = S3ToRedshiftOperator(
66 |                 s3_bucket=S3_BUCKET,
67 |                 s3_key=f"{item['key']}/{item['filename']}",
68 |                 schema="PUBLIC",
69 |                 table=item['table'],
70 |                 copy_options=[item['file_type']],
71 |                 method='UPSERT',
72 |                 upsert_keys= [item['upsert_key']],
73 |                 task_id=f"transfer_{item['type']}_s3_to_redshift",
74 |             )
75 | 
76 |     end = DummyOperator(task_id="end")
77 | 
78 |     external_task_sensor >> start >> transfer_section >> end
79 | 


--------------------------------------------------------------------------------
/airflow/dags/init_3_web_scraping_dag.py:
--------------------------------------------------------------------------------
  1 | # imports
  2 | from airflow import DAG
  3 | from airflow.operators.python import PythonOperator
  4 | from airflow.utils.dates import datetime
  5 | from airflow.utils.dates import days_ago
  6 | 
  7 | from bs4 import BeautifulSoup
  8 | 
  9 | # selenium will be used to scrap dynamic content of the webpage, our data source of our data
 10 | from selenium import webdriver
 11 | from webdriver_manager.firefox import GeckoDriverManager
 12 | from selenium.webdriver.firefox.options import Options as FirefoxOptions
 13 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 14 | 
 15 | from selenium.webdriver.common.by import By
 16 | from selenium.webdriver.support.ui import WebDriverWait
 17 | from selenium.webdriver.support import expected_conditions as EC
 18 | from airflow.sensors.external_task import ExternalTaskSensor
 19 | 
 20 | import json
 21 | 
 22 | 
 23 | url= "https://cycling.data.tfl.gov.uk"
 24 | dictionary_file= "links_dictionary.json"
 25 | 
 26 | def contents_downloader(**kwargs):
 27 |     cap = DesiredCapabilities().FIREFOX
 28 |     cap["marionette"] = False
 29 | 
 30 |     options = FirefoxOptions()
 31 |     options = webdriver.FirefoxOptions()
 32 |     options.log.level = "TRACE"
 33 |     options.add_argument('--no-sandbox')
 34 |     options.add_argument('--headless')
 35 |     options.add_argument('--disable-gpu')
 36 |     
 37 |     browser = webdriver.Firefox(capabilities=cap, executable_path=GeckoDriverManager().install(), options=options)
 38 |     browser.get(url)
 39 | 
 40 |     # wait until at least a single element of the table exists
 41 |     wait = WebDriverWait(browser, 20)
 42 |     wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/table/tbody/tr[1]/td[1]')))
 43 |     content= browser.page_source
 44 |     
 45 |     kwargs['ti'].xcom_push(key='html_content', value=content)
 46 |     
 47 | 
 48 | def links_extractor(**kwargs):
 49 |     task_instance= kwargs['ti']
 50 |     html_element= task_instance.xcom_pull(key='html_content', task_ids='download_contents_task')
 51 |     
 52 |     bsoup= BeautifulSoup(html_element, "html.parser")
 53 | 
 54 |     table= bsoup.find('table')
 55 |     tbody= table.find('tbody')
 56 |     folder_name= "usage-stats/"
 57 |     capture_files= False
 58 |     years= [2021, 2022]
 59 |     filetype= 'csv'
 60 |     extracted_files= {}
 61 | 
 62 |     for row in tbody.find_all('tr'):
 63 |         columns= row.find_all('td')
 64 | 
 65 |         if capture_files == False:
 66 |             col_values= [col.text.strip() for col in columns]
 67 | 
 68 |             if col_values[0] == folder_name:
 69 |                 capture_files= True
 70 |                 continue
 71 | 
 72 |         else:
 73 |             col= columns[0]
 74 |             filename= col.text.strip()
 75 |             filename_without_extension= filename.split('.')[-2]
 76 |             year_in_filename= filename_without_extension[-4:]
 77 | 
 78 |             if not year_in_filename.isdigit() or not int(year_in_filename) in years:
 79 |                 continue
 80 |             
 81 |             # extract the date (e.g 257JourneyDataExtract17Mar2021-23Mar2021.csv --> 23Mar2021)
 82 |             
 83 |             filename_last_date= filename_without_extension.split('-')[-1]
 84 |             extracted_files[filename_last_date]= col.a['href']
 85 |     
 86 |     kwargs['ti'].xcom_push(key="dictionary", value=extracted_files)
 87 | 
 88 | 
 89 | def dico_exporter(**kwargs):
 90 |     task_instance= kwargs['ti']
 91 |     links_dictionary= task_instance.xcom_pull(key="dictionary", task_ids="extract_links_task")
 92 |     
 93 |     # serialize json 
 94 |     links_json_object = json.dumps(links_dictionary, indent = 4)
 95 | 
 96 |     # save into a dico file
 97 |     with open(dictionary_file, 'w', encoding='utf-8') as f:
 98 |         f.write(links_json_object)
 99 | 
100 | 
101 | 
102 | ''' 
103 |     TODO: We need to manually trigger this dag for the very first time in order 
104 |     for the s3_ingestion_dag to have links dictionary to work with.
105 |     After the first run, this dag will run every Tuesday at 11:50pm, 
106 |     only 5 minutes before the ingestion dag runs.
107 | '''
108 | default_args = {
109 |     "owner": "airflow",
110 |     "start_date": days_ago(1), 
111 |     "depends_on_past": False,
112 |     "retries": 1
113 | }
114 | 
115 | with DAG(
116 |     dag_id="init_3__web_scraping_dag",
117 |     schedule_interval="@once",
118 |     default_args=default_args,
119 |     catchup=True,
120 |     max_active_runs=1,
121 |     tags=['web', 'scraping', 'links', 'source'],
122 | ) as dag:
123 | 
124 | 
125 |     external_task_sensor = ExternalTaskSensor(
126 |         task_id='sensor_for_init_2_s3_to_redshift_dag',
127 |         poke_interval=30,
128 |         soft_fail=False,
129 |         retries=2,
130 |         allowed_states=['success'],
131 |         failed_states=['failed', 'skipped'],
132 |         external_task_id='end',
133 |         external_dag_id='init_2_s3_to_redshifht_dag',
134 |     )
135 | 
136 |     download_web_contents_task = PythonOperator(
137 |         task_id="download_contents_task",
138 |         provide_context=True,
139 |         python_callable=contents_downloader
140 |     )
141 | 
142 |     extract_links_task = PythonOperator(
143 |         task_id="extract_links_task",
144 |         provide_context=True,
145 |         python_callable=links_extractor,
146 | 
147 |     )
148 | 
149 |     export_links_task = PythonOperator(
150 |         task_id="exporter_links_task",
151 |         provide_context=True,
152 |         python_callable=dico_exporter 
153 |     )
154 | 
155 | 
156 |     external_task_sensor >> download_web_contents_task >> extract_links_task >> export_links_task


--------------------------------------------------------------------------------
/airflow/dags/proc_0_ingestion_to_s3_dag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import logging
 4 | 
 5 | from airflow import DAG
 6 | from airflow.operators.bash import BashOperator
 7 | from airflow.operators.python import PythonOperator
 8 | from airflow.providers.amazon.aws.transfers.local_to_s3 import LocalFilesystemToS3Operator
 9 | from airflow.utils.dates import datetime
10 | 
11 | 
12 | import json
13 | 
14 | # https://cycling.data.tfl.gov.uk/usage-stats/cycling-load.json
15 | 
16 | path_to_local_home = os.environ.get("AIRFLOW_HOME", "/opt/airflow/")
17 | S3_DESTINATION= 'raw/cycling-journey/{{logical_date.strftime(\'%b%Y\')}}'
18 | S3_BUCKET = os.environ.get("S3_BUCKET", "s3_no_bucket")
19 |  
20 | dictionary_file= "links_dictionary.json"
21 | 
22 | 
23 | def get_file_link(exec_date, s3_destination_folder, **kwargs):
24 |     links= {}
25 |     with open(dictionary_file) as dico_file:
26 |         links= json.load(dico_file)
27 |     
28 |     file_link= links[exec_date]
29 |     filename= file_link.split('/')[-1]
30 |     
31 |     kwargs['ti'].xcom_push(key="remote_file_link", value=file_link)
32 |     kwargs['ti'].xcom_push(key="filename", value=filename)
33 |     kwargs['ti'].xcom_push(key="local_file_link", value=f"{path_to_local_home}/{filename}")
34 |     kwargs['ti'].xcom_push(key="s3_filepath_destination", value=f"{s3_destination_folder}/{filename}")
35 |     
36 | 
37 | download_cmd= "curl -sSLf $link > $destination"
38 | 
39 | 
40 | default_args = {
41 |     "owner": "airflow",
42 |     "start_date": datetime(2021, 1, 1),
43 |     "depends_on_past": True,  # the previous task instance needs to have succeeded for the current one to run
44 |     "retries": 1,
45 | }
46 | 
47 | with DAG(
48 |     dag_id="proc_1_ingestion_to_s3_dag",
49 |     schedule_interval="55 23 * * 2",  # run this dag every Tuesday at 11:55pm
50 |     max_active_runs=3,
51 |     catchup=True,
52 |     tags=['s3', 'aws', 'ingestion', 'cycling'],
53 |     default_args=default_args
54 | ) as dag:
55 | 
56 |     get_file_link_task = PythonOperator(
57 |         task_id="get_file_link_task",
58 |         provide_context=True,
59 |         python_callable=get_file_link,
60 |         op_kwargs={
61 |             "exec_date": "{{execution_date.strftime('%d%b%Y')}}",
62 |             "s3_destination_folder": S3_DESTINATION
63 |         }
64 |     )
65 | 
66 | 
67 |     download_dataset_task = BashOperator(
68 |         task_id="download_dataset_task",
69 |         bash_command=download_cmd,
70 |         env={
71 |             "link": "{{ti.xcom_pull(key='remote_file_link')}}",
72 |             "destination": "{{ti.xcom_pull(key='local_file_link')}}"
73 |         }
74 |     )
75 | 
76 |     
77 |     upload_to_s3_task = LocalFilesystemToS3Operator(
78 |         task_id="upload_to_s3",
79 |         filename="{{ti.xcom_pull(key='filename')}}",
80 |         dest_key="{{ti.xcom_pull(key='s3_filepath_destination')}}",
81 |         dest_bucket=S3_BUCKET,
82 |     )
83 | 
84 |     cleanup_local_storage_task = BashOperator(
85 |         task_id="cleanup_local_storage_task",
86 |         bash_command="rm {{ti.xcom_pull(key='local_file_link')}}"
87 |     )
88 | 
89 |     get_file_link_task >> download_dataset_task >> upload_to_s3_task >> cleanup_local_storage_task


--------------------------------------------------------------------------------
/airflow/dags/proc_1_spark_emr_dag.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from airflow import DAG
  4 | from airflow.utils.dates import days_ago
  5 | from airflow.operators.dummy import DummyOperator
  6 | from airflow.operators.bash import BashOperator
  7 | from airflow.operators.python import PythonOperator
  8 | from airflow.utils.task_group import TaskGroup
  9 | 
 10 | from airflow import DAG
 11 | from airflow.providers.amazon.aws.operators.emr_add_steps import EmrAddStepsOperator
 12 | from airflow.providers.amazon.aws.operators.emr_create_job_flow import EmrCreateJobFlowOperator
 13 | from airflow.providers.amazon.aws.operators.emr_terminate_job_flow import EmrTerminateJobFlowOperator
 14 |     
 15 | from airflow.providers.amazon.aws.sensors.emr_step import EmrStepSensor
 16 | 
 17 | BUCKET_NAME = os.environ.get("S3_BUCKET", "s3_no_bucket")
 18 | local_scripts = "dags/scripts"
 19 | s3_script = "utils/scripts/"
 20 | 
 21 | 
 22 | SPARK_STEPS = [
 23 |     {
 24 |         "Name": "Journey data transformation",
 25 |         "ActionOnFailure": "CANCEL_AND_WAIT",
 26 |         "HadoopJarStep": {
 27 |             "Jar": "command-runner.jar",
 28 |             "Args": [
 29 |                 "spark-submit",
 30 |                 "--deploy-mode",
 31 |                 "client",
 32 |                 "s3://hrc-de-data/utils/scripts/journey-data-transformation.py",
 33 |             ],
 34 |         },
 35 |     }
 36 | ]
 37 | 
 38 | JOB_FLOW_OVERRIDES = {
 39 |     'Name': 'ExtrasDataTransformer',
 40 |     'ReleaseLabel': 'emr-5.34.0',
 41 |     'Applications': [{'Name': 'Spark'}, {'Name': 'Hadoop'}],
 42 |     'LogUri': 's3n://hrc-de-data/emr/logs',
 43 |     'Instances': {
 44 |         'InstanceGroups': [
 45 |             {
 46 |                 'Name': 'Primary node',
 47 |                 'Market': 'SPOT',
 48 |                 'InstanceRole': 'MASTER',
 49 |                 'InstanceType': 'm5.xlarge',
 50 |                 'InstanceCount': 1,
 51 |             },
 52 |             {
 53 |                 "Name": "Core node",
 54 |                 "Market": "SPOT",
 55 |                 "InstanceRole": "CORE",
 56 |                 "InstanceType": "m5.xlarge",
 57 |                 "InstanceCount": 2,
 58 |             },
 59 |         ],
 60 |         'KeepJobFlowAliveWhenNoSteps': False,
 61 |         'TerminationProtected': False,
 62 |     },
 63 |     'Steps': SPARK_STEPS,
 64 |     'JobFlowRole': 'EMR_EC2_DefaultRole',
 65 |     'ServiceRole': 'EMR_DefaultRole',
 66 | }
 67 | 
 68 | 
 69 | 
 70 | 
 71 | default_args = {
 72 |     "owner": "airflow",
 73 |     "start_date": days_ago(1),
 74 |     "depends_on_past": False,
 75 |     "retries": 1,
 76 | }
 77 | 
 78 | with DAG(
 79 |     dag_id="proc_2_spark_emr_dag",
 80 |     description="""
 81 |         This dag perform a manually triggered spark jobs which processes extra files in s3.
 82 |     """, 
 83 |     schedule_interval="@once",
 84 |     default_args=default_args,
 85 |     catchup=False,
 86 |     max_active_runs=1,
 87 |     tags=['spark', 'emr', 'weather', 'stations', 'docking stations', 'london', '2021', 'journey'],
 88 | ) as dag:
 89 | 
 90 |     start = DummyOperator(task_id="start")
 91 |     
 92 |     cluster_creator = EmrCreateJobFlowOperator(
 93 |         task_id='create_job_flow',
 94 |         job_flow_overrides=JOB_FLOW_OVERRIDES,
 95 |         aws_conn_id='aws_default'
 96 |     )
 97 | 
 98 |     step_adder = EmrAddStepsOperator(
 99 |         task_id='add_steps',
100 |         job_flow_id=cluster_creator.output,
101 |         steps=SPARK_STEPS,
102 |         params={
103 |             "BUCKET": BUCKET_NAME,
104 |             "s3_script": s3_script
105 |         },
106 |         aws_conn_id='aws_default'
107 |     )
108 | 
109 |     step_checker = EmrStepSensor(
110 |         task_id='watch_step',
111 |         job_flow_id=cluster_creator.output,
112 |         step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
113 |         aws_conn_id='aws_default'
114 |     )
115 | 
116 |     cluster_remover = EmrTerminateJobFlowOperator(
117 |         task_id='remove_cluster', job_flow_id=cluster_creator.output,
118 |         aws_conn_id='aws_default'
119 |     )
120 | 
121 |     
122 |     end = DummyOperator(task_id="end")
123 | 
124 |     start >> cluster_creator >> step_adder >> step_checker >> cluster_remover >> end


--------------------------------------------------------------------------------
/airflow/dags/proc_2_s3_to_redshifht_dag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from airflow import DAG
 4 | from airflow.utils.dates import days_ago
 5 | from airflow.operators.dummy import DummyOperator
 6 | from airflow.utils.task_group import TaskGroup
 7 | from airflow.providers.amazon.aws.transfers.s3_to_redshift import S3ToRedshiftOperator
 8 | 
 9 | S3_BUCKET = os.environ.get("S3_BUCKET", "s3_no_bucket")
10 | S3_KEY_DIMS = f"processed/cycling-dimension"
11 | S3_KEY_JOURNEY = f"processed/cycling-fact"
12 | 
13 | s3_objects= [
14 |     {   
15 |         'type': 'stations',
16 |         'key': S3_KEY_DIMS,
17 |         'filename': 'stations/',
18 |         'table': 'dim_station',
19 |         'file_type': 'parquet',
20 |         'upsert_key': 'station_id'
21 |     },
22 |     {
23 |         'type': 'datetime',
24 |         'key': S3_KEY_DIMS,
25 |         'filename': 'datetime/',
26 |         'table': 'dim_datetime',
27 |         'file_type': 'parquet',
28 |         'upsert_key': 'datetime_id'
29 |     },
30 |     {
31 |         'type': 'journey',
32 |         'key': S3_KEY_JOURNEY,
33 |         'filename': 'journey/',
34 |         'table': 'fact_journey',
35 |         'file_type': 'parquet',
36 |         'upsert_key': 'rental_id'
37 |     }
38 | 
39 | ]
40 | 
41 | 
42 | default_args = {
43 |     "owner": "airflow",
44 |     "start_date": days_ago(1),
45 |     "depends_on_past": False,
46 |     "retries": 1,
47 | }
48 | 
49 | # NOTE: DAG declaration - using a Context Manager (an implicit way)
50 | with DAG(
51 |     dag_id="proc_3_s3_to_redshifht_dag",
52 |     description="""
53 |         This dag transfers extra files for dimensions from S3 to Redshift.
54 |     """, 
55 |     schedule_interval="@once",
56 |     default_args=default_args,
57 |     catchup=False,
58 |     max_active_runs=3,
59 |     tags=['weather', 'stations', '2021', 's3 to redshift'],
60 | ) as dag:
61 | 
62 |     start = DummyOperator(task_id="start")
63 | 
64 |     with TaskGroup("load_files_to_redshift") as transfer_section:
65 |         for item in s3_objects:
66 |             transfer_task = S3ToRedshiftOperator(
67 |                 s3_bucket=S3_BUCKET,
68 |                 s3_key=f"{item['key']}/{item['filename']}",
69 |                 schema="PUBLIC",
70 |                 table=item['table'],
71 |                 copy_options=[item['file_type']],
72 |                 method='UPSERT',
73 |                 upsert_keys= [item['upsert_key']],
74 |                 task_id=f"transfer_{item['type']}_s3_to_redshift",
75 |             )
76 | 
77 |     end = DummyOperator(task_id="end")
78 | 
79 |     start >> transfer_section >> end
80 | 


--------------------------------------------------------------------------------
/airflow/dags/scripts/init-data-transformation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # ## One time data transformation
 5 | # In this notebook, we are going to transform the stations and weather data in such a way that they will be conformed to the redshift schema for their corresponding tables.
 6 | # 
 7 | # The preprocessed data will be saved back to S3 before getting loaded to Redshift.
 8 | 
 9 | import pyspark
10 | import os
11 | 
12 | pyspark.__version__
13 | 
14 | from pyspark.sql import SparkSession
15 | 
16 | spark = SparkSession.builder\
17 |    .master('local[*]')\
18 |    .appName('data-transformer')\
19 |    .getOrCreate()
20 | 
21 | sc = spark.sparkContext
22 | 
23 | df_stations = spark.read.csv("s3a://hrc-de-data/raw/cycling-extras/stations.csv", inferSchema=True, header=True)
24 | df_stations.take(2)
25 | 
26 | df_stations.printSchema()
27 | 
28 | 
29 | from pyspark.sql import functions as F, types as T
30 | 
31 | # rename columns
32 | stations= df_stations.withColumnRenamed('Station.Id', 'station_id')\
33 |    .withColumnRenamed('StationName', 'station_name')\
34 |    .withColumnRenamed('easting', 'easting')\
35 |    .withColumnRenamed('northing', 'northing') 
36 | 
37 | stations.show(5)
38 | 
39 | 
40 | # count missing values in each column
41 | stations.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in stations.columns]).show()
42 | 
43 | stations.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/stations/', mode='overwrite')
44 | 
45 | 
46 | # ### 2. Weather data
47 | 
48 | df_weather = spark.read.json("s3a://hrc-de-data/raw/cycling-extras/weather.json")
49 | 
50 | df_weather.take(2)
51 | 
52 | df_weather.printSchema()
53 | 
54 | # drop some columns that we won't need
55 | weather= df_weather.drop('cloudcover', 'conditions', 'datetimeEpoch', 'description', 'dew', 'icon', 
56 |                            'precipcover', 'preciptype', 'source', 'stations', 'sunriseEpoch', 'sunsetEpoch')
57 | 
58 | 
59 | # transform datetime
60 | weather= weather.withColumnRenamed('datetime', 'weather_date') 
61 | weather= weather.withColumn('weather_date', weather.weather_date.cast(T.DateType()))
62 | 
63 | weather.printSchema()
64 | print(len(weather.columns), 'columns')
65 | 
66 | 
67 | # count missing values in each column
68 | cols= weather.columns
69 | cols.remove('weather_date')
70 | 
71 | missing_values= weather.select([F.count(F.when(F.col(c).isNull() | F.isnan(c), c)).alias(c) for c in cols])
72 | 
73 | missing_values.show()
74 | 
75 | 
76 | perc_missing_values= weather.select([(F.count(F.when(F.isnan(c) | F.col(c).isNull(), c))/F.count(F.lit(1))).alias(c) for c in cols])
77 | perc_missing_values.show()
78 | 
79 | 
80 | # drop columns where missing values are more than 70%
81 | 
82 | weather= weather.drop('precipprob', 'snow', 'snowdepth')
83 | 
84 | if 'severerisk' in weather.columns:
85 |     weather= weather.drop('severerisk')
86 | 
87 | 
88 | weather.columns
89 | 
90 | weather= weather.repartition(10)
91 | 
92 | weather.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/weather/', mode='overwrite')
93 | 


--------------------------------------------------------------------------------
/airflow/dags/scripts/journey-data-transformation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # ## Transformation for rental journey data 
  5 | # This notebook is responsible for transforming journey data by performing the following tasks:
  6 | # 
  7 | #     1. Renaming columns (removing spaces and lowercasing)
  8 | # 
  9 | #     2. Convert data types from string to timestamps
 10 | #     
 11 | #     3. Attach weather dates
 12 | #     
 13 | #     4. Drop unnecessary columns
 14 | #     
 15 | #     5. Update extra files for dimension tables
 16 | 
 17 | import pyspark
 18 | import os
 19 | 
 20 | from pyspark.sql import SparkSession
 21 | 
 22 | spark = SparkSession.builder\
 23 |     .master('local[*]') \
 24 |     .appName('journey-and-stations-data-transformer')\
 25 |     .getOrCreate()
 26 | 
 27 | # get journey data
 28 | df_journey = spark.read.csv("s3a://hrc-de-data/raw/cycling-journey/*/*", inferSchema=True, header=True)
 29 | 
 30 | df_journey.take(2)
 31 | 
 32 | df_journey.printSchema()
 33 | 
 34 | from pyspark.sql.functions import *
 35 | from pyspark.sql.types import *
 36 | 
 37 | # rename columns
 38 | df_journey= df_journey.withColumnRenamed('Rental Id', 'rental_id').withColumnRenamed('Bike Id', 'bike_id').withColumnRenamed('Start Date', 'start_date').withColumnRenamed('End Date', 'end_date').withColumnRenamed('StartStation Id', 'start_station').withColumnRenamed('EndStation Id', 'end_station')
 39 | 
 40 | # convert data types
 41 | df_journey= df_journey.withColumn('start_date', to_timestamp(col('start_date'), 'dd/MM/yyy HH:mm'))
 42 | 
 43 | df_journey= df_journey.withColumn('end_date',  to_timestamp(col('end_date'), 'dd/MM/yyy HH:mm'))
 44 | 
 45 | # add weather_date column
 46 | df_journey= df_journey.withColumn('weather_date', to_date(col("start_date"), 'dd/MM/yyy HH:mm'))
 47 | 
 48 | 
 49 | df_journey.show(5)
 50 | df_journey.printSchema()
 51 | 
 52 | 
 53 | # ### Stations data
 54 | # We are going to update the stations data (previously saved by another process) with some additional stations that are not present in the original stations data but are seen in some journey.
 55 | 
 56 | # read previously saved stations data from parquet
 57 | df_processed_stations= spark.read.parquet('s3a://hrc-de-data/processed/cycling-dimension/stations/')
 58 | 
 59 | # create temporary table for both stations and journey
 60 | df_journey.createOrReplaceTempView('journey')
 61 | df_processed_stations.createOrReplaceTempView('station')
 62 | 
 63 | 
 64 | # we keep all the stations which are not found in the temp view station table
 65 | additional_stations= spark.sql('''
 66 | with station_ids as (
 67 |     select 
 68 |         station_id
 69 |     from
 70 |         station
 71 | )
 72 | 
 73 | select 
 74 |     distinct(start_station) as station_id, 
 75 |     `StartStation Name` as station_name 
 76 | from 
 77 |     journey
 78 | where 
 79 |     start_station not in (table station_ids)
 80 | 
 81 | union
 82 | 
 83 | select 
 84 |     distinct(end_station) as station_id, 
 85 |     `EndStation Name` as station_name 
 86 | from 
 87 |     journey
 88 | where 
 89 |     end_station not in (table station_ids)
 90 | ''')
 91 | additional_stations.show()
 92 | 
 93 | 
 94 | # add columns to the additional stations to avoid errors when merging it to the previous one (df_processed_stations)
 95 | additional_stations= additional_stations.withColumn('longitude', lit(0).cast(DoubleType())).withColumn('latitude', lit(0).cast(DoubleType())).withColumn('easting', lit(0).cast(DoubleType())).withColumn('northing', lit(0).cast(DoubleType()))
 96 | 
 97 | additional_stations.show(5)
 98 | additional_stations.printSchema()
 99 | 
100 | 
101 | # remove duplicate values
102 | additional_stations= additional_stations.dropDuplicates(['station_id'])
103 | 
104 | 
105 | # save additional stations data into parquet files in s3
106 | additional_stations.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/stations/', mode='append')
107 | 
108 | 
109 | # drop other unnecessary journey columns
110 | df_journey= df_journey.drop('StartStation Name', 'EndStation Name', 'Duration')
111 | 
112 | 
113 | # ### Datetime
114 | # We are going to create/update datetime data from the start and end date of each journey.
115 | 
116 | # extract datetime values from the start and the end date
117 | df_datetime_from_start= (
118 |     df_journey.select(
119 |         col('start_date').alias('datetime_id'), 
120 |         year(col('start_date')).alias('year'),
121 |         dayofweek(col('start_date')).alias('week_day'), 
122 |         month(col('start_date')).alias('month'), 
123 |         dayofmonth(col('start_date')).alias('day'),
124 |         hour(col('start_date')).alias('hour'),
125 |         minute(col('start_date')).alias('minute'),
126 |         second(col('start_date')).alias('second'),
127 |     )
128 | )
129 | df_datetime_from_end= (
130 |     df_journey.select(
131 |         col('end_date').alias('datetime_id'), 
132 |         year(col('end_date')).alias('year'),
133 |         dayofweek(col('end_date')).alias('week_day'), 
134 |         month(col('end_date')).alias('month'), 
135 |         dayofmonth(col('end_date')).alias('day'),
136 |         hour(col('end_date')).alias('hour'),
137 |         minute(col('end_date')).alias('minute'),
138 |         second(col('end_date')).alias('second'),
139 |     )
140 | )
141 | 
142 | df_datetime_from_start.show(3)
143 | df_datetime_from_end.show(3)
144 | 
145 | 
146 | # combine the dataframes
147 | df_datetime= df_datetime_from_start.union(df_datetime_from_end)
148 | 
149 | # remove duplicate entries
150 | df_datetime= df_datetime.dropDuplicates(['datetime_id'])
151 | 
152 | df_datetime.show(10)
153 | 
154 | 
155 | # save datetime data into parquet files in s3
156 | df_datetime.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/datetime/', mode='append')
157 | 
158 | 
159 | # finally, save journey data into parquet files in s3
160 | df_journey.write.parquet('s3a://hrc-de-data/processed/cycling-fact/journey/', mode='append')
161 | 


--------------------------------------------------------------------------------
/airflow/docker-compose.yaml:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on an
 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | # KIND, either express or implied.  See the License for the
 15 | # specific language governing permissions and limitations
 16 | # under the License.
 17 | #
 18 | 
 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
 20 | #
 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
 22 | #
 23 | # This configuration supports basic configuration using environment variables or an .env file
 24 | # The following variables are supported:
 25 | #
 26 | # AIRFLOW_IMAGE_NAME           - Docker image name used to run Airflow.
 27 | #                                Default: apache/airflow:2.2.3
 28 | # AIRFLOW_UID                  - User ID in Airflow containers
 29 | #                                Default: 50000
 30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
 31 | #
 32 | # _AIRFLOW_WWW_USER_USERNAME   - Username for the administrator account (if requested).
 33 | #                                Default: airflow
 34 | # _AIRFLOW_WWW_USER_PASSWORD   - Password for the administrator account (if requested).
 35 | #                                Default: airflow
 36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
 37 | #                                Default: ''
 38 | #
 39 | # Feel free to modify this file to suit your needs.
 40 | ---
 41 | version: '3'
 42 | x-airflow-common:
 43 |   &airflow-common
 44 |   # In order to add custom dependencies or upgrade provider packages you can use your extended image.
 45 |   # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
 46 |   # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
 47 |   # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.2.3}
 48 |   image: airflow-img:latest # can be replaced by any airflow image that was built from the Dockerfile
 49 |   env_file:
 50 |     - .env
 51 |   # build: .
 52 |   environment:
 53 |     &airflow-common-env
 54 |     AIRFLOW__CORE__EXECUTOR: LocalExecutor
 55 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 56 |     AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
 57 |     AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
 58 |     AIRFLOW__CORE__FERNET_KEY: ''
 59 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 60 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
 61 |     AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth'
 62 |     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
 63 |   volumes:
 64 |     - ./dags:/opt/airflow/dags
 65 |     - ./logs:/opt/airflow/logs
 66 |     - ./plugins:/opt/airflow/plugins
 67 |     - ~/.aws:/home/airflow/.aws
 68 |   user: "${AIRFLOW_UID:-50000}:0"
 69 |   depends_on:
 70 |     &airflow-common-depends-on
 71 |     postgres:
 72 |       condition: service_healthy
 73 | 
 74 | services:
 75 |   postgres:
 76 |     image: postgres:13
 77 |     environment:
 78 |       POSTGRES_USER: airflow
 79 |       POSTGRES_PASSWORD: airflow
 80 |       POSTGRES_DB: airflow
 81 |     volumes:
 82 |       - postgres-db-volume:/var/lib/postgresql/data
 83 |     healthcheck:
 84 |       test: ["CMD", "pg_isready", "-U", "airflow"]
 85 |       interval: 5s
 86 |       retries: 5
 87 |     restart: always
 88 | 
 89 |   airflow-webserver:
 90 |     <<: *airflow-common
 91 |     command: webserver
 92 |     ports:
 93 |       - 8080:8080
 94 |     healthcheck:
 95 |       test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
 96 |       interval: 10s
 97 |       timeout: 10s
 98 |       retries: 5
 99 |     restart: always
100 |     depends_on:
101 |       <<: *airflow-common-depends-on
102 |       airflow-init:
103 |         condition: service_completed_successfully
104 | 
105 |   airflow-scheduler:
106 |     <<: *airflow-common
107 |     command: scheduler
108 |     healthcheck:
109 |       test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"']
110 |       interval: 10s
111 |       timeout: 10s
112 |       retries: 5
113 |     restart: always
114 |     depends_on:
115 |       <<: *airflow-common-depends-on
116 |       airflow-init:
117 |         condition: service_completed_successfully
118 | 
119 |   
120 |   airflow-init:
121 |     <<: *airflow-common
122 |     entrypoint: /bin/bash
123 |     # yamllint disable rule:line-length
124 |     command:
125 |       - -c
126 |       - |
127 |         function ver() {
128 |           printf "%04d%04d%04d%04d" $${1//./ }
129 |         }
130 |         airflow_version=$$(gosu airflow airflow version)
131 |         airflow_version_comparable=$$(ver $${airflow_version})
132 |         min_airflow_version=2.2.0
133 |         min_airflow_version_comparable=$$(ver $${min_airflow_version})
134 |         if (( airflow_version_comparable < min_airflow_version_comparable )); then
135 |           echo
136 |           echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
137 |           echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
138 |           echo
139 |           exit 1
140 |         fi
141 |         if [[ -z "${AIRFLOW_UID}" ]]; then
142 |           echo
143 |           echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
144 |           echo "If you are on Linux, you SHOULD follow the instructions below to set "
145 |           echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
146 |           echo "For other operating systems you can get rid of the warning with manually created .env file:"
147 |           echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user"
148 |           echo
149 |         fi
150 |         one_meg=1048576
151 |         mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
152 |         cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
153 |         disk_available=$$(df / | tail -1 | awk '{print $$4}')
154 |         warning_resources="false"
155 |         if (( mem_available < 4000 )) ; then
156 |           echo
157 |           echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
158 |           echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
159 |           echo
160 |           warning_resources="true"
161 |         fi
162 |         if (( cpus_available < 2 )); then
163 |           echo
164 |           echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
165 |           echo "At least 2 CPUs recommended. You have $${cpus_available}"
166 |           echo
167 |           warning_resources="true"
168 |         fi
169 |         if (( disk_available < one_meg * 10 )); then
170 |           echo
171 |           echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
172 |           echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
173 |           echo
174 |           warning_resources="true"
175 |         fi
176 |         if [[ $${warning_resources} == "true" ]]; then
177 |           echo
178 |           echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
179 |           echo "Please follow the instructions to increase amount of resources available:"
180 |           echo "   https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin"
181 |           echo
182 |         fi
183 |         mkdir -p /sources/logs /sources/dags /sources/plugins
184 |         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
185 |         exec /entrypoint airflow version
186 |     # yamllint enable rule:line-length
187 |     environment:
188 |       <<: *airflow-common-env
189 |       _AIRFLOW_DB_UPGRADE: 'true'
190 |       _AIRFLOW_WWW_USER_CREATE: 'true'
191 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
192 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
193 |     user: "0:0"
194 |     volumes:
195 |       - .:/sources
196 | 
197 |   airflow-cli:
198 |     <<: *airflow-common
199 |     profiles:
200 |       - debug
201 |     environment:
202 |       <<: *airflow-common-env
203 |       CONNECTION_CHECK_MAX_COUNT: "0"
204 |     # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
205 |     command:
206 |       - bash
207 |       - -c
208 |       - airflow
209 | 
210 | volumes:
211 |   postgres-db-volume:
212 | 


--------------------------------------------------------------------------------
/airflow/logs/scheduler/latest:
--------------------------------------------------------------------------------
1 | /opt/airflow/logs/scheduler/2022-03-11


--------------------------------------------------------------------------------
/airflow/requirements.txt:
--------------------------------------------------------------------------------
1 | apache-airflow-providers-amazon
2 | bs4
3 | pandas


--------------------------------------------------------------------------------
/images/CyclingERD.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/CyclingERD.png


--------------------------------------------------------------------------------
/images/batch-on-aws.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/batch-on-aws.png


--------------------------------------------------------------------------------
/images/dags/init_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/dags/init_0.png


--------------------------------------------------------------------------------
/images/dags/init_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/dags/init_1.png


--------------------------------------------------------------------------------
/images/dags/init_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/dags/init_2.png


--------------------------------------------------------------------------------
/images/dags/init_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/dags/init_3.png


--------------------------------------------------------------------------------
/images/dags/inits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/dags/inits.png


--------------------------------------------------------------------------------
/images/dags/proc_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/dags/proc_0.png


--------------------------------------------------------------------------------
/images/dags/proc_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/dags/proc_1.png


--------------------------------------------------------------------------------
/images/dags/proc_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/dags/proc_2.png


--------------------------------------------------------------------------------
/images/final-dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/final-dashboard.png


--------------------------------------------------------------------------------
/images/redshift-metabase.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/0859de8ace1e5da1962da677e14ecfe079a22048/images/redshift-metabase.png


--------------------------------------------------------------------------------
/metabase/README.md:
--------------------------------------------------------------------------------
 1 | ## Running Metabase
 2 | In this project, we are using Metabase inside a Docker container. The [official documentation](https://www.metabase.com/docs/latest/operations-guide/running-metabase-on-docker.html) clearly mentioned a simple step to install Metabase in that way.
 3 | 
 4 | It is as simple as running:
 5 | 
 6 | ```bash
 7 | docker run -d -p 3033:3000 --name metabase metabase/metabase
 8 | ```
 9 | 
10 | For the very first time of its execution, the above command downloads the latest Docker image available for Metabase before exposing the application on port `3033`.
11 | 
12 | On Metabase we can setup a connection to Redshift database as follow:
13 | 
14 | ![Metabase -  Redshift connection](/images/redshift-metabase.png "Connecting Metabase to Redshift")
15 | 
16 | Note: replace the database credentials by the relevant values in your case. 
17 | 
18 | 


--------------------------------------------------------------------------------
/notebook/data-exploration/Exploration.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Exploration of the dataset\n",
  8 |     "The TFL website contains **Santander cycling data** that are structured in different directories.\n",
  9 |     "In this notebook, we are going to read a single file as an example of the cycling journey.\n",
 10 |     "\n",
 11 |     "Additionally, we will also read the **docking stations data** which was found outside the main TFL website. \n",
 12 |     "The stations data contains the list of departure and destination stations mentioned in each cycling journey.\n",
 13 |     "\n",
 14 |     "Our third dataset consists of the **historical weather data** in London over the year of 2021. The data are represented daily with 36 weather attributes. This data was originally retrieved from www.visualcrossing.com website, then stored in Google Drive to allow easy access to it."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# import packages\n",
 24 |     "import pandas as pd\n",
 25 |     "import json"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "### Cycling journey data"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 2,
 38 |    "metadata": {},
 39 |    "outputs": [
 40 |     {
 41 |      "name": "stdout",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "--2022-02-21 09:24:34--  https://cycling.data.tfl.gov.uk/usage-stats/252JourneyDataExtract10Feb2021-16Feb2021.csv\n",
 45 |       "Resolving cycling.data.tfl.gov.uk (cycling.data.tfl.gov.uk)... 54.230.115.31, 54.230.115.80, 54.230.115.35, ...\n",
 46 |       "Connecting to cycling.data.tfl.gov.uk (cycling.data.tfl.gov.uk)|54.230.115.31|:443... connected.\n",
 47 |       "HTTP request sent, awaiting response... 200 OK\n",
 48 |       "Length: 11036049 (11M) [text/csv]\n",
 49 |       "Saving to: ‘journey10Feb2021-16Feb2021.csv’\n",
 50 |       "\n",
 51 |       "journey10Feb2021-16 100%[===================>]  10.52M  2.59MB/s    in 4.1s    \n",
 52 |       "\n",
 53 |       "2022-02-21 09:24:40 (2.59 MB/s) - ‘journey10Feb2021-16Feb2021.csv’ saved [11036049/11036049]\n",
 54 |       "\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "# download an example file\n",
 60 |     "!wget https://cycling.data.tfl.gov.uk/usage-stats/252JourneyDataExtract10Feb2021-16Feb2021.csv -O journey10Feb2021-16Feb2021.csv"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 12,
 66 |    "metadata": {},
 67 |    "outputs": [
 68 |     {
 69 |      "data": {
 70 |       "text/html": [
 71 |        "<div>\n",
 72 |        "<style scoped>\n",
 73 |        "    .dataframe tbody tr th:only-of-type {\n",
 74 |        "        vertical-align: middle;\n",
 75 |        "    }\n",
 76 |        "\n",
 77 |        "    .dataframe tbody tr th {\n",
 78 |        "        vertical-align: top;\n",
 79 |        "    }\n",
 80 |        "\n",
 81 |        "    .dataframe thead th {\n",
 82 |        "        text-align: right;\n",
 83 |        "    }\n",
 84 |        "</style>\n",
 85 |        "<table border=\"1\" class=\"dataframe\">\n",
 86 |        "  <thead>\n",
 87 |        "    <tr style=\"text-align: right;\">\n",
 88 |        "      <th></th>\n",
 89 |        "      <th>Rental Id</th>\n",
 90 |        "      <th>Duration</th>\n",
 91 |        "      <th>Bike Id</th>\n",
 92 |        "      <th>End Date</th>\n",
 93 |        "      <th>EndStation Id</th>\n",
 94 |        "      <th>EndStation Name</th>\n",
 95 |        "      <th>Start Date</th>\n",
 96 |        "      <th>StartStation Id</th>\n",
 97 |        "      <th>StartStation Name</th>\n",
 98 |        "    </tr>\n",
 99 |        "  </thead>\n",
100 |        "  <tbody>\n",
101 |        "    <tr>\n",
102 |        "      <th>0</th>\n",
103 |        "      <td>105401285</td>\n",
104 |        "      <td>3360</td>\n",
105 |        "      <td>17497</td>\n",
106 |        "      <td>15/02/2021 20:55</td>\n",
107 |        "      <td>785</td>\n",
108 |        "      <td>Aquatic Centre, Queen Elizabeth Olympic Park</td>\n",
109 |        "      <td>15/02/2021 19:59</td>\n",
110 |        "      <td>785</td>\n",
111 |        "      <td>Aquatic Centre, Queen Elizabeth Olympic Park</td>\n",
112 |        "    </tr>\n",
113 |        "    <tr>\n",
114 |        "      <th>1</th>\n",
115 |        "      <td>105322226</td>\n",
116 |        "      <td>1020</td>\n",
117 |        "      <td>4677</td>\n",
118 |        "      <td>10/02/2021 08:03</td>\n",
119 |        "      <td>194</td>\n",
120 |        "      <td>Hop Exchange, The Borough</td>\n",
121 |        "      <td>10/02/2021 07:46</td>\n",
122 |        "      <td>14</td>\n",
123 |        "      <td>Belgrove Street , King's Cross</td>\n",
124 |        "    </tr>\n",
125 |        "    <tr>\n",
126 |        "      <th>2</th>\n",
127 |        "      <td>105351846</td>\n",
128 |        "      <td>480</td>\n",
129 |        "      <td>18046</td>\n",
130 |        "      <td>12/02/2021 15:26</td>\n",
131 |        "      <td>27</td>\n",
132 |        "      <td>Bouverie Street, Temple</td>\n",
133 |        "      <td>12/02/2021 15:18</td>\n",
134 |        "      <td>196</td>\n",
135 |        "      <td>Union Street, The Borough</td>\n",
136 |        "    </tr>\n",
137 |        "    <tr>\n",
138 |        "      <th>3</th>\n",
139 |        "      <td>105324229</td>\n",
140 |        "      <td>180</td>\n",
141 |        "      <td>19785</td>\n",
142 |        "      <td>10/02/2021 10:46</td>\n",
143 |        "      <td>195</td>\n",
144 |        "      <td>Milroy Walk, South Bank</td>\n",
145 |        "      <td>10/02/2021 10:43</td>\n",
146 |        "      <td>196</td>\n",
147 |        "      <td>Union Street, The Borough</td>\n",
148 |        "    </tr>\n",
149 |        "    <tr>\n",
150 |        "      <th>4</th>\n",
151 |        "      <td>105350696</td>\n",
152 |        "      <td>720</td>\n",
153 |        "      <td>14243</td>\n",
154 |        "      <td>12/02/2021 14:17</td>\n",
155 |        "      <td>274</td>\n",
156 |        "      <td>Warwick Road, Olympia</td>\n",
157 |        "      <td>12/02/2021 14:05</td>\n",
158 |        "      <td>219</td>\n",
159 |        "      <td>Bramham Gardens, Earl's Court</td>\n",
160 |        "    </tr>\n",
161 |        "  </tbody>\n",
162 |        "</table>\n",
163 |        "</div>"
164 |       ],
165 |       "text/plain": [
166 |        "   Rental Id  Duration  Bike Id          End Date  EndStation Id  \\\n",
167 |        "0  105401285      3360    17497  15/02/2021 20:55            785   \n",
168 |        "1  105322226      1020     4677  10/02/2021 08:03            194   \n",
169 |        "2  105351846       480    18046  12/02/2021 15:26             27   \n",
170 |        "3  105324229       180    19785  10/02/2021 10:46            195   \n",
171 |        "4  105350696       720    14243  12/02/2021 14:17            274   \n",
172 |        "\n",
173 |        "                                EndStation Name        Start Date  \\\n",
174 |        "0  Aquatic Centre, Queen Elizabeth Olympic Park  15/02/2021 19:59   \n",
175 |        "1                     Hop Exchange, The Borough  10/02/2021 07:46   \n",
176 |        "2                       Bouverie Street, Temple  12/02/2021 15:18   \n",
177 |        "3                       Milroy Walk, South Bank  10/02/2021 10:43   \n",
178 |        "4                         Warwick Road, Olympia  12/02/2021 14:05   \n",
179 |        "\n",
180 |        "   StartStation Id                             StartStation Name  \n",
181 |        "0              785  Aquatic Centre, Queen Elizabeth Olympic Park  \n",
182 |        "1               14                Belgrove Street , King's Cross  \n",
183 |        "2              196                     Union Street, The Borough  \n",
184 |        "3              196                     Union Street, The Borough  \n",
185 |        "4              219                 Bramham Gardens, Earl's Court  "
186 |       ]
187 |      },
188 |      "execution_count": 12,
189 |      "metadata": {},
190 |      "output_type": "execute_result"
191 |     }
192 |    ],
193 |    "source": [
194 |     "df= pd.read_csv('journey10Feb2021-16Feb2021.csv')\n",
195 |     "df.head()"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 13,
201 |    "metadata": {},
202 |    "outputs": [
203 |     {
204 |      "data": {
205 |       "text/plain": [
206 |        "(89405, 9)"
207 |       ]
208 |      },
209 |      "execution_count": 13,
210 |      "metadata": {},
211 |      "output_type": "execute_result"
212 |     }
213 |    ],
214 |    "source": [
215 |     "df.shape"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 14,
221 |    "metadata": {},
222 |    "outputs": [
223 |     {
224 |      "name": "stdout",
225 |      "output_type": "stream",
226 |      "text": [
227 |       "CREATE TABLE \"journey_staging\" (\n",
228 |       "\"Rental Id\" INTEGER,\n",
229 |       "  \"Duration\" INTEGER,\n",
230 |       "  \"Bike Id\" INTEGER,\n",
231 |       "  \"End Date\" TEXT,\n",
232 |       "  \"EndStation Id\" INTEGER,\n",
233 |       "  \"EndStation Name\" TEXT,\n",
234 |       "  \"Start Date\" TEXT,\n",
235 |       "  \"StartStation Id\" INTEGER,\n",
236 |       "  \"StartStation Name\" TEXT,\n",
237 |       "  CONSTRAINT journey_staging_pk PRIMARY KEY (\"Rental Id\")\n",
238 |       ")\n"
239 |      ]
240 |     },
241 |     {
242 |      "name": "stderr",
243 |      "output_type": "stream",
244 |      "text": [
245 |       "/tmp/ipykernel_234257/265231394.py:2: UserWarning: The spaces in these column names will not be changed. In pandas versions < 0.14, spaces were converted to underscores.\n",
246 |       "  journey_table= pd.io.sql.get_schema(frame=df, name='journey_staging', keys='Rental Id')\n"
247 |      ]
248 |     }
249 |    ],
250 |    "source": [
251 |     "# infer a sql table schema for journey data\n",
252 |     "journey_table= pd.io.sql.get_schema(frame=df, name='journey_staging', keys='Rental Id')\n",
253 |     "print(journey_table)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "### Docking stations"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 15,
266 |    "metadata": {},
267 |    "outputs": [
268 |     {
269 |      "name": "stdout",
270 |      "output_type": "stream",
271 |      "text": [
272 |       "--2022-02-21 09:41:08--  https://www.whatdotheyknow.com/request/664717/response/1572474/attach/3/Cycle%20hire%20docking%20stations.csv.txt\n",
273 |       "Resolving www.whatdotheyknow.com (www.whatdotheyknow.com)... 46.43.39.108\n",
274 |       "Connecting to www.whatdotheyknow.com (www.whatdotheyknow.com)|46.43.39.108|:443... connected.\n",
275 |       "HTTP request sent, awaiting response... 200 OK\n",
276 |       "Length: unspecified [text/plain]\n",
277 |       "Saving to: ‘stations.csv’\n",
278 |       "\n",
279 |       "stations.csv            [   <=>              ]  57.09K  97.3KB/s    in 0.6s    \n",
280 |       "\n",
281 |       "2022-02-21 09:41:11 (97.3 KB/s) - ‘stations.csv’ saved [58461]\n",
282 |       "\n"
283 |      ]
284 |     }
285 |    ],
286 |    "source": [
287 |     "!wget https://www.whatdotheyknow.com/request/664717/response/1572474/attach/3/Cycle%20hire%20docking%20stations.csv.txt -O stations.csv"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 16,
293 |    "metadata": {},
294 |    "outputs": [
295 |     {
296 |      "data": {
297 |       "text/html": [
298 |        "<div>\n",
299 |        "<style scoped>\n",
300 |        "    .dataframe tbody tr th:only-of-type {\n",
301 |        "        vertical-align: middle;\n",
302 |        "    }\n",
303 |        "\n",
304 |        "    .dataframe tbody tr th {\n",
305 |        "        vertical-align: top;\n",
306 |        "    }\n",
307 |        "\n",
308 |        "    .dataframe thead th {\n",
309 |        "        text-align: right;\n",
310 |        "    }\n",
311 |        "</style>\n",
312 |        "<table border=\"1\" class=\"dataframe\">\n",
313 |        "  <thead>\n",
314 |        "    <tr style=\"text-align: right;\">\n",
315 |        "      <th></th>\n",
316 |        "      <th>Station.Id</th>\n",
317 |        "      <th>StationName</th>\n",
318 |        "      <th>longitude</th>\n",
319 |        "      <th>latitude</th>\n",
320 |        "      <th>Easting</th>\n",
321 |        "      <th>Northing</th>\n",
322 |        "    </tr>\n",
323 |        "  </thead>\n",
324 |        "  <tbody>\n",
325 |        "    <tr>\n",
326 |        "      <th>0</th>\n",
327 |        "      <td>1</td>\n",
328 |        "      <td>River Street, Clerkenwell</td>\n",
329 |        "      <td>-0.109971</td>\n",
330 |        "      <td>51.5292</td>\n",
331 |        "      <td>531202.520</td>\n",
332 |        "      <td>182832.020</td>\n",
333 |        "    </tr>\n",
334 |        "    <tr>\n",
335 |        "      <th>1</th>\n",
336 |        "      <td>2</td>\n",
337 |        "      <td>Phillimore Gardens, Kensington</td>\n",
338 |        "      <td>-0.197574</td>\n",
339 |        "      <td>51.4996</td>\n",
340 |        "      <td>525207.070</td>\n",
341 |        "      <td>179391.860</td>\n",
342 |        "    </tr>\n",
343 |        "    <tr>\n",
344 |        "      <th>2</th>\n",
345 |        "      <td>3</td>\n",
346 |        "      <td>Christopher Street, Liverpool Street</td>\n",
347 |        "      <td>-0.084606</td>\n",
348 |        "      <td>51.5213</td>\n",
349 |        "      <td>532984.810</td>\n",
350 |        "      <td>182001.530</td>\n",
351 |        "    </tr>\n",
352 |        "    <tr>\n",
353 |        "      <th>3</th>\n",
354 |        "      <td>4</td>\n",
355 |        "      <td>St. Chad's Street, King's Cross</td>\n",
356 |        "      <td>-0.120974</td>\n",
357 |        "      <td>51.5301</td>\n",
358 |        "      <td>530436.760</td>\n",
359 |        "      <td>182911.990</td>\n",
360 |        "    </tr>\n",
361 |        "    <tr>\n",
362 |        "      <th>4</th>\n",
363 |        "      <td>5</td>\n",
364 |        "      <td>Sedding Street, Sloane Square</td>\n",
365 |        "      <td>-0.156876</td>\n",
366 |        "      <td>51.4931</td>\n",
367 |        "      <td>528051.649</td>\n",
368 |        "      <td>178742.097</td>\n",
369 |        "    </tr>\n",
370 |        "  </tbody>\n",
371 |        "</table>\n",
372 |        "</div>"
373 |       ],
374 |       "text/plain": [
375 |        "   Station.Id                           StationName  longitude  latitude  \\\n",
376 |        "0           1             River Street, Clerkenwell  -0.109971   51.5292   \n",
377 |        "1           2        Phillimore Gardens, Kensington  -0.197574   51.4996   \n",
378 |        "2           3  Christopher Street, Liverpool Street  -0.084606   51.5213   \n",
379 |        "3           4       St. Chad's Street, King's Cross  -0.120974   51.5301   \n",
380 |        "4           5         Sedding Street, Sloane Square  -0.156876   51.4931   \n",
381 |        "\n",
382 |        "      Easting    Northing  \n",
383 |        "0  531202.520  182832.020  \n",
384 |        "1  525207.070  179391.860  \n",
385 |        "2  532984.810  182001.530  \n",
386 |        "3  530436.760  182911.990  \n",
387 |        "4  528051.649  178742.097  "
388 |       ]
389 |      },
390 |      "execution_count": 16,
391 |      "metadata": {},
392 |      "output_type": "execute_result"
393 |     }
394 |    ],
395 |    "source": [
396 |     "df_stations= pd.read_csv('stations.csv')\n",
397 |     "df_stations.head()"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": 19,
403 |    "metadata": {},
404 |    "outputs": [
405 |     {
406 |      "name": "stdout",
407 |      "output_type": "stream",
408 |      "text": [
409 |       "CREATE TABLE \"stations_staging\" (\n",
410 |       "\"Station.Id\" INTEGER,\n",
411 |       "  \"StationName\" TEXT,\n",
412 |       "  \"longitude\" REAL,\n",
413 |       "  \"latitude\" REAL,\n",
414 |       "  \"Easting\" REAL,\n",
415 |       "  \"Northing\" REAL,\n",
416 |       "  CONSTRAINT stations_staging_pk PRIMARY KEY (\"Station.Id\")\n",
417 |       ")\n"
418 |      ]
419 |     }
420 |    ],
421 |    "source": [
422 |     "# infer a sql table schema for stations data\n",
423 |     "stations_table= pd.io.sql.get_schema(frame=df_stations, name='stations_staging', keys='Station.Id')\n",
424 |     "print(stations_table)"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "markdown",
429 |    "metadata": {},
430 |    "source": [
431 |     "### Historical weather data in 2021"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": 20,
437 |    "metadata": {},
438 |    "outputs": [
439 |     {
440 |      "name": "stdout",
441 |      "output_type": "stream",
442 |      "text": [
443 |       "--2022-02-21 09:42:22--  https://docs.google.com/uc?export=download&id=1Aa2mP5CwLele94GkJWqvpCmlm6GXeu8c\n",
444 |       "Resolving docs.google.com (docs.google.com)... 216.58.223.78, 2a00:1450:401a:804::200e\n",
445 |       "Connecting to docs.google.com (docs.google.com)|216.58.223.78|:443... connected.\n",
446 |       "HTTP request sent, awaiting response... 303 See Other\n",
447 |       "Location: https://doc-0s-2g-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/oh5pmqielfjkamj3j5h4htd9undhtio4/1645425675000/00305885236840532660/*/1Aa2mP5CwLele94GkJWqvpCmlm6GXeu8c?e=download [following]\n",
448 |       "Warning: wildcards not supported in HTTP.\n",
449 |       "--2022-02-21 09:42:25--  https://doc-0s-2g-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/oh5pmqielfjkamj3j5h4htd9undhtio4/1645425675000/00305885236840532660/*/1Aa2mP5CwLele94GkJWqvpCmlm6GXeu8c?e=download\n",
450 |       "Resolving doc-0s-2g-docs.googleusercontent.com (doc-0s-2g-docs.googleusercontent.com)... 172.217.170.161, 2a00:1450:401a:800::2001\n",
451 |       "Connecting to doc-0s-2g-docs.googleusercontent.com (doc-0s-2g-docs.googleusercontent.com)|172.217.170.161|:443... connected.\n",
452 |       "HTTP request sent, awaiting response... 200 OK\n",
453 |       "Length: 379443 (371K) [application/json]\n",
454 |       "Saving to: ‘weather-2021.json’\n",
455 |       "\n",
456 |       "weather-2021.json   100%[===================>] 370.55K   507KB/s    in 0.7s    \n",
457 |       "\n",
458 |       "2022-02-21 09:42:27 (507 KB/s) - ‘weather-2021.json’ saved [379443/379443]\n",
459 |       "\n"
460 |      ]
461 |     }
462 |    ],
463 |    "source": [
464 |     "!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=13LWAH93xxEvOukCnPhrfXH7rZZq_-mss' -O weather-2021.json"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": 21,
470 |    "metadata": {},
471 |    "outputs": [
472 |     {
473 |      "name": "stdout",
474 |      "output_type": "stream",
475 |      "text": [
476 |       "{\n",
477 |       "  \"latitude\" : 51.5064,\n",
478 |       "  \"longitude\" : -0.12721,\n",
479 |       "  \"resolvedAddress\" : \"London, England, United Kingdom\",\n",
480 |       "  \"address\" : \"London,UK\",\n",
481 |       "  \"timezone\" : \"Europe/London\",\n",
482 |       "  \"tzoffset\" : 0.0,\n",
483 |       "  \"name\" : \"London,UK\",\n",
484 |       "  \"days\" : [ {\n",
485 |       "    \"datetime\" : \"2021-01-01\",\n",
486 |       "    \"datetimeEpoch\" : 1609459200,\n",
487 |       "    \"tempmax\" : 5.0,\n",
488 |       "    \"tempmin\" : -0.5,\n",
489 |       "    \"temp\" : 2.1,\n",
490 |       "    \"feelslikemax\" : 2.9,\n",
491 |       "    \"feelslikemin\" : -3.6,\n",
492 |       "    \"feelslike\" : -0.2,\n",
493 |       "    \"dew\" : 0.8,\n",
494 |       "    \"humidity\" : 91.03,\n",
495 |       "    \"precip\" : 0.22,\n"
496 |      ]
497 |     }
498 |    ],
499 |    "source": [
500 |     "!head -n 20 weather-2021.json"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": 22,
506 |    "metadata": {},
507 |    "outputs": [
508 |     {
509 |      "data": {
510 |       "text/html": [
511 |        "<div>\n",
512 |        "<style scoped>\n",
513 |        "    .dataframe tbody tr th:only-of-type {\n",
514 |        "        vertical-align: middle;\n",
515 |        "    }\n",
516 |        "\n",
517 |        "    .dataframe tbody tr th {\n",
518 |        "        vertical-align: top;\n",
519 |        "    }\n",
520 |        "\n",
521 |        "    .dataframe thead th {\n",
522 |        "        text-align: right;\n",
523 |        "    }\n",
524 |        "</style>\n",
525 |        "<table border=\"1\" class=\"dataframe\">\n",
526 |        "  <thead>\n",
527 |        "    <tr style=\"text-align: right;\">\n",
528 |        "      <th></th>\n",
529 |        "      <th>datetime</th>\n",
530 |        "      <th>datetimeEpoch</th>\n",
531 |        "      <th>tempmax</th>\n",
532 |        "      <th>tempmin</th>\n",
533 |        "      <th>temp</th>\n",
534 |        "      <th>feelslikemax</th>\n",
535 |        "      <th>feelslikemin</th>\n",
536 |        "      <th>feelslike</th>\n",
537 |        "      <th>dew</th>\n",
538 |        "      <th>humidity</th>\n",
539 |        "      <th>...</th>\n",
540 |        "      <th>sunriseEpoch</th>\n",
541 |        "      <th>sunset</th>\n",
542 |        "      <th>sunsetEpoch</th>\n",
543 |        "      <th>moonphase</th>\n",
544 |        "      <th>conditions</th>\n",
545 |        "      <th>description</th>\n",
546 |        "      <th>icon</th>\n",
547 |        "      <th>stations</th>\n",
548 |        "      <th>source</th>\n",
549 |        "      <th>tzoffset</th>\n",
550 |        "    </tr>\n",
551 |        "  </thead>\n",
552 |        "  <tbody>\n",
553 |        "    <tr>\n",
554 |        "      <th>0</th>\n",
555 |        "      <td>2021-01-01</td>\n",
556 |        "      <td>1609459200</td>\n",
557 |        "      <td>5.0</td>\n",
558 |        "      <td>-0.5</td>\n",
559 |        "      <td>2.1</td>\n",
560 |        "      <td>2.9</td>\n",
561 |        "      <td>-3.6</td>\n",
562 |        "      <td>-0.2</td>\n",
563 |        "      <td>0.8</td>\n",
564 |        "      <td>91.03</td>\n",
565 |        "      <td>...</td>\n",
566 |        "      <td>1609488374</td>\n",
567 |        "      <td>16:02:22</td>\n",
568 |        "      <td>1609516942</td>\n",
569 |        "      <td>0.53</td>\n",
570 |        "      <td>Rain</td>\n",
571 |        "      <td>Clear conditions throughout the day with late ...</td>\n",
572 |        "      <td>rain</td>\n",
573 |        "      <td>[03769099999, 03680099999, D5621, 03672099999,...</td>\n",
574 |        "      <td>obs</td>\n",
575 |        "      <td>NaN</td>\n",
576 |        "    </tr>\n",
577 |        "    <tr>\n",
578 |        "      <th>1</th>\n",
579 |        "      <td>2021-01-02</td>\n",
580 |        "      <td>1609545600</td>\n",
581 |        "      <td>5.1</td>\n",
582 |        "      <td>1.5</td>\n",
583 |        "      <td>3.8</td>\n",
584 |        "      <td>3.1</td>\n",
585 |        "      <td>-1.5</td>\n",
586 |        "      <td>1.5</td>\n",
587 |        "      <td>1.0</td>\n",
588 |        "      <td>82.51</td>\n",
589 |        "      <td>...</td>\n",
590 |        "      <td>1609574765</td>\n",
591 |        "      <td>16:03:28</td>\n",
592 |        "      <td>1609603408</td>\n",
593 |        "      <td>0.56</td>\n",
594 |        "      <td>Rain</td>\n",
595 |        "      <td>Clear conditions throughout the day with rain.</td>\n",
596 |        "      <td>rain</td>\n",
597 |        "      <td>[03680099999, D5621, 03672099999, 03781099999,...</td>\n",
598 |        "      <td>obs</td>\n",
599 |        "      <td>NaN</td>\n",
600 |        "    </tr>\n",
601 |        "    <tr>\n",
602 |        "      <th>2</th>\n",
603 |        "      <td>2021-01-03</td>\n",
604 |        "      <td>1609632000</td>\n",
605 |        "      <td>6.0</td>\n",
606 |        "      <td>1.1</td>\n",
607 |        "      <td>3.8</td>\n",
608 |        "      <td>5.6</td>\n",
609 |        "      <td>-2.5</td>\n",
610 |        "      <td>0.9</td>\n",
611 |        "      <td>1.7</td>\n",
612 |        "      <td>86.02</td>\n",
613 |        "      <td>...</td>\n",
614 |        "      <td>1609661154</td>\n",
615 |        "      <td>16:04:36</td>\n",
616 |        "      <td>1609689876</td>\n",
617 |        "      <td>0.60</td>\n",
618 |        "      <td>Rain</td>\n",
619 |        "      <td>Clear conditions throughout the day with rain.</td>\n",
620 |        "      <td>rain</td>\n",
621 |        "      <td>[03680099999, D5621, 03672099999, 03781099999,...</td>\n",
622 |        "      <td>obs</td>\n",
623 |        "      <td>NaN</td>\n",
624 |        "    </tr>\n",
625 |        "    <tr>\n",
626 |        "      <th>3</th>\n",
627 |        "      <td>2021-01-04</td>\n",
628 |        "      <td>1609718400</td>\n",
629 |        "      <td>5.6</td>\n",
630 |        "      <td>3.5</td>\n",
631 |        "      <td>4.3</td>\n",
632 |        "      <td>4.1</td>\n",
633 |        "      <td>-0.7</td>\n",
634 |        "      <td>0.5</td>\n",
635 |        "      <td>1.4</td>\n",
636 |        "      <td>81.43</td>\n",
637 |        "      <td>...</td>\n",
638 |        "      <td>1609747538</td>\n",
639 |        "      <td>16:05:46</td>\n",
640 |        "      <td>1609776346</td>\n",
641 |        "      <td>0.65</td>\n",
642 |        "      <td>Rain</td>\n",
643 |        "      <td>Clear conditions throughout the day with rain.</td>\n",
644 |        "      <td>rain</td>\n",
645 |        "      <td>[03680099999, D5621, 03672099999, 03781099999,...</td>\n",
646 |        "      <td>obs</td>\n",
647 |        "      <td>NaN</td>\n",
648 |        "    </tr>\n",
649 |        "    <tr>\n",
650 |        "      <th>4</th>\n",
651 |        "      <td>2021-01-05</td>\n",
652 |        "      <td>1609804800</td>\n",
653 |        "      <td>4.6</td>\n",
654 |        "      <td>2.5</td>\n",
655 |        "      <td>3.7</td>\n",
656 |        "      <td>0.8</td>\n",
657 |        "      <td>-1.8</td>\n",
658 |        "      <td>-0.4</td>\n",
659 |        "      <td>1.0</td>\n",
660 |        "      <td>82.39</td>\n",
661 |        "      <td>...</td>\n",
662 |        "      <td>1609833920</td>\n",
663 |        "      <td>16:06:59</td>\n",
664 |        "      <td>1609862819</td>\n",
665 |        "      <td>0.70</td>\n",
666 |        "      <td>Rain</td>\n",
667 |        "      <td>Clear conditions throughout the day with rain.</td>\n",
668 |        "      <td>rain</td>\n",
669 |        "      <td>[03680099999, D5621, 03672099999, 03781099999,...</td>\n",
670 |        "      <td>obs</td>\n",
671 |        "      <td>NaN</td>\n",
672 |        "    </tr>\n",
673 |        "  </tbody>\n",
674 |        "</table>\n",
675 |        "<p>5 rows × 36 columns</p>\n",
676 |        "</div>"
677 |       ],
678 |       "text/plain": [
679 |        "     datetime  datetimeEpoch  tempmax  tempmin  temp  feelslikemax  \\\n",
680 |        "0  2021-01-01     1609459200      5.0     -0.5   2.1           2.9   \n",
681 |        "1  2021-01-02     1609545600      5.1      1.5   3.8           3.1   \n",
682 |        "2  2021-01-03     1609632000      6.0      1.1   3.8           5.6   \n",
683 |        "3  2021-01-04     1609718400      5.6      3.5   4.3           4.1   \n",
684 |        "4  2021-01-05     1609804800      4.6      2.5   3.7           0.8   \n",
685 |        "\n",
686 |        "   feelslikemin  feelslike  dew  humidity  ...  sunriseEpoch    sunset  \\\n",
687 |        "0          -3.6       -0.2  0.8     91.03  ...    1609488374  16:02:22   \n",
688 |        "1          -1.5        1.5  1.0     82.51  ...    1609574765  16:03:28   \n",
689 |        "2          -2.5        0.9  1.7     86.02  ...    1609661154  16:04:36   \n",
690 |        "3          -0.7        0.5  1.4     81.43  ...    1609747538  16:05:46   \n",
691 |        "4          -1.8       -0.4  1.0     82.39  ...    1609833920  16:06:59   \n",
692 |        "\n",
693 |        "   sunsetEpoch moonphase conditions  \\\n",
694 |        "0   1609516942      0.53       Rain   \n",
695 |        "1   1609603408      0.56       Rain   \n",
696 |        "2   1609689876      0.60       Rain   \n",
697 |        "3   1609776346      0.65       Rain   \n",
698 |        "4   1609862819      0.70       Rain   \n",
699 |        "\n",
700 |        "                                         description  icon  \\\n",
701 |        "0  Clear conditions throughout the day with late ...  rain   \n",
702 |        "1     Clear conditions throughout the day with rain.  rain   \n",
703 |        "2     Clear conditions throughout the day with rain.  rain   \n",
704 |        "3     Clear conditions throughout the day with rain.  rain   \n",
705 |        "4     Clear conditions throughout the day with rain.  rain   \n",
706 |        "\n",
707 |        "                                            stations  source  tzoffset  \n",
708 |        "0  [03769099999, 03680099999, D5621, 03672099999,...     obs       NaN  \n",
709 |        "1  [03680099999, D5621, 03672099999, 03781099999,...     obs       NaN  \n",
710 |        "2  [03680099999, D5621, 03672099999, 03781099999,...     obs       NaN  \n",
711 |        "3  [03680099999, D5621, 03672099999, 03781099999,...     obs       NaN  \n",
712 |        "4  [03680099999, D5621, 03672099999, 03781099999,...     obs       NaN  \n",
713 |        "\n",
714 |        "[5 rows x 36 columns]"
715 |       ]
716 |      },
717 |      "execution_count": 22,
718 |      "metadata": {},
719 |      "output_type": "execute_result"
720 |     }
721 |    ],
722 |    "source": [
723 |     "# we will only extract the day items\n",
724 |     "with open('weather-2021.json', 'r') as f:\n",
725 |     "    weather = json.load(f)\n",
726 |     "\n",
727 |     "df_weather = pd.DataFrame.from_dict(weather[\"days\"])\n",
728 |     "df_weather.head()"
729 |    ]
730 |   },
731 |   {
732 |    "cell_type": "code",
733 |    "execution_count": 24,
734 |    "metadata": {},
735 |    "outputs": [
736 |     {
737 |      "name": "stdout",
738 |      "output_type": "stream",
739 |      "text": [
740 |       "<class 'pandas.core.frame.DataFrame'>\n",
741 |       "RangeIndex: 365 entries, 0 to 364\n",
742 |       "Data columns (total 36 columns):\n",
743 |       " #   Column          Non-Null Count  Dtype  \n",
744 |       "---  ------          --------------  -----  \n",
745 |       " 0   datetime        365 non-null    object \n",
746 |       " 1   datetimeEpoch   365 non-null    int64  \n",
747 |       " 2   tempmax         365 non-null    float64\n",
748 |       " 3   tempmin         365 non-null    float64\n",
749 |       " 4   temp            365 non-null    float64\n",
750 |       " 5   feelslikemax    365 non-null    float64\n",
751 |       " 6   feelslikemin    365 non-null    float64\n",
752 |       " 7   feelslike       365 non-null    float64\n",
753 |       " 8   dew             365 non-null    float64\n",
754 |       " 9   humidity        365 non-null    float64\n",
755 |       " 10  precip          365 non-null    float64\n",
756 |       " 11  precipprob      0 non-null      object \n",
757 |       " 12  precipcover     365 non-null    float64\n",
758 |       " 13  preciptype      0 non-null      object \n",
759 |       " 14  snow            0 non-null      object \n",
760 |       " 15  snowdepth       9 non-null      float64\n",
761 |       " 16  windgust        139 non-null    float64\n",
762 |       " 17  windspeed       365 non-null    float64\n",
763 |       " 18  winddir         365 non-null    float64\n",
764 |       " 19  pressure        364 non-null    float64\n",
765 |       " 20  cloudcover      365 non-null    float64\n",
766 |       " 21  visibility      365 non-null    float64\n",
767 |       " 22  solarradiation  365 non-null    float64\n",
768 |       " 23  solarenergy     365 non-null    float64\n",
769 |       " 24  uvindex         365 non-null    float64\n",
770 |       " 25  sunrise         365 non-null    object \n",
771 |       " 26  sunriseEpoch    365 non-null    int64  \n",
772 |       " 27  sunset          365 non-null    object \n",
773 |       " 28  sunsetEpoch     365 non-null    int64  \n",
774 |       " 29  moonphase       365 non-null    float64\n",
775 |       " 30  conditions      365 non-null    object \n",
776 |       " 31  description     365 non-null    object \n",
777 |       " 32  icon            365 non-null    object \n",
778 |       " 33  stations        365 non-null    object \n",
779 |       " 34  source          365 non-null    object \n",
780 |       " 35  tzoffset        217 non-null    float64\n",
781 |       "dtypes: float64(22), int64(3), object(11)\n",
782 |       "memory usage: 102.8+ KB\n"
783 |      ]
784 |     }
785 |    ],
786 |    "source": [
787 |     "df_weather.info()"
788 |    ]
789 |   },
790 |   {
791 |    "cell_type": "code",
792 |    "execution_count": 25,
793 |    "metadata": {},
794 |    "outputs": [
795 |     {
796 |      "name": "stdout",
797 |      "output_type": "stream",
798 |      "text": [
799 |       "Columns:  Index(['datetime', 'datetimeEpoch', 'tempmax', 'tempmin', 'temp',\n",
800 |       "       'feelslikemax', 'feelslikemin', 'feelslike', 'dew', 'humidity',\n",
801 |       "       'precip', 'precipprob', 'precipcover', 'preciptype', 'snow',\n",
802 |       "       'snowdepth', 'windgust', 'windspeed', 'winddir', 'pressure',\n",
803 |       "       'cloudcover', 'visibility', 'solarradiation', 'solarenergy', 'uvindex',\n",
804 |       "       'sunrise', 'sunriseEpoch', 'sunset', 'sunsetEpoch', 'moonphase',\n",
805 |       "       'conditions', 'description', 'icon', 'stations', 'source', 'tzoffset'],\n",
806 |       "      dtype='object') \n",
807 |       "Shape:  (365, 36)\n"
808 |      ]
809 |     }
810 |    ],
811 |    "source": [
812 |     "print('Columns: ', df_weather.columns, '\\nShape: ', df_weather.shape)"
813 |    ]
814 |   },
815 |   {
816 |    "cell_type": "code",
817 |    "execution_count": 26,
818 |    "metadata": {},
819 |    "outputs": [
820 |     {
821 |      "name": "stdout",
822 |      "output_type": "stream",
823 |      "text": [
824 |       "CREATE TABLE \"weather_staging\" (\n",
825 |       "\"datetime\" TEXT,\n",
826 |       "  \"datetimeEpoch\" INTEGER,\n",
827 |       "  \"tempmax\" REAL,\n",
828 |       "  \"tempmin\" REAL,\n",
829 |       "  \"temp\" REAL,\n",
830 |       "  \"feelslikemax\" REAL,\n",
831 |       "  \"feelslikemin\" REAL,\n",
832 |       "  \"feelslike\" REAL,\n",
833 |       "  \"dew\" REAL,\n",
834 |       "  \"humidity\" REAL,\n",
835 |       "  \"precip\" REAL,\n",
836 |       "  \"precipprob\" TEXT,\n",
837 |       "  \"precipcover\" REAL,\n",
838 |       "  \"preciptype\" TEXT,\n",
839 |       "  \"snow\" TEXT,\n",
840 |       "  \"snowdepth\" REAL,\n",
841 |       "  \"windgust\" REAL,\n",
842 |       "  \"windspeed\" REAL,\n",
843 |       "  \"winddir\" REAL,\n",
844 |       "  \"pressure\" REAL,\n",
845 |       "  \"cloudcover\" REAL,\n",
846 |       "  \"visibility\" REAL,\n",
847 |       "  \"solarradiation\" REAL,\n",
848 |       "  \"solarenergy\" REAL,\n",
849 |       "  \"uvindex\" REAL,\n",
850 |       "  \"sunrise\" TEXT,\n",
851 |       "  \"sunriseEpoch\" INTEGER,\n",
852 |       "  \"sunset\" TEXT,\n",
853 |       "  \"sunsetEpoch\" INTEGER,\n",
854 |       "  \"moonphase\" REAL,\n",
855 |       "  \"conditions\" TEXT,\n",
856 |       "  \"description\" TEXT,\n",
857 |       "  \"icon\" TEXT,\n",
858 |       "  \"stations\" TEXT,\n",
859 |       "  \"source\" TEXT,\n",
860 |       "  \"tzoffset\" REAL,\n",
861 |       "  CONSTRAINT weather_staging_pk PRIMARY KEY (\"datetime\")\n",
862 |       ")\n"
863 |      ]
864 |     }
865 |    ],
866 |    "source": [
867 |     "# infer a sql table schema for weather data\n",
868 |     "weather_table= pd.io.sql.get_schema(frame=df_weather, name='weather_staging', keys='datetime')\n",
869 |     "print(weather_table)"
870 |    ]
871 |   }
872 |  ],
873 |  "metadata": {
874 |   "interpreter": {
875 |    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
876 |   },
877 |   "kernelspec": {
878 |    "display_name": "Python 3.8.10 64-bit",
879 |    "language": "python",
880 |    "name": "python3"
881 |   },
882 |   "language_info": {
883 |    "codemirror_mode": {
884 |     "name": "ipython",
885 |     "version": 3
886 |    },
887 |    "file_extension": ".py",
888 |    "mimetype": "text/x-python",
889 |    "name": "python",
890 |    "nbconvert_exporter": "python",
891 |    "pygments_lexer": "ipython3",
892 |    "version": "3.8.10"
893 |   },
894 |   "orig_nbformat": 4
895 |  },
896 |  "nbformat": 4,
897 |  "nbformat_minor": 2
898 | }
899 | 


--------------------------------------------------------------------------------
/notebook/data-exploration/Scraping.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Web Scrapping\n",
  8 |     "Scraping the TFL website: (https://cycling.data.tfl.gov.uk) in order to get the links of the files we are interested in."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 7,
 14 |    "metadata": {},
 15 |    "outputs": [
 16 |     {
 17 |      "name": "stdout",
 18 |      "output_type": "stream",
 19 |      "text": [
 20 |       "/usr/lib/python3/dist-packages/secretstorage/dhcrypto.py:15: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n",
 21 |       "  from cryptography.utils import int_from_bytes\n",
 22 |       "/usr/lib/python3/dist-packages/secretstorage/util.py:19: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n",
 23 |       "  from cryptography.utils import int_from_bytes\n",
 24 |       "Defaulting to user installation because normal site-packages is not writeable\n",
 25 |       "Requirement already satisfied: selenium in /home/hrc/.local/lib/python3.8/site-packages (4.1.0)\n",
 26 |       "Requirement already satisfied: trio-websocket~=0.9 in /home/hrc/.local/lib/python3.8/site-packages (from selenium) (0.9.2)\n",
 27 |       "Requirement already satisfied: trio~=0.17 in /home/hrc/.local/lib/python3.8/site-packages (from selenium) (0.19.0)\n",
 28 |       "Requirement already satisfied: urllib3[secure]~=1.26 in /home/hrc/.local/lib/python3.8/site-packages (from selenium) (1.26.7)\n",
 29 |       "Requirement already satisfied: async-generator>=1.9 in /home/hrc/.local/lib/python3.8/site-packages (from trio~=0.17->selenium) (1.10)\n",
 30 |       "Requirement already satisfied: sniffio in /home/hrc/.local/lib/python3.8/site-packages (from trio~=0.17->selenium) (1.2.0)\n",
 31 |       "Requirement already satisfied: outcome in /home/hrc/.local/lib/python3.8/site-packages (from trio~=0.17->selenium) (1.1.0)\n",
 32 |       "Requirement already satisfied: idna in /home/hrc/.local/lib/python3.8/site-packages (from trio~=0.17->selenium) (3.3)\n",
 33 |       "Requirement already satisfied: attrs>=19.2.0 in /home/hrc/.local/lib/python3.8/site-packages (from trio~=0.17->selenium) (20.3.0)\n",
 34 |       "Requirement already satisfied: sortedcontainers in /home/hrc/.local/lib/python3.8/site-packages (from trio~=0.17->selenium) (2.4.0)\n",
 35 |       "Requirement already satisfied: wsproto>=0.14 in /home/hrc/.local/lib/python3.8/site-packages (from trio-websocket~=0.9->selenium) (1.0.0)\n",
 36 |       "Requirement already satisfied: cryptography>=1.3.4 in /home/hrc/.local/lib/python3.8/site-packages (from urllib3[secure]~=1.26->selenium) (36.0.1)\n",
 37 |       "Requirement already satisfied: certifi in /home/hrc/.local/lib/python3.8/site-packages (from urllib3[secure]~=1.26->selenium) (2021.10.8)\n",
 38 |       "Requirement already satisfied: pyOpenSSL>=0.14 in /home/hrc/.local/lib/python3.8/site-packages (from urllib3[secure]~=1.26->selenium) (22.0.0)\n",
 39 |       "Requirement already satisfied: cffi>=1.12 in /home/hrc/.local/lib/python3.8/site-packages (from cryptography>=1.3.4->urllib3[secure]~=1.26->selenium) (1.15.0)\n",
 40 |       "Requirement already satisfied: h11<1,>=0.9.0 in /home/hrc/.local/lib/python3.8/site-packages (from wsproto>=0.14->trio-websocket~=0.9->selenium) (0.12.0)\n",
 41 |       "Requirement already satisfied: pycparser in /home/hrc/.local/lib/python3.8/site-packages (from cffi>=1.12->cryptography>=1.3.4->urllib3[secure]~=1.26->selenium) (2.21)\n",
 42 |       "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.3 is available.\n",
 43 |       "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
 44 |       "/usr/lib/python3/dist-packages/secretstorage/dhcrypto.py:15: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n",
 45 |       "  from cryptography.utils import int_from_bytes\n",
 46 |       "/usr/lib/python3/dist-packages/secretstorage/util.py:19: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n",
 47 |       "  from cryptography.utils import int_from_bytes\n",
 48 |       "Defaulting to user installation because normal site-packages is not writeable\n",
 49 |       "Requirement already satisfied: bs4 in /home/hrc/.local/lib/python3.8/site-packages (0.0.1)\n",
 50 |       "Requirement already satisfied: beautifulsoup4 in /home/hrc/.local/lib/python3.8/site-packages (from bs4) (4.10.0)\n",
 51 |       "Requirement already satisfied: soupsieve>1.2 in /home/hrc/.local/lib/python3.8/site-packages (from beautifulsoup4->bs4) (2.3.1)\n",
 52 |       "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.3 is available.\n",
 53 |       "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
 54 |      ]
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "# install dependencies\n",
 59 |     "!pip install selenium\n",
 60 |     "!pip install bs4"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 8,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "# imports\n",
 70 |     "from bs4 import BeautifulSoup\n",
 71 |     "\n",
 72 |     "# selenium will be used to scrap dynamic content of the webpage source of our data\n",
 73 |     "from selenium import webdriver\n",
 74 |     "from webdriver_manager.firefox import GeckoDriverManager\n",
 75 |     "from selenium.webdriver.firefox.options import Options as FirefoxOptions\n",
 76 |     "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
 77 |     "\n",
 78 |     "from selenium.webdriver.common.by import By\n",
 79 |     "from selenium.webdriver.support.ui import WebDriverWait\n",
 80 |     "from selenium.webdriver.support import expected_conditions as EC"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 9,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stderr",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "\n",
 93 |       "\n",
 94 |       "====== WebDriver manager ======\n",
 95 |       "Current firefox version is 96.0\n",
 96 |       "Get LATEST geckodriver version for 96.0 firefox\n",
 97 |       "Getting latest mozilla release info for v0.30.0\n",
 98 |       "Trying to download new driver from https://github.com/mozilla/geckodriver/releases/download/v0.30.0/geckodriver-v0.30.0-linux64.tar.gz\n",
 99 |       "Driver has been saved in cache [/home/hrc/.wdm/drivers/geckodriver/linux64/v0.30.0]\n",
100 |       "/tmp/ipykernel_373988/1338734867.py:11: DeprecationWarning: executable_path has been deprecated, please pass in a Service object\n",
101 |       "  browser = webdriver.Firefox(capabilities=cap, executable_path=GeckoDriverManager().install(), options=options)\n",
102 |       "/tmp/ipykernel_373988/1338734867.py:11: DeprecationWarning: capabilities and desired_capabilities have been deprecated, please pass in a Service object\n",
103 |       "  browser = webdriver.Firefox(capabilities=cap, executable_path=GeckoDriverManager().install(), options=options)\n"
104 |      ]
105 |     }
106 |    ],
107 |    "source": [
108 |     "# get the webpage contents in html format\n",
109 |     "\n",
110 |     "url= \"https://cycling.data.tfl.gov.uk\"\n",
111 |     "\n",
112 |     "cap = DesiredCapabilities().FIREFOX\n",
113 |     "cap[\"marionette\"] = False\n",
114 |     "\n",
115 |     "options = FirefoxOptions()\n",
116 |     "options.add_argument(\"--headless\")\n",
117 |     "\n",
118 |     "browser = webdriver.Firefox(capabilities=cap, executable_path=GeckoDriverManager().install(), options=options)\n",
119 |     "browser.get(url)\n",
120 |     "\n",
121 |     "# wait until at least a single element of the table exists\n",
122 |     "wait = WebDriverWait(browser, 20)\n",
123 |     "html = wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/table/tbody/tr[1]/td[1]')))\n",
124 |     "\n",
125 |     "html_element= browser.page_source\n"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 10,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "# scrap the html contents\n",
135 |     "bsoup= BeautifulSoup(html_element, \"html.parser\")"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 11,
141 |    "metadata": {},
142 |    "outputs": [
143 |     {
144 |      "name": "stdout",
145 |      "output_type": "stream",
146 |      "text": [
147 |       "Display 2 items in the dictionary\n",
148 |       "{'05Jan2021': 'https://cycling.data.tfl.gov.uk/usage-stats/246JourneyDataExtract30Dec2020-05Jan2021.csv', '12Jan2021': 'https://cycling.data.tfl.gov.uk/usage-stats/247JourneyDataExtract06Jan2021-12Jan2021.csv'}\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "# find the relevant files with their links\n",
154 |     "table= bsoup.find('table')\n",
155 |     "tbody= table.find('tbody')\n",
156 |     "folder_name= \"usage-stats/\"\n",
157 |     "capture_files= False\n",
158 |     "year= 2021\n",
159 |     "filetype= 'csv'\n",
160 |     "extracted_files= {}\n",
161 |     "\n",
162 |     "for row in tbody.find_all('tr'):\n",
163 |     "    columns= row.find_all('td')\n",
164 |     "\n",
165 |     "    if capture_files == False:\n",
166 |     "        col_values= [col.text.strip() for col in columns]\n",
167 |     "\n",
168 |     "        if col_values[0] == folder_name:\n",
169 |     "            capture_files= True\n",
170 |     "            continue\n",
171 |     "\n",
172 |     "    else:\n",
173 |     "        col= columns[0]\n",
174 |     "        filename= col.text.strip()\n",
175 |     "        \n",
176 |     "        if not filename.endswith(f'{year}.{filetype}'):\n",
177 |     "            continue\n",
178 |     "        \n",
179 |     "        # extract the date\n",
180 |     "        filename_without_extension= filename.replace(f'.{filetype}', '') \n",
181 |     "        filename_last_date= filename_without_extension.split('-')[-1]\n",
182 |     "        extracted_files[filename_last_date]= col.a['href']\n",
183 |     "\n",
184 |     "print('Display 2 items in the dictionary')\n",
185 |     "print(dict(list(extracted_files.items())[0:2]))"
186 |    ]
187 |   }
188 |  ],
189 |  "metadata": {
190 |   "interpreter": {
191 |    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
192 |   },
193 |   "kernelspec": {
194 |    "display_name": "Python 3.8.10 64-bit",
195 |    "language": "python",
196 |    "name": "python3"
197 |   },
198 |   "language_info": {
199 |    "codemirror_mode": {
200 |     "name": "ipython",
201 |     "version": 3
202 |    },
203 |    "file_extension": ".py",
204 |    "mimetype": "text/x-python",
205 |    "name": "python",
206 |    "nbconvert_exporter": "python",
207 |    "pygments_lexer": "ipython3",
208 |    "version": "3.8.10"
209 |   },
210 |   "orig_nbformat": 4
211 |  },
212 |  "nbformat": 4,
213 |  "nbformat_minor": 2
214 | }
215 | 


--------------------------------------------------------------------------------
/notebook/data-transformation/experiment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "d20b3aba",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## One time data transformation\n",
  9 |     "In this notebook, we are going to transform the stations and weather data in such a way that they will be conformed to the redshift schema for their corresponding tables.\n",
 10 |     "\n",
 11 |     "The preprocessed data will be saved back to S3 before getting loaded to Redshift."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "id": "d18f0fe2",
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import pyspark\n",
 22 |     "import os"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "id": "fc380d3d",
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "data": {
 33 |       "text/plain": [
 34 |        "'3.2.1'"
 35 |       ]
 36 |      },
 37 |      "execution_count": 2,
 38 |      "metadata": {},
 39 |      "output_type": "execute_result"
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "pyspark.__version__"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "id": "42c52993",
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "from pyspark.sql import SparkSession"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 4,
 59 |    "id": "3f382bf9",
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "name": "stderr",
 64 |      "output_type": "stream",
 65 |      "text": [
 66 |       "WARNING: An illegal reflective access operation has occurred\n",
 67 |       "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/hrc/Documents/de-aws/data-venv/lib/python3.8/site-packages/pyspark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
 68 |       "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
 69 |       "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
 70 |       "WARNING: All illegal access operations will be denied in a future release\n",
 71 |       "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
 72 |       "Setting default log level to \"WARN\".\n",
 73 |       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
 74 |       "22/03/05 23:56:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
 75 |      ]
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "spark = SparkSession.builder \\\n",
 80 |     "        .master('local[*]') \\\n",
 81 |     "        .appName('data-transformer') \\\n",
 82 |     "        .config(\"spark.hadoop.fs.s3a.access.key\", os.environ.get('AWS_ACCESS_KEY'))\\\n",
 83 |     "        .config(\"spark.hadoop.fs.s3a.secret.key\", os.environ.get('AWS_SECRET_ACCESS_KEY'))\\\n",
 84 |     "        .getOrCreate()"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 5,
 90 |    "id": "388ae2f2",
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "'3.3.1'"
 97 |       ]
 98 |      },
 99 |      "execution_count": 5,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "sc = spark.sparkContext\n",
106 |     "sc._jvm.org.apache.hadoop.util.VersionInfo.getVersion()"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "id": "50ba85e8",
112 |    "metadata": {},
113 |    "source": [
114 |     "###  1. Stations data"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 6,
120 |    "id": "ae6e918b",
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "name": "stderr",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "22/03/01 18:05:51 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n",
128 |       "                                                                                \r"
129 |      ]
130 |     }
131 |    ],
132 |    "source": [
133 |     "df_stations = spark.read.csv(\"s3a://hrc-de-data/raw/cycling-extras/stations.csv\", inferSchema=True, header=True)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 7,
139 |    "id": "0ceaf5ce",
140 |    "metadata": {},
141 |    "outputs": [
142 |     {
143 |      "data": {
144 |       "text/plain": [
145 |        "[Row(Station.Id=1, StationName='River Street, Clerkenwell', longitude=-0.109971, latitude=51.5292, Easting=531202.52, Northing=182832.02),\n",
146 |        " Row(Station.Id=2, StationName='Phillimore Gardens, Kensington', longitude=-0.197574, latitude=51.4996, Easting=525207.07, Northing=179391.86)]"
147 |       ]
148 |      },
149 |      "execution_count": 7,
150 |      "metadata": {},
151 |      "output_type": "execute_result"
152 |     }
153 |    ],
154 |    "source": [
155 |     "df_stations.take(2)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 8,
161 |    "id": "4533a36e",
162 |    "metadata": {},
163 |    "outputs": [
164 |     {
165 |      "name": "stdout",
166 |      "output_type": "stream",
167 |      "text": [
168 |       "root\n",
169 |       " |-- Station.Id: integer (nullable = true)\n",
170 |       " |-- StationName: string (nullable = true)\n",
171 |       " |-- longitude: double (nullable = true)\n",
172 |       " |-- latitude: double (nullable = true)\n",
173 |       " |-- Easting: double (nullable = true)\n",
174 |       " |-- Northing: double (nullable = true)\n",
175 |       "\n"
176 |      ]
177 |     }
178 |    ],
179 |    "source": [
180 |     "df_stations.printSchema()"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 11,
186 |    "id": "766eb9a2",
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "from pyspark.sql import functions as F, types as T"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 10,
196 |    "id": "6cbf6baf",
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "# rename columns\n",
201 |     "stations= df_stations.withColumnRenamed('Station.Id', 'station_id') \\\n",
202 |     "                        .withColumnRenamed('StationName', 'station_name') \\\n",
203 |     "                        .withColumnRenamed('easting', 'easting') \\\n",
204 |     "                        .withColumnRenamed('northing', 'northing') "
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 11,
210 |    "id": "7d9360c5",
211 |    "metadata": {
212 |     "scrolled": false
213 |    },
214 |    "outputs": [
215 |     {
216 |      "name": "stdout",
217 |      "output_type": "stream",
218 |      "text": [
219 |       "+----------+--------------------+----------+--------+----------+----------+\n",
220 |       "|station_id|        station_name| longitude|latitude|   easting|  northing|\n",
221 |       "+----------+--------------------+----------+--------+----------+----------+\n",
222 |       "|         1|River Street, Cle...| -0.109971| 51.5292| 531202.52| 182832.02|\n",
223 |       "|         2|Phillimore Garden...| -0.197574| 51.4996| 525207.07| 179391.86|\n",
224 |       "|         3|Christopher Stree...|-0.0846057| 51.5213| 532984.81| 182001.53|\n",
225 |       "|         4|St. Chad's Street...| -0.120974| 51.5301| 530436.76| 182911.99|\n",
226 |       "|         5|Sedding Street, S...| -0.156876| 51.4931|528051.649|178742.097|\n",
227 |       "+----------+--------------------+----------+--------+----------+----------+\n",
228 |       "only showing top 5 rows\n",
229 |       "\n"
230 |      ]
231 |     }
232 |    ],
233 |    "source": [
234 |     "stations.show(5)"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 12,
240 |    "id": "88468f5d",
241 |    "metadata": {
242 |     "scrolled": true
243 |    },
244 |    "outputs": [
245 |     {
246 |      "name": "stdout",
247 |      "output_type": "stream",
248 |      "text": [
249 |       "+----------+------------+---------+--------+-------+--------+\n",
250 |       "|station_id|station_name|longitude|latitude|easting|northing|\n",
251 |       "+----------+------------+---------+--------+-------+--------+\n",
252 |       "|         0|           0|        0|       0|      0|       0|\n",
253 |       "+----------+------------+---------+--------+-------+--------+\n",
254 |       "\n"
255 |      ]
256 |     }
257 |    ],
258 |    "source": [
259 |     "# count missing values in each column\n",
260 |     "stations.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in stations.columns]\n",
261 |     "   ).show()"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 17,
267 |    "id": "d0ee0f23",
268 |    "metadata": {},
269 |    "outputs": [
270 |     {
271 |      "name": "stderr",
272 |      "output_type": "stream",
273 |      "text": [
274 |       "                                                                                \r"
275 |      ]
276 |     }
277 |    ],
278 |    "source": [
279 |     "stations.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/stations/', mode='overwrite')"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "id": "ef3f5de0",
285 |    "metadata": {},
286 |    "source": [
287 |     "### 2. Weather data"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 6,
293 |    "id": "c41101c2",
294 |    "metadata": {},
295 |    "outputs": [
296 |     {
297 |      "name": "stderr",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "22/03/05 23:57:03 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n",
301 |       "                                                                                \r"
302 |      ]
303 |     }
304 |    ],
305 |    "source": [
306 |     "df_weather = spark.read.json(\"s3a://hrc-de-data/raw/cycling-extras/weather.json\")"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 7,
312 |    "id": "3203cde3",
313 |    "metadata": {},
314 |    "outputs": [
315 |     {
316 |      "name": "stderr",
317 |      "output_type": "stream",
318 |      "text": [
319 |       "22/03/05 23:57:19 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n",
320 |       "                                                                                \r"
321 |      ]
322 |     },
323 |     {
324 |      "data": {
325 |       "text/plain": [
326 |        "[Row(cloudcover=0.5, conditions='Rain', datetime='2021-01-01', datetimeEpoch=1609459200, description='Clear conditions throughout the day with late afternoon rain.', dew=0.8, feelslike=-0.2, feelslikemax=2.9, feelslikemin=-3.6, humidity=91.03, icon='rain', moonphase=0.53, precip=0.22, precipcover=4.17, precipprob=None, preciptype=None, pressure=1011.6, severerisk=None, snow=None, snowdepth=None, solarenergy=0.8, solarradiation=29.4, source='obs', stations=['03769099999', '03680099999', 'D5621', '03672099999', '03781099999', '03772099999', '03770099999'], sunrise='08:06:14', sunriseEpoch=1609488374, sunset='16:02:22', sunsetEpoch=1609516942, temp=2.1, tempmax=5.0, tempmin=-0.5, tzoffset=None, uvindex=0.0, visibility=2.6, winddir=304.0, windgust=None, windspeed=6.6),\n",
327 |        " Row(cloudcover=0.5, conditions='Rain', datetime='2021-01-02', datetimeEpoch=1609545600, description='Clear conditions throughout the day with rain.', dew=1.0, feelslike=1.5, feelslikemax=3.1, feelslikemin=-1.5, humidity=82.51, icon='rain', moonphase=0.56, precip=0.6, precipcover=8.33, precipprob=None, preciptype=None, pressure=1015.9, severerisk=None, snow=None, snowdepth=None, solarenergy=1.3, solarradiation=43.9, source='obs', stations=['03680099999', 'D5621', '03672099999', '03781099999', '03772099999', '03770099999'], sunrise='08:06:05', sunriseEpoch=1609574765, sunset='16:03:28', sunsetEpoch=1609603408, temp=3.8, tempmax=5.1, tempmin=1.5, tzoffset=None, uvindex=1.0, visibility=15.1, winddir=299.0, windgust=None, windspeed=7.8)]"
328 |       ]
329 |      },
330 |      "execution_count": 7,
331 |      "metadata": {},
332 |      "output_type": "execute_result"
333 |     }
334 |    ],
335 |    "source": [
336 |     "df_weather.take(2)"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 8,
342 |    "id": "5474298d",
343 |    "metadata": {},
344 |    "outputs": [
345 |     {
346 |      "name": "stdout",
347 |      "output_type": "stream",
348 |      "text": [
349 |       "root\n",
350 |       " |-- cloudcover: double (nullable = true)\n",
351 |       " |-- conditions: string (nullable = true)\n",
352 |       " |-- datetime: string (nullable = true)\n",
353 |       " |-- datetimeEpoch: long (nullable = true)\n",
354 |       " |-- description: string (nullable = true)\n",
355 |       " |-- dew: double (nullable = true)\n",
356 |       " |-- feelslike: double (nullable = true)\n",
357 |       " |-- feelslikemax: double (nullable = true)\n",
358 |       " |-- feelslikemin: double (nullable = true)\n",
359 |       " |-- humidity: double (nullable = true)\n",
360 |       " |-- icon: string (nullable = true)\n",
361 |       " |-- moonphase: double (nullable = true)\n",
362 |       " |-- precip: double (nullable = true)\n",
363 |       " |-- precipcover: double (nullable = true)\n",
364 |       " |-- precipprob: double (nullable = true)\n",
365 |       " |-- preciptype: array (nullable = true)\n",
366 |       " |    |-- element: string (containsNull = true)\n",
367 |       " |-- pressure: double (nullable = true)\n",
368 |       " |-- severerisk: double (nullable = true)\n",
369 |       " |-- snow: double (nullable = true)\n",
370 |       " |-- snowdepth: double (nullable = true)\n",
371 |       " |-- solarenergy: double (nullable = true)\n",
372 |       " |-- solarradiation: double (nullable = true)\n",
373 |       " |-- source: string (nullable = true)\n",
374 |       " |-- stations: array (nullable = true)\n",
375 |       " |    |-- element: string (containsNull = true)\n",
376 |       " |-- sunrise: string (nullable = true)\n",
377 |       " |-- sunriseEpoch: long (nullable = true)\n",
378 |       " |-- sunset: string (nullable = true)\n",
379 |       " |-- sunsetEpoch: long (nullable = true)\n",
380 |       " |-- temp: double (nullable = true)\n",
381 |       " |-- tempmax: double (nullable = true)\n",
382 |       " |-- tempmin: double (nullable = true)\n",
383 |       " |-- tzoffset: double (nullable = true)\n",
384 |       " |-- uvindex: double (nullable = true)\n",
385 |       " |-- visibility: double (nullable = true)\n",
386 |       " |-- winddir: double (nullable = true)\n",
387 |       " |-- windgust: double (nullable = true)\n",
388 |       " |-- windspeed: double (nullable = true)\n",
389 |       "\n"
390 |      ]
391 |     }
392 |    ],
393 |    "source": [
394 |     "df_weather.printSchema()"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": 9,
400 |    "id": "2e952690",
401 |    "metadata": {},
402 |    "outputs": [],
403 |    "source": [
404 |     "# drop some columns that we won't need\n",
405 |     "weather= df_weather.drop('cloudcover', 'conditions', 'datetimeEpoch', 'description', 'dew', 'icon', \n",
406 |     "                            'precipcover', 'preciptype', 'source', 'stations', 'sunriseEpoch', 'sunsetEpoch')"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 12,
412 |    "id": "22b9368f",
413 |    "metadata": {},
414 |    "outputs": [],
415 |    "source": [
416 |     "# transform datetime\n",
417 |     "weather= weather.withColumnRenamed('datetime', 'weather_date') \n",
418 |     "weather= weather.withColumn('weather_date', weather.weather_date.cast(T.DateType()))"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": 13,
424 |    "id": "b7d8f370",
425 |    "metadata": {
426 |     "scrolled": true
427 |    },
428 |    "outputs": [
429 |     {
430 |      "name": "stdout",
431 |      "output_type": "stream",
432 |      "text": [
433 |       "root\n",
434 |       " |-- weather_date: date (nullable = true)\n",
435 |       " |-- feelslike: double (nullable = true)\n",
436 |       " |-- feelslikemax: double (nullable = true)\n",
437 |       " |-- feelslikemin: double (nullable = true)\n",
438 |       " |-- humidity: double (nullable = true)\n",
439 |       " |-- moonphase: double (nullable = true)\n",
440 |       " |-- precip: double (nullable = true)\n",
441 |       " |-- precipprob: double (nullable = true)\n",
442 |       " |-- pressure: double (nullable = true)\n",
443 |       " |-- severerisk: double (nullable = true)\n",
444 |       " |-- snow: double (nullable = true)\n",
445 |       " |-- snowdepth: double (nullable = true)\n",
446 |       " |-- solarenergy: double (nullable = true)\n",
447 |       " |-- solarradiation: double (nullable = true)\n",
448 |       " |-- sunrise: string (nullable = true)\n",
449 |       " |-- sunset: string (nullable = true)\n",
450 |       " |-- temp: double (nullable = true)\n",
451 |       " |-- tempmax: double (nullable = true)\n",
452 |       " |-- tempmin: double (nullable = true)\n",
453 |       " |-- tzoffset: double (nullable = true)\n",
454 |       " |-- uvindex: double (nullable = true)\n",
455 |       " |-- visibility: double (nullable = true)\n",
456 |       " |-- winddir: double (nullable = true)\n",
457 |       " |-- windgust: double (nullable = true)\n",
458 |       " |-- windspeed: double (nullable = true)\n",
459 |       "\n",
460 |       "25 columns\n"
461 |      ]
462 |     }
463 |    ],
464 |    "source": [
465 |     "weather.printSchema()\n",
466 |     "print(len(weather.columns), 'columns')"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": null,
472 |    "id": "e065026c",
473 |    "metadata": {},
474 |    "outputs": [],
475 |    "source": []
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": 18,
480 |    "id": "ac976766",
481 |    "metadata": {},
482 |    "outputs": [
483 |     {
484 |      "name": "stderr",
485 |      "output_type": "stream",
486 |      "text": [
487 |       "\r",
488 |       "[Stage 11:>                                                         (0 + 1) / 1]\r"
489 |      ]
490 |     },
491 |     {
492 |      "name": "stdout",
493 |      "output_type": "stream",
494 |      "text": [
495 |       "+------------+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+--------+--------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n",
496 |       "|weather_date|feelslike|feelslikemax|feelslikemin|humidity|moonphase|precip|precipprob|pressure|severerisk|snow|snowdepth|solarenergy|solarradiation| sunrise|  sunset|temp|tempmax|tempmin|tzoffset|uvindex|visibility|winddir|windgust|windspeed|\n",
497 |       "+------------+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+--------+--------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n",
498 |       "|  2021-01-01|     -0.2|         2.9|        -3.6|   91.03|     0.53|  0.22|      null|  1011.6|      null|null|     null|        0.8|          29.4|08:06:14|16:02:22| 2.1|    5.0|   -0.5|    null|    0.0|       2.6|  304.0|    null|      6.6|\n",
499 |       "|  2021-01-02|      1.5|         3.1|        -1.5|   82.51|     0.56|   0.6|      null|  1015.9|      null|null|     null|        1.3|          43.9|08:06:05|16:03:28| 3.8|    5.1|    1.5|    null|    1.0|      15.1|  299.0|    null|      7.8|\n",
500 |       "+------------+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+--------+--------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n",
501 |       "only showing top 2 rows\n",
502 |       "\n"
503 |      ]
504 |     },
505 |     {
506 |      "name": "stderr",
507 |      "output_type": "stream",
508 |      "text": [
509 |       "\r",
510 |       "                                                                                \r"
511 |      ]
512 |     }
513 |    ],
514 |    "source": [
515 |     "weather.show(2)"
516 |    ]
517 |   },
518 |   {
519 |    "cell_type": "code",
520 |    "execution_count": 27,
521 |    "id": "1d53f1c4",
522 |    "metadata": {},
523 |    "outputs": [
524 |     {
525 |      "name": "stderr",
526 |      "output_type": "stream",
527 |      "text": [
528 |       "\r",
529 |       "[Stage 27:>                                                         (0 + 1) / 1]\r"
530 |      ]
531 |     },
532 |     {
533 |      "name": "stdout",
534 |      "output_type": "stream",
535 |      "text": [
536 |       "+----------------+\n",
537 |       "|missing_tzoffset|\n",
538 |       "+----------------+\n",
539 |       "|             179|\n",
540 |       "+----------------+\n",
541 |       "\n"
542 |      ]
543 |     },
544 |     {
545 |      "name": "stderr",
546 |      "output_type": "stream",
547 |      "text": [
548 |       "\r",
549 |       "                                                                                \r"
550 |      ]
551 |     }
552 |    ],
553 |    "source": [
554 |     "# count missing values in windgust\n",
555 |     "missing_windgust= (\n",
556 |     "    weather.select(\n",
557 |     "        F.count(F.when(F.col('tzoffset').isNull() | F.isnan(F.col('tzoffset')), ''))\n",
558 |     "        .alias('missing_tzoffset'))\n",
559 |     ")\n",
560 |     "missing_windgust.show()"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": 25,
566 |    "id": "e116dc28",
567 |    "metadata": {},
568 |    "outputs": [],
569 |    "source": [
570 |     "# count missing values in each column\n",
571 |     "cols= weather.columns\n",
572 |     "cols.remove('weather_date')\n",
573 |     "missing_values= weather.select([F.count(F.when(F.col(c).isNull() | F.isnan(c), c)).alias(c) for c in cols])"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": 26,
579 |    "id": "e279bebd",
580 |    "metadata": {
581 |     "scrolled": true
582 |    },
583 |    "outputs": [
584 |     {
585 |      "name": "stderr",
586 |      "output_type": "stream",
587 |      "text": [
588 |       "\r",
589 |       "[Stage 24:>                                                         (0 + 1) / 1]\r"
590 |      ]
591 |     },
592 |     {
593 |      "name": "stdout",
594 |      "output_type": "stream",
595 |      "text": [
596 |       "+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n",
597 |       "|feelslike|feelslikemax|feelslikemin|humidity|moonphase|precip|precipprob|pressure|severerisk|snow|snowdepth|solarenergy|solarradiation|sunrise|sunset|temp|tempmax|tempmin|tzoffset|uvindex|visibility|winddir|windgust|windspeed|\n",
598 |       "+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n",
599 |       "|        0|           0|           0|       0|        0|     0|       374|       1|       374| 374|      365|          0|             0|      0|     0|   0|      0|      0|     179|      0|         0|      0|     229|        0|\n",
600 |       "+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n",
601 |       "\n"
602 |      ]
603 |     },
604 |     {
605 |      "name": "stderr",
606 |      "output_type": "stream",
607 |      "text": [
608 |       "\r",
609 |       "                                                                                \r"
610 |      ]
611 |     }
612 |    ],
613 |    "source": [
614 |     "missing_values.show()"
615 |    ]
616 |   },
617 |   {
618 |    "cell_type": "code",
619 |    "execution_count": 32,
620 |    "id": "49d42f95",
621 |    "metadata": {
622 |     "scrolled": true
623 |    },
624 |    "outputs": [
625 |     {
626 |      "name": "stderr",
627 |      "output_type": "stream",
628 |      "text": [
629 |       "\r",
630 |       "[Stage 33:>                                                         (0 + 1) / 1]\r"
631 |      ]
632 |     },
633 |     {
634 |      "name": "stdout",
635 |      "output_type": "stream",
636 |      "text": [
637 |       "+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n",
638 |       "|feelslike|feelslikemax|feelslikemin|humidity|moonphase|precip|precipprob|pressure|severerisk|snow|snowdepth|solarenergy|solarradiation|sunrise|sunset|temp|tempmax|tempmin|tzoffset|uvindex|visibility|winddir|windgust|windspeed|\n",
639 |       "+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n",
640 |       "|      0.0|         0.0|         0.0|     0.0|      0.0|   0.0|      0.94|     0.0|      0.94|0.94|     0.92|        0.0|           0.0|    0.0|   0.0| 0.0|    0.0|    0.0|    0.45|    0.0|       0.0|    0.0|    0.58|      0.0|\n",
641 |       "+---------+------------+------------+--------+---------+------+----------+--------+----------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n",
642 |       "\n"
643 |      ]
644 |     },
645 |     {
646 |      "name": "stderr",
647 |      "output_type": "stream",
648 |      "text": [
649 |       "\r",
650 |       "                                                                                \r"
651 |      ]
652 |     }
653 |    ],
654 |    "source": [
655 |     "perc_missing_values= (\n",
656 |     "    weather.select([\n",
657 |     "        F.round(F.count(F.when(F.isnan(c) | F.col(c).isNull(), c))/F.count(F.lit(1)),2)\n",
658 |     "        .alias(c) for c in cols\n",
659 |     "    ])\n",
660 |     ")\n",
661 |     "perc_missing_values.show()"
662 |    ]
663 |   },
664 |   {
665 |    "cell_type": "code",
666 |    "execution_count": 28,
667 |    "id": "61e4c2fd",
668 |    "metadata": {},
669 |    "outputs": [
670 |     {
671 |      "data": {
672 |       "text/plain": [
673 |        "['weather_date',\n",
674 |        " 'feelslike',\n",
675 |        " 'feelslikemax',\n",
676 |        " 'feelslikemin',\n",
677 |        " 'humidity',\n",
678 |        " 'moonphase',\n",
679 |        " 'precip',\n",
680 |        " 'pressure',\n",
681 |        " 'solarenergy',\n",
682 |        " 'solarradiation',\n",
683 |        " 'sunrise',\n",
684 |        " 'sunset',\n",
685 |        " 'temp',\n",
686 |        " 'tempmax',\n",
687 |        " 'tempmin',\n",
688 |        " 'tzoffset',\n",
689 |        " 'uvindex',\n",
690 |        " 'visibility',\n",
691 |        " 'winddir',\n",
692 |        " 'windgust',\n",
693 |        " 'windspeed']"
694 |       ]
695 |      },
696 |      "execution_count": 28,
697 |      "metadata": {},
698 |      "output_type": "execute_result"
699 |     }
700 |    ],
701 |    "source": [
702 |     "# drop columns where missing values are more than 70%\n",
703 |     "\n",
704 |     "weather= weather.drop('precipprob', 'snow', 'snowdepth')\n",
705 |     "\n",
706 |     "if 'severerisk' in weather.columns:\n",
707 |     "    weather= weather.drop('severerisk')\n",
708 |     "\n",
709 |     "weather.columns"
710 |    ]
711 |   },
712 |   {
713 |    "cell_type": "code",
714 |    "execution_count": 29,
715 |    "id": "c1714e54",
716 |    "metadata": {},
717 |    "outputs": [
718 |     {
719 |      "name": "stderr",
720 |      "output_type": "stream",
721 |      "text": [
722 |       "                                                                                \r"
723 |      ]
724 |     }
725 |    ],
726 |    "source": [
727 |     "weather= weather.repartition(10)\n",
728 |     "\n",
729 |     "weather.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/weather/', mode='overwrite')"
730 |    ]
731 |   },
732 |   {
733 |    "cell_type": "code",
734 |    "execution_count": null,
735 |    "id": "2bbb74b9",
736 |    "metadata": {},
737 |    "outputs": [],
738 |    "source": []
739 |   }
740 |  ],
741 |  "metadata": {
742 |   "kernelspec": {
743 |    "display_name": "Python 3 (ipykernel)",
744 |    "language": "python",
745 |    "name": "python3"
746 |   },
747 |   "language_info": {
748 |    "codemirror_mode": {
749 |     "name": "ipython",
750 |     "version": 3
751 |    },
752 |    "file_extension": ".py",
753 |    "mimetype": "text/x-python",
754 |    "name": "python",
755 |    "nbconvert_exporter": "python",
756 |    "pygments_lexer": "ipython3",
757 |    "version": "3.8.10"
758 |   }
759 |  },
760 |  "nbformat": 4,
761 |  "nbformat_minor": 5
762 | }
763 | 


--------------------------------------------------------------------------------
/notebook/data-transformation/init-data-transformation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "d20b3aba",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## One time data transformation\n",
  9 |     "In this notebook, we are going to transform the stations and weather data in such a way that they will be conformed to the redshift schema for their corresponding tables.\n",
 10 |     "\n",
 11 |     "The preprocessed data will be saved back to S3 before getting loaded to Redshift."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "id": "d18f0fe2",
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import pyspark\n",
 22 |     "import os"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "id": "fc380d3d",
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "data": {
 33 |       "text/plain": [
 34 |        "'3.2.1'"
 35 |       ]
 36 |      },
 37 |      "execution_count": 2,
 38 |      "metadata": {},
 39 |      "output_type": "execute_result"
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "pyspark.__version__"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "id": "42c52993",
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "from pyspark.sql import SparkSession"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 4,
 59 |    "id": "3f382bf9",
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "name": "stderr",
 64 |      "output_type": "stream",
 65 |      "text": [
 66 |       "WARNING: An illegal reflective access operation has occurred\n",
 67 |       "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/hrc/anaconda3/lib/python3.9/site-packages/pyspark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
 68 |       "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
 69 |       "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
 70 |       "WARNING: All illegal access operations will be denied in a future release\n",
 71 |       "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
 72 |       "Setting default log level to \"WARN\".\n",
 73 |       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
 74 |       "22/03/01 18:05:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
 75 |      ]
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "spark = SparkSession.builder \\\n",
 80 |     "        .master('local[*]') \\\n",
 81 |     "        .appName('data-transformer') \\\n",
 82 |     "        .config(\"spark.hadoop.fs.s3a.access.key\", os.environ.get('AWS_ACCESS_KEY'))\\\n",
 83 |     "        .config(\"spark.hadoop.fs.s3a.secret.key\", os.environ.get('AWS_SECRET_ACCESS_KEY'))\\\n",
 84 |     "        .getOrCreate()"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 5,
 90 |    "id": "388ae2f2",
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "'3.3.1'"
 97 |       ]
 98 |      },
 99 |      "execution_count": 5,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "sc = spark.sparkContext\n",
106 |     "sc._jvm.org.apache.hadoop.util.VersionInfo.getVersion()"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "id": "50ba85e8",
112 |    "metadata": {},
113 |    "source": [
114 |     "###  1. Stations data"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 6,
120 |    "id": "ae6e918b",
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "name": "stderr",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "22/03/01 18:05:51 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n",
128 |       "                                                                                \r"
129 |      ]
130 |     }
131 |    ],
132 |    "source": [
133 |     "df_stations = spark.read.csv(\"s3a://hrc-de-data/raw/cycling-extras/stations.csv\", inferSchema=True, header=True)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 7,
139 |    "id": "0ceaf5ce",
140 |    "metadata": {},
141 |    "outputs": [
142 |     {
143 |      "data": {
144 |       "text/plain": [
145 |        "[Row(Station.Id=1, StationName='River Street, Clerkenwell', longitude=-0.109971, latitude=51.5292, Easting=531202.52, Northing=182832.02),\n",
146 |        " Row(Station.Id=2, StationName='Phillimore Gardens, Kensington', longitude=-0.197574, latitude=51.4996, Easting=525207.07, Northing=179391.86)]"
147 |       ]
148 |      },
149 |      "execution_count": 7,
150 |      "metadata": {},
151 |      "output_type": "execute_result"
152 |     }
153 |    ],
154 |    "source": [
155 |     "df_stations.take(2)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 8,
161 |    "id": "4533a36e",
162 |    "metadata": {},
163 |    "outputs": [
164 |     {
165 |      "name": "stdout",
166 |      "output_type": "stream",
167 |      "text": [
168 |       "root\n",
169 |       " |-- Station.Id: integer (nullable = true)\n",
170 |       " |-- StationName: string (nullable = true)\n",
171 |       " |-- longitude: double (nullable = true)\n",
172 |       " |-- latitude: double (nullable = true)\n",
173 |       " |-- Easting: double (nullable = true)\n",
174 |       " |-- Northing: double (nullable = true)\n",
175 |       "\n"
176 |      ]
177 |     }
178 |    ],
179 |    "source": [
180 |     "df_stations.printSchema()"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 9,
186 |    "id": "766eb9a2",
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "from pyspark.sql import functions as F, types as T"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 10,
196 |    "id": "6cbf6baf",
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "# rename columns\n",
201 |     "stations= df_stations.withColumnRenamed('Station.Id', 'station_id') \\\n",
202 |     "                        .withColumnRenamed('StationName', 'station_name') \\\n",
203 |     "                        .withColumnRenamed('easting', 'easting') \\\n",
204 |     "                        .withColumnRenamed('northing', 'northing') "
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 11,
210 |    "id": "7d9360c5",
211 |    "metadata": {
212 |     "scrolled": false
213 |    },
214 |    "outputs": [
215 |     {
216 |      "name": "stdout",
217 |      "output_type": "stream",
218 |      "text": [
219 |       "+----------+--------------------+----------+--------+----------+----------+\n",
220 |       "|station_id|        station_name| longitude|latitude|   easting|  northing|\n",
221 |       "+----------+--------------------+----------+--------+----------+----------+\n",
222 |       "|         1|River Street, Cle...| -0.109971| 51.5292| 531202.52| 182832.02|\n",
223 |       "|         2|Phillimore Garden...| -0.197574| 51.4996| 525207.07| 179391.86|\n",
224 |       "|         3|Christopher Stree...|-0.0846057| 51.5213| 532984.81| 182001.53|\n",
225 |       "|         4|St. Chad's Street...| -0.120974| 51.5301| 530436.76| 182911.99|\n",
226 |       "|         5|Sedding Street, S...| -0.156876| 51.4931|528051.649|178742.097|\n",
227 |       "+----------+--------------------+----------+--------+----------+----------+\n",
228 |       "only showing top 5 rows\n",
229 |       "\n"
230 |      ]
231 |     }
232 |    ],
233 |    "source": [
234 |     "stations.show(5)"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 12,
240 |    "id": "88468f5d",
241 |    "metadata": {
242 |     "scrolled": true
243 |    },
244 |    "outputs": [
245 |     {
246 |      "name": "stdout",
247 |      "output_type": "stream",
248 |      "text": [
249 |       "+----------+------------+---------+--------+-------+--------+\n",
250 |       "|station_id|station_name|longitude|latitude|easting|northing|\n",
251 |       "+----------+------------+---------+--------+-------+--------+\n",
252 |       "|         0|           0|        0|       0|      0|       0|\n",
253 |       "+----------+------------+---------+--------+-------+--------+\n",
254 |       "\n"
255 |      ]
256 |     }
257 |    ],
258 |    "source": [
259 |     "# count missing values in each column\n",
260 |     "stations.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in stations.columns]\n",
261 |     "   ).show()"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 17,
267 |    "id": "d0ee0f23",
268 |    "metadata": {},
269 |    "outputs": [
270 |     {
271 |      "name": "stderr",
272 |      "output_type": "stream",
273 |      "text": [
274 |       "                                                                                \r"
275 |      ]
276 |     }
277 |    ],
278 |    "source": [
279 |     "stations.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/stations/', mode='overwrite')"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "id": "ef3f5de0",
285 |    "metadata": {},
286 |    "source": [
287 |     "### 2. Weather data"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 19,
293 |    "id": "c41101c2",
294 |    "metadata": {},
295 |    "outputs": [
296 |     {
297 |      "name": "stderr",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "                                                                                \r"
301 |      ]
302 |     }
303 |    ],
304 |    "source": [
305 |     "df_weather = spark.read.json(\"s3a://hrc-de-data/raw/cycling-extras/weather.json\")"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 20,
311 |    "id": "3203cde3",
312 |    "metadata": {},
313 |    "outputs": [
314 |     {
315 |      "name": "stderr",
316 |      "output_type": "stream",
317 |      "text": [
318 |       "22/03/01 18:13:58 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n"
319 |      ]
320 |     },
321 |     {
322 |      "data": {
323 |       "text/plain": [
324 |        "[Row(cloudcover=0.5, conditions='Rain', datetime='2021-01-01', datetimeEpoch=1609459200, description='Clear conditions throughout the day with late afternoon rain.', dew=0.8, feelslike=-0.2, feelslikemax=2.9, feelslikemin=-3.6, humidity=91.03, icon='rain', moonphase=0.53, precip=0.22, precipcover=4.17, precipprob=None, preciptype=None, pressure=1011.6, snow=None, snowdepth=None, solarenergy=0.8, solarradiation=29.4, source='obs', stations=['03769099999', '03680099999', 'D5621', '03672099999', '03781099999', '03772099999', '03770099999'], sunrise='08:06:14', sunriseEpoch=1609488374, sunset='16:02:22', sunsetEpoch=1609516942, temp=2.1, tempmax=5.0, tempmin=-0.5, tzoffset=None, uvindex=0.0, visibility=4.1, winddir=304.0, windgust=None, windspeed=10.6),\n",
325 |        " Row(cloudcover=0.5, conditions='Rain', datetime='2021-01-02', datetimeEpoch=1609545600, description='Clear conditions throughout the day with rain.', dew=1.0, feelslike=1.5, feelslikemax=3.1, feelslikemin=-1.5, humidity=82.51, icon='rain', moonphase=0.56, precip=0.6, precipcover=8.33, precipprob=None, preciptype=None, pressure=1015.9, snow=None, snowdepth=None, solarenergy=1.3, solarradiation=43.9, source='obs', stations=['03680099999', 'D5621', '03672099999', '03781099999', '03772099999', '03770099999'], sunrise='08:06:05', sunriseEpoch=1609574765, sunset='16:03:28', sunsetEpoch=1609603408, temp=3.8, tempmax=5.1, tempmin=1.5, tzoffset=None, uvindex=1.0, visibility=24.4, winddir=299.0, windgust=None, windspeed=12.5)]"
326 |       ]
327 |      },
328 |      "execution_count": 20,
329 |      "metadata": {},
330 |      "output_type": "execute_result"
331 |     }
332 |    ],
333 |    "source": [
334 |     "df_weather.take(2)"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": 21,
340 |    "id": "5474298d",
341 |    "metadata": {},
342 |    "outputs": [
343 |     {
344 |      "name": "stdout",
345 |      "output_type": "stream",
346 |      "text": [
347 |       "root\n",
348 |       " |-- cloudcover: double (nullable = true)\n",
349 |       " |-- conditions: string (nullable = true)\n",
350 |       " |-- datetime: string (nullable = true)\n",
351 |       " |-- datetimeEpoch: long (nullable = true)\n",
352 |       " |-- description: string (nullable = true)\n",
353 |       " |-- dew: double (nullable = true)\n",
354 |       " |-- feelslike: double (nullable = true)\n",
355 |       " |-- feelslikemax: double (nullable = true)\n",
356 |       " |-- feelslikemin: double (nullable = true)\n",
357 |       " |-- humidity: double (nullable = true)\n",
358 |       " |-- icon: string (nullable = true)\n",
359 |       " |-- moonphase: double (nullable = true)\n",
360 |       " |-- precip: double (nullable = true)\n",
361 |       " |-- precipcover: double (nullable = true)\n",
362 |       " |-- precipprob: string (nullable = true)\n",
363 |       " |-- preciptype: string (nullable = true)\n",
364 |       " |-- pressure: double (nullable = true)\n",
365 |       " |-- snow: string (nullable = true)\n",
366 |       " |-- snowdepth: double (nullable = true)\n",
367 |       " |-- solarenergy: double (nullable = true)\n",
368 |       " |-- solarradiation: double (nullable = true)\n",
369 |       " |-- source: string (nullable = true)\n",
370 |       " |-- stations: array (nullable = true)\n",
371 |       " |    |-- element: string (containsNull = true)\n",
372 |       " |-- sunrise: string (nullable = true)\n",
373 |       " |-- sunriseEpoch: long (nullable = true)\n",
374 |       " |-- sunset: string (nullable = true)\n",
375 |       " |-- sunsetEpoch: long (nullable = true)\n",
376 |       " |-- temp: double (nullable = true)\n",
377 |       " |-- tempmax: double (nullable = true)\n",
378 |       " |-- tempmin: double (nullable = true)\n",
379 |       " |-- tzoffset: double (nullable = true)\n",
380 |       " |-- uvindex: double (nullable = true)\n",
381 |       " |-- visibility: double (nullable = true)\n",
382 |       " |-- winddir: double (nullable = true)\n",
383 |       " |-- windgust: double (nullable = true)\n",
384 |       " |-- windspeed: double (nullable = true)\n",
385 |       "\n"
386 |      ]
387 |     }
388 |    ],
389 |    "source": [
390 |     "df_weather.printSchema()"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": 22,
396 |    "id": "2e952690",
397 |    "metadata": {},
398 |    "outputs": [],
399 |    "source": [
400 |     "# drop some columns that we won't need\n",
401 |     "weather= df_weather.drop('cloudcover', 'conditions', 'datetimeEpoch', 'description', 'dew', 'icon', \n",
402 |     "                            'precipcover', 'preciptype', 'source', 'stations', 'sunriseEpoch', 'sunsetEpoch')"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 23,
408 |    "id": "22b9368f",
409 |    "metadata": {},
410 |    "outputs": [],
411 |    "source": [
412 |     "# transform datetime\n",
413 |     "weather= weather.withColumnRenamed('datetime', 'weather_date') \n",
414 |     "weather= weather.withColumn('weather_date', weather.weather_date.cast(T.DateType()))"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": 24,
420 |    "id": "b7d8f370",
421 |    "metadata": {
422 |     "scrolled": true
423 |    },
424 |    "outputs": [
425 |     {
426 |      "name": "stdout",
427 |      "output_type": "stream",
428 |      "text": [
429 |       "root\n",
430 |       " |-- weather_date: date (nullable = true)\n",
431 |       " |-- feelslike: double (nullable = true)\n",
432 |       " |-- feelslikemax: double (nullable = true)\n",
433 |       " |-- feelslikemin: double (nullable = true)\n",
434 |       " |-- humidity: double (nullable = true)\n",
435 |       " |-- moonphase: double (nullable = true)\n",
436 |       " |-- precip: double (nullable = true)\n",
437 |       " |-- precipprob: string (nullable = true)\n",
438 |       " |-- preciptype: string (nullable = true)\n",
439 |       " |-- pressure: double (nullable = true)\n",
440 |       " |-- snow: string (nullable = true)\n",
441 |       " |-- snowdepth: double (nullable = true)\n",
442 |       " |-- solarenergy: double (nullable = true)\n",
443 |       " |-- solarradiation: double (nullable = true)\n",
444 |       " |-- sunrise: string (nullable = true)\n",
445 |       " |-- sunset: string (nullable = true)\n",
446 |       " |-- temp: double (nullable = true)\n",
447 |       " |-- tempmax: double (nullable = true)\n",
448 |       " |-- tempmin: double (nullable = true)\n",
449 |       " |-- tzoffset: double (nullable = true)\n",
450 |       " |-- uvindex: double (nullable = true)\n",
451 |       " |-- visibility: double (nullable = true)\n",
452 |       " |-- winddir: double (nullable = true)\n",
453 |       " |-- windgust: double (nullable = true)\n",
454 |       " |-- windspeed: double (nullable = true)\n",
455 |       "\n",
456 |       "25 columns\n"
457 |      ]
458 |     }
459 |    ],
460 |    "source": [
461 |     "weather.printSchema()\n",
462 |     "print(len(weather.columns), 'columns')"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": 25,
468 |    "id": "e116dc28",
469 |    "metadata": {},
470 |    "outputs": [],
471 |    "source": [
472 |     "# count missing values in each column\n",
473 |     "cols= weather.columns\n",
474 |     "cols.remove('weather_date')\n",
475 |     "missing_values= weather.select([F.count(F.when(F.col(c).isNull() | F.isnan(c), c)).alias(c) for c in cols])"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "code",
480 |    "execution_count": 26,
481 |    "id": "e279bebd",
482 |    "metadata": {
483 |     "scrolled": true
484 |    },
485 |    "outputs": [
486 |     {
487 |      "name": "stdout",
488 |      "output_type": "stream",
489 |      "text": [
490 |       "+---------+------------+------------+--------+---------+------+----------+----------+--------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n",
491 |       "|feelslike|feelslikemax|feelslikemin|humidity|moonphase|precip|precipprob|preciptype|pressure|snow|snowdepth|solarenergy|solarradiation|sunrise|sunset|temp|tempmax|tempmin|tzoffset|uvindex|visibility|winddir|windgust|windspeed|\n",
492 |       "+---------+------------+------------+--------+---------+------+----------+----------+--------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n",
493 |       "|        0|           0|           0|       0|        0|     0|       365|       365|       1| 365|      356|          0|             0|      0|     0|   0|      0|      0|     148|      0|         0|      0|     226|        0|\n",
494 |       "+---------+------------+------------+--------+---------+------+----------+----------+--------+----+---------+-----------+--------------+-------+------+----+-------+-------+--------+-------+----------+-------+--------+---------+\n",
495 |       "\n"
496 |      ]
497 |     }
498 |    ],
499 |    "source": [
500 |     "missing_values.show()"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": 27,
506 |    "id": "49d42f95",
507 |    "metadata": {
508 |     "scrolled": true
509 |    },
510 |    "outputs": [
511 |     {
512 |      "name": "stdout",
513 |      "output_type": "stream",
514 |      "text": [
515 |       "+---------+------------+------------+--------+---------+------+----------+----------+--------------------+----+------------------+-----------+--------------+-------+------+----+-------+-------+------------------+-------+----------+-------+------------------+---------+\n",
516 |       "|feelslike|feelslikemax|feelslikemin|humidity|moonphase|precip|precipprob|preciptype|            pressure|snow|         snowdepth|solarenergy|solarradiation|sunrise|sunset|temp|tempmax|tempmin|          tzoffset|uvindex|visibility|winddir|          windgust|windspeed|\n",
517 |       "+---------+------------+------------+--------+---------+------+----------+----------+--------------------+----+------------------+-----------+--------------+-------+------+----+-------+-------+------------------+-------+----------+-------+------------------+---------+\n",
518 |       "|      0.0|         0.0|         0.0|     0.0|      0.0|   0.0|       1.0|       1.0|0.002739726027397...| 1.0|0.9753424657534246|        0.0|           0.0|    0.0|   0.0| 0.0|    0.0|    0.0|0.4054794520547945|    0.0|       0.0|    0.0|0.6191780821917808|      0.0|\n",
519 |       "+---------+------------+------------+--------+---------+------+----------+----------+--------------------+----+------------------+-----------+--------------+-------+------+----+-------+-------+------------------+-------+----------+-------+------------------+---------+\n",
520 |       "\n"
521 |      ]
522 |     }
523 |    ],
524 |    "source": [
525 |     "perc_missing_values= weather.select([(F.count(F.when(F.isnan(c) | F.col(c).isNull(), c))/F.count(F.lit(1))).alias(c) for c in cols])\n",
526 |     "perc_missing_values.show()"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": 28,
532 |    "id": "61e4c2fd",
533 |    "metadata": {},
534 |    "outputs": [
535 |     {
536 |      "data": {
537 |       "text/plain": [
538 |        "['weather_date',\n",
539 |        " 'feelslike',\n",
540 |        " 'feelslikemax',\n",
541 |        " 'feelslikemin',\n",
542 |        " 'humidity',\n",
543 |        " 'moonphase',\n",
544 |        " 'precip',\n",
545 |        " 'pressure',\n",
546 |        " 'solarenergy',\n",
547 |        " 'solarradiation',\n",
548 |        " 'sunrise',\n",
549 |        " 'sunset',\n",
550 |        " 'temp',\n",
551 |        " 'tempmax',\n",
552 |        " 'tempmin',\n",
553 |        " 'tzoffset',\n",
554 |        " 'uvindex',\n",
555 |        " 'visibility',\n",
556 |        " 'winddir',\n",
557 |        " 'windgust',\n",
558 |        " 'windspeed']"
559 |       ]
560 |      },
561 |      "execution_count": 28,
562 |      "metadata": {},
563 |      "output_type": "execute_result"
564 |     }
565 |    ],
566 |    "source": [
567 |     "# drop columns where missing values are more than 70%\n",
568 |     "\n",
569 |     "weather= weather.drop('precipprob', 'snow', 'snowdepth')\n",
570 |     "\n",
571 |     "if 'severerisk' in weather.columns:\n",
572 |     "    weather= weather.drop('severerisk')\n",
573 |     "\n",
574 |     "weather.columns"
575 |    ]
576 |   },
577 |   {
578 |    "cell_type": "code",
579 |    "execution_count": 29,
580 |    "id": "c1714e54",
581 |    "metadata": {},
582 |    "outputs": [
583 |     {
584 |      "name": "stderr",
585 |      "output_type": "stream",
586 |      "text": [
587 |       "                                                                                \r"
588 |      ]
589 |     }
590 |    ],
591 |    "source": [
592 |     "weather= weather.repartition(10)\n",
593 |     "\n",
594 |     "weather.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/weather/', mode='overwrite')"
595 |    ]
596 |   },
597 |   {
598 |    "cell_type": "code",
599 |    "execution_count": null,
600 |    "id": "2bbb74b9",
601 |    "metadata": {},
602 |    "outputs": [],
603 |    "source": []
604 |   }
605 |  ],
606 |  "metadata": {
607 |   "kernelspec": {
608 |    "display_name": "Python 3 (ipykernel)",
609 |    "language": "python",
610 |    "name": "python3"
611 |   },
612 |   "language_info": {
613 |    "codemirror_mode": {
614 |     "name": "ipython",
615 |     "version": 3
616 |    },
617 |    "file_extension": ".py",
618 |    "mimetype": "text/x-python",
619 |    "name": "python",
620 |    "nbconvert_exporter": "python",
621 |    "pygments_lexer": "ipython3",
622 |    "version": "3.9.7"
623 |   }
624 |  },
625 |  "nbformat": 4,
626 |  "nbformat_minor": 5
627 | }
628 | 


--------------------------------------------------------------------------------
/notebook/data-transformation/journey-data-transformation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "deda5766",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## Transformation for rental journey data \n",
  9 |     "This notebook is responsible for transforming journey data by performing the following tasks:\n",
 10 |     "\n",
 11 |     "    1. Renaming columns (removing spaces and lowercasing)\n",
 12 |     "\n",
 13 |     "    2. Convert data types from string to timestamps\n",
 14 |     "    \n",
 15 |     "    3. Attach weather dates\n",
 16 |     "    \n",
 17 |     "    4. Drop unnecessary columns\n",
 18 |     "    \n",
 19 |     "    5. Update extra files for dimension tables"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 1,
 25 |    "id": "763a90f8",
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "import pyspark\n",
 30 |     "import os"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "id": "25914d7c",
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "from pyspark.sql import SparkSession"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "id": "a9382918",
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stderr",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "WARNING: An illegal reflective access operation has occurred\n",
 54 |       "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/hrc/anaconda3/lib/python3.9/site-packages/pyspark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
 55 |       "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
 56 |       "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
 57 |       "WARNING: All illegal access operations will be denied in a future release\n",
 58 |       "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
 59 |       "Setting default log level to \"WARN\".\n",
 60 |       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
 61 |       "22/03/01 21:00:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
 62 |      ]
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "spark = SparkSession.builder \\\n",
 67 |     "        .master('local[*]') \\\n",
 68 |     "        .appName('journey-and-stations-data-transformer') \\\n",
 69 |     "        .config(\"spark.hadoop.fs.s3a.access.key\", os.environ.get('AWS_ACCESS_KEY'))\\\n",
 70 |     "        .config(\"spark.hadoop.fs.s3a.secret.key\", os.environ.get('AWS_SECRET_ACCESS_KEY'))\\\n",
 71 |     "        .getOrCreate()"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 5,
 77 |    "id": "30406f25",
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "name": "stderr",
 82 |      "output_type": "stream",
 83 |      "text": [
 84 |       "                                                                                \r"
 85 |      ]
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "# get journey data\n",
 90 |     "df_journey = spark.read.csv(\"s3a://hrc-de-data/raw/cycling-journey/*/*\", inferSchema=True, header=True)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 6,
 96 |    "id": "56579ae0",
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "data": {
101 |       "text/plain": [
102 |        "[Row(Rental Id=109096951, Duration=540, Bike Id=13318, End Date='15/06/2021 20:19', EndStation Id=661, EndStation Name='All Saints Church, Portobello', Start Date='15/06/2021 20:10', StartStation Id=105, StartStation Name='Westbourne Grove, Bayswater'),\n",
103 |        " Row(Rental Id=108982015, Duration=780, Bike Id=18991, End Date='13/06/2021 13:03', EndStation Id=312, EndStation Name=\"Grove End Road, St. John's Wood\", Start Date='13/06/2021 12:50', StartStation Id=106, StartStation Name='Woodstock Street, Mayfair')]"
104 |       ]
105 |      },
106 |      "execution_count": 6,
107 |      "metadata": {},
108 |      "output_type": "execute_result"
109 |     }
110 |    ],
111 |    "source": [
112 |     "df_journey.take(2)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 7,
118 |    "id": "9ef2dc88",
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "name": "stdout",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "root\n",
126 |       " |-- Rental Id: integer (nullable = true)\n",
127 |       " |-- Duration: integer (nullable = true)\n",
128 |       " |-- Bike Id: integer (nullable = true)\n",
129 |       " |-- End Date: string (nullable = true)\n",
130 |       " |-- EndStation Id: integer (nullable = true)\n",
131 |       " |-- EndStation Name: string (nullable = true)\n",
132 |       " |-- Start Date: string (nullable = true)\n",
133 |       " |-- StartStation Id: integer (nullable = true)\n",
134 |       " |-- StartStation Name: string (nullable = true)\n",
135 |       "\n"
136 |      ]
137 |     }
138 |    ],
139 |    "source": [
140 |     "df_journey.printSchema()"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 8,
146 |    "id": "aea050fa",
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "from pyspark.sql.functions import *\n",
151 |     "from pyspark.sql.types import *"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 9,
157 |    "id": "78224fd1",
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "# rename columns\n",
162 |     "df_journey= df_journey.withColumnRenamed('Rental Id', 'rental_id')\\\n",
163 |     ".withColumnRenamed('Bike Id', 'bike_id')\\\n",
164 |     ".withColumnRenamed('Start Date', 'start_date')\\\n",
165 |     ".withColumnRenamed('End Date', 'end_date')\\\n",
166 |     ".withColumnRenamed('StartStation Id', 'start_station')\\\n",
167 |     ".withColumnRenamed('EndStation Id', 'end_station')"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 10,
173 |    "id": "b1ad54a8",
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "# convert data types\n",
178 |     "df_journey= df_journey.withColumn('start_date', to_timestamp(col('start_date'), 'dd/MM/yyy HH:mm'))\n",
179 |     "\n",
180 |     "df_journey= df_journey.withColumn('end_date',  to_timestamp(col('end_date'), 'dd/MM/yyy HH:mm'))"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 11,
186 |    "id": "2989d91b",
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "# add weather_date column\n",
191 |     "df_journey= df_journey.withColumn('weather_date', to_date(col(\"start_date\"), 'dd/MM/yyy HH:mm'))"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 12,
197 |    "id": "1e64f016",
198 |    "metadata": {},
199 |    "outputs": [
200 |     {
201 |      "name": "stdout",
202 |      "output_type": "stream",
203 |      "text": [
204 |       "+---------+--------+-------+-------------------+-----------+--------------------+-------------------+-------------+--------------------+------------+\n",
205 |       "|rental_id|Duration|bike_id|           end_date|end_station|     EndStation Name|         start_date|start_station|   StartStation Name|weather_date|\n",
206 |       "+---------+--------+-------+-------------------+-----------+--------------------+-------------------+-------------+--------------------+------------+\n",
207 |       "|109096951|     540|  13318|2021-06-15 20:19:00|        661|All Saints Church...|2021-06-15 20:10:00|          105|Westbourne Grove,...|  2021-06-15|\n",
208 |       "|108982015|     780|  18991|2021-06-13 13:03:00|        312|Grove End Road, S...|2021-06-13 12:50:00|          106|Woodstock Street,...|  2021-06-13|\n",
209 |       "|108839141|     840|  16736|2021-06-10 15:28:00|        333|Palace Gardens Te...|2021-06-10 15:14:00|          106|Woodstock Street,...|  2021-06-10|\n",
210 |       "|108816591|    1380|    913|2021-06-09 22:37:00|         51|Finsbury Library ...|2021-06-09 22:14:00|          123|St. John Street, ...|  2021-06-09|\n",
211 |       "|108919084|    1200|   6682|2021-06-12 11:29:00|        732|Duke Street Hill,...|2021-06-12 11:09:00|          123|St. John Street, ...|  2021-06-12|\n",
212 |       "+---------+--------+-------+-------------------+-----------+--------------------+-------------------+-------------+--------------------+------------+\n",
213 |       "only showing top 5 rows\n",
214 |       "\n",
215 |       "root\n",
216 |       " |-- rental_id: integer (nullable = true)\n",
217 |       " |-- Duration: integer (nullable = true)\n",
218 |       " |-- bike_id: integer (nullable = true)\n",
219 |       " |-- end_date: timestamp (nullable = true)\n",
220 |       " |-- end_station: integer (nullable = true)\n",
221 |       " |-- EndStation Name: string (nullable = true)\n",
222 |       " |-- start_date: timestamp (nullable = true)\n",
223 |       " |-- start_station: integer (nullable = true)\n",
224 |       " |-- StartStation Name: string (nullable = true)\n",
225 |       " |-- weather_date: date (nullable = true)\n",
226 |       "\n"
227 |      ]
228 |     }
229 |    ],
230 |    "source": [
231 |     "df_journey.show(5)\n",
232 |     "df_journey.printSchema()"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "id": "71328ed9",
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "    "
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "id": "7b5e4e8d",
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": []
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "id": "784a2702",
256 |    "metadata": {},
257 |    "source": [
258 |     "### Stations data\n",
259 |     "We are going to update the stations data (previously saved by another process) with some additional stations that are not present in the original stations data but are seen in some journey."
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": null,
265 |    "id": "f8da8a97",
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "# read previously saved stations data from parquet\n",
270 |     "df_processed_stations= spark.read.parquet('s3a://hrc-de-data/processed/cycling-dimension/stations/')"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "id": "52c496c2",
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "# create temporary table for both stations and journey\n",
281 |     "df_journey.createOrReplaceTempView('journey')\n",
282 |     "df_processed_stations.createOrReplaceTempView('station')"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "id": "44f574f0",
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "# we keep all the stations which are not found in the temp view station table\n",
293 |     "additional_stations= spark.sql('''\n",
294 |     "with station_ids as (\n",
295 |     "    select \n",
296 |     "        station_id\n",
297 |     "    from\n",
298 |     "        station\n",
299 |     ")\n",
300 |     "\n",
301 |     "select \n",
302 |     "    distinct(start_station) as station_id, \n",
303 |     "    `StartStation Name` as station_name \n",
304 |     "from \n",
305 |     "    journey\n",
306 |     "where \n",
307 |     "    start_station not in (table station_ids)\n",
308 |     "\n",
309 |     "union\n",
310 |     "\n",
311 |     "select \n",
312 |     "    distinct(end_station) as station_id, \n",
313 |     "    `EndStation Name` as station_name \n",
314 |     "from \n",
315 |     "    journey\n",
316 |     "where \n",
317 |     "    end_station not in (table station_ids)\n",
318 |     "''')\n",
319 |     "additional_stations.show()"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": null,
325 |    "id": "c95d0712",
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "# add columns to the additional stations to avoid errors when merging it to the previous one (df_processed_stations)\n",
330 |     "additional_stations= additional_stations.withColumn('longitude', lit(0).cast(DoubleType()))\\\n",
331 |     ".withColumn('latitude', lit(0).cast(DoubleType()))\\\n",
332 |     ".withColumn('easting', lit(0).cast(DoubleType()))\\\n",
333 |     ".withColumn('northing', lit(0).cast(DoubleType()))"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "id": "ccd5a72c",
340 |    "metadata": {},
341 |    "outputs": [],
342 |    "source": [
343 |     "additional_stations.show(5)\n",
344 |     "additional_stations.printSchema()"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": null,
350 |    "id": "68e2adc5",
351 |    "metadata": {},
352 |    "outputs": [],
353 |    "source": [
354 |     "# remove duplicate values\n",
355 |     "additional_stations= additional_stations.dropDuplicates(['station_id'])"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "id": "9361ec15",
362 |    "metadata": {
363 |     "scrolled": false
364 |    },
365 |    "outputs": [],
366 |    "source": [
367 |     "# save additional stations data into parquet files in s3\n",
368 |     "additional_stations.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/stations/', mode='append')"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": null,
374 |    "id": "bd2a8529",
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "# drop other unnecessary journey columns\n",
379 |     "df_journey= df_journey.drop('StartStation Name', 'EndStation Name', 'Duration')"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "markdown",
384 |    "id": "e70a0ba5",
385 |    "metadata": {},
386 |    "source": [
387 |     "### Datetime\n",
388 |     "We are going to create/update datetime data from the start and end date of each journey."
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 13,
394 |    "id": "1c813157",
395 |    "metadata": {},
396 |    "outputs": [
397 |     {
398 |      "name": "stdout",
399 |      "output_type": "stream",
400 |      "text": [
401 |       "+-------------------+----+--------+-----+---+----+------+------+\n",
402 |       "|        datetime_id|year|week_day|month|day|hour|minute|second|\n",
403 |       "+-------------------+----+--------+-----+---+----+------+------+\n",
404 |       "|2021-06-15 20:10:00|2021|       3|    6| 15|  20|    10|     0|\n",
405 |       "|2021-06-13 12:50:00|2021|       1|    6| 13|  12|    50|     0|\n",
406 |       "|2021-06-10 15:14:00|2021|       5|    6| 10|  15|    14|     0|\n",
407 |       "+-------------------+----+--------+-----+---+----+------+------+\n",
408 |       "only showing top 3 rows\n",
409 |       "\n",
410 |       "+-------------------+----+--------+-----+---+----+------+------+\n",
411 |       "|        datetime_id|year|week_day|month|day|hour|minute|second|\n",
412 |       "+-------------------+----+--------+-----+---+----+------+------+\n",
413 |       "|2021-06-15 20:19:00|2021|       3|    6| 15|  20|    19|     0|\n",
414 |       "|2021-06-13 13:03:00|2021|       1|    6| 13|  13|     3|     0|\n",
415 |       "|2021-06-10 15:28:00|2021|       5|    6| 10|  15|    28|     0|\n",
416 |       "+-------------------+----+--------+-----+---+----+------+------+\n",
417 |       "only showing top 3 rows\n",
418 |       "\n"
419 |      ]
420 |     }
421 |    ],
422 |    "source": [
423 |     "# extract datetime values from the start and the end date\n",
424 |     "df_datetime_from_start= (\n",
425 |     "    df_journey.select(\n",
426 |     "        col('start_date').alias('datetime_id'), \n",
427 |     "        year(col('start_date')).alias('year'),\n",
428 |     "        dayofweek(col('start_date')).alias('week_day'),\n",
429 |     "        month(col('start_date')).alias('month'), \n",
430 |     "        dayofmonth(col('start_date')).alias('day'),\n",
431 |     "        hour(col('start_date')).alias('hour'),\n",
432 |     "        minute(col('start_date')).alias('minute'),\n",
433 |     "        second(col('start_date')).alias('second'),\n",
434 |     "    )\n",
435 |     ")\n",
436 |     "df_datetime_from_end= (\n",
437 |     "    df_journey.select(\n",
438 |     "        col('end_date').alias('datetime_id'), \n",
439 |     "        year(col('end_date')).alias('year'), \n",
440 |     "        dayofweek(col('end_date')).alias('week_day'),\n",
441 |     "        month(col('end_date')).alias('month'), \n",
442 |     "        dayofmonth(col('end_date')).alias('day'),\n",
443 |     "        hour(col('end_date')).alias('hour'),\n",
444 |     "        minute(col('end_date')).alias('minute'),\n",
445 |     "        second(col('end_date')).alias('second'),\n",
446 |     "    )\n",
447 |     ")\n",
448 |     "\n",
449 |     "df_datetime_from_start.show(3)\n",
450 |     "df_datetime_from_end.show(3)"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": 14,
456 |    "id": "057c1e45",
457 |    "metadata": {},
458 |    "outputs": [
459 |     {
460 |      "name": "stdout",
461 |      "output_type": "stream",
462 |      "text": [
463 |       "+-------------------+----+--------+-----+---+----+------+------+\n",
464 |       "|        datetime_id|year|week_day|month|day|hour|minute|second|\n",
465 |       "+-------------------+----+--------+-----+---+----+------+------+\n",
466 |       "|2021-06-15 20:10:00|2021|       3|    6| 15|  20|    10|     0|\n",
467 |       "|2021-06-13 12:50:00|2021|       1|    6| 13|  12|    50|     0|\n",
468 |       "|2021-06-10 15:14:00|2021|       5|    6| 10|  15|    14|     0|\n",
469 |       "|2021-06-09 22:14:00|2021|       4|    6|  9|  22|    14|     0|\n",
470 |       "|2021-06-12 11:09:00|2021|       7|    6| 12|  11|     9|     0|\n",
471 |       "|2021-06-10 22:33:00|2021|       5|    6| 10|  22|    33|     0|\n",
472 |       "|2021-06-13 14:48:00|2021|       1|    6| 13|  14|    48|     0|\n",
473 |       "|2021-06-14 18:06:00|2021|       2|    6| 14|  18|     6|     0|\n",
474 |       "|2021-06-14 18:06:00|2021|       2|    6| 14|  18|     6|     0|\n",
475 |       "|2021-06-09 16:06:00|2021|       4|    6|  9|  16|     6|     0|\n",
476 |       "+-------------------+----+--------+-----+---+----+------+------+\n",
477 |       "only showing top 10 rows\n",
478 |       "\n"
479 |      ]
480 |     }
481 |    ],
482 |    "source": [
483 |     "# combine the dataframes\n",
484 |     "df_datetime= df_datetime_from_start.union(df_datetime_from_end)\n",
485 |     "\n",
486 |     "# remove duplicate entries\n",
487 |     "df_datetime= df_datetime.dropDuplicates(['datetime_id'])\n",
488 |     "\n",
489 |     "df_datetime.show(10)"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": 15,
495 |    "id": "38b8b2ac",
496 |    "metadata": {},
497 |    "outputs": [
498 |     {
499 |      "name": "stderr",
500 |      "output_type": "stream",
501 |      "text": [
502 |       "                                                                                \r"
503 |      ]
504 |     }
505 |    ],
506 |    "source": [
507 |     "# save datetime data into parquet files in s3\n",
508 |     "df_datetime.write.parquet('s3a://hrc-de-data/processed/cycling-dimension/datetime/', mode='overwrite')"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": 26,
514 |    "id": "028eb71d",
515 |    "metadata": {
516 |     "scrolled": true
517 |    },
518 |    "outputs": [
519 |     {
520 |      "name": "stderr",
521 |      "output_type": "stream",
522 |      "text": [
523 |       "                                                                                \r"
524 |      ]
525 |     }
526 |    ],
527 |    "source": [
528 |     "# finally, save journey data into parquet files in s3\n",
529 |     "df_journey.write.parquet('s3a://hrc-de-data/processed/cycling-fact/journey/', mode='append')"
530 |    ]
531 |   }
532 |  ],
533 |  "metadata": {
534 |   "kernelspec": {
535 |    "display_name": "Python 3 (ipykernel)",
536 |    "language": "python",
537 |    "name": "python3"
538 |   },
539 |   "language_info": {
540 |    "codemirror_mode": {
541 |     "name": "ipython",
542 |     "version": 3
543 |    },
544 |    "file_extension": ".py",
545 |    "mimetype": "text/x-python",
546 |    "name": "python",
547 |    "nbconvert_exporter": "python",
548 |    "pygments_lexer": "ipython3",
549 |    "version": "3.9.7"
550 |   }
551 |  },
552 |  "nbformat": 4,
553 |  "nbformat_minor": 5
554 | }
555 | 


--------------------------------------------------------------------------------
/services.md:
--------------------------------------------------------------------------------
1 | ### Creating an EC2 default role
2 | 
3 | ```bash
4 | aws emr create-default-roles --region <region>
5 | ```
6 | 
7 | In case the role was already created before but become invalid, checkout [this link](https://aws.amazon.com/premiumsupport/knowledge-center/emr-default-role-invalid/)


--------------------------------------------------------------------------------
/terraform/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = "~> 3.27"
 6 |     }
 7 |   }
 8 | 
 9 |   required_version = ">= 0.14.9"
10 | }
11 | 
12 | provider "aws" {
13 |   profile = "default"
14 |   region  = "${var.region}"
15 | }
16 | 
17 | resource "aws_instance" "de-ec2" {
18 |   ami           = "${var.ec2_ami}"
19 |   instance_type = "t2.micro"
20 |   ebs_block_device {
21 |     device_name = "/dev/sda1"
22 |     volume_size = 10
23 |   }
24 | 
25 |   tags = {
26 |     Name = "EC2forDEprojects"
27 |   }
28 | }
29 | 
30 | resource "aws_s3_bucket" "de-s3" {
31 |   bucket = "${var.s3_bucket_name}"
32 |   acl = "private"
33 |   
34 |   tags = {
35 |     Name        = "S3forDEprojects "
36 |     Environment = "Dev"
37 |   }
38 | }
39 | 
40 | 
41 | resource "aws_redshift_cluster" "de-redshift" {
42 |   cluster_identifier = "${var.cluster_id}"
43 |   database_name      = "dev"   
44 |   master_username    = "${var.db_credentials_uname}"
45 |   master_password    = "${var.db_credentials_pwd}"
46 |   node_type          = "${var.node_type}"        
47 |   cluster_type       = "${var.cluster_type}"       
48 |   publicly_accessible = false
49 | }


--------------------------------------------------------------------------------
/terraform/services.md:
--------------------------------------------------------------------------------
 1 | ### Running Terraform
 2 | 
 3 | 
 4 | #### 1. Initialization
 5 | ```bash
 6 | terraform init
 7 | ```
 8 | 
 9 | #### 2. Planning
10 | ```bash
11 | terraform plan
12 | ```
13 | 
14 | 
15 | #### 3. Applying
16 | ```bash
17 | terraform apply
18 | ```
19 | 


--------------------------------------------------------------------------------
/terraform/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "region" {
 2 |     default= "eu-west-2"
 3 | }
 4 | 
 5 | 
 6 | variable "ec2_ami" {
 7 |     default= "ami-0015a39e4b7c0966f"
 8 | }
 9 | 
10 | variable "s3_bucket_name" {
11 |     default= "hrc-de-data"
12 | }
13 | 
14 | variable "cluster_id" {
15 |     default= "redshift-cluster-0"
16 | }
17 | 
18 | variable "node_type" {
19 |     default= "dc2.large"
20 | }
21 | 
22 | variable "cluster_type" {
23 |     default= "single-node"
24 | }
25 | 
26 | variable "db_credentials_uname" {
27 |     default= "awsusr"
28 | }
29 | 
30 | variable "db_credentials_pwd" {
31 |     default= "Mustbe8charsAndInside.EnvFile"
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------