├── Week 4
    ├── dbt_files
    │   ├── macros
    │   │   ├── .gitkeep
    │   │   ├── macros_properties.yml
    │   │   └── get_payment_type_description.sql
    │   ├── seeds
    │   │   ├── .gitkeep
    │   │   └── taxi_zone_lookup.csv
    │   ├── tests
    │   │   └── .gitkeep
    │   ├── analyses
    │   │   └── .gitkeep
    │   ├── snapshots
    │   │   └── .gitkeep
    │   ├── .gitignore
    │   ├── packages.yml
    │   ├── models
    │   │   ├── core
    │   │   │   ├── dim_zones.sql
    │   │   │   ├── dm_monthly_zone_revenue.sql
    │   │   │   ├── schema.yml
    │   │   │   └── fact_trips.sql
    │   │   └── staging
    │   │   │   ├── stg_green_tripdata.sql
    │   │   │   ├── stg_yellow_tripdata.sql
    │   │   │   └── schema.yml
    │   ├── README.md
    │   └── dbt_project.yml
    └── web_to_gcs.py
├── Week 1
    ├── Dockerfile
    ├── docker-compose.yaml
    ├── variables.tf
    ├── main.tf
    ├── ingest_data.py
    └── Homework Answers
├── Week 2
    ├── Dockerfile
    ├── docker_deploy.py
    ├── Homework Answers
    ├── etl_gcs_to_bq.py
    ├── etl_web_to_gcs.py
    ├── ingest_data_flow.py
    └── parameterized_flow.py
├── README.md
├── Week 5
    ├── Code
    │   ├── 02 download_data.sh
    │   ├── 09_Spark_SQL.py
    │   ├── 12_Spark_SQL_BQ.py
    │   ├── 05 Spark Join and GroupBy.ipynb
    │   ├── 04 Spark SQL.ipynb
    │   ├── 06 RDDs.ipynb
    │   └── 03 Taxi Schema.ipynb
    └── Data Engineering Zoomcamp Week 5.ipynb
└── LICENSE


/Week 4/dbt_files/macros/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Week 4/dbt_files/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Week 4/dbt_files/tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Week 4/dbt_files/analyses/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Week 4/dbt_files/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Week 4/dbt_files/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | target/
3 | dbt_packages/
4 | logs/
5 | 


--------------------------------------------------------------------------------
/Week 4/dbt_files/packages.yml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - package: dbt-labs/dbt_utils
3 |     version: 0.8.0


--------------------------------------------------------------------------------
/Week 1/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.1
 2 | 
 3 | RUN pip install pandas sqlalchemy psycopg2
 4 | 
 5 | WORKDIR /app
 6 | COPY ingest_data.py ingest_data.py
 7 | 
 8 | ENTRYPOINT ["python", "ingest_data.py"]
 9 | 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/Week 4/dbt_files/models/core/dim_zones.sql:
--------------------------------------------------------------------------------
1 | {{ config(materialized='table') }}
2 | 
3 | 
4 | select 
5 |     locationid, 
6 |     borough, 
7 |     zone, 
8 |     replace(service_zone,'Boro','Green') as service_zone
9 | from {{ ref('taxi_zone_lookup') }}


--------------------------------------------------------------------------------
/Week 2/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM prefecthq/prefect:2.7.7-python3.9
 2 | 
 3 | COPY requirements.txt .
 4 | 
 5 | RUN pip install -r requirements.txt --trusted-host pypi.python.org --no-cache-dir
 6 | 
 7 | COPY parameterized_flow.py /opt/prefect/flows/parameterized_flow.py
 8 | COPY data /opt/prefect/data
 9 | 
10 | 
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DE-Zoomcamp
 2 | 
 3 | This repo contains the following materials for the Data Engineering Zoomcamp organized by DataTalksClub
 4 | - Notes and Code Files from the videos
 5 | - Installation instructions/files for the tools used
 6 | - Deep dives on the key topics
 7 | - Homework Answers
 8 | 
 9 | Please reach out to me in case you want to add anything else.
10 | 


--------------------------------------------------------------------------------
/Week 2/docker_deploy.py:
--------------------------------------------------------------------------------
 1 | from prefect.deployments import Deployment
 2 | from prefect.infrastructure.docker import DockerContainer
 3 | from parameterized_flow import etl_parent_flow
 4 | 
 5 | docker_block = DockerContainer.load("zoom")
 6 | 
 7 | docker_dep = Deployment.build_from_flow(flow=etl_parent_flow, name='docker-flow', infrastructure=docker_block)
 8 | 
 9 | if __name__ == '__main__':
10 |     docker_dep.apply()
11 | 
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/Week 4/dbt_files/macros/macros_properties.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | macros:
 4 |   - name: get_payment_type_description
 5 |     description: >
 6 |       This macro receives a payment_type and returns the corresponding description.
 7 |     arguments:
 8 |       - name: payment_type
 9 |         type: int
10 |         description: > 
11 |           payment_type value.
12 |           Must be one of the accepted values, otherwise the macro will return null


--------------------------------------------------------------------------------
/Week 4/dbt_files/macros/get_payment_type_description.sql:
--------------------------------------------------------------------------------
 1 |  {#
 2 |     This macro returns the description of the payment_type 
 3 | #}
 4 | 
 5 | {% macro get_payment_type_description(payment_type) -%}
 6 | 
 7 |     case {{ payment_type }}
 8 |         when 1 then 'Credit card'
 9 |         when 2 then 'Cash'
10 |         when 3 then 'No charge'
11 |         when 4 then 'Dispute'
12 |         when 5 then 'Unknown'
13 |         when 6 then 'Voided trip'
14 |     end
15 | 
16 | {%- endmacro %}


--------------------------------------------------------------------------------
/Week 4/dbt_files/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to your new dbt project!
 2 | 
 3 | ### Using the starter project
 4 | 
 5 | Try running the following commands:
 6 | - dbt run
 7 | - dbt test
 8 | 
 9 | 
10 | ### Resources:
11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
13 | - Join the [dbt community](http://community.getbdt.com/) to learn from other analytics engineers
14 | - Find [dbt events](https://events.getdbt.com) near you
15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
16 | 


--------------------------------------------------------------------------------
/Week 5/Code/02 download_data.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | TAXI_TYPE=$1 # "yellow", "green"
 4 | YEAR=$2 # 2020, 2021
 5 | 
 6 | URL_PREFIX="https://github.com/DataTalksClub/nyc-tlc-data/releases/download"
 7 | 
 8 | for MONTH in {1..12}; do
 9 |   FMONTH=`printf "%02d" ${MONTH}`
10 | 
11 |   URL="${URL_PREFIX}/${TAXI_TYPE}/${TAXI_TYPE}_tripdata_${YEAR}-${FMONTH}.csv.gz"
12 | 
13 |   LOCAL_PREFIX="D:/data/raw/${TAXI_TYPE}/${YEAR}/${FMONTH}"
14 |   LOCAL_FILE="${TAXI_TYPE}_tripdata_${YEAR}_${FMONTH}.csv.gz"
15 |   LOCAL_PATH="${LOCAL_PREFIX}/${LOCAL_FILE}"
16 | 
17 |   echo "downloading ${URL} to ${LOCAL_PATH}"
18 |   mkdir -p ${LOCAL_PREFIX}
19 |   curl -L -o ${LOCAL_PATH} ${URL}
20 | # wget ${URL} -O ${LOCAL_PATH}
21 | 
22 | done
23 | 


--------------------------------------------------------------------------------
/Week 1/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   pgdatabase:
 3 |     image: postgres:13
 4 |     environment:
 5 |       - POSTGRES_USER=root
 6 |       - POSTGRES_PASSWORD=root
 7 |       - POSTGRES_DB=ny_taxi
 8 |     volumes:
 9 |       - "./ny_taxi_postgres_data:/var/lib/postgresql/data:rw"
10 |     ports:
11 |       - "5432:5432"
12 |   pgadmin:
13 |     image: dpage/pgadmin4
14 |     environment:
15 |       - PGADMIN_DEFAULT_EMAIL=admin@admin.com
16 |       - PGADMIN_DEFAULT_PASSWORD=root
17 |     volumes:
18 |       - "pgadmin_conn_data:/var/lib/pgadmin:rw"
19 |     ports:
20 |       - "8080:80"
21 | 
22 | volumes:
23 |   pgadmin_conn_data:
24 |     driver: local
25 |     driver_opts:
26 |       type: none
27 |       o: bind
28 |       device: ./pgadmin_conn_data
29 |     
30 | 


--------------------------------------------------------------------------------
/Week 1/variables.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   data_lake_bucket = "dtc_data_lake"
 3 | }
 4 | 
 5 | variable "project" {
 6 |   description = "Your Project ID here"
 7 | }
 8 | 
 9 | variable "region" {
10 |   description = "Region for GCP resources. Choose as per your location: https://cloud.google.com/about/locations"
11 |   default = "europe-west6"
12 |   type = string
13 | }
14 | 
15 | variable "storage_class" {
16 |   description = "Storage class type for your bucket. Check official docs for more info."
17 |   default = "STANDARD"
18 | }
19 | 
20 | variable "BQ_DATASET" {
21 |   description = "BigQuery Dataset that raw data (from GCS) will be written to"
22 |   type = string
23 |   default = "trips_data_all"
24 | }
25 | 
26 | variable "TABLE_NAME" {
27 |   description = "BigQuery Table"
28 |   type = string
29 |   default = "ny_trips"
30 | }


--------------------------------------------------------------------------------
/Week 2/Homework Answers:
--------------------------------------------------------------------------------
 1 | Prefect
 2 | 
 3 | --1)
 4 | 
 5 | Update etl_web_to_gcs.py to change the year to 2020 and month to 1
 6 | 
 7 | Then run - python etl_web_to_gcs.py
 8 | 
 9 | Answer - 447,770
10 | 
11 | --2)
12 | 
13 | prefect deployment build ./etl_web_to_gcs.py:etl_web_to_gcs -n "ETL Job 2" --cron "0 5 1 * *" -a
14 |  
15 | Answer - 0 5 1 * *
16 |  
17 | --3)
18 |  
19 | Update "parameterized_flow.py" file to change the color and months in the main function
20 |  
21 | Update the column name lpep_pickup_datetime to tpep_pickup_datetime. Similarly update lpep_dropoff_datetime to tpep_dropoff_datetime
22 |  
23 | Create a folder "yellow" in the "data" folder in the working dir
24 | 
25 | Run the command - python parameterized_flow.py 
26 | 
27 | This should load the parquet data files for Yellow taxi data for Feb. 2019 and March 2019 into GCS
28 | 
29 | Update the "etl_gcs_to_bq.py" code to make the relevant changes and run the command - python etl_gcs_to_bq.py
30 |  
31 | Answer - 14,851,920
32 |   
33 | --4)
34 | 
35 | 
36 | --5)
37 | 
38 | 
39 | --6)
40 | 
41 | Answer - 8
42 | 


--------------------------------------------------------------------------------
/Week 4/dbt_files/models/core/dm_monthly_zone_revenue.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table') }}
 2 | 
 3 | with trips_data as (
 4 |     select * from {{ ref('fact_trips') }}
 5 | )
 6 |     select 
 7 |     -- Reveneue grouping 
 8 |     pickup_zone as revenue_zone,
 9 |     date_trunc(pickup_datetime, month) as revenue_month, 
10 | 
11 |     service_type, 
12 | 
13 |     -- Revenue calculation 
14 |     sum(fare_amount) as revenue_monthly_fare,
15 |     sum(extra) as revenue_monthly_extra,
16 |     sum(mta_tax) as revenue_monthly_mta_tax,
17 |     sum(tip_amount) as revenue_monthly_tip_amount,
18 |     sum(tolls_amount) as revenue_monthly_tolls_amount,
19 |     sum(ehail_fee) as revenue_monthly_ehail_fee,
20 |     sum(improvement_surcharge) as revenue_monthly_improvement_surcharge,
21 |     sum(total_amount) as revenue_monthly_total_amount,
22 |     sum(congestion_surcharge) as revenue_monthly_congestion_surcharge,
23 | 
24 |     -- Additional calculations
25 |     count(tripid) as total_monthly_trips,
26 |     avg(passenger_count) as avg_montly_passenger_count,
27 |     avg(trip_distance) as avg_montly_trip_distance
28 | 
29 |     from trips_data
30 |     group by 1,2,3


--------------------------------------------------------------------------------
/Week 4/dbt_files/models/core/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: dim_zones
 5 |     description: >
 6 |       List of unique zones identified by locationid. 
 7 |       Includes the service zone they correspond to (Green or yellow).
 8 |   - name: fact_trips
 9 |     description: >
10 |       Taxi trips corresponding to both service zones (Green and yellow).
11 |       The table contains records where both pickup and dropoff locations are valid and known zones. 
12 |       Each record corresponds to a trip uniquely identified by tripid. 
13 |       
14 |   - name: dm_monthly_zone_revenue
15 |     description: >
16 |       Aggregated table of all taxi trips corresponding to both service zones (Green and yellow) per pickup zone, month and service.
17 |       The table contains monthly sums of the fare elements used to calculate the monthly revenue. 
18 |       The table contains also monthly indicators like number of trips, and average trip distance. 
19 |     columns:
20 |       - name: revenue_monthly_total_amount
21 |         description: Monthly sum of the the total_amount of the fare charged for the trip per pickup zone, month and service.
22 |         tests:
23 |             - not_null:
24 |                 severity: error


--------------------------------------------------------------------------------
/Week 1/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.0"
 3 |   backend "local" {}  # Can change from "local" to "gcs" (for google) or "s3" (for aws), if you would like to preserve your tf-state online
 4 |   required_providers {
 5 |     google = {
 6 |       source  = "hashicorp/google"
 7 |     }
 8 |   }
 9 | }
10 |  
11 | provider "google" {
12 |   project = var.project
13 |   region = var.region
14 |   // credentials = file(var.credentials)  # Use this if you do not want to set env-var GOOGLE_APPLICATION_CREDENTIALS
15 | }
16 | 
17 | # Data Lake Bucket
18 | # Ref: https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket
19 | resource "google_storage_bucket" "data-lake-bucket" {
20 |   name          = "${local.data_lake_bucket}_${var.project}" # Concatenating DL bucket & Project name for unique naming
21 |   location      = var.region
22 | 
23 |   # Optional, but recommended settings:
24 |   storage_class = var.storage_class
25 |   uniform_bucket_level_access = true
26 | 
27 |   versioning {
28 |     enabled     = true
29 |   }
30 | 
31 |   lifecycle_rule {
32 |     action {
33 |       type = "Delete"
34 |     }
35 |     condition {
36 |       age = 30  // days
37 |     }
38 |   }
39 | 
40 |   force_destroy = true
41 | }
42 | 
43 | # DWH
44 | # Ref: https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/bigquery_dataset
45 | resource "google_bigquery_dataset" "dataset" {
46 |   dataset_id = var.BQ_DATASET
47 |   project    = var.project
48 |   location   = var.region
49 | }


--------------------------------------------------------------------------------
/Week 4/dbt_files/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Name your project! Project names should contain only lowercase characters
 3 | # and underscores. A good package name should reflect your organization's
 4 | # name or the intended use of these models
 5 | name: 'ny_taxi_rides'
 6 | version: '1.0.0'
 7 | config-version: 2
 8 | 
 9 | # This setting configures which "profile" dbt uses for this project.
10 | profile: 'default'
11 | 
12 | # These configurations specify where dbt should look for different types of files.
13 | # The `source-paths` config, for example, states that models in this project can be
14 | # found in the "models/" directory. You probably won't need to change these!
15 | model-paths: ["models"]
16 | analysis-paths: ["analyses"]
17 | test-paths: ["tests"]
18 | seed-paths: ["seeds"]
19 | macro-paths: ["macros"]
20 | snapshot-paths: ["snapshots"]
21 | 
22 | target-path: "target"  # directory which will store compiled SQL files
23 | clean-targets:         # directories to be removed by `dbt clean`
24 |   - "target"
25 |   - "dbt_packages"
26 | 
27 | 
28 | # Configuring models
29 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
30 | 
31 | # In this example config, we tell dbt to build all models in the example/ directory
32 | # as tables. These settings can be overridden in the individual model files
33 | # using the `{{ config(...) }}` macro.
34 | models:
35 |   ny_taxi_rides:
36 |     # Applies to all files under models/example/
37 |     # example:
38 |     #   materialized: view
39 | 
40 | vars:
41 |   payment_type_values: [1, 2, 3, 4, 5, 6]
42 | 
43 | seeds: 
44 |     ny_taxi_rides:
45 |         taxi_zone_lookup:
46 |             +column_types:
47 |                 locationid: numeric


--------------------------------------------------------------------------------
/Week 4/dbt_files/models/core/fact_trips.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table') }}
 2 | 
 3 | with green_data as (
 4 |     select *, 
 5 |         'Green' as service_type 
 6 |     from {{ ref('stg_green_tripdata') }}
 7 | ), 
 8 | 
 9 | yellow_data as (
10 |     select *, 
11 |         'Yellow' as service_type
12 |     from {{ ref('stg_yellow_tripdata') }}
13 | ), 
14 | 
15 | trips_unioned as (
16 |     select * from green_data
17 |     union all
18 |     select * from yellow_data
19 | ), 
20 | 
21 | dim_zones as (
22 |     select * from {{ ref('dim_zones') }}
23 |     where borough != 'Unknown'
24 | )
25 | select 
26 |     trips_unioned.tripid, 
27 |     trips_unioned.vendorid, 
28 |     trips_unioned.service_type,
29 |     trips_unioned.ratecodeid, 
30 |     trips_unioned.pickup_locationid, 
31 |     pickup_zone.borough as pickup_borough, 
32 |     pickup_zone.zone as pickup_zone, 
33 |     trips_unioned.dropoff_locationid,
34 |     dropoff_zone.borough as dropoff_borough, 
35 |     dropoff_zone.zone as dropoff_zone,  
36 |     trips_unioned.pickup_datetime, 
37 |     trips_unioned.dropoff_datetime, 
38 |     trips_unioned.store_and_fwd_flag, 
39 |     trips_unioned.passenger_count, 
40 |     trips_unioned.trip_distance, 
41 |     trips_unioned.trip_type, 
42 |     trips_unioned.fare_amount, 
43 |     trips_unioned.extra, 
44 |     trips_unioned.mta_tax, 
45 |     trips_unioned.tip_amount, 
46 |     trips_unioned.tolls_amount, 
47 |     trips_unioned.ehail_fee, 
48 |     trips_unioned.improvement_surcharge, 
49 |     trips_unioned.total_amount, 
50 |     trips_unioned.payment_type, 
51 |     trips_unioned.payment_type_description, 
52 |     trips_unioned.congestion_surcharge
53 | from trips_unioned
54 | inner join dim_zones as pickup_zone
55 | on trips_unioned.pickup_locationid = pickup_zone.locationid
56 | inner join dim_zones as dropoff_zone
57 | on trips_unioned.dropoff_locationid = dropoff_zone.locationid


--------------------------------------------------------------------------------
/Week 4/dbt_files/models/staging/stg_green_tripdata.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='view') }}
 2 | 
 3 | with tripdata as 
 4 | (
 5 |   select *,
 6 |     row_number() over(partition by cast(vendorid as integer), lpep_pickup_datetime) as rn
 7 |   from {{ source('staging','green_trips') }}
 8 |   where vendorid is not null 
 9 | )
10 | select
11 |     -- identifiers
12 |     {{ dbt_utils.surrogate_key(['vendorid', 'lpep_pickup_datetime']) }} as tripid,
13 |     cast(vendorid as integer) as vendorid,
14 |     cast(ratecodeid as integer) as ratecodeid,
15 |     cast(pulocationid as integer) as  pickup_locationid,
16 |     cast(dolocationid as integer) as dropoff_locationid,
17 |     
18 |     -- timestamps
19 |     cast(lpep_pickup_datetime as timestamp) as pickup_datetime,
20 |     cast(lpep_dropoff_datetime as timestamp) as dropoff_datetime,
21 |     
22 |     -- trip info
23 |     store_and_fwd_flag,
24 |     cast(passenger_count as integer) as passenger_count,
25 |     cast(trip_distance as numeric) as trip_distance,
26 |     cast(trip_type as integer) as trip_type,
27 |     
28 |     -- payment info
29 |     cast(fare_amount as numeric) as fare_amount,
30 |     cast(extra as numeric) as extra,
31 |     cast(mta_tax as numeric) as mta_tax,
32 |     cast(tip_amount as numeric) as tip_amount,
33 |     cast(tolls_amount as numeric) as tolls_amount,
34 |     cast(ehail_fee as numeric) as ehail_fee,
35 |     cast(improvement_surcharge as numeric) as improvement_surcharge,
36 |     cast(total_amount as numeric) as total_amount,
37 |     cast(payment_type as integer) as payment_type,
38 |     {{ get_payment_type_description('payment_type') }} as payment_type_description, 
39 |     cast(congestion_surcharge as numeric) as congestion_surcharge
40 | from tripdata
41 | where rn = 1
42 | 
43 | 
44 | -- dbt build --m <model.sql> --var 'is_test_run: false'
45 | {% if var('is_test_run', default=true) %}
46 | 
47 |   limit 100
48 | 
49 | {% endif %}


--------------------------------------------------------------------------------
/Week 4/dbt_files/models/staging/stg_yellow_tripdata.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='view') }}
 2 |  
 3 | with tripdata as 
 4 | (
 5 |   select *,
 6 |     row_number() over(partition by cast(vendorid as integer), tpep_pickup_datetime) as rn
 7 |   from {{ source('staging','yellow_trips') }}
 8 |   where vendorid is not null 
 9 | )
10 | select
11 |    -- identifiers
12 |     {{ dbt_utils.surrogate_key(['vendorid', 'tpep_pickup_datetime']) }} as tripid,
13 |     cast(vendorid as integer) as vendorid,
14 |     cast(ratecodeid as integer) as ratecodeid,
15 |     cast(pulocationid as integer) as  pickup_locationid,
16 |     cast(dolocationid as integer) as dropoff_locationid,
17 |     
18 |     -- timestamps
19 |     cast(tpep_pickup_datetime as timestamp) as pickup_datetime,
20 |     cast(tpep_dropoff_datetime as timestamp) as dropoff_datetime,
21 |     
22 |     -- trip info
23 |     store_and_fwd_flag,
24 |     cast(passenger_count as integer) as passenger_count,
25 |     cast(trip_distance as numeric) as trip_distance,
26 |     -- yellow cabs are always street-hail
27 |     1 as trip_type,
28 |     
29 |     -- payment info
30 |     cast(fare_amount as numeric) as fare_amount,
31 |     cast(extra as numeric) as extra,
32 |     cast(mta_tax as numeric) as mta_tax,
33 |     cast(tip_amount as numeric) as tip_amount,
34 |     cast(tolls_amount as numeric) as tolls_amount,
35 |     cast(0 as numeric) as ehail_fee,
36 |     cast(improvement_surcharge as numeric) as improvement_surcharge,
37 |     cast(total_amount as numeric) as total_amount,
38 |     cast(payment_type as integer) as payment_type,
39 |     {{ get_payment_type_description('payment_type') }} as payment_type_description, 
40 |     cast(congestion_surcharge as numeric) as congestion_surcharge
41 | from tripdata
42 | where rn = 1
43 | 
44 | -- dbt build --m <model.sql> --var 'is_test_run: false'
45 | {% if var('is_test_run', default=true) %}
46 | 
47 |   limit 100
48 | 
49 | {% endif %}


--------------------------------------------------------------------------------
/Week 2/etl_gcs_to_bq.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import pandas as pd
 3 | from prefect import flow, task
 4 | from prefect_gcp.cloud_storage import GcsBucket
 5 | from prefect_gcp import GcpCredentials
 6 | 
 7 | @task(retries=3)
 8 | def extract_from_gcs(color: str, year: int, month: int) -> Path:
 9 |     """Download trip data from GCS"""
10 |     gcs_path = f"data/{color}/{color}_tripdata_{year}-{month:02}.parquet"
11 |     gcs_block = GcsBucket.load("zoom-gcs")
12 |     gcs_block.get_directory(from_path=gcs_path, local_path=f"./data/")
13 |     return Path(f"./data/{gcs_path}")
14 | 
15 | @task(log_prints=True)
16 | def transform(path: Path) -> pd.DataFrame:
17 |     """Data Cleaning Example"""
18 |     df = pd.read_parquet(path)
19 |     # print(f"Pre: missing passenger count: {df['passenger_count'].isna().sum()}")
20 |     # df["passenger_count"].fillna(0, inplace=True)
21 |     # print(f"Post: missing passenger count: {df['passenger_count'].isna().sum()}")
22 |     return df
23 | 
24 | @task()
25 | def write_bq(df: pd.DataFrame) -> None:
26 |     """Writing data into BigQuery"""
27 |     gcp_credentials_block = GcpCredentials.load("zoom-gcp-creds")
28 |     df.to_gbq(destination_table="yellow_trips.rides", 
29 |               project_id = "composed-sun-375018", 
30 |               credentials = gcp_credentials_block.get_credentials_from_service_account(), 
31 |               chunksize=100000, 
32 |               if_exists="append")
33 | 
34 | @flow(log_prints=True)
35 | def etl_gcs_to_bq(color: str = "yellow", year: int = 2019, months: list[int] = [2,3]):
36 |     """Main ETL Flow to load data to Big Query data warehouse"""    
37 |     for month in months:
38 |         path = extract_from_gcs(color, year, month)
39 |         df = transform(path)
40 |         print(f"Row count: {len(df)}")
41 |         write_bq(df)
42 | 
43 | if __name__ == '__main__':
44 |     color = "yellow"
45 |     year = 2019
46 |     months = [2,3]
47 |     etl_gcs_to_bq(color, year, months)


--------------------------------------------------------------------------------
/Week 2/etl_web_to_gcs.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import pandas as pd
 3 | from prefect import flow, task
 4 | from prefect_gcp.cloud_storage import GcsBucket
 5 | 
 6 | 
 7 | @task(retries=3)
 8 | def fetch(dataset_url: str) -> pd.DataFrame:
 9 |     """Read taxi data from web into a pandas DataFrame"""
10 |     df = pd.read_csv(dataset_url)
11 |     return df
12 | 
13 | @task(log_prints=True)
14 | def clean(df = pd.DataFrame) -> pd.DataFrame:
15 |     """Fix Data Type issues"""
16 |     df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
17 |     df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])
18 |     print(df.head(2))
19 |     print(f"Columns: {df.dtypes}")
20 |     print(f"Rows: {len(df)}")
21 |     return df
22 | 
23 | @task()
24 | def write_local(df: pd.DataFrame, color: str, dataset_file: str) -> Path:
25 |     """Write DataFrame out locally as a parquet file"""
26 | 
27 |     # Create a folder data/green in the working directory before running this code
28 |     path = Path(f"data/{color}/{dataset_file}.parquet")   
29 |     df.to_parquet(path, compression="gzip")
30 |     # Checking to see if the slashes are forward. Default is backwards in windows
31 |     print(path.as_posix())
32 |     return path
33 | 
34 | 
35 | @task()
36 | def write_gcs(path: Path) -> None:
37 |     """Upload local parquet file to GCS"""
38 |     gcs_block = GcsBucket.load("zoom-gcs")
39 |     gcs_block.upload_from_path(from_path=path, to_path=path.as_posix()) # Using as_posix() to convert the slashes to forward
40 |     return
41 | 
42 | 
43 | @flow()
44 | def etl_web_to_gcs() -> None:
45 |     """The Main ETL function"""
46 |     color = "green"
47 |     year = 2020
48 |     month = 1
49 |     dataset_file = f"{color}_tripdata_{year}-{month:02}"
50 |     dataset_url = f"https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{color}/{dataset_file}.csv.gz" 
51 | 
52 |     df = fetch(dataset_url)
53 |     df_clean = clean(df)
54 |     path = write_local(df_clean, color, dataset_file)
55 |     write_gcs(path)
56 |     
57 | 
58 | if __name__ == '__main__':
59 |     etl_web_to_gcs()
60 | 


--------------------------------------------------------------------------------
/Week 1/ingest_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from time import time
 3 | from sqlalchemy import create_engine
 4 | import argparse
 5 | 
 6 | 
 7 | def main(params):
 8 |     user = params.user
 9 |     password = params.password
10 |     host = params.host
11 |     port = params.port
12 |     db = params.db
13 |     table_name = params.table_name[0]
14 |     table_name1 = params.table_name[1]
15 |     url = params.url[0]
16 |     url1 = params.url[1]
17 |     
18 |     engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{db}")
19 | 
20 |     df_iter = pd.read_csv(url, iterator = True, chunksize = 100000)
21 |     df = next(df_iter)
22 | 
23 |     df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
24 |     df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
25 | 
26 |     df.head(n=0).to_sql(name = table_name, con = engine, if_exists='replace')
27 |     df.to_sql(name = table_name, con = engine, if_exists='append')
28 | 
29 |     for df in df_iter:
30 |         t_start = time()
31 |         
32 |         df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
33 |         df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
34 | 
35 |         df.to_sql(name = table_name, con = engine, if_exists='append')
36 |         
37 |         t_end = time()
38 |         
39 |         print(f'Inserted a new chunk. Time taken (in s) - {round(t_end - t_start, 2)}')
40 | 
41 |     taxi_zone = pd.read_csv(url1)
42 | 
43 |     taxi_zone.to_sql(name = table_name1, con = engine, if_exists='replace')
44 | 
45 | if __name__ == '__main__':
46 |     parser = argparse.ArgumentParser(description = "Ingest CSV data to Postgres")
47 | 
48 |     # user
49 |     # password
50 |     # host
51 |     # port
52 |     # database name
53 |     # table name
54 |     # url of the csv
55 | 
56 |     parser.add_argument('--user', help="user name for postgres")
57 |     parser.add_argument('--password', help="password for postgres")
58 |     parser.add_argument('--host', help="host for postgres")
59 |     parser.add_argument('--port', help="port for postgres")
60 |     parser.add_argument('--db', help="database name for postgres")
61 |     parser.add_argument('--table_name', nargs = 2, help="name of the table where we will write the results to")
62 |     parser.add_argument('--url', nargs = 2, help="url of the CSV")
63 | 
64 |     args = parser.parse_args()
65 | 
66 |     main(args)


--------------------------------------------------------------------------------
/Week 2/ingest_data_flow.py:
--------------------------------------------------------------------------------
 1 | from time import time
 2 | import pandas as pd
 3 | from sqlalchemy import create_engine
 4 | from prefect import flow, task
 5 | from prefect.tasks import task_input_hash
 6 | from datetime import timedelta
 7 | from prefect_sqlalchemy import SqlAlchemyConnector
 8 | 
 9 | 
10 | @task(log_prints=True, tags=["extract"], cache_key_fn=task_input_hash, cache_expiration=timedelta(days=1))
11 | def extract_data(url: str):
12 |     df_iter = pd.read_csv(url, iterator=True, chunksize=100000)
13 | 
14 |     df = next(df_iter)
15 | 
16 |     df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
17 |     df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
18 | 
19 |     return df
20 | 
21 | @task(log_prints=True)
22 | def transform_data(df):
23 |     print(f"pre: missing passenger count: {df['passenger_count'].isin([0]).sum()}")
24 |     df = df[df['passenger_count'] != 0]
25 |     print(f"post: missing passenger count: {df['passenger_count'].isin([0]).sum()}")
26 |     return df
27 | 
28 | @task(log_prints=True, retries=3)
29 | def load_data(table_name, df):
30 |     
31 |     # engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{db}")
32 |     # df.head(n=0).to_sql(name = table_name, con = engine, if_exists='replace')
33 |     # df.to_sql(name = table_name, con = engine, if_exists='append')
34 | 
35 |     connection_block = SqlAlchemyConnector.load("postgres-connector")
36 |     with connection_block.get_connection(begin=False) as engine:
37 |         df.head(n=0).to_sql(name=table_name, con=engine, if_exists='replace')
38 |         df.to_sql(name=table_name, con=engine, if_exists='append')
39 | 
40 | @flow(name="Subflow", log_prints=True)
41 | def log_subflow(table_name: str):
42 |     print(f"Logging Subflow for: {table_name}")
43 | 
44 | @flow(name="Ingest Data")
45 | def main_flow(table_name: str = "green_taxi_trips"):
46 |     # user = "root"
47 |     # password = "root"
48 |     # host = "localhost"
49 |     # port = "5432"
50 |     # db = "ny_taxi"
51 |     # table_name = "green_taxi_trips"
52 |     
53 |     csv_url = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-01.csv.gz" 
54 |     log_subflow(table_name)
55 |     raw_data = extract_data(csv_url)
56 |     data = transform_data(raw_data)
57 |     # load_data(user, password, host, port, db, table_name, data)
58 |     load_data(table_name, data)
59 | 
60 | if __name__ == '__main__':
61 |     main_flow(table_name = "green_trips")
62 | 
63 | 
64 | 
65 | 
66 | 
67 |     


--------------------------------------------------------------------------------
/Week 4/web_to_gcs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | import pandas as pd
 4 | import pyarrow as pa
 5 | import pyarrow.parquet as pq
 6 | import gzip
 7 | from google.cloud import storage
 8 | 
 9 | # Set Google Cloud Storage bucket name
10 | BUCKET_NAME = "bucket_name"  # Enter your bucket name here
11 |  
12 | # Set local directory for downloaded files
13 | LOCAL_DIR = "data"
14 | 
15 | # Create the local directory if it doesn't exist
16 | if not os.path.exists(LOCAL_DIR):
17 |     os.makedirs(LOCAL_DIR)
18 | 
19 | # Define function to upload file to Google Cloud Storage
20 | def upload_to_gcs(bucket_name, blob_name, file_path):
21 |     storage_client = storage.Client()
22 |     bucket = storage_client.get_bucket(bucket_name)
23 |     blob = bucket.blob(blob_name)
24 |     blob.upload_from_filename(file_path)
25 |     print(f"File {file_path} uploaded to gs://{bucket_name}/{blob_name}")
26 | 
27 | 
28 | # Define function to get file from github to local and then call the upload_to_gcs function
29 | def web_to_gcs(service, year):
30 | 
31 |     for month in range(1, 13):
32 |         base_url = f"https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{service}/"
33 |         file_name = f"{service}_tripdata_{year}-{month:02d}.csv.gz"
34 |         url = base_url + file_name
35 |         file_path = os.path.join(LOCAL_DIR, file_name)
36 |         r = requests.get(url, allow_redirects=True)
37 |         open(file_path, "wb").write(r.content)
38 |         
39 |         # Check if the downloaded file is a valid gzip file
40 |         try:
41 |             with gzip.open(file_path) as f:
42 |                 pass
43 |         except Exception as e:
44 |             print(f"Error: {e}. {file_name} may be corrupted.")
45 |             os.remove(file_path)
46 |             continue
47 |         
48 |         # Convert CSV file to Parquet
49 |         csv = pd.read_csv(file_path)
50 |         parquet_file_name = file_name.replace(".csv.gz", ".parquet")
51 |         parquet_file_path = os.path.join(LOCAL_DIR, parquet_file_name)
52 |         table = pa.Table.from_pandas(csv)
53 |         pq.write_table(table, parquet_file_path)
54 |             
55 |         # Upload Parquet file to Google Cloud Storage
56 |         upload_to_gcs(BUCKET_NAME, f"{service}/{parquet_file_name}", parquet_file_path)
57 |         
58 |         # Delete local CSV and Parquet files
59 |         os.remove(file_path)
60 |         os.remove(parquet_file_path)
61 | 
62 | 
63 | web_to_gcs("fhv", 2019)   
64 | web_to_gcs("green", 2019)   
65 | web_to_gcs("green", 2020)   
66 | web_to_gcs("yellow", 2019)   
67 | web_to_gcs("yellow", 2020)   
68 |         
69 | 


--------------------------------------------------------------------------------
/Week 2/parameterized_flow.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import pandas as pd
 3 | from prefect import flow, task
 4 | from prefect.tasks import task_input_hash
 5 | from datetime import timedelta
 6 | from prefect_gcp.cloud_storage import GcsBucket
 7 | 
 8 | 
 9 | @task(log_prints=True, retries=3, cache_key_fn=task_input_hash, cache_expiration=timedelta(days=1))
10 | def fetch(dataset_url: str) -> pd.DataFrame:
11 |     """Read taxi data from web into a pandas DataFrame"""
12 |     df = pd.read_csv(dataset_url)
13 |     return df
14 | 
15 | @task(log_prints=True)
16 | def clean(df = pd.DataFrame) -> pd.DataFrame:
17 |     """Fix Data Type issues"""
18 |     df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
19 |     df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
20 |     print(df.head(2))
21 |     print(f"Columns: {df.dtypes}")
22 |     print(f"Rows: {len(df)}")
23 |     return df
24 | 
25 | @task(log_prints=True)
26 | def write_local(df: pd.DataFrame, color: str, dataset_file: str) -> Path:
27 |     """Write DataFrame out locally as a parquet file"""
28 | 
29 |     # Create a folder data/green in the working directory before running this code
30 |     path = Path(f"data/{color}/{dataset_file}.parquet")   
31 |     df.to_parquet(path, compression="gzip")
32 |     # Checking to see if the slashes are forward. Default is backwards
33 |     print(path.as_posix())
34 |     return path
35 | 
36 | 
37 | @task(log_prints=True)
38 | def write_gcs(path: Path) -> None:
39 |     """Upload local parquet file to GCS"""
40 |     gcs_block = GcsBucket.load("zoom-gcs")
41 |     gcs_block.upload_from_path(from_path=path, to_path=path.as_posix()) # Using as_posix() to convert the slashes to forward
42 |     return
43 | 
44 | 
45 | @flow(log_prints=True)
46 | def etl_web_to_gcs(year: int, month: int, color: str) -> None:
47 |     """The Main ETL function"""
48 |     dataset_file = f"{color}_tripdata_{year}-{month:02}"
49 |     dataset_url = f"https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{color}/{dataset_file}.csv.gz" 
50 | 
51 |     df = fetch(dataset_url)
52 |     df_clean = clean(df)
53 |     path = write_local(df_clean, color, dataset_file)
54 |     write_gcs(path)
55 | 
56 | @flow(log_prints=True)
57 | def etl_parent_flow(months: list[int] = [1,2], year: int = 2019, color: str = "green"):
58 |     for month in months:
59 |         etl_web_to_gcs(year, month, color)
60 | 
61 | if __name__ == '__main__':
62 |     color = "yellow"
63 |     months = [2,3]
64 |     year = 2019
65 |     etl_parent_flow(months, year, color)
66 | 


--------------------------------------------------------------------------------
/Week 5/Code/09_Spark_SQL.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | import argparse
  5 | 
  6 | import pyspark
  7 | from pyspark.sql import SparkSession
  8 | from pyspark.sql import functions as F
  9 | 
 10 | 
 11 | parser = argparse.ArgumentParser()
 12 | 
 13 | parser.add_argument('--input_green', required=True)
 14 | parser.add_argument('--input_yellow', required=True)
 15 | parser.add_argument('--output', required=True)
 16 | 
 17 | args = parser.parse_args()
 18 | 
 19 | input_green = args.input_green
 20 | input_yellow = args.input_yellow
 21 | output = args.output
 22 | 
 23 | 
 24 | spark = SparkSession.builder \
 25 |     .appName('test') \
 26 |     .getOrCreate()
 27 | 
 28 | df_green = spark.read.parquet(input_green)
 29 | 
 30 | df_green = df_green \
 31 |     .withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \
 32 |     .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime')
 33 | 
 34 | df_yellow = spark.read.parquet(input_yellow)
 35 | 
 36 | 
 37 | df_yellow = df_yellow \
 38 |     .withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime') \
 39 |     .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime')
 40 | 
 41 | 
 42 | common_colums = [
 43 |     'VendorID',
 44 |     'pickup_datetime',
 45 |     'dropoff_datetime',
 46 |     'store_and_fwd_flag',
 47 |     'RatecodeID',
 48 |     'PULocationID',
 49 |     'DOLocationID',
 50 |     'passenger_count',
 51 |     'trip_distance',
 52 |     'fare_amount',
 53 |     'extra',
 54 |     'mta_tax',
 55 |     'tip_amount',
 56 |     'tolls_amount',
 57 |     'improvement_surcharge',
 58 |     'total_amount',
 59 |     'payment_type',
 60 |     'congestion_surcharge'
 61 | ]
 62 | 
 63 | 
 64 | 
 65 | df_green_sel = df_green \
 66 |     .select(common_colums) \
 67 |     .withColumn('service_type', F.lit('green'))
 68 | 
 69 | df_yellow_sel = df_yellow \
 70 |     .select(common_colums) \
 71 |     .withColumn('service_type', F.lit('yellow'))
 72 | 
 73 | 
 74 | df_trips_data = df_green_sel.unionAll(df_yellow_sel)
 75 | 
 76 | df_trips_data.registerTempTable('trips_data')
 77 | 
 78 | 
 79 | df_result = spark.sql("""
 80 | SELECT 
 81 |     -- Reveneue grouping 
 82 |     PULocationID AS revenue_zone,
 83 |     date_trunc('month', pickup_datetime) AS revenue_month, 
 84 |     service_type, 
 85 |     -- Revenue calculation 
 86 |     SUM(fare_amount) AS revenue_monthly_fare,
 87 |     SUM(extra) AS revenue_monthly_extra,
 88 |     SUM(mta_tax) AS revenue_monthly_mta_tax,
 89 |     SUM(tip_amount) AS revenue_monthly_tip_amount,
 90 |     SUM(tolls_amount) AS revenue_monthly_tolls_amount,
 91 |     SUM(improvement_surcharge) AS revenue_monthly_improvement_surcharge,
 92 |     SUM(total_amount) AS revenue_monthly_total_amount,
 93 |     SUM(congestion_surcharge) AS revenue_monthly_congestion_surcharge,
 94 |     -- Additional calculations
 95 |     AVG(passenger_count) AS avg_montly_passenger_count,
 96 |     AVG(trip_distance) AS avg_montly_trip_distance
 97 | FROM
 98 |     trips_data
 99 | GROUP BY
100 |     1, 2, 3
101 | """)
102 | 
103 | 
104 | df_result.coalesce(1) \
105 |     .write.parquet(output, mode='overwrite')


--------------------------------------------------------------------------------
/Week 5/Code/12_Spark_SQL_BQ.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | import argparse
  5 | 
  6 | import pyspark
  7 | from pyspark.sql import SparkSession
  8 | from pyspark.sql import functions as F
  9 | 
 10 | 
 11 | parser = argparse.ArgumentParser()
 12 | 
 13 | parser.add_argument('--input_green', required=True)
 14 | parser.add_argument('--input_yellow', required=True)
 15 | parser.add_argument('--output', required=True)
 16 | 
 17 | args = parser.parse_args()
 18 | 
 19 | input_green = args.input_green
 20 | input_yellow = args.input_yellow
 21 | output = args.output
 22 | 
 23 | 
 24 | spark = SparkSession.builder \
 25 |     .appName('test') \
 26 |     .getOrCreate()
 27 | 
 28 | spark.conf.set('temporaryGcsBucket', 'dataproc-temp-europe-west6-828225226997-fckhkym8')
 29 | 
 30 | df_green = spark.read.parquet(input_green)
 31 | 
 32 | df_green = df_green \
 33 |     .withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \
 34 |     .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime')
 35 | 
 36 | df_yellow = spark.read.parquet(input_yellow)
 37 | 
 38 | 
 39 | df_yellow = df_yellow \
 40 |     .withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime') \
 41 |     .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime')
 42 | 
 43 | 
 44 | common_colums = [
 45 |     'VendorID',
 46 |     'pickup_datetime',
 47 |     'dropoff_datetime',
 48 |     'store_and_fwd_flag',
 49 |     'RatecodeID',
 50 |     'PULocationID',
 51 |     'DOLocationID',
 52 |     'passenger_count',
 53 |     'trip_distance',
 54 |     'fare_amount',
 55 |     'extra',
 56 |     'mta_tax',
 57 |     'tip_amount',
 58 |     'tolls_amount',
 59 |     'improvement_surcharge',
 60 |     'total_amount',
 61 |     'payment_type',
 62 |     'congestion_surcharge'
 63 | ]
 64 | 
 65 | 
 66 | 
 67 | df_green_sel = df_green \
 68 |     .select(common_colums) \
 69 |     .withColumn('service_type', F.lit('green'))
 70 | 
 71 | df_yellow_sel = df_yellow \
 72 |     .select(common_colums) \
 73 |     .withColumn('service_type', F.lit('yellow'))
 74 | 
 75 | 
 76 | df_trips_data = df_green_sel.unionAll(df_yellow_sel)
 77 | 
 78 | df_trips_data.registerTempTable('trips_data')
 79 | 
 80 | 
 81 | df_result = spark.sql("""
 82 | SELECT 
 83 |     -- Reveneue grouping 
 84 |     PULocationID AS revenue_zone,
 85 |     date_trunc('month', pickup_datetime) AS revenue_month, 
 86 |     service_type, 
 87 |     -- Revenue calculation 
 88 |     SUM(fare_amount) AS revenue_monthly_fare,
 89 |     SUM(extra) AS revenue_monthly_extra,
 90 |     SUM(mta_tax) AS revenue_monthly_mta_tax,
 91 |     SUM(tip_amount) AS revenue_monthly_tip_amount,
 92 |     SUM(tolls_amount) AS revenue_monthly_tolls_amount,
 93 |     SUM(improvement_surcharge) AS revenue_monthly_improvement_surcharge,
 94 |     SUM(total_amount) AS revenue_monthly_total_amount,
 95 |     SUM(congestion_surcharge) AS revenue_monthly_congestion_surcharge,
 96 |     -- Additional calculations
 97 |     AVG(passenger_count) AS avg_montly_passenger_count,
 98 |     AVG(trip_distance) AS avg_montly_trip_distance
 99 | FROM
100 |     trips_data
101 | GROUP BY
102 |     1, 2, 3
103 | """)
104 | 
105 | 
106 | df_result.write.format('bigquery') \
107 |     .option('table', output) \
108 |     .save()


--------------------------------------------------------------------------------
/Week 1/Homework Answers:
--------------------------------------------------------------------------------
  1 | Docker & SQL
  2 | 
  3 | --1)
  4 | 
  5 | docker build --help
  6 | 
  7 | --2)
  8 | 
  9 | winpty docker run -it python:3.9 bash
 10 | pip list
 11 | 
 12 | --3)
 13 | 
 14 | SELECT COUNT(*) 
 15 | FROM GREEN_TAXI_DATA
 16 | WHERE DATE(LPEP_PICKUP_DATETIME) = '2019-01-15' AND DATE(LPEP_DROPOFF_DATETIME) = '2019-01-15';
 17 | 
 18 | --4)
 19 | 
 20 | SELECT DISTINCT DATE(LPEP_PICKUP_DATETIME) 
 21 | FROM GREEN_TAXI_DATA
 22 | WHERE TRIP_DISTANCE = (SELECT MAX(TRIP_DISTANCE) FROM GREEN_TAXI_DATA);
 23 | 
 24 | --5)
 25 | 
 26 | SELECT PASSENGER_COUNT, COUNT(*) 
 27 | FROM GREEN_TAXI_DATA
 28 | WHERE PASSENGER_COUNT IN (2,3) AND DATE(LPEP_PICKUP_DATETIME) = '2019-01-01' 
 29 | GROUP BY 1;
 30 | 
 31 | --6)
 32 | 
 33 | WITH LOC AS 
 34 | (
 35 |   SELECT DISTINCT "LocationID" FROM TAXI_ZONE
 36 |   WHERE "Zone" = 'Astoria'
 37 | ), TIP AS
 38 | (
 39 |   SELECT MAX(TIP_AMOUNT) FROM GREEN_TAXI_DATA
 40 |   WHERE "PULocationID" = (SELECT * FROM LOC)
 41 | ), LOC_ID AS
 42 | (
 43 | 	SELECT DISTINCT "DOLocationID" 
 44 | 	FROM GREEN_TAXI_DATA 
 45 | 	WHERE "PULocationID" = (SELECT * FROM LOC) AND TIP_AMOUNT = (SELECT * FROM TIP)
 46 | )
 47 | 
 48 | SELECT DISTINCT "Zone" FROM TAXI_ZONE
 49 | WHERE "LocationID" = (SELECT * FROM LOC_ID);
 50 | 
 51 | 
 52 | Terraform
 53 | 
 54 | terraform apply
 55 | 
 56 | Terraform used the selected providers to generate the following execution
 57 | plan. Resource actions are indicated with the following symbols:
 58 |   + create
 59 | 
 60 | Terraform will perform the following actions:
 61 | 
 62 |   # google_bigquery_dataset.dataset will be created
 63 |   + resource "google_bigquery_dataset" "dataset" {
 64 |       + creation_time              = (known after apply)
 65 |       + dataset_id                 = "trips_data_all"
 66 |       + delete_contents_on_destroy = false
 67 |       + etag                       = (known after apply)
 68 |       + id                         = (known after apply)
 69 |       + labels                     = (known after apply)
 70 |       + last_modified_time         = (known after apply)
 71 |       + location                   = "europe-west6"
 72 |       + project                    = "composed-sun-375018"
 73 |       + self_link                  = (known after apply)
 74 | 
 75 |       + access {
 76 |           + domain         = (known after apply)
 77 |           + group_by_email = (known after apply)
 78 |           + role           = (known after apply)
 79 |           + special_group  = (known after apply)
 80 |           + user_by_email  = (known after apply)
 81 | 
 82 |           + dataset {
 83 |               + target_types = (known after apply)
 84 | 
 85 |               + dataset {
 86 |                   + dataset_id = (known after apply)
 87 |                   + project_id = (known after apply)
 88 |                 }
 89 |             }
 90 | 
 91 |           + routine {
 92 |               + dataset_id = (known after apply)
 93 |               + project_id = (known after apply)
 94 |               + routine_id = (known after apply)
 95 |             }
 96 | 
 97 |           + view {
 98 |               + dataset_id = (known after apply)
 99 |               + project_id = (known after apply)
100 |               + table_id   = (known after apply)
101 |             }
102 |         }
103 |     }
104 | 
105 |   # google_storage_bucket.data-lake-bucket will be created
106 |   + resource "google_storage_bucket" "data-lake-bucket" {
107 |       + force_destroy               = true
108 |       + id                          = (known after apply)
109 |       + location                    = "EUROPE-WEST6"
110 |       + name                        = "dtc_data_lake_composed-sun-375018"
111 |       + project                     = (known after apply)
112 |       + public_access_prevention    = (known after apply)
113 |       + self_link                   = (known after apply)
114 |       + storage_class               = "STANDARD"
115 |       + uniform_bucket_level_access = true
116 |       + url                         = (known after apply)
117 | 
118 |       + lifecycle_rule {
119 |           + action {
120 |               + type = "Delete"
121 |             }
122 | 
123 |           + condition {
124 |               + age                   = 30
125 |               + matches_prefix        = []
126 |               + matches_storage_class = []
127 |               + matches_suffix        = []
128 |               + with_state            = (known after apply)
129 |             }
130 |         }
131 | 
132 |       + versioning {
133 |           + enabled = true
134 |         }
135 | 
136 |       + website {
137 |           + main_page_suffix = (known after apply)
138 |           + not_found_page   = (known after apply)
139 |         }
140 |     }
141 | 
142 | Plan: 2 to add, 0 to change, 0 to destroy.
143 | 
144 | Do you want to perform these actions?
145 |   Terraform will perform the actions described above.
146 |   Only 'yes' will be accepted to approve.
147 | 
148 |   Enter a value: yes
149 | 
150 | google_bigquery_dataset.dataset: Creating...
151 | google_storage_bucket.data-lake-bucket: Creating...
152 | google_storage_bucket.data-lake-bucket: Creation complete after 2s [id=dtc_data_lake_composed-sun-375018]
153 | google_bigquery_dataset.dataset: Creation complete after 3s [id=projects/composed-sun-375018/datasets/trips_data_all]
154 | 
155 | Apply complete! Resources: 2 added, 0 changed, 0 destroyed.
156 | 


--------------------------------------------------------------------------------
/Week 4/dbt_files/models/staging/schema.yml:
--------------------------------------------------------------------------------
  1 | version: 2
  2 | 
  3 | sources:
  4 |     - name: staging
  5 | 
  6 |       database: composed-sun-375018
  7 | 
  8 |       schema: trips_data_all
  9 | 
 10 |       # loaded_at_field: record_loaded_at
 11 |       tables:
 12 |         - name: green_trips
 13 |         - name: yellow_trips
 14 |          # freshness:
 15 |            # error_after: {count: 6, period: hour}
 16 | 
 17 | models:
 18 |     - name: stg_green_tripdata
 19 |       description: >
 20 |         Trip made by green taxis, also known as boro taxis and street-hail liveries.
 21 |         Green taxis may respond to street hails,but only in the areas indicated in green on the
 22 |         map (i.e. above W 110 St/E 96th St in Manhattan and in the boroughs).
 23 |         The records were collected and provided to the NYC Taxi and Limousine Commission (TLC) by
 24 |         technology service providers. 
 25 |       columns:
 26 |           - name: tripid
 27 |             description: Primary key for this table, generated with a concatenation of vendorid+pickup_datetime
 28 |             tests:
 29 |                 - unique:
 30 |                     severity: warn
 31 |                 - not_null:
 32 |                     severity: warn
 33 |           - name: VendorID 
 34 |             description: > 
 35 |                 A code indicating the TPEP provider that provided the record.
 36 |                 1= Creative Mobile Technologies, LLC; 
 37 |                 2= VeriFone Inc.
 38 |           - name: pickup_datetime 
 39 |             description: The date and time when the meter was engaged.
 40 |           - name: dropoff_datetime 
 41 |             description: The date and time when the meter was disengaged.
 42 |           - name: Passenger_count 
 43 |             description: The number of passengers in the vehicle. This is a driver-entered value.
 44 |           - name: Trip_distance 
 45 |             description: The elapsed trip distance in miles reported by the taximeter.
 46 |           - name: Pickup_locationid
 47 |             description: locationid where the meter was engaged.
 48 |             tests:
 49 |               - relationships:
 50 |                   to: ref('taxi_zone_lookup')
 51 |                   field: locationid
 52 |                   severity: warn
 53 |           - name: dropoff_locationid 
 54 |             description: locationid where the meter was engaged.
 55 |             tests:
 56 |               - relationships:
 57 |                   to: ref('taxi_zone_lookup')
 58 |                   field: locationid
 59 |           - name: RateCodeID 
 60 |             description: >
 61 |                 The final rate code in effect at the end of the trip.
 62 |                   1= Standard rate
 63 |                   2=JFK
 64 |                   3=Newark
 65 |                   4=Nassau or Westchester
 66 |                   5=Negotiated fare
 67 |                   6=Group ride
 68 |           - name: Store_and_fwd_flag 
 69 |             description: > 
 70 |               This flag indicates whether the trip record was held in vehicle
 71 |               memory before sending to the vendor, aka “store and forward,”
 72 |               because the vehicle did not have a connection to the server.
 73 |                 Y= store and forward trip
 74 |                 N= not a store and forward trip
 75 |           - name: Dropoff_longitude 
 76 |             description: Longitude where the meter was disengaged.
 77 |           - name: Dropoff_latitude 
 78 |             description: Latitude where the meter was disengaged.
 79 |           - name: Payment_type 
 80 |             description: >
 81 |               A numeric code signifying how the passenger paid for the trip.
 82 |             tests: 
 83 |               - accepted_values:
 84 |                   values: "{{ var('payment_type_values') }}"
 85 |                   severity: warn
 86 |                   quote: false
 87 |           - name: payment_type_description
 88 |             description: Description of the payment_type code
 89 |           - name: Fare_amount 
 90 |             description: > 
 91 |               The time-and-distance fare calculated by the meter.
 92 |               Extra Miscellaneous extras and surcharges. Currently, this only includes
 93 |               the $0.50 and $1 rush hour and overnight charges.
 94 |               MTA_tax $0.50 MTA tax that is automatically triggered based on the metered
 95 |               rate in use.
 96 |           - name: Improvement_surcharge 
 97 |             description: > 
 98 |               $0.30 improvement surcharge assessed trips at the flag drop. The
 99 |               improvement surcharge began being levied in 2015.
100 |           - name: Tip_amount 
101 |             description: > 
102 |               Tip amount. This field is automatically populated for credit card
103 |               tips. Cash tips are not included.
104 |           - name: Tolls_amount 
105 |             description: Total amount of all tolls paid in trip.
106 |           - name: Total_amount 
107 |             description: The total amount charged to passengers. Does not include cash tips.
108 | 
109 |     - name: stg_yellow_tripdata
110 |       description: > 
111 |         Trips made by New York City's iconic yellow taxis. 
112 |         Yellow taxis are the only vehicles permitted to respond to a street hail from a passenger in all five
113 |         boroughs. They may also be hailed using an e-hail app like Curb or Arro.
114 |         The records were collected and provided to the NYC Taxi and Limousine Commission (TLC) by
115 |         technology service providers. 
116 |       columns:
117 |           - name: tripid
118 |             description: Primary key for this table, generated with a concatenation of vendorid+pickup_datetime
119 |             tests:
120 |                 - unique:
121 |                     severity: warn
122 |                 - not_null:
123 |                     severity: warn
124 |           - name: VendorID 
125 |             description: > 
126 |                 A code indicating the TPEP provider that provided the record.
127 |                 1= Creative Mobile Technologies, LLC; 
128 |                 2= VeriFone Inc.
129 |           - name: pickup_datetime 
130 |             description: The date and time when the meter was engaged.
131 |           - name: dropoff_datetime 
132 |             description: The date and time when the meter was disengaged.
133 |           - name: Passenger_count 
134 |             description: The number of passengers in the vehicle. This is a driver-entered value.
135 |           - name: Trip_distance 
136 |             description: The elapsed trip distance in miles reported by the taximeter.
137 |           - name: Pickup_locationid
138 |             description: locationid where the meter was engaged.
139 |             tests:
140 |               - relationships:
141 |                   to: ref('taxi_zone_lookup')
142 |                   field: locationid
143 |                   severity: warn
144 |           - name: dropoff_locationid 
145 |             description: locationid where the meter was engaged.
146 |             tests:
147 |               - relationships:
148 |                   to: ref('taxi_zone_lookup')
149 |                   field: locationid
150 |                   severity: warn
151 |           - name: RateCodeID 
152 |             description: >
153 |                 The final rate code in effect at the end of the trip.
154 |                   1= Standard rate
155 |                   2=JFK
156 |                   3=Newark
157 |                   4=Nassau or Westchester
158 |                   5=Negotiated fare
159 |                   6=Group ride
160 |           - name: Store_and_fwd_flag 
161 |             description: > 
162 |               This flag indicates whether the trip record was held in vehicle
163 |               memory before sending to the vendor, aka “store and forward,”
164 |               because the vehicle did not have a connection to the server.
165 |                 Y= store and forward trip
166 |                 N= not a store and forward trip
167 |           - name: Dropoff_longitude 
168 |             description: Longitude where the meter was disengaged.
169 |           - name: Dropoff_latitude 
170 |             description: Latitude where the meter was disengaged.
171 |           - name: Payment_type 
172 |             description: >
173 |               A numeric code signifying how the passenger paid for the trip.
174 |             tests: 
175 |               - accepted_values:
176 |                   values: "{{ var('payment_type_values') }}"
177 |                   severity: warn
178 |                   quote: false
179 |           - name: payment_type_description
180 |             description: Description of the payment_type code
181 |           - name: Fare_amount 
182 |             description: > 
183 |               The time-and-distance fare calculated by the meter.
184 |               Extra Miscellaneous extras and surcharges. Currently, this only includes
185 |               the $0.50 and $1 rush hour and overnight charges.
186 |               MTA_tax $0.50 MTA tax that is automatically triggered based on the metered
187 |               rate in use.
188 |           - name: Improvement_surcharge 
189 |             description: > 
190 |               $0.30 improvement surcharge assessed trips at the flag drop. The
191 |               improvement surcharge began being levied in 2015.
192 |           - name: Tip_amount 
193 |             description: > 
194 |               Tip amount. This field is automatically populated for credit card
195 |               tips. Cash tips are not included.
196 |           - name: Tolls_amount 
197 |             description: Total amount of all tolls paid in trip.
198 |           - name: Total_amount 
199 |             description: The total amount charged to passengers. Does not include cash tips.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Week 5/Code/05 Spark Join and GroupBy.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "5e43dfad",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## Section I\n",
  9 |     "\n",
 10 |     "Group By"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "id": "b5d349d0",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pyspark\n",
 21 |     "from pyspark.sql import SparkSession\n",
 22 |     "\n",
 23 |     "spark = SparkSession.builder \\\n",
 24 |     "    .master(\"local[*]\") \\\n",
 25 |     "    .appName('test') \\\n",
 26 |     "    .getOrCreate()"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "id": "44762dd8",
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "df_green = spark.read.parquet('D:/data/pq/green/*/*')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 3,
 42 |    "id": "4f81ad58",
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "name": "stderr",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "C:\\Users\\balaj\\anaconda3\\lib\\site-packages\\pyspark\\sql\\dataframe.py:138: FutureWarning: Deprecated in 2.0, use createOrReplaceTempView instead.\n",
 50 |       "  warnings.warn(\n"
 51 |      ]
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "df_green.registerTempTable('green')"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 6,
 61 |    "id": "1800625a",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "df_green_revenue = spark.sql(\"\"\"\n",
 66 |     "SELECT \n",
 67 |     "    date_trunc('hour', lpep_pickup_datetime) AS hour, \n",
 68 |     "    PULocationID AS zone,\n",
 69 |     "\n",
 70 |     "    SUM(total_amount) AS amount,\n",
 71 |     "    COUNT(1) AS number_records\n",
 72 |     "FROM\n",
 73 |     "    green\n",
 74 |     "WHERE\n",
 75 |     "    lpep_pickup_datetime >= '2020-01-01 00:00:00'\n",
 76 |     "GROUP BY\n",
 77 |     "    1, 2\n",
 78 |     "ORDER BY \n",
 79 |     "    1, 2\n",
 80 |     "\"\"\")"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 7,
 86 |    "id": "371d744a",
 87 |    "metadata": {},
 88 |    "outputs": [
 89 |     {
 90 |      "name": "stdout",
 91 |      "output_type": "stream",
 92 |      "text": [
 93 |       "+-------------------+----+------------------+--------------+\n",
 94 |       "|               hour|zone|            amount|number_records|\n",
 95 |       "+-------------------+----+------------------+--------------+\n",
 96 |       "|2020-01-01 00:00:00|   7| 769.7299999999997|            45|\n",
 97 |       "|2020-01-01 00:00:00|  17|195.03000000000006|             9|\n",
 98 |       "|2020-01-01 00:00:00|  18|               7.8|             1|\n",
 99 |       "|2020-01-01 00:00:00|  22|              15.8|             1|\n",
100 |       "|2020-01-01 00:00:00|  24|              87.6|             3|\n",
101 |       "|2020-01-01 00:00:00|  25| 531.0000000000001|            26|\n",
102 |       "|2020-01-01 00:00:00|  29|              61.3|             1|\n",
103 |       "|2020-01-01 00:00:00|  32| 68.94999999999999|             2|\n",
104 |       "|2020-01-01 00:00:00|  33|317.27000000000004|            11|\n",
105 |       "|2020-01-01 00:00:00|  35|            129.96|             5|\n",
106 |       "|2020-01-01 00:00:00|  36|            295.34|            11|\n",
107 |       "|2020-01-01 00:00:00|  37|            175.67|             6|\n",
108 |       "|2020-01-01 00:00:00|  38| 98.78999999999999|             2|\n",
109 |       "|2020-01-01 00:00:00|  40|            168.98|             8|\n",
110 |       "|2020-01-01 00:00:00|  41|1363.9599999999987|            84|\n",
111 |       "|2020-01-01 00:00:00|  42| 799.7599999999996|            52|\n",
112 |       "|2020-01-01 00:00:00|  43|            107.52|             6|\n",
113 |       "|2020-01-01 00:00:00|  47|              13.3|             1|\n",
114 |       "|2020-01-01 00:00:00|  49| 266.7600000000001|            14|\n",
115 |       "|2020-01-01 00:00:00|  51|              17.8|             2|\n",
116 |       "+-------------------+----+------------------+--------------+\n",
117 |       "only showing top 20 rows\n",
118 |       "\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "df_green_revenue.show()"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 13,
129 |    "id": "f8f868b0",
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "df_green_revenue \\\n",
134 |     "    .repartition(20) \\\n",
135 |     "    .write.parquet('D:data/report/revenue/green', mode='overwrite')"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 14,
141 |    "id": "8ca7e6b4",
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "df_yellow = spark.read.parquet('D:/data/pq/yellow/*/*')\n",
146 |     "df_yellow.registerTempTable('yellow')"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 15,
152 |    "id": "ee29e4f4",
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "df_yellow_revenue = spark.sql(\"\"\"\n",
157 |     "SELECT \n",
158 |     "    date_trunc('hour', tpep_pickup_datetime) AS hour, \n",
159 |     "    PULocationID AS zone,\n",
160 |     "\n",
161 |     "    SUM(total_amount) AS amount,\n",
162 |     "    COUNT(1) AS number_records\n",
163 |     "FROM\n",
164 |     "    yellow\n",
165 |     "WHERE\n",
166 |     "    tpep_pickup_datetime >= '2020-01-01 00:00:00'\n",
167 |     "GROUP BY\n",
168 |     "    1, 2\n",
169 |     "\"\"\")"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 16,
175 |    "id": "2148f001",
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "df_yellow_revenue \\\n",
180 |     "    .repartition(20) \\\n",
181 |     "    .write.parquet('D:/data/report/revenue/yellow', mode='overwrite')"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "id": "15ba0334",
187 |    "metadata": {},
188 |    "source": [
189 |     "## Section II\n",
190 |     "\n",
191 |     "Joins"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "id": "9b2b3060",
197 |    "metadata": {},
198 |    "source": [
199 |     "Type 1 - Tables of equal/similar size"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 17,
205 |    "id": "8ffae439",
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "df_green_revenue = spark.read.parquet('D:/data/report/revenue/green')\n",
210 |     "df_yellow_revenue = spark.read.parquet('D:/data/report/revenue/yellow')"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 18,
216 |    "id": "4ff23cd0",
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "df_green_revenue_tmp = df_green_revenue \\\n",
221 |     "    .withColumnRenamed('amount', 'green_amount') \\\n",
222 |     "    .withColumnRenamed('number_records', 'green_number_records')\n",
223 |     "\n",
224 |     "df_yellow_revenue_tmp = df_yellow_revenue \\\n",
225 |     "    .withColumnRenamed('amount', 'yellow_amount') \\\n",
226 |     "    .withColumnRenamed('number_records', 'yellow_number_records')"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 20,
232 |    "id": "98ac3e6a",
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "df_join = df_green_revenue_tmp.join(df_yellow_revenue_tmp, on=['hour', 'zone'], how='outer')"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 21,
242 |    "id": "a0fadc51",
243 |    "metadata": {},
244 |    "outputs": [],
245 |    "source": [
246 |     "df_join.write.parquet('D:/data/report/revenue/total', mode='overwrite')"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 22,
252 |    "id": "31cb79f0",
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "df_join = spark.read.parquet('D:/data/report/revenue/total')"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 23,
262 |    "id": "c2edcaec",
263 |    "metadata": {},
264 |    "outputs": [
265 |     {
266 |      "data": {
267 |       "text/plain": [
268 |        "DataFrame[hour: timestamp, zone: int, green_amount: double, green_number_records: bigint, yellow_amount: double, yellow_number_records: bigint]"
269 |       ]
270 |      },
271 |      "execution_count": 23,
272 |      "metadata": {},
273 |      "output_type": "execute_result"
274 |     }
275 |    ],
276 |    "source": [
277 |     "df_join"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "markdown",
282 |    "id": "08900d84",
283 |    "metadata": {},
284 |    "source": [
285 |     "Type 2 - Big table and a smaller table"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 48,
291 |    "id": "0924acf1",
292 |    "metadata": {},
293 |    "outputs": [
294 |     {
295 |      "name": "stderr",
296 |      "output_type": "stream",
297 |      "text": [
298 |       "find: '/I': No such file or directory\n",
299 |       "find: '/N': No such file or directory\n",
300 |       "find: 'SoundMixer.exe': No such file or directory\n",
301 |       "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
302 |       "                                 Dload  Upload   Total   Spent    Left  Speed\n",
303 |       "\n",
304 |       "  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\n",
305 |       "  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\n",
306 |       "\n",
307 |       "  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\n",
308 |       "100 12322  100 12322    0     0  16245      0 --:--:-- --:--:-- --:--:-- 56522\n"
309 |      ]
310 |     }
311 |    ],
312 |    "source": [
313 |     "import os\n",
314 |     "\n",
315 |     "directory = \"C:/users/balaj/zones\"\n",
316 |     "if not os.path.exists(directory):\n",
317 |     "    os.makedirs(directory)\n",
318 |     "    \n",
319 |     "!curl -L -o \"C:/users/balaj/zones/taxi_zone_lookup.csv\" \"https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv\""
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": 60,
325 |    "id": "c9eb3b30",
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "df_temp = spark.read.csv(\"C:/users/balaj/zones/taxi_zone_lookup.csv\", header=True)"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 62,
335 |    "id": "af494ba3",
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "df_temp.write.parquet(\"C:/users/balaj/zones/taxi_zone_lookup\")"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 63,
345 |    "id": "d76d0c0f",
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "df_zones = spark.read.parquet('C:/users/balaj/zones/taxi_zone_lookup/')"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 64,
355 |    "id": "7715c3e0",
356 |    "metadata": {},
357 |    "outputs": [],
358 |    "source": [
359 |     "df_result = df_join.join(df_zones, df_join.zone == df_zones.LocationID)"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": 65,
365 |    "id": "bc8a653f",
366 |    "metadata": {},
367 |    "outputs": [],
368 |    "source": [
369 |     "df_result.drop('LocationID', 'zone').write.parquet('tmp/revenue-zones')"
370 |    ]
371 |   }
372 |  ],
373 |  "metadata": {
374 |   "kernelspec": {
375 |    "display_name": "Python 3 (ipykernel)",
376 |    "language": "python",
377 |    "name": "python3"
378 |   },
379 |   "language_info": {
380 |    "codemirror_mode": {
381 |     "name": "ipython",
382 |     "version": 3
383 |    },
384 |    "file_extension": ".py",
385 |    "mimetype": "text/x-python",
386 |    "name": "python",
387 |    "nbconvert_exporter": "python",
388 |    "pygments_lexer": "ipython3",
389 |    "version": "3.9.15"
390 |   }
391 |  },
392 |  "nbformat": 4,
393 |  "nbformat_minor": 5
394 | }
395 | 


--------------------------------------------------------------------------------
/Week 4/dbt_files/seeds/taxi_zone_lookup.csv:
--------------------------------------------------------------------------------
  1 | "LocationID","Borough","Zone","service_zone"
  2 | 1,"EWR","Newark Airport","EWR"
  3 | 2,"Queens","Jamaica Bay","Boro Zone"
  4 | 3,"Bronx","Allerton/Pelham Gardens","Boro Zone"
  5 | 4,"Manhattan","Alphabet City","Yellow Zone"
  6 | 5,"Staten Island","Arden Heights","Boro Zone"
  7 | 6,"Staten Island","Arrochar/Fort Wadsworth","Boro Zone"
  8 | 7,"Queens","Astoria","Boro Zone"
  9 | 8,"Queens","Astoria Park","Boro Zone"
 10 | 9,"Queens","Auburndale","Boro Zone"
 11 | 10,"Queens","Baisley Park","Boro Zone"
 12 | 11,"Brooklyn","Bath Beach","Boro Zone"
 13 | 12,"Manhattan","Battery Park","Yellow Zone"
 14 | 13,"Manhattan","Battery Park City","Yellow Zone"
 15 | 14,"Brooklyn","Bay Ridge","Boro Zone"
 16 | 15,"Queens","Bay Terrace/Fort Totten","Boro Zone"
 17 | 16,"Queens","Bayside","Boro Zone"
 18 | 17,"Brooklyn","Bedford","Boro Zone"
 19 | 18,"Bronx","Bedford Park","Boro Zone"
 20 | 19,"Queens","Bellerose","Boro Zone"
 21 | 20,"Bronx","Belmont","Boro Zone"
 22 | 21,"Brooklyn","Bensonhurst East","Boro Zone"
 23 | 22,"Brooklyn","Bensonhurst West","Boro Zone"
 24 | 23,"Staten Island","Bloomfield/Emerson Hill","Boro Zone"
 25 | 24,"Manhattan","Bloomingdale","Yellow Zone"
 26 | 25,"Brooklyn","Boerum Hill","Boro Zone"
 27 | 26,"Brooklyn","Borough Park","Boro Zone"
 28 | 27,"Queens","Breezy Point/Fort Tilden/Riis Beach","Boro Zone"
 29 | 28,"Queens","Briarwood/Jamaica Hills","Boro Zone"
 30 | 29,"Brooklyn","Brighton Beach","Boro Zone"
 31 | 30,"Queens","Broad Channel","Boro Zone"
 32 | 31,"Bronx","Bronx Park","Boro Zone"
 33 | 32,"Bronx","Bronxdale","Boro Zone"
 34 | 33,"Brooklyn","Brooklyn Heights","Boro Zone"
 35 | 34,"Brooklyn","Brooklyn Navy Yard","Boro Zone"
 36 | 35,"Brooklyn","Brownsville","Boro Zone"
 37 | 36,"Brooklyn","Bushwick North","Boro Zone"
 38 | 37,"Brooklyn","Bushwick South","Boro Zone"
 39 | 38,"Queens","Cambria Heights","Boro Zone"
 40 | 39,"Brooklyn","Canarsie","Boro Zone"
 41 | 40,"Brooklyn","Carroll Gardens","Boro Zone"
 42 | 41,"Manhattan","Central Harlem","Boro Zone"
 43 | 42,"Manhattan","Central Harlem North","Boro Zone"
 44 | 43,"Manhattan","Central Park","Yellow Zone"
 45 | 44,"Staten Island","Charleston/Tottenville","Boro Zone"
 46 | 45,"Manhattan","Chinatown","Yellow Zone"
 47 | 46,"Bronx","City Island","Boro Zone"
 48 | 47,"Bronx","Claremont/Bathgate","Boro Zone"
 49 | 48,"Manhattan","Clinton East","Yellow Zone"
 50 | 49,"Brooklyn","Clinton Hill","Boro Zone"
 51 | 50,"Manhattan","Clinton West","Yellow Zone"
 52 | 51,"Bronx","Co-Op City","Boro Zone"
 53 | 52,"Brooklyn","Cobble Hill","Boro Zone"
 54 | 53,"Queens","College Point","Boro Zone"
 55 | 54,"Brooklyn","Columbia Street","Boro Zone"
 56 | 55,"Brooklyn","Coney Island","Boro Zone"
 57 | 56,"Queens","Corona","Boro Zone"
 58 | 57,"Queens","Corona","Boro Zone"
 59 | 58,"Bronx","Country Club","Boro Zone"
 60 | 59,"Bronx","Crotona Park","Boro Zone"
 61 | 60,"Bronx","Crotona Park East","Boro Zone"
 62 | 61,"Brooklyn","Crown Heights North","Boro Zone"
 63 | 62,"Brooklyn","Crown Heights South","Boro Zone"
 64 | 63,"Brooklyn","Cypress Hills","Boro Zone"
 65 | 64,"Queens","Douglaston","Boro Zone"
 66 | 65,"Brooklyn","Downtown Brooklyn/MetroTech","Boro Zone"
 67 | 66,"Brooklyn","DUMBO/Vinegar Hill","Boro Zone"
 68 | 67,"Brooklyn","Dyker Heights","Boro Zone"
 69 | 68,"Manhattan","East Chelsea","Yellow Zone"
 70 | 69,"Bronx","East Concourse/Concourse Village","Boro Zone"
 71 | 70,"Queens","East Elmhurst","Boro Zone"
 72 | 71,"Brooklyn","East Flatbush/Farragut","Boro Zone"
 73 | 72,"Brooklyn","East Flatbush/Remsen Village","Boro Zone"
 74 | 73,"Queens","East Flushing","Boro Zone"
 75 | 74,"Manhattan","East Harlem North","Boro Zone"
 76 | 75,"Manhattan","East Harlem South","Boro Zone"
 77 | 76,"Brooklyn","East New York","Boro Zone"
 78 | 77,"Brooklyn","East New York/Pennsylvania Avenue","Boro Zone"
 79 | 78,"Bronx","East Tremont","Boro Zone"
 80 | 79,"Manhattan","East Village","Yellow Zone"
 81 | 80,"Brooklyn","East Williamsburg","Boro Zone"
 82 | 81,"Bronx","Eastchester","Boro Zone"
 83 | 82,"Queens","Elmhurst","Boro Zone"
 84 | 83,"Queens","Elmhurst/Maspeth","Boro Zone"
 85 | 84,"Staten Island","Eltingville/Annadale/Prince's Bay","Boro Zone"
 86 | 85,"Brooklyn","Erasmus","Boro Zone"
 87 | 86,"Queens","Far Rockaway","Boro Zone"
 88 | 87,"Manhattan","Financial District North","Yellow Zone"
 89 | 88,"Manhattan","Financial District South","Yellow Zone"
 90 | 89,"Brooklyn","Flatbush/Ditmas Park","Boro Zone"
 91 | 90,"Manhattan","Flatiron","Yellow Zone"
 92 | 91,"Brooklyn","Flatlands","Boro Zone"
 93 | 92,"Queens","Flushing","Boro Zone"
 94 | 93,"Queens","Flushing Meadows-Corona Park","Boro Zone"
 95 | 94,"Bronx","Fordham South","Boro Zone"
 96 | 95,"Queens","Forest Hills","Boro Zone"
 97 | 96,"Queens","Forest Park/Highland Park","Boro Zone"
 98 | 97,"Brooklyn","Fort Greene","Boro Zone"
 99 | 98,"Queens","Fresh Meadows","Boro Zone"
100 | 99,"Staten Island","Freshkills Park","Boro Zone"
101 | 100,"Manhattan","Garment District","Yellow Zone"
102 | 101,"Queens","Glen Oaks","Boro Zone"
103 | 102,"Queens","Glendale","Boro Zone"
104 | 103,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone"
105 | 104,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone"
106 | 105,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone"
107 | 106,"Brooklyn","Gowanus","Boro Zone"
108 | 107,"Manhattan","Gramercy","Yellow Zone"
109 | 108,"Brooklyn","Gravesend","Boro Zone"
110 | 109,"Staten Island","Great Kills","Boro Zone"
111 | 110,"Staten Island","Great Kills Park","Boro Zone"
112 | 111,"Brooklyn","Green-Wood Cemetery","Boro Zone"
113 | 112,"Brooklyn","Greenpoint","Boro Zone"
114 | 113,"Manhattan","Greenwich Village North","Yellow Zone"
115 | 114,"Manhattan","Greenwich Village South","Yellow Zone"
116 | 115,"Staten Island","Grymes Hill/Clifton","Boro Zone"
117 | 116,"Manhattan","Hamilton Heights","Boro Zone"
118 | 117,"Queens","Hammels/Arverne","Boro Zone"
119 | 118,"Staten Island","Heartland Village/Todt Hill","Boro Zone"
120 | 119,"Bronx","Highbridge","Boro Zone"
121 | 120,"Manhattan","Highbridge Park","Boro Zone"
122 | 121,"Queens","Hillcrest/Pomonok","Boro Zone"
123 | 122,"Queens","Hollis","Boro Zone"
124 | 123,"Brooklyn","Homecrest","Boro Zone"
125 | 124,"Queens","Howard Beach","Boro Zone"
126 | 125,"Manhattan","Hudson Sq","Yellow Zone"
127 | 126,"Bronx","Hunts Point","Boro Zone"
128 | 127,"Manhattan","Inwood","Boro Zone"
129 | 128,"Manhattan","Inwood Hill Park","Boro Zone"
130 | 129,"Queens","Jackson Heights","Boro Zone"
131 | 130,"Queens","Jamaica","Boro Zone"
132 | 131,"Queens","Jamaica Estates","Boro Zone"
133 | 132,"Queens","JFK Airport","Airports"
134 | 133,"Brooklyn","Kensington","Boro Zone"
135 | 134,"Queens","Kew Gardens","Boro Zone"
136 | 135,"Queens","Kew Gardens Hills","Boro Zone"
137 | 136,"Bronx","Kingsbridge Heights","Boro Zone"
138 | 137,"Manhattan","Kips Bay","Yellow Zone"
139 | 138,"Queens","LaGuardia Airport","Airports"
140 | 139,"Queens","Laurelton","Boro Zone"
141 | 140,"Manhattan","Lenox Hill East","Yellow Zone"
142 | 141,"Manhattan","Lenox Hill West","Yellow Zone"
143 | 142,"Manhattan","Lincoln Square East","Yellow Zone"
144 | 143,"Manhattan","Lincoln Square West","Yellow Zone"
145 | 144,"Manhattan","Little Italy/NoLiTa","Yellow Zone"
146 | 145,"Queens","Long Island City/Hunters Point","Boro Zone"
147 | 146,"Queens","Long Island City/Queens Plaza","Boro Zone"
148 | 147,"Bronx","Longwood","Boro Zone"
149 | 148,"Manhattan","Lower East Side","Yellow Zone"
150 | 149,"Brooklyn","Madison","Boro Zone"
151 | 150,"Brooklyn","Manhattan Beach","Boro Zone"
152 | 151,"Manhattan","Manhattan Valley","Yellow Zone"
153 | 152,"Manhattan","Manhattanville","Boro Zone"
154 | 153,"Manhattan","Marble Hill","Boro Zone"
155 | 154,"Brooklyn","Marine Park/Floyd Bennett Field","Boro Zone"
156 | 155,"Brooklyn","Marine Park/Mill Basin","Boro Zone"
157 | 156,"Staten Island","Mariners Harbor","Boro Zone"
158 | 157,"Queens","Maspeth","Boro Zone"
159 | 158,"Manhattan","Meatpacking/West Village West","Yellow Zone"
160 | 159,"Bronx","Melrose South","Boro Zone"
161 | 160,"Queens","Middle Village","Boro Zone"
162 | 161,"Manhattan","Midtown Center","Yellow Zone"
163 | 162,"Manhattan","Midtown East","Yellow Zone"
164 | 163,"Manhattan","Midtown North","Yellow Zone"
165 | 164,"Manhattan","Midtown South","Yellow Zone"
166 | 165,"Brooklyn","Midwood","Boro Zone"
167 | 166,"Manhattan","Morningside Heights","Boro Zone"
168 | 167,"Bronx","Morrisania/Melrose","Boro Zone"
169 | 168,"Bronx","Mott Haven/Port Morris","Boro Zone"
170 | 169,"Bronx","Mount Hope","Boro Zone"
171 | 170,"Manhattan","Murray Hill","Yellow Zone"
172 | 171,"Queens","Murray Hill-Queens","Boro Zone"
173 | 172,"Staten Island","New Dorp/Midland Beach","Boro Zone"
174 | 173,"Queens","North Corona","Boro Zone"
175 | 174,"Bronx","Norwood","Boro Zone"
176 | 175,"Queens","Oakland Gardens","Boro Zone"
177 | 176,"Staten Island","Oakwood","Boro Zone"
178 | 177,"Brooklyn","Ocean Hill","Boro Zone"
179 | 178,"Brooklyn","Ocean Parkway South","Boro Zone"
180 | 179,"Queens","Old Astoria","Boro Zone"
181 | 180,"Queens","Ozone Park","Boro Zone"
182 | 181,"Brooklyn","Park Slope","Boro Zone"
183 | 182,"Bronx","Parkchester","Boro Zone"
184 | 183,"Bronx","Pelham Bay","Boro Zone"
185 | 184,"Bronx","Pelham Bay Park","Boro Zone"
186 | 185,"Bronx","Pelham Parkway","Boro Zone"
187 | 186,"Manhattan","Penn Station/Madison Sq West","Yellow Zone"
188 | 187,"Staten Island","Port Richmond","Boro Zone"
189 | 188,"Brooklyn","Prospect-Lefferts Gardens","Boro Zone"
190 | 189,"Brooklyn","Prospect Heights","Boro Zone"
191 | 190,"Brooklyn","Prospect Park","Boro Zone"
192 | 191,"Queens","Queens Village","Boro Zone"
193 | 192,"Queens","Queensboro Hill","Boro Zone"
194 | 193,"Queens","Queensbridge/Ravenswood","Boro Zone"
195 | 194,"Manhattan","Randalls Island","Yellow Zone"
196 | 195,"Brooklyn","Red Hook","Boro Zone"
197 | 196,"Queens","Rego Park","Boro Zone"
198 | 197,"Queens","Richmond Hill","Boro Zone"
199 | 198,"Queens","Ridgewood","Boro Zone"
200 | 199,"Bronx","Rikers Island","Boro Zone"
201 | 200,"Bronx","Riverdale/North Riverdale/Fieldston","Boro Zone"
202 | 201,"Queens","Rockaway Park","Boro Zone"
203 | 202,"Manhattan","Roosevelt Island","Boro Zone"
204 | 203,"Queens","Rosedale","Boro Zone"
205 | 204,"Staten Island","Rossville/Woodrow","Boro Zone"
206 | 205,"Queens","Saint Albans","Boro Zone"
207 | 206,"Staten Island","Saint George/New Brighton","Boro Zone"
208 | 207,"Queens","Saint Michaels Cemetery/Woodside","Boro Zone"
209 | 208,"Bronx","Schuylerville/Edgewater Park","Boro Zone"
210 | 209,"Manhattan","Seaport","Yellow Zone"
211 | 210,"Brooklyn","Sheepshead Bay","Boro Zone"
212 | 211,"Manhattan","SoHo","Yellow Zone"
213 | 212,"Bronx","Soundview/Bruckner","Boro Zone"
214 | 213,"Bronx","Soundview/Castle Hill","Boro Zone"
215 | 214,"Staten Island","South Beach/Dongan Hills","Boro Zone"
216 | 215,"Queens","South Jamaica","Boro Zone"
217 | 216,"Queens","South Ozone Park","Boro Zone"
218 | 217,"Brooklyn","South Williamsburg","Boro Zone"
219 | 218,"Queens","Springfield Gardens North","Boro Zone"
220 | 219,"Queens","Springfield Gardens South","Boro Zone"
221 | 220,"Bronx","Spuyten Duyvil/Kingsbridge","Boro Zone"
222 | 221,"Staten Island","Stapleton","Boro Zone"
223 | 222,"Brooklyn","Starrett City","Boro Zone"
224 | 223,"Queens","Steinway","Boro Zone"
225 | 224,"Manhattan","Stuy Town/Peter Cooper Village","Yellow Zone"
226 | 225,"Brooklyn","Stuyvesant Heights","Boro Zone"
227 | 226,"Queens","Sunnyside","Boro Zone"
228 | 227,"Brooklyn","Sunset Park East","Boro Zone"
229 | 228,"Brooklyn","Sunset Park West","Boro Zone"
230 | 229,"Manhattan","Sutton Place/Turtle Bay North","Yellow Zone"
231 | 230,"Manhattan","Times Sq/Theatre District","Yellow Zone"
232 | 231,"Manhattan","TriBeCa/Civic Center","Yellow Zone"
233 | 232,"Manhattan","Two Bridges/Seward Park","Yellow Zone"
234 | 233,"Manhattan","UN/Turtle Bay South","Yellow Zone"
235 | 234,"Manhattan","Union Sq","Yellow Zone"
236 | 235,"Bronx","University Heights/Morris Heights","Boro Zone"
237 | 236,"Manhattan","Upper East Side North","Yellow Zone"
238 | 237,"Manhattan","Upper East Side South","Yellow Zone"
239 | 238,"Manhattan","Upper West Side North","Yellow Zone"
240 | 239,"Manhattan","Upper West Side South","Yellow Zone"
241 | 240,"Bronx","Van Cortlandt Park","Boro Zone"
242 | 241,"Bronx","Van Cortlandt Village","Boro Zone"
243 | 242,"Bronx","Van Nest/Morris Park","Boro Zone"
244 | 243,"Manhattan","Washington Heights North","Boro Zone"
245 | 244,"Manhattan","Washington Heights South","Boro Zone"
246 | 245,"Staten Island","West Brighton","Boro Zone"
247 | 246,"Manhattan","West Chelsea/Hudson Yards","Yellow Zone"
248 | 247,"Bronx","West Concourse","Boro Zone"
249 | 248,"Bronx","West Farms/Bronx River","Boro Zone"
250 | 249,"Manhattan","West Village","Yellow Zone"
251 | 250,"Bronx","Westchester Village/Unionport","Boro Zone"
252 | 251,"Staten Island","Westerleigh","Boro Zone"
253 | 252,"Queens","Whitestone","Boro Zone"
254 | 253,"Queens","Willets Point","Boro Zone"
255 | 254,"Bronx","Williamsbridge/Olinville","Boro Zone"
256 | 255,"Brooklyn","Williamsburg (North Side)","Boro Zone"
257 | 256,"Brooklyn","Williamsburg (South Side)","Boro Zone"
258 | 257,"Brooklyn","Windsor Terrace","Boro Zone"
259 | 258,"Queens","Woodhaven","Boro Zone"
260 | 259,"Bronx","Woodlawn/Wakefield","Boro Zone"
261 | 260,"Queens","Woodside","Boro Zone"
262 | 261,"Manhattan","World Trade Center","Yellow Zone"
263 | 262,"Manhattan","Yorkville East","Yellow Zone"
264 | 263,"Manhattan","Yorkville West","Yellow Zone"
265 | 264,"Unknown","NV","N/A"
266 | 265,"Unknown","NA","N/A"
267 | 


--------------------------------------------------------------------------------
/Week 5/Code/04 Spark SQL.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "1a8cf1a8",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pyspark\n",
 11 |     "from pyspark.sql import SparkSession\n",
 12 |     "\n",
 13 |     "spark = SparkSession.builder \\\n",
 14 |     "    .master(\"local[*]\") \\\n",
 15 |     "    .appName('test') \\\n",
 16 |     "    .getOrCreate()"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 9,
 22 |    "id": "17ff4c2a",
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "df_green = spark.read.parquet('D:/data/pq/green/*/*')"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 10,
 32 |    "id": "7568f742",
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "root\n",
 40 |       " |-- VendorID: integer (nullable = true)\n",
 41 |       " |-- lpep_pickup_datetime: timestamp (nullable = true)\n",
 42 |       " |-- lpep_dropoff_datetime: timestamp (nullable = true)\n",
 43 |       " |-- store_and_fwd_flag: string (nullable = true)\n",
 44 |       " |-- RatecodeID: integer (nullable = true)\n",
 45 |       " |-- PULocationID: integer (nullable = true)\n",
 46 |       " |-- DOLocationID: integer (nullable = true)\n",
 47 |       " |-- passenger_count: integer (nullable = true)\n",
 48 |       " |-- trip_distance: double (nullable = true)\n",
 49 |       " |-- fare_amount: double (nullable = true)\n",
 50 |       " |-- extra: double (nullable = true)\n",
 51 |       " |-- mta_tax: double (nullable = true)\n",
 52 |       " |-- tip_amount: double (nullable = true)\n",
 53 |       " |-- tolls_amount: double (nullable = true)\n",
 54 |       " |-- ehail_fee: string (nullable = true)\n",
 55 |       " |-- improvement_surcharge: double (nullable = true)\n",
 56 |       " |-- total_amount: double (nullable = true)\n",
 57 |       " |-- payment_type: integer (nullable = true)\n",
 58 |       " |-- trip_type: integer (nullable = true)\n",
 59 |       " |-- congestion_surcharge: double (nullable = true)\n",
 60 |       "\n"
 61 |      ]
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "df_green.printSchema()"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 11,
 71 |    "id": "ca99c5db",
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# Changing the column names for the date columns\n",
 76 |     "\n",
 77 |     "df_green = df_green \\\n",
 78 |     "    .withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \\\n",
 79 |     "    .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime')"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 12,
 85 |    "id": "15991733",
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stdout",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "root\n",
 93 |       " |-- VendorID: integer (nullable = true)\n",
 94 |       " |-- pickup_datetime: timestamp (nullable = true)\n",
 95 |       " |-- dropoff_datetime: timestamp (nullable = true)\n",
 96 |       " |-- store_and_fwd_flag: string (nullable = true)\n",
 97 |       " |-- RatecodeID: integer (nullable = true)\n",
 98 |       " |-- PULocationID: integer (nullable = true)\n",
 99 |       " |-- DOLocationID: integer (nullable = true)\n",
100 |       " |-- passenger_count: integer (nullable = true)\n",
101 |       " |-- trip_distance: double (nullable = true)\n",
102 |       " |-- fare_amount: double (nullable = true)\n",
103 |       " |-- extra: double (nullable = true)\n",
104 |       " |-- mta_tax: double (nullable = true)\n",
105 |       " |-- tip_amount: double (nullable = true)\n",
106 |       " |-- tolls_amount: double (nullable = true)\n",
107 |       " |-- ehail_fee: string (nullable = true)\n",
108 |       " |-- improvement_surcharge: double (nullable = true)\n",
109 |       " |-- total_amount: double (nullable = true)\n",
110 |       " |-- payment_type: integer (nullable = true)\n",
111 |       " |-- trip_type: integer (nullable = true)\n",
112 |       " |-- congestion_surcharge: double (nullable = true)\n",
113 |       "\n"
114 |      ]
115 |     }
116 |    ],
117 |    "source": [
118 |     "# checking the schema to see if the column names are changed\n",
119 |     "\n",
120 |     "df_green.printSchema()"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 13,
126 |    "id": "2a5dba50",
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "df_yellow = spark.read.parquet('D:/data/pq/yellow/*/*')"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 14,
136 |    "id": "faf1413f",
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "name": "stdout",
141 |      "output_type": "stream",
142 |      "text": [
143 |       "root\n",
144 |       " |-- VendorID: integer (nullable = true)\n",
145 |       " |-- tpep_pickup_datetime: timestamp (nullable = true)\n",
146 |       " |-- tpep_dropoff_datetime: timestamp (nullable = true)\n",
147 |       " |-- passenger_count: integer (nullable = true)\n",
148 |       " |-- trip_distance: double (nullable = true)\n",
149 |       " |-- RatecodeID: integer (nullable = true)\n",
150 |       " |-- store_and_fwd_flag: string (nullable = true)\n",
151 |       " |-- PULocationID: integer (nullable = true)\n",
152 |       " |-- DOLocationID: integer (nullable = true)\n",
153 |       " |-- payment_type: integer (nullable = true)\n",
154 |       " |-- fare_amount: double (nullable = true)\n",
155 |       " |-- extra: double (nullable = true)\n",
156 |       " |-- mta_tax: double (nullable = true)\n",
157 |       " |-- tip_amount: double (nullable = true)\n",
158 |       " |-- tolls_amount: double (nullable = true)\n",
159 |       " |-- improvement_surcharge: double (nullable = true)\n",
160 |       " |-- total_amount: double (nullable = true)\n",
161 |       " |-- congestion_surcharge: double (nullable = true)\n",
162 |       "\n"
163 |      ]
164 |     }
165 |    ],
166 |    "source": [
167 |     "df_yellow.printSchema()"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 15,
173 |    "id": "a78482c4",
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "# Changing the column names for the date columns\n",
178 |     "\n",
179 |     "df_yellow = df_yellow \\\n",
180 |     "    .withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime') \\\n",
181 |     "    .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime')"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 16,
187 |    "id": "52a5bf4f",
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "name": "stdout",
192 |      "output_type": "stream",
193 |      "text": [
194 |       "root\n",
195 |       " |-- VendorID: integer (nullable = true)\n",
196 |       " |-- pickup_datetime: timestamp (nullable = true)\n",
197 |       " |-- dropoff_datetime: timestamp (nullable = true)\n",
198 |       " |-- passenger_count: integer (nullable = true)\n",
199 |       " |-- trip_distance: double (nullable = true)\n",
200 |       " |-- RatecodeID: integer (nullable = true)\n",
201 |       " |-- store_and_fwd_flag: string (nullable = true)\n",
202 |       " |-- PULocationID: integer (nullable = true)\n",
203 |       " |-- DOLocationID: integer (nullable = true)\n",
204 |       " |-- payment_type: integer (nullable = true)\n",
205 |       " |-- fare_amount: double (nullable = true)\n",
206 |       " |-- extra: double (nullable = true)\n",
207 |       " |-- mta_tax: double (nullable = true)\n",
208 |       " |-- tip_amount: double (nullable = true)\n",
209 |       " |-- tolls_amount: double (nullable = true)\n",
210 |       " |-- improvement_surcharge: double (nullable = true)\n",
211 |       " |-- total_amount: double (nullable = true)\n",
212 |       " |-- congestion_surcharge: double (nullable = true)\n",
213 |       "\n"
214 |      ]
215 |     }
216 |    ],
217 |    "source": [
218 |     "# checking the schema to see if the column names are changed\n",
219 |     "\n",
220 |     "df_yellow.printSchema()"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 18,
226 |    "id": "aeb8195e",
227 |    "metadata": {},
228 |    "outputs": [
229 |     {
230 |      "data": {
231 |       "text/plain": [
232 |        "['VendorID',\n",
233 |        " 'pickup_datetime',\n",
234 |        " 'dropoff_datetime',\n",
235 |        " 'store_and_fwd_flag',\n",
236 |        " 'RatecodeID',\n",
237 |        " 'PULocationID',\n",
238 |        " 'DOLocationID',\n",
239 |        " 'passenger_count',\n",
240 |        " 'trip_distance',\n",
241 |        " 'fare_amount',\n",
242 |        " 'extra',\n",
243 |        " 'mta_tax',\n",
244 |        " 'tip_amount',\n",
245 |        " 'tolls_amount',\n",
246 |        " 'improvement_surcharge',\n",
247 |        " 'total_amount',\n",
248 |        " 'payment_type',\n",
249 |        " 'congestion_surcharge']"
250 |       ]
251 |      },
252 |      "execution_count": 18,
253 |      "metadata": {},
254 |      "output_type": "execute_result"
255 |     }
256 |    ],
257 |    "source": [
258 |     "# Preserving the row order and choosing only the columns common in both the taxi datasets\n",
259 |     "\n",
260 |     "common_columns = []\n",
261 |     "\n",
262 |     "cols = set(df_yellow.columns)\n",
263 |     "\n",
264 |     "for col in df_green.columns:\n",
265 |     "    if col in cols:\n",
266 |     "        common_columns.append(col)\n",
267 |     "\n",
268 |     "common_columns"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 22,
274 |    "id": "933bee9c",
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "from pyspark.sql import functions as F"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 25,
284 |    "id": "c6eed3e5",
285 |    "metadata": {
286 |     "scrolled": true
287 |    },
288 |    "outputs": [
289 |     {
290 |      "data": {
291 |       "text/plain": [
292 |        "['VendorID',\n",
293 |        " 'pickup_datetime',\n",
294 |        " 'dropoff_datetime',\n",
295 |        " 'store_and_fwd_flag',\n",
296 |        " 'RatecodeID',\n",
297 |        " 'PULocationID',\n",
298 |        " 'DOLocationID',\n",
299 |        " 'passenger_count',\n",
300 |        " 'trip_distance',\n",
301 |        " 'fare_amount',\n",
302 |        " 'extra',\n",
303 |        " 'mta_tax',\n",
304 |        " 'tip_amount',\n",
305 |        " 'tolls_amount',\n",
306 |        " 'improvement_surcharge',\n",
307 |        " 'total_amount',\n",
308 |        " 'payment_type',\n",
309 |        " 'congestion_surcharge',\n",
310 |        " 'service_type']"
311 |       ]
312 |      },
313 |      "execution_count": 25,
314 |      "metadata": {},
315 |      "output_type": "execute_result"
316 |     }
317 |    ],
318 |    "source": [
319 |     "# Adding an additional column to differentiate the taxi color\n",
320 |     "\n",
321 |     "df_green_sel = df_green.select(common_columns).withColumn(\"service_type\", F.lit('green'))\n",
322 |     "\n",
323 |     "df_green_sel.columns"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 26,
329 |    "id": "0cc38d27",
330 |    "metadata": {},
331 |    "outputs": [
332 |     {
333 |      "data": {
334 |       "text/plain": [
335 |        "['VendorID',\n",
336 |        " 'pickup_datetime',\n",
337 |        " 'dropoff_datetime',\n",
338 |        " 'store_and_fwd_flag',\n",
339 |        " 'RatecodeID',\n",
340 |        " 'PULocationID',\n",
341 |        " 'DOLocationID',\n",
342 |        " 'passenger_count',\n",
343 |        " 'trip_distance',\n",
344 |        " 'fare_amount',\n",
345 |        " 'extra',\n",
346 |        " 'mta_tax',\n",
347 |        " 'tip_amount',\n",
348 |        " 'tolls_amount',\n",
349 |        " 'improvement_surcharge',\n",
350 |        " 'total_amount',\n",
351 |        " 'payment_type',\n",
352 |        " 'congestion_surcharge',\n",
353 |        " 'service_type']"
354 |       ]
355 |      },
356 |      "execution_count": 26,
357 |      "metadata": {},
358 |      "output_type": "execute_result"
359 |     }
360 |    ],
361 |    "source": [
362 |     "# Adding an additional column to differentiate the taxi color\n",
363 |     "\n",
364 |     "df_yellow_sel = df_yellow.select(common_columns).withColumn(\"service_type\", F.lit('yellow'))\n",
365 |     "\n",
366 |     "df_yellow_sel.columns"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 27,
372 |    "id": "f0c117d3",
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "# Combining the two datasets\n",
377 |     "\n",
378 |     "df_trips_data = df_green_sel.unionAll(df_yellow_sel)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 45,
384 |    "id": "e415c4c0",
385 |    "metadata": {},
386 |    "outputs": [
387 |     {
388 |      "name": "stdout",
389 |      "output_type": "stream",
390 |      "text": [
391 |       "+------------+--------+\n",
392 |       "|service_type|   count|\n",
393 |       "+------------+--------+\n",
394 |       "|      yellow|39649199|\n",
395 |       "|       green| 2304517|\n",
396 |       "+------------+--------+\n",
397 |       "\n"
398 |      ]
399 |     }
400 |    ],
401 |    "source": [
402 |     "df_trips_data.groupBy('service_type').count().orderBy('count', ascending = False).show()"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 48,
408 |    "id": "e242ddd9",
409 |    "metadata": {},
410 |    "outputs": [
411 |     {
412 |      "name": "stderr",
413 |      "output_type": "stream",
414 |      "text": [
415 |       "C:\\Users\\balaj\\anaconda3\\lib\\site-packages\\pyspark\\sql\\dataframe.py:138: FutureWarning: Deprecated in 2.0, use createOrReplaceTempView instead.\n",
416 |       "  warnings.warn(\n"
417 |      ]
418 |     }
419 |    ],
420 |    "source": [
421 |     "# Registering the spark dataframe as a temporary table to be used for Spark SQL\n",
422 |     "\n",
423 |     "df_trips_data.registerTempTable('trips_data')"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": 57,
429 |    "id": "7bbd40ae",
430 |    "metadata": {},
431 |    "outputs": [
432 |     {
433 |      "name": "stdout",
434 |      "output_type": "stream",
435 |      "text": [
436 |       "+------------+--------+\n",
437 |       "|service_type|   count|\n",
438 |       "+------------+--------+\n",
439 |       "|      yellow|39649199|\n",
440 |       "|       green| 2304517|\n",
441 |       "+------------+--------+\n",
442 |       "\n"
443 |      ]
444 |     }
445 |    ],
446 |    "source": [
447 |     "# Sample SQL query\n",
448 |     "\n",
449 |     "spark.sql(\"\"\"\n",
450 |     "SELECT \n",
451 |     "    service_type, count(1) as count\n",
452 |     "FROM\n",
453 |     "    trips_data\n",
454 |     "GROUP BY 1\n",
455 |     "ORDER BY 1 desc\n",
456 |     "\"\"\"\n",
457 |     ").show()"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": 58,
463 |    "id": "f8350e5e",
464 |    "metadata": {},
465 |    "outputs": [],
466 |    "source": [
467 |     "# Saving a result from a SQL query\n",
468 |     "\n",
469 |     "df_result = spark.sql(\"\"\"\n",
470 |     "SELECT \n",
471 |     "    -- Reveneue grouping \n",
472 |     "    PULocationID AS revenue_zone,\n",
473 |     "    date_trunc('month', pickup_datetime) AS revenue_month, \n",
474 |     "    service_type, \n",
475 |     "\n",
476 |     "    -- Revenue calculation \n",
477 |     "    SUM(fare_amount) AS revenue_monthly_fare,\n",
478 |     "    SUM(extra) AS revenue_monthly_extra,\n",
479 |     "    SUM(mta_tax) AS revenue_monthly_mta_tax,\n",
480 |     "    SUM(tip_amount) AS revenue_monthly_tip_amount,\n",
481 |     "    SUM(tolls_amount) AS revenue_monthly_tolls_amount,\n",
482 |     "    SUM(improvement_surcharge) AS revenue_monthly_improvement_surcharge,\n",
483 |     "    SUM(total_amount) AS revenue_monthly_total_amount,\n",
484 |     "    SUM(congestion_surcharge) AS revenue_monthly_congestion_surcharge,\n",
485 |     "\n",
486 |     "    -- Additional calculations\n",
487 |     "    AVG(passenger_count) AS avg_montly_passenger_count,\n",
488 |     "    AVG(trip_distance) AS avg_montly_trip_distance\n",
489 |     "FROM\n",
490 |     "    trips_data\n",
491 |     "GROUP BY\n",
492 |     "    1, 2, 3\n",
493 |     "\"\"\")\n",
494 |     "\n",
495 |     "df_result.coalesce(1).write.parquet('D:/data/report/revenue/', mode='overwrite')"
496 |    ]
497 |   }
498 |  ],
499 |  "metadata": {
500 |   "kernelspec": {
501 |    "display_name": "Python 3 (ipykernel)",
502 |    "language": "python",
503 |    "name": "python3"
504 |   },
505 |   "language_info": {
506 |    "codemirror_mode": {
507 |     "name": "ipython",
508 |     "version": 3
509 |    },
510 |    "file_extension": ".py",
511 |    "mimetype": "text/x-python",
512 |    "name": "python",
513 |    "nbconvert_exporter": "python",
514 |    "pygments_lexer": "ipython3",
515 |    "version": "3.9.15"
516 |   }
517 |  },
518 |  "nbformat": 4,
519 |  "nbformat_minor": 5
520 | }
521 | 


--------------------------------------------------------------------------------
/Week 5/Code/06 RDDs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "b40d2766",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## Section I\n",
  9 |     "\n",
 10 |     "Operations on Spark RDDs"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "id": "722b5468",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pyspark\n",
 21 |     "from pyspark.sql import SparkSession\n",
 22 |     "\n",
 23 |     "spark = SparkSession.builder \\\n",
 24 |     "    .master(\"local[*]\") \\\n",
 25 |     "    .appName('test') \\\n",
 26 |     "    .getOrCreate()"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 3,
 32 |    "id": "7f88bc7b",
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "df_green = spark.read.parquet('D:/data/pq/green/*/*')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "id": "9e4913cf",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "#### We will be Implementing the below SQL code using RDDs\n",
 45 |     "\n",
 46 |     "SELECT \n",
 47 |     "    date_trunc('hour', lpep_pickup_datetime) AS hour, \n",
 48 |     "    PULocationID AS zone, \n",
 49 |     "    \n",
 50 |     "    SUM(total_amount) AS amount,\n",
 51 |     "    COUNT(1) AS number_records \n",
 52 |     "FROM\n",
 53 |     "    green \\\n",
 54 |     "WHERE \n",
 55 |     "    lpep_pickup_datetime >= '2020-01-01 00:00:00' \\\n",
 56 |     "GROUP BY\n",
 57 |     "    1, 2;"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 4,
 63 |    "id": "34a91beb",
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# STEP 1\n",
 68 |     "\n",
 69 |     "rdd = df_green \\\n",
 70 |     "    .select('lpep_pickup_datetime', 'PULocationID', 'total_amount') \\\n",
 71 |     "    .rdd"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 5,
 77 |    "id": "1f594261",
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "data": {
 82 |       "text/plain": [
 83 |        "[Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 23, 13, 10, 15), PULocationID=74, total_amount=44.97),\n",
 84 |        " Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 20, 15, 9), PULocationID=67, total_amount=33.45),\n",
 85 |        " Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 15, 20, 23, 41), PULocationID=260, total_amount=8.3),\n",
 86 |        " Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 5, 16, 32, 26), PULocationID=82, total_amount=8.3),\n",
 87 |        " Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 29, 19, 22, 42), PULocationID=166, total_amount=12.74)]"
 88 |       ]
 89 |      },
 90 |      "execution_count": 5,
 91 |      "metadata": {},
 92 |      "output_type": "execute_result"
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "rdd.take(5)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 6,
102 |    "id": "46b797f9",
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "data": {
107 |       "text/plain": [
108 |        "[Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 23, 13, 10, 15), PULocationID=74, total_amount=44.97),\n",
109 |        " Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 20, 15, 9), PULocationID=67, total_amount=33.45),\n",
110 |        " Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 15, 20, 23, 41), PULocationID=260, total_amount=8.3),\n",
111 |        " Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 5, 16, 32, 26), PULocationID=82, total_amount=8.3),\n",
112 |        " Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 29, 19, 22, 42), PULocationID=166, total_amount=12.74)]"
113 |       ]
114 |      },
115 |      "execution_count": 6,
116 |      "metadata": {},
117 |      "output_type": "execute_result"
118 |     }
119 |    ],
120 |    "source": [
121 |     "# STEP 2\n",
122 |     "\n",
123 |     "from datetime import datetime\n",
124 |     "\n",
125 |     "start = datetime(year=2020, month=1, day=1)\n",
126 |     "\n",
127 |     "def filter_outliers(row):\n",
128 |     "    return row.lpep_pickup_datetime >= start\n",
129 |     "\n",
130 |     "rdd.filter(filter_outliers).take(5)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 7,
136 |    "id": "c2a85921",
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "data": {
141 |       "text/plain": [
142 |        "Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 23, 13, 10, 15), PULocationID=74, total_amount=44.97)"
143 |       ]
144 |      },
145 |      "execution_count": 7,
146 |      "metadata": {},
147 |      "output_type": "execute_result"
148 |     }
149 |    ],
150 |    "source": [
151 |     "# Used this row for testing purpose\n",
152 |     "\n",
153 |     "rows = rdd.take(10)\n",
154 |     "row = rows[0]\n",
155 |     "\n",
156 |     "row"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 8,
162 |    "id": "2cb29aae",
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "# STEP 3\n",
167 |     "\n",
168 |     "def prepare_for_grouping(row): \n",
169 |     "    hour = row.lpep_pickup_datetime.replace(minute=0, second=0, microsecond=0)\n",
170 |     "    zone = row.PULocationID\n",
171 |     "    key = (hour, zone)\n",
172 |     "    \n",
173 |     "    amount = row.total_amount\n",
174 |     "    count = 1\n",
175 |     "    value = (amount, count)\n",
176 |     "\n",
177 |     "    return (key, value)"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 9,
183 |    "id": "91dcfda7",
184 |    "metadata": {},
185 |    "outputs": [
186 |     {
187 |      "data": {
188 |       "text/plain": [
189 |        "[((datetime.datetime(2020, 1, 23, 13, 0), 74), (44.97, 1)),\n",
190 |        " ((datetime.datetime(2020, 1, 20, 15, 0), 67), (33.45, 1)),\n",
191 |        " ((datetime.datetime(2020, 1, 15, 20, 0), 260), (8.3, 1)),\n",
192 |        " ((datetime.datetime(2020, 1, 5, 16, 0), 82), (8.3, 1)),\n",
193 |        " ((datetime.datetime(2020, 1, 29, 19, 0), 166), (12.74, 1))]"
194 |       ]
195 |      },
196 |      "execution_count": 9,
197 |      "metadata": {},
198 |      "output_type": "execute_result"
199 |     }
200 |    ],
201 |    "source": [
202 |     "rdd.filter(filter_outliers).map(prepare_for_grouping).take(5)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 10,
208 |    "id": "308bf061",
209 |    "metadata": {},
210 |    "outputs": [
211 |     {
212 |      "data": {
213 |       "text/plain": [
214 |        "[((datetime.datetime(2020, 1, 20, 15, 0), 67), (79.5, 3)),\n",
215 |        " ((datetime.datetime(2020, 1, 16, 8, 0), 41), (736.1399999999994, 54)),\n",
216 |        " ((datetime.datetime(2020, 1, 20, 15, 0), 75), (609.0, 47)),\n",
217 |        " ((datetime.datetime(2020, 1, 17, 21, 0), 74), (594.87, 39)),\n",
218 |        " ((datetime.datetime(2020, 1, 3, 9, 0), 61), (142.21, 9))]"
219 |       ]
220 |      },
221 |      "execution_count": 10,
222 |      "metadata": {},
223 |      "output_type": "execute_result"
224 |     }
225 |    ],
226 |    "source": [
227 |     "# STEP 4\n",
228 |     "\n",
229 |     "def calculate_revenue(left_value, right_value):\n",
230 |     "    left_amount, left_count = left_value\n",
231 |     "    right_amount, right_count = right_value\n",
232 |     "    \n",
233 |     "    output_amount = left_amount + right_amount\n",
234 |     "    output_count = left_count + right_count\n",
235 |     "    \n",
236 |     "    return (output_amount, output_count)\n",
237 |     "\n",
238 |     "rdd.filter(filter_outliers).map(prepare_for_grouping).reduceByKey(calculate_revenue).take(5)"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 11,
244 |    "id": "6d2fa321",
245 |    "metadata": {},
246 |    "outputs": [
247 |     {
248 |      "data": {
249 |       "text/plain": [
250 |        "[RevenueRow(hour=datetime.datetime(2020, 1, 20, 15, 0), zone=67, revenue=79.5, count=3),\n",
251 |        " RevenueRow(hour=datetime.datetime(2020, 1, 16, 8, 0), zone=41, revenue=736.1399999999994, count=54),\n",
252 |        " RevenueRow(hour=datetime.datetime(2020, 1, 20, 15, 0), zone=75, revenue=609.0, count=47),\n",
253 |        " RevenueRow(hour=datetime.datetime(2020, 1, 17, 21, 0), zone=74, revenue=594.87, count=39),\n",
254 |        " RevenueRow(hour=datetime.datetime(2020, 1, 3, 9, 0), zone=61, revenue=142.21, count=9)]"
255 |       ]
256 |      },
257 |      "execution_count": 11,
258 |      "metadata": {},
259 |      "output_type": "execute_result"
260 |     }
261 |    ],
262 |    "source": [
263 |     "# STEP 5\n",
264 |     "\n",
265 |     "from collections import namedtuple\n",
266 |     "\n",
267 |     "RevenueRow = namedtuple('RevenueRow', ['hour', 'zone', 'revenue', 'count'])\n",
268 |     "\n",
269 |     "def unwrap(row):\n",
270 |     "    return RevenueRow(\n",
271 |     "        hour=row[0][0], \n",
272 |     "        zone=row[0][1],\n",
273 |     "        revenue=row[1][0],\n",
274 |     "        count=row[1][1]\n",
275 |     "    )\n",
276 |     "\n",
277 |     "rdd.filter(filter_outliers).map(prepare_for_grouping).reduceByKey(calculate_revenue).map(unwrap).take(5)"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": 18,
283 |    "id": "c7623de8",
284 |    "metadata": {},
285 |    "outputs": [
286 |     {
287 |      "name": "stdout",
288 |      "output_type": "stream",
289 |      "text": [
290 |       "+-------------------+----+-----------------+-----+\n",
291 |       "|               hour|zone|          revenue|count|\n",
292 |       "+-------------------+----+-----------------+-----+\n",
293 |       "|2020-01-20 15:00:00|  67|             79.5|    3|\n",
294 |       "|2020-01-16 08:00:00|  41|736.1399999999994|   54|\n",
295 |       "|2020-01-20 15:00:00|  75|            609.0|   47|\n",
296 |       "|2020-01-17 21:00:00|  74|           594.87|   39|\n",
297 |       "|2020-01-03 09:00:00|  61|           142.21|    9|\n",
298 |       "+-------------------+----+-----------------+-----+\n",
299 |       "only showing top 5 rows\n",
300 |       "\n"
301 |      ]
302 |     }
303 |    ],
304 |    "source": [
305 |     "# If you don't specify the schema in toDF(), it takes a bit longer to run as it tries to find what schema to implement it in\n",
306 |     "\n",
307 |     "rdd.filter(filter_outliers).map(prepare_for_grouping).reduceByKey(calculate_revenue).map(unwrap).toDF().show(5)"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 19,
313 |    "id": "a800b5cb",
314 |    "metadata": {},
315 |    "outputs": [
316 |     {
317 |      "name": "stdout",
318 |      "output_type": "stream",
319 |      "text": [
320 |       "+-------------------+----+-----------------+-----+\n",
321 |       "|               hour|zone|          revenue|count|\n",
322 |       "+-------------------+----+-----------------+-----+\n",
323 |       "|2020-01-20 15:00:00|  67|             79.5|    3|\n",
324 |       "|2020-01-16 08:00:00|  41|736.1399999999994|   54|\n",
325 |       "|2020-01-20 15:00:00|  75|            609.0|   47|\n",
326 |       "|2020-01-17 21:00:00|  74|           594.87|   39|\n",
327 |       "|2020-01-03 09:00:00|  61|           142.21|    9|\n",
328 |       "+-------------------+----+-----------------+-----+\n",
329 |       "only showing top 5 rows\n",
330 |       "\n"
331 |      ]
332 |     }
333 |    ],
334 |    "source": [
335 |     "# STEP 6\n",
336 |     "\n",
337 |     "from pyspark.sql import types\n",
338 |     "\n",
339 |     "result_schema = types.StructType([\n",
340 |     "    types.StructField('hour', types.TimestampType(), True),\n",
341 |     "    types.StructField('zone', types.IntegerType(), True),\n",
342 |     "    types.StructField('revenue', types.DoubleType(), True),\n",
343 |     "    types.StructField('count', types.IntegerType(), True)\n",
344 |     "])\n",
345 |     "\n",
346 |     "rdd.filter(filter_outliers).map(prepare_for_grouping).reduceByKey(calculate_revenue).map(unwrap).toDF(result_schema).show(5)"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 20,
352 |    "id": "d3d64007",
353 |    "metadata": {},
354 |    "outputs": [],
355 |    "source": [
356 |     "df_result = rdd \\\n",
357 |     "    .filter(filter_outliers) \\\n",
358 |     "    .map(prepare_for_grouping) \\\n",
359 |     "    .reduceByKey(calculate_revenue) \\\n",
360 |     "    .map(unwrap) \\\n",
361 |     "    .toDF(result_schema) "
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": 15,
367 |    "id": "72399664",
368 |    "metadata": {},
369 |    "outputs": [],
370 |    "source": [
371 |     "df_result.write.parquet('D:/tmp/green-revenue')"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "markdown",
376 |    "id": "ffa1c598",
377 |    "metadata": {},
378 |    "source": [
379 |     "## Section II\n",
380 |     "\n",
381 |     "Spark RDD mapPartition"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 21,
387 |    "id": "c3f3c1fe",
388 |    "metadata": {},
389 |    "outputs": [],
390 |    "source": [
391 |     "columns = ['VendorID', 'lpep_pickup_datetime', 'PULocationID', 'DOLocationID', 'trip_distance']\n",
392 |     "\n",
393 |     "duration_rdd = df_green \\\n",
394 |     "    .select(columns) \\\n",
395 |     "    .rdd"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 23,
401 |    "id": "436aca6e",
402 |    "metadata": {},
403 |    "outputs": [
404 |     {
405 |      "data": {
406 |       "text/plain": [
407 |        "[Row(VendorID=2, lpep_pickup_datetime=datetime.datetime(2020, 1, 23, 13, 10, 15), PULocationID=74, DOLocationID=130, trip_distance=12.77),\n",
408 |        " Row(VendorID=None, lpep_pickup_datetime=datetime.datetime(2020, 1, 20, 15, 9), PULocationID=67, DOLocationID=39, trip_distance=8.0),\n",
409 |        " Row(VendorID=2, lpep_pickup_datetime=datetime.datetime(2020, 1, 15, 20, 23, 41), PULocationID=260, DOLocationID=157, trip_distance=1.27),\n",
410 |        " Row(VendorID=2, lpep_pickup_datetime=datetime.datetime(2020, 1, 5, 16, 32, 26), PULocationID=82, DOLocationID=83, trip_distance=1.25),\n",
411 |        " Row(VendorID=2, lpep_pickup_datetime=datetime.datetime(2020, 1, 29, 19, 22, 42), PULocationID=166, DOLocationID=42, trip_distance=1.84)]"
412 |       ]
413 |      },
414 |      "execution_count": 23,
415 |      "metadata": {},
416 |      "output_type": "execute_result"
417 |     }
418 |    ],
419 |    "source": [
420 |     "duration_rdd.take(5)"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": 29,
426 |    "id": "cbdfa98f",
427 |    "metadata": {},
428 |    "outputs": [],
429 |    "source": [
430 |     "import pandas as pd\n",
431 |     "\n",
432 |     "#model = ...\n",
433 |     "\n",
434 |     "def model_predict(df):\n",
435 |     "#     y_pred = model.predict(df)\n",
436 |     "    y_pred = df.trip_distance * 5\n",
437 |     "    return y_pred\n",
438 |     "\n",
439 |     "\n",
440 |     "def apply_model_in_batch(rows):\n",
441 |     "    df = pd.DataFrame(rows, columns=columns)\n",
442 |     "    predictions = model_predict(df)\n",
443 |     "    df['predicted_duration'] = predictions\n",
444 |     "\n",
445 |     "    for row in df.itertuples():\n",
446 |     "        yield row"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": 31,
452 |    "id": "b00263cf",
453 |    "metadata": {},
454 |    "outputs": [
455 |     {
456 |      "data": {
457 |       "text/plain": [
458 |        "[Pandas(Index=0, VendorID=2.0, lpep_pickup_datetime=Timestamp('2020-01-23 13:10:15'), PULocationID=74, DOLocationID=130, trip_distance=12.77, predicted_duration=63.849999999999994),\n",
459 |        " Pandas(Index=1, VendorID=nan, lpep_pickup_datetime=Timestamp('2020-01-20 15:09:00'), PULocationID=67, DOLocationID=39, trip_distance=8.0, predicted_duration=40.0),\n",
460 |        " Pandas(Index=2, VendorID=2.0, lpep_pickup_datetime=Timestamp('2020-01-15 20:23:41'), PULocationID=260, DOLocationID=157, trip_distance=1.27, predicted_duration=6.35),\n",
461 |        " Pandas(Index=3, VendorID=2.0, lpep_pickup_datetime=Timestamp('2020-01-05 16:32:26'), PULocationID=82, DOLocationID=83, trip_distance=1.25, predicted_duration=6.25),\n",
462 |        " Pandas(Index=4, VendorID=2.0, lpep_pickup_datetime=Timestamp('2020-01-29 19:22:42'), PULocationID=166, DOLocationID=42, trip_distance=1.84, predicted_duration=9.200000000000001)]"
463 |       ]
464 |      },
465 |      "execution_count": 31,
466 |      "metadata": {},
467 |      "output_type": "execute_result"
468 |     }
469 |    ],
470 |    "source": [
471 |     "duration_rdd.mapPartitions(apply_model_in_batch).take(5)"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": 32,
477 |    "id": "cf0ed02e",
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": [
481 |     "df_predicts = duration_rdd \\\n",
482 |     "    .mapPartitions(apply_model_in_batch)\\\n",
483 |     "    .toDF() \\\n",
484 |     "    .drop('Index')"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": 33,
490 |    "id": "a8a6821d",
491 |    "metadata": {},
492 |    "outputs": [
493 |     {
494 |      "name": "stdout",
495 |      "output_type": "stream",
496 |      "text": [
497 |       "+------------------+\n",
498 |       "|predicted_duration|\n",
499 |       "+------------------+\n",
500 |       "|63.849999999999994|\n",
501 |       "|              40.0|\n",
502 |       "|              6.35|\n",
503 |       "|              6.25|\n",
504 |       "| 9.200000000000001|\n",
505 |       "|               3.8|\n",
506 |       "|16.599999999999998|\n",
507 |       "|             11.05|\n",
508 |       "|               4.5|\n",
509 |       "|              30.5|\n",
510 |       "|               8.7|\n",
511 |       "|5.8999999999999995|\n",
512 |       "|              11.0|\n",
513 |       "|              15.2|\n",
514 |       "|              4.25|\n",
515 |       "|25.299999999999997|\n",
516 |       "|7.8500000000000005|\n",
517 |       "|              34.0|\n",
518 |       "| 5.300000000000001|\n",
519 |       "|              6.15|\n",
520 |       "+------------------+\n",
521 |       "only showing top 20 rows\n",
522 |       "\n"
523 |      ]
524 |     }
525 |    ],
526 |    "source": [
527 |     "df_predicts.select('predicted_duration').show()"
528 |    ]
529 |   }
530 |  ],
531 |  "metadata": {
532 |   "kernelspec": {
533 |    "display_name": "Python 3 (ipykernel)",
534 |    "language": "python",
535 |    "name": "python3"
536 |   },
537 |   "language_info": {
538 |    "codemirror_mode": {
539 |     "name": "ipython",
540 |     "version": 3
541 |    },
542 |    "file_extension": ".py",
543 |    "mimetype": "text/x-python",
544 |    "name": "python",
545 |    "nbconvert_exporter": "python",
546 |    "pygments_lexer": "ipython3",
547 |    "version": "3.9.15"
548 |   }
549 |  },
550 |  "nbformat": 4,
551 |  "nbformat_minor": 5
552 | }
553 | 


--------------------------------------------------------------------------------
/Week 5/Code/03 Taxi Schema.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "60993230",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pyspark\n",
 11 |     "from pyspark.sql import SparkSession"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "id": "03f3af5b",
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "spark = SparkSession.builder \\\n",
 22 |     "    .master(\"local[*]\") \\\n",
 23 |     "    .appName('test') \\\n",
 24 |     "    .getOrCreate()"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "id": "ef0f2f45",
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "# import pandas as pd\n",
 35 |     "# from pyspark.sql import types\n",
 36 |     "\n",
 37 |     "# green_schema = types.StructType([\n",
 38 |     "#     types.StructField(\"VendorID\", types.IntegerType(), True),\n",
 39 |     "#     types.StructField(\"lpep_pickup_datetime\", types.TimestampType(), True),\n",
 40 |     "#     types.StructField(\"lpep_dropoff_datetime\", types.TimestampType(), True),\n",
 41 |     "#     types.StructField(\"store_and_fwd_flag\", types.StringType(), True),\n",
 42 |     "#     types.StructField(\"RatecodeID\", types.IntegerType(), True),\n",
 43 |     "#     types.StructField(\"PULocationID\", types.IntegerType(), True),\n",
 44 |     "#     types.StructField(\"DOLocationID\", types.IntegerType(), True),\n",
 45 |     "#     types.StructField(\"passenger_count\", types.IntegerType(), True),\n",
 46 |     "#     types.StructField(\"trip_distance\", types.DoubleType(), True),\n",
 47 |     "#     types.StructField(\"fare_amount\", types.DoubleType(), True),\n",
 48 |     "#     types.StructField(\"extra\", types.DoubleType(), True),\n",
 49 |     "#     types.StructField(\"mta_tax\", types.DoubleType(), True),\n",
 50 |     "#     types.StructField(\"tip_amount\", types.DoubleType(), True),\n",
 51 |     "#     types.StructField(\"tolls_amount\", types.DoubleType(), True),\n",
 52 |     "#     types.StructField(\"ehail_fee\", types.DoubleType(), True),\n",
 53 |     "#     types.StructField(\"improvement_surcharge\", types.DoubleType(), True),\n",
 54 |     "#     types.StructField(\"total_amount\", types.DoubleType(), True),\n",
 55 |     "#     types.StructField(\"payment_type\", types.IntegerType(), True),\n",
 56 |     "#     types.StructField(\"trip_type\", types.IntegerType(), True),\n",
 57 |     "#     types.StructField(\"congestion_surcharge\", types.DoubleType(), True)\n",
 58 |     "# ])\n",
 59 |     "\n",
 60 |     "# yellow_schema = types.StructType([\n",
 61 |     "#     types.StructField(\"VendorID\", types.IntegerType(), True),\n",
 62 |     "#     types.StructField(\"tpep_pickup_datetime\", types.TimestampType(), True),\n",
 63 |     "#     types.StructField(\"tpep_dropoff_datetime\", types.TimestampType(), True),\n",
 64 |     "#     types.StructField(\"passenger_count\", types.IntegerType(), True),\n",
 65 |     "#     types.StructField(\"trip_distance\", types.DoubleType(), True),\n",
 66 |     "#     types.StructField(\"RatecodeID\", types.IntegerType(), True),\n",
 67 |     "#     types.StructField(\"store_and_fwd_flag\", types.StringType(), True),\n",
 68 |     "#     types.StructField(\"PULocationID\", types.IntegerType(), True),\n",
 69 |     "#     types.StructField(\"DOLocationID\", types.IntegerType(), True),\n",
 70 |     "#     types.StructField(\"payment_type\", types.IntegerType(), True),\n",
 71 |     "#     types.StructField(\"fare_amount\", types.DoubleType(), True),\n",
 72 |     "#     types.StructField(\"extra\", types.DoubleType(), True),\n",
 73 |     "#     types.StructField(\"mta_tax\", types.DoubleType(), True),\n",
 74 |     "#     types.StructField(\"tip_amount\", types.DoubleType(), True),\n",
 75 |     "#     types.StructField(\"tolls_amount\", types.DoubleType(), True),\n",
 76 |     "#     types.StructField(\"improvement_surcharge\", types.DoubleType(), True),\n",
 77 |     "#     types.StructField(\"total_amount\", types.DoubleType(), True),\n",
 78 |     "#     types.StructField(\"congestion_surcharge\", types.DoubleType(), True)\n",
 79 |     "# ])"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 4,
 85 |    "id": "1f3183f7",
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stdout",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "processing data for 2020/1\n",
 93 |       "processing data for 2020/2\n",
 94 |       "processing data for 2020/3\n",
 95 |       "processing data for 2020/4\n",
 96 |       "processing data for 2020/5\n",
 97 |       "processing data for 2020/6\n",
 98 |       "processing data for 2020/7\n",
 99 |       "processing data for 2020/8\n",
100 |       "processing data for 2020/9\n",
101 |       "processing data for 2020/10\n",
102 |       "processing data for 2020/11\n",
103 |       "processing data for 2020/12\n"
104 |      ]
105 |     }
106 |    ],
107 |    "source": [
108 |     "year = 2020\n",
109 |     "\n",
110 |     "for month in range(1, 13):\n",
111 |     "    print(f'processing data for {year}/{month}')\n",
112 |     "\n",
113 |     "    input_path = f'D:/data/raw/green/{year}/{month:02d}/'\n",
114 |     "    output_path = f'D:/data/pq/green/{year}/{month:02d}/'\n",
115 |     "\n",
116 |     "    df_green = spark.read \\\n",
117 |     "        .option(\"header\", \"true\") \\\n",
118 |     "        .option(\"inferSchema\", \"true\") \\  # .schema(green_schema)\n",
119 |     "        .csv(input_path)\n",
120 |     "\n",
121 |     "    df_green \\\n",
122 |     "        .repartition(4) \\\n",
123 |     "        .write.parquet(output_path)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 5,
129 |    "id": "0ee5b906",
130 |    "metadata": {},
131 |    "outputs": [
132 |     {
133 |      "name": "stdout",
134 |      "output_type": "stream",
135 |      "text": [
136 |       "processing data for 2021/1\n",
137 |       "processing data for 2021/2\n",
138 |       "processing data for 2021/3\n",
139 |       "processing data for 2021/4\n",
140 |       "processing data for 2021/5\n",
141 |       "processing data for 2021/6\n",
142 |       "processing data for 2021/7\n",
143 |       "processing data for 2021/8\n"
144 |      ]
145 |     },
146 |     {
147 |      "ename": "AnalysisException",
148 |      "evalue": "Path does not exist: file:/D:/data/raw/green/2021/08",
149 |      "output_type": "error",
150 |      "traceback": [
151 |       "\u001b[1;31m--------------------\u001b[0m",
152 |       "\u001b[1;31mAnalysisException\u001b[0mTraceback (most recent call last)",
153 |       "Cell \u001b[1;32mIn[5], line 9\u001b[0m\n\u001b[0;32m      6\u001b[0m input_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mD:/data/raw/green/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00myear\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmonth\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m02d\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m      7\u001b[0m output_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mD:/data/pq/green/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00myear\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmonth\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m02d\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m----> 9\u001b[0m df_green \u001b[38;5;241m=\u001b[39m \u001b[43mspark\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m \u001b[49m\u001b[43m\\\u001b[49m\n\u001b[0;32m     10\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moption\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mheader\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtrue\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m\\\u001b[49m\n\u001b[0;32m     11\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moption\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minferSchema\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtrue\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m\\\u001b[49m\n\u001b[0;32m     12\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcsv\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     14\u001b[0m df_green \\\n\u001b[0;32m     15\u001b[0m     \u001b[38;5;241m.\u001b[39mrepartition(\u001b[38;5;241m4\u001b[39m) \\\n\u001b[0;32m     16\u001b[0m     \u001b[38;5;241m.\u001b[39mwrite\u001b[38;5;241m.\u001b[39mparquet(output_path)\n",
154 |       "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pyspark\\sql\\readwriter.py:410\u001b[0m, in \u001b[0;36mDataFrameReader.csv\u001b[1;34m(self, path, schema, sep, encoding, quote, escape, comment, header, inferSchema, ignoreLeadingWhiteSpace, ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, dateFormat, timestampFormat, maxColumns, maxCharsPerColumn, maxMalformedLogPerPartition, mode, columnNameOfCorruptRecord, multiLine, charToEscapeQuoteEscaping, samplingRatio, enforceSchema, emptyValue, locale, lineSep, pathGlobFilter, recursiveFileLookup, modifiedBefore, modifiedAfter, unescapedQuoteHandling)\u001b[0m\n\u001b[0;32m    408\u001b[0m     path \u001b[38;5;241m=\u001b[39m [path]\n\u001b[0;32m    409\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(path) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlist\u001b[39m:\n\u001b[1;32m--> 410\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_df(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jreader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcsv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_spark\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jvm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPythonUtils\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtoSeq\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m    411\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(path, RDD):\n\u001b[0;32m    412\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfunc\u001b[39m(iterator):\n",
155 |       "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py:1321\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m   1315\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[0;32m   1316\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[0;32m   1317\u001b[0m     args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[0;32m   1318\u001b[0m     proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[0;32m   1320\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client\u001b[38;5;241m.\u001b[39msend_command(command)\n\u001b[1;32m-> 1321\u001b[0m return_value \u001b[38;5;241m=\u001b[39m \u001b[43mget_return_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1322\u001b[0m \u001b[43m    \u001b[49m\u001b[43manswer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgateway_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1324\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n\u001b[0;32m   1325\u001b[0m     temp_arg\u001b[38;5;241m.\u001b[39m_detach()\n",
156 |       "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pyspark\\sql\\utils.py:117\u001b[0m, in \u001b[0;36mcapture_sql_exception.<locals>.deco\u001b[1;34m(*a, **kw)\u001b[0m\n\u001b[0;32m    113\u001b[0m converted \u001b[38;5;241m=\u001b[39m convert_exception(e\u001b[38;5;241m.\u001b[39mjava_exception)\n\u001b[0;32m    114\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(converted, UnknownException):\n\u001b[0;32m    115\u001b[0m     \u001b[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001b[39;00m\n\u001b[0;32m    116\u001b[0m     \u001b[38;5;66;03m# JVM exception message.\u001b[39;00m\n\u001b[1;32m--> 117\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m converted \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[0;32m    118\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    119\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m\n",
157 |       "\u001b[1;31mAnalysisException\u001b[0m: Path does not exist: file:/D:/data/raw/green/2021/08"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "year = 2021\n",
163 |     "\n",
164 |     "for month in range(1, 13):\n",
165 |     "    print(f'processing data for {year}/{month}')\n",
166 |     "\n",
167 |     "    input_path = f'D:/data/raw/green/{year}/{month:02d}/'\n",
168 |     "    output_path = f'D:/data/pq/green/{year}/{month:02d}/'\n",
169 |     "\n",
170 |     "    df_green = spark.read \\\n",
171 |     "        .option(\"header\", \"true\") \\\n",
172 |     "        .option(\"inferSchema\", \"true\") \\  # .schema(green_schema)\n",
173 |     "        .csv(input_path)\n",
174 |     "\n",
175 |     "    df_green \\\n",
176 |     "        .repartition(4) \\\n",
177 |     "        .write.parquet(output_path)"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 6,
183 |    "id": "1fa4ae14",
184 |    "metadata": {},
185 |    "outputs": [
186 |     {
187 |      "name": "stdout",
188 |      "output_type": "stream",
189 |      "text": [
190 |       "processing data for 2020/1\n",
191 |       "processing data for 2020/2\n",
192 |       "processing data for 2020/3\n",
193 |       "processing data for 2020/4\n",
194 |       "processing data for 2020/5\n",
195 |       "processing data for 2020/6\n",
196 |       "processing data for 2020/7\n",
197 |       "processing data for 2020/8\n",
198 |       "processing data for 2020/9\n",
199 |       "processing data for 2020/10\n",
200 |       "processing data for 2020/11\n",
201 |       "processing data for 2020/12\n"
202 |      ]
203 |     }
204 |    ],
205 |    "source": [
206 |     "year = 2020\n",
207 |     "\n",
208 |     "for month in range(1, 13):\n",
209 |     "    print(f'processing data for {year}/{month}')\n",
210 |     "\n",
211 |     "    input_path = f'D:/data/raw/yellow/{year}/{month:02d}/'\n",
212 |     "    output_path = f'D:/data/pq/yellow/{year}/{month:02d}/'\n",
213 |     "\n",
214 |     "    df_green = spark.read \\\n",
215 |     "        .option(\"header\", \"true\") \\\n",
216 |     "        .option(\"inferSchema\", \"true\") \\  # .schema(yellow_schema)\n",
217 |     "        .csv(input_path)\n",
218 |     "\n",
219 |     "    df_green \\\n",
220 |     "        .repartition(4) \\\n",
221 |     "        .write.parquet(output_path)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 7,
227 |    "id": "738bbef7",
228 |    "metadata": {},
229 |    "outputs": [
230 |     {
231 |      "name": "stdout",
232 |      "output_type": "stream",
233 |      "text": [
234 |       "processing data for 2021/1\n",
235 |       "processing data for 2021/2\n",
236 |       "processing data for 2021/3\n",
237 |       "processing data for 2021/4\n",
238 |       "processing data for 2021/5\n",
239 |       "processing data for 2021/6\n",
240 |       "processing data for 2021/7\n",
241 |       "processing data for 2021/8\n"
242 |      ]
243 |     },
244 |     {
245 |      "ename": "AnalysisException",
246 |      "evalue": "Path does not exist: file:/D:/data/raw/yellow/2021/08",
247 |      "output_type": "error",
248 |      "traceback": [
249 |       "\u001b[1;31m--------------------\u001b[0m",
250 |       "\u001b[1;31mAnalysisException\u001b[0mTraceback (most recent call last)",
251 |       "Cell \u001b[1;32mIn[7], line 9\u001b[0m\n\u001b[0;32m      6\u001b[0m input_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mD:/data/raw/yellow/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00myear\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmonth\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m02d\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m      7\u001b[0m output_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mD:/data/pq/yellow/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00myear\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmonth\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m02d\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m----> 9\u001b[0m df_green \u001b[38;5;241m=\u001b[39m \u001b[43mspark\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m \u001b[49m\u001b[43m\\\u001b[49m\n\u001b[0;32m     10\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moption\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mheader\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtrue\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m\\\u001b[49m\n\u001b[0;32m     11\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moption\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minferSchema\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtrue\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m\\\u001b[49m\n\u001b[0;32m     12\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcsv\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     14\u001b[0m df_green \\\n\u001b[0;32m     15\u001b[0m     \u001b[38;5;241m.\u001b[39mrepartition(\u001b[38;5;241m4\u001b[39m) \\\n\u001b[0;32m     16\u001b[0m     \u001b[38;5;241m.\u001b[39mwrite\u001b[38;5;241m.\u001b[39mparquet(output_path)\n",
252 |       "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pyspark\\sql\\readwriter.py:410\u001b[0m, in \u001b[0;36mDataFrameReader.csv\u001b[1;34m(self, path, schema, sep, encoding, quote, escape, comment, header, inferSchema, ignoreLeadingWhiteSpace, ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, dateFormat, timestampFormat, maxColumns, maxCharsPerColumn, maxMalformedLogPerPartition, mode, columnNameOfCorruptRecord, multiLine, charToEscapeQuoteEscaping, samplingRatio, enforceSchema, emptyValue, locale, lineSep, pathGlobFilter, recursiveFileLookup, modifiedBefore, modifiedAfter, unescapedQuoteHandling)\u001b[0m\n\u001b[0;32m    408\u001b[0m     path \u001b[38;5;241m=\u001b[39m [path]\n\u001b[0;32m    409\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(path) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlist\u001b[39m:\n\u001b[1;32m--> 410\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_df(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jreader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcsv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_spark\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jvm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPythonUtils\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtoSeq\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m    411\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(path, RDD):\n\u001b[0;32m    412\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfunc\u001b[39m(iterator):\n",
253 |       "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py:1321\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m   1315\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[0;32m   1316\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[0;32m   1317\u001b[0m     args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[0;32m   1318\u001b[0m     proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[0;32m   1320\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client\u001b[38;5;241m.\u001b[39msend_command(command)\n\u001b[1;32m-> 1321\u001b[0m return_value \u001b[38;5;241m=\u001b[39m \u001b[43mget_return_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1322\u001b[0m \u001b[43m    \u001b[49m\u001b[43manswer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgateway_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1324\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n\u001b[0;32m   1325\u001b[0m     temp_arg\u001b[38;5;241m.\u001b[39m_detach()\n",
254 |       "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pyspark\\sql\\utils.py:117\u001b[0m, in \u001b[0;36mcapture_sql_exception.<locals>.deco\u001b[1;34m(*a, **kw)\u001b[0m\n\u001b[0;32m    113\u001b[0m converted \u001b[38;5;241m=\u001b[39m convert_exception(e\u001b[38;5;241m.\u001b[39mjava_exception)\n\u001b[0;32m    114\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(converted, UnknownException):\n\u001b[0;32m    115\u001b[0m     \u001b[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001b[39;00m\n\u001b[0;32m    116\u001b[0m     \u001b[38;5;66;03m# JVM exception message.\u001b[39;00m\n\u001b[1;32m--> 117\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m converted \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[0;32m    118\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    119\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m\n",
255 |       "\u001b[1;31mAnalysisException\u001b[0m: Path does not exist: file:/D:/data/raw/yellow/2021/08"
256 |      ]
257 |     }
258 |    ],
259 |    "source": [
260 |     "year = 2021\n",
261 |     "\n",
262 |     "for month in range(1, 13):\n",
263 |     "    print(f'processing data for {year}/{month}')\n",
264 |     "\n",
265 |     "    input_path = f'D:/data/raw/yellow/{year}/{month:02d}/'\n",
266 |     "    output_path = f'D:/data/pq/yellow/{year}/{month:02d}/'\n",
267 |     "\n",
268 |     "    df_green = spark.read \\\n",
269 |     "        .option(\"header\", \"true\") \\\n",
270 |     "        .option(\"inferSchema\", \"true\") \\  # .schema(yellow_schema)\n",
271 |     "        .csv(input_path)\n",
272 |     "\n",
273 |     "    df_green \\\n",
274 |     "        .repartition(4) \\\n",
275 |     "        .write.parquet(output_path)"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 8,
281 |    "id": "36675614",
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "df = spark.read.parquet('D:/data/pq/yellow/2021/01/')"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 9,
291 |    "id": "fda63bac",
292 |    "metadata": {},
293 |    "outputs": [
294 |     {
295 |      "name": "stdout",
296 |      "output_type": "stream",
297 |      "text": [
298 |       "root\n",
299 |       " |-- VendorID: integer (nullable = true)\n",
300 |       " |-- tpep_pickup_datetime: timestamp (nullable = true)\n",
301 |       " |-- tpep_dropoff_datetime: timestamp (nullable = true)\n",
302 |       " |-- passenger_count: integer (nullable = true)\n",
303 |       " |-- trip_distance: double (nullable = true)\n",
304 |       " |-- RatecodeID: integer (nullable = true)\n",
305 |       " |-- store_and_fwd_flag: string (nullable = true)\n",
306 |       " |-- PULocationID: integer (nullable = true)\n",
307 |       " |-- DOLocationID: integer (nullable = true)\n",
308 |       " |-- payment_type: integer (nullable = true)\n",
309 |       " |-- fare_amount: double (nullable = true)\n",
310 |       " |-- extra: double (nullable = true)\n",
311 |       " |-- mta_tax: double (nullable = true)\n",
312 |       " |-- tip_amount: double (nullable = true)\n",
313 |       " |-- tolls_amount: double (nullable = true)\n",
314 |       " |-- improvement_surcharge: double (nullable = true)\n",
315 |       " |-- total_amount: double (nullable = true)\n",
316 |       " |-- congestion_surcharge: double (nullable = true)\n",
317 |       "\n"
318 |      ]
319 |     }
320 |    ],
321 |    "source": [
322 |     "df.printSchema()"
323 |    ]
324 |   }
325 |  ],
326 |  "metadata": {
327 |   "kernelspec": {
328 |    "display_name": "Python 3 (ipykernel)",
329 |    "language": "python",
330 |    "name": "python3"
331 |   },
332 |   "language_info": {
333 |    "codemirror_mode": {
334 |     "name": "ipython",
335 |     "version": 3
336 |    },
337 |    "file_extension": ".py",
338 |    "mimetype": "text/x-python",
339 |    "name": "python",
340 |    "nbconvert_exporter": "python",
341 |    "pygments_lexer": "ipython3",
342 |    "version": "3.9.15"
343 |   }
344 |  },
345 |  "nbformat": 4,
346 |  "nbformat_minor": 5
347 | }
348 | 


--------------------------------------------------------------------------------
/Week 5/Data Engineering Zoomcamp Week 5.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "26d71c85",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# <font color='green'>Data Engineering Zoom Camp - Detailed Week 5 Notes</font>"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "f84dce6a",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# <font color='green'><a id='the_destination_1'>1) Batch Processing</a></font>\n",
 17 |     "\n",
 18 |     "There are two key approaches to processing data:\n",
 19 |     "\n",
 20 |     "- Batch processing\n",
 21 |     "- Stream processing (sometimes called real-time processing)\n",
 22 |     "\n",
 23 |     "In some circles, you’ll hear the first talked about as being the old way of doing things and the second as the more modern approach. The same sort of language is used when comparing monolithic apps to microservices or on-premise solutions to the cloud.\n",
 24 |     "\n",
 25 |     "In reality, things aren’t quite that simple in this case or in those other cases mentioned. Stream processing isn’t so much a replacement for batch processing as it is a different approach, and it’s not without its challenges.\n",
 26 |     "\n",
 27 |     "### <font color='green'>What is Batch Processing?</font>\n",
 28 |     "\n",
 29 |     "Batch processing is a term used to describe collecting, modifying, or exporting multiple data records at a regular cadence with downtime in betwen batches. Because large amounts of data can be processed all at once in these batches it can be a very efficient approach and is best suited for handling frequent, repetitive tasks. It is the most common form of data processing that fits many businesses data needs.\n",
 30 |     "\n",
 31 |     "Many businesses face increasingly complicated and diverse data challenges due to the sheer magnitude of data available. Batch processing has increased in sophistication, and is also often used in conjunction with other processing techniques for modern analysis. While batch processing used to be by far the most common and widely used method of data processing, recently real-time or near real-time stream processing has proven to be a worthy competitor. As traditional batch systems run overnight to process data accumulated during the day, there is naturally a delta between the real world versus what the data is actually describing. Advanced Batch Processing partially solves this issue, but even the most advanced systems cannot compete with stream processing for real-time continuous data.\n",
 32 |     "\n",
 33 |     "\n",
 34 |     "### <font color='green'>Process Flow</font>\n",
 35 |     "\n",
 36 |     "The general flow of batch processing can be broken down into the following steps:\n",
 37 |     "\n",
 38 |     "- `Data acquisition`: This involves obtaining data from various sources, such as databases, flat files, or web services.\n",
 39 |     "\n",
 40 |     "- `Data preparation`: This involves cleaning, filtering, and transforming data to make it ready for processing.\n",
 41 |     "\n",
 42 |     "- `Batch scheduling`: This involves scheduling batch processing jobs using a batch scheduling tool, which automates the execution of the jobs.\n",
 43 |     "\n",
 44 |     "- `Batch processing`: This involves executing the batch processing jobs, which can include tasks such as data integration, data transformation, and data analysis.\n",
 45 |     "\n",
 46 |     "- `Error handling and recovery`: This involves detecting and handling errors that may occur during batch processing, such as missing or invalid data.\n",
 47 |     "\n",
 48 |     "- `Reporting and analysis`: This involves generating reports and analyzing the processed data using business intelligence and analytics tools.\n",
 49 |     "\n",
 50 |     "- `Archiving and storage`: This involves archiving and storing the processed data for future use or reference.\n",
 51 |     "\n",
 52 |     "\n",
 53 |     "### <font color='green'>Tech Stack</font>\n",
 54 |     "\n",
 55 |     "Batch processing is a technique that can be used with a wide range of technologies and tools, depending on the specific requirements and constraints of the application. Some of the common technologies used for batch processing are:\n",
 56 |     "\n",
 57 |     "`Batch scheduling tools`: These tools are used to schedule and automate batch processing jobs. Some popular batch scheduling tools include Control-M, IBM Tivoli Workload Scheduler, and Autosys.\n",
 58 |     "\n",
 59 |     "`Data integration tools`: These tools are used to extract, transform, and load (ETL) data from various sources into a target system. Some popular data integration tools include Informatica, Talend, and SSIS.\n",
 60 |     "\n",
 61 |     "`Scripting languages`: Scripting languages like Python, Perl, and shell scripts are often used to write the code for batch processing tasks such as data transformation, file handling, and error handling.\n",
 62 |     "\n",
 63 |     "`Relational database management systems (RDBMS)`: RDBMS such as Oracle, MySQL, and SQL Server are commonly used for storing and processing large volumes of data.\n",
 64 |     "\n",
 65 |     "`Big data technologies`: Big data technologies like Apache Hadoop, Spark, and Hive are used for processing large volumes of unstructured or semi-structured data.\n",
 66 |     "\n",
 67 |     "`Workflow automation tools`: Workflow automation tools like Apache Airflow, Luigi, and Azkaban are used for automating the workflow of batch processing jobs.\n",
 68 |     "\n",
 69 |     "`Business intelligence and analytics tools`: Business intelligence and analytics tools like Tableau, QlikView, and Power BI are used for analyzing and visualizing the processed data.\n",
 70 |     "\n",
 71 |     "### <font color='green'>Advantages</font>\n",
 72 |     "\n",
 73 |     "<b>1) Efficiency</b>\n",
 74 |     "\n",
 75 |     "Batch processing allows a company to process data when computing or other resources are available. For example, a common schedule is to process data overnight when the database and servers aren't being used by employees. If data isn't frequently updated, one can simply change the batch processing schedule to make it less frequent as well.\n",
 76 |     "\n",
 77 |     "<b>2) Simplicity</b>\n",
 78 |     "\n",
 79 |     "Compared to stream processing, batch processing is usually less complex and doesn't require special hardware or system support for incoming data. Batch processing systems typically require less maintenance than stream processing.\n",
 80 |     "\n",
 81 |     "<b>3) Processing Speed</b>\n",
 82 |     "\n",
 83 |     "Because batch processing allows companies to process large amounts of data quickly, this speeds up procesing time and delivers data that companies can use in a timely fashion.\n",
 84 |     "\n",
 85 |     "\n",
 86 |     "### <font color='green'>Disadvantages</font>\n",
 87 |     "\n",
 88 |     "<b>1) Processing delays</b>\n",
 89 |     "\n",
 90 |     "Batch processing can cause delays in processing large volumes of data or transactions, which may impact the overall performance of the system.\n",
 91 |     "\n",
 92 |     "<b>2) Limited real-time processing</b>\n",
 93 |     "\n",
 94 |     "Batch processing is limited to processing data or transactions in a batch mode, which may not be suitable for applications that require real-time processing.\n",
 95 |     "\n",
 96 |     "<b>3) Security</b>\n",
 97 |     "\n",
 98 |     "Batch processing may pose security risks, as large volumes of data or transactions are processed at once, making it easier for cyber attackers to access sensitive information.\n",
 99 |     "\n",
100 |     "\n",
101 |     "### <font color='green'>Applications</font>\n",
102 |     "\n",
103 |     "Batch processing is a widely used technique for processing large volumes of data or transactions in various industries. Here are some real-world examples of batch processing:\n",
104 |     "\n",
105 |     "`Banking and Finance`: In banking and finance, batch processing is used to process large volumes of financial transactions, such as clearing and settlement of trades, reconciling account balances, and generating financial reports. These tasks are typically run overnight, and the results are made available to users the following morning.\n",
106 |     "\n",
107 |     "`Retail and E-commerce`: In retail and e-commerce, batch processing is used to update inventory levels, process customer orders, and generate reports. For example, at the end of the day, a retailer may run a batch process to update the inventory levels in their system based on the sales that were made during the day.\n",
108 |     "\n",
109 |     "`Healthcare`: In healthcare, batch processing is used for tasks such as claims processing, billing, and patient record updates. For example, a health insurer may run a batch process at the end of the day to process claims submitted by healthcare providers during the day.\n",
110 |     "\n",
111 |     "`Manufacturing`: In manufacturing, batch processing is used to manage production runs of batches of products. For example, a food manufacturer may run a batch process to produce a specific quantity of a product, with each batch consisting of a set number of units.\n",
112 |     "\n",
113 |     "`Marketing`: In marketing, batch processing is used to manage large volumes of customer data, such as contact information and purchase history. For example, a company may run a batch process to update their marketing database with the latest customer information, allowing them to target specific customers with personalized marketing campaigns.\n",
114 |     "\n",
115 |     "Overall, batch processing is a common technique used in various industries to process large volumes of data or transactions efficiently and effectively.\n",
116 |     "\n",
117 |     "\n",
118 |     "### <font color='green'>Advanced Batch Processing</font>\n",
119 |     "\n",
120 |     "Traditionally, batch processing was usually configured to run sequentially. Each job was processed one after another on a single machine. The need for more sophistication led to the rise of concurrent and parallel batch processing.\n",
121 |     "\n",
122 |     "<b>Concurrent Batch Processing</b>\n",
123 |     "\n",
124 |     "Concurrent batch processing typically refers to jobs that run batches partially overlapping in time. This overlap allows for a piece of the data to always be analyzed at a given time. Concurrent batch processing gives the illusion of parallelism without requiring more than a single CPU core. Due to this concurrent \"multi-threading\" behavior, the architecture for concurrent batch processing must have fault tolerance in mind. As batches are not run one after another, a single batch failure could cause a domino effect on other batches should the architecture be configured poorly.\n",
125 |     "\n",
126 |     "<b>Parallel Batch Processing</b>\n",
127 |     "\n",
128 |     "Parallel batch processing takes a similar approach as concurrent batch processing, however instead of overlapping parts of batches over time, entire batches are scheduled in parallel. By taking advantage of the relative cheapness of multicore machines in the modern age, parallel batch processing can multitask effectively.\n",
129 |     "\n",
130 |     "<b>Modern Batch Processing</b>\n",
131 |     "\n",
132 |     "Modern day batch processing methods often use a combination of both concurrent and parallel batch processing. Also called parallel concurrent batch processing, by finding the right balance of parameter tunings to optimize how each CPU core handles multiple tasks and how each worker system handles a single task, when properly configured, parallel concurrent batch processing is a state of the art solution. Institutions that require greater stability and security such as the financial sector most commonly use parallel concurrent batch processing. For the most important data, often multiple redundant batches are run so that even if one batch fails, other batches can cover for the mistakes of the failure.\n",
133 |     "\n",
134 |     "As mentioned earlier, live data streaming is a challenge for batch processing traditionally. While attempts have been made to use concurrent and parallel batch processing methods to analyze \"microbatches\" stacked on top of eachother on extremely powerful machines, the use case for complex architectures like this is niche. For the majority of live data cases, stream processing is still preferred. The main business use case for batch processing for this application is when such large quantities of data needs to be analyzed that stream data processing is not a viable option."
135 |    ]
136 |   },
137 |   {
138 |    "attachments": {
139 |     "image.png": {
140 |      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAwsAAAFyCAIAAAA1dtHNAAAgAElEQVR4nOydeXxTVfr/n5s9adI2aQoplKVQqUBFLEwtWhEXYGYAGRAVxQFkHEAdBkfg64LIIII6wCiDiqBTAcVxQfwxqAg4LlgVKhRBFoFKKRS6pU2XNHvu/f1x2sPlZmnaJmmW5/3ij+Tm3KXJ4XM+5znPOYdpMFsAQRAEQRAE4SHq6gdAEARBEASJONAhIQiCIAiCCEGHhCAIgiAIIgQdEoIgCIIgiBB0SAiCIAiCIELQISEIgiAIgghBh4QgCIIgCCJEEkihhtkDQv0cSIyRtPG050GsSEiAeNYfrDxI4ND6g9UGoXhtlfyDMSQEQRAEQRAh6JAQBEEQBEGEoENCEARBEAQRgg4JQRAEQRBECDokBEEQBEEQIeiQEARBEARBhKBDQhAEQRAEEYIOCUEQBEEQRAg6JARBEARBECHokBAEQRAEQYSgQ0IQBEEQBBGCDglBEARBEEQIOiQEQRAEQRAh6JAQBEEQBEGEoENCEARBEAQRgg4JQRAEQRBECDokBEEQBEEQIZKufoCYQr+gAADsxwqbdhf4KaadsVys7+k2XjRtXhKuR0MQBEEQIZqxs+TZ+QBgXDOrq58l4ogjh6QZ2/Lz+7cvXkvKs3JlfbMBwHpoj8tY7utEZc5oz1ukPvUfxeB8a/FeWv8UQ26WGDJclaUd+TOQgFHlTVDdOEnaIxMAWLPJ3VjbpnmNUjzrGIIgEYhEn54waqq0zyB6xFl2ovnr9/w0K6FGnp1PWy4/SPTpymFjAMDdUGPZv7MDBaKROHJIydOXkRcSQ4af4E3SlIWJk+aT17Q11Yyf69X9BIJicD7wzBMSBiT69NQn35UYMnjHMgBAmTM6aeoT5Q8M4Bcmhti/941wsI5FNe218vKsXPWt0ySGviK1FgCcl0qcZScatq32WphU71hqtKIUiT5dO3sV+a/KR5kzOnHSfFdlacWCkV3yYAHiMparbpgoy8wBAHFSqmf9pH9d7bqHu+D5QkMcOSSK6oaJfhySevT04N7OXVch1qU5SoqDe1nED9QeuesqHOeOkYPixBSJIYM0KnyodY7e8BLWsSilXVaelNcvKJD2HnjFQUOGMme0evR0894tnj6JVG9XZSk6pC5Eok/vtnS7WJdG3roqS121Fzm7VaJPF6mTxbq0K+tAhFK77hHDqi8ZmTJx8qMCtdSMnUXskbV4byzVtHh0SCK1Vr+gwOt4hHbGcs8WtJNcmpcb3Asi/tGMnUXkxna8sGblvYJP5Vkx+HNgHYtS2mXlVXkTdHPWMDIlALBmk6PsOGe3AoBEny7tPVCk1iZOmi9KSMLsxgiE2iNr8d76zc8IwtVk6K2LHq0duIzlTZ9uTJw037MN1YyfCwDuuooYG+iPO4fEOayMTKnIzpfo0z1HVVQ3TKRluuLpkCBAsg4BwLRxkeen9lNF/Lc05wxBwkx7rXzytCVElyyF22vXzxcUTvnLK2JdmnrMTFdlafRGQ2MS7YzlxB55/nAEl7Hc1yBppNGwbbVy2Bhp74HKnNGqvAkkXKRfUED+wPqty7v6AYNM3Dkk+5lDsj6DRWpt8oxnBW6XBJA4h9V+5pDnaHGHSX3qP5KUnq7ai54iKNGnJ894Vj5gOOkvuusqmr/5IFr+q0Q7ZAaHODGl5e34ueoxM8lrkhCQtmYfADR8uMqyf6d2xnLFkJvpW3qFhFFT6ZCH58+nGTtLMWy0NK0/ja5zDqvz/Mn6d1fwjRq9kdtUpRk/l9YH1mwi4ybyrFzN+LmyvtnkOvQ4vYJnHSNHbEe/MW1eop2xXH719fQ5nedPmjY9LXCKqrwJiRPn8cs07lgnz8pVDLnZa9VFOk+7rLz/VtZ+qqh62WRfIyBI10I63qzZ5NUeeSmfNyHprkUAULFgJGkjSI4a0aX2qgoAaH73oLT3QGKvyX9tXwNhmrGzVDdMpIXddRVNn7wuqE7GNbNITUu6a5Fl/055Vq4iOwbH1whx55A4u9Xy/Q71mJmCMJJEn07qcfPX74v1PYN4R0lKT69jzCJ1Mqln9IhYl5Y4ab5swDBskDqD23iRvEiZ92rV0om+iqnHzOT/LlRuKORTeVZu0l2LaElxUip5oV9QQDOjXZWlYp2B/HzSPoOo8xbcAgAYmVKWmZP6xDuVi269XPcMGQCQdNcisc7Arw9k3ETaZ5AiO9/zONtUR5XLs46RIwqAHuuKBH+atPdA/WNvXpwzhB7hD9/QMinzXiPpTb6+QCSctNnKuozl1qJdqvzJIrU2acpC7GhFCPKsXNLhsZ8+GOAp4qRU8t9ZM3ZW4uRHBeOt7VIVfreHQP5rS9OzPGuI4fk9gsJiXVry9GWOc8f43ouOtUkMGSkPrZVlXsfIlLE3vkaIxxUjTZuXsGYTI1Mmz3iWHkye8axIrWXNprCN4ovUWlfludp1D1+Y1uvCtF7Vz97prqsAAMXgfBz66QxNn25gzSYAkGXm9FhXpJ3hPfDb8OGq+i1LLYXbyVtL4fb6LUvJP34x9ZiZYp3BUVJsLd5rLd5LkkWSpiwk9shSuP3CtF4VC0aWPzDAWrwXAJQ5o+n4iKPkcOPHayvmjyA/8YVpvcjtSPdL8DxincF+5hCtD+Y9m8hxZc5o5/mT9VuWkuONH68lxwNJXCAqad6ziTxDxfwRzvMnAUCk1vLrmPaBFYxMyTms9C7Vz97pKClGexRS+Fbef0nayjrKjvspZv5yK3nBn0yOdC1kmRgAsB8rbO+5JMvedryQiA852C5VkfYeaC3eW/3snaRk/ZalnMMKAJpxsyX6dEFhiaGvpXA7LUzvSHKM+DRsW02URJU/mYhM7I2vEeIuhkQgYSTSmNlPFUn06SROaPl+R9iewVVZWvnkGPqWHydX3TAR4+QdxmUsN721OHnaErEujWRmJIy6x3n+pOX7HfxvlQSENWNnqfInA4Dj7BGv37mrsrTm+fsEKWsJN98NAI6SYn6H3rhmVs8NR0VqrfrWaaTL5dndr10/Xz5ohFiXxqg0go9sxwr5nTDT5iWqGyaK1FpXZSk/EtawbXXCzXeLdWmMTNHmV+Guq+AncbuM5Y071qXMew0A5Nn55O+lsxMa3nuBfgP2U0VVSyemrdkXFVNsopSmTzeQn5hYeevB3b66Z7SVdZw+5OeC9lNFJIeSDMogkYCv/0GqvAk0IE0QhGrAR4Jau1RFsEYa+Q+ePH0ZI1Nqxs0R1Dd+CAoAjGtmpb912ld1omNtEKPja4R4jCEBgGnzEhKwSb5vMQAkz3iWkSnDGUDyistYToy5INSJtBfL/p2X5uWa92wiwSQShU6evixtzT5V3oR2Xcp5qcRz4klLRoiHn2bN9QDgqVN8OIctwFuTq3XmCp4lPYVM1m8IAHAOK5ryMEOsPBEiYuXT3zrdfdkOzxCyrN+15AXbVOf/mu66ylA8KtJhfOVsJN21KHn6Mv4/z1CN7dDeAO8SuCY07S4gqii/+nrBR55Tl/xXJ3pTRh6zE5viNIYEAE2fvJ48fZksM0eVN4EEkBq3v9zVDwXuxloAwJl0QcG0eYlp8xJV3gSaqCgxZOjmrHGbqgR9tXZB1o0FAMWw0TTZliBSJ3uWJ1OWxPqe9mOFdEZ35NA6SwBb1i7Asn8nmQdAgknEyssyc9RjZvLnBPj33EgkQ8dSBbhqLx/vQKS2M6rCmutFam0gQWg/aGevoglSJDMkJrtYceyQdheQlDeSo4pTZGMV0ghJ9Okp816VZeYwMmXyfYv9ZHAHTpsTHlV5E7QPrKA6gmteI15p08o7y04EWHk62ewhQYfuLiXS6PjH+cNnvbZeCPyCkaAqdH3Ixo/XqkdPF6m1iZMfjeptCXwRvw4JAMx7NpERWfK6qx8HCSEuY3nV0okkTyhYuTWNH6/1OupBunTyrFxivt11FfYTPzjOHgEAeXY+ncyPIHz8WHlazdqsumT8N/AxFyTUWA/tIQnXsgHDOn+1SFAViT49cfKjAOA8f7Jh22q2qS55+jKvC+jEAHHtkJp2F8j6XcuoNJylKUICSGR5HjLdAAk6JLzcSSlxN9S0XI03396TxDv/RiaIVS+bTLtWTbsL0tbsiyiH5Kq9KDFkSAwZXtdQRcKMVyt/uZXtN8TPuTTHjqQzIpGAy1hOVs2QXzWs8//FgqIqJBmgwzaajK9xDqtp09Pk7mS/Nv4akjFDnGZqU2rXzzeumRXgQl5hgGgiClyIINJA494dw7J/J7GwZIkaX0hSegKAu64ywm0HzQblL34BABJ9Og7ZdBUkSZ+2efw5HJ6TtCma3z1IXtBp/0gkYD24GwAYmbLNZR3apPOqcnl9pl8OdOD0y/uvFe2i2Zy16x4hkpg8LdZ2vIl3hxRcSC2hyzT7R2LI0C8ooHon0ad3X7aD1N1wLjoQe2jGzkqaslDQkPC/XtvRbzzPotOFAsF2rBAAZJk5nostybNyyXpIZMKIWGfgP4l2xnKxzhD4jcJA0+4CYhmVOaPJRCrN2FkpD63tvuIzXA+pq/C08s1fvwd+W1lV3gSy77qrsrQzExGQoGPavIT8lGRZh6QpCzt8qfaqiiI7n69RZHcaIHNXP93Q3rvT8TV3XQU/rEDWkAQAsS5NvyAiRmOCRVyPsnUAMi3T83j9lqVNuwtcleekvQfKMnN6bb3gqiwli8T7gnNYlTmjlTmjyX8eup6y7XhhhAz5RSny7HxlzujESfNZs4lOmKcDFo6SYv6aDtZDe5KmPsHIlKr8yfJBIwBArEu7MK2X/1vUb36G7AGiHjNTdcNEV2Wpu7FW2iOTkSnEurT6LUvtp4os3+8g2STdV3xmP32QkSvJdjeh+aM7Rc3z95GdNclEKnLQXVeBa2qHFM3YWSKNrvnr9/jxAJKH5Gnl6ViGLDPH8Pwewb4x2hnLE0bdAwCcw1q3seMNMBIiap6/j+xSTFbep+okUie3SxM6oCpkQTgyWZXKYNOnGzsQhaLz15o+eV3wUcO21aoRd0gMGTE21oYOKZg07lhHZxm0OcprO1boLDuhHj2d1lrOYRUsG4h0APuxQmJfBClHXr9el7G8bsMC8qvRXc/avIXLWF69bDLJqBWptbLMy3ehm7Q37S6QGDISRt0jUmvJfBPWbGr8eK20z6BIm9RG/pykuxbJMq8DAM5hs/9ywLR5CdnaCQkR7bLyAFC77hH9ggJp74HS3gO7PfORu66CiAztXHEOa92GBV4DSBJDhtcJU2125JCg4DKWVywYqZ2xXDl8LNEZvjqxZpOj7Lin7fCkvapCot2K7Hxarzx3dQwQOr7mqw9ft3Fh6hPvMDJl8rQljpLDEZ5dECBMg9nSZqGG2QPC8CihQ5U7zlL0KXkBAKF+7Th7xGUsD7w8iETipFRpn0G2n77yWoa8iC6SNp72PBjOiiTRpyuHjSFLhriNF9tczUGelSvrm93eCavkLiKNjsw28jydFAAAd0NN1PWrSL6wYFne8OBZf6JdhTzRjJ2lGT/XM0rnv6dEF08SHHeUFNeue8Sz9vqfSR6rDonWnwisNlQTCB2YJB+IqpDfnf7nJcuQRqMKBRGvrZJ/4sIhdRs8zZxQDwDq5mQAiLrX1cejL++yyx0S0knkWbndnvkIAMx7NoV/rfl4cEiE9lp5gipvgjQ9SzZgWEvabPHepk9ex/QjSiQ7pPAgcEgIdMghxcUom/uNV9R//gt5AQDR9/qGSExeQWIGsjsh/4hEn66d+Rx0NKMTCRyXsbwDeYeW/TsBWjYWTJz8KM1odF4qsR8rjPNQAYIEi7iIIQEHKT+YAKB2hBYgCl8zYfiOggzGkKKItDX7xDqDu67SVXuRs1vFiSlkZWfoogASxFMMqfNI9OnJM55VZOfT3YpidewscDCGhDEkTzCG5AOmxXMQqxGNrxEkdDgvlYh1BrJuJD3Y4YxOJMy4jOWkFdTOWC7rNyRYS8YjCBIfDgmu9BnR+BpBQgbN5aRLzDvOHsElJ6KOLon2IZFJ/Zal0LoDEtJh4sYhIQjiF7RECBIz4H/noIBraiMIgiAIgghBh4QgCIIgCCIEHRKCIAiCIIgQdEgIgiAIgiBC0CEhCIIgCIIIQYeEIAiCIAgiBB0SgiAIgiCIEHRICIIgCIIgQtAhIQiCIAiCCEGHhCAIgiAIIgQdEoIgCIIgiBB0SAiCIAiCIELQISEIgiAIgghBh4QgCIIgCCIEHRKCIAiCIIgQdEgIgiAIgiBC0CEhCIIgCIIIQYeEIAiCIAgiBB0SgiAIgiCIEHRICIIgCIIgQtAhIQiCIAiCCEGHhCAIgiAIIgQdEoIgCIIgiBB0SAiCIAiCIELQISEIgiAIgghBh4QgCIIgCCIEHRKCIAiCIIgQdEgIgiAIgiBC0CEhCIIgCIIIQYeEIAiCIAgiBB0SgiAIgiCIEHRICIIgCIIgQiRd/QAIgiAIgvhDM3YWeWE9tMdlLA/uxSX69NQn37Ud/ca0eUlwrxztoENCEARBkAhFv6BAkZ3PyJTkbfL0ZbbjhTUr7w3iLZTDxkgMGa49m4J4zdgAHRKChBbN2FnqMTMbPlxl2b8z8LO0M5azzQ0N21aH7sEQBIlw0tbskxgyrMV7mz553X6qCHjBpCAiz84HAOuhPUG/crSDDikI+G8CsamLOnptvUBfcw6r/cyhzvTY5Nn5EkOGf3uU8tBa6+EvaBlV3gT1mJmuylKsNggSt6Q+9R+JIcO8ZxN/8Ktpd4FnSXlWLgAQCwUAqrwJ4qRU8DEkRz51nDtGy0v06azZ5DKWy7NyZX2z3Q017erOxTBx5JA0Y2clT1/mrqu4NC+Xf1yiT+++4jORWlu77uGOVQs/TSA2dVEH6aJZCrc7zh4BAMWw0YrB+SkPra1dP79jF5T2yHTXVfgpIM/KVeVPJrcjWPbvVF53u/nLrR27I4IEAnbeIhl5Vq5icL6jpNhPblDPDUftpw+KE1NkmTlk6E2iT9cvKJD2HkgKJE9fZincTrSLXxgAOIe1ctGtxD+JdQZ3XWXqU/9RDM4nJ0rTs7BiQFzNZSOBRLEujdhtinb2KpFaCwAdds1+mkDL/p2Wwu11Gxd27MpI+JH1uxYAzF9ubdpd0LS7oGblvazZJMu8jl9GlTch5aG1qrwJgnM1Y2eRf/yDYp3Bv0Mi1xF0DWvXz6c9PII8K1czdha9qUSf7nkv+hgpD60V1HMkPKSt2ddzw9FASpLGTKJPD/UjEZKmLEyaclmISOdNNeKO8NwdaS+a8XMBoP7dFX7KiNRaRXY+a7fUb1na+NFLANBt6XaJoW/9lqUXpvWq37IUAGjXixR2VZ67MK1X48drGZlSM24O/Uhi6MvZrRXzR1TMH8E5rFgxCHEUQyKBREamUN86jbY9SVMWyq8axppNrLleUFg5bAwAeMYb6Uc0SinWGZznT4K36CUAeI09kIbNMwSqypugvO52/oALEmYkhr6cw8r/BRmZgnPYyGtV3oTkaUtE6mTWXK/Kn6y6cZJxzSx6XKxLI8WSpy8jIUl5Vi4jU7oqz3m9lypvQsq818hrMrRHztIvKFDmjL4wrRcAdF+2Q6TW2o5+ox4zk5Qk1idh1D0keVMxbDQdBEyaslAzbjZ5WlX+5PotS73G5JHQIdYZfP3cAjTj5ihzRpP6EwbUo6fbTx+kbzFOGeFIe2TyhYgkJJHX1c/eaT9V1NKIFO2iTYx+QYFYl0YHQ0hnj2QXCQo3f/1e4qT5Yn1Pz48IVPHinDhySCSQyDmsNB4g0adrxs22Fu1S5v7OVXuRliTNDJ07oJuzpvyBAeS1dsZy2jKxZtPFOUNIE+hurKU1mHNYa164n9RsflOnnbFcPWZm7bqHtQ+sIFErzfi5dMjPV9OLhBmRWuuuq6Rv9QsKGJmSqIxEn659YIWrsrR62WSXsTxtzT5pj0wAkGfl6uasYc31RLnIj05ESnHNSOB14wRY9u+07N/Zc8NRd11l5ZNj6HFpj0xXZSl9HrHOIOs3pGL+CAAwrPoyYdQ9rspzpI71WFckSelJSqryJiROmk+D6r22XpBn56NDCicSfTojU3pmfnhN75D1G0J/ZT6asbNk/a41f7lVEEQkH9EftF29OHlWrkittR8r5BcLcOBYnpWrypuA43Hhhy9EjpLDzksl0h6ZEkMG+U1ptJuWkQ8Y7igpppWBdPZIbSSFGz5cRT4ijSBnafL8CAC81uH4JI4ckkittZ8+6Cw7kThpvkSf7jKW6xcUuOsqzV9uVeVPdl0sIcWSpixMnDTfUVJcu+4R0grSKxCLYy3eS7wLCY/L+mYDgHzAcMv3O0wLRpJsJ834uaQS85s6YtiT7lrUuP3lpt0FpB0lkuer6UXCj8SQwTms5HcX6wyMTGnes4m0DSnzXgWAqqUTgcQLdQbbsUIA0M1ezTls5LcDAIk+nQ6rSfsMAh/JlS2306eTmsk/SKOS5Hmc50+SmwIA57Cx5npqp/hdPe0DKxwlxaTZ085YDgCCFhEJNcSX0K+dZH4wcqUgvSPlobWq/MnkSK+tFziHlfTBvIYAac9KN2cNMesuY3m7enHpb50mb5OnL6OJKbTzRrJPSC8OACT6dMOqL8nsBJrU4qoslRgyVCPuqFgwMixfJAIAIFIn09fk/3Xamn20QRFEu4kJdlV+RU8R69Kox5IY+pJcbFoYAKyHv/D8iISUUDoI8eKQ6K9uPbQncdJ8Mv4qMfStXHRrwqip0DoLgESV+A2SWGewnzlEPkoYdY/teCEN7ZAqRdKbTG8tJs7d3VADrd4crmzqpD0yWbOp5vn7yImkDCnvq+lFwgzJ8nGeP+lurCXxABIWIp9Kew+0HStU5U1Q3ThJkZ3vPH/SuGaWPCuXzDehEkOileQ13y1pZyxXDLmZ3ouszyZoUwGAPzBHnsf+ywH6qcBOidTJRDFVeRNEaq3l+5c1Y2epbpgoy8wx79mEAaQwQ9TAce4YeUsyP2zHCmn8TzXijoZtq2vXzzd/ubXbMx/RgB/4DgHSnlXz1++7KkuJPWpXL678gQHdl+2Q9h5IXRTwOm+sqZrcnShY0l2LGJmSJLXoFxSI1Mn8yGjov0KkBdZskhgy6O9CoO0RXGmAoLWv7qppmYerypsg1qVZi/d6LawcPtZVWUquLNal8ZNM+GNzSLw4JPqru4zlzvMnlcPHitTJTZ9udBnLSS+f1BXNuDmMTGna9DQ5i7RVREH4wsGHNIG0EpM5lmRURZCDQtwSbUcZlYbe12vTG9IvBPEK6Vo17XqTpBB1e+YjmrWmypvAyJTKnNHyAcNdlaUN771A/EeLieHNm+WbGL5bclWWOvU96b1ahj88ViLhD8wJJvEKunfkXu7GgwCgvO52ACCzNZ0Vv/KNHRI2JPp02q33n97hOfzqKwQo6Fl1oBcHHg0k8DpvjrNHVPmTiXABgGLoLY6SYvupIu2M5dLeA0lFkujTZX2zvY4JIiGi/t0V3Z75KHnaEhJ3lOjTE0ZNZWRKOtzBN0DQ6suVw8Y0bFutnbFcdcNEAHCWnaCFSQF5Vq525nMidXLdhgVeryMIKcU58eKQ+L+6/ZcD6jEzHSXFZOiEPxNN1m8IP27JVzEyf9Kz1RHrDHzhIA0eaTsFTZ0gY5f24Xw1vUj4kfTMhFbbaj9V5Dx/UjH0FvIRaUI8c59JF99Rcpi8TbprEbS2bQK3RCbHCe/YuhIJPcIfmOM/D7QafRqi4MefiOGmYyVIl8A3xJ5pIvz0DsHwq58QoKBn1YFeHHi0gnw5atpdkDx9GYlXaWcsZ2SK2nWPAID86uuJQJHmlnPYap6/L2jfFNIW9lNFjR+vVY+enjhpfuKkFpNNAz/Ef1MDRMo7SoplmTm9tl5wVZaa925JnDTf9vM+WliZM5pMB2HNproNC3xdRxBSinPixSHxu1BNn25wVZbSXjt/IEyk1vIrB2mBiE4xMoXnZfmdeHrEaw6KZ5eR9vx8Nb1I+JGk9OT7XTImmzRlYcO21dZDe5KnL0sYNZX8TElTFkpSe9Wun+82XgSApLsWNXy4KnnGswpeTIjUHwCQZ+X6iuiQeqUZO0uk0dl+3mc/VcS37ILnEWYe8O7lLDuhzBlN1m2S6NM14+Zgam2YERhiwY8liP8JlgjxFQL07Fl1oBfn2QoK5MhdV8HIlS0hqGOFLcEqQ19Gpkx94h1X5TnL9ztwx67w07BtdcO21V6Xf/Ta3apaOpGEtIn7of/9iVmvXfewOClVkNfveR3BeoFxThw5JNrzdhnLaZ0QCBBrNskyc1R5E9ymKu3M5ySGvlTFnJdKJIYM0liq8iYk3Hpfzcp7PZNI+CElvggKuowt971YAgC+mt4Qfh2IDySGDNvxy79mw7bV6tHTSe6Iy1huKdyuyp9M+mGcw0pyxZo+3aAcPlaVP1mVP9l2vNB2rFA+YDgRMuuhPZrxc5U5o6U9Mn2luJJp/MnTl7FmE9tUZz9VJEjT5j+PYKyEH39q2LZaOWwMeQwAYM0m894tQf9+ED8QNeCPa1yRJnJlegf/VwbfIUDPnlV7e3H01iScQBDIEWuul6T0TJ7xLOew0RE6Rqak+UxIF9KulV+8FiZmHVeQ6QBx4ZD8JOcLBKhp15u6OWtS5r3GOazNX7/PyBSs2UQ+qt/8jKxvNg14OkqKJfp0QRKJIKQkSNPmdxnJfYmX8tX0IuHHs4m6OGcIfU0SbAXTtl3G8kvzcjVjZwnWwaIf+b+jafMSQe+cn04reB7B1fgLBJC3XlfkQsIDUQNqRPyndzAyJSNTasbOkhgyTJuX+AoBes6FbG8vDlrtl+KakSSZt2l3gUCOXMZy+YDhiuz8hvdeoAfddRXyAcPJtF/+1ZCoQ7CCCRI4ceGQvAYkCSSMSd9a9u90lBxWDhvjuZYjae1IC0Q/FXSwXMZyfpPGb+oEIQTBfb02vUgEYj9V5NV8RMgIKVaeLsQzTdtPegdJGSHDak2fbvAVAvRcr7+9vTiXsdHXSr4AACAASURBVNzy3ceK7PzESfM5h9VatAs8Ilj2Y4XKnNG244X8atz0yetJU59IW/sDfWByteB+aUgYMO/ZRIdQkHbBNJgtbRZqmD2gzTIIwidp42nPg1iRkADxrD/xUHkCDAGSBSG9bkpKL+Lr03ZBV56MkA5A4ND6Ew/VBgkQr62Sf+IihoQgCBL5BBgC5GdSdvginb8REnuQ9dwxC5YSRzvXIgiCIAjii6SpT9DV3hHAGBKCIAiCIADQ9OlGtqmuq58igkCHFHK0M5bjyjQIgiAIwX+CF39/YrLnsdeS9CNBzpnnzse0JD/FzetBr+1Uy+rwHpltXhdqijHixSGRTYUq5o8I8w+pypugHjPTVVmKDgmhpDy0VpZ5XQc2AcUsgVgi1A2MPCtXM35u/eZnYrX1ijoEmxADgDw7n0yI9tyfGABS5r0qy8wh5yZOftT4zweplSH7DZPXqhsmki1o/Ox8TCY8itRaPwfJtuuC/ZK1D6wQqbUAoBk/l642ItGnpz75rsSQQd7S7ZDD8zWGk3hxSGRvo/ArhWX/TuV1t/N3HkAQWeZ1gRRLmrIQruzVJU19gpEpY1KJ4grN2FmJkx8lDQ8AJE9fZjteGPTVhrQzn5P2Hmg/VojZ1hEC3YS45oX77aeKeqwrkg8YLviI7k9MNhsmfkWVNyFl3mvJ9y0mToh81PjxWiIOZH9iXzsfJ4y6x1FSTE6k6zV4PcjfMos+T+P2l5t2F5AQAwluSfTp3ZZuB4DadQ9b9u8kvoq/qGksES8OSawz8FfuF6DKm6C87nbr4S8E00BIdBF8xEIFu4qCj8m6XtuzeI5bIoKVsn2hHj2dbmFBwCyBGID0zp3nT5reWkw2SE6+b7FicL5+QUFwF7Bu3LFOnpWL9ihyEGxC7Kz4lcaBBB+p8ibw9+az7N+ZdNeilljO2FmyzBxqj6B1f2JfOx+TtUn5JX0dJGv0e30eztIEAO6GGgBInvGsSJ1MTB54LBYfY8SFQ2rZ4sPH2iHJ05aI1MmsuV6VP1l14yQiUuS4WJdGiiVPX0b8MgD03HDUfvqgODFFlpljO17Y+NFL3Z75qPHjtaoRd5CoI+ew0tpDrDdZRhLjltGLf+dKRvQ9V/uk2Qb8j8jGSXSDbl/Is3JFaq1gIXjBWC29PjXlXu+IRA7yrFyybTbpvgOA/VRR1dKJPTccJTv6BYivTh35yFFy2GUst+zfKfjU8yysMOGE7MVJBUSsSeEcVvoRf3/ihFvvAwCve+EljJrKmk0CKfCz87HteKFicH7amn30iK+D/H2XBc9DlmUnNUSRne88f5JGAciWJrHan48Lh0Ty0Tx3HZHo07UPrHBVllYvm+wylqet2SftkQkA8qxc3Zw1rLme7B9JXA6VD5Faq8jOt585VL9lqePcMXJx9ejplu93mBaMJCFHzfi5pAJJe2TSbdowbhmNSPTp+gUFZE9QuNK5Eq/MyJWXO4LpWVS5SE4A7ajp5qwha6zzQ489NxxlzfU0IYl46PotS8loGrkdvSM/S4AUThh1DynGmk0X5wxR5U3QzVkDAKy5XqxLo3YfiRyS71vMOay16x4RHLefPqjMGU3GO8gPzQ8S9Np6gQ7Dee3UybNyST8t4ea7xbo0cm6vrRfoxmpez8IKE074e3ES6MiG5/7EkpSegrXU6TLoEkNfzy1EfO18DAA1K+/VjJ2lGT83efoykUZHKpXnQf6+y57PQxsyVd4ERqZ0nD3Ke7C0GN7SJC4ckmD3NErKvFcBgHTmVHkTxDoD2RBNN3s157AR2wQAEn06rawto2NFu2h0RzN+LgCQgDm0xiFJTBI8tmbDuGXU0W3pdpE6mZ/JSJ0r8cq2Y4UV80cAgGHVl2SPWwBImrIwcdJ8R0lx7bpHiPmmF5T0zITW3pi7rlKsM9CPFENudlWWkk1ySKoBf+MafpYA8VK0/SOJCIkT57Hmen69DfWXg7QXWWaOo6TYV4dblnmdy1hO/vs3f/0eOUg0x3H6EPju1NF+mu2nrxxnj1gP7eFvRunrLKww4YTsxUmNS9KUhSK1tnH7y+Btf2K4ch9izdhZjExp+X4HANBO1xWFfex8TCCS0nPDUSpQngf5u7B7Pg+JfgEACaWzzQ3kePdlOwT7D8YYcbFiJNkvyVOVpL0H2k8fVOVN0C8o0M1Z4zx/kvTGJIYMy/c7LsdCdQa6oRIRL37mNRm7pREmUoFI3RI4cbHOQFLwyNu4jVtGEfoFBWJdWt2GBcSX8J0r9crGNbNcxvIW1+uwAYBEn64ZN9t5/mTV0onkuFhncNVeJNeUpPSkYUWXsZym65JNTM17NpG3nj0zmiUg0acnjLrHdryQ9vjJXRiZgq+qWH8iDVJnvCZEMnIltKqB4L8/qXVkQ1zaqSO7yYp1BuelEmh1z6a3Fteun9+0u4DaLLIbl6+zsMKEE7IJMfmJtTOWa8bNdpQUE2Hx3J/YUXJYpNaSdCLN2FlJU59wnj9JCrgqS8U6AwlFJ01ZmPLQWmjdAZC8lujTtTOWk3keSVMWEuMrz8plZAqiQl4P8uMIgufhR79IgYSb79bOWG54fg9JROHvPxhjxEUMiT+8SiHRQmXOaPmA4a7K0ob3XiAVgqSJXDYrvNgjtIoXPxFbcHF+R5/vxDFuGY3IBwx3lBRfTtrgNV2eXpnmumnGzWFkStOmp1sukpXLyJSsqZq8pb0xaN0xlIy0qm6YSAJIrcWEPTNa05LuWsTIlI0fvSR4WssP/02cNL/7is8s3+/wmsGARAJeR8+laf1poFrw35+vOdLeA23HClV5E1Q3TiLdKhpE5PfTAjwLK0w4IXG7XlsvkLf82Yue+xPXrp8v7T1QPWamesxMUti0cRH5qOHDVbo5a7o98xEAcA4rGffwuvMx6aqRbYwBwFVZatq4yOtBaK1CRMEEz0MaMtJauYzl5j2blMPHKoePtR7cLVIlqvInE/sek8S+QxJYHAoJ9tCsjsvH9T0BwFFymLxNumsR8HKYhOLlcXH+kBzfiWPcMuogudKuyq/oEf6vL/DK/EENWb8h/I88jTJ1S9ZDe5KnL5MYMuRZubLMnPotS/lX4/fM+DWNJEV5bm7asG217ed92pnPqcfMlPTMDPrscSQo0NkYFM3YWWJdmqVwO3kr+O9Pa52vTh146wQGchZWmHAi1hkcJcVNu9703FfY69JolU+OIVNABIVJAj6RCH7jVfnkGM/J1OUPDAjwYOWTY3w9T8O21fzEcNPmJdRPp63Z566r8L/RclQT+w6JDK8Cb+o+AFgP7SGNU8KoqaSSJU1ZKEntVbt+vtt4EQCS7lrU8OGq5BnPKq7MYRKIF3/strWAgY6h8J24/7hl8vRlCTffLUpIkl99vUidDDEdt4wWSG6Hq6alz6fKm8D/9QVemT8AJ1Jr6bAstFYSr0bZZSxnzSaxvmfyfYv5AST+wAr/Iq3Tdy+PjAiwnyqqfHJM92U7aPI4EjmQMS/51dfzD0r06Zrxc1mzqeHDVdAqU0SF4Mpa56tT57UT2OZZBKww4YGOIbRrwqD9VJEv8+H11/R68cAPBoI8K5c+kn5BgcSQ0fjx2o5dKiqI/TwkMuylzBlNpgWRf+KUHi5juaVwu7T3wF5bL/TaekEzbjZJDGr6dIO7rkKVPzlt7Q+MXGk7Vkhjj549e0EOOJEqd2MteSvWGS47pLbilgCgHD7W/ssB+4kf4MrWEekSSHtGrIl2xnLtAyuA9+uLdWkkmYMgMfSl9YQ1m8S6NFXeBHlWruH5PRJDX4FRFml0NCvWXVepyM6XZebUbVxIr0aqouKakZqxs0it49c0cl+SZ6DKm5D61H8AQDN2FklNaKmEVwbtkUjAfqrIdrxQ2ntg6lP/kWflSvTpSVMWdl/xmUidbHprMT9OINb3lOjTUx5aS2odMUwtKSCjppIyNAfFs5/GVypfZ2GFCSct+3tE//TklL+80nPD0bQ1+9LfOq3MGW3esym2t4uI/RiSn/mrtevnm7/cKljJxmUsvzQvVzN2lmDhR2hN/vdzcZexnD+bgD8RCeOWUYf9VJGjpFiWmdNr6wVXZal575bESfOJc/X0ymJdGo0bNe16UzdnTcq81ziHtfnr9xmZgq7DZvnuY0V2fuKk+dI+g0jlcRnLpb0HWov38n9xWoxzWK1Fu+DKLIH6zc/I+mYnTppPkgkcJcUSfbp6zEw6fMOaTaa3Fof+G0LajWnjIu3sVYrB+TRm4zx/0rTpafrrN+0u0Iyfq8wZrcwZTWsd+ZR06lT5k0kuC81B8Zyry49B+joLK0w4cZw75iuMF13UvvIX9a3TGJXGdvQby/6dMd9OMQ1mS5uFGmYPaLMM0mEEcUtlzmj+UihRStLG054Ho7Eikcz99galyUJ8bS6MTtfBoguHtuvBBAkNvhaujEY86080Vh5fkOpBFgpp/uYDz//sXntohI79yl7PiqUKI4DWn1iqNkgn8doq+QcdUtfTY10RI1Ow5nqxzsDIlOY9m2JgXknMOKTQIc/KTfnLKwBAF6RBKLHtkAgSfTqJJwEALqAfXNAhIZ50wCHF/ihb5BNvcUsEWoOFrspSuoIoEm+4jOU1K++V6NM14+bQ6R0IgkQO6JC6Hj8TFpBYpemT151lJ6J9LBXpPC5jeQzEjBEkJkGHhCBdANpiBEGQCCeg2f7y22aG+jmQWAIrDIIgCBLtBOSQFPc8hW0eEiDy22Yq7nnK60eiJH2YHwaJRrzWE8WUx8P/JEg0gq0VEiwCXTESTRISCH7sEQCoFryNJgnxjyhJr1rwtudx+Zg/+alaCELwL0EI0i7asaY2miTEP21qk9jQH00S4gdij8SG/l4/xcYP8Q/WECS4tG/XETRJiC8C1CY0SYgv/NsjAjaBiC+wbiBBp937sqFJQjxplzahSUI8CcQeEbAhRDzBWoGEgo7sXIsmCeHTAW1Ck4TwCdweEbA5RPjg7BCkTTpWEzrikABNEtJKh9sqNEkIob32iIAmCSHg7BCkTXzN/2gT8ZNPdXBLZ0n2TWBpcpf+1LHTkRigk62USK2TDBnlKt7N2dveHBCJSTpmjwiSfkMZlcZ1/NugPxUSLbQpQSgySGdEpuMOCdAkxTdB6cSjfsUznVEuApqkeCZACUKRiWc6KTKdckiAJileCeIYB+pXfNJ5e0RAkxSftEuCUGTik86LTGcdEqBJij+CngKC+hVvBMseEdAkxRsdkCAUmXgjKCITBIcEaJLiiRBlyKJ+xQ/BtUcENEnxQ4clCEUmfgiWyATHIQGapPggpBOIUL/igVDYIwKapHgAZ4cgbRJEkQmaQwI0SbFOGOZXo37FNqGzRwQ0SbENzg5B2iS4IhNMhwRokmKXsC0/g/oVq4TaHhHQJMUqODsEaZOgi0yQHRKgSYpFwrw6H+pX7BEee0RAkxR74OwQpE1CITLBd0iAJim26JLFi1G/Yolw2iMCmqRYAmeHIG0SIpEJiUMCNEmxQhfu7YD6FRuE3x4R0CTFBjg7BGmT0IlMqBwSoEmKfrp86yvUr2inq+wRAU1StIOzQ5A2CanIhNAhAZqkaKbL7REB9St66Vp7RECTFL3g7BCkTUItMqF1SIAmKTqJEHtEQP2KRiLBHhHQJEUjODsEaZMwiEzIHRKgSYo2IsoeEVC/oovIsUcENEnRBc4OQdokPCITDocEaJKihwi0RwTUr2gh0uwRAU1StICzQ5A2CZvIhMkhAZqkaCBi7REB9SvyiUx7RECTFPl0uQShyEQ+4RSZ8DkkQJMU2XS5NgUC6lckE8n2iIAmKZKJEAlCkYlkwiwyYXVIgCYpUokQbQoE1K/IJPLtEQFNUmQSURKEIhOZhF9kwu2QAE1S5BFR2hQIqF+RRrTYIwKapEgjAiUIRSbS6BKR6QKHBGiSIokI1KZAQP2KHKLLHhHQJEUOEStBKDKRQ1eJTNc4JECTFBlErDYFAupXJBCN9oiAJikSiHAJQpGJBLpQZLrMIQGapK4mwrUpEFC/upbotUcENEldS1RIEIpM19K1ItOVDgnQJHUdUaFNgYD61VVEuz0ioEnqKqJIglBkuoouF5kudkiAJqkriCJtCgTUr/DT5coVRNAkhZ+okyAUmfATCSLT9Q4J0CSFl6jTpkBA/QonkaBcwQVNUjiJUglCkQknESIyEeGQAE1SuIhSbQoE1K/wECHKFXTQJIWHqJYgFJnwEDkiEykOCdAkhZ6o1qZAQP0KNZGjXKEATVKoiQEJQpEJNRElMhHkkABNUiiJAW0KBNSv0BFRyhUi0CSFjpiRIBSZ0BFpIhNZDgnQJIWGmNGmQED9CgWRplyhA01SKIgxCUKRCQURKDIR55AATVKwiTFtCgTUr+ASgcoVUtAkBZeYlCAUmeASmSITiQ4J0CQFj5jUpkBA/QoWkalcoQZNUrCIYQlCkQkWESsyEeqQAE1SMIhhbQoE1K/OE7HKFQbQJHWemJcgFJnOE8kiE7kOCdAkdY6Y16ZAQP3qDJGsXOEBTVJniBMJQpHpDBEuMhHtkABNUkeJE20KBNSvjhHhyhU20CR1jLiSIBSZjhH5IhPpDgnQJLWfuNKmQED9ai+Rr1zhBE1Se4lDCUKRaS9RITJR4JAATVJ7iENtCgTUr8CJCuUKM2iSAiduJQhFJnCiRWSiwyEBmqTAiFttCgTUr0CIFuUKP2iSAiHOJQhFJhCiSGSixiEBmqS2iHNtCgTUL/9EkXJ1CWiS/IMSBCgybRFdIhNNDgnQJPkGtSlAUL98EV3K1VWgSfIFShAFRcYXUScyUeaQAE2SN1Cb2gXqlydRp1xdCJokT1CCBKDIeBKNIhN9DgnQJF0JalMHQP3iE43K1bWgSeKDEuQVFBk+USoyUemQAE1SK6hNHQb1ixClytXloEkioAT5AUWGEL0iE60OCdAkoTZ1GtSv6FWuSABNEkpQm6DIRLXIRLFDgvg2SahNQSGe9SuqlStCiGeThBIUICgy0Ssy0e2QIF5NEmpTEIlP/Yp25Yoc4tMkoQS1CxSZKCXqHRLEn0lCbQo68aZfMaBcEUW8mSSUoA6AIhONxIJDgngySahNISJ+9Cs2lCvSiB+ThBLUYVBkoo4YcUgQHyYJtSmkxIN+xYxyRSDxYJJQgjoJikx0ETsOCWLdJKE2hYHY1q9YUq7IJLZNEkpQUECRiSJiyiFB7Jok1KawEav6FWPKFbHEqklCCQoiKDLRQqw5JIhFk4TaFGZiT79iT7kimdgzSShBQQdFJiqIQYcEsWWSUJu6hFjSr5hUrggnlkwSSlCIQJGJfGLTIUGsmCTUpi4kNvQrVpUr8okNk4QSFFJQZCKcmHVIEP0mCbWpy4l2/Yph5YoKot0koQSFARSZSIZpMIfpV3GVHGKry9jai1x9JWuq5OoqWVMlZ20Kz90RBEEQBIka5AlirYHRGkTa7kxKT1GyQZTWT5JxLYglYXuEEDske7Pz+Leuo187j37FmU0hvBGCIAiCIDENo9ZKskdKrxklyR7JKDUhv12IHJKz+HPHN++7Sw5yTnsoro8gCIIgSJwiloj7D5PnT5HmTQzdTYLvkFwlh+zbV7tKDgX3sgiCIAiCIHxEvQaq7nlKPOD6UFw8mA6JrSmzbXvRefgL73fSGqRDbxcld2eSUsX6npCoFyV1C0OUjI/t/ZX2/20K5x3bBeZFRj7uyl8ta/7INhi7+kG8E9tZkzGA/X+bbO+v7Oqn8AlKUCSAItOCvZltNHKmSrfxItdQw9ZXuX7+ijVe9FpWkn2TctJCUa+BwX2EoDkk28dr7Hv+DW6X4Lio10DpkFuk194u7psdlBt1kog1SahN0ULE6hfao6ggYk0SSlDkgCLjC/bCSefRr5xHvnCfOyb8TCyRj7pfMeX/gpjKHQyHZG+2FCzyDB1Jc34r/8P8CNTrCDRJqE3RRQTqV5crFxI4EWiSUIIiDRQZ/7A1ZfZP1zu+3y44Lhl4o2ruv4I1PNVZh8SZKs2vzGEvnOQflGQOU0x5XNxvaOeeLYRElElCbYpGIkq/Ikq5kECIKJOEEhSZoMi0CXvhpOX9le7TB/gHxYb+yodfCcqjdsohuc/+ZFn/MP/3Y7QG5dRnpNfd3vknCzURYpJQm6KXCNGvyFQupE0ixCShBEUyKDKB4Dr2rfU/z7I1ZfQIo9So5v5LMvDGTl6542tqu8/+ZH7pAa654fK1+g1NeGyLJDLyjdokElbcRm2KaiJhMdwIVy7ED5Gw4jZKUISDIhMIom59ZNdPcJX+zNW25nG7HM4fP5X0HypK7d2ZK3fQIXGmyuaXpvPtkTRvomruKyJVYmeeJsx0rUlCbYoBula/Il+5EP90rUlCCYoKUGQCgZEpZdffwZnr3GWtGdwc6zrypeS620VqXYcv2yGHZG82v/QAW32+9RoSxR1/Vd6zmAnjWuDBoqtMEmpTzNBV+hUtyoX4p6tMEkpQFIEiExAikXTILaJEvetEIXAsAIDL4T65X5o3kZHKO3bJjjgkyxuPuk8V0beKO/4qH/dwx24fCYTfJKE2xRjh168oUy7EL+E3SShBUQeKTICI+14j0uhcP39N3nJmk7vsuCx3PIhEHblaex2S7eM1jm8/pG+leROV93QwkylyCKdJQm2KScKpX1GqXIgfwmmSUIKiFBSZABH3vYZrNNLhNtZ4gbM0SK+5uSOXapdDYmvKLG8uaIlfAYj7DVXNfSUaB9c8CY9JQm2KYcKjX1GtXIgfwmOSUIKiGhSZAJEOvsl99ifWeIG8dZ87Kht6O5OU2t7rtC/uZNv2Il01m9EaVA+91uHhvQhEcc9T8ttmhu76qE0xj9jQX7XgbVGSPkTXjwHlQvwQaolACYoBUGQCQixRzf2XKLUPPWDp0Moa7XBIrpJD/IWzlVOfCd2P1FWEziShNsUJodOvGFEuxC+hEwqUoJgBRSYQGKVGNeM5+tZ9+oDrWLsDtO1wSPbtq+lrSeawqFgWsgOEwiShNsUVodCvWFIur9j2FlgW5FqfuMlzG4F4IxRygRIUY6DIBIJ4wPXSIbfQt7ZtL3huHdvGFQLMQ3IWf27fU0DfqmavFWkN7bpTFBHcnCTUpjgkuOkCsadcApxFn3D/ecZttzF2C/vzV5A+SGzI6OqH6kqCm5OEEhSToMgEgij9ase3H5Dkaa6pTpTaS9xrYDtOD7Cc45v36Wtpzm8jec+1oBCsSBJqU9wSrE5erCoXhXPa7bte5wDcHDhZjmFZ+4fPc5bGrn6uLiZY0oESFMOgyLSJuMdVsuvvoG8d+95r1+mBOSR7s7vkIH0n/8P8dt0jSum8SUJtinM6r18xrFwUtuocW/Gr082Rtw6Wk9edt3++sWufKhLovICgBMU8KDJtIp+0EFpn3LvP/tSuTe4CckjO499yTnvLCb0GxvBXKaAzJgm1CYHO6VfMKxeBvXRazrAcAAMgZgAAbG7O9fXb7IWTXf1oXU9nZAQlKE5AkfGPKEkv7j+MvnUd/aod5wZSyHX0a/qan/cUD3TMJKE2IZSO6Vc8KBeBrfiVYYABAJmC1aaJGWA5kDpt1o9Wcazb52kcxxnL2V++Z8t/Cd+zdgUdExOUoLgCRcY/0utuo69dx4LtkJw8zyW9NjansPmhvSYJtQkR0F79ih/lAgD3xdMcB2IGmKRuqukrQKYQMWBzc6JfvnMd/ExQmG2ocf38tX3bi5YX7zY/e4f15QdsKydb3lnSJU8eNtorKShBcQiKjB/4kR3niR/omFibtO2QXCWHOLOJvGa0BnHf7A48X7QTuElCbUK8Erh+xZVycS4nW3XOzYGIYURag2RQviT/HoWYAQAOwLbjZc5sApvZXXLQ8emrzf+caVk23vHqHObLAubcUbCbXSzndru4b99v80bRTuDCghIUt6DI+EKU2kfc46qWN/Zm/saybZzYZgm2uoy+lg6NuwASJRCThNqE+CEQ/Yo35YJGo7uuws1xYhGI0voDgOz3c22JBqmIcbIcY7zQvGpa47IJzavug0//JT7zAzSbXCxndXEOlmM5AACJiGF7Xt3Ff0VYCEReUILiHBQZX0iyR9LX7qpfAzwrAIdUe/Fy6eTu7X2sWMK/SUJtQtrEv37FoXK56y6KHM0cBwAg7jkAAESaFPkdfxUxAABuDkRVv4pNF4EDm4tjOZCJGDEDAAw5XcQAJ5Io7/y/rnr+MONfZFCCEECR8YEotTd9zTXWBnpWmyW4+kr6ugMbv8UYvkwSahMSIL70Kz6Vi606JxcxHIDDDeIemeSg7PqJrszrFWKGAeA4TsIwCjEDYgnXrZ87/x7xuEfECiUDwABIRYxo4mOSwfld+1eEE19SgxKEUFBkPGE0Wvqaa6gJ8KwAYkimyw5JrO/Z3seKPTxNEmoT0i489SselcvlZM/97Cr6xM2BiAG3LIFJ6dXykViivHORkxGLRcAmG7gho+HuJeqnPlIt2aGc8iT7y36Rw8oBKCQMm3+vYsyfuvTP6AI8BQclCBGAIiNApL7skNzGi35K8pG0WYKru+yQIDHWtqrtGESM7P/bBKhNSIcg+mVZ80e2wRhfymVvdpcdcx39ynXiO7aiRMyxDpaTihhG243hyYu47zXyOa+IlQmiXoNAqWk56nZZ3pgvOVdsZTmlhHEOHKm6O07/65FOmu39lYAShPggfkXGG4wu/fKbxkAXjWzbIfFjSKKkbu18qpiFShJqE9IxWvTrjcdUf/5nzCsXZ21ylxS7jn7pPvk9W1MmFzMMCxzLuQCkIkYtZZpSezNSGf8U6bW3Ci5i/fAFybEvrS5OLmYcaVerZr4oOCWuICaJM15CCUJ8EVci4x+RJpm+ZhuqAzyLaTC3seldw+wB9HXSxtMdeDIEQeIWtuaC83+bXEf+x5gqJCJwsuBmOWBAyjBiEdhYRpzWXzr4JvGIySRT2xf2/23mtq10uDmJiOESU5ULtoi6WkYw9wAAIABJREFU9wvbX4EgSLTTATPTdgwJQRCkY7jPn7C/NkfWWO1iOScLThZkIkYqYeyMlOs1kBs8MmHwSFGvgW2GghyH97q3/8PNciIGWJlS8afVaI8QBAk16JAQBAkV9s/fkDVVN7tadqUVi0TcgFxm2NiErDxR9wxgAtv16NfDzi1PgtvJcSCRK8T3LpVk5YXyqREEQQDQISFIvHHghwNu1nXDjTeSt9VVVd/u+1YkYm4aebM+tSVXuuhAkdPpuDG/ZRa9y+X68ov/nTlz5rbbb7964NUA8OOBohS9vl//fj8fOSKSSAYPHgwAZ06fOXr4sMvtvm3MaL1eDwDgtLLc5VtzHAfAiNQ6UXK3AO0RADi+3KJwNJlZkIsZV0aOcsSkoHwPCIIg/glUpBAEiQGMxtoXn3/+nc1bOJYlRw7sP/Dtvn3FB4vf2LCRHKmrq3txxfNvv7WFZVkAcLvd/1y1+ovdexISEupN9aTMe+++29BQDwD/7+P/9+uZEnJw544dP//8M8uyYnFL10uaf7eLA0mrzLAcx5ze73rzUfPfx9nff85dehQ4noHygSx/io0TMQBOloOyn92XzgTr20AQBPEDxpAQJOpxOBynfvnl7K9n6+vrJRJJ7969hlw7NCk5ybPktvffH/6b4RUVFfX1DVqdFgDOnSvVabUcxyW3lv/o/Q+H/WZYRUWFyWRKSUk5cfz4mVNn1q1/VaFUkAJWq6WuzvTf/7fjq/99debU6bvuvpscv3TxokymSEpK1mpbpo1Ir72Vu2eJa8dLKrvZ7ubcHDhYDgAkpkrmm7etX24RZw5PeOB5Vt/Hz18nGXijY/DN8mNf2dyc0mm2f/a66sE1QfrmEARBfIIOCUGim31ff/3h+x+Uni1lGBEAMAAccKndUu+f/sdbb79iI8WLF8t379rV/6qrKi9V1tTUEId0qfxSQoKqf2bmHZMmAUBFRcWuzz7rn9m/qqKyuqo6JSWlrrZWrpQTe8RxHMMw1dU1wMC1Q4c2NTUBQHeDAQCam5vNTc2jf3tjWo80/k1lo+4XZ+W5v/2A+XGnsrnOxYKT5dwcSAHE+nTVLVNPVjaW/PDJuAnj/fyN8nGP2E4UMqzD5uLEh3e7SmdIMoYE+XtEEAS5EnRICBKtcCz37zfe2PHxxwAMx3Ec56YfVVdVv7T6n1arjToPjuM2vVlw/YgRuddf/+F7H5ScOTMga4DFYqmurlr54ovdDd1pmd/k5l4/Im/7hx+WlJwZOGhg9pAh72x+Z+nTS1L0KXMeekgul5f+erZ7t+5/mDzpzKnT+7/7PjEpEQCqKquqq6vOl5Wlp/cSPKc4LVN891OSMQ86D+yA77crqs86WU6cPxVG//mDfYc+eGfZmDFtbIkt7nsNXDdWUfyJ1cVJWKfjs9ckj7we+Bd1tuTXogMHig8eOneuzGJpBgB9auqgwYNuGnnTiNZ8LARBEAHokBAkWnl369YdH+9gWQ6AYxiGYRiOA45joSUnGjauf717d8Pw3OEA4HQ68268Mff6XI1G083QXSqRAgDDwKw/P6hLSSEXdLlc19+QN/w3uYmJmu6G7hKxBABSUlJWvLjywP4Der1eJpMBQN++fafN+CMAJCYmPvDnB8ViMQAkJSfNnjvX4XKmdvO+e6MouZt87J+5m+91/fSFXKE5rehV8MK/fjl+jGGY3n2Eo2yNjU2JShmIJSASkyOK3z9kOfKFyG21uznx8W/cpw+IB1wfyLdU8Mab27d9JDhorKnZ9/U3+77+ZtCgQY8/9WRKKu4WgCCIEFwxEkGikmPHjj39+JNOpxMAGIbRaDT9+vdvMjedLfmV4wCAE4lESqXq4XmPjLr1lq5+WCG19U0PzXrAarGyrJthmGdXPpczbBj99Is9e38++ctjd9zU/J9l4gHXiwfeKO49mNHobB+sZL7abHNzcjHjysxL+FsB0+qfvN+lxvjiyudPnDjh/2EmT7lz1p8fDM4fhiBIpIIrRiJIXMBx3EcffOB2uwGAYZj+V2U+ufgpQ1qa2+3+/LPPNrz2OgD0y+z/l3nzMgdc1dUP64XqSxcszc0k0CUWi/W8EM5Phw+vf/W1nr17u9UTXRdOyS6esH/xFqfWifpdy+jSnYyYAZfDzYnP7Hcd2ycd4s/8/fvNN/n2aNCgQaN/O3b02DEAcLbk16+/+urzzz4ng24IgiCeoENCkOjDbDafOXWGOAyGYX77298Z0tIAQCwWj5sw4VL5pSZz05yHHkpQJ3g93W6zVVZV19fVsiyrSUxM69EzIUEV4K0tluaqyqqG+npgmMTExB49eygUykBOZFmWDAUCQE1NDcMwJO87ISEhKall7tvZX8+ueuFFh91eV1NjZuQSfbq9pszFcuKmWvHPX7EcsK3LK0kYcHy9VXLNKHJBT3747rt9X39D344cdfP/PfkEfdsvs3+/zP4T//CHF1c+r1IF+rcjCBJXoENCkOjDbrPZ7Y6WNxwcPPjj2N//ViRqWXdo1uyW3CBPSkvP7dm169CPB6urq10uFwAwDJParVveiLyJkyaRfG1fnDxxcs/nu386fLjWaCRLJTEMYzCk5d980x1/mKjVaj1Pcbvdhfu+/a6w0FRX53Q6RSJx3g15d0+dWlVxeT/spKRk4lEuXbq4YvnyxoZGlmVNdXV1FkdaarrCdL7ZBS6Ws7mvWDnJzQFbc4FhWfDxl368bTt9rU9N5dsjSkqq/h8v4cIBCIJ4Bx0SgkQfSlWCUqmwWi0AwHJs0f4D/9n67rQ/3k8+9WqP7Hb7e1v/898dO5wOB8uyXOtSjRzHVVdVffLfnYX7vp398Nz8m27yPLexsXFTwVtf7v3C7XZzHMc/t6Li0kcffFj4zbePLnqMrKx9+ayGxpf/uebHAz+2vOdAJBYN+81wAKisanFIDMPoUnRSqbS+vv6FFc9XV1axLCsSiW+57ZYeaWmSPyyw78/kfvmeqTirYNxuDlwsx3LAACjEjK3XQF/26GzJr/zxtclTJgf4xRJ++O67b/d9e+L4CWNNDTnSp0/vgdnZv//97/tlXrFB+tmSX59Y9LjF0qxSJSxe+nR6z/R/v/kmiV0N/83wvz+3nJasrTHu3bOn+OAh+mC+rokgSISADglBoo+EBNXVgwb+8N0PZIY/y7LvbX03WZs8brz3VYUaG5tWvfDCkcM/UW8kYkTAAAAQx+N2u+vq6ta8uEokEt1w5QT4qorKlc89V3q2lKY90WAVy7acW1lZsXLZ8mdXPNf/qkzykcvlennNmoM/HmRbF+8GAGChR88eAFBVWU2P9Uzvabfb/7HyhdJfS1mWFYlEI24cMe/R+VKpBHoPlvUeLHM53RdPsSe/g2P74PxJqaMZgLOlD1ZMWuDr+yk6cID/9sYb8wP8Yn0ld5eVnS8rO//5p5/9dtzv//LXeZe/nKpKkslksTRXV1VvfG19Wdl58tHBHw/SYnt373nj9Y2CnCd6zdkPzbnjD38I8AkRBAkb6JAQJCq5864pPx4ocjpbHA/HcW+s36BQKG67Xbi2kN1m+8fK548eOUIsjkgkkkgkvfv0TtZqm5ubz54967S3RJWcTuf6da9mXX11Suv8/7o60/Jly8rOnWfZlnPlckWvPr1UKpXFYikrPed0OlmWZVm2sbHxXy+//I81q+UKBQAc+GH/j0U/0sE4+jDdu3cHgDpjDQ1E6XS619a98vPRn1nWLRaJh+YMfWzRAqlUevkPkEjFfbLFfbKlv53D1lzgjOdBKhf1zmZkCl9fjrG2lr7Wp6YGOJm/tsb4zOLF1OJ45fNPPwMAvkmieHogwg/ffbf2ny/5uebG9RuUShVJIUcQJHJAh4QgUUnW1VdPf2BmwRtvUofkcrleeflfcpksf+RIfsmtb79D7RHDMLfcduvkO+/s3bcPCQWVlZVtKXir6EARMUn19Q27P9t13x/vBwCWZTe8+tr5svNkTr5ELBl3x4Txd4xP69GDXLm09Nwb61//+ehRYpLO/nr2u8JCspD3D999T8owDJN9zTX3TrtXqUpgGKZP3z6NjY0NDY3kU5ZlP9+1y1hj5DhOJBINvib78aee8pP6LUrtBanCFSk9OV96jr7ulup9fSZP/v3mm9QeqVQJ98+4n4R2jvz009bNb9PA0ueffuZ1aMzXtLgN6zfwr3njjfkpqXoy6PbOlrfJR1vffgcdEoJEGrhzLYJEK5PunDx5yp10zIsEgV5e8/Lh4mJapvTs2U/+u9PtZgFAJBL9afaDjy1a2LdfBj2rT58+jy9+avA12S1HOK7owAES+zn448HvCgvdbjcAI5PJHvu/BX+eO5vaIwDIyOj7xNOLu3c30ChR4b5vyZNcunSRFvvjzBnXXnfdgKwBVw24SiaT1debmlun+gMAsUcMw2RnZz+5ZLGv+XehprbGyJ/79reFf6MjX9cOHfr4U0/qeU7r66++8noRlSrh/ul//GT3rs3vvL34macBYO/uPTSZafKUyXf84Q8koJWSqp867b7hvxlOPjLW1Jwt+TUEfxaCIB0HHRKCRDEz/zTr9tGjxa0LJ3IcZ7NZ17y46tKlS+TIJzt3upwusoDkLbfdNunOOz0vIpPJ7rt/GnE5HHBVlVVkw7X//r8d5KBIxEy5666Ro0Z5npuYqPntuN+1nMtxF8svuVwuu81eW1tHfI9MLtfrrxjkqjXWkml09JkBgGGYvhkZiYmJwfhWOkIxz1b26dNbsBtJSqp+5M2XI3O/nDjp9SKLlz49ddp9pDy5At+tvrPl7fFjf8f/x89Vqqqq9LwggiBdCDokBIliGIZ5aN5f8m68gW+SGhsat255GwBsNmvxwUMsxwKAVCadcvcUX9fJvOqq5OQkYnTsDofL5TLW1Jw4fpwsYqTVau+Y7DOVeMiQy5vI2h02DqCxsaGxqWUcTaPRaBLV/PJVVdX8zCTymmXZXZ99VrT/igzroFDdGsLxz4Xzl9OP+mRkeBYYOGhgmxe5duhQwRFjdUB3RxAkAkGHhCDRjUwm/dvCvw3KHnx5ihnHHvjhgMlkqqkx1hprSYpPr1690nv5zOARi0QSmYy8FjEiqVR69myp0+EAAIZhBg0erFarfZ2r1WqlMhkDDADIZXKJWFxXZ3LY7CSGpNVpBXlFVZWVAAy5cka/jOzslgE+l8u1/tXXanlJ1h3map6bMdbU1NYY23W6IOgVBlSqhO7dDWG+KYIg/sFMbQSJepRK1aInH3/8sUWVlRVk9r7NZjVW1zQ0NNJ0nxS9nlooT2pra+vrTMTTJCUnajSammo6IZ/Rpej83N3hcrBuNweciBElJSUxDFNTXS0SiUgyU2pqN8F9qyurAFpG1tJ79Z72x2mP/fVRq9XKcVxtjfH1V9c/+fRTfh41EHr17s1/W1xc3K48aKOxfY4qQBY/87Rg8A5BkEgGY0gIEn04HA67zcY/kpKSMuXuKfzRK7JMEYnWAEBjY6OfC37/3fd0E9zMq65iGIYkaAMAAFdfX+/n3NraOppXRNKQqyovp9QYrlynm+M4nvcCnU7Xq3fve+6dSp7czbr3f//9rs8+83O7QMjJyVGpLmd87/18d5un8PceKSst9Sxwkpd7pAp4kxZ+yZM+spcQBIlM0CEhSJTxY9GPT/3f429v3iI4nsIbG5JIxCn6lEsXL5FoDcuypWdLz58v83rByoqK/378ceuqAXDLbbcBgE6nJedyHPfzkaMN9Q2+nqfOWNvqzJjuBgMAVPCcGTlCcdjttXV1rZEtxtDDAAATJ0+6+uqrSdyI47hNb75V+uvZwL8QT1JS9cNzh9O3J06ceG/ru57FamuMf396yT+efwEABg4eRI+XlZ3/4bvvBCX3fbOPvs0ZPizAJ7l64OXxvn3f7GvveB+CIF0IOiQEiRp+Pnr0708vWbbkmVO/nNq7e09Z2WXH43K5Pt+1i7wWiUR9MzL0qamVFZdoAYfdvuG11z3X7Dl/vmzFs8+ZTPUtKxINHjRs+DAA6JeZSZZt5Diuvr7+32+8wZ+ARjh65Kczp0/X1tZSP5RmMEDLktkt7qpbt278U5qazE2N1Gxx3bsZAEAqlc595GGZXE62s7XZrK/86192m70z39WfHnyQH0Z6Z8vbf396CfU9Z0t+LXjjzYdmzz3440GSTH3t0KH8+fwvrX5p7+495PWRn356ZvFiOmlfn5oa+CLduddfT18ba2qeWbyYXhYAfvjuu4I33px5//SCN97syB+JIEgowTwkBIkaNq5/vexcGck0am5uXvH35TP/9ECfvn2qKqs+3r798KFimnU0YeJEhmH4+TQsyx796cjix5+ccs/dWVlZEqnUZKr/obDwk//ubGpqInPWFErlnIcfIsYoLS1t6HVDfyw6yLJulmW/+t+XdbV1d951Z5+MDIZhqiurvv1m365du1a+8DwdNeM4NrV7NwAw1RrpHP7Ublcs2FhvMlmtNrrKpb51tev+V2VOuWvKu++8y3FulmVPnzrz7jvvPPDgnzr8XaWk6hcvfXrFsueoKTz440H+7HqKvvUJ/7bwscWPP0leWyzNa//5kte1sOc8NCfARboBoF9m/8lT7ty+7SPytqzsvK/LIggSaaBDQpCoYezvfrfhtdfJa7Iq48rlz8lkMofDQaIvACAWi4fmDB116y3Nzc2mOhP1TADAsmzJ6ZLnl69QKJUSsdhqs7JulvgthmGkUulfH/0rXSqaYZhpf/zjkSNH7TYbx3Esyx756afDxcVqtZphoLnZwjCMRCLR6/WXLl4kESORSJSammqxNNfXN5ArJCQkJGu1/D+hpjUSwzCMSqVK/v/t3Wt4U1W+x/F/LiXp/UJbQiGIpVQhhXIpeoqCBUHAGT1QQWbAhzlqQTheeEAeFed4QZ9HB2eUUVG04HjkCIogMl5A1FFABwZELUhHLVCFlBoglN5ok16S82LDNuxim7ZpQ9vv5+HFTrL32is77PSXtdZeOzZGfSl72tQ9e/YcKjjo8Xg8nvq/v7NpcPqQ4SP87c9qKH3IkD8+8j++90praODAgbfn5Kjrz1+44NduHiIiYWHhs+fOae5o69tm54iIGpJ+pWR/BzYBaDckJKDDuHb8tVs++ODoEbtylzQl/bjdbnXZYDD0vbTvPQsWGAyGsrIydepqdUZHZW4kV3W1uomI6PX68PDwO+ffPer825X0658y9855y//6XH19vRKSROTMmTPKtjqdLio62tit2ynn2UmxwyPC47p3P336dGVl5dkVoqLDw8+bI9vhM4g7OjradxIBk8k0d968B+9/wO12KzdReXH580//9a8xsTH79+1zu2pCQgxJSb2cp5wDbTY/j1j6kCEv5L788daPvvn66yM//qhGpUsu6TMgLW348GGauDN+wnXDhg37+KOPvt771U8/HVGiUlhYeN++lwzLGD7+uus0rUc9eljCwsKV1QYOHCi/4rbZOVljxmzevPnojz+pdy9Riu1zad+G1QBwMSAhAR1GaGjYogceeGLJ48ePOzwej3jFK14l/ShtSJlXXXXHf8+Ni4sTkZKSkhq3W85enpai0+sPFRyU84ORiM7r9Q4fkfFft97aN/kC0ySOv+668PDwV3JXnTh+QsTru61er49PiPfU1//sOK7XG3Q6CQ8LDw8PV+ab1uv1er0hMjqy27lplhRFRXaD3iCiM+j1UTHRJpPJ99XLBlw++absDeve8ni8It5TzlPPPvPM4oceCg8L14l8//0P33/3fUxcnP8J6ey7mHCd/1f7K/cDUabGblJySr+33tng55oXvN8tgIsWCQnoSC69tO+fnn5q09sb9/xrj/OUs8bt7mbqFhsTN8A2cOy4sUOHDVPXPHn8hE6v93o8Op1+xJVXTr152o5t23ft3Hn0p6MuV5XBaIxPSLjssssyrxo5yGdS7IZGXnXVQJtt+7Zte/d8eayoqMbtNoZ0s/TokZY+OGtslslkyp56k6fe4xWvpUcPEYmJjc2eNlWtraa0wYPTo6Kiz716gUw27eZpRr2+2uUSr4iIx+s9XVKSaOnhdrkio2MqKytiYqJbfvgAwG+6ssqqxtcom5OqLkfnFrRxfQD4xe12ny4pcVW7zKHmmJhYc6hZs8Ibr69Z+/paj6der9ffec/dE6+fdHZDl7umtkavN4T7PamPyuWqrq2tMxoNoaGMmwHQkbQgzNCGBHRIJpPJ0rNnIyscP35cveS+R49fpm00mU0ms+nXt2uM2Rxq1iYxAOicmA8J6JxOnLtnqtFo6J7QPbiVAYAOh4QEdEK1tbWnnE6vV3Q6XWhoWHR0TNPbAAB80MsGdELV1dVut8vULcTr9Sb2SNRccg8AaBIJCeiEoqKi/vf1/wt2LQCgA6OXDQAAQIuEBAAAoEVCAgAA0CIhAQAAaJGQAAAAtEhIAAAAWiQkAAAALRISAACAFgkJAABAi4QEAACgRUICAADQIiEBAABokZAAAAC0SEgAAABaJCQAAAAtY7ArAADocl4d3zvYVejwbv24KNhV6ORISG2Lb4HWa+dvAT6y1mv/L24+tdbjzy2gQS8bAACAFgkJAABAi4QEAACgRUICAADQIiEBAABokZAAAAC0SEgAAABaJCQAAAAtEhIAAIAWCQkAAECLhAQAAKBFQgIAANAiIQEAAGiRkAAAALRISAAAAFokJAAAAC0SEgAAgBYJCQAAQIuEBAAAoEVCAjqMrIVL5+88NXP1jmBXBIHUKz1z/s5Td2w5GOyKADiPMdgVQHvLnL34ilsX+T7jKispLSr8Zt1LBZ+8E6xadQ4Nj23lieLKk8VfLH/02L5drS8/skdvETGaQltfFJrlhqVrEi8bHJGYJCKVJ4qLvvli65J5gSo8IXWQiJij4wJVIBq689Miozl0y8M5fMvBf7QhdTnxKWkiUueqLrUXKv/M0XEWW8akx1Zlzl4c7Np1bA2PbURiksWWMXnZ+tRxU/wvJ3XclPk7T/1h3ZdtVlM0w+2bvk0eNTEiMUn9TC+fcPP0lVuDXS80g9EcKiJhsQnBrgg6EtqQuqjKkz+/Nn2Eshxpsd741Jr4FNvg7Nt2rXwyuBXrBHyPba/0zIlLciMSk0bd9Zj/P175Hr94THhkRURikqus5P0HZikNgZEWa9aCP0X1tAa7agDaFm1IkAqHfdvT94uIOTquWU0daNKxfbs+fGSOiEQkJnFsOyLLwAwRKd6/R+0nrXDY37t/5ppZo4NaLwBtjjYkiIio3/60XgSc77EdMm3ONQuedJWVvDypv2Y1ZZzE37KH3LYxT3kmxpo8f+cpZfnZkd19V460WK9/fFV8P5vRHFp5ovjfH6xt2Pg34ZEVvYderQydUYaaaYZDTV+51WLL2PJwTlhsgu2GW+JTbCLiPJT/5epljNVorl7pmVff9ajyidS5qkuLCvPfez1vfa7y6oRHVlgGZsRYk0XEVVZSvH/PtmUPVDjsjRSYOm7K0OlzY3onK+OTnIfyfQtE60VarOMffC62T4o6vOzwjs3bnrlfRJo8T5XzsZES0DnQhgQREWUEUp2rmq/gNpW3PrfOVd2wrS5z9mIl61Q47KX2wsoTxXL+eCbflc1RMbPW7rLYMipP/uwqK4lITLri1kW+Y8giLdbbN317+YSb1aEzRlNow+FQ5qg4ERk6fe41C56M6Z1cai+sc1XHp9jGP/h8r/TMtj0QHUfJTwUi0mfENY00AaaOmzJ52XqLLaPOXV1qL6xzV8en2K5ZcDaznv0sEnqqw/6SR03MfnZjIzvNnL140mOrLLaMOrer1F7oKitRChwybU5g312X1Ss9c9baXdaM0UaTWTnjIhKT0qfm3LB0jYgc/nxL4+dpkyWgcyAhQbIWLh32+ztF5OiX24Ndl05I/at2+PMtIuI8nC8iaTfO8l2nzxVZIlL0zRci8tr0EV+teV7OjWdS/vmubI6Oc5Wf3jDvt69NH/HypP6O/L0iMvA3M9QVxj/4XERiUuWJYmWd16aPWD0j05G/12gOHXPvU5rqWWwZjvy9q2dkKqtVnig2mkOvvP2+QB+GjmrbsgdcZSVGc+ikx1ZNX7n1gtlxzL1PGc2hjvy9L0/qr3wo25ctVj4XETFHxe7bsOqFsb2Vz2LLwzkiEmNNbiRyxaekOQ/lb5j321cmD1IKdB7KFxHbDbe0zbvschJSB7nKT29ftlj5yF6ZPOj7rW+JSJ8R14hIhcPe5HnaeAnoHEhIXZTSg6P8S5+aIyLfb33rvftnBrtenU3quCnDZ94tIo78vUqvyg8fvS0iCf3TfFeL72cTkQObVvtTpqus5K2516v9ZV8sf1RElHZ+Rc+0ESLy1ZrnfYfObH4oR/lZrGmHcB7KXzd7glK3Cof98I7Ncm5aAYhIhcO+9taxSjOexZYxdcX7mpyUOXuxOTrOVVaybvYE9cm89bnqwxfG9vbteSn45B2ltO6XXv5rO1XGOfl2iea/97qImKNiA/bGura89bmvTB7k22SuTN9gNIdGWqwicnTPNmn0PG2yBHQCJKQuSu3BUR6WFhXufPmJ4Fap04hI6PmHdV/+Yd2Xt2/6dtJjq5TmnM0P5Siv5q3PdZWV+CYVpem+1F7o55xJrvJS3yEsmq2GTJtjNIe6yko0HaYVDntpUaGIWDPO+41b/vN5o2FK7Yf9fZ9dRoXD/tr0EduXLVZz0uRl67MWLlVeVaZ4OHnwgP8FKj13yoZ+Uj5N3xyMgFM+336jJonIrpVPajra1C62Rs5T3xLQCTBSu4tSr0iPtFhvfmlzfIrt+sdX+f4IRosZzaHKmFwRKbUXHtn9qWbw5smDB6wZoy+77iblz57SdH9k96cB2XuPgUNFxFVe2vCl8p/tynBstEDe+ty89blDps0ZPvNuZcTJwX9sOrZvl3LZf211VeObD5k2R82mcX1T/dljr/TM/tdOpj2v7URarGk33KJGVXNUjO+rzsP5FltG2o2zuyMgAAAKZUlEQVSzlAsXlPNUaWT1swR0dCSkrq7CYf/wkTlTV7xvsWX0Ss8MyNTPXVypvVAzckhj9ytPWTNGKy32cq7p/qu1ywOy925hUQEpBxeUtz738OdbZrz6qTk6btiMu47t29XkFOdZC5fafjtTmbHQT5EWa/azG9WcjbaQ/dw71ozGpmz44aO3LbYMtaOt4XnaZAno6Ohlgxzbt0sZVXr1XY8Guy5dwrF9u5QB0ZmzFytN985D+Y1f++0/+97tImI0mRu+pLR21FSVB2RHXVaFw+7bp6Z0mYWEhl1w5SHT5ijj/PZtWPW37CHPjuz+7MjuhZ9/2PgulHjkPJS/5eEcZRPNdA9opRuWrrFmjK48Ubzn1b+oR1hz0ajaIZ46bkrD89SfEtDRkZAgcm5YYnw/G2MM24dyOUyfK7KUpvtjeRdoumtZi/3Jgm9FJCIxSXPVVaTFGtM7WUR+/OdHLSgWvmL7pMi5rFlxvEgaDOlVKT1rR7/cvu2Z+/0PwUrr0ZpZo5mYKrCUs0POdXR+teb5xu8ioEThtBtnNTxP/SwBHRoJCSIiu1Y+qVzSPHzGXcGuS5egjIuP72eL72erc1VrutiU8Unm6LgWzEt0bN8u5crwiUty1c2VXhtlPDh/dP3XKz3zji0HsxYu9f3loNyHRM5d0/TV2uXKkF7fiXBSx02ZuXqH+tC3hSl13BQ/Lwj33Sm3gWuBG5auUcZZq1dFaEYRhMb80jKXtXBpw27NA++uFpGE/mkXPE/9KQEdGuOQcFbx/j3Joyb2G309c8K2gwqH3XkoXxk3rU4E4KvUXhhjTZ68bH3lyZ8jEnq+MLYZw3XfvW/mrLW7IhKTpq54X2n2V764K08Ub5yfHbg30fmVHy8ymkLTp+akT81RjmREQk9lRNG+DauUP7cVDnv++2vSp+Ykj5p4x5aDrvJSc1SMMhG2iPzw8YbkUROtGaOnr9xaVeIMi4u32DIqTxRHNDosSfm/MePVT4v37xGRpMFXGE2hSo9Pm7/nTiQsLn78g89nzn4wIqGniKgzVImI4997Y6zJw35/Z89BV9RWV8X1TY2xJiuzPvqWUPDJO2PufUo57Jrz1M8S0KHRhtTlKF0Dde5qzfNfr10uIhGJSXS0tdivHdsLUlvslS5OjY3zs52H8pXL4pSr9Bspv8513jMVDvvqGZn2vTtcZSUx1mTli7vw8w/fmnu971e8q7xEGgxLqjp90p/KdxEVDvumBdPse3dUnihWjmSdu9qRv3fLwzm+PyS2PXP/9mWLnYfyzdFxMdZkoynUeSh/z6t/EZGCT95RXrLYMpJHTYxISNq3YZUyI6h65JVj7vshvnvfTGWsUvKoiX1GXFNaVLhpwTRXeanmg0bjNj+U4zycr/w8cOTvVWfcEJGtS+bt27DKVX7amjE6edTEOnf19mWLT/ywXxqcAkpIlXMzmbWgBHRcurLKJi5SLZvzy4Wp0bkFbVyfzubV8Vyp21q3flzUnrtrt48sddyUSY+tuuC9nzq6dv7IhBMtEDrridaJtf+J1qG1IMzQhgQEx9Dpc8XnFyoA4KJCQgKCoFd6psWWIec6NwEAFxsSEhAEaZNniYgjfy9TdALAxYlr2YAg2LpknnKfSwDAxYk2JAAAAC0SEgAAgBYJCQAAQIuEBAAAoEVCAgAA0CIhAQAAaJGQAAAAtEhIAAAAWiQkAAAALRISAACAFgkJAABAi4QEAACgRUICAADQIiEBAABokZAAAAC0SEgAAABaurLKqsbXKJuTqi5H5xa0cX0AAAACrAVhhjYkAAAALRISAACAFgkJAABAi4QEAACgRUICAADQ8iMhmcJ/WXafabuqAAAABJ5vevFNNY1qOiEZYi3qsqfc2cxKAQAABJOnolRd1kfF+7lV0wlJ55OQvKcdza0WAABAEHlLitRl31TTuKYTkj62h7pc7zzW3GoBAAAEkafytLqsjwtcQtJ176Uue8tONrdaAAAAQeSt8ElIAexl08f4jEMqPd7cagEAAASRp6RYXdYFMiH1TFaX6779rLnVAgAACKLaAzvUZb0luZE1fTWdkIyXpusiYpVlj/OYx/5dCyoHAADQ/rynHWp00YWYjANG+rmhH/MhGYzGtNHqo9r9NCMBAICOoTbvE3XZOGCkLsTk54Z+zakdMvDqX/a075NG1gQAALh41Ob9Q102Dsryf0O/EpIxfawYjMpy/U8HPCePNKtyAAAA7c9T5qwr2K0+DBk8xv9t/UpIutBIQ7/h6kP3Byv83wEAAEBQuDe/KPV1yrLeOsD/6SLF/zvXmq6eqi7X7NzIeG0AAHAx85w8UrPjTfWh6eppzdrc34QU8h//qbcOUB9WrXuiWbsBAABoT663n/6lASnhkm6jf9eszf1NSCISNv1Bdbm+YHfdgc+btScAAID2UV+YV/v1h+pD8033qiOq/dSMhGRIvdKYNkp9WP3GY16fG50AAABcFNxnqlf/UX1kTBkeMmxic8toRkISkdApi9QI5jl55MxL96jtVwAAAMFXX1e1cmF98UH1CVP2ohYU07yEpLcOMGXd8ksdCnZXv/l4C/YKAADQFlzvL/ed3brbyGxjyvBG1v81zUtIImKeep9xwFXqw5rtb9Rsf6MFOwYAAAis2n/93f3Bi+pDQ/IQ88wlLSuq2QlJDMawuc8ZLP3UJ6rffJyQBAAAgqv2X3+vem2x+lAXawmb96L/txnR0JVVVrVgs3rH4TNP3uytrlCf6XbN70N/91BzB4oDAAC0Vn2d6/3lvq1HYgqPvG+t70RFzdX8NiQRETFY+oXNfc43D9Vsf+PMc7N9MxMAAECbc5+pWnHnefHIYAy77c+tiUfS4jYkRd13/6x66R7fVGSw9Au95VFD6pWtqRMAAIA/6gvzqlf/0ffKNTGFh93255Ch41pZcqsSkojUOw5Xv3hXveOw75Mhg8eYshcZkvq3rm4AAAAX5jl5xPX2077TQoqILtYScdfLrWw9OltUKxOSiHirK6peuqfuu3+e96zB2O3KG01TFumj41tZPgAAgMpT5nRvfrFmx5uaSRkNyUPC5r0YqOARgIQkIlJfV/3WEzWfva593mA09BseMvTakMFj9AmXBGBHAACgS/KedtTmfVKb94+6gt0NJ6zuNjLbPHNJi69cayhACUlERDz276rWPVFfsPuCrxqS+hvTRut7JOui4/VRCfroeF2sJVC7BgAAnYmnzCnlJ+tPO7xlTo/zaO2BHR77dxdc05gy3JS9qGXTQjYikAlJUbv/M/fGv5w3ZgoAACDQ9AmXmG+6twX3XPNH4BOSiEh9Xc3ud2t2vFlfmBf4wgEAQNemtw4wXT2t2+jftd1EjG2TkM7xlDnr9n9Wd+Cz2n/vEveZttsRAADo3HQhJuOAkcZBWSGDx7TDQJ22TUgqb627/oc99cUF3nKnp9zpKXF4Tzs85U5iEwAA0DKF66PidbEWfZxFHxWvi4rXW5KNA0YGcCB2k9opIQEAAHQgLbzrCAAAQCdGQgIAANAiIQEAAGiRkAAAALRISAAAAFokJAAAAK3/B2KrA65IiGJFAAAAAElFTkSuQmCC"
141 |     }
142 |    },
143 |    "cell_type": "markdown",
144 |    "id": "502fdd00",
145 |    "metadata": {},
146 |    "source": [
147 |     "# <font color='green'><a id='the_destination_2'>2) Apache Spark</a></font>\n",
148 |     "\n",
149 |     "Apache Spark is an open-source, distributed processing system used for big data workloads. It utilizes in-memory caching, and optimized query execution for fast analytic queries against data of any size. It provides development APIs in Java, Scala, Python and R, and supports code reuse across multiple workloads - batch processing, interactive queries, real-time analytics, machine learning, and graph processing. You’ll find it used by organizations from any industry, including at FINRA, Yelp, Zillow, DataXu, Urban Institute, and CrowdStrike. Apache Spark has become one of the most popular big data distributed processing framework with 365,000 meetup members in 2017.\n",
150 |     "\n",
151 |     "Spark is designed to be fast, scalable, and easy to use, and it provides a range of features that make it well-suited for big data processing, machine learning, and real-time stream processing. Some of the key features of Spark include:\n",
152 |     "\n",
153 |     "`In-memory processing`: Spark processes data in-memory, which allows for faster processing times than traditional disk-based processing systems.\n",
154 |     "\n",
155 |     "`Distributed computing`: Spark is designed to run on a cluster of machines, allowing it to process large amounts of data in parallel across multiple nodes.\n",
156 |     "\n",
157 |     "`Data processing APIs`: Spark provides a range of APIs for processing data, including SQL, Streaming, Machine Learning, and Graph processing APIs.\n",
158 |     "\n",
159 |     "`Fault tolerance`: Spark is designed to be fault-tolerant, meaning that it can recover from failures in the cluster without losing data.\n",
160 |     "\n",
161 |     "`Community support`: Spark has a large and active community of users and developers who contribute to the development and maintenance of the system.\n",
162 |     "\n",
163 |     "\n",
164 |     "### <font color='green'>When to use Spark?</font>\n",
165 |     "\n",
166 |     "Spark is a versatile tool for processing big data and can be used in a wide range of applications. Here are some scenarios where Spark is particularly well-suited:\n",
167 |     "\n",
168 |     "<b>1) Processing large volumes of data</b> \n",
169 |     "    \n",
170 |     "Spark is designed to process large volumes of data quickly and efficiently. If you have large datasets that are too big to fit into memory on a single machine, Spark can help you distribute the processing across a cluster of machines, allowing you to process the data faster.\n",
171 |     "\n",
172 |     "<b>2) Real-time stream processing</b> \n",
173 |     "\n",
174 |     "Spark Streaming is a component of Spark that allows you to process real-time data streams. If you need to process data in real-time, Spark Streaming provides a scalable and fault-tolerant platform for doing so.\n",
175 |     "\n",
176 |     "<b>3) Machine learning</b> \n",
177 |     "\n",
178 |     "Spark's Machine Learning Library (MLlib) provides a range of algorithms for building machine learning models. If you need to train machine learning models on large datasets, Spark can help you distribute the processing across a cluster of machines, allowing you to train models faster.\n",
179 |     "\n",
180 |     "<b>4) Graph processing</b> \n",
181 |     "\n",
182 |     "Spark provides a Graph Processing API (GraphX) that allows you to process large-scale graphs. If you need to perform graph analysis on large datasets, Spark can help you distribute the processing across a cluster of machines, allowing you to process the graph faster.\n",
183 |     "\n",
184 |     "<b>5) Ad-hoc data analysis</b>\n",
185 |     "\n",
186 |     "Spark provides an SQL API (Spark SQL) that allows you to run SQL queries on large datasets. If you need to perform ad-hoc data analysis on large datasets, Spark SQL can help you do so quickly and efficiently.\n",
187 |     "\n",
188 |     "Overall, Spark is well-suited for applications that involve processing large volumes of data, real-time stream processing, machine learning, graph processing, and ad-hoc data analysis. If you have data processing needs in any of these areas, Spark may be a good choice for your application.\n",
189 |     "\n",
190 |     "![image.png](attachment:image.png)\n",
191 |     "\n",
192 |     "### <font color='green'>Spark and PySpark Installation</font>\n",
193 |     "\n",
194 |     "Follow the instructions from here - \n",
195 |     "\n",
196 |     "https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/week_5_batch_processing/setup/windows.md\n"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "id": "fd6315b9",
202 |    "metadata": {},
203 |    "source": [
204 |     "# <font color='green'><a id='the_destination_3'>3) Remaining Notes</a></font>\n",
205 |     "\n",
206 |     "The remaining notes can be found as individual code files along with explanations here - https://github.com/Balajirvp/DE-Zoomcamp/tree/main/Week%205/Code"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "id": "761e0215",
212 |    "metadata": {},
213 |    "source": [
214 |     "# <font color='green'><a id='the_destination_4'>4) References</a></font>\n",
215 |     "\n",
216 |     "https://dataengineering.wiki/Concepts/Batch+Data+Processing \\\n",
217 |     "https://www.montecarlodata.com/blog-stream-vs-batch-processing/"
218 |    ]
219 |   }
220 |  ],
221 |  "metadata": {
222 |   "kernelspec": {
223 |    "display_name": "Python 3 (ipykernel)",
224 |    "language": "python",
225 |    "name": "python3"
226 |   },
227 |   "language_info": {
228 |    "codemirror_mode": {
229 |     "name": "ipython",
230 |     "version": 3
231 |    },
232 |    "file_extension": ".py",
233 |    "mimetype": "text/x-python",
234 |    "name": "python",
235 |    "nbconvert_exporter": "python",
236 |    "pygments_lexer": "ipython3",
237 |    "version": "3.9.15"
238 |   }
239 |  },
240 |  "nbformat": 4,
241 |  "nbformat_minor": 5
242 | }
243 | 


--------------------------------------------------------------------------------