├── Week 4 ├── dbt_files │ ├── macros │ │ ├── .gitkeep │ │ ├── macros_properties.yml │ │ └── get_payment_type_description.sql │ ├── seeds │ │ ├── .gitkeep │ │ └── taxi_zone_lookup.csv │ ├── tests │ │ └── .gitkeep │ ├── analyses │ │ └── .gitkeep │ ├── snapshots │ │ └── .gitkeep │ ├── .gitignore │ ├── packages.yml │ ├── models │ │ ├── core │ │ │ ├── dim_zones.sql │ │ │ ├── dm_monthly_zone_revenue.sql │ │ │ ├── schema.yml │ │ │ └── fact_trips.sql │ │ └── staging │ │ │ ├── stg_green_tripdata.sql │ │ │ ├── stg_yellow_tripdata.sql │ │ │ └── schema.yml │ ├── README.md │ └── dbt_project.yml └── web_to_gcs.py ├── Week 1 ├── Dockerfile ├── docker-compose.yaml ├── variables.tf ├── main.tf ├── ingest_data.py └── Homework Answers ├── Week 2 ├── Dockerfile ├── docker_deploy.py ├── Homework Answers ├── etl_gcs_to_bq.py ├── etl_web_to_gcs.py ├── ingest_data_flow.py └── parameterized_flow.py ├── README.md ├── Week 5 ├── Code │ ├── 02 download_data.sh │ ├── 09_Spark_SQL.py │ ├── 12_Spark_SQL_BQ.py │ ├── 05 Spark Join and GroupBy.ipynb │ ├── 04 Spark SQL.ipynb │ ├── 06 RDDs.ipynb │ └── 03 Taxi Schema.ipynb └── Data Engineering Zoomcamp Week 5.ipynb └── LICENSE /Week 4/dbt_files/macros/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Week 4/dbt_files/seeds/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Week 4/dbt_files/tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Week 4/dbt_files/analyses/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Week 4/dbt_files/snapshots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Week 4/dbt_files/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /Week 4/dbt_files/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 0.8.0 -------------------------------------------------------------------------------- /Week 1/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.1 2 | 3 | RUN pip install pandas sqlalchemy psycopg2 4 | 5 | WORKDIR /app 6 | COPY ingest_data.py ingest_data.py 7 | 8 | ENTRYPOINT ["python", "ingest_data.py"] 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /Week 4/dbt_files/models/core/dim_zones.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | 4 | select 5 | locationid, 6 | borough, 7 | zone, 8 | replace(service_zone,'Boro','Green') as service_zone 9 | from {{ ref('taxi_zone_lookup') }} -------------------------------------------------------------------------------- /Week 2/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM prefecthq/prefect:2.7.7-python3.9 2 | 3 | COPY requirements.txt . 4 | 5 | RUN pip install -r requirements.txt --trusted-host pypi.python.org --no-cache-dir 6 | 7 | COPY parameterized_flow.py /opt/prefect/flows/parameterized_flow.py 8 | COPY data /opt/prefect/data 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DE-Zoomcamp 2 | 3 | This repo contains the following materials for the Data Engineering Zoomcamp organized by DataTalksClub 4 | - Notes and Code Files from the videos 5 | - Installation instructions/files for the tools used 6 | - Deep dives on the key topics 7 | - Homework Answers 8 | 9 | Please reach out to me in case you want to add anything else. 10 | -------------------------------------------------------------------------------- /Week 2/docker_deploy.py: -------------------------------------------------------------------------------- 1 | from prefect.deployments import Deployment 2 | from prefect.infrastructure.docker import DockerContainer 3 | from parameterized_flow import etl_parent_flow 4 | 5 | docker_block = DockerContainer.load("zoom") 6 | 7 | docker_dep = Deployment.build_from_flow(flow=etl_parent_flow, name='docker-flow', infrastructure=docker_block) 8 | 9 | if __name__ == '__main__': 10 | docker_dep.apply() 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /Week 4/dbt_files/macros/macros_properties.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | macros: 4 | - name: get_payment_type_description 5 | description: > 6 | This macro receives a payment_type and returns the corresponding description. 7 | arguments: 8 | - name: payment_type 9 | type: int 10 | description: > 11 | payment_type value. 12 | Must be one of the accepted values, otherwise the macro will return null -------------------------------------------------------------------------------- /Week 4/dbt_files/macros/get_payment_type_description.sql: -------------------------------------------------------------------------------- 1 | {# 2 | This macro returns the description of the payment_type 3 | #} 4 | 5 | {% macro get_payment_type_description(payment_type) -%} 6 | 7 | case {{ payment_type }} 8 | when 1 then 'Credit card' 9 | when 2 then 'Cash' 10 | when 3 then 'No charge' 11 | when 4 then 'Dispute' 12 | when 5 then 'Unknown' 13 | when 6 then 'Voided trip' 14 | end 15 | 16 | {%- endmacro %} -------------------------------------------------------------------------------- /Week 4/dbt_files/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt run 7 | - dbt test 8 | 9 | 10 | ### Resources: 11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 13 | - Join the [dbt community](http://community.getbdt.com/) to learn from other analytics engineers 14 | - Find [dbt events](https://events.getdbt.com) near you 15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 16 | -------------------------------------------------------------------------------- /Week 5/Code/02 download_data.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | TAXI_TYPE=$1 # "yellow", "green" 4 | YEAR=$2 # 2020, 2021 5 | 6 | URL_PREFIX="https://github.com/DataTalksClub/nyc-tlc-data/releases/download" 7 | 8 | for MONTH in {1..12}; do 9 | FMONTH=`printf "%02d" ${MONTH}` 10 | 11 | URL="${URL_PREFIX}/${TAXI_TYPE}/${TAXI_TYPE}_tripdata_${YEAR}-${FMONTH}.csv.gz" 12 | 13 | LOCAL_PREFIX="D:/data/raw/${TAXI_TYPE}/${YEAR}/${FMONTH}" 14 | LOCAL_FILE="${TAXI_TYPE}_tripdata_${YEAR}_${FMONTH}.csv.gz" 15 | LOCAL_PATH="${LOCAL_PREFIX}/${LOCAL_FILE}" 16 | 17 | echo "downloading ${URL} to ${LOCAL_PATH}" 18 | mkdir -p ${LOCAL_PREFIX} 19 | curl -L -o ${LOCAL_PATH} ${URL} 20 | # wget ${URL} -O ${LOCAL_PATH} 21 | 22 | done 23 | -------------------------------------------------------------------------------- /Week 1/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | pgdatabase: 3 | image: postgres:13 4 | environment: 5 | - POSTGRES_USER=root 6 | - POSTGRES_PASSWORD=root 7 | - POSTGRES_DB=ny_taxi 8 | volumes: 9 | - "./ny_taxi_postgres_data:/var/lib/postgresql/data:rw" 10 | ports: 11 | - "5432:5432" 12 | pgadmin: 13 | image: dpage/pgadmin4 14 | environment: 15 | - PGADMIN_DEFAULT_EMAIL=admin@admin.com 16 | - PGADMIN_DEFAULT_PASSWORD=root 17 | volumes: 18 | - "pgadmin_conn_data:/var/lib/pgadmin:rw" 19 | ports: 20 | - "8080:80" 21 | 22 | volumes: 23 | pgadmin_conn_data: 24 | driver: local 25 | driver_opts: 26 | type: none 27 | o: bind 28 | device: ./pgadmin_conn_data 29 | 30 | -------------------------------------------------------------------------------- /Week 1/variables.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | data_lake_bucket = "dtc_data_lake" 3 | } 4 | 5 | variable "project" { 6 | description = "Your Project ID here" 7 | } 8 | 9 | variable "region" { 10 | description = "Region for GCP resources. Choose as per your location: https://cloud.google.com/about/locations" 11 | default = "europe-west6" 12 | type = string 13 | } 14 | 15 | variable "storage_class" { 16 | description = "Storage class type for your bucket. Check official docs for more info." 17 | default = "STANDARD" 18 | } 19 | 20 | variable "BQ_DATASET" { 21 | description = "BigQuery Dataset that raw data (from GCS) will be written to" 22 | type = string 23 | default = "trips_data_all" 24 | } 25 | 26 | variable "TABLE_NAME" { 27 | description = "BigQuery Table" 28 | type = string 29 | default = "ny_trips" 30 | } -------------------------------------------------------------------------------- /Week 2/Homework Answers: -------------------------------------------------------------------------------- 1 | Prefect 2 | 3 | --1) 4 | 5 | Update etl_web_to_gcs.py to change the year to 2020 and month to 1 6 | 7 | Then run - python etl_web_to_gcs.py 8 | 9 | Answer - 447,770 10 | 11 | --2) 12 | 13 | prefect deployment build ./etl_web_to_gcs.py:etl_web_to_gcs -n "ETL Job 2" --cron "0 5 1 * *" -a 14 | 15 | Answer - 0 5 1 * * 16 | 17 | --3) 18 | 19 | Update "parameterized_flow.py" file to change the color and months in the main function 20 | 21 | Update the column name lpep_pickup_datetime to tpep_pickup_datetime. Similarly update lpep_dropoff_datetime to tpep_dropoff_datetime 22 | 23 | Create a folder "yellow" in the "data" folder in the working dir 24 | 25 | Run the command - python parameterized_flow.py 26 | 27 | This should load the parquet data files for Yellow taxi data for Feb. 2019 and March 2019 into GCS 28 | 29 | Update the "etl_gcs_to_bq.py" code to make the relevant changes and run the command - python etl_gcs_to_bq.py 30 | 31 | Answer - 14,851,920 32 | 33 | --4) 34 | 35 | 36 | --5) 37 | 38 | 39 | --6) 40 | 41 | Answer - 8 42 | -------------------------------------------------------------------------------- /Week 4/dbt_files/models/core/dm_monthly_zone_revenue.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | with trips_data as ( 4 | select * from {{ ref('fact_trips') }} 5 | ) 6 | select 7 | -- Reveneue grouping 8 | pickup_zone as revenue_zone, 9 | date_trunc(pickup_datetime, month) as revenue_month, 10 | 11 | service_type, 12 | 13 | -- Revenue calculation 14 | sum(fare_amount) as revenue_monthly_fare, 15 | sum(extra) as revenue_monthly_extra, 16 | sum(mta_tax) as revenue_monthly_mta_tax, 17 | sum(tip_amount) as revenue_monthly_tip_amount, 18 | sum(tolls_amount) as revenue_monthly_tolls_amount, 19 | sum(ehail_fee) as revenue_monthly_ehail_fee, 20 | sum(improvement_surcharge) as revenue_monthly_improvement_surcharge, 21 | sum(total_amount) as revenue_monthly_total_amount, 22 | sum(congestion_surcharge) as revenue_monthly_congestion_surcharge, 23 | 24 | -- Additional calculations 25 | count(tripid) as total_monthly_trips, 26 | avg(passenger_count) as avg_montly_passenger_count, 27 | avg(trip_distance) as avg_montly_trip_distance 28 | 29 | from trips_data 30 | group by 1,2,3 -------------------------------------------------------------------------------- /Week 4/dbt_files/models/core/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: dim_zones 5 | description: > 6 | List of unique zones identified by locationid. 7 | Includes the service zone they correspond to (Green or yellow). 8 | - name: fact_trips 9 | description: > 10 | Taxi trips corresponding to both service zones (Green and yellow). 11 | The table contains records where both pickup and dropoff locations are valid and known zones. 12 | Each record corresponds to a trip uniquely identified by tripid. 13 | 14 | - name: dm_monthly_zone_revenue 15 | description: > 16 | Aggregated table of all taxi trips corresponding to both service zones (Green and yellow) per pickup zone, month and service. 17 | The table contains monthly sums of the fare elements used to calculate the monthly revenue. 18 | The table contains also monthly indicators like number of trips, and average trip distance. 19 | columns: 20 | - name: revenue_monthly_total_amount 21 | description: Monthly sum of the the total_amount of the fare charged for the trip per pickup zone, month and service. 22 | tests: 23 | - not_null: 24 | severity: error -------------------------------------------------------------------------------- /Week 1/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.0" 3 | backend "local" {} # Can change from "local" to "gcs" (for google) or "s3" (for aws), if you would like to preserve your tf-state online 4 | required_providers { 5 | google = { 6 | source = "hashicorp/google" 7 | } 8 | } 9 | } 10 | 11 | provider "google" { 12 | project = var.project 13 | region = var.region 14 | // credentials = file(var.credentials) # Use this if you do not want to set env-var GOOGLE_APPLICATION_CREDENTIALS 15 | } 16 | 17 | # Data Lake Bucket 18 | # Ref: https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket 19 | resource "google_storage_bucket" "data-lake-bucket" { 20 | name = "${local.data_lake_bucket}_${var.project}" # Concatenating DL bucket & Project name for unique naming 21 | location = var.region 22 | 23 | # Optional, but recommended settings: 24 | storage_class = var.storage_class 25 | uniform_bucket_level_access = true 26 | 27 | versioning { 28 | enabled = true 29 | } 30 | 31 | lifecycle_rule { 32 | action { 33 | type = "Delete" 34 | } 35 | condition { 36 | age = 30 // days 37 | } 38 | } 39 | 40 | force_destroy = true 41 | } 42 | 43 | # DWH 44 | # Ref: https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/bigquery_dataset 45 | resource "google_bigquery_dataset" "dataset" { 46 | dataset_id = var.BQ_DATASET 47 | project = var.project 48 | location = var.region 49 | } -------------------------------------------------------------------------------- /Week 4/dbt_files/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'ny_taxi_rides' 6 | version: '1.0.0' 7 | config-version: 2 8 | 9 | # This setting configures which "profile" dbt uses for this project. 10 | profile: 'default' 11 | 12 | # These configurations specify where dbt should look for different types of files. 13 | # The `source-paths` config, for example, states that models in this project can be 14 | # found in the "models/" directory. You probably won't need to change these! 15 | model-paths: ["models"] 16 | analysis-paths: ["analyses"] 17 | test-paths: ["tests"] 18 | seed-paths: ["seeds"] 19 | macro-paths: ["macros"] 20 | snapshot-paths: ["snapshots"] 21 | 22 | target-path: "target" # directory which will store compiled SQL files 23 | clean-targets: # directories to be removed by `dbt clean` 24 | - "target" 25 | - "dbt_packages" 26 | 27 | 28 | # Configuring models 29 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 30 | 31 | # In this example config, we tell dbt to build all models in the example/ directory 32 | # as tables. These settings can be overridden in the individual model files 33 | # using the `{{ config(...) }}` macro. 34 | models: 35 | ny_taxi_rides: 36 | # Applies to all files under models/example/ 37 | # example: 38 | # materialized: view 39 | 40 | vars: 41 | payment_type_values: [1, 2, 3, 4, 5, 6] 42 | 43 | seeds: 44 | ny_taxi_rides: 45 | taxi_zone_lookup: 46 | +column_types: 47 | locationid: numeric -------------------------------------------------------------------------------- /Week 4/dbt_files/models/core/fact_trips.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | with green_data as ( 4 | select *, 5 | 'Green' as service_type 6 | from {{ ref('stg_green_tripdata') }} 7 | ), 8 | 9 | yellow_data as ( 10 | select *, 11 | 'Yellow' as service_type 12 | from {{ ref('stg_yellow_tripdata') }} 13 | ), 14 | 15 | trips_unioned as ( 16 | select * from green_data 17 | union all 18 | select * from yellow_data 19 | ), 20 | 21 | dim_zones as ( 22 | select * from {{ ref('dim_zones') }} 23 | where borough != 'Unknown' 24 | ) 25 | select 26 | trips_unioned.tripid, 27 | trips_unioned.vendorid, 28 | trips_unioned.service_type, 29 | trips_unioned.ratecodeid, 30 | trips_unioned.pickup_locationid, 31 | pickup_zone.borough as pickup_borough, 32 | pickup_zone.zone as pickup_zone, 33 | trips_unioned.dropoff_locationid, 34 | dropoff_zone.borough as dropoff_borough, 35 | dropoff_zone.zone as dropoff_zone, 36 | trips_unioned.pickup_datetime, 37 | trips_unioned.dropoff_datetime, 38 | trips_unioned.store_and_fwd_flag, 39 | trips_unioned.passenger_count, 40 | trips_unioned.trip_distance, 41 | trips_unioned.trip_type, 42 | trips_unioned.fare_amount, 43 | trips_unioned.extra, 44 | trips_unioned.mta_tax, 45 | trips_unioned.tip_amount, 46 | trips_unioned.tolls_amount, 47 | trips_unioned.ehail_fee, 48 | trips_unioned.improvement_surcharge, 49 | trips_unioned.total_amount, 50 | trips_unioned.payment_type, 51 | trips_unioned.payment_type_description, 52 | trips_unioned.congestion_surcharge 53 | from trips_unioned 54 | inner join dim_zones as pickup_zone 55 | on trips_unioned.pickup_locationid = pickup_zone.locationid 56 | inner join dim_zones as dropoff_zone 57 | on trips_unioned.dropoff_locationid = dropoff_zone.locationid -------------------------------------------------------------------------------- /Week 4/dbt_files/models/staging/stg_green_tripdata.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} 2 | 3 | with tripdata as 4 | ( 5 | select *, 6 | row_number() over(partition by cast(vendorid as integer), lpep_pickup_datetime) as rn 7 | from {{ source('staging','green_trips') }} 8 | where vendorid is not null 9 | ) 10 | select 11 | -- identifiers 12 | {{ dbt_utils.surrogate_key(['vendorid', 'lpep_pickup_datetime']) }} as tripid, 13 | cast(vendorid as integer) as vendorid, 14 | cast(ratecodeid as integer) as ratecodeid, 15 | cast(pulocationid as integer) as pickup_locationid, 16 | cast(dolocationid as integer) as dropoff_locationid, 17 | 18 | -- timestamps 19 | cast(lpep_pickup_datetime as timestamp) as pickup_datetime, 20 | cast(lpep_dropoff_datetime as timestamp) as dropoff_datetime, 21 | 22 | -- trip info 23 | store_and_fwd_flag, 24 | cast(passenger_count as integer) as passenger_count, 25 | cast(trip_distance as numeric) as trip_distance, 26 | cast(trip_type as integer) as trip_type, 27 | 28 | -- payment info 29 | cast(fare_amount as numeric) as fare_amount, 30 | cast(extra as numeric) as extra, 31 | cast(mta_tax as numeric) as mta_tax, 32 | cast(tip_amount as numeric) as tip_amount, 33 | cast(tolls_amount as numeric) as tolls_amount, 34 | cast(ehail_fee as numeric) as ehail_fee, 35 | cast(improvement_surcharge as numeric) as improvement_surcharge, 36 | cast(total_amount as numeric) as total_amount, 37 | cast(payment_type as integer) as payment_type, 38 | {{ get_payment_type_description('payment_type') }} as payment_type_description, 39 | cast(congestion_surcharge as numeric) as congestion_surcharge 40 | from tripdata 41 | where rn = 1 42 | 43 | 44 | -- dbt build --m --var 'is_test_run: false' 45 | {% if var('is_test_run', default=true) %} 46 | 47 | limit 100 48 | 49 | {% endif %} -------------------------------------------------------------------------------- /Week 4/dbt_files/models/staging/stg_yellow_tripdata.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} 2 | 3 | with tripdata as 4 | ( 5 | select *, 6 | row_number() over(partition by cast(vendorid as integer), tpep_pickup_datetime) as rn 7 | from {{ source('staging','yellow_trips') }} 8 | where vendorid is not null 9 | ) 10 | select 11 | -- identifiers 12 | {{ dbt_utils.surrogate_key(['vendorid', 'tpep_pickup_datetime']) }} as tripid, 13 | cast(vendorid as integer) as vendorid, 14 | cast(ratecodeid as integer) as ratecodeid, 15 | cast(pulocationid as integer) as pickup_locationid, 16 | cast(dolocationid as integer) as dropoff_locationid, 17 | 18 | -- timestamps 19 | cast(tpep_pickup_datetime as timestamp) as pickup_datetime, 20 | cast(tpep_dropoff_datetime as timestamp) as dropoff_datetime, 21 | 22 | -- trip info 23 | store_and_fwd_flag, 24 | cast(passenger_count as integer) as passenger_count, 25 | cast(trip_distance as numeric) as trip_distance, 26 | -- yellow cabs are always street-hail 27 | 1 as trip_type, 28 | 29 | -- payment info 30 | cast(fare_amount as numeric) as fare_amount, 31 | cast(extra as numeric) as extra, 32 | cast(mta_tax as numeric) as mta_tax, 33 | cast(tip_amount as numeric) as tip_amount, 34 | cast(tolls_amount as numeric) as tolls_amount, 35 | cast(0 as numeric) as ehail_fee, 36 | cast(improvement_surcharge as numeric) as improvement_surcharge, 37 | cast(total_amount as numeric) as total_amount, 38 | cast(payment_type as integer) as payment_type, 39 | {{ get_payment_type_description('payment_type') }} as payment_type_description, 40 | cast(congestion_surcharge as numeric) as congestion_surcharge 41 | from tripdata 42 | where rn = 1 43 | 44 | -- dbt build --m --var 'is_test_run: false' 45 | {% if var('is_test_run', default=true) %} 46 | 47 | limit 100 48 | 49 | {% endif %} -------------------------------------------------------------------------------- /Week 2/etl_gcs_to_bq.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pandas as pd 3 | from prefect import flow, task 4 | from prefect_gcp.cloud_storage import GcsBucket 5 | from prefect_gcp import GcpCredentials 6 | 7 | @task(retries=3) 8 | def extract_from_gcs(color: str, year: int, month: int) -> Path: 9 | """Download trip data from GCS""" 10 | gcs_path = f"data/{color}/{color}_tripdata_{year}-{month:02}.parquet" 11 | gcs_block = GcsBucket.load("zoom-gcs") 12 | gcs_block.get_directory(from_path=gcs_path, local_path=f"./data/") 13 | return Path(f"./data/{gcs_path}") 14 | 15 | @task(log_prints=True) 16 | def transform(path: Path) -> pd.DataFrame: 17 | """Data Cleaning Example""" 18 | df = pd.read_parquet(path) 19 | # print(f"Pre: missing passenger count: {df['passenger_count'].isna().sum()}") 20 | # df["passenger_count"].fillna(0, inplace=True) 21 | # print(f"Post: missing passenger count: {df['passenger_count'].isna().sum()}") 22 | return df 23 | 24 | @task() 25 | def write_bq(df: pd.DataFrame) -> None: 26 | """Writing data into BigQuery""" 27 | gcp_credentials_block = GcpCredentials.load("zoom-gcp-creds") 28 | df.to_gbq(destination_table="yellow_trips.rides", 29 | project_id = "composed-sun-375018", 30 | credentials = gcp_credentials_block.get_credentials_from_service_account(), 31 | chunksize=100000, 32 | if_exists="append") 33 | 34 | @flow(log_prints=True) 35 | def etl_gcs_to_bq(color: str = "yellow", year: int = 2019, months: list[int] = [2,3]): 36 | """Main ETL Flow to load data to Big Query data warehouse""" 37 | for month in months: 38 | path = extract_from_gcs(color, year, month) 39 | df = transform(path) 40 | print(f"Row count: {len(df)}") 41 | write_bq(df) 42 | 43 | if __name__ == '__main__': 44 | color = "yellow" 45 | year = 2019 46 | months = [2,3] 47 | etl_gcs_to_bq(color, year, months) -------------------------------------------------------------------------------- /Week 2/etl_web_to_gcs.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pandas as pd 3 | from prefect import flow, task 4 | from prefect_gcp.cloud_storage import GcsBucket 5 | 6 | 7 | @task(retries=3) 8 | def fetch(dataset_url: str) -> pd.DataFrame: 9 | """Read taxi data from web into a pandas DataFrame""" 10 | df = pd.read_csv(dataset_url) 11 | return df 12 | 13 | @task(log_prints=True) 14 | def clean(df = pd.DataFrame) -> pd.DataFrame: 15 | """Fix Data Type issues""" 16 | df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime']) 17 | df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime']) 18 | print(df.head(2)) 19 | print(f"Columns: {df.dtypes}") 20 | print(f"Rows: {len(df)}") 21 | return df 22 | 23 | @task() 24 | def write_local(df: pd.DataFrame, color: str, dataset_file: str) -> Path: 25 | """Write DataFrame out locally as a parquet file""" 26 | 27 | # Create a folder data/green in the working directory before running this code 28 | path = Path(f"data/{color}/{dataset_file}.parquet") 29 | df.to_parquet(path, compression="gzip") 30 | # Checking to see if the slashes are forward. Default is backwards in windows 31 | print(path.as_posix()) 32 | return path 33 | 34 | 35 | @task() 36 | def write_gcs(path: Path) -> None: 37 | """Upload local parquet file to GCS""" 38 | gcs_block = GcsBucket.load("zoom-gcs") 39 | gcs_block.upload_from_path(from_path=path, to_path=path.as_posix()) # Using as_posix() to convert the slashes to forward 40 | return 41 | 42 | 43 | @flow() 44 | def etl_web_to_gcs() -> None: 45 | """The Main ETL function""" 46 | color = "green" 47 | year = 2020 48 | month = 1 49 | dataset_file = f"{color}_tripdata_{year}-{month:02}" 50 | dataset_url = f"https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{color}/{dataset_file}.csv.gz" 51 | 52 | df = fetch(dataset_url) 53 | df_clean = clean(df) 54 | path = write_local(df_clean, color, dataset_file) 55 | write_gcs(path) 56 | 57 | 58 | if __name__ == '__main__': 59 | etl_web_to_gcs() 60 | -------------------------------------------------------------------------------- /Week 1/ingest_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from time import time 3 | from sqlalchemy import create_engine 4 | import argparse 5 | 6 | 7 | def main(params): 8 | user = params.user 9 | password = params.password 10 | host = params.host 11 | port = params.port 12 | db = params.db 13 | table_name = params.table_name[0] 14 | table_name1 = params.table_name[1] 15 | url = params.url[0] 16 | url1 = params.url[1] 17 | 18 | engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{db}") 19 | 20 | df_iter = pd.read_csv(url, iterator = True, chunksize = 100000) 21 | df = next(df_iter) 22 | 23 | df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime) 24 | df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime) 25 | 26 | df.head(n=0).to_sql(name = table_name, con = engine, if_exists='replace') 27 | df.to_sql(name = table_name, con = engine, if_exists='append') 28 | 29 | for df in df_iter: 30 | t_start = time() 31 | 32 | df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime) 33 | df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime) 34 | 35 | df.to_sql(name = table_name, con = engine, if_exists='append') 36 | 37 | t_end = time() 38 | 39 | print(f'Inserted a new chunk. Time taken (in s) - {round(t_end - t_start, 2)}') 40 | 41 | taxi_zone = pd.read_csv(url1) 42 | 43 | taxi_zone.to_sql(name = table_name1, con = engine, if_exists='replace') 44 | 45 | if __name__ == '__main__': 46 | parser = argparse.ArgumentParser(description = "Ingest CSV data to Postgres") 47 | 48 | # user 49 | # password 50 | # host 51 | # port 52 | # database name 53 | # table name 54 | # url of the csv 55 | 56 | parser.add_argument('--user', help="user name for postgres") 57 | parser.add_argument('--password', help="password for postgres") 58 | parser.add_argument('--host', help="host for postgres") 59 | parser.add_argument('--port', help="port for postgres") 60 | parser.add_argument('--db', help="database name for postgres") 61 | parser.add_argument('--table_name', nargs = 2, help="name of the table where we will write the results to") 62 | parser.add_argument('--url', nargs = 2, help="url of the CSV") 63 | 64 | args = parser.parse_args() 65 | 66 | main(args) -------------------------------------------------------------------------------- /Week 2/ingest_data_flow.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | import pandas as pd 3 | from sqlalchemy import create_engine 4 | from prefect import flow, task 5 | from prefect.tasks import task_input_hash 6 | from datetime import timedelta 7 | from prefect_sqlalchemy import SqlAlchemyConnector 8 | 9 | 10 | @task(log_prints=True, tags=["extract"], cache_key_fn=task_input_hash, cache_expiration=timedelta(days=1)) 11 | def extract_data(url: str): 12 | df_iter = pd.read_csv(url, iterator=True, chunksize=100000) 13 | 14 | df = next(df_iter) 15 | 16 | df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime) 17 | df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime) 18 | 19 | return df 20 | 21 | @task(log_prints=True) 22 | def transform_data(df): 23 | print(f"pre: missing passenger count: {df['passenger_count'].isin([0]).sum()}") 24 | df = df[df['passenger_count'] != 0] 25 | print(f"post: missing passenger count: {df['passenger_count'].isin([0]).sum()}") 26 | return df 27 | 28 | @task(log_prints=True, retries=3) 29 | def load_data(table_name, df): 30 | 31 | # engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{db}") 32 | # df.head(n=0).to_sql(name = table_name, con = engine, if_exists='replace') 33 | # df.to_sql(name = table_name, con = engine, if_exists='append') 34 | 35 | connection_block = SqlAlchemyConnector.load("postgres-connector") 36 | with connection_block.get_connection(begin=False) as engine: 37 | df.head(n=0).to_sql(name=table_name, con=engine, if_exists='replace') 38 | df.to_sql(name=table_name, con=engine, if_exists='append') 39 | 40 | @flow(name="Subflow", log_prints=True) 41 | def log_subflow(table_name: str): 42 | print(f"Logging Subflow for: {table_name}") 43 | 44 | @flow(name="Ingest Data") 45 | def main_flow(table_name: str = "green_taxi_trips"): 46 | # user = "root" 47 | # password = "root" 48 | # host = "localhost" 49 | # port = "5432" 50 | # db = "ny_taxi" 51 | # table_name = "green_taxi_trips" 52 | 53 | csv_url = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-01.csv.gz" 54 | log_subflow(table_name) 55 | raw_data = extract_data(csv_url) 56 | data = transform_data(raw_data) 57 | # load_data(user, password, host, port, db, table_name, data) 58 | load_data(table_name, data) 59 | 60 | if __name__ == '__main__': 61 | main_flow(table_name = "green_trips") 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /Week 4/web_to_gcs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import pandas as pd 4 | import pyarrow as pa 5 | import pyarrow.parquet as pq 6 | import gzip 7 | from google.cloud import storage 8 | 9 | # Set Google Cloud Storage bucket name 10 | BUCKET_NAME = "bucket_name" # Enter your bucket name here 11 | 12 | # Set local directory for downloaded files 13 | LOCAL_DIR = "data" 14 | 15 | # Create the local directory if it doesn't exist 16 | if not os.path.exists(LOCAL_DIR): 17 | os.makedirs(LOCAL_DIR) 18 | 19 | # Define function to upload file to Google Cloud Storage 20 | def upload_to_gcs(bucket_name, blob_name, file_path): 21 | storage_client = storage.Client() 22 | bucket = storage_client.get_bucket(bucket_name) 23 | blob = bucket.blob(blob_name) 24 | blob.upload_from_filename(file_path) 25 | print(f"File {file_path} uploaded to gs://{bucket_name}/{blob_name}") 26 | 27 | 28 | # Define function to get file from github to local and then call the upload_to_gcs function 29 | def web_to_gcs(service, year): 30 | 31 | for month in range(1, 13): 32 | base_url = f"https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{service}/" 33 | file_name = f"{service}_tripdata_{year}-{month:02d}.csv.gz" 34 | url = base_url + file_name 35 | file_path = os.path.join(LOCAL_DIR, file_name) 36 | r = requests.get(url, allow_redirects=True) 37 | open(file_path, "wb").write(r.content) 38 | 39 | # Check if the downloaded file is a valid gzip file 40 | try: 41 | with gzip.open(file_path) as f: 42 | pass 43 | except Exception as e: 44 | print(f"Error: {e}. {file_name} may be corrupted.") 45 | os.remove(file_path) 46 | continue 47 | 48 | # Convert CSV file to Parquet 49 | csv = pd.read_csv(file_path) 50 | parquet_file_name = file_name.replace(".csv.gz", ".parquet") 51 | parquet_file_path = os.path.join(LOCAL_DIR, parquet_file_name) 52 | table = pa.Table.from_pandas(csv) 53 | pq.write_table(table, parquet_file_path) 54 | 55 | # Upload Parquet file to Google Cloud Storage 56 | upload_to_gcs(BUCKET_NAME, f"{service}/{parquet_file_name}", parquet_file_path) 57 | 58 | # Delete local CSV and Parquet files 59 | os.remove(file_path) 60 | os.remove(parquet_file_path) 61 | 62 | 63 | web_to_gcs("fhv", 2019) 64 | web_to_gcs("green", 2019) 65 | web_to_gcs("green", 2020) 66 | web_to_gcs("yellow", 2019) 67 | web_to_gcs("yellow", 2020) 68 | 69 | -------------------------------------------------------------------------------- /Week 2/parameterized_flow.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pandas as pd 3 | from prefect import flow, task 4 | from prefect.tasks import task_input_hash 5 | from datetime import timedelta 6 | from prefect_gcp.cloud_storage import GcsBucket 7 | 8 | 9 | @task(log_prints=True, retries=3, cache_key_fn=task_input_hash, cache_expiration=timedelta(days=1)) 10 | def fetch(dataset_url: str) -> pd.DataFrame: 11 | """Read taxi data from web into a pandas DataFrame""" 12 | df = pd.read_csv(dataset_url) 13 | return df 14 | 15 | @task(log_prints=True) 16 | def clean(df = pd.DataFrame) -> pd.DataFrame: 17 | """Fix Data Type issues""" 18 | df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime']) 19 | df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime']) 20 | print(df.head(2)) 21 | print(f"Columns: {df.dtypes}") 22 | print(f"Rows: {len(df)}") 23 | return df 24 | 25 | @task(log_prints=True) 26 | def write_local(df: pd.DataFrame, color: str, dataset_file: str) -> Path: 27 | """Write DataFrame out locally as a parquet file""" 28 | 29 | # Create a folder data/green in the working directory before running this code 30 | path = Path(f"data/{color}/{dataset_file}.parquet") 31 | df.to_parquet(path, compression="gzip") 32 | # Checking to see if the slashes are forward. Default is backwards 33 | print(path.as_posix()) 34 | return path 35 | 36 | 37 | @task(log_prints=True) 38 | def write_gcs(path: Path) -> None: 39 | """Upload local parquet file to GCS""" 40 | gcs_block = GcsBucket.load("zoom-gcs") 41 | gcs_block.upload_from_path(from_path=path, to_path=path.as_posix()) # Using as_posix() to convert the slashes to forward 42 | return 43 | 44 | 45 | @flow(log_prints=True) 46 | def etl_web_to_gcs(year: int, month: int, color: str) -> None: 47 | """The Main ETL function""" 48 | dataset_file = f"{color}_tripdata_{year}-{month:02}" 49 | dataset_url = f"https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{color}/{dataset_file}.csv.gz" 50 | 51 | df = fetch(dataset_url) 52 | df_clean = clean(df) 53 | path = write_local(df_clean, color, dataset_file) 54 | write_gcs(path) 55 | 56 | @flow(log_prints=True) 57 | def etl_parent_flow(months: list[int] = [1,2], year: int = 2019, color: str = "green"): 58 | for month in months: 59 | etl_web_to_gcs(year, month, color) 60 | 61 | if __name__ == '__main__': 62 | color = "yellow" 63 | months = [2,3] 64 | year = 2019 65 | etl_parent_flow(months, year, color) 66 | -------------------------------------------------------------------------------- /Week 5/Code/09_Spark_SQL.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import argparse 5 | 6 | import pyspark 7 | from pyspark.sql import SparkSession 8 | from pyspark.sql import functions as F 9 | 10 | 11 | parser = argparse.ArgumentParser() 12 | 13 | parser.add_argument('--input_green', required=True) 14 | parser.add_argument('--input_yellow', required=True) 15 | parser.add_argument('--output', required=True) 16 | 17 | args = parser.parse_args() 18 | 19 | input_green = args.input_green 20 | input_yellow = args.input_yellow 21 | output = args.output 22 | 23 | 24 | spark = SparkSession.builder \ 25 | .appName('test') \ 26 | .getOrCreate() 27 | 28 | df_green = spark.read.parquet(input_green) 29 | 30 | df_green = df_green \ 31 | .withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \ 32 | .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime') 33 | 34 | df_yellow = spark.read.parquet(input_yellow) 35 | 36 | 37 | df_yellow = df_yellow \ 38 | .withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime') \ 39 | .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime') 40 | 41 | 42 | common_colums = [ 43 | 'VendorID', 44 | 'pickup_datetime', 45 | 'dropoff_datetime', 46 | 'store_and_fwd_flag', 47 | 'RatecodeID', 48 | 'PULocationID', 49 | 'DOLocationID', 50 | 'passenger_count', 51 | 'trip_distance', 52 | 'fare_amount', 53 | 'extra', 54 | 'mta_tax', 55 | 'tip_amount', 56 | 'tolls_amount', 57 | 'improvement_surcharge', 58 | 'total_amount', 59 | 'payment_type', 60 | 'congestion_surcharge' 61 | ] 62 | 63 | 64 | 65 | df_green_sel = df_green \ 66 | .select(common_colums) \ 67 | .withColumn('service_type', F.lit('green')) 68 | 69 | df_yellow_sel = df_yellow \ 70 | .select(common_colums) \ 71 | .withColumn('service_type', F.lit('yellow')) 72 | 73 | 74 | df_trips_data = df_green_sel.unionAll(df_yellow_sel) 75 | 76 | df_trips_data.registerTempTable('trips_data') 77 | 78 | 79 | df_result = spark.sql(""" 80 | SELECT 81 | -- Reveneue grouping 82 | PULocationID AS revenue_zone, 83 | date_trunc('month', pickup_datetime) AS revenue_month, 84 | service_type, 85 | -- Revenue calculation 86 | SUM(fare_amount) AS revenue_monthly_fare, 87 | SUM(extra) AS revenue_monthly_extra, 88 | SUM(mta_tax) AS revenue_monthly_mta_tax, 89 | SUM(tip_amount) AS revenue_monthly_tip_amount, 90 | SUM(tolls_amount) AS revenue_monthly_tolls_amount, 91 | SUM(improvement_surcharge) AS revenue_monthly_improvement_surcharge, 92 | SUM(total_amount) AS revenue_monthly_total_amount, 93 | SUM(congestion_surcharge) AS revenue_monthly_congestion_surcharge, 94 | -- Additional calculations 95 | AVG(passenger_count) AS avg_montly_passenger_count, 96 | AVG(trip_distance) AS avg_montly_trip_distance 97 | FROM 98 | trips_data 99 | GROUP BY 100 | 1, 2, 3 101 | """) 102 | 103 | 104 | df_result.coalesce(1) \ 105 | .write.parquet(output, mode='overwrite') -------------------------------------------------------------------------------- /Week 5/Code/12_Spark_SQL_BQ.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import argparse 5 | 6 | import pyspark 7 | from pyspark.sql import SparkSession 8 | from pyspark.sql import functions as F 9 | 10 | 11 | parser = argparse.ArgumentParser() 12 | 13 | parser.add_argument('--input_green', required=True) 14 | parser.add_argument('--input_yellow', required=True) 15 | parser.add_argument('--output', required=True) 16 | 17 | args = parser.parse_args() 18 | 19 | input_green = args.input_green 20 | input_yellow = args.input_yellow 21 | output = args.output 22 | 23 | 24 | spark = SparkSession.builder \ 25 | .appName('test') \ 26 | .getOrCreate() 27 | 28 | spark.conf.set('temporaryGcsBucket', 'dataproc-temp-europe-west6-828225226997-fckhkym8') 29 | 30 | df_green = spark.read.parquet(input_green) 31 | 32 | df_green = df_green \ 33 | .withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \ 34 | .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime') 35 | 36 | df_yellow = spark.read.parquet(input_yellow) 37 | 38 | 39 | df_yellow = df_yellow \ 40 | .withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime') \ 41 | .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime') 42 | 43 | 44 | common_colums = [ 45 | 'VendorID', 46 | 'pickup_datetime', 47 | 'dropoff_datetime', 48 | 'store_and_fwd_flag', 49 | 'RatecodeID', 50 | 'PULocationID', 51 | 'DOLocationID', 52 | 'passenger_count', 53 | 'trip_distance', 54 | 'fare_amount', 55 | 'extra', 56 | 'mta_tax', 57 | 'tip_amount', 58 | 'tolls_amount', 59 | 'improvement_surcharge', 60 | 'total_amount', 61 | 'payment_type', 62 | 'congestion_surcharge' 63 | ] 64 | 65 | 66 | 67 | df_green_sel = df_green \ 68 | .select(common_colums) \ 69 | .withColumn('service_type', F.lit('green')) 70 | 71 | df_yellow_sel = df_yellow \ 72 | .select(common_colums) \ 73 | .withColumn('service_type', F.lit('yellow')) 74 | 75 | 76 | df_trips_data = df_green_sel.unionAll(df_yellow_sel) 77 | 78 | df_trips_data.registerTempTable('trips_data') 79 | 80 | 81 | df_result = spark.sql(""" 82 | SELECT 83 | -- Reveneue grouping 84 | PULocationID AS revenue_zone, 85 | date_trunc('month', pickup_datetime) AS revenue_month, 86 | service_type, 87 | -- Revenue calculation 88 | SUM(fare_amount) AS revenue_monthly_fare, 89 | SUM(extra) AS revenue_monthly_extra, 90 | SUM(mta_tax) AS revenue_monthly_mta_tax, 91 | SUM(tip_amount) AS revenue_monthly_tip_amount, 92 | SUM(tolls_amount) AS revenue_monthly_tolls_amount, 93 | SUM(improvement_surcharge) AS revenue_monthly_improvement_surcharge, 94 | SUM(total_amount) AS revenue_monthly_total_amount, 95 | SUM(congestion_surcharge) AS revenue_monthly_congestion_surcharge, 96 | -- Additional calculations 97 | AVG(passenger_count) AS avg_montly_passenger_count, 98 | AVG(trip_distance) AS avg_montly_trip_distance 99 | FROM 100 | trips_data 101 | GROUP BY 102 | 1, 2, 3 103 | """) 104 | 105 | 106 | df_result.write.format('bigquery') \ 107 | .option('table', output) \ 108 | .save() -------------------------------------------------------------------------------- /Week 1/Homework Answers: -------------------------------------------------------------------------------- 1 | Docker & SQL 2 | 3 | --1) 4 | 5 | docker build --help 6 | 7 | --2) 8 | 9 | winpty docker run -it python:3.9 bash 10 | pip list 11 | 12 | --3) 13 | 14 | SELECT COUNT(*) 15 | FROM GREEN_TAXI_DATA 16 | WHERE DATE(LPEP_PICKUP_DATETIME) = '2019-01-15' AND DATE(LPEP_DROPOFF_DATETIME) = '2019-01-15'; 17 | 18 | --4) 19 | 20 | SELECT DISTINCT DATE(LPEP_PICKUP_DATETIME) 21 | FROM GREEN_TAXI_DATA 22 | WHERE TRIP_DISTANCE = (SELECT MAX(TRIP_DISTANCE) FROM GREEN_TAXI_DATA); 23 | 24 | --5) 25 | 26 | SELECT PASSENGER_COUNT, COUNT(*) 27 | FROM GREEN_TAXI_DATA 28 | WHERE PASSENGER_COUNT IN (2,3) AND DATE(LPEP_PICKUP_DATETIME) = '2019-01-01' 29 | GROUP BY 1; 30 | 31 | --6) 32 | 33 | WITH LOC AS 34 | ( 35 | SELECT DISTINCT "LocationID" FROM TAXI_ZONE 36 | WHERE "Zone" = 'Astoria' 37 | ), TIP AS 38 | ( 39 | SELECT MAX(TIP_AMOUNT) FROM GREEN_TAXI_DATA 40 | WHERE "PULocationID" = (SELECT * FROM LOC) 41 | ), LOC_ID AS 42 | ( 43 | SELECT DISTINCT "DOLocationID" 44 | FROM GREEN_TAXI_DATA 45 | WHERE "PULocationID" = (SELECT * FROM LOC) AND TIP_AMOUNT = (SELECT * FROM TIP) 46 | ) 47 | 48 | SELECT DISTINCT "Zone" FROM TAXI_ZONE 49 | WHERE "LocationID" = (SELECT * FROM LOC_ID); 50 | 51 | 52 | Terraform 53 | 54 | terraform apply 55 | 56 | Terraform used the selected providers to generate the following execution 57 | plan. Resource actions are indicated with the following symbols: 58 | + create 59 | 60 | Terraform will perform the following actions: 61 | 62 | # google_bigquery_dataset.dataset will be created 63 | + resource "google_bigquery_dataset" "dataset" { 64 | + creation_time = (known after apply) 65 | + dataset_id = "trips_data_all" 66 | + delete_contents_on_destroy = false 67 | + etag = (known after apply) 68 | + id = (known after apply) 69 | + labels = (known after apply) 70 | + last_modified_time = (known after apply) 71 | + location = "europe-west6" 72 | + project = "composed-sun-375018" 73 | + self_link = (known after apply) 74 | 75 | + access { 76 | + domain = (known after apply) 77 | + group_by_email = (known after apply) 78 | + role = (known after apply) 79 | + special_group = (known after apply) 80 | + user_by_email = (known after apply) 81 | 82 | + dataset { 83 | + target_types = (known after apply) 84 | 85 | + dataset { 86 | + dataset_id = (known after apply) 87 | + project_id = (known after apply) 88 | } 89 | } 90 | 91 | + routine { 92 | + dataset_id = (known after apply) 93 | + project_id = (known after apply) 94 | + routine_id = (known after apply) 95 | } 96 | 97 | + view { 98 | + dataset_id = (known after apply) 99 | + project_id = (known after apply) 100 | + table_id = (known after apply) 101 | } 102 | } 103 | } 104 | 105 | # google_storage_bucket.data-lake-bucket will be created 106 | + resource "google_storage_bucket" "data-lake-bucket" { 107 | + force_destroy = true 108 | + id = (known after apply) 109 | + location = "EUROPE-WEST6" 110 | + name = "dtc_data_lake_composed-sun-375018" 111 | + project = (known after apply) 112 | + public_access_prevention = (known after apply) 113 | + self_link = (known after apply) 114 | + storage_class = "STANDARD" 115 | + uniform_bucket_level_access = true 116 | + url = (known after apply) 117 | 118 | + lifecycle_rule { 119 | + action { 120 | + type = "Delete" 121 | } 122 | 123 | + condition { 124 | + age = 30 125 | + matches_prefix = [] 126 | + matches_storage_class = [] 127 | + matches_suffix = [] 128 | + with_state = (known after apply) 129 | } 130 | } 131 | 132 | + versioning { 133 | + enabled = true 134 | } 135 | 136 | + website { 137 | + main_page_suffix = (known after apply) 138 | + not_found_page = (known after apply) 139 | } 140 | } 141 | 142 | Plan: 2 to add, 0 to change, 0 to destroy. 143 | 144 | Do you want to perform these actions? 145 | Terraform will perform the actions described above. 146 | Only 'yes' will be accepted to approve. 147 | 148 | Enter a value: yes 149 | 150 | google_bigquery_dataset.dataset: Creating... 151 | google_storage_bucket.data-lake-bucket: Creating... 152 | google_storage_bucket.data-lake-bucket: Creation complete after 2s [id=dtc_data_lake_composed-sun-375018] 153 | google_bigquery_dataset.dataset: Creation complete after 3s [id=projects/composed-sun-375018/datasets/trips_data_all] 154 | 155 | Apply complete! Resources: 2 added, 0 changed, 0 destroyed. 156 | -------------------------------------------------------------------------------- /Week 4/dbt_files/models/staging/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: staging 5 | 6 | database: composed-sun-375018 7 | 8 | schema: trips_data_all 9 | 10 | # loaded_at_field: record_loaded_at 11 | tables: 12 | - name: green_trips 13 | - name: yellow_trips 14 | # freshness: 15 | # error_after: {count: 6, period: hour} 16 | 17 | models: 18 | - name: stg_green_tripdata 19 | description: > 20 | Trip made by green taxis, also known as boro taxis and street-hail liveries. 21 | Green taxis may respond to street hails,but only in the areas indicated in green on the 22 | map (i.e. above W 110 St/E 96th St in Manhattan and in the boroughs). 23 | The records were collected and provided to the NYC Taxi and Limousine Commission (TLC) by 24 | technology service providers. 25 | columns: 26 | - name: tripid 27 | description: Primary key for this table, generated with a concatenation of vendorid+pickup_datetime 28 | tests: 29 | - unique: 30 | severity: warn 31 | - not_null: 32 | severity: warn 33 | - name: VendorID 34 | description: > 35 | A code indicating the TPEP provider that provided the record. 36 | 1= Creative Mobile Technologies, LLC; 37 | 2= VeriFone Inc. 38 | - name: pickup_datetime 39 | description: The date and time when the meter was engaged. 40 | - name: dropoff_datetime 41 | description: The date and time when the meter was disengaged. 42 | - name: Passenger_count 43 | description: The number of passengers in the vehicle. This is a driver-entered value. 44 | - name: Trip_distance 45 | description: The elapsed trip distance in miles reported by the taximeter. 46 | - name: Pickup_locationid 47 | description: locationid where the meter was engaged. 48 | tests: 49 | - relationships: 50 | to: ref('taxi_zone_lookup') 51 | field: locationid 52 | severity: warn 53 | - name: dropoff_locationid 54 | description: locationid where the meter was engaged. 55 | tests: 56 | - relationships: 57 | to: ref('taxi_zone_lookup') 58 | field: locationid 59 | - name: RateCodeID 60 | description: > 61 | The final rate code in effect at the end of the trip. 62 | 1= Standard rate 63 | 2=JFK 64 | 3=Newark 65 | 4=Nassau or Westchester 66 | 5=Negotiated fare 67 | 6=Group ride 68 | - name: Store_and_fwd_flag 69 | description: > 70 | This flag indicates whether the trip record was held in vehicle 71 | memory before sending to the vendor, aka “store and forward,” 72 | because the vehicle did not have a connection to the server. 73 | Y= store and forward trip 74 | N= not a store and forward trip 75 | - name: Dropoff_longitude 76 | description: Longitude where the meter was disengaged. 77 | - name: Dropoff_latitude 78 | description: Latitude where the meter was disengaged. 79 | - name: Payment_type 80 | description: > 81 | A numeric code signifying how the passenger paid for the trip. 82 | tests: 83 | - accepted_values: 84 | values: "{{ var('payment_type_values') }}" 85 | severity: warn 86 | quote: false 87 | - name: payment_type_description 88 | description: Description of the payment_type code 89 | - name: Fare_amount 90 | description: > 91 | The time-and-distance fare calculated by the meter. 92 | Extra Miscellaneous extras and surcharges. Currently, this only includes 93 | the $0.50 and $1 rush hour and overnight charges. 94 | MTA_tax $0.50 MTA tax that is automatically triggered based on the metered 95 | rate in use. 96 | - name: Improvement_surcharge 97 | description: > 98 | $0.30 improvement surcharge assessed trips at the flag drop. The 99 | improvement surcharge began being levied in 2015. 100 | - name: Tip_amount 101 | description: > 102 | Tip amount. This field is automatically populated for credit card 103 | tips. Cash tips are not included. 104 | - name: Tolls_amount 105 | description: Total amount of all tolls paid in trip. 106 | - name: Total_amount 107 | description: The total amount charged to passengers. Does not include cash tips. 108 | 109 | - name: stg_yellow_tripdata 110 | description: > 111 | Trips made by New York City's iconic yellow taxis. 112 | Yellow taxis are the only vehicles permitted to respond to a street hail from a passenger in all five 113 | boroughs. They may also be hailed using an e-hail app like Curb or Arro. 114 | The records were collected and provided to the NYC Taxi and Limousine Commission (TLC) by 115 | technology service providers. 116 | columns: 117 | - name: tripid 118 | description: Primary key for this table, generated with a concatenation of vendorid+pickup_datetime 119 | tests: 120 | - unique: 121 | severity: warn 122 | - not_null: 123 | severity: warn 124 | - name: VendorID 125 | description: > 126 | A code indicating the TPEP provider that provided the record. 127 | 1= Creative Mobile Technologies, LLC; 128 | 2= VeriFone Inc. 129 | - name: pickup_datetime 130 | description: The date and time when the meter was engaged. 131 | - name: dropoff_datetime 132 | description: The date and time when the meter was disengaged. 133 | - name: Passenger_count 134 | description: The number of passengers in the vehicle. This is a driver-entered value. 135 | - name: Trip_distance 136 | description: The elapsed trip distance in miles reported by the taximeter. 137 | - name: Pickup_locationid 138 | description: locationid where the meter was engaged. 139 | tests: 140 | - relationships: 141 | to: ref('taxi_zone_lookup') 142 | field: locationid 143 | severity: warn 144 | - name: dropoff_locationid 145 | description: locationid where the meter was engaged. 146 | tests: 147 | - relationships: 148 | to: ref('taxi_zone_lookup') 149 | field: locationid 150 | severity: warn 151 | - name: RateCodeID 152 | description: > 153 | The final rate code in effect at the end of the trip. 154 | 1= Standard rate 155 | 2=JFK 156 | 3=Newark 157 | 4=Nassau or Westchester 158 | 5=Negotiated fare 159 | 6=Group ride 160 | - name: Store_and_fwd_flag 161 | description: > 162 | This flag indicates whether the trip record was held in vehicle 163 | memory before sending to the vendor, aka “store and forward,” 164 | because the vehicle did not have a connection to the server. 165 | Y= store and forward trip 166 | N= not a store and forward trip 167 | - name: Dropoff_longitude 168 | description: Longitude where the meter was disengaged. 169 | - name: Dropoff_latitude 170 | description: Latitude where the meter was disengaged. 171 | - name: Payment_type 172 | description: > 173 | A numeric code signifying how the passenger paid for the trip. 174 | tests: 175 | - accepted_values: 176 | values: "{{ var('payment_type_values') }}" 177 | severity: warn 178 | quote: false 179 | - name: payment_type_description 180 | description: Description of the payment_type code 181 | - name: Fare_amount 182 | description: > 183 | The time-and-distance fare calculated by the meter. 184 | Extra Miscellaneous extras and surcharges. Currently, this only includes 185 | the $0.50 and $1 rush hour and overnight charges. 186 | MTA_tax $0.50 MTA tax that is automatically triggered based on the metered 187 | rate in use. 188 | - name: Improvement_surcharge 189 | description: > 190 | $0.30 improvement surcharge assessed trips at the flag drop. The 191 | improvement surcharge began being levied in 2015. 192 | - name: Tip_amount 193 | description: > 194 | Tip amount. This field is automatically populated for credit card 195 | tips. Cash tips are not included. 196 | - name: Tolls_amount 197 | description: Total amount of all tolls paid in trip. 198 | - name: Total_amount 199 | description: The total amount charged to passengers. Does not include cash tips. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Week 5/Code/05 Spark Join and GroupBy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "5e43dfad", 6 | "metadata": {}, 7 | "source": [ 8 | "## Section I\n", 9 | "\n", 10 | "Group By" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "b5d349d0", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pyspark\n", 21 | "from pyspark.sql import SparkSession\n", 22 | "\n", 23 | "spark = SparkSession.builder \\\n", 24 | " .master(\"local[*]\") \\\n", 25 | " .appName('test') \\\n", 26 | " .getOrCreate()" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "id": "44762dd8", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "df_green = spark.read.parquet('D:/data/pq/green/*/*')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "id": "4f81ad58", 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stderr", 47 | "output_type": "stream", 48 | "text": [ 49 | "C:\\Users\\balaj\\anaconda3\\lib\\site-packages\\pyspark\\sql\\dataframe.py:138: FutureWarning: Deprecated in 2.0, use createOrReplaceTempView instead.\n", 50 | " warnings.warn(\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "df_green.registerTempTable('green')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 6, 61 | "id": "1800625a", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "df_green_revenue = spark.sql(\"\"\"\n", 66 | "SELECT \n", 67 | " date_trunc('hour', lpep_pickup_datetime) AS hour, \n", 68 | " PULocationID AS zone,\n", 69 | "\n", 70 | " SUM(total_amount) AS amount,\n", 71 | " COUNT(1) AS number_records\n", 72 | "FROM\n", 73 | " green\n", 74 | "WHERE\n", 75 | " lpep_pickup_datetime >= '2020-01-01 00:00:00'\n", 76 | "GROUP BY\n", 77 | " 1, 2\n", 78 | "ORDER BY \n", 79 | " 1, 2\n", 80 | "\"\"\")" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 7, 86 | "id": "371d744a", 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stdout", 91 | "output_type": "stream", 92 | "text": [ 93 | "+-------------------+----+------------------+--------------+\n", 94 | "| hour|zone| amount|number_records|\n", 95 | "+-------------------+----+------------------+--------------+\n", 96 | "|2020-01-01 00:00:00| 7| 769.7299999999997| 45|\n", 97 | "|2020-01-01 00:00:00| 17|195.03000000000006| 9|\n", 98 | "|2020-01-01 00:00:00| 18| 7.8| 1|\n", 99 | "|2020-01-01 00:00:00| 22| 15.8| 1|\n", 100 | "|2020-01-01 00:00:00| 24| 87.6| 3|\n", 101 | "|2020-01-01 00:00:00| 25| 531.0000000000001| 26|\n", 102 | "|2020-01-01 00:00:00| 29| 61.3| 1|\n", 103 | "|2020-01-01 00:00:00| 32| 68.94999999999999| 2|\n", 104 | "|2020-01-01 00:00:00| 33|317.27000000000004| 11|\n", 105 | "|2020-01-01 00:00:00| 35| 129.96| 5|\n", 106 | "|2020-01-01 00:00:00| 36| 295.34| 11|\n", 107 | "|2020-01-01 00:00:00| 37| 175.67| 6|\n", 108 | "|2020-01-01 00:00:00| 38| 98.78999999999999| 2|\n", 109 | "|2020-01-01 00:00:00| 40| 168.98| 8|\n", 110 | "|2020-01-01 00:00:00| 41|1363.9599999999987| 84|\n", 111 | "|2020-01-01 00:00:00| 42| 799.7599999999996| 52|\n", 112 | "|2020-01-01 00:00:00| 43| 107.52| 6|\n", 113 | "|2020-01-01 00:00:00| 47| 13.3| 1|\n", 114 | "|2020-01-01 00:00:00| 49| 266.7600000000001| 14|\n", 115 | "|2020-01-01 00:00:00| 51| 17.8| 2|\n", 116 | "+-------------------+----+------------------+--------------+\n", 117 | "only showing top 20 rows\n", 118 | "\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "df_green_revenue.show()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 13, 129 | "id": "f8f868b0", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "df_green_revenue \\\n", 134 | " .repartition(20) \\\n", 135 | " .write.parquet('D:data/report/revenue/green', mode='overwrite')" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 14, 141 | "id": "8ca7e6b4", 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "df_yellow = spark.read.parquet('D:/data/pq/yellow/*/*')\n", 146 | "df_yellow.registerTempTable('yellow')" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 15, 152 | "id": "ee29e4f4", 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "df_yellow_revenue = spark.sql(\"\"\"\n", 157 | "SELECT \n", 158 | " date_trunc('hour', tpep_pickup_datetime) AS hour, \n", 159 | " PULocationID AS zone,\n", 160 | "\n", 161 | " SUM(total_amount) AS amount,\n", 162 | " COUNT(1) AS number_records\n", 163 | "FROM\n", 164 | " yellow\n", 165 | "WHERE\n", 166 | " tpep_pickup_datetime >= '2020-01-01 00:00:00'\n", 167 | "GROUP BY\n", 168 | " 1, 2\n", 169 | "\"\"\")" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 16, 175 | "id": "2148f001", 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "df_yellow_revenue \\\n", 180 | " .repartition(20) \\\n", 181 | " .write.parquet('D:/data/report/revenue/yellow', mode='overwrite')" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "id": "15ba0334", 187 | "metadata": {}, 188 | "source": [ 189 | "## Section II\n", 190 | "\n", 191 | "Joins" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "id": "9b2b3060", 197 | "metadata": {}, 198 | "source": [ 199 | "Type 1 - Tables of equal/similar size" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 17, 205 | "id": "8ffae439", 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "df_green_revenue = spark.read.parquet('D:/data/report/revenue/green')\n", 210 | "df_yellow_revenue = spark.read.parquet('D:/data/report/revenue/yellow')" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 18, 216 | "id": "4ff23cd0", 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "df_green_revenue_tmp = df_green_revenue \\\n", 221 | " .withColumnRenamed('amount', 'green_amount') \\\n", 222 | " .withColumnRenamed('number_records', 'green_number_records')\n", 223 | "\n", 224 | "df_yellow_revenue_tmp = df_yellow_revenue \\\n", 225 | " .withColumnRenamed('amount', 'yellow_amount') \\\n", 226 | " .withColumnRenamed('number_records', 'yellow_number_records')" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 20, 232 | "id": "98ac3e6a", 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "df_join = df_green_revenue_tmp.join(df_yellow_revenue_tmp, on=['hour', 'zone'], how='outer')" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 21, 242 | "id": "a0fadc51", 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "df_join.write.parquet('D:/data/report/revenue/total', mode='overwrite')" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 22, 252 | "id": "31cb79f0", 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "df_join = spark.read.parquet('D:/data/report/revenue/total')" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 23, 262 | "id": "c2edcaec", 263 | "metadata": {}, 264 | "outputs": [ 265 | { 266 | "data": { 267 | "text/plain": [ 268 | "DataFrame[hour: timestamp, zone: int, green_amount: double, green_number_records: bigint, yellow_amount: double, yellow_number_records: bigint]" 269 | ] 270 | }, 271 | "execution_count": 23, 272 | "metadata": {}, 273 | "output_type": "execute_result" 274 | } 275 | ], 276 | "source": [ 277 | "df_join" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "id": "08900d84", 283 | "metadata": {}, 284 | "source": [ 285 | "Type 2 - Big table and a smaller table" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 48, 291 | "id": "0924acf1", 292 | "metadata": {}, 293 | "outputs": [ 294 | { 295 | "name": "stderr", 296 | "output_type": "stream", 297 | "text": [ 298 | "find: '/I': No such file or directory\n", 299 | "find: '/N': No such file or directory\n", 300 | "find: 'SoundMixer.exe': No such file or directory\n", 301 | " % Total % Received % Xferd Average Speed Time Time Time Current\n", 302 | " Dload Upload Total Spent Left Speed\n", 303 | "\n", 304 | " 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\n", 305 | " 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\n", 306 | "\n", 307 | " 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\n", 308 | "100 12322 100 12322 0 0 16245 0 --:--:-- --:--:-- --:--:-- 56522\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "import os\n", 314 | "\n", 315 | "directory = \"C:/users/balaj/zones\"\n", 316 | "if not os.path.exists(directory):\n", 317 | " os.makedirs(directory)\n", 318 | " \n", 319 | "!curl -L -o \"C:/users/balaj/zones/taxi_zone_lookup.csv\" \"https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv\"" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 60, 325 | "id": "c9eb3b30", 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "df_temp = spark.read.csv(\"C:/users/balaj/zones/taxi_zone_lookup.csv\", header=True)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 62, 335 | "id": "af494ba3", 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "df_temp.write.parquet(\"C:/users/balaj/zones/taxi_zone_lookup\")" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 63, 345 | "id": "d76d0c0f", 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "df_zones = spark.read.parquet('C:/users/balaj/zones/taxi_zone_lookup/')" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 64, 355 | "id": "7715c3e0", 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "df_result = df_join.join(df_zones, df_join.zone == df_zones.LocationID)" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 65, 365 | "id": "bc8a653f", 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "df_result.drop('LocationID', 'zone').write.parquet('tmp/revenue-zones')" 370 | ] 371 | } 372 | ], 373 | "metadata": { 374 | "kernelspec": { 375 | "display_name": "Python 3 (ipykernel)", 376 | "language": "python", 377 | "name": "python3" 378 | }, 379 | "language_info": { 380 | "codemirror_mode": { 381 | "name": "ipython", 382 | "version": 3 383 | }, 384 | "file_extension": ".py", 385 | "mimetype": "text/x-python", 386 | "name": "python", 387 | "nbconvert_exporter": "python", 388 | "pygments_lexer": "ipython3", 389 | "version": "3.9.15" 390 | } 391 | }, 392 | "nbformat": 4, 393 | "nbformat_minor": 5 394 | } 395 | -------------------------------------------------------------------------------- /Week 4/dbt_files/seeds/taxi_zone_lookup.csv: -------------------------------------------------------------------------------- 1 | "LocationID","Borough","Zone","service_zone" 2 | 1,"EWR","Newark Airport","EWR" 3 | 2,"Queens","Jamaica Bay","Boro Zone" 4 | 3,"Bronx","Allerton/Pelham Gardens","Boro Zone" 5 | 4,"Manhattan","Alphabet City","Yellow Zone" 6 | 5,"Staten Island","Arden Heights","Boro Zone" 7 | 6,"Staten Island","Arrochar/Fort Wadsworth","Boro Zone" 8 | 7,"Queens","Astoria","Boro Zone" 9 | 8,"Queens","Astoria Park","Boro Zone" 10 | 9,"Queens","Auburndale","Boro Zone" 11 | 10,"Queens","Baisley Park","Boro Zone" 12 | 11,"Brooklyn","Bath Beach","Boro Zone" 13 | 12,"Manhattan","Battery Park","Yellow Zone" 14 | 13,"Manhattan","Battery Park City","Yellow Zone" 15 | 14,"Brooklyn","Bay Ridge","Boro Zone" 16 | 15,"Queens","Bay Terrace/Fort Totten","Boro Zone" 17 | 16,"Queens","Bayside","Boro Zone" 18 | 17,"Brooklyn","Bedford","Boro Zone" 19 | 18,"Bronx","Bedford Park","Boro Zone" 20 | 19,"Queens","Bellerose","Boro Zone" 21 | 20,"Bronx","Belmont","Boro Zone" 22 | 21,"Brooklyn","Bensonhurst East","Boro Zone" 23 | 22,"Brooklyn","Bensonhurst West","Boro Zone" 24 | 23,"Staten Island","Bloomfield/Emerson Hill","Boro Zone" 25 | 24,"Manhattan","Bloomingdale","Yellow Zone" 26 | 25,"Brooklyn","Boerum Hill","Boro Zone" 27 | 26,"Brooklyn","Borough Park","Boro Zone" 28 | 27,"Queens","Breezy Point/Fort Tilden/Riis Beach","Boro Zone" 29 | 28,"Queens","Briarwood/Jamaica Hills","Boro Zone" 30 | 29,"Brooklyn","Brighton Beach","Boro Zone" 31 | 30,"Queens","Broad Channel","Boro Zone" 32 | 31,"Bronx","Bronx Park","Boro Zone" 33 | 32,"Bronx","Bronxdale","Boro Zone" 34 | 33,"Brooklyn","Brooklyn Heights","Boro Zone" 35 | 34,"Brooklyn","Brooklyn Navy Yard","Boro Zone" 36 | 35,"Brooklyn","Brownsville","Boro Zone" 37 | 36,"Brooklyn","Bushwick North","Boro Zone" 38 | 37,"Brooklyn","Bushwick South","Boro Zone" 39 | 38,"Queens","Cambria Heights","Boro Zone" 40 | 39,"Brooklyn","Canarsie","Boro Zone" 41 | 40,"Brooklyn","Carroll Gardens","Boro Zone" 42 | 41,"Manhattan","Central Harlem","Boro Zone" 43 | 42,"Manhattan","Central Harlem North","Boro Zone" 44 | 43,"Manhattan","Central Park","Yellow Zone" 45 | 44,"Staten Island","Charleston/Tottenville","Boro Zone" 46 | 45,"Manhattan","Chinatown","Yellow Zone" 47 | 46,"Bronx","City Island","Boro Zone" 48 | 47,"Bronx","Claremont/Bathgate","Boro Zone" 49 | 48,"Manhattan","Clinton East","Yellow Zone" 50 | 49,"Brooklyn","Clinton Hill","Boro Zone" 51 | 50,"Manhattan","Clinton West","Yellow Zone" 52 | 51,"Bronx","Co-Op City","Boro Zone" 53 | 52,"Brooklyn","Cobble Hill","Boro Zone" 54 | 53,"Queens","College Point","Boro Zone" 55 | 54,"Brooklyn","Columbia Street","Boro Zone" 56 | 55,"Brooklyn","Coney Island","Boro Zone" 57 | 56,"Queens","Corona","Boro Zone" 58 | 57,"Queens","Corona","Boro Zone" 59 | 58,"Bronx","Country Club","Boro Zone" 60 | 59,"Bronx","Crotona Park","Boro Zone" 61 | 60,"Bronx","Crotona Park East","Boro Zone" 62 | 61,"Brooklyn","Crown Heights North","Boro Zone" 63 | 62,"Brooklyn","Crown Heights South","Boro Zone" 64 | 63,"Brooklyn","Cypress Hills","Boro Zone" 65 | 64,"Queens","Douglaston","Boro Zone" 66 | 65,"Brooklyn","Downtown Brooklyn/MetroTech","Boro Zone" 67 | 66,"Brooklyn","DUMBO/Vinegar Hill","Boro Zone" 68 | 67,"Brooklyn","Dyker Heights","Boro Zone" 69 | 68,"Manhattan","East Chelsea","Yellow Zone" 70 | 69,"Bronx","East Concourse/Concourse Village","Boro Zone" 71 | 70,"Queens","East Elmhurst","Boro Zone" 72 | 71,"Brooklyn","East Flatbush/Farragut","Boro Zone" 73 | 72,"Brooklyn","East Flatbush/Remsen Village","Boro Zone" 74 | 73,"Queens","East Flushing","Boro Zone" 75 | 74,"Manhattan","East Harlem North","Boro Zone" 76 | 75,"Manhattan","East Harlem South","Boro Zone" 77 | 76,"Brooklyn","East New York","Boro Zone" 78 | 77,"Brooklyn","East New York/Pennsylvania Avenue","Boro Zone" 79 | 78,"Bronx","East Tremont","Boro Zone" 80 | 79,"Manhattan","East Village","Yellow Zone" 81 | 80,"Brooklyn","East Williamsburg","Boro Zone" 82 | 81,"Bronx","Eastchester","Boro Zone" 83 | 82,"Queens","Elmhurst","Boro Zone" 84 | 83,"Queens","Elmhurst/Maspeth","Boro Zone" 85 | 84,"Staten Island","Eltingville/Annadale/Prince's Bay","Boro Zone" 86 | 85,"Brooklyn","Erasmus","Boro Zone" 87 | 86,"Queens","Far Rockaway","Boro Zone" 88 | 87,"Manhattan","Financial District North","Yellow Zone" 89 | 88,"Manhattan","Financial District South","Yellow Zone" 90 | 89,"Brooklyn","Flatbush/Ditmas Park","Boro Zone" 91 | 90,"Manhattan","Flatiron","Yellow Zone" 92 | 91,"Brooklyn","Flatlands","Boro Zone" 93 | 92,"Queens","Flushing","Boro Zone" 94 | 93,"Queens","Flushing Meadows-Corona Park","Boro Zone" 95 | 94,"Bronx","Fordham South","Boro Zone" 96 | 95,"Queens","Forest Hills","Boro Zone" 97 | 96,"Queens","Forest Park/Highland Park","Boro Zone" 98 | 97,"Brooklyn","Fort Greene","Boro Zone" 99 | 98,"Queens","Fresh Meadows","Boro Zone" 100 | 99,"Staten Island","Freshkills Park","Boro Zone" 101 | 100,"Manhattan","Garment District","Yellow Zone" 102 | 101,"Queens","Glen Oaks","Boro Zone" 103 | 102,"Queens","Glendale","Boro Zone" 104 | 103,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone" 105 | 104,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone" 106 | 105,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone" 107 | 106,"Brooklyn","Gowanus","Boro Zone" 108 | 107,"Manhattan","Gramercy","Yellow Zone" 109 | 108,"Brooklyn","Gravesend","Boro Zone" 110 | 109,"Staten Island","Great Kills","Boro Zone" 111 | 110,"Staten Island","Great Kills Park","Boro Zone" 112 | 111,"Brooklyn","Green-Wood Cemetery","Boro Zone" 113 | 112,"Brooklyn","Greenpoint","Boro Zone" 114 | 113,"Manhattan","Greenwich Village North","Yellow Zone" 115 | 114,"Manhattan","Greenwich Village South","Yellow Zone" 116 | 115,"Staten Island","Grymes Hill/Clifton","Boro Zone" 117 | 116,"Manhattan","Hamilton Heights","Boro Zone" 118 | 117,"Queens","Hammels/Arverne","Boro Zone" 119 | 118,"Staten Island","Heartland Village/Todt Hill","Boro Zone" 120 | 119,"Bronx","Highbridge","Boro Zone" 121 | 120,"Manhattan","Highbridge Park","Boro Zone" 122 | 121,"Queens","Hillcrest/Pomonok","Boro Zone" 123 | 122,"Queens","Hollis","Boro Zone" 124 | 123,"Brooklyn","Homecrest","Boro Zone" 125 | 124,"Queens","Howard Beach","Boro Zone" 126 | 125,"Manhattan","Hudson Sq","Yellow Zone" 127 | 126,"Bronx","Hunts Point","Boro Zone" 128 | 127,"Manhattan","Inwood","Boro Zone" 129 | 128,"Manhattan","Inwood Hill Park","Boro Zone" 130 | 129,"Queens","Jackson Heights","Boro Zone" 131 | 130,"Queens","Jamaica","Boro Zone" 132 | 131,"Queens","Jamaica Estates","Boro Zone" 133 | 132,"Queens","JFK Airport","Airports" 134 | 133,"Brooklyn","Kensington","Boro Zone" 135 | 134,"Queens","Kew Gardens","Boro Zone" 136 | 135,"Queens","Kew Gardens Hills","Boro Zone" 137 | 136,"Bronx","Kingsbridge Heights","Boro Zone" 138 | 137,"Manhattan","Kips Bay","Yellow Zone" 139 | 138,"Queens","LaGuardia Airport","Airports" 140 | 139,"Queens","Laurelton","Boro Zone" 141 | 140,"Manhattan","Lenox Hill East","Yellow Zone" 142 | 141,"Manhattan","Lenox Hill West","Yellow Zone" 143 | 142,"Manhattan","Lincoln Square East","Yellow Zone" 144 | 143,"Manhattan","Lincoln Square West","Yellow Zone" 145 | 144,"Manhattan","Little Italy/NoLiTa","Yellow Zone" 146 | 145,"Queens","Long Island City/Hunters Point","Boro Zone" 147 | 146,"Queens","Long Island City/Queens Plaza","Boro Zone" 148 | 147,"Bronx","Longwood","Boro Zone" 149 | 148,"Manhattan","Lower East Side","Yellow Zone" 150 | 149,"Brooklyn","Madison","Boro Zone" 151 | 150,"Brooklyn","Manhattan Beach","Boro Zone" 152 | 151,"Manhattan","Manhattan Valley","Yellow Zone" 153 | 152,"Manhattan","Manhattanville","Boro Zone" 154 | 153,"Manhattan","Marble Hill","Boro Zone" 155 | 154,"Brooklyn","Marine Park/Floyd Bennett Field","Boro Zone" 156 | 155,"Brooklyn","Marine Park/Mill Basin","Boro Zone" 157 | 156,"Staten Island","Mariners Harbor","Boro Zone" 158 | 157,"Queens","Maspeth","Boro Zone" 159 | 158,"Manhattan","Meatpacking/West Village West","Yellow Zone" 160 | 159,"Bronx","Melrose South","Boro Zone" 161 | 160,"Queens","Middle Village","Boro Zone" 162 | 161,"Manhattan","Midtown Center","Yellow Zone" 163 | 162,"Manhattan","Midtown East","Yellow Zone" 164 | 163,"Manhattan","Midtown North","Yellow Zone" 165 | 164,"Manhattan","Midtown South","Yellow Zone" 166 | 165,"Brooklyn","Midwood","Boro Zone" 167 | 166,"Manhattan","Morningside Heights","Boro Zone" 168 | 167,"Bronx","Morrisania/Melrose","Boro Zone" 169 | 168,"Bronx","Mott Haven/Port Morris","Boro Zone" 170 | 169,"Bronx","Mount Hope","Boro Zone" 171 | 170,"Manhattan","Murray Hill","Yellow Zone" 172 | 171,"Queens","Murray Hill-Queens","Boro Zone" 173 | 172,"Staten Island","New Dorp/Midland Beach","Boro Zone" 174 | 173,"Queens","North Corona","Boro Zone" 175 | 174,"Bronx","Norwood","Boro Zone" 176 | 175,"Queens","Oakland Gardens","Boro Zone" 177 | 176,"Staten Island","Oakwood","Boro Zone" 178 | 177,"Brooklyn","Ocean Hill","Boro Zone" 179 | 178,"Brooklyn","Ocean Parkway South","Boro Zone" 180 | 179,"Queens","Old Astoria","Boro Zone" 181 | 180,"Queens","Ozone Park","Boro Zone" 182 | 181,"Brooklyn","Park Slope","Boro Zone" 183 | 182,"Bronx","Parkchester","Boro Zone" 184 | 183,"Bronx","Pelham Bay","Boro Zone" 185 | 184,"Bronx","Pelham Bay Park","Boro Zone" 186 | 185,"Bronx","Pelham Parkway","Boro Zone" 187 | 186,"Manhattan","Penn Station/Madison Sq West","Yellow Zone" 188 | 187,"Staten Island","Port Richmond","Boro Zone" 189 | 188,"Brooklyn","Prospect-Lefferts Gardens","Boro Zone" 190 | 189,"Brooklyn","Prospect Heights","Boro Zone" 191 | 190,"Brooklyn","Prospect Park","Boro Zone" 192 | 191,"Queens","Queens Village","Boro Zone" 193 | 192,"Queens","Queensboro Hill","Boro Zone" 194 | 193,"Queens","Queensbridge/Ravenswood","Boro Zone" 195 | 194,"Manhattan","Randalls Island","Yellow Zone" 196 | 195,"Brooklyn","Red Hook","Boro Zone" 197 | 196,"Queens","Rego Park","Boro Zone" 198 | 197,"Queens","Richmond Hill","Boro Zone" 199 | 198,"Queens","Ridgewood","Boro Zone" 200 | 199,"Bronx","Rikers Island","Boro Zone" 201 | 200,"Bronx","Riverdale/North Riverdale/Fieldston","Boro Zone" 202 | 201,"Queens","Rockaway Park","Boro Zone" 203 | 202,"Manhattan","Roosevelt Island","Boro Zone" 204 | 203,"Queens","Rosedale","Boro Zone" 205 | 204,"Staten Island","Rossville/Woodrow","Boro Zone" 206 | 205,"Queens","Saint Albans","Boro Zone" 207 | 206,"Staten Island","Saint George/New Brighton","Boro Zone" 208 | 207,"Queens","Saint Michaels Cemetery/Woodside","Boro Zone" 209 | 208,"Bronx","Schuylerville/Edgewater Park","Boro Zone" 210 | 209,"Manhattan","Seaport","Yellow Zone" 211 | 210,"Brooklyn","Sheepshead Bay","Boro Zone" 212 | 211,"Manhattan","SoHo","Yellow Zone" 213 | 212,"Bronx","Soundview/Bruckner","Boro Zone" 214 | 213,"Bronx","Soundview/Castle Hill","Boro Zone" 215 | 214,"Staten Island","South Beach/Dongan Hills","Boro Zone" 216 | 215,"Queens","South Jamaica","Boro Zone" 217 | 216,"Queens","South Ozone Park","Boro Zone" 218 | 217,"Brooklyn","South Williamsburg","Boro Zone" 219 | 218,"Queens","Springfield Gardens North","Boro Zone" 220 | 219,"Queens","Springfield Gardens South","Boro Zone" 221 | 220,"Bronx","Spuyten Duyvil/Kingsbridge","Boro Zone" 222 | 221,"Staten Island","Stapleton","Boro Zone" 223 | 222,"Brooklyn","Starrett City","Boro Zone" 224 | 223,"Queens","Steinway","Boro Zone" 225 | 224,"Manhattan","Stuy Town/Peter Cooper Village","Yellow Zone" 226 | 225,"Brooklyn","Stuyvesant Heights","Boro Zone" 227 | 226,"Queens","Sunnyside","Boro Zone" 228 | 227,"Brooklyn","Sunset Park East","Boro Zone" 229 | 228,"Brooklyn","Sunset Park West","Boro Zone" 230 | 229,"Manhattan","Sutton Place/Turtle Bay North","Yellow Zone" 231 | 230,"Manhattan","Times Sq/Theatre District","Yellow Zone" 232 | 231,"Manhattan","TriBeCa/Civic Center","Yellow Zone" 233 | 232,"Manhattan","Two Bridges/Seward Park","Yellow Zone" 234 | 233,"Manhattan","UN/Turtle Bay South","Yellow Zone" 235 | 234,"Manhattan","Union Sq","Yellow Zone" 236 | 235,"Bronx","University Heights/Morris Heights","Boro Zone" 237 | 236,"Manhattan","Upper East Side North","Yellow Zone" 238 | 237,"Manhattan","Upper East Side South","Yellow Zone" 239 | 238,"Manhattan","Upper West Side North","Yellow Zone" 240 | 239,"Manhattan","Upper West Side South","Yellow Zone" 241 | 240,"Bronx","Van Cortlandt Park","Boro Zone" 242 | 241,"Bronx","Van Cortlandt Village","Boro Zone" 243 | 242,"Bronx","Van Nest/Morris Park","Boro Zone" 244 | 243,"Manhattan","Washington Heights North","Boro Zone" 245 | 244,"Manhattan","Washington Heights South","Boro Zone" 246 | 245,"Staten Island","West Brighton","Boro Zone" 247 | 246,"Manhattan","West Chelsea/Hudson Yards","Yellow Zone" 248 | 247,"Bronx","West Concourse","Boro Zone" 249 | 248,"Bronx","West Farms/Bronx River","Boro Zone" 250 | 249,"Manhattan","West Village","Yellow Zone" 251 | 250,"Bronx","Westchester Village/Unionport","Boro Zone" 252 | 251,"Staten Island","Westerleigh","Boro Zone" 253 | 252,"Queens","Whitestone","Boro Zone" 254 | 253,"Queens","Willets Point","Boro Zone" 255 | 254,"Bronx","Williamsbridge/Olinville","Boro Zone" 256 | 255,"Brooklyn","Williamsburg (North Side)","Boro Zone" 257 | 256,"Brooklyn","Williamsburg (South Side)","Boro Zone" 258 | 257,"Brooklyn","Windsor Terrace","Boro Zone" 259 | 258,"Queens","Woodhaven","Boro Zone" 260 | 259,"Bronx","Woodlawn/Wakefield","Boro Zone" 261 | 260,"Queens","Woodside","Boro Zone" 262 | 261,"Manhattan","World Trade Center","Yellow Zone" 263 | 262,"Manhattan","Yorkville East","Yellow Zone" 264 | 263,"Manhattan","Yorkville West","Yellow Zone" 265 | 264,"Unknown","NV","N/A" 266 | 265,"Unknown","NA","N/A" 267 | -------------------------------------------------------------------------------- /Week 5/Code/04 Spark SQL.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "1a8cf1a8", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pyspark\n", 11 | "from pyspark.sql import SparkSession\n", 12 | "\n", 13 | "spark = SparkSession.builder \\\n", 14 | " .master(\"local[*]\") \\\n", 15 | " .appName('test') \\\n", 16 | " .getOrCreate()" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 9, 22 | "id": "17ff4c2a", 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "df_green = spark.read.parquet('D:/data/pq/green/*/*')" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 10, 32 | "id": "7568f742", 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "root\n", 40 | " |-- VendorID: integer (nullable = true)\n", 41 | " |-- lpep_pickup_datetime: timestamp (nullable = true)\n", 42 | " |-- lpep_dropoff_datetime: timestamp (nullable = true)\n", 43 | " |-- store_and_fwd_flag: string (nullable = true)\n", 44 | " |-- RatecodeID: integer (nullable = true)\n", 45 | " |-- PULocationID: integer (nullable = true)\n", 46 | " |-- DOLocationID: integer (nullable = true)\n", 47 | " |-- passenger_count: integer (nullable = true)\n", 48 | " |-- trip_distance: double (nullable = true)\n", 49 | " |-- fare_amount: double (nullable = true)\n", 50 | " |-- extra: double (nullable = true)\n", 51 | " |-- mta_tax: double (nullable = true)\n", 52 | " |-- tip_amount: double (nullable = true)\n", 53 | " |-- tolls_amount: double (nullable = true)\n", 54 | " |-- ehail_fee: string (nullable = true)\n", 55 | " |-- improvement_surcharge: double (nullable = true)\n", 56 | " |-- total_amount: double (nullable = true)\n", 57 | " |-- payment_type: integer (nullable = true)\n", 58 | " |-- trip_type: integer (nullable = true)\n", 59 | " |-- congestion_surcharge: double (nullable = true)\n", 60 | "\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "df_green.printSchema()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 11, 71 | "id": "ca99c5db", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# Changing the column names for the date columns\n", 76 | "\n", 77 | "df_green = df_green \\\n", 78 | " .withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \\\n", 79 | " .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime')" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 12, 85 | "id": "15991733", 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "root\n", 93 | " |-- VendorID: integer (nullable = true)\n", 94 | " |-- pickup_datetime: timestamp (nullable = true)\n", 95 | " |-- dropoff_datetime: timestamp (nullable = true)\n", 96 | " |-- store_and_fwd_flag: string (nullable = true)\n", 97 | " |-- RatecodeID: integer (nullable = true)\n", 98 | " |-- PULocationID: integer (nullable = true)\n", 99 | " |-- DOLocationID: integer (nullable = true)\n", 100 | " |-- passenger_count: integer (nullable = true)\n", 101 | " |-- trip_distance: double (nullable = true)\n", 102 | " |-- fare_amount: double (nullable = true)\n", 103 | " |-- extra: double (nullable = true)\n", 104 | " |-- mta_tax: double (nullable = true)\n", 105 | " |-- tip_amount: double (nullable = true)\n", 106 | " |-- tolls_amount: double (nullable = true)\n", 107 | " |-- ehail_fee: string (nullable = true)\n", 108 | " |-- improvement_surcharge: double (nullable = true)\n", 109 | " |-- total_amount: double (nullable = true)\n", 110 | " |-- payment_type: integer (nullable = true)\n", 111 | " |-- trip_type: integer (nullable = true)\n", 112 | " |-- congestion_surcharge: double (nullable = true)\n", 113 | "\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "# checking the schema to see if the column names are changed\n", 119 | "\n", 120 | "df_green.printSchema()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 13, 126 | "id": "2a5dba50", 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "df_yellow = spark.read.parquet('D:/data/pq/yellow/*/*')" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 14, 136 | "id": "faf1413f", 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "root\n", 144 | " |-- VendorID: integer (nullable = true)\n", 145 | " |-- tpep_pickup_datetime: timestamp (nullable = true)\n", 146 | " |-- tpep_dropoff_datetime: timestamp (nullable = true)\n", 147 | " |-- passenger_count: integer (nullable = true)\n", 148 | " |-- trip_distance: double (nullable = true)\n", 149 | " |-- RatecodeID: integer (nullable = true)\n", 150 | " |-- store_and_fwd_flag: string (nullable = true)\n", 151 | " |-- PULocationID: integer (nullable = true)\n", 152 | " |-- DOLocationID: integer (nullable = true)\n", 153 | " |-- payment_type: integer (nullable = true)\n", 154 | " |-- fare_amount: double (nullable = true)\n", 155 | " |-- extra: double (nullable = true)\n", 156 | " |-- mta_tax: double (nullable = true)\n", 157 | " |-- tip_amount: double (nullable = true)\n", 158 | " |-- tolls_amount: double (nullable = true)\n", 159 | " |-- improvement_surcharge: double (nullable = true)\n", 160 | " |-- total_amount: double (nullable = true)\n", 161 | " |-- congestion_surcharge: double (nullable = true)\n", 162 | "\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "df_yellow.printSchema()" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 15, 173 | "id": "a78482c4", 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "# Changing the column names for the date columns\n", 178 | "\n", 179 | "df_yellow = df_yellow \\\n", 180 | " .withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime') \\\n", 181 | " .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime')" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 16, 187 | "id": "52a5bf4f", 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "root\n", 195 | " |-- VendorID: integer (nullable = true)\n", 196 | " |-- pickup_datetime: timestamp (nullable = true)\n", 197 | " |-- dropoff_datetime: timestamp (nullable = true)\n", 198 | " |-- passenger_count: integer (nullable = true)\n", 199 | " |-- trip_distance: double (nullable = true)\n", 200 | " |-- RatecodeID: integer (nullable = true)\n", 201 | " |-- store_and_fwd_flag: string (nullable = true)\n", 202 | " |-- PULocationID: integer (nullable = true)\n", 203 | " |-- DOLocationID: integer (nullable = true)\n", 204 | " |-- payment_type: integer (nullable = true)\n", 205 | " |-- fare_amount: double (nullable = true)\n", 206 | " |-- extra: double (nullable = true)\n", 207 | " |-- mta_tax: double (nullable = true)\n", 208 | " |-- tip_amount: double (nullable = true)\n", 209 | " |-- tolls_amount: double (nullable = true)\n", 210 | " |-- improvement_surcharge: double (nullable = true)\n", 211 | " |-- total_amount: double (nullable = true)\n", 212 | " |-- congestion_surcharge: double (nullable = true)\n", 213 | "\n" 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "# checking the schema to see if the column names are changed\n", 219 | "\n", 220 | "df_yellow.printSchema()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 18, 226 | "id": "aeb8195e", 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "data": { 231 | "text/plain": [ 232 | "['VendorID',\n", 233 | " 'pickup_datetime',\n", 234 | " 'dropoff_datetime',\n", 235 | " 'store_and_fwd_flag',\n", 236 | " 'RatecodeID',\n", 237 | " 'PULocationID',\n", 238 | " 'DOLocationID',\n", 239 | " 'passenger_count',\n", 240 | " 'trip_distance',\n", 241 | " 'fare_amount',\n", 242 | " 'extra',\n", 243 | " 'mta_tax',\n", 244 | " 'tip_amount',\n", 245 | " 'tolls_amount',\n", 246 | " 'improvement_surcharge',\n", 247 | " 'total_amount',\n", 248 | " 'payment_type',\n", 249 | " 'congestion_surcharge']" 250 | ] 251 | }, 252 | "execution_count": 18, 253 | "metadata": {}, 254 | "output_type": "execute_result" 255 | } 256 | ], 257 | "source": [ 258 | "# Preserving the row order and choosing only the columns common in both the taxi datasets\n", 259 | "\n", 260 | "common_columns = []\n", 261 | "\n", 262 | "cols = set(df_yellow.columns)\n", 263 | "\n", 264 | "for col in df_green.columns:\n", 265 | " if col in cols:\n", 266 | " common_columns.append(col)\n", 267 | "\n", 268 | "common_columns" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 22, 274 | "id": "933bee9c", 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "from pyspark.sql import functions as F" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 25, 284 | "id": "c6eed3e5", 285 | "metadata": { 286 | "scrolled": true 287 | }, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "text/plain": [ 292 | "['VendorID',\n", 293 | " 'pickup_datetime',\n", 294 | " 'dropoff_datetime',\n", 295 | " 'store_and_fwd_flag',\n", 296 | " 'RatecodeID',\n", 297 | " 'PULocationID',\n", 298 | " 'DOLocationID',\n", 299 | " 'passenger_count',\n", 300 | " 'trip_distance',\n", 301 | " 'fare_amount',\n", 302 | " 'extra',\n", 303 | " 'mta_tax',\n", 304 | " 'tip_amount',\n", 305 | " 'tolls_amount',\n", 306 | " 'improvement_surcharge',\n", 307 | " 'total_amount',\n", 308 | " 'payment_type',\n", 309 | " 'congestion_surcharge',\n", 310 | " 'service_type']" 311 | ] 312 | }, 313 | "execution_count": 25, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "# Adding an additional column to differentiate the taxi color\n", 320 | "\n", 321 | "df_green_sel = df_green.select(common_columns).withColumn(\"service_type\", F.lit('green'))\n", 322 | "\n", 323 | "df_green_sel.columns" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 26, 329 | "id": "0cc38d27", 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "data": { 334 | "text/plain": [ 335 | "['VendorID',\n", 336 | " 'pickup_datetime',\n", 337 | " 'dropoff_datetime',\n", 338 | " 'store_and_fwd_flag',\n", 339 | " 'RatecodeID',\n", 340 | " 'PULocationID',\n", 341 | " 'DOLocationID',\n", 342 | " 'passenger_count',\n", 343 | " 'trip_distance',\n", 344 | " 'fare_amount',\n", 345 | " 'extra',\n", 346 | " 'mta_tax',\n", 347 | " 'tip_amount',\n", 348 | " 'tolls_amount',\n", 349 | " 'improvement_surcharge',\n", 350 | " 'total_amount',\n", 351 | " 'payment_type',\n", 352 | " 'congestion_surcharge',\n", 353 | " 'service_type']" 354 | ] 355 | }, 356 | "execution_count": 26, 357 | "metadata": {}, 358 | "output_type": "execute_result" 359 | } 360 | ], 361 | "source": [ 362 | "# Adding an additional column to differentiate the taxi color\n", 363 | "\n", 364 | "df_yellow_sel = df_yellow.select(common_columns).withColumn(\"service_type\", F.lit('yellow'))\n", 365 | "\n", 366 | "df_yellow_sel.columns" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 27, 372 | "id": "f0c117d3", 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "# Combining the two datasets\n", 377 | "\n", 378 | "df_trips_data = df_green_sel.unionAll(df_yellow_sel)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 45, 384 | "id": "e415c4c0", 385 | "metadata": {}, 386 | "outputs": [ 387 | { 388 | "name": "stdout", 389 | "output_type": "stream", 390 | "text": [ 391 | "+------------+--------+\n", 392 | "|service_type| count|\n", 393 | "+------------+--------+\n", 394 | "| yellow|39649199|\n", 395 | "| green| 2304517|\n", 396 | "+------------+--------+\n", 397 | "\n" 398 | ] 399 | } 400 | ], 401 | "source": [ 402 | "df_trips_data.groupBy('service_type').count().orderBy('count', ascending = False).show()" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 48, 408 | "id": "e242ddd9", 409 | "metadata": {}, 410 | "outputs": [ 411 | { 412 | "name": "stderr", 413 | "output_type": "stream", 414 | "text": [ 415 | "C:\\Users\\balaj\\anaconda3\\lib\\site-packages\\pyspark\\sql\\dataframe.py:138: FutureWarning: Deprecated in 2.0, use createOrReplaceTempView instead.\n", 416 | " warnings.warn(\n" 417 | ] 418 | } 419 | ], 420 | "source": [ 421 | "# Registering the spark dataframe as a temporary table to be used for Spark SQL\n", 422 | "\n", 423 | "df_trips_data.registerTempTable('trips_data')" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 57, 429 | "id": "7bbd40ae", 430 | "metadata": {}, 431 | "outputs": [ 432 | { 433 | "name": "stdout", 434 | "output_type": "stream", 435 | "text": [ 436 | "+------------+--------+\n", 437 | "|service_type| count|\n", 438 | "+------------+--------+\n", 439 | "| yellow|39649199|\n", 440 | "| green| 2304517|\n", 441 | "+------------+--------+\n", 442 | "\n" 443 | ] 444 | } 445 | ], 446 | "source": [ 447 | "# Sample SQL query\n", 448 | "\n", 449 | "spark.sql(\"\"\"\n", 450 | "SELECT \n", 451 | " service_type, count(1) as count\n", 452 | "FROM\n", 453 | " trips_data\n", 454 | "GROUP BY 1\n", 455 | "ORDER BY 1 desc\n", 456 | "\"\"\"\n", 457 | ").show()" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 58, 463 | "id": "f8350e5e", 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [ 467 | "# Saving a result from a SQL query\n", 468 | "\n", 469 | "df_result = spark.sql(\"\"\"\n", 470 | "SELECT \n", 471 | " -- Reveneue grouping \n", 472 | " PULocationID AS revenue_zone,\n", 473 | " date_trunc('month', pickup_datetime) AS revenue_month, \n", 474 | " service_type, \n", 475 | "\n", 476 | " -- Revenue calculation \n", 477 | " SUM(fare_amount) AS revenue_monthly_fare,\n", 478 | " SUM(extra) AS revenue_monthly_extra,\n", 479 | " SUM(mta_tax) AS revenue_monthly_mta_tax,\n", 480 | " SUM(tip_amount) AS revenue_monthly_tip_amount,\n", 481 | " SUM(tolls_amount) AS revenue_monthly_tolls_amount,\n", 482 | " SUM(improvement_surcharge) AS revenue_monthly_improvement_surcharge,\n", 483 | " SUM(total_amount) AS revenue_monthly_total_amount,\n", 484 | " SUM(congestion_surcharge) AS revenue_monthly_congestion_surcharge,\n", 485 | "\n", 486 | " -- Additional calculations\n", 487 | " AVG(passenger_count) AS avg_montly_passenger_count,\n", 488 | " AVG(trip_distance) AS avg_montly_trip_distance\n", 489 | "FROM\n", 490 | " trips_data\n", 491 | "GROUP BY\n", 492 | " 1, 2, 3\n", 493 | "\"\"\")\n", 494 | "\n", 495 | "df_result.coalesce(1).write.parquet('D:/data/report/revenue/', mode='overwrite')" 496 | ] 497 | } 498 | ], 499 | "metadata": { 500 | "kernelspec": { 501 | "display_name": "Python 3 (ipykernel)", 502 | "language": "python", 503 | "name": "python3" 504 | }, 505 | "language_info": { 506 | "codemirror_mode": { 507 | "name": "ipython", 508 | "version": 3 509 | }, 510 | "file_extension": ".py", 511 | "mimetype": "text/x-python", 512 | "name": "python", 513 | "nbconvert_exporter": "python", 514 | "pygments_lexer": "ipython3", 515 | "version": "3.9.15" 516 | } 517 | }, 518 | "nbformat": 4, 519 | "nbformat_minor": 5 520 | } 521 | -------------------------------------------------------------------------------- /Week 5/Code/06 RDDs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "b40d2766", 6 | "metadata": {}, 7 | "source": [ 8 | "## Section I\n", 9 | "\n", 10 | "Operations on Spark RDDs" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "722b5468", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pyspark\n", 21 | "from pyspark.sql import SparkSession\n", 22 | "\n", 23 | "spark = SparkSession.builder \\\n", 24 | " .master(\"local[*]\") \\\n", 25 | " .appName('test') \\\n", 26 | " .getOrCreate()" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "id": "7f88bc7b", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "df_green = spark.read.parquet('D:/data/pq/green/*/*')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "9e4913cf", 42 | "metadata": {}, 43 | "source": [ 44 | "#### We will be Implementing the below SQL code using RDDs\n", 45 | "\n", 46 | "SELECT \n", 47 | " date_trunc('hour', lpep_pickup_datetime) AS hour, \n", 48 | " PULocationID AS zone, \n", 49 | " \n", 50 | " SUM(total_amount) AS amount,\n", 51 | " COUNT(1) AS number_records \n", 52 | "FROM\n", 53 | " green \\\n", 54 | "WHERE \n", 55 | " lpep_pickup_datetime >= '2020-01-01 00:00:00' \\\n", 56 | "GROUP BY\n", 57 | " 1, 2;" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "id": "34a91beb", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# STEP 1\n", 68 | "\n", 69 | "rdd = df_green \\\n", 70 | " .select('lpep_pickup_datetime', 'PULocationID', 'total_amount') \\\n", 71 | " .rdd" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "id": "1f594261", 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "[Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 23, 13, 10, 15), PULocationID=74, total_amount=44.97),\n", 84 | " Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 20, 15, 9), PULocationID=67, total_amount=33.45),\n", 85 | " Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 15, 20, 23, 41), PULocationID=260, total_amount=8.3),\n", 86 | " Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 5, 16, 32, 26), PULocationID=82, total_amount=8.3),\n", 87 | " Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 29, 19, 22, 42), PULocationID=166, total_amount=12.74)]" 88 | ] 89 | }, 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "rdd.take(5)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 6, 102 | "id": "46b797f9", 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "[Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 23, 13, 10, 15), PULocationID=74, total_amount=44.97),\n", 109 | " Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 20, 15, 9), PULocationID=67, total_amount=33.45),\n", 110 | " Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 15, 20, 23, 41), PULocationID=260, total_amount=8.3),\n", 111 | " Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 5, 16, 32, 26), PULocationID=82, total_amount=8.3),\n", 112 | " Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 29, 19, 22, 42), PULocationID=166, total_amount=12.74)]" 113 | ] 114 | }, 115 | "execution_count": 6, 116 | "metadata": {}, 117 | "output_type": "execute_result" 118 | } 119 | ], 120 | "source": [ 121 | "# STEP 2\n", 122 | "\n", 123 | "from datetime import datetime\n", 124 | "\n", 125 | "start = datetime(year=2020, month=1, day=1)\n", 126 | "\n", 127 | "def filter_outliers(row):\n", 128 | " return row.lpep_pickup_datetime >= start\n", 129 | "\n", 130 | "rdd.filter(filter_outliers).take(5)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 7, 136 | "id": "c2a85921", 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 23, 13, 10, 15), PULocationID=74, total_amount=44.97)" 143 | ] 144 | }, 145 | "execution_count": 7, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "# Used this row for testing purpose\n", 152 | "\n", 153 | "rows = rdd.take(10)\n", 154 | "row = rows[0]\n", 155 | "\n", 156 | "row" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 8, 162 | "id": "2cb29aae", 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "# STEP 3\n", 167 | "\n", 168 | "def prepare_for_grouping(row): \n", 169 | " hour = row.lpep_pickup_datetime.replace(minute=0, second=0, microsecond=0)\n", 170 | " zone = row.PULocationID\n", 171 | " key = (hour, zone)\n", 172 | " \n", 173 | " amount = row.total_amount\n", 174 | " count = 1\n", 175 | " value = (amount, count)\n", 176 | "\n", 177 | " return (key, value)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 9, 183 | "id": "91dcfda7", 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "data": { 188 | "text/plain": [ 189 | "[((datetime.datetime(2020, 1, 23, 13, 0), 74), (44.97, 1)),\n", 190 | " ((datetime.datetime(2020, 1, 20, 15, 0), 67), (33.45, 1)),\n", 191 | " ((datetime.datetime(2020, 1, 15, 20, 0), 260), (8.3, 1)),\n", 192 | " ((datetime.datetime(2020, 1, 5, 16, 0), 82), (8.3, 1)),\n", 193 | " ((datetime.datetime(2020, 1, 29, 19, 0), 166), (12.74, 1))]" 194 | ] 195 | }, 196 | "execution_count": 9, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "rdd.filter(filter_outliers).map(prepare_for_grouping).take(5)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 10, 208 | "id": "308bf061", 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "data": { 213 | "text/plain": [ 214 | "[((datetime.datetime(2020, 1, 20, 15, 0), 67), (79.5, 3)),\n", 215 | " ((datetime.datetime(2020, 1, 16, 8, 0), 41), (736.1399999999994, 54)),\n", 216 | " ((datetime.datetime(2020, 1, 20, 15, 0), 75), (609.0, 47)),\n", 217 | " ((datetime.datetime(2020, 1, 17, 21, 0), 74), (594.87, 39)),\n", 218 | " ((datetime.datetime(2020, 1, 3, 9, 0), 61), (142.21, 9))]" 219 | ] 220 | }, 221 | "execution_count": 10, 222 | "metadata": {}, 223 | "output_type": "execute_result" 224 | } 225 | ], 226 | "source": [ 227 | "# STEP 4\n", 228 | "\n", 229 | "def calculate_revenue(left_value, right_value):\n", 230 | " left_amount, left_count = left_value\n", 231 | " right_amount, right_count = right_value\n", 232 | " \n", 233 | " output_amount = left_amount + right_amount\n", 234 | " output_count = left_count + right_count\n", 235 | " \n", 236 | " return (output_amount, output_count)\n", 237 | "\n", 238 | "rdd.filter(filter_outliers).map(prepare_for_grouping).reduceByKey(calculate_revenue).take(5)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 11, 244 | "id": "6d2fa321", 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "data": { 249 | "text/plain": [ 250 | "[RevenueRow(hour=datetime.datetime(2020, 1, 20, 15, 0), zone=67, revenue=79.5, count=3),\n", 251 | " RevenueRow(hour=datetime.datetime(2020, 1, 16, 8, 0), zone=41, revenue=736.1399999999994, count=54),\n", 252 | " RevenueRow(hour=datetime.datetime(2020, 1, 20, 15, 0), zone=75, revenue=609.0, count=47),\n", 253 | " RevenueRow(hour=datetime.datetime(2020, 1, 17, 21, 0), zone=74, revenue=594.87, count=39),\n", 254 | " RevenueRow(hour=datetime.datetime(2020, 1, 3, 9, 0), zone=61, revenue=142.21, count=9)]" 255 | ] 256 | }, 257 | "execution_count": 11, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "# STEP 5\n", 264 | "\n", 265 | "from collections import namedtuple\n", 266 | "\n", 267 | "RevenueRow = namedtuple('RevenueRow', ['hour', 'zone', 'revenue', 'count'])\n", 268 | "\n", 269 | "def unwrap(row):\n", 270 | " return RevenueRow(\n", 271 | " hour=row[0][0], \n", 272 | " zone=row[0][1],\n", 273 | " revenue=row[1][0],\n", 274 | " count=row[1][1]\n", 275 | " )\n", 276 | "\n", 277 | "rdd.filter(filter_outliers).map(prepare_for_grouping).reduceByKey(calculate_revenue).map(unwrap).take(5)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 18, 283 | "id": "c7623de8", 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "+-------------------+----+-----------------+-----+\n", 291 | "| hour|zone| revenue|count|\n", 292 | "+-------------------+----+-----------------+-----+\n", 293 | "|2020-01-20 15:00:00| 67| 79.5| 3|\n", 294 | "|2020-01-16 08:00:00| 41|736.1399999999994| 54|\n", 295 | "|2020-01-20 15:00:00| 75| 609.0| 47|\n", 296 | "|2020-01-17 21:00:00| 74| 594.87| 39|\n", 297 | "|2020-01-03 09:00:00| 61| 142.21| 9|\n", 298 | "+-------------------+----+-----------------+-----+\n", 299 | "only showing top 5 rows\n", 300 | "\n" 301 | ] 302 | } 303 | ], 304 | "source": [ 305 | "# If you don't specify the schema in toDF(), it takes a bit longer to run as it tries to find what schema to implement it in\n", 306 | "\n", 307 | "rdd.filter(filter_outliers).map(prepare_for_grouping).reduceByKey(calculate_revenue).map(unwrap).toDF().show(5)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 19, 313 | "id": "a800b5cb", 314 | "metadata": {}, 315 | "outputs": [ 316 | { 317 | "name": "stdout", 318 | "output_type": "stream", 319 | "text": [ 320 | "+-------------------+----+-----------------+-----+\n", 321 | "| hour|zone| revenue|count|\n", 322 | "+-------------------+----+-----------------+-----+\n", 323 | "|2020-01-20 15:00:00| 67| 79.5| 3|\n", 324 | "|2020-01-16 08:00:00| 41|736.1399999999994| 54|\n", 325 | "|2020-01-20 15:00:00| 75| 609.0| 47|\n", 326 | "|2020-01-17 21:00:00| 74| 594.87| 39|\n", 327 | "|2020-01-03 09:00:00| 61| 142.21| 9|\n", 328 | "+-------------------+----+-----------------+-----+\n", 329 | "only showing top 5 rows\n", 330 | "\n" 331 | ] 332 | } 333 | ], 334 | "source": [ 335 | "# STEP 6\n", 336 | "\n", 337 | "from pyspark.sql import types\n", 338 | "\n", 339 | "result_schema = types.StructType([\n", 340 | " types.StructField('hour', types.TimestampType(), True),\n", 341 | " types.StructField('zone', types.IntegerType(), True),\n", 342 | " types.StructField('revenue', types.DoubleType(), True),\n", 343 | " types.StructField('count', types.IntegerType(), True)\n", 344 | "])\n", 345 | "\n", 346 | "rdd.filter(filter_outliers).map(prepare_for_grouping).reduceByKey(calculate_revenue).map(unwrap).toDF(result_schema).show(5)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 20, 352 | "id": "d3d64007", 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "df_result = rdd \\\n", 357 | " .filter(filter_outliers) \\\n", 358 | " .map(prepare_for_grouping) \\\n", 359 | " .reduceByKey(calculate_revenue) \\\n", 360 | " .map(unwrap) \\\n", 361 | " .toDF(result_schema) " 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 15, 367 | "id": "72399664", 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "df_result.write.parquet('D:/tmp/green-revenue')" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "id": "ffa1c598", 377 | "metadata": {}, 378 | "source": [ 379 | "## Section II\n", 380 | "\n", 381 | "Spark RDD mapPartition" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 21, 387 | "id": "c3f3c1fe", 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "columns = ['VendorID', 'lpep_pickup_datetime', 'PULocationID', 'DOLocationID', 'trip_distance']\n", 392 | "\n", 393 | "duration_rdd = df_green \\\n", 394 | " .select(columns) \\\n", 395 | " .rdd" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 23, 401 | "id": "436aca6e", 402 | "metadata": {}, 403 | "outputs": [ 404 | { 405 | "data": { 406 | "text/plain": [ 407 | "[Row(VendorID=2, lpep_pickup_datetime=datetime.datetime(2020, 1, 23, 13, 10, 15), PULocationID=74, DOLocationID=130, trip_distance=12.77),\n", 408 | " Row(VendorID=None, lpep_pickup_datetime=datetime.datetime(2020, 1, 20, 15, 9), PULocationID=67, DOLocationID=39, trip_distance=8.0),\n", 409 | " Row(VendorID=2, lpep_pickup_datetime=datetime.datetime(2020, 1, 15, 20, 23, 41), PULocationID=260, DOLocationID=157, trip_distance=1.27),\n", 410 | " Row(VendorID=2, lpep_pickup_datetime=datetime.datetime(2020, 1, 5, 16, 32, 26), PULocationID=82, DOLocationID=83, trip_distance=1.25),\n", 411 | " Row(VendorID=2, lpep_pickup_datetime=datetime.datetime(2020, 1, 29, 19, 22, 42), PULocationID=166, DOLocationID=42, trip_distance=1.84)]" 412 | ] 413 | }, 414 | "execution_count": 23, 415 | "metadata": {}, 416 | "output_type": "execute_result" 417 | } 418 | ], 419 | "source": [ 420 | "duration_rdd.take(5)" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 29, 426 | "id": "cbdfa98f", 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "import pandas as pd\n", 431 | "\n", 432 | "#model = ...\n", 433 | "\n", 434 | "def model_predict(df):\n", 435 | "# y_pred = model.predict(df)\n", 436 | " y_pred = df.trip_distance * 5\n", 437 | " return y_pred\n", 438 | "\n", 439 | "\n", 440 | "def apply_model_in_batch(rows):\n", 441 | " df = pd.DataFrame(rows, columns=columns)\n", 442 | " predictions = model_predict(df)\n", 443 | " df['predicted_duration'] = predictions\n", 444 | "\n", 445 | " for row in df.itertuples():\n", 446 | " yield row" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 31, 452 | "id": "b00263cf", 453 | "metadata": {}, 454 | "outputs": [ 455 | { 456 | "data": { 457 | "text/plain": [ 458 | "[Pandas(Index=0, VendorID=2.0, lpep_pickup_datetime=Timestamp('2020-01-23 13:10:15'), PULocationID=74, DOLocationID=130, trip_distance=12.77, predicted_duration=63.849999999999994),\n", 459 | " Pandas(Index=1, VendorID=nan, lpep_pickup_datetime=Timestamp('2020-01-20 15:09:00'), PULocationID=67, DOLocationID=39, trip_distance=8.0, predicted_duration=40.0),\n", 460 | " Pandas(Index=2, VendorID=2.0, lpep_pickup_datetime=Timestamp('2020-01-15 20:23:41'), PULocationID=260, DOLocationID=157, trip_distance=1.27, predicted_duration=6.35),\n", 461 | " Pandas(Index=3, VendorID=2.0, lpep_pickup_datetime=Timestamp('2020-01-05 16:32:26'), PULocationID=82, DOLocationID=83, trip_distance=1.25, predicted_duration=6.25),\n", 462 | " Pandas(Index=4, VendorID=2.0, lpep_pickup_datetime=Timestamp('2020-01-29 19:22:42'), PULocationID=166, DOLocationID=42, trip_distance=1.84, predicted_duration=9.200000000000001)]" 463 | ] 464 | }, 465 | "execution_count": 31, 466 | "metadata": {}, 467 | "output_type": "execute_result" 468 | } 469 | ], 470 | "source": [ 471 | "duration_rdd.mapPartitions(apply_model_in_batch).take(5)" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 32, 477 | "id": "cf0ed02e", 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "df_predicts = duration_rdd \\\n", 482 | " .mapPartitions(apply_model_in_batch)\\\n", 483 | " .toDF() \\\n", 484 | " .drop('Index')" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 33, 490 | "id": "a8a6821d", 491 | "metadata": {}, 492 | "outputs": [ 493 | { 494 | "name": "stdout", 495 | "output_type": "stream", 496 | "text": [ 497 | "+------------------+\n", 498 | "|predicted_duration|\n", 499 | "+------------------+\n", 500 | "|63.849999999999994|\n", 501 | "| 40.0|\n", 502 | "| 6.35|\n", 503 | "| 6.25|\n", 504 | "| 9.200000000000001|\n", 505 | "| 3.8|\n", 506 | "|16.599999999999998|\n", 507 | "| 11.05|\n", 508 | "| 4.5|\n", 509 | "| 30.5|\n", 510 | "| 8.7|\n", 511 | "|5.8999999999999995|\n", 512 | "| 11.0|\n", 513 | "| 15.2|\n", 514 | "| 4.25|\n", 515 | "|25.299999999999997|\n", 516 | "|7.8500000000000005|\n", 517 | "| 34.0|\n", 518 | "| 5.300000000000001|\n", 519 | "| 6.15|\n", 520 | "+------------------+\n", 521 | "only showing top 20 rows\n", 522 | "\n" 523 | ] 524 | } 525 | ], 526 | "source": [ 527 | "df_predicts.select('predicted_duration').show()" 528 | ] 529 | } 530 | ], 531 | "metadata": { 532 | "kernelspec": { 533 | "display_name": "Python 3 (ipykernel)", 534 | "language": "python", 535 | "name": "python3" 536 | }, 537 | "language_info": { 538 | "codemirror_mode": { 539 | "name": "ipython", 540 | "version": 3 541 | }, 542 | "file_extension": ".py", 543 | "mimetype": "text/x-python", 544 | "name": "python", 545 | "nbconvert_exporter": "python", 546 | "pygments_lexer": "ipython3", 547 | "version": "3.9.15" 548 | } 549 | }, 550 | "nbformat": 4, 551 | "nbformat_minor": 5 552 | } 553 | -------------------------------------------------------------------------------- /Week 5/Code/03 Taxi Schema.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "60993230", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pyspark\n", 11 | "from pyspark.sql import SparkSession" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "id": "03f3af5b", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "spark = SparkSession.builder \\\n", 22 | " .master(\"local[*]\") \\\n", 23 | " .appName('test') \\\n", 24 | " .getOrCreate()" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "ef0f2f45", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# import pandas as pd\n", 35 | "# from pyspark.sql import types\n", 36 | "\n", 37 | "# green_schema = types.StructType([\n", 38 | "# types.StructField(\"VendorID\", types.IntegerType(), True),\n", 39 | "# types.StructField(\"lpep_pickup_datetime\", types.TimestampType(), True),\n", 40 | "# types.StructField(\"lpep_dropoff_datetime\", types.TimestampType(), True),\n", 41 | "# types.StructField(\"store_and_fwd_flag\", types.StringType(), True),\n", 42 | "# types.StructField(\"RatecodeID\", types.IntegerType(), True),\n", 43 | "# types.StructField(\"PULocationID\", types.IntegerType(), True),\n", 44 | "# types.StructField(\"DOLocationID\", types.IntegerType(), True),\n", 45 | "# types.StructField(\"passenger_count\", types.IntegerType(), True),\n", 46 | "# types.StructField(\"trip_distance\", types.DoubleType(), True),\n", 47 | "# types.StructField(\"fare_amount\", types.DoubleType(), True),\n", 48 | "# types.StructField(\"extra\", types.DoubleType(), True),\n", 49 | "# types.StructField(\"mta_tax\", types.DoubleType(), True),\n", 50 | "# types.StructField(\"tip_amount\", types.DoubleType(), True),\n", 51 | "# types.StructField(\"tolls_amount\", types.DoubleType(), True),\n", 52 | "# types.StructField(\"ehail_fee\", types.DoubleType(), True),\n", 53 | "# types.StructField(\"improvement_surcharge\", types.DoubleType(), True),\n", 54 | "# types.StructField(\"total_amount\", types.DoubleType(), True),\n", 55 | "# types.StructField(\"payment_type\", types.IntegerType(), True),\n", 56 | "# types.StructField(\"trip_type\", types.IntegerType(), True),\n", 57 | "# types.StructField(\"congestion_surcharge\", types.DoubleType(), True)\n", 58 | "# ])\n", 59 | "\n", 60 | "# yellow_schema = types.StructType([\n", 61 | "# types.StructField(\"VendorID\", types.IntegerType(), True),\n", 62 | "# types.StructField(\"tpep_pickup_datetime\", types.TimestampType(), True),\n", 63 | "# types.StructField(\"tpep_dropoff_datetime\", types.TimestampType(), True),\n", 64 | "# types.StructField(\"passenger_count\", types.IntegerType(), True),\n", 65 | "# types.StructField(\"trip_distance\", types.DoubleType(), True),\n", 66 | "# types.StructField(\"RatecodeID\", types.IntegerType(), True),\n", 67 | "# types.StructField(\"store_and_fwd_flag\", types.StringType(), True),\n", 68 | "# types.StructField(\"PULocationID\", types.IntegerType(), True),\n", 69 | "# types.StructField(\"DOLocationID\", types.IntegerType(), True),\n", 70 | "# types.StructField(\"payment_type\", types.IntegerType(), True),\n", 71 | "# types.StructField(\"fare_amount\", types.DoubleType(), True),\n", 72 | "# types.StructField(\"extra\", types.DoubleType(), True),\n", 73 | "# types.StructField(\"mta_tax\", types.DoubleType(), True),\n", 74 | "# types.StructField(\"tip_amount\", types.DoubleType(), True),\n", 75 | "# types.StructField(\"tolls_amount\", types.DoubleType(), True),\n", 76 | "# types.StructField(\"improvement_surcharge\", types.DoubleType(), True),\n", 77 | "# types.StructField(\"total_amount\", types.DoubleType(), True),\n", 78 | "# types.StructField(\"congestion_surcharge\", types.DoubleType(), True)\n", 79 | "# ])" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 4, 85 | "id": "1f3183f7", 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "processing data for 2020/1\n", 93 | "processing data for 2020/2\n", 94 | "processing data for 2020/3\n", 95 | "processing data for 2020/4\n", 96 | "processing data for 2020/5\n", 97 | "processing data for 2020/6\n", 98 | "processing data for 2020/7\n", 99 | "processing data for 2020/8\n", 100 | "processing data for 2020/9\n", 101 | "processing data for 2020/10\n", 102 | "processing data for 2020/11\n", 103 | "processing data for 2020/12\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "year = 2020\n", 109 | "\n", 110 | "for month in range(1, 13):\n", 111 | " print(f'processing data for {year}/{month}')\n", 112 | "\n", 113 | " input_path = f'D:/data/raw/green/{year}/{month:02d}/'\n", 114 | " output_path = f'D:/data/pq/green/{year}/{month:02d}/'\n", 115 | "\n", 116 | " df_green = spark.read \\\n", 117 | " .option(\"header\", \"true\") \\\n", 118 | " .option(\"inferSchema\", \"true\") \\ # .schema(green_schema)\n", 119 | " .csv(input_path)\n", 120 | "\n", 121 | " df_green \\\n", 122 | " .repartition(4) \\\n", 123 | " .write.parquet(output_path)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 5, 129 | "id": "0ee5b906", 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "processing data for 2021/1\n", 137 | "processing data for 2021/2\n", 138 | "processing data for 2021/3\n", 139 | "processing data for 2021/4\n", 140 | "processing data for 2021/5\n", 141 | "processing data for 2021/6\n", 142 | "processing data for 2021/7\n", 143 | "processing data for 2021/8\n" 144 | ] 145 | }, 146 | { 147 | "ename": "AnalysisException", 148 | "evalue": "Path does not exist: file:/D:/data/raw/green/2021/08", 149 | "output_type": "error", 150 | "traceback": [ 151 | "\u001b[1;31m--------------------\u001b[0m", 152 | "\u001b[1;31mAnalysisException\u001b[0mTraceback (most recent call last)", 153 | "Cell \u001b[1;32mIn[5], line 9\u001b[0m\n\u001b[0;32m 6\u001b[0m input_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mD:/data/raw/green/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00myear\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmonth\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m02d\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m 7\u001b[0m output_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mD:/data/pq/green/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00myear\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmonth\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m02d\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m----> 9\u001b[0m df_green \u001b[38;5;241m=\u001b[39m \u001b[43mspark\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m \u001b[49m\u001b[43m\\\u001b[49m\n\u001b[0;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moption\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mheader\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtrue\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m\\\u001b[49m\n\u001b[0;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moption\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minferSchema\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtrue\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m\\\u001b[49m\n\u001b[0;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcsv\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 14\u001b[0m df_green \\\n\u001b[0;32m 15\u001b[0m \u001b[38;5;241m.\u001b[39mrepartition(\u001b[38;5;241m4\u001b[39m) \\\n\u001b[0;32m 16\u001b[0m \u001b[38;5;241m.\u001b[39mwrite\u001b[38;5;241m.\u001b[39mparquet(output_path)\n", 154 | "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pyspark\\sql\\readwriter.py:410\u001b[0m, in \u001b[0;36mDataFrameReader.csv\u001b[1;34m(self, path, schema, sep, encoding, quote, escape, comment, header, inferSchema, ignoreLeadingWhiteSpace, ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, dateFormat, timestampFormat, maxColumns, maxCharsPerColumn, maxMalformedLogPerPartition, mode, columnNameOfCorruptRecord, multiLine, charToEscapeQuoteEscaping, samplingRatio, enforceSchema, emptyValue, locale, lineSep, pathGlobFilter, recursiveFileLookup, modifiedBefore, modifiedAfter, unescapedQuoteHandling)\u001b[0m\n\u001b[0;32m 408\u001b[0m path \u001b[38;5;241m=\u001b[39m [path]\n\u001b[0;32m 409\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(path) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlist\u001b[39m:\n\u001b[1;32m--> 410\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_df(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jreader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcsv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_spark\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jvm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPythonUtils\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtoSeq\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m 411\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(path, RDD):\n\u001b[0;32m 412\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfunc\u001b[39m(iterator):\n", 155 | "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py:1321\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 1315\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[0;32m 1316\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[0;32m 1317\u001b[0m args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[0;32m 1318\u001b[0m proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[0;32m 1320\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client\u001b[38;5;241m.\u001b[39msend_command(command)\n\u001b[1;32m-> 1321\u001b[0m return_value \u001b[38;5;241m=\u001b[39m \u001b[43mget_return_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1322\u001b[0m \u001b[43m \u001b[49m\u001b[43manswer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgateway_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1324\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n\u001b[0;32m 1325\u001b[0m temp_arg\u001b[38;5;241m.\u001b[39m_detach()\n", 156 | "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pyspark\\sql\\utils.py:117\u001b[0m, in \u001b[0;36mcapture_sql_exception..deco\u001b[1;34m(*a, **kw)\u001b[0m\n\u001b[0;32m 113\u001b[0m converted \u001b[38;5;241m=\u001b[39m convert_exception(e\u001b[38;5;241m.\u001b[39mjava_exception)\n\u001b[0;32m 114\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(converted, UnknownException):\n\u001b[0;32m 115\u001b[0m \u001b[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001b[39;00m\n\u001b[0;32m 116\u001b[0m \u001b[38;5;66;03m# JVM exception message.\u001b[39;00m\n\u001b[1;32m--> 117\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m converted \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[0;32m 118\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 119\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n", 157 | "\u001b[1;31mAnalysisException\u001b[0m: Path does not exist: file:/D:/data/raw/green/2021/08" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "year = 2021\n", 163 | "\n", 164 | "for month in range(1, 13):\n", 165 | " print(f'processing data for {year}/{month}')\n", 166 | "\n", 167 | " input_path = f'D:/data/raw/green/{year}/{month:02d}/'\n", 168 | " output_path = f'D:/data/pq/green/{year}/{month:02d}/'\n", 169 | "\n", 170 | " df_green = spark.read \\\n", 171 | " .option(\"header\", \"true\") \\\n", 172 | " .option(\"inferSchema\", \"true\") \\ # .schema(green_schema)\n", 173 | " .csv(input_path)\n", 174 | "\n", 175 | " df_green \\\n", 176 | " .repartition(4) \\\n", 177 | " .write.parquet(output_path)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 6, 183 | "id": "1fa4ae14", 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | "processing data for 2020/1\n", 191 | "processing data for 2020/2\n", 192 | "processing data for 2020/3\n", 193 | "processing data for 2020/4\n", 194 | "processing data for 2020/5\n", 195 | "processing data for 2020/6\n", 196 | "processing data for 2020/7\n", 197 | "processing data for 2020/8\n", 198 | "processing data for 2020/9\n", 199 | "processing data for 2020/10\n", 200 | "processing data for 2020/11\n", 201 | "processing data for 2020/12\n" 202 | ] 203 | } 204 | ], 205 | "source": [ 206 | "year = 2020\n", 207 | "\n", 208 | "for month in range(1, 13):\n", 209 | " print(f'processing data for {year}/{month}')\n", 210 | "\n", 211 | " input_path = f'D:/data/raw/yellow/{year}/{month:02d}/'\n", 212 | " output_path = f'D:/data/pq/yellow/{year}/{month:02d}/'\n", 213 | "\n", 214 | " df_green = spark.read \\\n", 215 | " .option(\"header\", \"true\") \\\n", 216 | " .option(\"inferSchema\", \"true\") \\ # .schema(yellow_schema)\n", 217 | " .csv(input_path)\n", 218 | "\n", 219 | " df_green \\\n", 220 | " .repartition(4) \\\n", 221 | " .write.parquet(output_path)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 7, 227 | "id": "738bbef7", 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "name": "stdout", 232 | "output_type": "stream", 233 | "text": [ 234 | "processing data for 2021/1\n", 235 | "processing data for 2021/2\n", 236 | "processing data for 2021/3\n", 237 | "processing data for 2021/4\n", 238 | "processing data for 2021/5\n", 239 | "processing data for 2021/6\n", 240 | "processing data for 2021/7\n", 241 | "processing data for 2021/8\n" 242 | ] 243 | }, 244 | { 245 | "ename": "AnalysisException", 246 | "evalue": "Path does not exist: file:/D:/data/raw/yellow/2021/08", 247 | "output_type": "error", 248 | "traceback": [ 249 | "\u001b[1;31m--------------------\u001b[0m", 250 | "\u001b[1;31mAnalysisException\u001b[0mTraceback (most recent call last)", 251 | "Cell \u001b[1;32mIn[7], line 9\u001b[0m\n\u001b[0;32m 6\u001b[0m input_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mD:/data/raw/yellow/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00myear\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmonth\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m02d\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m 7\u001b[0m output_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mD:/data/pq/yellow/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00myear\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmonth\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m02d\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m----> 9\u001b[0m df_green \u001b[38;5;241m=\u001b[39m \u001b[43mspark\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m \u001b[49m\u001b[43m\\\u001b[49m\n\u001b[0;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moption\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mheader\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtrue\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m\\\u001b[49m\n\u001b[0;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moption\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minferSchema\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtrue\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m\\\u001b[49m\n\u001b[0;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcsv\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 14\u001b[0m df_green \\\n\u001b[0;32m 15\u001b[0m \u001b[38;5;241m.\u001b[39mrepartition(\u001b[38;5;241m4\u001b[39m) \\\n\u001b[0;32m 16\u001b[0m \u001b[38;5;241m.\u001b[39mwrite\u001b[38;5;241m.\u001b[39mparquet(output_path)\n", 252 | "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pyspark\\sql\\readwriter.py:410\u001b[0m, in \u001b[0;36mDataFrameReader.csv\u001b[1;34m(self, path, schema, sep, encoding, quote, escape, comment, header, inferSchema, ignoreLeadingWhiteSpace, ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, dateFormat, timestampFormat, maxColumns, maxCharsPerColumn, maxMalformedLogPerPartition, mode, columnNameOfCorruptRecord, multiLine, charToEscapeQuoteEscaping, samplingRatio, enforceSchema, emptyValue, locale, lineSep, pathGlobFilter, recursiveFileLookup, modifiedBefore, modifiedAfter, unescapedQuoteHandling)\u001b[0m\n\u001b[0;32m 408\u001b[0m path \u001b[38;5;241m=\u001b[39m [path]\n\u001b[0;32m 409\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(path) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlist\u001b[39m:\n\u001b[1;32m--> 410\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_df(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jreader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcsv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_spark\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jvm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPythonUtils\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtoSeq\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m 411\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(path, RDD):\n\u001b[0;32m 412\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfunc\u001b[39m(iterator):\n", 253 | "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py:1321\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 1315\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[0;32m 1316\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[0;32m 1317\u001b[0m args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[0;32m 1318\u001b[0m proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[0;32m 1320\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client\u001b[38;5;241m.\u001b[39msend_command(command)\n\u001b[1;32m-> 1321\u001b[0m return_value \u001b[38;5;241m=\u001b[39m \u001b[43mget_return_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1322\u001b[0m \u001b[43m \u001b[49m\u001b[43manswer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgateway_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1324\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n\u001b[0;32m 1325\u001b[0m temp_arg\u001b[38;5;241m.\u001b[39m_detach()\n", 254 | "File \u001b[1;32m~\\anaconda3\\lib\\site-packages\\pyspark\\sql\\utils.py:117\u001b[0m, in \u001b[0;36mcapture_sql_exception..deco\u001b[1;34m(*a, **kw)\u001b[0m\n\u001b[0;32m 113\u001b[0m converted \u001b[38;5;241m=\u001b[39m convert_exception(e\u001b[38;5;241m.\u001b[39mjava_exception)\n\u001b[0;32m 114\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(converted, UnknownException):\n\u001b[0;32m 115\u001b[0m \u001b[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001b[39;00m\n\u001b[0;32m 116\u001b[0m \u001b[38;5;66;03m# JVM exception message.\u001b[39;00m\n\u001b[1;32m--> 117\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m converted \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[0;32m 118\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 119\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n", 255 | "\u001b[1;31mAnalysisException\u001b[0m: Path does not exist: file:/D:/data/raw/yellow/2021/08" 256 | ] 257 | } 258 | ], 259 | "source": [ 260 | "year = 2021\n", 261 | "\n", 262 | "for month in range(1, 13):\n", 263 | " print(f'processing data for {year}/{month}')\n", 264 | "\n", 265 | " input_path = f'D:/data/raw/yellow/{year}/{month:02d}/'\n", 266 | " output_path = f'D:/data/pq/yellow/{year}/{month:02d}/'\n", 267 | "\n", 268 | " df_green = spark.read \\\n", 269 | " .option(\"header\", \"true\") \\\n", 270 | " .option(\"inferSchema\", \"true\") \\ # .schema(yellow_schema)\n", 271 | " .csv(input_path)\n", 272 | "\n", 273 | " df_green \\\n", 274 | " .repartition(4) \\\n", 275 | " .write.parquet(output_path)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 8, 281 | "id": "36675614", 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "df = spark.read.parquet('D:/data/pq/yellow/2021/01/')" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 9, 291 | "id": "fda63bac", 292 | "metadata": {}, 293 | "outputs": [ 294 | { 295 | "name": "stdout", 296 | "output_type": "stream", 297 | "text": [ 298 | "root\n", 299 | " |-- VendorID: integer (nullable = true)\n", 300 | " |-- tpep_pickup_datetime: timestamp (nullable = true)\n", 301 | " |-- tpep_dropoff_datetime: timestamp (nullable = true)\n", 302 | " |-- passenger_count: integer (nullable = true)\n", 303 | " |-- trip_distance: double (nullable = true)\n", 304 | " |-- RatecodeID: integer (nullable = true)\n", 305 | " |-- store_and_fwd_flag: string (nullable = true)\n", 306 | " |-- PULocationID: integer (nullable = true)\n", 307 | " |-- DOLocationID: integer (nullable = true)\n", 308 | " |-- payment_type: integer (nullable = true)\n", 309 | " |-- fare_amount: double (nullable = true)\n", 310 | " |-- extra: double (nullable = true)\n", 311 | " |-- mta_tax: double (nullable = true)\n", 312 | " |-- tip_amount: double (nullable = true)\n", 313 | " |-- tolls_amount: double (nullable = true)\n", 314 | " |-- improvement_surcharge: double (nullable = true)\n", 315 | " |-- total_amount: double (nullable = true)\n", 316 | " |-- congestion_surcharge: double (nullable = true)\n", 317 | "\n" 318 | ] 319 | } 320 | ], 321 | "source": [ 322 | "df.printSchema()" 323 | ] 324 | } 325 | ], 326 | "metadata": { 327 | "kernelspec": { 328 | "display_name": "Python 3 (ipykernel)", 329 | "language": "python", 330 | "name": "python3" 331 | }, 332 | "language_info": { 333 | "codemirror_mode": { 334 | "name": "ipython", 335 | "version": 3 336 | }, 337 | "file_extension": ".py", 338 | "mimetype": "text/x-python", 339 | "name": "python", 340 | "nbconvert_exporter": "python", 341 | "pygments_lexer": "ipython3", 342 | "version": "3.9.15" 343 | } 344 | }, 345 | "nbformat": 4, 346 | "nbformat_minor": 5 347 | } 348 | -------------------------------------------------------------------------------- /Week 5/Data Engineering Zoomcamp Week 5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "26d71c85", 6 | "metadata": {}, 7 | "source": [ 8 | "# Data Engineering Zoom Camp - Detailed Week 5 Notes" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "f84dce6a", 14 | "metadata": {}, 15 | "source": [ 16 | "# 1) Batch Processing\n", 17 | "\n", 18 | "There are two key approaches to processing data:\n", 19 | "\n", 20 | "- Batch processing\n", 21 | "- Stream processing (sometimes called real-time processing)\n", 22 | "\n", 23 | "In some circles, you’ll hear the first talked about as being the old way of doing things and the second as the more modern approach. The same sort of language is used when comparing monolithic apps to microservices or on-premise solutions to the cloud.\n", 24 | "\n", 25 | "In reality, things aren’t quite that simple in this case or in those other cases mentioned. Stream processing isn’t so much a replacement for batch processing as it is a different approach, and it’s not without its challenges.\n", 26 | "\n", 27 | "### What is Batch Processing?\n", 28 | "\n", 29 | "Batch processing is a term used to describe collecting, modifying, or exporting multiple data records at a regular cadence with downtime in betwen batches. Because large amounts of data can be processed all at once in these batches it can be a very efficient approach and is best suited for handling frequent, repetitive tasks. It is the most common form of data processing that fits many businesses data needs.\n", 30 | "\n", 31 | "Many businesses face increasingly complicated and diverse data challenges due to the sheer magnitude of data available. Batch processing has increased in sophistication, and is also often used in conjunction with other processing techniques for modern analysis. While batch processing used to be by far the most common and widely used method of data processing, recently real-time or near real-time stream processing has proven to be a worthy competitor. As traditional batch systems run overnight to process data accumulated during the day, there is naturally a delta between the real world versus what the data is actually describing. Advanced Batch Processing partially solves this issue, but even the most advanced systems cannot compete with stream processing for real-time continuous data.\n", 32 | "\n", 33 | "\n", 34 | "### Process Flow\n", 35 | "\n", 36 | "The general flow of batch processing can be broken down into the following steps:\n", 37 | "\n", 38 | "- `Data acquisition`: This involves obtaining data from various sources, such as databases, flat files, or web services.\n", 39 | "\n", 40 | "- `Data preparation`: This involves cleaning, filtering, and transforming data to make it ready for processing.\n", 41 | "\n", 42 | "- `Batch scheduling`: This involves scheduling batch processing jobs using a batch scheduling tool, which automates the execution of the jobs.\n", 43 | "\n", 44 | "- `Batch processing`: This involves executing the batch processing jobs, which can include tasks such as data integration, data transformation, and data analysis.\n", 45 | "\n", 46 | "- `Error handling and recovery`: This involves detecting and handling errors that may occur during batch processing, such as missing or invalid data.\n", 47 | "\n", 48 | "- `Reporting and analysis`: This involves generating reports and analyzing the processed data using business intelligence and analytics tools.\n", 49 | "\n", 50 | "- `Archiving and storage`: This involves archiving and storing the processed data for future use or reference.\n", 51 | "\n", 52 | "\n", 53 | "### Tech Stack\n", 54 | "\n", 55 | "Batch processing is a technique that can be used with a wide range of technologies and tools, depending on the specific requirements and constraints of the application. Some of the common technologies used for batch processing are:\n", 56 | "\n", 57 | "`Batch scheduling tools`: These tools are used to schedule and automate batch processing jobs. Some popular batch scheduling tools include Control-M, IBM Tivoli Workload Scheduler, and Autosys.\n", 58 | "\n", 59 | "`Data integration tools`: These tools are used to extract, transform, and load (ETL) data from various sources into a target system. Some popular data integration tools include Informatica, Talend, and SSIS.\n", 60 | "\n", 61 | "`Scripting languages`: Scripting languages like Python, Perl, and shell scripts are often used to write the code for batch processing tasks such as data transformation, file handling, and error handling.\n", 62 | "\n", 63 | "`Relational database management systems (RDBMS)`: RDBMS such as Oracle, MySQL, and SQL Server are commonly used for storing and processing large volumes of data.\n", 64 | "\n", 65 | "`Big data technologies`: Big data technologies like Apache Hadoop, Spark, and Hive are used for processing large volumes of unstructured or semi-structured data.\n", 66 | "\n", 67 | "`Workflow automation tools`: Workflow automation tools like Apache Airflow, Luigi, and Azkaban are used for automating the workflow of batch processing jobs.\n", 68 | "\n", 69 | "`Business intelligence and analytics tools`: Business intelligence and analytics tools like Tableau, QlikView, and Power BI are used for analyzing and visualizing the processed data.\n", 70 | "\n", 71 | "### Advantages\n", 72 | "\n", 73 | "1) Efficiency\n", 74 | "\n", 75 | "Batch processing allows a company to process data when computing or other resources are available. For example, a common schedule is to process data overnight when the database and servers aren't being used by employees. If data isn't frequently updated, one can simply change the batch processing schedule to make it less frequent as well.\n", 76 | "\n", 77 | "2) Simplicity\n", 78 | "\n", 79 | "Compared to stream processing, batch processing is usually less complex and doesn't require special hardware or system support for incoming data. Batch processing systems typically require less maintenance than stream processing.\n", 80 | "\n", 81 | "3) Processing Speed\n", 82 | "\n", 83 | "Because batch processing allows companies to process large amounts of data quickly, this speeds up procesing time and delivers data that companies can use in a timely fashion.\n", 84 | "\n", 85 | "\n", 86 | "### Disadvantages\n", 87 | "\n", 88 | "1) Processing delays\n", 89 | "\n", 90 | "Batch processing can cause delays in processing large volumes of data or transactions, which may impact the overall performance of the system.\n", 91 | "\n", 92 | "2) Limited real-time processing\n", 93 | "\n", 94 | "Batch processing is limited to processing data or transactions in a batch mode, which may not be suitable for applications that require real-time processing.\n", 95 | "\n", 96 | "3) Security\n", 97 | "\n", 98 | "Batch processing may pose security risks, as large volumes of data or transactions are processed at once, making it easier for cyber attackers to access sensitive information.\n", 99 | "\n", 100 | "\n", 101 | "### Applications\n", 102 | "\n", 103 | "Batch processing is a widely used technique for processing large volumes of data or transactions in various industries. Here are some real-world examples of batch processing:\n", 104 | "\n", 105 | "`Banking and Finance`: In banking and finance, batch processing is used to process large volumes of financial transactions, such as clearing and settlement of trades, reconciling account balances, and generating financial reports. These tasks are typically run overnight, and the results are made available to users the following morning.\n", 106 | "\n", 107 | "`Retail and E-commerce`: In retail and e-commerce, batch processing is used to update inventory levels, process customer orders, and generate reports. For example, at the end of the day, a retailer may run a batch process to update the inventory levels in their system based on the sales that were made during the day.\n", 108 | "\n", 109 | "`Healthcare`: In healthcare, batch processing is used for tasks such as claims processing, billing, and patient record updates. For example, a health insurer may run a batch process at the end of the day to process claims submitted by healthcare providers during the day.\n", 110 | "\n", 111 | "`Manufacturing`: In manufacturing, batch processing is used to manage production runs of batches of products. For example, a food manufacturer may run a batch process to produce a specific quantity of a product, with each batch consisting of a set number of units.\n", 112 | "\n", 113 | "`Marketing`: In marketing, batch processing is used to manage large volumes of customer data, such as contact information and purchase history. For example, a company may run a batch process to update their marketing database with the latest customer information, allowing them to target specific customers with personalized marketing campaigns.\n", 114 | "\n", 115 | "Overall, batch processing is a common technique used in various industries to process large volumes of data or transactions efficiently and effectively.\n", 116 | "\n", 117 | "\n", 118 | "### Advanced Batch Processing\n", 119 | "\n", 120 | "Traditionally, batch processing was usually configured to run sequentially. Each job was processed one after another on a single machine. The need for more sophistication led to the rise of concurrent and parallel batch processing.\n", 121 | "\n", 122 | "Concurrent Batch Processing\n", 123 | "\n", 124 | "Concurrent batch processing typically refers to jobs that run batches partially overlapping in time. This overlap allows for a piece of the data to always be analyzed at a given time. Concurrent batch processing gives the illusion of parallelism without requiring more than a single CPU core. Due to this concurrent \"multi-threading\" behavior, the architecture for concurrent batch processing must have fault tolerance in mind. As batches are not run one after another, a single batch failure could cause a domino effect on other batches should the architecture be configured poorly.\n", 125 | "\n", 126 | "Parallel Batch Processing\n", 127 | "\n", 128 | "Parallel batch processing takes a similar approach as concurrent batch processing, however instead of overlapping parts of batches over time, entire batches are scheduled in parallel. By taking advantage of the relative cheapness of multicore machines in the modern age, parallel batch processing can multitask effectively.\n", 129 | "\n", 130 | "Modern Batch Processing\n", 131 | "\n", 132 | "Modern day batch processing methods often use a combination of both concurrent and parallel batch processing. Also called parallel concurrent batch processing, by finding the right balance of parameter tunings to optimize how each CPU core handles multiple tasks and how each worker system handles a single task, when properly configured, parallel concurrent batch processing is a state of the art solution. Institutions that require greater stability and security such as the financial sector most commonly use parallel concurrent batch processing. For the most important data, often multiple redundant batches are run so that even if one batch fails, other batches can cover for the mistakes of the failure.\n", 133 | "\n", 134 | "As mentioned earlier, live data streaming is a challenge for batch processing traditionally. While attempts have been made to use concurrent and parallel batch processing methods to analyze \"microbatches\" stacked on top of eachother on extremely powerful machines, the use case for complex architectures like this is niche. For the majority of live data cases, stream processing is still preferred. The main business use case for batch processing for this application is when such large quantities of data needs to be analyzed that stream data processing is not a viable option." 135 | ] 136 | }, 137 | { 138 | "attachments": { 139 | "image.png": { 140 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAwsAAAFyCAIAAAA1dtHNAAAgAElEQVR4nOydeXxTVfr/n5s9adI2aQoplKVQqUBFLEwtWhEXYGYAGRAVxQFkHEAdBkfg64LIIII6wCiDiqBTAcVxQfwxqAg4LlgVKhRBFoFKKRS6pU2XNHvu/f1x2sPlZmnaJmmW5/3ij+Tm3KXJ4XM+5znPOYdpMFsAQRAEQRAE4SHq6gdAEARBEASJONAhIQiCIAiCCEGHhCAIgiAIIgQdEoIgCIIgiBB0SAiCIAiCIELQISEIgiAIgghBh4QgCIIgCCJEEkihhtkDQv0cSIyRtPG050GsSEiAeNYfrDxI4ND6g9UGoXhtlfyDMSQEQRAEQRAh6JAQBEEQBEGEoENCEARBEAQRgg4JQRAEQRBECDokBEEQBEEQIeiQEARBEARBhKBDQhAEQRAEEYIOCUEQBEEQRAg6JARBEARBECHokBAEQRAEQYSgQ0IQBEEQBBGCDglBEARBEEQIOiQEQRAEQRAh6JAQBEEQBEGEoENCEARBEAQRgg4JQRAEQRBECDokBEEQBEEQIZKufoCYQr+gAADsxwqbdhf4KaadsVys7+k2XjRtXhKuR0MQBEEQIZqxs+TZ+QBgXDOrq58l4ogjh6QZ2/Lz+7cvXkvKs3JlfbMBwHpoj8tY7utEZc5oz1ukPvUfxeB8a/FeWv8UQ26WGDJclaUd+TOQgFHlTVDdOEnaIxMAWLPJ3VjbpnmNUjzrGIIgEYhEn54waqq0zyB6xFl2ovnr9/w0K6FGnp1PWy4/SPTpymFjAMDdUGPZv7MDBaKROHJIydOXkRcSQ4af4E3SlIWJk+aT17Q11Yyf69X9BIJicD7wzBMSBiT69NQn35UYMnjHMgBAmTM6aeoT5Q8M4Bcmhti/941wsI5FNe218vKsXPWt0ySGviK1FgCcl0qcZScatq32WphU71hqtKIUiT5dO3sV+a/KR5kzOnHSfFdlacWCkV3yYAHiMparbpgoy8wBAHFSqmf9pH9d7bqHu+D5QkMcOSSK6oaJfhySevT04N7OXVch1qU5SoqDe1nED9QeuesqHOeOkYPixBSJIYM0KnyodY7e8BLWsSilXVaelNcvKJD2HnjFQUOGMme0evR0894tnj6JVG9XZSk6pC5Eok/vtnS7WJdG3roqS121Fzm7VaJPF6mTxbq0K+tAhFK77hHDqi8ZmTJx8qMCtdSMnUXskbV4byzVtHh0SCK1Vr+gwOt4hHbGcs8WtJNcmpcb3Asi/tGMnUXkxna8sGblvYJP5Vkx+HNgHYtS2mXlVXkTdHPWMDIlALBmk6PsOGe3AoBEny7tPVCk1iZOmi9KSMLsxgiE2iNr8d76zc8IwtVk6K2LHq0duIzlTZ9uTJw037MN1YyfCwDuuooYG+iPO4fEOayMTKnIzpfo0z1HVVQ3TKRluuLpkCBAsg4BwLRxkeen9lNF/Lc05wxBwkx7rXzytCVElyyF22vXzxcUTvnLK2JdmnrMTFdlafRGQ2MS7YzlxB55/nAEl7Hc1yBppNGwbbVy2Bhp74HKnNGqvAkkXKRfUED+wPqty7v6AYNM3Dkk+5lDsj6DRWpt8oxnBW6XBJA4h9V+5pDnaHGHSX3qP5KUnq7ai54iKNGnJ894Vj5gOOkvuusqmr/5IFr+q0Q7ZAaHODGl5e34ueoxM8lrkhCQtmYfADR8uMqyf6d2xnLFkJvpW3qFhFFT6ZCH58+nGTtLMWy0NK0/ja5zDqvz/Mn6d1fwjRq9kdtUpRk/l9YH1mwi4ybyrFzN+LmyvtnkOvQ4vYJnHSNHbEe/MW1eop2xXH719fQ5nedPmjY9LXCKqrwJiRPn8cs07lgnz8pVDLnZa9VFOk+7rLz/VtZ+qqh62WRfIyBI10I63qzZ5NUeeSmfNyHprkUAULFgJGkjSI4a0aX2qgoAaH73oLT3QGKvyX9tXwNhmrGzVDdMpIXddRVNn7wuqE7GNbNITUu6a5Fl/055Vq4iOwbH1whx55A4u9Xy/Q71mJmCMJJEn07qcfPX74v1PYN4R0lKT69jzCJ1Mqln9IhYl5Y4ab5swDBskDqD23iRvEiZ92rV0om+iqnHzOT/LlRuKORTeVZu0l2LaElxUip5oV9QQDOjXZWlYp2B/HzSPoOo8xbcAgAYmVKWmZP6xDuVi269XPcMGQCQdNcisc7Arw9k3ETaZ5AiO9/zONtUR5XLs46RIwqAHuuKBH+atPdA/WNvXpwzhB7hD9/QMinzXiPpTb6+QCSctNnKuozl1qJdqvzJIrU2acpC7GhFCPKsXNLhsZ8+GOAp4qRU8t9ZM3ZW4uRHBeOt7VIVfreHQP5rS9OzPGuI4fk9gsJiXVry9GWOc8f43ouOtUkMGSkPrZVlXsfIlLE3vkaIxxUjTZuXsGYTI1Mmz3iWHkye8axIrWXNprCN4ovUWlfludp1D1+Y1uvCtF7Vz97prqsAAMXgfBz66QxNn25gzSYAkGXm9FhXpJ3hPfDb8OGq+i1LLYXbyVtL4fb6LUvJP34x9ZiZYp3BUVJsLd5rLd5LkkWSpiwk9shSuP3CtF4VC0aWPzDAWrwXAJQ5o+n4iKPkcOPHayvmjyA/8YVpvcjtSPdL8DxincF+5hCtD+Y9m8hxZc5o5/mT9VuWkuONH68lxwNJXCAqad6ziTxDxfwRzvMnAUCk1vLrmPaBFYxMyTms9C7Vz97pKClGexRS+Fbef0nayjrKjvspZv5yK3nBn0yOdC1kmRgAsB8rbO+5JMvedryQiA852C5VkfYeaC3eW/3snaRk/ZalnMMKAJpxsyX6dEFhiaGvpXA7LUzvSHKM+DRsW02URJU/mYhM7I2vEeIuhkQgYSTSmNlPFUn06SROaPl+R9iewVVZWvnkGPqWHydX3TAR4+QdxmUsN721OHnaErEujWRmJIy6x3n+pOX7HfxvlQSENWNnqfInA4Dj7BGv37mrsrTm+fsEKWsJN98NAI6SYn6H3rhmVs8NR0VqrfrWaaTL5dndr10/Xz5ohFiXxqg0go9sxwr5nTDT5iWqGyaK1FpXZSk/EtawbXXCzXeLdWmMTNHmV+Guq+AncbuM5Y071qXMew0A5Nn55O+lsxMa3nuBfgP2U0VVSyemrdkXFVNsopSmTzeQn5hYeevB3b66Z7SVdZw+5OeC9lNFJIeSDMogkYCv/0GqvAk0IE0QhGrAR4Jau1RFsEYa+Q+ePH0ZI1Nqxs0R1Dd+CAoAjGtmpb912ld1omNtEKPja4R4jCEBgGnzEhKwSb5vMQAkz3iWkSnDGUDyistYToy5INSJtBfL/p2X5uWa92wiwSQShU6evixtzT5V3oR2Xcp5qcRz4klLRoiHn2bN9QDgqVN8OIctwFuTq3XmCp4lPYVM1m8IAHAOK5ryMEOsPBEiYuXT3zrdfdkOzxCyrN+15AXbVOf/mu66ylA8KtJhfOVsJN21KHn6Mv4/z1CN7dDeAO8SuCY07S4gqii/+nrBR55Tl/xXJ3pTRh6zE5viNIYEAE2fvJ48fZksM0eVN4EEkBq3v9zVDwXuxloAwJl0QcG0eYlp8xJV3gSaqCgxZOjmrHGbqgR9tXZB1o0FAMWw0TTZliBSJ3uWJ1OWxPqe9mOFdEZ35NA6SwBb1i7Asn8nmQdAgknEyssyc9RjZvLnBPj33EgkQ8dSBbhqLx/vQKS2M6rCmutFam0gQWg/aGevoglSJDMkJrtYceyQdheQlDeSo4pTZGMV0ghJ9Okp816VZeYwMmXyfYv9ZHAHTpsTHlV5E7QPrKA6gmteI15p08o7y04EWHk62ewhQYfuLiXS6PjH+cNnvbZeCPyCkaAqdH3Ixo/XqkdPF6m1iZMfjeptCXwRvw4JAMx7NpERWfK6qx8HCSEuY3nV0okkTyhYuTWNH6/1OupBunTyrFxivt11FfYTPzjOHgEAeXY+ncyPIHz8WHlazdqsumT8N/AxFyTUWA/tIQnXsgHDOn+1SFAViT49cfKjAOA8f7Jh22q2qS55+jKvC+jEAHHtkJp2F8j6XcuoNJylKUICSGR5HjLdAAk6JLzcSSlxN9S0XI03396TxDv/RiaIVS+bTLtWTbsL0tbsiyiH5Kq9KDFkSAwZXtdQRcKMVyt/uZXtN8TPuTTHjqQzIpGAy1hOVs2QXzWs8//FgqIqJBmgwzaajK9xDqtp09Pk7mS/Nv4akjFDnGZqU2rXzzeumRXgQl5hgGgiClyIINJA494dw7J/J7GwZIkaX0hSegKAu64ywm0HzQblL34BABJ9Og7ZdBUkSZ+2efw5HJ6TtCma3z1IXtBp/0gkYD24GwAYmbLNZR3apPOqcnl9pl8OdOD0y/uvFe2i2Zy16x4hkpg8LdZ2vIl3hxRcSC2hyzT7R2LI0C8ooHon0ad3X7aD1N1wLjoQe2jGzkqaslDQkPC/XtvRbzzPotOFAsF2rBAAZJk5nostybNyyXpIZMKIWGfgP4l2xnKxzhD4jcJA0+4CYhmVOaPJRCrN2FkpD63tvuIzXA+pq/C08s1fvwd+W1lV3gSy77qrsrQzExGQoGPavIT8lGRZh6QpCzt8qfaqiiI7n69RZHcaIHNXP93Q3rvT8TV3XQU/rEDWkAQAsS5NvyAiRmOCRVyPsnUAMi3T83j9lqVNuwtcleekvQfKMnN6bb3gqiwli8T7gnNYlTmjlTmjyX8eup6y7XhhhAz5RSny7HxlzujESfNZs4lOmKcDFo6SYv6aDtZDe5KmPsHIlKr8yfJBIwBArEu7MK2X/1vUb36G7AGiHjNTdcNEV2Wpu7FW2iOTkSnEurT6LUvtp4os3+8g2STdV3xmP32QkSvJdjeh+aM7Rc3z95GdNclEKnLQXVeBa2qHFM3YWSKNrvnr9/jxAJKH5Gnl6ViGLDPH8Pwewb4x2hnLE0bdAwCcw1q3seMNMBIiap6/j+xSTFbep+okUie3SxM6oCpkQTgyWZXKYNOnGzsQhaLz15o+eV3wUcO21aoRd0gMGTE21oYOKZg07lhHZxm0OcprO1boLDuhHj2d1lrOYRUsG4h0APuxQmJfBClHXr9el7G8bsMC8qvRXc/avIXLWF69bDLJqBWptbLMy3ehm7Q37S6QGDISRt0jUmvJfBPWbGr8eK20z6BIm9RG/pykuxbJMq8DAM5hs/9ywLR5CdnaCQkR7bLyAFC77hH9ggJp74HS3gO7PfORu66CiAztXHEOa92GBV4DSBJDhtcJU2125JCg4DKWVywYqZ2xXDl8LNEZvjqxZpOj7Lin7fCkvapCot2K7Hxarzx3dQwQOr7mqw9ft3Fh6hPvMDJl8rQljpLDEZ5dECBMg9nSZqGG2QPC8CihQ5U7zlL0KXkBAKF+7Th7xGUsD7w8iETipFRpn0G2n77yWoa8iC6SNp72PBjOiiTRpyuHjSFLhriNF9tczUGelSvrm93eCavkLiKNjsw28jydFAAAd0NN1PWrSL6wYFne8OBZf6JdhTzRjJ2lGT/XM0rnv6dEF08SHHeUFNeue8Sz9vqfSR6rDonWnwisNlQTCB2YJB+IqpDfnf7nJcuQRqMKBRGvrZJ/4sIhdRs8zZxQDwDq5mQAiLrX1cejL++yyx0S0knkWbndnvkIAMx7NoV/rfl4cEiE9lp5gipvgjQ9SzZgWEvabPHepk9ex/QjSiQ7pPAgcEgIdMghxcUom/uNV9R//gt5AQDR9/qGSExeQWIGsjsh/4hEn66d+Rx0NKMTCRyXsbwDeYeW/TsBWjYWTJz8KM1odF4qsR8rjPNQAYIEi7iIIQEHKT+YAKB2hBYgCl8zYfiOggzGkKKItDX7xDqDu67SVXuRs1vFiSlkZWfoogASxFMMqfNI9OnJM55VZOfT3YpidewscDCGhDEkTzCG5AOmxXMQqxGNrxEkdDgvlYh1BrJuJD3Y4YxOJMy4jOWkFdTOWC7rNyRYS8YjCBIfDgmu9BnR+BpBQgbN5aRLzDvOHsElJ6KOLon2IZFJ/Zal0LoDEtJh4sYhIQjiF7RECBIz4H/noIBraiMIgiAIgghBh4QgCIIgCCIEHRKCIAiCIIgQdEgIgiAIgiBC0CEhCIIgCIIIQYeEIAiCIAgiBB0SgiAIgiCIEHRICIIgCIIgQtAhIQiCIAiCCEGHhCAIgiAIIgQdEoIgCIIgiBB0SAiCIAiCIELQISEIgiAIgghBh4QgCIIgCCIEHRKCIAiCIIgQdEgIgiAIgiBC0CEhCIIgCIIIQYeEIAiCIAgiBB0SgiAIgiCIEHRICIIgCIIgQtAhIQiCIAiCCEGHhCAIgiAIIgQdEoIgCIIgiBB0SAiCIAiCIELQISEIgiAIgghBh4QgCIIgCCIEHRKCIAiCIIgQdEgIgiAIgiBC0CEhCIIgCIIIQYeEIAiCIAgiBB0SgiAIgiCIEHRICIIgCIIgQiRd/QAIgiAIgvhDM3YWeWE9tMdlLA/uxSX69NQn37Ud/ca0eUlwrxztoENCEARBkAhFv6BAkZ3PyJTkbfL0ZbbjhTUr7w3iLZTDxkgMGa49m4J4zdgAHRKChBbN2FnqMTMbPlxl2b8z8LO0M5azzQ0N21aH7sEQBIlw0tbskxgyrMV7mz553X6qCHjBpCAiz84HAOuhPUG/crSDDikI+G8CsamLOnptvUBfcw6r/cyhzvTY5Nn5EkOGf3uU8tBa6+EvaBlV3gT1mJmuylKsNggSt6Q+9R+JIcO8ZxN/8Ktpd4FnSXlWLgAQCwUAqrwJ4qRU8DEkRz51nDtGy0v06azZ5DKWy7NyZX2z3Q017erOxTBx5JA0Y2clT1/mrqu4NC+Xf1yiT+++4jORWlu77uGOVQs/TSA2dVEH6aJZCrc7zh4BAMWw0YrB+SkPra1dP79jF5T2yHTXVfgpIM/KVeVPJrcjWPbvVF53u/nLrR27I4IEAnbeIhl5Vq5icL6jpNhPblDPDUftpw+KE1NkmTlk6E2iT9cvKJD2HkgKJE9fZincTrSLXxgAOIe1ctGtxD+JdQZ3XWXqU/9RDM4nJ0rTs7BiQFzNZSOBRLEujdhtinb2KpFaCwAdds1+mkDL/p2Wwu11Gxd27MpI+JH1uxYAzF9ubdpd0LS7oGblvazZJMu8jl9GlTch5aG1qrwJgnM1Y2eRf/yDYp3Bv0Mi1xF0DWvXz6c9PII8K1czdha9qUSf7nkv+hgpD60V1HMkPKSt2ddzw9FASpLGTKJPD/UjEZKmLEyaclmISOdNNeKO8NwdaS+a8XMBoP7dFX7KiNRaRXY+a7fUb1na+NFLANBt6XaJoW/9lqUXpvWq37IUAGjXixR2VZ67MK1X48drGZlSM24O/Uhi6MvZrRXzR1TMH8E5rFgxCHEUQyKBREamUN86jbY9SVMWyq8axppNrLleUFg5bAwAeMYb6Uc0SinWGZznT4K36CUAeI09kIbNMwSqypugvO52/oALEmYkhr6cw8r/BRmZgnPYyGtV3oTkaUtE6mTWXK/Kn6y6cZJxzSx6XKxLI8WSpy8jIUl5Vi4jU7oqz3m9lypvQsq818hrMrRHztIvKFDmjL4wrRcAdF+2Q6TW2o5+ox4zk5Qk1idh1D0keVMxbDQdBEyaslAzbjZ5WlX+5PotS73G5JHQIdYZfP3cAjTj5ihzRpP6EwbUo6fbTx+kbzFOGeFIe2TyhYgkJJHX1c/eaT9V1NKIFO2iTYx+QYFYl0YHQ0hnj2QXCQo3f/1e4qT5Yn1Pz48IVPHinDhySCSQyDmsNB4g0adrxs22Fu1S5v7OVXuRliTNDJ07oJuzpvyBAeS1dsZy2jKxZtPFOUNIE+hurKU1mHNYa164n9RsflOnnbFcPWZm7bqHtQ+sIFErzfi5dMjPV9OLhBmRWuuuq6Rv9QsKGJmSqIxEn659YIWrsrR62WSXsTxtzT5pj0wAkGfl6uasYc31RLnIj05ESnHNSOB14wRY9u+07N/Zc8NRd11l5ZNj6HFpj0xXZSl9HrHOIOs3pGL+CAAwrPoyYdQ9rspzpI71WFckSelJSqryJiROmk+D6r22XpBn56NDCicSfTojU3pmfnhN75D1G0J/ZT6asbNk/a41f7lVEEQkH9EftF29OHlWrkittR8r5BcLcOBYnpWrypuA43Hhhy9EjpLDzksl0h6ZEkMG+U1ptJuWkQ8Y7igpppWBdPZIbSSFGz5cRT4ijSBnafL8CAC81uH4JI4ckkittZ8+6Cw7kThpvkSf7jKW6xcUuOsqzV9uVeVPdl0sIcWSpixMnDTfUVJcu+4R0grSKxCLYy3eS7wLCY/L+mYDgHzAcMv3O0wLRpJsJ834uaQS85s6YtiT7lrUuP3lpt0FpB0lkuer6UXCj8SQwTms5HcX6wyMTGnes4m0DSnzXgWAqqUTgcQLdQbbsUIA0M1ezTls5LcDAIk+nQ6rSfsMAh/JlS2306eTmsk/SKOS5Hmc50+SmwIA57Cx5npqp/hdPe0DKxwlxaTZ085YDgCCFhEJNcSX0K+dZH4wcqUgvSPlobWq/MnkSK+tFziHlfTBvIYAac9KN2cNMesuY3m7enHpb50mb5OnL6OJKbTzRrJPSC8OACT6dMOqL8nsBJrU4qoslRgyVCPuqFgwMixfJAIAIFIn09fk/3Xamn20QRFEu4kJdlV+RU8R69Kox5IY+pJcbFoYAKyHv/D8iISUUDoI8eKQ6K9uPbQncdJ8Mv4qMfStXHRrwqip0DoLgESV+A2SWGewnzlEPkoYdY/teCEN7ZAqRdKbTG8tJs7d3VADrd4crmzqpD0yWbOp5vn7yImkDCnvq+lFwgzJ8nGeP+lurCXxABIWIp9Kew+0HStU5U1Q3ThJkZ3vPH/SuGaWPCuXzDehEkOileQ13y1pZyxXDLmZ3ouszyZoUwGAPzBHnsf+ywH6qcBOidTJRDFVeRNEaq3l+5c1Y2epbpgoy8wx79mEAaQwQ9TAce4YeUsyP2zHCmn8TzXijoZtq2vXzzd/ubXbMx/RgB/4DgHSnlXz1++7KkuJPWpXL678gQHdl+2Q9h5IXRTwOm+sqZrcnShY0l2LGJmSJLXoFxSI1Mn8yGjov0KkBdZskhgy6O9CoO0RXGmAoLWv7qppmYerypsg1qVZi/d6LawcPtZVWUquLNal8ZNM+GNzSLw4JPqru4zlzvMnlcPHitTJTZ9udBnLSS+f1BXNuDmMTGna9DQ5i7RVREH4wsGHNIG0EpM5lmRURZCDQtwSbUcZlYbe12vTG9IvBPEK6Vo17XqTpBB1e+YjmrWmypvAyJTKnNHyAcNdlaUN771A/EeLieHNm+WbGL5bclWWOvU96b1ahj88ViLhD8wJJvEKunfkXu7GgwCgvO52ACCzNZ0Vv/KNHRI2JPp02q33n97hOfzqKwQo6Fl1oBcHHg0k8DpvjrNHVPmTiXABgGLoLY6SYvupIu2M5dLeA0lFkujTZX2zvY4JIiGi/t0V3Z75KHnaEhJ3lOjTE0ZNZWRKOtzBN0DQ6suVw8Y0bFutnbFcdcNEAHCWnaCFSQF5Vq525nMidXLdhgVeryMIKcU58eKQ+L+6/ZcD6jEzHSXFZOiEPxNN1m8IP27JVzEyf9Kz1RHrDHzhIA0eaTsFTZ0gY5f24Xw1vUj4kfTMhFbbaj9V5Dx/UjH0FvIRaUI8c59JF99Rcpi8TbprEbS2bQK3RCbHCe/YuhIJPcIfmOM/D7QafRqi4MefiOGmYyVIl8A3xJ5pIvz0DsHwq58QoKBn1YFeHHi0gnw5atpdkDx9GYlXaWcsZ2SK2nWPAID86uuJQJHmlnPYap6/L2jfFNIW9lNFjR+vVY+enjhpfuKkFpNNAz/Ef1MDRMo7SoplmTm9tl5wVZaa925JnDTf9vM+WliZM5pMB2HNproNC3xdRxBSinPixSHxu1BNn25wVZbSXjt/IEyk1vIrB2mBiE4xMoXnZfmdeHrEaw6KZ5eR9vx8Nb1I+JGk9OT7XTImmzRlYcO21dZDe5KnL0sYNZX8TElTFkpSe9Wun+82XgSApLsWNXy4KnnGswpeTIjUHwCQZ+X6iuiQeqUZO0uk0dl+3mc/VcS37ILnEWYe8O7lLDuhzBlN1m2S6NM14+Zgam2YERhiwY8liP8JlgjxFQL07Fl1oBfn2QoK5MhdV8HIlS0hqGOFLcEqQ19Gpkx94h1X5TnL9ztwx67w07BtdcO21V6Xf/Ta3apaOpGEtIn7of/9iVmvXfewOClVkNfveR3BeoFxThw5JNrzdhnLaZ0QCBBrNskyc1R5E9ymKu3M5ySGvlTFnJdKJIYM0liq8iYk3Hpfzcp7PZNI+CElvggKuowt971YAgC+mt4Qfh2IDySGDNvxy79mw7bV6tHTSe6Iy1huKdyuyp9M+mGcw0pyxZo+3aAcPlaVP1mVP9l2vNB2rFA+YDgRMuuhPZrxc5U5o6U9Mn2luJJp/MnTl7FmE9tUZz9VJEjT5j+PYKyEH39q2LZaOWwMeQwAYM0m894tQf9+ED8QNeCPa1yRJnJlegf/VwbfIUDPnlV7e3H01iScQBDIEWuul6T0TJ7xLOew0RE6Rqak+UxIF9KulV+8FiZmHVeQ6QBx4ZD8JOcLBKhp15u6OWtS5r3GOazNX7/PyBSs2UQ+qt/8jKxvNg14OkqKJfp0QRKJIKQkSNPmdxnJfYmX8tX0IuHHs4m6OGcIfU0SbAXTtl3G8kvzcjVjZwnWwaIf+b+jafMSQe+cn04reB7B1fgLBJC3XlfkQsIDUQNqRPyndzAyJSNTasbOkhgyTJuX+AoBes6FbG8vDlrtl+KakSSZt2l3gUCOXMZy+YDhiuz8hvdeoAfddRXyAcPJtF/+1ZCoQ7CCCRI4ceGQvAYkCSSMSd9a9u90lBxWDhvjuZYjae1IC0Q/FXSwXMZyfpPGb+oEIQTBfb02vUgEYj9V5NV8RMgIKVaeLsQzTdtPegdJGSHDak2fbvAVAvRcr7+9vTiXsdHXSr4AACAASURBVNzy3ceK7PzESfM5h9VatAs8Ilj2Y4XKnNG244X8atz0yetJU59IW/sDfWByteB+aUgYMO/ZRIdQkHbBNJgtbRZqmD2gzTIIwidp42nPg1iRkADxrD/xUHkCDAGSBSG9bkpKL+Lr03ZBV56MkA5A4ND6Ew/VBgkQr62Sf+IihoQgCBL5BBgC5GdSdvginb8REnuQ9dwxC5YSRzvXIgiCIAjii6SpT9DV3hHAGBKCIAiCIADQ9OlGtqmuq58igkCHFHK0M5bjyjQIgiAIwX+CF39/YrLnsdeS9CNBzpnnzse0JD/FzetBr+1Uy+rwHpltXhdqijHixSGRTYUq5o8I8w+pypugHjPTVVmKDgmhpDy0VpZ5XQc2AcUsgVgi1A2MPCtXM35u/eZnYrX1ijoEmxADgDw7n0yI9tyfGABS5r0qy8wh5yZOftT4zweplSH7DZPXqhsmki1o/Ox8TCY8itRaPwfJtuuC/ZK1D6wQqbUAoBk/l642ItGnpz75rsSQQd7S7ZDD8zWGk3hxSGRvo/ArhWX/TuV1t/N3HkAQWeZ1gRRLmrIQruzVJU19gpEpY1KJ4grN2FmJkx8lDQ8AJE9fZjteGPTVhrQzn5P2Hmg/VojZ1hEC3YS45oX77aeKeqwrkg8YLviI7k9MNhsmfkWVNyFl3mvJ9y0mToh81PjxWiIOZH9iXzsfJ4y6x1FSTE6k6zV4PcjfMos+T+P2l5t2F5AQAwluSfTp3ZZuB4DadQ9b9u8kvoq/qGksES8OSawz8FfuF6DKm6C87nbr4S8E00BIdBF8xEIFu4qCj8m6XtuzeI5bIoKVsn2hHj2dbmFBwCyBGID0zp3nT5reWkw2SE6+b7FicL5+QUFwF7Bu3LFOnpWL9ihyEGxC7Kz4lcaBBB+p8ibw9+az7N+ZdNeilljO2FmyzBxqj6B1f2JfOx+TtUn5JX0dJGv0e30eztIEAO6GGgBInvGsSJ1MTB54LBYfY8SFQ2rZ4sPH2iHJ05aI1MmsuV6VP1l14yQiUuS4WJdGiiVPX0b8MgD03HDUfvqgODFFlpljO17Y+NFL3Z75qPHjtaoRd5CoI+ew0tpDrDdZRhLjltGLf+dKRvQ9V/uk2Qb8j8jGSXSDbl/Is3JFaq1gIXjBWC29PjXlXu+IRA7yrFyybTbpvgOA/VRR1dKJPTccJTv6BYivTh35yFFy2GUst+zfKfjU8yysMOGE7MVJBUSsSeEcVvoRf3/ihFvvAwCve+EljJrKmk0CKfCz87HteKFicH7amn30iK+D/H2XBc9DlmUnNUSRne88f5JGAciWJrHan48Lh0Ty0Tx3HZHo07UPrHBVllYvm+wylqet2SftkQkA8qxc3Zw1rLme7B9JXA6VD5Faq8jOt585VL9lqePcMXJx9ejplu93mBaMJCFHzfi5pAJJe2TSbdowbhmNSPTp+gUFZE9QuNK5Eq/MyJWXO4LpWVS5SE4A7ajp5qwha6zzQ489NxxlzfU0IYl46PotS8loGrkdvSM/S4AUThh1DynGmk0X5wxR5U3QzVkDAKy5XqxLo3YfiRyS71vMOay16x4RHLefPqjMGU3GO8gPzQ8S9Np6gQ7Dee3UybNyST8t4ea7xbo0cm6vrRfoxmpez8IKE074e3ES6MiG5/7EkpSegrXU6TLoEkNfzy1EfO18DAA1K+/VjJ2lGT83efoykUZHKpXnQf6+y57PQxsyVd4ERqZ0nD3Ke7C0GN7SJC4ckmD3NErKvFcBgHTmVHkTxDoD2RBNN3s157AR2wQAEn06rawto2NFu2h0RzN+LgCQgDm0xiFJTBI8tmbDuGXU0W3pdpE6mZ/JSJ0r8cq2Y4UV80cAgGHVl2SPWwBImrIwcdJ8R0lx7bpHiPmmF5T0zITW3pi7rlKsM9CPFENudlWWkk1ySKoBf+MafpYA8VK0/SOJCIkT57Hmen69DfWXg7QXWWaOo6TYV4dblnmdy1hO/vs3f/0eOUg0x3H6EPju1NF+mu2nrxxnj1gP7eFvRunrLKww4YTsxUmNS9KUhSK1tnH7y+Btf2K4ch9izdhZjExp+X4HANBO1xWFfex8TCCS0nPDUSpQngf5u7B7Pg+JfgEACaWzzQ3kePdlOwT7D8YYcbFiJNkvyVOVpL0H2k8fVOVN0C8o0M1Z4zx/kvTGJIYMy/c7LsdCdQa6oRIRL37mNRm7pREmUoFI3RI4cbHOQFLwyNu4jVtGEfoFBWJdWt2GBcSX8J0r9crGNbNcxvIW1+uwAYBEn64ZN9t5/mTV0onkuFhncNVeJNeUpPSkYUWXsZym65JNTM17NpG3nj0zmiUg0acnjLrHdryQ9vjJXRiZgq+qWH8iDVJnvCZEMnIltKqB4L8/qXVkQ1zaqSO7yYp1BuelEmh1z6a3Fteun9+0u4DaLLIbl6+zsMKEE7IJMfmJtTOWa8bNdpQUE2Hx3J/YUXJYpNaSdCLN2FlJU59wnj9JCrgqS8U6AwlFJ01ZmPLQWmjdAZC8lujTtTOWk3keSVMWEuMrz8plZAqiQl4P8uMIgufhR79IgYSb79bOWG54fg9JROHvPxhjxEUMiT+8SiHRQmXOaPmA4a7K0ob3XiAVgqSJXDYrvNgjtIoXPxFbcHF+R5/vxDFuGY3IBwx3lBRfTtrgNV2eXpnmumnGzWFkStOmp1sukpXLyJSsqZq8pb0xaN0xlIy0qm6YSAJIrcWEPTNa05LuWsTIlI0fvSR4WssP/02cNL/7is8s3+/wmsGARAJeR8+laf1poFrw35+vOdLeA23HClV5E1Q3TiLdKhpE5PfTAjwLK0w4IXG7XlsvkLf82Yue+xPXrp8v7T1QPWamesxMUti0cRH5qOHDVbo5a7o98xEAcA4rGffwuvMx6aqRbYwBwFVZatq4yOtBaK1CRMEEz0MaMtJauYzl5j2blMPHKoePtR7cLVIlqvInE/sek8S+QxJYHAoJ9tCsjsvH9T0BwFFymLxNumsR8HKYhOLlcXH+kBzfiWPcMuogudKuyq/oEf6vL/DK/EENWb8h/I88jTJ1S9ZDe5KnL5MYMuRZubLMnPotS/lX4/fM+DWNJEV5bm7asG217ed92pnPqcfMlPTMDPrscSQo0NkYFM3YWWJdmqVwO3kr+O9Pa52vTh146wQGchZWmHAi1hkcJcVNu9703FfY69JolU+OIVNABIVJAj6RCH7jVfnkGM/J1OUPDAjwYOWTY3w9T8O21fzEcNPmJdRPp63Z566r8L/RclQT+w6JDK8Cb+o+AFgP7SGNU8KoqaSSJU1ZKEntVbt+vtt4EQCS7lrU8OGq5BnPKq7MYRKIF3/strWAgY6h8J24/7hl8vRlCTffLUpIkl99vUidDDEdt4wWSG6Hq6alz6fKm8D/9QVemT8AJ1Jr6bAstFYSr0bZZSxnzSaxvmfyfYv5AST+wAr/Iq3Tdy+PjAiwnyqqfHJM92U7aPI4EjmQMS/51dfzD0r06Zrxc1mzqeHDVdAqU0SF4Mpa56tT57UT2OZZBKww4YGOIbRrwqD9VJEv8+H11/R68cAPBoI8K5c+kn5BgcSQ0fjx2o5dKiqI/TwkMuylzBlNpgWRf+KUHi5juaVwu7T3wF5bL/TaekEzbjZJDGr6dIO7rkKVPzlt7Q+MXGk7Vkhjj549e0EOOJEqd2MteSvWGS47pLbilgCgHD7W/ssB+4kf4MrWEekSSHtGrIl2xnLtAyuA9+uLdWkkmYMgMfSl9YQ1m8S6NFXeBHlWruH5PRJDX4FRFml0NCvWXVepyM6XZebUbVxIr0aqouKakZqxs0it49c0cl+SZ6DKm5D61H8AQDN2FklNaKmEVwbtkUjAfqrIdrxQ2ntg6lP/kWflSvTpSVMWdl/xmUidbHprMT9OINb3lOjTUx5aS2odMUwtKSCjppIyNAfFs5/GVypfZ2GFCSct+3tE//TklL+80nPD0bQ1+9LfOq3MGW3esym2t4uI/RiSn/mrtevnm7/cKljJxmUsvzQvVzN2lmDhR2hN/vdzcZexnD+bgD8RCeOWUYf9VJGjpFiWmdNr6wVXZal575bESfOJc/X0ymJdGo0bNe16UzdnTcq81ziHtfnr9xmZgq7DZvnuY0V2fuKk+dI+g0jlcRnLpb0HWov38n9xWoxzWK1Fu+DKLIH6zc/I+mYnTppPkgkcJcUSfbp6zEw6fMOaTaa3Fof+G0LajWnjIu3sVYrB+TRm4zx/0rTpafrrN+0u0Iyfq8wZrcwZTWsd+ZR06lT5k0kuC81B8Zyry49B+joLK0w4cZw75iuMF13UvvIX9a3TGJXGdvQby/6dMd9OMQ1mS5uFGmYPaLMM0mEEcUtlzmj+UihRStLG054Ho7Eikcz99galyUJ8bS6MTtfBoguHtuvBBAkNvhaujEY86080Vh5fkOpBFgpp/uYDz//sXntohI79yl7PiqUKI4DWn1iqNkgn8doq+QcdUtfTY10RI1Ow5nqxzsDIlOY9m2JgXknMOKTQIc/KTfnLKwBAF6RBKLHtkAgSfTqJJwEALqAfXNAhIZ50wCHF/ihb5BNvcUsEWoOFrspSuoIoEm+4jOU1K++V6NM14+bQ6R0IgkQO6JC6Hj8TFpBYpemT151lJ6J9LBXpPC5jeQzEjBEkJkGHhCBdANpiBEGQCCeg2f7y22aG+jmQWAIrDIIgCBLtBOSQFPc8hW0eEiDy22Yq7nnK60eiJH2YHwaJRrzWE8WUx8P/JEg0gq0VEiwCXTESTRISCH7sEQCoFryNJgnxjyhJr1rwtudx+Zg/+alaCELwL0EI0i7asaY2miTEP21qk9jQH00S4gdij8SG/l4/xcYP8Q/WECS4tG/XETRJiC8C1CY0SYgv/NsjAjaBiC+wbiBBp937sqFJQjxplzahSUI8CcQeEbAhRDzBWoGEgo7sXIsmCeHTAW1Ck4TwCdweEbA5RPjg7BCkTTpWEzrikABNEtJKh9sqNEkIob32iIAmCSHg7BCkTXzN/2gT8ZNPdXBLZ0n2TWBpcpf+1LHTkRigk62USK2TDBnlKt7N2dveHBCJSTpmjwiSfkMZlcZ1/NugPxUSLbQpQSgySGdEpuMOCdAkxTdB6cSjfsUznVEuApqkeCZACUKRiWc6KTKdckiAJileCeIYB+pXfNJ5e0RAkxSftEuCUGTik86LTGcdEqBJij+CngKC+hVvBMseEdAkxRsdkCAUmXgjKCITBIcEaJLiiRBlyKJ+xQ/BtUcENEnxQ4clCEUmfgiWyATHIQGapPggpBOIUL/igVDYIwKapHgAZ4cgbRJEkQmaQwI0SbFOGOZXo37FNqGzRwQ0SbENzg5B2iS4IhNMhwRokmKXsC0/g/oVq4TaHhHQJMUqODsEaZOgi0yQHRKgSYpFwrw6H+pX7BEee0RAkxR74OwQpE1CITLBd0iAJim26JLFi1G/Yolw2iMCmqRYAmeHIG0SIpEJiUMCNEmxQhfu7YD6FRuE3x4R0CTFBjg7BGmT0IlMqBwSoEmKfrp86yvUr2inq+wRAU1StIOzQ5A2CanIhNAhAZqkaKbL7REB9St66Vp7RECTFL3g7BCkTUItMqF1SIAmKTqJEHtEQP2KRiLBHhHQJEUjODsEaZMwiEzIHRKgSYo2IsoeEVC/oovIsUcENEnRBc4OQdokPCITDocEaJKihwi0RwTUr2gh0uwRAU1StICzQ5A2CZvIhMkhAZqkaCBi7REB9SvyiUx7RECTFPl0uQShyEQ+4RSZ8DkkQJMU2XS5NgUC6lckE8n2iIAmKZKJEAlCkYlkwiwyYXVIgCYpUokQbQoE1K/IJPLtEQFNUmQSURKEIhOZhF9kwu2QAE1S5BFR2hQIqF+RRrTYIwKapEgjAiUIRSbS6BKR6QKHBGiSIokI1KZAQP2KHKLLHhHQJEUOEStBKDKRQ1eJTNc4JECTFBlErDYFAupXJBCN9oiAJikSiHAJQpGJBLpQZLrMIQGapK4mwrUpEFC/upbotUcENEldS1RIEIpM19K1ItOVDgnQJHUdUaFNgYD61VVEuz0ioEnqKqJIglBkuoouF5kudkiAJqkriCJtCgTUr/DT5coVRNAkhZ+okyAUmfATCSLT9Q4J0CSFl6jTpkBA/QonkaBcwQVNUjiJUglCkQknESIyEeGQAE1SuIhSbQoE1K/wECHKFXTQJIWHqJYgFJnwEDkiEykOCdAkhZ6o1qZAQP0KNZGjXKEATVKoiQEJQpEJNRElMhHkkABNUiiJAW0KBNSv0BFRyhUi0CSFjpiRIBSZ0BFpIhNZDgnQJIWGmNGmQED9CgWRplyhA01SKIgxCUKRCQURKDIR55AATVKwiTFtCgTUr+ASgcoVUtAkBZeYlCAUmeASmSITiQ4J0CQFj5jUpkBA/QoWkalcoQZNUrCIYQlCkQkWESsyEeqQAE1SMIhhbQoE1K/OE7HKFQbQJHWemJcgFJnOE8kiE7kOCdAkdY6Y16ZAQP3qDJGsXOEBTVJniBMJQpHpDBEuMhHtkABNUkeJE20KBNSvjhHhyhU20CR1jLiSIBSZjhH5IhPpDgnQJLWfuNKmQED9ai+Rr1zhBE1Se4lDCUKRaS9RITJR4JAATVJ7iENtCgTUr8CJCuUKM2iSAiduJQhFJnCiRWSiwyEBmqTAiFttCgTUr0CIFuUKP2iSAiHOJQhFJhCiSGSixiEBmqS2iHNtCgTUL/9EkXJ1CWiS/IMSBCgybRFdIhNNDgnQJPkGtSlAUL98EV3K1VWgSfIFShAFRcYXUScyUeaQAE2SN1Cb2gXqlydRp1xdCJokT1CCBKDIeBKNIhN9DgnQJF0JalMHQP3iE43K1bWgSeKDEuQVFBk+USoyUemQAE1SK6hNHQb1ixClytXloEkioAT5AUWGEL0iE60OCdAkoTZ1GtSv6FWuSABNEkpQm6DIRLXIRLFDgvg2SahNQSGe9SuqlStCiGeThBIUICgy0Ssy0e2QIF5NEmpTEIlP/Yp25Yoc4tMkoQS1CxSZKCXqHRLEn0lCbQo68aZfMaBcEUW8mSSUoA6AIhONxIJDgngySahNISJ+9Cs2lCvSiB+ThBLUYVBkoo4YcUgQHyYJtSmkxIN+xYxyRSDxYJJQgjoJikx0ETsOCWLdJKE2hYHY1q9YUq7IJLZNEkpQUECRiSJiyiFB7Jok1KawEav6FWPKFbHEqklCCQoiKDLRQqw5JIhFk4TaFGZiT79iT7kimdgzSShBQQdFJiqIQYcEsWWSUJu6hFjSr5hUrggnlkwSSlCIQJGJfGLTIUGsmCTUpi4kNvQrVpUr8okNk4QSFFJQZCKcmHVIEP0mCbWpy4l2/Yph5YoKot0koQSFARSZSIZpMIfpV3GVHGKry9jai1x9JWuq5OoqWVMlZ20Kz90RBEEQBIka5AlirYHRGkTa7kxKT1GyQZTWT5JxLYglYXuEEDske7Pz+Leuo187j37FmU0hvBGCIAiCIDENo9ZKskdKrxklyR7JKDUhv12IHJKz+HPHN++7Sw5yTnsoro8gCIIgSJwiloj7D5PnT5HmTQzdTYLvkFwlh+zbV7tKDgX3sgiCIAiCIHxEvQaq7nlKPOD6UFw8mA6JrSmzbXvRefgL73fSGqRDbxcld2eSUsX6npCoFyV1C0OUjI/t/ZX2/20K5x3bBeZFRj7uyl8ta/7INhi7+kG8E9tZkzGA/X+bbO+v7Oqn8AlKUCSAItOCvZltNHKmSrfxItdQw9ZXuX7+ijVe9FpWkn2TctJCUa+BwX2EoDkk28dr7Hv+DW6X4Lio10DpkFuk194u7psdlBt1kog1SahN0ULE6hfao6ggYk0SSlDkgCLjC/bCSefRr5xHvnCfOyb8TCyRj7pfMeX/gpjKHQyHZG+2FCzyDB1Jc34r/8P8CNTrCDRJqE3RRQTqV5crFxI4EWiSUIIiDRQZ/7A1ZfZP1zu+3y44Lhl4o2ruv4I1PNVZh8SZKs2vzGEvnOQflGQOU0x5XNxvaOeeLYRElElCbYpGIkq/Ikq5kECIKJOEEhSZoMi0CXvhpOX9le7TB/gHxYb+yodfCcqjdsohuc/+ZFn/MP/3Y7QG5dRnpNfd3vknCzURYpJQm6KXCNGvyFQupE0ixCShBEUyKDKB4Dr2rfU/z7I1ZfQIo9So5v5LMvDGTl6542tqu8/+ZH7pAa654fK1+g1NeGyLJDLyjdokElbcRm2KaiJhMdwIVy7ED5Gw4jZKUISDIhMIom59ZNdPcJX+zNW25nG7HM4fP5X0HypK7d2ZK3fQIXGmyuaXpvPtkTRvomruKyJVYmeeJsx0rUlCbYoBula/Il+5EP90rUlCCYoKUGQCgZEpZdffwZnr3GWtGdwc6zrypeS620VqXYcv2yGHZG82v/QAW32+9RoSxR1/Vd6zmAnjWuDBoqtMEmpTzNBV+hUtyoX4p6tMEkpQFIEiExAikXTILaJEvetEIXAsAIDL4T65X5o3kZHKO3bJjjgkyxuPuk8V0beKO/4qH/dwx24fCYTfJKE2xRjh168oUy7EL+E3SShBUQeKTICI+14j0uhcP39N3nJmk7vsuCx3PIhEHblaex2S7eM1jm8/pG+leROV93QwkylyCKdJQm2KScKpX1GqXIgfwmmSUIKiFBSZABH3vYZrNNLhNtZ4gbM0SK+5uSOXapdDYmvKLG8uaIlfAYj7DVXNfSUaB9c8CY9JQm2KYcKjX1GtXIgfwmOSUIKiGhSZAJEOvsl99ifWeIG8dZ87Kht6O5OU2t7rtC/uZNv2Il01m9EaVA+91uHhvQhEcc9T8ttmhu76qE0xj9jQX7XgbVGSPkTXjwHlQvwQaolACYoBUGQCQixRzf2XKLUPPWDp0Moa7XBIrpJD/IWzlVOfCd2P1FWEziShNsUJodOvGFEuxC+hEwqUoJgBRSYQGKVGNeM5+tZ9+oDrWLsDtO1wSPbtq+lrSeawqFgWsgOEwiShNsUVodCvWFIur9j2FlgW5FqfuMlzG4F4IxRygRIUY6DIBIJ4wPXSIbfQt7ZtL3huHdvGFQLMQ3IWf27fU0DfqmavFWkN7bpTFBHcnCTUpjgkuOkCsadcApxFn3D/ecZttzF2C/vzV5A+SGzI6OqH6kqCm5OEEhSToMgEgij9ase3H5Dkaa6pTpTaS9xrYDtOD7Cc45v36Wtpzm8jec+1oBCsSBJqU9wSrE5erCoXhXPa7bte5wDcHDhZjmFZ+4fPc5bGrn6uLiZY0oESFMOgyLSJuMdVsuvvoG8d+95r1+mBOSR7s7vkIH0n/8P8dt0jSum8SUJtinM6r18xrFwUtuocW/Gr082Rtw6Wk9edt3++sWufKhLovICgBMU8KDJtIp+0EFpn3LvP/tSuTe4CckjO499yTnvLCb0GxvBXKaAzJgm1CYHO6VfMKxeBvXRazrAcAAMgZgAAbG7O9fXb7IWTXf1oXU9nZAQlKE5AkfGPKEkv7j+MvnUd/aod5wZSyHX0a/qan/cUD3TMJKE2IZSO6Vc8KBeBrfiVYYABAJmC1aaJGWA5kDpt1o9Wcazb52kcxxnL2V++Z8t/Cd+zdgUdExOUoLgCRcY/0utuo69dx4LtkJw8zyW9NjansPmhvSYJtQkR0F79ih/lAgD3xdMcB2IGmKRuqukrQKYQMWBzc6JfvnMd/ExQmG2ocf38tX3bi5YX7zY/e4f15QdsKydb3lnSJU8eNtorKShBcQiKjB/4kR3niR/omFibtO2QXCWHOLOJvGa0BnHf7A48X7QTuElCbUK8Erh+xZVycS4nW3XOzYGIYURag2RQviT/HoWYAQAOwLbjZc5sApvZXXLQ8emrzf+caVk23vHqHObLAubcUbCbXSzndru4b99v80bRTuDCghIUt6DI+EKU2kfc46qWN/Zm/saybZzYZgm2uoy+lg6NuwASJRCThNqE+CEQ/Yo35YJGo7uuws1xYhGI0voDgOz3c22JBqmIcbIcY7zQvGpa47IJzavug0//JT7zAzSbXCxndXEOlmM5AACJiGF7Xt3Ff0VYCEReUILiHBQZX0iyR9LX7qpfAzwrAIdUe/Fy6eTu7X2sWMK/SUJtQtrEv37FoXK56y6KHM0cBwAg7jkAAESaFPkdfxUxAABuDkRVv4pNF4EDm4tjOZCJGDEDAAw5XcQAJ5Io7/y/rnr+MONfZFCCEECR8YEotTd9zTXWBnpWmyW4+kr6ugMbv8UYvkwSahMSIL70Kz6Vi606JxcxHIDDDeIemeSg7PqJrszrFWKGAeA4TsIwCjEDYgnXrZ87/x7xuEfECiUDwABIRYxo4mOSwfld+1eEE19SgxKEUFBkPGE0Wvqaa6gJ8KwAYkimyw5JrO/Z3seKPTxNEmoT0i489SselcvlZM/97Cr6xM2BiAG3LIFJ6dXykViivHORkxGLRcAmG7gho+HuJeqnPlIt2aGc8iT7y36Rw8oBKCQMm3+vYsyfuvTP6AI8BQclCBGAIiNApL7skNzGi35K8pG0WYKru+yQIDHWtqrtGESM7P/bBKhNSIcg+mVZ80e2wRhfymVvdpcdcx39ynXiO7aiRMyxDpaTihhG243hyYu47zXyOa+IlQmiXoNAqWk56nZZ3pgvOVdsZTmlhHEOHKm6O07/65FOmu39lYAShPggfkXGG4wu/fKbxkAXjWzbIfFjSKKkbu18qpiFShJqE9IxWvTrjcdUf/5nzCsXZ21ylxS7jn7pPvk9W1MmFzMMCxzLuQCkIkYtZZpSezNSGf8U6bW3Ci5i/fAFybEvrS5OLmYcaVerZr4oOCWuICaJM15CCUJ8EVci4x+RJpm+ZhuqAzyLaTC3seldw+wB9HXSxtMdeDIEQeIWtuaC83+bXEf+x5gqJCJwsuBmOWBAyjBiEdhYRpzWXzr4JvGIySRT2xf2/23mtq10uDmJiOESU5ULtoi6WkYw9wAAIABJREFU9wvbX4EgSLTTATPTdgwJQRCkY7jPn7C/NkfWWO1iOScLThZkIkYqYeyMlOs1kBs8MmHwSFGvgW2GghyH97q3/8PNciIGWJlS8afVaI8QBAk16JAQBAkV9s/fkDVVN7tadqUVi0TcgFxm2NiErDxR9wxgAtv16NfDzi1PgtvJcSCRK8T3LpVk5YXyqREEQQDQISFIvHHghwNu1nXDjTeSt9VVVd/u+1YkYm4aebM+tSVXuuhAkdPpuDG/ZRa9y+X68ov/nTlz5rbbb7964NUA8OOBohS9vl//fj8fOSKSSAYPHgwAZ06fOXr4sMvtvm3MaL1eDwDgtLLc5VtzHAfAiNQ6UXK3AO0RADi+3KJwNJlZkIsZV0aOcsSkoHwPCIIg/glUpBAEiQGMxtoXn3/+nc1bOJYlRw7sP/Dtvn3FB4vf2LCRHKmrq3txxfNvv7WFZVkAcLvd/1y1+ovdexISEupN9aTMe+++29BQDwD/7+P/9+uZEnJw544dP//8M8uyYnFL10uaf7eLA0mrzLAcx5ze73rzUfPfx9nff85dehQ4noHygSx/io0TMQBOloOyn92XzgTr20AQBPEDxpAQJOpxOBynfvnl7K9n6+vrJRJJ7969hlw7NCk5ybPktvffH/6b4RUVFfX1DVqdFgDOnSvVabUcxyW3lv/o/Q+H/WZYRUWFyWRKSUk5cfz4mVNn1q1/VaFUkAJWq6WuzvTf/7fjq/99debU6bvuvpscv3TxokymSEpK1mpbpo1Ir72Vu2eJa8dLKrvZ7ubcHDhYDgAkpkrmm7etX24RZw5PeOB5Vt/Hz18nGXijY/DN8mNf2dyc0mm2f/a66sE1QfrmEARBfIIOCUGim31ff/3h+x+Uni1lGBEAMAAccKndUu+f/sdbb79iI8WLF8t379rV/6qrKi9V1tTUEId0qfxSQoKqf2bmHZMmAUBFRcWuzz7rn9m/qqKyuqo6JSWlrrZWrpQTe8RxHMMw1dU1wMC1Q4c2NTUBQHeDAQCam5vNTc2jf3tjWo80/k1lo+4XZ+W5v/2A+XGnsrnOxYKT5dwcSAHE+nTVLVNPVjaW/PDJuAnj/fyN8nGP2E4UMqzD5uLEh3e7SmdIMoYE+XtEEAS5EnRICBKtcCz37zfe2PHxxwAMx3Ec56YfVVdVv7T6n1arjToPjuM2vVlw/YgRuddf/+F7H5ScOTMga4DFYqmurlr54ovdDd1pmd/k5l4/Im/7hx+WlJwZOGhg9pAh72x+Z+nTS1L0KXMeekgul5f+erZ7t+5/mDzpzKnT+7/7PjEpEQCqKquqq6vOl5Wlp/cSPKc4LVN891OSMQ86D+yA77crqs86WU6cPxVG//mDfYc+eGfZmDFtbIkt7nsNXDdWUfyJ1cVJWKfjs9ckj7we+Bd1tuTXogMHig8eOneuzGJpBgB9auqgwYNuGnnTiNZ8LARBEAHokBAkWnl369YdH+9gWQ6AYxiGYRiOA45joSUnGjauf717d8Pw3OEA4HQ68268Mff6XI1G083QXSqRAgDDwKw/P6hLSSEXdLlc19+QN/w3uYmJmu6G7hKxBABSUlJWvLjywP4Der1eJpMBQN++fafN+CMAJCYmPvDnB8ViMQAkJSfNnjvX4XKmdvO+e6MouZt87J+5m+91/fSFXKE5rehV8MK/fjl+jGGY3n2Eo2yNjU2JShmIJSASkyOK3z9kOfKFyG21uznx8W/cpw+IB1wfyLdU8Mab27d9JDhorKnZ9/U3+77+ZtCgQY8/9WRKKu4WgCCIEFwxEkGikmPHjj39+JNOpxMAGIbRaDT9+vdvMjedLfmV4wCAE4lESqXq4XmPjLr1lq5+WCG19U0PzXrAarGyrJthmGdXPpczbBj99Is9e38++ctjd9zU/J9l4gHXiwfeKO49mNHobB+sZL7abHNzcjHjysxL+FsB0+qfvN+lxvjiyudPnDjh/2EmT7lz1p8fDM4fhiBIpIIrRiJIXMBx3EcffOB2uwGAYZj+V2U+ufgpQ1qa2+3+/LPPNrz2OgD0y+z/l3nzMgdc1dUP64XqSxcszc0k0CUWi/W8EM5Phw+vf/W1nr17u9UTXRdOyS6esH/xFqfWifpdy+jSnYyYAZfDzYnP7Hcd2ycd4s/8/fvNN/n2aNCgQaN/O3b02DEAcLbk16+/+urzzz4ng24IgiCeoENCkOjDbDafOXWGOAyGYX77298Z0tIAQCwWj5sw4VL5pSZz05yHHkpQJ3g93W6zVVZV19fVsiyrSUxM69EzIUEV4K0tluaqyqqG+npgmMTExB49eygUykBOZFmWDAUCQE1NDcMwJO87ISEhKall7tvZX8+ueuFFh91eV1NjZuQSfbq9pszFcuKmWvHPX7EcsK3LK0kYcHy9VXLNKHJBT3747rt9X39D344cdfP/PfkEfdsvs3+/zP4T//CHF1c+r1IF+rcjCBJXoENCkOjDbrPZ7Y6WNxwcPPjj2N//ViRqWXdo1uyW3CBPSkvP7dm169CPB6urq10uFwAwDJParVveiLyJkyaRfG1fnDxxcs/nu386fLjWaCRLJTEMYzCk5d980x1/mKjVaj1Pcbvdhfu+/a6w0FRX53Q6RSJx3g15d0+dWlVxeT/spKRk4lEuXbq4YvnyxoZGlmVNdXV1FkdaarrCdL7ZBS6Ws7mvWDnJzQFbc4FhWfDxl368bTt9rU9N5dsjSkqq/h8v4cIBCIJ4Bx0SgkQfSlWCUqmwWi0AwHJs0f4D/9n67rQ/3k8+9WqP7Hb7e1v/898dO5wOB8uyXOtSjRzHVVdVffLfnYX7vp398Nz8m27yPLexsXFTwVtf7v3C7XZzHMc/t6Li0kcffFj4zbePLnqMrKx9+ayGxpf/uebHAz+2vOdAJBYN+81wAKisanFIDMPoUnRSqbS+vv6FFc9XV1axLCsSiW+57ZYeaWmSPyyw78/kfvmeqTirYNxuDlwsx3LAACjEjK3XQF/26GzJr/zxtclTJgf4xRJ++O67b/d9e+L4CWNNDTnSp0/vgdnZv//97/tlXrFB+tmSX59Y9LjF0qxSJSxe+nR6z/R/v/kmiV0N/83wvz+3nJasrTHu3bOn+OAh+mC+rokgSISADglBoo+EBNXVgwb+8N0PZIY/y7LvbX03WZs8brz3VYUaG5tWvfDCkcM/UW8kYkTAAAAQx+N2u+vq6ta8uEokEt1w5QT4qorKlc89V3q2lKY90WAVy7acW1lZsXLZ8mdXPNf/qkzykcvlennNmoM/HmRbF+8GAGChR88eAFBVWU2P9Uzvabfb/7HyhdJfS1mWFYlEI24cMe/R+VKpBHoPlvUeLHM53RdPsSe/g2P74PxJqaMZgLOlD1ZMWuDr+yk6cID/9sYb8wP8Yn0ld5eVnS8rO//5p5/9dtzv//LXeZe/nKpKkslksTRXV1VvfG19Wdl58tHBHw/SYnt373nj9Y2CnCd6zdkPzbnjD38I8AkRBAkb6JAQJCq5864pPx4ocjpbHA/HcW+s36BQKG67Xbi2kN1m+8fK548eOUIsjkgkkkgkvfv0TtZqm5ubz54967S3RJWcTuf6da9mXX11Suv8/7o60/Jly8rOnWfZlnPlckWvPr1UKpXFYikrPed0OlmWZVm2sbHxXy+//I81q+UKBQAc+GH/j0U/0sE4+jDdu3cHgDpjDQ1E6XS619a98vPRn1nWLRaJh+YMfWzRAqlUevkPkEjFfbLFfbKlv53D1lzgjOdBKhf1zmZkCl9fjrG2lr7Wp6YGOJm/tsb4zOLF1OJ45fNPPwMAvkmieHogwg/ffbf2ny/5uebG9RuUShVJIUcQJHJAh4QgUUnW1VdPf2BmwRtvUofkcrleeflfcpksf+RIfsmtb79D7RHDMLfcduvkO+/s3bcPCQWVlZVtKXir6EARMUn19Q27P9t13x/vBwCWZTe8+tr5svNkTr5ELBl3x4Txd4xP69GDXLm09Nwb61//+ehRYpLO/nr2u8JCspD3D999T8owDJN9zTX3TrtXqUpgGKZP3z6NjY0NDY3kU5ZlP9+1y1hj5DhOJBINvib78aee8pP6LUrtBanCFSk9OV96jr7ulup9fSZP/v3mm9QeqVQJ98+4n4R2jvz009bNb9PA0ueffuZ1aMzXtLgN6zfwr3njjfkpqXoy6PbOlrfJR1vffgcdEoJEGrhzLYJEK5PunDx5yp10zIsEgV5e8/Lh4mJapvTs2U/+u9PtZgFAJBL9afaDjy1a2LdfBj2rT58+jy9+avA12S1HOK7owAES+zn448HvCgvdbjcAI5PJHvu/BX+eO5vaIwDIyOj7xNOLu3c30ChR4b5vyZNcunSRFvvjzBnXXnfdgKwBVw24SiaT1debmlun+gMAsUcMw2RnZz+5ZLGv+XehprbGyJ/79reFf6MjX9cOHfr4U0/qeU7r66++8noRlSrh/ul//GT3rs3vvL34macBYO/uPTSZafKUyXf84Q8koJWSqp867b7hvxlOPjLW1Jwt+TUEfxaCIB0HHRKCRDEz/zTr9tGjxa0LJ3IcZ7NZ17y46tKlS+TIJzt3upwusoDkLbfdNunOOz0vIpPJ7rt/GnE5HHBVlVVkw7X//r8d5KBIxEy5666Ro0Z5npuYqPntuN+1nMtxF8svuVwuu81eW1tHfI9MLtfrrxjkqjXWkml09JkBgGGYvhkZiYmJwfhWOkIxz1b26dNbsBtJSqp+5M2XI3O/nDjp9SKLlz49ddp9pDy5At+tvrPl7fFjf8f/x89Vqqqq9LwggiBdCDokBIliGIZ5aN5f8m68gW+SGhsat255GwBsNmvxwUMsxwKAVCadcvcUX9fJvOqq5OQkYnTsDofL5TLW1Jw4fpwsYqTVau+Y7DOVeMiQy5vI2h02DqCxsaGxqWUcTaPRaBLV/PJVVdX8zCTymmXZXZ99VrT/igzroFDdGsLxz4Xzl9OP+mRkeBYYOGhgmxe5duhQwRFjdUB3RxAkAkGHhCDRjUwm/dvCvw3KHnx5ihnHHvjhgMlkqqkx1hprSYpPr1690nv5zOARi0QSmYy8FjEiqVR69myp0+EAAIZhBg0erFarfZ2r1WqlMhkDDADIZXKJWFxXZ3LY7CSGpNVpBXlFVZWVAAy5cka/jOzslgE+l8u1/tXXanlJ1h3map6bMdbU1NYY23W6IOgVBlSqhO7dDWG+KYIg/sFMbQSJepRK1aInH3/8sUWVlRVk9r7NZjVW1zQ0NNJ0nxS9nlooT2pra+vrTMTTJCUnajSammo6IZ/Rpej83N3hcrBuNweciBElJSUxDFNTXS0SiUgyU2pqN8F9qyurAFpG1tJ79Z72x2mP/fVRq9XKcVxtjfH1V9c/+fRTfh41EHr17s1/W1xc3K48aKOxfY4qQBY/87Rg8A5BkEgGY0gIEn04HA67zcY/kpKSMuXuKfzRK7JMEYnWAEBjY6OfC37/3fd0E9zMq65iGIYkaAMAAFdfX+/n3NraOppXRNKQqyovp9QYrlynm+M4nvcCnU7Xq3fve+6dSp7czbr3f//9rs8+83O7QMjJyVGpLmd87/18d5un8PceKSst9Sxwkpd7pAp4kxZ+yZM+spcQBIlM0CEhSJTxY9GPT/3f429v3iI4nsIbG5JIxCn6lEsXL5FoDcuypWdLz58v83rByoqK/378ceuqAXDLbbcBgE6nJedyHPfzkaMN9Q2+nqfOWNvqzJjuBgMAVPCcGTlCcdjttXV1rZEtxtDDAAATJ0+6+uqrSdyI47hNb75V+uvZwL8QT1JS9cNzh9O3J06ceG/ru57FamuMf396yT+efwEABg4eRI+XlZ3/4bvvBCX3fbOPvs0ZPizAJ7l64OXxvn3f7GvveB+CIF0IOiQEiRp+Pnr0708vWbbkmVO/nNq7e09Z2WXH43K5Pt+1i7wWiUR9MzL0qamVFZdoAYfdvuG11z3X7Dl/vmzFs8+ZTPUtKxINHjRs+DAA6JeZSZZt5Diuvr7+32+8wZ+ARjh65Kczp0/X1tZSP5RmMEDLktkt7qpbt278U5qazE2N1Gxx3bsZAEAqlc595GGZXE62s7XZrK/86192m70z39WfHnyQH0Z6Z8vbf396CfU9Z0t+LXjjzYdmzz3440GSTH3t0KH8+fwvrX5p7+495PWRn356ZvFiOmlfn5oa+CLduddfT18ba2qeWbyYXhYAfvjuu4I33px5//SCN97syB+JIEgowTwkBIkaNq5/vexcGck0am5uXvH35TP/9ECfvn2qKqs+3r798KFimnU0YeJEhmH4+TQsyx796cjix5+ccs/dWVlZEqnUZKr/obDwk//ubGpqInPWFErlnIcfIsYoLS1t6HVDfyw6yLJulmW/+t+XdbV1d951Z5+MDIZhqiurvv1m365du1a+8DwdNeM4NrV7NwAw1RrpHP7Ublcs2FhvMlmtNrrKpb51tev+V2VOuWvKu++8y3FulmVPnzrz7jvvPPDgnzr8XaWk6hcvfXrFsueoKTz440H+7HqKvvUJ/7bwscWPP0leWyzNa//5kte1sOc8NCfARboBoF9m/8lT7ty+7SPytqzsvK/LIggSaaBDQpCoYezvfrfhtdfJa7Iq48rlz8lkMofDQaIvACAWi4fmDB116y3Nzc2mOhP1TADAsmzJ6ZLnl69QKJUSsdhqs7JulvgthmGkUulfH/0rXSqaYZhpf/zjkSNH7TYbx3Esyx756afDxcVqtZphoLnZwjCMRCLR6/WXLl4kESORSJSammqxNNfXN5ArJCQkJGu1/D+hpjUSwzCMSqVK/v/t3Wt4U1W+x/F/LiXp/UJbQiGIpVQhhXIpeoqCBUHAGT1QQWbAhzlqQTheeEAeFed4QZ9HB2eUUVG04HjkCIogMl5A1FFABwZELUhHLVCFlBoglN5ok16S82LDNuxim7ZpQ9vv5+HFTrL32is77PSXtdZeOzZGfSl72tQ9e/YcKjjo8Xg8nvq/v7NpcPqQ4SP87c9qKH3IkD8+8j++90praODAgbfn5Kjrz1+44NduHiIiYWHhs+fOae5o69tm54iIGpJ+pWR/BzYBaDckJKDDuHb8tVs++ODoEbtylzQl/bjdbnXZYDD0vbTvPQsWGAyGsrIydepqdUZHZW4kV3W1uomI6PX68PDwO+ffPer825X0658y9855y//6XH19vRKSROTMmTPKtjqdLio62tit2ynn2UmxwyPC47p3P336dGVl5dkVoqLDw8+bI9vhM4g7OjradxIBk8k0d968B+9/wO12KzdReXH580//9a8xsTH79+1zu2pCQgxJSb2cp5wDbTY/j1j6kCEv5L788daPvvn66yM//qhGpUsu6TMgLW348GGauDN+wnXDhg37+KOPvt771U8/HVGiUlhYeN++lwzLGD7+uus0rUc9eljCwsKV1QYOHCi/4rbZOVljxmzevPnojz+pdy9Riu1zad+G1QBwMSAhAR1GaGjYogceeGLJ48ePOzwej3jFK14l/ShtSJlXXXXHf8+Ni4sTkZKSkhq3W85enpai0+sPFRyU84ORiM7r9Q4fkfFft97aN/kC0ySOv+668PDwV3JXnTh+QsTru61er49PiPfU1//sOK7XG3Q6CQ8LDw8PV+ab1uv1er0hMjqy27lplhRFRXaD3iCiM+j1UTHRJpPJ99XLBlw++absDeve8ni8It5TzlPPPvPM4oceCg8L14l8//0P33/3fUxcnP8J6ey7mHCd/1f7K/cDUabGblJySr+33tng55oXvN8tgIsWCQnoSC69tO+fnn5q09sb9/xrj/OUs8bt7mbqFhsTN8A2cOy4sUOHDVPXPHn8hE6v93o8Op1+xJVXTr152o5t23ft3Hn0p6MuV5XBaIxPSLjssssyrxo5yGdS7IZGXnXVQJtt+7Zte/d8eayoqMbtNoZ0s/TokZY+OGtslslkyp56k6fe4xWvpUcPEYmJjc2eNlWtraa0wYPTo6Kiz716gUw27eZpRr2+2uUSr4iIx+s9XVKSaOnhdrkio2MqKytiYqJbfvgAwG+6ssqqxtcom5OqLkfnFrRxfQD4xe12ny4pcVW7zKHmmJhYc6hZs8Ibr69Z+/paj6der9ffec/dE6+fdHZDl7umtkavN4T7PamPyuWqrq2tMxoNoaGMmwHQkbQgzNCGBHRIJpPJ0rNnIyscP35cveS+R49fpm00mU0ms+nXt2uM2Rxq1iYxAOicmA8J6JxOnLtnqtFo6J7QPbiVAYAOh4QEdEK1tbWnnE6vV3Q6XWhoWHR0TNPbAAB80MsGdELV1dVut8vULcTr9Sb2SNRccg8AaBIJCeiEoqKi/vf1/wt2LQCgA6OXDQAAQIuEBAAAoEVCAgAA0CIhAQAAaJGQAAAAtEhIAAAAWiQkAAAALRISAACAFgkJAABAi4QEAACgRUICAADQIiEBAABokZAAAAC0SEgAAABaJCQAAAAtY7ArAADocl4d3zvYVejwbv24KNhV6ORISG2Lb4HWa+dvAT6y1mv/L24+tdbjzy2gQS8bAACAFgkJAABAi4QEAACgRUICAADQIiEBAABokZAAAAC0SEgAAABaJCQAAAAtEhIAAIAWCQkAAECLhAQAAKBFQgIAANAiIQEAAGiRkAAAALRISAAAAFokJAAAAC0SEgAAgBYJCQAAQIuEBAAAoEVCAjqMrIVL5+88NXP1jmBXBIHUKz1z/s5Td2w5GOyKADiPMdgVQHvLnL34ilsX+T7jKispLSr8Zt1LBZ+8E6xadQ4Nj23lieLKk8VfLH/02L5drS8/skdvETGaQltfFJrlhqVrEi8bHJGYJCKVJ4qLvvli65J5gSo8IXWQiJij4wJVIBq689Miozl0y8M5fMvBf7QhdTnxKWkiUueqLrUXKv/M0XEWW8akx1Zlzl4c7Np1bA2PbURiksWWMXnZ+tRxU/wvJ3XclPk7T/1h3ZdtVlM0w+2bvk0eNTEiMUn9TC+fcPP0lVuDXS80g9EcKiJhsQnBrgg6EtqQuqjKkz+/Nn2Eshxpsd741Jr4FNvg7Nt2rXwyuBXrBHyPba/0zIlLciMSk0bd9Zj/P175Hr94THhkRURikqus5P0HZikNgZEWa9aCP0X1tAa7agDaFm1IkAqHfdvT94uIOTquWU0daNKxfbs+fGSOiEQkJnFsOyLLwAwRKd6/R+0nrXDY37t/5ppZo4NaLwBtjjYkiIio3/60XgSc77EdMm3ONQuedJWVvDypv2Y1ZZzE37KH3LYxT3kmxpo8f+cpZfnZkd19V460WK9/fFV8P5vRHFp5ovjfH6xt2Pg34ZEVvYderQydUYaaaYZDTV+51WLL2PJwTlhsgu2GW+JTbCLiPJT/5epljNVorl7pmVff9ajyidS5qkuLCvPfez1vfa7y6oRHVlgGZsRYk0XEVVZSvH/PtmUPVDjsjRSYOm7K0OlzY3onK+OTnIfyfQtE60VarOMffC62T4o6vOzwjs3bnrlfRJo8T5XzsZES0DnQhgQREWUEUp2rmq/gNpW3PrfOVd2wrS5z9mIl61Q47KX2wsoTxXL+eCbflc1RMbPW7rLYMipP/uwqK4lITLri1kW+Y8giLdbbN317+YSb1aEzRlNow+FQ5qg4ERk6fe41C56M6Z1cai+sc1XHp9jGP/h8r/TMtj0QHUfJTwUi0mfENY00AaaOmzJ52XqLLaPOXV1qL6xzV8en2K5ZcDaznv0sEnqqw/6SR03MfnZjIzvNnL140mOrLLaMOrer1F7oKitRChwybU5g312X1Ss9c9baXdaM0UaTWTnjIhKT0qfm3LB0jYgc/nxL4+dpkyWgcyAhQbIWLh32+ztF5OiX24Ndl05I/at2+PMtIuI8nC8iaTfO8l2nzxVZIlL0zRci8tr0EV+teV7OjWdS/vmubI6Oc5Wf3jDvt69NH/HypP6O/L0iMvA3M9QVxj/4XERiUuWJYmWd16aPWD0j05G/12gOHXPvU5rqWWwZjvy9q2dkKqtVnig2mkOvvP2+QB+GjmrbsgdcZSVGc+ikx1ZNX7n1gtlxzL1PGc2hjvy9L0/qr3wo25ctVj4XETFHxe7bsOqFsb2Vz2LLwzkiEmNNbiRyxaekOQ/lb5j321cmD1IKdB7KFxHbDbe0zbvschJSB7nKT29ftlj5yF6ZPOj7rW+JSJ8R14hIhcPe5HnaeAnoHEhIXZTSg6P8S5+aIyLfb33rvftnBrtenU3quCnDZ94tIo78vUqvyg8fvS0iCf3TfFeL72cTkQObVvtTpqus5K2516v9ZV8sf1RElHZ+Rc+0ESLy1ZrnfYfObH4oR/lZrGmHcB7KXzd7glK3Cof98I7Ncm5aAYhIhcO+9taxSjOexZYxdcX7mpyUOXuxOTrOVVaybvYE9cm89bnqwxfG9vbteSn45B2ltO6XXv5rO1XGOfl2iea/97qImKNiA/bGura89bmvTB7k22SuTN9gNIdGWqwicnTPNmn0PG2yBHQCJKQuSu3BUR6WFhXufPmJ4Fap04hI6PmHdV/+Yd2Xt2/6dtJjq5TmnM0P5Siv5q3PdZWV+CYVpem+1F7o55xJrvJS3yEsmq2GTJtjNIe6yko0HaYVDntpUaGIWDPO+41b/vN5o2FK7Yf9fZ9dRoXD/tr0EduXLVZz0uRl67MWLlVeVaZ4OHnwgP8FKj13yoZ+Uj5N3xyMgFM+336jJonIrpVPajra1C62Rs5T3xLQCTBSu4tSr0iPtFhvfmlzfIrt+sdX+f4IRosZzaHKmFwRKbUXHtn9qWbw5smDB6wZoy+77iblz57SdH9k96cB2XuPgUNFxFVe2vCl8p/tynBstEDe+ty89blDps0ZPvNuZcTJwX9sOrZvl3LZf211VeObD5k2R82mcX1T/dljr/TM/tdOpj2v7URarGk33KJGVXNUjO+rzsP5FltG2o2zuyMgAAAKZUlEQVSzlAsXlPNUaWT1swR0dCSkrq7CYf/wkTlTV7xvsWX0Ss8MyNTPXVypvVAzckhj9ytPWTNGKy32cq7p/qu1ywOy925hUQEpBxeUtz738OdbZrz6qTk6btiMu47t29XkFOdZC5fafjtTmbHQT5EWa/azG9WcjbaQ/dw71ozGpmz44aO3LbYMtaOt4XnaZAno6Ohlgxzbt0sZVXr1XY8Guy5dwrF9u5QB0ZmzFytN985D+Y1f++0/+97tImI0mRu+pLR21FSVB2RHXVaFw+7bp6Z0mYWEhl1w5SHT5ijj/PZtWPW37CHPjuz+7MjuhZ9/2PgulHjkPJS/5eEcZRPNdA9opRuWrrFmjK48Ubzn1b+oR1hz0ajaIZ46bkrD89SfEtDRkZAgcm5YYnw/G2MM24dyOUyfK7KUpvtjeRdoumtZi/3Jgm9FJCIxSXPVVaTFGtM7WUR+/OdHLSgWvmL7pMi5rFlxvEgaDOlVKT1rR7/cvu2Z+/0PwUrr0ZpZo5mYKrCUs0POdXR+teb5xu8ioEThtBtnNTxP/SwBHRoJCSIiu1Y+qVzSPHzGXcGuS5egjIuP72eL72erc1VrutiU8Unm6LgWzEt0bN8u5crwiUty1c2VXhtlPDh/dP3XKz3zji0HsxYu9f3loNyHRM5d0/TV2uXKkF7fiXBSx02ZuXqH+tC3hSl13BQ/Lwj33Sm3gWuBG5auUcZZq1dFaEYRhMb80jKXtXBpw27NA++uFpGE/mkXPE/9KQEdGuOQcFbx/j3Joyb2G309c8K2gwqH3XkoXxk3rU4E4KvUXhhjTZ68bH3lyZ8jEnq+MLYZw3XfvW/mrLW7IhKTpq54X2n2V764K08Ub5yfHbg30fmVHy8ymkLTp+akT81RjmREQk9lRNG+DauUP7cVDnv++2vSp+Ykj5p4x5aDrvJSc1SMMhG2iPzw8YbkUROtGaOnr9xaVeIMi4u32DIqTxRHNDosSfm/MePVT4v37xGRpMFXGE2hSo9Pm7/nTiQsLn78g89nzn4wIqGniKgzVImI4997Y6zJw35/Z89BV9RWV8X1TY2xJiuzPvqWUPDJO2PufUo57Jrz1M8S0KHRhtTlKF0Dde5qzfNfr10uIhGJSXS0tdivHdsLUlvslS5OjY3zs52H8pXL4pSr9Bspv8513jMVDvvqGZn2vTtcZSUx1mTli7vw8w/fmnu971e8q7xEGgxLqjp90p/KdxEVDvumBdPse3dUnihWjmSdu9qRv3fLwzm+PyS2PXP/9mWLnYfyzdFxMdZkoynUeSh/z6t/EZGCT95RXrLYMpJHTYxISNq3YZUyI6h65JVj7vshvnvfTGWsUvKoiX1GXFNaVLhpwTRXeanmg0bjNj+U4zycr/w8cOTvVWfcEJGtS+bt27DKVX7amjE6edTEOnf19mWLT/ywXxqcAkpIlXMzmbWgBHRcurLKJi5SLZvzy4Wp0bkFbVyfzubV8Vyp21q3flzUnrtrt48sddyUSY+tuuC9nzq6dv7IhBMtEDrridaJtf+J1qG1IMzQhgQEx9Dpc8XnFyoA4KJCQgKCoFd6psWWIec6NwEAFxsSEhAEaZNniYgjfy9TdALAxYlr2YAg2LpknnKfSwDAxYk2JAAAAC0SEgAAgBYJCQAAQIuEBAAAoEVCAgAA0CIhAQAAaJGQAAAAtEhIAAAAWiQkAAAALRISAACAFgkJAABAi4QEAACgRUICAADQIiEBAABokZAAAAC0SEgAAABaurLKqsbXKJuTqi5H5xa0cX0AAAACrAVhhjYkAAAALRISAACAFgkJAABAi4QEAACgRUICAADQ8iMhmcJ/WXafabuqAAAABJ5vevFNNY1qOiEZYi3qsqfc2cxKAQAABJOnolRd1kfF+7lV0wlJ55OQvKcdza0WAABAEHlLitRl31TTuKYTkj62h7pc7zzW3GoBAAAEkafytLqsjwtcQtJ176Uue8tONrdaAAAAQeSt8ElIAexl08f4jEMqPd7cagEAAASRp6RYXdYFMiH1TFaX6779rLnVAgAACKLaAzvUZb0luZE1fTWdkIyXpusiYpVlj/OYx/5dCyoHAADQ/rynHWp00YWYjANG+rmhH/MhGYzGtNHqo9r9NCMBAICOoTbvE3XZOGCkLsTk54Z+zakdMvDqX/a075NG1gQAALh41Ob9Q102Dsryf0O/EpIxfawYjMpy/U8HPCePNKtyAAAA7c9T5qwr2K0+DBk8xv9t/UpIutBIQ7/h6kP3Byv83wEAAEBQuDe/KPV1yrLeOsD/6SLF/zvXmq6eqi7X7NzIeG0AAHAx85w8UrPjTfWh6eppzdrc34QU8h//qbcOUB9WrXuiWbsBAABoT663n/6lASnhkm6jf9eszf1NSCISNv1Bdbm+YHfdgc+btScAAID2UV+YV/v1h+pD8033qiOq/dSMhGRIvdKYNkp9WP3GY16fG50AAABcFNxnqlf/UX1kTBkeMmxic8toRkISkdApi9QI5jl55MxL96jtVwAAAMFXX1e1cmF98UH1CVP2ohYU07yEpLcOMGXd8ksdCnZXv/l4C/YKAADQFlzvL/ed3brbyGxjyvBG1v81zUtIImKeep9xwFXqw5rtb9Rsf6MFOwYAAAis2n/93f3Bi+pDQ/IQ88wlLSuq2QlJDMawuc8ZLP3UJ6rffJyQBAAAgqv2X3+vem2x+lAXawmb96L/txnR0JVVVrVgs3rH4TNP3uytrlCf6XbN70N/91BzB4oDAAC0Vn2d6/3lvq1HYgqPvG+t70RFzdX8NiQRETFY+oXNfc43D9Vsf+PMc7N9MxMAAECbc5+pWnHnefHIYAy77c+tiUfS4jYkRd13/6x66R7fVGSw9Au95VFD6pWtqRMAAIA/6gvzqlf/0ffKNTGFh93255Ch41pZcqsSkojUOw5Xv3hXveOw75Mhg8eYshcZkvq3rm4AAAAX5jl5xPX2077TQoqILtYScdfLrWw9OltUKxOSiHirK6peuqfuu3+e96zB2O3KG01TFumj41tZPgAAgMpT5nRvfrFmx5uaSRkNyUPC5r0YqOARgIQkIlJfV/3WEzWfva593mA09BseMvTakMFj9AmXBGBHAACgS/KedtTmfVKb94+6gt0NJ6zuNjLbPHNJi69cayhACUlERDz276rWPVFfsPuCrxqS+hvTRut7JOui4/VRCfroeF2sJVC7BgAAnYmnzCnlJ+tPO7xlTo/zaO2BHR77dxdc05gy3JS9qGXTQjYikAlJUbv/M/fGv5w3ZgoAACDQ9AmXmG+6twX3XPNH4BOSiEh9Xc3ud2t2vFlfmBf4wgEAQNemtw4wXT2t2+jftd1EjG2TkM7xlDnr9n9Wd+Cz2n/vEveZttsRAADo3HQhJuOAkcZBWSGDx7TDQJ22TUgqb627/oc99cUF3nKnp9zpKXF4Tzs85U5iEwAA0DKF66PidbEWfZxFHxWvi4rXW5KNA0YGcCB2k9opIQEAAHQgLbzrCAAAQCdGQgIAANAiIQEAAGiRkAAAALRISAAAAFokJAAAAK3/B2KrA65IiGJFAAAAAElFTkSuQmCC" 141 | } 142 | }, 143 | "cell_type": "markdown", 144 | "id": "502fdd00", 145 | "metadata": {}, 146 | "source": [ 147 | "# 2) Apache Spark\n", 148 | "\n", 149 | "Apache Spark is an open-source, distributed processing system used for big data workloads. It utilizes in-memory caching, and optimized query execution for fast analytic queries against data of any size. It provides development APIs in Java, Scala, Python and R, and supports code reuse across multiple workloads - batch processing, interactive queries, real-time analytics, machine learning, and graph processing. You’ll find it used by organizations from any industry, including at FINRA, Yelp, Zillow, DataXu, Urban Institute, and CrowdStrike. Apache Spark has become one of the most popular big data distributed processing framework with 365,000 meetup members in 2017.\n", 150 | "\n", 151 | "Spark is designed to be fast, scalable, and easy to use, and it provides a range of features that make it well-suited for big data processing, machine learning, and real-time stream processing. Some of the key features of Spark include:\n", 152 | "\n", 153 | "`In-memory processing`: Spark processes data in-memory, which allows for faster processing times than traditional disk-based processing systems.\n", 154 | "\n", 155 | "`Distributed computing`: Spark is designed to run on a cluster of machines, allowing it to process large amounts of data in parallel across multiple nodes.\n", 156 | "\n", 157 | "`Data processing APIs`: Spark provides a range of APIs for processing data, including SQL, Streaming, Machine Learning, and Graph processing APIs.\n", 158 | "\n", 159 | "`Fault tolerance`: Spark is designed to be fault-tolerant, meaning that it can recover from failures in the cluster without losing data.\n", 160 | "\n", 161 | "`Community support`: Spark has a large and active community of users and developers who contribute to the development and maintenance of the system.\n", 162 | "\n", 163 | "\n", 164 | "### When to use Spark?\n", 165 | "\n", 166 | "Spark is a versatile tool for processing big data and can be used in a wide range of applications. Here are some scenarios where Spark is particularly well-suited:\n", 167 | "\n", 168 | "1) Processing large volumes of data \n", 169 | " \n", 170 | "Spark is designed to process large volumes of data quickly and efficiently. If you have large datasets that are too big to fit into memory on a single machine, Spark can help you distribute the processing across a cluster of machines, allowing you to process the data faster.\n", 171 | "\n", 172 | "2) Real-time stream processing \n", 173 | "\n", 174 | "Spark Streaming is a component of Spark that allows you to process real-time data streams. If you need to process data in real-time, Spark Streaming provides a scalable and fault-tolerant platform for doing so.\n", 175 | "\n", 176 | "3) Machine learning \n", 177 | "\n", 178 | "Spark's Machine Learning Library (MLlib) provides a range of algorithms for building machine learning models. If you need to train machine learning models on large datasets, Spark can help you distribute the processing across a cluster of machines, allowing you to train models faster.\n", 179 | "\n", 180 | "4) Graph processing \n", 181 | "\n", 182 | "Spark provides a Graph Processing API (GraphX) that allows you to process large-scale graphs. If you need to perform graph analysis on large datasets, Spark can help you distribute the processing across a cluster of machines, allowing you to process the graph faster.\n", 183 | "\n", 184 | "5) Ad-hoc data analysis\n", 185 | "\n", 186 | "Spark provides an SQL API (Spark SQL) that allows you to run SQL queries on large datasets. If you need to perform ad-hoc data analysis on large datasets, Spark SQL can help you do so quickly and efficiently.\n", 187 | "\n", 188 | "Overall, Spark is well-suited for applications that involve processing large volumes of data, real-time stream processing, machine learning, graph processing, and ad-hoc data analysis. If you have data processing needs in any of these areas, Spark may be a good choice for your application.\n", 189 | "\n", 190 | "![image.png](attachment:image.png)\n", 191 | "\n", 192 | "### Spark and PySpark Installation\n", 193 | "\n", 194 | "Follow the instructions from here - \n", 195 | "\n", 196 | "https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/week_5_batch_processing/setup/windows.md\n" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "id": "fd6315b9", 202 | "metadata": {}, 203 | "source": [ 204 | "# 3) Remaining Notes\n", 205 | "\n", 206 | "The remaining notes can be found as individual code files along with explanations here - https://github.com/Balajirvp/DE-Zoomcamp/tree/main/Week%205/Code" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "id": "761e0215", 212 | "metadata": {}, 213 | "source": [ 214 | "# 4) References\n", 215 | "\n", 216 | "https://dataengineering.wiki/Concepts/Batch+Data+Processing \\\n", 217 | "https://www.montecarlodata.com/blog-stream-vs-batch-processing/" 218 | ] 219 | } 220 | ], 221 | "metadata": { 222 | "kernelspec": { 223 | "display_name": "Python 3 (ipykernel)", 224 | "language": "python", 225 | "name": "python3" 226 | }, 227 | "language_info": { 228 | "codemirror_mode": { 229 | "name": "ipython", 230 | "version": 3 231 | }, 232 | "file_extension": ".py", 233 | "mimetype": "text/x-python", 234 | "name": "python", 235 | "nbconvert_exporter": "python", 236 | "pygments_lexer": "ipython3", 237 | "version": "3.9.15" 238 | } 239 | }, 240 | "nbformat": 4, 241 | "nbformat_minor": 5 242 | } 243 | --------------------------------------------------------------------------------