├── .gitignore
├── README.md
├── week1
    ├── Dockerfile
    ├── README.md
    ├── docker-compose.yaml
    ├── homework
    │   ├── homework-partB.md
    │   ├── homework-partB.solution.md
    │   ├── homework.md
    │   └── homework.solution.md
    ├── img
    │   ├── Screenshot at 2023-01-21 12-00-42.png
    │   ├── db.png
    │   ├── register-server1.png
    │   ├── register-server2.png
    │   ├── register-server3.png
    │   ├── register-server4.png
    │   ├── register-server5.png
    │   └── register-server6.png
    ├── ingest_data.py
    ├── ingest_data_green_taxi.py
    ├── ingest_taxi_zone_lookup.py
    ├── terraform
    │   ├── .terraform-version
    │   ├── README.md
    │   ├── main.tf
    │   └── variables.tf
    └── upload-data.ipynb
├── week2
    ├── Dockerfile
    ├── README.md
    ├── blocks
    │   ├── make_docker_block.py
    │   └── make_gcp_blocks.py
    ├── de-zoomcamp-week2.yaml
    ├── docker-requirements.txt
    ├── docker_deploy.py
    ├── etl_gcs_to_bq.py
    ├── etl_web_to_gcs.py
    ├── homework
    │   ├── etl_gcs_to_bq.py
    │   ├── etl_web_to_gcs.py
    │   └── homework.md
    ├── img
    │   ├── bq_create_table1.png
    │   ├── bq_create_table2.png
    │   ├── bq_delete_data1.png
    │   ├── bq_delete_data2.png
    │   ├── bq_delete_data3.png
    │   ├── deployment.png
    │   ├── docker_block1.png
    │   ├── docker_block2.png
    │   ├── docker_deploy_result.png
    │   ├── gcp_credentials_block.png
    │   ├── gcs_bucket_block1.png
    │   ├── gcs_bucket_block2.png
    │   ├── gcs_bucket_block3.png
    │   ├── gcs_bucket_block4.png
    │   ├── gcs_parameterized_flow.png
    │   ├── homework_Q2.png
    │   ├── homework_Q5_1.png
    │   ├── homework_Q5_2.png
    │   ├── homework_Q5_3.png
    │   ├── homework_Q5_4.png
    │   ├── homework_Q6.png
    │   ├── quickrun.png
    │   ├── records.png
    │   ├── schedule1.png
    │   ├── schedule2.png
    │   ├── schedule3.png
    │   ├── sql-block.png
    │   ├── subflows.png
    │   └── uploaded_data_gcp.png
    ├── ingest_data_flow.py
    ├── ingest_data_flow_etl.py
    ├── ingest_data_flow_etl_with_sql_block.py
    ├── parameterized_flow.py
    └── requirements.txt
├── week3
    ├── README.md
    ├── big_query_ml.sql
    ├── download.py
    ├── homework.md
    └── img
    │   ├── citibike_stations1.png
    │   ├── citibike_stations2.png
    │   ├── clustering.png
    │   ├── information_schema_partitions.png
    │   ├── partition.png
    │   ├── partitioning_vs_clustering.png
    │   ├── result_non_partitioned.png
    │   ├── result_partitioned.png
    │   ├── results_clustered.png
    │   └── results_unclustered.png
├── week4
    ├── README.md
    ├── homework.md
    ├── img
    │   ├── artifacts.png
    │   ├── bigquery.png
    │   ├── charts.png
    │   ├── control_menu.png
    │   ├── data_source.png
    │   ├── dbt.png
    │   ├── dbt_init.png
    │   ├── dbt_job1.png
    │   ├── dbt_job2.png
    │   ├── dbt_job3.png
    │   ├── dbt_job4.png
    │   ├── dbt_prod_env.png
    │   ├── etl_vs_elt.png
    │   ├── homework-question2.png
    │   ├── homework-question5.png
    │   ├── lineage.png
    │   ├── select_table.png
    │   ├── taxi_zone_lookup.png
    │   ├── ts_chart.png
    │   └── ts_chart_2019_2020.png
    ├── taxi_rides_ny
    │   ├── .gitignore
    │   ├── README.md
    │   ├── analyses
    │   │   └── .gitkeep
    │   ├── dbt_project.yml
    │   ├── macros
    │   │   ├── .gitkeep
    │   │   └── get_payment_type_description.sql
    │   ├── models
    │   │   ├── core
    │   │   │   ├── dim_monthly_zone_revenue.sql
    │   │   │   ├── dim_zones.sql
    │   │   │   ├── fact_trips.sql
    │   │   │   ├── fhv_fact_trips.sql
    │   │   │   └── schema.yml
    │   │   └── staging
    │   │   │   ├── schema.yml
    │   │   │   ├── stg_fhv_tripdata.sql
    │   │   │   ├── stg_green_tripdata.sql
    │   │   │   └── stg_yellow_tripdata.sql
    │   ├── packages.yml
    │   ├── seeds
    │   │   ├── .gitkeep
    │   │   └── taxi_zone_lookup.csv
    │   ├── snapshots
    │   │   └── .gitkeep
    │   └── tests
    │   │   └── .gitkeep
    └── web_to_gcs.py
├── week5
    ├── 04_pyspark.ipynb
    ├── 05_taxi_schema.ipynb
    ├── 06_spark_sql.ipynb
    ├── 07_groupby_join.ipynb
    ├── 08_rdds.ipynb
    ├── 09_spark_gcs.ipynb
    ├── 10_local_spark_cluster.ipynb
    ├── 10_local_spark_cluster.py
    ├── 11_big_query.py
    ├── README.md
    ├── de-zoomcamp-week5.yaml
    ├── download_data.sh
    └── img
    │   ├── bigquery1.png
    │   ├── bigquery2.png
    │   ├── cluster1.png
    │   ├── cluster2.png
    │   ├── create_cluster.png
    │   ├── groupby1.png
    │   ├── groupby2.png
    │   ├── join1.png
    │   ├── join2.png
    │   ├── join3.png
    │   ├── join4.png
    │   ├── mapPartition.png
    │   ├── spark-master.png
    │   ├── spark-session.png
    │   ├── submit_job.png
    │   └── worker.png
└── week6
    ├── README.md
    ├── img
        ├── api-key1.png
        ├── api-key2.png
        ├── avro.png
        ├── basic-cluster1.png
        ├── basic-cluster2.png
        ├── basic-cluster3.png
        ├── connector.png
        ├── global_ktable.png
        ├── join-example.png
        ├── kafka-streams-basics1.png
        ├── kafka-streams-basics2.png
        ├── kafka-streams-basics3.png
        ├── ktables.png
        ├── messages1.png
        ├── messages2.png
        ├── offset-example.png
        ├── partition-example.png
        ├── rides-location-topic.png
        ├── schema-registry1.png
        ├── schema-registry2.png
        ├── topic1.png
        ├── topic2.png
        └── vendor-info-topic.png
    ├── java
        └── kafka_examples
        │   ├── .gitignore
        │   ├── build.gradle
        │   ├── build
        │       └── generated-main-avro-java
        │       │   └── schemaregistry
        │       │       ├── RideRecord.java
        │       │       ├── RideRecordCompatible.java
        │       │       └── RideRecordNoneCompatible.java
        │   ├── gradle
        │       └── wrapper
        │       │   ├── gradle-wrapper.jar
        │       │   └── gradle-wrapper.properties
        │   ├── gradlew
        │   ├── gradlew.bat
        │   ├── settings.gradle
        │   └── src
        │       ├── main
        │           ├── avro
        │           │   ├── rides.avsc
        │           │   ├── rides_compatible.avsc
        │           │   └── rides_non_compatible.avsc
        │           ├── java
        │           │   └── org
        │           │   │   └── example
        │           │   │       ├── AvroProducer.java
        │           │   │       ├── JsonConsumer.java
        │           │   │       ├── JsonKStream.java
        │           │   │       ├── JsonKStreamJoins.java
        │           │   │       ├── JsonKStreamWindow.java
        │           │   │       ├── JsonProducer.java
        │           │   │       ├── JsonProducerPickupLocation.java
        │           │   │       ├── Secrets.java
        │           │   │       ├── Topics.java
        │           │   │       ├── customserdes
        │           │   │           └── CustomSerdes.java
        │           │   │       └── data
        │           │   │           ├── PickupLocation.java
        │           │   │           ├── Ride.java
        │           │   │           └── VendorInfo.java
        │           └── resources
        │           │   └── rides.csv
        │       └── test
        │           └── java
        │               └── org
        │                   └── example
        │                       ├── JsonKStreamJoinsTest.java
        │                       ├── JsonKStreamTest.java
        │                       └── helper
        │                           └── DataGeneratorHelper.java
    └── ksqldb
        └── commands.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | **/.terraform
 2 | **/.terraform.lock.hcl
 3 | **/terraform.tfstate
 4 | **/__pycache__
 5 | **/.ipynb_checkpoints/
 6 | dtc-de-375514-849c13503247.json
 7 | week2/data
 8 | prefect.api.key
 9 | dtc-de-375514-319899402561-bigquery.json
10 | **/.prefectignore


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This repository contains my notes of the awesome [Data Engineering Zoomcamp by DataTalksClub](https://github.com/DataTalksClub/data-engineering-zoomcamp).
2 | 
3 | If you find any errors/inconsistencies or have any suggestions for improvement, do not hesitate to create a pull request. :smiley:
4 | 
5 | My final project for the zoomcamp is available in [padilha/nyc-motor-vehicle-collisions](https://github.com/padilha/nyc-motor-vehicle-collisions).


--------------------------------------------------------------------------------
/week1/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9.1
2 | 
3 | RUN apt-get install wget
4 | RUN pip install pandas==1.5.2 sqlalchemy==1.4.39 pyarrow==8.0.0 psycopg2==2.9.5 psycopg2-binary==2.9.5
5 | 
6 | WORKDIR /app
7 | COPY ingest_data.py ingest_data.py
8 | 
9 | ENTRYPOINT ["python", "ingest_data.py"]


--------------------------------------------------------------------------------
/week1/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   pgdatabase:
 3 |     image: postgres:13
 4 |     environment:
 5 |       - POSTGRES_USER=root
 6 |       - POSTGRES_PASSWORD=root
 7 |       - POSTGRES_DB=ny_taxi
 8 |     volumes:
 9 |       - "../ny_taxi_postgres_data:/var/lib/postgresql/data:rw"
10 |     ports:
11 |       - "5432:5432"
12 |   pgadmin:
13 |     image: dpage/pgadmin4
14 |     environment:
15 |       - PGADMIN_DEFAULT_EMAIL=admin@admin.com
16 |       - PGADMIN_DEFAULT_PASSWORD=root
17 |     volumes:
18 |       - "../pgadmin_conn_data:/var/lib/pgadmin:rw"
19 |     ports:
20 |       - "8080:80"


--------------------------------------------------------------------------------
/week1/homework/homework-partB.md:
--------------------------------------------------------------------------------
 1 | ## Week 1 Homework
 2 | 
 3 | In this homework we'll prepare the environment by creating resources in GCP with Terraform.
 4 | 
 5 | In your VM on GCP install Terraform. Copy the files from the course repo
 6 | [here](https://github.com/DataTalksClub/data-engineering-zoomcamp/tree/main/week_1_basics_n_setup/1_terraform_gcp/terraform) to your VM.
 7 | 
 8 | Modify the files as necessary to create a GCP Bucket and Big Query Dataset.
 9 | 
10 | 
11 | ## Question 1. Creating Resources
12 | 
13 | After updating the main.tf and variable.tf files run:
14 | 
15 | ```
16 | terraform apply
17 | ```
18 | 
19 | Paste the output of this command into the homework submission form.
20 | 
21 | 
22 | ## Submitting the solutions
23 | 
24 | * Form for submitting: [form](https://forms.gle/S57Xs3HL9nB3YTzj9)
25 | * You can submit your homework multiple times. In this case, only the last submission will be used. 
26 | 
27 | Deadline: 26 January (Thursday), 22:00 CET
28 | 
29 | 


--------------------------------------------------------------------------------
/week1/homework/homework-partB.solution.md:
--------------------------------------------------------------------------------
  1 | ## Question 1. Creating Resources
  2 | 
  3 | Command:
  4 | ```
  5 | terraform apply -var="project=dtc-de-375514"
  6 | ```
  7 | 
  8 | Output:
  9 | 
 10 |     Terraform used the selected providers to generate the following execution plan. Resource actions are indicated with the following symbols:
 11 |     + create
 12 | 
 13 |     Terraform will perform the following actions:
 14 | 
 15 |     # google_bigquery_dataset.dataset will be created
 16 |     + resource "google_bigquery_dataset" "dataset" {
 17 |         + creation_time              = (known after apply)
 18 |         + dataset_id                 = "trips_data_all"
 19 |         + delete_contents_on_destroy = false
 20 |         + etag                       = (known after apply)
 21 |         + id                         = (known after apply)
 22 |         + labels                     = (known after apply)
 23 |         + last_modified_time         = (known after apply)
 24 |         + location                   = "europe-west6"
 25 |         + project                    = "dtc-de-375514"
 26 |         + self_link                  = (known after apply)
 27 | 
 28 |         + access {
 29 |             + domain         = (known after apply)
 30 |             + group_by_email = (known after apply)
 31 |             + role           = (known after apply)
 32 |             + special_group  = (known after apply)
 33 |             + user_by_email  = (known after apply)
 34 | 
 35 |             + dataset {
 36 |                 + target_types = (known after apply)
 37 | 
 38 |                 + dataset {
 39 |                     + dataset_id = (known after apply)
 40 |                     + project_id = (known after apply)
 41 |                     }
 42 |                 }
 43 | 
 44 |             + routine {
 45 |                 + dataset_id = (known after apply)
 46 |                 + project_id = (known after apply)
 47 |                 + routine_id = (known after apply)
 48 |                 }
 49 | 
 50 |             + view {
 51 |                 + dataset_id = (known after apply)
 52 |                 + project_id = (known after apply)
 53 |                 + table_id   = (known after apply)
 54 |                 }
 55 |             }
 56 |         }
 57 | 
 58 |     # google_storage_bucket.data-lake-bucket will be created
 59 |     + resource "google_storage_bucket" "data-lake-bucket" {
 60 |         + force_destroy               = true
 61 |         + id                          = (known after apply)
 62 |         + location                    = "EUROPE-WEST6"
 63 |         + name                        = "dtc_data_lake_dtc-de-375514"
 64 |         + project                     = (known after apply)
 65 |         + public_access_prevention    = (known after apply)
 66 |         + self_link                   = (known after apply)
 67 |         + storage_class               = "STANDARD"
 68 |         + uniform_bucket_level_access = true
 69 |         + url                         = (known after apply)
 70 | 
 71 |         + lifecycle_rule {
 72 |             + action {
 73 |                 + type = "Delete"
 74 |                 }
 75 | 
 76 |             + condition {
 77 |                 + age                   = 30
 78 |                 + matches_prefix        = []
 79 |                 + matches_storage_class = []
 80 |                 + matches_suffix        = []
 81 |                 + with_state            = (known after apply)
 82 |                 }
 83 |             }
 84 | 
 85 |         + versioning {
 86 |             + enabled = true
 87 |             }
 88 | 
 89 |         + website {
 90 |             + main_page_suffix = (known after apply)
 91 |             + not_found_page   = (known after apply)
 92 |             }
 93 |         }
 94 | 
 95 |     Plan: 2 to add, 0 to change, 0 to destroy.
 96 | 
 97 |     Do you want to perform these actions?
 98 |     Terraform will perform the actions described above.
 99 |     Only 'yes' will be accepted to approve.
100 | 
101 |     Enter a value: yes
102 | 
103 |     google_bigquery_dataset.dataset: Creating...
104 |     google_storage_bucket.data-lake-bucket: Creating...
105 |     google_storage_bucket.data-lake-bucket: Creation complete after 4s [id=dtc_data_lake_dtc-de-375514]
106 |     google_bigquery_dataset.dataset: Creation complete after 9s [id=projects/dtc-de-375514/datasets/trips_data_all]
107 | 
108 |     Apply complete! Resources: 2 added, 0 changed, 0 destroyed.
109 | 
110 | ## Learning in public
111 | [LinkedIn](https://www.linkedin.com/posts/victor-padilha_dataengineering-dataanalytics-dezoomcamp-activity-7023252179117350912-Ch-d?utm_source=share&utm_medium=member_desktop)


--------------------------------------------------------------------------------
/week1/homework/homework.md:
--------------------------------------------------------------------------------
  1 | ## Week 1 Homework
  2 | 
  3 | In this homework we'll prepare the environment 
  4 | and practice with Docker and SQL
  5 | 
  6 | 
  7 | ## Question 1. Knowing docker tags
  8 | 
  9 | Run the command to get information on Docker 
 10 | 
 11 | ```docker --help```
 12 | 
 13 | Now run the command to get help on the "docker build" command
 14 | 
 15 | Which tag has the following text? - *Write the image ID to the file* 
 16 | 
 17 | - `--imageid string`
 18 | - `--iidfile string`
 19 | - `--idimage string`
 20 | - `--idfile string`
 21 | 
 22 | 
 23 | ## Question 2. Understanding docker first run 
 24 | 
 25 | Run docker with the python:3.9 image in an interactive mode and the entrypoint of bash.
 26 | Now check the python modules that are installed ( use pip list). 
 27 | How many python packages/modules are installed?
 28 | 
 29 | - 1
 30 | - 6
 31 | - 3
 32 | - 7
 33 | 
 34 | # Prepare Postgres
 35 | 
 36 | Run Postgres and load data as shown in the videos
 37 | We'll use the green taxi trips from January 2019:
 38 | 
 39 | ```wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-01.csv.gz```
 40 | 
 41 | You will also need the dataset with zones:
 42 | 
 43 | ```wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv```
 44 | 
 45 | Download this data and put it into Postgres (with jupyter notebooks or with a pipeline)
 46 | 
 47 | 
 48 | ## Question 3. Count records 
 49 | 
 50 | How many taxi trips were totally made on January 15?
 51 | 
 52 | Tip: started and finished on 2019-01-15. 
 53 | 
 54 | Remember that `lpep_pickup_datetime` and `lpep_dropoff_datetime` columns are in the format timestamp (date and hour+min+sec) and not in date.
 55 | 
 56 | - 20689
 57 | - 20530
 58 | - 17630
 59 | - 21090
 60 | 
 61 | ## Question 4. Largest trip for each day
 62 | 
 63 | Which was the day with the largest trip distance
 64 | Use the pick up time for your calculations.
 65 | 
 66 | - 2019-01-18
 67 | - 2019-01-28
 68 | - 2019-01-15
 69 | - 2019-01-10
 70 | 
 71 | ## Question 5. The number of passengers
 72 | 
 73 | In 2019-01-01 how many trips had 2 and 3 passengers?
 74 |  
 75 | - 2: 1282 ; 3: 266
 76 | - 2: 1532 ; 3: 126
 77 | - 2: 1282 ; 3: 254
 78 | - 2: 1282 ; 3: 274
 79 | 
 80 | 
 81 | ## Question 6. Largest tip
 82 | 
 83 | For the passengers picked up in the Astoria Zone which was the drop off zone that had the largest tip?
 84 | We want the name of the zone, not the id.
 85 | 
 86 | Note: it's not a typo, it's `tip` , not `trip`
 87 | 
 88 | - Central Park
 89 | - Jamaica
 90 | - South Ozone Park
 91 | - Long Island City/Queens Plaza
 92 | 
 93 | 
 94 | ## Submitting the solutions
 95 | 
 96 | * Form for submitting: [form](https://forms.gle/EjphSkR1b3nsdojv7)
 97 | * You can submit your homework multiple times. In this case, only the last submission will be used. 
 98 | 
 99 | Deadline: 26 January (Thursday), 22:00 CET
100 | 
101 | 
102 | ## Solution
103 | 
104 | We will publish the solution here
105 | 


--------------------------------------------------------------------------------
/week1/homework/homework.solution.md:
--------------------------------------------------------------------------------
  1 | ## Question 1.
  2 | ```--iidfile string``` has the text _Write the image ID to the file_
  3 | 
  4 | ## Question 2.
  5 | Command output:
  6 | 
  7 |     Package    Version
  8 |     ---------- -------
  9 |     pip        22.0.4
 10 |     setuptools 58.1.0
 11 |     wheel      0.38.4
 12 | 
 13 | python:3.9 has a total of 3 packages installed.
 14 | 
 15 | # Prepare Postgres
 16 | 
 17 | Before solving the next questions, we need to ingest the green taxi trips dataset using the following commands. For some reason, GitHub does not allow us to wget the green_tripdata_2019-01.csv.gz described in the homework. So, I downloaded it manually, saved in the week1 directory and run the code ingest_data_green_taxi.py (see below). It is not a good practice to copy a code and just change a few lines to be able to run it for a new file. I only did that as a quickfix for this homework.
 18 | ```
 19 | docker run -it \
 20 |     -e POSTGRES_USER="root" \
 21 |     -e POSTGRES_PASSWORD="root" \
 22 |     -e POSTGRES_DB="ny_taxi" \
 23 |     -v /home/padilha/projects/de-zoomcamp/ny_taxi_postgres_data:/var/lib/postgresql/data \
 24 |     -p 5432:5432 \
 25 |     --network=pg-network \
 26 |     --name=pg-database \
 27 |     postgres:13
 28 | 
 29 | python ingest_data_green_taxi.py
 30 |     --user=root
 31 |     --password=root
 32 |     --host=localhost
 33 |     --port=5432
 34 |     --db=ny_taxi
 35 |     --table_name=green_taxi_trips
 36 | 
 37 | python ingest_taxi_zone_lookup.py
 38 | ```
 39 | 
 40 | ## Question 3. Count records
 41 | 
 42 | Query:
 43 | ```sql
 44 | SELECT
 45 |     CAST(lpep_pickup_datetime AS DATE),
 46 |     CAST(lpep_dropoff_datetime AS DATE),
 47 |     COUNT(1)
 48 | FROM
 49 |     green_taxi_trips
 50 | WHERE
 51 |     CAST(lpep_pickup_datetime AS DATE) = TO_DATE('2019-01-15', 'YYYY-MM-DD') AND
 52 |     CAST(lpep_dropoff_datetime AS DATE) = TO_DATE('2019-01-15', 'YYYY-MM-DD')
 53 | GROUP BY 1, 2;
 54 | ```
 55 | 
 56 | Output:
 57 | 
 58 |     +----------------------+-----------------------+-------+
 59 |     | lpep_pickup_datetime | lpep_dropoff_datetime | count |
 60 |     |----------------------+-----------------------+-------|
 61 |     | 2019-01-15           | 2019-01-15            | 20530 |
 62 |     +----------------------+-----------------------+-------+
 63 |     SELECT 1
 64 |     Time: 0.079s
 65 | 
 66 | ## Question 4. Largest trip for each day
 67 | 
 68 | Query:
 69 | ```sql
 70 | SELECT
 71 |     CAST(lpep_pickup_datetime AS DATE),
 72 |     MAX(trip_distance)
 73 | FROM
 74 |     green_taxi_trips
 75 | WHERE
 76 |     CAST(lpep_pickup_datetime AS DATE) = TO_DATE('2019-01-18', 'YYYY-MM-DD') OR
 77 |     CAST(lpep_pickup_datetime AS DATE) = TO_DATE('2019-01-28', 'YYYY-MM-DD') OR
 78 |     CAST(lpep_pickup_datetime AS DATE) = TO_DATE('2019-01-15', 'YYYY-MM-DD') OR
 79 |     CAST(lpep_pickup_datetime AS DATE) = TO_DATE('2019-01-10', 'YYYY-MM-DD')
 80 | GROUP BY 1;
 81 | ```
 82 | 
 83 | Output:
 84 | 
 85 |     +----------------------+--------+
 86 |     | lpep_pickup_datetime | max    |
 87 |     |----------------------+--------|
 88 |     | 2019-01-18           | 80.96  |
 89 |     | 2019-01-15           | 117.99 |
 90 |     | 2019-01-10           | 64.2   |
 91 |     | 2019-01-28           | 64.27  |
 92 |     +----------------------+--------+
 93 |     SELECT 4
 94 |     Time: 0.175s
 95 | 
 96 | ## Question 5. The number of passengers
 97 | 
 98 | Since the question does not specify if we must consider lpep_pickup_datetime or lpep_dropoff_datetime, I consider lpep_pickup_datetime as in Question 4.
 99 | 
100 | Query:
101 | ```sql
102 | SELECT
103 |     CAST(lpep_pickup_datetime AS DATE),
104 |     passenger_count,
105 |     COUNT(1)
106 | FROM
107 |     green_taxi_trips
108 | WHERE
109 |     CAST(lpep_pickup_datetime AS DATE) = TO_DATE('2019-01-01', 'YYYY-MM-DD')
110 | GROUP BY 1, 2;
111 | ```
112 | 
113 | Output:
114 | 
115 |     +----------------------+-----------------+-------+
116 |     | lpep_pickup_datetime | passenger_count | count |
117 |     |----------------------+-----------------+-------|
118 |     | 2019-01-01           | 0               | 21    |
119 |     | 2019-01-01           | 1               | 12415 |
120 |     | 2019-01-01           | 2               | 1282  |
121 |     | 2019-01-01           | 3               | 254   |
122 |     | 2019-01-01           | 4               | 129   |
123 |     | 2019-01-01           | 5               | 616   |
124 |     | 2019-01-01           | 6               | 273   |
125 |     +----------------------+-----------------+-------+
126 | 
127 | In 2019-01-01 1282 trips had 2 passengers and 254 trips had 3 passengers.
128 | 
129 | ## Question 6. Largest tip
130 | 
131 | Query:
132 | ```sql
133 | SELECT
134 | 	MAX(tip_amount),
135 | 	zpu."Zone" AS "pickup_loc",
136 |     zdo."Zone" AS "dropoff_loc"
137 | FROM
138 | 	green_taxi_trips t
139 | 	JOIN zones zpu ON t."PULocationID" = zpu."LocationID"
140 | 	JOIN zones zdo ON t."DOLocationID" = zdo."LocationID"
141 | WHERE
142 |     zpu."Zone" = 'Astoria'
143 | GROUP BY 2, 3
144 | ORDER BY 1 DESC LIMIT 10;
145 | ```
146 | 
147 | Output:
148 | 
149 |     +-------+------------+-------------------------------+
150 |     | max   | pickup_loc | dropoff_loc                   |
151 |     |-------+------------+-------------------------------|
152 |     | 88.0  | Astoria    | Long Island City/Queens Plaza |
153 |     | 30.0  | Astoria    | Central Park                  |
154 |     | 25.0  | Astoria    | <null>                        |
155 |     | 25.0  | Astoria    | Jamaica                       |
156 |     | 18.16 | Astoria    | Astoria                       |
157 |     | 16.95 | Astoria    | Coney Island                  |
158 |     | 15.0  | Astoria    | South Ozone Park              |
159 |     | 14.96 | Astoria    | Marine Park/Mill Basin        |
160 |     | 14.42 | Astoria    | Old Astoria                   |
161 |     | 13.58 | Astoria    | Arrochar/Fort Wadsworth       |
162 |     +-------+------------+-------------------------------+
163 |     SELECT 10
164 |     Time: 0.037s
165 | 
166 | For the passengers picked up in the Astoria Zone, Long Island City/Queens Plaza was the drop off zone that had the largest tip.
167 | 
168 | ## Learning in public
169 | [LinkedIn](https://www.linkedin.com/posts/victor-padilha_dataengineering-dataanalytics-dezoomcamp-activity-7023252179117350912-Ch-d?utm_source=share&utm_medium=member_desktop)


--------------------------------------------------------------------------------
/week1/img/Screenshot at 2023-01-21 12-00-42.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week1/img/Screenshot at 2023-01-21 12-00-42.png


--------------------------------------------------------------------------------
/week1/img/db.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week1/img/db.png


--------------------------------------------------------------------------------
/week1/img/register-server1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week1/img/register-server1.png


--------------------------------------------------------------------------------
/week1/img/register-server2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week1/img/register-server2.png


--------------------------------------------------------------------------------
/week1/img/register-server3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week1/img/register-server3.png


--------------------------------------------------------------------------------
/week1/img/register-server4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week1/img/register-server4.png


--------------------------------------------------------------------------------
/week1/img/register-server5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week1/img/register-server5.png


--------------------------------------------------------------------------------
/week1/img/register-server6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week1/img/register-server6.png


--------------------------------------------------------------------------------
/week1/ingest_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import argparse
 3 | import os
 4 | from sqlalchemy import create_engine
 5 | from time import time
 6 | 
 7 | def parquet_to_csv(parquet_file, csv_file):
 8 |     df = pd.read_parquet(parquet_file, engine = 'pyarrow')
 9 |     df.to_csv(csv_file, index=False)
10 | 
11 | def ingest(csv_file, table_name, engine, chunksize=100000):
12 |     df_iter = pd.read_csv(csv_file, iterator=True, chunksize=chunksize)
13 |     run = True
14 |     while run:
15 |         try:
16 |             t_start = time()
17 |             df = next(df_iter)
18 |             df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
19 |             df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
20 |             df.to_sql(name=table_name, con=engine, if_exists='append')
21 |             t_end = time()
22 |             print(f'inserted another chunk, took {t_end-t_start:.3f} seconds')
23 |         except Exception:
24 |             run = False
25 | 
26 | def main(params):
27 |     user = params.user
28 |     password = params.password
29 |     host = params.host 
30 |     port = params.port 
31 |     db = params.db
32 |     table_name = params.table_name
33 |     url = params.url
34 |     parquet_file = 'output.parquet'
35 |     csv_file = 'output.csv'
36 |     os.system(f'wget {url} -O {parquet_file}')
37 |     parquet_to_csv(parquet_file, csv_file)
38 |     engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}')
39 |     engine.connect()
40 |     ingest(csv_file, table_name, engine)
41 |     
42 | 
43 | if __name__ == '__main__':
44 |     parser = argparse.ArgumentParser(description='Ingest CSV data to Postgres')
45 |     parser.add_argument('--user', required=True, help='user name for postgres')
46 |     parser.add_argument('--password', required=True, help='password for postgres')
47 |     parser.add_argument('--host', required=True, help='host for postgres')
48 |     parser.add_argument('--port', required=True, help='port for postgres')
49 |     parser.add_argument('--db', required=True, help='database name for postgres')
50 |     parser.add_argument('--table_name', required=True, help='name of the table where we will write the results to')
51 |     parser.add_argument('--url', required=True, help='url of the csv file')
52 |     args = parser.parse_args()
53 |     main(args)


--------------------------------------------------------------------------------
/week1/ingest_data_green_taxi.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import argparse
 3 | import os
 4 | from sqlalchemy import create_engine
 5 | from time import time
 6 | 
 7 | def ingest(csv_file, table_name, engine, chunksize=100000):
 8 |     df_iter = pd.read_csv(csv_file, iterator=True, chunksize=chunksize, compression='gzip')
 9 |     run = True
10 |     while run:
11 |         try:
12 |             t_start = time()
13 |             df = next(df_iter)
14 |             df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
15 |             df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])
16 |             df.to_sql(name=table_name, con=engine, if_exists='append')
17 |             t_end = time()
18 |             print(f'inserted another chunk, took {t_end-t_start:.3f} seconds')
19 |         except Exception:
20 |             run = False
21 | 
22 | def main(params):
23 |     user = params.user
24 |     password = params.password
25 |     host = params.host 
26 |     port = params.port 
27 |     db = params.db
28 |     table_name = params.table_name
29 |     csv_file = 'green_tripdata_2019-01.csv.gz'
30 |     engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}')
31 |     engine.connect()
32 |     ingest(csv_file, table_name, engine)
33 |     
34 | 
35 | if __name__ == '__main__':
36 |     parser = argparse.ArgumentParser(description='Ingest CSV data to Postgres')
37 |     parser.add_argument('--user', required=True, help='user name for postgres')
38 |     parser.add_argument('--password', required=True, help='password for postgres')
39 |     parser.add_argument('--host', required=True, help='host for postgres')
40 |     parser.add_argument('--port', required=True, help='port for postgres')
41 |     parser.add_argument('--db', required=True, help='database name for postgres')
42 |     parser.add_argument('--table_name', required=True, help='name of the table where we will write the results to')
43 |     args = parser.parse_args()
44 |     main(args)


--------------------------------------------------------------------------------
/week1/ingest_taxi_zone_lookup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | from sqlalchemy import create_engine
4 | url = 'https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv'
5 | os.system(f'wget {url}')
6 | engine = create_engine(f'postgresql://root:root@localhost:5432/ny_taxi')
7 | engine.connect()
8 | df_zones = pd.read_csv('taxi+_zone_lookup.csv')
9 | df_zones.to_sql(name='zones', con=engine, if_exists='replace')


--------------------------------------------------------------------------------
/week1/terraform/.terraform-version:
--------------------------------------------------------------------------------
1 | 1.3.7


--------------------------------------------------------------------------------
/week1/terraform/README.md:
--------------------------------------------------------------------------------
 1 | ### Concepts
 2 | * [Terraform_overview](../1_terraform_overview.md)
 3 | * [Audio](https://drive.google.com/file/d/1IqMRDwJV-m0v9_le_i2HA_UbM_sIWgWx/view?usp=sharing)
 4 | 
 5 | ### Execution
 6 | 
 7 | ```shell
 8 | # Refresh service-account's auth-token for this session
 9 | gcloud auth application-default login
10 | 
11 | # Initialize state file (.tfstate)
12 | terraform init
13 | 
14 | # Check changes to new infra plan
15 | terraform plan -var="project=<your-gcp-project-id>"
16 | ```
17 | 
18 | ```shell
19 | # Create new infra
20 | terraform apply -var="project=<your-gcp-project-id>"
21 | ```
22 | 
23 | ```shell
24 | # Delete infra after your work, to avoid costs on any running services
25 | terraform destroy
26 | ```
27 | 


--------------------------------------------------------------------------------
/week1/terraform/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.0"
 3 |   backend "local" {}  # Can change from "local" to "gcs" (for google) or "s3" (for aws), if you would like to preserve your tf-state online
 4 |   required_providers {
 5 |     google = {
 6 |       source  = "hashicorp/google"
 7 |     }
 8 |   }
 9 | }
10 | 
11 | provider "google" {
12 |   project = var.project
13 |   region = var.region
14 |   // credentials = file(var.credentials)  # Use this if you do not want to set env-var GOOGLE_APPLICATION_CREDENTIALS
15 | }
16 | 
17 | # Data Lake Bucket
18 | # Ref: https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket
19 | resource "google_storage_bucket" "data-lake-bucket" {
20 |   name          = "${local.data_lake_bucket}_${var.project}" # Concatenating DL bucket & Project name for unique naming
21 |   location      = var.region
22 | 
23 |   # Optional, but recommended settings:
24 |   storage_class = var.storage_class
25 |   uniform_bucket_level_access = true
26 | 
27 |   versioning {
28 |     enabled     = true
29 |   }
30 | 
31 |   lifecycle_rule {
32 |     action {
33 |       type = "Delete"
34 |     }
35 |     condition {
36 |       age = 30  // days
37 |     }
38 |   }
39 | 
40 |   force_destroy = true
41 | }
42 | 
43 | # DWH
44 | # Ref: https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/bigquery_dataset
45 | resource "google_bigquery_dataset" "dataset" {
46 |   dataset_id = var.BQ_DATASET
47 |   project    = var.project
48 |   location   = var.region
49 | }
50 | 


--------------------------------------------------------------------------------
/week1/terraform/variables.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   data_lake_bucket = "dtc_data_lake"
 3 | }
 4 | 
 5 | variable "project" {
 6 |   description = "Your GCP Project ID"
 7 | }
 8 | 
 9 | variable "region" {
10 |   description = "Region for GCP resources. Choose as per your location: https://cloud.google.com/about/locations"
11 |   default = "europe-west6"
12 |   type = string
13 | }
14 | 
15 | variable "storage_class" {
16 |   description = "Storage class type for your bucket. Check official docs for more info."
17 |   default = "STANDARD"
18 | }
19 | 
20 | variable "BQ_DATASET" {
21 |   description = "BigQuery Dataset that raw data (from GCS) will be written to"
22 |   type = string
23 |   default = "trips_data_all"
24 | }
25 | 


--------------------------------------------------------------------------------
/week1/upload-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "4797e788",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## Upload data"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "0e7596d2",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "In the original zoomcamp video, the NYC dataset file is downloaded as a csv file. Currently, the dataset is only available in parquet format. Therefore, we first convert it from parquet to csv (look at Kyle A and taro.wp's comments in the [zoomcamp video](https://www.youtube.com/watch?v=2JM-ziJt0WI&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb))."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "id": "056376b1",
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import pandas as pd\n",
 27 |     "parquet_file = './yellow_tripdata_2021-01.parquet'\n",
 28 |     "df = pd.read_parquet(parquet_file, engine = 'pyarrow')\n",
 29 |     "df.to_csv(parquet_file.replace('parquet', 'csv.gz'), index=False, compression='gzip')"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "id": "b66d10b8",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "Below we generate the SQL to create the table in the database."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "id": "977050bb",
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "name": "stdout",
 48 |      "output_type": "stream",
 49 |      "text": [
 50 |       "CREATE TABLE \"yellow_taxi_data\" (\n",
 51 |       "\"VendorID\" INTEGER,\n",
 52 |       "  \"tpep_pickup_datetime\" TEXT,\n",
 53 |       "  \"tpep_dropoff_datetime\" TEXT,\n",
 54 |       "  \"passenger_count\" REAL,\n",
 55 |       "  \"trip_distance\" REAL,\n",
 56 |       "  \"RatecodeID\" REAL,\n",
 57 |       "  \"store_and_fwd_flag\" TEXT,\n",
 58 |       "  \"PULocationID\" INTEGER,\n",
 59 |       "  \"DOLocationID\" INTEGER,\n",
 60 |       "  \"payment_type\" INTEGER,\n",
 61 |       "  \"fare_amount\" REAL,\n",
 62 |       "  \"extra\" REAL,\n",
 63 |       "  \"mta_tax\" REAL,\n",
 64 |       "  \"tip_amount\" REAL,\n",
 65 |       "  \"tolls_amount\" REAL,\n",
 66 |       "  \"improvement_surcharge\" REAL,\n",
 67 |       "  \"total_amount\" REAL,\n",
 68 |       "  \"congestion_surcharge\" REAL,\n",
 69 |       "  \"airport_fee\" REAL\n",
 70 |       ")\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "df = pd.read_csv('./yellow_tripdata_2021-01.csv.gz', nrows=100, compression='gzip')\n",
 76 |     "print(pd.io.sql.get_schema(df, name='yellow_taxi_data'))"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "id": "1288a603",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "Next, we run a simple script to ingest the data to Postgres. Note that Postgres must be running, otherwise we will not be able to connect to the database."
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 4,
 90 |    "id": "f55ed1a5",
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "name": "stdout",
 95 |      "output_type": "stream",
 96 |      "text": [
 97 |       "inserted another chunk, took 3.692 seconds\n",
 98 |       "inserted another chunk, took 3.603 seconds\n",
 99 |       "inserted another chunk, took 3.637 seconds\n",
100 |       "inserted another chunk, took 3.767 seconds\n",
101 |       "inserted another chunk, took 3.614 seconds\n",
102 |       "inserted another chunk, took 3.631 seconds\n",
103 |       "inserted another chunk, took 3.632 seconds\n",
104 |       "inserted another chunk, took 3.555 seconds\n",
105 |       "inserted another chunk, took 3.593 seconds\n",
106 |       "inserted another chunk, took 3.603 seconds\n",
107 |       "inserted another chunk, took 3.615 seconds\n",
108 |       "inserted another chunk, took 3.757 seconds\n"
109 |      ]
110 |     },
111 |     {
112 |      "name": "stderr",
113 |      "output_type": "stream",
114 |      "text": [
115 |       "/tmp/ipykernel_14775/2370894584.py:11: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.\n",
116 |       "  df = next(df_iter)\n"
117 |      ]
118 |     },
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "inserted another chunk, took 3.989 seconds\n",
124 |       "inserted another chunk, took 2.360 seconds\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "from sqlalchemy import create_engine\n",
130 |     "from time import time\n",
131 |     "engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')\n",
132 |     "engine.connect()\n",
133 |     "\n",
134 |     "df_iter = pd.read_csv('./yellow_tripdata_2021-01.csv.gz', iterator=True, chunksize=100000)\n",
135 |     "run = True\n",
136 |     "while run:\n",
137 |     "    try:\n",
138 |     "        t_start = time()\n",
139 |     "        df = next(df_iter)\n",
140 |     "        df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])\n",
141 |     "        df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])\n",
142 |     "        df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')\n",
143 |     "        t_end = time()\n",
144 |     "        print(f'inserted another chunk, took {t_end-t_start:.3f} seconds')\n",
145 |     "    except Exception:\n",
146 |     "        run = False"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "id": "368cb484",
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": []
156 |   }
157 |  ],
158 |  "metadata": {
159 |   "kernelspec": {
160 |    "display_name": "Python 3 (ipykernel)",
161 |    "language": "python",
162 |    "name": "python3"
163 |   },
164 |   "language_info": {
165 |    "codemirror_mode": {
166 |     "name": "ipython",
167 |     "version": 3
168 |    },
169 |    "file_extension": ".py",
170 |    "mimetype": "text/x-python",
171 |    "name": "python",
172 |    "nbconvert_exporter": "python",
173 |    "pygments_lexer": "ipython3",
174 |    "version": "3.9.15"
175 |   }
176 |  },
177 |  "nbformat": 4,
178 |  "nbformat_minor": 5
179 | }
180 | 


--------------------------------------------------------------------------------
/week2/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM prefecthq/prefect:2.7.7-python3.9
2 | 
3 | COPY docker-requirements.txt .
4 | 
5 | RUN pip install -r docker-requirements.txt --trusted-host pypi.python.org --no-cache-dir
6 | RUN mkdir /opt/prefect/flows
7 | 
8 | COPY parameterized_flow.py /opt/prefect/flows/parameterized_flow.py


--------------------------------------------------------------------------------
/week2/blocks/make_docker_block.py:
--------------------------------------------------------------------------------
 1 | from prefect.infrastructure.docker import DockerContainer
 2 | 
 3 | # alternative to creating DockerContainer block in the UI
 4 | docker_block = DockerContainer(
 5 |     image="discdiver/prefect:zoom",  # insert your image here
 6 |     image_pull_policy="ALWAYS",
 7 |     auto_remove=True,
 8 | )
 9 | 
10 | docker_block.save("zoom", overwrite=True)


--------------------------------------------------------------------------------
/week2/blocks/make_gcp_blocks.py:
--------------------------------------------------------------------------------
 1 | from prefect_gcp import GcpCredentials
 2 | from prefect_gcp.cloud_storage import GcsBucket
 3 | 
 4 | # alternative to creating GCP blocks in the UI
 5 | # insert your own service_account_file path or service_account_info dictionary from the json file
 6 | # IMPORTANT - do not store credentials in a publicly available repository!
 7 | 
 8 | 
 9 | credentials_block = GcpCredentials(
10 |     service_account_info={}  # enter your credentials info or use the file method.
11 | )
12 | credentials_block.save("zoom-gcp-creds", overwrite=True)
13 | 
14 | 
15 | bucket_block = GcsBucket(
16 |     gcp_credentials=GcpCredentials.load("zoom-gcp-creds"),
17 |     bucket="prefect-de-zoomcamp",  # insert your  GCS bucket name
18 | )
19 | 
20 | bucket_block.save("zoom-gcs", overwrite=True)


--------------------------------------------------------------------------------
/week2/de-zoomcamp-week2.yaml:
--------------------------------------------------------------------------------
  1 | name: de-zoomcamp-week2
  2 | channels:
  3 |   - defaults
  4 | dependencies:
  5 |   - _libgcc_mutex=0.1=main
  6 |   - _openmp_mutex=5.1=1_gnu
  7 |   - ca-certificates=2023.01.10=h06a4308_0
  8 |   - certifi=2022.12.7=py39h06a4308_0
  9 |   - ld_impl_linux-64=2.38=h1181459_1
 10 |   - libffi=3.4.2=h6a678d5_6
 11 |   - libgcc-ng=11.2.0=h1234567_1
 12 |   - libgomp=11.2.0=h1234567_1
 13 |   - libstdcxx-ng=11.2.0=h1234567_1
 14 |   - ncurses=6.3=h5eee18b_3
 15 |   - openssl=1.1.1s=h7f8727e_0
 16 |   - pip=22.3.1=py39h06a4308_0
 17 |   - python=3.9.16=h7a1cb2a_0
 18 |   - readline=8.2=h5eee18b_0
 19 |   - setuptools=65.6.3=py39h06a4308_0
 20 |   - sqlite=3.40.1=h5082296_0
 21 |   - tk=8.6.12=h1ccaba5_0
 22 |   - wheel=0.37.1=pyhd3eb1b0_0
 23 |   - xz=5.2.10=h5eee18b_1
 24 |   - zlib=1.2.13=h5eee18b_0
 25 |   - pip:
 26 |     - aiosqlite==0.18.0
 27 |     - alembic==1.9.2
 28 |     - anyio==3.6.2
 29 |     - apprise==1.2.1
 30 |     - asgi-lifespan==2.0.0
 31 |     - asyncpg==0.27.0
 32 |     - cachetools==5.3.0
 33 |     - cffi==1.15.1
 34 |     - charset-normalizer==3.0.1
 35 |     - click==8.1.3
 36 |     - cloudpickle==2.2.1
 37 |     - colorama==0.4.6
 38 |     - coolname==2.2.0
 39 |     - croniter==1.3.8
 40 |     - cryptography==39.0.0
 41 |     - dateparser==1.1.6
 42 |     - db-dtypes==1.0.5
 43 |     - docker==6.0.1
 44 |     - fastapi==0.89.1
 45 |     - fsspec==2023.1.0
 46 |     - google-api-core==2.11.0
 47 |     - google-api-python-client==2.74.0
 48 |     - google-auth==2.16.0
 49 |     - google-auth-httplib2==0.1.0
 50 |     - google-auth-oauthlib==0.8.0
 51 |     - google-cloud-bigquery==3.4.2
 52 |     - google-cloud-bigquery-storage==2.18.1
 53 |     - google-cloud-core==2.3.2
 54 |     - google-cloud-storage==2.7.0
 55 |     - google-crc32c==1.5.0
 56 |     - google-resumable-media==2.4.1
 57 |     - googleapis-common-protos==1.58.0
 58 |     - greenlet==2.0.1
 59 |     - griffe==0.25.4
 60 |     - grpcio==1.51.1
 61 |     - grpcio-status==1.51.1
 62 |     - h11==0.14.0
 63 |     - h2==4.1.0
 64 |     - hpack==4.0.0
 65 |     - httpcore==0.16.3
 66 |     - httplib2==0.21.0
 67 |     - httpx==0.23.3
 68 |     - hyperframe==6.0.1
 69 |     - idna==3.4
 70 |     - importlib-metadata==6.0.0
 71 |     - jinja2==3.1.2
 72 |     - jsonpatch==1.32
 73 |     - jsonpointer==2.3
 74 |     - kubernetes==25.3.0
 75 |     - mako==1.2.4
 76 |     - markdown==3.4.1
 77 |     - markdown-it-py==2.1.0
 78 |     - markupsafe==2.1.2
 79 |     - mdurl==0.1.2
 80 |     - numpy==1.24.1
 81 |     - oauthlib==3.2.2
 82 |     - orjson==3.8.5
 83 |     - packaging==23.0
 84 |     - pandas==1.5.2
 85 |     - pandas-gbq==0.18.1
 86 |     - pathspec==0.11.0
 87 |     - pendulum==2.1.2
 88 |     - prefect==2.7.7
 89 |     - prefect-gcp==0.2.4
 90 |     - prefect-sqlalchemy==0.2.2
 91 |     - proto-plus==1.22.2
 92 |     - protobuf==4.21.11
 93 |     - psycopg2-binary==2.9.5
 94 |     - pyarrow==10.0.1
 95 |     - pyasn1==0.4.8
 96 |     - pyasn1-modules==0.2.8
 97 |     - pycparser==2.21
 98 |     - pydantic==1.10.4
 99 |     - pydata-google-auth==1.5.0
100 |     - pygments==2.14.0
101 |     - pyparsing==3.0.9
102 |     - python-dateutil==2.8.2
103 |     - python-slugify==7.0.0
104 |     - pytz==2022.7.1
105 |     - pytz-deprecation-shim==0.1.0.post0
106 |     - pytzdata==2020.1
107 |     - pyyaml==6.0
108 |     - readchar==4.0.3
109 |     - regex==2022.10.31
110 |     - requests==2.28.2
111 |     - requests-oauthlib==1.3.1
112 |     - rfc3986==1.5.0
113 |     - rich==13.2.0
114 |     - rsa==4.9
115 |     - six==1.16.0
116 |     - sniffio==1.3.0
117 |     - sqlalchemy==1.4.46
118 |     - starlette==0.22.0
119 |     - text-unidecode==1.3
120 |     - toml==0.10.2
121 |     - typer==0.7.0
122 |     - typing-extensions==4.4.0
123 |     - tzdata==2022.7
124 |     - tzlocal==4.2
125 |     - uritemplate==4.1.1
126 |     - urllib3==1.26.14
127 |     - uvicorn==0.20.0
128 |     - websocket-client==1.4.2
129 |     - zipp==3.11.0
130 | prefix: /home/padilha/miniconda3/envs/de-zoomcamp-week2
131 | 


--------------------------------------------------------------------------------
/week2/docker-requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==1.5.2
2 | prefect-gcp[cloud_storage]==0.2.4
3 | protobuf==4.21.11
4 | pyarrow==10.0.1
5 | pandas-gbq==0.18.1


--------------------------------------------------------------------------------
/week2/docker_deploy.py:
--------------------------------------------------------------------------------
 1 | from prefect.deployments import Deployment
 2 | from parameterized_flow import etl_parent_flow
 3 | from prefect.infrastructure.docker import DockerContainer
 4 | 
 5 | docker_block = DockerContainer.load("zoom")
 6 | 
 7 | docker_dep = Deployment.build_from_flow(
 8 |     flow=etl_parent_flow,
 9 |     name='docker-flow',
10 |     infrastructure=docker_block
11 | )
12 | 
13 | if __name__ == '__main__':
14 |     docker_dep.apply()


--------------------------------------------------------------------------------
/week2/etl_gcs_to_bq.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from pathlib import Path
 3 | from prefect import flow, task
 4 | from prefect_gcp.cloud_storage import GcsBucket
 5 | from prefect_gcp import GcpCredentials
 6 | 
 7 | @task(retries=3)
 8 | def extract_from_gcs(color: str, year: int, month: int) -> Path:
 9 |     """Download trip data from GCS"""
10 |     gcs_path = f'data/{color}/{color}_tripdata_{year}-{month:02}.parquet'
11 |     gcs_block = GcsBucket.load('zoomcamp-gcs')
12 |     gcs_block.get_directory(from_path=gcs_path, local_path='./')
13 |     return Path(gcs_path)
14 | 
15 | @task()
16 | def transform(path: Path) -> pd.DataFrame:
17 |     """Data cleaning example"""
18 |     df = pd.read_parquet(path)
19 |     print(f"pre: missing passenger count: {df['passenger_count'].isna().sum()}")
20 |     df['passenger_count'].fillna(0, inplace=True)
21 |     print(f"post: missing passenger count: {df['passenger_count'].isna().sum()}")
22 |     return df
23 | 
24 | @task()
25 | def write_bq(df: pd.DataFrame) -> None:
26 |     """Write DataFrame to BigQuery"""
27 |     gcp_credentials_block = GcpCredentials.load('zoomcamp-gcp-credentials')
28 |     df.to_gbq(
29 |         destination_table='trips_data_all.yellow_taxi_trips',
30 |         project_id='dtc-de-375514',
31 |         credentials=gcp_credentials_block.get_credentials_from_service_account(),
32 |         chunksize=500_000,
33 |         if_exists='append'
34 |     )
35 | 
36 | @flow()
37 | def etl_gcs_to_bq():
38 |     """Main ETL flow to load data into BigQuery"""
39 |     color = 'yellow'
40 |     year = 2021
41 |     month = 1
42 |     path = extract_from_gcs(color, year, month)
43 |     df = transform(path)
44 |     write_bq(df)
45 | 
46 | if __name__ == '__main__':
47 |     etl_gcs_to_bq()


--------------------------------------------------------------------------------
/week2/etl_web_to_gcs.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from pathlib import Path
 3 | from prefect import flow, task
 4 | from prefect_gcp.cloud_storage import GcsBucket
 5 | # from random import randint
 6 | 
 7 | @task(retries=3)
 8 | def fetch(dataset_url: str) -> pd.DataFrame:
 9 |     """Read taxi data from web into pandas DataFrame"""
10 |     # simulating failure to test retries
11 |     # if randint(0, 1) == 1:
12 |     #     raise Exception()
13 |     df = pd.read_csv(dataset_url)
14 |     return df
15 | 
16 | @task(log_prints=True)
17 | def clean(df: pd.DataFrame) -> pd.DataFrame:
18 |     """Fix some dtype issues"""
19 |     df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
20 |     df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
21 |     print(df.head(5))
22 |     print(f'columns: {df.dtypes}')
23 |     print(f'rows: {len(df)}')
24 |     return df
25 | 
26 | @task()
27 | def write_local(df: pd.DataFrame, color: str, dataset_file: str) -> Path:
28 |     """Write DataFrame out as parquet file"""
29 |     data_dir = f'data/{color}'
30 |     Path(data_dir).mkdir(parents=True, exist_ok=True)
31 |     path = Path(f'{data_dir}/{dataset_file}.parquet')
32 |     df.to_parquet(path, compression='gzip')
33 |     return path
34 | 
35 | @task()
36 | def write_gcs(path: Path) -> None:
37 |     """Upload local parquet file to GCS"""
38 |     gcp_cloud_storage_bucket_block = GcsBucket.load("zoomcamp-gcs")
39 |     gcp_cloud_storage_bucket_block.upload_from_path(from_path=path, to_path=path)
40 | 
41 | @flow()
42 | def etl_web_to_gcs() -> None:
43 |     """The main ETL function"""
44 |     color = 'yellow'
45 |     year = 2021
46 |     month = 1
47 |     dataset_file = f'{color}_tripdata_{year}-{month:02}'
48 |     dataset_url = f'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{color}/{dataset_file}.csv.gz'
49 |     df = fetch(dataset_url)
50 |     df = clean(df)
51 |     path = write_local(df, color, dataset_file)
52 |     write_gcs(path)
53 | 
54 | if __name__ == '__main__':
55 |     etl_web_to_gcs()


--------------------------------------------------------------------------------
/week2/homework/etl_gcs_to_bq.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import pandas as pd
 3 | from prefect import flow, task
 4 | from prefect_gcp.cloud_storage import GcsBucket
 5 | from prefect_gcp import GcpCredentials
 6 | 
 7 | @task(retries=3)
 8 | def extract_from_gcs(color: str, year: int, month: int) -> Path:
 9 |     """Download trip data from GCS"""
10 |     gcs_path = f"data/{color}/{color}_tripdata_{year}-{month:02}.parquet"
11 |     gcs_block = GcsBucket.load("zoomcamp-gcs")
12 |     gcs_block.get_directory(from_path=gcs_path, local_path=f"../data/")
13 |     return Path(f"../data/{gcs_path}")
14 | 
15 | @task()
16 | def write_bq(df: pd.DataFrame) -> None:
17 |     """Write DataFrame to BiqQuery"""
18 |     gcp_credentials_block = GcpCredentials.load("zoomcamp-gcp-credentials")
19 |     df.to_gbq(
20 |         destination_table="trips_data_all.yellow_taxi_trips",
21 |         project_id="dtc-de-375514",
22 |         credentials=gcp_credentials_block.get_credentials_from_service_account(),
23 |         chunksize=500_000,
24 |         if_exists="append"
25 |     )
26 | 
27 | @flow(log_prints=True)
28 | def etl_gcs_to_bq(months: list[int] = [2, 3], year: int = 2019, color: str = 'yellow'):
29 |     """Main ETL flow to load data into Big Query"""
30 |     total_rows = 0
31 |     for month in months:
32 |         path = extract_from_gcs(color, year, month)
33 |         df = pd.read_parquet(path)
34 |         print(f'{color} {year}-{month} rows: {len(df)}')
35 |         total_rows += len(df)
36 |         write_bq(df)
37 |     print(f'Processed rows (total): {total_rows}')
38 | 
39 | if __name__ == "__main__":
40 |     etl_gcs_to_bq()
41 | 


--------------------------------------------------------------------------------
/week2/homework/etl_web_to_gcs.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import pandas as pd
 3 | from prefect import flow, task
 4 | from prefect_gcp.cloud_storage import GcsBucket
 5 | from random import randint
 6 | 
 7 | 
 8 | @task(retries=3)
 9 | def fetch(dataset_url: str) -> pd.DataFrame:
10 |     """Read taxi data from web into pandas DataFrame"""
11 |     # if randint(0, 1) > 0:
12 |     #     raise Exception
13 | 
14 |     df = pd.read_csv(dataset_url)
15 |     return df
16 | 
17 | 
18 | @task(log_prints=True)
19 | def clean(df: pd.DataFrame) -> pd.DataFrame:
20 |     """Fix dtype issues"""
21 |     df["lpep_pickup_datetime"] = pd.to_datetime(df["lpep_pickup_datetime"])
22 |     df["lpep_dropoff_datetime"] = pd.to_datetime(df["lpep_dropoff_datetime"])
23 |     print(df.head(2))
24 |     print(f"columns: {df.dtypes}")
25 |     print(f"rows: {len(df)}")
26 |     return df
27 | 
28 | 
29 | @task()
30 | def write_local(df: pd.DataFrame, color: str, dataset_file: str) -> Path:
31 |     """Write DataFrame out locally as parquet file"""
32 |     path = Path(f"/home/padilha/projects/de-zoomcamp/data/{color}/{dataset_file}.parquet")
33 |     df.to_parquet(path, compression="gzip")
34 |     return path
35 | 
36 | 
37 | @task()
38 | def write_gcs(path: Path) -> None:
39 |     """Upload local parquet file to GCS"""
40 |     gcs_block = GcsBucket.load("zoomcamp-gcs")
41 |     gcs_block.upload_from_path(from_path=path, to_path=path)
42 |     return
43 | 
44 | 
45 | @flow(log_prints=True)
46 | def etl_web_to_gcs(color='green', year=2019, month=4) -> None:
47 |     """The main ETL function"""
48 |     dataset_file = f"{color}_tripdata_{year}-{month:02}"
49 |     dataset_url = f"https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{color}/{dataset_file}.csv.gz"
50 |     df = fetch(dataset_url)
51 |     df_clean = clean(df)
52 |     print(f'Processed rows: {len(df_clean)}')
53 |     path = write_local(df_clean, color, dataset_file)
54 |     write_gcs(path)
55 | 
56 | if __name__ == "__main__":
57 |     etl_web_to_gcs()
58 | 


--------------------------------------------------------------------------------
/week2/img/bq_create_table1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/bq_create_table1.png


--------------------------------------------------------------------------------
/week2/img/bq_create_table2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/bq_create_table2.png


--------------------------------------------------------------------------------
/week2/img/bq_delete_data1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/bq_delete_data1.png


--------------------------------------------------------------------------------
/week2/img/bq_delete_data2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/bq_delete_data2.png


--------------------------------------------------------------------------------
/week2/img/bq_delete_data3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/bq_delete_data3.png


--------------------------------------------------------------------------------
/week2/img/deployment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/deployment.png


--------------------------------------------------------------------------------
/week2/img/docker_block1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/docker_block1.png


--------------------------------------------------------------------------------
/week2/img/docker_block2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/docker_block2.png


--------------------------------------------------------------------------------
/week2/img/docker_deploy_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/docker_deploy_result.png


--------------------------------------------------------------------------------
/week2/img/gcp_credentials_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/gcp_credentials_block.png


--------------------------------------------------------------------------------
/week2/img/gcs_bucket_block1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/gcs_bucket_block1.png


--------------------------------------------------------------------------------
/week2/img/gcs_bucket_block2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/gcs_bucket_block2.png


--------------------------------------------------------------------------------
/week2/img/gcs_bucket_block3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/gcs_bucket_block3.png


--------------------------------------------------------------------------------
/week2/img/gcs_bucket_block4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/gcs_bucket_block4.png


--------------------------------------------------------------------------------
/week2/img/gcs_parameterized_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/gcs_parameterized_flow.png


--------------------------------------------------------------------------------
/week2/img/homework_Q2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/homework_Q2.png


--------------------------------------------------------------------------------
/week2/img/homework_Q5_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/homework_Q5_1.png


--------------------------------------------------------------------------------
/week2/img/homework_Q5_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/homework_Q5_2.png


--------------------------------------------------------------------------------
/week2/img/homework_Q5_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/homework_Q5_3.png


--------------------------------------------------------------------------------
/week2/img/homework_Q5_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/homework_Q5_4.png


--------------------------------------------------------------------------------
/week2/img/homework_Q6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/homework_Q6.png


--------------------------------------------------------------------------------
/week2/img/quickrun.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/quickrun.png


--------------------------------------------------------------------------------
/week2/img/records.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/records.png


--------------------------------------------------------------------------------
/week2/img/schedule1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/schedule1.png


--------------------------------------------------------------------------------
/week2/img/schedule2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/schedule2.png


--------------------------------------------------------------------------------
/week2/img/schedule3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/schedule3.png


--------------------------------------------------------------------------------
/week2/img/sql-block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/sql-block.png


--------------------------------------------------------------------------------
/week2/img/subflows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/subflows.png


--------------------------------------------------------------------------------
/week2/img/uploaded_data_gcp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week2/img/uploaded_data_gcp.png


--------------------------------------------------------------------------------
/week2/ingest_data_flow.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | from sqlalchemy import create_engine
 4 | from time import time
 5 | from prefect import flow, task
 6 | 
 7 | def parquet_to_csv(parquet_file, csv_file):
 8 |     df = pd.read_parquet(parquet_file, engine = 'pyarrow')
 9 |     df.to_csv(csv_file, index=False)
10 | 
11 | @task(log_prints=True, retries=3)
12 | def ingest(csv_file, table_name, engine, chunksize=100000):
13 |     df_iter = pd.read_csv(csv_file, iterator=True, chunksize=chunksize)
14 |     run = True
15 |     while run:
16 |         try:
17 |             t_start = time()
18 |             df = next(df_iter)
19 |             df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
20 |             df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
21 |             df.to_sql(name=table_name, con=engine, if_exists='append')
22 |             t_end = time()
23 |             print(f'inserted another chunk, took {t_end-t_start:.3f} seconds')
24 |         except Exception:
25 |             run = False
26 | 
27 | @flow(name='Ingest Data')
28 | def main_flow():
29 |     user = 'root'
30 |     password = 'root'
31 |     host = 'localhost'
32 |     port = '5432'
33 |     db = 'ny_taxi'
34 |     table_name = 'yellow_taxi_trips'
35 |     url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet'
36 |     parquet_file = 'output.parquet'
37 |     csv_file = 'output.csv'
38 |     os.system(f'wget {url} -O {parquet_file}')
39 |     parquet_to_csv(parquet_file, csv_file)
40 |     engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}')
41 |     engine.connect()
42 |     ingest(csv_file, table_name, engine)
43 |     
44 | 
45 | if __name__ == '__main__':
46 |     parser = argparse.ArgumentParser(description='Ingest CSV data to Postgres')
47 |     parser.add_argument('--user', required=True, help='user name for postgres')
48 |     parser.add_argument('--password', required=True, help='password for postgres')
49 |     parser.add_argument('--host', required=True, help='host for postgres')
50 |     parser.add_argument('--port', required=True, help='port for postgres')
51 |     parser.add_argument('--db', required=True, help='database name for postgres')
52 |     parser.add_argument('--table_name', required=True, help='name of the table where we will write the results to')
53 |     parser.add_argument('--url', required=True, help='url of the csv file')
54 |     args = parser.parse_args()
55 |     main(args)
56 |     main_flow()


--------------------------------------------------------------------------------
/week2/ingest_data_flow_etl.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | import argparse
 4 | from sqlalchemy import create_engine
 5 | from time import time
 6 | from prefect import flow, task
 7 | 
 8 | @task(log_prints=True)
 9 | def extract(url: str, parque_file: str = 'output.parquet', csv_file: str = 'output.csv'):
10 |     parquet_file = 'output.parquet'
11 |     os.system(f'wget {url} -O {parquet_file}')
12 |     df = pd.read_parquet(parquet_file, engine = 'pyarrow')
13 |     df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
14 |     df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
15 |     return df
16 | 
17 | @task(log_prints=True)
18 | def transform(df: pd.DataFrame):
19 |     print(f"pre: missing passenger count: {(df['passenger_count'] == 0).sum()}")
20 |     df = df[df['passenger_count'] != 0]
21 |     print(f"post: missing passenger count: {(df['passenger_count'] == 0).sum()}")
22 |     return df
23 | 
24 | @task(log_prints=True, retries=3)
25 | def load(user, password, host, port, db, table_name, df):
26 |     engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}')
27 |     engine.connect()
28 |     df.to_sql(name=table_name, con=engine, if_exists='append')
29 | 
30 | @flow(name='Ingest Data')
31 | def main_flow(args):
32 |     user = args.user
33 |     password = args.password
34 |     host = args.host
35 |     port = args.port
36 |     db = args.db
37 |     table_name = args.table_name
38 |     url = args.url
39 |     raw_data = extract(url)
40 |     data = transform(raw_data)
41 |     load(user, password, host, port, db, table_name, data)   
42 | 
43 | if __name__ == '__main__':
44 |     parser = argparse.ArgumentParser(description='Ingest CSV data to Postgres')
45 |     parser.add_argument('--user', required=False, help='user name for postgres', default='root')
46 |     parser.add_argument('--password', required=False, help='password for postgres', default='root')
47 |     parser.add_argument('--host', required=False, help='host for postgres', default='localhost')
48 |     parser.add_argument('--port', required=False, help='port for postgres', default='5432')
49 |     parser.add_argument('--db', required=False, help='database name for postgres', default='ny_taxi')
50 |     parser.add_argument('--table_name', required=False, help='name of the table where we will write the results to', default='yellow_taxi_trips')
51 |     DEFAULT_URL = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet'
52 |     parser.add_argument('--url', required=False, help='url of the csv file', default=DEFAULT_URL)
53 |     args = parser.parse_args()
54 |     main_flow(args)


--------------------------------------------------------------------------------
/week2/ingest_data_flow_etl_with_sql_block.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | import argparse
 4 | from sqlalchemy import create_engine
 5 | from datetime import timedelta
 6 | from prefect import flow, task
 7 | from prefect.tasks import task_input_hash
 8 | from prefect_sqlalchemy import SqlAlchemyConnector
 9 | 
10 | @task(log_prints=True, tags=['extract'], cache_key_fn=task_input_hash, cache_expiration=timedelta(days=1))
11 | def extract(url: str, parque_file: str = 'output.parquet', csv_file: str = 'output.csv'):
12 |     parquet_file = 'output.parquet'
13 |     os.system(f'wget {url} -O {parquet_file}')
14 |     df = pd.read_parquet(parquet_file, engine = 'pyarrow')
15 |     df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
16 |     df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
17 |     return df
18 | 
19 | @task(log_prints=True)
20 | def transform(df: pd.DataFrame):
21 |     print(f"pre: missing passenger count: {(df['passenger_count'] == 0).sum()}")
22 |     df = df[df['passenger_count'] != 0]
23 |     print(f"post: missing passenger count: {(df['passenger_count'] == 0).sum()}")
24 |     return df
25 | 
26 | @task(log_prints=True, retries=3)
27 | def load(table_name, df):
28 |     connection_block = SqlAlchemyConnector.load("postgres-connector")
29 |     with connection_block.get_connection(begin=False) as engine:
30 |         df.to_sql(name=table_name, con=engine, if_exists='append')
31 | 
32 | @flow(name='Ingest Data')
33 | def main_flow(args):
34 |     table_name = args.table_name
35 |     url = args.url
36 |     raw_data = extract(url)
37 |     data = transform(raw_data)
38 |     load(table_name, data)   
39 | 
40 | if __name__ == '__main__':
41 |     parser = argparse.ArgumentParser(description='Ingest CSV data to Postgres')
42 |     parser.add_argument('--table_name', required=False, help='name of the table where we will write the results to', default='yellow_taxi_trips')
43 |     DEFAULT_URL = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet'
44 |     parser.add_argument('--url', required=False, help='url of the csv file', default=DEFAULT_URL)
45 |     args = parser.parse_args()
46 |     main_flow(args)


--------------------------------------------------------------------------------
/week2/parameterized_flow.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from pathlib import Path
 3 | from datetime import timedelta
 4 | from prefect import flow, task
 5 | from prefect_gcp.cloud_storage import GcsBucket
 6 | # from prefect.tasks import task_input_hash
 7 | # from random import randint
 8 | 
 9 | # For some reason, the commented @task below is responsible for an exception when deploying the code using
10 | # the docker block as the infrastructure. It has something to do with cache_key_fn and cache_expiration.
11 | # I discovered this solution by reading a thread in the course's Slack.
12 | # See:
13 | # https://datatalks-club.slack.com/archives/C01FABYF2RG/p1674823816614039
14 | # https://github.com/PrefectHQ/prefect/issues/6086
15 | # @task(retries=3, cache_key_fn=task_input_hash, cache_expiration=timedelta(days=1))
16 | @task(retries=3)
17 | def fetch(dataset_url: str) -> pd.DataFrame:
18 |     """Read taxi data from web into pandas DataFrame"""
19 |     # simulating failure to test retries
20 |     # if randint(0, 1) == 1:
21 |     #     raise Exception()
22 |     df = pd.read_csv(dataset_url)
23 |     return df
24 | 
25 | @task(log_prints=True)
26 | def clean(df: pd.DataFrame) -> pd.DataFrame:
27 |     """Fix some dtype issues"""
28 |     df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
29 |     df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
30 |     print(df.head(5))
31 |     print(f'columns: {df.dtypes}')
32 |     print(f'rows: {len(df)}')
33 |     return df
34 | 
35 | @task()
36 | def write_local(df: pd.DataFrame, color: str, dataset_file: str) -> Path:
37 |     """Write DataFrame out as parquet file"""
38 |     data_dir = f'data/{color}'
39 |     Path(data_dir).mkdir(parents=True, exist_ok=True)
40 |     path = Path(f'{data_dir}/{dataset_file}.parquet')
41 |     df.to_parquet(path, compression='gzip')
42 |     return path
43 | 
44 | @task()
45 | def write_gcs(path: Path) -> None:
46 |     """Upload local parquet file to GCS"""
47 |     gcp_cloud_storage_bucket_block = GcsBucket.load("zoomcamp-gcs")
48 |     gcp_cloud_storage_bucket_block.upload_from_path(from_path=path, to_path=path)
49 | 
50 | @flow()
51 | def etl_web_to_gcs(year: int, month: int, color: str) -> None:
52 |     """The main ETL function"""
53 |     dataset_file = f'{color}_tripdata_{year}-{month:02}'
54 |     dataset_url = f'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{color}/{dataset_file}.csv.gz'
55 |     df = fetch(dataset_url)
56 |     df = clean(df)
57 |     path = write_local(df, color, dataset_file)
58 |     write_gcs(path)
59 | 
60 | @flow()
61 | def etl_parent_flow(months: list[int] = [1, 2], year: int = 2021, color: str = 'yellow') -> None:
62 |     for month in months:
63 |         etl_web_to_gcs(year, month, color)
64 | 
65 | if __name__ == '__main__':
66 |     color = 'yellow'
67 |     months = [1, 2, 3]
68 |     year = 2021
69 |     etl_parent_flow(months, year, color)


--------------------------------------------------------------------------------
/week2/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==1.5.2
2 | prefect==2.7.7
3 | prefect-sqlalchemy==0.2.2
4 | prefect-gcp[cloud_storage]==0.2.4
5 | protobuf==4.21.11
6 | pyarrow==10.0.1
7 | pandas-gbq==0.18.1
8 | psycopg2-binary==2.9.5
9 | sqlalchemy==1.4.46


--------------------------------------------------------------------------------
/week3/big_query_ml.sql:
--------------------------------------------------------------------------------
 1 | -- SELECT THE COLUMNS INTERESTED FOR YOU
 2 | SELECT passenger_count, trip_distance, PULocationID, DOLocationID, payment_type, fare_amount, tolls_amount, tip_amount
 3 | FROM `taxi-rides-ny.nytaxi.yellow_tripdata_partitoned` WHERE fare_amount != 0;
 4 | 
 5 | -- CREATE A ML TABLE WITH APPROPRIATE TYPE
 6 | CREATE OR REPLACE TABLE `taxi-rides-ny.nytaxi.yellow_tripdata_ml` (
 7 | `passenger_count` INTEGER,
 8 | `trip_distance` FLOAT64,
 9 | `PULocationID` STRING,
10 | `DOLocationID` STRING,
11 | `payment_type` STRING,
12 | `fare_amount` FLOAT64,
13 | `tolls_amount` FLOAT64,
14 | `tip_amount` FLOAT64
15 | ) AS (
16 | SELECT passenger_count, trip_distance, cast(PULocationID AS STRING), CAST(DOLocationID AS STRING),
17 | CAST(payment_type AS STRING), fare_amount, tolls_amount, tip_amount
18 | FROM `taxi-rides-ny.nytaxi.yellow_tripdata_partitoned` WHERE fare_amount != 0
19 | );
20 | 
21 | -- CREATE MODEL WITH DEFAULT SETTING
22 | CREATE OR REPLACE MODEL `taxi-rides-ny.nytaxi.tip_model`
23 | OPTIONS
24 | (model_type='linear_reg',
25 | input_label_cols=['tip_amount'],
26 | DATA_SPLIT_METHOD='AUTO_SPLIT') AS
27 | SELECT
28 | *
29 | FROM
30 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml`
31 | WHERE
32 | tip_amount IS NOT NULL;
33 | 
34 | -- CHECK FEATURES
35 | SELECT * FROM ML.FEATURE_INFO(MODEL `taxi-rides-ny.nytaxi.tip_model`);
36 | 
37 | -- EVALUATE THE MODEL
38 | SELECT
39 | *
40 | FROM
41 | ML.EVALUATE(MODEL `taxi-rides-ny.nytaxi.tip_model`,
42 | (
43 | SELECT
44 | *
45 | FROM
46 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml`
47 | WHERE
48 | tip_amount IS NOT NULL
49 | ));
50 | 
51 | -- PREDICT THE MODEL
52 | SELECT
53 | *
54 | FROM
55 | ML.PREDICT(MODEL `taxi-rides-ny.nytaxi.tip_model`,
56 | (
57 | SELECT
58 | *
59 | FROM
60 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml`
61 | WHERE
62 | tip_amount IS NOT NULL
63 | ));
64 | 
65 | -- PREDICT AND EXPLAIN
66 | SELECT
67 | *
68 | FROM
69 | ML.EXPLAIN_PREDICT(MODEL `taxi-rides-ny.nytaxi.tip_model`,
70 | (
71 | SELECT
72 | *
73 | FROM
74 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml`
75 | WHERE
76 | tip_amount IS NOT NULL
77 | ), STRUCT(3 as top_k_features));
78 | 
79 | -- HYPER PARAM TUNNING
80 | CREATE OR REPLACE MODEL `taxi-rides-ny.nytaxi.tip_hyperparam_model`
81 | OPTIONS
82 | (model_type='linear_reg',
83 | input_label_cols=['tip_amount'],
84 | DATA_SPLIT_METHOD='AUTO_SPLIT',
85 | num_trials=5,
86 | max_parallel_trials=2,
87 | l1_reg=hparam_range(0, 20),
88 | l2_reg=hparam_candidates([0, 0.1, 1, 10])) AS
89 | SELECT
90 | *
91 | FROM
92 | `taxi-rides-ny.nytaxi.yellow_tripdata_ml`
93 | WHERE
94 | tip_amount IS NOT NULL;


--------------------------------------------------------------------------------
/week3/download.py:
--------------------------------------------------------------------------------
1 | # script to download the fhv tripdata
2 | import os
3 | for month in range(1, 13):
4 |     url = f'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-{month:02d}.csv.gz'
5 |     os.system('wget ' + url)


--------------------------------------------------------------------------------
/week3/homework.md:
--------------------------------------------------------------------------------
  1 | ## Week 3 Homework
  2 | <b><u>Important Note:</b></u> <p>You can load the data however you would like, but keep the files in .GZ Format. 
  3 | If you are using orchestration such as Airflow or Prefect do not load the data into Big Query using the orchestrator.</br> 
  4 | Stop with loading the files into a bucket. </br></br>
  5 | <u>NOTE:</u> You can use the CSV option for the GZ files when creating an External Table</br>
  6 | 
  7 | <b>SETUP:</b></br>
  8 | Create an external table using the fhv 2019 data. </br>
  9 | Create a table in BQ using the fhv 2019 data (do not partition or cluster this table). </br>
 10 | Data can be found here: https://github.com/DataTalksClub/nyc-tlc-data/releases/tag/fhv </p>
 11 | 
 12 | ## Question 1:
 13 | What is the count for fhv vehicle records for year 2019?
 14 | - 65,623,481
 15 | - 43,244,696
 16 | - 22,978,333
 17 | - 13,942,414
 18 | 
 19 | ### Solution
 20 | 
 21 | ```sql
 22 | SELECT COUNT(1) FROM `dtc-de-375514.week3_homework.external_fhv_tripdata`;
 23 | ```
 24 | 
 25 | **Answer:** 43,244,696
 26 | 
 27 | ## Question 2:
 28 | Write a query to count the distinct number of affiliated_base_number for the entire dataset on both the tables.</br> 
 29 | What is the estimated amount of data that will be read when this query is executed on the External Table and the Table?
 30 | 
 31 | - 25.2 MB for the External Table and 100.87MB for the BQ Table
 32 | - 225.82 MB for the External Table and 47.60MB for the BQ Table
 33 | - 0 MB for the External Table and 0MB for the BQ Table
 34 | - 0 MB for the External Table and 317.94MB for the BQ Table
 35 | 
 36 | ### Solution
 37 | 
 38 | ```sql
 39 | SELECT COUNT(DISTINCT affiliated_base_number) FROM `dtc-de-375514.week3_homework.external_fhv_tripdata`;
 40 | ```
 41 | Output: This query will process 0 B when run.
 42 | 
 43 | ```sql
 44 | SELECT COUNT(DISTINCT affiliated_base_number) FROM `dtc-de-375514.week3_homework.fhv_tripdata`;
 45 | ```
 46 | Output: This query will process 317.94 MB when run.
 47 | 
 48 | **Answer:** 0 MB for the External Table and 317.94MB for the BQ Table
 49 | 
 50 | ## Question 3:
 51 | How many records have both a blank (null) PUlocationID and DOlocationID in the entire dataset?
 52 | - 717,748
 53 | - 1,215,687
 54 | - 5
 55 | - 20,332
 56 | 
 57 | ### Solution
 58 | 
 59 | ```sql
 60 | SELECT COUNT(1) from `dtc-de-375514.week3_homework.fhv_tripdata` WHERE PUlocationID IS NULL AND DOlocationID IS NULL;
 61 | ```
 62 | 
 63 | **Answer:** 717,748
 64 | 
 65 | ## Question 4:
 66 | What is the best strategy to optimize the table if query always filter by pickup_datetime and order by affiliated_base_number?
 67 | - Cluster on pickup_datetime Cluster on affiliated_base_number
 68 | - Partition by pickup_datetime Cluster on affiliated_base_number
 69 | - Partition by pickup_datetime Partition by affiliated_base_number
 70 | - Partition by affiliated_base_number Cluster on pickup_datetime
 71 | 
 72 | **Answer:** Partition by pickup_datetime Cluster on affiliated_base_number
 73 | 
 74 | ## Question 5:
 75 | Implement the optimized solution you chose for question 4. Write a query to retrieve the distinct affiliated_base_number between pickup_datetime 2019/03/01 and 2019/03/31 (inclusive).</br> 
 76 | Use the BQ table you created earlier in your from clause and note the estimated bytes. Now change the table in the from clause to the partitioned table you created for question 4 and note the estimated bytes processed. What are these values? Choose the answer which most closely matches.
 77 | - 12.82 MB for non-partitioned table and 647.87 MB for the partitioned table
 78 | - 647.87 MB for non-partitioned table and 23.06 MB for the partitioned table
 79 | - 582.63 MB for non-partitioned table and 0 MB for the partitioned table
 80 | - 646.25 MB for non-partitioned table and 646.25 MB for the partitioned table
 81 | 
 82 | ### Solution
 83 | 
 84 | Querying the non-partitioned table.
 85 | ```sql
 86 | SELECT COUNT(DISTINCT affiliated_base_number)
 87 | FROM `dtc-de-375514.week3_homework.fhv_tripdata`
 88 | WHERE DATE(pickup_datetime) BETWEEN '2019-03-01' and '2019-03-31';
 89 | ```
 90 | 
 91 | Creating and querying the partitioned and clustered table.
 92 | ```sql
 93 | CREATE OR REPLACE TABLE `dtc-de-375514.week3_homework.fhv_tripdata_partitioned_clustered`
 94 | PARTITION BY DATE(pickup_datetime)
 95 | CLUSTER BY Affiliated_base_number AS
 96 | SELECT * FROM `dtc-de-375514.week3_homework.fhv_tripdata`;
 97 | 
 98 | SELECT COUNT(DISTINCT affiliated_base_number)
 99 | FROM `dtc-de-375514.week3_homework.fhv_tripdata_partitioned_clustered`
100 | WHERE DATE(pickup_datetime) BETWEEN '2019-03-01' and '2019-03-31';
101 | ```
102 | 
103 | **Answer:** 647.87 MB for non-partitioned table and 23.06 MB for the partitioned table
104 | 
105 | ## Question 6: 
106 | Where is the data stored in the External Table you created?
107 | 
108 | - Big Query
109 | - GCP Bucket
110 | - Container Registry
111 | - Big Table
112 | 
113 | **Answer:** GCP Bucket
114 | 
115 | ## Question 7:
116 | It is best practice in Big Query to always cluster your data:
117 | - True
118 | - False
119 | 
120 | **Answer:** False. For instance, if the cardinality of the number of values in a column or group of columns is small.
121 | 
122 | ## (Not required) Question 8:
123 | A better format to store these files may be parquet. Create a data pipeline to download the gzip files and convert them into parquet. Upload the files to your GCP Bucket and create an External and BQ Table. 
124 | 
125 | 
126 | Note: Column types for all files used in an External Table must have the same datatype. While an External Table may be created and shown in the side panel in Big Query, this will need to be validated by running a count query on the External Table to check if any errors occur. 
127 |  
128 | ## Submitting the solutions
129 | 
130 | * Form for submitting: https://forms.gle/rLdvQW2igsAT73HTA
131 | * You can submit your homework multiple times. In this case, only the last submission will be used. 
132 | 
133 | Deadline: 13 February (Monday), 22:00 CET
134 | 
135 | 
136 | ## Solution
137 | 
138 | We will publish the solution here
139 | 


--------------------------------------------------------------------------------
/week3/img/citibike_stations1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week3/img/citibike_stations1.png


--------------------------------------------------------------------------------
/week3/img/citibike_stations2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week3/img/citibike_stations2.png


--------------------------------------------------------------------------------
/week3/img/clustering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week3/img/clustering.png


--------------------------------------------------------------------------------
/week3/img/information_schema_partitions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week3/img/information_schema_partitions.png


--------------------------------------------------------------------------------
/week3/img/partition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week3/img/partition.png


--------------------------------------------------------------------------------
/week3/img/partitioning_vs_clustering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week3/img/partitioning_vs_clustering.png


--------------------------------------------------------------------------------
/week3/img/result_non_partitioned.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week3/img/result_non_partitioned.png


--------------------------------------------------------------------------------
/week3/img/result_partitioned.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week3/img/result_partitioned.png


--------------------------------------------------------------------------------
/week3/img/results_clustered.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week3/img/results_clustered.png


--------------------------------------------------------------------------------
/week3/img/results_unclustered.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week3/img/results_unclustered.png


--------------------------------------------------------------------------------
/week4/homework.md:
--------------------------------------------------------------------------------
  1 | ## Week 4 Homework 
  2 | 
  3 | In this homework, we'll use the models developed during the week 4 videos and enhance the already presented dbt project using the already loaded Taxi data for fhv vehicles for year 2019 in our DWH.
  4 | 
  5 | This means that in this homework we use the following data [Datasets list](https://github.com/DataTalksClub/nyc-tlc-data/)
  6 | * Yellow taxi data - Years 2019 and 2020
  7 | * Green taxi data - Years 2019 and 2020 
  8 | * fhv data - Year 2019. 
  9 | 
 10 | We will use the data loaded for:
 11 | 
 12 | * Building a source table: `stg_fhv_tripdata`
 13 | * Building a fact table: `fact_fhv_trips`
 14 | * Create a dashboard 
 15 | 
 16 | If you don't have access to GCP, you can do this locally using the ingested data from your Postgres database
 17 | instead. If you have access to GCP, you don't need to do it for local Postgres -
 18 | only if you want to.
 19 | 
 20 | > **Note**: if your answer doesn't match exactly, select the closest option 
 21 | 
 22 | ## Question 1: 
 23 | 
 24 | **What is the count of records in the model fact_trips after running all models with the test run variable disabled and filtering for 2019 and 2020 data only (pickup datetime)?** 
 25 | 
 26 | You'll need to have completed the ["Build the first dbt models"](https://www.youtube.com/watch?v=UVI30Vxzd6c) video and have been able to run the models via the CLI. 
 27 | You should find the views and models for querying in your DWH.
 28 | 
 29 | - 41648442
 30 | - 51648442
 31 | - 61648442
 32 | - 71648442
 33 | 
 34 | ### Solution
 35 | 
 36 | ```sql
 37 | SELECT COUNT(1) FROM `dtc-de-375514.production.fact_trips`;
 38 | ```
 39 | 
 40 | **Answer:** 61541140. Closest one is 61648442.
 41 | 
 42 | ## Question 2: 
 43 | 
 44 | **What is the distribution between service type filtering by years 2019 and 2020 data as done in the videos?**
 45 | 
 46 | You will need to complete "Visualising the data" videos, either using [google data studio](https://www.youtube.com/watch?v=39nLTs74A3E) or [metabase](https://www.youtube.com/watch?v=BnLkrA7a6gM). 
 47 | 
 48 | - 89.9/10.1
 49 | - 94/6
 50 | - 76.3/23.7
 51 | - 99.1/0.9
 52 | 
 53 | ### Solution
 54 | 
 55 | ![](./img/homework-question2.png)
 56 | 
 57 | **Answer:** 89.8/10.2. Closest one is 89.9/10.1.
 58 | 
 59 | ## Question 3: 
 60 | 
 61 | **What is the count of records in the model stg_fhv_tripdata after running all models with the test run variable disabled (:false)?**  
 62 | 
 63 | Create a staging model for the fhv data for 2019 and do not add a deduplication step. Run it via the CLI without limits (is_test_run: false).
 64 | Filter records with pickup time in year 2019.
 65 | 
 66 | - 33244696
 67 | - 43244696
 68 | - 53244696
 69 | - 63244696
 70 | 
 71 | ### Solution
 72 | 
 73 | ```sql
 74 | SELECT COUNT(1) FROM `dtc-de-375514.production.stg_fhv_tripdata`;
 75 | ```
 76 | 
 77 | **Answer:** 43097500. Closest one is 43244696.
 78 | 
 79 | 
 80 | ## Question 4: 
 81 | 
 82 | **What is the count of records in the model fact_fhv_trips after running all dependencies with the test run variable disabled (:false)?**  
 83 | 
 84 | Create a core model for the stg_fhv_tripdata joining with dim_zones.
 85 | Similar to what we've done in fact_trips, keep only records with known pickup and dropoff locations entries for pickup and dropoff locations. 
 86 | Run it via the CLI without limits (is_test_run: false) and filter records with pickup time in year 2019.
 87 | 
 88 | - 12998722
 89 | - 22998722
 90 | - 32998722
 91 | - 42998722
 92 | 
 93 | ### Solution
 94 | 
 95 | ```sql
 96 | SELECT COUNT(1) FROM `dtc-de-375514.production.fhv_fact_trips`;
 97 | ```
 98 | 
 99 | **Answer:** 22935061. Closest one is 22998722.
100 | 
101 | ## Question 5: 
102 | 
103 | **What is the month with the biggest amount of rides after building a tile for the fact_fhv_trips table?**
104 | 
105 | Create a dashboard with some tiles that you find interesting to explore the data. One tile should show the amount of trips per month, as done in the videos for fact_trips, based on the fact_fhv_trips table.
106 | 
107 | - March
108 | - April
109 | - January
110 | - December
111 | 
112 | ### Solution
113 | 
114 | ![](./img/homework-question5.png)
115 | 
116 | **Answer:** January.
117 | 
118 | ## Submitting the solutions
119 | 
120 | * Form for submitting: https://forms.gle/6A94GPutZJTuT5Y16
121 | * You can submit your homework multiple times. In this case, only the last submission will be used. 
122 | 
123 | Deadline: 25 February (Saturday), 22:00 CET
124 | 
125 | 
126 | ## Solution
127 | 
128 | * Video: https://www.youtube.com/watch?v=I_K0lNu9WQw&list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW
129 | * Answers:
130 |   * Question 1: 61648442,
131 |   * Question 2: 89.9/10.1
132 |   * Question 3: 43244696
133 |   * Question 4: 22998722
134 |   * Question 5: January


--------------------------------------------------------------------------------
/week4/img/artifacts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/artifacts.png


--------------------------------------------------------------------------------
/week4/img/bigquery.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/bigquery.png


--------------------------------------------------------------------------------
/week4/img/charts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/charts.png


--------------------------------------------------------------------------------
/week4/img/control_menu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/control_menu.png


--------------------------------------------------------------------------------
/week4/img/data_source.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/data_source.png


--------------------------------------------------------------------------------
/week4/img/dbt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/dbt.png


--------------------------------------------------------------------------------
/week4/img/dbt_init.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/dbt_init.png


--------------------------------------------------------------------------------
/week4/img/dbt_job1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/dbt_job1.png


--------------------------------------------------------------------------------
/week4/img/dbt_job2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/dbt_job2.png


--------------------------------------------------------------------------------
/week4/img/dbt_job3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/dbt_job3.png


--------------------------------------------------------------------------------
/week4/img/dbt_job4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/dbt_job4.png


--------------------------------------------------------------------------------
/week4/img/dbt_prod_env.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/dbt_prod_env.png


--------------------------------------------------------------------------------
/week4/img/etl_vs_elt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/etl_vs_elt.png


--------------------------------------------------------------------------------
/week4/img/homework-question2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/homework-question2.png


--------------------------------------------------------------------------------
/week4/img/homework-question5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/homework-question5.png


--------------------------------------------------------------------------------
/week4/img/lineage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/lineage.png


--------------------------------------------------------------------------------
/week4/img/select_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/select_table.png


--------------------------------------------------------------------------------
/week4/img/taxi_zone_lookup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/taxi_zone_lookup.png


--------------------------------------------------------------------------------
/week4/img/ts_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/ts_chart.png


--------------------------------------------------------------------------------
/week4/img/ts_chart_2019_2020.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/img/ts_chart_2019_2020.png


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | target/
3 | dbt_packages/
4 | logs/
5 | 


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to your new dbt project!
 2 | 
 3 | ### Using the starter project
 4 | 
 5 | Try running the following commands:
 6 | - dbt run
 7 | - dbt test
 8 | 
 9 | 
10 | ### Resources:
11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
13 | - Join the [dbt community](http://community.getbdt.com/) to learn from other analytics engineers
14 | - Find [dbt events](https://events.getdbt.com) near you
15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
16 | 


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/analyses/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/taxi_rides_ny/analyses/.gitkeep


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Name your project! Project names should contain only lowercase characters
 3 | # and underscores. A good package name should reflect your organization's
 4 | # name or the intended use of these models
 5 | name: 'taxi_rides_ny'
 6 | version: '1.0.0'
 7 | config-version: 2
 8 | 
 9 | # This setting configures which "profile" dbt uses for this project.
10 | profile: 'default'
11 | 
12 | # These configurations specify where dbt should look for different types of files.
13 | # The `source-paths` config, for example, states that models in this project can be
14 | # found in the "models/" directory. You probably won't need to change these!
15 | model-paths: ["models"]
16 | analysis-paths: ["analyses"]
17 | test-paths: ["tests"]
18 | seed-paths: ["seeds"]
19 | macro-paths: ["macros"]
20 | snapshot-paths: ["snapshots"]
21 | 
22 | target-path: "target"  # directory which will store compiled SQL files
23 | clean-targets:         # directories to be removed by `dbt clean`
24 |   - "target"
25 |   - "dbt_packages"
26 | 
27 | 
28 | # Configuring models
29 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
30 | 
31 | # In this example config, we tell dbt to build all models in the example/ directory
32 | # as tables. These settings can be overridden in the individual model files
33 | # using the `{{ config(...) }}` macro.
34 | models:
35 |   taxi_rides_ny:
36 | 
37 | vars:
38 |   payment_type_values: [1, 2, 3, 4, 5, 6]
39 | 
40 | seeds:
41 |   taxi_rides_ny:
42 |     taxi_zone_lookup:
43 |       +column_types:
44 |         locationid: numeric


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/taxi_rides_ny/macros/.gitkeep


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/macros/get_payment_type_description.sql:
--------------------------------------------------------------------------------
 1 | {# This macro returns the description of the payment_type #}
 2 | 
 3 | {% macro get_payment_type_description(payment_type) -%}
 4 | 
 5 |     case {{ payment_type }}
 6 |         when 1 then 'Credit card'
 7 |         when 2 then 'Cash'
 8 |         when 3 then 'No charge'
 9 |         when 4 then 'Dispute'
10 |         when 5 then 'Unknown'
11 |         when 6 then 'Voided trip'
12 |     end
13 | 
14 | {%- endmacro %}
15 | 


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/models/core/dim_monthly_zone_revenue.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table') }}
 2 | 
 3 | with trips_data as (
 4 |     select * from {{ ref('fact_trips') }}
 5 | )
 6 |     select 
 7 |     -- Reveneue grouping 
 8 |     pickup_zone as revenue_zone,
 9 |     --date_trunc('month', pickup_datetime) as revenue_month, 
10 |     --Note: For BQ use instead: date_trunc(pickup_datetime, month) as revenue_month, 
11 |     date_trunc(pickup_datetime, month) as revenue_month,
12 | 
13 |     service_type, 
14 | 
15 |     -- Revenue calculation 
16 |     sum(fare_amount) as revenue_monthly_fare,
17 |     sum(extra) as revenue_monthly_extra,
18 |     sum(mta_tax) as revenue_monthly_mta_tax,
19 |     sum(tip_amount) as revenue_monthly_tip_amount,
20 |     sum(tolls_amount) as revenue_monthly_tolls_amount,
21 |     sum(ehail_fee) as revenue_monthly_ehail_fee,
22 |     sum(improvement_surcharge) as revenue_monthly_improvement_surcharge,
23 |     sum(total_amount) as revenue_monthly_total_amount,
24 |     sum(congestion_surcharge) as revenue_monthly_congestion_surcharge,
25 | 
26 |     -- Additional calculations
27 |     count(tripid) as total_monthly_trips,
28 |     avg(passenger_count) as avg_montly_passenger_count,
29 |     avg(trip_distance) as avg_montly_trip_distance
30 | 
31 |     from trips_data
32 |     group by 1,2,3


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/models/core/dim_zones.sql:
--------------------------------------------------------------------------------
1 | {{ config(materialized='table') }}
2 | 
3 | select 
4 |     locationid, 
5 |     borough, 
6 |     zone, 
7 |     replace(service_zone,'Boro','Green') as service_zone
8 | from {{ ref('taxi_zone_lookup') }}


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/models/core/fact_trips.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table') }}
 2 | 
 3 | with green_data as (
 4 |     select *, 
 5 |         'Green' as service_type 
 6 |     from {{ ref('stg_green_tripdata') }}
 7 | ), 
 8 | 
 9 | yellow_data as (
10 |     select *, 
11 |         'Yellow' as service_type
12 |     from {{ ref('stg_yellow_tripdata') }}
13 | ), 
14 | 
15 | trips_unioned as (
16 |     select * from green_data
17 |     union all
18 |     select * from yellow_data
19 | ), 
20 | 
21 | dim_zones as (
22 |     select * from {{ ref('dim_zones') }}
23 |     where borough != 'Unknown'
24 | )
25 | select 
26 |     trips_unioned.tripid, 
27 |     trips_unioned.vendorid, 
28 |     trips_unioned.service_type,
29 |     trips_unioned.ratecodeid, 
30 |     trips_unioned.pickup_locationid, 
31 |     pickup_zone.borough as pickup_borough, 
32 |     pickup_zone.zone as pickup_zone, 
33 |     trips_unioned.dropoff_locationid,
34 |     dropoff_zone.borough as dropoff_borough, 
35 |     dropoff_zone.zone as dropoff_zone,  
36 |     trips_unioned.pickup_datetime, 
37 |     trips_unioned.dropoff_datetime, 
38 |     trips_unioned.store_and_fwd_flag, 
39 |     trips_unioned.passenger_count, 
40 |     trips_unioned.trip_distance, 
41 |     trips_unioned.trip_type, 
42 |     trips_unioned.fare_amount, 
43 |     trips_unioned.extra, 
44 |     trips_unioned.mta_tax, 
45 |     trips_unioned.tip_amount, 
46 |     trips_unioned.tolls_amount, 
47 |     trips_unioned.ehail_fee, 
48 |     trips_unioned.improvement_surcharge, 
49 |     trips_unioned.total_amount, 
50 |     trips_unioned.payment_type, 
51 |     trips_unioned.payment_type_description, 
52 |     trips_unioned.congestion_surcharge
53 | from trips_unioned
54 | inner join dim_zones as pickup_zone
55 | on trips_unioned.pickup_locationid = pickup_zone.locationid
56 | inner join dim_zones as dropoff_zone
57 | on trips_unioned.dropoff_locationid = dropoff_zone.locationid


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/models/core/fhv_fact_trips.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table') }}
 2 | 
 3 | with fhv_data as (
 4 |     select * from {{ ref('stg_fhv_tripdata') }}
 5 | ),
 6 | 
 7 | dim_zones as (
 8 |     select * from {{ ref('dim_zones') }}
 9 |     where borough != 'Unknown'
10 | )
11 | 
12 | select
13 |     fhv_data.tripid, 
14 |     fhv_data.dispatching_base_num,
15 |     fhv_data.pickup_locationid,
16 |     pickup_zone.borough as pickup_borough,
17 |     pickup_zone.zone as pickup_zone,
18 |     fhv_data.dropoff_locationid,
19 |     dropoff_zone.borough as dropoff_borough, 
20 |     dropoff_zone.zone as dropoff_zone,
21 |     fhv_data.pickup_datetime, 
22 |     fhv_data.dropoff_datetime, 
23 |     fhv_data.is_shared
24 | from fhv_data
25 | inner join dim_zones as pickup_zone
26 | on fhv_data.pickup_locationid = pickup_zone.locationid
27 | inner join dim_zones as dropoff_zone
28 | on fhv_data.dropoff_locationid = dropoff_zone.locationid


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/models/core/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: dim_zones
 5 |     description: >
 6 |       List of unique zones identified by locationid. 
 7 |       Includes the service zone they correspond to (Green or yellow).
 8 |   
 9 |   - name: fact_trips
10 |     description: >
11 |       Taxi trips corresponding to both service zones (Green and yellow).
12 |       The table contains records where both pickup and dropoff locations are valid and known zones. 
13 |       Each record corresponds to a trip uniquely identified by tripid. 
14 |   
15 |   - name: fhv_fact_trips
16 |     description: >
17 |       FHV trips. The table contains records where both pickup and dropoff locations are valid and known zones. 
18 |       Each record corresponds to a trip uniquely identified by tripid. 
19 |       
20 |   - name: dm_monthly_zone_revenue
21 |     description: >
22 |       Aggregated table of all taxi trips corresponding to both service zones (Green and yellow) per pickup zone, month and service.
23 |       The table contains monthly sums of the fare elements used to calculate the monthly revenue. 
24 |       The table contains also monthly indicators like number of trips, and average trip distance. 
25 |     columns:
26 |       - name: revenue_monthly_total_amount
27 |         description: Monthly sum of the the total_amount of the fare charged for the trip per pickup zone, month and service.
28 |         tests:
29 |             - not_null:
30 |                 severity: error


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/models/staging/schema.yml:
--------------------------------------------------------------------------------
  1 | 
  2 | version: 2
  3 | 
  4 | sources:
  5 |     - name: staging
  6 |       #For bigquery:
  7 |       database: dtc-de-375514
  8 | 
  9 |       # For postgres:
 10 |       #database: production
 11 | 
 12 |       schema: trips_data_all
 13 | 
 14 |       # loaded_at_field: record_loaded_at
 15 |       tables:
 16 |         - name: green_tripdata
 17 |         - name: yellow_tripdata
 18 |         - name: fhv_tripdata
 19 |          # freshness:
 20 |            # error_after: {count: 6, period: hour}
 21 | 
 22 | models:
 23 |     - name: stg_green_tripdata
 24 |       description: >
 25 |         Trip made by green taxis, also known as boro taxis and street-hail liveries.
 26 |         Green taxis may respond to street hails,but only in the areas indicated in green on the
 27 |         map (i.e. above W 110 St/E 96th St in Manhattan and in the boroughs).
 28 |         The records were collected and provided to the NYC Taxi and Limousine Commission (TLC) by
 29 |         technology service providers. 
 30 |       columns:
 31 |           - name: tripid
 32 |             description: Primary key for this table, generated with a concatenation of vendorid+pickup_datetime
 33 |             tests:
 34 |                 - unique:
 35 |                     severity: warn
 36 |                 - not_null:
 37 |                     severity: warn
 38 |           - name: VendorID 
 39 |             description: > 
 40 |                 A code indicating the TPEP provider that provided the record.
 41 |                 1= Creative Mobile Technologies, LLC; 
 42 |                 2= VeriFone Inc.
 43 |           - name: pickup_datetime 
 44 |             description: The date and time when the meter was engaged.
 45 |           - name: dropoff_datetime 
 46 |             description: The date and time when the meter was disengaged.
 47 |           - name: Passenger_count 
 48 |             description: The number of passengers in the vehicle. This is a driver-entered value.
 49 |           - name: Trip_distance 
 50 |             description: The elapsed trip distance in miles reported by the taximeter.
 51 |           - name: Pickup_locationid
 52 |             description: locationid where the meter was engaged.
 53 |             tests:
 54 |               - relationships:
 55 |                   to: ref('taxi_zone_lookup')
 56 |                   field: locationid
 57 |                   severity: warn
 58 |           - name: dropoff_locationid 
 59 |             description: locationid where the meter was engaged.
 60 |             tests:
 61 |               - relationships:
 62 |                   to: ref('taxi_zone_lookup')
 63 |                   field: locationid
 64 |           - name: RateCodeID 
 65 |             description: >
 66 |                 The final rate code in effect at the end of the trip.
 67 |                   1= Standard rate
 68 |                   2=JFK
 69 |                   3=Newark
 70 |                   4=Nassau or Westchester
 71 |                   5=Negotiated fare
 72 |                   6=Group ride
 73 |           - name: Store_and_fwd_flag 
 74 |             description: > 
 75 |               This flag indicates whether the trip record was held in vehicle
 76 |               memory before sending to the vendor, aka “store and forward,”
 77 |               because the vehicle did not have a connection to the server.
 78 |                 Y= store and forward trip
 79 |                 N= not a store and forward trip
 80 |           - name: Dropoff_longitude 
 81 |             description: Longitude where the meter was disengaged.
 82 |           - name: Dropoff_latitude 
 83 |             description: Latitude where the meter was disengaged.
 84 |           - name: Payment_type 
 85 |             description: >
 86 |               A numeric code signifying how the passenger paid for the trip.
 87 |             tests: 
 88 |               - accepted_values:
 89 |                   values: "{{ var('payment_type_values') }}"
 90 |                   severity: warn
 91 |                   quote: false
 92 |           - name: payment_type_description
 93 |             description: Description of the payment_type code
 94 |           - name: Fare_amount 
 95 |             description: > 
 96 |               The time-and-distance fare calculated by the meter.
 97 |               Extra Miscellaneous extras and surcharges. Currently, this only includes
 98 |               the $0.50 and $1 rush hour and overnight charges.
 99 |               MTA_tax $0.50 MTA tax that is automatically triggered based on the metered
100 |               rate in use.
101 |           - name: Improvement_surcharge 
102 |             description: > 
103 |               $0.30 improvement surcharge assessed trips at the flag drop. The
104 |               improvement surcharge began being levied in 2015.
105 |           - name: Tip_amount 
106 |             description: > 
107 |               Tip amount. This field is automatically populated for credit card
108 |               tips. Cash tips are not included.
109 |           - name: Tolls_amount 
110 |             description: Total amount of all tolls paid in trip.
111 |           - name: Total_amount 
112 |             description: The total amount charged to passengers. Does not include cash tips.
113 | 
114 |     - name: stg_yellow_tripdata
115 |       description: > 
116 |         Trips made by New York City's iconic yellow taxis. 
117 |         Yellow taxis are the only vehicles permitted to respond to a street hail from a passenger in all five
118 |         boroughs. They may also be hailed using an e-hail app like Curb or Arro.
119 |         The records were collected and provided to the NYC Taxi and Limousine Commission (TLC) by
120 |         technology service providers. 
121 |       columns:
122 |           - name: tripid
123 |             description: Primary key for this table, generated with a concatenation of vendorid+pickup_datetime
124 |             tests:
125 |                 - unique:
126 |                     severity: warn
127 |                 - not_null:
128 |                     severity: warn
129 |           - name: VendorID 
130 |             description: > 
131 |                 A code indicating the TPEP provider that provided the record.
132 |                 1= Creative Mobile Technologies, LLC; 
133 |                 2= VeriFone Inc.
134 |           - name: pickup_datetime 
135 |             description: The date and time when the meter was engaged.
136 |           - name: dropoff_datetime 
137 |             description: The date and time when the meter was disengaged.
138 |           - name: Passenger_count 
139 |             description: The number of passengers in the vehicle. This is a driver-entered value.
140 |           - name: Trip_distance 
141 |             description: The elapsed trip distance in miles reported by the taximeter.
142 |           - name: Pickup_locationid
143 |             description: locationid where the meter was engaged.
144 |             tests:
145 |               - relationships:
146 |                   to: ref('taxi_zone_lookup')
147 |                   field: locationid
148 |                   severity: warn
149 |           - name: dropoff_locationid 
150 |             description: locationid where the meter was engaged.
151 |             tests:
152 |               - relationships:
153 |                   to: ref('taxi_zone_lookup')
154 |                   field: locationid
155 |                   severity: warn
156 |           - name: RateCodeID 
157 |             description: >
158 |                 The final rate code in effect at the end of the trip.
159 |                   1= Standard rate
160 |                   2=JFK
161 |                   3=Newark
162 |                   4=Nassau or Westchester
163 |                   5=Negotiated fare
164 |                   6=Group ride
165 |           - name: Store_and_fwd_flag 
166 |             description: > 
167 |               This flag indicates whether the trip record was held in vehicle
168 |               memory before sending to the vendor, aka “store and forward,”
169 |               because the vehicle did not have a connection to the server.
170 |                 Y= store and forward trip
171 |                 N= not a store and forward trip
172 |           - name: Dropoff_longitude 
173 |             description: Longitude where the meter was disengaged.
174 |           - name: Dropoff_latitude 
175 |             description: Latitude where the meter was disengaged.
176 |           - name: Payment_type 
177 |             description: >
178 |               A numeric code signifying how the passenger paid for the trip.
179 |             tests: 
180 |               - accepted_values:
181 |                   values: "{{ var('payment_type_values') }}"
182 |                   severity: warn
183 |                   quote: false
184 |           - name: payment_type_description
185 |             description: Description of the payment_type code
186 |           - name: Fare_amount 
187 |             description: > 
188 |               The time-and-distance fare calculated by the meter.
189 |               Extra Miscellaneous extras and surcharges. Currently, this only includes
190 |               the $0.50 and $1 rush hour and overnight charges.
191 |               MTA_tax $0.50 MTA tax that is automatically triggered based on the metered
192 |               rate in use.
193 |           - name: Improvement_surcharge 
194 |             description: > 
195 |               $0.30 improvement surcharge assessed trips at the flag drop. The
196 |               improvement surcharge began being levied in 2015.
197 |           - name: Tip_amount 
198 |             description: > 
199 |               Tip amount. This field is automatically populated for credit card
200 |               tips. Cash tips are not included.
201 |           - name: Tolls_amount 
202 |             description: Total amount of all tolls paid in trip.
203 |           - name: Total_amount 
204 |             description: The total amount charged to passengers. Does not include cash tips.
205 |   
206 |     - name: stg_fhv_tripdata
207 |       description: >
208 |         For-Hire Vehicles (FHVs) provide pre-arranged transportation throughout New York City.
209 |         For-Hire trips are arranged through TLC licensed bases.  FHV bases accept trip requests from passengers,
210 |         dispatch TLC licensed drivers in TLC licensed vehicles, and are responsible for collecting and paying
211 |         taxes, as well as providing driver benefits. 
212 |       columns:
213 |           - name: tripid
214 |             description: Primary key for this table, generated with a concatenation of dispatching_base_num+pickup_datetime+dropoff_datetime
215 |             tests:
216 |                 - unique:
217 |                     severity: warn
218 |                 - not_null:
219 |                     severity: warn
220 |           - name: dispatching_base_num 
221 |             description: The TLC Base License Number of the base that dispatched the trip
222 |           - name: pickup_datetime 
223 |             description: The date and time of the trip pick-up
224 |           - name: dropoff_datetime 
225 |             description: The date and time of the trip dropoff
226 |           - name: pickup_locationid 
227 |             description: TLC Taxi Zone in which the trip began
228 |           - name: dropoff_locationid 
229 |             description: TLC Taxi Zone in which the trip ended
230 |           - name: SR_Flag
231 |             description: >
232 |                 Indicates if the trip was a part of a shared ride chain offered by a
233 |                 High Volume FHV company (e.g. Uber Pool, Lyft Line). For shared
234 |                 trips, the value is 1. For non-shared rides, this field is null.
235 | 


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/models/staging/stg_fhv_tripdata.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='view') }}
 2 | 
 3 | with tripdata as 
 4 | (
 5 |   select *,
 6 |     row_number() over(partition by Dispatching_base_num, Pickup_datetime, DropOff_datetime) as rn
 7 |   from {{ source('staging','fhv_tripdata') }}
 8 | )
 9 | select
10 |     -- identifiers
11 |     {{ dbt_utils.surrogate_key(['Dispatching_base_num', 'Pickup_datetime', 'DropOff_datetime']) }} as tripid,
12 |     Dispatching_base_num as dispatching_base_num,
13 |     cast(PULocationID as integer) as pickup_locationid,
14 |     cast(DOLocationID as integer) as dropoff_locationid,
15 |     
16 |     -- timestamps
17 |     cast(Pickup_datetime as timestamp) as pickup_datetime,
18 |     cast(DropOff_datetime as timestamp) as dropoff_datetime,
19 |     
20 |     -- trip info
21 |     cast(SR_Flag as integer) as is_shared
22 | 
23 | from tripdata
24 | where rn = 1
25 | 
26 | -- dbt build --m <model.sql> --var 'is_test_run: false'
27 | {% if var('is_test_run', default=true) %}
28 |     limit 100
29 | {% endif %}


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/models/staging/stg_green_tripdata.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='view') }}
 2 | 
 3 | with tripdata as 
 4 | (
 5 |   select *,
 6 |     row_number() over(partition by vendorid, lpep_pickup_datetime) as rn
 7 |   from {{ source('staging','green_tripdata') }}
 8 |   where vendorid is not null 
 9 | )
10 | select
11 |     -- identifiers
12 |     {{ dbt_utils.surrogate_key(['vendorid', 'lpep_pickup_datetime']) }} as tripid,
13 |     cast(vendorid as integer) as vendorid,
14 |     cast(ratecodeid as integer) as ratecodeid,
15 |     cast(pulocationid as integer) as pickup_locationid,
16 |     cast(dolocationid as integer) as dropoff_locationid,
17 |     
18 |     -- timestamps
19 |     cast(lpep_pickup_datetime as timestamp) as pickup_datetime,
20 |     cast(lpep_dropoff_datetime as timestamp) as dropoff_datetime,
21 |     
22 |     -- trip info
23 |     store_and_fwd_flag,
24 |     cast(passenger_count as integer) as passenger_count,
25 |     cast(trip_distance as numeric) as trip_distance,
26 |     cast(trip_type as integer) as trip_type,
27 |     
28 |     -- payment info
29 |     cast(fare_amount as numeric) as fare_amount,
30 |     cast(extra as numeric) as extra,
31 |     cast(mta_tax as numeric) as mta_tax,
32 |     cast(tip_amount as numeric) as tip_amount,
33 |     cast(tolls_amount as numeric) as tolls_amount,
34 |     cast(ehail_fee as numeric) as ehail_fee,
35 |     cast(improvement_surcharge as numeric) as improvement_surcharge,
36 |     cast(total_amount as numeric) as total_amount,
37 |     cast(payment_type as integer) as payment_type,
38 |     {{ get_payment_type_description('payment_type') }} as payment_type_description,
39 |     cast(congestion_surcharge as numeric) as congestion_surcharge
40 | 
41 | from tripdata
42 | where rn = 1
43 | 
44 | -- dbt build --m <model.sql> --var 'is_test_run: false'
45 | {% if var('is_test_run', default=true) %}
46 |     limit 100
47 | {% endif %}


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/models/staging/stg_yellow_tripdata.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='view') }}
 2 | 
 3 | with tripdata as 
 4 | (
 5 |   select *,
 6 |     row_number() over(partition by vendorid, tpep_pickup_datetime) as rn
 7 |   from {{ source('staging','yellow_tripdata') }}
 8 |   where vendorid is not null 
 9 | )
10 | select
11 |    -- identifiers
12 |     {{ dbt_utils.surrogate_key(['vendorid', 'tpep_pickup_datetime']) }} as tripid,
13 |     cast(vendorid as integer) as vendorid,
14 |     cast(ratecodeid as integer) as ratecodeid,
15 |     cast(pulocationid as integer) as  pickup_locationid,
16 |     cast(dolocationid as integer) as dropoff_locationid,
17 |     
18 |     -- timestamps
19 |     cast(tpep_pickup_datetime as timestamp) as pickup_datetime,
20 |     cast(tpep_dropoff_datetime as timestamp) as dropoff_datetime,
21 |     
22 |     -- trip info
23 |     store_and_fwd_flag,
24 |     cast(passenger_count as integer) as passenger_count,
25 |     cast(trip_distance as numeric) as trip_distance,
26 |     -- yellow cabs are always street-hail
27 |     1 as trip_type,
28 |     
29 |     -- payment info
30 |     cast(fare_amount as numeric) as fare_amount,
31 |     cast(extra as numeric) as extra,
32 |     cast(mta_tax as numeric) as mta_tax,
33 |     cast(tip_amount as numeric) as tip_amount,
34 |     cast(tolls_amount as numeric) as tolls_amount,
35 |     cast(0 as numeric) as ehail_fee,
36 |     cast(improvement_surcharge as numeric) as improvement_surcharge,
37 |     cast(total_amount as numeric) as total_amount,
38 |     cast(payment_type as integer) as payment_type,
39 |     {{ get_payment_type_description('payment_type') }} as payment_type_description, 
40 |     cast(congestion_surcharge as numeric) as congestion_surcharge
41 | from tripdata
42 | where rn = 1
43 | 
44 | -- dbt build --m <model.sql> --var 'is_test_run: false'
45 | {% if var('is_test_run', default=true) %}
46 |     limit 100
47 | {% endif %}


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/packages.yml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - package: dbt-labs/dbt_utils
3 |     version: 0.8.0


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/seeds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/taxi_rides_ny/seeds/.gitkeep


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/seeds/taxi_zone_lookup.csv:
--------------------------------------------------------------------------------
  1 | "LocationID","Borough","Zone","service_zone"
  2 | 1,"EWR","Newark Airport","EWR"
  3 | 2,"Queens","Jamaica Bay","Boro Zone"
  4 | 3,"Bronx","Allerton/Pelham Gardens","Boro Zone"
  5 | 4,"Manhattan","Alphabet City","Yellow Zone"
  6 | 5,"Staten Island","Arden Heights","Boro Zone"
  7 | 6,"Staten Island","Arrochar/Fort Wadsworth","Boro Zone"
  8 | 7,"Queens","Astoria","Boro Zone"
  9 | 8,"Queens","Astoria Park","Boro Zone"
 10 | 9,"Queens","Auburndale","Boro Zone"
 11 | 10,"Queens","Baisley Park","Boro Zone"
 12 | 11,"Brooklyn","Bath Beach","Boro Zone"
 13 | 12,"Manhattan","Battery Park","Yellow Zone"
 14 | 13,"Manhattan","Battery Park City","Yellow Zone"
 15 | 14,"Brooklyn","Bay Ridge","Boro Zone"
 16 | 15,"Queens","Bay Terrace/Fort Totten","Boro Zone"
 17 | 16,"Queens","Bayside","Boro Zone"
 18 | 17,"Brooklyn","Bedford","Boro Zone"
 19 | 18,"Bronx","Bedford Park","Boro Zone"
 20 | 19,"Queens","Bellerose","Boro Zone"
 21 | 20,"Bronx","Belmont","Boro Zone"
 22 | 21,"Brooklyn","Bensonhurst East","Boro Zone"
 23 | 22,"Brooklyn","Bensonhurst West","Boro Zone"
 24 | 23,"Staten Island","Bloomfield/Emerson Hill","Boro Zone"
 25 | 24,"Manhattan","Bloomingdale","Yellow Zone"
 26 | 25,"Brooklyn","Boerum Hill","Boro Zone"
 27 | 26,"Brooklyn","Borough Park","Boro Zone"
 28 | 27,"Queens","Breezy Point/Fort Tilden/Riis Beach","Boro Zone"
 29 | 28,"Queens","Briarwood/Jamaica Hills","Boro Zone"
 30 | 29,"Brooklyn","Brighton Beach","Boro Zone"
 31 | 30,"Queens","Broad Channel","Boro Zone"
 32 | 31,"Bronx","Bronx Park","Boro Zone"
 33 | 32,"Bronx","Bronxdale","Boro Zone"
 34 | 33,"Brooklyn","Brooklyn Heights","Boro Zone"
 35 | 34,"Brooklyn","Brooklyn Navy Yard","Boro Zone"
 36 | 35,"Brooklyn","Brownsville","Boro Zone"
 37 | 36,"Brooklyn","Bushwick North","Boro Zone"
 38 | 37,"Brooklyn","Bushwick South","Boro Zone"
 39 | 38,"Queens","Cambria Heights","Boro Zone"
 40 | 39,"Brooklyn","Canarsie","Boro Zone"
 41 | 40,"Brooklyn","Carroll Gardens","Boro Zone"
 42 | 41,"Manhattan","Central Harlem","Boro Zone"
 43 | 42,"Manhattan","Central Harlem North","Boro Zone"
 44 | 43,"Manhattan","Central Park","Yellow Zone"
 45 | 44,"Staten Island","Charleston/Tottenville","Boro Zone"
 46 | 45,"Manhattan","Chinatown","Yellow Zone"
 47 | 46,"Bronx","City Island","Boro Zone"
 48 | 47,"Bronx","Claremont/Bathgate","Boro Zone"
 49 | 48,"Manhattan","Clinton East","Yellow Zone"
 50 | 49,"Brooklyn","Clinton Hill","Boro Zone"
 51 | 50,"Manhattan","Clinton West","Yellow Zone"
 52 | 51,"Bronx","Co-Op City","Boro Zone"
 53 | 52,"Brooklyn","Cobble Hill","Boro Zone"
 54 | 53,"Queens","College Point","Boro Zone"
 55 | 54,"Brooklyn","Columbia Street","Boro Zone"
 56 | 55,"Brooklyn","Coney Island","Boro Zone"
 57 | 56,"Queens","Corona","Boro Zone"
 58 | 57,"Queens","Corona","Boro Zone"
 59 | 58,"Bronx","Country Club","Boro Zone"
 60 | 59,"Bronx","Crotona Park","Boro Zone"
 61 | 60,"Bronx","Crotona Park East","Boro Zone"
 62 | 61,"Brooklyn","Crown Heights North","Boro Zone"
 63 | 62,"Brooklyn","Crown Heights South","Boro Zone"
 64 | 63,"Brooklyn","Cypress Hills","Boro Zone"
 65 | 64,"Queens","Douglaston","Boro Zone"
 66 | 65,"Brooklyn","Downtown Brooklyn/MetroTech","Boro Zone"
 67 | 66,"Brooklyn","DUMBO/Vinegar Hill","Boro Zone"
 68 | 67,"Brooklyn","Dyker Heights","Boro Zone"
 69 | 68,"Manhattan","East Chelsea","Yellow Zone"
 70 | 69,"Bronx","East Concourse/Concourse Village","Boro Zone"
 71 | 70,"Queens","East Elmhurst","Boro Zone"
 72 | 71,"Brooklyn","East Flatbush/Farragut","Boro Zone"
 73 | 72,"Brooklyn","East Flatbush/Remsen Village","Boro Zone"
 74 | 73,"Queens","East Flushing","Boro Zone"
 75 | 74,"Manhattan","East Harlem North","Boro Zone"
 76 | 75,"Manhattan","East Harlem South","Boro Zone"
 77 | 76,"Brooklyn","East New York","Boro Zone"
 78 | 77,"Brooklyn","East New York/Pennsylvania Avenue","Boro Zone"
 79 | 78,"Bronx","East Tremont","Boro Zone"
 80 | 79,"Manhattan","East Village","Yellow Zone"
 81 | 80,"Brooklyn","East Williamsburg","Boro Zone"
 82 | 81,"Bronx","Eastchester","Boro Zone"
 83 | 82,"Queens","Elmhurst","Boro Zone"
 84 | 83,"Queens","Elmhurst/Maspeth","Boro Zone"
 85 | 84,"Staten Island","Eltingville/Annadale/Prince's Bay","Boro Zone"
 86 | 85,"Brooklyn","Erasmus","Boro Zone"
 87 | 86,"Queens","Far Rockaway","Boro Zone"
 88 | 87,"Manhattan","Financial District North","Yellow Zone"
 89 | 88,"Manhattan","Financial District South","Yellow Zone"
 90 | 89,"Brooklyn","Flatbush/Ditmas Park","Boro Zone"
 91 | 90,"Manhattan","Flatiron","Yellow Zone"
 92 | 91,"Brooklyn","Flatlands","Boro Zone"
 93 | 92,"Queens","Flushing","Boro Zone"
 94 | 93,"Queens","Flushing Meadows-Corona Park","Boro Zone"
 95 | 94,"Bronx","Fordham South","Boro Zone"
 96 | 95,"Queens","Forest Hills","Boro Zone"
 97 | 96,"Queens","Forest Park/Highland Park","Boro Zone"
 98 | 97,"Brooklyn","Fort Greene","Boro Zone"
 99 | 98,"Queens","Fresh Meadows","Boro Zone"
100 | 99,"Staten Island","Freshkills Park","Boro Zone"
101 | 100,"Manhattan","Garment District","Yellow Zone"
102 | 101,"Queens","Glen Oaks","Boro Zone"
103 | 102,"Queens","Glendale","Boro Zone"
104 | 103,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone"
105 | 104,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone"
106 | 105,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone"
107 | 106,"Brooklyn","Gowanus","Boro Zone"
108 | 107,"Manhattan","Gramercy","Yellow Zone"
109 | 108,"Brooklyn","Gravesend","Boro Zone"
110 | 109,"Staten Island","Great Kills","Boro Zone"
111 | 110,"Staten Island","Great Kills Park","Boro Zone"
112 | 111,"Brooklyn","Green-Wood Cemetery","Boro Zone"
113 | 112,"Brooklyn","Greenpoint","Boro Zone"
114 | 113,"Manhattan","Greenwich Village North","Yellow Zone"
115 | 114,"Manhattan","Greenwich Village South","Yellow Zone"
116 | 115,"Staten Island","Grymes Hill/Clifton","Boro Zone"
117 | 116,"Manhattan","Hamilton Heights","Boro Zone"
118 | 117,"Queens","Hammels/Arverne","Boro Zone"
119 | 118,"Staten Island","Heartland Village/Todt Hill","Boro Zone"
120 | 119,"Bronx","Highbridge","Boro Zone"
121 | 120,"Manhattan","Highbridge Park","Boro Zone"
122 | 121,"Queens","Hillcrest/Pomonok","Boro Zone"
123 | 122,"Queens","Hollis","Boro Zone"
124 | 123,"Brooklyn","Homecrest","Boro Zone"
125 | 124,"Queens","Howard Beach","Boro Zone"
126 | 125,"Manhattan","Hudson Sq","Yellow Zone"
127 | 126,"Bronx","Hunts Point","Boro Zone"
128 | 127,"Manhattan","Inwood","Boro Zone"
129 | 128,"Manhattan","Inwood Hill Park","Boro Zone"
130 | 129,"Queens","Jackson Heights","Boro Zone"
131 | 130,"Queens","Jamaica","Boro Zone"
132 | 131,"Queens","Jamaica Estates","Boro Zone"
133 | 132,"Queens","JFK Airport","Airports"
134 | 133,"Brooklyn","Kensington","Boro Zone"
135 | 134,"Queens","Kew Gardens","Boro Zone"
136 | 135,"Queens","Kew Gardens Hills","Boro Zone"
137 | 136,"Bronx","Kingsbridge Heights","Boro Zone"
138 | 137,"Manhattan","Kips Bay","Yellow Zone"
139 | 138,"Queens","LaGuardia Airport","Airports"
140 | 139,"Queens","Laurelton","Boro Zone"
141 | 140,"Manhattan","Lenox Hill East","Yellow Zone"
142 | 141,"Manhattan","Lenox Hill West","Yellow Zone"
143 | 142,"Manhattan","Lincoln Square East","Yellow Zone"
144 | 143,"Manhattan","Lincoln Square West","Yellow Zone"
145 | 144,"Manhattan","Little Italy/NoLiTa","Yellow Zone"
146 | 145,"Queens","Long Island City/Hunters Point","Boro Zone"
147 | 146,"Queens","Long Island City/Queens Plaza","Boro Zone"
148 | 147,"Bronx","Longwood","Boro Zone"
149 | 148,"Manhattan","Lower East Side","Yellow Zone"
150 | 149,"Brooklyn","Madison","Boro Zone"
151 | 150,"Brooklyn","Manhattan Beach","Boro Zone"
152 | 151,"Manhattan","Manhattan Valley","Yellow Zone"
153 | 152,"Manhattan","Manhattanville","Boro Zone"
154 | 153,"Manhattan","Marble Hill","Boro Zone"
155 | 154,"Brooklyn","Marine Park/Floyd Bennett Field","Boro Zone"
156 | 155,"Brooklyn","Marine Park/Mill Basin","Boro Zone"
157 | 156,"Staten Island","Mariners Harbor","Boro Zone"
158 | 157,"Queens","Maspeth","Boro Zone"
159 | 158,"Manhattan","Meatpacking/West Village West","Yellow Zone"
160 | 159,"Bronx","Melrose South","Boro Zone"
161 | 160,"Queens","Middle Village","Boro Zone"
162 | 161,"Manhattan","Midtown Center","Yellow Zone"
163 | 162,"Manhattan","Midtown East","Yellow Zone"
164 | 163,"Manhattan","Midtown North","Yellow Zone"
165 | 164,"Manhattan","Midtown South","Yellow Zone"
166 | 165,"Brooklyn","Midwood","Boro Zone"
167 | 166,"Manhattan","Morningside Heights","Boro Zone"
168 | 167,"Bronx","Morrisania/Melrose","Boro Zone"
169 | 168,"Bronx","Mott Haven/Port Morris","Boro Zone"
170 | 169,"Bronx","Mount Hope","Boro Zone"
171 | 170,"Manhattan","Murray Hill","Yellow Zone"
172 | 171,"Queens","Murray Hill-Queens","Boro Zone"
173 | 172,"Staten Island","New Dorp/Midland Beach","Boro Zone"
174 | 173,"Queens","North Corona","Boro Zone"
175 | 174,"Bronx","Norwood","Boro Zone"
176 | 175,"Queens","Oakland Gardens","Boro Zone"
177 | 176,"Staten Island","Oakwood","Boro Zone"
178 | 177,"Brooklyn","Ocean Hill","Boro Zone"
179 | 178,"Brooklyn","Ocean Parkway South","Boro Zone"
180 | 179,"Queens","Old Astoria","Boro Zone"
181 | 180,"Queens","Ozone Park","Boro Zone"
182 | 181,"Brooklyn","Park Slope","Boro Zone"
183 | 182,"Bronx","Parkchester","Boro Zone"
184 | 183,"Bronx","Pelham Bay","Boro Zone"
185 | 184,"Bronx","Pelham Bay Park","Boro Zone"
186 | 185,"Bronx","Pelham Parkway","Boro Zone"
187 | 186,"Manhattan","Penn Station/Madison Sq West","Yellow Zone"
188 | 187,"Staten Island","Port Richmond","Boro Zone"
189 | 188,"Brooklyn","Prospect-Lefferts Gardens","Boro Zone"
190 | 189,"Brooklyn","Prospect Heights","Boro Zone"
191 | 190,"Brooklyn","Prospect Park","Boro Zone"
192 | 191,"Queens","Queens Village","Boro Zone"
193 | 192,"Queens","Queensboro Hill","Boro Zone"
194 | 193,"Queens","Queensbridge/Ravenswood","Boro Zone"
195 | 194,"Manhattan","Randalls Island","Yellow Zone"
196 | 195,"Brooklyn","Red Hook","Boro Zone"
197 | 196,"Queens","Rego Park","Boro Zone"
198 | 197,"Queens","Richmond Hill","Boro Zone"
199 | 198,"Queens","Ridgewood","Boro Zone"
200 | 199,"Bronx","Rikers Island","Boro Zone"
201 | 200,"Bronx","Riverdale/North Riverdale/Fieldston","Boro Zone"
202 | 201,"Queens","Rockaway Park","Boro Zone"
203 | 202,"Manhattan","Roosevelt Island","Boro Zone"
204 | 203,"Queens","Rosedale","Boro Zone"
205 | 204,"Staten Island","Rossville/Woodrow","Boro Zone"
206 | 205,"Queens","Saint Albans","Boro Zone"
207 | 206,"Staten Island","Saint George/New Brighton","Boro Zone"
208 | 207,"Queens","Saint Michaels Cemetery/Woodside","Boro Zone"
209 | 208,"Bronx","Schuylerville/Edgewater Park","Boro Zone"
210 | 209,"Manhattan","Seaport","Yellow Zone"
211 | 210,"Brooklyn","Sheepshead Bay","Boro Zone"
212 | 211,"Manhattan","SoHo","Yellow Zone"
213 | 212,"Bronx","Soundview/Bruckner","Boro Zone"
214 | 213,"Bronx","Soundview/Castle Hill","Boro Zone"
215 | 214,"Staten Island","South Beach/Dongan Hills","Boro Zone"
216 | 215,"Queens","South Jamaica","Boro Zone"
217 | 216,"Queens","South Ozone Park","Boro Zone"
218 | 217,"Brooklyn","South Williamsburg","Boro Zone"
219 | 218,"Queens","Springfield Gardens North","Boro Zone"
220 | 219,"Queens","Springfield Gardens South","Boro Zone"
221 | 220,"Bronx","Spuyten Duyvil/Kingsbridge","Boro Zone"
222 | 221,"Staten Island","Stapleton","Boro Zone"
223 | 222,"Brooklyn","Starrett City","Boro Zone"
224 | 223,"Queens","Steinway","Boro Zone"
225 | 224,"Manhattan","Stuy Town/Peter Cooper Village","Yellow Zone"
226 | 225,"Brooklyn","Stuyvesant Heights","Boro Zone"
227 | 226,"Queens","Sunnyside","Boro Zone"
228 | 227,"Brooklyn","Sunset Park East","Boro Zone"
229 | 228,"Brooklyn","Sunset Park West","Boro Zone"
230 | 229,"Manhattan","Sutton Place/Turtle Bay North","Yellow Zone"
231 | 230,"Manhattan","Times Sq/Theatre District","Yellow Zone"
232 | 231,"Manhattan","TriBeCa/Civic Center","Yellow Zone"
233 | 232,"Manhattan","Two Bridges/Seward Park","Yellow Zone"
234 | 233,"Manhattan","UN/Turtle Bay South","Yellow Zone"
235 | 234,"Manhattan","Union Sq","Yellow Zone"
236 | 235,"Bronx","University Heights/Morris Heights","Boro Zone"
237 | 236,"Manhattan","Upper East Side North","Yellow Zone"
238 | 237,"Manhattan","Upper East Side South","Yellow Zone"
239 | 238,"Manhattan","Upper West Side North","Yellow Zone"
240 | 239,"Manhattan","Upper West Side South","Yellow Zone"
241 | 240,"Bronx","Van Cortlandt Park","Boro Zone"
242 | 241,"Bronx","Van Cortlandt Village","Boro Zone"
243 | 242,"Bronx","Van Nest/Morris Park","Boro Zone"
244 | 243,"Manhattan","Washington Heights North","Boro Zone"
245 | 244,"Manhattan","Washington Heights South","Boro Zone"
246 | 245,"Staten Island","West Brighton","Boro Zone"
247 | 246,"Manhattan","West Chelsea/Hudson Yards","Yellow Zone"
248 | 247,"Bronx","West Concourse","Boro Zone"
249 | 248,"Bronx","West Farms/Bronx River","Boro Zone"
250 | 249,"Manhattan","West Village","Yellow Zone"
251 | 250,"Bronx","Westchester Village/Unionport","Boro Zone"
252 | 251,"Staten Island","Westerleigh","Boro Zone"
253 | 252,"Queens","Whitestone","Boro Zone"
254 | 253,"Queens","Willets Point","Boro Zone"
255 | 254,"Bronx","Williamsbridge/Olinville","Boro Zone"
256 | 255,"Brooklyn","Williamsburg (North Side)","Boro Zone"
257 | 256,"Brooklyn","Williamsburg (South Side)","Boro Zone"
258 | 257,"Brooklyn","Windsor Terrace","Boro Zone"
259 | 258,"Queens","Woodhaven","Boro Zone"
260 | 259,"Bronx","Woodlawn/Wakefield","Boro Zone"
261 | 260,"Queens","Woodside","Boro Zone"
262 | 261,"Manhattan","World Trade Center","Yellow Zone"
263 | 262,"Manhattan","Yorkville East","Yellow Zone"
264 | 263,"Manhattan","Yorkville West","Yellow Zone"
265 | 264,"Unknown","NV","N/A"
266 | 265,"Unknown","NA","N/A"
267 | 


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/snapshots/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/taxi_rides_ny/snapshots/.gitkeep


--------------------------------------------------------------------------------
/week4/taxi_rides_ny/tests/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week4/taxi_rides_ny/tests/.gitkeep


--------------------------------------------------------------------------------
/week4/web_to_gcs.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | import requests
 4 | import pandas as pd
 5 | import pyarrow
 6 | from google.cloud import storage
 7 | 
 8 | """
 9 | Pre-reqs: 
10 | 1. `pip install pandas pyarrow google-cloud-storage`
11 | 2. Set GOOGLE_APPLICATION_CREDENTIALS to your project/service-account key
12 | 3. Set GCP_GCS_BUCKET as your bucket or change default value of BUCKET
13 | """
14 | 
15 | init_url = 'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{service}/{service}_tripdata_{year}-{month}.csv.gz'
16 | # switch out the bucketname
17 | BUCKET = "dtc_data_lake_dtc-de-375514"
18 | 
19 | def upload_to_gcs(bucket, object_name, local_file):
20 |     """
21 |     Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
22 |     """
23 |     # # WORKAROUND to prevent timeout for files > 6 MB on 800 kbps upload speed.
24 |     # # (Ref: https://github.com/googleapis/python-storage/issues/74)
25 |     # storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024  # 5 MB
26 |     # storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024  # 5 MB
27 | 
28 |     client = storage.Client()
29 |     bucket = client.bucket(bucket)
30 |     blob = bucket.blob(object_name)
31 |     blob.upload_from_filename(local_file)
32 | 
33 | 
34 | def web_to_gcs(year, service):
35 |     for i in range(12):
36 |         
37 |         # sets the month part of the file_name string
38 |         month = '0'+str(i+1)
39 |         month = month[-2:]
40 | 
41 |         # download it using requests via a pandas df
42 |         request_url = init_url.format(service=service, year=year, month=month)
43 |         file_name = request_url.split('/')[-1]
44 |         os.system('wget ' + request_url)
45 |         #r = requests.get(request_url)
46 |         #pd.DataFrame(io.StringIO(r.text)).to_csv(file_name, compression='gzip')
47 |         print(f"Local: {file_name}")
48 | 
49 |         # read it back into a parquet file
50 |         df = pd.read_csv(file_name).astype({"PUlocationID": "Int64", "DOlocationID": "Int64"})
51 |         file_name = file_name.replace('.csv.gz', '.parquet')
52 |         df.to_parquet(file_name, engine='pyarrow')
53 |         print(f"Parquet: {file_name}")
54 | 
55 |         # upload it to gcs 
56 |         upload_to_gcs(BUCKET, f"{service}/{file_name}", file_name)
57 |         print(f"GCS: {service}/{file_name}")
58 | 
59 | 
60 | web_to_gcs('2019', 'green')
61 | web_to_gcs('2020', 'green')
62 | web_to_gcs('2019', 'yellow')
63 | web_to_gcs('2020', 'yellow')
64 | web_to_gcs('2019', 'fhv')


--------------------------------------------------------------------------------
/week5/09_spark_gcs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "50696932",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pyspark\n",
 11 |     "from pyspark.sql import SparkSession\n",
 12 |     "from pyspark.conf import SparkConf\n",
 13 |     "from pyspark.context import SparkContext"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "id": "42094fb2",
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "credentials_location = '/home/padilha/projects/de-zoomcamp/dtc-de-375514-849c13503247.json'\n",
 24 |     "\n",
 25 |     "conf = SparkConf() \\\n",
 26 |     "    .setMaster('local[*]') \\\n",
 27 |     "    .setAppName('test') \\\n",
 28 |     "    .set(\"spark.jars\", \"./lib/gcs-connector-hadoop3-latest.jar\") \\\n",
 29 |     "    .set(\"spark.hadoop.google.cloud.auth.service.account.enable\", \"true\") \\\n",
 30 |     "    .set(\"spark.hadoop.google.cloud.auth.service.account.json.keyfile\", credentials_location)"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "id": "cc1ce80f",
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stderr",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "23/03/12 16:51:37 WARN Utils: Your hostname, padilha-A70-HYB resolves to a loopback address: 127.0.1.1; using 192.168.15.5 instead (on interface wlo1)\n",
 44 |       "23/03/12 16:51:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
 45 |       "WARNING: An illegal reflective access operation has occurred\n",
 46 |       "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/padilha/miniconda3/envs/de-zoomcamp-week5/lib/python3.9/site-packages/pyspark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
 47 |       "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
 48 |       "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
 49 |       "WARNING: All illegal access operations will be denied in a future release\n",
 50 |       "23/03/12 16:51:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
 51 |       "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
 52 |       "Setting default log level to \"WARN\".\n",
 53 |       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
 54 |       "23/03/12 16:51:38 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "sc = SparkContext(conf=conf)\n",
 60 |     "\n",
 61 |     "hadoop_conf = sc._jsc.hadoopConfiguration()\n",
 62 |     "\n",
 63 |     "hadoop_conf.set(\"fs.AbstractFileSystem.gs.impl\",  \"com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS\")\n",
 64 |     "hadoop_conf.set(\"fs.gs.impl\", \"com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem\")\n",
 65 |     "hadoop_conf.set(\"fs.gs.auth.service.account.json.keyfile\", credentials_location)\n",
 66 |     "hadoop_conf.set(\"fs.gs.auth.service.account.enable\", \"true\")"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 4,
 72 |    "id": "ab61f2a4",
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "spark = SparkSession.builder \\\n",
 77 |     "    .config(conf=sc.getConf()) \\\n",
 78 |     "    .getOrCreate()"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 5,
 84 |    "id": "43d57114",
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stderr",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "                                                                                \r"
 92 |      ]
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "df_green = spark.read.parquet('gs://dtc_data_lake_dtc-de-375514/pq/green/*/*')"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 7,
102 |    "id": "e0a61d09",
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "name": "stderr",
107 |      "output_type": "stream",
108 |      "text": [
109 |       "                                                                                \r"
110 |      ]
111 |     },
112 |     {
113 |      "data": {
114 |       "text/plain": [
115 |        "2304517"
116 |       ]
117 |      },
118 |      "execution_count": 7,
119 |      "metadata": {},
120 |      "output_type": "execute_result"
121 |     }
122 |    ],
123 |    "source": [
124 |     "df_green.count()"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "id": "b778aa3e",
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": []
134 |   }
135 |  ],
136 |  "metadata": {
137 |   "kernelspec": {
138 |    "display_name": "Python 3 (ipykernel)",
139 |    "language": "python",
140 |    "name": "python3"
141 |   },
142 |   "language_info": {
143 |    "codemirror_mode": {
144 |     "name": "ipython",
145 |     "version": 3
146 |    },
147 |    "file_extension": ".py",
148 |    "mimetype": "text/x-python",
149 |    "name": "python",
150 |    "nbconvert_exporter": "python",
151 |    "pygments_lexer": "ipython3",
152 |    "version": "3.9.16"
153 |   }
154 |  },
155 |  "nbformat": 4,
156 |  "nbformat_minor": 5
157 | }
158 | 


--------------------------------------------------------------------------------
/week5/10_local_spark_cluster.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from pyspark.sql import SparkSession
 4 | from pyspark.sql import functions as F
 5 | 
 6 | parser = argparse.ArgumentParser(description='Ingest CSV data to Postgres')
 7 | parser.add_argument('--input_green', required=True)
 8 | parser.add_argument('--input_yellow', required=True)
 9 | parser.add_argument('--output', required=True)
10 | args = parser.parse_args()
11 | 
12 | input_green = args.input_green
13 | input_yellow = args.input_yellow
14 | output = args.output
15 | 
16 | spark = SparkSession.builder \
17 |     .appName('test') \
18 |     .getOrCreate()
19 | 
20 | df_green = spark.read.parquet(input_green)
21 | 
22 | df_yellow = spark.read.parquet(input_yellow)
23 | 
24 | df_green = df_green \
25 |     .withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \
26 |     .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime')
27 | 
28 | df_yellow = df_yellow \
29 |     .withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \
30 |     .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime')
31 | df_yellow.columns
32 | 
33 | common_columns = [
34 |     'VendorID',
35 |     'pickup_datetime',
36 |     'dropoff_datetime',
37 |     'store_and_fwd_flag',
38 |     'RatecodeID',
39 |     'PULocationID',
40 |     'DOLocationID',
41 |     'passenger_count',
42 |     'trip_distance',
43 |     'fare_amount',
44 |     'extra',
45 |     'mta_tax',
46 |     'tip_amount',
47 |     'tolls_amount',
48 |     'ehail_fee',
49 |     'improvement_surcharge',
50 |     'total_amount',
51 |     'payment_type',
52 |     'trip_type',
53 |     'congestion_surcharge'
54 | ]
55 | 
56 | df_green_sel = \
57 |     df_green \
58 |     .select(common_columns) \
59 |     .withColumn('service_type', F.lit('green'))
60 | 
61 | df_yellow_sel = \
62 |     df_yellow \
63 |     .select(common_columns) \
64 |     .withColumn('service_type', F.lit('yellow'))
65 | 
66 | df_trips_data = df_green_sel.unionAll(df_yellow_sel)
67 | 
68 | df_trips_data.registerTempTable('trips_data')
69 | 
70 | df_result = spark.sql("""
71 | SELECT 
72 |     -- Reveneue grouping 
73 |     PULocationID AS revenue_zone,
74 |     date_trunc('month', pickup_datetime) AS revenue_month, 
75 |     service_type, 
76 | 
77 |     -- Revenue calculation 
78 |     SUM(fare_amount) AS revenue_monthly_fare,
79 |     SUM(extra) AS revenue_monthly_extra,
80 |     SUM(mta_tax) AS revenue_monthly_mta_tax,
81 |     SUM(tip_amount) AS revenue_monthly_tip_amount,
82 |     SUM(tolls_amount) AS revenue_monthly_tolls_amount,
83 |     SUM(improvement_surcharge) AS revenue_monthly_improvement_surcharge,
84 |     SUM(total_amount) AS revenue_monthly_total_amount,
85 |     SUM(congestion_surcharge) AS revenue_monthly_congestion_surcharge,
86 | 
87 |     -- Additional calculations
88 |     AVG(passenger_count) AS avg_montly_passenger_count,
89 |     AVG(trip_distance) AS avg_montly_trip_distance
90 | FROM
91 |     trips_data
92 | GROUP BY
93 |     1, 2, 3
94 | """)
95 | 
96 | df_result.coalesce(1).write.parquet(output, mode='overwrite')
97 | 


--------------------------------------------------------------------------------
/week5/11_big_query.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | from pyspark.sql import SparkSession
  4 | from pyspark.sql import functions as F
  5 | 
  6 | parser = argparse.ArgumentParser(description='Ingest CSV data to Postgres')
  7 | parser.add_argument('--input_green', required=True)
  8 | parser.add_argument('--input_yellow', required=True)
  9 | parser.add_argument('--output', required=True)
 10 | args = parser.parse_args()
 11 | 
 12 | input_green = args.input_green
 13 | input_yellow = args.input_yellow
 14 | output = args.output
 15 | 
 16 | spark = SparkSession.builder \
 17 |     .appName('test') \
 18 |     .getOrCreate()
 19 | 
 20 | spark.conf.set('temporaryGcsBucket', 'dataproc-temp-europe-west6-20820862035-y4bk6fli')
 21 | 
 22 | df_green = spark.read.parquet(input_green)
 23 | 
 24 | df_yellow = spark.read.parquet(input_yellow)
 25 | 
 26 | df_green = df_green \
 27 |     .withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \
 28 |     .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime')
 29 | 
 30 | df_yellow = df_yellow \
 31 |     .withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \
 32 |     .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime')
 33 | df_yellow.columns
 34 | 
 35 | common_columns = [
 36 |     'VendorID',
 37 |     'pickup_datetime',
 38 |     'dropoff_datetime',
 39 |     'store_and_fwd_flag',
 40 |     'RatecodeID',
 41 |     'PULocationID',
 42 |     'DOLocationID',
 43 |     'passenger_count',
 44 |     'trip_distance',
 45 |     'fare_amount',
 46 |     'extra',
 47 |     'mta_tax',
 48 |     'tip_amount',
 49 |     'tolls_amount',
 50 |     'ehail_fee',
 51 |     'improvement_surcharge',
 52 |     'total_amount',
 53 |     'payment_type',
 54 |     'trip_type',
 55 |     'congestion_surcharge'
 56 | ]
 57 | 
 58 | df_green_sel = \
 59 |     df_green \
 60 |     .select(common_columns) \
 61 |     .withColumn('service_type', F.lit('green'))
 62 | 
 63 | df_yellow_sel = \
 64 |     df_yellow \
 65 |     .select(common_columns) \
 66 |     .withColumn('service_type', F.lit('yellow'))
 67 | 
 68 | df_trips_data = df_green_sel.unionAll(df_yellow_sel)
 69 | 
 70 | df_trips_data.registerTempTable('trips_data')
 71 | 
 72 | df_result = spark.sql("""
 73 | SELECT 
 74 |     -- Reveneue grouping 
 75 |     PULocationID AS revenue_zone,
 76 |     date_trunc('month', pickup_datetime) AS revenue_month, 
 77 |     service_type, 
 78 | 
 79 |     -- Revenue calculation 
 80 |     SUM(fare_amount) AS revenue_monthly_fare,
 81 |     SUM(extra) AS revenue_monthly_extra,
 82 |     SUM(mta_tax) AS revenue_monthly_mta_tax,
 83 |     SUM(tip_amount) AS revenue_monthly_tip_amount,
 84 |     SUM(tolls_amount) AS revenue_monthly_tolls_amount,
 85 |     SUM(improvement_surcharge) AS revenue_monthly_improvement_surcharge,
 86 |     SUM(total_amount) AS revenue_monthly_total_amount,
 87 |     SUM(congestion_surcharge) AS revenue_monthly_congestion_surcharge,
 88 | 
 89 |     -- Additional calculations
 90 |     AVG(passenger_count) AS avg_montly_passenger_count,
 91 |     AVG(trip_distance) AS avg_montly_trip_distance
 92 | FROM
 93 |     trips_data
 94 | GROUP BY
 95 |     1, 2, 3
 96 | """)
 97 | 
 98 | df_result.write.format('bigquery') \
 99 |     .option('table', output) \
100 |     .save()
101 | 


--------------------------------------------------------------------------------
/week5/de-zoomcamp-week5.yaml:
--------------------------------------------------------------------------------
  1 | name: de-zoomcamp-week5
  2 | channels:
  3 |   - defaults
  4 | dependencies:
  5 |   - _libgcc_mutex=0.1=main
  6 |   - _openmp_mutex=5.1=1_gnu
  7 |   - abseil-cpp=20211102.0=hd4dd3e8_0
  8 |   - anyio=3.5.0=py39h06a4308_0
  9 |   - argon2-cffi=21.3.0=pyhd3eb1b0_0
 10 |   - argon2-cffi-bindings=21.2.0=py39h7f8727e_0
 11 |   - arrow-cpp=8.0.0=py39h60b952e_1
 12 |   - asttokens=2.0.5=pyhd3eb1b0_0
 13 |   - attrs=22.1.0=py39h06a4308_0
 14 |   - aws-c-common=0.4.57=he6710b0_1
 15 |   - aws-c-event-stream=0.1.6=h2531618_5
 16 |   - aws-checksums=0.1.9=he6710b0_0
 17 |   - aws-sdk-cpp=1.8.185=hce553d0_0
 18 |   - babel=2.11.0=py39h06a4308_0
 19 |   - backcall=0.2.0=pyhd3eb1b0_0
 20 |   - beautifulsoup4=4.11.1=py39h06a4308_0
 21 |   - blas=1.0=mkl
 22 |   - bleach=4.1.0=pyhd3eb1b0_0
 23 |   - boost-cpp=1.73.0=h7f8727e_12
 24 |   - bottleneck=1.3.5=py39h7deecbd_0
 25 |   - brotlipy=0.7.0=py39h27cfd23_1003
 26 |   - bzip2=1.0.8=h7b6447c_0
 27 |   - c-ares=1.18.1=h7f8727e_0
 28 |   - ca-certificates=2023.01.10=h06a4308_0
 29 |   - certifi=2022.12.7=py39h06a4308_0
 30 |   - cffi=1.15.1=py39h5eee18b_3
 31 |   - charset-normalizer=2.0.4=pyhd3eb1b0_0
 32 |   - comm=0.1.2=py39h06a4308_0
 33 |   - cryptography=38.0.4=py39h9ce1e76_0
 34 |   - dbus=1.13.18=hb2f20db_0
 35 |   - debugpy=1.5.1=py39h295c915_0
 36 |   - decorator=5.1.1=pyhd3eb1b0_0
 37 |   - defusedxml=0.7.1=pyhd3eb1b0_0
 38 |   - entrypoints=0.4=py39h06a4308_0
 39 |   - executing=0.8.3=pyhd3eb1b0_0
 40 |   - expat=2.4.9=h6a678d5_0
 41 |   - flit-core=3.6.0=pyhd3eb1b0_0
 42 |   - fontconfig=2.14.1=h52c9d5c_1
 43 |   - freetype=2.12.1=h4a9f257_0
 44 |   - gflags=2.2.2=he6710b0_0
 45 |   - giflib=5.2.1=h5eee18b_1
 46 |   - glib=2.69.1=he621ea3_2
 47 |   - glog=0.5.0=h2531618_0
 48 |   - grpc-cpp=1.46.1=h33aed49_1
 49 |   - gst-plugins-base=1.14.0=h8213a91_2
 50 |   - gstreamer=1.14.0=h28cd5cc_2
 51 |   - icu=58.2=he6710b0_3
 52 |   - idna=3.4=py39h06a4308_0
 53 |   - importlib-metadata=4.11.3=py39h06a4308_0
 54 |   - intel-openmp=2021.4.0=h06a4308_3561
 55 |   - ipykernel=6.19.2=py39hb070fc8_0
 56 |   - ipython=8.10.0=py39h06a4308_0
 57 |   - ipython_genutils=0.2.0=pyhd3eb1b0_1
 58 |   - ipywidgets=7.6.5=pyhd3eb1b0_1
 59 |   - jedi=0.18.1=py39h06a4308_1
 60 |   - jinja2=3.1.2=py39h06a4308_0
 61 |   - jpeg=9e=h7f8727e_0
 62 |   - json5=0.9.6=pyhd3eb1b0_0
 63 |   - jsonschema=4.17.3=py39h06a4308_0
 64 |   - jupyter=1.0.0=py39h06a4308_8
 65 |   - jupyter_client=7.4.9=py39h06a4308_0
 66 |   - jupyter_console=6.4.4=py39h06a4308_0
 67 |   - jupyter_core=5.2.0=py39h06a4308_0
 68 |   - jupyter_server=1.23.4=py39h06a4308_0
 69 |   - jupyterlab=3.5.3=py39h06a4308_0
 70 |   - jupyterlab_pygments=0.1.2=py_0
 71 |   - jupyterlab_server=2.16.5=py39h06a4308_0
 72 |   - jupyterlab_widgets=1.0.0=pyhd3eb1b0_1
 73 |   - krb5=1.19.4=h568e23c_0
 74 |   - ld_impl_linux-64=2.38=h1181459_1
 75 |   - lerc=3.0=h295c915_0
 76 |   - libboost=1.73.0=h28710b8_12
 77 |   - libbrotlicommon=1.0.9=h5eee18b_7
 78 |   - libbrotlidec=1.0.9=h5eee18b_7
 79 |   - libbrotlienc=1.0.9=h5eee18b_7
 80 |   - libclang=10.0.1=default_hb85057a_2
 81 |   - libcurl=7.87.0=h91b91d3_0
 82 |   - libdeflate=1.8=h7f8727e_5
 83 |   - libedit=3.1.20221030=h5eee18b_0
 84 |   - libev=4.33=h7f8727e_1
 85 |   - libevent=2.1.12=h8f2d780_0
 86 |   - libffi=3.4.2=h6a678d5_6
 87 |   - libgcc-ng=11.2.0=h1234567_1
 88 |   - libgomp=11.2.0=h1234567_1
 89 |   - libllvm10=10.0.1=hbcb73fb_5
 90 |   - libnghttp2=1.46.0=hce63b2e_0
 91 |   - libpng=1.6.37=hbc83047_0
 92 |   - libpq=12.9=h16c4e8d_3
 93 |   - libprotobuf=3.20.3=he621ea3_0
 94 |   - libsodium=1.0.18=h7b6447c_0
 95 |   - libssh2=1.10.0=h8f2d780_0
 96 |   - libstdcxx-ng=11.2.0=h1234567_1
 97 |   - libthrift=0.15.0=hcc01f38_0
 98 |   - libtiff=4.5.0=h6a678d5_1
 99 |   - libuuid=1.41.5=h5eee18b_0
100 |   - libwebp=1.2.4=h11a3e52_0
101 |   - libwebp-base=1.2.4=h5eee18b_0
102 |   - libxcb=1.15=h7f8727e_0
103 |   - libxkbcommon=1.0.1=hfa300c1_0
104 |   - libxml2=2.9.14=h74e7548_0
105 |   - libxslt=1.1.35=h4e12654_0
106 |   - lxml=4.9.1=py39h1edc446_0
107 |   - lz4-c=1.9.4=h6a678d5_0
108 |   - markupsafe=2.1.1=py39h7f8727e_0
109 |   - matplotlib-inline=0.1.6=py39h06a4308_0
110 |   - mistune=0.8.4=py39h27cfd23_1000
111 |   - mkl=2021.4.0=h06a4308_640
112 |   - mkl-service=2.4.0=py39h7f8727e_0
113 |   - mkl_fft=1.3.1=py39hd3c417c_0
114 |   - mkl_random=1.2.2=py39h51133e4_0
115 |   - nbclassic=0.4.8=py39h06a4308_0
116 |   - nbclient=0.5.13=py39h06a4308_0
117 |   - nbconvert=6.5.4=py39h06a4308_0
118 |   - nbformat=5.7.0=py39h06a4308_0
119 |   - ncurses=6.4=h6a678d5_0
120 |   - nest-asyncio=1.5.6=py39h06a4308_0
121 |   - notebook=6.5.2=py39h06a4308_0
122 |   - notebook-shim=0.2.2=py39h06a4308_0
123 |   - nspr=4.33=h295c915_0
124 |   - nss=3.74=h0370c37_0
125 |   - numexpr=2.8.4=py39he184ba9_0
126 |   - numpy=1.23.5=py39h14f4228_0
127 |   - numpy-base=1.23.5=py39h31eccc5_0
128 |   - openjdk=11.0.13=h87a67e3_0
129 |   - openssl=1.1.1t=h7f8727e_0
130 |   - orc=1.7.4=hb3bc3d3_1
131 |   - packaging=22.0=py39h06a4308_0
132 |   - pandas=1.5.2=py39h417a72b_0
133 |   - pandocfilters=1.5.0=pyhd3eb1b0_0
134 |   - parso=0.8.3=pyhd3eb1b0_0
135 |   - pcre=8.45=h295c915_0
136 |   - pexpect=4.8.0=pyhd3eb1b0_3
137 |   - pickleshare=0.7.5=pyhd3eb1b0_1003
138 |   - pip=22.3.1=py39h06a4308_0
139 |   - platformdirs=2.5.2=py39h06a4308_0
140 |   - ply=3.11=py39h06a4308_0
141 |   - prometheus_client=0.14.1=py39h06a4308_0
142 |   - prompt-toolkit=3.0.36=py39h06a4308_0
143 |   - prompt_toolkit=3.0.36=hd3eb1b0_0
144 |   - psutil=5.9.0=py39h5eee18b_0
145 |   - ptyprocess=0.7.0=pyhd3eb1b0_2
146 |   - pure_eval=0.2.2=pyhd3eb1b0_0
147 |   - py4j=0.10.9.3=py39h06a4308_0
148 |   - pyarrow=8.0.0=py39h992f0b0_0
149 |   - pycparser=2.21=pyhd3eb1b0_0
150 |   - pygments=2.11.2=pyhd3eb1b0_0
151 |   - pyopenssl=22.0.0=pyhd3eb1b0_0
152 |   - pyqt=5.15.7=py39h6a678d5_1
153 |   - pyqt5-sip=12.11.0=py39h6a678d5_1
154 |   - pyrsistent=0.18.0=py39heee7806_0
155 |   - pysocks=1.7.1=py39h06a4308_0
156 |   - pyspark=3.2.1=py39h06a4308_0
157 |   - python=3.9.16=h7a1cb2a_0
158 |   - python-dateutil=2.8.2=pyhd3eb1b0_0
159 |   - python-fastjsonschema=2.16.2=py39h06a4308_0
160 |   - pytz=2022.7=py39h06a4308_0
161 |   - pyzmq=23.2.0=py39h6a678d5_0
162 |   - qt-main=5.15.2=h327a75a_7
163 |   - qt-webengine=5.15.9=hd2b0992_4
164 |   - qtconsole=5.4.0=py39h06a4308_0
165 |   - qtpy=2.2.0=py39h06a4308_0
166 |   - qtwebkit=5.212=h4eab89a_4
167 |   - re2=2022.04.01=h295c915_0
168 |   - readline=8.2=h5eee18b_0
169 |   - requests=2.28.1=py39h06a4308_0
170 |   - send2trash=1.8.0=pyhd3eb1b0_1
171 |   - setuptools=65.6.3=py39h06a4308_0
172 |   - sip=6.6.2=py39h6a678d5_0
173 |   - six=1.16.0=pyhd3eb1b0_1
174 |   - snappy=1.1.9=h295c915_0
175 |   - sniffio=1.2.0=py39h06a4308_1
176 |   - soupsieve=2.3.2.post1=py39h06a4308_0
177 |   - sqlite=3.40.1=h5082296_0
178 |   - stack_data=0.2.0=pyhd3eb1b0_0
179 |   - terminado=0.17.1=py39h06a4308_0
180 |   - tinycss2=1.2.1=py39h06a4308_0
181 |   - tk=8.6.12=h1ccaba5_0
182 |   - toml=0.10.2=pyhd3eb1b0_0
183 |   - tomli=2.0.1=py39h06a4308_0
184 |   - tornado=6.2=py39h5eee18b_0
185 |   - traitlets=5.7.1=py39h06a4308_0
186 |   - typing-extensions=4.4.0=py39h06a4308_0
187 |   - typing_extensions=4.4.0=py39h06a4308_0
188 |   - tzdata=2022g=h04d1e81_0
189 |   - urllib3=1.26.14=py39h06a4308_0
190 |   - utf8proc=2.6.1=h27cfd23_0
191 |   - wcwidth=0.2.5=pyhd3eb1b0_0
192 |   - webencodings=0.5.1=py39h06a4308_1
193 |   - websocket-client=0.58.0=py39h06a4308_4
194 |   - wheel=0.38.4=py39h06a4308_0
195 |   - widgetsnbextension=3.5.2=py39h06a4308_0
196 |   - xz=5.2.10=h5eee18b_1
197 |   - zeromq=4.3.4=h2531618_0
198 |   - zipp=3.11.0=py39h06a4308_0
199 |   - zlib=1.2.13=h5eee18b_0
200 |   - zstd=1.5.2=ha4553b6_0
201 | prefix: /home/padilha/miniconda3/envs/de-zoomcamp-week5
202 | 


--------------------------------------------------------------------------------
/week5/download_data.sh:
--------------------------------------------------------------------------------
 1 | set -e # Exit immediately if a command exits with a non-zero status.
 2 | 
 3 | TAXI_TYPE=$1 #"yellow"
 4 | YEAR=$2 #2020
 5 | URL_PREFIX="https://github.com/DataTalksClub/nyc-tlc-data/releases/download"
 6 | 
 7 | for MONTH in {1..12}; do
 8 |     FMONTH=`printf "%02d" ${MONTH}`
 9 |     URL="${URL_PREFIX}/${TAXI_TYPE}/${TAXI_TYPE}_tripdata_${YEAR}-${FMONTH}.csv.gz"
10 |     LOCAL_PREFIX="data/raw/${TAXI_TYPE}/${YEAR}/${FMONTH}"
11 |     LOCAL_FILE="${TAXI_TYPE}_tripdata_${YEAR}_${FMONTH}.csv.gz"
12 |     LOCAL_PATH="${LOCAL_PREFIX}/${LOCAL_FILE}"
13 |     mkdir -p ${LOCAL_PREFIX}
14 |     wget ${URL} -O ${LOCAL_PATH}
15 | done


--------------------------------------------------------------------------------
/week5/img/bigquery1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week5/img/bigquery1.png


--------------------------------------------------------------------------------
/week5/img/bigquery2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week5/img/bigquery2.png


--------------------------------------------------------------------------------
/week5/img/cluster1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week5/img/cluster1.png


--------------------------------------------------------------------------------
/week5/img/cluster2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week5/img/cluster2.png


--------------------------------------------------------------------------------
/week5/img/create_cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week5/img/create_cluster.png


--------------------------------------------------------------------------------
/week5/img/groupby1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week5/img/groupby1.png


--------------------------------------------------------------------------------
/week5/img/groupby2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week5/img/groupby2.png


--------------------------------------------------------------------------------
/week5/img/join1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week5/img/join1.png


--------------------------------------------------------------------------------
/week5/img/join2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week5/img/join2.png


--------------------------------------------------------------------------------
/week5/img/join3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week5/img/join3.png


--------------------------------------------------------------------------------
/week5/img/join4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week5/img/join4.png


--------------------------------------------------------------------------------
/week5/img/mapPartition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week5/img/mapPartition.png


--------------------------------------------------------------------------------
/week5/img/spark-master.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week5/img/spark-master.png


--------------------------------------------------------------------------------
/week5/img/spark-session.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week5/img/spark-session.png


--------------------------------------------------------------------------------
/week5/img/submit_job.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week5/img/submit_job.png


--------------------------------------------------------------------------------
/week5/img/worker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week5/img/worker.png


--------------------------------------------------------------------------------
/week6/img/api-key1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/api-key1.png


--------------------------------------------------------------------------------
/week6/img/api-key2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/api-key2.png


--------------------------------------------------------------------------------
/week6/img/avro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/avro.png


--------------------------------------------------------------------------------
/week6/img/basic-cluster1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/basic-cluster1.png


--------------------------------------------------------------------------------
/week6/img/basic-cluster2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/basic-cluster2.png


--------------------------------------------------------------------------------
/week6/img/basic-cluster3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/basic-cluster3.png


--------------------------------------------------------------------------------
/week6/img/connector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/connector.png


--------------------------------------------------------------------------------
/week6/img/global_ktable.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/global_ktable.png


--------------------------------------------------------------------------------
/week6/img/join-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/join-example.png


--------------------------------------------------------------------------------
/week6/img/kafka-streams-basics1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/kafka-streams-basics1.png


--------------------------------------------------------------------------------
/week6/img/kafka-streams-basics2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/kafka-streams-basics2.png


--------------------------------------------------------------------------------
/week6/img/kafka-streams-basics3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/kafka-streams-basics3.png


--------------------------------------------------------------------------------
/week6/img/ktables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/ktables.png


--------------------------------------------------------------------------------
/week6/img/messages1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/messages1.png


--------------------------------------------------------------------------------
/week6/img/messages2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/messages2.png


--------------------------------------------------------------------------------
/week6/img/offset-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/offset-example.png


--------------------------------------------------------------------------------
/week6/img/partition-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/partition-example.png


--------------------------------------------------------------------------------
/week6/img/rides-location-topic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/rides-location-topic.png


--------------------------------------------------------------------------------
/week6/img/schema-registry1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/schema-registry1.png


--------------------------------------------------------------------------------
/week6/img/schema-registry2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/schema-registry2.png


--------------------------------------------------------------------------------
/week6/img/topic1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/topic1.png


--------------------------------------------------------------------------------
/week6/img/topic2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/topic2.png


--------------------------------------------------------------------------------
/week6/img/vendor-info-topic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/img/vendor-info-topic.png


--------------------------------------------------------------------------------
/week6/java/kafka_examples/.gitignore:
--------------------------------------------------------------------------------
 1 | .gradle
 2 | bin
 3 | !src/main/resources/rides.csv
 4 | 
 5 | build/classes
 6 | build/generated
 7 | build/libs
 8 | build/reports
 9 | build/resources
10 | build/test-results
11 | build/tmp
12 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/build.gradle:
--------------------------------------------------------------------------------
 1 | plugins {
 2 |     id 'java'
 3 |     id "com.github.davidmc24.gradle.plugin.avro" version "1.5.0"
 4 | }
 5 | 
 6 | 
 7 | group 'org.example'
 8 | version '1.0-SNAPSHOT'
 9 | 
10 | repositories {
11 |     mavenCentral()
12 |     maven {
13 |         url "https://packages.confluent.io/maven"
14 |     }
15 | }
16 | 
17 | dependencies {
18 |     implementation 'org.apache.kafka:kafka-clients:3.3.1'
19 |     implementation 'com.opencsv:opencsv:5.7.1'
20 |     implementation 'io.confluent:kafka-json-serializer:7.3.1'
21 |     implementation 'org.apache.kafka:kafka-streams:3.3.1'
22 |     implementation 'io.confluent:kafka-avro-serializer:7.3.1'
23 |     implementation 'io.confluent:kafka-schema-registry-client:7.3.1'
24 |     implementation 'io.confluent:kafka-streams-avro-serde:7.3.1'
25 |     implementation "org.apache.avro:avro:1.11.0"
26 |     testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.1'
27 |     testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine:5.8.1'
28 |     testImplementation 'org.apache.kafka:kafka-streams-test-utils:3.3.1'
29 | }
30 | 
31 | sourceSets.main.java.srcDirs = ['build/generated-main-avro-java','src/main/java']
32 | 
33 | test {
34 |     useJUnitPlatform()
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/padilha/de-zoomcamp/d40c00ffe7014458f64940a3d00f634b0543a883/week6/java/kafka_examples/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/week6/java/kafka_examples/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.5.1-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/gradlew:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | #
  4 | # Copyright © 2015-2021 the original authors.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #      https://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | ##############################################################################
 20 | #
 21 | #   Gradle start up script for POSIX generated by Gradle.
 22 | #
 23 | #   Important for running:
 24 | #
 25 | #   (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
 26 | #       noncompliant, but you have some other compliant shell such as ksh or
 27 | #       bash, then to run this script, type that shell name before the whole
 28 | #       command line, like:
 29 | #
 30 | #           ksh Gradle
 31 | #
 32 | #       Busybox and similar reduced shells will NOT work, because this script
 33 | #       requires all of these POSIX shell features:
 34 | #         * functions;
 35 | #         * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
 36 | #           «${var#prefix}», «${var%suffix}», and «$( cmd )»;
 37 | #         * compound commands having a testable exit status, especially «case»;
 38 | #         * various built-in commands including «command», «set», and «ulimit».
 39 | #
 40 | #   Important for patching:
 41 | #
 42 | #   (2) This script targets any POSIX shell, so it avoids extensions provided
 43 | #       by Bash, Ksh, etc; in particular arrays are avoided.
 44 | #
 45 | #       The "traditional" practice of packing multiple parameters into a
 46 | #       space-separated string is a well documented source of bugs and security
 47 | #       problems, so this is (mostly) avoided, by progressively accumulating
 48 | #       options in "$@", and eventually passing that to Java.
 49 | #
 50 | #       Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
 51 | #       and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
 52 | #       see the in-line comments for details.
 53 | #
 54 | #       There are tweaks for specific operating systems such as AIX, CygWin,
 55 | #       Darwin, MinGW, and NonStop.
 56 | #
 57 | #   (3) This script is generated from the Groovy template
 58 | #       https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
 59 | #       within the Gradle project.
 60 | #
 61 | #       You can find Gradle at https://github.com/gradle/gradle/.
 62 | #
 63 | ##############################################################################
 64 | 
 65 | # Attempt to set APP_HOME
 66 | 
 67 | # Resolve links: $0 may be a link
 68 | app_path=$0
 69 | 
 70 | # Need this for daisy-chained symlinks.
 71 | while
 72 |     APP_HOME=${app_path%"${app_path##*/}"}  # leaves a trailing /; empty if no leading path
 73 |     [ -h "$app_path" ]
 74 | do
 75 |     ls=$( ls -ld "$app_path" )
 76 |     link=${ls#*' -> '}
 77 |     case $link in             #(
 78 |       /*)   app_path=$link ;; #(
 79 |       *)    app_path=$APP_HOME$link ;;
 80 |     esac
 81 | done
 82 | 
 83 | APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
 84 | 
 85 | APP_NAME="Gradle"
 86 | APP_BASE_NAME=${0##*/}
 87 | 
 88 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 89 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
 90 | 
 91 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 92 | MAX_FD=maximum
 93 | 
 94 | warn () {
 95 |     echo "$*"
 96 | } >&2
 97 | 
 98 | die () {
 99 |     echo
100 |     echo "$*"
101 |     echo
102 |     exit 1
103 | } >&2
104 | 
105 | # OS specific support (must be 'true' or 'false').
106 | cygwin=false
107 | msys=false
108 | darwin=false
109 | nonstop=false
110 | case "$( uname )" in                #(
111 |   CYGWIN* )         cygwin=true  ;; #(
112 |   Darwin* )         darwin=true  ;; #(
113 |   MSYS* | MINGW* )  msys=true    ;; #(
114 |   NONSTOP* )        nonstop=true ;;
115 | esac
116 | 
117 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
118 | 
119 | 
120 | # Determine the Java command to use to start the JVM.
121 | if [ -n "$JAVA_HOME" ] ; then
122 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
123 |         # IBM's JDK on AIX uses strange locations for the executables
124 |         JAVACMD=$JAVA_HOME/jre/sh/java
125 |     else
126 |         JAVACMD=$JAVA_HOME/bin/java
127 |     fi
128 |     if [ ! -x "$JAVACMD" ] ; then
129 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
130 | 
131 | Please set the JAVA_HOME variable in your environment to match the
132 | location of your Java installation."
133 |     fi
134 | else
135 |     JAVACMD=java
136 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
137 | 
138 | Please set the JAVA_HOME variable in your environment to match the
139 | location of your Java installation."
140 | fi
141 | 
142 | # Increase the maximum file descriptors if we can.
143 | if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
144 |     case $MAX_FD in #(
145 |       max*)
146 |         MAX_FD=$( ulimit -H -n ) ||
147 |             warn "Could not query maximum file descriptor limit"
148 |     esac
149 |     case $MAX_FD in  #(
150 |       '' | soft) :;; #(
151 |       *)
152 |         ulimit -n "$MAX_FD" ||
153 |             warn "Could not set maximum file descriptor limit to $MAX_FD"
154 |     esac
155 | fi
156 | 
157 | # Collect all arguments for the java command, stacking in reverse order:
158 | #   * args from the command line
159 | #   * the main class name
160 | #   * -classpath
161 | #   * -D...appname settings
162 | #   * --module-path (only if needed)
163 | #   * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
164 | 
165 | # For Cygwin or MSYS, switch paths to Windows format before running java
166 | if "$cygwin" || "$msys" ; then
167 |     APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
168 |     CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
169 | 
170 |     JAVACMD=$( cygpath --unix "$JAVACMD" )
171 | 
172 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
173 |     for arg do
174 |         if
175 |             case $arg in                                #(
176 |               -*)   false ;;                            # don't mess with options #(
177 |               /?*)  t=${arg#/} t=/${t%%/*}              # looks like a POSIX filepath
178 |                     [ -e "$t" ] ;;                      #(
179 |               *)    false ;;
180 |             esac
181 |         then
182 |             arg=$( cygpath --path --ignore --mixed "$arg" )
183 |         fi
184 |         # Roll the args list around exactly as many times as the number of
185 |         # args, so each arg winds up back in the position where it started, but
186 |         # possibly modified.
187 |         #
188 |         # NB: a `for` loop captures its iteration list before it begins, so
189 |         # changing the positional parameters here affects neither the number of
190 |         # iterations, nor the values presented in `arg`.
191 |         shift                   # remove old arg
192 |         set -- "$@" "$arg"      # push replacement arg
193 |     done
194 | fi
195 | 
196 | # Collect all arguments for the java command;
197 | #   * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
198 | #     shell script including quotes and variable substitutions, so put them in
199 | #     double quotes to make sure that they get re-expanded; and
200 | #   * put everything else in single quotes, so that it's not re-expanded.
201 | 
202 | set -- \
203 |         "-Dorg.gradle.appname=$APP_BASE_NAME" \
204 |         -classpath "$CLASSPATH" \
205 |         org.gradle.wrapper.GradleWrapperMain \
206 |         "$@"
207 | 
208 | # Stop when "xargs" is not available.
209 | if ! command -v xargs >/dev/null 2>&1
210 | then
211 |     die "xargs is not available"
212 | fi
213 | 
214 | # Use "xargs" to parse quoted args.
215 | #
216 | # With -n1 it outputs one arg per line, with the quotes and backslashes removed.
217 | #
218 | # In Bash we could simply go:
219 | #
220 | #   readarray ARGS < <( xargs -n1 <<<"$var" ) &&
221 | #   set -- "${ARGS[@]}" "$@"
222 | #
223 | # but POSIX shell has neither arrays nor command substitution, so instead we
224 | # post-process each arg (as a line of input to sed) to backslash-escape any
225 | # character that might be a shell metacharacter, then use eval to reverse
226 | # that process (while maintaining the separation between arguments), and wrap
227 | # the whole thing up as a single "set" statement.
228 | #
229 | # This will of course break if any of these variables contains a newline or
230 | # an unmatched quote.
231 | #
232 | 
233 | eval "set -- $(
234 |         printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
235 |         xargs -n1 |
236 |         sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
237 |         tr '\n' ' '
238 |     )" '"$@"'
239 | 
240 | exec "$JAVACMD" "$@"
241 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @rem
 2 | @rem Copyright 2015 the original author or authors.
 3 | @rem
 4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
 5 | @rem you may not use this file except in compliance with the License.
 6 | @rem You may obtain a copy of the License at
 7 | @rem
 8 | @rem      https://www.apache.org/licenses/LICENSE-2.0
 9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 | 
17 | @if "%DEBUG%"=="" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem  Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 | 
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 | 
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%"=="" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 | 
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 | 
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 | 
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 | 
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if %ERRORLEVEL% equ 0 goto execute
44 | 
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 | 
51 | goto fail
52 | 
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 | 
57 | if exist "%JAVA_EXE%" goto execute
58 | 
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 | 
65 | goto fail
66 | 
67 | :execute
68 | @rem Setup the command line
69 | 
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 | 
72 | 
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 | 
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if %ERRORLEVEL% equ 0 goto mainEnd
79 | 
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | set EXIT_CODE=%ERRORLEVEL%
84 | if %EXIT_CODE% equ 0 set EXIT_CODE=1
85 | if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE%
86 | exit /b %EXIT_CODE%
87 | 
88 | :mainEnd
89 | if "%OS%"=="Windows_NT" endlocal
90 | 
91 | :omega
92 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/settings.gradle:
--------------------------------------------------------------------------------
1 | pluginManagement {
2 |     repositories {
3 |         gradlePluginPortal()
4 |         mavenCentral()
5 |     }
6 | }
7 | rootProject.name = 'kafka_examples'


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/main/avro/rides.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |        "type": "record",
 3 |        "name":"RideRecord",
 4 |        "namespace": "schemaregistry",
 5 |        "fields":[
 6 |          {"name":"vendor_id","type":"string"},
 7 |          {"name":"passenger_count","type":"int"},
 8 |          {"name":"trip_distance","type":"double"}
 9 |        ]
10 | }


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/main/avro/rides_compatible.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |    "type": "record",
 3 |        "name":"RideRecordCompatible",
 4 |        "namespace": "schemaregistry",
 5 |        "fields":[
 6 |          {"name":"vendorId","type":"string"},
 7 |          {"name":"passenger_count","type":"int"},
 8 |          {"name":"trip_distance","type":"double"},
 9 |          {"name":"pu_location_id", "type": [ "null", "long" ], "default": null}
10 |        ]
11 | }


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/main/avro/rides_non_compatible.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |    "type": "record",
 3 |        "name":"RideRecordNoneCompatible",
 4 |        "namespace": "schemaregistry",
 5 |        "fields":[
 6 |          {"name":"vendorId","type":"int"},
 7 |          {"name":"passenger_count","type":"int"},
 8 |          {"name":"trip_distance","type":"double"}
 9 |        ]
10 | }


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/main/java/org/example/AvroProducer.java:
--------------------------------------------------------------------------------
 1 | package org.example;
 2 | 
 3 | import com.opencsv.CSVReader;
 4 | import com.opencsv.exceptions.CsvException;
 5 | import io.confluent.kafka.serializers.AbstractKafkaAvroSerDeConfig;
 6 | import io.confluent.kafka.serializers.KafkaAvroSerializer;
 7 | import org.apache.kafka.clients.producer.KafkaProducer;
 8 | import org.apache.kafka.clients.producer.ProducerConfig;
 9 | import org.apache.kafka.clients.producer.ProducerRecord;
10 | import org.apache.kafka.streams.StreamsConfig;
11 | import schemaregistry.RideRecord;
12 | 
13 | import java.io.FileReader;
14 | import java.io.IOException;
15 | import java.util.List;
16 | import java.util.Properties;
17 | import java.util.concurrent.ExecutionException;
18 | import java.util.stream.Collectors;
19 | 
20 | public class AvroProducer {
21 | 
22 |     private Properties props = new Properties();
23 | 
24 |     public AvroProducer() {
25 |         props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-lzoyy.europe-west6.gcp.confluent.cloud:9092");
26 |         props.put("security.protocol", "SASL_SSL");
27 |         props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';");
28 |         props.put("sasl.mechanism", "PLAIN");
29 |         props.put("client.dns.lookup", "use_all_dns_ips");
30 |         props.put("session.timeout.ms", "45000");
31 |         props.put(ProducerConfig.ACKS_CONFIG, "all");
32 |         props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
33 |         props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, KafkaAvroSerializer.class.getName());
34 | 
35 |         props.put(AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, "https://psrc-kk5gg.europe-west3.gcp.confluent.cloud");
36 |         props.put("basic.auth.credentials.source", "USER_INFO");
37 |         props.put("basic.auth.user.info", Secrets.SCHEMA_REGISTRY_KEY+":"+Secrets.SCHEMA_REGISTRY_SECRET);
38 |     }
39 | 
40 |     public List<RideRecord> getRides() throws IOException, CsvException {
41 |         var ridesStream = this.getClass().getResource("/rides.csv");
42 |         var reader = new CSVReader(new FileReader(ridesStream.getFile()));
43 |         reader.skip(1);
44 | 
45 |         return reader.readAll().stream().map(row ->
46 |             RideRecord.newBuilder()
47 |                     .setVendorId(row[0])
48 |                     .setTripDistance(Double.parseDouble(row[4]))
49 |                     .setPassengerCount(Integer.parseInt(row[3]))
50 |                     .build()
51 |                 ).collect(Collectors.toList());
52 |     }
53 | 
54 |     public void publishRides(List<RideRecord> rides) throws ExecutionException, InterruptedException {
55 |         KafkaProducer<String, RideRecord> kafkaProducer = new KafkaProducer<>(props);
56 |         for (RideRecord ride : rides) {
57 |             var record = kafkaProducer.send(new ProducerRecord<>("rides_avro", String.valueOf(ride.getVendorId()), ride), (metadata, exception) -> {
58 |                 if (exception != null) {
59 |                     System.out.println(exception.getMessage());
60 |                 }
61 |             });
62 |             System.out.println(record.get().offset());
63 |             Thread.sleep(500);
64 |         }
65 |     }
66 | 
67 |     public static void main(String[] args) throws IOException, CsvException, ExecutionException, InterruptedException {
68 |         var producer = new AvroProducer();
69 |         var rideRecords = producer.getRides();
70 |         producer.publishRides(rideRecords);
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/main/java/org/example/JsonConsumer.java:
--------------------------------------------------------------------------------
 1 | package org.example;
 2 | 
 3 | import org.apache.kafka.clients.consumer.ConsumerConfig;
 4 | import org.apache.kafka.clients.consumer.ConsumerRecord;
 5 | import org.apache.kafka.clients.consumer.KafkaConsumer;
 6 | import org.apache.kafka.clients.producer.ProducerConfig;
 7 | import org.example.data.Ride;
 8 | 
 9 | import java.time.Duration;
10 | import java.time.temporal.ChronoUnit;
11 | import java.time.temporal.TemporalUnit;
12 | import java.util.List;
13 | import java.util.Properties;
14 | import io.confluent.kafka.serializers.KafkaJsonDeserializerConfig;
15 | public class JsonConsumer {
16 | 
17 |     private Properties props = new Properties();
18 |     private KafkaConsumer<String, Ride> consumer;
19 |     public JsonConsumer() {
20 |         props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-lzoyy.europe-west6.gcp.confluent.cloud:9092");
21 |         props.put("security.protocol", "SASL_SSL");
22 |         props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';");
23 |         props.put("sasl.mechanism", "PLAIN");
24 |         props.put("client.dns.lookup", "use_all_dns_ips");
25 |         props.put("session.timeout.ms", "45000");
26 |         props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
27 |         props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "io.confluent.kafka.serializers.KafkaJsonDeserializer");
28 |         props.put(ConsumerConfig.GROUP_ID_CONFIG, "kafka_tutorial_example.jsonconsumer.v2");
29 |         props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
30 |         props.put(KafkaJsonDeserializerConfig.JSON_VALUE_TYPE, Ride.class);
31 |         consumer = new KafkaConsumer<String, Ride>(props);
32 |         consumer.subscribe(List.of("rides"));
33 | 
34 |     }
35 | 
36 |     public void consumeFromKafka() {
37 |         System.out.println("Consuming form kafka started");
38 |         var results = consumer.poll(Duration.of(1, ChronoUnit.SECONDS));
39 |         var i = 0;
40 |         do {
41 | 
42 |             for(ConsumerRecord<String, Ride> result: results) {
43 |                 System.out.println(result.value().DOLocationID);
44 |             }
45 |             results =  consumer.poll(Duration.of(1, ChronoUnit.SECONDS));
46 |             System.out.println("RESULTS:::" + results.count());
47 |             i++;
48 |         }
49 |         while(!results.isEmpty() || i < 10);
50 |     }
51 | 
52 |     public static void main(String[] args) {
53 |         JsonConsumer jsonConsumer = new JsonConsumer();
54 |         jsonConsumer.consumeFromKafka();
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/main/java/org/example/JsonKStream.java:
--------------------------------------------------------------------------------
 1 | package org.example;
 2 | 
 3 | import org.apache.kafka.clients.consumer.ConsumerConfig;
 4 | import org.apache.kafka.common.serialization.Serdes;
 5 | import org.apache.kafka.streams.KafkaStreams;
 6 | import org.apache.kafka.streams.StreamsBuilder;
 7 | import org.apache.kafka.streams.StreamsConfig;
 8 | import org.apache.kafka.streams.Topology;
 9 | import org.apache.kafka.streams.kstream.Consumed;
10 | import org.apache.kafka.streams.kstream.Produced;
11 | import org.example.customserdes.CustomSerdes;
12 | import org.example.data.Ride;
13 | 
14 | import java.util.Properties;
15 | 
16 | public class JsonKStream {
17 |     private Properties props = new Properties();
18 | 
19 |     public JsonKStream() {
20 |         props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-lzoyy.europe-west6.gcp.confluent.cloud:9092");
21 |         props.put("security.protocol", "SASL_SSL");
22 |         props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';");
23 |         props.put("sasl.mechanism", "PLAIN");
24 |         props.put("client.dns.lookup", "use_all_dns_ips");
25 |         props.put("session.timeout.ms", "45000");
26 |         props.put(StreamsConfig.APPLICATION_ID_CONFIG, "kafka_tutorial.kstream.count.plocation.v1");
27 |         props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest");
28 |         props.put(StreamsConfig.CACHE_MAX_BYTES_BUFFERING_CONFIG, 0);
29 | 
30 |     }
31 | 
32 |     public Topology createTopology() {
33 |         StreamsBuilder streamsBuilder = new StreamsBuilder();
34 |         var ridesStream = streamsBuilder.stream("rides", Consumed.with(Serdes.String(), CustomSerdes.getSerde(Ride.class)));
35 |         var puLocationCount = ridesStream.groupByKey().count().toStream();
36 |         puLocationCount.to("rides-pulocation-count", Produced.with(Serdes.String(), Serdes.Long()));
37 |         return streamsBuilder.build();
38 |     }
39 | 
40 |     public void countPLocation() throws InterruptedException {
41 |         var topology = createTopology();
42 |         var kStreams = new KafkaStreams(topology, props);
43 |         kStreams.start();
44 |         while (kStreams.state() != KafkaStreams.State.RUNNING) {
45 |             System.out.println(kStreams.state());
46 |             Thread.sleep(1000);
47 |         }
48 |         System.out.println(kStreams.state());
49 |         Runtime.getRuntime().addShutdownHook(new Thread(kStreams::close));
50 |     }
51 | 
52 |     public static void main(String[] args) throws InterruptedException {
53 |         var object = new JsonKStream();
54 |         object.countPLocation();
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/main/java/org/example/JsonKStreamJoins.java:
--------------------------------------------------------------------------------
 1 | package org.example;
 2 | 
 3 | import org.apache.kafka.clients.consumer.ConsumerConfig;
 4 | import org.apache.kafka.common.serialization.Serdes;
 5 | import org.apache.kafka.streams.KafkaStreams;
 6 | import org.apache.kafka.streams.StreamsBuilder;
 7 | import org.apache.kafka.streams.StreamsConfig;
 8 | import org.apache.kafka.streams.Topology;
 9 | import org.apache.kafka.streams.errors.StreamsUncaughtExceptionHandler;
10 | import org.apache.kafka.streams.kstream.*;
11 | import org.example.customserdes.CustomSerdes;
12 | import org.example.data.PickupLocation;
13 | import org.example.data.Ride;
14 | import org.example.data.VendorInfo;
15 | 
16 | import java.time.Duration;
17 | import java.util.Optional;
18 | import java.util.Properties;
19 | public class JsonKStreamJoins {
20 |     private Properties props = new Properties();
21 | 
22 |     public JsonKStreamJoins() {
23 |         props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-lzoyy.europe-west6.gcp.confluent.cloud:9092");
24 |         props.put("security.protocol", "SASL_SSL");
25 |         props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';");
26 |         props.put("sasl.mechanism", "PLAIN");
27 |         props.put("client.dns.lookup", "use_all_dns_ips");
28 |         props.put("session.timeout.ms", "45000");
29 |         props.put(StreamsConfig.APPLICATION_ID_CONFIG, "kafka_tutorial.kstream.joined.rides.pickuplocation.v1");
30 |         props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest");
31 |         props.put(StreamsConfig.CACHE_MAX_BYTES_BUFFERING_CONFIG, 0);
32 |     }
33 | 
34 |     public Topology createTopology() {
35 |         StreamsBuilder streamsBuilder = new StreamsBuilder();
36 |         KStream<String, Ride> rides = streamsBuilder.stream(Topics.INPUT_RIDE_TOPIC, Consumed.with(Serdes.String(), CustomSerdes.getSerde(Ride.class)));
37 |         KStream<String, PickupLocation> pickupLocations = streamsBuilder.stream(Topics.INPUT_RIDE_LOCATION_TOPIC, Consumed.with(Serdes.String(), CustomSerdes.getSerde(PickupLocation.class)));
38 | 
39 |         var pickupLocationsKeyedOnPUId = pickupLocations.selectKey((key, value) -> String.valueOf(value.PULocationID));
40 | 
41 |         var joined = rides.join(pickupLocationsKeyedOnPUId, (ValueJoiner<Ride, PickupLocation, Optional<VendorInfo>>) (ride, pickupLocation) -> {
42 |                     var period = Duration.between(ride.tpep_dropoff_datetime, pickupLocation.tpep_pickup_datetime);
43 |                     if (period.abs().toMinutes() > 10) return Optional.empty();
44 |                     else return Optional.of(new VendorInfo(ride.VendorID, pickupLocation.PULocationID, pickupLocation.tpep_pickup_datetime, ride.tpep_dropoff_datetime));
45 |                 }, JoinWindows.ofTimeDifferenceAndGrace(Duration.ofMinutes(20), Duration.ofMinutes(5)),
46 |                 StreamJoined.with(Serdes.String(), CustomSerdes.getSerde(Ride.class), CustomSerdes.getSerde(PickupLocation.class)));
47 | 
48 |         joined.filter(((key, value) -> value.isPresent())).mapValues(Optional::get)
49 |                 .to(Topics.OUTPUT_TOPIC, Produced.with(Serdes.String(), CustomSerdes.getSerde(VendorInfo.class)));
50 | 
51 |         return streamsBuilder.build();
52 |     }
53 | 
54 |     public void joinRidesPickupLocation() throws InterruptedException {
55 |         var topology = createTopology();
56 |         var kStreams = new KafkaStreams(topology, props);
57 | 
58 |         kStreams.setUncaughtExceptionHandler(exception -> {
59 |             System.out.println(exception.getMessage());
60 |             return StreamsUncaughtExceptionHandler.StreamThreadExceptionResponse.SHUTDOWN_APPLICATION;
61 |         });
62 |         kStreams.start();
63 |         while (kStreams.state() != KafkaStreams.State.RUNNING) {
64 |             System.out.println(kStreams.state());
65 |             Thread.sleep(1000);
66 |         }
67 |         System.out.println(kStreams.state());
68 |         Runtime.getRuntime().addShutdownHook(new Thread(kStreams::close));
69 | 
70 |     }
71 | 
72 |     public static void main(String[] args) throws InterruptedException {
73 |         var object = new JsonKStreamJoins();
74 |         object.joinRidesPickupLocation();
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/main/java/org/example/JsonKStreamWindow.java:
--------------------------------------------------------------------------------
 1 | package org.example;
 2 | 
 3 | import org.apache.kafka.clients.consumer.ConsumerConfig;
 4 | import org.apache.kafka.common.serialization.Serdes;
 5 | import org.apache.kafka.streams.KafkaStreams;
 6 | import org.apache.kafka.streams.StreamsBuilder;
 7 | import org.apache.kafka.streams.StreamsConfig;
 8 | import org.apache.kafka.streams.Topology;
 9 | import org.apache.kafka.streams.kstream.Consumed;
10 | import org.apache.kafka.streams.kstream.Produced;
11 | import org.apache.kafka.streams.kstream.TimeWindows;
12 | import org.apache.kafka.streams.kstream.WindowedSerdes;
13 | import org.example.customserdes.CustomSerdes;
14 | import org.example.data.Ride;
15 | 
16 | import java.time.Duration;
17 | import java.time.temporal.ChronoUnit;
18 | import java.util.Properties;
19 | 
20 | public class JsonKStreamWindow {
21 |     private Properties props = new Properties();
22 | 
23 |     public JsonKStreamWindow() {
24 |         props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-lzoyy.europe-west6.gcp.confluent.cloud:9092");
25 |         props.put("security.protocol", "SASL_SSL");
26 |         props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';");
27 |         props.put("sasl.mechanism", "PLAIN");
28 |         props.put("client.dns.lookup", "use_all_dns_ips");
29 |         props.put("session.timeout.ms", "45000");
30 |         props.put(StreamsConfig.APPLICATION_ID_CONFIG, "kafka_tutorial.kstream.count.plocation.v1");
31 |         props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest");
32 |         props.put(StreamsConfig.CACHE_MAX_BYTES_BUFFERING_CONFIG, 0);
33 | 
34 |     }
35 | 
36 |     public Topology createTopology() {
37 |         StreamsBuilder streamsBuilder = new StreamsBuilder();
38 |         var ridesStream = streamsBuilder.stream("rides", Consumed.with(Serdes.String(), CustomSerdes.getSerde(Ride.class)));
39 |         var puLocationCount = ridesStream.groupByKey()
40 |                 .windowedBy(TimeWindows.ofSizeAndGrace(Duration.ofSeconds(10), Duration.ofSeconds(5)))
41 |                 .count().toStream();
42 |         var windowSerde = WindowedSerdes.timeWindowedSerdeFrom(String.class, 10*1000);
43 | 
44 |         puLocationCount.to("rides-pulocation-window-count", Produced.with(windowSerde, Serdes.Long()));
45 |         return streamsBuilder.build();
46 |     }
47 | 
48 |     public void countPLocationWindowed() {
49 |         var topology = createTopology();
50 |         var kStreams = new KafkaStreams(topology, props);
51 |         kStreams.start();
52 | 
53 |         Runtime.getRuntime().addShutdownHook(new Thread(kStreams::close));
54 |     }
55 | 
56 |     public static void main(String[] args) {
57 |         var object = new JsonKStreamWindow();
58 |         object.countPLocationWindowed();
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/main/java/org/example/JsonProducer.java:
--------------------------------------------------------------------------------
 1 | package org.example;
 2 | 
 3 | import com.opencsv.CSVReader;
 4 | import com.opencsv.exceptions.CsvException;
 5 | import org.apache.kafka.clients.producer.*;
 6 | import org.apache.kafka.streams.StreamsConfig;
 7 | import org.example.data.Ride;
 8 | 
 9 | import java.io.FileReader;
10 | import java.io.IOException;
11 | import java.time.LocalDateTime;
12 | import java.util.List;
13 | import java.util.Properties;
14 | import java.util.concurrent.ExecutionException;
15 | import java.util.stream.Collectors;
16 | 
17 | public class JsonProducer {
18 |     private Properties props = new Properties();
19 |     public JsonProducer() {
20 |         props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-lzoyy.europe-west6.gcp.confluent.cloud:9092");
21 |         props.put("security.protocol", "SASL_SSL");
22 |         props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';");
23 |         props.put("sasl.mechanism", "PLAIN");
24 |         props.put("client.dns.lookup", "use_all_dns_ips");
25 |         props.put("session.timeout.ms", "45000");
26 |         props.put(ProducerConfig.ACKS_CONFIG, "all");
27 |         props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
28 |         props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "io.confluent.kafka.serializers.KafkaJsonSerializer");
29 |     }
30 | 
31 |     public List<Ride> getRides() throws IOException, CsvException {
32 |         var ridesStream = this.getClass().getResource("/rides.csv");
33 |         var reader = new CSVReader(new FileReader(ridesStream.getFile()));
34 |         reader.skip(1);
35 |         return reader.readAll().stream().map(arr -> new Ride(arr))
36 |                 .collect(Collectors.toList());
37 | 
38 |     }
39 | 
40 |     public void publishRides(List<Ride> rides) throws ExecutionException, InterruptedException {
41 |         KafkaProducer<String, Ride> kafkaProducer = new KafkaProducer<String, Ride>(props);
42 |         for(Ride ride: rides) {
43 |             ride.tpep_pickup_datetime = LocalDateTime.now().minusMinutes(20);
44 |             ride.tpep_dropoff_datetime = LocalDateTime.now();
45 |             var record = kafkaProducer.send(new ProducerRecord<>("rides", String.valueOf(ride.DOLocationID), ride), (metadata, exception) -> {
46 |                 if(exception != null) {
47 |                     System.out.println(exception.getMessage());
48 |                 }
49 |             });
50 |             System.out.println(record.get().offset());
51 |             System.out.println(ride.DOLocationID);
52 |             Thread.sleep(500);
53 |         }
54 |     }
55 | 
56 |     public static void main(String[] args) throws IOException, CsvException, ExecutionException, InterruptedException {
57 |         var producer = new JsonProducer();
58 |         var rides = producer.getRides();
59 |         producer.publishRides(rides);
60 |     }
61 | }


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/main/java/org/example/JsonProducerPickupLocation.java:
--------------------------------------------------------------------------------
 1 | package org.example;
 2 | 
 3 | import com.opencsv.exceptions.CsvException;
 4 | import org.apache.kafka.clients.producer.KafkaProducer;
 5 | import org.apache.kafka.clients.producer.ProducerConfig;
 6 | import org.apache.kafka.clients.producer.ProducerRecord;
 7 | import org.example.data.PickupLocation;
 8 | 
 9 | import java.io.IOException;
10 | import java.time.LocalDateTime;
11 | import java.util.Properties;
12 | import java.util.concurrent.ExecutionException;
13 | 
14 | public class JsonProducerPickupLocation {
15 |     private Properties props = new Properties();
16 | 
17 |     public JsonProducerPickupLocation() {
18 |         props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "pkc-lzoyy.europe-west6.gcp.confluent.cloud:9092");
19 |         props.put("security.protocol", "SASL_SSL");
20 |         props.put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='"+Secrets.KAFKA_CLUSTER_KEY+"' password='"+Secrets.KAFKA_CLUSTER_SECRET+"';");
21 |         props.put("sasl.mechanism", "PLAIN");
22 |         props.put("client.dns.lookup", "use_all_dns_ips");
23 |         props.put("session.timeout.ms", "45000");
24 |         props.put(ProducerConfig.ACKS_CONFIG, "all");
25 |         props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
26 |         props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "io.confluent.kafka.serializers.KafkaJsonSerializer");
27 |     }
28 | 
29 |     public void publish(PickupLocation pickupLocation) throws ExecutionException, InterruptedException {
30 |         KafkaProducer<String, PickupLocation> kafkaProducer = new KafkaProducer<String, PickupLocation>(props);
31 |         var record = kafkaProducer.send(new ProducerRecord<>("rides_location", String.valueOf(pickupLocation.PULocationID), pickupLocation), (metadata, exception) -> {
32 |             if (exception != null) {
33 |                 System.out.println(exception.getMessage());
34 |             }
35 |         });
36 |         System.out.println(record.get().offset());
37 |     }
38 | 
39 | 
40 |     public static void main(String[] args) throws IOException, CsvException, ExecutionException, InterruptedException {
41 |         var producer = new JsonProducerPickupLocation();
42 |         producer.publish(new PickupLocation(186, LocalDateTime.now()));
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/main/java/org/example/Secrets.java:
--------------------------------------------------------------------------------
 1 | package org.example;
 2 | 
 3 | public class Secrets {
 4 |     public static final String KAFKA_CLUSTER_KEY = "REPLACE_WITH_YOUR_KAFKA_CLUSTER_KEY";
 5 |     public static final String KAFKA_CLUSTER_SECRET = "REPLACE_WITH_YOUR_KAFKA_CLUSTER_SECRET";
 6 | 
 7 |     public static final String SCHEMA_REGISTRY_KEY = "REPLACE_WITH_SCHEMA_REGISTRY_KEY";
 8 |     public static final String SCHEMA_REGISTRY_SECRET = "REPLACE_WITH_SCHEMA_REGISTRY_SECRET";
 9 | 
10 | }
11 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/main/java/org/example/Topics.java:
--------------------------------------------------------------------------------
1 | package org.example;
2 | 
3 | public class Topics {
4 |     public static final String INPUT_RIDE_TOPIC = "rides";
5 |     public static final String INPUT_RIDE_LOCATION_TOPIC = "rides_location";
6 |     public static final String OUTPUT_TOPIC = "vendor_info";
7 | }
8 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/main/java/org/example/customserdes/CustomSerdes.java:
--------------------------------------------------------------------------------
 1 | package org.example.customserdes;
 2 | 
 3 | import io.confluent.kafka.serializers.AbstractKafkaAvroSerDeConfig;
 4 | import io.confluent.kafka.serializers.KafkaJsonDeserializer;
 5 | import io.confluent.kafka.serializers.KafkaJsonSerializer;
 6 | import io.confluent.kafka.streams.serdes.avro.SpecificAvroSerde;
 7 | import org.apache.avro.specific.SpecificRecordBase;
 8 | import org.apache.kafka.common.serialization.Deserializer;
 9 | import org.apache.kafka.common.serialization.Serde;
10 | import org.apache.kafka.common.serialization.Serdes;
11 | import org.apache.kafka.common.serialization.Serializer;
12 | import org.example.data.PickupLocation;
13 | import org.example.data.Ride;
14 | import org.example.data.VendorInfo;
15 | 
16 | import java.util.HashMap;
17 | import java.util.Map;
18 | 
19 | public class CustomSerdes {
20 | 
21 |     public static <T> Serde<T> getSerde(Class<T> classOf) {
22 |         Map<String, Object> serdeProps = new HashMap<>();
23 |         serdeProps.put("json.value.type", classOf);
24 |         final Serializer<T> mySerializer = new KafkaJsonSerializer<>();
25 |         mySerializer.configure(serdeProps, false);
26 | 
27 |         final Deserializer<T> myDeserializer = new KafkaJsonDeserializer<>();
28 |         myDeserializer.configure(serdeProps, false);
29 |         return Serdes.serdeFrom(mySerializer, myDeserializer);
30 |     }
31 | 
32 |     public static <T extends SpecificRecordBase> SpecificAvroSerde getAvroSerde(boolean isKey, String schemaRegistryUrl) {
33 |         var serde = new SpecificAvroSerde<T>();
34 | 
35 |         Map<String, Object> serdeProps = new HashMap<>();
36 |         serdeProps.put(AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, schemaRegistryUrl);
37 |         serde.configure(serdeProps, isKey);
38 |         return serde;
39 |     }
40 | 
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/main/java/org/example/data/PickupLocation.java:
--------------------------------------------------------------------------------
 1 | package org.example.data;
 2 | 
 3 | import java.time.LocalDateTime;
 4 | 
 5 | public class PickupLocation {
 6 |     public PickupLocation(long PULocationID, LocalDateTime tpep_pickup_datetime) {
 7 |         this.PULocationID = PULocationID;
 8 |         this.tpep_pickup_datetime = tpep_pickup_datetime;
 9 |     }
10 | 
11 |     public PickupLocation() {
12 |     }
13 | 
14 |     public long PULocationID;
15 |     public LocalDateTime tpep_pickup_datetime;
16 | }
17 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/main/java/org/example/data/Ride.java:
--------------------------------------------------------------------------------
 1 | package org.example.data;
 2 | 
 3 | import java.nio.DoubleBuffer;
 4 | import java.time.LocalDate;
 5 | import java.time.LocalDateTime;
 6 | import java.time.format.DateTimeFormatter;
 7 | 
 8 | public class Ride {
 9 |     public Ride(String[] arr) {
10 |         VendorID = arr[0];
11 |         tpep_pickup_datetime = LocalDateTime.parse(arr[1], DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"));
12 |         tpep_dropoff_datetime = LocalDateTime.parse(arr[2], DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"));
13 |         passenger_count = Integer.parseInt(arr[3]);
14 |         trip_distance = Double.parseDouble(arr[4]);
15 |         RatecodeID = Long.parseLong(arr[5]);
16 |         store_and_fwd_flag = arr[6];
17 |         PULocationID = Long.parseLong(arr[7]);
18 |         DOLocationID = Long.parseLong(arr[8]);
19 |         payment_type = arr[9];
20 |         fare_amount = Double.parseDouble(arr[10]);
21 |         extra = Double.parseDouble(arr[11]);
22 |         mta_tax = Double.parseDouble(arr[12]);
23 |         tip_amount = Double.parseDouble(arr[13]);
24 |         tolls_amount = Double.parseDouble(arr[14]);
25 |         improvement_surcharge = Double.parseDouble(arr[15]);
26 |         total_amount = Double.parseDouble(arr[16]);
27 |         congestion_surcharge = Double.parseDouble(arr[17]);
28 |     }
29 |     public Ride(){}
30 |     public String VendorID;
31 |     public LocalDateTime tpep_pickup_datetime;
32 |     public LocalDateTime tpep_dropoff_datetime;
33 |     public int passenger_count;
34 |     public double trip_distance;
35 |     public long RatecodeID;
36 |     public String store_and_fwd_flag;
37 |     public long PULocationID;
38 |     public long DOLocationID;
39 |     public String payment_type;
40 |     public double fare_amount;
41 |     public double extra;
42 |     public double mta_tax;
43 |     public double tip_amount;
44 |     public double tolls_amount;
45 |     public double improvement_surcharge;
46 |     public double total_amount;
47 |     public double congestion_surcharge;
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/main/java/org/example/data/VendorInfo.java:
--------------------------------------------------------------------------------
 1 | package org.example.data;
 2 | 
 3 | import java.time.LocalDateTime;
 4 | 
 5 | public class VendorInfo {
 6 | 
 7 |     public VendorInfo(String vendorID, long PULocationID, LocalDateTime pickupTime, LocalDateTime lastDropoffTime) {
 8 |         VendorID = vendorID;
 9 |         this.PULocationID = PULocationID;
10 |         this.pickupTime = pickupTime;
11 |         this.lastDropoffTime = lastDropoffTime;
12 |     }
13 | 
14 |     public VendorInfo() {
15 |     }
16 | 
17 |     public String VendorID;
18 |     public long PULocationID;
19 |     public LocalDateTime pickupTime;
20 |     public LocalDateTime lastDropoffTime;
21 | }
22 | 


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/test/java/org/example/JsonKStreamJoinsTest.java:
--------------------------------------------------------------------------------
 1 | package org.example;
 2 | 
 3 | import org.apache.kafka.clients.consumer.ConsumerConfig;
 4 | import org.apache.kafka.common.internals.Topic;
 5 | import org.apache.kafka.common.serialization.Serdes;
 6 | import org.apache.kafka.streams.*;
 7 | import org.example.customserdes.CustomSerdes;
 8 | import org.example.data.PickupLocation;
 9 | import org.example.data.Ride;
10 | import org.example.data.VendorInfo;
11 | import org.example.helper.DataGeneratorHelper;
12 | import org.junit.jupiter.api.AfterAll;
13 | import org.junit.jupiter.api.BeforeEach;
14 | import org.junit.jupiter.api.Test;
15 | 
16 | import javax.xml.crypto.Data;
17 | import java.util.Properties;
18 | 
19 | import static org.junit.jupiter.api.Assertions.*;
20 | 
21 | class JsonKStreamJoinsTest {
22 |     private Properties props = new Properties();
23 |     private static TopologyTestDriver testDriver;
24 |     private TestInputTopic<String, Ride> ridesTopic;
25 |     private TestInputTopic<String, PickupLocation> pickLocationTopic;
26 |     private TestOutputTopic<String, VendorInfo> outputTopic;
27 | 
28 |     private Topology topology = new JsonKStreamJoins().createTopology();
29 |     @BeforeEach
30 |     public void setup() {
31 |         props = new Properties();
32 |         props.setProperty(StreamsConfig.APPLICATION_ID_CONFIG, "testing_count_application");
33 |         props.setProperty(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "dummy:1234");
34 |         if (testDriver != null) {
35 |             testDriver.close();
36 |         }
37 |         testDriver = new TopologyTestDriver(topology, props);
38 |         ridesTopic = testDriver.createInputTopic(Topics.INPUT_RIDE_TOPIC, Serdes.String().serializer(), CustomSerdes.getSerde(Ride.class).serializer());
39 |         pickLocationTopic = testDriver.createInputTopic(Topics.INPUT_RIDE_LOCATION_TOPIC, Serdes.String().serializer(), CustomSerdes.getSerde(PickupLocation.class).serializer());
40 |         outputTopic = testDriver.createOutputTopic(Topics.OUTPUT_TOPIC, Serdes.String().deserializer(), CustomSerdes.getSerde(VendorInfo.class).deserializer());
41 |     }
42 | 
43 |     @Test
44 |     public void testIfJoinWorksOnSameDropOffPickupLocationId() {
45 |         Ride ride = DataGeneratorHelper.generateRide();
46 |         PickupLocation pickupLocation = DataGeneratorHelper.generatePickUpLocation(ride.DOLocationID);
47 |         ridesTopic.pipeInput(String.valueOf(ride.DOLocationID), ride);
48 |         pickLocationTopic.pipeInput(String.valueOf(pickupLocation.PULocationID), pickupLocation);
49 | 
50 |         assertEquals(outputTopic.getQueueSize(), 1);
51 |         var expected = new VendorInfo(ride.VendorID, pickupLocation.PULocationID, pickupLocation.tpep_pickup_datetime, ride.tpep_dropoff_datetime);
52 |         var result = outputTopic.readKeyValue();
53 |         assertEquals(result.key, String.valueOf(ride.DOLocationID));
54 |         assertEquals(result.value.VendorID, expected.VendorID);
55 |         assertEquals(result.value.pickupTime, expected.pickupTime);
56 |     }
57 | 
58 | 
59 |     @AfterAll
60 |     public static void shutdown() {
61 |         testDriver.close();
62 |     }
63 | }


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/test/java/org/example/JsonKStreamTest.java:
--------------------------------------------------------------------------------
 1 | package org.example;
 2 | 
 3 | import org.apache.kafka.common.serialization.Serdes;
 4 | import org.apache.kafka.streams.*;
 5 | import org.example.customserdes.CustomSerdes;
 6 | import org.example.data.Ride;
 7 | import org.example.helper.DataGeneratorHelper;
 8 | import org.junit.jupiter.api.AfterAll;
 9 | import org.junit.jupiter.api.BeforeEach;
10 | import org.junit.jupiter.api.Test;
11 | import static org.junit.jupiter.api.Assertions.*;
12 | import java.util.Properties;
13 | 
14 | class JsonKStreamTest {
15 |     private Properties props;
16 |     private static TopologyTestDriver testDriver;
17 |     private TestInputTopic<String, Ride> inputTopic;
18 |     private TestOutputTopic<String, Long> outputTopic;
19 |     private Topology topology = new JsonKStream().createTopology();
20 | 
21 |     @BeforeEach
22 |     public void setup() {
23 |         props = new Properties();
24 |         props.setProperty(StreamsConfig.APPLICATION_ID_CONFIG, "testing_count_application");
25 |         props.setProperty(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "dummy:1234");
26 |         if (testDriver != null) {
27 |             testDriver.close();
28 |         }
29 |         testDriver = new TopologyTestDriver(topology, props);
30 |         inputTopic = testDriver.createInputTopic("rides", Serdes.String().serializer(), CustomSerdes.getSerde(Ride.class).serializer());
31 |         outputTopic = testDriver.createOutputTopic("rides-pulocation-count", Serdes.String().deserializer(), Serdes.Long().deserializer());
32 |     }
33 | 
34 |     @Test
35 |     public void testIfOneMessageIsPassedToInputTopicWeGetCountOfOne() {
36 |         Ride ride = DataGeneratorHelper.generateRide();
37 |         inputTopic.pipeInput(String.valueOf(ride.DOLocationID), ride);
38 | 
39 |         assertEquals(outputTopic.readKeyValue(), KeyValue.pair(String.valueOf(ride.DOLocationID), 1L));
40 |         assertTrue(outputTopic.isEmpty());
41 |     }
42 | 
43 |     @Test
44 |     public void testIfTwoMessageArePassedWithDifferentKey() {
45 |         Ride ride1 = DataGeneratorHelper.generateRide();
46 |         ride1.DOLocationID = 100L;
47 |         inputTopic.pipeInput(String.valueOf(ride1.DOLocationID), ride1);
48 | 
49 |         Ride ride2 = DataGeneratorHelper.generateRide();
50 |         ride2.DOLocationID = 200L;
51 |         inputTopic.pipeInput(String.valueOf(ride2.DOLocationID), ride2);
52 | 
53 |         assertEquals(outputTopic.readKeyValue(), KeyValue.pair(String.valueOf(ride1.DOLocationID), 1L));
54 |         assertEquals(outputTopic.readKeyValue(), KeyValue.pair(String.valueOf(ride2.DOLocationID), 1L));
55 |         assertTrue(outputTopic.isEmpty());
56 |     }
57 | 
58 |     @Test
59 |     public void testIfTwoMessageArePassedWithSameKey() {
60 |         Ride ride1 = DataGeneratorHelper.generateRide();
61 |         ride1.DOLocationID = 100L;
62 |         inputTopic.pipeInput(String.valueOf(ride1.DOLocationID), ride1);
63 | 
64 |         Ride ride2 = DataGeneratorHelper.generateRide();
65 |         ride2.DOLocationID = 100L;
66 |         inputTopic.pipeInput(String.valueOf(ride2.DOLocationID), ride2);
67 | 
68 |         assertEquals(outputTopic.readKeyValue(), KeyValue.pair("100", 1L));
69 |         assertEquals(outputTopic.readKeyValue(), KeyValue.pair("100", 2L));
70 |         assertTrue(outputTopic.isEmpty());
71 |     }
72 | 
73 | 
74 |     @AfterAll
75 |     public static void tearDown() {
76 |         testDriver.close();
77 |     }
78 | 
79 | 
80 | }


--------------------------------------------------------------------------------
/week6/java/kafka_examples/src/test/java/org/example/helper/DataGeneratorHelper.java:
--------------------------------------------------------------------------------
 1 | package org.example.helper;
 2 | 
 3 | import org.example.data.PickupLocation;
 4 | import org.example.data.Ride;
 5 | import org.example.data.VendorInfo;
 6 | 
 7 | import java.time.LocalDateTime;
 8 | import java.time.format.DateTimeFormatter;
 9 | import java.util.List;
10 | 
11 | public class DataGeneratorHelper {
12 |     public static Ride generateRide() {
13 |         var arrivalTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"));
14 |         var departureTime = LocalDateTime.now().minusMinutes(30).format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"));
15 |         return new Ride(new String[]{"1", departureTime, arrivalTime,"1","1.50","1","N","238","75","2","8","0.5","0.5","0","0","0.3","9.3","0"});
16 |     }
17 | 
18 |     public static PickupLocation generatePickUpLocation(long pickupLocationId) {
19 |         return new PickupLocation(pickupLocationId, LocalDateTime.now());
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/week6/ksqldb/commands.md:
--------------------------------------------------------------------------------
 1 | ## KSQL DB Examples
 2 | ### Create streams
 3 | ```sql
 4 | CREATE STREAM ride_streams (
 5 |     VendorId varchar, 
 6 |     trip_distance double,
 7 |     payment_type varchar
 8 | )  WITH (KAFKA_TOPIC='rides',
 9 |         VALUE_FORMAT='JSON');
10 | ```
11 | 
12 | ### Query stream
13 | ```sql
14 | select * from RIDE_STREAMS 
15 | EMIT CHANGES;
16 | ```
17 | 
18 | ### Query stream count
19 | ```sql
20 | SELECT VENDORID, count(*) FROM RIDE_STREAMS 
21 | GROUP BY VENDORID
22 | EMIT CHANGES;
23 | ```
24 | 
25 | ### Query stream with filters
26 | ```sql
27 | SELECT payment_type, count(*) FROM RIDE_STREAMS 
28 | WHERE payment_type IN ('1', '2')
29 | GROUP BY payment_type
30 | EMIT CHANGES;
31 | ```
32 | 
33 | ### Query stream with window functions
34 | ```sql
35 | CREATE TABLE payment_type_sessions AS
36 |   SELECT payment_type,
37 |          count(*)
38 |   FROM  RIDE_STREAMS 
39 |   WINDOW SESSION (60 SECONDS)
40 |   GROUP BY payment_type
41 |   EMIT CHANGES;
42 | ```
43 | 
44 | ## KSQL documentation for details
45 | [KSQL DB Documentation](https://docs.ksqldb.io/en/latest/developer-guide/ksqldb-reference/quick-reference/)
46 | 
47 | [KSQL DB Java client](https://docs.ksqldb.io/en/latest/developer-guide/ksqldb-clients/java-client/)


--------------------------------------------------------------------------------