├── LICENSE ├── README.md ├── week1 ├── .gitignore ├── .ipynb_checkpoints │ └── postgres_connection-checkpoint.ipynb ├── Dockerfile ├── README.md ├── docker-compose.yaml ├── ingest_data.py ├── postgres_connection.ipynb └── terraform │ ├── .terraform-version │ ├── main.tf │ └── variables.tf ├── week2 ├── .gitignore ├── README.md └── airflow │ ├── Dockerfile │ ├── dags │ └── dag_ingestion_gcs.py │ ├── docker-compose.yaml │ └── requirements.txt ├── week3 ├── .gitignore ├── README.md └── airflow │ ├── dags │ ├── __pycache__ │ │ └── gcp_to_bq_dag.cpython-37.pyc │ └── gcp_to_bq_dag.py │ └── docker-compose.yaml ├── week4 ├── README.md ├── data_to_gcs │ ├── .gitignore │ └── upload_to_gcs.py └── dbt │ ├── .gitignore │ ├── analyses │ └── .gitkeep │ ├── data │ └── taxi_zone.csv │ ├── dbt_project.yml │ ├── macros │ ├── .gitkeep │ └── get_payment_type_description.sql │ ├── models │ ├── core │ │ ├── dim_zones.sql │ │ ├── dm_monthly_zone_revenue.sql │ │ ├── fact_trips.sql │ │ └── schema.yml │ └── staging │ │ ├── schema.yml │ │ ├── stg_green_tripdata.sql │ │ └── stg_yellow_tripdata.sql │ ├── packages.yml │ ├── profiles.yml │ ├── seeds │ └── .gitkeep │ ├── snapshots │ └── .gitkeep │ └── tests │ └── .gitkeep └── week6 ├── README.md ├── avro_example ├── consumer.py ├── data │ └── rides_new.csv ├── producer.py ├── taxi_ride_key.avsc └── taxi_ride_value.avsc ├── consumer.py ├── docker-compose.yml ├── producer.py ├── requirements.txt └── streams ├── __pycache__ └── taxi_rides.cpython-37.pyc ├── branch_price.py ├── producer_taxi_json.py ├── stream.py └── taxi_rides.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Pedro C. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### data-engineering-bootcamp 2 | This repo contains all the material developed during the 9-week bootcamp provided by DPhi in colaboration with DataTalks Club 3 | - Link: https://dphi.tech 4 | - Notes: https://pcrespoo.notion.site/Tech-Data-Engineering-Bootcamp-475acaace06042188da8600e1e45d7f5 5 | - **Updates**: by the end of the week in progress 6 | 7 | 8 | ### Topics 9 | - Week 1: Docker, Docker Compose, GCP and Terraform 10 | - Week 2: Airflow, Data Ingestion to Google Cloud Storage 11 | - Week 3: BigQuery, Partitioned and Clustered tables, Airflow, how to move files in Google Cloud Storage 12 | - Week 4: Dbt and Google Data Studio 13 | - Week 5: Apache Spark (not yet implemented) 14 | - Week 6: Kafka 15 | -------------------------------------------------------------------------------- /week1/.gitignore: -------------------------------------------------------------------------------- 1 | terraform/.terraform/ 2 | ny_taxi_postgres_data/ 3 | *.csv 4 | *.parquet 5 | terraform/terraform.tfstate 6 | terraform/terraform.tfstate.backup 7 | terraform/.terraform.lock.hcl 8 | 9 | -------------------------------------------------------------------------------- /week1/.ipynb_checkpoints/postgres_connection-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "'0.25.1'" 12 | ] 13 | }, 14 | "execution_count": 3, 15 | "metadata": {}, 16 | "output_type": "execute_result" 17 | } 18 | ], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "pd.__version__" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 4, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/html": [ 32 | "
\n", 33 | "\n", 46 | "\n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | "
VendorIDtpep_pickup_datetimetpep_dropoff_datetimepassenger_counttrip_distanceRatecodeIDstore_and_fwd_flagPULocationIDDOLocationIDpayment_typefare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amountcongestion_surchargeairport_fee
012021-01-01 00:30:102021-01-01 00:36:121.02.101.0N1424328.003.000.50.000.000.311.802.5NaN
112021-01-01 00:51:202021-01-01 00:52:191.00.201.0N23815123.000.500.50.000.000.34.300.0NaN
212021-01-01 00:43:302021-01-01 01:11:061.014.701.0N132165142.000.500.58.650.000.351.950.0NaN
312021-01-01 00:15:482021-01-01 00:31:010.010.601.0N138132129.000.500.56.050.000.336.350.0NaN
422021-01-01 00:31:492021-01-01 00:48:211.04.941.0N6833116.500.500.54.060.000.324.362.5NaN
............................................................
136976422021-01-31 23:03:002021-01-31 23:33:00NaN8.89NaNNone229181027.780.000.57.460.000.338.54NaNNaN
136976522021-01-31 23:29:002021-01-31 23:51:00NaN7.43NaNNone4170032.580.000.50.006.120.339.50NaNNaN
136976622021-01-31 23:25:002021-01-31 23:38:00NaN6.26NaNNone74137016.850.000.53.900.000.324.05NaNNaN
136976762021-01-31 23:01:062021-02-01 00:02:03NaN19.70NaNNone265188053.680.000.50.000.000.354.48NaNNaN
136976822021-01-31 23:08:292021-01-31 23:31:22NaN4.68NaNNone8961025.452.750.50.000.000.329.00NaNNaN
\n", 316 | "

1369769 rows × 19 columns

\n", 317 | "
" 318 | ], 319 | "text/plain": [ 320 | " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n", 321 | "0 1 2021-01-01 00:30:10 2021-01-01 00:36:12 1.0 \n", 322 | "1 1 2021-01-01 00:51:20 2021-01-01 00:52:19 1.0 \n", 323 | "2 1 2021-01-01 00:43:30 2021-01-01 01:11:06 1.0 \n", 324 | "3 1 2021-01-01 00:15:48 2021-01-01 00:31:01 0.0 \n", 325 | "4 2 2021-01-01 00:31:49 2021-01-01 00:48:21 1.0 \n", 326 | "... ... ... ... ... \n", 327 | "1369764 2 2021-01-31 23:03:00 2021-01-31 23:33:00 NaN \n", 328 | "1369765 2 2021-01-31 23:29:00 2021-01-31 23:51:00 NaN \n", 329 | "1369766 2 2021-01-31 23:25:00 2021-01-31 23:38:00 NaN \n", 330 | "1369767 6 2021-01-31 23:01:06 2021-02-01 00:02:03 NaN \n", 331 | "1369768 2 2021-01-31 23:08:29 2021-01-31 23:31:22 NaN \n", 332 | "\n", 333 | " trip_distance RatecodeID store_and_fwd_flag PULocationID \\\n", 334 | "0 2.10 1.0 N 142 \n", 335 | "1 0.20 1.0 N 238 \n", 336 | "2 14.70 1.0 N 132 \n", 337 | "3 10.60 1.0 N 138 \n", 338 | "4 4.94 1.0 N 68 \n", 339 | "... ... ... ... ... \n", 340 | "1369764 8.89 NaN None 229 \n", 341 | "1369765 7.43 NaN None 41 \n", 342 | "1369766 6.26 NaN None 74 \n", 343 | "1369767 19.70 NaN None 265 \n", 344 | "1369768 4.68 NaN None 89 \n", 345 | "\n", 346 | " DOLocationID payment_type fare_amount extra mta_tax tip_amount \\\n", 347 | "0 43 2 8.00 3.00 0.5 0.00 \n", 348 | "1 151 2 3.00 0.50 0.5 0.00 \n", 349 | "2 165 1 42.00 0.50 0.5 8.65 \n", 350 | "3 132 1 29.00 0.50 0.5 6.05 \n", 351 | "4 33 1 16.50 0.50 0.5 4.06 \n", 352 | "... ... ... ... ... ... ... \n", 353 | "1369764 181 0 27.78 0.00 0.5 7.46 \n", 354 | "1369765 70 0 32.58 0.00 0.5 0.00 \n", 355 | "1369766 137 0 16.85 0.00 0.5 3.90 \n", 356 | "1369767 188 0 53.68 0.00 0.5 0.00 \n", 357 | "1369768 61 0 25.45 2.75 0.5 0.00 \n", 358 | "\n", 359 | " tolls_amount improvement_surcharge total_amount \\\n", 360 | "0 0.00 0.3 11.80 \n", 361 | "1 0.00 0.3 4.30 \n", 362 | "2 0.00 0.3 51.95 \n", 363 | "3 0.00 0.3 36.35 \n", 364 | "4 0.00 0.3 24.36 \n", 365 | "... ... ... ... \n", 366 | "1369764 0.00 0.3 38.54 \n", 367 | "1369765 6.12 0.3 39.50 \n", 368 | "1369766 0.00 0.3 24.05 \n", 369 | "1369767 0.00 0.3 54.48 \n", 370 | "1369768 0.00 0.3 29.00 \n", 371 | "\n", 372 | " congestion_surcharge airport_fee \n", 373 | "0 2.5 NaN \n", 374 | "1 0.0 NaN \n", 375 | "2 0.0 NaN \n", 376 | "3 0.0 NaN \n", 377 | "4 2.5 NaN \n", 378 | "... ... ... \n", 379 | "1369764 NaN NaN \n", 380 | "1369765 NaN NaN \n", 381 | "1369766 NaN NaN \n", 382 | "1369767 NaN NaN \n", 383 | "1369768 NaN NaN \n", 384 | "\n", 385 | "[1369769 rows x 19 columns]" 386 | ] 387 | }, 388 | "execution_count": 4, 389 | "metadata": {}, 390 | "output_type": "execute_result" 391 | } 392 | ], 393 | "source": [ 394 | "pd.read_parquet('yellow_tripdata_2021-01.parquet', engine='pyarrow')\n" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [] 403 | } 404 | ], 405 | "metadata": { 406 | "kernelspec": { 407 | "display_name": "Python 3", 408 | "language": "python", 409 | "name": "python3" 410 | }, 411 | "language_info": { 412 | "codemirror_mode": { 413 | "name": "ipython", 414 | "version": 3 415 | }, 416 | "file_extension": ".py", 417 | "mimetype": "text/x-python", 418 | "name": "python", 419 | "nbconvert_exporter": "python", 420 | "pygments_lexer": "ipython3", 421 | "version": "3.7.4" 422 | } 423 | }, 424 | "nbformat": 4, 425 | "nbformat_minor": 2 426 | } 427 | -------------------------------------------------------------------------------- /week1/Dockerfile: -------------------------------------------------------------------------------- 1 | #how to create a new docker image 2 | 3 | #specify the python version of the image 4 | FROM python:3.9 5 | 6 | #install all the dependencies of our application 7 | RUN apt-get install wget 8 | RUN pip install pandas sqlalchemy psycopg2 pyarrow 9 | 10 | #create a app folder where pipeline.py will be stored 11 | WORKDIR /app 12 | COPY ingest_data.py ingest_data.py 13 | 14 | #run the pipeline script once the docker image is run 15 | ENTRYPOINT [ "python", "ingest_data.py"] 16 | 17 | -------------------------------------------------------------------------------- /week1/README.md: -------------------------------------------------------------------------------- 1 | ## Topics covered in week 1: 2 | - Docker 3 | - Docker Compose 4 | - PostgreSQL 5 | - pgAdmin 6 | - pgcli 7 | - Terraform 8 | - Google Cloud Plataform 9 | 10 | ### Notes: 11 | Notion page: https://www.notion.so/pcrespoo/Week-1-adea5a2dbf0f44e49749238957f99754 12 | 13 | ### Commands learned in week 1: 14 | 15 | - dataset: https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.parquet 16 | 17 | - build an image: 18 | ``` 19 | docker build -t IMAGE_NAME . 20 | ``` 21 | - run an image : 22 | ``` 23 | docker run -it IMAGE_NAME 24 | ``` 25 | - run a postgres image 26 | ``` 27 | docker run -it \ 28 | -e POSTGRES_USER='root' \ 29 | -e POSTGRES_PASSWORD='root' \ 30 | -e POSTGRES_DB='ny_taxi' \ 31 | -v "YOUR_PATH/ny_taxi_postgres_data:/var/lib/postgresql/data" \ 32 | -p 5432:5432 \ 33 | postgres:13 34 | ``` 35 | 36 | - how to connect to a postgres database with pgcli: 37 | ``` 38 | pgcli -h localhost -p 5432 -u root -d ny_taxi 39 | ``` 40 | 41 | - pgAdmin image: 42 | ``` 43 | docker run -it \ 44 | -e PGADMIN_DEFAULT_EMAIL=admin@admin.com \ 45 | -e PGADMIN_DEFAULT_PASSWORD="root" \ 46 | -p 8080:80 \ 47 | dpage/pgadmin4 48 | ``` 49 | - 8080: the port in the local machine 50 | - 80: the port used by pgAdmin 51 | - 8080:80 is the setup to connect the local machine with pgAdmin 52 | 53 | - create a docker network: 54 | ``` 55 | docker network create pedro_network 56 | ``` 57 | 58 | - update PostgreSQL image: 59 | ``` 60 | docker run -it \ 61 | -e POSTGRES_USER='root' \ 62 | -e POSTGRES_PASSWORD='root' \ 63 | -e POSTGRES_DB='ny_taxi' \ 64 | -v "YOUR_PATH/ny_taxi_postgres_data:/var/lib/postgresql/data" \ 65 | -p 5432:5432 \ 66 | --network=pedro_network \ 67 | --name pg-database-teste \ 68 | postgres:13 69 | ``` 70 | 71 | - update pgAdmin image with network settings: 72 | ``` 73 | docker run -it \ 74 | -e PGADMIN_DEFAULT_EMAIL=admin@admin.com \ 75 | -e PGADMIN_DEFAULT_PASSWORD="root" \ 76 | -p 8080:80 \ 77 | --network=pedro_network \ 78 | --name pgAdmin-bootcamp \ 79 | dpage/pgadmin4 80 | ``` 81 | 82 | - build an image for the data ingestion process: 83 | ``` 84 | docker build -t taxi_ingest:v001 . 85 | ``` 86 | 87 | - run the docker image for data ingestion using the same network used by pgAdmin and PostgreSQL: 88 | ``` 89 | docker run -it \ 90 | --network=pedro_network \ 91 | taxi_ingest:v001 \ 92 | --user=root \ 93 | --password=root \ 94 | --host=pg-database-teste\ 95 | --port=5432 \ 96 | --db=ny_taxi \ 97 | --table_name=yellow_taxi_data \ 98 | --url="https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.parquet" 99 | ``` 100 | 101 | - Docker Compose: 102 | ``` 103 | docker-compose up 104 | ``` 105 | 106 | - Terraform 107 | - terraform init: here we are basically initializing default parameters, like specifying the provider, the backend state file that will manage all the resources to be created, etc 108 | 109 | - terraform plan: here, we will pass to the state file which resources we want to have and their parameters 110 | 111 | - terraform apply: the state file will have all the plan to be executed. Then, running this command, it will create all the resources for us 112 | - if we decide to create more resources during development stage, we basically need to add more resources to the main file, run 113 | “terraform plan” and “terraform apply” to apply the changes 114 | 115 | 116 | - terraform destroy: if we want to remove all the resources created -------------------------------------------------------------------------------- /week1/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | pgdatabase: 3 | image: postgres:13 4 | environment: 5 | - POSTGRES_USER=root 6 | - POSTGRES_PASSWORD=root 7 | - POSTGRES_DB=ny_taxi 8 | volumes: 9 | - "./ny_taxi_postgres_data:/var/lib/postgresql/data:rw" 10 | ports: 11 | - "5432:5432" 12 | pgadmin: 13 | image: dpage/pgadmin4 14 | environment: 15 | - PGADMIN_DEFAULT_EMAIL=admin@admin.com 16 | - PGADMIN_DEFAULT_PASSWORD=root 17 | ports: 18 | - "8080:80" -------------------------------------------------------------------------------- /week1/ingest_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | from sqlalchemy import create_engine 5 | import pandas as pd 6 | from time import time 7 | import argparse 8 | import os 9 | 10 | def main(params): 11 | user = params.user 12 | password = params.password 13 | database = params.db 14 | url = params.url 15 | host = params.host 16 | table_name = params.table_name 17 | port = params.port 18 | 19 | #download the parquet and convert to csv 20 | parquet_name = 'output.parquet' 21 | os.system(f'wget {url} -O {parquet_name}') 22 | csv_name = 'output.csv' 23 | 24 | df_parquet = pd.read_parquet(parquet_name, engine='pyarrow') 25 | df_parquet['tpep_pickup_datetime'] = pd.to_datetime(df_parquet['tpep_pickup_datetime']) 26 | df_parquet['tpep_dropoff_datetime'] = pd.to_datetime(df_parquet['tpep_dropoff_datetime']) 27 | df_parquet.to_csv(csv_name,sep=';') 28 | 29 | #create a conn with Postgres 30 | engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{database}') 31 | 32 | #read data in chunks 33 | df_iter = pd.read_csv(csv_name,sep=';',iterator=True, chunksize=100000,index_col=0) 34 | 35 | #create a first chunk 36 | df = next(df_iter) 37 | 38 | #adjust date columns 39 | df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime']) 40 | df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime']) 41 | 42 | #create table 43 | df.head(0).to_sql(con=engine, name=table_name, if_exists='replace') 44 | 45 | #insert chunks of data into the table 46 | while True: 47 | t_start = time() 48 | 49 | df = next(df_iter) 50 | 51 | df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime']) 52 | df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime']) 53 | 54 | df.to_sql(con=engine, name=table_name, if_exists='append') 55 | 56 | t_final = time() 57 | 58 | print(f'chunk insertion took {t_final - t_start}') 59 | 60 | if __name__ == '__main__': 61 | #parse the CLI parameters 62 | parser = argparse.ArgumentParser(description='Ingest csv data to postgresql') 63 | parser.add_argument('--user', help='username for postgresql') 64 | parser.add_argument('--password', help='password for postgresql') 65 | parser.add_argument('--port', help='port for postgresql') 66 | parser.add_argument('--host', help='host for postgresql') 67 | parser.add_argument('--db', help='database name') 68 | parser.add_argument('--table_name', help='name of the table') 69 | parser.add_argument('--url', help='url of the csv file') 70 | args = parser.parse_args() 71 | 72 | main(args) 73 | -------------------------------------------------------------------------------- /week1/postgres_connection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "'0.25.1'" 12 | ] 13 | }, 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "output_type": "execute_result" 17 | } 18 | ], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "pd.__version__" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "#read the original data and convert to csv before proceeding\n", 31 | "df = pd.read_parquet('yellow_tripdata_2021-01.parquet', engine='pyarrow')\n", 32 | "df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])\n", 33 | "df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])\n", 34 | "df.to_csv('yellow_tripdata_2021-01.csv',sep=';')" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "name": "stderr", 44 | "output_type": "stream", 45 | "text": [ 46 | "C:\\Users\\pedro\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3058: DtypeWarning: Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.\n", 47 | " interactivity=interactivity, compiler=compiler, result=result)\n", 48 | "C:\\Users\\pedro\\Anaconda3\\lib\\site-packages\\numpy\\lib\\arraysetops.py:569: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n", 49 | " mask |= (ar1 == a)\n" 50 | ] 51 | }, 52 | { 53 | "data": { 54 | "text/html": [ 55 | "
\n", 56 | "\n", 69 | "\n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | "
VendorIDtpep_pickup_datetimetpep_dropoff_datetimepassenger_counttrip_distanceRatecodeIDstore_and_fwd_flagPULocationIDDOLocationIDpayment_typefare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amountcongestion_surchargeairport_fee
012021-01-01 00:30:102021-01-01 00:36:121.02.101.0N1424328.03.00.50.000.00.311.802.5NaN
112021-01-01 00:51:202021-01-01 00:52:191.00.201.0N23815123.00.50.50.000.00.34.300.0NaN
212021-01-01 00:43:302021-01-01 01:11:061.014.701.0N132165142.00.50.58.650.00.351.950.0NaN
312021-01-01 00:15:482021-01-01 00:31:010.010.601.0N138132129.00.50.56.050.00.336.350.0NaN
422021-01-01 00:31:492021-01-01 00:48:211.04.941.0N6833116.50.50.54.060.00.324.362.5NaN
\n", 207 | "
" 208 | ], 209 | "text/plain": [ 210 | " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n", 211 | "0 1 2021-01-01 00:30:10 2021-01-01 00:36:12 1.0 \n", 212 | "1 1 2021-01-01 00:51:20 2021-01-01 00:52:19 1.0 \n", 213 | "2 1 2021-01-01 00:43:30 2021-01-01 01:11:06 1.0 \n", 214 | "3 1 2021-01-01 00:15:48 2021-01-01 00:31:01 0.0 \n", 215 | "4 2 2021-01-01 00:31:49 2021-01-01 00:48:21 1.0 \n", 216 | "\n", 217 | " trip_distance RatecodeID store_and_fwd_flag PULocationID DOLocationID \\\n", 218 | "0 2.10 1.0 N 142 43 \n", 219 | "1 0.20 1.0 N 238 151 \n", 220 | "2 14.70 1.0 N 132 165 \n", 221 | "3 10.60 1.0 N 138 132 \n", 222 | "4 4.94 1.0 N 68 33 \n", 223 | "\n", 224 | " payment_type fare_amount extra mta_tax tip_amount tolls_amount \\\n", 225 | "0 2 8.0 3.0 0.5 0.00 0.0 \n", 226 | "1 2 3.0 0.5 0.5 0.00 0.0 \n", 227 | "2 1 42.0 0.5 0.5 8.65 0.0 \n", 228 | "3 1 29.0 0.5 0.5 6.05 0.0 \n", 229 | "4 1 16.5 0.5 0.5 4.06 0.0 \n", 230 | "\n", 231 | " improvement_surcharge total_amount congestion_surcharge airport_fee \n", 232 | "0 0.3 11.80 2.5 NaN \n", 233 | "1 0.3 4.30 0.0 NaN \n", 234 | "2 0.3 51.95 0.0 NaN \n", 235 | "3 0.3 36.35 0.0 NaN \n", 236 | "4 0.3 24.36 2.5 NaN " 237 | ] 238 | }, 239 | "execution_count": 3, 240 | "metadata": {}, 241 | "output_type": "execute_result" 242 | } 243 | ], 244 | "source": [ 245 | "#read csv\n", 246 | "df = pd.read_csv('yellow_tripdata_2021-01.csv',sep=';',index_col=0)\n", 247 | "df.head()" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 4, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "#read data in chunks\n", 257 | "df_iter = df = pd.read_csv('yellow_tripdata_2021-01.csv',sep=';',iterator=True, chunksize=100000,index_col=0)\n", 258 | "\n", 259 | "#create a first chunk\n", 260 | "df = next(df_iter)\n", 261 | "\n", 262 | "#adjust date columns \n", 263 | "df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])\n", 264 | "df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 5, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "#create a conn with Postgres\n", 274 | "from sqlalchemy import create_engine\n", 275 | "engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 6, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "name": "stdout", 285 | "output_type": "stream", 286 | "text": [ 287 | "\n", 288 | "CREATE TABLE yellow_taxi_data (\n", 289 | "\t\"VendorID\" BIGINT, \n", 290 | "\ttpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, \n", 291 | "\ttpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, \n", 292 | "\tpassenger_count FLOAT(53), \n", 293 | "\ttrip_distance FLOAT(53), \n", 294 | "\t\"RatecodeID\" FLOAT(53), \n", 295 | "\tstore_and_fwd_flag TEXT, \n", 296 | "\t\"PULocationID\" BIGINT, \n", 297 | "\t\"DOLocationID\" BIGINT, \n", 298 | "\tpayment_type BIGINT, \n", 299 | "\tfare_amount FLOAT(53), \n", 300 | "\textra FLOAT(53), \n", 301 | "\tmta_tax FLOAT(53), \n", 302 | "\ttip_amount FLOAT(53), \n", 303 | "\ttolls_amount FLOAT(53), \n", 304 | "\timprovement_surcharge FLOAT(53), \n", 305 | "\ttotal_amount FLOAT(53), \n", 306 | "\tcongestion_surcharge FLOAT(53), \n", 307 | "\tairport_fee FLOAT(53)\n", 308 | ")\n", 309 | "\n", 310 | "\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "#sample of schema of the table to be created on Postgres\n", 316 | "print(pd.io.sql.get_schema(df,name='yellow_taxi_data',con=engine))" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 7, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "#create table\n", 326 | "df.head(0).to_sql(con=engine, name='yellow_taxi_data', if_exists='replace')" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 8, 332 | "metadata": {}, 333 | "outputs": [ 334 | { 335 | "name": "stdout", 336 | "output_type": "stream", 337 | "text": [ 338 | "chunk insertion took 140.19736647605896\n", 339 | "chunk insertion took 122.36996746063232\n", 340 | "chunk insertion took 126.62409663200378\n", 341 | "chunk insertion took 127.74477791786194\n", 342 | "chunk insertion took 132.0837414264679\n", 343 | "chunk insertion took 125.15627336502075\n", 344 | "chunk insertion took 137.10601329803467\n", 345 | "chunk insertion took 88.83106231689453\n", 346 | "chunk insertion took 87.03130531311035\n", 347 | "chunk insertion took 86.5908088684082\n", 348 | "chunk insertion took 87.42471408843994\n", 349 | "chunk insertion took 91.67677354812622\n", 350 | "chunk insertion took 59.76935338973999\n" 351 | ] 352 | }, 353 | { 354 | "ename": "StopIteration", 355 | "evalue": "", 356 | "output_type": "error", 357 | "traceback": [ 358 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 359 | "\u001b[1;31mStopIteration\u001b[0m Traceback (most recent call last)", 360 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mt_start\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_iter\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'tpep_pickup_datetime'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_datetime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'tpep_pickup_datetime'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 361 | "\u001b[1;32mC:\\Users\\pedro\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m__next__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1126\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__next__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1127\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1128\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_chunk\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1129\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1130\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 362 | "\u001b[1;32mC:\\Users\\pedro\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mget_chunk\u001b[1;34m(self, size)\u001b[0m\n\u001b[0;32m 1186\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1187\u001b[0m \u001b[0msize\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msize\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnrows\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_currow\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1188\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msize\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1189\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1190\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 363 | "\u001b[1;32mC:\\Users\\pedro\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, nrows)\u001b[0m\n\u001b[0;32m 1152\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1153\u001b[0m \u001b[0mnrows\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_validate_integer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"nrows\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1154\u001b[1;33m \u001b[0mret\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1155\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1156\u001b[0m \u001b[1;31m# May alter columns / col_dict\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 364 | "\u001b[1;32mC:\\Users\\pedro\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, nrows)\u001b[0m\n\u001b[0;32m 2057\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2058\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2059\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2060\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2061\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_first_chunk\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 365 | "\u001b[1;32mpandas\\_libs\\parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read\u001b[1;34m()\u001b[0m\n", 366 | "\u001b[1;32mpandas\\_libs\\parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_low_memory\u001b[1;34m()\u001b[0m\n", 367 | "\u001b[1;31mStopIteration\u001b[0m: " 368 | ] 369 | } 370 | ], 371 | "source": [ 372 | "#insert chunks of data into the table\n", 373 | "from time import time\n", 374 | "while True:\n", 375 | " t_start = time()\n", 376 | " \n", 377 | " df = next(df_iter)\n", 378 | " \n", 379 | " df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])\n", 380 | " df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])\n", 381 | " \n", 382 | " df.to_sql(con=engine, name='yellow_taxi_data', if_exists='append')\n", 383 | " \n", 384 | " t_final = time()\n", 385 | " \n", 386 | " print(f'chunk insertion took {t_final - t_start}')" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [] 395 | } 396 | ], 397 | "metadata": { 398 | "kernelspec": { 399 | "display_name": "Python 3", 400 | "language": "python", 401 | "name": "python3" 402 | }, 403 | "language_info": { 404 | "codemirror_mode": { 405 | "name": "ipython", 406 | "version": 3 407 | }, 408 | "file_extension": ".py", 409 | "mimetype": "text/x-python", 410 | "name": "python", 411 | "nbconvert_exporter": "python", 412 | "pygments_lexer": "ipython3", 413 | "version": "3.7.4" 414 | } 415 | }, 416 | "nbformat": 4, 417 | "nbformat_minor": 2 418 | } 419 | -------------------------------------------------------------------------------- /week1/terraform/.terraform-version: -------------------------------------------------------------------------------- 1 | 1.2.0 -------------------------------------------------------------------------------- /week1/terraform/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.0" 3 | backend "local" {} # Can change from "local" to "gcs" (for google) or "s3" (for aws), if you would like to preserve your tf-state online 4 | required_providers { 5 | google = { 6 | source = "hashicorp/google" 7 | } 8 | } 9 | } 10 | 11 | provider "google" { 12 | project = var.project 13 | region = var.region 14 | // credentials = file(var.credentials) # Use this if you do not want to set env-var GOOGLE_APPLICATION_CREDENTIALS 15 | } 16 | 17 | # Data Lake Bucket 18 | # Ref: https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket 19 | resource "google_storage_bucket" "data-lake-bucket" { 20 | name = "${local.data_lake_bucket}_${var.project}" # Concatenating DL bucket & Project name for unique naming 21 | location = var.region 22 | 23 | # Optional, but recommended settings: 24 | storage_class = var.storage_class 25 | uniform_bucket_level_access = true 26 | 27 | versioning { 28 | enabled = true 29 | } 30 | 31 | lifecycle_rule { 32 | action { 33 | type = "Delete" 34 | } 35 | condition { 36 | age = 30 // days 37 | } 38 | } 39 | 40 | force_destroy = true 41 | } 42 | 43 | # DWH 44 | # Ref: https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/bigquery_dataset 45 | resource "google_bigquery_dataset" "dataset" { 46 | dataset_id = var.BQ_DATASET 47 | project = var.project 48 | location = var.region 49 | } -------------------------------------------------------------------------------- /week1/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | data_lake_bucket = "dtc_data_lake" 3 | } 4 | 5 | variable "project" { 6 | description = "Your GCP Project ID" 7 | } 8 | 9 | variable "region" { 10 | description = "Region for GCP resources. Choose as per your location: https://cloud.google.com/about/locations" 11 | default = "southamerica-east1" 12 | type = string 13 | } 14 | 15 | variable "storage_class" { 16 | description = "Storage class type for your bucket. Check official docs for more info." 17 | default = "STANDARD" 18 | } 19 | 20 | variable "BQ_DATASET" { 21 | description = "BigQuery Dataset that raw data (from GCS) will be written to" 22 | type = string 23 | default = "trips_data_all" 24 | } 25 | 26 | variable "TABLE_NAME"{ 27 | description = "BigQuery Table" 28 | type = string 29 | default = "ny_trips" 30 | } -------------------------------------------------------------------------------- /week2/.gitignore: -------------------------------------------------------------------------------- 1 | google/ 2 | .env 3 | logs/ 4 | plugins/ 5 | **__pycache__/ -------------------------------------------------------------------------------- /week2/README.md: -------------------------------------------------------------------------------- 1 | ## Topics covered in week 2: 2 | - Airflow 3 | - Data Ingestion to GCP with Airflow 4 | 5 | ### Notes: 6 | Notion page: https://www.notion.so/pcrespoo/Week-2-eca5926ca202477998bb0296f6487d83 7 | 8 | -------------------------------------------------------------------------------- /week2/airflow/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:2.2.3 2 | 3 | ENV AIRFLOW_HOME=/opt/airflow 4 | 5 | USER root 6 | RUN apt-get update -qq && apt-get install vim -qqq 7 | 8 | COPY requirements.txt . 9 | RUN pip install --no-cache-dir -r requirements.txt 10 | 11 | ## GOOGLE IMAGE 12 | SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"] 13 | 14 | ARG CLOUD_SDK_VERSION=322.0.0 15 | ENV GCLOUD_HOME=/home/google-cloud-sdk 16 | 17 | ENV PATH="${GCLOUD_HOME}/bin/:${PATH}" 18 | 19 | RUN DOWNLOAD_URL="https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz" \ 20 | && TMP_DIR="$(mktemp -d)" \ 21 | && curl -fL "${DOWNLOAD_URL}" --output "${TMP_DIR}/google-cloud-sdk.tar.gz" \ 22 | && mkdir -p "${GCLOUD_HOME}" \ 23 | && tar xzf "${TMP_DIR}/google-cloud-sdk.tar.gz" -C "${GCLOUD_HOME}" --strip-components=1 \ 24 | && "${GCLOUD_HOME}/install.sh" \ 25 | --bash-completion=false \ 26 | --path-update=false \ 27 | --usage-reporting=false \ 28 | --quiet \ 29 | && rm -rf "${TMP_DIR}" \ 30 | && gcloud --version 31 | 32 | WORKDIR $AIRFLOW_HOME 33 | 34 | USER $AIRFLOW_UID -------------------------------------------------------------------------------- /week2/airflow/dags/dag_ingestion_gcs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from airflow import DAG 3 | from airflow.utils.dates import days_ago 4 | from airflow.operators.bash import BashOperator 5 | from airflow.operators.python import PythonOperator 6 | from google.cloud import storage 7 | from airflow.providers.google.cloud.operators.bigquery import BigQueryCreateExternalTableOperator 8 | 9 | PROJECT_ID = os.environ.get("GCP_PROJECT_ID") 10 | BUCKET = os.environ.get("GCP_GCS_BUCKET") 11 | 12 | dataset_file = "yellow_tripdata_2021-01.parquet" 13 | dataset_url = f"https://s3.amazonaws.com/nyc-tlc/trip+data/{dataset_file}" 14 | path_to_local_home = os.environ.get("AIRFLOW_HOME", "/opt/airflow/") 15 | BIGQUERY_DATASET = os.environ.get("BIGQUERY_DATASET", 'trips_data_all') 16 | 17 | def upload_to_gcs(bucket, object_name, local_file): 18 | """ 19 | Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python 20 | :param bucket: GCS bucket name 21 | :param object_name: target path & file-name 22 | :param local_file: source path & file-name 23 | :return: 24 | """ 25 | # WORKAROUND to prevent timeout for files > 6 MB on 800 kbps upload speed. 26 | # (Ref: https://github.com/googleapis/python-storage/issues/74) 27 | storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024 # 5 MB 28 | storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024 # 5 MB 29 | # End of Workaround 30 | 31 | client = storage.Client() 32 | bucket = client.bucket(bucket) 33 | 34 | blob = bucket.blob(object_name) 35 | blob.upload_from_filename(local_file) 36 | 37 | 38 | default_args = { 39 | "owner": "airflow", 40 | "start_date": days_ago(1), 41 | "depends_on_past": False, 42 | "retries": 1, 43 | } 44 | 45 | # NOTE: DAG declaration - using a Context Manager (an implicit way) 46 | with DAG( 47 | dag_id="data_ingestion_gcs_dag", 48 | schedule_interval="@daily", 49 | default_args=default_args, 50 | catchup=False, 51 | max_active_runs=1, 52 | tags=['dtc-de'], 53 | ) as dag: 54 | 55 | download_dataset_task = BashOperator( 56 | task_id="download_dataset_task", 57 | bash_command=f"curl -sS {dataset_url} > {path_to_local_home}/{dataset_file}" 58 | ) 59 | 60 | # TODO: Homework - research and try XCOM to communicate output values between 2 tasks/operators 61 | local_to_gcs_task = PythonOperator( 62 | task_id="local_to_gcs_task", 63 | python_callable=upload_to_gcs, 64 | op_kwargs={ 65 | "bucket": BUCKET, 66 | "object_name": f"raw/{dataset_file}", 67 | "local_file": f"{path_to_local_home}/{dataset_file}", 68 | }, 69 | ) 70 | 71 | bigquery_external_table_task = BigQueryCreateExternalTableOperator( 72 | task_id="bigquery_external_table_task", 73 | table_resource={ 74 | "tableReference": { 75 | "projectId": PROJECT_ID, 76 | "datasetId": BIGQUERY_DATASET, 77 | "tableId": "external_table", 78 | }, 79 | "externalDataConfiguration": { 80 | "sourceFormat": "PARQUET", 81 | "sourceUris": [f"gs://{BUCKET}/raw/{dataset_file}"], 82 | }, 83 | }, 84 | ) 85 | 86 | download_dataset_task >> local_to_gcs_task >> bigquery_external_table_task 87 | -------------------------------------------------------------------------------- /week2/airflow/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 20 | # 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 22 | # 23 | # This configuration supports basic configuration using environment variables or an .env file 24 | # The following variables are supported: 25 | # 26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 27 | # Default: apache/airflow:2.2.3 28 | # AIRFLOW_UID - User ID in Airflow containers 29 | # Default: 50000 30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode 31 | # 32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). 33 | # Default: airflow 34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). 35 | # Default: airflow 36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. 37 | # Default: '' 38 | # 39 | # Feel free to modify this file to suit your needs. 40 | --- 41 | version: '3' 42 | x-airflow-common: 43 | &airflow-common 44 | # In order to add custom dependencies or upgrade provider packages you can use your extended image. 45 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml 46 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images. 47 | build: 48 | context: . 49 | dockerfile: ./Dockerfile 50 | environment: 51 | &airflow-common-env 52 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor 53 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 54 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow 55 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 56 | AIRFLOW__CORE__FERNET_KEY: '' 57 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 58 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 59 | AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth' 60 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} 61 | GOOGLE_APPLICATION_CREDENTIALS: /google/credentials/google_credentials.json 62 | AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT: 'google-cloud-platform://?extra__google_cloud_platform__key_path=/google/credentials/google_credentials.json' 63 | 64 | # TODO: Please change GCP_PROJECT_ID & GCP_GCS_BUCKET, as per your config 65 | GCP_PROJECT_ID: 'dtc-boot-7639' 66 | GCP_GCS_BUCKET: 'dtc_data_lake_dtc-boot-7639' 67 | 68 | volumes: 69 | - ./dags:/opt/airflow/dags 70 | - ./logs:/opt/airflow/logs 71 | - ./plugins:/opt/airflow/plugins 72 | - C:/Users/pedro/Documents/Estudos_DS/repos/data-engineering-bootcamp/week2/airflow/google/credentials/:/google/credentials:ro 73 | 74 | user: "${AIRFLOW_UID:-50000}:0" 75 | depends_on: 76 | &airflow-common-depends-on 77 | redis: 78 | condition: service_healthy 79 | postgres: 80 | condition: service_healthy 81 | 82 | services: 83 | postgres: 84 | image: postgres:13 85 | environment: 86 | POSTGRES_USER: airflow 87 | POSTGRES_PASSWORD: airflow 88 | POSTGRES_DB: airflow 89 | volumes: 90 | - postgres-db-volume:/var/lib/postgresql/data 91 | healthcheck: 92 | test: ["CMD", "pg_isready", "-U", "airflow"] 93 | interval: 5s 94 | retries: 5 95 | restart: always 96 | 97 | redis: 98 | image: redis:latest 99 | expose: 100 | - 6379 101 | healthcheck: 102 | test: ["CMD", "redis-cli", "ping"] 103 | interval: 5s 104 | timeout: 30s 105 | retries: 50 106 | restart: always 107 | 108 | airflow-webserver: 109 | <<: *airflow-common 110 | command: webserver 111 | ports: 112 | - 8080:8080 113 | healthcheck: 114 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] 115 | interval: 10s 116 | timeout: 10s 117 | retries: 5 118 | restart: always 119 | depends_on: 120 | <<: *airflow-common-depends-on 121 | airflow-init: 122 | condition: service_completed_successfully 123 | 124 | airflow-scheduler: 125 | <<: *airflow-common 126 | command: scheduler 127 | healthcheck: 128 | test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"'] 129 | interval: 10s 130 | timeout: 10s 131 | retries: 5 132 | restart: always 133 | depends_on: 134 | <<: *airflow-common-depends-on 135 | airflow-init: 136 | condition: service_completed_successfully 137 | 138 | airflow-worker: 139 | <<: *airflow-common 140 | command: celery worker 141 | healthcheck: 142 | test: 143 | - "CMD-SHELL" 144 | - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' 145 | interval: 10s 146 | timeout: 10s 147 | retries: 5 148 | environment: 149 | <<: *airflow-common-env 150 | # Required to handle warm shutdown of the celery workers properly 151 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation 152 | DUMB_INIT_SETSID: "0" 153 | restart: always 154 | depends_on: 155 | <<: *airflow-common-depends-on 156 | airflow-init: 157 | condition: service_completed_successfully 158 | 159 | airflow-triggerer: 160 | <<: *airflow-common 161 | command: triggerer 162 | healthcheck: 163 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] 164 | interval: 10s 165 | timeout: 10s 166 | retries: 5 167 | restart: always 168 | depends_on: 169 | <<: *airflow-common-depends-on 170 | airflow-init: 171 | condition: service_completed_successfully 172 | 173 | airflow-init: 174 | <<: *airflow-common 175 | entrypoint: /bin/bash 176 | # yamllint disable rule:line-length 177 | command: 178 | - -c 179 | - | 180 | function ver() { 181 | printf "%04d%04d%04d%04d" $${1//./ } 182 | } 183 | airflow_version=$$(gosu airflow airflow version) 184 | airflow_version_comparable=$$(ver $${airflow_version}) 185 | min_airflow_version=2.2.0 186 | min_airflow_version_comparable=$$(ver $${min_airflow_version}) 187 | if (( airflow_version_comparable < min_airflow_version_comparable )); then 188 | echo 189 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m" 190 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!" 191 | echo 192 | exit 1 193 | fi 194 | if [[ -z "${AIRFLOW_UID}" ]]; then 195 | echo 196 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 197 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 198 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 199 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 200 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user" 201 | echo 202 | fi 203 | one_meg=1048576 204 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 205 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 206 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 207 | warning_resources="false" 208 | if (( mem_available < 4000 )) ; then 209 | echo 210 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 211 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 212 | echo 213 | warning_resources="true" 214 | fi 215 | if (( cpus_available < 2 )); then 216 | echo 217 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 218 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 219 | echo 220 | warning_resources="true" 221 | fi 222 | if (( disk_available < one_meg * 10 )); then 223 | echo 224 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 225 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 226 | echo 227 | warning_resources="true" 228 | fi 229 | if [[ $${warning_resources} == "true" ]]; then 230 | echo 231 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 232 | echo "Please follow the instructions to increase amount of resources available:" 233 | echo " https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin" 234 | echo 235 | fi 236 | mkdir -p /sources/logs /sources/dags /sources/plugins 237 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} 238 | exec /entrypoint airflow version 239 | # yamllint enable rule:line-length 240 | environment: 241 | <<: *airflow-common-env 242 | _AIRFLOW_DB_UPGRADE: 'true' 243 | _AIRFLOW_WWW_USER_CREATE: 'true' 244 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 245 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 246 | user: "0:0" 247 | volumes: 248 | - .:/sources 249 | 250 | airflow-cli: 251 | <<: *airflow-common 252 | profiles: 253 | - debug 254 | environment: 255 | <<: *airflow-common-env 256 | CONNECTION_CHECK_MAX_COUNT: "0" 257 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 258 | command: 259 | - bash 260 | - -c 261 | - airflow 262 | 263 | flower: 264 | <<: *airflow-common 265 | command: celery flower 266 | ports: 267 | - 5555:5555 268 | healthcheck: 269 | test: ["CMD", "curl", "--fail", "http://localhost:5555/"] 270 | interval: 10s 271 | timeout: 10s 272 | retries: 5 273 | restart: always 274 | depends_on: 275 | <<: *airflow-common-depends-on 276 | airflow-init: 277 | condition: service_completed_successfully 278 | 279 | volumes: 280 | postgres-db-volume: -------------------------------------------------------------------------------- /week2/airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow-providers-google 2 | pyarrow 3 | -------------------------------------------------------------------------------- /week3/.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | logs/ -------------------------------------------------------------------------------- /week3/README.md: -------------------------------------------------------------------------------- 1 | ## Topics covered in week 3: 2 | - BigQuery 3 | - Table Partition 4 | - Table Clustering 5 | - Machine Learning in BigQuery 6 | - General usage with BigQuery 7 | - Airflow 8 | 9 | ### Notes: 10 | Notion page: https://www.notion.so/pcrespoo/Week-3-2e55578d253b45d4ab16c6213c3cf9f4 11 | -------------------------------------------------------------------------------- /week3/airflow/dags/__pycache__/gcp_to_bq_dag.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pcrespoo/data-engineering-bootcamp/a1835f9c1557d4974b73837a0622ac5d24db9dfa/week3/airflow/dags/__pycache__/gcp_to_bq_dag.cpython-37.pyc -------------------------------------------------------------------------------- /week3/airflow/dags/gcp_to_bq_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | from airflow import DAG 3 | from airflow.utils.dates import days_ago 4 | from airflow.providers.google.cloud.operators.bigquery import BigQueryCreateExternalTableOperator, BigQueryInsertJobOperator 5 | from airflow.providers.google.cloud.transfers.gcs_to_gcs import GCSToGCSOperator 6 | 7 | PROJECT_ID = os.environ.get("GCP_PROJECT_ID") 8 | BUCKET = os.environ.get("GCP_GCS_BUCKET") 9 | 10 | path_to_local_home = os.environ.get("AIRFLOW_HOME", "/opt/airflow/") 11 | BIGQUERY_DATASET = os.environ.get("BIGQUERY_DATASET", 'trips_data_all') 12 | 13 | default_args = { 14 | "owner": "airflow", 15 | "start_date": days_ago(1), 16 | "depends_on_past": False, 17 | "retries": 1, 18 | } 19 | 20 | with DAG( 21 | dag_id="gcs_2_bq_dag", 22 | schedule_interval="@daily", 23 | default_args=default_args, 24 | catchup=False, 25 | max_active_runs=1, 26 | tags=['dtc-de'], 27 | ) as dag: 28 | 29 | #move files in GCS 30 | gcs_2_gcs_task = GCSToGCSOperator( 31 | task_id = "gcs_2_gcs_task", 32 | source_bucket = BUCKET, 33 | source_object = "raw/yellow_tripdata*.parquet", 34 | destination_bucket = BUCKET, 35 | move_object = True, 36 | destination_object = 'yellow/' 37 | ) 38 | 39 | #create a table from the new file location 40 | gcs_2_bq_ext_task = BigQueryCreateExternalTableOperator( 41 | task_id="gcs_2_bq_ext_task", 42 | table_resource={ 43 | "tableReference": { 44 | "projectId": PROJECT_ID, 45 | "datasetId": BIGQUERY_DATASET, 46 | "tableId": "external_yellow_tripdata", 47 | }, 48 | "externalDataConfiguration": { 49 | "sourceFormat": "PARQUET", 50 | "sourceUris": [f"gs://{BUCKET}/yellow/*"], 51 | }, 52 | }, 53 | ) 54 | 55 | #partition a table 56 | CREATE_PART_TBL_QUERY = f"""CREATE OR REPLACE TABLE {BIGQUERY_DATASET}.yellow_tripdata_partitioned 57 | PARTITION BY DATE(tpep_pickup_datetime) AS SELECT * FROM {BIGQUERY_DATASET}.external_yellow_tripdata""" 58 | bq_ext_2_part_task = BigQueryInsertJobOperator( 59 | task_id = "bq_ext_2_part_task", 60 | configuration = { 61 | 'query':{ 62 | 'query': CREATE_PART_TBL_QUERY, 63 | 'useLegacySql':False, 64 | } 65 | }, 66 | ) 67 | 68 | 69 | gcs_2_gcs_task >> gcs_2_bq_ext_task >> bq_ext_2_part_task -------------------------------------------------------------------------------- /week3/airflow/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 20 | # 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 22 | # 23 | # This configuration supports basic configuration using environment variables or an .env file 24 | # The following variables are supported: 25 | # 26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 27 | # Default: apache/airflow:2.2.3 28 | # AIRFLOW_UID - User ID in Airflow containers 29 | # Default: 50000 30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode 31 | # 32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). 33 | # Default: airflow 34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). 35 | # Default: airflow 36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. 37 | # Default: '' 38 | # 39 | # Feel free to modify this file to suit your needs. 40 | --- 41 | version: '3' 42 | x-airflow-common: 43 | &airflow-common 44 | # In order to add custom dependencies or upgrade provider packages you can use your extended image. 45 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml 46 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images. 47 | build: 48 | context: . 49 | dockerfile: ./Dockerfile 50 | environment: 51 | &airflow-common-env 52 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor 53 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 54 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow 55 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 56 | AIRFLOW__CORE__FERNET_KEY: '' 57 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 58 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 59 | AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth' 60 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} 61 | GOOGLE_APPLICATION_CREDENTIALS: /google/credentials/google_credentials.json 62 | AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT: 'google-cloud-platform://?extra__google_cloud_platform__key_path=/google/credentials/google_credentials.json' 63 | 64 | # TODO: Please change GCP_PROJECT_ID & GCP_GCS_BUCKET, as per your config 65 | GCP_PROJECT_ID: 'dtc-boot-7639' 66 | GCP_GCS_BUCKET: 'dtc_data_lake_dtc-boot-7639' 67 | 68 | volumes: 69 | - ./dags:/opt/airflow/dags 70 | - ./logs:/opt/airflow/logs 71 | - ./plugins:/opt/airflow/plugins 72 | - C:/Users/pedro/Documents/Estudos_DS/repos/data-engineering-bootcamp/week2/airflow/google/credentials/:/google/credentials:ro 73 | 74 | user: "${AIRFLOW_UID:-50000}:0" 75 | depends_on: 76 | &airflow-common-depends-on 77 | redis: 78 | condition: service_healthy 79 | postgres: 80 | condition: service_healthy 81 | 82 | services: 83 | postgres: 84 | image: postgres:13 85 | environment: 86 | POSTGRES_USER: airflow 87 | POSTGRES_PASSWORD: airflow 88 | POSTGRES_DB: airflow 89 | volumes: 90 | - postgres-db-volume:/var/lib/postgresql/data 91 | healthcheck: 92 | test: ["CMD", "pg_isready", "-U", "airflow"] 93 | interval: 5s 94 | retries: 5 95 | restart: always 96 | 97 | redis: 98 | image: redis:latest 99 | expose: 100 | - 6379 101 | healthcheck: 102 | test: ["CMD", "redis-cli", "ping"] 103 | interval: 5s 104 | timeout: 30s 105 | retries: 50 106 | restart: always 107 | 108 | airflow-webserver: 109 | <<: *airflow-common 110 | command: webserver 111 | ports: 112 | - 8080:8080 113 | healthcheck: 114 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] 115 | interval: 10s 116 | timeout: 10s 117 | retries: 5 118 | restart: always 119 | depends_on: 120 | <<: *airflow-common-depends-on 121 | airflow-init: 122 | condition: service_completed_successfully 123 | 124 | airflow-scheduler: 125 | <<: *airflow-common 126 | command: scheduler 127 | healthcheck: 128 | test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"'] 129 | interval: 10s 130 | timeout: 10s 131 | retries: 5 132 | restart: always 133 | depends_on: 134 | <<: *airflow-common-depends-on 135 | airflow-init: 136 | condition: service_completed_successfully 137 | 138 | airflow-worker: 139 | <<: *airflow-common 140 | command: celery worker 141 | healthcheck: 142 | test: 143 | - "CMD-SHELL" 144 | - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' 145 | interval: 10s 146 | timeout: 10s 147 | retries: 5 148 | environment: 149 | <<: *airflow-common-env 150 | # Required to handle warm shutdown of the celery workers properly 151 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation 152 | DUMB_INIT_SETSID: "0" 153 | restart: always 154 | depends_on: 155 | <<: *airflow-common-depends-on 156 | airflow-init: 157 | condition: service_completed_successfully 158 | 159 | airflow-triggerer: 160 | <<: *airflow-common 161 | command: triggerer 162 | healthcheck: 163 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] 164 | interval: 10s 165 | timeout: 10s 166 | retries: 5 167 | restart: always 168 | depends_on: 169 | <<: *airflow-common-depends-on 170 | airflow-init: 171 | condition: service_completed_successfully 172 | 173 | airflow-init: 174 | <<: *airflow-common 175 | entrypoint: /bin/bash 176 | # yamllint disable rule:line-length 177 | command: 178 | - -c 179 | - | 180 | function ver() { 181 | printf "%04d%04d%04d%04d" $${1//./ } 182 | } 183 | airflow_version=$$(gosu airflow airflow version) 184 | airflow_version_comparable=$$(ver $${airflow_version}) 185 | min_airflow_version=2.2.0 186 | min_airflow_version_comparable=$$(ver $${min_airflow_version}) 187 | if (( airflow_version_comparable < min_airflow_version_comparable )); then 188 | echo 189 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m" 190 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!" 191 | echo 192 | exit 1 193 | fi 194 | if [[ -z "${AIRFLOW_UID}" ]]; then 195 | echo 196 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 197 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 198 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 199 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 200 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user" 201 | echo 202 | fi 203 | one_meg=1048576 204 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 205 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 206 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 207 | warning_resources="false" 208 | if (( mem_available < 4000 )) ; then 209 | echo 210 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 211 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 212 | echo 213 | warning_resources="true" 214 | fi 215 | if (( cpus_available < 2 )); then 216 | echo 217 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 218 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 219 | echo 220 | warning_resources="true" 221 | fi 222 | if (( disk_available < one_meg * 10 )); then 223 | echo 224 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 225 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 226 | echo 227 | warning_resources="true" 228 | fi 229 | if [[ $${warning_resources} == "true" ]]; then 230 | echo 231 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 232 | echo "Please follow the instructions to increase amount of resources available:" 233 | echo " https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin" 234 | echo 235 | fi 236 | mkdir -p /sources/logs /sources/dags /sources/plugins 237 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} 238 | exec /entrypoint airflow version 239 | # yamllint enable rule:line-length 240 | environment: 241 | <<: *airflow-common-env 242 | _AIRFLOW_DB_UPGRADE: 'true' 243 | _AIRFLOW_WWW_USER_CREATE: 'true' 244 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 245 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 246 | user: "0:0" 247 | volumes: 248 | - .:/sources 249 | 250 | airflow-cli: 251 | <<: *airflow-common 252 | profiles: 253 | - debug 254 | environment: 255 | <<: *airflow-common-env 256 | CONNECTION_CHECK_MAX_COUNT: "0" 257 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 258 | command: 259 | - bash 260 | - -c 261 | - airflow 262 | 263 | flower: 264 | <<: *airflow-common 265 | command: celery flower 266 | ports: 267 | - 5555:5555 268 | healthcheck: 269 | test: ["CMD", "curl", "--fail", "http://localhost:5555/"] 270 | interval: 10s 271 | timeout: 10s 272 | retries: 5 273 | restart: always 274 | depends_on: 275 | <<: *airflow-common-depends-on 276 | airflow-init: 277 | condition: service_completed_successfully 278 | 279 | volumes: 280 | postgres-db-volume: -------------------------------------------------------------------------------- /week4/README.md: -------------------------------------------------------------------------------- 1 | ## Topics covered in week 4: 2 | - dbt 3 | - Models creation 4 | - Macros 5 | - Seeds 6 | - Staging and Production environments 7 | - Deployment 8 | 9 | - Google Data Studio 10 | - Dashboards using BigQuery sources 11 | 12 | ### Notes: 13 | Notion page: https://www.notion.so/pcrespoo/Week-4-9de8ba99495b44839eccb082d49dc516 14 | -------------------------------------------------------------------------------- /week4/data_to_gcs/.gitignore: -------------------------------------------------------------------------------- 1 | google/ -------------------------------------------------------------------------------- /week4/data_to_gcs/upload_to_gcs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from google.cloud import storage 3 | 4 | init_url = 'https://nyc-tlc.s3.amazonaws.com/trip+data/' 5 | BUCKET = os.environ.get("GCP_GCS_BUCKET", "dtc_data_lake_dtc-boot-7639") 6 | GOOGLE_APPLICATION_CREDENTIALS = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS", "/google/credentials/google_credentials.json") 7 | 8 | def upload_to_gcs(bucket, object_name, local_file): 9 | client = storage.Client() 10 | bucket = client.bucket(bucket) 11 | blob = bucket.blob(object_name) 12 | blob.upload_from_filename(local_file) 13 | 14 | def web_to_gcs(year, service): 15 | for i in range(12): 16 | 17 | # sets the month part of the file_name string 18 | month = '0'+str(i+1) 19 | month = month[-2:] 20 | 21 | # parquet file_name 22 | file_name = service + '_tripdata_' + year + '-' + month + '.parquet' 23 | 24 | # download it using bash command 25 | os.system(f'wget {init_url + file_name} -O {file_name}') 26 | 27 | # upload it to gcs 28 | upload_to_gcs(BUCKET, f"{service}/{file_name}", file_name) 29 | print(f"GCS: {service}/{file_name}") 30 | 31 | 32 | web_to_gcs('2019', 'green') 33 | web_to_gcs('2020', 'green') 34 | # web_to_gcs('2019', 'yellow') 35 | # web_to_gcs('2020', 'yellow') -------------------------------------------------------------------------------- /week4/dbt/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /week4/dbt/analyses/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pcrespoo/data-engineering-bootcamp/a1835f9c1557d4974b73837a0622ac5d24db9dfa/week4/dbt/analyses/.gitkeep -------------------------------------------------------------------------------- /week4/dbt/data/taxi_zone.csv: -------------------------------------------------------------------------------- 1 | "LocationID","Borough","Zone","service_zone" 2 | 1,"EWR","Newark Airport","EWR" 3 | 2,"Queens","Jamaica Bay","Boro Zone" 4 | 3,"Bronx","Allerton/Pelham Gardens","Boro Zone" 5 | 4,"Manhattan","Alphabet City","Yellow Zone" 6 | 5,"Staten Island","Arden Heights","Boro Zone" 7 | 6,"Staten Island","Arrochar/Fort Wadsworth","Boro Zone" 8 | 7,"Queens","Astoria","Boro Zone" 9 | 8,"Queens","Astoria Park","Boro Zone" 10 | 9,"Queens","Auburndale","Boro Zone" 11 | 10,"Queens","Baisley Park","Boro Zone" 12 | 11,"Brooklyn","Bath Beach","Boro Zone" 13 | 12,"Manhattan","Battery Park","Yellow Zone" 14 | 13,"Manhattan","Battery Park City","Yellow Zone" 15 | 14,"Brooklyn","Bay Ridge","Boro Zone" 16 | 15,"Queens","Bay Terrace/Fort Totten","Boro Zone" 17 | 16,"Queens","Bayside","Boro Zone" 18 | 17,"Brooklyn","Bedford","Boro Zone" 19 | 18,"Bronx","Bedford Park","Boro Zone" 20 | 19,"Queens","Bellerose","Boro Zone" 21 | 20,"Bronx","Belmont","Boro Zone" 22 | 21,"Brooklyn","Bensonhurst East","Boro Zone" 23 | 22,"Brooklyn","Bensonhurst West","Boro Zone" 24 | 23,"Staten Island","Bloomfield/Emerson Hill","Boro Zone" 25 | 24,"Manhattan","Bloomingdale","Yellow Zone" 26 | 25,"Brooklyn","Boerum Hill","Boro Zone" 27 | 26,"Brooklyn","Borough Park","Boro Zone" 28 | 27,"Queens","Breezy Point/Fort Tilden/Riis Beach","Boro Zone" 29 | 28,"Queens","Briarwood/Jamaica Hills","Boro Zone" 30 | 29,"Brooklyn","Brighton Beach","Boro Zone" 31 | 30,"Queens","Broad Channel","Boro Zone" 32 | 31,"Bronx","Bronx Park","Boro Zone" 33 | 32,"Bronx","Bronxdale","Boro Zone" 34 | 33,"Brooklyn","Brooklyn Heights","Boro Zone" 35 | 34,"Brooklyn","Brooklyn Navy Yard","Boro Zone" 36 | 35,"Brooklyn","Brownsville","Boro Zone" 37 | 36,"Brooklyn","Bushwick North","Boro Zone" 38 | 37,"Brooklyn","Bushwick South","Boro Zone" 39 | 38,"Queens","Cambria Heights","Boro Zone" 40 | 39,"Brooklyn","Canarsie","Boro Zone" 41 | 40,"Brooklyn","Carroll Gardens","Boro Zone" 42 | 41,"Manhattan","Central Harlem","Boro Zone" 43 | 42,"Manhattan","Central Harlem North","Boro Zone" 44 | 43,"Manhattan","Central Park","Yellow Zone" 45 | 44,"Staten Island","Charleston/Tottenville","Boro Zone" 46 | 45,"Manhattan","Chinatown","Yellow Zone" 47 | 46,"Bronx","City Island","Boro Zone" 48 | 47,"Bronx","Claremont/Bathgate","Boro Zone" 49 | 48,"Manhattan","Clinton East","Yellow Zone" 50 | 49,"Brooklyn","Clinton Hill","Boro Zone" 51 | 50,"Manhattan","Clinton West","Yellow Zone" 52 | 51,"Bronx","Co-Op City","Boro Zone" 53 | 52,"Brooklyn","Cobble Hill","Boro Zone" 54 | 53,"Queens","College Point","Boro Zone" 55 | 54,"Brooklyn","Columbia Street","Boro Zone" 56 | 55,"Brooklyn","Coney Island","Boro Zone" 57 | 56,"Queens","Corona","Boro Zone" 58 | 57,"Queens","Corona","Boro Zone" 59 | 58,"Bronx","Country Club","Boro Zone" 60 | 59,"Bronx","Crotona Park","Boro Zone" 61 | 60,"Bronx","Crotona Park East","Boro Zone" 62 | 61,"Brooklyn","Crown Heights North","Boro Zone" 63 | 62,"Brooklyn","Crown Heights South","Boro Zone" 64 | 63,"Brooklyn","Cypress Hills","Boro Zone" 65 | 64,"Queens","Douglaston","Boro Zone" 66 | 65,"Brooklyn","Downtown Brooklyn/MetroTech","Boro Zone" 67 | 66,"Brooklyn","DUMBO/Vinegar Hill","Boro Zone" 68 | 67,"Brooklyn","Dyker Heights","Boro Zone" 69 | 68,"Manhattan","East Chelsea","Yellow Zone" 70 | 69,"Bronx","East Concourse/Concourse Village","Boro Zone" 71 | 70,"Queens","East Elmhurst","Boro Zone" 72 | 71,"Brooklyn","East Flatbush/Farragut","Boro Zone" 73 | 72,"Brooklyn","East Flatbush/Remsen Village","Boro Zone" 74 | 73,"Queens","East Flushing","Boro Zone" 75 | 74,"Manhattan","East Harlem North","Boro Zone" 76 | 75,"Manhattan","East Harlem South","Boro Zone" 77 | 76,"Brooklyn","East New York","Boro Zone" 78 | 77,"Brooklyn","East New York/Pennsylvania Avenue","Boro Zone" 79 | 78,"Bronx","East Tremont","Boro Zone" 80 | 79,"Manhattan","East Village","Yellow Zone" 81 | 80,"Brooklyn","East Williamsburg","Boro Zone" 82 | 81,"Bronx","Eastchester","Boro Zone" 83 | 82,"Queens","Elmhurst","Boro Zone" 84 | 83,"Queens","Elmhurst/Maspeth","Boro Zone" 85 | 84,"Staten Island","Eltingville/Annadale/Prince's Bay","Boro Zone" 86 | 85,"Brooklyn","Erasmus","Boro Zone" 87 | 86,"Queens","Far Rockaway","Boro Zone" 88 | 87,"Manhattan","Financial District North","Yellow Zone" 89 | 88,"Manhattan","Financial District South","Yellow Zone" 90 | 89,"Brooklyn","Flatbush/Ditmas Park","Boro Zone" 91 | 90,"Manhattan","Flatiron","Yellow Zone" 92 | 91,"Brooklyn","Flatlands","Boro Zone" 93 | 92,"Queens","Flushing","Boro Zone" 94 | 93,"Queens","Flushing Meadows-Corona Park","Boro Zone" 95 | 94,"Bronx","Fordham South","Boro Zone" 96 | 95,"Queens","Forest Hills","Boro Zone" 97 | 96,"Queens","Forest Park/Highland Park","Boro Zone" 98 | 97,"Brooklyn","Fort Greene","Boro Zone" 99 | 98,"Queens","Fresh Meadows","Boro Zone" 100 | 99,"Staten Island","Freshkills Park","Boro Zone" 101 | 100,"Manhattan","Garment District","Yellow Zone" 102 | 101,"Queens","Glen Oaks","Boro Zone" 103 | 102,"Queens","Glendale","Boro Zone" 104 | 103,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone" 105 | 104,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone" 106 | 105,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone" 107 | 106,"Brooklyn","Gowanus","Boro Zone" 108 | 107,"Manhattan","Gramercy","Yellow Zone" 109 | 108,"Brooklyn","Gravesend","Boro Zone" 110 | 109,"Staten Island","Great Kills","Boro Zone" 111 | 110,"Staten Island","Great Kills Park","Boro Zone" 112 | 111,"Brooklyn","Green-Wood Cemetery","Boro Zone" 113 | 112,"Brooklyn","Greenpoint","Boro Zone" 114 | 113,"Manhattan","Greenwich Village North","Yellow Zone" 115 | 114,"Manhattan","Greenwich Village South","Yellow Zone" 116 | 115,"Staten Island","Grymes Hill/Clifton","Boro Zone" 117 | 116,"Manhattan","Hamilton Heights","Boro Zone" 118 | 117,"Queens","Hammels/Arverne","Boro Zone" 119 | 118,"Staten Island","Heartland Village/Todt Hill","Boro Zone" 120 | 119,"Bronx","Highbridge","Boro Zone" 121 | 120,"Manhattan","Highbridge Park","Boro Zone" 122 | 121,"Queens","Hillcrest/Pomonok","Boro Zone" 123 | 122,"Queens","Hollis","Boro Zone" 124 | 123,"Brooklyn","Homecrest","Boro Zone" 125 | 124,"Queens","Howard Beach","Boro Zone" 126 | 125,"Manhattan","Hudson Sq","Yellow Zone" 127 | 126,"Bronx","Hunts Point","Boro Zone" 128 | 127,"Manhattan","Inwood","Boro Zone" 129 | 128,"Manhattan","Inwood Hill Park","Boro Zone" 130 | 129,"Queens","Jackson Heights","Boro Zone" 131 | 130,"Queens","Jamaica","Boro Zone" 132 | 131,"Queens","Jamaica Estates","Boro Zone" 133 | 132,"Queens","JFK Airport","Airports" 134 | 133,"Brooklyn","Kensington","Boro Zone" 135 | 134,"Queens","Kew Gardens","Boro Zone" 136 | 135,"Queens","Kew Gardens Hills","Boro Zone" 137 | 136,"Bronx","Kingsbridge Heights","Boro Zone" 138 | 137,"Manhattan","Kips Bay","Yellow Zone" 139 | 138,"Queens","LaGuardia Airport","Airports" 140 | 139,"Queens","Laurelton","Boro Zone" 141 | 140,"Manhattan","Lenox Hill East","Yellow Zone" 142 | 141,"Manhattan","Lenox Hill West","Yellow Zone" 143 | 142,"Manhattan","Lincoln Square East","Yellow Zone" 144 | 143,"Manhattan","Lincoln Square West","Yellow Zone" 145 | 144,"Manhattan","Little Italy/NoLiTa","Yellow Zone" 146 | 145,"Queens","Long Island City/Hunters Point","Boro Zone" 147 | 146,"Queens","Long Island City/Queens Plaza","Boro Zone" 148 | 147,"Bronx","Longwood","Boro Zone" 149 | 148,"Manhattan","Lower East Side","Yellow Zone" 150 | 149,"Brooklyn","Madison","Boro Zone" 151 | 150,"Brooklyn","Manhattan Beach","Boro Zone" 152 | 151,"Manhattan","Manhattan Valley","Yellow Zone" 153 | 152,"Manhattan","Manhattanville","Boro Zone" 154 | 153,"Manhattan","Marble Hill","Boro Zone" 155 | 154,"Brooklyn","Marine Park/Floyd Bennett Field","Boro Zone" 156 | 155,"Brooklyn","Marine Park/Mill Basin","Boro Zone" 157 | 156,"Staten Island","Mariners Harbor","Boro Zone" 158 | 157,"Queens","Maspeth","Boro Zone" 159 | 158,"Manhattan","Meatpacking/West Village West","Yellow Zone" 160 | 159,"Bronx","Melrose South","Boro Zone" 161 | 160,"Queens","Middle Village","Boro Zone" 162 | 161,"Manhattan","Midtown Center","Yellow Zone" 163 | 162,"Manhattan","Midtown East","Yellow Zone" 164 | 163,"Manhattan","Midtown North","Yellow Zone" 165 | 164,"Manhattan","Midtown South","Yellow Zone" 166 | 165,"Brooklyn","Midwood","Boro Zone" 167 | 166,"Manhattan","Morningside Heights","Boro Zone" 168 | 167,"Bronx","Morrisania/Melrose","Boro Zone" 169 | 168,"Bronx","Mott Haven/Port Morris","Boro Zone" 170 | 169,"Bronx","Mount Hope","Boro Zone" 171 | 170,"Manhattan","Murray Hill","Yellow Zone" 172 | 171,"Queens","Murray Hill-Queens","Boro Zone" 173 | 172,"Staten Island","New Dorp/Midland Beach","Boro Zone" 174 | 173,"Queens","North Corona","Boro Zone" 175 | 174,"Bronx","Norwood","Boro Zone" 176 | 175,"Queens","Oakland Gardens","Boro Zone" 177 | 176,"Staten Island","Oakwood","Boro Zone" 178 | 177,"Brooklyn","Ocean Hill","Boro Zone" 179 | 178,"Brooklyn","Ocean Parkway South","Boro Zone" 180 | 179,"Queens","Old Astoria","Boro Zone" 181 | 180,"Queens","Ozone Park","Boro Zone" 182 | 181,"Brooklyn","Park Slope","Boro Zone" 183 | 182,"Bronx","Parkchester","Boro Zone" 184 | 183,"Bronx","Pelham Bay","Boro Zone" 185 | 184,"Bronx","Pelham Bay Park","Boro Zone" 186 | 185,"Bronx","Pelham Parkway","Boro Zone" 187 | 186,"Manhattan","Penn Station/Madison Sq West","Yellow Zone" 188 | 187,"Staten Island","Port Richmond","Boro Zone" 189 | 188,"Brooklyn","Prospect-Lefferts Gardens","Boro Zone" 190 | 189,"Brooklyn","Prospect Heights","Boro Zone" 191 | 190,"Brooklyn","Prospect Park","Boro Zone" 192 | 191,"Queens","Queens Village","Boro Zone" 193 | 192,"Queens","Queensboro Hill","Boro Zone" 194 | 193,"Queens","Queensbridge/Ravenswood","Boro Zone" 195 | 194,"Manhattan","Randalls Island","Yellow Zone" 196 | 195,"Brooklyn","Red Hook","Boro Zone" 197 | 196,"Queens","Rego Park","Boro Zone" 198 | 197,"Queens","Richmond Hill","Boro Zone" 199 | 198,"Queens","Ridgewood","Boro Zone" 200 | 199,"Bronx","Rikers Island","Boro Zone" 201 | 200,"Bronx","Riverdale/North Riverdale/Fieldston","Boro Zone" 202 | 201,"Queens","Rockaway Park","Boro Zone" 203 | 202,"Manhattan","Roosevelt Island","Boro Zone" 204 | 203,"Queens","Rosedale","Boro Zone" 205 | 204,"Staten Island","Rossville/Woodrow","Boro Zone" 206 | 205,"Queens","Saint Albans","Boro Zone" 207 | 206,"Staten Island","Saint George/New Brighton","Boro Zone" 208 | 207,"Queens","Saint Michaels Cemetery/Woodside","Boro Zone" 209 | 208,"Bronx","Schuylerville/Edgewater Park","Boro Zone" 210 | 209,"Manhattan","Seaport","Yellow Zone" 211 | 210,"Brooklyn","Sheepshead Bay","Boro Zone" 212 | 211,"Manhattan","SoHo","Yellow Zone" 213 | 212,"Bronx","Soundview/Bruckner","Boro Zone" 214 | 213,"Bronx","Soundview/Castle Hill","Boro Zone" 215 | 214,"Staten Island","South Beach/Dongan Hills","Boro Zone" 216 | 215,"Queens","South Jamaica","Boro Zone" 217 | 216,"Queens","South Ozone Park","Boro Zone" 218 | 217,"Brooklyn","South Williamsburg","Boro Zone" 219 | 218,"Queens","Springfield Gardens North","Boro Zone" 220 | 219,"Queens","Springfield Gardens South","Boro Zone" 221 | 220,"Bronx","Spuyten Duyvil/Kingsbridge","Boro Zone" 222 | 221,"Staten Island","Stapleton","Boro Zone" 223 | 222,"Brooklyn","Starrett City","Boro Zone" 224 | 223,"Queens","Steinway","Boro Zone" 225 | 224,"Manhattan","Stuy Town/Peter Cooper Village","Yellow Zone" 226 | 225,"Brooklyn","Stuyvesant Heights","Boro Zone" 227 | 226,"Queens","Sunnyside","Boro Zone" 228 | 227,"Brooklyn","Sunset Park East","Boro Zone" 229 | 228,"Brooklyn","Sunset Park West","Boro Zone" 230 | 229,"Manhattan","Sutton Place/Turtle Bay North","Yellow Zone" 231 | 230,"Manhattan","Times Sq/Theatre District","Yellow Zone" 232 | 231,"Manhattan","TriBeCa/Civic Center","Yellow Zone" 233 | 232,"Manhattan","Two Bridges/Seward Park","Yellow Zone" 234 | 233,"Manhattan","UN/Turtle Bay South","Yellow Zone" 235 | 234,"Manhattan","Union Sq","Yellow Zone" 236 | 235,"Bronx","University Heights/Morris Heights","Boro Zone" 237 | 236,"Manhattan","Upper East Side North","Yellow Zone" 238 | 237,"Manhattan","Upper East Side South","Yellow Zone" 239 | 238,"Manhattan","Upper West Side North","Yellow Zone" 240 | 239,"Manhattan","Upper West Side South","Yellow Zone" 241 | 240,"Bronx","Van Cortlandt Park","Boro Zone" 242 | 241,"Bronx","Van Cortlandt Village","Boro Zone" 243 | 242,"Bronx","Van Nest/Morris Park","Boro Zone" 244 | 243,"Manhattan","Washington Heights North","Boro Zone" 245 | 244,"Manhattan","Washington Heights South","Boro Zone" 246 | 245,"Staten Island","West Brighton","Boro Zone" 247 | 246,"Manhattan","West Chelsea/Hudson Yards","Yellow Zone" 248 | 247,"Bronx","West Concourse","Boro Zone" 249 | 248,"Bronx","West Farms/Bronx River","Boro Zone" 250 | 249,"Manhattan","West Village","Yellow Zone" 251 | 250,"Bronx","Westchester Village/Unionport","Boro Zone" 252 | 251,"Staten Island","Westerleigh","Boro Zone" 253 | 252,"Queens","Whitestone","Boro Zone" 254 | 253,"Queens","Willets Point","Boro Zone" 255 | 254,"Bronx","Williamsbridge/Olinville","Boro Zone" 256 | 255,"Brooklyn","Williamsburg (North Side)","Boro Zone" 257 | 256,"Brooklyn","Williamsburg (South Side)","Boro Zone" 258 | 257,"Brooklyn","Windsor Terrace","Boro Zone" 259 | 258,"Queens","Woodhaven","Boro Zone" 260 | 259,"Bronx","Woodlawn/Wakefield","Boro Zone" 261 | 260,"Queens","Woodside","Boro Zone" 262 | 261,"Manhattan","World Trade Center","Yellow Zone" 263 | 262,"Manhattan","Yorkville East","Yellow Zone" 264 | 263,"Manhattan","Yorkville West","Yellow Zone" 265 | 264,"Unknown","NV","N/A" 266 | 265,"Unknown","NA","N/A" -------------------------------------------------------------------------------- /week4/dbt/dbt_project.yml: -------------------------------------------------------------------------------- 1 | name: 'taxi_rides_ny' 2 | version: '1.0.0' 3 | config-version: 2 4 | 5 | # This setting configures which "profile" dbt uses for this project. 6 | profile: 'my_profile' 7 | 8 | # These configurations specify where dbt should look for different types of files. 9 | # The `source-paths` config, for example, states that models in this project can be 10 | # found in the "models/" directory. You probably won't need to change these! 11 | model-paths: ["models"] 12 | analysis-paths: ["analysis"] 13 | test-paths: ["tests"] 14 | seed-paths: ["data"] 15 | macro-paths: ["macros"] 16 | snapshot-paths: ["snapshots"] 17 | 18 | target-path: "target" # directory which will store compiled SQL files 19 | clean-targets: # directories to be removed by `dbt clean` 20 | - "target" 21 | - "dbt_packages" 22 | 23 | 24 | # Configuring models 25 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 26 | 27 | # In this example config, we tell dbt to build all models in the example/ directory 28 | # as tables. These settings can be overridden in the individual model files 29 | # using the `{{ config(...) }}` macro. 30 | models: 31 | taxi_rides_ny: 32 | # Applies to all files under models/.../ 33 | vars: 34 | payment_type_values: [1, 2, 3, 4, 5, 6] 35 | 36 | seeds: 37 | taxi_rides_ny: 38 | taxi_zone_lookup: 39 | +column_types: 40 | LocationID: numeric -------------------------------------------------------------------------------- /week4/dbt/macros/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pcrespoo/data-engineering-bootcamp/a1835f9c1557d4974b73837a0622ac5d24db9dfa/week4/dbt/macros/.gitkeep -------------------------------------------------------------------------------- /week4/dbt/macros/get_payment_type_description.sql: -------------------------------------------------------------------------------- 1 | {# 2 | This macro returns the description of the payment_type 3 | #} 4 | 5 | {% macro get_payment_type_description(payment_type) -%} 6 | 7 | case {{ payment_type }} 8 | when 1 then 'Credit card' 9 | when 2 then 'Cash' 10 | when 3 then 'No charge' 11 | when 4 then 'Dispute' 12 | when 5 then 'Unknown' 13 | when 6 then 'Voided trip' 14 | end 15 | 16 | {%- endmacro %} 17 | -------------------------------------------------------------------------------- /week4/dbt/models/core/dim_zones.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | select 3 | Locationid, 4 | Borough, 5 | zone, 6 | replace(service_zone,'Boro','Green') as service_zone 7 | from {{ ref('taxi_zone') }} -------------------------------------------------------------------------------- /week4/dbt/models/core/dm_monthly_zone_revenue.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | with trips_data as ( 4 | select * from {{ ref('fact_trips') }} 5 | ) 6 | select 7 | -- Reveneue grouping 8 | pickup_zone as revenue_zone, 9 | date_trunc(pickup_datetime,month) as revenue_month, 10 | --Note: For BQ use instead: date_trunc(pickup_datetime, month) as revenue_month, 11 | 12 | service_type, 13 | 14 | -- Revenue calculation 15 | sum(fare_amount) as revenue_monthly_fare, 16 | sum(extra) as revenue_monthly_extra, 17 | sum(mta_tax) as revenue_monthly_mta_tax, 18 | sum(tip_amount) as revenue_monthly_tip_amount, 19 | sum(tolls_amount) as revenue_monthly_tolls_amount, 20 | sum(improvement_surcharge) as revenue_monthly_improvement_surcharge, 21 | sum(total_amount) as revenue_monthly_total_amount, 22 | sum(congestion_surcharge) as revenue_monthly_congestion_surcharge, 23 | 24 | -- Additional calculations 25 | count(tripid) as total_monthly_trips, 26 | avg(passenger_count) as avg_montly_passenger_count, 27 | avg(trip_distance) as avg_montly_trip_distance 28 | 29 | from trips_data 30 | group by 1,2,3 -------------------------------------------------------------------------------- /week4/dbt/models/core/fact_trips.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | with green_data as ( 4 | select *, 5 | 'Green' as service_type 6 | from {{ ref('stg_green_tripdata') }} 7 | ), 8 | 9 | yellow_data as ( 10 | select *, 11 | 'Yellow' as service_type 12 | from {{ ref('stg_yellow_tripdata') }} 13 | ), 14 | 15 | trips_unioned as ( 16 | select * from green_data 17 | union all 18 | select * from yellow_data 19 | ), 20 | 21 | dim_zones as ( 22 | select * from {{ ref('dim_zones') }} 23 | where Borough != 'Unknown' 24 | ) 25 | select 26 | trips_unioned.tripid, 27 | trips_unioned.vendorid, 28 | trips_unioned.service_type, 29 | trips_unioned.ratecodeid, 30 | trips_unioned.pickup_locationid, 31 | pickup_zone.borough as pickup_borough, 32 | pickup_zone.zone as pickup_zone, 33 | trips_unioned.dropoff_locationid, 34 | dropoff_zone.borough as dropoff_borough, 35 | dropoff_zone.zone as dropoff_zone, 36 | trips_unioned.pickup_datetime, 37 | trips_unioned.dropoff_datetime, 38 | trips_unioned.store_and_fwd_flag, 39 | trips_unioned.passenger_count, 40 | trips_unioned.trip_distance, 41 | trips_unioned.trip_type, 42 | trips_unioned.fare_amount, 43 | trips_unioned.extra, 44 | trips_unioned.mta_tax, 45 | trips_unioned.tip_amount, 46 | trips_unioned.tolls_amount, 47 | trips_unioned.improvement_surcharge, 48 | trips_unioned.total_amount, 49 | trips_unioned.payment_type, 50 | trips_unioned.payment_type_description, 51 | trips_unioned.congestion_surcharge 52 | from trips_unioned 53 | inner join dim_zones as pickup_zone 54 | on trips_unioned.pickup_LocationID = pickup_zone.LocationID 55 | inner join dim_zones as dropoff_zone 56 | on trips_unioned.dropoff_LocationID = dropoff_zone.LocationID -------------------------------------------------------------------------------- /week4/dbt/models/core/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: dim_zones 5 | description: > 6 | List of unique zones idefied by locationid. 7 | Includes the service zone they correspond to (Green or yellow). 8 | - name: fact_trips 9 | description: > 10 | Taxi trips corresponding to both service zones (Green and yellow). 11 | The table contains records where both pickup and dropoff locations are valid and known zones. 12 | Each record corresponds to a trip uniquely identified by tripid. 13 | 14 | - name: dm_monthly_zone_revenue 15 | description: > 16 | Aggregated table of all taxi trips corresponding to both service zones (Green and yellow) per pickup zone, month and service. 17 | The table contains monthly sums of the fare elements used to calculate the monthly revenue. 18 | The table contains also monthly indicators like number of trips, and average trip distance. 19 | columns: 20 | - name: revenue_monthly_total_amount 21 | description: Monthly sum of the the total_amount of the fare charged for the trip per pickup zone, month and service. 22 | tests: 23 | - not_null: 24 | severity: error -------------------------------------------------------------------------------- /week4/dbt/models/staging/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: staging 5 | database: dtc-boot-7639 6 | schema: trips_data_all 7 | 8 | tables: 9 | - name: green_partitioned_clustered_table 10 | - name: yellow_partitioned_clustered_table 11 | models: 12 | - name: stg_green_tripdata 13 | description: > 14 | Trip made by green taxis, also known as boro taxis and street-hail liveries. 15 | Green taxis may respond to street hails,but only in the areas indicated in green on the 16 | map (i.e. above W 110 St/E 96th St in Manhattan and in the boroughs). 17 | The records were collected and provided to the NYC Taxi and Limousine Commission (TLC) by 18 | technology service providers. 19 | columns: 20 | - name: tripid 21 | description: Primary key for this table, generated with a concatenation of vendorid+pickup_datetime 22 | tests: 23 | - unique: 24 | severity: warn 25 | - not_null: 26 | severity: warn 27 | - name: VendorID 28 | description: > 29 | A code indicating the TPEP provider that provided the record. 30 | 1= Creative Mobile Technologies, LLC; 31 | 2= VeriFone Inc. 32 | - name: pickup_datetime 33 | description: The date and time when the meter was engaged. 34 | - name: dropoff_datetime 35 | description: The date and time when the meter was disengaged. 36 | - name: Passenger_count 37 | description: The number of passengers in the vehicle. This is a driver-entered value. 38 | - name: Trip_distance 39 | description: The elapsed trip distance in miles reported by the taximeter. 40 | - name: Pickup_locationid 41 | description: locationid where the meter was engaged. 42 | tests: 43 | - relationships: 44 | to: ref('taxi_zone') 45 | field: LocationID 46 | severity: warn 47 | - name: dropoff_locationid 48 | description: locationid where the meter was engaged. 49 | tests: 50 | - relationships: 51 | to: ref('taxi_zone') 52 | field: LocationID 53 | - name: RateCodeID 54 | description: > 55 | The final rate code in effect at the end of the trip. 56 | 1= Standard rate 57 | 2=JFK 58 | 3=Newark 59 | 4=Nassau or Westchester 60 | 5=Negotiated fare 61 | 6=Group ride 62 | - name: Store_and_fwd_flag 63 | description: > 64 | This flag indicates whether the trip record was held in vehicle 65 | memory before sending to the vendor, aka “store and forward,” 66 | because the vehicle did not have a connection to the server. 67 | Y= store and forward trip 68 | N= not a store and forward trip 69 | - name: Dropoff_longitude 70 | description: Longitude where the meter was disengaged. 71 | - name: Dropoff_latitude 72 | description: Latitude where the meter was disengaged. 73 | - name: Payment_type 74 | description: > 75 | A numeric code signifying how the passenger paid for the trip. 76 | tests: 77 | - accepted_values: 78 | values: "{{ var('payment_type_values') }}" 79 | severity: warn 80 | quote: false 81 | - name: payment_type_description 82 | description: Description of the payment_type code 83 | - name: Fare_amount 84 | description: > 85 | The time-and-distance fare calculated by the meter. 86 | Extra Miscellaneous extras and surcharges. Currently, this only includes 87 | the $0.50 and $1 rush hour and overnight charges. 88 | MTA_tax $0.50 MTA tax that is automatically triggered based on the metered 89 | rate in use. 90 | - name: Improvement_surcharge 91 | description: > 92 | $0.30 improvement surcharge assessed trips at the flag drop. The 93 | improvement surcharge began being levied in 2015. 94 | - name: Tip_amount 95 | description: > 96 | Tip amount. This field is automatically populated for credit card 97 | tips. Cash tips are not included. 98 | - name: Tolls_amount 99 | description: Total amount of all tolls paid in trip. 100 | - name: Total_amount 101 | description: The total amount charged to passengers. Does not include cash tips. 102 | 103 | - name: stg_yellow_tripdata 104 | description: > 105 | Trips made by New York City's iconic yellow taxis. 106 | Yellow taxis are the only vehicles permitted to respond to a street hail from a passenger in all five 107 | boroughs. They may also be hailed using an e-hail app like Curb or Arro. 108 | The records were collected and provided to the NYC Taxi and Limousine Commission (TLC) by 109 | technology service providers. 110 | columns: 111 | - name: tripid 112 | description: Primary key for this table, generated with a concatenation of vendorid+pickup_datetime 113 | tests: 114 | - unique: 115 | severity: warn 116 | - not_null: 117 | severity: warn 118 | - name: VendorID 119 | description: > 120 | A code indicating the TPEP provider that provided the record. 121 | 1= Creative Mobile Technologies, LLC; 122 | 2= VeriFone Inc. 123 | - name: pickup_datetime 124 | description: The date and time when the meter was engaged. 125 | - name: dropoff_datetime 126 | description: The date and time when the meter was disengaged. 127 | - name: Passenger_count 128 | description: The number of passengers in the vehicle. This is a driver-entered value. 129 | - name: Trip_distance 130 | description: The elapsed trip distance in miles reported by the taximeter. 131 | - name: Pickup_locationid 132 | description: locationid where the meter was engaged. 133 | tests: 134 | - relationships: 135 | to: ref('taxi_zone') 136 | field: LocationID 137 | severity: warn 138 | - name: dropoff_locationid 139 | description: locationid where the meter was engaged. 140 | tests: 141 | - relationships: 142 | to: ref('taxi_zone') 143 | field: LocationID 144 | severity: warn 145 | - name: RateCodeID 146 | description: > 147 | The final rate code in effect at the end of the trip. 148 | 1= Standard rate 149 | 2=JFK 150 | 3=Newark 151 | 4=Nassau or Westchester 152 | 5=Negotiated fare 153 | 6=Group ride 154 | - name: Store_and_fwd_flag 155 | description: > 156 | This flag indicates whether the trip record was held in vehicle 157 | memory before sending to the vendor, aka “store and forward,” 158 | because the vehicle did not have a connection to the server. 159 | Y= store and forward trip 160 | N= not a store and forward trip 161 | - name: Dropoff_longitude 162 | description: Longitude where the meter was disengaged. 163 | - name: Dropoff_latitude 164 | description: Latitude where the meter was disengaged. 165 | - name: Payment_type 166 | description: > 167 | A numeric code signifying how the passenger paid for the trip. 168 | tests: 169 | - accepted_values: 170 | values: "{{ var('payment_type_values') }}" 171 | severity: warn 172 | quote: false 173 | - name: payment_type_description 174 | description: Description of the payment_type code 175 | - name: Fare_amount 176 | description: > 177 | The time-and-distance fare calculated by the meter. 178 | Extra Miscellaneous extras and surcharges. Currently, this only includes 179 | the $0.50 and $1 rush hour and overnight charges. 180 | MTA_tax $0.50 MTA tax that is automatically triggered based on the metered 181 | rate in use. 182 | - name: Improvement_surcharge 183 | description: > 184 | $0.30 improvement surcharge assessed trips at the flag drop. The 185 | improvement surcharge began being levied in 2015. 186 | - name: Tip_amount 187 | description: > 188 | Tip amount. This field is automatically populated for credit card 189 | tips. Cash tips are not included. 190 | - name: Tolls_amount 191 | description: Total amount of all tolls paid in trip. 192 | - name: Total_amount 193 | description: The total amount charged to passengers. Does not include cash tips. 194 | -------------------------------------------------------------------------------- /week4/dbt/models/staging/stg_green_tripdata.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} 2 | 3 | with tripdata as 4 | ( 5 | select *, 6 | row_number() over(partition by vendorid, lpep_pickup_datetime) as rn 7 | from {{source('staging','green_partitioned_clustered_table')}} 8 | where vendorid is not null 9 | ) 10 | 11 | select 12 | -- identifiers 13 | {{ dbt_utils.surrogate_key(['vendorid', 'lpep_pickup_datetime']) }} as tripid, 14 | cast(vendorid as integer) as vendorid, 15 | cast(ratecodeid as integer) as ratecodeid, 16 | cast(pulocationid as integer) as pickup_locationid, 17 | cast(dolocationid as integer) as dropoff_locationid, 18 | 19 | -- timestamps 20 | cast(lpep_pickup_datetime as timestamp) as pickup_datetime, 21 | cast(lpep_dropoff_datetime as timestamp) as dropoff_datetime, 22 | 23 | -- trip info 24 | store_and_fwd_flag, 25 | cast(passenger_count as integer) as passenger_count, 26 | cast(trip_distance as numeric) as trip_distance, 27 | cast(trip_type as integer) as trip_type, 28 | 29 | -- payment info 30 | cast(fare_amount as numeric) as fare_amount, 31 | cast(extra as numeric) as extra, 32 | cast(mta_tax as numeric) as mta_tax, 33 | cast(tip_amount as numeric) as tip_amount, 34 | cast(tolls_amount as numeric) as tolls_amount, 35 | cast(improvement_surcharge as numeric) as improvement_surcharge, 36 | cast(total_amount as numeric) as total_amount, 37 | cast(payment_type as integer) as payment_type, 38 | {{ get_payment_type_description('payment_type') }} as payment_type_description, 39 | cast(congestion_surcharge as numeric) as congestion_surcharge 40 | from tripdata 41 | where rn = 1 42 | -- dbt build --m --var 'is_test_run: false' 43 | {% if var('is_test_run', default=true) %} 44 | 45 | limit 100 46 | 47 | {% endif %} -------------------------------------------------------------------------------- /week4/dbt/models/staging/stg_yellow_tripdata.sql: -------------------------------------------------------------------------------- 1 | 2 | {{ config(materialized='view') }} 3 | 4 | with tripdata as 5 | ( 6 | select *, 7 | row_number() over(partition by vendorid, tpep_pickup_datetime) as rn 8 | from {{source('staging','yellow_partitioned_clustered_table')}} 9 | where vendorid is not null 10 | ) 11 | 12 | select 13 | -- identifiers 14 | {{ dbt_utils.surrogate_key(['vendorid', 'tpep_pickup_datetime']) }} as tripid, 15 | cast(vendorid as integer) as vendorid, 16 | cast(ratecodeid as integer) as ratecodeid, 17 | cast(pulocationid as integer) as pickup_locationid, 18 | cast(dolocationid as integer) as dropoff_locationid, 19 | 20 | -- timestamps 21 | cast(tpep_pickup_datetime as timestamp) as pickup_datetime, 22 | cast(tpep_dropoff_datetime as timestamp) as dropoff_datetime, 23 | 24 | -- trip info 25 | store_and_fwd_flag, 26 | cast(passenger_count as integer) as passenger_count, 27 | cast(trip_distance as numeric) as trip_distance, 28 | -- yellow cabs are always street-hail 29 | 1 as trip_type, 30 | 31 | -- payment info 32 | cast(fare_amount as numeric) as fare_amount, 33 | cast(extra as numeric) as extra, 34 | cast(mta_tax as numeric) as mta_tax, 35 | cast(tip_amount as numeric) as tip_amount, 36 | cast(tolls_amount as numeric) as tolls_amount, 37 | cast(improvement_surcharge as numeric) as improvement_surcharge, 38 | cast(total_amount as numeric) as total_amount, 39 | cast(payment_type as integer) as payment_type, 40 | {{ get_payment_type_description('payment_type') }} as payment_type_description, 41 | cast(congestion_surcharge as numeric) as congestion_surcharge 42 | from tripdata 43 | where rn = 1 44 | -- dbt build --m --var 'is_test_run: false' 45 | {% if var('is_test_run', default=true) %} 46 | 47 | limit 100 48 | 49 | {% endif %} -------------------------------------------------------------------------------- /week4/dbt/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 0.8.0 -------------------------------------------------------------------------------- /week4/dbt/profiles.yml: -------------------------------------------------------------------------------- 1 | my-profile: 2 | target: dev 3 | outputs: 4 | dev: 5 | type: bigquery 6 | method: oauth 7 | project: dtc-boot-7639 8 | dataset: dbt_pcrespoo 9 | location: southamerica-east1 -------------------------------------------------------------------------------- /week4/dbt/seeds/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pcrespoo/data-engineering-bootcamp/a1835f9c1557d4974b73837a0622ac5d24db9dfa/week4/dbt/seeds/.gitkeep -------------------------------------------------------------------------------- /week4/dbt/snapshots/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pcrespoo/data-engineering-bootcamp/a1835f9c1557d4974b73837a0622ac5d24db9dfa/week4/dbt/snapshots/.gitkeep -------------------------------------------------------------------------------- /week4/dbt/tests/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pcrespoo/data-engineering-bootcamp/a1835f9c1557d4974b73837a0622ac5d24db9dfa/week4/dbt/tests/.gitkeep -------------------------------------------------------------------------------- /week6/README.md: -------------------------------------------------------------------------------- 1 | ## Topics covered in week 6: 2 | - Kafka 3 | - Basic terminology 4 | - Kafka workflow 5 | - Avro and Kafka 6 | - Kafka Streams 7 | - KSQL 8 | ### Notes: 9 | Notion page: https://www.notion.so/pcrespoo/Week-6-7b58575a54f64b64825a30ff02370a27 10 | -------------------------------------------------------------------------------- /week6/avro_example/consumer.py: -------------------------------------------------------------------------------- 1 | from confluent_kafka.avro import AvroConsumer 2 | 3 | 4 | def read_messages(): 5 | consumer_config = {"bootstrap.servers": "localhost:9092", 6 | "schema.registry.url": "http://localhost:8081", 7 | "group.id": "datatalkclubs.taxirides.avro.consumer.2", 8 | "auto.offset.reset": "earliest"} 9 | 10 | consumer = AvroConsumer(consumer_config) 11 | consumer.subscribe(["datatalkclub.yellow_taxi_rides"]) 12 | 13 | while(True): 14 | try: 15 | message = consumer.poll(5) 16 | except Exception as e: 17 | print(f"Exception while trying to poll messages - {e}") 18 | else: 19 | if message: 20 | print(f"Successfully poll a record from " 21 | f"Kafka topic: {message.topic()}, partition: {message.partition()}, offset: {message.offset()}\n" 22 | f"message key: {message.key()} || message value: {message.value()}") 23 | consumer.commit() 24 | else: 25 | print("No new messages at this point. Try again later.") 26 | consumer.close() 27 | 28 | 29 | if __name__ == "__main__": 30 | read_messages() -------------------------------------------------------------------------------- /week6/avro_example/data/rides_new.csv: -------------------------------------------------------------------------------- 1 | VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge 2 | 1,2020-07-01 00:25:32,2020-07-01 00:33:39,1,1.5,1,N,238,75,2,8.0,0.5,0.5,0.0,0.0,0.3,9.3,0.0 3 | 1,2020-07-01 00:03:19,2020-07-01 00:25:43,1,9.5,1,N,138,216,1,26.5,0.5,0.5,0.0,0.0,0.3,27.8,0.0 4 | 2,2020-07-01 00:15:11,2020-07-01 00:29:24,1,5.85,1,N,230,88,2,18.5,0.5,0.5,0.0,0.0,0.3,22.3,2.5 5 | 2,2020-07-01 00:30:49,2020-07-01 00:38:26,1,1.9,1,N,88,232,1,8.0,0.5,0.5,2.36,0.0,0.3,14.16,2.5 6 | 2,2020-07-01 00:31:26,2020-07-01 00:38:02,1,1.25,1,N,37,17,2,6.5,0.5,0.5,0.0,0.0,0.3,7.8,0.0 7 | 1,2020-07-01 00:09:00,2020-07-01 00:34:39,1,9.7,1,N,140,61,1,30.0,3.0,0.5,0.0,0.0,0.3,33.8,2.5 8 | 2,2020-07-01 00:44:08,2020-07-01 00:58:12,1,5.27,1,N,137,260,1,16.5,0.5,0.5,6.09,0.0,0.3,26.39,2.5 9 | 2,2020-07-01 00:49:20,2020-07-01 00:56:44,1,1.32,1,N,166,41,2,7.5,0.5,0.5,0.0,0.0,0.3,8.8,0.0 10 | 2,2020-07-01 00:21:59,2020-07-01 00:25:12,1,0.73,1,N,239,142,1,5.0,0.5,0.5,1.32,0.0,0.3,10.12,2.5 11 | 2,2020-07-01 00:08:28,2020-07-01 00:36:18,1,18.65,2,N,132,249,1,52.0,0.0,0.5,11.06,0.0,0.3,66.36,2.5 12 | 1,2020-07-01 00:26:44,2020-07-01 00:43:46,2,8.0,1,N,138,112,1,24.0,0.5,0.5,3.0,0.0,0.3,28.3,0.0 13 | 2,2020-07-01 00:40:49,2020-07-01 00:51:59,3,4.97,1,N,79,195,2,16.0,0.5,0.5,0.0,0.0,0.3,19.8,2.5 14 | 2,2020-07-01 00:03:34,2020-07-01 00:03:42,1,0.0,2,N,45,45,1,52.0,0.0,0.5,11.06,0.0,0.3,66.36,2.5 15 | 2,2020-07-01 00:08:53,2020-07-01 00:12:42,1,0.57,1,N,263,263,2,4.5,0.5,0.5,0.0,0.0,0.3,8.3,2.5 16 | 2,2020-07-01 00:16:31,2020-07-01 00:16:41,1,0.0,1,N,263,263,1,2.5,0.5,0.5,1.89,0.0,0.3,8.19,2.5 17 | 2,2020-07-01 00:36:43,2020-07-01 01:02:48,1,9.41,1,N,170,116,1,29.5,0.5,0.5,3.0,0.0,0.3,36.3,2.5 18 | 1,2020-07-01 00:16:31,2020-07-01 00:16:43,1,2.8,1,Y,141,141,2,2.5,2.5,0.5,0.0,0.0,0.3,5.8,2.5 19 | 1,2020-07-01 00:33:37,2020-07-01 00:55:26,2,13.5,1,Y,137,254,2,36.5,2.5,0.5,0.0,0.0,0.3,39.8,2.5 20 | 2,2020-07-01 00:15:15,2020-07-01 00:17:44,1,0.48,1,N,140,140,1,4.0,0.5,0.5,1.56,0.0,0.3,9.36,2.5 21 | 2,2020-07-01 00:38:24,2020-07-01 00:46:57,1,1.67,1,N,238,75,1,8.0,0.5,0.5,2.79,0.0,0.3,12.09,0.0 22 | 2,2020-07-01 00:17:10,2020-07-01 00:25:45,1,3.75,1,N,137,75,1,12.5,0.5,0.5,4.89,0.0,0.3,21.19,2.5 23 | 2,2020-07-01 00:40:45,2020-07-01 00:50:45,1,3.45,1,N,263,48,1,12.0,0.5,0.5,3.16,0.0,0.3,18.96,2.5 24 | 1,2020-07-01 00:53:13,2020-07-01 01:13:32,2,6.7,1,N,249,74,1,21.0,3.0,0.5,6.0,0.0,0.3,30.8,2.5 25 | 1,2020-07-01 00:07:57,2020-07-01 00:14:42,2,1.7,1,N,142,186,2,7.5,3.0,0.5,0.0,0.0,0.3,11.3,2.5 26 | 1,2020-07-01 00:26:14,2020-07-01 00:30:19,1,1.1,1,N,140,262,1,6.0,3.0,0.5,1.0,0.0,0.3,10.8,2.5 27 | 2,2020-07-01 00:15:30,2020-07-01 00:22:33,1,1.36,1,N,43,237,2,7.5,0.5,0.5,0.0,0.0,0.3,11.3,2.5 28 | 2,2020-07-01 00:30:01,2020-07-01 00:39:54,1,2.01,1,N,141,239,2,9.5,0.5,0.5,0.0,0.0,0.3,13.3,2.5 29 | 2,2020-07-01 00:32:58,2020-07-01 00:40:18,3,0.77,1,N,114,234,2,4.5,0.5,0.5,0.0,0.0,0.3,8.3,2.5 30 | 1,2020-07-01 00:43:40,2020-07-01 00:46:32,1,1.1,1,N,249,186,2,5.0,3.0,0.5,0.0,0.0,0.3,8.8,2.5 31 | 1,2020-07-01 00:48:48,2020-07-01 00:59:46,1,3.5,1,N,186,262,2,12.0,3.0,0.5,0.0,0.0,0.3,15.8,2.5 32 | 2,2020-07-01 00:25:25,2020-07-01 00:30:18,1,0.94,1,N,68,186,1,6.0,0.5,0.5,1.96,0.0,0.3,11.76,2.5 33 | 2,2020-07-01 00:32:51,2020-07-01 00:38:43,1,1.5,1,N,90,230,1,7.0,0.5,0.5,0.0,0.0,0.3,10.8,2.5 34 | 2,2020-07-01 00:02:23,2020-07-01 00:09:57,1,1.86,1,N,79,186,2,8.5,0.5,0.5,0.0,0.0,0.3,12.3,2.5 35 | 2,2020-07-01 00:17:00,2020-07-01 00:41:20,1,7.85,1,N,164,225,1,25.0,0.5,0.5,6.98,6.12,0.3,41.9,2.5 36 | 1,2020-07-01 00:40:37,2020-07-01 00:47:02,1,1.4,1,N,249,231,1,7.0,3.0,0.5,0.0,0.0,0.3,10.8,2.5 37 | 1,2020-07-01 00:57:28,2020-07-01 01:02:12,1,0.9,1,N,186,48,1,5.5,3.0,0.5,1.85,0.0,0.3,11.15,2.5 38 | 2,2020-07-01 00:09:44,2020-07-01 00:19:33,1,2.33,1,N,166,74,1,9.5,0.5,0.5,2.16,0.0,0.3,12.96,0.0 39 | 2,2020-07-01 00:13:57,2020-07-01 00:22:10,1,1.41,1,N,74,75,2,7.5,0.5,0.5,0.0,0.0,0.3,8.8,0.0 40 | 2,2020-07-01 00:27:00,2020-07-01 00:42:58,1,5.03,1,N,75,249,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5 41 | 2,2020-07-01 00:45:14,2020-07-01 00:53:13,1,1.44,1,N,114,79,1,7.5,0.5,0.5,0.0,0.0,0.3,11.3,2.5 42 | 2,2020-07-01 00:17:22,2020-07-01 00:38:33,1,9.3,1,N,138,166,2,28.0,0.5,0.5,0.0,6.12,0.3,35.42,0.0 43 | 2,2020-07-01 00:46:05,2020-07-01 00:47:58,1,0.75,1,N,238,236,1,4.0,0.5,0.5,0.8,0.0,0.3,6.1,0.0 44 | 2,2020-07-01 00:53:17,2020-07-01 01:04:18,1,4.44,1,N,263,7,2,14.5,0.5,0.5,0.0,0.0,0.3,18.3,2.5 45 | 2,2020-07-01 00:03:02,2020-07-01 00:07:35,2,2.39,1,N,137,232,1,8.5,0.5,0.5,2.46,0.0,0.3,14.76,2.5 46 | 2,2020-07-01 00:36:59,2020-07-01 00:51:01,3,6.42,1,N,137,97,2,20.5,0.5,0.5,0.0,0.0,0.3,24.3,2.5 47 | 2,2020-07-01 00:48:36,2020-07-01 00:57:29,1,3.77,1,N,107,263,1,11.5,0.5,0.5,1.53,0.0,0.3,16.83,2.5 48 | 2,2020-07-01 00:05:24,2020-07-01 00:14:11,1,2.89,1,N,263,170,2,10.0,0.5,0.5,0.0,0.0,0.3,13.8,2.5 49 | 2,2020-06-30 23:59:23,2020-07-01 00:13:26,1,5.07,1,N,114,236,1,16.5,0.5,0.5,3.0,0.0,0.3,23.3,2.5 50 | 2,2020-07-01 00:15:59,2020-07-01 00:29:58,3,2.55,1,N,75,166,1,12.0,0.5,0.5,0.0,0.0,0.3,13.3,0.0 51 | 2,2020-07-01 00:45:47,2020-07-01 00:54:11,4,1.95,1,N,142,164,2,8.5,0.5,0.5,0.0,0.0,0.3,12.3,2.5 52 | 2,2020-07-01 01:00:18,2020-07-01 01:27:13,1,12.66,1,N,68,248,1,36.5,0.5,0.5,2.75,0.0,0.3,40.55,0.0 53 | 2,2020-07-01 00:26:27,2020-07-01 00:57:14,1,11.6,1,N,114,128,1,35.5,0.5,0.5,0.0,0.0,0.3,39.3,2.5 54 | 2,2020-07-01 00:06:29,2020-07-01 00:26:40,1,10.27,1,N,138,238,1,29.5,0.5,0.5,9.86,6.12,0.3,49.28,2.5 55 | 2,2020-07-01 00:34:01,2020-07-01 00:47:57,1,5.31,1,N,142,4,1,17.5,0.5,0.5,5.32,0.0,0.3,26.62,2.5 56 | 2,2020-07-01 00:05:27,2020-07-01 00:20:15,1,7.48,1,N,140,25,1,22.5,0.5,0.5,6.58,0.0,0.3,32.88,2.5 57 | 2,2020-07-01 00:22:32,2020-07-01 00:46:40,1,5.27,1,N,25,89,1,20.0,0.5,0.5,4.26,0.0,0.3,25.56,0.0 58 | 2,2020-07-01 00:02:08,2020-07-01 00:18:04,1,4.98,1,N,79,13,1,17.0,0.5,0.5,4.16,0.0,0.3,24.96,2.5 59 | 2,2020-07-01 00:21:09,2020-07-01 00:34:00,1,5.66,1,N,13,137,1,18.0,0.5,0.5,3.0,0.0,0.3,24.8,2.5 60 | 1,2020-07-01 00:29:32,2020-07-01 00:35:36,1,1.3,1,N,211,232,1,7.0,3.0,0.5,2.15,0.0,0.3,12.95,2.5 61 | 1,2020-07-01 00:04:26,2020-07-01 00:17:19,2,2.4,1,N,179,8,2,11.0,0.5,0.5,0.0,0.0,0.3,12.3,0.0 62 | 2,2020-07-01 00:10:41,2020-07-01 00:19:54,1,2.12,1,N,142,140,1,9.0,0.5,0.5,3.2,0.0,0.3,16.0,2.5 63 | 2,2020-07-01 00:24:13,2020-07-01 00:28:34,2,1.67,1,N,141,75,1,6.5,0.5,0.5,2.06,0.0,0.3,12.36,2.5 64 | 2,2020-07-01 00:39:54,2020-07-01 00:49:20,2,2.77,1,N,141,100,1,10.0,0.5,0.5,3.45,0.0,0.3,17.25,2.5 65 | 2,2020-07-01 00:39:29,2020-07-01 00:57:16,1,5.99,1,N,166,7,1,19.5,0.5,0.5,5.0,6.12,0.3,31.92,0.0 66 | 2,2020-07-01 00:18:54,2020-07-01 00:25:14,1,2.33,1,N,114,162,1,8.5,0.5,0.5,0.0,0.0,0.3,12.3,2.5 67 | 2,2020-07-01 00:10:56,2020-07-01 00:13:01,6,0.73,1,N,75,74,2,4.0,0.5,0.5,0.0,0.0,0.3,5.3,0.0 68 | 2,2020-07-01 00:18:59,2020-07-01 00:35:22,1,6.52,1,N,137,80,4,-21.0,-0.5,-0.5,0.0,0.0,-0.3,-24.8,-2.5 69 | 2,2020-07-01 00:18:59,2020-07-01 00:35:22,1,6.52,1,N,137,80,2,21.0,0.5,0.5,0.0,0.0,0.3,24.8,2.5 70 | 2,2020-07-01 00:01:58,2020-07-01 00:27:31,1,12.23,1,N,138,48,1,35.5,0.5,0.5,6.0,6.12,0.3,51.42,2.5 71 | 2,2020-06-30 23:42:25,2020-07-01 00:05:46,1,6.98,1,N,143,247,2,23.5,0.5,0.5,0.0,0.0,0.3,27.3,2.5 72 | 1,2020-07-01 00:01:40,2020-07-01 00:12:19,2,2.4,1,N,151,116,1,10.5,0.5,0.5,2.95,0.0,0.3,14.75,0.0 73 | 1,2020-07-01 00:30:51,2020-07-01 00:36:44,2,1.5,1,N,238,166,1,7.5,3.0,0.5,0.0,0.0,0.3,11.3,2.5 74 | 1,2020-07-01 00:12:23,2020-07-01 00:22:57,1,2.3,1,N,68,148,2,10.0,3.0,0.5,0.0,0.0,0.3,13.8,2.5 75 | 1,2020-07-01 00:27:23,2020-07-01 00:30:13,1,0.7,1,N,148,79,1,4.5,3.0,0.5,1.0,0.0,0.3,9.3,2.5 76 | 2,2020-07-01 00:24:51,2020-07-01 00:32:30,2,2.1,1,N,48,158,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5 77 | 2,2020-07-01 00:42:04,2020-07-01 00:45:39,2,1.05,1,N,48,48,2,5.0,0.5,0.5,0.0,0.0,0.3,8.8,2.5 78 | 1,2020-07-01 00:04:38,2020-07-01 00:09:25,1,1.0,1,N,249,186,2,6.0,3.0,0.5,0.0,0.0,0.3,9.8,2.5 79 | 1,2020-07-01 00:18:56,2020-07-01 00:20:35,1,0.5,1,N,125,249,1,3.5,3.0,0.5,1.0,0.0,0.3,8.3,2.5 80 | 1,2020-07-01 00:31:45,2020-07-01 00:50:50,2,6.2,1,N,249,37,1,20.5,3.0,0.5,4.85,0.0,0.3,29.15,2.5 81 | 1,2020-07-01 00:13:01,2020-07-01 00:35:14,1,14.6,1,N,162,16,2,40.5,3.0,0.5,0.0,0.0,0.3,44.3,2.5 82 | 2,2020-07-01 00:09:30,2020-07-01 00:12:54,1,0.7,1,N,230,100,2,4.5,0.5,0.5,0.0,0.0,0.3,8.3,2.5 83 | 1,2020-07-01 00:07:46,2020-07-01 00:16:16,1,2.0,1,N,170,141,1,8.5,3.0,0.5,3.65,0.0,0.3,15.95,2.5 84 | 1,2020-07-01 00:34:45,2020-07-01 00:53:39,1,4.9,1,N,249,262,1,17.5,3.0,0.5,2.7,0.0,0.3,24.0,2.5 85 | 2,2020-07-01 00:48:58,2020-07-01 01:01:27,1,4.82,1,N,75,145,1,15.0,0.5,0.5,0.0,0.0,0.3,18.8,2.5 86 | 2,2020-07-01 00:12:29,2020-07-01 00:19:56,1,1.73,1,N,263,238,1,8.0,0.5,0.5,1.0,0.0,0.3,12.8,2.5 87 | 1,2020-07-01 00:11:19,2020-07-01 00:16:24,1,1.1,1,N,138,70,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,0.0 88 | 1,2020-07-01 00:38:12,2020-07-01 01:14:52,2,25.9,1,N,138,156,4,68.0,0.5,0.5,0.0,12.24,0.3,81.54,0.0 89 | 2,2020-07-01 00:14:26,2020-07-01 00:17:41,2,0.77,1,N,137,107,1,4.5,0.5,0.5,1.66,0.0,0.3,9.96,2.5 90 | 1,2020-07-01 00:36:09,2020-07-01 00:42:25,1,2.3,1,N,48,236,1,8.5,3.0,0.5,2.45,0.0,0.3,14.75,2.5 91 | 2,2020-07-01 00:58:59,2020-07-01 01:06:59,1,3.57,1,N,74,119,1,11.5,0.5,0.5,2.56,0.0,0.3,15.36,0.0 92 | 1,2020-07-01 00:55:52,2020-07-01 01:14:25,1,7.8,1,N,68,159,1,24.5,3.0,0.5,5.65,0.0,0.3,33.95,2.5 93 | 2,2020-07-01 00:04:05,2020-07-01 00:18:10,1,6.31,1,N,137,97,1,19.5,0.5,0.5,0.0,0.0,0.3,23.3,2.5 94 | 1,2020-07-01 00:16:21,2020-07-01 00:46:16,1,18.5,4,N,50,265,1,67.0,3.0,0.5,5.0,0.0,0.3,75.8,2.5 95 | 2,2020-07-01 00:16:15,2020-07-01 00:44:00,2,10.8,1,N,186,26,1,32.5,0.5,0.5,3.7,0.0,0.3,40.0,2.5 96 | 1,2020-07-01 00:24:19,2020-07-01 00:32:39,1,1.6,1,N,234,144,1,8.0,3.0,0.5,1.0,0.0,0.3,12.8,2.5 97 | 1,2020-07-01 00:46:44,2020-07-01 01:05:20,1,4.1,1,N,90,238,1,17.0,3.0,0.5,4.15,0.0,0.3,24.95,2.5 98 | 1,2020-07-01 00:08:43,2020-07-01 00:33:43,1,5.7,1,N,186,217,2,21.5,3.0,0.5,0.0,0.0,0.3,25.3,2.5 99 | 2,2020-07-01 00:15:37,2020-07-01 00:34:29,1,14.19,1,N,132,7,1,38.5,0.5,0.5,7.96,0.0,0.3,47.76,0.0 100 | 2,2020-07-01 00:38:02,2020-07-01 00:49:24,1,2.47,1,N,7,145,2,10.5,0.5,0.5,0.0,0.0,0.3,11.8,0.0 101 | 2,2020-07-01 00:15:10,2020-07-01 00:17:55,1,0.88,1,N,141,229,1,4.5,0.5,0.5,0.5,0.0,0.3,8.8,2.5 102 | 2,2020-07-01 00:33:27,2020-07-01 00:41:52,1,2.22,1,N,114,261,1,9.0,0.5,0.5,2.0,0.0,0.3,14.8,2.5 103 | 1,2020-07-01 00:15:07,2020-07-01 00:39:00,1,7.0,1,N,263,173,2,23.5,3.0,0.5,0.0,0.0,0.3,27.3,2.5 104 | 1,2020-07-01 00:47:31,2020-07-01 01:05:14,1,7.9,1,N,138,96,2,24.0,0.5,0.5,0.0,0.0,0.3,25.3,0.0 105 | 2,2020-07-01 00:39:43,2020-07-01 00:42:22,2,1.07,1,N,75,41,1,5.0,0.5,0.5,0.0,0.0,0.3,6.3,0.0 106 | 2,2020-07-01 00:19:37,2020-07-01 00:27:20,1,1.54,1,N,231,232,2,7.5,0.5,0.5,0.0,0.0,0.3,11.3,2.5 107 | 2,2020-07-01 00:33:35,2020-07-02 00:27:20,1,8.31,1,N,148,41,1,25.0,0.5,0.5,2.88,0.0,0.3,31.68,2.5 108 | 2,2020-07-01 00:59:08,2020-07-01 01:07:28,1,1.61,1,N,166,244,2,8.0,0.5,0.5,0.0,0.0,0.3,9.3,0.0 109 | 2,2020-07-01 00:08:12,2020-07-01 00:17:32,1,2.33,1,N,229,142,1,9.5,0.5,0.5,2.66,0.0,0.3,15.96,2.5 110 | 2,2020-07-01 00:27:41,2020-07-01 00:35:32,1,3.21,1,N,249,143,2,10.5,0.5,0.5,0.0,0.0,0.3,14.3,2.5 111 | 2,2020-07-01 00:08:42,2020-07-01 00:20:20,2,4.17,1,N,137,193,2,14.0,0.5,0.5,0.0,0.0,0.3,17.8,2.5 112 | 2,2020-07-01 00:39:57,2020-07-01 00:44:22,1,1.51,1,N,229,262,1,6.5,0.5,0.5,2.06,0.0,0.3,12.36,2.5 113 | 2,2020-07-01 00:45:33,2020-07-01 00:49:30,1,1.3,1,N,262,74,2,6.0,0.5,0.5,0.0,0.0,0.3,9.8,2.5 114 | 1,2020-07-01 00:05:05,2020-07-01 00:16:12,1,4.2,1,N,231,142,1,13.5,3.0,0.5,3.46,0.0,0.3,20.76,2.5 115 | 1,2020-07-01 00:21:25,2020-07-01 00:50:22,1,7.4,1,N,48,244,1,27.5,3.0,0.5,0.0,0.0,0.3,31.3,2.5 116 | 1,2020-07-01 00:12:49,2020-07-01 00:17:58,1,0.9,1,N,41,151,1,6.0,0.5,0.5,1.45,0.0,0.3,8.75,0.0 117 | 1,2020-07-01 00:35:12,2020-07-01 00:43:47,1,1.6,1,N,48,229,2,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5 118 | 1,2020-07-01 00:04:13,2020-07-01 00:07:08,1,0.9,1,N,140,263,1,5.0,3.0,0.5,1.0,0.0,0.3,9.8,2.5 119 | 1,2020-07-01 00:14:39,2020-07-01 00:16:51,1,0.9,1,N,263,141,1,4.5,3.0,0.5,1.65,0.0,0.3,9.95,2.5 120 | 1,2020-07-01 00:26:16,2020-07-01 00:30:30,1,0.7,1,N,262,262,1,5.0,3.0,0.5,2.2,0.0,0.3,11.0,2.5 121 | 2,2020-07-01 00:03:36,2020-07-01 00:16:32,1,3.86,1,N,87,246,1,14.0,0.5,0.5,0.0,0.0,0.3,17.8,2.5 122 | 2,2020-07-01 00:24:03,2020-07-01 00:32:35,1,3.11,1,N,186,261,2,10.5,0.5,0.5,0.0,0.0,0.3,14.3,2.5 123 | 2,2020-07-01 00:38:09,2020-07-01 00:48:00,1,2.89,1,N,125,48,2,11.0,0.5,0.5,0.0,0.0,0.3,14.8,2.5 124 | 2,2020-07-01 00:14:57,2020-07-01 00:49:16,1,3.49,1,N,140,74,2,23.0,0.5,0.5,0.0,0.0,0.3,26.8,2.5 125 | 2,2020-07-01 00:00:16,2020-07-01 00:14:52,1,2.51,1,N,80,148,1,12.0,0.5,0.5,3.16,0.0,0.3,18.96,2.5 126 | 2,2020-07-01 00:25:32,2020-07-01 00:31:39,1,2.17,1,N,233,263,1,8.0,0.5,0.5,2.36,0.0,0.3,14.16,2.5 127 | 2,2020-07-01 00:05:52,2020-07-01 00:18:37,1,2.57,1,N,244,41,1,11.5,0.5,0.5,2.56,0.0,0.3,15.36,0.0 128 | 2,2020-07-01 00:23:36,2020-07-01 00:31:16,1,1.04,1,N,41,74,2,7.0,0.5,0.5,0.0,0.0,0.3,8.3,0.0 129 | 2,2020-07-01 00:34:33,2020-07-01 00:48:41,1,4.79,1,N,75,193,2,15.5,0.5,0.5,0.0,0.0,0.3,19.3,2.5 130 | 2,2020-07-01 00:50:13,2020-07-01 01:18:50,1,17.21,1,N,193,76,2,47.0,0.5,0.5,0.0,0.0,0.3,48.3,0.0 131 | 1,2020-07-01 00:28:51,2020-07-01 00:38:34,1,3.5,1,N,79,141,1,11.5,3.0,0.5,3.05,0.0,0.3,18.35,2.5 132 | 1,2020-07-01 00:08:17,2020-07-01 00:23:00,2,10.6,1,N,138,191,1,29.5,0.5,0.5,7.7,0.0,0.3,38.5,0.0 133 | 2,2020-07-01 00:37:29,2020-07-01 00:51:43,1,8.75,1,N,74,259,2,25.5,0.5,0.5,0.0,0.0,0.3,26.8,0.0 134 | 1,2020-07-01 00:16:06,2020-07-01 00:20:01,2,1.0,1,N,48,48,1,5.0,3.0,0.5,0.0,0.0,0.3,8.8,2.5 135 | 2,2020-07-01 00:04:30,2020-07-01 05:10:03,1,21.22,1,N,224,224,2,173.5,0.5,0.5,0.0,0.0,0.3,177.3,2.5 136 | 1,2020-07-01 00:51:08,2020-07-01 01:18:28,1,0.0,1,N,247,37,1,28.2,0.0,0.5,0.0,0.0,0.3,29.0,0.0 137 | 2,2020-07-01 00:01:34,2020-07-01 00:20:19,1,4.82,1,N,113,189,1,18.0,0.5,0.5,4.36,0.0,0.3,26.16,2.5 138 | 2,2020-07-01 00:24:53,2020-07-01 00:53:42,1,9.3,5,N,229,265,2,40.0,0.0,0.5,0.0,11.75,0.3,55.05,2.5 139 | 1,2020-07-01 00:53:12,2020-07-01 01:07:22,1,4.1,1,N,68,65,1,15.0,3.0,0.5,3.0,0.0,0.3,21.8,2.5 140 | 2,2020-07-01 00:52:34,2020-07-01 01:00:22,3,1.6,1,N,239,263,1,8.0,0.5,0.5,2.95,0.0,0.3,14.75,2.5 141 | 2,2020-07-01 00:38:25,2020-07-01 00:50:55,1,3.93,1,N,68,140,1,13.5,0.5,0.5,3.46,0.0,0.3,20.76,2.5 142 | 2,2020-07-01 00:59:10,2020-07-01 01:02:20,1,0.86,1,N,229,141,1,5.0,0.5,0.5,2.2,0.0,0.3,11.0,2.5 143 | 2,2020-07-01 00:17:00,2020-07-01 00:30:59,1,4.76,1,N,68,75,2,15.5,0.5,0.5,0.0,0.0,0.3,19.3,2.5 144 | 2,2020-07-01 00:32:59,2020-07-01 00:36:36,1,1.03,1,N,75,262,2,5.5,0.5,0.5,0.0,0.0,0.3,9.3,2.5 145 | 2,2020-07-01 00:34:29,2020-07-01 00:38:21,1,0.6,1,N,151,43,2,5.0,0.5,0.5,0.0,0.0,0.3,6.3,0.0 146 | 2,2020-07-01 00:05:01,2020-07-01 00:07:22,6,1.09,1,N,249,186,1,5.0,0.5,0.5,2.64,0.0,0.3,11.44,2.5 147 | 2,2020-07-01 01:00:13,2020-07-01 01:16:45,6,9.22,1,N,148,7,1,27.0,0.5,0.5,9.24,0.0,0.3,40.04,2.5 148 | 2,2020-07-01 00:32:28,2020-07-01 00:48:25,1,10.65,1,N,183,64,1,30.0,0.5,0.5,2.75,6.12,0.3,40.17,0.0 149 | 2,2020-07-01 00:50:08,2020-07-01 00:50:18,1,0.0,2,N,74,74,1,52.0,0.0,0.5,0.0,6.12,0.3,58.92,0.0 150 | 2,2020-07-01 00:57:42,2020-07-01 01:01:13,2,0.82,1,N,141,263,1,5.0,0.5,0.5,0.0,0.0,0.3,8.8,2.5 151 | 2,2020-07-01 00:07:52,2020-07-01 00:11:51,1,0.99,1,N,263,141,1,5.5,0.5,0.5,2.33,0.0,0.3,11.63,2.5 152 | 2,2020-07-01 00:38:02,2020-07-01 00:56:43,1,5.81,1,N,48,226,1,19.0,0.5,0.5,5.0,0.0,0.3,27.8,2.5 153 | 2,2020-07-01 00:12:06,2020-07-01 00:37:51,1,16.43,1,N,132,188,1,44.0,0.5,0.5,5.0,0.0,0.3,50.3,0.0 154 | 2,2020-07-01 00:19:56,2020-07-01 00:37:09,1,8.17,1,N,75,88,1,24.5,0.5,0.5,5.66,0.0,0.3,33.96,2.5 155 | 2,2020-07-01 00:13:41,2020-07-01 00:22:54,1,6.15,1,N,132,135,2,18.0,0.5,0.5,0.0,0.0,0.3,19.3,0.0 156 | 2,2020-07-01 00:23:55,2020-07-01 00:27:12,2,0.95,1,N,263,74,1,5.0,0.5,0.5,1.76,0.0,0.3,10.56,2.5 157 | 2,2020-07-01 00:32:56,2020-07-01 00:36:43,2,1.19,1,N,263,229,1,5.5,0.5,0.5,1.86,0.0,0.3,11.16,2.5 158 | 2,2020-07-01 00:51:36,2020-07-01 01:08:08,2,4.95,1,N,137,112,2,17.5,0.5,0.5,0.0,0.0,0.3,21.3,2.5 159 | 1,2020-07-01 00:19:28,2020-07-01 00:57:28,1,9.3,1,N,68,120,2,34.5,3.0,0.5,0.0,0.0,0.3,38.3,2.5 160 | 2,2020-07-01 00:23:07,2020-07-01 00:26:26,2,0.62,1,N,48,164,1,4.5,0.5,0.5,0.0,0.0,0.3,8.3,2.5 161 | 2,2020-07-01 00:33:28,2020-07-01 00:34:44,3,0.62,1,N,48,68,1,4.0,0.5,0.5,1.56,0.0,0.3,9.36,2.5 162 | 2,2020-07-01 00:09:00,2020-07-01 00:09:22,2,0.05,1,N,113,113,2,2.5,0.5,0.5,0.0,0.0,0.3,6.3,2.5 163 | 1,2020-07-01 00:15:23,2020-07-01 00:20:04,1,0.8,1,N,79,4,1,5.5,3.0,0.5,1.85,0.0,0.3,11.15,2.5 164 | 1,2020-07-01 00:19:27,2020-07-01 00:44:15,1,7.4,1,N,113,129,1,24.5,3.0,0.5,4.0,0.0,0.3,32.3,2.5 165 | 2,2020-07-01 00:07:06,2020-07-01 00:16:37,1,3.0,1,N,143,152,1,10.5,0.5,0.5,2.86,0.0,0.3,17.16,2.5 166 | 2,2020-07-01 00:03:33,2020-07-01 00:10:21,1,2.51,1,N,263,170,1,9.0,0.5,0.5,1.92,0.0,0.3,14.72,2.5 167 | 1,2020-07-01 00:04:39,2020-07-01 00:13:28,1,0.0,1,N,236,170,1,9.2,3.0,0.5,2.6,0.0,0.3,15.6,2.5 168 | 1,2020-07-01 00:17:43,2020-07-01 00:32:35,1,8.6,1,N,233,212,1,25.0,3.0,0.5,5.75,0.0,0.3,34.55,2.5 169 | 1,2020-07-01 00:51:58,2020-07-01 00:55:37,1,1.5,1,N,229,137,1,6.0,3.0,0.5,1.96,0.0,0.3,11.76,2.5 170 | 2,2020-07-01 00:19:11,2020-07-01 00:24:01,3,1.44,1,N,186,50,1,6.0,0.5,0.5,2.94,0.0,0.3,12.74,2.5 171 | 2,2020-07-01 00:50:23,2020-07-01 00:56:23,4,1.74,1,N,68,234,1,7.5,0.5,0.5,2.26,0.0,0.3,13.56,2.5 172 | 2,2020-07-01 00:15:19,2020-07-01 00:22:27,1,2.25,1,N,114,65,2,9.5,0.5,0.5,0.0,0.0,0.3,13.3,2.5 173 | 2,2020-07-01 00:12:10,2020-07-01 01:05:19,1,29.88,1,N,259,86,1,80.0,0.5,0.5,2.75,6.12,0.3,90.17,0.0 174 | 1,2020-07-01 00:40:36,2020-07-01 00:50:32,1,2.1,1,N,237,263,1,9.5,3.0,0.5,2.65,0.0,0.3,15.95,2.5 175 | 2,2020-07-01 00:31:32,2020-07-01 00:36:52,1,1.63,1,N,263,74,2,7.0,0.5,0.5,0.0,0.0,0.3,10.8,2.5 176 | 2,2020-07-01 00:54:23,2020-07-01 00:58:57,2,1.81,1,N,263,74,1,7.0,0.5,0.5,2.0,0.0,0.3,12.8,2.5 177 | 1,2020-07-01 00:09:18,2020-07-01 00:36:31,1,0.0,1,N,137,76,1,40.2,0.0,0.5,0.0,6.12,0.3,47.12,0.0 178 | 2,2020-07-01 00:10:02,2020-07-01 00:19:46,1,2.13,1,N,68,125,2,9.0,0.5,0.5,0.0,0.0,0.3,12.8,2.5 179 | 2,2020-07-01 00:14:31,2020-07-01 00:30:18,1,2.2,1,N,116,41,2,12.0,0.5,0.5,0.0,0.0,0.3,13.3,0.0 180 | 2,2020-07-01 00:02:54,2020-07-01 00:09:25,1,1.88,1,N,48,229,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5 181 | 2,2020-07-01 00:48:38,2020-07-01 00:57:47,1,3.44,1,N,143,166,2,11.0,0.5,0.5,0.0,0.0,0.3,14.8,2.5 182 | 1,2020-07-01 00:54:28,2020-07-01 01:21:16,1,0.0,1,N,100,259,1,39.2,0.0,0.5,0.0,0.0,0.3,40.0,0.0 183 | 2,2020-07-01 00:02:21,2020-07-01 00:08:56,4,1.96,1,N,100,114,1,8.0,0.5,0.5,2.36,0.0,0.3,14.16,2.5 184 | 2,2020-07-01 00:09:00,2020-07-01 00:22:11,1,4.36,1,N,137,146,2,15.0,0.5,0.5,0.0,0.0,0.3,18.8,2.5 185 | 2,2020-07-01 00:53:38,2020-07-01 00:58:08,1,1.54,1,N,142,186,2,6.5,0.5,0.5,0.0,0.0,0.3,10.3,2.5 186 | 2,2020-07-01 00:52:58,2020-07-01 01:01:28,1,2.85,1,N,237,79,1,10.0,0.5,0.5,2.76,0.0,0.3,16.56,2.5 187 | 1,2020-07-01 00:06:43,2020-07-01 00:32:05,1,7.0,1,N,238,226,2,23.0,3.0,0.5,0.0,0.0,0.3,26.8,2.5 188 | 1,2020-07-01 00:05:01,2020-07-01 00:24:49,2,9.2,1,N,138,17,2,27.5,0.5,0.5,0.0,0.0,0.3,28.8,0.0 189 | 2,2020-07-01 00:02:07,2020-07-01 00:08:02,1,1.29,1,N,142,48,1,6.5,0.5,0.5,3.7,0.0,0.3,14.0,2.5 190 | 2,2020-07-01 00:11:00,2020-07-01 00:21:26,1,3.23,1,N,230,263,1,11.5,0.5,0.5,3.83,0.0,0.3,19.13,2.5 191 | 2,2020-07-01 00:26:40,2020-07-01 00:45:39,1,11.67,1,N,141,28,2,32.0,0.5,0.5,0.0,0.0,0.3,35.8,2.5 192 | 2,2020-07-01 00:59:08,2020-07-01 01:03:01,1,1.17,1,N,258,102,2,5.5,0.5,0.5,0.0,0.0,0.3,6.8,0.0 193 | 2,2020-07-01 00:00:52,2020-07-01 00:08:38,1,1.85,1,N,68,158,1,8.0,0.5,0.5,2.36,0.0,0.3,14.16,2.5 194 | 2,2020-07-01 00:42:59,2020-07-01 01:02:30,1,7.46,1,N,249,41,1,24.0,0.5,0.5,2.0,0.0,0.3,29.8,2.5 195 | 1,2020-07-01 00:42:46,2020-07-01 00:58:19,1,4.8,1,N,186,193,2,16.5,3.0,0.5,0.0,0.0,0.3,20.3,2.5 196 | 2,2020-07-01 00:00:54,2020-07-01 00:11:46,1,2.02,1,N,170,90,2,9.5,0.5,0.5,0.0,0.0,0.3,13.3,2.5 197 | 1,2020-07-01 00:01:32,2020-07-01 00:21:31,1,3.2,1,N,236,186,1,15.0,3.5,0.5,0.0,0.0,0.3,19.3,2.5 198 | 1,2020-07-01 00:55:22,2020-07-01 01:02:53,1,1.7,1,N,107,229,2,8.0,3.5,0.5,0.0,0.0,0.3,12.3,2.5 199 | 2,2020-07-01 00:51:31,2020-07-01 00:51:39,1,0.03,1,N,42,264,2,2.5,0.5,0.5,0.0,0.0,0.3,3.8,0.0 200 | 1,2020-07-01 00:27:40,2020-07-01 00:41:35,1,9.1,1,N,132,39,1,26.0,0.5,0.5,0.02,0.0,0.3,27.32,0.0 201 | 1,2020-07-01 00:23:18,2020-07-01 00:56:51,1,22.8,1,N,138,123,1,60.5,0.5,0.5,0.05,0.0,0.3,61.85,0.0 202 | 2,2020-07-01 00:10:41,2020-07-01 00:11:38,2,0.91,5,N,70,70,1,35.0,0.0,0.5,0.0,0.0,0.3,35.8,0.0 203 | 2,2020-07-01 00:02:23,2020-07-01 00:10:22,1,1.35,1,N,107,186,2,7.5,0.5,0.5,0.0,0.0,0.3,11.3,2.5 204 | 1,2020-07-01 00:27:23,2020-07-01 00:46:42,1,0.0,1,N,231,14,1,23.2,3.0,0.5,0.0,6.12,0.3,33.12,2.5 205 | 1,2020-07-01 00:16:54,2020-07-01 00:44:13,1,7.5,1,N,264,264,1,25.0,0.5,0.5,5.25,0.0,0.3,31.55,0.0 206 | 2,2020-07-01 00:14:45,2020-07-01 00:23:33,1,2.51,1,N,263,142,1,9.5,0.5,0.5,2.66,0.0,0.3,15.96,2.5 207 | 2,2020-07-01 00:47:25,2020-07-01 01:13:52,1,10.84,1,N,231,244,1,32.0,0.5,0.5,7.16,0.0,0.3,42.96,2.5 208 | 2,2020-07-01 00:10:53,2020-07-01 00:25:28,1,5.08,1,N,90,75,2,16.0,0.5,0.5,0.0,0.0,0.3,19.8,2.5 209 | 2,2020-07-01 00:00:37,2020-07-01 00:16:20,1,6.34,1,N,70,229,1,19.5,0.5,0.5,3.5,0.0,0.3,26.8,2.5 210 | 1,2020-07-01 00:20:01,2020-07-01 00:25:18,0,1.7,1,N,48,68,3,7.0,3.0,0.5,0.0,0.0,0.3,10.8,2.5 211 | 1,2020-07-01 00:14:42,2020-07-01 00:36:34,1,8.6,1,N,162,196,2,26.0,3.0,0.5,0.0,0.0,0.3,29.8,2.5 212 | 2,2020-07-01 00:20:57,2020-07-01 00:26:10,3,1.33,1,N,137,100,2,6.5,0.5,0.5,0.0,0.0,0.3,10.3,2.5 213 | 1,2020-07-01 00:59:56,2020-07-01 01:01:52,1,0.6,1,N,234,113,1,4.0,3.0,0.5,1.95,0.0,0.3,9.75,2.5 214 | 2,2020-07-01 00:07:20,2020-07-01 00:12:24,1,1.81,1,N,90,48,1,7.5,0.5,0.5,0.0,0.0,0.3,11.3,2.5 215 | 2,2020-07-01 00:15:58,2020-07-01 00:21:47,1,2.34,1,N,48,90,1,8.0,0.5,0.5,2.36,0.0,0.3,14.16,2.5 216 | 2,2020-07-01 00:19:37,2020-07-01 00:22:34,1,1.32,1,N,170,79,1,5.5,0.5,0.5,1.86,0.0,0.3,11.16,2.5 217 | 2,2020-07-01 00:33:21,2020-07-01 00:38:35,1,0.64,1,N,100,186,2,5.5,0.5,0.5,0.0,0.0,0.3,9.3,2.5 218 | 2,2020-07-01 00:44:42,2020-07-01 00:51:50,1,1.64,1,N,231,261,1,7.5,0.5,0.5,2.26,0.0,0.3,13.56,2.5 219 | 2,2020-07-01 00:27:04,2020-07-01 00:31:28,1,0.67,1,N,113,79,1,5.0,0.5,0.5,1.76,0.0,0.3,10.56,2.5 220 | 2,2020-07-01 00:47:27,2020-07-01 01:05:31,1,6.53,1,N,137,198,2,21.0,0.5,0.5,0.0,0.0,0.3,24.8,2.5 221 | 2,2020-07-01 00:02:03,2020-07-01 00:10:13,1,2.17,1,N,48,233,2,9.0,0.5,0.5,0.0,0.0,0.3,12.8,2.5 222 | 2,2020-07-01 00:16:38,2020-07-02 00:10:13,1,4.54,1,N,164,256,1,16.0,0.5,0.5,3.96,0.0,0.3,23.76,2.5 223 | 1,2020-07-01 00:29:08,2020-07-01 00:49:35,1,9.5,1,N,138,116,2,27.5,1.0,0.5,0.0,6.12,0.3,35.42,0.0 224 | 2,2020-07-01 00:17:53,2020-07-01 00:22:15,1,1.15,1,N,137,137,1,5.5,0.5,0.5,2.33,0.0,0.3,11.63,2.5 225 | 2,2020-07-01 00:27:27,2020-07-01 00:41:14,1,6.93,1,N,137,159,1,21.0,0.5,0.5,2.0,0.0,0.3,26.8,2.5 226 | 1,2020-07-01 00:11:27,2020-07-01 00:29:59,2,8.8,1,N,138,198,2,25.5,0.5,0.5,0.0,0.0,0.3,26.8,0.0 227 | 1,2020-07-01 00:06:50,2020-07-01 00:23:37,1,8.4,1,N,138,80,1,25.0,0.5,0.5,5.25,0.0,0.3,31.55,0.0 228 | 2,2020-07-01 00:31:49,2020-07-01 00:46:41,2,5.42,1,N,161,42,1,17.0,0.5,0.5,5.2,0.0,0.3,26.0,2.5 229 | 1,2020-07-01 00:13:14,2020-07-01 00:29:10,1,4.1,1,N,181,177,2,15.0,0.5,0.5,0.0,0.0,0.3,16.3,0.0 230 | 1,2020-07-01 00:57:45,2020-07-01 01:00:31,1,0.4,1,N,49,49,2,4.0,0.5,0.5,0.0,0.0,0.3,5.3,0.0 231 | 2,2020-07-01 00:09:29,2020-07-01 00:20:40,1,3.11,1,N,141,7,2,11.0,0.5,0.5,0.0,0.0,0.3,14.8,2.5 232 | 2,2020-07-01 00:23:00,2020-07-01 00:41:10,1,4.42,1,N,7,140,1,16.5,0.5,0.5,6.09,0.0,0.3,26.39,2.5 233 | 2,2020-07-01 00:19:14,2020-07-01 00:51:53,1,27.71,4,N,132,265,2,91.0,0.5,0.5,0.0,6.12,0.3,98.42,0.0 234 | 1,2020-07-01 00:01:46,2020-07-01 00:24:45,3,12.9,1,N,138,61,2,36.5,0.5,0.5,0.0,0.0,0.3,37.8,0.0 235 | 2,2020-06-30 18:11:01,2020-06-30 18:15:03,1,0.55,1,N,74,75,2,4.5,0.5,0.5,0.0,0.0,0.3,5.8,0.0 236 | 2,2020-06-30 18:18:33,2020-06-30 18:23:32,1,1.67,1,N,236,237,2,7.0,0.5,0.5,0.0,0.0,0.3,10.8,2.5 237 | 2,2020-06-30 18:32:20,2020-07-01 18:07:16,1,7.45,1,N,161,159,1,28.5,0.5,0.5,3.0,0.0,0.3,35.3,2.5 238 | 2,2020-07-01 00:18:13,2020-07-01 00:23:50,2,0.76,1,N,137,234,2,6.0,0.5,0.5,0.0,0.0,0.3,9.8,2.5 239 | 1,2020-07-01 00:14:00,2020-07-01 00:38:57,1,14.2,1,N,237,22,2,40.0,3.0,0.5,0.0,0.0,0.3,43.8,2.5 240 | 2,2020-07-01 00:21:46,2020-07-01 00:30:04,1,0.73,1,N,75,236,1,7.0,0.5,0.5,2.7,0.0,0.3,13.5,2.5 241 | 2,2020-07-01 00:28:49,2020-07-01 00:41:14,1,2.7,1,N,42,238,1,11.5,0.5,0.5,1.0,0.0,0.3,13.8,0.0 242 | 2,2020-07-01 00:52:31,2020-07-01 00:58:39,1,1.98,1,N,142,246,2,7.5,0.5,0.5,0.0,0.0,0.3,11.3,2.5 243 | 1,2020-07-01 00:14:43,2020-07-01 00:19:19,1,1.0,1,N,161,100,2,5.5,3.0,0.5,0.0,0.0,0.3,9.3,2.5 244 | 2,2020-07-01 00:05:15,2020-07-01 00:11:26,3,2.08,1,N,262,74,1,8.0,0.5,0.5,2.95,0.0,0.3,14.75,2.5 245 | 1,2020-07-01 01:38:25,2020-07-01 01:38:46,1,0.0,1,N,10,10,1,39.2,0.0,0.5,0.0,0.0,0.3,40.0,0.0 246 | 1,2020-07-01 01:20:14,2020-07-01 01:52:58,1,0.0,1,N,168,35,1,38.2,0.0,0.5,0.0,6.12,0.3,45.12,0.0 247 | 2,2020-07-01 01:31:41,2020-07-01 01:42:22,1,2.1,1,N,50,141,1,9.5,0.5,0.5,3.33,0.0,0.3,16.63,2.5 248 | 2,2020-07-01 01:48:22,2020-07-01 02:07:06,1,4.37,1,N,262,166,1,17.0,0.5,0.5,4.16,0.0,0.3,24.96,2.5 249 | 2,2020-07-01 01:27:57,2020-07-01 01:38:03,1,2.64,1,N,246,229,2,10.5,0.5,0.5,0.0,0.0,0.3,14.3,2.5 250 | 2,2020-07-01 01:48:33,2020-07-01 01:52:03,1,1.22,1,N,48,239,2,5.5,0.5,0.5,0.0,0.0,0.3,9.3,2.5 251 | 2,2020-07-01 01:57:36,2020-07-01 02:32:21,1,21.5,2,N,142,132,2,52.0,0.0,0.5,0.0,6.12,0.3,61.42,2.5 252 | 2,2020-07-01 01:36:00,2020-07-01 01:39:28,1,0.85,1,N,74,42,1,5.0,0.5,0.5,0.0,0.0,0.3,6.3,0.0 253 | 1,2020-07-01 01:10:43,2020-07-01 01:16:41,1,1.6,1,N,146,7,2,7.0,0.5,0.5,0.0,0.0,0.3,8.3,0.0 254 | 1,2020-07-01 01:47:34,2020-07-01 01:50:10,1,0.7,1,N,137,107,1,4.5,3.0,0.5,1.65,0.0,0.3,9.95,2.5 255 | 2,2020-07-01 01:55:37,2020-07-01 02:04:49,1,3.89,1,N,116,143,2,13.0,0.5,0.5,0.0,0.0,0.3,16.8,2.5 256 | 2,2020-07-01 01:06:11,2020-07-01 01:16:47,1,6.0,1,N,230,244,2,18.0,0.5,0.5,0.0,0.0,0.3,21.8,2.5 257 | 2,2020-07-01 01:38:55,2020-07-01 01:44:05,1,1.12,1,N,166,24,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,0.0 258 | 1,2020-07-01 01:28:19,2020-07-01 01:59:40,1,0.0,1,N,168,258,1,36.2,0.0,0.5,0.0,6.12,0.3,43.12,0.0 259 | 1,2020-07-01 01:23:23,2020-07-01 01:28:24,1,0.9,1,N,107,170,2,5.5,3.0,0.5,0.0,0.0,0.3,9.3,2.5 260 | 2,2020-07-01 01:31:24,2020-07-01 01:44:24,1,3.75,1,N,141,41,1,13.0,0.5,0.5,2.0,0.0,0.3,18.8,2.5 261 | 2,2020-07-01 01:38:26,2020-07-01 01:49:37,1,3.28,1,N,48,211,2,12.0,0.5,0.5,0.0,0.0,0.3,15.8,2.5 262 | 1,2020-07-01 01:35:50,2020-07-01 01:40:24,1,1.4,1,N,140,236,1,6.5,3.0,0.5,2.05,0.0,0.3,12.35,2.5 263 | 1,2020-07-01 01:48:38,2020-07-01 01:51:55,1,1.3,1,N,140,75,2,5.5,3.0,0.5,0.0,0.0,0.3,9.3,2.5 264 | 1,2020-07-01 01:55:54,2020-07-01 02:05:23,1,3.2,1,N,236,230,2,11.5,3.0,0.5,0.0,0.0,0.3,15.3,2.5 265 | 1,2020-07-01 01:48:27,2020-07-01 01:52:21,1,0.3,1,N,158,249,1,4.5,3.0,0.5,1.65,0.0,0.3,9.95,2.5 266 | 1,2020-07-01 01:49:27,2020-07-01 01:55:52,1,0.8,1,N,140,237,2,6.5,3.0,0.5,0.0,0.0,0.3,10.3,2.5 267 | 2,2020-07-01 01:49:40,2020-07-01 01:56:37,3,2.64,1,N,263,161,1,9.0,0.5,0.5,0.0,0.0,0.3,12.8,2.5 268 | -------------------------------------------------------------------------------- /week6/avro_example/producer.py: -------------------------------------------------------------------------------- 1 | from confluent_kafka import avro 2 | from confluent_kafka.avro import AvroProducer 3 | import csv 4 | from time import sleep 5 | 6 | 7 | def load_avro_schema_from_file(): 8 | key_schema = avro.load("taxi_ride_key.avsc") 9 | value_schema = avro.load("taxi_ride_value.avsc") 10 | 11 | return key_schema, value_schema 12 | 13 | 14 | def send_record(): 15 | key_schema, value_schema = load_avro_schema_from_file() 16 | 17 | producer_config = { 18 | "bootstrap.servers": "localhost:9092", 19 | "schema.registry.url": "http://localhost:8081", 20 | "acks": "1" 21 | } 22 | 23 | producer = AvroProducer(producer_config, default_key_schema=key_schema, default_value_schema=value_schema) 24 | 25 | file = open('./data/rides_new.csv') 26 | 27 | csvreader = csv.reader(file) 28 | header = next(csvreader) 29 | for row in csvreader: 30 | key = {"vendorId": int(row[0])} 31 | value = {"vendorId": int(row[0]), "passenger_count": int(row[3]), "trip_distance": float(row[4]), "payment_type": int(row[9]), "total_amount": float(row[16])} 32 | 33 | try: 34 | producer.produce(topic='datatalkclub.yellow_taxi_rides', key=key, value=value) 35 | except Exception as e: 36 | print(f"Exception while producing record value - {value}: {e}") 37 | else: 38 | print(f"Successfully producing record value - {value}") 39 | 40 | producer.flush() 41 | sleep(1) 42 | 43 | if __name__ == "__main__": 44 | send_record() -------------------------------------------------------------------------------- /week6/avro_example/taxi_ride_key.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "namespace": "com.datatalksclub.taxi", 3 | "type": "record", 4 | "name": "TaxiRideKey", 5 | "fields": [ 6 | { 7 | "name": "vendorId", 8 | "type": "int" 9 | } 10 | ] 11 | } -------------------------------------------------------------------------------- /week6/avro_example/taxi_ride_value.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "namespace": "com.datatalksclub.taxi", 3 | "type": "record", 4 | "name": "TaxiRide", 5 | "fields": [ 6 | { 7 | "name": "vendorId", 8 | "type": "int" 9 | }, 10 | { 11 | "name": "passenger_count", 12 | "type": "int" 13 | }, 14 | { 15 | "name": "trip_distance", 16 | "type": "float" 17 | }, 18 | { 19 | "name": "payment_type", 20 | "type": "int" 21 | }, 22 | { 23 | "name": "total_amount", 24 | "type": "float" 25 | } 26 | ] 27 | } -------------------------------------------------------------------------------- /week6/consumer.py: -------------------------------------------------------------------------------- 1 | from kafka import KafkaConsumer 2 | from json import loads 3 | from time import sleep 4 | 5 | consumer = KafkaConsumer( 6 | 'demo_1', 7 | bootstrap_servers=['localhost:9092'], 8 | auto_offset_reset='earliest', 9 | enable_auto_commit=True, 10 | group_id='consumer.group.id.demo.1', 11 | value_deserializer=lambda x: loads(x.decode('utf-8'))) 12 | 13 | 14 | while(True): 15 | print("inside while") 16 | for message in consumer: 17 | message = message.value 18 | print(message) 19 | sleep(1) -------------------------------------------------------------------------------- /week6/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | zookeeper: 5 | image: confluentinc/cp-zookeeper:5.4.0 6 | hostname: zookeeper 7 | container_name: zookeeper 8 | ports: 9 | - "2181:2181" 10 | environment: 11 | ZOOKEEPER_CLIENT_PORT: 2181 12 | ZOOKEEPER_TICK_TIME: 2000 13 | 14 | broker: 15 | image: confluentinc/cp-server:5.4.0 16 | hostname: broker 17 | container_name: broker 18 | depends_on: 19 | - zookeeper 20 | ports: 21 | - "9092:9092" 22 | environment: 23 | KAFKA_BROKER_ID: 1 24 | KAFKA_ZOOKEEPER_CONNECT: "zookeeper:2181" 25 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT 26 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092 27 | KAFKA_METRIC_REPORTERS: io.confluent.metrics.reporter.ConfluentMetricsReporter 28 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 29 | KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0 30 | KAFKA_CONFLUENT_LICENSE_TOPIC_REPLICATION_FACTOR: 1 31 | CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: broker:29092 32 | CONFLUENT_METRICS_REPORTER_ZOOKEEPER_CONNECT: zookeeper:2181 33 | CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1 34 | CONFLUENT_METRICS_ENABLE: "true" 35 | CONFLUENT_SUPPORT_CUSTOMER_ID: "anonymous" 36 | 37 | kafka-tools: 38 | image: confluentinc/cp-kafka:5.4.0 39 | hostname: kafka-tools 40 | container_name: kafka-tools 41 | command: ["tail", "-f", "/dev/null"] 42 | network_mode: "host" 43 | 44 | schema-registry: 45 | image: confluentinc/cp-schema-registry:5.4.0 46 | hostname: schema-registry 47 | container_name: schema-registry 48 | depends_on: 49 | - zookeeper 50 | - broker 51 | ports: 52 | - "8081:8081" 53 | environment: 54 | SCHEMA_REGISTRY_HOST_NAME: schema-registry 55 | SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: "zookeeper:2181" 56 | 57 | control-center: 58 | image: confluentinc/cp-enterprise-control-center:5.4.0 59 | hostname: control-center 60 | container_name: control-center 61 | depends_on: 62 | - zookeeper 63 | - broker 64 | - schema-registry 65 | ports: 66 | - "9021:9021" 67 | environment: 68 | CONTROL_CENTER_BOOTSTRAP_SERVERS: 'broker:29092' 69 | CONTROL_CENTER_ZOOKEEPER_CONNECT: 'zookeeper:2181' 70 | CONTROL_CENTER_SCHEMA_REGISTRY_URL: "http://schema-registry:8081" 71 | CONTROL_CENTER_REPLICATION_FACTOR: 1 72 | CONTROL_CENTER_INTERNAL_TOPICS_PARTITIONS: 1 73 | CONTROL_CENTER_MONITORING_INTERCEPTOR_TOPIC_PARTITIONS: 1 74 | CONFLUENT_METRICS_TOPIC_REPLICATION: 1 75 | PORT: 9021 -------------------------------------------------------------------------------- /week6/producer.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | from json import dumps 3 | from kafka import KafkaProducer 4 | 5 | 6 | producer = KafkaProducer(bootstrap_servers=['localhost:9092'], 7 | value_serializer=lambda x: 8 | dumps(x).encode('utf-8')) 9 | 10 | for e in range(1000): 11 | data = {'number' : e} 12 | producer.send('demo_1', value=data) 13 | print("producing") 14 | sleep(1) -------------------------------------------------------------------------------- /week6/requirements.txt: -------------------------------------------------------------------------------- 1 | kafka-python==1.4.6 2 | confluent_kafka 3 | requests 4 | avro 5 | faust 6 | fastavro -------------------------------------------------------------------------------- /week6/streams/__pycache__/taxi_rides.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pcrespoo/data-engineering-bootcamp/a1835f9c1557d4974b73837a0622ac5d24db9dfa/week6/streams/__pycache__/taxi_rides.cpython-37.pyc -------------------------------------------------------------------------------- /week6/streams/branch_price.py: -------------------------------------------------------------------------------- 1 | import faust 2 | from taxi_rides import TaxiRide 3 | from faust import current_event 4 | 5 | app = faust.App('datatalksclub.stream.v3', broker='kafka://localhost:9092', consumer_auto_offset_reset="earliest") 6 | topic = app.topic('datatalkclub.yellow_taxi_ride.json', value_type=TaxiRide) 7 | 8 | high_amount_rides = app.topic('datatalks.yellow_taxi_rides.high_amount') 9 | low_amount_rides = app.topic('datatalks.yellow_taxi_rides.low_amount') 10 | 11 | 12 | @app.agent(topic) 13 | async def process(stream): 14 | async for event in stream: 15 | if event.total_amount >= 40.0: 16 | await current_event().forward(high_amount_rides) 17 | else: 18 | await current_event().forward(low_amount_rides) 19 | 20 | if __name__ == '__main__': 21 | app.main() -------------------------------------------------------------------------------- /week6/streams/producer_taxi_json.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from json import dumps 3 | from kafka import KafkaProducer 4 | from time import sleep 5 | 6 | 7 | producer = KafkaProducer(bootstrap_servers=['localhost:9092'], 8 | key_serializer=lambda x: dumps(x).encode('utf-8'), 9 | value_serializer=lambda x: dumps(x).encode('utf-8')) 10 | 11 | file = open('../avro_example/data/rides_new.csv') 12 | 13 | csvreader = csv.reader(file) 14 | header = next(csvreader) 15 | for row in csvreader: 16 | key = {"vendorId": int(row[0])} 17 | value = {"vendorId": int(row[0]), "passenger_count": int(row[3]), "trip_distance": float(row[4]), "payment_type": int(row[9]), "total_amount": float(row[16])} 18 | producer.send('datatalkclub.yellow_taxi_ride.json', value=value, key=key) 19 | print("producing") 20 | sleep(1) -------------------------------------------------------------------------------- /week6/streams/stream.py: -------------------------------------------------------------------------------- 1 | import faust 2 | from taxi_rides import TaxiRide 3 | 4 | 5 | app = faust.App('datatalksclub.stream.v2', broker='kafka://localhost:9092') 6 | topic = app.topic('datatalkclub.yellow_taxi_ride.json', value_type=TaxiRide) 7 | 8 | 9 | @app.agent(topic) 10 | async def start_reading(records): 11 | async for record in records: 12 | print(record) 13 | 14 | 15 | if __name__ == '__main__': 16 | app.main() -------------------------------------------------------------------------------- /week6/streams/taxi_rides.py: -------------------------------------------------------------------------------- 1 | import faust 2 | 3 | 4 | class TaxiRide(faust.Record, validation=True): 5 | vendorId: str 6 | passenger_count: int 7 | trip_distance: float 8 | payment_type: int 9 | total_amount: float --------------------------------------------------------------------------------