├── LICENSE
├── README.md
├── week1
├── .gitignore
├── .ipynb_checkpoints
│ └── postgres_connection-checkpoint.ipynb
├── Dockerfile
├── README.md
├── docker-compose.yaml
├── ingest_data.py
├── postgres_connection.ipynb
└── terraform
│ ├── .terraform-version
│ ├── main.tf
│ └── variables.tf
├── week2
├── .gitignore
├── README.md
└── airflow
│ ├── Dockerfile
│ ├── dags
│ └── dag_ingestion_gcs.py
│ ├── docker-compose.yaml
│ └── requirements.txt
├── week3
├── .gitignore
├── README.md
└── airflow
│ ├── dags
│ ├── __pycache__
│ │ └── gcp_to_bq_dag.cpython-37.pyc
│ └── gcp_to_bq_dag.py
│ └── docker-compose.yaml
├── week4
├── README.md
├── data_to_gcs
│ ├── .gitignore
│ └── upload_to_gcs.py
└── dbt
│ ├── .gitignore
│ ├── analyses
│ └── .gitkeep
│ ├── data
│ └── taxi_zone.csv
│ ├── dbt_project.yml
│ ├── macros
│ ├── .gitkeep
│ └── get_payment_type_description.sql
│ ├── models
│ ├── core
│ │ ├── dim_zones.sql
│ │ ├── dm_monthly_zone_revenue.sql
│ │ ├── fact_trips.sql
│ │ └── schema.yml
│ └── staging
│ │ ├── schema.yml
│ │ ├── stg_green_tripdata.sql
│ │ └── stg_yellow_tripdata.sql
│ ├── packages.yml
│ ├── profiles.yml
│ ├── seeds
│ └── .gitkeep
│ ├── snapshots
│ └── .gitkeep
│ └── tests
│ └── .gitkeep
└── week6
├── README.md
├── avro_example
├── consumer.py
├── data
│ └── rides_new.csv
├── producer.py
├── taxi_ride_key.avsc
└── taxi_ride_value.avsc
├── consumer.py
├── docker-compose.yml
├── producer.py
├── requirements.txt
└── streams
├── __pycache__
└── taxi_rides.cpython-37.pyc
├── branch_price.py
├── producer_taxi_json.py
├── stream.py
└── taxi_rides.py
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Pedro C.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ### data-engineering-bootcamp
2 | This repo contains all the material developed during the 9-week bootcamp provided by DPhi in colaboration with DataTalks Club
3 | - Link: https://dphi.tech
4 | - Notes: https://pcrespoo.notion.site/Tech-Data-Engineering-Bootcamp-475acaace06042188da8600e1e45d7f5
5 | - **Updates**: by the end of the week in progress
6 |
7 |
8 | ### Topics
9 | - Week 1: Docker, Docker Compose, GCP and Terraform
10 | - Week 2: Airflow, Data Ingestion to Google Cloud Storage
11 | - Week 3: BigQuery, Partitioned and Clustered tables, Airflow, how to move files in Google Cloud Storage
12 | - Week 4: Dbt and Google Data Studio
13 | - Week 5: Apache Spark (not yet implemented)
14 | - Week 6: Kafka
15 |
--------------------------------------------------------------------------------
/week1/.gitignore:
--------------------------------------------------------------------------------
1 | terraform/.terraform/
2 | ny_taxi_postgres_data/
3 | *.csv
4 | *.parquet
5 | terraform/terraform.tfstate
6 | terraform/terraform.tfstate.backup
7 | terraform/.terraform.lock.hcl
8 |
9 |
--------------------------------------------------------------------------------
/week1/.ipynb_checkpoints/postgres_connection-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "data": {
10 | "text/plain": [
11 | "'0.25.1'"
12 | ]
13 | },
14 | "execution_count": 3,
15 | "metadata": {},
16 | "output_type": "execute_result"
17 | }
18 | ],
19 | "source": [
20 | "import pandas as pd\n",
21 | "pd.__version__"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 4,
27 | "metadata": {},
28 | "outputs": [
29 | {
30 | "data": {
31 | "text/html": [
32 | "
\n",
33 | "\n",
46 | "
\n",
47 | " \n",
48 | " \n",
49 | " | \n",
50 | " VendorID | \n",
51 | " tpep_pickup_datetime | \n",
52 | " tpep_dropoff_datetime | \n",
53 | " passenger_count | \n",
54 | " trip_distance | \n",
55 | " RatecodeID | \n",
56 | " store_and_fwd_flag | \n",
57 | " PULocationID | \n",
58 | " DOLocationID | \n",
59 | " payment_type | \n",
60 | " fare_amount | \n",
61 | " extra | \n",
62 | " mta_tax | \n",
63 | " tip_amount | \n",
64 | " tolls_amount | \n",
65 | " improvement_surcharge | \n",
66 | " total_amount | \n",
67 | " congestion_surcharge | \n",
68 | " airport_fee | \n",
69 | "
\n",
70 | " \n",
71 | " \n",
72 | " \n",
73 | " 0 | \n",
74 | " 1 | \n",
75 | " 2021-01-01 00:30:10 | \n",
76 | " 2021-01-01 00:36:12 | \n",
77 | " 1.0 | \n",
78 | " 2.10 | \n",
79 | " 1.0 | \n",
80 | " N | \n",
81 | " 142 | \n",
82 | " 43 | \n",
83 | " 2 | \n",
84 | " 8.00 | \n",
85 | " 3.00 | \n",
86 | " 0.5 | \n",
87 | " 0.00 | \n",
88 | " 0.00 | \n",
89 | " 0.3 | \n",
90 | " 11.80 | \n",
91 | " 2.5 | \n",
92 | " NaN | \n",
93 | "
\n",
94 | " \n",
95 | " 1 | \n",
96 | " 1 | \n",
97 | " 2021-01-01 00:51:20 | \n",
98 | " 2021-01-01 00:52:19 | \n",
99 | " 1.0 | \n",
100 | " 0.20 | \n",
101 | " 1.0 | \n",
102 | " N | \n",
103 | " 238 | \n",
104 | " 151 | \n",
105 | " 2 | \n",
106 | " 3.00 | \n",
107 | " 0.50 | \n",
108 | " 0.5 | \n",
109 | " 0.00 | \n",
110 | " 0.00 | \n",
111 | " 0.3 | \n",
112 | " 4.30 | \n",
113 | " 0.0 | \n",
114 | " NaN | \n",
115 | "
\n",
116 | " \n",
117 | " 2 | \n",
118 | " 1 | \n",
119 | " 2021-01-01 00:43:30 | \n",
120 | " 2021-01-01 01:11:06 | \n",
121 | " 1.0 | \n",
122 | " 14.70 | \n",
123 | " 1.0 | \n",
124 | " N | \n",
125 | " 132 | \n",
126 | " 165 | \n",
127 | " 1 | \n",
128 | " 42.00 | \n",
129 | " 0.50 | \n",
130 | " 0.5 | \n",
131 | " 8.65 | \n",
132 | " 0.00 | \n",
133 | " 0.3 | \n",
134 | " 51.95 | \n",
135 | " 0.0 | \n",
136 | " NaN | \n",
137 | "
\n",
138 | " \n",
139 | " 3 | \n",
140 | " 1 | \n",
141 | " 2021-01-01 00:15:48 | \n",
142 | " 2021-01-01 00:31:01 | \n",
143 | " 0.0 | \n",
144 | " 10.60 | \n",
145 | " 1.0 | \n",
146 | " N | \n",
147 | " 138 | \n",
148 | " 132 | \n",
149 | " 1 | \n",
150 | " 29.00 | \n",
151 | " 0.50 | \n",
152 | " 0.5 | \n",
153 | " 6.05 | \n",
154 | " 0.00 | \n",
155 | " 0.3 | \n",
156 | " 36.35 | \n",
157 | " 0.0 | \n",
158 | " NaN | \n",
159 | "
\n",
160 | " \n",
161 | " 4 | \n",
162 | " 2 | \n",
163 | " 2021-01-01 00:31:49 | \n",
164 | " 2021-01-01 00:48:21 | \n",
165 | " 1.0 | \n",
166 | " 4.94 | \n",
167 | " 1.0 | \n",
168 | " N | \n",
169 | " 68 | \n",
170 | " 33 | \n",
171 | " 1 | \n",
172 | " 16.50 | \n",
173 | " 0.50 | \n",
174 | " 0.5 | \n",
175 | " 4.06 | \n",
176 | " 0.00 | \n",
177 | " 0.3 | \n",
178 | " 24.36 | \n",
179 | " 2.5 | \n",
180 | " NaN | \n",
181 | "
\n",
182 | " \n",
183 | " ... | \n",
184 | " ... | \n",
185 | " ... | \n",
186 | " ... | \n",
187 | " ... | \n",
188 | " ... | \n",
189 | " ... | \n",
190 | " ... | \n",
191 | " ... | \n",
192 | " ... | \n",
193 | " ... | \n",
194 | " ... | \n",
195 | " ... | \n",
196 | " ... | \n",
197 | " ... | \n",
198 | " ... | \n",
199 | " ... | \n",
200 | " ... | \n",
201 | " ... | \n",
202 | " ... | \n",
203 | "
\n",
204 | " \n",
205 | " 1369764 | \n",
206 | " 2 | \n",
207 | " 2021-01-31 23:03:00 | \n",
208 | " 2021-01-31 23:33:00 | \n",
209 | " NaN | \n",
210 | " 8.89 | \n",
211 | " NaN | \n",
212 | " None | \n",
213 | " 229 | \n",
214 | " 181 | \n",
215 | " 0 | \n",
216 | " 27.78 | \n",
217 | " 0.00 | \n",
218 | " 0.5 | \n",
219 | " 7.46 | \n",
220 | " 0.00 | \n",
221 | " 0.3 | \n",
222 | " 38.54 | \n",
223 | " NaN | \n",
224 | " NaN | \n",
225 | "
\n",
226 | " \n",
227 | " 1369765 | \n",
228 | " 2 | \n",
229 | " 2021-01-31 23:29:00 | \n",
230 | " 2021-01-31 23:51:00 | \n",
231 | " NaN | \n",
232 | " 7.43 | \n",
233 | " NaN | \n",
234 | " None | \n",
235 | " 41 | \n",
236 | " 70 | \n",
237 | " 0 | \n",
238 | " 32.58 | \n",
239 | " 0.00 | \n",
240 | " 0.5 | \n",
241 | " 0.00 | \n",
242 | " 6.12 | \n",
243 | " 0.3 | \n",
244 | " 39.50 | \n",
245 | " NaN | \n",
246 | " NaN | \n",
247 | "
\n",
248 | " \n",
249 | " 1369766 | \n",
250 | " 2 | \n",
251 | " 2021-01-31 23:25:00 | \n",
252 | " 2021-01-31 23:38:00 | \n",
253 | " NaN | \n",
254 | " 6.26 | \n",
255 | " NaN | \n",
256 | " None | \n",
257 | " 74 | \n",
258 | " 137 | \n",
259 | " 0 | \n",
260 | " 16.85 | \n",
261 | " 0.00 | \n",
262 | " 0.5 | \n",
263 | " 3.90 | \n",
264 | " 0.00 | \n",
265 | " 0.3 | \n",
266 | " 24.05 | \n",
267 | " NaN | \n",
268 | " NaN | \n",
269 | "
\n",
270 | " \n",
271 | " 1369767 | \n",
272 | " 6 | \n",
273 | " 2021-01-31 23:01:06 | \n",
274 | " 2021-02-01 00:02:03 | \n",
275 | " NaN | \n",
276 | " 19.70 | \n",
277 | " NaN | \n",
278 | " None | \n",
279 | " 265 | \n",
280 | " 188 | \n",
281 | " 0 | \n",
282 | " 53.68 | \n",
283 | " 0.00 | \n",
284 | " 0.5 | \n",
285 | " 0.00 | \n",
286 | " 0.00 | \n",
287 | " 0.3 | \n",
288 | " 54.48 | \n",
289 | " NaN | \n",
290 | " NaN | \n",
291 | "
\n",
292 | " \n",
293 | " 1369768 | \n",
294 | " 2 | \n",
295 | " 2021-01-31 23:08:29 | \n",
296 | " 2021-01-31 23:31:22 | \n",
297 | " NaN | \n",
298 | " 4.68 | \n",
299 | " NaN | \n",
300 | " None | \n",
301 | " 89 | \n",
302 | " 61 | \n",
303 | " 0 | \n",
304 | " 25.45 | \n",
305 | " 2.75 | \n",
306 | " 0.5 | \n",
307 | " 0.00 | \n",
308 | " 0.00 | \n",
309 | " 0.3 | \n",
310 | " 29.00 | \n",
311 | " NaN | \n",
312 | " NaN | \n",
313 | "
\n",
314 | " \n",
315 | "
\n",
316 | "
1369769 rows × 19 columns
\n",
317 | "
"
318 | ],
319 | "text/plain": [
320 | " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n",
321 | "0 1 2021-01-01 00:30:10 2021-01-01 00:36:12 1.0 \n",
322 | "1 1 2021-01-01 00:51:20 2021-01-01 00:52:19 1.0 \n",
323 | "2 1 2021-01-01 00:43:30 2021-01-01 01:11:06 1.0 \n",
324 | "3 1 2021-01-01 00:15:48 2021-01-01 00:31:01 0.0 \n",
325 | "4 2 2021-01-01 00:31:49 2021-01-01 00:48:21 1.0 \n",
326 | "... ... ... ... ... \n",
327 | "1369764 2 2021-01-31 23:03:00 2021-01-31 23:33:00 NaN \n",
328 | "1369765 2 2021-01-31 23:29:00 2021-01-31 23:51:00 NaN \n",
329 | "1369766 2 2021-01-31 23:25:00 2021-01-31 23:38:00 NaN \n",
330 | "1369767 6 2021-01-31 23:01:06 2021-02-01 00:02:03 NaN \n",
331 | "1369768 2 2021-01-31 23:08:29 2021-01-31 23:31:22 NaN \n",
332 | "\n",
333 | " trip_distance RatecodeID store_and_fwd_flag PULocationID \\\n",
334 | "0 2.10 1.0 N 142 \n",
335 | "1 0.20 1.0 N 238 \n",
336 | "2 14.70 1.0 N 132 \n",
337 | "3 10.60 1.0 N 138 \n",
338 | "4 4.94 1.0 N 68 \n",
339 | "... ... ... ... ... \n",
340 | "1369764 8.89 NaN None 229 \n",
341 | "1369765 7.43 NaN None 41 \n",
342 | "1369766 6.26 NaN None 74 \n",
343 | "1369767 19.70 NaN None 265 \n",
344 | "1369768 4.68 NaN None 89 \n",
345 | "\n",
346 | " DOLocationID payment_type fare_amount extra mta_tax tip_amount \\\n",
347 | "0 43 2 8.00 3.00 0.5 0.00 \n",
348 | "1 151 2 3.00 0.50 0.5 0.00 \n",
349 | "2 165 1 42.00 0.50 0.5 8.65 \n",
350 | "3 132 1 29.00 0.50 0.5 6.05 \n",
351 | "4 33 1 16.50 0.50 0.5 4.06 \n",
352 | "... ... ... ... ... ... ... \n",
353 | "1369764 181 0 27.78 0.00 0.5 7.46 \n",
354 | "1369765 70 0 32.58 0.00 0.5 0.00 \n",
355 | "1369766 137 0 16.85 0.00 0.5 3.90 \n",
356 | "1369767 188 0 53.68 0.00 0.5 0.00 \n",
357 | "1369768 61 0 25.45 2.75 0.5 0.00 \n",
358 | "\n",
359 | " tolls_amount improvement_surcharge total_amount \\\n",
360 | "0 0.00 0.3 11.80 \n",
361 | "1 0.00 0.3 4.30 \n",
362 | "2 0.00 0.3 51.95 \n",
363 | "3 0.00 0.3 36.35 \n",
364 | "4 0.00 0.3 24.36 \n",
365 | "... ... ... ... \n",
366 | "1369764 0.00 0.3 38.54 \n",
367 | "1369765 6.12 0.3 39.50 \n",
368 | "1369766 0.00 0.3 24.05 \n",
369 | "1369767 0.00 0.3 54.48 \n",
370 | "1369768 0.00 0.3 29.00 \n",
371 | "\n",
372 | " congestion_surcharge airport_fee \n",
373 | "0 2.5 NaN \n",
374 | "1 0.0 NaN \n",
375 | "2 0.0 NaN \n",
376 | "3 0.0 NaN \n",
377 | "4 2.5 NaN \n",
378 | "... ... ... \n",
379 | "1369764 NaN NaN \n",
380 | "1369765 NaN NaN \n",
381 | "1369766 NaN NaN \n",
382 | "1369767 NaN NaN \n",
383 | "1369768 NaN NaN \n",
384 | "\n",
385 | "[1369769 rows x 19 columns]"
386 | ]
387 | },
388 | "execution_count": 4,
389 | "metadata": {},
390 | "output_type": "execute_result"
391 | }
392 | ],
393 | "source": [
394 | "pd.read_parquet('yellow_tripdata_2021-01.parquet', engine='pyarrow')\n"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": null,
400 | "metadata": {},
401 | "outputs": [],
402 | "source": []
403 | }
404 | ],
405 | "metadata": {
406 | "kernelspec": {
407 | "display_name": "Python 3",
408 | "language": "python",
409 | "name": "python3"
410 | },
411 | "language_info": {
412 | "codemirror_mode": {
413 | "name": "ipython",
414 | "version": 3
415 | },
416 | "file_extension": ".py",
417 | "mimetype": "text/x-python",
418 | "name": "python",
419 | "nbconvert_exporter": "python",
420 | "pygments_lexer": "ipython3",
421 | "version": "3.7.4"
422 | }
423 | },
424 | "nbformat": 4,
425 | "nbformat_minor": 2
426 | }
427 |
--------------------------------------------------------------------------------
/week1/Dockerfile:
--------------------------------------------------------------------------------
1 | #how to create a new docker image
2 |
3 | #specify the python version of the image
4 | FROM python:3.9
5 |
6 | #install all the dependencies of our application
7 | RUN apt-get install wget
8 | RUN pip install pandas sqlalchemy psycopg2 pyarrow
9 |
10 | #create a app folder where pipeline.py will be stored
11 | WORKDIR /app
12 | COPY ingest_data.py ingest_data.py
13 |
14 | #run the pipeline script once the docker image is run
15 | ENTRYPOINT [ "python", "ingest_data.py"]
16 |
17 |
--------------------------------------------------------------------------------
/week1/README.md:
--------------------------------------------------------------------------------
1 | ## Topics covered in week 1:
2 | - Docker
3 | - Docker Compose
4 | - PostgreSQL
5 | - pgAdmin
6 | - pgcli
7 | - Terraform
8 | - Google Cloud Plataform
9 |
10 | ### Notes:
11 | Notion page: https://www.notion.so/pcrespoo/Week-1-adea5a2dbf0f44e49749238957f99754
12 |
13 | ### Commands learned in week 1:
14 |
15 | - dataset: https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.parquet
16 |
17 | - build an image:
18 | ```
19 | docker build -t IMAGE_NAME .
20 | ```
21 | - run an image :
22 | ```
23 | docker run -it IMAGE_NAME
24 | ```
25 | - run a postgres image
26 | ```
27 | docker run -it \
28 | -e POSTGRES_USER='root' \
29 | -e POSTGRES_PASSWORD='root' \
30 | -e POSTGRES_DB='ny_taxi' \
31 | -v "YOUR_PATH/ny_taxi_postgres_data:/var/lib/postgresql/data" \
32 | -p 5432:5432 \
33 | postgres:13
34 | ```
35 |
36 | - how to connect to a postgres database with pgcli:
37 | ```
38 | pgcli -h localhost -p 5432 -u root -d ny_taxi
39 | ```
40 |
41 | - pgAdmin image:
42 | ```
43 | docker run -it \
44 | -e PGADMIN_DEFAULT_EMAIL=admin@admin.com \
45 | -e PGADMIN_DEFAULT_PASSWORD="root" \
46 | -p 8080:80 \
47 | dpage/pgadmin4
48 | ```
49 | - 8080: the port in the local machine
50 | - 80: the port used by pgAdmin
51 | - 8080:80 is the setup to connect the local machine with pgAdmin
52 |
53 | - create a docker network:
54 | ```
55 | docker network create pedro_network
56 | ```
57 |
58 | - update PostgreSQL image:
59 | ```
60 | docker run -it \
61 | -e POSTGRES_USER='root' \
62 | -e POSTGRES_PASSWORD='root' \
63 | -e POSTGRES_DB='ny_taxi' \
64 | -v "YOUR_PATH/ny_taxi_postgres_data:/var/lib/postgresql/data" \
65 | -p 5432:5432 \
66 | --network=pedro_network \
67 | --name pg-database-teste \
68 | postgres:13
69 | ```
70 |
71 | - update pgAdmin image with network settings:
72 | ```
73 | docker run -it \
74 | -e PGADMIN_DEFAULT_EMAIL=admin@admin.com \
75 | -e PGADMIN_DEFAULT_PASSWORD="root" \
76 | -p 8080:80 \
77 | --network=pedro_network \
78 | --name pgAdmin-bootcamp \
79 | dpage/pgadmin4
80 | ```
81 |
82 | - build an image for the data ingestion process:
83 | ```
84 | docker build -t taxi_ingest:v001 .
85 | ```
86 |
87 | - run the docker image for data ingestion using the same network used by pgAdmin and PostgreSQL:
88 | ```
89 | docker run -it \
90 | --network=pedro_network \
91 | taxi_ingest:v001 \
92 | --user=root \
93 | --password=root \
94 | --host=pg-database-teste\
95 | --port=5432 \
96 | --db=ny_taxi \
97 | --table_name=yellow_taxi_data \
98 | --url="https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.parquet"
99 | ```
100 |
101 | - Docker Compose:
102 | ```
103 | docker-compose up
104 | ```
105 |
106 | - Terraform
107 | - terraform init: here we are basically initializing default parameters, like specifying the provider, the backend state file that will manage all the resources to be created, etc
108 |
109 | - terraform plan: here, we will pass to the state file which resources we want to have and their parameters
110 |
111 | - terraform apply: the state file will have all the plan to be executed. Then, running this command, it will create all the resources for us
112 | - if we decide to create more resources during development stage, we basically need to add more resources to the main file, run
113 | “terraform plan” and “terraform apply” to apply the changes
114 |
115 |
116 | - terraform destroy: if we want to remove all the resources created
--------------------------------------------------------------------------------
/week1/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | services:
2 | pgdatabase:
3 | image: postgres:13
4 | environment:
5 | - POSTGRES_USER=root
6 | - POSTGRES_PASSWORD=root
7 | - POSTGRES_DB=ny_taxi
8 | volumes:
9 | - "./ny_taxi_postgres_data:/var/lib/postgresql/data:rw"
10 | ports:
11 | - "5432:5432"
12 | pgadmin:
13 | image: dpage/pgadmin4
14 | environment:
15 | - PGADMIN_DEFAULT_EMAIL=admin@admin.com
16 | - PGADMIN_DEFAULT_PASSWORD=root
17 | ports:
18 | - "8080:80"
--------------------------------------------------------------------------------
/week1/ingest_data.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | from sqlalchemy import create_engine
5 | import pandas as pd
6 | from time import time
7 | import argparse
8 | import os
9 |
10 | def main(params):
11 | user = params.user
12 | password = params.password
13 | database = params.db
14 | url = params.url
15 | host = params.host
16 | table_name = params.table_name
17 | port = params.port
18 |
19 | #download the parquet and convert to csv
20 | parquet_name = 'output.parquet'
21 | os.system(f'wget {url} -O {parquet_name}')
22 | csv_name = 'output.csv'
23 |
24 | df_parquet = pd.read_parquet(parquet_name, engine='pyarrow')
25 | df_parquet['tpep_pickup_datetime'] = pd.to_datetime(df_parquet['tpep_pickup_datetime'])
26 | df_parquet['tpep_dropoff_datetime'] = pd.to_datetime(df_parquet['tpep_dropoff_datetime'])
27 | df_parquet.to_csv(csv_name,sep=';')
28 |
29 | #create a conn with Postgres
30 | engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{database}')
31 |
32 | #read data in chunks
33 | df_iter = pd.read_csv(csv_name,sep=';',iterator=True, chunksize=100000,index_col=0)
34 |
35 | #create a first chunk
36 | df = next(df_iter)
37 |
38 | #adjust date columns
39 | df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
40 | df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
41 |
42 | #create table
43 | df.head(0).to_sql(con=engine, name=table_name, if_exists='replace')
44 |
45 | #insert chunks of data into the table
46 | while True:
47 | t_start = time()
48 |
49 | df = next(df_iter)
50 |
51 | df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
52 | df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
53 |
54 | df.to_sql(con=engine, name=table_name, if_exists='append')
55 |
56 | t_final = time()
57 |
58 | print(f'chunk insertion took {t_final - t_start}')
59 |
60 | if __name__ == '__main__':
61 | #parse the CLI parameters
62 | parser = argparse.ArgumentParser(description='Ingest csv data to postgresql')
63 | parser.add_argument('--user', help='username for postgresql')
64 | parser.add_argument('--password', help='password for postgresql')
65 | parser.add_argument('--port', help='port for postgresql')
66 | parser.add_argument('--host', help='host for postgresql')
67 | parser.add_argument('--db', help='database name')
68 | parser.add_argument('--table_name', help='name of the table')
69 | parser.add_argument('--url', help='url of the csv file')
70 | args = parser.parse_args()
71 |
72 | main(args)
73 |
--------------------------------------------------------------------------------
/week1/postgres_connection.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "data": {
10 | "text/plain": [
11 | "'0.25.1'"
12 | ]
13 | },
14 | "execution_count": 1,
15 | "metadata": {},
16 | "output_type": "execute_result"
17 | }
18 | ],
19 | "source": [
20 | "import pandas as pd\n",
21 | "pd.__version__"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "#read the original data and convert to csv before proceeding\n",
31 | "df = pd.read_parquet('yellow_tripdata_2021-01.parquet', engine='pyarrow')\n",
32 | "df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])\n",
33 | "df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])\n",
34 | "df.to_csv('yellow_tripdata_2021-01.csv',sep=';')"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 3,
40 | "metadata": {},
41 | "outputs": [
42 | {
43 | "name": "stderr",
44 | "output_type": "stream",
45 | "text": [
46 | "C:\\Users\\pedro\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3058: DtypeWarning: Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.\n",
47 | " interactivity=interactivity, compiler=compiler, result=result)\n",
48 | "C:\\Users\\pedro\\Anaconda3\\lib\\site-packages\\numpy\\lib\\arraysetops.py:569: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
49 | " mask |= (ar1 == a)\n"
50 | ]
51 | },
52 | {
53 | "data": {
54 | "text/html": [
55 | "\n",
56 | "\n",
69 | "
\n",
70 | " \n",
71 | " \n",
72 | " | \n",
73 | " VendorID | \n",
74 | " tpep_pickup_datetime | \n",
75 | " tpep_dropoff_datetime | \n",
76 | " passenger_count | \n",
77 | " trip_distance | \n",
78 | " RatecodeID | \n",
79 | " store_and_fwd_flag | \n",
80 | " PULocationID | \n",
81 | " DOLocationID | \n",
82 | " payment_type | \n",
83 | " fare_amount | \n",
84 | " extra | \n",
85 | " mta_tax | \n",
86 | " tip_amount | \n",
87 | " tolls_amount | \n",
88 | " improvement_surcharge | \n",
89 | " total_amount | \n",
90 | " congestion_surcharge | \n",
91 | " airport_fee | \n",
92 | "
\n",
93 | " \n",
94 | " \n",
95 | " \n",
96 | " 0 | \n",
97 | " 1 | \n",
98 | " 2021-01-01 00:30:10 | \n",
99 | " 2021-01-01 00:36:12 | \n",
100 | " 1.0 | \n",
101 | " 2.10 | \n",
102 | " 1.0 | \n",
103 | " N | \n",
104 | " 142 | \n",
105 | " 43 | \n",
106 | " 2 | \n",
107 | " 8.0 | \n",
108 | " 3.0 | \n",
109 | " 0.5 | \n",
110 | " 0.00 | \n",
111 | " 0.0 | \n",
112 | " 0.3 | \n",
113 | " 11.80 | \n",
114 | " 2.5 | \n",
115 | " NaN | \n",
116 | "
\n",
117 | " \n",
118 | " 1 | \n",
119 | " 1 | \n",
120 | " 2021-01-01 00:51:20 | \n",
121 | " 2021-01-01 00:52:19 | \n",
122 | " 1.0 | \n",
123 | " 0.20 | \n",
124 | " 1.0 | \n",
125 | " N | \n",
126 | " 238 | \n",
127 | " 151 | \n",
128 | " 2 | \n",
129 | " 3.0 | \n",
130 | " 0.5 | \n",
131 | " 0.5 | \n",
132 | " 0.00 | \n",
133 | " 0.0 | \n",
134 | " 0.3 | \n",
135 | " 4.30 | \n",
136 | " 0.0 | \n",
137 | " NaN | \n",
138 | "
\n",
139 | " \n",
140 | " 2 | \n",
141 | " 1 | \n",
142 | " 2021-01-01 00:43:30 | \n",
143 | " 2021-01-01 01:11:06 | \n",
144 | " 1.0 | \n",
145 | " 14.70 | \n",
146 | " 1.0 | \n",
147 | " N | \n",
148 | " 132 | \n",
149 | " 165 | \n",
150 | " 1 | \n",
151 | " 42.0 | \n",
152 | " 0.5 | \n",
153 | " 0.5 | \n",
154 | " 8.65 | \n",
155 | " 0.0 | \n",
156 | " 0.3 | \n",
157 | " 51.95 | \n",
158 | " 0.0 | \n",
159 | " NaN | \n",
160 | "
\n",
161 | " \n",
162 | " 3 | \n",
163 | " 1 | \n",
164 | " 2021-01-01 00:15:48 | \n",
165 | " 2021-01-01 00:31:01 | \n",
166 | " 0.0 | \n",
167 | " 10.60 | \n",
168 | " 1.0 | \n",
169 | " N | \n",
170 | " 138 | \n",
171 | " 132 | \n",
172 | " 1 | \n",
173 | " 29.0 | \n",
174 | " 0.5 | \n",
175 | " 0.5 | \n",
176 | " 6.05 | \n",
177 | " 0.0 | \n",
178 | " 0.3 | \n",
179 | " 36.35 | \n",
180 | " 0.0 | \n",
181 | " NaN | \n",
182 | "
\n",
183 | " \n",
184 | " 4 | \n",
185 | " 2 | \n",
186 | " 2021-01-01 00:31:49 | \n",
187 | " 2021-01-01 00:48:21 | \n",
188 | " 1.0 | \n",
189 | " 4.94 | \n",
190 | " 1.0 | \n",
191 | " N | \n",
192 | " 68 | \n",
193 | " 33 | \n",
194 | " 1 | \n",
195 | " 16.5 | \n",
196 | " 0.5 | \n",
197 | " 0.5 | \n",
198 | " 4.06 | \n",
199 | " 0.0 | \n",
200 | " 0.3 | \n",
201 | " 24.36 | \n",
202 | " 2.5 | \n",
203 | " NaN | \n",
204 | "
\n",
205 | " \n",
206 | "
\n",
207 | "
"
208 | ],
209 | "text/plain": [
210 | " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n",
211 | "0 1 2021-01-01 00:30:10 2021-01-01 00:36:12 1.0 \n",
212 | "1 1 2021-01-01 00:51:20 2021-01-01 00:52:19 1.0 \n",
213 | "2 1 2021-01-01 00:43:30 2021-01-01 01:11:06 1.0 \n",
214 | "3 1 2021-01-01 00:15:48 2021-01-01 00:31:01 0.0 \n",
215 | "4 2 2021-01-01 00:31:49 2021-01-01 00:48:21 1.0 \n",
216 | "\n",
217 | " trip_distance RatecodeID store_and_fwd_flag PULocationID DOLocationID \\\n",
218 | "0 2.10 1.0 N 142 43 \n",
219 | "1 0.20 1.0 N 238 151 \n",
220 | "2 14.70 1.0 N 132 165 \n",
221 | "3 10.60 1.0 N 138 132 \n",
222 | "4 4.94 1.0 N 68 33 \n",
223 | "\n",
224 | " payment_type fare_amount extra mta_tax tip_amount tolls_amount \\\n",
225 | "0 2 8.0 3.0 0.5 0.00 0.0 \n",
226 | "1 2 3.0 0.5 0.5 0.00 0.0 \n",
227 | "2 1 42.0 0.5 0.5 8.65 0.0 \n",
228 | "3 1 29.0 0.5 0.5 6.05 0.0 \n",
229 | "4 1 16.5 0.5 0.5 4.06 0.0 \n",
230 | "\n",
231 | " improvement_surcharge total_amount congestion_surcharge airport_fee \n",
232 | "0 0.3 11.80 2.5 NaN \n",
233 | "1 0.3 4.30 0.0 NaN \n",
234 | "2 0.3 51.95 0.0 NaN \n",
235 | "3 0.3 36.35 0.0 NaN \n",
236 | "4 0.3 24.36 2.5 NaN "
237 | ]
238 | },
239 | "execution_count": 3,
240 | "metadata": {},
241 | "output_type": "execute_result"
242 | }
243 | ],
244 | "source": [
245 | "#read csv\n",
246 | "df = pd.read_csv('yellow_tripdata_2021-01.csv',sep=';',index_col=0)\n",
247 | "df.head()"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 4,
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "#read data in chunks\n",
257 | "df_iter = df = pd.read_csv('yellow_tripdata_2021-01.csv',sep=';',iterator=True, chunksize=100000,index_col=0)\n",
258 | "\n",
259 | "#create a first chunk\n",
260 | "df = next(df_iter)\n",
261 | "\n",
262 | "#adjust date columns \n",
263 | "df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])\n",
264 | "df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 5,
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "#create a conn with Postgres\n",
274 | "from sqlalchemy import create_engine\n",
275 | "engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 6,
281 | "metadata": {},
282 | "outputs": [
283 | {
284 | "name": "stdout",
285 | "output_type": "stream",
286 | "text": [
287 | "\n",
288 | "CREATE TABLE yellow_taxi_data (\n",
289 | "\t\"VendorID\" BIGINT, \n",
290 | "\ttpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, \n",
291 | "\ttpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, \n",
292 | "\tpassenger_count FLOAT(53), \n",
293 | "\ttrip_distance FLOAT(53), \n",
294 | "\t\"RatecodeID\" FLOAT(53), \n",
295 | "\tstore_and_fwd_flag TEXT, \n",
296 | "\t\"PULocationID\" BIGINT, \n",
297 | "\t\"DOLocationID\" BIGINT, \n",
298 | "\tpayment_type BIGINT, \n",
299 | "\tfare_amount FLOAT(53), \n",
300 | "\textra FLOAT(53), \n",
301 | "\tmta_tax FLOAT(53), \n",
302 | "\ttip_amount FLOAT(53), \n",
303 | "\ttolls_amount FLOAT(53), \n",
304 | "\timprovement_surcharge FLOAT(53), \n",
305 | "\ttotal_amount FLOAT(53), \n",
306 | "\tcongestion_surcharge FLOAT(53), \n",
307 | "\tairport_fee FLOAT(53)\n",
308 | ")\n",
309 | "\n",
310 | "\n"
311 | ]
312 | }
313 | ],
314 | "source": [
315 | "#sample of schema of the table to be created on Postgres\n",
316 | "print(pd.io.sql.get_schema(df,name='yellow_taxi_data',con=engine))"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 7,
322 | "metadata": {},
323 | "outputs": [],
324 | "source": [
325 | "#create table\n",
326 | "df.head(0).to_sql(con=engine, name='yellow_taxi_data', if_exists='replace')"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 8,
332 | "metadata": {},
333 | "outputs": [
334 | {
335 | "name": "stdout",
336 | "output_type": "stream",
337 | "text": [
338 | "chunk insertion took 140.19736647605896\n",
339 | "chunk insertion took 122.36996746063232\n",
340 | "chunk insertion took 126.62409663200378\n",
341 | "chunk insertion took 127.74477791786194\n",
342 | "chunk insertion took 132.0837414264679\n",
343 | "chunk insertion took 125.15627336502075\n",
344 | "chunk insertion took 137.10601329803467\n",
345 | "chunk insertion took 88.83106231689453\n",
346 | "chunk insertion took 87.03130531311035\n",
347 | "chunk insertion took 86.5908088684082\n",
348 | "chunk insertion took 87.42471408843994\n",
349 | "chunk insertion took 91.67677354812622\n",
350 | "chunk insertion took 59.76935338973999\n"
351 | ]
352 | },
353 | {
354 | "ename": "StopIteration",
355 | "evalue": "",
356 | "output_type": "error",
357 | "traceback": [
358 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
359 | "\u001b[1;31mStopIteration\u001b[0m Traceback (most recent call last)",
360 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mt_start\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_iter\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'tpep_pickup_datetime'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_datetime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'tpep_pickup_datetime'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
361 | "\u001b[1;32mC:\\Users\\pedro\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m__next__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1126\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__next__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1127\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1128\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_chunk\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1129\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1130\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
362 | "\u001b[1;32mC:\\Users\\pedro\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mget_chunk\u001b[1;34m(self, size)\u001b[0m\n\u001b[0;32m 1186\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1187\u001b[0m \u001b[0msize\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msize\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnrows\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_currow\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1188\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msize\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1189\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1190\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
363 | "\u001b[1;32mC:\\Users\\pedro\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, nrows)\u001b[0m\n\u001b[0;32m 1152\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1153\u001b[0m \u001b[0mnrows\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_validate_integer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"nrows\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1154\u001b[1;33m \u001b[0mret\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1155\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1156\u001b[0m \u001b[1;31m# May alter columns / col_dict\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
364 | "\u001b[1;32mC:\\Users\\pedro\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, nrows)\u001b[0m\n\u001b[0;32m 2057\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2058\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2059\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2060\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2061\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_first_chunk\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
365 | "\u001b[1;32mpandas\\_libs\\parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read\u001b[1;34m()\u001b[0m\n",
366 | "\u001b[1;32mpandas\\_libs\\parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_low_memory\u001b[1;34m()\u001b[0m\n",
367 | "\u001b[1;31mStopIteration\u001b[0m: "
368 | ]
369 | }
370 | ],
371 | "source": [
372 | "#insert chunks of data into the table\n",
373 | "from time import time\n",
374 | "while True:\n",
375 | " t_start = time()\n",
376 | " \n",
377 | " df = next(df_iter)\n",
378 | " \n",
379 | " df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])\n",
380 | " df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])\n",
381 | " \n",
382 | " df.to_sql(con=engine, name='yellow_taxi_data', if_exists='append')\n",
383 | " \n",
384 | " t_final = time()\n",
385 | " \n",
386 | " print(f'chunk insertion took {t_final - t_start}')"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": []
395 | }
396 | ],
397 | "metadata": {
398 | "kernelspec": {
399 | "display_name": "Python 3",
400 | "language": "python",
401 | "name": "python3"
402 | },
403 | "language_info": {
404 | "codemirror_mode": {
405 | "name": "ipython",
406 | "version": 3
407 | },
408 | "file_extension": ".py",
409 | "mimetype": "text/x-python",
410 | "name": "python",
411 | "nbconvert_exporter": "python",
412 | "pygments_lexer": "ipython3",
413 | "version": "3.7.4"
414 | }
415 | },
416 | "nbformat": 4,
417 | "nbformat_minor": 2
418 | }
419 |
--------------------------------------------------------------------------------
/week1/terraform/.terraform-version:
--------------------------------------------------------------------------------
1 | 1.2.0
--------------------------------------------------------------------------------
/week1/terraform/main.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_version = ">= 1.0"
3 | backend "local" {} # Can change from "local" to "gcs" (for google) or "s3" (for aws), if you would like to preserve your tf-state online
4 | required_providers {
5 | google = {
6 | source = "hashicorp/google"
7 | }
8 | }
9 | }
10 |
11 | provider "google" {
12 | project = var.project
13 | region = var.region
14 | // credentials = file(var.credentials) # Use this if you do not want to set env-var GOOGLE_APPLICATION_CREDENTIALS
15 | }
16 |
17 | # Data Lake Bucket
18 | # Ref: https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket
19 | resource "google_storage_bucket" "data-lake-bucket" {
20 | name = "${local.data_lake_bucket}_${var.project}" # Concatenating DL bucket & Project name for unique naming
21 | location = var.region
22 |
23 | # Optional, but recommended settings:
24 | storage_class = var.storage_class
25 | uniform_bucket_level_access = true
26 |
27 | versioning {
28 | enabled = true
29 | }
30 |
31 | lifecycle_rule {
32 | action {
33 | type = "Delete"
34 | }
35 | condition {
36 | age = 30 // days
37 | }
38 | }
39 |
40 | force_destroy = true
41 | }
42 |
43 | # DWH
44 | # Ref: https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/bigquery_dataset
45 | resource "google_bigquery_dataset" "dataset" {
46 | dataset_id = var.BQ_DATASET
47 | project = var.project
48 | location = var.region
49 | }
--------------------------------------------------------------------------------
/week1/terraform/variables.tf:
--------------------------------------------------------------------------------
1 | locals {
2 | data_lake_bucket = "dtc_data_lake"
3 | }
4 |
5 | variable "project" {
6 | description = "Your GCP Project ID"
7 | }
8 |
9 | variable "region" {
10 | description = "Region for GCP resources. Choose as per your location: https://cloud.google.com/about/locations"
11 | default = "southamerica-east1"
12 | type = string
13 | }
14 |
15 | variable "storage_class" {
16 | description = "Storage class type for your bucket. Check official docs for more info."
17 | default = "STANDARD"
18 | }
19 |
20 | variable "BQ_DATASET" {
21 | description = "BigQuery Dataset that raw data (from GCS) will be written to"
22 | type = string
23 | default = "trips_data_all"
24 | }
25 |
26 | variable "TABLE_NAME"{
27 | description = "BigQuery Table"
28 | type = string
29 | default = "ny_trips"
30 | }
--------------------------------------------------------------------------------
/week2/.gitignore:
--------------------------------------------------------------------------------
1 | google/
2 | .env
3 | logs/
4 | plugins/
5 | **__pycache__/
--------------------------------------------------------------------------------
/week2/README.md:
--------------------------------------------------------------------------------
1 | ## Topics covered in week 2:
2 | - Airflow
3 | - Data Ingestion to GCP with Airflow
4 |
5 | ### Notes:
6 | Notion page: https://www.notion.so/pcrespoo/Week-2-eca5926ca202477998bb0296f6487d83
7 |
8 |
--------------------------------------------------------------------------------
/week2/airflow/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM apache/airflow:2.2.3
2 |
3 | ENV AIRFLOW_HOME=/opt/airflow
4 |
5 | USER root
6 | RUN apt-get update -qq && apt-get install vim -qqq
7 |
8 | COPY requirements.txt .
9 | RUN pip install --no-cache-dir -r requirements.txt
10 |
11 | ## GOOGLE IMAGE
12 | SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"]
13 |
14 | ARG CLOUD_SDK_VERSION=322.0.0
15 | ENV GCLOUD_HOME=/home/google-cloud-sdk
16 |
17 | ENV PATH="${GCLOUD_HOME}/bin/:${PATH}"
18 |
19 | RUN DOWNLOAD_URL="https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz" \
20 | && TMP_DIR="$(mktemp -d)" \
21 | && curl -fL "${DOWNLOAD_URL}" --output "${TMP_DIR}/google-cloud-sdk.tar.gz" \
22 | && mkdir -p "${GCLOUD_HOME}" \
23 | && tar xzf "${TMP_DIR}/google-cloud-sdk.tar.gz" -C "${GCLOUD_HOME}" --strip-components=1 \
24 | && "${GCLOUD_HOME}/install.sh" \
25 | --bash-completion=false \
26 | --path-update=false \
27 | --usage-reporting=false \
28 | --quiet \
29 | && rm -rf "${TMP_DIR}" \
30 | && gcloud --version
31 |
32 | WORKDIR $AIRFLOW_HOME
33 |
34 | USER $AIRFLOW_UID
--------------------------------------------------------------------------------
/week2/airflow/dags/dag_ingestion_gcs.py:
--------------------------------------------------------------------------------
1 | import os
2 | from airflow import DAG
3 | from airflow.utils.dates import days_ago
4 | from airflow.operators.bash import BashOperator
5 | from airflow.operators.python import PythonOperator
6 | from google.cloud import storage
7 | from airflow.providers.google.cloud.operators.bigquery import BigQueryCreateExternalTableOperator
8 |
9 | PROJECT_ID = os.environ.get("GCP_PROJECT_ID")
10 | BUCKET = os.environ.get("GCP_GCS_BUCKET")
11 |
12 | dataset_file = "yellow_tripdata_2021-01.parquet"
13 | dataset_url = f"https://s3.amazonaws.com/nyc-tlc/trip+data/{dataset_file}"
14 | path_to_local_home = os.environ.get("AIRFLOW_HOME", "/opt/airflow/")
15 | BIGQUERY_DATASET = os.environ.get("BIGQUERY_DATASET", 'trips_data_all')
16 |
17 | def upload_to_gcs(bucket, object_name, local_file):
18 | """
19 | Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
20 | :param bucket: GCS bucket name
21 | :param object_name: target path & file-name
22 | :param local_file: source path & file-name
23 | :return:
24 | """
25 | # WORKAROUND to prevent timeout for files > 6 MB on 800 kbps upload speed.
26 | # (Ref: https://github.com/googleapis/python-storage/issues/74)
27 | storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024 # 5 MB
28 | storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024 # 5 MB
29 | # End of Workaround
30 |
31 | client = storage.Client()
32 | bucket = client.bucket(bucket)
33 |
34 | blob = bucket.blob(object_name)
35 | blob.upload_from_filename(local_file)
36 |
37 |
38 | default_args = {
39 | "owner": "airflow",
40 | "start_date": days_ago(1),
41 | "depends_on_past": False,
42 | "retries": 1,
43 | }
44 |
45 | # NOTE: DAG declaration - using a Context Manager (an implicit way)
46 | with DAG(
47 | dag_id="data_ingestion_gcs_dag",
48 | schedule_interval="@daily",
49 | default_args=default_args,
50 | catchup=False,
51 | max_active_runs=1,
52 | tags=['dtc-de'],
53 | ) as dag:
54 |
55 | download_dataset_task = BashOperator(
56 | task_id="download_dataset_task",
57 | bash_command=f"curl -sS {dataset_url} > {path_to_local_home}/{dataset_file}"
58 | )
59 |
60 | # TODO: Homework - research and try XCOM to communicate output values between 2 tasks/operators
61 | local_to_gcs_task = PythonOperator(
62 | task_id="local_to_gcs_task",
63 | python_callable=upload_to_gcs,
64 | op_kwargs={
65 | "bucket": BUCKET,
66 | "object_name": f"raw/{dataset_file}",
67 | "local_file": f"{path_to_local_home}/{dataset_file}",
68 | },
69 | )
70 |
71 | bigquery_external_table_task = BigQueryCreateExternalTableOperator(
72 | task_id="bigquery_external_table_task",
73 | table_resource={
74 | "tableReference": {
75 | "projectId": PROJECT_ID,
76 | "datasetId": BIGQUERY_DATASET,
77 | "tableId": "external_table",
78 | },
79 | "externalDataConfiguration": {
80 | "sourceFormat": "PARQUET",
81 | "sourceUris": [f"gs://{BUCKET}/raw/{dataset_file}"],
82 | },
83 | },
84 | )
85 |
86 | download_dataset_task >> local_to_gcs_task >> bigquery_external_table_task
87 |
--------------------------------------------------------------------------------
/week2/airflow/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | #
18 |
19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
20 | #
21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
22 | #
23 | # This configuration supports basic configuration using environment variables or an .env file
24 | # The following variables are supported:
25 | #
26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow.
27 | # Default: apache/airflow:2.2.3
28 | # AIRFLOW_UID - User ID in Airflow containers
29 | # Default: 50000
30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
31 | #
32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested).
33 | # Default: airflow
34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested).
35 | # Default: airflow
36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
37 | # Default: ''
38 | #
39 | # Feel free to modify this file to suit your needs.
40 | ---
41 | version: '3'
42 | x-airflow-common:
43 | &airflow-common
44 | # In order to add custom dependencies or upgrade provider packages you can use your extended image.
45 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
46 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
47 | build:
48 | context: .
49 | dockerfile: ./Dockerfile
50 | environment:
51 | &airflow-common-env
52 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor
53 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
54 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
55 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
56 | AIRFLOW__CORE__FERNET_KEY: ''
57 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
58 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
59 | AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth'
60 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
61 | GOOGLE_APPLICATION_CREDENTIALS: /google/credentials/google_credentials.json
62 | AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT: 'google-cloud-platform://?extra__google_cloud_platform__key_path=/google/credentials/google_credentials.json'
63 |
64 | # TODO: Please change GCP_PROJECT_ID & GCP_GCS_BUCKET, as per your config
65 | GCP_PROJECT_ID: 'dtc-boot-7639'
66 | GCP_GCS_BUCKET: 'dtc_data_lake_dtc-boot-7639'
67 |
68 | volumes:
69 | - ./dags:/opt/airflow/dags
70 | - ./logs:/opt/airflow/logs
71 | - ./plugins:/opt/airflow/plugins
72 | - C:/Users/pedro/Documents/Estudos_DS/repos/data-engineering-bootcamp/week2/airflow/google/credentials/:/google/credentials:ro
73 |
74 | user: "${AIRFLOW_UID:-50000}:0"
75 | depends_on:
76 | &airflow-common-depends-on
77 | redis:
78 | condition: service_healthy
79 | postgres:
80 | condition: service_healthy
81 |
82 | services:
83 | postgres:
84 | image: postgres:13
85 | environment:
86 | POSTGRES_USER: airflow
87 | POSTGRES_PASSWORD: airflow
88 | POSTGRES_DB: airflow
89 | volumes:
90 | - postgres-db-volume:/var/lib/postgresql/data
91 | healthcheck:
92 | test: ["CMD", "pg_isready", "-U", "airflow"]
93 | interval: 5s
94 | retries: 5
95 | restart: always
96 |
97 | redis:
98 | image: redis:latest
99 | expose:
100 | - 6379
101 | healthcheck:
102 | test: ["CMD", "redis-cli", "ping"]
103 | interval: 5s
104 | timeout: 30s
105 | retries: 50
106 | restart: always
107 |
108 | airflow-webserver:
109 | <<: *airflow-common
110 | command: webserver
111 | ports:
112 | - 8080:8080
113 | healthcheck:
114 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
115 | interval: 10s
116 | timeout: 10s
117 | retries: 5
118 | restart: always
119 | depends_on:
120 | <<: *airflow-common-depends-on
121 | airflow-init:
122 | condition: service_completed_successfully
123 |
124 | airflow-scheduler:
125 | <<: *airflow-common
126 | command: scheduler
127 | healthcheck:
128 | test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"']
129 | interval: 10s
130 | timeout: 10s
131 | retries: 5
132 | restart: always
133 | depends_on:
134 | <<: *airflow-common-depends-on
135 | airflow-init:
136 | condition: service_completed_successfully
137 |
138 | airflow-worker:
139 | <<: *airflow-common
140 | command: celery worker
141 | healthcheck:
142 | test:
143 | - "CMD-SHELL"
144 | - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
145 | interval: 10s
146 | timeout: 10s
147 | retries: 5
148 | environment:
149 | <<: *airflow-common-env
150 | # Required to handle warm shutdown of the celery workers properly
151 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
152 | DUMB_INIT_SETSID: "0"
153 | restart: always
154 | depends_on:
155 | <<: *airflow-common-depends-on
156 | airflow-init:
157 | condition: service_completed_successfully
158 |
159 | airflow-triggerer:
160 | <<: *airflow-common
161 | command: triggerer
162 | healthcheck:
163 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
164 | interval: 10s
165 | timeout: 10s
166 | retries: 5
167 | restart: always
168 | depends_on:
169 | <<: *airflow-common-depends-on
170 | airflow-init:
171 | condition: service_completed_successfully
172 |
173 | airflow-init:
174 | <<: *airflow-common
175 | entrypoint: /bin/bash
176 | # yamllint disable rule:line-length
177 | command:
178 | - -c
179 | - |
180 | function ver() {
181 | printf "%04d%04d%04d%04d" $${1//./ }
182 | }
183 | airflow_version=$$(gosu airflow airflow version)
184 | airflow_version_comparable=$$(ver $${airflow_version})
185 | min_airflow_version=2.2.0
186 | min_airflow_version_comparable=$$(ver $${min_airflow_version})
187 | if (( airflow_version_comparable < min_airflow_version_comparable )); then
188 | echo
189 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
190 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
191 | echo
192 | exit 1
193 | fi
194 | if [[ -z "${AIRFLOW_UID}" ]]; then
195 | echo
196 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
197 | echo "If you are on Linux, you SHOULD follow the instructions below to set "
198 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
199 | echo "For other operating systems you can get rid of the warning with manually created .env file:"
200 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user"
201 | echo
202 | fi
203 | one_meg=1048576
204 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
205 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
206 | disk_available=$$(df / | tail -1 | awk '{print $$4}')
207 | warning_resources="false"
208 | if (( mem_available < 4000 )) ; then
209 | echo
210 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
211 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
212 | echo
213 | warning_resources="true"
214 | fi
215 | if (( cpus_available < 2 )); then
216 | echo
217 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
218 | echo "At least 2 CPUs recommended. You have $${cpus_available}"
219 | echo
220 | warning_resources="true"
221 | fi
222 | if (( disk_available < one_meg * 10 )); then
223 | echo
224 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
225 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
226 | echo
227 | warning_resources="true"
228 | fi
229 | if [[ $${warning_resources} == "true" ]]; then
230 | echo
231 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
232 | echo "Please follow the instructions to increase amount of resources available:"
233 | echo " https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin"
234 | echo
235 | fi
236 | mkdir -p /sources/logs /sources/dags /sources/plugins
237 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
238 | exec /entrypoint airflow version
239 | # yamllint enable rule:line-length
240 | environment:
241 | <<: *airflow-common-env
242 | _AIRFLOW_DB_UPGRADE: 'true'
243 | _AIRFLOW_WWW_USER_CREATE: 'true'
244 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
245 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
246 | user: "0:0"
247 | volumes:
248 | - .:/sources
249 |
250 | airflow-cli:
251 | <<: *airflow-common
252 | profiles:
253 | - debug
254 | environment:
255 | <<: *airflow-common-env
256 | CONNECTION_CHECK_MAX_COUNT: "0"
257 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
258 | command:
259 | - bash
260 | - -c
261 | - airflow
262 |
263 | flower:
264 | <<: *airflow-common
265 | command: celery flower
266 | ports:
267 | - 5555:5555
268 | healthcheck:
269 | test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
270 | interval: 10s
271 | timeout: 10s
272 | retries: 5
273 | restart: always
274 | depends_on:
275 | <<: *airflow-common-depends-on
276 | airflow-init:
277 | condition: service_completed_successfully
278 |
279 | volumes:
280 | postgres-db-volume:
--------------------------------------------------------------------------------
/week2/airflow/requirements.txt:
--------------------------------------------------------------------------------
1 | apache-airflow-providers-google
2 | pyarrow
3 |
--------------------------------------------------------------------------------
/week3/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | logs/
--------------------------------------------------------------------------------
/week3/README.md:
--------------------------------------------------------------------------------
1 | ## Topics covered in week 3:
2 | - BigQuery
3 | - Table Partition
4 | - Table Clustering
5 | - Machine Learning in BigQuery
6 | - General usage with BigQuery
7 | - Airflow
8 |
9 | ### Notes:
10 | Notion page: https://www.notion.so/pcrespoo/Week-3-2e55578d253b45d4ab16c6213c3cf9f4
11 |
--------------------------------------------------------------------------------
/week3/airflow/dags/__pycache__/gcp_to_bq_dag.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pcrespoo/data-engineering-bootcamp/a1835f9c1557d4974b73837a0622ac5d24db9dfa/week3/airflow/dags/__pycache__/gcp_to_bq_dag.cpython-37.pyc
--------------------------------------------------------------------------------
/week3/airflow/dags/gcp_to_bq_dag.py:
--------------------------------------------------------------------------------
1 | import os
2 | from airflow import DAG
3 | from airflow.utils.dates import days_ago
4 | from airflow.providers.google.cloud.operators.bigquery import BigQueryCreateExternalTableOperator, BigQueryInsertJobOperator
5 | from airflow.providers.google.cloud.transfers.gcs_to_gcs import GCSToGCSOperator
6 |
7 | PROJECT_ID = os.environ.get("GCP_PROJECT_ID")
8 | BUCKET = os.environ.get("GCP_GCS_BUCKET")
9 |
10 | path_to_local_home = os.environ.get("AIRFLOW_HOME", "/opt/airflow/")
11 | BIGQUERY_DATASET = os.environ.get("BIGQUERY_DATASET", 'trips_data_all')
12 |
13 | default_args = {
14 | "owner": "airflow",
15 | "start_date": days_ago(1),
16 | "depends_on_past": False,
17 | "retries": 1,
18 | }
19 |
20 | with DAG(
21 | dag_id="gcs_2_bq_dag",
22 | schedule_interval="@daily",
23 | default_args=default_args,
24 | catchup=False,
25 | max_active_runs=1,
26 | tags=['dtc-de'],
27 | ) as dag:
28 |
29 | #move files in GCS
30 | gcs_2_gcs_task = GCSToGCSOperator(
31 | task_id = "gcs_2_gcs_task",
32 | source_bucket = BUCKET,
33 | source_object = "raw/yellow_tripdata*.parquet",
34 | destination_bucket = BUCKET,
35 | move_object = True,
36 | destination_object = 'yellow/'
37 | )
38 |
39 | #create a table from the new file location
40 | gcs_2_bq_ext_task = BigQueryCreateExternalTableOperator(
41 | task_id="gcs_2_bq_ext_task",
42 | table_resource={
43 | "tableReference": {
44 | "projectId": PROJECT_ID,
45 | "datasetId": BIGQUERY_DATASET,
46 | "tableId": "external_yellow_tripdata",
47 | },
48 | "externalDataConfiguration": {
49 | "sourceFormat": "PARQUET",
50 | "sourceUris": [f"gs://{BUCKET}/yellow/*"],
51 | },
52 | },
53 | )
54 |
55 | #partition a table
56 | CREATE_PART_TBL_QUERY = f"""CREATE OR REPLACE TABLE {BIGQUERY_DATASET}.yellow_tripdata_partitioned
57 | PARTITION BY DATE(tpep_pickup_datetime) AS SELECT * FROM {BIGQUERY_DATASET}.external_yellow_tripdata"""
58 | bq_ext_2_part_task = BigQueryInsertJobOperator(
59 | task_id = "bq_ext_2_part_task",
60 | configuration = {
61 | 'query':{
62 | 'query': CREATE_PART_TBL_QUERY,
63 | 'useLegacySql':False,
64 | }
65 | },
66 | )
67 |
68 |
69 | gcs_2_gcs_task >> gcs_2_bq_ext_task >> bq_ext_2_part_task
--------------------------------------------------------------------------------
/week3/airflow/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | #
18 |
19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
20 | #
21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
22 | #
23 | # This configuration supports basic configuration using environment variables or an .env file
24 | # The following variables are supported:
25 | #
26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow.
27 | # Default: apache/airflow:2.2.3
28 | # AIRFLOW_UID - User ID in Airflow containers
29 | # Default: 50000
30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
31 | #
32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested).
33 | # Default: airflow
34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested).
35 | # Default: airflow
36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
37 | # Default: ''
38 | #
39 | # Feel free to modify this file to suit your needs.
40 | ---
41 | version: '3'
42 | x-airflow-common:
43 | &airflow-common
44 | # In order to add custom dependencies or upgrade provider packages you can use your extended image.
45 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
46 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
47 | build:
48 | context: .
49 | dockerfile: ./Dockerfile
50 | environment:
51 | &airflow-common-env
52 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor
53 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
54 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
55 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
56 | AIRFLOW__CORE__FERNET_KEY: ''
57 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
58 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
59 | AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth'
60 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
61 | GOOGLE_APPLICATION_CREDENTIALS: /google/credentials/google_credentials.json
62 | AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT: 'google-cloud-platform://?extra__google_cloud_platform__key_path=/google/credentials/google_credentials.json'
63 |
64 | # TODO: Please change GCP_PROJECT_ID & GCP_GCS_BUCKET, as per your config
65 | GCP_PROJECT_ID: 'dtc-boot-7639'
66 | GCP_GCS_BUCKET: 'dtc_data_lake_dtc-boot-7639'
67 |
68 | volumes:
69 | - ./dags:/opt/airflow/dags
70 | - ./logs:/opt/airflow/logs
71 | - ./plugins:/opt/airflow/plugins
72 | - C:/Users/pedro/Documents/Estudos_DS/repos/data-engineering-bootcamp/week2/airflow/google/credentials/:/google/credentials:ro
73 |
74 | user: "${AIRFLOW_UID:-50000}:0"
75 | depends_on:
76 | &airflow-common-depends-on
77 | redis:
78 | condition: service_healthy
79 | postgres:
80 | condition: service_healthy
81 |
82 | services:
83 | postgres:
84 | image: postgres:13
85 | environment:
86 | POSTGRES_USER: airflow
87 | POSTGRES_PASSWORD: airflow
88 | POSTGRES_DB: airflow
89 | volumes:
90 | - postgres-db-volume:/var/lib/postgresql/data
91 | healthcheck:
92 | test: ["CMD", "pg_isready", "-U", "airflow"]
93 | interval: 5s
94 | retries: 5
95 | restart: always
96 |
97 | redis:
98 | image: redis:latest
99 | expose:
100 | - 6379
101 | healthcheck:
102 | test: ["CMD", "redis-cli", "ping"]
103 | interval: 5s
104 | timeout: 30s
105 | retries: 50
106 | restart: always
107 |
108 | airflow-webserver:
109 | <<: *airflow-common
110 | command: webserver
111 | ports:
112 | - 8080:8080
113 | healthcheck:
114 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
115 | interval: 10s
116 | timeout: 10s
117 | retries: 5
118 | restart: always
119 | depends_on:
120 | <<: *airflow-common-depends-on
121 | airflow-init:
122 | condition: service_completed_successfully
123 |
124 | airflow-scheduler:
125 | <<: *airflow-common
126 | command: scheduler
127 | healthcheck:
128 | test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"']
129 | interval: 10s
130 | timeout: 10s
131 | retries: 5
132 | restart: always
133 | depends_on:
134 | <<: *airflow-common-depends-on
135 | airflow-init:
136 | condition: service_completed_successfully
137 |
138 | airflow-worker:
139 | <<: *airflow-common
140 | command: celery worker
141 | healthcheck:
142 | test:
143 | - "CMD-SHELL"
144 | - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
145 | interval: 10s
146 | timeout: 10s
147 | retries: 5
148 | environment:
149 | <<: *airflow-common-env
150 | # Required to handle warm shutdown of the celery workers properly
151 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
152 | DUMB_INIT_SETSID: "0"
153 | restart: always
154 | depends_on:
155 | <<: *airflow-common-depends-on
156 | airflow-init:
157 | condition: service_completed_successfully
158 |
159 | airflow-triggerer:
160 | <<: *airflow-common
161 | command: triggerer
162 | healthcheck:
163 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
164 | interval: 10s
165 | timeout: 10s
166 | retries: 5
167 | restart: always
168 | depends_on:
169 | <<: *airflow-common-depends-on
170 | airflow-init:
171 | condition: service_completed_successfully
172 |
173 | airflow-init:
174 | <<: *airflow-common
175 | entrypoint: /bin/bash
176 | # yamllint disable rule:line-length
177 | command:
178 | - -c
179 | - |
180 | function ver() {
181 | printf "%04d%04d%04d%04d" $${1//./ }
182 | }
183 | airflow_version=$$(gosu airflow airflow version)
184 | airflow_version_comparable=$$(ver $${airflow_version})
185 | min_airflow_version=2.2.0
186 | min_airflow_version_comparable=$$(ver $${min_airflow_version})
187 | if (( airflow_version_comparable < min_airflow_version_comparable )); then
188 | echo
189 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
190 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
191 | echo
192 | exit 1
193 | fi
194 | if [[ -z "${AIRFLOW_UID}" ]]; then
195 | echo
196 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
197 | echo "If you are on Linux, you SHOULD follow the instructions below to set "
198 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
199 | echo "For other operating systems you can get rid of the warning with manually created .env file:"
200 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user"
201 | echo
202 | fi
203 | one_meg=1048576
204 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
205 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
206 | disk_available=$$(df / | tail -1 | awk '{print $$4}')
207 | warning_resources="false"
208 | if (( mem_available < 4000 )) ; then
209 | echo
210 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
211 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
212 | echo
213 | warning_resources="true"
214 | fi
215 | if (( cpus_available < 2 )); then
216 | echo
217 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
218 | echo "At least 2 CPUs recommended. You have $${cpus_available}"
219 | echo
220 | warning_resources="true"
221 | fi
222 | if (( disk_available < one_meg * 10 )); then
223 | echo
224 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
225 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
226 | echo
227 | warning_resources="true"
228 | fi
229 | if [[ $${warning_resources} == "true" ]]; then
230 | echo
231 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
232 | echo "Please follow the instructions to increase amount of resources available:"
233 | echo " https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin"
234 | echo
235 | fi
236 | mkdir -p /sources/logs /sources/dags /sources/plugins
237 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
238 | exec /entrypoint airflow version
239 | # yamllint enable rule:line-length
240 | environment:
241 | <<: *airflow-common-env
242 | _AIRFLOW_DB_UPGRADE: 'true'
243 | _AIRFLOW_WWW_USER_CREATE: 'true'
244 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
245 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
246 | user: "0:0"
247 | volumes:
248 | - .:/sources
249 |
250 | airflow-cli:
251 | <<: *airflow-common
252 | profiles:
253 | - debug
254 | environment:
255 | <<: *airflow-common-env
256 | CONNECTION_CHECK_MAX_COUNT: "0"
257 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
258 | command:
259 | - bash
260 | - -c
261 | - airflow
262 |
263 | flower:
264 | <<: *airflow-common
265 | command: celery flower
266 | ports:
267 | - 5555:5555
268 | healthcheck:
269 | test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
270 | interval: 10s
271 | timeout: 10s
272 | retries: 5
273 | restart: always
274 | depends_on:
275 | <<: *airflow-common-depends-on
276 | airflow-init:
277 | condition: service_completed_successfully
278 |
279 | volumes:
280 | postgres-db-volume:
--------------------------------------------------------------------------------
/week4/README.md:
--------------------------------------------------------------------------------
1 | ## Topics covered in week 4:
2 | - dbt
3 | - Models creation
4 | - Macros
5 | - Seeds
6 | - Staging and Production environments
7 | - Deployment
8 |
9 | - Google Data Studio
10 | - Dashboards using BigQuery sources
11 |
12 | ### Notes:
13 | Notion page: https://www.notion.so/pcrespoo/Week-4-9de8ba99495b44839eccb082d49dc516
14 |
--------------------------------------------------------------------------------
/week4/data_to_gcs/.gitignore:
--------------------------------------------------------------------------------
1 | google/
--------------------------------------------------------------------------------
/week4/data_to_gcs/upload_to_gcs.py:
--------------------------------------------------------------------------------
1 | import os
2 | from google.cloud import storage
3 |
4 | init_url = 'https://nyc-tlc.s3.amazonaws.com/trip+data/'
5 | BUCKET = os.environ.get("GCP_GCS_BUCKET", "dtc_data_lake_dtc-boot-7639")
6 | GOOGLE_APPLICATION_CREDENTIALS = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS", "/google/credentials/google_credentials.json")
7 |
8 | def upload_to_gcs(bucket, object_name, local_file):
9 | client = storage.Client()
10 | bucket = client.bucket(bucket)
11 | blob = bucket.blob(object_name)
12 | blob.upload_from_filename(local_file)
13 |
14 | def web_to_gcs(year, service):
15 | for i in range(12):
16 |
17 | # sets the month part of the file_name string
18 | month = '0'+str(i+1)
19 | month = month[-2:]
20 |
21 | # parquet file_name
22 | file_name = service + '_tripdata_' + year + '-' + month + '.parquet'
23 |
24 | # download it using bash command
25 | os.system(f'wget {init_url + file_name} -O {file_name}')
26 |
27 | # upload it to gcs
28 | upload_to_gcs(BUCKET, f"{service}/{file_name}", file_name)
29 | print(f"GCS: {service}/{file_name}")
30 |
31 |
32 | web_to_gcs('2019', 'green')
33 | web_to_gcs('2020', 'green')
34 | # web_to_gcs('2019', 'yellow')
35 | # web_to_gcs('2020', 'yellow')
--------------------------------------------------------------------------------
/week4/dbt/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | target/
3 | dbt_packages/
4 | logs/
5 |
--------------------------------------------------------------------------------
/week4/dbt/analyses/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pcrespoo/data-engineering-bootcamp/a1835f9c1557d4974b73837a0622ac5d24db9dfa/week4/dbt/analyses/.gitkeep
--------------------------------------------------------------------------------
/week4/dbt/data/taxi_zone.csv:
--------------------------------------------------------------------------------
1 | "LocationID","Borough","Zone","service_zone"
2 | 1,"EWR","Newark Airport","EWR"
3 | 2,"Queens","Jamaica Bay","Boro Zone"
4 | 3,"Bronx","Allerton/Pelham Gardens","Boro Zone"
5 | 4,"Manhattan","Alphabet City","Yellow Zone"
6 | 5,"Staten Island","Arden Heights","Boro Zone"
7 | 6,"Staten Island","Arrochar/Fort Wadsworth","Boro Zone"
8 | 7,"Queens","Astoria","Boro Zone"
9 | 8,"Queens","Astoria Park","Boro Zone"
10 | 9,"Queens","Auburndale","Boro Zone"
11 | 10,"Queens","Baisley Park","Boro Zone"
12 | 11,"Brooklyn","Bath Beach","Boro Zone"
13 | 12,"Manhattan","Battery Park","Yellow Zone"
14 | 13,"Manhattan","Battery Park City","Yellow Zone"
15 | 14,"Brooklyn","Bay Ridge","Boro Zone"
16 | 15,"Queens","Bay Terrace/Fort Totten","Boro Zone"
17 | 16,"Queens","Bayside","Boro Zone"
18 | 17,"Brooklyn","Bedford","Boro Zone"
19 | 18,"Bronx","Bedford Park","Boro Zone"
20 | 19,"Queens","Bellerose","Boro Zone"
21 | 20,"Bronx","Belmont","Boro Zone"
22 | 21,"Brooklyn","Bensonhurst East","Boro Zone"
23 | 22,"Brooklyn","Bensonhurst West","Boro Zone"
24 | 23,"Staten Island","Bloomfield/Emerson Hill","Boro Zone"
25 | 24,"Manhattan","Bloomingdale","Yellow Zone"
26 | 25,"Brooklyn","Boerum Hill","Boro Zone"
27 | 26,"Brooklyn","Borough Park","Boro Zone"
28 | 27,"Queens","Breezy Point/Fort Tilden/Riis Beach","Boro Zone"
29 | 28,"Queens","Briarwood/Jamaica Hills","Boro Zone"
30 | 29,"Brooklyn","Brighton Beach","Boro Zone"
31 | 30,"Queens","Broad Channel","Boro Zone"
32 | 31,"Bronx","Bronx Park","Boro Zone"
33 | 32,"Bronx","Bronxdale","Boro Zone"
34 | 33,"Brooklyn","Brooklyn Heights","Boro Zone"
35 | 34,"Brooklyn","Brooklyn Navy Yard","Boro Zone"
36 | 35,"Brooklyn","Brownsville","Boro Zone"
37 | 36,"Brooklyn","Bushwick North","Boro Zone"
38 | 37,"Brooklyn","Bushwick South","Boro Zone"
39 | 38,"Queens","Cambria Heights","Boro Zone"
40 | 39,"Brooklyn","Canarsie","Boro Zone"
41 | 40,"Brooklyn","Carroll Gardens","Boro Zone"
42 | 41,"Manhattan","Central Harlem","Boro Zone"
43 | 42,"Manhattan","Central Harlem North","Boro Zone"
44 | 43,"Manhattan","Central Park","Yellow Zone"
45 | 44,"Staten Island","Charleston/Tottenville","Boro Zone"
46 | 45,"Manhattan","Chinatown","Yellow Zone"
47 | 46,"Bronx","City Island","Boro Zone"
48 | 47,"Bronx","Claremont/Bathgate","Boro Zone"
49 | 48,"Manhattan","Clinton East","Yellow Zone"
50 | 49,"Brooklyn","Clinton Hill","Boro Zone"
51 | 50,"Manhattan","Clinton West","Yellow Zone"
52 | 51,"Bronx","Co-Op City","Boro Zone"
53 | 52,"Brooklyn","Cobble Hill","Boro Zone"
54 | 53,"Queens","College Point","Boro Zone"
55 | 54,"Brooklyn","Columbia Street","Boro Zone"
56 | 55,"Brooklyn","Coney Island","Boro Zone"
57 | 56,"Queens","Corona","Boro Zone"
58 | 57,"Queens","Corona","Boro Zone"
59 | 58,"Bronx","Country Club","Boro Zone"
60 | 59,"Bronx","Crotona Park","Boro Zone"
61 | 60,"Bronx","Crotona Park East","Boro Zone"
62 | 61,"Brooklyn","Crown Heights North","Boro Zone"
63 | 62,"Brooklyn","Crown Heights South","Boro Zone"
64 | 63,"Brooklyn","Cypress Hills","Boro Zone"
65 | 64,"Queens","Douglaston","Boro Zone"
66 | 65,"Brooklyn","Downtown Brooklyn/MetroTech","Boro Zone"
67 | 66,"Brooklyn","DUMBO/Vinegar Hill","Boro Zone"
68 | 67,"Brooklyn","Dyker Heights","Boro Zone"
69 | 68,"Manhattan","East Chelsea","Yellow Zone"
70 | 69,"Bronx","East Concourse/Concourse Village","Boro Zone"
71 | 70,"Queens","East Elmhurst","Boro Zone"
72 | 71,"Brooklyn","East Flatbush/Farragut","Boro Zone"
73 | 72,"Brooklyn","East Flatbush/Remsen Village","Boro Zone"
74 | 73,"Queens","East Flushing","Boro Zone"
75 | 74,"Manhattan","East Harlem North","Boro Zone"
76 | 75,"Manhattan","East Harlem South","Boro Zone"
77 | 76,"Brooklyn","East New York","Boro Zone"
78 | 77,"Brooklyn","East New York/Pennsylvania Avenue","Boro Zone"
79 | 78,"Bronx","East Tremont","Boro Zone"
80 | 79,"Manhattan","East Village","Yellow Zone"
81 | 80,"Brooklyn","East Williamsburg","Boro Zone"
82 | 81,"Bronx","Eastchester","Boro Zone"
83 | 82,"Queens","Elmhurst","Boro Zone"
84 | 83,"Queens","Elmhurst/Maspeth","Boro Zone"
85 | 84,"Staten Island","Eltingville/Annadale/Prince's Bay","Boro Zone"
86 | 85,"Brooklyn","Erasmus","Boro Zone"
87 | 86,"Queens","Far Rockaway","Boro Zone"
88 | 87,"Manhattan","Financial District North","Yellow Zone"
89 | 88,"Manhattan","Financial District South","Yellow Zone"
90 | 89,"Brooklyn","Flatbush/Ditmas Park","Boro Zone"
91 | 90,"Manhattan","Flatiron","Yellow Zone"
92 | 91,"Brooklyn","Flatlands","Boro Zone"
93 | 92,"Queens","Flushing","Boro Zone"
94 | 93,"Queens","Flushing Meadows-Corona Park","Boro Zone"
95 | 94,"Bronx","Fordham South","Boro Zone"
96 | 95,"Queens","Forest Hills","Boro Zone"
97 | 96,"Queens","Forest Park/Highland Park","Boro Zone"
98 | 97,"Brooklyn","Fort Greene","Boro Zone"
99 | 98,"Queens","Fresh Meadows","Boro Zone"
100 | 99,"Staten Island","Freshkills Park","Boro Zone"
101 | 100,"Manhattan","Garment District","Yellow Zone"
102 | 101,"Queens","Glen Oaks","Boro Zone"
103 | 102,"Queens","Glendale","Boro Zone"
104 | 103,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone"
105 | 104,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone"
106 | 105,"Manhattan","Governor's Island/Ellis Island/Liberty Island","Yellow Zone"
107 | 106,"Brooklyn","Gowanus","Boro Zone"
108 | 107,"Manhattan","Gramercy","Yellow Zone"
109 | 108,"Brooklyn","Gravesend","Boro Zone"
110 | 109,"Staten Island","Great Kills","Boro Zone"
111 | 110,"Staten Island","Great Kills Park","Boro Zone"
112 | 111,"Brooklyn","Green-Wood Cemetery","Boro Zone"
113 | 112,"Brooklyn","Greenpoint","Boro Zone"
114 | 113,"Manhattan","Greenwich Village North","Yellow Zone"
115 | 114,"Manhattan","Greenwich Village South","Yellow Zone"
116 | 115,"Staten Island","Grymes Hill/Clifton","Boro Zone"
117 | 116,"Manhattan","Hamilton Heights","Boro Zone"
118 | 117,"Queens","Hammels/Arverne","Boro Zone"
119 | 118,"Staten Island","Heartland Village/Todt Hill","Boro Zone"
120 | 119,"Bronx","Highbridge","Boro Zone"
121 | 120,"Manhattan","Highbridge Park","Boro Zone"
122 | 121,"Queens","Hillcrest/Pomonok","Boro Zone"
123 | 122,"Queens","Hollis","Boro Zone"
124 | 123,"Brooklyn","Homecrest","Boro Zone"
125 | 124,"Queens","Howard Beach","Boro Zone"
126 | 125,"Manhattan","Hudson Sq","Yellow Zone"
127 | 126,"Bronx","Hunts Point","Boro Zone"
128 | 127,"Manhattan","Inwood","Boro Zone"
129 | 128,"Manhattan","Inwood Hill Park","Boro Zone"
130 | 129,"Queens","Jackson Heights","Boro Zone"
131 | 130,"Queens","Jamaica","Boro Zone"
132 | 131,"Queens","Jamaica Estates","Boro Zone"
133 | 132,"Queens","JFK Airport","Airports"
134 | 133,"Brooklyn","Kensington","Boro Zone"
135 | 134,"Queens","Kew Gardens","Boro Zone"
136 | 135,"Queens","Kew Gardens Hills","Boro Zone"
137 | 136,"Bronx","Kingsbridge Heights","Boro Zone"
138 | 137,"Manhattan","Kips Bay","Yellow Zone"
139 | 138,"Queens","LaGuardia Airport","Airports"
140 | 139,"Queens","Laurelton","Boro Zone"
141 | 140,"Manhattan","Lenox Hill East","Yellow Zone"
142 | 141,"Manhattan","Lenox Hill West","Yellow Zone"
143 | 142,"Manhattan","Lincoln Square East","Yellow Zone"
144 | 143,"Manhattan","Lincoln Square West","Yellow Zone"
145 | 144,"Manhattan","Little Italy/NoLiTa","Yellow Zone"
146 | 145,"Queens","Long Island City/Hunters Point","Boro Zone"
147 | 146,"Queens","Long Island City/Queens Plaza","Boro Zone"
148 | 147,"Bronx","Longwood","Boro Zone"
149 | 148,"Manhattan","Lower East Side","Yellow Zone"
150 | 149,"Brooklyn","Madison","Boro Zone"
151 | 150,"Brooklyn","Manhattan Beach","Boro Zone"
152 | 151,"Manhattan","Manhattan Valley","Yellow Zone"
153 | 152,"Manhattan","Manhattanville","Boro Zone"
154 | 153,"Manhattan","Marble Hill","Boro Zone"
155 | 154,"Brooklyn","Marine Park/Floyd Bennett Field","Boro Zone"
156 | 155,"Brooklyn","Marine Park/Mill Basin","Boro Zone"
157 | 156,"Staten Island","Mariners Harbor","Boro Zone"
158 | 157,"Queens","Maspeth","Boro Zone"
159 | 158,"Manhattan","Meatpacking/West Village West","Yellow Zone"
160 | 159,"Bronx","Melrose South","Boro Zone"
161 | 160,"Queens","Middle Village","Boro Zone"
162 | 161,"Manhattan","Midtown Center","Yellow Zone"
163 | 162,"Manhattan","Midtown East","Yellow Zone"
164 | 163,"Manhattan","Midtown North","Yellow Zone"
165 | 164,"Manhattan","Midtown South","Yellow Zone"
166 | 165,"Brooklyn","Midwood","Boro Zone"
167 | 166,"Manhattan","Morningside Heights","Boro Zone"
168 | 167,"Bronx","Morrisania/Melrose","Boro Zone"
169 | 168,"Bronx","Mott Haven/Port Morris","Boro Zone"
170 | 169,"Bronx","Mount Hope","Boro Zone"
171 | 170,"Manhattan","Murray Hill","Yellow Zone"
172 | 171,"Queens","Murray Hill-Queens","Boro Zone"
173 | 172,"Staten Island","New Dorp/Midland Beach","Boro Zone"
174 | 173,"Queens","North Corona","Boro Zone"
175 | 174,"Bronx","Norwood","Boro Zone"
176 | 175,"Queens","Oakland Gardens","Boro Zone"
177 | 176,"Staten Island","Oakwood","Boro Zone"
178 | 177,"Brooklyn","Ocean Hill","Boro Zone"
179 | 178,"Brooklyn","Ocean Parkway South","Boro Zone"
180 | 179,"Queens","Old Astoria","Boro Zone"
181 | 180,"Queens","Ozone Park","Boro Zone"
182 | 181,"Brooklyn","Park Slope","Boro Zone"
183 | 182,"Bronx","Parkchester","Boro Zone"
184 | 183,"Bronx","Pelham Bay","Boro Zone"
185 | 184,"Bronx","Pelham Bay Park","Boro Zone"
186 | 185,"Bronx","Pelham Parkway","Boro Zone"
187 | 186,"Manhattan","Penn Station/Madison Sq West","Yellow Zone"
188 | 187,"Staten Island","Port Richmond","Boro Zone"
189 | 188,"Brooklyn","Prospect-Lefferts Gardens","Boro Zone"
190 | 189,"Brooklyn","Prospect Heights","Boro Zone"
191 | 190,"Brooklyn","Prospect Park","Boro Zone"
192 | 191,"Queens","Queens Village","Boro Zone"
193 | 192,"Queens","Queensboro Hill","Boro Zone"
194 | 193,"Queens","Queensbridge/Ravenswood","Boro Zone"
195 | 194,"Manhattan","Randalls Island","Yellow Zone"
196 | 195,"Brooklyn","Red Hook","Boro Zone"
197 | 196,"Queens","Rego Park","Boro Zone"
198 | 197,"Queens","Richmond Hill","Boro Zone"
199 | 198,"Queens","Ridgewood","Boro Zone"
200 | 199,"Bronx","Rikers Island","Boro Zone"
201 | 200,"Bronx","Riverdale/North Riverdale/Fieldston","Boro Zone"
202 | 201,"Queens","Rockaway Park","Boro Zone"
203 | 202,"Manhattan","Roosevelt Island","Boro Zone"
204 | 203,"Queens","Rosedale","Boro Zone"
205 | 204,"Staten Island","Rossville/Woodrow","Boro Zone"
206 | 205,"Queens","Saint Albans","Boro Zone"
207 | 206,"Staten Island","Saint George/New Brighton","Boro Zone"
208 | 207,"Queens","Saint Michaels Cemetery/Woodside","Boro Zone"
209 | 208,"Bronx","Schuylerville/Edgewater Park","Boro Zone"
210 | 209,"Manhattan","Seaport","Yellow Zone"
211 | 210,"Brooklyn","Sheepshead Bay","Boro Zone"
212 | 211,"Manhattan","SoHo","Yellow Zone"
213 | 212,"Bronx","Soundview/Bruckner","Boro Zone"
214 | 213,"Bronx","Soundview/Castle Hill","Boro Zone"
215 | 214,"Staten Island","South Beach/Dongan Hills","Boro Zone"
216 | 215,"Queens","South Jamaica","Boro Zone"
217 | 216,"Queens","South Ozone Park","Boro Zone"
218 | 217,"Brooklyn","South Williamsburg","Boro Zone"
219 | 218,"Queens","Springfield Gardens North","Boro Zone"
220 | 219,"Queens","Springfield Gardens South","Boro Zone"
221 | 220,"Bronx","Spuyten Duyvil/Kingsbridge","Boro Zone"
222 | 221,"Staten Island","Stapleton","Boro Zone"
223 | 222,"Brooklyn","Starrett City","Boro Zone"
224 | 223,"Queens","Steinway","Boro Zone"
225 | 224,"Manhattan","Stuy Town/Peter Cooper Village","Yellow Zone"
226 | 225,"Brooklyn","Stuyvesant Heights","Boro Zone"
227 | 226,"Queens","Sunnyside","Boro Zone"
228 | 227,"Brooklyn","Sunset Park East","Boro Zone"
229 | 228,"Brooklyn","Sunset Park West","Boro Zone"
230 | 229,"Manhattan","Sutton Place/Turtle Bay North","Yellow Zone"
231 | 230,"Manhattan","Times Sq/Theatre District","Yellow Zone"
232 | 231,"Manhattan","TriBeCa/Civic Center","Yellow Zone"
233 | 232,"Manhattan","Two Bridges/Seward Park","Yellow Zone"
234 | 233,"Manhattan","UN/Turtle Bay South","Yellow Zone"
235 | 234,"Manhattan","Union Sq","Yellow Zone"
236 | 235,"Bronx","University Heights/Morris Heights","Boro Zone"
237 | 236,"Manhattan","Upper East Side North","Yellow Zone"
238 | 237,"Manhattan","Upper East Side South","Yellow Zone"
239 | 238,"Manhattan","Upper West Side North","Yellow Zone"
240 | 239,"Manhattan","Upper West Side South","Yellow Zone"
241 | 240,"Bronx","Van Cortlandt Park","Boro Zone"
242 | 241,"Bronx","Van Cortlandt Village","Boro Zone"
243 | 242,"Bronx","Van Nest/Morris Park","Boro Zone"
244 | 243,"Manhattan","Washington Heights North","Boro Zone"
245 | 244,"Manhattan","Washington Heights South","Boro Zone"
246 | 245,"Staten Island","West Brighton","Boro Zone"
247 | 246,"Manhattan","West Chelsea/Hudson Yards","Yellow Zone"
248 | 247,"Bronx","West Concourse","Boro Zone"
249 | 248,"Bronx","West Farms/Bronx River","Boro Zone"
250 | 249,"Manhattan","West Village","Yellow Zone"
251 | 250,"Bronx","Westchester Village/Unionport","Boro Zone"
252 | 251,"Staten Island","Westerleigh","Boro Zone"
253 | 252,"Queens","Whitestone","Boro Zone"
254 | 253,"Queens","Willets Point","Boro Zone"
255 | 254,"Bronx","Williamsbridge/Olinville","Boro Zone"
256 | 255,"Brooklyn","Williamsburg (North Side)","Boro Zone"
257 | 256,"Brooklyn","Williamsburg (South Side)","Boro Zone"
258 | 257,"Brooklyn","Windsor Terrace","Boro Zone"
259 | 258,"Queens","Woodhaven","Boro Zone"
260 | 259,"Bronx","Woodlawn/Wakefield","Boro Zone"
261 | 260,"Queens","Woodside","Boro Zone"
262 | 261,"Manhattan","World Trade Center","Yellow Zone"
263 | 262,"Manhattan","Yorkville East","Yellow Zone"
264 | 263,"Manhattan","Yorkville West","Yellow Zone"
265 | 264,"Unknown","NV","N/A"
266 | 265,"Unknown","NA","N/A"
--------------------------------------------------------------------------------
/week4/dbt/dbt_project.yml:
--------------------------------------------------------------------------------
1 | name: 'taxi_rides_ny'
2 | version: '1.0.0'
3 | config-version: 2
4 |
5 | # This setting configures which "profile" dbt uses for this project.
6 | profile: 'my_profile'
7 |
8 | # These configurations specify where dbt should look for different types of files.
9 | # The `source-paths` config, for example, states that models in this project can be
10 | # found in the "models/" directory. You probably won't need to change these!
11 | model-paths: ["models"]
12 | analysis-paths: ["analysis"]
13 | test-paths: ["tests"]
14 | seed-paths: ["data"]
15 | macro-paths: ["macros"]
16 | snapshot-paths: ["snapshots"]
17 |
18 | target-path: "target" # directory which will store compiled SQL files
19 | clean-targets: # directories to be removed by `dbt clean`
20 | - "target"
21 | - "dbt_packages"
22 |
23 |
24 | # Configuring models
25 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
26 |
27 | # In this example config, we tell dbt to build all models in the example/ directory
28 | # as tables. These settings can be overridden in the individual model files
29 | # using the `{{ config(...) }}` macro.
30 | models:
31 | taxi_rides_ny:
32 | # Applies to all files under models/.../
33 | vars:
34 | payment_type_values: [1, 2, 3, 4, 5, 6]
35 |
36 | seeds:
37 | taxi_rides_ny:
38 | taxi_zone_lookup:
39 | +column_types:
40 | LocationID: numeric
--------------------------------------------------------------------------------
/week4/dbt/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pcrespoo/data-engineering-bootcamp/a1835f9c1557d4974b73837a0622ac5d24db9dfa/week4/dbt/macros/.gitkeep
--------------------------------------------------------------------------------
/week4/dbt/macros/get_payment_type_description.sql:
--------------------------------------------------------------------------------
1 | {#
2 | This macro returns the description of the payment_type
3 | #}
4 |
5 | {% macro get_payment_type_description(payment_type) -%}
6 |
7 | case {{ payment_type }}
8 | when 1 then 'Credit card'
9 | when 2 then 'Cash'
10 | when 3 then 'No charge'
11 | when 4 then 'Dispute'
12 | when 5 then 'Unknown'
13 | when 6 then 'Voided trip'
14 | end
15 |
16 | {%- endmacro %}
17 |
--------------------------------------------------------------------------------
/week4/dbt/models/core/dim_zones.sql:
--------------------------------------------------------------------------------
1 | {{ config(materialized='table') }}
2 | select
3 | Locationid,
4 | Borough,
5 | zone,
6 | replace(service_zone,'Boro','Green') as service_zone
7 | from {{ ref('taxi_zone') }}
--------------------------------------------------------------------------------
/week4/dbt/models/core/dm_monthly_zone_revenue.sql:
--------------------------------------------------------------------------------
1 | {{ config(materialized='table') }}
2 |
3 | with trips_data as (
4 | select * from {{ ref('fact_trips') }}
5 | )
6 | select
7 | -- Reveneue grouping
8 | pickup_zone as revenue_zone,
9 | date_trunc(pickup_datetime,month) as revenue_month,
10 | --Note: For BQ use instead: date_trunc(pickup_datetime, month) as revenue_month,
11 |
12 | service_type,
13 |
14 | -- Revenue calculation
15 | sum(fare_amount) as revenue_monthly_fare,
16 | sum(extra) as revenue_monthly_extra,
17 | sum(mta_tax) as revenue_monthly_mta_tax,
18 | sum(tip_amount) as revenue_monthly_tip_amount,
19 | sum(tolls_amount) as revenue_monthly_tolls_amount,
20 | sum(improvement_surcharge) as revenue_monthly_improvement_surcharge,
21 | sum(total_amount) as revenue_monthly_total_amount,
22 | sum(congestion_surcharge) as revenue_monthly_congestion_surcharge,
23 |
24 | -- Additional calculations
25 | count(tripid) as total_monthly_trips,
26 | avg(passenger_count) as avg_montly_passenger_count,
27 | avg(trip_distance) as avg_montly_trip_distance
28 |
29 | from trips_data
30 | group by 1,2,3
--------------------------------------------------------------------------------
/week4/dbt/models/core/fact_trips.sql:
--------------------------------------------------------------------------------
1 | {{ config(materialized='table') }}
2 |
3 | with green_data as (
4 | select *,
5 | 'Green' as service_type
6 | from {{ ref('stg_green_tripdata') }}
7 | ),
8 |
9 | yellow_data as (
10 | select *,
11 | 'Yellow' as service_type
12 | from {{ ref('stg_yellow_tripdata') }}
13 | ),
14 |
15 | trips_unioned as (
16 | select * from green_data
17 | union all
18 | select * from yellow_data
19 | ),
20 |
21 | dim_zones as (
22 | select * from {{ ref('dim_zones') }}
23 | where Borough != 'Unknown'
24 | )
25 | select
26 | trips_unioned.tripid,
27 | trips_unioned.vendorid,
28 | trips_unioned.service_type,
29 | trips_unioned.ratecodeid,
30 | trips_unioned.pickup_locationid,
31 | pickup_zone.borough as pickup_borough,
32 | pickup_zone.zone as pickup_zone,
33 | trips_unioned.dropoff_locationid,
34 | dropoff_zone.borough as dropoff_borough,
35 | dropoff_zone.zone as dropoff_zone,
36 | trips_unioned.pickup_datetime,
37 | trips_unioned.dropoff_datetime,
38 | trips_unioned.store_and_fwd_flag,
39 | trips_unioned.passenger_count,
40 | trips_unioned.trip_distance,
41 | trips_unioned.trip_type,
42 | trips_unioned.fare_amount,
43 | trips_unioned.extra,
44 | trips_unioned.mta_tax,
45 | trips_unioned.tip_amount,
46 | trips_unioned.tolls_amount,
47 | trips_unioned.improvement_surcharge,
48 | trips_unioned.total_amount,
49 | trips_unioned.payment_type,
50 | trips_unioned.payment_type_description,
51 | trips_unioned.congestion_surcharge
52 | from trips_unioned
53 | inner join dim_zones as pickup_zone
54 | on trips_unioned.pickup_LocationID = pickup_zone.LocationID
55 | inner join dim_zones as dropoff_zone
56 | on trips_unioned.dropoff_LocationID = dropoff_zone.LocationID
--------------------------------------------------------------------------------
/week4/dbt/models/core/schema.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | models:
4 | - name: dim_zones
5 | description: >
6 | List of unique zones idefied by locationid.
7 | Includes the service zone they correspond to (Green or yellow).
8 | - name: fact_trips
9 | description: >
10 | Taxi trips corresponding to both service zones (Green and yellow).
11 | The table contains records where both pickup and dropoff locations are valid and known zones.
12 | Each record corresponds to a trip uniquely identified by tripid.
13 |
14 | - name: dm_monthly_zone_revenue
15 | description: >
16 | Aggregated table of all taxi trips corresponding to both service zones (Green and yellow) per pickup zone, month and service.
17 | The table contains monthly sums of the fare elements used to calculate the monthly revenue.
18 | The table contains also monthly indicators like number of trips, and average trip distance.
19 | columns:
20 | - name: revenue_monthly_total_amount
21 | description: Monthly sum of the the total_amount of the fare charged for the trip per pickup zone, month and service.
22 | tests:
23 | - not_null:
24 | severity: error
--------------------------------------------------------------------------------
/week4/dbt/models/staging/schema.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | sources:
4 | - name: staging
5 | database: dtc-boot-7639
6 | schema: trips_data_all
7 |
8 | tables:
9 | - name: green_partitioned_clustered_table
10 | - name: yellow_partitioned_clustered_table
11 | models:
12 | - name: stg_green_tripdata
13 | description: >
14 | Trip made by green taxis, also known as boro taxis and street-hail liveries.
15 | Green taxis may respond to street hails,but only in the areas indicated in green on the
16 | map (i.e. above W 110 St/E 96th St in Manhattan and in the boroughs).
17 | The records were collected and provided to the NYC Taxi and Limousine Commission (TLC) by
18 | technology service providers.
19 | columns:
20 | - name: tripid
21 | description: Primary key for this table, generated with a concatenation of vendorid+pickup_datetime
22 | tests:
23 | - unique:
24 | severity: warn
25 | - not_null:
26 | severity: warn
27 | - name: VendorID
28 | description: >
29 | A code indicating the TPEP provider that provided the record.
30 | 1= Creative Mobile Technologies, LLC;
31 | 2= VeriFone Inc.
32 | - name: pickup_datetime
33 | description: The date and time when the meter was engaged.
34 | - name: dropoff_datetime
35 | description: The date and time when the meter was disengaged.
36 | - name: Passenger_count
37 | description: The number of passengers in the vehicle. This is a driver-entered value.
38 | - name: Trip_distance
39 | description: The elapsed trip distance in miles reported by the taximeter.
40 | - name: Pickup_locationid
41 | description: locationid where the meter was engaged.
42 | tests:
43 | - relationships:
44 | to: ref('taxi_zone')
45 | field: LocationID
46 | severity: warn
47 | - name: dropoff_locationid
48 | description: locationid where the meter was engaged.
49 | tests:
50 | - relationships:
51 | to: ref('taxi_zone')
52 | field: LocationID
53 | - name: RateCodeID
54 | description: >
55 | The final rate code in effect at the end of the trip.
56 | 1= Standard rate
57 | 2=JFK
58 | 3=Newark
59 | 4=Nassau or Westchester
60 | 5=Negotiated fare
61 | 6=Group ride
62 | - name: Store_and_fwd_flag
63 | description: >
64 | This flag indicates whether the trip record was held in vehicle
65 | memory before sending to the vendor, aka “store and forward,”
66 | because the vehicle did not have a connection to the server.
67 | Y= store and forward trip
68 | N= not a store and forward trip
69 | - name: Dropoff_longitude
70 | description: Longitude where the meter was disengaged.
71 | - name: Dropoff_latitude
72 | description: Latitude where the meter was disengaged.
73 | - name: Payment_type
74 | description: >
75 | A numeric code signifying how the passenger paid for the trip.
76 | tests:
77 | - accepted_values:
78 | values: "{{ var('payment_type_values') }}"
79 | severity: warn
80 | quote: false
81 | - name: payment_type_description
82 | description: Description of the payment_type code
83 | - name: Fare_amount
84 | description: >
85 | The time-and-distance fare calculated by the meter.
86 | Extra Miscellaneous extras and surcharges. Currently, this only includes
87 | the $0.50 and $1 rush hour and overnight charges.
88 | MTA_tax $0.50 MTA tax that is automatically triggered based on the metered
89 | rate in use.
90 | - name: Improvement_surcharge
91 | description: >
92 | $0.30 improvement surcharge assessed trips at the flag drop. The
93 | improvement surcharge began being levied in 2015.
94 | - name: Tip_amount
95 | description: >
96 | Tip amount. This field is automatically populated for credit card
97 | tips. Cash tips are not included.
98 | - name: Tolls_amount
99 | description: Total amount of all tolls paid in trip.
100 | - name: Total_amount
101 | description: The total amount charged to passengers. Does not include cash tips.
102 |
103 | - name: stg_yellow_tripdata
104 | description: >
105 | Trips made by New York City's iconic yellow taxis.
106 | Yellow taxis are the only vehicles permitted to respond to a street hail from a passenger in all five
107 | boroughs. They may also be hailed using an e-hail app like Curb or Arro.
108 | The records were collected and provided to the NYC Taxi and Limousine Commission (TLC) by
109 | technology service providers.
110 | columns:
111 | - name: tripid
112 | description: Primary key for this table, generated with a concatenation of vendorid+pickup_datetime
113 | tests:
114 | - unique:
115 | severity: warn
116 | - not_null:
117 | severity: warn
118 | - name: VendorID
119 | description: >
120 | A code indicating the TPEP provider that provided the record.
121 | 1= Creative Mobile Technologies, LLC;
122 | 2= VeriFone Inc.
123 | - name: pickup_datetime
124 | description: The date and time when the meter was engaged.
125 | - name: dropoff_datetime
126 | description: The date and time when the meter was disengaged.
127 | - name: Passenger_count
128 | description: The number of passengers in the vehicle. This is a driver-entered value.
129 | - name: Trip_distance
130 | description: The elapsed trip distance in miles reported by the taximeter.
131 | - name: Pickup_locationid
132 | description: locationid where the meter was engaged.
133 | tests:
134 | - relationships:
135 | to: ref('taxi_zone')
136 | field: LocationID
137 | severity: warn
138 | - name: dropoff_locationid
139 | description: locationid where the meter was engaged.
140 | tests:
141 | - relationships:
142 | to: ref('taxi_zone')
143 | field: LocationID
144 | severity: warn
145 | - name: RateCodeID
146 | description: >
147 | The final rate code in effect at the end of the trip.
148 | 1= Standard rate
149 | 2=JFK
150 | 3=Newark
151 | 4=Nassau or Westchester
152 | 5=Negotiated fare
153 | 6=Group ride
154 | - name: Store_and_fwd_flag
155 | description: >
156 | This flag indicates whether the trip record was held in vehicle
157 | memory before sending to the vendor, aka “store and forward,”
158 | because the vehicle did not have a connection to the server.
159 | Y= store and forward trip
160 | N= not a store and forward trip
161 | - name: Dropoff_longitude
162 | description: Longitude where the meter was disengaged.
163 | - name: Dropoff_latitude
164 | description: Latitude where the meter was disengaged.
165 | - name: Payment_type
166 | description: >
167 | A numeric code signifying how the passenger paid for the trip.
168 | tests:
169 | - accepted_values:
170 | values: "{{ var('payment_type_values') }}"
171 | severity: warn
172 | quote: false
173 | - name: payment_type_description
174 | description: Description of the payment_type code
175 | - name: Fare_amount
176 | description: >
177 | The time-and-distance fare calculated by the meter.
178 | Extra Miscellaneous extras and surcharges. Currently, this only includes
179 | the $0.50 and $1 rush hour and overnight charges.
180 | MTA_tax $0.50 MTA tax that is automatically triggered based on the metered
181 | rate in use.
182 | - name: Improvement_surcharge
183 | description: >
184 | $0.30 improvement surcharge assessed trips at the flag drop. The
185 | improvement surcharge began being levied in 2015.
186 | - name: Tip_amount
187 | description: >
188 | Tip amount. This field is automatically populated for credit card
189 | tips. Cash tips are not included.
190 | - name: Tolls_amount
191 | description: Total amount of all tolls paid in trip.
192 | - name: Total_amount
193 | description: The total amount charged to passengers. Does not include cash tips.
194 |
--------------------------------------------------------------------------------
/week4/dbt/models/staging/stg_green_tripdata.sql:
--------------------------------------------------------------------------------
1 | {{ config(materialized='view') }}
2 |
3 | with tripdata as
4 | (
5 | select *,
6 | row_number() over(partition by vendorid, lpep_pickup_datetime) as rn
7 | from {{source('staging','green_partitioned_clustered_table')}}
8 | where vendorid is not null
9 | )
10 |
11 | select
12 | -- identifiers
13 | {{ dbt_utils.surrogate_key(['vendorid', 'lpep_pickup_datetime']) }} as tripid,
14 | cast(vendorid as integer) as vendorid,
15 | cast(ratecodeid as integer) as ratecodeid,
16 | cast(pulocationid as integer) as pickup_locationid,
17 | cast(dolocationid as integer) as dropoff_locationid,
18 |
19 | -- timestamps
20 | cast(lpep_pickup_datetime as timestamp) as pickup_datetime,
21 | cast(lpep_dropoff_datetime as timestamp) as dropoff_datetime,
22 |
23 | -- trip info
24 | store_and_fwd_flag,
25 | cast(passenger_count as integer) as passenger_count,
26 | cast(trip_distance as numeric) as trip_distance,
27 | cast(trip_type as integer) as trip_type,
28 |
29 | -- payment info
30 | cast(fare_amount as numeric) as fare_amount,
31 | cast(extra as numeric) as extra,
32 | cast(mta_tax as numeric) as mta_tax,
33 | cast(tip_amount as numeric) as tip_amount,
34 | cast(tolls_amount as numeric) as tolls_amount,
35 | cast(improvement_surcharge as numeric) as improvement_surcharge,
36 | cast(total_amount as numeric) as total_amount,
37 | cast(payment_type as integer) as payment_type,
38 | {{ get_payment_type_description('payment_type') }} as payment_type_description,
39 | cast(congestion_surcharge as numeric) as congestion_surcharge
40 | from tripdata
41 | where rn = 1
42 | -- dbt build --m --var 'is_test_run: false'
43 | {% if var('is_test_run', default=true) %}
44 |
45 | limit 100
46 |
47 | {% endif %}
--------------------------------------------------------------------------------
/week4/dbt/models/staging/stg_yellow_tripdata.sql:
--------------------------------------------------------------------------------
1 |
2 | {{ config(materialized='view') }}
3 |
4 | with tripdata as
5 | (
6 | select *,
7 | row_number() over(partition by vendorid, tpep_pickup_datetime) as rn
8 | from {{source('staging','yellow_partitioned_clustered_table')}}
9 | where vendorid is not null
10 | )
11 |
12 | select
13 | -- identifiers
14 | {{ dbt_utils.surrogate_key(['vendorid', 'tpep_pickup_datetime']) }} as tripid,
15 | cast(vendorid as integer) as vendorid,
16 | cast(ratecodeid as integer) as ratecodeid,
17 | cast(pulocationid as integer) as pickup_locationid,
18 | cast(dolocationid as integer) as dropoff_locationid,
19 |
20 | -- timestamps
21 | cast(tpep_pickup_datetime as timestamp) as pickup_datetime,
22 | cast(tpep_dropoff_datetime as timestamp) as dropoff_datetime,
23 |
24 | -- trip info
25 | store_and_fwd_flag,
26 | cast(passenger_count as integer) as passenger_count,
27 | cast(trip_distance as numeric) as trip_distance,
28 | -- yellow cabs are always street-hail
29 | 1 as trip_type,
30 |
31 | -- payment info
32 | cast(fare_amount as numeric) as fare_amount,
33 | cast(extra as numeric) as extra,
34 | cast(mta_tax as numeric) as mta_tax,
35 | cast(tip_amount as numeric) as tip_amount,
36 | cast(tolls_amount as numeric) as tolls_amount,
37 | cast(improvement_surcharge as numeric) as improvement_surcharge,
38 | cast(total_amount as numeric) as total_amount,
39 | cast(payment_type as integer) as payment_type,
40 | {{ get_payment_type_description('payment_type') }} as payment_type_description,
41 | cast(congestion_surcharge as numeric) as congestion_surcharge
42 | from tripdata
43 | where rn = 1
44 | -- dbt build --m --var 'is_test_run: false'
45 | {% if var('is_test_run', default=true) %}
46 |
47 | limit 100
48 |
49 | {% endif %}
--------------------------------------------------------------------------------
/week4/dbt/packages.yml:
--------------------------------------------------------------------------------
1 | packages:
2 | - package: dbt-labs/dbt_utils
3 | version: 0.8.0
--------------------------------------------------------------------------------
/week4/dbt/profiles.yml:
--------------------------------------------------------------------------------
1 | my-profile:
2 | target: dev
3 | outputs:
4 | dev:
5 | type: bigquery
6 | method: oauth
7 | project: dtc-boot-7639
8 | dataset: dbt_pcrespoo
9 | location: southamerica-east1
--------------------------------------------------------------------------------
/week4/dbt/seeds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pcrespoo/data-engineering-bootcamp/a1835f9c1557d4974b73837a0622ac5d24db9dfa/week4/dbt/seeds/.gitkeep
--------------------------------------------------------------------------------
/week4/dbt/snapshots/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pcrespoo/data-engineering-bootcamp/a1835f9c1557d4974b73837a0622ac5d24db9dfa/week4/dbt/snapshots/.gitkeep
--------------------------------------------------------------------------------
/week4/dbt/tests/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pcrespoo/data-engineering-bootcamp/a1835f9c1557d4974b73837a0622ac5d24db9dfa/week4/dbt/tests/.gitkeep
--------------------------------------------------------------------------------
/week6/README.md:
--------------------------------------------------------------------------------
1 | ## Topics covered in week 6:
2 | - Kafka
3 | - Basic terminology
4 | - Kafka workflow
5 | - Avro and Kafka
6 | - Kafka Streams
7 | - KSQL
8 | ### Notes:
9 | Notion page: https://www.notion.so/pcrespoo/Week-6-7b58575a54f64b64825a30ff02370a27
10 |
--------------------------------------------------------------------------------
/week6/avro_example/consumer.py:
--------------------------------------------------------------------------------
1 | from confluent_kafka.avro import AvroConsumer
2 |
3 |
4 | def read_messages():
5 | consumer_config = {"bootstrap.servers": "localhost:9092",
6 | "schema.registry.url": "http://localhost:8081",
7 | "group.id": "datatalkclubs.taxirides.avro.consumer.2",
8 | "auto.offset.reset": "earliest"}
9 |
10 | consumer = AvroConsumer(consumer_config)
11 | consumer.subscribe(["datatalkclub.yellow_taxi_rides"])
12 |
13 | while(True):
14 | try:
15 | message = consumer.poll(5)
16 | except Exception as e:
17 | print(f"Exception while trying to poll messages - {e}")
18 | else:
19 | if message:
20 | print(f"Successfully poll a record from "
21 | f"Kafka topic: {message.topic()}, partition: {message.partition()}, offset: {message.offset()}\n"
22 | f"message key: {message.key()} || message value: {message.value()}")
23 | consumer.commit()
24 | else:
25 | print("No new messages at this point. Try again later.")
26 | consumer.close()
27 |
28 |
29 | if __name__ == "__main__":
30 | read_messages()
--------------------------------------------------------------------------------
/week6/avro_example/data/rides_new.csv:
--------------------------------------------------------------------------------
1 | VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
2 | 1,2020-07-01 00:25:32,2020-07-01 00:33:39,1,1.5,1,N,238,75,2,8.0,0.5,0.5,0.0,0.0,0.3,9.3,0.0
3 | 1,2020-07-01 00:03:19,2020-07-01 00:25:43,1,9.5,1,N,138,216,1,26.5,0.5,0.5,0.0,0.0,0.3,27.8,0.0
4 | 2,2020-07-01 00:15:11,2020-07-01 00:29:24,1,5.85,1,N,230,88,2,18.5,0.5,0.5,0.0,0.0,0.3,22.3,2.5
5 | 2,2020-07-01 00:30:49,2020-07-01 00:38:26,1,1.9,1,N,88,232,1,8.0,0.5,0.5,2.36,0.0,0.3,14.16,2.5
6 | 2,2020-07-01 00:31:26,2020-07-01 00:38:02,1,1.25,1,N,37,17,2,6.5,0.5,0.5,0.0,0.0,0.3,7.8,0.0
7 | 1,2020-07-01 00:09:00,2020-07-01 00:34:39,1,9.7,1,N,140,61,1,30.0,3.0,0.5,0.0,0.0,0.3,33.8,2.5
8 | 2,2020-07-01 00:44:08,2020-07-01 00:58:12,1,5.27,1,N,137,260,1,16.5,0.5,0.5,6.09,0.0,0.3,26.39,2.5
9 | 2,2020-07-01 00:49:20,2020-07-01 00:56:44,1,1.32,1,N,166,41,2,7.5,0.5,0.5,0.0,0.0,0.3,8.8,0.0
10 | 2,2020-07-01 00:21:59,2020-07-01 00:25:12,1,0.73,1,N,239,142,1,5.0,0.5,0.5,1.32,0.0,0.3,10.12,2.5
11 | 2,2020-07-01 00:08:28,2020-07-01 00:36:18,1,18.65,2,N,132,249,1,52.0,0.0,0.5,11.06,0.0,0.3,66.36,2.5
12 | 1,2020-07-01 00:26:44,2020-07-01 00:43:46,2,8.0,1,N,138,112,1,24.0,0.5,0.5,3.0,0.0,0.3,28.3,0.0
13 | 2,2020-07-01 00:40:49,2020-07-01 00:51:59,3,4.97,1,N,79,195,2,16.0,0.5,0.5,0.0,0.0,0.3,19.8,2.5
14 | 2,2020-07-01 00:03:34,2020-07-01 00:03:42,1,0.0,2,N,45,45,1,52.0,0.0,0.5,11.06,0.0,0.3,66.36,2.5
15 | 2,2020-07-01 00:08:53,2020-07-01 00:12:42,1,0.57,1,N,263,263,2,4.5,0.5,0.5,0.0,0.0,0.3,8.3,2.5
16 | 2,2020-07-01 00:16:31,2020-07-01 00:16:41,1,0.0,1,N,263,263,1,2.5,0.5,0.5,1.89,0.0,0.3,8.19,2.5
17 | 2,2020-07-01 00:36:43,2020-07-01 01:02:48,1,9.41,1,N,170,116,1,29.5,0.5,0.5,3.0,0.0,0.3,36.3,2.5
18 | 1,2020-07-01 00:16:31,2020-07-01 00:16:43,1,2.8,1,Y,141,141,2,2.5,2.5,0.5,0.0,0.0,0.3,5.8,2.5
19 | 1,2020-07-01 00:33:37,2020-07-01 00:55:26,2,13.5,1,Y,137,254,2,36.5,2.5,0.5,0.0,0.0,0.3,39.8,2.5
20 | 2,2020-07-01 00:15:15,2020-07-01 00:17:44,1,0.48,1,N,140,140,1,4.0,0.5,0.5,1.56,0.0,0.3,9.36,2.5
21 | 2,2020-07-01 00:38:24,2020-07-01 00:46:57,1,1.67,1,N,238,75,1,8.0,0.5,0.5,2.79,0.0,0.3,12.09,0.0
22 | 2,2020-07-01 00:17:10,2020-07-01 00:25:45,1,3.75,1,N,137,75,1,12.5,0.5,0.5,4.89,0.0,0.3,21.19,2.5
23 | 2,2020-07-01 00:40:45,2020-07-01 00:50:45,1,3.45,1,N,263,48,1,12.0,0.5,0.5,3.16,0.0,0.3,18.96,2.5
24 | 1,2020-07-01 00:53:13,2020-07-01 01:13:32,2,6.7,1,N,249,74,1,21.0,3.0,0.5,6.0,0.0,0.3,30.8,2.5
25 | 1,2020-07-01 00:07:57,2020-07-01 00:14:42,2,1.7,1,N,142,186,2,7.5,3.0,0.5,0.0,0.0,0.3,11.3,2.5
26 | 1,2020-07-01 00:26:14,2020-07-01 00:30:19,1,1.1,1,N,140,262,1,6.0,3.0,0.5,1.0,0.0,0.3,10.8,2.5
27 | 2,2020-07-01 00:15:30,2020-07-01 00:22:33,1,1.36,1,N,43,237,2,7.5,0.5,0.5,0.0,0.0,0.3,11.3,2.5
28 | 2,2020-07-01 00:30:01,2020-07-01 00:39:54,1,2.01,1,N,141,239,2,9.5,0.5,0.5,0.0,0.0,0.3,13.3,2.5
29 | 2,2020-07-01 00:32:58,2020-07-01 00:40:18,3,0.77,1,N,114,234,2,4.5,0.5,0.5,0.0,0.0,0.3,8.3,2.5
30 | 1,2020-07-01 00:43:40,2020-07-01 00:46:32,1,1.1,1,N,249,186,2,5.0,3.0,0.5,0.0,0.0,0.3,8.8,2.5
31 | 1,2020-07-01 00:48:48,2020-07-01 00:59:46,1,3.5,1,N,186,262,2,12.0,3.0,0.5,0.0,0.0,0.3,15.8,2.5
32 | 2,2020-07-01 00:25:25,2020-07-01 00:30:18,1,0.94,1,N,68,186,1,6.0,0.5,0.5,1.96,0.0,0.3,11.76,2.5
33 | 2,2020-07-01 00:32:51,2020-07-01 00:38:43,1,1.5,1,N,90,230,1,7.0,0.5,0.5,0.0,0.0,0.3,10.8,2.5
34 | 2,2020-07-01 00:02:23,2020-07-01 00:09:57,1,1.86,1,N,79,186,2,8.5,0.5,0.5,0.0,0.0,0.3,12.3,2.5
35 | 2,2020-07-01 00:17:00,2020-07-01 00:41:20,1,7.85,1,N,164,225,1,25.0,0.5,0.5,6.98,6.12,0.3,41.9,2.5
36 | 1,2020-07-01 00:40:37,2020-07-01 00:47:02,1,1.4,1,N,249,231,1,7.0,3.0,0.5,0.0,0.0,0.3,10.8,2.5
37 | 1,2020-07-01 00:57:28,2020-07-01 01:02:12,1,0.9,1,N,186,48,1,5.5,3.0,0.5,1.85,0.0,0.3,11.15,2.5
38 | 2,2020-07-01 00:09:44,2020-07-01 00:19:33,1,2.33,1,N,166,74,1,9.5,0.5,0.5,2.16,0.0,0.3,12.96,0.0
39 | 2,2020-07-01 00:13:57,2020-07-01 00:22:10,1,1.41,1,N,74,75,2,7.5,0.5,0.5,0.0,0.0,0.3,8.8,0.0
40 | 2,2020-07-01 00:27:00,2020-07-01 00:42:58,1,5.03,1,N,75,249,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5
41 | 2,2020-07-01 00:45:14,2020-07-01 00:53:13,1,1.44,1,N,114,79,1,7.5,0.5,0.5,0.0,0.0,0.3,11.3,2.5
42 | 2,2020-07-01 00:17:22,2020-07-01 00:38:33,1,9.3,1,N,138,166,2,28.0,0.5,0.5,0.0,6.12,0.3,35.42,0.0
43 | 2,2020-07-01 00:46:05,2020-07-01 00:47:58,1,0.75,1,N,238,236,1,4.0,0.5,0.5,0.8,0.0,0.3,6.1,0.0
44 | 2,2020-07-01 00:53:17,2020-07-01 01:04:18,1,4.44,1,N,263,7,2,14.5,0.5,0.5,0.0,0.0,0.3,18.3,2.5
45 | 2,2020-07-01 00:03:02,2020-07-01 00:07:35,2,2.39,1,N,137,232,1,8.5,0.5,0.5,2.46,0.0,0.3,14.76,2.5
46 | 2,2020-07-01 00:36:59,2020-07-01 00:51:01,3,6.42,1,N,137,97,2,20.5,0.5,0.5,0.0,0.0,0.3,24.3,2.5
47 | 2,2020-07-01 00:48:36,2020-07-01 00:57:29,1,3.77,1,N,107,263,1,11.5,0.5,0.5,1.53,0.0,0.3,16.83,2.5
48 | 2,2020-07-01 00:05:24,2020-07-01 00:14:11,1,2.89,1,N,263,170,2,10.0,0.5,0.5,0.0,0.0,0.3,13.8,2.5
49 | 2,2020-06-30 23:59:23,2020-07-01 00:13:26,1,5.07,1,N,114,236,1,16.5,0.5,0.5,3.0,0.0,0.3,23.3,2.5
50 | 2,2020-07-01 00:15:59,2020-07-01 00:29:58,3,2.55,1,N,75,166,1,12.0,0.5,0.5,0.0,0.0,0.3,13.3,0.0
51 | 2,2020-07-01 00:45:47,2020-07-01 00:54:11,4,1.95,1,N,142,164,2,8.5,0.5,0.5,0.0,0.0,0.3,12.3,2.5
52 | 2,2020-07-01 01:00:18,2020-07-01 01:27:13,1,12.66,1,N,68,248,1,36.5,0.5,0.5,2.75,0.0,0.3,40.55,0.0
53 | 2,2020-07-01 00:26:27,2020-07-01 00:57:14,1,11.6,1,N,114,128,1,35.5,0.5,0.5,0.0,0.0,0.3,39.3,2.5
54 | 2,2020-07-01 00:06:29,2020-07-01 00:26:40,1,10.27,1,N,138,238,1,29.5,0.5,0.5,9.86,6.12,0.3,49.28,2.5
55 | 2,2020-07-01 00:34:01,2020-07-01 00:47:57,1,5.31,1,N,142,4,1,17.5,0.5,0.5,5.32,0.0,0.3,26.62,2.5
56 | 2,2020-07-01 00:05:27,2020-07-01 00:20:15,1,7.48,1,N,140,25,1,22.5,0.5,0.5,6.58,0.0,0.3,32.88,2.5
57 | 2,2020-07-01 00:22:32,2020-07-01 00:46:40,1,5.27,1,N,25,89,1,20.0,0.5,0.5,4.26,0.0,0.3,25.56,0.0
58 | 2,2020-07-01 00:02:08,2020-07-01 00:18:04,1,4.98,1,N,79,13,1,17.0,0.5,0.5,4.16,0.0,0.3,24.96,2.5
59 | 2,2020-07-01 00:21:09,2020-07-01 00:34:00,1,5.66,1,N,13,137,1,18.0,0.5,0.5,3.0,0.0,0.3,24.8,2.5
60 | 1,2020-07-01 00:29:32,2020-07-01 00:35:36,1,1.3,1,N,211,232,1,7.0,3.0,0.5,2.15,0.0,0.3,12.95,2.5
61 | 1,2020-07-01 00:04:26,2020-07-01 00:17:19,2,2.4,1,N,179,8,2,11.0,0.5,0.5,0.0,0.0,0.3,12.3,0.0
62 | 2,2020-07-01 00:10:41,2020-07-01 00:19:54,1,2.12,1,N,142,140,1,9.0,0.5,0.5,3.2,0.0,0.3,16.0,2.5
63 | 2,2020-07-01 00:24:13,2020-07-01 00:28:34,2,1.67,1,N,141,75,1,6.5,0.5,0.5,2.06,0.0,0.3,12.36,2.5
64 | 2,2020-07-01 00:39:54,2020-07-01 00:49:20,2,2.77,1,N,141,100,1,10.0,0.5,0.5,3.45,0.0,0.3,17.25,2.5
65 | 2,2020-07-01 00:39:29,2020-07-01 00:57:16,1,5.99,1,N,166,7,1,19.5,0.5,0.5,5.0,6.12,0.3,31.92,0.0
66 | 2,2020-07-01 00:18:54,2020-07-01 00:25:14,1,2.33,1,N,114,162,1,8.5,0.5,0.5,0.0,0.0,0.3,12.3,2.5
67 | 2,2020-07-01 00:10:56,2020-07-01 00:13:01,6,0.73,1,N,75,74,2,4.0,0.5,0.5,0.0,0.0,0.3,5.3,0.0
68 | 2,2020-07-01 00:18:59,2020-07-01 00:35:22,1,6.52,1,N,137,80,4,-21.0,-0.5,-0.5,0.0,0.0,-0.3,-24.8,-2.5
69 | 2,2020-07-01 00:18:59,2020-07-01 00:35:22,1,6.52,1,N,137,80,2,21.0,0.5,0.5,0.0,0.0,0.3,24.8,2.5
70 | 2,2020-07-01 00:01:58,2020-07-01 00:27:31,1,12.23,1,N,138,48,1,35.5,0.5,0.5,6.0,6.12,0.3,51.42,2.5
71 | 2,2020-06-30 23:42:25,2020-07-01 00:05:46,1,6.98,1,N,143,247,2,23.5,0.5,0.5,0.0,0.0,0.3,27.3,2.5
72 | 1,2020-07-01 00:01:40,2020-07-01 00:12:19,2,2.4,1,N,151,116,1,10.5,0.5,0.5,2.95,0.0,0.3,14.75,0.0
73 | 1,2020-07-01 00:30:51,2020-07-01 00:36:44,2,1.5,1,N,238,166,1,7.5,3.0,0.5,0.0,0.0,0.3,11.3,2.5
74 | 1,2020-07-01 00:12:23,2020-07-01 00:22:57,1,2.3,1,N,68,148,2,10.0,3.0,0.5,0.0,0.0,0.3,13.8,2.5
75 | 1,2020-07-01 00:27:23,2020-07-01 00:30:13,1,0.7,1,N,148,79,1,4.5,3.0,0.5,1.0,0.0,0.3,9.3,2.5
76 | 2,2020-07-01 00:24:51,2020-07-01 00:32:30,2,2.1,1,N,48,158,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5
77 | 2,2020-07-01 00:42:04,2020-07-01 00:45:39,2,1.05,1,N,48,48,2,5.0,0.5,0.5,0.0,0.0,0.3,8.8,2.5
78 | 1,2020-07-01 00:04:38,2020-07-01 00:09:25,1,1.0,1,N,249,186,2,6.0,3.0,0.5,0.0,0.0,0.3,9.8,2.5
79 | 1,2020-07-01 00:18:56,2020-07-01 00:20:35,1,0.5,1,N,125,249,1,3.5,3.0,0.5,1.0,0.0,0.3,8.3,2.5
80 | 1,2020-07-01 00:31:45,2020-07-01 00:50:50,2,6.2,1,N,249,37,1,20.5,3.0,0.5,4.85,0.0,0.3,29.15,2.5
81 | 1,2020-07-01 00:13:01,2020-07-01 00:35:14,1,14.6,1,N,162,16,2,40.5,3.0,0.5,0.0,0.0,0.3,44.3,2.5
82 | 2,2020-07-01 00:09:30,2020-07-01 00:12:54,1,0.7,1,N,230,100,2,4.5,0.5,0.5,0.0,0.0,0.3,8.3,2.5
83 | 1,2020-07-01 00:07:46,2020-07-01 00:16:16,1,2.0,1,N,170,141,1,8.5,3.0,0.5,3.65,0.0,0.3,15.95,2.5
84 | 1,2020-07-01 00:34:45,2020-07-01 00:53:39,1,4.9,1,N,249,262,1,17.5,3.0,0.5,2.7,0.0,0.3,24.0,2.5
85 | 2,2020-07-01 00:48:58,2020-07-01 01:01:27,1,4.82,1,N,75,145,1,15.0,0.5,0.5,0.0,0.0,0.3,18.8,2.5
86 | 2,2020-07-01 00:12:29,2020-07-01 00:19:56,1,1.73,1,N,263,238,1,8.0,0.5,0.5,1.0,0.0,0.3,12.8,2.5
87 | 1,2020-07-01 00:11:19,2020-07-01 00:16:24,1,1.1,1,N,138,70,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,0.0
88 | 1,2020-07-01 00:38:12,2020-07-01 01:14:52,2,25.9,1,N,138,156,4,68.0,0.5,0.5,0.0,12.24,0.3,81.54,0.0
89 | 2,2020-07-01 00:14:26,2020-07-01 00:17:41,2,0.77,1,N,137,107,1,4.5,0.5,0.5,1.66,0.0,0.3,9.96,2.5
90 | 1,2020-07-01 00:36:09,2020-07-01 00:42:25,1,2.3,1,N,48,236,1,8.5,3.0,0.5,2.45,0.0,0.3,14.75,2.5
91 | 2,2020-07-01 00:58:59,2020-07-01 01:06:59,1,3.57,1,N,74,119,1,11.5,0.5,0.5,2.56,0.0,0.3,15.36,0.0
92 | 1,2020-07-01 00:55:52,2020-07-01 01:14:25,1,7.8,1,N,68,159,1,24.5,3.0,0.5,5.65,0.0,0.3,33.95,2.5
93 | 2,2020-07-01 00:04:05,2020-07-01 00:18:10,1,6.31,1,N,137,97,1,19.5,0.5,0.5,0.0,0.0,0.3,23.3,2.5
94 | 1,2020-07-01 00:16:21,2020-07-01 00:46:16,1,18.5,4,N,50,265,1,67.0,3.0,0.5,5.0,0.0,0.3,75.8,2.5
95 | 2,2020-07-01 00:16:15,2020-07-01 00:44:00,2,10.8,1,N,186,26,1,32.5,0.5,0.5,3.7,0.0,0.3,40.0,2.5
96 | 1,2020-07-01 00:24:19,2020-07-01 00:32:39,1,1.6,1,N,234,144,1,8.0,3.0,0.5,1.0,0.0,0.3,12.8,2.5
97 | 1,2020-07-01 00:46:44,2020-07-01 01:05:20,1,4.1,1,N,90,238,1,17.0,3.0,0.5,4.15,0.0,0.3,24.95,2.5
98 | 1,2020-07-01 00:08:43,2020-07-01 00:33:43,1,5.7,1,N,186,217,2,21.5,3.0,0.5,0.0,0.0,0.3,25.3,2.5
99 | 2,2020-07-01 00:15:37,2020-07-01 00:34:29,1,14.19,1,N,132,7,1,38.5,0.5,0.5,7.96,0.0,0.3,47.76,0.0
100 | 2,2020-07-01 00:38:02,2020-07-01 00:49:24,1,2.47,1,N,7,145,2,10.5,0.5,0.5,0.0,0.0,0.3,11.8,0.0
101 | 2,2020-07-01 00:15:10,2020-07-01 00:17:55,1,0.88,1,N,141,229,1,4.5,0.5,0.5,0.5,0.0,0.3,8.8,2.5
102 | 2,2020-07-01 00:33:27,2020-07-01 00:41:52,1,2.22,1,N,114,261,1,9.0,0.5,0.5,2.0,0.0,0.3,14.8,2.5
103 | 1,2020-07-01 00:15:07,2020-07-01 00:39:00,1,7.0,1,N,263,173,2,23.5,3.0,0.5,0.0,0.0,0.3,27.3,2.5
104 | 1,2020-07-01 00:47:31,2020-07-01 01:05:14,1,7.9,1,N,138,96,2,24.0,0.5,0.5,0.0,0.0,0.3,25.3,0.0
105 | 2,2020-07-01 00:39:43,2020-07-01 00:42:22,2,1.07,1,N,75,41,1,5.0,0.5,0.5,0.0,0.0,0.3,6.3,0.0
106 | 2,2020-07-01 00:19:37,2020-07-01 00:27:20,1,1.54,1,N,231,232,2,7.5,0.5,0.5,0.0,0.0,0.3,11.3,2.5
107 | 2,2020-07-01 00:33:35,2020-07-02 00:27:20,1,8.31,1,N,148,41,1,25.0,0.5,0.5,2.88,0.0,0.3,31.68,2.5
108 | 2,2020-07-01 00:59:08,2020-07-01 01:07:28,1,1.61,1,N,166,244,2,8.0,0.5,0.5,0.0,0.0,0.3,9.3,0.0
109 | 2,2020-07-01 00:08:12,2020-07-01 00:17:32,1,2.33,1,N,229,142,1,9.5,0.5,0.5,2.66,0.0,0.3,15.96,2.5
110 | 2,2020-07-01 00:27:41,2020-07-01 00:35:32,1,3.21,1,N,249,143,2,10.5,0.5,0.5,0.0,0.0,0.3,14.3,2.5
111 | 2,2020-07-01 00:08:42,2020-07-01 00:20:20,2,4.17,1,N,137,193,2,14.0,0.5,0.5,0.0,0.0,0.3,17.8,2.5
112 | 2,2020-07-01 00:39:57,2020-07-01 00:44:22,1,1.51,1,N,229,262,1,6.5,0.5,0.5,2.06,0.0,0.3,12.36,2.5
113 | 2,2020-07-01 00:45:33,2020-07-01 00:49:30,1,1.3,1,N,262,74,2,6.0,0.5,0.5,0.0,0.0,0.3,9.8,2.5
114 | 1,2020-07-01 00:05:05,2020-07-01 00:16:12,1,4.2,1,N,231,142,1,13.5,3.0,0.5,3.46,0.0,0.3,20.76,2.5
115 | 1,2020-07-01 00:21:25,2020-07-01 00:50:22,1,7.4,1,N,48,244,1,27.5,3.0,0.5,0.0,0.0,0.3,31.3,2.5
116 | 1,2020-07-01 00:12:49,2020-07-01 00:17:58,1,0.9,1,N,41,151,1,6.0,0.5,0.5,1.45,0.0,0.3,8.75,0.0
117 | 1,2020-07-01 00:35:12,2020-07-01 00:43:47,1,1.6,1,N,48,229,2,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5
118 | 1,2020-07-01 00:04:13,2020-07-01 00:07:08,1,0.9,1,N,140,263,1,5.0,3.0,0.5,1.0,0.0,0.3,9.8,2.5
119 | 1,2020-07-01 00:14:39,2020-07-01 00:16:51,1,0.9,1,N,263,141,1,4.5,3.0,0.5,1.65,0.0,0.3,9.95,2.5
120 | 1,2020-07-01 00:26:16,2020-07-01 00:30:30,1,0.7,1,N,262,262,1,5.0,3.0,0.5,2.2,0.0,0.3,11.0,2.5
121 | 2,2020-07-01 00:03:36,2020-07-01 00:16:32,1,3.86,1,N,87,246,1,14.0,0.5,0.5,0.0,0.0,0.3,17.8,2.5
122 | 2,2020-07-01 00:24:03,2020-07-01 00:32:35,1,3.11,1,N,186,261,2,10.5,0.5,0.5,0.0,0.0,0.3,14.3,2.5
123 | 2,2020-07-01 00:38:09,2020-07-01 00:48:00,1,2.89,1,N,125,48,2,11.0,0.5,0.5,0.0,0.0,0.3,14.8,2.5
124 | 2,2020-07-01 00:14:57,2020-07-01 00:49:16,1,3.49,1,N,140,74,2,23.0,0.5,0.5,0.0,0.0,0.3,26.8,2.5
125 | 2,2020-07-01 00:00:16,2020-07-01 00:14:52,1,2.51,1,N,80,148,1,12.0,0.5,0.5,3.16,0.0,0.3,18.96,2.5
126 | 2,2020-07-01 00:25:32,2020-07-01 00:31:39,1,2.17,1,N,233,263,1,8.0,0.5,0.5,2.36,0.0,0.3,14.16,2.5
127 | 2,2020-07-01 00:05:52,2020-07-01 00:18:37,1,2.57,1,N,244,41,1,11.5,0.5,0.5,2.56,0.0,0.3,15.36,0.0
128 | 2,2020-07-01 00:23:36,2020-07-01 00:31:16,1,1.04,1,N,41,74,2,7.0,0.5,0.5,0.0,0.0,0.3,8.3,0.0
129 | 2,2020-07-01 00:34:33,2020-07-01 00:48:41,1,4.79,1,N,75,193,2,15.5,0.5,0.5,0.0,0.0,0.3,19.3,2.5
130 | 2,2020-07-01 00:50:13,2020-07-01 01:18:50,1,17.21,1,N,193,76,2,47.0,0.5,0.5,0.0,0.0,0.3,48.3,0.0
131 | 1,2020-07-01 00:28:51,2020-07-01 00:38:34,1,3.5,1,N,79,141,1,11.5,3.0,0.5,3.05,0.0,0.3,18.35,2.5
132 | 1,2020-07-01 00:08:17,2020-07-01 00:23:00,2,10.6,1,N,138,191,1,29.5,0.5,0.5,7.7,0.0,0.3,38.5,0.0
133 | 2,2020-07-01 00:37:29,2020-07-01 00:51:43,1,8.75,1,N,74,259,2,25.5,0.5,0.5,0.0,0.0,0.3,26.8,0.0
134 | 1,2020-07-01 00:16:06,2020-07-01 00:20:01,2,1.0,1,N,48,48,1,5.0,3.0,0.5,0.0,0.0,0.3,8.8,2.5
135 | 2,2020-07-01 00:04:30,2020-07-01 05:10:03,1,21.22,1,N,224,224,2,173.5,0.5,0.5,0.0,0.0,0.3,177.3,2.5
136 | 1,2020-07-01 00:51:08,2020-07-01 01:18:28,1,0.0,1,N,247,37,1,28.2,0.0,0.5,0.0,0.0,0.3,29.0,0.0
137 | 2,2020-07-01 00:01:34,2020-07-01 00:20:19,1,4.82,1,N,113,189,1,18.0,0.5,0.5,4.36,0.0,0.3,26.16,2.5
138 | 2,2020-07-01 00:24:53,2020-07-01 00:53:42,1,9.3,5,N,229,265,2,40.0,0.0,0.5,0.0,11.75,0.3,55.05,2.5
139 | 1,2020-07-01 00:53:12,2020-07-01 01:07:22,1,4.1,1,N,68,65,1,15.0,3.0,0.5,3.0,0.0,0.3,21.8,2.5
140 | 2,2020-07-01 00:52:34,2020-07-01 01:00:22,3,1.6,1,N,239,263,1,8.0,0.5,0.5,2.95,0.0,0.3,14.75,2.5
141 | 2,2020-07-01 00:38:25,2020-07-01 00:50:55,1,3.93,1,N,68,140,1,13.5,0.5,0.5,3.46,0.0,0.3,20.76,2.5
142 | 2,2020-07-01 00:59:10,2020-07-01 01:02:20,1,0.86,1,N,229,141,1,5.0,0.5,0.5,2.2,0.0,0.3,11.0,2.5
143 | 2,2020-07-01 00:17:00,2020-07-01 00:30:59,1,4.76,1,N,68,75,2,15.5,0.5,0.5,0.0,0.0,0.3,19.3,2.5
144 | 2,2020-07-01 00:32:59,2020-07-01 00:36:36,1,1.03,1,N,75,262,2,5.5,0.5,0.5,0.0,0.0,0.3,9.3,2.5
145 | 2,2020-07-01 00:34:29,2020-07-01 00:38:21,1,0.6,1,N,151,43,2,5.0,0.5,0.5,0.0,0.0,0.3,6.3,0.0
146 | 2,2020-07-01 00:05:01,2020-07-01 00:07:22,6,1.09,1,N,249,186,1,5.0,0.5,0.5,2.64,0.0,0.3,11.44,2.5
147 | 2,2020-07-01 01:00:13,2020-07-01 01:16:45,6,9.22,1,N,148,7,1,27.0,0.5,0.5,9.24,0.0,0.3,40.04,2.5
148 | 2,2020-07-01 00:32:28,2020-07-01 00:48:25,1,10.65,1,N,183,64,1,30.0,0.5,0.5,2.75,6.12,0.3,40.17,0.0
149 | 2,2020-07-01 00:50:08,2020-07-01 00:50:18,1,0.0,2,N,74,74,1,52.0,0.0,0.5,0.0,6.12,0.3,58.92,0.0
150 | 2,2020-07-01 00:57:42,2020-07-01 01:01:13,2,0.82,1,N,141,263,1,5.0,0.5,0.5,0.0,0.0,0.3,8.8,2.5
151 | 2,2020-07-01 00:07:52,2020-07-01 00:11:51,1,0.99,1,N,263,141,1,5.5,0.5,0.5,2.33,0.0,0.3,11.63,2.5
152 | 2,2020-07-01 00:38:02,2020-07-01 00:56:43,1,5.81,1,N,48,226,1,19.0,0.5,0.5,5.0,0.0,0.3,27.8,2.5
153 | 2,2020-07-01 00:12:06,2020-07-01 00:37:51,1,16.43,1,N,132,188,1,44.0,0.5,0.5,5.0,0.0,0.3,50.3,0.0
154 | 2,2020-07-01 00:19:56,2020-07-01 00:37:09,1,8.17,1,N,75,88,1,24.5,0.5,0.5,5.66,0.0,0.3,33.96,2.5
155 | 2,2020-07-01 00:13:41,2020-07-01 00:22:54,1,6.15,1,N,132,135,2,18.0,0.5,0.5,0.0,0.0,0.3,19.3,0.0
156 | 2,2020-07-01 00:23:55,2020-07-01 00:27:12,2,0.95,1,N,263,74,1,5.0,0.5,0.5,1.76,0.0,0.3,10.56,2.5
157 | 2,2020-07-01 00:32:56,2020-07-01 00:36:43,2,1.19,1,N,263,229,1,5.5,0.5,0.5,1.86,0.0,0.3,11.16,2.5
158 | 2,2020-07-01 00:51:36,2020-07-01 01:08:08,2,4.95,1,N,137,112,2,17.5,0.5,0.5,0.0,0.0,0.3,21.3,2.5
159 | 1,2020-07-01 00:19:28,2020-07-01 00:57:28,1,9.3,1,N,68,120,2,34.5,3.0,0.5,0.0,0.0,0.3,38.3,2.5
160 | 2,2020-07-01 00:23:07,2020-07-01 00:26:26,2,0.62,1,N,48,164,1,4.5,0.5,0.5,0.0,0.0,0.3,8.3,2.5
161 | 2,2020-07-01 00:33:28,2020-07-01 00:34:44,3,0.62,1,N,48,68,1,4.0,0.5,0.5,1.56,0.0,0.3,9.36,2.5
162 | 2,2020-07-01 00:09:00,2020-07-01 00:09:22,2,0.05,1,N,113,113,2,2.5,0.5,0.5,0.0,0.0,0.3,6.3,2.5
163 | 1,2020-07-01 00:15:23,2020-07-01 00:20:04,1,0.8,1,N,79,4,1,5.5,3.0,0.5,1.85,0.0,0.3,11.15,2.5
164 | 1,2020-07-01 00:19:27,2020-07-01 00:44:15,1,7.4,1,N,113,129,1,24.5,3.0,0.5,4.0,0.0,0.3,32.3,2.5
165 | 2,2020-07-01 00:07:06,2020-07-01 00:16:37,1,3.0,1,N,143,152,1,10.5,0.5,0.5,2.86,0.0,0.3,17.16,2.5
166 | 2,2020-07-01 00:03:33,2020-07-01 00:10:21,1,2.51,1,N,263,170,1,9.0,0.5,0.5,1.92,0.0,0.3,14.72,2.5
167 | 1,2020-07-01 00:04:39,2020-07-01 00:13:28,1,0.0,1,N,236,170,1,9.2,3.0,0.5,2.6,0.0,0.3,15.6,2.5
168 | 1,2020-07-01 00:17:43,2020-07-01 00:32:35,1,8.6,1,N,233,212,1,25.0,3.0,0.5,5.75,0.0,0.3,34.55,2.5
169 | 1,2020-07-01 00:51:58,2020-07-01 00:55:37,1,1.5,1,N,229,137,1,6.0,3.0,0.5,1.96,0.0,0.3,11.76,2.5
170 | 2,2020-07-01 00:19:11,2020-07-01 00:24:01,3,1.44,1,N,186,50,1,6.0,0.5,0.5,2.94,0.0,0.3,12.74,2.5
171 | 2,2020-07-01 00:50:23,2020-07-01 00:56:23,4,1.74,1,N,68,234,1,7.5,0.5,0.5,2.26,0.0,0.3,13.56,2.5
172 | 2,2020-07-01 00:15:19,2020-07-01 00:22:27,1,2.25,1,N,114,65,2,9.5,0.5,0.5,0.0,0.0,0.3,13.3,2.5
173 | 2,2020-07-01 00:12:10,2020-07-01 01:05:19,1,29.88,1,N,259,86,1,80.0,0.5,0.5,2.75,6.12,0.3,90.17,0.0
174 | 1,2020-07-01 00:40:36,2020-07-01 00:50:32,1,2.1,1,N,237,263,1,9.5,3.0,0.5,2.65,0.0,0.3,15.95,2.5
175 | 2,2020-07-01 00:31:32,2020-07-01 00:36:52,1,1.63,1,N,263,74,2,7.0,0.5,0.5,0.0,0.0,0.3,10.8,2.5
176 | 2,2020-07-01 00:54:23,2020-07-01 00:58:57,2,1.81,1,N,263,74,1,7.0,0.5,0.5,2.0,0.0,0.3,12.8,2.5
177 | 1,2020-07-01 00:09:18,2020-07-01 00:36:31,1,0.0,1,N,137,76,1,40.2,0.0,0.5,0.0,6.12,0.3,47.12,0.0
178 | 2,2020-07-01 00:10:02,2020-07-01 00:19:46,1,2.13,1,N,68,125,2,9.0,0.5,0.5,0.0,0.0,0.3,12.8,2.5
179 | 2,2020-07-01 00:14:31,2020-07-01 00:30:18,1,2.2,1,N,116,41,2,12.0,0.5,0.5,0.0,0.0,0.3,13.3,0.0
180 | 2,2020-07-01 00:02:54,2020-07-01 00:09:25,1,1.88,1,N,48,229,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5
181 | 2,2020-07-01 00:48:38,2020-07-01 00:57:47,1,3.44,1,N,143,166,2,11.0,0.5,0.5,0.0,0.0,0.3,14.8,2.5
182 | 1,2020-07-01 00:54:28,2020-07-01 01:21:16,1,0.0,1,N,100,259,1,39.2,0.0,0.5,0.0,0.0,0.3,40.0,0.0
183 | 2,2020-07-01 00:02:21,2020-07-01 00:08:56,4,1.96,1,N,100,114,1,8.0,0.5,0.5,2.36,0.0,0.3,14.16,2.5
184 | 2,2020-07-01 00:09:00,2020-07-01 00:22:11,1,4.36,1,N,137,146,2,15.0,0.5,0.5,0.0,0.0,0.3,18.8,2.5
185 | 2,2020-07-01 00:53:38,2020-07-01 00:58:08,1,1.54,1,N,142,186,2,6.5,0.5,0.5,0.0,0.0,0.3,10.3,2.5
186 | 2,2020-07-01 00:52:58,2020-07-01 01:01:28,1,2.85,1,N,237,79,1,10.0,0.5,0.5,2.76,0.0,0.3,16.56,2.5
187 | 1,2020-07-01 00:06:43,2020-07-01 00:32:05,1,7.0,1,N,238,226,2,23.0,3.0,0.5,0.0,0.0,0.3,26.8,2.5
188 | 1,2020-07-01 00:05:01,2020-07-01 00:24:49,2,9.2,1,N,138,17,2,27.5,0.5,0.5,0.0,0.0,0.3,28.8,0.0
189 | 2,2020-07-01 00:02:07,2020-07-01 00:08:02,1,1.29,1,N,142,48,1,6.5,0.5,0.5,3.7,0.0,0.3,14.0,2.5
190 | 2,2020-07-01 00:11:00,2020-07-01 00:21:26,1,3.23,1,N,230,263,1,11.5,0.5,0.5,3.83,0.0,0.3,19.13,2.5
191 | 2,2020-07-01 00:26:40,2020-07-01 00:45:39,1,11.67,1,N,141,28,2,32.0,0.5,0.5,0.0,0.0,0.3,35.8,2.5
192 | 2,2020-07-01 00:59:08,2020-07-01 01:03:01,1,1.17,1,N,258,102,2,5.5,0.5,0.5,0.0,0.0,0.3,6.8,0.0
193 | 2,2020-07-01 00:00:52,2020-07-01 00:08:38,1,1.85,1,N,68,158,1,8.0,0.5,0.5,2.36,0.0,0.3,14.16,2.5
194 | 2,2020-07-01 00:42:59,2020-07-01 01:02:30,1,7.46,1,N,249,41,1,24.0,0.5,0.5,2.0,0.0,0.3,29.8,2.5
195 | 1,2020-07-01 00:42:46,2020-07-01 00:58:19,1,4.8,1,N,186,193,2,16.5,3.0,0.5,0.0,0.0,0.3,20.3,2.5
196 | 2,2020-07-01 00:00:54,2020-07-01 00:11:46,1,2.02,1,N,170,90,2,9.5,0.5,0.5,0.0,0.0,0.3,13.3,2.5
197 | 1,2020-07-01 00:01:32,2020-07-01 00:21:31,1,3.2,1,N,236,186,1,15.0,3.5,0.5,0.0,0.0,0.3,19.3,2.5
198 | 1,2020-07-01 00:55:22,2020-07-01 01:02:53,1,1.7,1,N,107,229,2,8.0,3.5,0.5,0.0,0.0,0.3,12.3,2.5
199 | 2,2020-07-01 00:51:31,2020-07-01 00:51:39,1,0.03,1,N,42,264,2,2.5,0.5,0.5,0.0,0.0,0.3,3.8,0.0
200 | 1,2020-07-01 00:27:40,2020-07-01 00:41:35,1,9.1,1,N,132,39,1,26.0,0.5,0.5,0.02,0.0,0.3,27.32,0.0
201 | 1,2020-07-01 00:23:18,2020-07-01 00:56:51,1,22.8,1,N,138,123,1,60.5,0.5,0.5,0.05,0.0,0.3,61.85,0.0
202 | 2,2020-07-01 00:10:41,2020-07-01 00:11:38,2,0.91,5,N,70,70,1,35.0,0.0,0.5,0.0,0.0,0.3,35.8,0.0
203 | 2,2020-07-01 00:02:23,2020-07-01 00:10:22,1,1.35,1,N,107,186,2,7.5,0.5,0.5,0.0,0.0,0.3,11.3,2.5
204 | 1,2020-07-01 00:27:23,2020-07-01 00:46:42,1,0.0,1,N,231,14,1,23.2,3.0,0.5,0.0,6.12,0.3,33.12,2.5
205 | 1,2020-07-01 00:16:54,2020-07-01 00:44:13,1,7.5,1,N,264,264,1,25.0,0.5,0.5,5.25,0.0,0.3,31.55,0.0
206 | 2,2020-07-01 00:14:45,2020-07-01 00:23:33,1,2.51,1,N,263,142,1,9.5,0.5,0.5,2.66,0.0,0.3,15.96,2.5
207 | 2,2020-07-01 00:47:25,2020-07-01 01:13:52,1,10.84,1,N,231,244,1,32.0,0.5,0.5,7.16,0.0,0.3,42.96,2.5
208 | 2,2020-07-01 00:10:53,2020-07-01 00:25:28,1,5.08,1,N,90,75,2,16.0,0.5,0.5,0.0,0.0,0.3,19.8,2.5
209 | 2,2020-07-01 00:00:37,2020-07-01 00:16:20,1,6.34,1,N,70,229,1,19.5,0.5,0.5,3.5,0.0,0.3,26.8,2.5
210 | 1,2020-07-01 00:20:01,2020-07-01 00:25:18,0,1.7,1,N,48,68,3,7.0,3.0,0.5,0.0,0.0,0.3,10.8,2.5
211 | 1,2020-07-01 00:14:42,2020-07-01 00:36:34,1,8.6,1,N,162,196,2,26.0,3.0,0.5,0.0,0.0,0.3,29.8,2.5
212 | 2,2020-07-01 00:20:57,2020-07-01 00:26:10,3,1.33,1,N,137,100,2,6.5,0.5,0.5,0.0,0.0,0.3,10.3,2.5
213 | 1,2020-07-01 00:59:56,2020-07-01 01:01:52,1,0.6,1,N,234,113,1,4.0,3.0,0.5,1.95,0.0,0.3,9.75,2.5
214 | 2,2020-07-01 00:07:20,2020-07-01 00:12:24,1,1.81,1,N,90,48,1,7.5,0.5,0.5,0.0,0.0,0.3,11.3,2.5
215 | 2,2020-07-01 00:15:58,2020-07-01 00:21:47,1,2.34,1,N,48,90,1,8.0,0.5,0.5,2.36,0.0,0.3,14.16,2.5
216 | 2,2020-07-01 00:19:37,2020-07-01 00:22:34,1,1.32,1,N,170,79,1,5.5,0.5,0.5,1.86,0.0,0.3,11.16,2.5
217 | 2,2020-07-01 00:33:21,2020-07-01 00:38:35,1,0.64,1,N,100,186,2,5.5,0.5,0.5,0.0,0.0,0.3,9.3,2.5
218 | 2,2020-07-01 00:44:42,2020-07-01 00:51:50,1,1.64,1,N,231,261,1,7.5,0.5,0.5,2.26,0.0,0.3,13.56,2.5
219 | 2,2020-07-01 00:27:04,2020-07-01 00:31:28,1,0.67,1,N,113,79,1,5.0,0.5,0.5,1.76,0.0,0.3,10.56,2.5
220 | 2,2020-07-01 00:47:27,2020-07-01 01:05:31,1,6.53,1,N,137,198,2,21.0,0.5,0.5,0.0,0.0,0.3,24.8,2.5
221 | 2,2020-07-01 00:02:03,2020-07-01 00:10:13,1,2.17,1,N,48,233,2,9.0,0.5,0.5,0.0,0.0,0.3,12.8,2.5
222 | 2,2020-07-01 00:16:38,2020-07-02 00:10:13,1,4.54,1,N,164,256,1,16.0,0.5,0.5,3.96,0.0,0.3,23.76,2.5
223 | 1,2020-07-01 00:29:08,2020-07-01 00:49:35,1,9.5,1,N,138,116,2,27.5,1.0,0.5,0.0,6.12,0.3,35.42,0.0
224 | 2,2020-07-01 00:17:53,2020-07-01 00:22:15,1,1.15,1,N,137,137,1,5.5,0.5,0.5,2.33,0.0,0.3,11.63,2.5
225 | 2,2020-07-01 00:27:27,2020-07-01 00:41:14,1,6.93,1,N,137,159,1,21.0,0.5,0.5,2.0,0.0,0.3,26.8,2.5
226 | 1,2020-07-01 00:11:27,2020-07-01 00:29:59,2,8.8,1,N,138,198,2,25.5,0.5,0.5,0.0,0.0,0.3,26.8,0.0
227 | 1,2020-07-01 00:06:50,2020-07-01 00:23:37,1,8.4,1,N,138,80,1,25.0,0.5,0.5,5.25,0.0,0.3,31.55,0.0
228 | 2,2020-07-01 00:31:49,2020-07-01 00:46:41,2,5.42,1,N,161,42,1,17.0,0.5,0.5,5.2,0.0,0.3,26.0,2.5
229 | 1,2020-07-01 00:13:14,2020-07-01 00:29:10,1,4.1,1,N,181,177,2,15.0,0.5,0.5,0.0,0.0,0.3,16.3,0.0
230 | 1,2020-07-01 00:57:45,2020-07-01 01:00:31,1,0.4,1,N,49,49,2,4.0,0.5,0.5,0.0,0.0,0.3,5.3,0.0
231 | 2,2020-07-01 00:09:29,2020-07-01 00:20:40,1,3.11,1,N,141,7,2,11.0,0.5,0.5,0.0,0.0,0.3,14.8,2.5
232 | 2,2020-07-01 00:23:00,2020-07-01 00:41:10,1,4.42,1,N,7,140,1,16.5,0.5,0.5,6.09,0.0,0.3,26.39,2.5
233 | 2,2020-07-01 00:19:14,2020-07-01 00:51:53,1,27.71,4,N,132,265,2,91.0,0.5,0.5,0.0,6.12,0.3,98.42,0.0
234 | 1,2020-07-01 00:01:46,2020-07-01 00:24:45,3,12.9,1,N,138,61,2,36.5,0.5,0.5,0.0,0.0,0.3,37.8,0.0
235 | 2,2020-06-30 18:11:01,2020-06-30 18:15:03,1,0.55,1,N,74,75,2,4.5,0.5,0.5,0.0,0.0,0.3,5.8,0.0
236 | 2,2020-06-30 18:18:33,2020-06-30 18:23:32,1,1.67,1,N,236,237,2,7.0,0.5,0.5,0.0,0.0,0.3,10.8,2.5
237 | 2,2020-06-30 18:32:20,2020-07-01 18:07:16,1,7.45,1,N,161,159,1,28.5,0.5,0.5,3.0,0.0,0.3,35.3,2.5
238 | 2,2020-07-01 00:18:13,2020-07-01 00:23:50,2,0.76,1,N,137,234,2,6.0,0.5,0.5,0.0,0.0,0.3,9.8,2.5
239 | 1,2020-07-01 00:14:00,2020-07-01 00:38:57,1,14.2,1,N,237,22,2,40.0,3.0,0.5,0.0,0.0,0.3,43.8,2.5
240 | 2,2020-07-01 00:21:46,2020-07-01 00:30:04,1,0.73,1,N,75,236,1,7.0,0.5,0.5,2.7,0.0,0.3,13.5,2.5
241 | 2,2020-07-01 00:28:49,2020-07-01 00:41:14,1,2.7,1,N,42,238,1,11.5,0.5,0.5,1.0,0.0,0.3,13.8,0.0
242 | 2,2020-07-01 00:52:31,2020-07-01 00:58:39,1,1.98,1,N,142,246,2,7.5,0.5,0.5,0.0,0.0,0.3,11.3,2.5
243 | 1,2020-07-01 00:14:43,2020-07-01 00:19:19,1,1.0,1,N,161,100,2,5.5,3.0,0.5,0.0,0.0,0.3,9.3,2.5
244 | 2,2020-07-01 00:05:15,2020-07-01 00:11:26,3,2.08,1,N,262,74,1,8.0,0.5,0.5,2.95,0.0,0.3,14.75,2.5
245 | 1,2020-07-01 01:38:25,2020-07-01 01:38:46,1,0.0,1,N,10,10,1,39.2,0.0,0.5,0.0,0.0,0.3,40.0,0.0
246 | 1,2020-07-01 01:20:14,2020-07-01 01:52:58,1,0.0,1,N,168,35,1,38.2,0.0,0.5,0.0,6.12,0.3,45.12,0.0
247 | 2,2020-07-01 01:31:41,2020-07-01 01:42:22,1,2.1,1,N,50,141,1,9.5,0.5,0.5,3.33,0.0,0.3,16.63,2.5
248 | 2,2020-07-01 01:48:22,2020-07-01 02:07:06,1,4.37,1,N,262,166,1,17.0,0.5,0.5,4.16,0.0,0.3,24.96,2.5
249 | 2,2020-07-01 01:27:57,2020-07-01 01:38:03,1,2.64,1,N,246,229,2,10.5,0.5,0.5,0.0,0.0,0.3,14.3,2.5
250 | 2,2020-07-01 01:48:33,2020-07-01 01:52:03,1,1.22,1,N,48,239,2,5.5,0.5,0.5,0.0,0.0,0.3,9.3,2.5
251 | 2,2020-07-01 01:57:36,2020-07-01 02:32:21,1,21.5,2,N,142,132,2,52.0,0.0,0.5,0.0,6.12,0.3,61.42,2.5
252 | 2,2020-07-01 01:36:00,2020-07-01 01:39:28,1,0.85,1,N,74,42,1,5.0,0.5,0.5,0.0,0.0,0.3,6.3,0.0
253 | 1,2020-07-01 01:10:43,2020-07-01 01:16:41,1,1.6,1,N,146,7,2,7.0,0.5,0.5,0.0,0.0,0.3,8.3,0.0
254 | 1,2020-07-01 01:47:34,2020-07-01 01:50:10,1,0.7,1,N,137,107,1,4.5,3.0,0.5,1.65,0.0,0.3,9.95,2.5
255 | 2,2020-07-01 01:55:37,2020-07-01 02:04:49,1,3.89,1,N,116,143,2,13.0,0.5,0.5,0.0,0.0,0.3,16.8,2.5
256 | 2,2020-07-01 01:06:11,2020-07-01 01:16:47,1,6.0,1,N,230,244,2,18.0,0.5,0.5,0.0,0.0,0.3,21.8,2.5
257 | 2,2020-07-01 01:38:55,2020-07-01 01:44:05,1,1.12,1,N,166,24,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,0.0
258 | 1,2020-07-01 01:28:19,2020-07-01 01:59:40,1,0.0,1,N,168,258,1,36.2,0.0,0.5,0.0,6.12,0.3,43.12,0.0
259 | 1,2020-07-01 01:23:23,2020-07-01 01:28:24,1,0.9,1,N,107,170,2,5.5,3.0,0.5,0.0,0.0,0.3,9.3,2.5
260 | 2,2020-07-01 01:31:24,2020-07-01 01:44:24,1,3.75,1,N,141,41,1,13.0,0.5,0.5,2.0,0.0,0.3,18.8,2.5
261 | 2,2020-07-01 01:38:26,2020-07-01 01:49:37,1,3.28,1,N,48,211,2,12.0,0.5,0.5,0.0,0.0,0.3,15.8,2.5
262 | 1,2020-07-01 01:35:50,2020-07-01 01:40:24,1,1.4,1,N,140,236,1,6.5,3.0,0.5,2.05,0.0,0.3,12.35,2.5
263 | 1,2020-07-01 01:48:38,2020-07-01 01:51:55,1,1.3,1,N,140,75,2,5.5,3.0,0.5,0.0,0.0,0.3,9.3,2.5
264 | 1,2020-07-01 01:55:54,2020-07-01 02:05:23,1,3.2,1,N,236,230,2,11.5,3.0,0.5,0.0,0.0,0.3,15.3,2.5
265 | 1,2020-07-01 01:48:27,2020-07-01 01:52:21,1,0.3,1,N,158,249,1,4.5,3.0,0.5,1.65,0.0,0.3,9.95,2.5
266 | 1,2020-07-01 01:49:27,2020-07-01 01:55:52,1,0.8,1,N,140,237,2,6.5,3.0,0.5,0.0,0.0,0.3,10.3,2.5
267 | 2,2020-07-01 01:49:40,2020-07-01 01:56:37,3,2.64,1,N,263,161,1,9.0,0.5,0.5,0.0,0.0,0.3,12.8,2.5
268 |
--------------------------------------------------------------------------------
/week6/avro_example/producer.py:
--------------------------------------------------------------------------------
1 | from confluent_kafka import avro
2 | from confluent_kafka.avro import AvroProducer
3 | import csv
4 | from time import sleep
5 |
6 |
7 | def load_avro_schema_from_file():
8 | key_schema = avro.load("taxi_ride_key.avsc")
9 | value_schema = avro.load("taxi_ride_value.avsc")
10 |
11 | return key_schema, value_schema
12 |
13 |
14 | def send_record():
15 | key_schema, value_schema = load_avro_schema_from_file()
16 |
17 | producer_config = {
18 | "bootstrap.servers": "localhost:9092",
19 | "schema.registry.url": "http://localhost:8081",
20 | "acks": "1"
21 | }
22 |
23 | producer = AvroProducer(producer_config, default_key_schema=key_schema, default_value_schema=value_schema)
24 |
25 | file = open('./data/rides_new.csv')
26 |
27 | csvreader = csv.reader(file)
28 | header = next(csvreader)
29 | for row in csvreader:
30 | key = {"vendorId": int(row[0])}
31 | value = {"vendorId": int(row[0]), "passenger_count": int(row[3]), "trip_distance": float(row[4]), "payment_type": int(row[9]), "total_amount": float(row[16])}
32 |
33 | try:
34 | producer.produce(topic='datatalkclub.yellow_taxi_rides', key=key, value=value)
35 | except Exception as e:
36 | print(f"Exception while producing record value - {value}: {e}")
37 | else:
38 | print(f"Successfully producing record value - {value}")
39 |
40 | producer.flush()
41 | sleep(1)
42 |
43 | if __name__ == "__main__":
44 | send_record()
--------------------------------------------------------------------------------
/week6/avro_example/taxi_ride_key.avsc:
--------------------------------------------------------------------------------
1 | {
2 | "namespace": "com.datatalksclub.taxi",
3 | "type": "record",
4 | "name": "TaxiRideKey",
5 | "fields": [
6 | {
7 | "name": "vendorId",
8 | "type": "int"
9 | }
10 | ]
11 | }
--------------------------------------------------------------------------------
/week6/avro_example/taxi_ride_value.avsc:
--------------------------------------------------------------------------------
1 | {
2 | "namespace": "com.datatalksclub.taxi",
3 | "type": "record",
4 | "name": "TaxiRide",
5 | "fields": [
6 | {
7 | "name": "vendorId",
8 | "type": "int"
9 | },
10 | {
11 | "name": "passenger_count",
12 | "type": "int"
13 | },
14 | {
15 | "name": "trip_distance",
16 | "type": "float"
17 | },
18 | {
19 | "name": "payment_type",
20 | "type": "int"
21 | },
22 | {
23 | "name": "total_amount",
24 | "type": "float"
25 | }
26 | ]
27 | }
--------------------------------------------------------------------------------
/week6/consumer.py:
--------------------------------------------------------------------------------
1 | from kafka import KafkaConsumer
2 | from json import loads
3 | from time import sleep
4 |
5 | consumer = KafkaConsumer(
6 | 'demo_1',
7 | bootstrap_servers=['localhost:9092'],
8 | auto_offset_reset='earliest',
9 | enable_auto_commit=True,
10 | group_id='consumer.group.id.demo.1',
11 | value_deserializer=lambda x: loads(x.decode('utf-8')))
12 |
13 |
14 | while(True):
15 | print("inside while")
16 | for message in consumer:
17 | message = message.value
18 | print(message)
19 | sleep(1)
--------------------------------------------------------------------------------
/week6/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3"
2 |
3 | services:
4 | zookeeper:
5 | image: confluentinc/cp-zookeeper:5.4.0
6 | hostname: zookeeper
7 | container_name: zookeeper
8 | ports:
9 | - "2181:2181"
10 | environment:
11 | ZOOKEEPER_CLIENT_PORT: 2181
12 | ZOOKEEPER_TICK_TIME: 2000
13 |
14 | broker:
15 | image: confluentinc/cp-server:5.4.0
16 | hostname: broker
17 | container_name: broker
18 | depends_on:
19 | - zookeeper
20 | ports:
21 | - "9092:9092"
22 | environment:
23 | KAFKA_BROKER_ID: 1
24 | KAFKA_ZOOKEEPER_CONNECT: "zookeeper:2181"
25 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
26 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092
27 | KAFKA_METRIC_REPORTERS: io.confluent.metrics.reporter.ConfluentMetricsReporter
28 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
29 | KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
30 | KAFKA_CONFLUENT_LICENSE_TOPIC_REPLICATION_FACTOR: 1
31 | CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: broker:29092
32 | CONFLUENT_METRICS_REPORTER_ZOOKEEPER_CONNECT: zookeeper:2181
33 | CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1
34 | CONFLUENT_METRICS_ENABLE: "true"
35 | CONFLUENT_SUPPORT_CUSTOMER_ID: "anonymous"
36 |
37 | kafka-tools:
38 | image: confluentinc/cp-kafka:5.4.0
39 | hostname: kafka-tools
40 | container_name: kafka-tools
41 | command: ["tail", "-f", "/dev/null"]
42 | network_mode: "host"
43 |
44 | schema-registry:
45 | image: confluentinc/cp-schema-registry:5.4.0
46 | hostname: schema-registry
47 | container_name: schema-registry
48 | depends_on:
49 | - zookeeper
50 | - broker
51 | ports:
52 | - "8081:8081"
53 | environment:
54 | SCHEMA_REGISTRY_HOST_NAME: schema-registry
55 | SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: "zookeeper:2181"
56 |
57 | control-center:
58 | image: confluentinc/cp-enterprise-control-center:5.4.0
59 | hostname: control-center
60 | container_name: control-center
61 | depends_on:
62 | - zookeeper
63 | - broker
64 | - schema-registry
65 | ports:
66 | - "9021:9021"
67 | environment:
68 | CONTROL_CENTER_BOOTSTRAP_SERVERS: 'broker:29092'
69 | CONTROL_CENTER_ZOOKEEPER_CONNECT: 'zookeeper:2181'
70 | CONTROL_CENTER_SCHEMA_REGISTRY_URL: "http://schema-registry:8081"
71 | CONTROL_CENTER_REPLICATION_FACTOR: 1
72 | CONTROL_CENTER_INTERNAL_TOPICS_PARTITIONS: 1
73 | CONTROL_CENTER_MONITORING_INTERCEPTOR_TOPIC_PARTITIONS: 1
74 | CONFLUENT_METRICS_TOPIC_REPLICATION: 1
75 | PORT: 9021
--------------------------------------------------------------------------------
/week6/producer.py:
--------------------------------------------------------------------------------
1 | from time import sleep
2 | from json import dumps
3 | from kafka import KafkaProducer
4 |
5 |
6 | producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
7 | value_serializer=lambda x:
8 | dumps(x).encode('utf-8'))
9 |
10 | for e in range(1000):
11 | data = {'number' : e}
12 | producer.send('demo_1', value=data)
13 | print("producing")
14 | sleep(1)
--------------------------------------------------------------------------------
/week6/requirements.txt:
--------------------------------------------------------------------------------
1 | kafka-python==1.4.6
2 | confluent_kafka
3 | requests
4 | avro
5 | faust
6 | fastavro
--------------------------------------------------------------------------------
/week6/streams/__pycache__/taxi_rides.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pcrespoo/data-engineering-bootcamp/a1835f9c1557d4974b73837a0622ac5d24db9dfa/week6/streams/__pycache__/taxi_rides.cpython-37.pyc
--------------------------------------------------------------------------------
/week6/streams/branch_price.py:
--------------------------------------------------------------------------------
1 | import faust
2 | from taxi_rides import TaxiRide
3 | from faust import current_event
4 |
5 | app = faust.App('datatalksclub.stream.v3', broker='kafka://localhost:9092', consumer_auto_offset_reset="earliest")
6 | topic = app.topic('datatalkclub.yellow_taxi_ride.json', value_type=TaxiRide)
7 |
8 | high_amount_rides = app.topic('datatalks.yellow_taxi_rides.high_amount')
9 | low_amount_rides = app.topic('datatalks.yellow_taxi_rides.low_amount')
10 |
11 |
12 | @app.agent(topic)
13 | async def process(stream):
14 | async for event in stream:
15 | if event.total_amount >= 40.0:
16 | await current_event().forward(high_amount_rides)
17 | else:
18 | await current_event().forward(low_amount_rides)
19 |
20 | if __name__ == '__main__':
21 | app.main()
--------------------------------------------------------------------------------
/week6/streams/producer_taxi_json.py:
--------------------------------------------------------------------------------
1 | import csv
2 | from json import dumps
3 | from kafka import KafkaProducer
4 | from time import sleep
5 |
6 |
7 | producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
8 | key_serializer=lambda x: dumps(x).encode('utf-8'),
9 | value_serializer=lambda x: dumps(x).encode('utf-8'))
10 |
11 | file = open('../avro_example/data/rides_new.csv')
12 |
13 | csvreader = csv.reader(file)
14 | header = next(csvreader)
15 | for row in csvreader:
16 | key = {"vendorId": int(row[0])}
17 | value = {"vendorId": int(row[0]), "passenger_count": int(row[3]), "trip_distance": float(row[4]), "payment_type": int(row[9]), "total_amount": float(row[16])}
18 | producer.send('datatalkclub.yellow_taxi_ride.json', value=value, key=key)
19 | print("producing")
20 | sleep(1)
--------------------------------------------------------------------------------
/week6/streams/stream.py:
--------------------------------------------------------------------------------
1 | import faust
2 | from taxi_rides import TaxiRide
3 |
4 |
5 | app = faust.App('datatalksclub.stream.v2', broker='kafka://localhost:9092')
6 | topic = app.topic('datatalkclub.yellow_taxi_ride.json', value_type=TaxiRide)
7 |
8 |
9 | @app.agent(topic)
10 | async def start_reading(records):
11 | async for record in records:
12 | print(record)
13 |
14 |
15 | if __name__ == '__main__':
16 | app.main()
--------------------------------------------------------------------------------
/week6/streams/taxi_rides.py:
--------------------------------------------------------------------------------
1 | import faust
2 |
3 |
4 | class TaxiRide(faust.Record, validation=True):
5 | vendorId: str
6 | passenger_count: int
7 | trip_distance: float
8 | payment_type: int
9 | total_amount: float
--------------------------------------------------------------------------------