├── .env ├── .gitignore ├── Makefile ├── README.md ├── airflow-docker-compose.yaml ├── airflow ├── .gitignore ├── Dockerfile ├── README.md ├── dags │ └── elt_pipeline_dag.py └── requirements.txt ├── batch_processing └── datalake_to_dw.py ├── config ├── datalake.yaml └── spark.yaml ├── data_validation ├── full_flow.ipynb ├── gx │ ├── .gitignore │ ├── checkpoints │ │ └── staging_tripdata_asset_checkpoint.yml │ ├── expectations │ │ ├── .ge_store_backend_id │ │ └── validate_trip_data.json │ ├── great_expectations.yml │ └── plugins │ │ └── custom_data_docs │ │ └── styles │ │ └── data_docs_custom_styles.css └── reload_and_validate.ipynb ├── dbt_nyc ├── .gitignore ├── .user.yml ├── README.md ├── analyses │ └── .gitkeep ├── dbt_project.yml ├── macros │ ├── .gitkeep │ ├── generate_schema_name.sql │ ├── get_payment_description.sql │ ├── get_rate_code_description.sql │ ├── get_service_name.sql │ └── get_vendor_description.sql ├── models │ └── production │ │ ├── dim_dropoff_location.sql │ │ ├── dim_payment.sql │ │ ├── dim_pickup_location.sql │ │ ├── dim_rate_code.sql │ │ ├── dim_service_type.sql │ │ ├── dim_vendor.sql │ │ ├── fact_trip.sql │ │ └── schema.yml ├── packages.yml ├── profiles.yml ├── seeds │ └── .gitkeep ├── snapshots │ └── .gitkeep └── tests │ └── .gitkeep ├── debezium ├── configs │ └── taxi-nyc-cdc.json └── run.sh ├── docker-compose.yaml ├── imgs ├── airflow_1.png ├── airflow_2.png ├── airflow_pipeline.png ├── batch_1.png ├── batch_2.png ├── batch_3.png ├── batch_5.png ├── batch_6.png ├── batch_8.png ├── big-data-diagram.png ├── big-data-diagram.svg ├── dbt.png ├── logo │ ├── airflow_logo.png │ └── dbt_logo.png ├── star_schema_updated.png ├── stream_1.png ├── stream_2.png ├── stream_3.png └── trino.png ├── requirements.txt ├── scripts ├── convert_to_delta.py ├── data │ └── taxi_lookup.csv ├── extract_load.py └── transform_data.py ├── stream-docker-compose.yaml ├── stream_processing ├── read_parquet_streaming.py ├── schema_config.json └── streaming_to_datalake.py ├── trino ├── catalog │ └── datalake.properties └── etc │ ├── config.properties │ ├── jvm.config │ └── node.properties └── utils ├── create_schema.py ├── create_table.py ├── helpers.py ├── minio_utils.py ├── postgresql_client.py ├── streaming_data_db.py ├── streaming_data_json.py └── trino_db_scripts_generate.py /.env: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/.env -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/.gitignore -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/README.md -------------------------------------------------------------------------------- /airflow-docker-compose.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/airflow-docker-compose.yaml -------------------------------------------------------------------------------- /airflow/.gitignore: -------------------------------------------------------------------------------- 1 | /logs/ -------------------------------------------------------------------------------- /airflow/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/airflow/Dockerfile -------------------------------------------------------------------------------- /airflow/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/airflow/README.md -------------------------------------------------------------------------------- /airflow/dags/elt_pipeline_dag.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/airflow/dags/elt_pipeline_dag.py -------------------------------------------------------------------------------- /airflow/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/airflow/requirements.txt -------------------------------------------------------------------------------- /batch_processing/datalake_to_dw.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/batch_processing/datalake_to_dw.py -------------------------------------------------------------------------------- /config/datalake.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/config/datalake.yaml -------------------------------------------------------------------------------- /config/spark.yaml: -------------------------------------------------------------------------------- 1 | spark_config: 2 | executor_memory: 8g 3 | -------------------------------------------------------------------------------- /data_validation/full_flow.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/data_validation/full_flow.ipynb -------------------------------------------------------------------------------- /data_validation/gx/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | uncommitted/ -------------------------------------------------------------------------------- /data_validation/gx/checkpoints/staging_tripdata_asset_checkpoint.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/data_validation/gx/checkpoints/staging_tripdata_asset_checkpoint.yml -------------------------------------------------------------------------------- /data_validation/gx/expectations/.ge_store_backend_id: -------------------------------------------------------------------------------- 1 | store_backend_id = ec2e8f5e-9559-468f-b182-cada6655e5d4 2 | -------------------------------------------------------------------------------- /data_validation/gx/expectations/validate_trip_data.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/data_validation/gx/expectations/validate_trip_data.json -------------------------------------------------------------------------------- /data_validation/gx/great_expectations.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/data_validation/gx/great_expectations.yml -------------------------------------------------------------------------------- /data_validation/gx/plugins/custom_data_docs/styles/data_docs_custom_styles.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/data_validation/gx/plugins/custom_data_docs/styles/data_docs_custom_styles.css -------------------------------------------------------------------------------- /data_validation/reload_and_validate.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/data_validation/reload_and_validate.ipynb -------------------------------------------------------------------------------- /dbt_nyc/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /dbt_nyc/.user.yml: -------------------------------------------------------------------------------- 1 | id: 0ae4c9e5-7325-48e8-97e2-807bd9d6f157 2 | -------------------------------------------------------------------------------- /dbt_nyc/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/README.md -------------------------------------------------------------------------------- /dbt_nyc/analyses/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbt_nyc/dbt_project.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/dbt_project.yml -------------------------------------------------------------------------------- /dbt_nyc/macros/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbt_nyc/macros/generate_schema_name.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/macros/generate_schema_name.sql -------------------------------------------------------------------------------- /dbt_nyc/macros/get_payment_description.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/macros/get_payment_description.sql -------------------------------------------------------------------------------- /dbt_nyc/macros/get_rate_code_description.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/macros/get_rate_code_description.sql -------------------------------------------------------------------------------- /dbt_nyc/macros/get_service_name.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/macros/get_service_name.sql -------------------------------------------------------------------------------- /dbt_nyc/macros/get_vendor_description.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/macros/get_vendor_description.sql -------------------------------------------------------------------------------- /dbt_nyc/models/production/dim_dropoff_location.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/models/production/dim_dropoff_location.sql -------------------------------------------------------------------------------- /dbt_nyc/models/production/dim_payment.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/models/production/dim_payment.sql -------------------------------------------------------------------------------- /dbt_nyc/models/production/dim_pickup_location.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/models/production/dim_pickup_location.sql -------------------------------------------------------------------------------- /dbt_nyc/models/production/dim_rate_code.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/models/production/dim_rate_code.sql -------------------------------------------------------------------------------- /dbt_nyc/models/production/dim_service_type.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/models/production/dim_service_type.sql -------------------------------------------------------------------------------- /dbt_nyc/models/production/dim_vendor.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/models/production/dim_vendor.sql -------------------------------------------------------------------------------- /dbt_nyc/models/production/fact_trip.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/models/production/fact_trip.sql -------------------------------------------------------------------------------- /dbt_nyc/models/production/schema.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/models/production/schema.yml -------------------------------------------------------------------------------- /dbt_nyc/packages.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/packages.yml -------------------------------------------------------------------------------- /dbt_nyc/profiles.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/dbt_nyc/profiles.yml -------------------------------------------------------------------------------- /dbt_nyc/seeds/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbt_nyc/snapshots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbt_nyc/tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /debezium/configs/taxi-nyc-cdc.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/debezium/configs/taxi-nyc-cdc.json -------------------------------------------------------------------------------- /debezium/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/debezium/run.sh -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/docker-compose.yaml -------------------------------------------------------------------------------- /imgs/airflow_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/airflow_1.png -------------------------------------------------------------------------------- /imgs/airflow_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/airflow_2.png -------------------------------------------------------------------------------- /imgs/airflow_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/airflow_pipeline.png -------------------------------------------------------------------------------- /imgs/batch_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/batch_1.png -------------------------------------------------------------------------------- /imgs/batch_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/batch_2.png -------------------------------------------------------------------------------- /imgs/batch_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/batch_3.png -------------------------------------------------------------------------------- /imgs/batch_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/batch_5.png -------------------------------------------------------------------------------- /imgs/batch_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/batch_6.png -------------------------------------------------------------------------------- /imgs/batch_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/batch_8.png -------------------------------------------------------------------------------- /imgs/big-data-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/big-data-diagram.png -------------------------------------------------------------------------------- /imgs/big-data-diagram.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/big-data-diagram.svg -------------------------------------------------------------------------------- /imgs/dbt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/dbt.png -------------------------------------------------------------------------------- /imgs/logo/airflow_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/logo/airflow_logo.png -------------------------------------------------------------------------------- /imgs/logo/dbt_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/logo/dbt_logo.png -------------------------------------------------------------------------------- /imgs/star_schema_updated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/star_schema_updated.png -------------------------------------------------------------------------------- /imgs/stream_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/stream_1.png -------------------------------------------------------------------------------- /imgs/stream_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/stream_2.png -------------------------------------------------------------------------------- /imgs/stream_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/stream_3.png -------------------------------------------------------------------------------- /imgs/trino.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/imgs/trino.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/requirements.txt -------------------------------------------------------------------------------- /scripts/convert_to_delta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/scripts/convert_to_delta.py -------------------------------------------------------------------------------- /scripts/data/taxi_lookup.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/scripts/data/taxi_lookup.csv -------------------------------------------------------------------------------- /scripts/extract_load.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/scripts/extract_load.py -------------------------------------------------------------------------------- /scripts/transform_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/scripts/transform_data.py -------------------------------------------------------------------------------- /stream-docker-compose.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/stream-docker-compose.yaml -------------------------------------------------------------------------------- /stream_processing/read_parquet_streaming.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/stream_processing/read_parquet_streaming.py -------------------------------------------------------------------------------- /stream_processing/schema_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/stream_processing/schema_config.json -------------------------------------------------------------------------------- /stream_processing/streaming_to_datalake.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/stream_processing/streaming_to_datalake.py -------------------------------------------------------------------------------- /trino/catalog/datalake.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/trino/catalog/datalake.properties -------------------------------------------------------------------------------- /trino/etc/config.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/trino/etc/config.properties -------------------------------------------------------------------------------- /trino/etc/jvm.config: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/trino/etc/jvm.config -------------------------------------------------------------------------------- /trino/etc/node.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/trino/etc/node.properties -------------------------------------------------------------------------------- /utils/create_schema.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/utils/create_schema.py -------------------------------------------------------------------------------- /utils/create_table.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/utils/create_table.py -------------------------------------------------------------------------------- /utils/helpers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/utils/helpers.py -------------------------------------------------------------------------------- /utils/minio_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/utils/minio_utils.py -------------------------------------------------------------------------------- /utils/postgresql_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/utils/postgresql_client.py -------------------------------------------------------------------------------- /utils/streaming_data_db.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/utils/streaming_data_db.py -------------------------------------------------------------------------------- /utils/streaming_data_json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/utils/streaming_data_json.py -------------------------------------------------------------------------------- /utils/trino_db_scripts_generate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trannhatnguyen2/NYC_Taxi_Data_Pipeline/HEAD/utils/trino_db_scripts_generate.py --------------------------------------------------------------------------------