├── public
    ├── notebooks
    │   ├── EDA.ipynb
    │   └── Preprocessing.ipynb
    └── images
    │   ├── DataFlow.png
    │   ├── DataLineage.png
    │   ├── data_flow.png
    │   ├── youtube_icon.png
    │   ├── Data_flow_youtube.png
    │   └── Data_flow_youtube2.png
├── etl_pipeline
    ├── dbt_tranform
    │   ├── analyses
    │   │   └── .gitkeep
    │   ├── macros
    │   │   ├── .gitkeep
    │   │   └── generate_schema_name.sql
    │   ├── seeds
    │   │   └── .gitkeep
    │   ├── tests
    │   │   └── .gitkeep
    │   ├── snapshots
    │   │   └── .gitkeep
    │   ├── .gitignore
    │   ├── .user.yml
    │   ├── packages.yml
    │   ├── package-lock.yml
    │   ├── models
    │   │   ├── youtube_trending
    │   │   │   ├── search_linkvideo.sql
    │   │   │   ├── search_videocategory.sql
    │   │   │   └── search_information.sql
    │   │   ├── sources.yml
    │   │   └── schema.yml
    │   ├── profiles.yml
    │   ├── README.md
    │   └── dbt_project.yml
    ├── etl_pipeline
    │   ├── jobs
    │   │   └── __init__.py
    │   ├── assets
    │   │   ├── __init__.py
    │   │   ├── dbt.py
    │   │   ├── warehouse.py
    │   │   ├── gold.py
    │   │   ├── bronze.py
    │   │   └── silver.py
    │   ├── schedules
    │   │   └── __init__.py
    │   ├── constants.py
    │   ├── partitions
    │   │   └── __init__.py
    │   ├── func_process.py
    │   ├── __init__.py
    │   └── resources
    │   │   ├── mysql_io_manager.py
    │   │   ├── __init__.py
    │   │   ├── minio_io_manager.py
    │   │   ├── psql_io_manager.py
    │   │   ├── spark_io_manager.py
    │   │   └── youtube_io_manager.py
    ├── etl_pipeline_tests
    │   ├── __init__.py
    │   └── test_assets.py
    ├── setup.cfg
    ├── pyproject.toml
    ├── setup.py
    ├── requirements.txt
    ├── README.md
    └── Dockerfile
├── app
    ├── icons
    │   ├── video.png
    │   ├── youtube.png
    │   ├── youtube_v2.png
    │   ├── icons8-like-48.png
    │   ├── icons8-view-48.png
    │   ├── icons8-channel-48.png
    │   ├── icons8-category-48.png
    │   └── icons8-thumbs-down-skin-type-4-48.png
    ├── streamlit_app.py
    └── pages
    │   ├── search_video.py
    │   └── video_detail.py
├── docker-images
    ├── dagster
    │   ├── requirements.txt
    │   └── Dockerfile
    ├── streamlit
    │   ├── requirements.txt
    │   └── Dockerfile
    └── spark
    │   ├── Dockerfile
    │   └── spark-defaults.conf
├── dagster_home
    ├── workspace.yaml
    └── dagster.yaml
├── .gitignore
├── LICENSE
├── Makefile
├── load_dataset
    ├── mysql_load.sql
    ├── psql_schemas.sql
    └── mysql_schemas.sql
├── docker-compose.yaml
└── README.md


/public/notebooks/EDA.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/analyses/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/macros/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/jobs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/assets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/schedules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline_tests/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline_tests/test_assets.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/etl_pipeline/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = etl_pipeline
3 | 


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | target/
3 | dbt_packages/
4 | logs/
5 | 


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/.user.yml:
--------------------------------------------------------------------------------
1 | id: 34ac7379-38f0-4235-94d4-210cae8a4832
2 | 


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/constants.py:
--------------------------------------------------------------------------------
1 | START_DATE = "2020-06-15"
2 | END_DATE = "2024-05-13"


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/packages.yml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - package: dbt-labs/dbt_utils
3 |     version: 1.1.1


--------------------------------------------------------------------------------
/app/icons/video.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/video.png


--------------------------------------------------------------------------------
/app/icons/youtube.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/youtube.png


--------------------------------------------------------------------------------
/app/icons/youtube_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/youtube_v2.png


--------------------------------------------------------------------------------
/docker-images/dagster/requirements.txt:
--------------------------------------------------------------------------------
1 | dagster==1.7.3
2 | dagit==1.7.3
3 | dagster-postgres
4 | dagster-dbt==0.23.3
5 | dagster-spark==0.23.3


--------------------------------------------------------------------------------
/public/images/DataFlow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/DataFlow.png


--------------------------------------------------------------------------------
/app/icons/icons8-like-48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/icons8-like-48.png


--------------------------------------------------------------------------------
/app/icons/icons8-view-48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/icons8-view-48.png


--------------------------------------------------------------------------------
/dagster_home/workspace.yaml:
--------------------------------------------------------------------------------
1 | load_from:
2 |   - grpc_server:
3 |       host: etl_pipeline
4 |       port: 4000
5 |       location_name: "etl_pipeline"


--------------------------------------------------------------------------------
/public/images/DataLineage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/DataLineage.png


--------------------------------------------------------------------------------
/public/images/data_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/data_flow.png


--------------------------------------------------------------------------------
/app/icons/icons8-channel-48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/icons8-channel-48.png


--------------------------------------------------------------------------------
/public/images/youtube_icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/youtube_icon.png


--------------------------------------------------------------------------------
/app/icons/icons8-category-48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/icons8-category-48.png


--------------------------------------------------------------------------------
/public/images/Data_flow_youtube.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/Data_flow_youtube.png


--------------------------------------------------------------------------------
/public/images/Data_flow_youtube2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/Data_flow_youtube2.png


--------------------------------------------------------------------------------
/docker-images/streamlit/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.34.0
2 | psycopg2-binary==2.9.9
3 | pandas==2.2.2
4 | polars==0.20.23
5 | # scikit-learn==1.5.0
6 | # surprise==0.1


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/package-lock.yml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - package: dbt-labs/dbt_utils
3 |     version: 1.1.1
4 | sha1_hash: a158c48c59c2bb7d729d2a4e215aabe5bb4f3353
5 | 


--------------------------------------------------------------------------------
/etl_pipeline/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools"]
3 | build-backend = "setuptools.build_meta"
4 | 
5 | [tool.dagster]
6 | module_name = "etl_pipeline"
7 | 


--------------------------------------------------------------------------------
/app/icons/icons8-thumbs-down-skin-type-4-48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/icons8-thumbs-down-skin-type-4-48.png


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/models/youtube_trending/search_linkvideo.sql:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | {{ config(materialized="table") }}
4 | 
5 | SELECT
6 |     video_id,
7 |     link_video
8 | FROM {{ source("gold", "linkvideos") }}


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/models/youtube_trending/search_videocategory.sql:
--------------------------------------------------------------------------------
1 | 
2 | {{ config(materialized="table") }}
3 | 
4 | SELECT 
5 |     categoryid,
6 |     categoryname
7 | FROM {{ source("gold", "videocategory") }}


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/partitions/__init__.py:
--------------------------------------------------------------------------------
1 | from dagster import MonthlyPartitionsDefinition
2 | from .. import constants
3 | 
4 | 
5 | monthly_partitions = MonthlyPartitionsDefinition(
6 |     start_date=constants.START_DATE,
7 |     end_date=constants.END_DATE
8 | )


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/func_process.py:
--------------------------------------------------------------------------------
1 | def replace_str(value: str):
2 |     return value.replace("default", "maxresdefault")
3 | 
4 | def format_date(value: str):
5 |     return value.replace("T", " ").replace("Z", "")
6 | 
7 | def convert(value: str):
8 |     return value.replace('"', '')


--------------------------------------------------------------------------------
/etl_pipeline/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name="etl_pipeline",
 5 |     packages=find_packages(exclude=["etl_pipeline_tests"]),
 6 |     install_requires=[
 7 |         "dagster",
 8 |         "dagster-cloud"
 9 |     ],
10 |     extras_require={"dev": ["dagster-webserver", "pytest"]},
11 | )
12 | 


--------------------------------------------------------------------------------
/etl_pipeline/requirements.txt:
--------------------------------------------------------------------------------
 1 | dagster==1.7.3
 2 | dagit==1.7.3
 3 | pandas==2.2.2
 4 | polars==0.20.23
 5 | pyarrow==16.0.0
 6 | minio==7.2.7
 7 | pymysql==1.1.0
 8 | cryptography==42.0.5
 9 | psycopg2-binary==2.9.9
10 | dagster-postgres
11 | google-api-python-client==2.127.0
12 | pyspark==3.4.3
13 | dbt-postgres==1.7.13
14 | dagster-dbt==0.23.3
15 | dagster-spark==0.23.3


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .etl-pipeline/
 2 | .idea/
 3 | .pytest_cache/
 4 | 
 5 | .logs_queue/
 6 | .nux/
 7 | .telemetry/
 8 | history/
 9 | logs/
10 | dagster_home/schedules/
11 | 
12 | dataset/
13 | 
14 | .mypy_cache/
15 | __pycache__/
16 | 
17 | minio/
18 | mysql/
19 | postgresql/
20 | venv/
21 | 
22 | .env/
23 | data/
24 | .streamlit/
25 | 
26 | .env.spark_master
27 | .env.spark_worker
28 | .env


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/macros/generate_schema_name.sql:
--------------------------------------------------------------------------------
 1 | {% macro generate_schema_name(custom_schema_name, node) -%}
 2 | 
 3 |     {%- set default_schema = target.schema -%}
 4 |     {%- if custom_schema_name is none -%}
 5 | 
 6 |         {{ default_schema }}
 7 | 
 8 |     {%- else -%}
 9 | 
10 |         {{ custom_schema_name | trim }}
11 | 
12 |     {%- endif -%}
13 | 
14 | {%- endmacro %}


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/profiles.yml:
--------------------------------------------------------------------------------
 1 | dbt_tranform:
 2 |   outputs:
 3 |     dev:
 4 |       dbname: youTube_trending_video
 5 |       host: "{{ env_var('POSTGRES_HOST') }}"
 6 |       pass: "{{ env_var('POSTGRES_PASSWORD') }}"
 7 |       port: 5432
 8 |       schema: gold
 9 |       threads: 1
10 |       type: postgres
11 |       user: "{{ env_var('POSTGRES_USER') }}"
12 |   target: dev
13 | 


--------------------------------------------------------------------------------
/docker-images/dagster/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim
 2 | 
 3 | ENV DAGSTER_HOME=/opt/dagster/dagster_home
 4 | 
 5 | RUN mkdir -p $DAGSTER_HOME && \
 6 |     mkdir -p $DAGSTER_HOME/storage && \
 7 |     mkdir -p $DAGSTER_HOME/compute_logs && \
 8 |     mkdir -p $DAGSTER_HOME/local_artifact_storage
 9 | 
10 | WORKDIR $DAGSTER_HOME
11 | 
12 | COPY requirements.txt $DAGSTER_HOME
13 | 
14 | RUN pip install --upgrade pip && pip install -r requirements.txt


--------------------------------------------------------------------------------
/docker-images/streamlit/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | RUN apt-get update && apt-get install -y \
 6 |     build-essential \
 7 |     curl \
 8 |     software-properties-common \
 9 |     git \
10 |     && rm -rf /var/lib/apt/lists/*
11 | 
12 | COPY . .
13 | 
14 | RUN pip install --upgrade pip && pip install -r requirements.txt
15 | 
16 | EXPOSE 8501
17 | 
18 | HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19 | 
20 | ENTRYPOINT ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to your new dbt project!
 2 | 
 3 | ### Using the starter project
 4 | 
 5 | Try running the following commands:
 6 | - dbt run
 7 | - dbt test
 8 | 
 9 | 
10 | ### Resources:
11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support
14 | - Find [dbt events](https://events.getdbt.com) near you
15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
16 | 


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/models/sources.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | version: 2
 3 | 
 4 | sources:
 5 |   - name: gold
 6 |     tables:
 7 |       - name: videocategory
 8 |         meta:
 9 |           dagster:
10 |             asset_key: ["warehouse", "gold", "videoCategory"]
11 |       - name: linkvideos
12 |         meta:
13 |           dagster:
14 |             asset_key: ["warehouse", "gold", "linkVideos"]
15 |       - name: metricvideos
16 |         meta:
17 |           dagster:
18 |             asset_key: ["warehouse", "gold", "metricVideos"]
19 |       - name: informationvideos
20 |         meta:
21 |           dagster:
22 |             asset_key: ["warehouse", "gold", "informationVideos"]


--------------------------------------------------------------------------------
/docker-images/spark/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/bitnami/spark:3.4.3
 2 | 
 3 | USER root
 4 | 
 5 | # Install prerequisites
 6 | RUN apt-get update && apt-get install -y curl
 7 | 
 8 | RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \
 9 |   && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \
10 |   && curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \
11 |   && curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \
12 |   && mv s3-2.18.41.jar /opt/bitnami/spark/jars \
13 |   && mv aws-java-sdk-1.12.367.jar /opt/bitnami/spark/jars \
14 |   && mv delta-core_2.12-2.2.0.jar /opt/bitnami/spark/jars \
15 |   && mv delta-storage-2.2.0.jar /opt/bitnami/spark/jars


--------------------------------------------------------------------------------
/docker-images/spark/spark-defaults.conf:
--------------------------------------------------------------------------------
 1 | spark.jars jars/delta-core_2.12-2.2.0.jar,jars/hadoop-aws-3.3.2.jar,jars/delta-storage-2.2.0.jar,jars/aws-java-sdk-1.12.367.jar,jars/s3-2.18.41.jar,jars/aws-java-sdk-bundle-1.11.1026.jar
 2 | spark.sql.extensions io.delta.sql.DeltaSparkSessionExtension
 3 | spark.sql.catalog.spark_catalog org.apache.spark.sql.delta.catalog.DeltaCatalog
 4 | spark.hadoop.fs.s3a.endpoint http://minio:9000
 5 | spark.hadoop.fs.s3a.access.key minio
 6 | spark.hadoop.fs.s3a.secret.key minio123
 7 | ; spark.hadoop.fs.s3a.awsAccessKeyId minio
 8 | ; spark.hadoop.fs.s3a.awsSecretAccessKey minio123
 9 | spark.hadoop.fs.s3a.path.style.access true
10 | spark.hadoop.fs.s3a.connection.ssl.enabled false
11 | spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
12 | spark.driver.memory 4g
13 | spark.executor.memory 4g


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dagster import Definitions, load_assets_from_modules
 4 | from dagster_dbt import DbtCliResource
 5 | 
 6 | from .assets import bronze, gold, silver, warehouse, dbt
 7 | from .resources import mysql, minio, postgres, youtube, spark
 8 | 
 9 | 
10 | all_assets = load_assets_from_modules(
11 |     [bronze, silver, gold, warehouse, dbt])
12 | 
13 | defs = Definitions(
14 |     assets=all_assets,
15 |     resources={
16 |         "mysql_io_manager": mysql,
17 |         "minio_io_manager": minio,
18 |         "psql_io_manager": postgres,
19 |         "youtube_io_manager": youtube,
20 |         "spark_io_manager": spark,
21 |         "dbt": DbtCliResource(
22 |             project_dir=os.fspath(dbt.DBT_PROJECT_DIR),
23 |             profiles_dir=os.fspath(dbt.DBT_PROFILE_DIR)
24 |         ),
25 |     },
26 | )
27 | 


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/models/youtube_trending/search_information.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | {{ config(materialized="table") }}
 4 | 
 5 | select distinct 
 6 |     i.video_id
 7 |     , i.title
 8 |     , i.channeltitle 
 9 |     , v.categoryname
10 |     , m.view
11 |     , m.like as likes
12 |     , m.dislike
13 |     , m.publishedat
14 |     , l.link_video
15 |     , i.tags
16 |     , i.thumbnail_link
17 | 
18 | from {{ source('gold', 'informationvideos') }} i 
19 |     inner join {{ source('gold', 'linkvideos') }} l on i.video_id = l.video_id 
20 |     inner join {{ source('gold', 'videocategory') }} v on i.categoryid = v.categoryid 
21 |     inner join (
22 |         SELECT 
23 |             video_id
24 |             , MAX(view_count) AS view
25 |             , MAX(likes) as like
26 |             , MAX(dislikes) as dislike
27 |             , MAX(publishedat) as publishedat
28 |         FROM {{ source('gold', 'metricvideos') }}
29 |         GROUP BY video_id
30 |     ) AS m on i.video_id = m.video_id
31 | 
32 |     


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/assets/dbt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | from dagster import AssetExecutionContext
 5 | from dagster_dbt import DbtCliResource, dbt_assets
 6 | from dagster_dbt import DagsterDbtTranslator
 7 | 
 8 | from typing import Mapping, Optional, Any
 9 | 
10 | 
11 | 
12 | DBT_PROJECT_DIR = Path(__file__).joinpath("..", "..", "..", "dbt_tranform").resolve()
13 | DBT_PROFILE_DIR = Path(__file__).joinpath("..", "..", "..", "dbt_tranform").resolve()
14 | DBT_MANIFEST_PATH = DBT_PROJECT_DIR.joinpath("target", "manifest.json")
15 | 
16 | class CustomDagsterDbtTranslator(DagsterDbtTranslator):
17 |     def get_group_name(
18 |         self, dbt_resource_props: Mapping[str, Any]
19 |     ) -> Optional[str]:
20 |         return "warehouse"
21 | 
22 | 
23 | @dbt_assets(
24 |     manifest=DBT_MANIFEST_PATH,
25 |     dagster_dbt_translator=CustomDagsterDbtTranslator()
26 | )
27 | def Brazilian_ECommerce_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):
28 |     yield from dbt.cli(["build"], context=context).stream()
29 |     


--------------------------------------------------------------------------------
/app/streamlit_app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import psycopg2
 3 | import polars as pl
 4 | import pandas as pd
 5 | from PIL import Image
 6 | from io import BytesIO
 7 | import requests
 8 | 
 9 | icon = Image.open("./icons/youtube_v2.png", mode="r")
10 | 
11 | st.set_page_config(
12 |     page_title="YouTube RecoMaster",
13 |     page_icon=icon,
14 |     layout="centered",
15 |     initial_sidebar_state="expanded"
16 | )
17 | 
18 | title, logo = st.columns([4,2.91])
19 | with title: 
20 |     st.title("YouTube RecoMaster")
21 | with logo: 
22 |     st.write("")
23 |     st.image(icon, width=70)
24 | 
25 | 
26 | st.markdown(
27 |             f'''<iframe width="705" height="460" src="https://www.youtube.com/embed/dnjKNmn9cyg?si=VbxTczlH7frtQZPy" title="YouTube video player" frameborder="0" 
28 |                 allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; 
29 |                 web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>''', 
30 |             unsafe_allow_html=True
31 |         )


--------------------------------------------------------------------------------
/dagster_home/dagster.yaml:
--------------------------------------------------------------------------------
 1 | run_coordinator:
 2 |   module: dagster.core.run_coordinator
 3 |   class: QueuedRunCoordinator
 4 |   config:
 5 |     max_concurrent_runs: 3
 6 | 
 7 | scheduler:
 8 |   module: dagster.core.scheduler
 9 |   class: DagsterDaemonScheduler
10 |   config:
11 |     max_catchup_runs: 5
12 | 
13 | storage:
14 |   postgres:
15 |     postgres_db:
16 |       username:
17 |         env: DAGSTER_PG_USERNAME
18 |       password:
19 |         env: DAGSTER_PG_PASSWORD
20 |       hostname:
21 |         env: DAGSTER_PG_HOSTNAME
22 |       db_name:
23 |         env: DAGSTER_PG_DB
24 |       port: 5432
25 | 
26 | run_launcher:
27 |   module: dagster.core.launcher
28 |   class: DefaultRunLauncher
29 | 
30 | compute_logs:
31 |   module: dagster.core.storage.local_compute_log_manager
32 |   class: LocalComputeLogManager
33 |   config:
34 |     base_dir: /opt/dagster/dagster_home/compute_logs
35 | 
36 | local_artifact_storage:
37 |   module: dagster.core.storage.root
38 |   class: LocalArtifactStorage
39 |   config:
40 |     base_dir: /opt/dagster/dagster_home/local_artifact_storage


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Long Nguyen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/resources/mysql_io_manager.py:
--------------------------------------------------------------------------------
 1 | import polars as pl
 2 | from contextlib import contextmanager
 3 | from sqlalchemy import create_engine
 4 | 
 5 | from dagster import IOManager, OutputContext, InputContext
 6 | 
 7 | 
 8 | @contextmanager
 9 | def connect_mysql(config: dict):
10 |     conn_info = (
11 |         f"mysql+pymysql://{config['user']}:{config['password']}"
12 |         + f"@{config['host']}:{config['port']}"
13 |         + f"/{config['database']}"
14 |     )
15 |     db_conn = create_engine(conn_info)
16 |     try:
17 |         yield db_conn
18 |     except Exception:
19 |         raise
20 | 
21 | 
22 | class MySQLIOManager(IOManager):
23 | 
24 |     def __init__(self, config):
25 |         self._config = config
26 | 
27 |     def handle_output(self, context: OutputContext, obj: pl.DataFrame):
28 |         pass
29 | 
30 |     def load_input(self, context: InputContext) -> pl.DataFrame:
31 |         pass
32 | 
33 |     def extract_data(self, sql: str) -> pl.DataFrame:
34 |         with connect_mysql(self._config) as db_conn:
35 |             pd_data = pl.read_database(query=sql, connection=db_conn)
36 |             return pd_data


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | include .env
 2 | 
 3 | 
 4 | build:
 5 | 	docker-compose build
 6 | 
 7 | up:
 8 | 	docker-compose --env-file .env up -d
 9 | 
10 | down:
11 | 	docker-compose --env-file .env down
12 | 
13 | restart:
14 | 	make down && make up
15 | 
16 | to_psql:
17 | 	docker exec -ti de_psql psql postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}
18 | 
19 | to_mysql:
20 | 	docker exec -it de_mysql mysql --local-infile=1 -u"${MYSQL_USER}" -p"${MYSQL_PASSWORD}" ${MYSQL_DATABASE}
21 | 
22 | to_mysql_root:
23 | 	docker exec -it de_mysql mysql -u"root" -p"${MYSQL_ROOT_PASSWORD}" ${MYSQL_DATABASE}
24 | 
25 | mysql_create:
26 | 	docker exec -it de_mysql mysql --local_infile -u"${MYSQL_USER}" -p"${MYSQL_PASSWORD}" ${MYSQL_DATABASE} -e"source /tmp/load_dataset/mysql_schemas.sql"
27 | 
28 | mysql_load:
29 | 	docker exec -it de_mysql mysql --local_infile=1 -u"${MYSQL_USER}" -p"${MYSQL_PASSWORD}" ${MYSQL_DATABASE} -e"source /tmp/load_dataset/mysql_load.sql"
30 | 
31 | psql_create:
32 | 	docker exec -it de_psql psql postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB} -f /tmp/load_dataset/psql_schemas.sql -a


--------------------------------------------------------------------------------
/load_dataset/mysql_load.sql:
--------------------------------------------------------------------------------
 1 | LOAD DATA LOCAL INFILE '/tmp/youTube_trending_video/DE_youtube_trending_data.csv' 
 2 | INTO TABLE DE_youtube_trending_data 
 3 | FIELDS TERMINATED BY ',' 
 4 | ENCLOSED BY '"'
 5 | LINES TERMINATED BY '\n'
 6 | IGNORE 1 ROWS;
 7 | 
 8 | LOAD DATA LOCAL INFILE '/tmp/youTube_trending_video/JP_youtube_trending_data.csv' 
 9 | INTO TABLE JP_youtube_trending_data 
10 | FIELDS TERMINATED BY ',' 
11 | ENCLOSED BY '"'
12 | LINES TERMINATED BY '\n' 
13 | IGNORE 1 ROWS;
14 | 
15 | LOAD DATA LOCAL INFILE '/tmp/youTube_trending_video/RU_youtube_trending_data.csv' 
16 | INTO TABLE RU_youtube_trending_data 
17 | FIELDS TERMINATED BY ',' 
18 | ENCLOSED BY '"'
19 | LINES TERMINATED BY '\n'
20 | IGNORE 1 ROWS;
21 | 
22 | LOAD DATA LOCAL INFILE '/tmp/youTube_trending_video/CA_youtube_trending_data.csv' 
23 | INTO TABLE CA_youtube_trending_data 
24 | FIELDS TERMINATED BY ',' 
25 | ENCLOSED BY '"'
26 | LINES TERMINATED BY '\n' 
27 | IGNORE 1 ROWS;
28 | 
29 | LOAD DATA LOCAL INFILE '/tmp/youTube_trending_video/IN_youtube_trending_data.csv' 
30 | INTO TABLE IN_youtube_trending_data 
31 | FIELDS TERMINATED BY ',' 
32 | ENCLOSED BY '"'
33 | LINES TERMINATED BY '\n' 
34 | IGNORE 1 ROWS;


--------------------------------------------------------------------------------
/load_dataset/psql_schemas.sql:
--------------------------------------------------------------------------------
 1 | DROP SCHEMA IF EXISTS gold CASCADE;
 2 | CREATE SCHEMA gold;
 3 | 
 4 | DROP TABLE IF EXISTS gold.videoCategory;
 5 | CREATE TABLE gold.videoCategory (
 6 |     categoryId VARCHAR(5),
 7 |     categoryName VARCHAR(50)
 8 | );
 9 | 
10 | DROP TABLE IF EXISTS gold.linkVideos;
11 | CREATE TABLE gold.linkVideos (
12 |     video_id VARCHAR(20),
13 |     link_video VARCHAR(50)
14 | );
15 | 
16 | DROP TABLE IF EXISTS gold.metricVideos;
17 | CREATE TABLE gold.metricVideos (
18 |     video_id VARCHAR(20),
19 |     -- country_code,
20 |     publishedAt TIMESTAMP,
21 |     trending_date TIMESTAMP,
22 |     channelId VARCHAR(27),
23 |     categoryId VARCHAR(5),
24 |     view_count INTEGER,
25 |     likes INTEGER,
26 |     dislikes INTEGER,
27 |     comment_count INTEGER
28 | );
29 | 
30 | DROP TABLE IF EXISTS gold.informationVideos;
31 | CREATE TABLE gold.informationVideos (
32 |     video_id VARCHAR(20),
33 |     -- country_code,
34 |     title TEXT,
35 |     channelId VARCHAR(27),
36 |     channelTitle TEXT,
37 |     categoryId VARCHAR(5),
38 |     tags TEXT,
39 |     thumbnail_link TEXT,
40 |     comments_disabled VARCHAR(5),
41 |     ratings_disabled VARCHAR(5)
42 | );


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Name your project! Project names should contain only lowercase characters
 3 | # and underscores. A good package name should reflect your organization's
 4 | # name or the intended use of these models
 5 | name: 'dbt_tranform'
 6 | version: '1.0.0'
 7 | 
 8 | # This setting configures which "profile" dbt uses for this project.
 9 | profile: 'dbt_tranform'
10 | 
11 | # These configurations specify where dbt should look for different types of files.
12 | # The `model-paths` config, for example, states that models in this project can be
13 | # found in the "models/" directory. You probably won't need to change these!
14 | model-paths: ["models"]
15 | analysis-paths: ["analyses"]
16 | test-paths: ["tests"]
17 | seed-paths: ["seeds"]
18 | macro-paths: ["macros"]
19 | snapshot-paths: ["snapshots"]
20 | 
21 | clean-targets:         # directories to be removed by `dbt clean`
22 |   - "target"
23 |   - "dbt_packages"
24 | 
25 | 
26 | # Configuring models
27 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
28 | 
29 | # In this example config, we tell dbt to build all models in the example/
30 | # directory as views. These settings can be overridden in the individual model
31 | # files using the `{{ config(...) }}` macro.
32 | models:
33 |   dbt_tranform:
34 |     # Config indicated by + and applies to all files under models/example/
35 |     youtube_trending:
36 |       +materialized: table
37 |       +schema: youtube_trending
38 | 


--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/models/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: search_videocategory
 5 |     description: ""
 6 |     columns:
 7 |       - name: categoryid
 8 |         description: "The primary key for this table"
 9 |         tests:
10 |           - unique
11 |           - not_null
12 |       - name: categoryname
13 |         description: ""
14 |         tests:
15 |           - unique
16 |           - not_null
17 |           - accepted_values:
18 |               values: ['Film & Animation', 'Autos & Vehicles', 'Music', 'Pets & Animals', 'Sports', 'Travel & Events', 'Gaming', 'People & Blogs', 'Comedy', 'Entertainment', 'News & Politics', 'Howto & Style', 'Education', 'Science & Technology', 'Nonprofits & Activism']
19 | 
20 |   - name: search_linkvideo
21 |     description: ""
22 |     columns:
23 |       - name: video_id
24 |         description: "The primary key for this table"
25 |         tests:
26 |           - unique
27 |           - not_null
28 |       - name: link_video
29 |         description: ""
30 |         tests:
31 |           - unique
32 |           - not_null
33 |     post-hook:
34 |       - "CREATE INDEX IF NOT EXISTS idx_video_id ON {{ this }} (video_id)"
35 | 
36 | 
37 |   - name: search_information
38 |     description: ""
39 |     columns:
40 |       - name: video_id
41 |         description: "The primary key for this table"
42 |         tests:
43 |           - not_null
44 |       - name: categoryname
45 |         description: ""
46 |         tests:
47 |           - not_null
48 |           - accepted_values:
49 |               values: ['Film & Animation', 'Autos & Vehicles', 'Music', 'Pets & Animals', 'Sports', 'Travel & Events', 'Gaming', 'People & Blogs', 'Comedy', 'Entertainment', 'News & Politics', 'Howto & Style', 'Education', 'Science & Technology', 'Nonprofits & Activism']
50 | 
51 |     post-hook:
52 |       - "CREATE INDEX IF NOT EXISTS idx_video_id ON {{ this }} (video_id, categoryname, tags)"


--------------------------------------------------------------------------------
/etl_pipeline/README.md:
--------------------------------------------------------------------------------
 1 | # etl_pipeline
 2 | 
 3 | This is a [Dagster](https://dagster.io/) project scaffolded with [`dagster project scaffold`](https://docs.dagster.io/getting-started/create-new-project).
 4 | 
 5 | ## Getting started
 6 | 
 7 | First, install your Dagster code location as a Python package. By using the --editable flag, pip will install your Python package in ["editable mode"](https://pip.pypa.io/en/latest/topics/local-project-installs/#editable-installs) so that as you develop, local code changes will automatically apply.
 8 | 
 9 | ```bash
10 | pip install -e ".[dev]"
11 | ```
12 | 
13 | Then, start the Dagster UI web server:
14 | 
15 | ```bash
16 | dagster dev
17 | ```
18 | 
19 | Open http://localhost:3000 with your browser to see the project.
20 | 
21 | You can start writing assets in `etl_pipeline/assets.py`. The assets are automatically loaded into the Dagster code location as you define them.
22 | 
23 | ## Development
24 | 
25 | ### Adding new Python dependencies
26 | 
27 | You can specify new Python dependencies in `setup.py`.
28 | 
29 | ### Unit testing
30 | 
31 | Tests are in the `etl_pipeline_tests` directory and you can run tests using `pytest`:
32 | 
33 | ```bash
34 | pytest etl_pipeline_tests
35 | ```
36 | 
37 | ### Schedules and sensors
38 | 
39 | If you want to enable Dagster [Schedules](https://docs.dagster.io/concepts/partitions-schedules-sensors/schedules) or [Sensors](https://docs.dagster.io/concepts/partitions-schedules-sensors/sensors) for your jobs, the [Dagster Daemon](https://docs.dagster.io/deployment/dagster-daemon) process must be running. This is done automatically when you run `dagster dev`.
40 | 
41 | Once your Dagster Daemon is running, you can start turning on schedules and sensors for your jobs.
42 | 
43 | ## Deploy on Dagster Cloud
44 | 
45 | The easiest way to deploy your Dagster project is to use Dagster Cloud.
46 | 
47 | Check out the [Dagster Cloud Documentation](https://docs.dagster.cloud) to learn more.
48 | 


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/resources/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .mysql_io_manager import MySQLIOManager
 3 | from .minio_io_manager import MinIOIOManager
 4 | from .psql_io_manager import PostgreSQLIOManager
 5 | from .youtube_io_manager import YoutubeIOManager
 6 | from .spark_io_manager import SparkIOManager
 7 | 
 8 | 
 9 | mysql = MySQLIOManager(
10 |     {
11 |         "host": os.getenv("MYSQL_HOST"),
12 |         "port": 3306,
13 |         "database": os.getenv("MYSQL_DATABASE"),
14 |         "user": os.getenv("MYSQL_USER"),
15 |         "password": os.getenv("MYSQL_PASSWORD"),
16 |     }
17 | )
18 | 
19 | minio = MinIOIOManager(
20 |     {
21 |         "endpoint_url": os.getenv("MINIO_ENDPOINT"),
22 |         "bucket": os.getenv("DATALAKE_BUCKET"),
23 |         "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
24 |         "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
25 |     }
26 | )
27 | 
28 | postgres = PostgreSQLIOManager(
29 |     {
30 |         "host": os.getenv("POSTGRES_HOST"),
31 |         "port": os.getenv("POSTGRES_PORT"),
32 |         "database": os.getenv("POSTGRES_DB"),
33 |         "user": os.getenv("POSTGRES_USER"),
34 |         "password": os.getenv("POSTGRES_PASSWORD"),
35 |     }
36 | )
37 | 
38 | youtube = YoutubeIOManager(
39 |     {
40 |         "api_service_name": os.getenv("API_SERVICE_NAME"),
41 |         "api_version": os.getenv("API_VERSION"),
42 |         "api_key": os.getenv("API_KEY"),
43 |         "endpoint_url": os.getenv("MINIO_ENDPOINT"),
44 |         "bucket": os.getenv("DATALAKE_BUCKET"),
45 |         "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
46 |         "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
47 |     }
48 | )
49 | 
50 | spark = SparkIOManager(
51 |     {
52 |         "spark_master_url": os.getenv("SPARK_MASTER_URL"),
53 |         "endpoint_url": os.getenv("MINIO_ENDPOINT"),
54 |         "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
55 |         "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
56 |         "bucket": os.getenv("DATALAKE_BUCKET"),
57 |     }
58 | )


--------------------------------------------------------------------------------
/etl_pipeline/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim
 2 | 
 3 | ARG openjdk_version="17"
 4 | 
 5 | # USER root 
 6 | 
 7 | RUN apt-get update --yes && \
 8 |     apt-get install --yes  \
 9 |         curl \
10 |         "openjdk-${openjdk_version}-jre-headless" ca-certificates-java procps && \
11 |     apt-get clean && rm -rf /var/lib/apt/lists/*
12 | 
13 | 
14 | RUN curl -O -L https://dlcdn.apache.org/spark/spark-3.4.3/spark-3.4.3-bin-hadoop3.tgz \
15 |     && tar -zxvf spark-3.4.3-bin-hadoop3.tgz \
16 |     && rm -rf spark-3.4.3-bin-hadoop3.tgz \
17 |     && mv spark-3.4.3-bin-hadoop3/ /usr/local/ \
18 |     && rm -rf /usr/local/spark \
19 |     && rm -rf /usr/local/spark-3.3.0-bin-hadoop3 \
20 |     && ln -s /usr/local/spark-3.4.3-bin-hadoop3 /usr/local/spark
21 | 
22 | 
23 | RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \
24 |     && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \
25 |     && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1026/aws-java-sdk-bundle-1.11.1026.jar \
26 |     && curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \
27 |     && curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \
28 |     && curl -O https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.2/hadoop-aws-3.3.2.jar \
29 | #    && mkdir -p /usr/local/spark/jars \
30 |     && mv s3-2.18.41.jar /usr/local/spark/jars \
31 |     && mv aws-java-sdk-1.12.367.jar /usr/local/spark/jars \
32 |     && mv aws-java-sdk-bundle-1.11.1026.jar /usr/local/spark/jars \
33 |     && mv delta-core_2.12-2.2.0.jar /usr/local/spark/jars \
34 |     && mv delta-storage-2.2.0.jar /usr/local/spark/jars \
35 |     && mv hadoop-aws-3.3.2.jar /usr/local/spark/jars
36 | 
37 | 
38 | WORKDIR /opt/dagster/app
39 | COPY requirements.txt /opt/dagster/app
40 | RUN pip install --upgrade pip && pip install -r requirements.txt
41 | COPY . /opt/dagster/app
42 | 
43 | CMD ["dagster", "api", "grpc", "-h", "0.0.0.0", "-p", "4000", "-m", "etl_pipeline"]


--------------------------------------------------------------------------------
/load_dataset/mysql_schemas.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS youtube_trending_data;
 2 | 
 3 | 
 4 | DROP TABLE IF EXISTS CA_youtube_trending_data;
 5 | CREATE TABLE CA_youtube_trending_data (
 6 |     video_id VARCHAR(20),
 7 |     title TEXT,
 8 |     publishedAt VARCHAR(27),
 9 |     channelId VARCHAR(27),
10 |     channelTitle TEXT,
11 |     categoryId VARCHAR(5),
12 |     trending_date VARCHAR(27),
13 |     tags TEXT,
14 |     view_count TEXT,
15 |     likes TEXT,
16 |     dislikes TEXT,
17 |     comment_count TEXT,
18 |     thumbnail_link TEXT,
19 |     comments_disabled VARCHAR(6),  
20 |     ratings_disabled VARCHAR(6)
21 | );
22 | 
23 | DROP TABLE IF EXISTS DE_youtube_trending_data;
24 | CREATE TABLE DE_youtube_trending_data (
25 |     video_id VARCHAR(20),
26 |     title TEXT,
27 |     publishedAt VARCHAR(27),
28 |     channelId VARCHAR(27),
29 |     channelTitle TEXT,
30 |     categoryId VARCHAR(5),
31 |     trending_date VARCHAR(27),
32 |     tags TEXT,
33 |     view_count TEXT,
34 |     likes TEXT,
35 |     dislikes TEXT,
36 |     comment_count TEXT,
37 |     thumbnail_link TEXT,
38 |     comments_disabled VARCHAR(6),  
39 |     ratings_disabled VARCHAR(6)
40 | );
41 | 
42 | DROP TABLE IF EXISTS IN_youtube_trending_data;
43 | CREATE TABLE IN_youtube_trending_data (
44 |     video_id VARCHAR(20),
45 |     title TEXT,
46 |     publishedAt VARCHAR(27),
47 |     channelId VARCHAR(27),
48 |     channelTitle TEXT,
49 |     categoryId VARCHAR(5),
50 |     trending_date VARCHAR(27),
51 |     tags TEXT,
52 |     view_count TEXT,
53 |     likes TEXT,
54 |     dislikes TEXT,
55 |     comment_count TEXT,
56 |     thumbnail_link TEXT,
57 |     comments_disabled VARCHAR(6),  
58 |     ratings_disabled VARCHAR(6)
59 | );
60 | 
61 | DROP TABLE IF EXISTS JP_youtube_trending_data;
62 | CREATE TABLE JP_youtube_trending_data (
63 |     video_id VARCHAR(20),
64 |     title TEXT,
65 |     publishedAt VARCHAR(27),
66 |     channelId VARCHAR(27),
67 |     channelTitle TEXT,
68 |     categoryId VARCHAR(5),
69 |     trending_date VARCHAR(27),
70 |     tags TEXT,
71 |     view_count TEXT,
72 |     likes TEXT,
73 |     dislikes TEXT,
74 |     comment_count TEXT,
75 |     thumbnail_link TEXT,
76 |     comments_disabled VARCHAR(6),  
77 |     ratings_disabled VARCHAR(6)
78 | );
79 | 
80 | DROP TABLE IF EXISTS RU_youtube_trending_data;
81 | CREATE TABLE RU_youtube_trending_data (
82 |     video_id VARCHAR(20),
83 |     title TEXT,
84 |     publishedAt VARCHAR(27),
85 |     channelId VARCHAR(27),
86 |     channelTitle TEXT,
87 |     categoryId VARCHAR(5),
88 |     trending_date VARCHAR(27),
89 |     tags TEXT,
90 |     view_count TEXT,
91 |     likes TEXT,
92 |     dislikes TEXT,
93 |     comment_count TEXT,
94 |     thumbnail_link TEXT,
95 |     comments_disabled VARCHAR(6),  
96 |     ratings_disabled VARCHAR(6)
97 | );


--------------------------------------------------------------------------------
/app/pages/search_video.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import psycopg2
  3 | import polars as pl
  4 | import pandas as pd
  5 | from PIL import Image
  6 | from io import BytesIO
  7 | import requests
  8 | 
  9 | 
 10 | icon = Image.open("./icons/youtube_v2.png", mode="r")
 11 | 
 12 | st.set_page_config(
 13 |     page_title="YouTube RecoMaster",
 14 |     page_icon=icon,
 15 |     layout="centered",
 16 |     initial_sidebar_state="expanded"
 17 | )
 18 | 
 19 | @st.cache_resource
 20 | def init_connection():
 21 |     return psycopg2.connect(**st.secrets["postgres"])
 22 | 
 23 | conn = init_connection()
 24 | 
 25 | @st.cache_data(ttl=600)
 26 | def run_query(query):
 27 |     with conn.cursor() as cur:
 28 |         cur.execute(query)
 29 |         return cur.fetchall()
 30 |         
 31 | 
 32 | title, logo = st.columns([4,2.91])
 33 | with title: 
 34 |     st.title("YouTube RecoMaster")
 35 | with logo: 
 36 |     st.write("")
 37 |     st.image(icon, width=70)
 38 |     
 39 | st.slider("Size")
 40 | video_name = st.text_input("Enter a video name")
 41 | st.write(f"You entered: {video_name}")
 42 | 
 43 | 
 44 | data = run_query(
 45 |     f"""
 46 |         SELECT DISTINCT 
 47 |             video_id,
 48 |             title, 
 49 |             channeltitle,
 50 |             thumbnail_link,
 51 |             link_video,
 52 |             categoryname,
 53 |             view
 54 |         FROM youtube_trending.search_information
 55 |         WHERE title LIKE '%{video_name}%'
 56 |         LIMIT 10;
 57 |     """
 58 | )
 59 | 
 60 | videos = {
 61 |     "video_id": [e[0] for e in data],
 62 |     "title": [e[1] for e in data],
 63 |     "channeltitle": [e[2] for e in data],
 64 |     "thumbnail_link": [e[3] for e in data],
 65 |     "link_video": [e[4] for e in data],
 66 |     "categoryname": [e[5] for e in data],
 67 |     "view_count": [e[6] for e in data]
 68 | }                      
 69 | video_url = "https://www.youtube.com/embed/J78aPJ3VyNs"
 70 | 
 71 | recommended_videos = []
 72 | recommended_videos += videos['link_video']
 73 | 
 74 | st.subheader(f"Have {len(videos['video_id'])} results for keyword: {video_name}")
 75 | for video_id,title,channeltitle,thumbnail_link,link_video,categoryname,view_count in zip(
 76 |     videos['video_id'],videos['title'],videos['channeltitle'],
 77 |     videos['thumbnail_link'],videos['link_video'],videos['categoryname'],videos['view_count']):
 78 |     
 79 |     col1, col2 = st.columns([1, 1])
 80 |     
 81 |     with col1:
 82 |         img = Image.open(BytesIO(requests.get(thumbnail_link).content))
 83 |         st.markdown(
 84 |             f'<style>img {{border-radius: 12px;}}</style>',
 85 |             unsafe_allow_html=True,
 86 |         )
 87 |         st.image(img, use_column_width=True)
 88 |     
 89 |     with col2:
 90 |         st.write("")
 91 |         st.markdown(f"""
 92 |             <div style="line-height: 1.5;">
 93 |                 <span style="font-weight: bold;">{title}</span><br>
 94 |                 <span style="opacity: 0.6;">channel: {channeltitle}</span><br>
 95 |                 <span style="opacity: 0.6;">category: {categoryname}</span><br>
 96 |                 <span style="opacity: 0.6;">views: {view_count}</span>
 97 |             </div>
 98 |             """, unsafe_allow_html=True)
 99 |         st.write("")
100 |         is_clicked = st.button("Watch", key=video_id)
101 |         
102 |         if is_clicked:
103 |             st.experimental_set_query_params(video_id=video_id)
104 |             # st.experimental_rerun()
105 |             st.switch_page("./pages/video_detail.py")
106 |             
107 | 
108 |     st.write("---")
109 | 
110 | 
111 | # df = pl.DataFrame(videos)
112 | # st.table(df)


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/resources/minio_io_manager.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from contextlib import contextmanager
 3 | from datetime import datetime
 4 | from typing import Union
 5 | 
 6 | import polars as pl
 7 | import pyarrow as pa
 8 | import pyarrow.parquet as pq
 9 | from dagster import IOManager, OutputContext, InputContext
10 | from minio import Minio
11 | 
12 | 
13 | @contextmanager
14 | def connect_minio(config: dict):
15 |     client = Minio(
16 |         endpoint=config.get("endpoint_url"),
17 |         access_key=config.get("aws_access_key_id"),
18 |         secret_key=config.get("aws_secret_access_key"),
19 |         secure=False,
20 |     )
21 |     try:
22 |         yield client
23 |     except Exception as e:
24 |         raise e
25 |     
26 |     
27 | class MinIOIOManager(IOManager):
28 |     
29 |     def __init__(self, config):
30 |         self._config = config
31 |     
32 |     def _get_path(self, context: Union[InputContext, OutputContext]):
33 |         layer, schema, table = context.asset_key.path
34 |         key = "/".join([layer, schema, table.replace(f"{layer}_", "")])
35 |         tmp_file_path = "/tmp/file-{}-{}.parquet".format(
36 |             datetime.today().strftime("%Y%m%d%H%M%S"), 
37 |             "-".join(context.asset_key.path)
38 |         )
39 |         
40 |         if context.has_asset_partitions:
41 |             start, end = context.asset_partitions_time_window
42 |             # partition_str = context.asset_partition_key
43 |             partition_str = start.strftime("%Y%m")
44 |             context.log.info(f"INFO: {os.path.join(key, partition_str)}.pq, {tmp_file_path}")
45 |             return os.path.join(key, f"{partition_str}.pq"), tmp_file_path
46 |         else:
47 |             context.log.info(f"INFO: {key}.pq, {tmp_file_path}")
48 |             return f"{key}.pq", tmp_file_path
49 | 
50 |     def handle_output(self, context: OutputContext, obj: pl.DataFrame):
51 |         # convert to parquet format
52 |         key_name, tmp_file_path = self._get_path(context)
53 |         obj.write_parquet(tmp_file_path)
54 |         
55 |         # upload to MinIO
56 |         try:
57 |             bucket_name = self._config.get("bucket")
58 |             with connect_minio(self._config) as client:
59 |                 # Make bucket if not exist.
60 |                 found = client.bucket_exists(bucket_name)
61 |                 if not found:
62 |                     client.make_bucket(bucket_name)
63 |                 else:
64 |                     print(f"Bucket {bucket_name} already exists")
65 |                 client.fput_object(bucket_name, key_name, tmp_file_path)
66 |                 row_count = len(obj)
67 |                 context.add_output_metadata(
68 |                     {
69 |                         "path": key_name, 
70 |                         "records": row_count, 
71 |                         "tmp": tmp_file_path
72 |                     }
73 |                 )
74 |                 # clean up tmp file
75 |                 os.remove(tmp_file_path)
76 |                 
77 |         except Exception as e:
78 |             raise e
79 | 
80 |     def load_input(self, context: InputContext) -> pl.DataFrame:
81 |         bucket_name = self._config.get("bucket")
82 |         key_name, tmp_file_path = self._get_path(context)
83 |         
84 |         try:
85 |             with connect_minio(self._config) as client:
86 |                 # Make bucket if not exist.
87 |                 found = client.bucket_exists(bucket_name)
88 |                 if not found:
89 |                     client.make_bucket(bucket_name)
90 |                 else:
91 |                     print(f"Bucket {bucket_name} already exists")
92 |                     
93 |                 client.fget_object(bucket_name, key_name, tmp_file_path)
94 |                 pd_data = pl.read_parquet(tmp_file_path)
95 |                 return pd_data
96 |             
97 |         except Exception as e:
98 |             raise e


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/resources/psql_io_manager.py:
--------------------------------------------------------------------------------
  1 | from contextlib import contextmanager
  2 | from datetime import datetime
  3 | from psycopg2 import sql
  4 | import psycopg2.extras
  5 | import psycopg2
  6 | 
  7 | import polars as pl
  8 | from dagster import IOManager, OutputContext, InputContext
  9 | from sqlalchemy import create_engine
 10 | 
 11 | 
 12 | @contextmanager
 13 | def connect_psql(config: dict):
 14 |     try:
 15 |         yield psycopg2.connect(
 16 |             host=config["host"],
 17 |             port=config["port"],
 18 |             database=config["database"],
 19 |             user=config["user"],
 20 |             password=config["password"],
 21 |         )
 22 |         
 23 |     except Exception as e:
 24 |         raise e
 25 |     
 26 |     
 27 | class PostgreSQLIOManager(IOManager):
 28 |     
 29 |     def __init__(self, config):
 30 |         self._config = config
 31 |     
 32 |     def load_input(self, context: InputContext) -> pl.DataFrame:
 33 |         pass
 34 | 
 35 |     def handle_output(self, context: OutputContext, obj: pl.DataFrame):
 36 |         schema, table = context.asset_key.path[-2], context.asset_key.path[-1]
 37 |         tmp_tbl = f"{table}_tmp_{datetime.now().strftime('%Y_%m_%d')}"
 38 |         
 39 |         with connect_psql(self._config) as db_conn:
 40 |             primary_keys = (context.metadata or {}).get("primary_keys", [])
 41 |             ls_columns = (context.metadata or {}).get("columns", [])
 42 |             
 43 |             with db_conn.cursor() as cursor:
 44 |                 # create temp table
 45 |                 cursor.execute(
 46 |                     f'CREATE TEMP TABLE IF NOT EXISTS "{tmp_tbl}" (LIKE {schema}.{table})'
 47 |                 )
 48 |                 cursor.execute(f'SELECT COUNT(*) FROM "{tmp_tbl}"')
 49 |                 context.log.debug(
 50 |                     f"Log for creating temp table: {cursor.fetchall()}"
 51 |                 )
 52 |                 # cursor.execute(
 53 |                 #     sql.SQL("CREATE TEMP TABLE IF NOT EXISTS {} (LIKE {}.{});").format(
 54 |                 #         sql.Identifier(tmp_tbl),
 55 |                 #         sql.Identifier(schema),
 56 |                 #         sql.Identifier(table),
 57 |                 #     )
 58 |                 # )
 59 |                 
 60 |                 # insert new data
 61 |                 try:
 62 |                     columns = sql.SQL(",").join(
 63 |                         sql.Identifier(name.lower()) for name in obj.columns
 64 |                     )
 65 |                     context.log.info(f"Table {table} with columns: {columns}")
 66 |                     values = sql.SQL(",").join(sql.Placeholder() for _ in obj.columns)
 67 |                     
 68 |                     context.log.debug("Inserting data into temp table")
 69 |                     insert_query = sql.SQL('INSERT INTO {} ({}) VALUES({});').format(
 70 |                         sql.Identifier(tmp_tbl), columns, values
 71 |                     )
 72 |                     psycopg2.extras.execute_batch(cursor, insert_query, obj.rows())
 73 |                     context.log.info(f"Insert into data for table {table} Success !!!")
 74 |                     
 75 |                     db_conn.commit()
 76 |                     
 77 |                 except Exception as e:
 78 |                     raise e
 79 |                 
 80 |             with db_conn.cursor() as cursor:
 81 |                 # check data inserted
 82 |                 cursor.execute(f'SELECT COUNT(*) FROM "{tmp_tbl}"')
 83 |                 context.log.info(f"Number of rows inserted: {cursor.fetchone()}")
 84 |                     
 85 |                 # upsert data
 86 |                 if len(primary_keys) > 0:
 87 |                     conditions = " AND ".join(
 88 |                         [
 89 |                             f""" {schema}.{table}."{k.lower()}" = "{tmp_tbl}"."{k.lower()}" """
 90 |                             for k in primary_keys
 91 |                         ]
 92 |                     )
 93 |                     command = f"""
 94 |                         BEGIN TRANSACTION;
 95 |                         DELETE FROM {schema}.{table}
 96 |                         USING "{tmp_tbl}"
 97 |                         WHERE {conditions};
 98 |                         
 99 |                         INSERT INTO {schema}.{table}
100 |                         SELECT * FROM "{tmp_tbl}";
101 |                         
102 |                         END TRANSACTION;
103 |                     """
104 |                 else:
105 |                     command = f"""
106 |                         BEGIN TRANSACTION;
107 |                         TRUNCATE TABLE {schema}.{table};
108 | 
109 |                         INSERT INTO {schema}.{table}
110 |                         SELECT * FROM "{tmp_tbl}";
111 |                         
112 |                         END TRANSACTION;
113 |                     """
114 | 
115 |                 cursor.execute(command)
116 |                 # drop temp table
117 |                 cursor.execute(f'DROP TABLE IF EXISTS "{tmp_tbl}"')
118 |                 db_conn.commit()


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
  1 | version: "3.9"
  2 | 
  3 | 
  4 | services:
  5 | 
  6 |   # MySQL
  7 |   de_mysql:
  8 |     image: mysql:8.0
  9 |     container_name: de_mysql
 10 |     volumes:
 11 |       - ./mysql:/var/lib/mysql
 12 |       - ./dataset/youTube_trending_video:/tmp/youTube_trending_video
 13 |       - ./load_dataset:/tmp/load_dataset
 14 |     ports:
 15 |       - 3306:3306
 16 |     env_file: .env
 17 |     networks:
 18 |       - de_network
 19 | 
 20 |   # MinIO
 21 |   minio:
 22 |     hostname: minio
 23 |     image: minio/minio
 24 |     container_name: minio
 25 |     ports:
 26 |       - 9001:9001
 27 |       - 9000:9000
 28 |     command: [ "server", "/data", "--console-address", ":9001" ]
 29 |     volumes:
 30 |       - ./minio:/data
 31 |     env_file: .env
 32 |     networks:
 33 |       - de_network
 34 | 
 35 |   mc:
 36 |     image: minio/mc
 37 |     container_name: mc
 38 |     hostname: mc
 39 |     env_file: .env
 40 |     entrypoint: >
 41 |       /bin/sh -c " until (/usr/bin/mc config host add minio
 42 |       http://minio:9000 minio minio123) do echo '...waiting...' && sleep 1;
 43 |       done; /usr/bin/mc mb minio/lakehouse; /usr/bin/mc policy set public
 44 |       minio/lakehouse; exit 0; "
 45 |     depends_on:
 46 |       - minio
 47 |     networks:
 48 |       - de_network
 49 | 
 50 |   # Pipeline
 51 |   etl_pipeline: 
 52 |     build:
 53 |       context: ./etl_pipeline
 54 |       dockerfile: Dockerfile
 55 |     container_name: etl_pipeline
 56 |     image: etl_pipeline:latest
 57 |     restart: always
 58 |     volumes:
 59 |       - ./etl_pipeline:/opt/dagster/app
 60 |       - ./docker-images/spark/spark-defaults.conf:/usr/local/spark/conf/spark-defaults.conf
 61 |     ports:
 62 |       - 4041:4040
 63 |     env_file: .env
 64 |     networks:
 65 |       - de_network
 66 | 
 67 |   # PostgreSQL
 68 |   de_psql:
 69 |     image: postgres:15
 70 |     container_name: de_psql
 71 |     volumes:
 72 |       - ./postgresql:/var/lib/postgresql/data
 73 |       - ./load_dataset:/tmp/load_dataset
 74 |     ports:
 75 |       - 5432:5432
 76 |     env_file: .env
 77 |     networks:
 78 |       - de_network
 79 | 
 80 |   # Dagster
 81 |   de_dagster:
 82 |     build:
 83 |       context: ./docker-images/dagster/
 84 |     container_name: de_dagster
 85 |     image: de_dagster
 86 | 
 87 |   de_dagster_dagit:
 88 |     image: de_dagster:latest
 89 |     entrypoint:
 90 |       - dagit
 91 |       - -h
 92 |       - "0.0.0.0"
 93 |       - -p
 94 |       - "3001"
 95 |       - -w
 96 |       - workspace.yaml
 97 |     container_name: de_dagster_dagit
 98 |     expose:
 99 |       - "3001"
100 |     ports:
101 |       - 3001:3001
102 |     volumes:
103 |       - /var/run/docker.sock:/var/run/docker.sock
104 |       - ./dagster_home:/opt/dagster/dagster_home
105 |     env_file: .env
106 |     networks:
107 |       - de_network
108 | 
109 |   de_dagster_daemon:
110 |     image: de_dagster:latest
111 |     entrypoint:
112 |       - dagster-daemon
113 |       - run
114 |     container_name: de_dagster_daemon
115 |     volumes:
116 |       - /var/run/docker.sock:/var/run/docker.sock
117 |       - ./dagster_home:/opt/dagster/dagster_home
118 |     env_file: .env
119 |     networks:
120 |       - de_network
121 | 
122 |   # Streamlit
123 |   de_streamlit:
124 |     build:
125 |       context: ./docker-images/streamlit
126 |       dockerfile: Dockerfile
127 |     image: de_streamlit:latest
128 |     container_name: de_streamlit
129 |     volumes:
130 |       - ./app:/app
131 |     env_file: .env
132 |     ports:
133 |       - "8501:8501"
134 |     networks:
135 |       - de_network
136 | 
137 |   # Metabase
138 |   de_metabase:
139 |     image: metabase/metabase:latest
140 |     container_name: de_metabase
141 |     volumes:
142 |       - ./storage/metabase_data:/metabase_data
143 |     ports:
144 |       - "3030:3000"
145 |     env_file: .env
146 |     networks:
147 |       - de_network
148 | 
149 |   # Jupyter
150 |   # de_notebook:
151 |   #   image: jupyter/all-spark-notebook:python-3.9
152 |   #   container_name: de_notebook
153 |   #   command: [ "start-notebook.sh", "--NotebookApp.token=" ]
154 |   #   ports:
155 |   #     - 8888:8888
156 |   #   volumes:
157 |   #     - ./notebooks/work:/home/jovyan/work
158 |   #   env_file: .env
159 |   #   networks:
160 |   #     - de_network
161 |       
162 |   # # Spark
163 |   # spark-master:
164 |   #   build:
165 |   #     context: ./docker-images/spark
166 |   #     dockerfile: Dockerfile
167 |   #   image: spark-master:latest
168 |   #   container_name: spark-master
169 |   #   hostname: spark_master
170 |   #   volumes:
171 |   #     - ./docker-images/spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
172 |   #     - ./data:/opt/spark-data
173 |   #   env_file: .env.spark_master
174 |   #   expose:
175 |   #     - "7077"
176 |   #   ports:
177 |   #     - "7077:7077"
178 |   #     - "8080:8080"
179 |   #   networks:
180 |   #     - de_network
181 | 
182 |   # spark-worker:
183 |   #   image: docker.io/bitnami/spark:3.4.3
184 |   #   depends_on: 
185 |   #     - spark-master
186 |   #   deploy: 
187 |   #     replicas: 3
188 |   #   env_file: .env.spark_worker
189 |   #   volumes:
190 |   #     - ./docker-images/spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
191 |   #     - ./data:/opt/spark-data
192 |   #   networks:
193 |   #     - de_network
194 | 
195 | networks:
196 |   de_network:
197 |     driver: bridge
198 |     name: de_network


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🌄 Youtube-ETL-Pipeline
  2 | In this project, I build a simple data pipeline following the ETL(extract - transform - load) model using Youtube-Trending-Video dataset, perform data processing, transformation and calculation using Apache Spark big data technology, serving the video search and recommendation system
  3 | 
  4 | ## 🔦 About Project
  5 | <img src="./public/images/data_flow.png" style="width: 100%;">
  6 | 
  7 |  - **Data Source**: This project uses two main `data sources`: [Youtube Trending Video](https://www.kaggle.com/datasets/rsrishav/youtube-trending-video-dataset) data and [Youtube API](https://developers.google.com/youtube/v3)
  8 |     - `Youtube Trending Video` data is downloaded from [Kaggle.com](https://www.kaggle.com) with `.csv` file format, then loaded into `MySQL`, considered as a `data source`
  9 |     - Using `Video ID` and `Category ID` from `Youtube Trending Video` data, we collect some additional information fields from `Youtube API` such as `Video Link` and `Video Category`
 10 |  - **Extract Data**: Extract the above `data sources` using `Polars` `DataFrame`, now we have the `raw` layer, then load the data into `MinIO` `datalake`
 11 |  - **Tranform Data**: From `MinIO`, we use `Apache Spark`, specifically `PySpark`
 12 |     - convert from `Polars` `DataFrame` to `PySpark` `DataFrame` for processing and calculation, we get `silver` and `gold` layers
 13 |     - Data stored in `MinIO` is in `.parquet` format, providing better processing performance
 14 |  - **Load Data**: Load the `gold` layer into the `PostgreSQL` data warehouse, perform additional transform with `dbt` to create an `index`, making video searching faster
 15 |  - **Serving**: The data was used for visualization using `Metabase` and creating a video recommendation application using `Streamlit`
 16 |  - **package and orchestrator**: Use `Docker` to containerize and package projects and `Dagster` to coordinate `assets` across different tasks
 17 | 
 18 | ## ⚡ Workflow
 19 | <img src="./public/images/Data_flow_youtube2.png" style="width: 100%;">
 20 | 
 21 | ## 📦 Technologies
 22 |  - `MySQL`
 23 |  - `Youtube API`
 24 |  - `Polars`
 25 |  - `MinIO`
 26 |  - `Apache Spark`
 27 |  - `PostgreSQL`
 28 |  - `Dbt`
 29 |  - `Metabase`
 30 |  - `Streamlit`
 31 |  - `Dagster`
 32 |  - `Docker`
 33 |  - `Apache Superset`
 34 |  - `Unittest`
 35 |  - `Pytest`
 36 | 
 37 | ## 🦄 Features
 38 | Here's what you can do with:
 39 |  - You can completely change the logic or create new `assets` in the `data pipeline` as you wish, perform `aggregate` `calculations` on the `assets` in the `pipeline` according to your purposes.
 40 |  - You can also create new `data charts` as well as change existing `charts` as you like with extremely diverse `chart types` on `Metabase` and `Apache Superset`.
 41 |  - You can also create new or change my existing `dashboards` as you like
 42 |  - `Search` videos quickly with any `keyword`, for `Video Recommendation` Apps
 43 |  - `Search` in many different languages, not just `English` such as: `Japanese`, `Canadian`, `German`, `Indian`, `Russian`
 44 |  - Recommend videos based on `category` and `tags` video
 45 | 
 46 | ## 👩🏽‍🍳 The Process
 47 | 
 48 | 
 49 | ## 📚 What I Learned
 50 | 
 51 | During this project, I learned important skills, understood complex ideas, knew how to install and set up popular and useful tools, which brought me closer to becoming a `Data Engineer`.
 52 |  - **Logical thinking**: I learned how to think like a data person, find the cause of the data `problem` and then come up with the most `reasonable solution` to achieve high data `accuracy`.
 53 |  - **Architecture**: I understand and grasp the `ideas` and `architecture` of today's popular and popular big data processing tool, `Apache Spark`.
 54 |  - **Installation**: I learned how to install popular data processing, visualization and storage tools such as: `Metabase`, `Streamlit`, `MinIO`,... with `Docker`
 55 |  - **Setup**: I know how to setup `Spark Standalone Cluster` using `Docker` with three `Worker Nodes` on my local machine
 56 | 
 57 | ### 📈 Overall Growth: 
 58 | Each part of this project has helped me understand more about how to build a data engineering, data management project. Learn new knowledge and improve my skills in future work
 59 | 
 60 | ## 💭 How can it be improved?
 61 |  - Add more `data sources` to increase data richness.
 62 |  - Refer to other `data warehouses` besides `PostgreSQL` such as `Amazon Redshift` or `Snowflake`.
 63 |  - Perform more `cleaning` and `optimization` `processing` of the data.
 64 |  - Perform more advanced `statistics`, `analysis` and `calculations` with `Apache Spark`.
 65 |  - Check out other popular and popular `data orchestration` tools like `Apache Airflow`.
 66 |  - Separate `dbt` into a separate service (separate `container`) in `docker` when the project expands
 67 |  - Setup `Spark Cluster` on `cloud platforms` instead of on `local machines`
 68 |  - Refer to `cloud computing` services if the project is more extensive
 69 |  - Learn about `dbt packages` like `dbt-labs/dbt_utils` to help make the `transformation` process faster and more optimal.
 70 | 
 71 | ## 🚦 Running the Project
 72 | To run the project in your local environment, follow these steps:
 73 | 1. Run command after to clone the `repository` to your `local machine`.
 74 | ~~~bash
 75 |    git clone https://github.com/longNguyen010203/Youtube-ETL-Pipeline.git
 76 | ~~~
 77 | 
 78 | 2. Run the following commands to build the images from the `Dockerfile`, pull images from `docker hub` and launch services
 79 | ~~~bash
 80 |    make build
 81 |    make up
 82 | ~~~
 83 | 
 84 | 3. Run the following commands to access the `SQL editor` on the `terminal` and Check if `local_infile` was turned on
 85 | ~~~python
 86 |    make to_mysql_root
 87 | 
 88 |    SET GLOBAL local_infile=TRUE;
 89 |    SHOW VARIABLES LIKE "local_infile";
 90 |    exit
 91 | ~~~
 92 | 
 93 | 4. Run the following commands to create tables with schema for `MySQL`, load data from `CSV` file to `MySQL` and create tables with schema for `PostgreSQL`
 94 | ~~~bash
 95 |    make mysql_create
 96 |    make mysql_load
 97 |    make psql_create
 98 | ~~~
 99 | 
100 | 5. Open [http://localhost:3001](http://localhost:3001) to view `Dagster UI` and click `Materialize all` button to run the Pipeline
101 | 6. Open [http://localhost:9001](http://localhost:9001) to view `MinIO UI` and check the data to be loaded
102 | 7. Open [http://localhost:8080](http://localhost:8080) to view `Spark UI` and three `workers` are running
103 | 8. Open [http://localhost:3030](http://localhost:3030) to see charts and `dashboards` on `Metabase`
104 | 9. Open [http://localhost:8501](http://localhost:8501) to try out the `video recommendation` app on `Streamlit`
105 | 
106 | ## 🍿 Video


--------------------------------------------------------------------------------
/app/pages/video_detail.py:
--------------------------------------------------------------------------------
  1 | from PIL import Image
  2 | import streamlit as st
  3 | import psycopg2
  4 | from PIL import Image
  5 | from io import BytesIO
  6 | import requests
  7 | 
  8 | 
  9 | icon = Image.open("./icons/youtube_v2.png", mode="r")
 10 | 
 11 | st.set_page_config(
 12 |     page_title="Video Recommender",
 13 |     page_icon=icon,
 14 |     layout="centered",
 15 |     initial_sidebar_state="expanded"
 16 | )
 17 | 
 18 | title, logo = st.columns([4,2.91])
 19 | with title: 
 20 |     st.title("YouTube RecoMaster")
 21 | with logo: 
 22 |     st.write("")
 23 |     st.image(icon, width=70)
 24 |     
 25 | def display_video(url, recommended_videos=[]):
 26 |     if url not in recommended_videos:
 27 |         st.markdown(
 28 |             f'''<iframe width="705" height="460" src="https://{url}" title="YouTube video player" frameborder="0" 
 29 |                 allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; 
 30 |                 web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>''', 
 31 |             unsafe_allow_html=True
 32 |         )
 33 |     else:
 34 |         st.markdown(
 35 |             f'''<iframe width="355" height="160" src="{url}" title="YouTube video player" frameborder="0" 
 36 |                 allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; 
 37 |                 web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>''', 
 38 |             unsafe_allow_html=True
 39 |         )
 40 | 
 41 | @st.cache_resource
 42 | def init_connection():
 43 |     return psycopg2.connect(**st.secrets["postgres"])
 44 | 
 45 | conn = init_connection()
 46 | 
 47 | @st.cache_data(ttl=600)
 48 | def run_query(query):
 49 |     with conn.cursor() as cur:
 50 |         cur.execute(query)
 51 |         return cur.fetchall()
 52 |     
 53 | query_params = st.experimental_get_query_params()
 54 | video_id = query_params.get('video_id', [None])[0]
 55 | 
 56 | data = run_query(f"""
 57 |             select distinct 
 58 |                 title
 59 |                 , channeltitle 
 60 |                 , categoryname
 61 |                 , view
 62 |                 , likes
 63 |                 , dislike
 64 |                 , publishedat
 65 |                 , link_video
 66 |                 , tags
 67 |             from youtube_trending.search_information si 
 68 |             where video_id = '{video_id}';
 69 |     """)
 70 | 
 71 | videos = {
 72 |     "title": data[0][0],
 73 |     "channeltitle": data[0][1],
 74 |     "categoryname": data[0][2],
 75 |     "view": data[0][3],
 76 |     "like": data[0][4],
 77 |     "dislike": data[0][5],
 78 |     "publishedat": data[0][6],
 79 |     "link_video": data[0][7],
 80 |     "tags": data[0][8]
 81 | }
 82 | 
 83 | display_video(videos['link_video'])
 84 | st.markdown(f"### {videos['title']}")
 85 | view_icon = Image.open("./icons/icons8-view-48.png", mode="r")
 86 | like_icon = Image.open("./icons/icons8-like-48.png", mode="r")
 87 | dislike_icon = Image.open("./icons/icons8-thumbs-down-skin-type-4-48.png", mode="r")
 88 | category_icon = Image.open("./icons/icons8-category-48.png", mode="r")
 89 | channel_icon = Image.open("./icons/icons8-channel-48.png", mode="r")
 90 | # st.write(f"{videos['tags']}")
 91 | st.write(f"<span style='color: #6495ED;'>{videos['tags']}</span>", unsafe_allow_html=True)
 92 | 
 93 | title, view, like, dislike, category = st.columns([4,1,1,1,1.3])
 94 | with title:
 95 |     st.image(channel_icon, width=40)
 96 |     st.write(f"{videos['channeltitle']}")
 97 | with view:
 98 |     st.image(view_icon, width=30)
 99 |     st.write(f"{videos['view']}")
100 | with like:
101 |     st.image(like_icon, width=30)
102 |     st.write(f"{videos['like']}")
103 | with dislike:
104 |     st.image(dislike_icon, width=30)
105 |     st.write(f"{videos['dislike']}")
106 | with category:
107 |     st.image(category_icon, width=30)
108 |     st.write(f"{videos['categoryname']}")
109 | 
110 | 
111 | st.subheader("Recommended Videos:")
112 | tags = ""
113 | tag_list = videos['tags'].split(' ')
114 | for tag in tag_list: tags += f"tags LIKE '%{tag}%' OR "
115 | tags = tags[:-3]
116 | 
117 | query = f"""
118 |             select distinct 
119 |                 video_id
120 |                 , title
121 |                 , channeltitle 
122 |                 , categoryname
123 |                 , view
124 |                 , likes
125 |                 , dislike
126 |                 , publishedat
127 |                 , link_video
128 |                 , tags
129 |                 , thumbnail_link
130 |             from youtube_trending.search_information 
131 |             where (categoryname = '{videos['categoryname']}') AND
132 |                     ({tags}) AND video_id <> '{video_id}'
133 |             limit 10;
134 |     """
135 | data2 = run_query(query)
136 | 
137 | if data2 is not None:
138 |     videos2 = {
139 |         "video_id": [e[0] for e in data2],
140 |         "title": [e[1] for e in data2],
141 |         "channeltitle": [e[2] for e in data2],
142 |         "categoryname": [e[3] for e in data2],
143 |         "view": [e[4] for e in data2],
144 |         "like": [e[5] for e in data2],
145 |         "dislike": [e[6] for e in data2],
146 |         "publishedat": [e[7] for e in data2],
147 |         "link_video": [e[8] for e in data2],
148 |         "tags": [e[9] for e in data2],
149 |         'thumbnail_link': [e[10] for e in data2]
150 |     }
151 |                         
152 | 
153 |     recommended_videos = []
154 |     recommended_videos += videos2['link_video']
155 | 
156 |     for video_id,title,channeltitle,categoryname,view,like,dislike,publishedat,link_video,tags,thumbnail_link in zip(
157 |         videos2['video_id'],videos2['title'],videos2['channeltitle'],videos2['categoryname'],
158 |         videos2['view'],videos2['like'], videos2['dislike'],videos2['publishedat'],
159 |         videos2['link_video'],videos2['tags'],videos2['thumbnail_link']):
160 |         
161 |         col1, col2 = st.columns([1, 1])
162 |         
163 |         with col1:
164 |             img = Image.open(BytesIO(requests.get(thumbnail_link).content))
165 |             st.markdown(
166 |                 f'<style>img {{border-radius: 12px;}}</style>',
167 |                 unsafe_allow_html=True,
168 |             )
169 |             st.image(img, use_column_width=True)
170 |         
171 |         with col2:
172 |             st.write("")
173 |             st.markdown(f"""
174 |                 <div style="line-height: 1.5;">
175 |                     <span style="font-weight: bold;">{title}</span><br>
176 |                     <span style="opacity: 0.6;">channel: {channeltitle}</span><br>
177 |                     <span style="opacity: 0.6;">category: {categoryname}</span><br>
178 |                     <span style="opacity: 0.6;">views: {view}</span>
179 |                 </div>
180 |                 """, unsafe_allow_html=True)
181 |             st.write("")
182 |             st.button("Detail", key=video_id)
183 | 
184 |         st.write("---")
185 |         
186 | else: st.write(f"Not found")
187 | 


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/assets/warehouse.py:
--------------------------------------------------------------------------------
  1 | import polars as pl
  2 | from dagster import AssetExecutionContext
  3 | 
  4 | from dagster import (
  5 |     multi_asset,
  6 |     AssetIn,
  7 |     AssetOut,
  8 |     MetadataValue,
  9 |     AssetExecutionContext,
 10 |     Output
 11 | )
 12 | 
 13 | from ..partitions import monthly_partitions
 14 | 
 15 | 
 16 | GROUP_NAME = "warehouse"
 17 | 
 18 | @multi_asset(
 19 |     ins={
 20 |         "gold_videoCategory": AssetIn(
 21 |             key_prefix=["gold", "youtube"],
 22 |         )
 23 |     },
 24 |     outs={
 25 |         "videoCategory": AssetOut(
 26 |             key_prefix=["warehouse", "gold"],
 27 |             io_manager_key="psql_io_manager",
 28 |             metadata={
 29 |                 "primary_keys": [
 30 |                     "categoryId"
 31 |                 ],
 32 |                 "columns": [
 33 |                     "categoryId",
 34 |                     "categoryName"
 35 |                 ]
 36 |             },
 37 |             group_name=GROUP_NAME
 38 |         )
 39 |     },
 40 |     name="videoCategory",
 41 |     required_resource_keys={"psql_io_manager"},
 42 |     compute_kind="postgres",
 43 | )
 44 | def videoCategory(context: AssetExecutionContext,
 45 |                        gold_videoCategory: pl.DataFrame
 46 | ) -> Output[pl.DataFrame]:
 47 |     """ 
 48 |         Load videoCategory data from gold to PostgreSQL warehouse
 49 |     """
 50 |     pl_data: pl.DataFrame = gold_videoCategory
 51 |     context.log.info(f"Load videoCategory data Success with shape {pl_data.shape}")
 52 |     
 53 |     return Output(
 54 |         value=pl_data,
 55 |         metadata={
 56 |             "table name": MetadataValue.text("videoCategory"),
 57 |             "record count": MetadataValue.int(pl_data.shape[0]),
 58 |             "column count": MetadataValue.int(pl_data.shape[1]),
 59 |             "columns": pl_data.columns
 60 |         }
 61 |     )
 62 |     
 63 | 
 64 | @multi_asset(
 65 |     ins={
 66 |         "gold_linkVideos": AssetIn(
 67 |             key_prefix=["gold", "youtube"],
 68 |         )
 69 |     },
 70 |     outs={
 71 |         "linkVideos": AssetOut(
 72 |             key_prefix=["warehouse", "gold"],
 73 |             io_manager_key="psql_io_manager",
 74 |             metadata={
 75 |                 "primary_keys": [
 76 |                     "video_id"
 77 |                 ],
 78 |                 "columns": [
 79 |                     "video_id",
 80 |                     "link_video"
 81 |                 ]
 82 |             },
 83 |             group_name=GROUP_NAME
 84 |         )
 85 |     },
 86 |     name="linkVideos",
 87 |     required_resource_keys={"psql_io_manager"},
 88 |     compute_kind="postgres"
 89 | )
 90 | def linkVideos(context: AssetExecutionContext,
 91 |                     gold_linkVideos: pl.DataFrame
 92 | ) -> Output[pl.DataFrame]:
 93 |     """ 
 94 |         Load linkVideos data from gold to PostgreSQL warehouse
 95 |     """
 96 |     pl_data: pl.DataFrame = gold_linkVideos
 97 |     context.log.info(f"Load linkVideos data Success with shape {pl_data.shape}")
 98 |     
 99 |     return Output(
100 |         value=pl_data,
101 |         metadata={
102 |             "table name": MetadataValue.text("linkVideos"),
103 |             "record count": MetadataValue.int(pl_data.shape[0]),
104 |             "column count": MetadataValue.int(pl_data.shape[1]),
105 |             "columns": pl_data.columns
106 |         }
107 |     )
108 |     
109 |     
110 | @multi_asset(
111 |     ins={
112 |         "gold_metric_trending": AssetIn(
113 |             key_prefix=["gold", "youtube"]
114 |         )
115 |     },
116 |     outs={
117 |         "metricVideos": AssetOut(
118 |             key_prefix=["warehouse", "gold"],
119 |             io_manager_key="psql_io_manager",
120 |             metadata={
121 |                 "primary_keys": [
122 |                     "video_id"
123 |                 ],
124 |                 "columns": [
125 |                     "video_id",
126 |                     "publishedAt",
127 |                     "trending_date",
128 |                     "channelId",
129 |                     "categoryId",
130 |                     "view_count",
131 |                     "likes",
132 |                     "dislikes",
133 |                     "comment_count"
134 |                 ]
135 |             },
136 |             group_name=GROUP_NAME
137 |         )
138 |     },
139 |     name="metricVideos",
140 |     required_resource_keys={"psql_io_manager"},
141 |     partitions_def=monthly_partitions,
142 |     compute_kind="postgres"
143 | )
144 | def metricVideos(context: AssetExecutionContext,
145 |                  gold_metric_trending: pl.DataFrame
146 | ) -> Output[pl.DataFrame]:
147 |     """ 
148 |         Load metricVideos data from gold to PostgreSQL warehouse
149 |     """
150 |     pl_data: pl.DataFrame = gold_metric_trending
151 |     context.log.info(f"Load metricVideos data Success with shape {pl_data.shape}")
152 |     
153 |     return Output(
154 |         value=pl_data,
155 |         metadata={
156 |             "table name": MetadataValue.text("metricVideos"),
157 |             "record count": MetadataValue.int(pl_data.shape[0]),
158 |             "column count": MetadataValue.int(pl_data.shape[1]),
159 |             "columns": pl_data.columns
160 |         }
161 |     )
162 |     
163 |     
164 | @multi_asset(
165 |     ins={
166 |         "gold_information_trending": AssetIn(
167 |             key_prefix=["gold", "youtube"]
168 |         )
169 |     },
170 |     outs={
171 |         "informationVideos": AssetOut(
172 |             key_prefix=["warehouse", "gold"],
173 |             io_manager_key="psql_io_manager",
174 |             metadata={
175 |                 "primary_keys": [
176 |                     "video_id"
177 |                 ],
178 |                 "columns": [
179 |                     "video_id",
180 |                     "title",
181 |                     "channelId",
182 |                     "channelTitle",
183 |                     "categoryId",
184 |                     "tags",
185 |                     "thumbnail_link",
186 |                     "comments_disabled",
187 |                     "ratings_disabled",
188 |                 ]
189 |             },
190 |             group_name=GROUP_NAME
191 |         )
192 |     },
193 |     name="informationVideos",
194 |     required_resource_keys={"psql_io_manager"},
195 |     partitions_def=monthly_partitions,
196 |     compute_kind="postgres"
197 | )
198 | def informationVideos(context: AssetExecutionContext,
199 |                  gold_information_trending: pl.DataFrame
200 | ) -> Output[pl.DataFrame]:
201 |     """ 
202 |         Load informationVideos data from gold to PostgreSQL warehouse
203 |     """
204 |     pl_data: pl.DataFrame = gold_information_trending
205 |     context.log.info(f"Load informationVideos data Success with shape {pl_data.shape}")
206 |     
207 |     return Output(
208 |         value=pl_data,
209 |         metadata={
210 |             "table name": MetadataValue.text("informationVideos"),
211 |             "record count": MetadataValue.int(pl_data.shape[0]),
212 |             "column count": MetadataValue.int(pl_data.shape[1]),
213 |             "columns": pl_data.columns
214 |         }
215 |     )


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/resources/spark_io_manager.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Union
  2 | from datetime import datetime
  3 | from dagster import IOManager, InputContext, OutputContext
  4 | 
  5 | import os
  6 | import polars as pl
  7 | import pandas as pd
  8 | from contextlib import contextmanager
  9 | from pyspark.sql import SparkSession, DataFrame
 10 | from pyspark import SparkConf
 11 | from .minio_io_manager import connect_minio
 12 | 
 13 | 
 14 | @contextmanager
 15 | def create_spark_session(config, appName=None):
 16 |     spark = (
 17 |         SparkSession.builder.appName(appName)
 18 |             .master("spark://spark-master:7077")
 19 |             .config("spark.driver.memory", "4g")
 20 |             .config("spark.executor.memory", "4g")
 21 |             # .config("spark.cores.max", "4")
 22 |             # .config("spark.executor.cores", "4")
 23 |             .config(
 24 |                 "spark.jars",
 25 |                 "/usr/local/spark/jars/delta-core_2.12-2.2.0.jar,/usr/local/spark/jars/hadoop-aws-3.3.2.jar,/usr/local/spark/jars/delta-storage-2.2.0.jar,/usr/local/spark/jars/aws-java-sdk-1.12.367.jar,/usr/local/spark/jars/s3-2.18.41.jar,/usr/local/spark/jars/aws-java-sdk-bundle-1.11.1026.jar",
 26 |             )
 27 |             .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
 28 |             .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
 29 |             .config("spark.hadoop.fs.s3a.endpoint", "http://" + config["endpoint_url"])
 30 |             .config("spark.hadoop.fs.s3a.access.key", str(config["aws_access_key_id"]))
 31 |             .config("spark.hadoop.fs.s3a.secret.key", str(config["aws_secret_access_key"]))
 32 |             .config("spark.hadoop.fs.s3a.path.style.access", "true")
 33 |             .config("spark.hadoop.fs.connection.ssl.enabled", "false")
 34 |             .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
 35 |             .config("spark.sql.execution.arrow.pyspark.enabled", "true")
 36 |             .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true")
 37 |             .getOrCreate()
 38 |     )
 39 | 
 40 |     try:
 41 |         yield spark
 42 |     except Exception as e:
 43 |         raise f"Error Pyspark: {e}"
 44 |     
 45 |     
 46 | class SparkIOManager(IOManager):
 47 |     
 48 |     def __init__(self, config) -> None:
 49 |         self._config = config
 50 | 
 51 |     
 52 |     def get_spark_session(self, context, appName=None) -> SparkSession:
 53 |         with create_spark_session(self._config, appName) as spark:
 54 |             context.log.info("Return Object SparkSession")
 55 |             return spark
 56 |         
 57 |         
 58 |     def _get_path(self, context: Union[InputContext, OutputContext]):
 59 |         layer, schema, table = context.asset_key.path
 60 |         key = "/".join([layer, schema, table.replace(f"{layer}_", "")])
 61 |         tmp_file_path = "/tmp/file-{}-{}.parquet".format(
 62 |             datetime.today().strftime("%Y%m%d%H%M%S"), 
 63 |             "-".join(context.asset_key.path)
 64 |         )
 65 |         return key, tmp_file_path
 66 |     
 67 |     
 68 |     def handle_output(self, context: OutputContext, obj: DataFrame):
 69 |         key_name, tmp_file_path = self._get_path(context)
 70 |         bucket_name = self._config.get("bucket")
 71 |         ## ====>
 72 |         file_path = "s3a://lakehouse/" + key_name
 73 |         context.log.info(f"file_path: {file_path}")
 74 |         context.log.info(f"key_name: {key_name}")
 75 |         
 76 |         if context.has_partition_key:
 77 |             start, end = context.asset_partitions_time_window
 78 |             # partition_str = context.asset_partition_key
 79 |             partition_str = start.strftime("%Y%m")
 80 |             context.log.info(f"INFO: {os.path.join(key_name, partition_str)}.parquet, {tmp_file_path}")
 81 |             key_name, tmp_file_path = os.path.join(key_name, f"{partition_str}.parquet"), tmp_file_path
 82 |         else:
 83 |             context.log.info(f"INFO: {key_name}.parquet, {tmp_file_path}")
 84 |             key_name, tmp_file_path = f"{key_name}.parquet", tmp_file_path
 85 |         
 86 |         
 87 |         obj.write.mode('overwrite').parquet(tmp_file_path)
 88 |  
 89 |         with connect_minio(self._config) as client:
 90 |             try:
 91 |                 bucket_name = self._config.get("bucket")
 92 |                 with connect_minio(self._config) as client:
 93 |                     # Make bucket if not exist.
 94 |                     found = client.bucket_exists(bucket_name)
 95 |                     if not found:
 96 |                         client.make_bucket(bucket_name)
 97 |                     else:
 98 |                         print(f"Bucket {bucket_name} already exists")
 99 |                     client.fput_object(bucket_name, key_name, tmp_file_path)
100 |                     row_count = obj.count()
101 |                     context.add_output_metadata(
102 |                         {
103 |                             "path": key_name, 
104 |                             "records": row_count, 
105 |                             "tmp": tmp_file_path
106 |                         }
107 |                     )
108 |                     # clean up tmp file
109 |                     os.remove(tmp_file_path)
110 |                     
111 |             except Exception as e:
112 |                 raise e
113 |     
114 |     
115 |     def load_input(self, context: InputContext) -> DataFrame:
116 |         key_name, tmp_file_path = self._get_path(context)
117 |         bucket_name = self._config.get("bucket")
118 |         
119 |         if context.has_asset_partitions:
120 |             start, end = context.asset_partitions_time_window
121 |             # partition_str = context.asset_partition_key
122 |             partition_str = start.strftime("%Y%m")
123 |             context.log.info(f"INFO: {os.path.join(key_name, partition_str)}.parquet, {tmp_file_path}")
124 |             key_name, tmp_file_path = os.path.join(key_name, f"{partition_str}.parquet"), tmp_file_path
125 |         else:
126 |             context.log.info(f"INFO: {key_name}.parquet, {tmp_file_path}")
127 |             key_name, tmp_file_path = f"{key_name}.parquet", tmp_file_path
128 |             
129 |         with connect_minio(self._config) as client:
130 |             try:
131 |                 with connect_minio(self._config) as client:
132 |                     # Make bucket if not exist.
133 |                     found = client.bucket_exists(bucket_name)
134 |                     if not found:
135 |                         client.make_bucket(bucket_name)
136 |                     else:
137 |                         print(f"Bucket {bucket_name} already exists")
138 |                     
139 |                     context.log.info(f"INFO -> bucket_name: {bucket_name}")
140 |                     context.log.info(f"INFO -> key_name: {key_name}")
141 |                     context.log.info(f"INFO -> tmp_file_path: {tmp_file_path}")
142 |                     
143 |                     client.fget_object(bucket_name, key_name, tmp_file_path)
144 |                     
145 |                     spark: SparkSession = self.get_spark_session(self, appName="Read-Parquet")
146 |                     df = spark.read.parquet(tmp_file_path)
147 |                     
148 |                     return df
149 |                     
150 |             except Exception as e:
151 |                 raise e


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/resources/youtube_io_manager.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from contextlib import contextmanager
  4 | from datetime import datetime, timedelta
  5 | from typing import Union, List
  6 | 
  7 | import polars as pl
  8 | from  googleapiclient.discovery import build
  9 | from googleapiclient.errors import HttpError
 10 | from dagster import IOManager, InputContext, OutputContext
 11 | from .minio_io_manager import connect_minio
 12 | from .. import constants
 13 | 
 14 | 
 15 | @contextmanager
 16 | def youtube_client(config: dict):
 17 |     api_service_name = config["api_service_name"]
 18 |     api_version = config["api_version"]
 19 |     api_key = config["api_key"]
 20 |     
 21 |     youtube = build(
 22 |         serviceName=api_service_name,
 23 |         version=api_version,
 24 |         developerKey=api_key
 25 |     )
 26 |     try:
 27 |         yield youtube
 28 |     except HttpError as e:
 29 |         raise 'An HTTP error %d occurred:\n%s' % (e.resp.status, e.content)
 30 |     
 31 |     
 32 | class YoutubeIOManager(IOManager):
 33 |     
 34 |     def __init__(self, config) -> None:
 35 |         self._config = config
 36 |         
 37 |         
 38 |     def _get_path(self, context: Union[InputContext, OutputContext]):
 39 |         
 40 |         start = constants.START_DATE
 41 |         end = constants.END_DATE
 42 |         start_date = datetime.strptime(start, "%Y-%m-%d")
 43 |         end_date = datetime.strptime(end, "%Y-%m-%d")
 44 | 
 45 |         layer, schema, table = context.asset_key.path
 46 |         table = "youtube_trending_data"
 47 |         layer = "bronze"
 48 |         key = "/".join([layer, schema, table.replace(f"{layer}_", "")])
 49 |         
 50 |         key_names: list[str] = []
 51 |         tmp_file_paths: list[str] = []
 52 |         
 53 |         for date in range((end_date - start_date).days + 1):
 54 |             partition_date = start_date + timedelta(days=date)
 55 |             partition_date.strftime("%Y-%m")
 56 |             key_name = f"{key}/" + str(partition_date)[:7].replace("-", "") + ".pq"
 57 |             # key_name = "bronze/youtube/youtube_trending_data/202011.pq"
 58 |             tmp_file_path = "/tmp/file-{}-{}.parquet".format(
 59 |                 datetime.today().strftime("%Y%m%d%H%M%S"), 
 60 |                 str(partition_date)[:7].replace("-", "")
 61 |             )
 62 |             # tmp_file_path = "/tmp/file-2020-11.parquet"
 63 |             context.log.info(f"INFO -> key_name: {key_name}")
 64 |             context.log.info(f"INFO -> tmp_file_path: {tmp_file_path}")
 65 |             
 66 |             key_names.append(key_name)
 67 |             tmp_file_paths.append(tmp_file_path)
 68 |             
 69 |         return key_names, tmp_file_paths
 70 |     
 71 |     
 72 |     def list_of_list(self, obj: pl.Series) -> list[list[str]]:
 73 |         start = 0
 74 |         end = 50
 75 |         lists: List[List] = []
 76 |         for lst in range(len(obj) // 50 + 1):
 77 |             lists.append(list(obj)[start:end])
 78 |             start += 50
 79 |             end += 50
 80 |         return lists
 81 |     
 82 |     
 83 |     def get_DataFrame(self, context, field: str) -> pl.DataFrame:
 84 |         bucket_name = self._config.get("bucket")
 85 |         key_names, tmp_file_paths = self._get_path(context)
 86 |         
 87 |         try:
 88 |             with connect_minio(self._config) as client:
 89 |                 # Make bucket if not exist.
 90 |                 found = client.bucket_exists(bucket_name)
 91 |                 if not found:
 92 |                     client.make_bucket(bucket_name)
 93 |                 else:
 94 |                     print(f"Bucket {bucket_name} already exists")
 95 |                     
 96 |         except Exception as e:
 97 |                 raise e
 98 |         
 99 |         list_dfs: list[pl.DataFrame] = []
100 |         for key_name, tmp_file_path in zip(key_names, tmp_file_paths):
101 |             client.fget_object(bucket_name, key_name, tmp_file_path)
102 |             df = pl.read_parquet(tmp_file_path)[field].unique()
103 |             list_dfs.append(df)
104 |             time.sleep(0.5)
105 |             
106 |             context.log.info(f"INFO -> key_name: {key_name}, tmp_file_path: {tmp_file_path}")
107 |             os.remove(tmp_file_path)
108 |             
109 |         pl_data = pl.concat(list_dfs).unique()
110 |         return pl_data
111 |         
112 |         
113 |     def downLoad_videoCategories(self, context, obj: pl.DataFrame) -> pl.DataFrame:  
114 |         
115 |         # pl_data = self.get_DataFrame(context, "categoryId")
116 |         pl_data = obj["categoryId"].unique()
117 |         
118 |         with youtube_client(self._config) as service:
119 |             categoryNames: list[str] = []
120 |             categoryIds: list[str] = []
121 |             
122 |             categoryId_list: pl.Series = pl_data
123 |             context.log.info("Divide categoryIds to multiple list categoryIds")
124 |             
125 |             for categoryId in list(categoryId_list.unique()):
126 |                 request = service.videoCategories().list(
127 |                     part="snippet",
128 |                     id=categoryId
129 |                 )
130 |                 response = request.execute()
131 |                 
132 |                 try:
133 |                     categoryIds.append(str(response["items"][0]["id"]))
134 |                     categoryNames.append(str(response["items"][0]["snippet"]["title"]))
135 |                 except IndexError:
136 |                     categoryNames.append(response["items"]["snippet"]["title"])
137 |         
138 |             
139 |         return pl.DataFrame(
140 |             {
141 |                 "categoryId": categoryIds, 
142 |                 "categoryName": categoryNames
143 |             }
144 |         )
145 | 
146 |     
147 |     def downLoad_linkVideos(self, context, obj: pl.DataFrame) -> pl.DataFrame:
148 |             
149 |         pl_data = obj["video_id"].unique()
150 |             
151 |         with youtube_client(self._config) as service:
152 |             link_videos: list[str] = []
153 |             videoIds: list[str] = []
154 |             
155 |             video_id_list: pl.Series = pl_data
156 |             context.log.info("Divide videoId to multiple list videoId")
157 |             
158 |             for videoId in self.list_of_list(video_id_list.unique()):
159 |                 # videoId = list(map(lambda id: id[1:-1], videoId))
160 |                 # context.log.info(",".join(videoId)[:20])
161 |                 request = service.videos().list(
162 |                     part="player",
163 |                     id=",".join(videoId)
164 |                 )
165 |                 response = request.execute()
166 |                 
167 |                 for data in response["items"]:
168 |                     try:
169 |                         videoIds.append(str(data["id"]))
170 |                         link_videos.append(str(data["player"]["embedHtml"][40:74]))
171 |                     except IndexError as e:
172 |                         link_videos.append(response["items"]["snippet"]["title"])
173 |                         raise e
174 |             
175 |         return pl.DataFrame(
176 |             {
177 |                 "videoId": videoIds, 
178 |                 "link_video": link_videos
179 |             }
180 |         )
181 |             
182 |             
183 |     def handle_output(self, context: OutputContext, obj: pl.DataFrame):
184 |         pass
185 |     
186 |     
187 |     def load_input(self, context: InputContext) -> pl.DataFrame:
188 |         bucket_name = self._config.get("bucket")
189 |         key_name, tmp_file_path = self._get_path(context)
190 |         
191 |         try:
192 |             with connect_minio(self._config) as client:
193 |                 # Make bucket if not exist.
194 |                 found = client.bucket_exists(bucket_name)
195 |                 if not found:
196 |                     client.make_bucket(bucket_name)
197 |                 else:
198 |                     print(f"Bucket {bucket_name} already exists")
199 |                     
200 |                 client.fget_object(bucket_name, key_name, tmp_file_path)
201 |                 pd_data = pl.read_parquet(tmp_file_path)
202 |                 return pd_data
203 |             
204 |         except Exception as e:
205 |             raise e
206 |     
207 | 
208 | 
209 | 


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/assets/gold.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import polars as pl
  3 | from datetime import datetime
  4 | from pyspark.sql import DataFrame
  5 | 
  6 | from dagster import (
  7 |     multi_asset,
  8 |     AssetIn,
  9 |     AssetOut,
 10 |     MetadataValue,
 11 |     AssetExecutionContext,
 12 |     Output
 13 | )
 14 | 
 15 | from ..partitions import monthly_partitions
 16 | from ..resources.spark_io_manager import create_spark_session
 17 | 
 18 | 
 19 | GROUP_NAME = "gold"
 20 | 
 21 | @multi_asset(
 22 |     ins={
 23 |         "silver_videoCategory_cleaned": AssetIn(
 24 |             key_prefix=["silver", "youtube"],
 25 |             input_manager_key="spark_io_manager"
 26 |         )
 27 |     },
 28 |     outs={
 29 |         "gold_videoCategory": AssetOut(
 30 |             key_prefix=["gold", "youtube"],
 31 |             io_manager_key="spark_io_manager",
 32 |             metadata={
 33 |                 "primary_keys": [
 34 |                     "categoryId"
 35 |                 ],
 36 |                 "columns": [
 37 |                     "categoryId",
 38 |                     "categoryName"
 39 |                 ]
 40 |             },
 41 |             group_name=GROUP_NAME
 42 |         )
 43 |     },
 44 |     name="gold_videoCategory",
 45 |     required_resource_keys={"spark_io_manager"},
 46 |     compute_kind="PySpark",
 47 | )
 48 | def gold_videoCategory(context: AssetExecutionContext,
 49 |                        silver_videoCategory_cleaned: pl.DataFrame
 50 | ) -> Output[DataFrame]:
 51 |     """ 
 52 |         Compute and Load videoCategory data from silver to gold layer in MinIO
 53 |     """
 54 |     CONFIG = {
 55 |         "endpoint_url": os.getenv("MINIO_ENDPOINT"),
 56 |         "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
 57 |         "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
 58 |     }
 59 |     
 60 |     with create_spark_session(
 61 |         CONFIG, "gold_videoCategory-{}".format(datetime.today())
 62 |     ) as spark:
 63 |         spark_df: DataFrame = spark.createDataFrame(silver_videoCategory_cleaned.to_pandas())
 64 |     context.log.info(f"Load {context.asset_key.path[-1]} to gold layer success 🙂")
 65 |     
 66 |     return Output(
 67 |         value=spark_df,
 68 |         metadata={
 69 |             "file name": MetadataValue.text("videoCategory.pq"),
 70 |             "record count": MetadataValue.int(spark_df.count()),
 71 |             "column count": MetadataValue.int(len(spark_df.columns)),
 72 |             "columns": spark_df.columns
 73 |         }
 74 |     )
 75 |     
 76 |     
 77 | @multi_asset(
 78 |     ins={
 79 |         "silver_linkVideos_cleaned": AssetIn(
 80 |             key_prefix=["silver", "youtube"],
 81 |             input_manager_key="spark_io_manager"
 82 |         )
 83 |     },
 84 |     outs={
 85 |         "gold_linkVideos": AssetOut(
 86 |             key_prefix=["gold", "youtube"],
 87 |             io_manager_key="spark_io_manager",
 88 |             metadata={
 89 |                 "primary_keys": [
 90 |                     "video_id"
 91 |                 ],
 92 |                 "columns": [
 93 |                     "video_id",
 94 |                     "link_video"
 95 |                 ]
 96 |             },
 97 |             group_name=GROUP_NAME
 98 |         )
 99 |     },
100 |     name="gold_linkVideos",
101 |     required_resource_keys={"spark_io_manager"},
102 |     compute_kind="PySpark"
103 | )
104 | def gold_linkVideos(context: AssetExecutionContext,
105 |                     silver_linkVideos_cleaned: pl.DataFrame
106 | ) -> Output[DataFrame]:
107 |     """ 
108 |         Compute and Load linkVideos data from silver to gold layer in MinIO
109 |     """
110 |     
111 |     CONFIG = {
112 |         "endpoint_url": os.getenv("MINIO_ENDPOINT"),
113 |         "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
114 |         "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
115 |     }
116 |     
117 |     with create_spark_session(
118 |         CONFIG, "gold_linkVideos-{}".format(datetime.today())
119 |     ) as spark:
120 |         spark_df: DataFrame = spark.createDataFrame(silver_linkVideos_cleaned.to_pandas())
121 |     context.log.info(f"Load {context.asset_key.path[-1]} to gold layer success 🙂")
122 |     
123 |     return Output(
124 |         value=spark_df,
125 |         metadata={
126 |             "file name": MetadataValue.text("linkVideos.pq"),
127 |             "record count": MetadataValue.int(spark_df.count()),
128 |             "column count": MetadataValue.int(len(spark_df.columns)),
129 |             "columns": spark_df.columns
130 |         }
131 |     )
132 |     
133 | 
134 | @multi_asset(
135 |     ins={
136 |         "silver_trending_cleaned": AssetIn(
137 |             key_prefix=["silver", "youtube"],
138 |             input_manager_key="spark_io_manager"
139 |         )
140 |     },
141 |     outs={
142 |         "gold_metric_trending": AssetOut(
143 |             key_prefix=["gold", "youtube"],
144 |             io_manager_key="spark_io_manager",
145 |             metadata={
146 |                 "primary_keys": [
147 |                     "video_id"
148 |                 ],
149 |                 "columns": [
150 |                     "video_id",
151 |                     "publishedAt",
152 |                     "trending_date",
153 |                     "channelId",
154 |                     "categoryId",
155 |                     "view_count",
156 |                     "likes",
157 |                     "dislikes",
158 |                     "comment_count"
159 |                 ]
160 |             },
161 |             group_name=GROUP_NAME
162 |         ),
163 |         "gold_information_trending": AssetOut(
164 |             key_prefix=["gold", "youtube"],
165 |             io_manager_key="spark_io_manager",
166 |             metadata={
167 |                 "primary_keys": [
168 |                     "video_id"
169 |                 ],
170 |                 "columns": [
171 |                     "video_id",
172 |                     "title",
173 |                     "channelId",
174 |                     "channelTitle",
175 |                     "categoryId",
176 |                     "tags",
177 |                     "thumbnail_link",
178 |                     "comments_disabled",
179 |                     "ratings_disabled",
180 |                 ]
181 |             },
182 |             group_name=GROUP_NAME
183 |         ),
184 |     },
185 |     name="gold_metric_trending",
186 |     required_resource_keys={"spark_io_manager"},
187 |     partitions_def=monthly_partitions,
188 |     compute_kind="pyspark"
189 | )
190 | def gold_metric_trending(context: AssetExecutionContext,
191 |                          silver_trending_cleaned: pl.DataFrame
192 | ):
193 |     """ 
194 |         Compute and Load trending data from silver to gold layer in MinIO
195 |     """
196 |     
197 |     CONFIG = {
198 |         "endpoint_url": os.getenv("MINIO_ENDPOINT"),
199 |         "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
200 |         "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
201 |     }
202 |     
203 |     with create_spark_session(
204 |         CONFIG, "gold_metric_trending-{}".format(datetime.today())
205 |     ) as spark:
206 |     
207 |         metric: DataFrame = spark.createDataFrame(silver_trending_cleaned.select([
208 |                         "video_id",
209 |                         "publishedAt",
210 |                         "trending_date",
211 |                         "channelId",
212 |                         "categoryId",
213 |                         "view_count",
214 |                         "likes",
215 |                         "dislikes",
216 |                         "comment_count"
217 |                     ]))
218 |         information: DataFrame = spark.createDataFrame(silver_trending_cleaned.select([
219 |                         "video_id",
220 |                         "title",
221 |                         "channelId",
222 |                         "channelTitle",
223 |                         "categoryId",
224 |                         "tags",
225 |                         "thumbnail_link",
226 |                         "comments_disabled",
227 |                         "ratings_disabled",
228 |                     ]))
229 |     context.log.info(f"Load {context.asset_key.path[-1]} to gold layer success 🙂")
230 |     
231 |     return Output(
232 |         value=metric,
233 |         output_name="gold_metric_trending",
234 |         metadata={
235 |             "folder name": MetadataValue.text("metric_trending"),
236 |             "record count": MetadataValue.int(metric.count()),
237 |             "column count": MetadataValue.int(len(metric.columns)),
238 |             "columns": metric.columns
239 |         }
240 |     ), Output(
241 |         value=information,
242 |         output_name="gold_information_trending",
243 |         metadata={
244 |             "folder name": MetadataValue.text("information_trending"),
245 |             "record count": MetadataValue.int(information.count()),
246 |             "column count": MetadataValue.int(len(information.columns)),
247 |             "columns": information.columns
248 |         }
249 |     ),


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/assets/bronze.py:
--------------------------------------------------------------------------------
  1 | import polars as pl
  2 | from ..partitions import monthly_partitions
  3 | 
  4 | from dagster import (
  5 |     asset,
  6 |     Output,
  7 |     AssetIn,
  8 |     AssetOut,
  9 |     multi_asset,
 10 |     MetadataValue,
 11 |     AssetExecutionContext
 12 | )
 13 | 
 14 | 
 15 | GROUP_NAME = "bronze"
 16 | 
 17 | @asset(
 18 |     name="bronze_CA_youtube_trending",
 19 |     required_resource_keys={"mysql_io_manager"},
 20 |     io_manager_key="minio_io_manager",
 21 |     key_prefix=["bronze", "youtube"],
 22 |     compute_kind="SQL",
 23 |     group_name=GROUP_NAME
 24 | )
 25 | def bronze_CA_youtube_trending(context: AssetExecutionContext) -> Output[pl.DataFrame]:
 26 |     """
 27 |         Load table 'CA_youtube_trending_data'
 28 |         from MySQL database as polars DataFrame and save to MinIO
 29 |     """
 30 |     query = """ SELECT * FROM CA_youtube_trending_data; """
 31 |     
 32 |     pl_data: pl.DataFrame = context.resources.mysql_io_manager.extract_data(query)
 33 |     context.log.info(f"Extract table 'CA_youtube_trending_data' from MySQL Success")
 34 |     pl_data = pl_data.with_columns(pl.lit("CA").alias("country_code"))
 35 |     
 36 |     return Output(
 37 |         value=pl_data,
 38 |         metadata={
 39 |             "file name": MetadataValue.text("CA_youtube_trending.pq"),
 40 |             "number columns": MetadataValue.int(pl_data.shape[1]),
 41 |             "number records": MetadataValue.int(pl_data.shape[0])
 42 |         }
 43 |     )
 44 |         
 45 |     
 46 | @asset(
 47 |     name="bronze_DE_youtube_trending",
 48 |     required_resource_keys={"mysql_io_manager"},
 49 |     io_manager_key="minio_io_manager",
 50 |     key_prefix=["bronze", "youtube"],
 51 |     compute_kind="SQL",
 52 |     group_name=GROUP_NAME
 53 | )
 54 | def bronze_DE_youtube_trending(context: AssetExecutionContext) -> Output[pl.DataFrame]:
 55 |     """
 56 |         Load table 'DE_youtube_trending_data'
 57 |         from MySQL database as polars DataFrame and save to MinIO
 58 |     """
 59 |     query = """ SELECT * FROM DE_youtube_trending_data; """
 60 |     
 61 |     pl_data: pl.DataFrame = context.resources.mysql_io_manager.extract_data(query)
 62 |     context.log.info(f"Extract table 'DE_youtube_trending_data' from MySQL Success")
 63 |     pl_data = pl_data.with_columns(pl.lit("DE").alias("country_code"))
 64 |     
 65 |     return Output(
 66 |         value=pl_data,
 67 |         metadata={
 68 |             "file name": MetadataValue.text("DE_youtube_trending.pq"),
 69 |             "number columns": MetadataValue.int(pl_data.shape[1]),
 70 |             "number records": MetadataValue.int(pl_data.shape[0])
 71 |         }
 72 |     )
 73 |     
 74 |     
 75 | @asset(
 76 |     name="bronze_IN_youtube_trending",
 77 |     required_resource_keys={"mysql_io_manager"},
 78 |     io_manager_key="minio_io_manager",
 79 |     key_prefix=["bronze", "youtube"],
 80 |     compute_kind="SQL",
 81 |     group_name=GROUP_NAME
 82 | )
 83 | def bronze_IN_youtube_trending(context: AssetExecutionContext) -> Output[pl.DataFrame]:
 84 |     """
 85 |         Load table 'IN_youtube_trending_data'
 86 |         from MySQL database as polars DataFrame and save to MinIO
 87 |     """
 88 |     query = """ SELECT * FROM IN_youtube_trending_data; """
 89 |     
 90 |     pl_data: pl.DataFrame = context.resources.mysql_io_manager.extract_data(query)
 91 |     context.log.info(f"Extract table 'IN_youtube_trending_data' from MySQL Success")
 92 |     pl_data = pl_data.with_columns(pl.lit("IN").alias("country_code"))
 93 |     
 94 |     return Output(
 95 |         value=pl_data,
 96 |         metadata={
 97 |             "file name": MetadataValue.text("IN_youtube_trending.pq"),
 98 |             "number columns": MetadataValue.int(pl_data.shape[1]),
 99 |             "number records": MetadataValue.int(pl_data.shape[0])
100 |         }
101 |     )
102 |     
103 |     
104 | @asset(
105 |     name="bronze_JP_youtube_trending",
106 |     required_resource_keys={"mysql_io_manager"},
107 |     io_manager_key="minio_io_manager",
108 |     key_prefix=["bronze", "youtube"],
109 |     compute_kind="SQL",
110 |     group_name=GROUP_NAME
111 | )
112 | def bronze_JP_youtube_trending(context: AssetExecutionContext) -> Output[pl.DataFrame]:
113 |     """
114 |         Load table 'JP_youtube_trending_data'
115 |         from MySQL database as polars DataFrame and save to MinIO
116 |     """
117 |     query = """ SELECT * FROM JP_youtube_trending_data; """
118 |     
119 |     pl_data: pl.DataFrame = context.resources.mysql_io_manager.extract_data(query)
120 |     context.log.info(f"Extract table 'JP_youtube_trending_data' from MySQL Success")
121 |     pl_data = pl_data.with_columns(pl.lit("JP").alias("country_code"))
122 |     
123 |     return Output(
124 |         value=pl_data,
125 |         metadata={
126 |             "file name": MetadataValue.text("JP_youtube_trending.pq"),
127 |             "number columns": MetadataValue.int(pl_data.shape[1]),
128 |             "number records": MetadataValue.int(pl_data.shape[0])
129 |         }
130 |     )
131 | 
132 | 
133 | @asset(
134 |     name="bronze_RU_youtube_trending",
135 |     required_resource_keys={"mysql_io_manager"},
136 |     io_manager_key="minio_io_manager",
137 |     key_prefix=["bronze", "youtube"],
138 |     compute_kind="SQL",
139 |     group_name=GROUP_NAME
140 | )
141 | def bronze_RU_youtube_trending(context: AssetExecutionContext) -> Output[pl.DataFrame]:
142 |     """
143 |         Load table 'RU_youtube_trending_data'
144 |         from MySQL database as polars DataFrame and save to MinIO
145 |     """
146 |     query = """ SELECT * FROM RU_youtube_trending_data; """
147 |     
148 |     pl_data: pl.DataFrame = context.resources.mysql_io_manager.extract_data(query)
149 |     context.log.info(f"Extract table 'RU_youtube_trending_data' from MySQL Success")
150 |     pl_data = pl_data.with_columns(pl.lit("RU").alias("country_code"))
151 |     
152 |     return Output(
153 |         value=pl_data,
154 |         metadata={
155 |             "file name": MetadataValue.text("RU_youtube_trending.pq"),
156 |             "number columns": MetadataValue.int(pl_data.shape[1]),
157 |             "number records": MetadataValue.int(pl_data.shape[0])
158 |         }
159 |     )
160 | 
161 | 
162 | @asset(
163 |     ins={
164 |         "bronze_CA_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
165 |         "bronze_DE_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
166 |         "bronze_IN_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"])
167 |     },
168 |     name="bronze_linkVideos_trending",
169 |     required_resource_keys={"youtube_io_manager"},
170 |     io_manager_key="minio_io_manager",
171 |     key_prefix=["bronze", "youtube"],
172 |     group_name=GROUP_NAME,
173 |     compute_kind="Youtube API"
174 | )
175 | def bronze_linkVideos_trending(context: AssetExecutionContext,
176 |                                 bronze_CA_youtube_trending: pl.DataFrame,
177 |                                 bronze_DE_youtube_trending: pl.DataFrame,
178 |                                 bronze_IN_youtube_trending: pl.DataFrame
179 | ) -> Output[pl.DataFrame]:
180 |     """
181 |         Download Link Video from Youtube API by VideoId 
182 |     """
183 |     data = pl.concat(
184 |         [
185 |             bronze_CA_youtube_trending,
186 |             bronze_DE_youtube_trending,
187 |             bronze_IN_youtube_trending
188 |         ]
189 |     )
190 |     
191 |     pl_data: pl.DataFrame = context \
192 |             .resources \
193 |             .youtube_io_manager \
194 |             .downLoad_linkVideos(
195 |                 context, data
196 |         )
197 |     context.log.info("Download links video from youtube api success")
198 |     
199 |     return Output(
200 |         value=pl_data,
201 |         metadata={
202 |             "File Name": MetadataValue.text("linkVideos_trending.pq"),
203 |             "Number Columns": MetadataValue.int(pl_data.shape[1]),
204 |             "Number Records": MetadataValue.int(pl_data.shape[0])
205 |         }
206 |     )
207 |     
208 |     
209 | @asset(
210 |     ins={
211 |         "bronze_JP_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
212 |         "bronze_RU_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"])
213 |     },
214 |     name="bronze_videoCategory_trending",
215 |     required_resource_keys={"youtube_io_manager"},
216 |     io_manager_key="minio_io_manager",
217 |     key_prefix=["bronze", "youtube"],
218 |     compute_kind="Youtube API",
219 |     group_name=GROUP_NAME,
220 | )
221 | def bronze_videoCategory_trending(context: AssetExecutionContext,
222 |                                   bronze_JP_youtube_trending: pl.DataFrame,
223 |                                   bronze_RU_youtube_trending: pl.DataFrame
224 | ) -> Output[pl.DataFrame]:
225 |     """
226 |         Download Video Category from Youtube API by categoryId 
227 |     """
228 |     data = pl.concat(
229 |         [
230 |             bronze_JP_youtube_trending,
231 |             bronze_RU_youtube_trending
232 |         ]
233 |     )
234 |     
235 |     pl_data: pl.DataFrame = context \
236 |             .resources \
237 |             .youtube_io_manager \
238 |             .downLoad_videoCategories(
239 |                 context, data
240 |         )
241 |     context.log.info("Download video category from youtube api success")
242 |     
243 |     return Output(
244 |         value=pl_data,
245 |         metadata={
246 |             "File Name": MetadataValue.text("videoCategory_trending.pq"),
247 |             "Number Columns": MetadataValue.int(pl_data.shape[1]),
248 |             "Number Records": MetadataValue.int(pl_data.shape[0]),
249 |         }
250 |     )
251 |     
252 |     
253 | @asset(
254 |     ins={
255 |         "bronze_CA_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
256 |         "bronze_DE_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
257 |         "bronze_IN_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
258 |         "bronze_JP_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
259 |         "bronze_RU_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"])
260 |     },
261 |     name="bronze_youtube_trending",
262 |     required_resource_keys={"youtube_io_manager"},
263 |     io_manager_key="minio_io_manager",
264 |     key_prefix=["bronze", "youtube"],
265 |     compute_kind="Polars",
266 |     group_name=GROUP_NAME,
267 | )
268 | def bronze_youtube_trending(context: AssetExecutionContext,
269 |                             bronze_CA_youtube_trending: pl.DataFrame,
270 |                             bronze_DE_youtube_trending: pl.DataFrame,
271 |                             bronze_IN_youtube_trending: pl.DataFrame,
272 |                             bronze_JP_youtube_trending: pl.DataFrame,
273 |                             bronze_RU_youtube_trending: pl.DataFrame
274 | ) -> Output[pl.DataFrame]: 
275 |     """  """
276 |     
277 |     pl_data = pl.concat(
278 |         [
279 |             bronze_CA_youtube_trending,
280 |             bronze_DE_youtube_trending,
281 |             bronze_IN_youtube_trending,
282 |             bronze_JP_youtube_trending,
283 |             bronze_RU_youtube_trending
284 |         ]
285 |     )
286 |     
287 |     # 2020-08-11T16:34:06Z
288 |     pl_data = pl_data.with_columns(pl.col('publishedAt').apply(lambda e: e.replace('T', ' ').replace('Z', '')))
289 |     pl_data = pl_data.with_columns(pl.col("publishedAt").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S"))
290 |     
291 |     return Output(
292 |         value=pl_data,
293 |         metadata={
294 |             "File Name": MetadataValue.text("youtube_trending.pq"),
295 |             "Number Columns": MetadataValue.int(pl_data.shape[1]),
296 |             "Number Records": MetadataValue.int(pl_data.shape[0]),
297 |         }
298 |     )


--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/assets/silver.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import polars as pl
  3 | import pandas as pd
  4 | from datetime import datetime
  5 | 
  6 | from pyspark.sql import SparkSession, DataFrame
  7 | from pyspark.sql.types import IntegerType, StringType
  8 | from pyspark.sql.functions import udf, to_timestamp, count
  9 | from pyspark.sql.functions import when, col, concat, lit
 10 | 
 11 | from ..partitions import monthly_partitions
 12 | from ..func_process import replace_str, format_date, convert
 13 | from ..resources.spark_io_manager import create_spark_session
 14 | 
 15 | from dagster import (
 16 |     AssetExecutionContext,
 17 |     MetadataValue,
 18 |     AssetIn,
 19 |     AssetIn,
 20 |     Output,
 21 |     asset
 22 | )
 23 | 
 24 | 
 25 | GROUP_NAME = "silver"
 26 | 
 27 | @asset(
 28 |     ins={
 29 |         "bronze_videoCategory_trending": AssetIn(
 30 |             key_prefix=["bronze", "youtube"]
 31 |         )
 32 |     },
 33 |     name="silver_videoCategory_cleaned",
 34 |     required_resource_keys={"spark_io_manager"},
 35 |     io_manager_key="spark_io_manager",
 36 |     key_prefix=["silver", "youtube"],
 37 |     compute_kind="PySpark",
 38 |     group_name=GROUP_NAME
 39 | )
 40 | def silver_videoCategory_cleaned(context: AssetExecutionContext,
 41 |                                  bronze_videoCategory_trending: pl.DataFrame
 42 |     ) -> Output[DataFrame]:
 43 |     """ 
 44 |         Clean 'videoCategory_trending_data' and load to silver layer in MinIO
 45 |     """
 46 |     # spark: SparkSession = context.resources.spark_io_manager.get_spark_session(
 47 |     #     context, "silver_videoCategory_cleaned-{}".format(datetime.today())
 48 |     # )
 49 |     CONFIG = {
 50 |         "endpoint_url": os.getenv("MINIO_ENDPOINT"),
 51 |         "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
 52 |         "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
 53 |     }
 54 | 
 55 |     with create_spark_session(
 56 |         CONFIG, "silver_videoCategory_cleaned-{}".format(datetime.today())
 57 |     ) as spark:
 58 |         
 59 |         # Convert from polars dataframe to pyspark dataframe
 60 |         spark_df: DataFrame = spark.createDataFrame(bronze_videoCategory_trending.to_pandas())
 61 |         # Convert data type from string to integer of categoryId column
 62 |         spark_df = spark_df.withColumn("categoryId", spark_df["categoryId"].cast(IntegerType()))
 63 |         # Sorted dataframe by categoryId column
 64 |         spark_df = spark_df.orderBy(spark_df["categoryId"])
 65 |         # polars_df = pl.DataFrame(spark_df.toPandas())
 66 |         context.log.info(f"Cleaning for {context.asset_key.path[-1]} success 🙂")
 67 |     
 68 |     return Output(
 69 |         value=spark_df,
 70 |         metadata={
 71 |             "File Name": MetadataValue.text("videoCategory_cleaned.pq"),
 72 |             "Number Columns": MetadataValue.int(len(spark_df.columns)),
 73 |             "Number Records": MetadataValue.int(spark_df.count())
 74 |         }
 75 |     )
 76 |     
 77 |     
 78 | @asset(
 79 |     ins={
 80 |         "bronze_linkVideos_trending": AssetIn(key_prefix=["bronze", "youtube"]),
 81 |         "bronze_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
 82 |     },
 83 |     name="silver_linkVideos_cleaned",
 84 |     required_resource_keys={"spark_io_manager", "youtube_io_manager"},
 85 |     io_manager_key="spark_io_manager",
 86 |     key_prefix=["silver", "youtube"],
 87 |     compute_kind="PySpark",
 88 |     group_name=GROUP_NAME
 89 | )
 90 | def silver_linkVideos_cleaned(context: AssetExecutionContext,
 91 |                               bronze_linkVideos_trending: pl.DataFrame,
 92 |                               bronze_youtube_trending: pl.DataFrame
 93 |     ) -> Output[DataFrame]:
 94 |     """ 
 95 |         Clean 'linkVideos_trending_data' and load to silver layer in MinIO
 96 |     """
 97 |     # spark: SparkSession = context.resources.spark_io_manager.get_spark_session(
 98 |     #     context, "silver_linkVideos_cleaned-{}".format(datetime.today())
 99 |     # )
100 |     
101 |     CONFIG = {
102 |         "endpoint_url": os.getenv("MINIO_ENDPOINT"),
103 |         "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
104 |         "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
105 |     }
106 |     
107 |     with create_spark_session(
108 |         CONFIG, "silver_linkVideos_cleaned-{}".format(datetime.today())
109 |     ) as spark:
110 |         
111 |         # Convert from polars dataframe to pyspark dataframe for linkVideos
112 |         linkVideos: DataFrame = spark.createDataFrame(bronze_linkVideos_trending.to_pandas())
113 |         # Convert from polars dataframe to pyspark dataframe for trending
114 |         trending: DataFrame = spark.createDataFrame(bronze_youtube_trending.to_pandas())
115 |         # Drop duplicates by video_id for trending
116 |         trending = trending.dropDuplicates(["video_id"])
117 |         # Convert the link to the correct format
118 |         link_format = udf(convert, StringType())
119 |         linkVideos = linkVideos.withColumn("link_video", link_format(linkVideos['link_video']))
120 |         # Join two dataframe by video_id
121 |         spark_df = linkVideos.join(
122 |             trending,
123 |             linkVideos["videoId"] == trending["video_id"],
124 |             how="outer",
125 |         ).select(trending.video_id, linkVideos.link_video)
126 |         spark_df.cache()
127 |         
128 |         # fill NA for link video
129 |         spark_df = spark_df.withColumn("link_video",when( 
130 |                     col("link_video").isNull(), 
131 |                     concat(lit("www.youtube.com/embed/"),
132 |                     col("video_id"))).otherwise(col("link_video"))
133 |         )
134 |         context.log.info(f"Cleaning for {context.asset_key.path[-1]} success 🙂")
135 |         
136 |         spark_df.unpersist()
137 |     
138 |     # trending = pl.concat(
139 |     #     [
140 |     #         silver_youtube_trending_01,
141 |     #         silver_youtube_trending_02
142 |     #     ]
143 |     # )
144 |     # bronze_linkVideos_trending = bronze_linkVideos_trending.with_columns(
145 |     #     pl.col('link_video').apply(lambda e: e.replace('"', ''))
146 |     # )
147 |     # bronze_youtube_trending = bronze_youtube_trending.unique(subset=["video_id"])
148 |     # polars_df = bronze_linkVideos_trending.join(
149 |     #     bronze_youtube_trending, 
150 |     #     left_on="videoId", 
151 |     #     right_on="video_id", 
152 |     #     how="outer"
153 |     # ).select(["video_id", "link_video"])
154 |     
155 |     # polars_df = polars_df.with_columns(
156 |     #     pl.when(pl.col("link_video").is_null()).then(pl.format("www.youtube.com/embed/{}", pl.col("video_id")))
157 |     #       .otherwise(pl.col("link_video")).alias("link_video")
158 |     # )
159 |     # context.log.info(f"Cleaning for {context.asset_key.path[-1]} success 🙂")
160 | 
161 |     return Output(
162 |         value=spark_df,
163 |         metadata={
164 |             "File Name": MetadataValue.text("linkVideos_cleaned.pq"),
165 |             "Number Columns": MetadataValue.int(len(spark_df.columns)),
166 |             "Number Records": MetadataValue.int(spark_df.count())
167 |         }
168 |     )
169 |     
170 |     
171 | @asset(
172 |     ins={
173 |         "bronze_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
174 |     },
175 |     name="silver_trending_cleaned",
176 |     required_resource_keys={"spark_io_manager"},
177 |     io_manager_key="spark_io_manager",
178 |     key_prefix=["silver", "youtube"],
179 |     partitions_def=monthly_partitions,
180 |     compute_kind="PySpark",
181 |     group_name=GROUP_NAME
182 | )
183 | def silver_trending_cleaned(context: AssetExecutionContext,
184 |                           bronze_youtube_trending: pl.DataFrame,
185 |     ) -> Output[DataFrame]:
186 |     """
187 |         Clean 'bronze_youtube_trending_data' and load to silver layer in MinIO
188 |     """
189 | 
190 |     try:
191 |         partition_date_str = context.asset_partition_key_for_output()
192 |         data_by_publishedAt = bronze_youtube_trending.filter(
193 |             (pl.col("publishedAt").dt.year() == int(partition_date_str[:4])) &
194 |             (pl.col("publishedAt").dt.month() == int(partition_date_str[5:7]))
195 |         )
196 |     except Exception as e:
197 |         raise Exception(f"{e}")
198 | 
199 |     # data_by_publishedAt = data_by_publishedAt.with_columns(
200 |     #     pl.col('trending_date').apply(lambda e: e.replace('T', ' ').replace('Z', ''))
201 |     # )
202 |     # data_by_publishedAt = data_by_publishedAt.with_columns(
203 |     #     pl.col("trending_date").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S")
204 |     # )
205 |     # data_by_publishedAt = data_by_publishedAt.with_columns(
206 |     #         pl.when(pl.col("thumbnail_link").is_not_null())
207 |     #           .then(pl.col("thumbnail_link").str.replace("default.jpg", "maxresdefault.jpg"))
208 |     #           .otherwise(pl.col("thumbnail_link")).alias("thumbnail_link")
209 |     # )
210 |     
211 |     # data_by_publishedAt = data_by_publishedAt.with_columns(
212 |     #     pl.col("comment_count").str.parse_int(10, strict=False)
213 |     # )
214 |     # data_by_publishedAt = data_by_publishedAt.filter(pl.col("comment_count").is_not_null())
215 |     
216 |     # data_by_publishedAt = data_by_publishedAt.with_columns(
217 |     #     pl.col('tags').apply(lambda e: e.replace('|', ' #').replace('Z', ''))
218 |     # ) #Squeezie arnaque #Squeezie tableau #Squeezie thread #Squeezie art #Squeezie arnaqueur
219 |     
220 |     # data_by_publishedAt = data_by_publishedAt.with_columns(
221 |     #     (pl.col('tags').apply(lambda x: f"#{x}"))
222 |     # )
223 |         
224 |     # data_by_publishedAt = data_by_publishedAt.with_columns([
225 |     #     pl.col("categoryId").cast(pl.Int64),
226 |     #     pl.col("view_count").cast(pl.Int64),
227 |     #     pl.col("likes").cast(pl.Int64),
228 |     #     pl.col("dislikes").cast(pl.Int64),
229 |     #     pl.col("comment_count").cast(pl.Int64)
230 |     # ])
231 |     # context.log.info(f"Cleaning for {context.asset_key.path[-1]} success 🙂")
232 |     
233 |     # polars_df: pl.DataFrame = data_by_publishedAt
234 |     
235 |     # spark: SparkSession = context.resources.spark_io_manager.get_spark_session(
236 |     #     context, "silver_trending_cleaned-{}".format(datetime.today())
237 |     # )
238 |     
239 |     CONFIG = {
240 |         "endpoint_url": os.getenv("MINIO_ENDPOINT"),
241 |         "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
242 |         "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
243 |     }
244 |     
245 |     with create_spark_session(
246 |         CONFIG, "silver_trending_cleaned-{}".format(datetime.today())
247 |     ) as spark:
248 |         
249 |         spark_df: DataFrame = spark.createDataFrame(data_by_publishedAt.to_pandas())
250 |         # publishedAt replace to format date
251 |         date_format = udf(format_date, StringType())
252 |         # spark_df = spark_df.withColumn("publishedAt", date_format(spark_df["publishedAt"]))
253 |         # Convert date type of column publishedAt to datetime data type
254 |         spark_df = spark_df.withColumn("publishedAt", to_timestamp("publishedAt"))
255 |         # Convert date type of column categoryId to integer data type
256 |         spark_df = spark_df.withColumn("categoryId", spark_df["categoryId"].cast(IntegerType()))
257 |         # trending_date replace to format date
258 |         spark_df = spark_df.withColumn("trending_date", date_format(spark_df["trending_date"]))
259 |         # Convert date type of column trending_date to datetime data type
260 |         spark_df = spark_df.withColumn("trending_date", to_timestamp("trending_date"))
261 |         # Convert date type of column view_count to integer data type
262 |         spark_df = spark_df.withColumn("view_count", spark_df["view_count"].cast(IntegerType()))    
263 |         # Convert date type of column likes to integer data type
264 |         spark_df = spark_df.withColumn("likes", spark_df["likes"].cast(IntegerType()))
265 |         # Convert date type of column dislikes to integer data type
266 |         spark_df = spark_df.withColumn("dislikes", spark_df["dislikes"].cast(IntegerType()))
267 |         # Convert date type of column comment_count to integer data type
268 |         spark_df = spark_df.withColumn("comment_count", spark_df["comment_count"].cast(IntegerType()))
269 |         # thumbnail_link replace from default to maxresdefault
270 |         link_convert = udf(replace_str, StringType())
271 |         spark_df = spark_df.withColumn("thumbnail_link", link_convert(spark_df["thumbnail_link"]))
272 |         # context.log.info(f"Data: {spark_df.show(5)}")
273 |         spark_df.unpersist()
274 |         context.log.info(f"Cleaning for {context.asset_key.path[-1]} success 🙂")
275 |         # polars_df = pl.DataFrame(spark_df.toPandas())
276 |     
277 |     return Output(
278 |         value=spark_df,
279 |         metadata={
280 |             "file name": MetadataValue.text(f"{partition_date_str[:7]}.pq"),
281 |             "Records": MetadataValue.int(spark_df.count()),
282 |             "Columns": MetadataValue.int(len(spark_df.columns))
283 |         }
284 |     )


--------------------------------------------------------------------------------
/public/notebooks/Preprocessing.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |   "cells": [
   3 |     {
   4 |       "cell_type": "markdown",
   5 |       "id": "0wCyW8k45E-a",
   6 |       "metadata": {
   7 |         "id": "0wCyW8k45E-a"
   8 |       },
   9 |       "source": [
  10 |         "# Xử lý trước khi đưa vào Asset"
  11 |       ]
  12 |     },
  13 |     {
  14 |       "cell_type": "markdown",
  15 |       "id": "6880fc6d",
  16 |       "metadata": {
  17 |         "id": "6880fc6d"
  18 |       },
  19 |       "source": [
  20 |         "# Import Library"
  21 |       ]
  22 |     },
  23 |     {
  24 |       "cell_type": "code",
  25 |       "execution_count": 1,
  26 |       "id": "8kaoHMK_lY1U",
  27 |       "metadata": {
  28 |         "colab": {
  29 |           "base_uri": "https://localhost:8080/"
  30 |         },
  31 |         "id": "8kaoHMK_lY1U",
  32 |         "outputId": "10480db2-cc8d-44ef-8fda-0c3bba66971f"
  33 |       },
  34 |       "outputs": [
  35 |         {
  36 |           "name": "stdout",
  37 |           "output_type": "stream",
  38 |           "text": [
  39 |             "Mounted at /content/drive\n"
  40 |           ]
  41 |         }
  42 |       ],
  43 |       "source": [
  44 |         "from google.colab import drive\n",
  45 |         "drive.mount('/content/drive')"
  46 |       ]
  47 |     },
  48 |     {
  49 |       "cell_type": "code",
  50 |       "execution_count": 4,
  51 |       "id": "6-BAYjQS2NzN",
  52 |       "metadata": {
  53 |         "id": "6-BAYjQS2NzN"
  54 |       },
  55 |       "outputs": [],
  56 |       "source": [
  57 |         "from pyspark.sql import SparkSession, DataFrame\n",
  58 |         "from pyspark.sql.types import IntegerType, StringType\n",
  59 |         "from pyspark.sql.functions import udf, to_timestamp"
  60 |       ]
  61 |     },
  62 |     {
  63 |       "cell_type": "code",
  64 |       "execution_count": 6,
  65 |       "id": "45adf8f6-0480-49af-966b-d1dfefa29ab3",
  66 |       "metadata": {
  67 |         "id": "45adf8f6-0480-49af-966b-d1dfefa29ab3"
  68 |       },
  69 |       "outputs": [],
  70 |       "source": [
  71 |         "# Create SparkSession object\n",
  72 |         "spark = SparkSession.builder \\\n",
  73 |         "                    .appName(\"HomeWork-W6\") \\\n",
  74 |         "                    .getOrCreate()"
  75 |       ]
  76 |     },
  77 |     {
  78 |       "cell_type": "code",
  79 |       "execution_count": 8,
  80 |       "id": "dhgY_ug22qoX",
  81 |       "metadata": {
  82 |         "id": "dhgY_ug22qoX"
  83 |       },
  84 |       "outputs": [],
  85 |       "source": [
  86 |         "spark_df = spark.read.parquet(\"/content/drive/MyDrive/Colab Notebooks/202204.pq\")"
  87 |       ]
  88 |     },
  89 |     {
  90 |       "cell_type": "code",
  91 |       "execution_count": 9,
  92 |       "id": "dY1y4YBf2zox",
  93 |       "metadata": {
  94 |         "colab": {
  95 |           "base_uri": "https://localhost:8080/"
  96 |         },
  97 |         "id": "dY1y4YBf2zox",
  98 |         "outputId": "329f8ff8-632c-4018-e840-94cf7f8da43d"
  99 |       },
 100 |       "outputs": [
 101 |         {
 102 |           "name": "stdout",
 103 |           "output_type": "stream",
 104 |           "text": [
 105 |             "+-----------+--------------------+--------------------+--------------------+---------------+----------+--------------------+--------------------+----------+-----+--------+-------------+--------------------+-----------------+----------------+\n",
 106 |             "|   video_id|               title|         publishedAt|           channelId|   channelTitle|categoryId|       trending_date|                tags|view_count|likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|\n",
 107 |             "+-----------+--------------------+--------------------+--------------------+---------------+----------+--------------------+--------------------+----------+-----+--------+-------------+--------------------+-----------------+----------------+\n",
 108 |             "|zoHGxJKjC_Y|Heiratsantrag, di...|2022-04-01T16:36:48Z|UCm3_j4RLEzgMovQT...|   Drachen Lord|        24|2022-04-02T00:00:00Z|drachenlord origi...|    126194| 4922|       0|         2069|https://i.ytimg.c...|            False|           False|\n",
 109 |             "|s38-OigKoIU|Nachgefragt: Panz...|2022-04-01T11:03:23Z|UClCZul-nK9h8eVo7...|     Bundeswehr|        25|2022-04-02T00:00:00Z|Bundeswehr|Soldat...|    345217|10056|       0|         1927|https://i.ytimg.c...|            False|           False|\n",
 110 |             "|fn_DBhbEscA|Aprilscherze in d...|2022-04-01T12:00:31Z|UC6UrlhHQXm9tWhZc...|     How2Shirli|        22|2022-04-02T00:00:00Z|              [None]|    353375|47638|       0|          517|https://i.ytimg.c...|            False|           False|\n",
 111 |             "|JpiJT7lLuAE|       MOIN GERHARD!|2022-04-01T12:33:48Z|UC3oj6YrK6Tj3tR6-...|      Tom Stein|        24|2022-04-02T00:00:00Z|              [None]|     67361| 6114|       0|          435|https://i.ytimg.c...|            False|           False|\n",
 112 |             "|u_D9tg3cK1w|Saltatio Mortis f...|2022-04-01T10:01:34Z|UCDGhwUyQMvcNqz15...|Saltatio Mortis|        24|2022-04-02T00:00:00Z|Saltatio Morits|H...|     34958|  572|       0|          163|https://i.ytimg.c...|            False|           False|\n",
 113 |             "+-----------+--------------------+--------------------+--------------------+---------------+----------+--------------------+--------------------+----------+-----+--------+-------------+--------------------+-----------------+----------------+\n",
 114 |             "only showing top 5 rows\n",
 115 |             "\n"
 116 |           ]
 117 |         }
 118 |       ],
 119 |       "source": [
 120 |         "spark_df.show(5)"
 121 |       ]
 122 |     },
 123 |     {
 124 |       "cell_type": "code",
 125 |       "execution_count": 10,
 126 |       "id": "v7aGpy5328MN",
 127 |       "metadata": {
 128 |         "colab": {
 129 |           "base_uri": "https://localhost:8080/"
 130 |         },
 131 |         "id": "v7aGpy5328MN",
 132 |         "outputId": "fed2732f-d0c5-4e72-88e7-53360849b9e5"
 133 |       },
 134 |       "outputs": [
 135 |         {
 136 |           "name": "stdout",
 137 |           "output_type": "stream",
 138 |           "text": [
 139 |             "root\n",
 140 |             " |-- video_id: string (nullable = true)\n",
 141 |             " |-- title: string (nullable = true)\n",
 142 |             " |-- publishedAt: string (nullable = true)\n",
 143 |             " |-- channelId: string (nullable = true)\n",
 144 |             " |-- channelTitle: string (nullable = true)\n",
 145 |             " |-- categoryId: string (nullable = true)\n",
 146 |             " |-- trending_date: string (nullable = true)\n",
 147 |             " |-- tags: string (nullable = true)\n",
 148 |             " |-- view_count: string (nullable = true)\n",
 149 |             " |-- likes: string (nullable = true)\n",
 150 |             " |-- dislikes: string (nullable = true)\n",
 151 |             " |-- comment_count: string (nullable = true)\n",
 152 |             " |-- thumbnail_link: string (nullable = true)\n",
 153 |             " |-- comments_disabled: string (nullable = true)\n",
 154 |             " |-- ratings_disabled: string (nullable = true)\n",
 155 |             "\n"
 156 |           ]
 157 |         }
 158 |       ],
 159 |       "source": [
 160 |         "spark_df.printSchema()"
 161 |       ]
 162 |     },
 163 |     {
 164 |       "cell_type": "code",
 165 |       "execution_count": 7,
 166 |       "id": "tQW9x4yi2Fta",
 167 |       "metadata": {
 168 |         "id": "tQW9x4yi2Fta"
 169 |       },
 170 |       "outputs": [],
 171 |       "source": [
 172 |         "def replace_str(value: str):\n",
 173 |         "    return value.replace(\"default\", \"maxresdefault\")\n",
 174 |         "\n",
 175 |         "def format_date(value: str):\n",
 176 |         "    return value.replace(\"T\", \" \").replace(\"Z\", \"\")"
 177 |       ]
 178 |     },
 179 |     {
 180 |       "cell_type": "code",
 181 |       "execution_count": 11,
 182 |       "id": "Wcl-xS4r2Fwq",
 183 |       "metadata": {
 184 |         "id": "Wcl-xS4r2Fwq"
 185 |       },
 186 |       "outputs": [],
 187 |       "source": [
 188 |         "date_format = udf(format_date, StringType())\n",
 189 |         "spark_df = spark_df.withColumn(\"publishedAt\", date_format(spark_df[\"publishedAt\"]))"
 190 |       ]
 191 |     },
 192 |     {
 193 |       "cell_type": "code",
 194 |       "execution_count": 12,
 195 |       "id": "r6hpuRoT3he6",
 196 |       "metadata": {
 197 |         "colab": {
 198 |           "base_uri": "https://localhost:8080/"
 199 |         },
 200 |         "id": "r6hpuRoT3he6",
 201 |         "outputId": "581b6d63-cf30-4e86-a472-e8844d928326"
 202 |       },
 203 |       "outputs": [
 204 |         {
 205 |           "name": "stdout",
 206 |           "output_type": "stream",
 207 |           "text": [
 208 |             "+-------------------+\n",
 209 |             "|        publishedAt|\n",
 210 |             "+-------------------+\n",
 211 |             "|2022-04-01 16:36:48|\n",
 212 |             "|2022-04-01 11:03:23|\n",
 213 |             "+-------------------+\n",
 214 |             "only showing top 2 rows\n",
 215 |             "\n"
 216 |           ]
 217 |         }
 218 |       ],
 219 |       "source": [
 220 |         "spark_df.select(\"publishedAt\").show(2)"
 221 |       ]
 222 |     },
 223 |     {
 224 |       "cell_type": "code",
 225 |       "execution_count": 13,
 226 |       "id": "AN2oaZJm2F5E",
 227 |       "metadata": {
 228 |         "id": "AN2oaZJm2F5E"
 229 |       },
 230 |       "outputs": [],
 231 |       "source": [
 232 |         "# Convert date type of column publishedAt to datetime data type\n",
 233 |         "spark_df = spark_df.withColumn(\"publishedAt\", to_timestamp(\"publishedAt\"))"
 234 |       ]
 235 |     },
 236 |     {
 237 |       "cell_type": "code",
 238 |       "execution_count": 14,
 239 |       "id": "wOsobTLN2F7Q",
 240 |       "metadata": {
 241 |         "id": "wOsobTLN2F7Q"
 242 |       },
 243 |       "outputs": [],
 244 |       "source": [
 245 |         "# Convert date type of column categoryId to integer data type\n",
 246 |         "spark_df = spark_df.withColumn(\"categoryId\", spark_df[\"categoryId\"].cast(IntegerType()))"
 247 |       ]
 248 |     },
 249 |     {
 250 |       "cell_type": "code",
 251 |       "execution_count": 15,
 252 |       "id": "g6FcJJ1N2F-O",
 253 |       "metadata": {
 254 |         "id": "g6FcJJ1N2F-O"
 255 |       },
 256 |       "outputs": [],
 257 |       "source": [
 258 |         "# trending_date replace to format date\n",
 259 |         "spark_df = spark_df.withColumn(\"trending_date\", date_format(spark_df[\"trending_date\"]))"
 260 |       ]
 261 |     },
 262 |     {
 263 |       "cell_type": "code",
 264 |       "execution_count": 16,
 265 |       "id": "JSNG_4MW4ELz",
 266 |       "metadata": {
 267 |         "colab": {
 268 |           "base_uri": "https://localhost:8080/"
 269 |         },
 270 |         "id": "JSNG_4MW4ELz",
 271 |         "outputId": "c16f96a7-9267-40da-c2d1-7b0f429fc15c"
 272 |       },
 273 |       "outputs": [
 274 |         {
 275 |           "name": "stdout",
 276 |           "output_type": "stream",
 277 |           "text": [
 278 |             "+-------------------+\n",
 279 |             "|      trending_date|\n",
 280 |             "+-------------------+\n",
 281 |             "|2022-04-02 00:00:00|\n",
 282 |             "|2022-04-02 00:00:00|\n",
 283 |             "+-------------------+\n",
 284 |             "only showing top 2 rows\n",
 285 |             "\n"
 286 |           ]
 287 |         }
 288 |       ],
 289 |       "source": [
 290 |         "spark_df.select(\"trending_date\").show(2)"
 291 |       ]
 292 |     },
 293 |     {
 294 |       "cell_type": "code",
 295 |       "execution_count": 17,
 296 |       "id": "4bovaYHZ2GBQ",
 297 |       "metadata": {
 298 |         "id": "4bovaYHZ2GBQ"
 299 |       },
 300 |       "outputs": [],
 301 |       "source": [
 302 |         "# Convert date type of column trending_date to datetime data type\n",
 303 |         "spark_df = spark_df.withColumn(\"trending_date\", to_timestamp(\"trending_date\"))"
 304 |       ]
 305 |     },
 306 |     {
 307 |       "cell_type": "code",
 308 |       "execution_count": 18,
 309 |       "id": "xlQ7QkYJ2GEA",
 310 |       "metadata": {
 311 |         "id": "xlQ7QkYJ2GEA"
 312 |       },
 313 |       "outputs": [],
 314 |       "source": [
 315 |         "# Convert date type of column view_count to integer data type\n",
 316 |         "spark_df = spark_df.withColumn(\"view_count\", spark_df[\"view_count\"].cast(IntegerType()))"
 317 |       ]
 318 |     },
 319 |     {
 320 |       "cell_type": "code",
 321 |       "execution_count": 19,
 322 |       "id": "Inflh60t2GGj",
 323 |       "metadata": {
 324 |         "id": "Inflh60t2GGj"
 325 |       },
 326 |       "outputs": [],
 327 |       "source": [
 328 |         "# Convert date type of column likes to integer data type\n",
 329 |         "spark_df = spark_df.withColumn(\"likes\", spark_df[\"likes\"].cast(IntegerType()))"
 330 |       ]
 331 |     },
 332 |     {
 333 |       "cell_type": "code",
 334 |       "execution_count": 20,
 335 |       "id": "jxJWX5ox2GLU",
 336 |       "metadata": {
 337 |         "id": "jxJWX5ox2GLU"
 338 |       },
 339 |       "outputs": [],
 340 |       "source": [
 341 |         "# Convert date type of column dislikes to integer data type\n",
 342 |         "spark_df = spark_df.withColumn(\"dislikes\", spark_df[\"dislikes\"].cast(IntegerType()))"
 343 |       ]
 344 |     },
 345 |     {
 346 |       "cell_type": "code",
 347 |       "execution_count": 21,
 348 |       "id": "ZcSxAoyQ3ZTo",
 349 |       "metadata": {
 350 |         "id": "ZcSxAoyQ3ZTo"
 351 |       },
 352 |       "outputs": [],
 353 |       "source": [
 354 |         "# Convert date type of column comment_count to integer data type\n",
 355 |         "spark_df = spark_df.withColumn(\"comment_count\", spark_df[\"comment_count\"].cast(IntegerType()))"
 356 |       ]
 357 |     },
 358 |     {
 359 |       "cell_type": "code",
 360 |       "execution_count": 22,
 361 |       "id": "sR2x4HY83ZWz",
 362 |       "metadata": {
 363 |         "id": "sR2x4HY83ZWz"
 364 |       },
 365 |       "outputs": [],
 366 |       "source": [
 367 |         "# thumbnail_link replace from default to maxresdefault\n",
 368 |         "link_convert = udf(replace_str, StringType())\n",
 369 |         "spark_df = spark_df.withColumn(\"thumbnail_link\", link_convert(spark_df[\"thumbnail_link\"]))"
 370 |       ]
 371 |     },
 372 |     {
 373 |       "cell_type": "code",
 374 |       "execution_count": 27,
 375 |       "id": "m8REaetN3ZZU",
 376 |       "metadata": {
 377 |         "colab": {
 378 |           "base_uri": "https://localhost:8080/",
 379 |           "height": 36
 380 |         },
 381 |         "id": "m8REaetN3ZZU",
 382 |         "outputId": "893811ff-3c5a-410a-ea90-96d637af1ba2"
 383 |       },
 384 |       "outputs": [
 385 |         {
 386 |           "data": {
 387 |             "application/vnd.google.colaboratory.intrinsic+json": {
 388 |               "type": "string"
 389 |             },
 390 |             "text/plain": [
 391 |               "'https://i.ytimg.com/vi/EfP1h_3u0Lk/maxresdefault.jpg'"
 392 |             ]
 393 |           },
 394 |           "execution_count": 27,
 395 |           "metadata": {},
 396 |           "output_type": "execute_result"
 397 |         }
 398 |       ],
 399 |       "source": [
 400 |         "spark_df.select(\"thumbnail_link\").collect()[17][0]"
 401 |       ]
 402 |     },
 403 |     {
 404 |       "cell_type": "code",
 405 |       "execution_count": 28,
 406 |       "id": "PlOr7qhb3Zbh",
 407 |       "metadata": {
 408 |         "colab": {
 409 |           "base_uri": "https://localhost:8080/"
 410 |         },
 411 |         "id": "PlOr7qhb3Zbh",
 412 |         "outputId": "f49d0243-e8c7-4d80-eca1-154313a60f3f"
 413 |       },
 414 |       "outputs": [
 415 |         {
 416 |           "name": "stdout",
 417 |           "output_type": "stream",
 418 |           "text": [
 419 |             "root\n",
 420 |             " |-- video_id: string (nullable = true)\n",
 421 |             " |-- title: string (nullable = true)\n",
 422 |             " |-- publishedAt: timestamp (nullable = true)\n",
 423 |             " |-- channelId: string (nullable = true)\n",
 424 |             " |-- channelTitle: string (nullable = true)\n",
 425 |             " |-- categoryId: integer (nullable = true)\n",
 426 |             " |-- trending_date: timestamp (nullable = true)\n",
 427 |             " |-- tags: string (nullable = true)\n",
 428 |             " |-- view_count: integer (nullable = true)\n",
 429 |             " |-- likes: integer (nullable = true)\n",
 430 |             " |-- dislikes: integer (nullable = true)\n",
 431 |             " |-- comment_count: integer (nullable = true)\n",
 432 |             " |-- thumbnail_link: string (nullable = true)\n",
 433 |             " |-- comments_disabled: string (nullable = true)\n",
 434 |             " |-- ratings_disabled: string (nullable = true)\n",
 435 |             "\n"
 436 |           ]
 437 |         }
 438 |       ],
 439 |       "source": [
 440 |         "# Check\n",
 441 |         "spark_df.printSchema()"
 442 |       ]
 443 |     },
 444 |     {
 445 |       "cell_type": "code",
 446 |       "execution_count": null,
 447 |       "id": "0npxdqo03Zdr",
 448 |       "metadata": {
 449 |         "id": "0npxdqo03Zdr"
 450 |       },
 451 |       "outputs": [],
 452 |       "source": []
 453 |     },
 454 |     {
 455 |       "cell_type": "code",
 456 |       "execution_count": null,
 457 |       "id": "vJWbPWHg3ZgW",
 458 |       "metadata": {
 459 |         "id": "vJWbPWHg3ZgW"
 460 |       },
 461 |       "outputs": [],
 462 |       "source": []
 463 |     },
 464 |     {
 465 |       "cell_type": "code",
 466 |       "execution_count": null,
 467 |       "id": "lrfACvfa2GO2",
 468 |       "metadata": {
 469 |         "id": "lrfACvfa2GO2"
 470 |       },
 471 |       "outputs": [],
 472 |       "source": []
 473 |     },
 474 |     {
 475 |       "cell_type": "code",
 476 |       "execution_count": null,
 477 |       "id": "oMo9Hmry2GXW",
 478 |       "metadata": {
 479 |         "id": "oMo9Hmry2GXW"
 480 |       },
 481 |       "outputs": [],
 482 |       "source": []
 483 |     },
 484 |     {
 485 |       "cell_type": "code",
 486 |       "execution_count": null,
 487 |       "id": "bb609589-83c8-45c5-b4ce-371458647e8c",
 488 |       "metadata": {
 489 |         "id": "bb609589-83c8-45c5-b4ce-371458647e8c"
 490 |       },
 491 |       "outputs": [],
 492 |       "source": [
 493 |         "from pyspark.sql.functions import *\n",
 494 |         "from pyspark.sql import SparkSession\n",
 495 |         "from pyspark.sql.functions import round\n",
 496 |         "from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Bucketizer\n",
 497 |         "from pyspark.ml.regression import LinearRegression"
 498 |       ]
 499 |     },
 500 |     {
 501 |       "cell_type": "code",
 502 |       "execution_count": null,
 503 |       "id": "c9fe1d06-d632-401b-b98b-c59d1198685b",
 504 |       "metadata": {
 505 |         "id": "c9fe1d06-d632-401b-b98b-c59d1198685b"
 506 |       },
 507 |       "outputs": [],
 508 |       "source": [
 509 |         "import seaborn as sns\n",
 510 |         "import matplotlib.pyplot as plt\n",
 511 |         "\n",
 512 |         "%matplotlib inline\n",
 513 |         "import warnings\n",
 514 |         "warnings.filterwarnings('ignore')"
 515 |       ]
 516 |     },
 517 |     {
 518 |       "cell_type": "markdown",
 519 |       "id": "c1271c1f",
 520 |       "metadata": {
 521 |         "id": "c1271c1f"
 522 |       },
 523 |       "source": [
 524 |         "# Read Data"
 525 |       ]
 526 |     },
 527 |     {
 528 |       "cell_type": "code",
 529 |       "execution_count": null,
 530 |       "id": "801e7626",
 531 |       "metadata": {
 532 |         "id": "801e7626"
 533 |       },
 534 |       "outputs": [],
 535 |       "source": [
 536 |         "# Read data from CSV file\n",
 537 |         "df = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/properties_2016.csv', sep=',', header=True, inferSchema=True, nullValue='NA')"
 538 |       ]
 539 |     },
 540 |     {
 541 |       "cell_type": "code",
 542 |       "execution_count": null,
 543 |       "id": "968d018d",
 544 |       "metadata": {
 545 |         "colab": {
 546 |           "base_uri": "https://localhost:8080/"
 547 |         },
 548 |         "id": "968d018d",
 549 |         "outputId": "c0050aba-eb23-482d-b4d9-ad85826b8ddf"
 550 |       },
 551 |       "outputs": [
 552 |         {
 553 |           "name": "stdout",
 554 |           "output_type": "stream",
 555 |           "text": [
 556 |             "Dataset have 2985217 records\n"
 557 |           ]
 558 |         }
 559 |       ],
 560 |       "source": [
 561 |         "# Get number of records\n",
 562 |         "print(f\"Dataset have {df.count()} records\")"
 563 |       ]
 564 |     },
 565 |     {
 566 |       "cell_type": "code",
 567 |       "execution_count": null,
 568 |       "id": "ea7b3db3-8682-4bac-a2ec-cd0324cb19cd",
 569 |       "metadata": {
 570 |         "colab": {
 571 |           "base_uri": "https://localhost:8080/"
 572 |         },
 573 |         "id": "ea7b3db3-8682-4bac-a2ec-cd0324cb19cd",
 574 |         "outputId": "c71a1c3a-6004-4e52-9cbd-b05ed8b71e61"
 575 |       },
 576 |       "outputs": [
 577 |         {
 578 |           "name": "stdout",
 579 |           "output_type": "stream",
 580 |           "text": [
 581 |             "column: 58\n",
 582 |             "row: 2985217\n"
 583 |           ]
 584 |         }
 585 |       ],
 586 |       "source": [
 587 |         "# Get Shape dataset\n",
 588 |         "print(f\"column: {len(df.columns)}\\nrow: {df.count()}\")"
 589 |       ]
 590 |     },
 591 |     {
 592 |       "cell_type": "code",
 593 |       "execution_count": null,
 594 |       "id": "7ec68403",
 595 |       "metadata": {
 596 |         "colab": {
 597 |           "base_uri": "https://localhost:8080/"
 598 |         },
 599 |         "id": "7ec68403",
 600 |         "outputId": "e729d51d-deed-49b5-da2c-cfe3701d5f53"
 601 |       },
 602 |       "outputs": [
 603 |         {
 604 |           "name": "stdout",
 605 |           "output_type": "stream",
 606 |           "text": [
 607 |             "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+----+------------+-----------+------------+---------------+--------------+---------------------+--------+----------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n",
 608 |             "|parcelid|airconditioningtypeid|architecturalstyletypeid|basementsqft|bathroomcnt|bedroomcnt|buildingclasstypeid|buildingqualitytypeid|calculatedbathnbr|decktypeid|finishedfloor1squarefeet|calculatedfinishedsquarefeet|finishedsquarefeet12|finishedsquarefeet13|finishedsquarefeet15|finishedsquarefeet50|finishedsquarefeet6|fips|fireplacecnt|fullbathcnt|garagecarcnt|garagetotalsqft|hashottuborspa|heatingorsystemtypeid|latitude| longitude|lotsizesquarefeet|poolcnt|poolsizesum|pooltypeid10|pooltypeid2|pooltypeid7|propertycountylandusecode|propertylandusetypeid|propertyzoningdesc|rawcensustractandblock|regionidcity|regionidcounty|regionidneighborhood|regionidzip|roomcnt|storytypeid|threequarterbathnbr|typeconstructiontypeid|unitcnt|yardbuildingsqft17|yardbuildingsqft26|yearbuilt|numberofstories|fireplaceflag|structuretaxvaluedollarcnt|taxvaluedollarcnt|assessmentyear|landtaxvaluedollarcnt|taxamount|taxdelinquencyflag|taxdelinquencyyear|censustractandblock|\n",
 609 |             "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+----+------------+-----------+------------+---------------+--------------+---------------------+--------+----------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n",
 610 |             "|10754147|                 NULL|                    NULL|        NULL|        0.0|       0.0|               NULL|                 NULL|             NULL|      NULL|                    NULL|                        NULL|                NULL|                NULL|                NULL|                NULL|               NULL|6037|        NULL|       NULL|        NULL|           NULL|          NULL|                 NULL|34144442|-118654084|          85768.0|   NULL|       NULL|        NULL|       NULL|       NULL|                     010D|                  269|              NULL|        6.0378002041E7|       37688|          3101|                NULL|      96337|    0.0|       NULL|               NULL|                  NULL|   NULL|              NULL|              NULL|     NULL|           NULL|         NULL|                      NULL|              9.0|          2015|                  9.0|     NULL|              NULL|              NULL|               NULL|\n",
 611 |             "|10759547|                 NULL|                    NULL|        NULL|        0.0|       0.0|               NULL|                 NULL|             NULL|      NULL|                    NULL|                        NULL|                NULL|                NULL|                NULL|                NULL|               NULL|6037|        NULL|       NULL|        NULL|           NULL|          NULL|                 NULL|34140430|-118625364|           4083.0|   NULL|       NULL|        NULL|       NULL|       NULL|                     0109|                  261|            LCA11*|     6.0378001011002E7|       37688|          3101|                NULL|      96337|    0.0|       NULL|               NULL|                  NULL|   NULL|              NULL|              NULL|     NULL|           NULL|         NULL|                      NULL|          27516.0|          2015|              27516.0|     NULL|              NULL|              NULL|               NULL|\n",
 612 |             "|10843547|                 NULL|                    NULL|        NULL|        0.0|       0.0|               NULL|                 NULL|             NULL|      NULL|                    NULL|                     73026.0|                NULL|                NULL|               73026|                NULL|               NULL|6037|        NULL|       NULL|        NULL|           NULL|          NULL|                 NULL|33989359|-118394633|          63085.0|   NULL|       NULL|        NULL|       NULL|       NULL|                     1200|                   47|              LAC2|     6.0377030012017E7|       51617|          3101|                NULL|      96095|    0.0|       NULL|               NULL|                  NULL|      2|              NULL|              NULL|     NULL|           NULL|         NULL|                  650756.0|        1413387.0|          2015|             762631.0| 20800.37|              NULL|              NULL|               NULL|\n",
 613 |             "|10859147|                 NULL|                    NULL|        NULL|        0.0|       0.0|                  3|                    7|             NULL|      NULL|                    NULL|                      5068.0|                NULL|                NULL|                5068|                NULL|               NULL|6037|        NULL|       NULL|        NULL|           NULL|          NULL|                 NULL|34148863|-118437206|           7521.0|   NULL|       NULL|        NULL|       NULL|       NULL|                     1200|                   47|              LAC2|     6.0371412023001E7|       12447|          3101|               27080|      96424|    0.0|       NULL|               NULL|                  NULL|   NULL|              NULL|              NULL|   1948.0|              1|         NULL|                  571346.0|        1156834.0|          2015|             585488.0| 14557.57|              NULL|              NULL|               NULL|\n",
 614 |             "|10879947|                 NULL|                    NULL|        NULL|        0.0|       0.0|                  4|                 NULL|             NULL|      NULL|                    NULL|                      1776.0|                NULL|                NULL|                1776|                NULL|               NULL|6037|        NULL|       NULL|        NULL|           NULL|          NULL|                 NULL|34194168|-118385816|           8512.0|   NULL|       NULL|        NULL|       NULL|       NULL|                     1210|                   31|              LAM1|     6.0371232052003E7|       12447|          3101|               46795|      96450|    0.0|       NULL|               NULL|                  NULL|      1|              NULL|              NULL|   1947.0|           NULL|         NULL|                  193796.0|         433491.0|          2015|             239695.0|  5725.17|              NULL|              NULL|               NULL|\n",
 615 |             "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+----+------------+-----------+------------+---------------+--------------+---------------------+--------+----------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n",
 616 |             "only showing top 5 rows\n",
 617 |             "\n"
 618 |           ]
 619 |         }
 620 |       ],
 621 |       "source": [
 622 |         "# View five records data\n",
 623 |         "df.show(5)"
 624 |       ]
 625 |     },
 626 |     {
 627 |       "cell_type": "code",
 628 |       "execution_count": null,
 629 |       "id": "32d76054",
 630 |       "metadata": {
 631 |         "colab": {
 632 |           "base_uri": "https://localhost:8080/"
 633 |         },
 634 |         "id": "32d76054",
 635 |         "outputId": "cf3b6c89-2ef3-47b6-f4e3-a50eb579ae5d"
 636 |       },
 637 |       "outputs": [
 638 |         {
 639 |           "name": "stdout",
 640 |           "output_type": "stream",
 641 |           "text": [
 642 |             "root\n",
 643 |             " |-- parcelid: integer (nullable = true)\n",
 644 |             " |-- airconditioningtypeid: integer (nullable = true)\n",
 645 |             " |-- architecturalstyletypeid: integer (nullable = true)\n",
 646 |             " |-- basementsqft: integer (nullable = true)\n",
 647 |             " |-- bathroomcnt: double (nullable = true)\n",
 648 |             " |-- bedroomcnt: double (nullable = true)\n",
 649 |             " |-- buildingclasstypeid: integer (nullable = true)\n",
 650 |             " |-- buildingqualitytypeid: integer (nullable = true)\n",
 651 |             " |-- calculatedbathnbr: double (nullable = true)\n",
 652 |             " |-- decktypeid: integer (nullable = true)\n",
 653 |             " |-- finishedfloor1squarefeet: integer (nullable = true)\n",
 654 |             " |-- calculatedfinishedsquarefeet: double (nullable = true)\n",
 655 |             " |-- finishedsquarefeet12: integer (nullable = true)\n",
 656 |             " |-- finishedsquarefeet13: integer (nullable = true)\n",
 657 |             " |-- finishedsquarefeet15: integer (nullable = true)\n",
 658 |             " |-- finishedsquarefeet50: integer (nullable = true)\n",
 659 |             " |-- finishedsquarefeet6: integer (nullable = true)\n",
 660 |             " |-- fips: integer (nullable = true)\n",
 661 |             " |-- fireplacecnt: integer (nullable = true)\n",
 662 |             " |-- fullbathcnt: integer (nullable = true)\n",
 663 |             " |-- garagecarcnt: integer (nullable = true)\n",
 664 |             " |-- garagetotalsqft: integer (nullable = true)\n",
 665 |             " |-- hashottuborspa: boolean (nullable = true)\n",
 666 |             " |-- heatingorsystemtypeid: integer (nullable = true)\n",
 667 |             " |-- latitude: integer (nullable = true)\n",
 668 |             " |-- longitude: integer (nullable = true)\n",
 669 |             " |-- lotsizesquarefeet: double (nullable = true)\n",
 670 |             " |-- poolcnt: integer (nullable = true)\n",
 671 |             " |-- poolsizesum: integer (nullable = true)\n",
 672 |             " |-- pooltypeid10: integer (nullable = true)\n",
 673 |             " |-- pooltypeid2: integer (nullable = true)\n",
 674 |             " |-- pooltypeid7: integer (nullable = true)\n",
 675 |             " |-- propertycountylandusecode: string (nullable = true)\n",
 676 |             " |-- propertylandusetypeid: integer (nullable = true)\n",
 677 |             " |-- propertyzoningdesc: string (nullable = true)\n",
 678 |             " |-- rawcensustractandblock: double (nullable = true)\n",
 679 |             " |-- regionidcity: integer (nullable = true)\n",
 680 |             " |-- regionidcounty: integer (nullable = true)\n",
 681 |             " |-- regionidneighborhood: integer (nullable = true)\n",
 682 |             " |-- regionidzip: integer (nullable = true)\n",
 683 |             " |-- roomcnt: double (nullable = true)\n",
 684 |             " |-- storytypeid: integer (nullable = true)\n",
 685 |             " |-- threequarterbathnbr: integer (nullable = true)\n",
 686 |             " |-- typeconstructiontypeid: integer (nullable = true)\n",
 687 |             " |-- unitcnt: integer (nullable = true)\n",
 688 |             " |-- yardbuildingsqft17: integer (nullable = true)\n",
 689 |             " |-- yardbuildingsqft26: integer (nullable = true)\n",
 690 |             " |-- yearbuilt: double (nullable = true)\n",
 691 |             " |-- numberofstories: integer (nullable = true)\n",
 692 |             " |-- fireplaceflag: boolean (nullable = true)\n",
 693 |             " |-- structuretaxvaluedollarcnt: double (nullable = true)\n",
 694 |             " |-- taxvaluedollarcnt: double (nullable = true)\n",
 695 |             " |-- assessmentyear: integer (nullable = true)\n",
 696 |             " |-- landtaxvaluedollarcnt: double (nullable = true)\n",
 697 |             " |-- taxamount: double (nullable = true)\n",
 698 |             " |-- taxdelinquencyflag: string (nullable = true)\n",
 699 |             " |-- taxdelinquencyyear: integer (nullable = true)\n",
 700 |             " |-- censustractandblock: long (nullable = true)\n",
 701 |             "\n"
 702 |           ]
 703 |         }
 704 |       ],
 705 |       "source": [
 706 |         "df.printSchema()"
 707 |       ]
 708 |     },
 709 |     {
 710 |       "cell_type": "code",
 711 |       "execution_count": null,
 712 |       "id": "65634354",
 713 |       "metadata": {
 714 |         "colab": {
 715 |           "base_uri": "https://localhost:8080/"
 716 |         },
 717 |         "id": "65634354",
 718 |         "outputId": "5ad24bc2-fef7-4898-bcd0-f401ec43b6c3"
 719 |       },
 720 |       "outputs": [
 721 |         {
 722 |           "name": "stdout",
 723 |           "output_type": "stream",
 724 |           "text": [
 725 |             "+-------+--------------------+---------------------+------------------------+-----------------+------------------+------------------+-------------------+---------------------+------------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+---------------------+-------------------+--------------------+------------------+-------+------------------+------------+-----------+-----------+-------------------------+---------------------+--------------------+----------------------+-----------------+------------------+--------------------+-----------------+------------------+-----------+-------------------+----------------------+------------------+------------------+------------------+------------------+------------------+--------------------------+------------------+-------------------+---------------------+-----------------+------------------+------------------+--------------------+\n",
 726 |             "|summary|            parcelid|airconditioningtypeid|architecturalstyletypeid|     basementsqft|       bathroomcnt|        bedroomcnt|buildingclasstypeid|buildingqualitytypeid| calculatedbathnbr|decktypeid|finishedfloor1squarefeet|calculatedfinishedsquarefeet|finishedsquarefeet12|finishedsquarefeet13|finishedsquarefeet15|finishedsquarefeet50|finishedsquarefeet6|              fips|       fireplacecnt|       fullbathcnt|      garagecarcnt|   garagetotalsqft|heatingorsystemtypeid|           latitude|           longitude| lotsizesquarefeet|poolcnt|       poolsizesum|pooltypeid10|pooltypeid2|pooltypeid7|propertycountylandusecode|propertylandusetypeid|  propertyzoningdesc|rawcensustractandblock|     regionidcity|    regionidcounty|regionidneighborhood|      regionidzip|           roomcnt|storytypeid|threequarterbathnbr|typeconstructiontypeid|           unitcnt|yardbuildingsqft17|yardbuildingsqft26|         yearbuilt|   numberofstories|structuretaxvaluedollarcnt| taxvaluedollarcnt|     assessmentyear|landtaxvaluedollarcnt|        taxamount|taxdelinquencyflag|taxdelinquencyyear| censustractandblock|\n",
 727 |             "+-------+--------------------+---------------------+------------------------+-----------------+------------------+------------------+-------------------+---------------------+------------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+---------------------+-------------------+--------------------+------------------+-------+------------------+------------+-----------+-----------+-------------------------+---------------------+--------------------+----------------------+-----------------+------------------+--------------------+-----------------+------------------+-----------+-------------------+----------------------+------------------+------------------+------------------+------------------+------------------+--------------------------+------------------+-------------------+---------------------+-----------------+------------------+------------------+--------------------+\n",
 728 |             "|  count|             2985217|               811519|                    6061|             1628|           2973755|           2973767|              12629|              1938488|           2856305|     17096|                  202717|                     2929652|             2709184|                7672|              190798|              202717|              22001|           2973780|             312637|           2856305|            883267|            883267|              1806401|            2973780|             2973780|           2709118| 517534|             27960|       36939|      32075|     485459|                  2972940|              2973780|             1978629|               2973780|          2922372|           2973780|             1156402|          2971237|           2973742|       1624|             311631|                  6747|           1977490|             80355|              2647|           2925289|            682069|                   2930235|           2942667|            2973778|              2917484|          2953967|             56462|             56464|             2910091|\n",
 729 |             "|   mean|1.3325858360229759E7|   1.9311661218036793|       7.202606830556014|646.8832923832924|2.2091427336818263| 3.088948797938776| 3.7259482144271123|    5.784786906083505|2.2992625087306853|      66.0|      1380.6303960693972|          1827.1621236925068|  1760.0006079321302|   1178.900677789364|    2739.18723466703|  1388.9445779091047|  2414.339439116404| 6048.031600185623| 1.1687100375195514|2.2441651014159905|1.8235165584132544| 383.7693573970272|    4.012053248420478|3.400146865372119E7|-1.18201934159426...|22822.805527748147|    1.0| 519.7109799713877|         1.0|        1.0|        1.0|        199.5320980966015|    260.0484285992911|  5.46084705882353E8|   6.048344961635102E7|34993.35022406456|2570.4605535715486|   193476.4074145496|96552.67280025121|1.4750183438912992|        7.0| 1.0100086320038764|     5.999555357936861| 1.181171080511153| 319.8033974239313| 278.2965621458255|1964.2616411575061|1.4014637815235702|        170883.57716599523|420478.99067852396|  2014.999458937419|   252478.02946854208|5377.607139338332|              NULL|13.892409322754322|6.048431221257243E13|\n",
 730 |             "| stddev|   7909966.389233432|    3.148587394577264|       2.436290490710878|538.7934732127098|1.0777537772255268|1.2758587961101613| 0.5017002111297728|   1.8053515795599582|1.0007362395982085|       0.0|       632.8685428862445|          1819.7804693000555|   971.0610103785792|  357.07303551809184|   5447.428327204328|   664.4887085429802|  7695.302951762993|20.232784692561076|0.46127285457915057|0.9912053996708714|0.6100353832595409|245.44341897378612|    3.293732688713869| 243381.17831128882|  345317.10127200687|  337592.366407657|    0.0|191.32328381052514|         0.0|        0.0|        0.0|         302.825330209084|   15.908166600884176|2.0614819081932812E9|     200811.6754489488| 50727.4653888151|  788.071140066596|  165713.25431675857|3673.175037540778| 2.840402806614331|        0.0|0.11770930082463944|   0.38405027536042613|2.4478959553912745|233.08631396807013| 369.7315077596207|23.441319348584372|0.5390757507737586|         402068.3420150093| 726346.6517993591|0.03683161097766099|   445013.16961781326|9183.107127994226|              NULL|2.5810057224984697|3.249034547374049E11|\n",
 731 |             "|    min|            10711725|                    1|                       2|               20|               0.0|               0.0|                  1|                    1|               1.0|        66|                       3|                         1.0|                   1|                 120|                 112|                   3|                117|              6037|                  1|                 1|                 0|                 0|                    1|           33324388|          -119475780|             100.0|      1|                19|           1|          1|          1|                        0|                   31|                 #12|        6.0371011101E7|             3491|              1286|                6952|            95982|               0.0|          7|                  1|                     4|                 1|                10|                10|            1801.0|                 1|                       1.0|               1.0|               2000|                  1.0|             1.34|                 Y|                 0|                  -1|\n",
 732 |             "|    max|           169601949|                   13|                      27|             8516|              20.0|              20.0|                  5|                   12|              20.0|        66|                   31303|                    952576.0|              290345|                2688|              820242|               31303|             952576|              6111|                  9|                20|                25|              7749|                   24|           34819650|          -117554316|      3.28263808E8|      1|             17410|           1|          1|          1|                      SFR|                  275|           ZONE LCC3|     6.1110091003011E7|           396556|              3101|              764167|           399675|              96.0|          7|                  7|                    13|               997|              7983|              6141|            2015.0|                41|                 2.51486E8|         2.82786E8|               2016|          9.0246219E7|       3458861.12|                 Y|                99|     483030105084015|\n",
 733 |             "+-------+--------------------+---------------------+------------------------+-----------------+------------------+------------------+-------------------+---------------------+------------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+---------------------+-------------------+--------------------+------------------+-------+------------------+------------+-----------+-----------+-------------------------+---------------------+--------------------+----------------------+-----------------+------------------+--------------------+-----------------+------------------+-----------+-------------------+----------------------+------------------+------------------+------------------+------------------+------------------+--------------------------+------------------+-------------------+---------------------+-----------------+------------------+------------------+--------------------+\n",
 734 |             "\n"
 735 |           ]
 736 |         }
 737 |       ],
 738 |       "source": [
 739 |         "df.describe().show()"
 740 |       ]
 741 |     },
 742 |     {
 743 |       "cell_type": "markdown",
 744 |       "id": "e7f4d289",
 745 |       "metadata": {
 746 |         "id": "e7f4d289"
 747 |       },
 748 |       "source": [
 749 |         "# Clean Data"
 750 |       ]
 751 |     },
 752 |     {
 753 |       "cell_type": "code",
 754 |       "execution_count": null,
 755 |       "id": "eddda1f7",
 756 |       "metadata": {
 757 |         "id": "eddda1f7"
 758 |       },
 759 |       "outputs": [],
 760 |       "source": [
 761 |         "# Drop duplicates\n",
 762 |         "df = df.dropDuplicates()"
 763 |       ]
 764 |     },
 765 |     {
 766 |       "cell_type": "code",
 767 |       "execution_count": null,
 768 |       "id": "2d651e5b",
 769 |       "metadata": {
 770 |         "colab": {
 771 |           "base_uri": "https://localhost:8080/"
 772 |         },
 773 |         "id": "2d651e5b",
 774 |         "outputId": "c59bf422-2e51-410e-fa39-b18c726e4e46"
 775 |       },
 776 |       "outputs": [
 777 |         {
 778 |           "name": "stdout",
 779 |           "output_type": "stream",
 780 |           "text": [
 781 |             "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----+------------+-----------+------------+---------------+--------------+---------------------+--------+---------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n",
 782 |             "|parcelid|airconditioningtypeid|architecturalstyletypeid|basementsqft|bathroomcnt|bedroomcnt|buildingclasstypeid|buildingqualitytypeid|calculatedbathnbr|decktypeid|finishedfloor1squarefeet|calculatedfinishedsquarefeet|finishedsquarefeet12|finishedsquarefeet13|finishedsquarefeet15|finishedsquarefeet50|finishedsquarefeet6| fips|fireplacecnt|fullbathcnt|garagecarcnt|garagetotalsqft|hashottuborspa|heatingorsystemtypeid|latitude|longitude|lotsizesquarefeet|poolcnt|poolsizesum|pooltypeid10|pooltypeid2|pooltypeid7|propertycountylandusecode|propertylandusetypeid|propertyzoningdesc|rawcensustractandblock|regionidcity|regionidcounty|regionidneighborhood|regionidzip|roomcnt|storytypeid|threequarterbathnbr|typeconstructiontypeid|unitcnt|yardbuildingsqft17|yardbuildingsqft26|yearbuilt|numberofstories|fireplaceflag|structuretaxvaluedollarcnt|taxvaluedollarcnt|assessmentyear|landtaxvaluedollarcnt|taxamount|taxdelinquencyflag|taxdelinquencyyear|censustractandblock|\n",
 783 |             "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----+------------+-----------+------------+---------------+--------------+---------------------+--------+---------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n",
 784 |             "|       0|              2173698|                 2979156|     2983589|      11462|     11450|            2972588|              1046729|           128912|   2968121|                 2782500|                       55565|              276033|             2977545|             2794419|             2782500|            2963216|11437|     2672580|     128912|     2101950|        2101950|       2916203|              1178816|   11437|    11437|           276099|2467683|    2957257|     2948278|    2953142|    2499758|                    12277|                11437|           1006588|                 11437|       62845|         11437|             1828815|      13980|  11475|    2983593|            2673586|               2978470|1007727|           2904862|           2982570|    59928|        2303148|      2980054|                     54982|            42550|         11439|                67733|    31250|           2928755|           2928753|              75126|\n",
 785 |             "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----+------------+-----------+------------+---------------+--------------+---------------------+--------+---------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n",
 786 |             "\n"
 787 |           ]
 788 |         }
 789 |       ],
 790 |       "source": [
 791 |         "# Get the missing value of each column\n",
 792 |         "null_counts = df.select([sum(col(column).isNull().cast(\"int\")).alias(column) for column in df.columns])\n",
 793 |         "null_counts.show()"
 794 |       ]
 795 |     },
 796 |     {
 797 |       "cell_type": "code",
 798 |       "execution_count": null,
 799 |       "id": "c158a9c4-c639-47f5-9e29-6ac483daced6",
 800 |       "metadata": {
 801 |         "id": "c158a9c4-c639-47f5-9e29-6ac483daced6"
 802 |       },
 803 |       "outputs": [],
 804 |       "source": [
 805 |         "# Visualize missing value on each column\n",
 806 |         "pandas_df = df.toPandas()\n",
 807 |         "missing_count = pandas_df.isna().sum()\n",
 808 |         "sns.barplot(x=missing_count.index, y=missing_count.values)\n",
 809 |         "plt.title('Numbers Missing Value on each column')\n",
 810 |         "plt.xlabel('Column')\n",
 811 |         "plt.ylabel('Numbers Missing')\n",
 812 |         "plt.show()"
 813 |       ]
 814 |     },
 815 |     {
 816 |       "cell_type": "code",
 817 |       "execution_count": null,
 818 |       "id": "4b1b542f",
 819 |       "metadata": {
 820 |         "colab": {
 821 |           "background_save": true
 822 |         },
 823 |         "id": "4b1b542f"
 824 |       },
 825 |       "outputs": [],
 826 |       "source": [
 827 |         "# Drop columns that are more than 60% missing\n",
 828 |         "def column_dropper(df, threshold):\n",
 829 |         "    total_records = df.count()\n",
 830 |         "    for col in df.columns:\n",
 831 |         "        missing = df.filter(df[col].isNull()).count()\n",
 832 |         "        missing_percent = missing / total_records\n",
 833 |         "        if missing_percent > threshold:\n",
 834 |         "            df = df.drop(col)\n",
 835 |         "    return df\n",
 836 |         "\n",
 837 |         "df = column_dropper(df, 0.6)"
 838 |       ]
 839 |     },
 840 |     {
 841 |       "cell_type": "code",
 842 |       "execution_count": null,
 843 |       "id": "41bd87bb",
 844 |       "metadata": {
 845 |         "colab": {
 846 |           "base_uri": "https://localhost:8080/"
 847 |         },
 848 |         "id": "41bd87bb",
 849 |         "outputId": "ea48b121-fe86-4c50-cc77-c284beaec8c7"
 850 |       },
 851 |       "outputs": [
 852 |         {
 853 |           "name": "stdout",
 854 |           "output_type": "stream",
 855 |           "text": [
 856 |             "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----+------------+-----------+------------+---------------+--------------+---------------------+--------+---------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n",
 857 |             "|parcelid|airconditioningtypeid|architecturalstyletypeid|basementsqft|bathroomcnt|bedroomcnt|buildingclasstypeid|buildingqualitytypeid|calculatedbathnbr|decktypeid|finishedfloor1squarefeet|calculatedfinishedsquarefeet|finishedsquarefeet12|finishedsquarefeet13|finishedsquarefeet15|finishedsquarefeet50|finishedsquarefeet6| fips|fireplacecnt|fullbathcnt|garagecarcnt|garagetotalsqft|hashottuborspa|heatingorsystemtypeid|latitude|longitude|lotsizesquarefeet|poolcnt|poolsizesum|pooltypeid10|pooltypeid2|pooltypeid7|propertycountylandusecode|propertylandusetypeid|propertyzoningdesc|rawcensustractandblock|regionidcity|regionidcounty|regionidneighborhood|regionidzip|roomcnt|storytypeid|threequarterbathnbr|typeconstructiontypeid|unitcnt|yardbuildingsqft17|yardbuildingsqft26|yearbuilt|numberofstories|fireplaceflag|structuretaxvaluedollarcnt|taxvaluedollarcnt|assessmentyear|landtaxvaluedollarcnt|taxamount|taxdelinquencyflag|taxdelinquencyyear|censustractandblock|\n",
 858 |             "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----+------------+-----------+------------+---------------+--------------+---------------------+--------+---------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n",
 859 |             "|       0|              2173698|                 2979156|     2983589|      11462|     11450|            2972588|              1046729|           128912|   2968121|                 2782500|                       55565|              276033|             2977545|             2794419|             2782500|            2963216|11437|     2672580|     128912|     2101950|        2101950|       2916203|              1178816|   11437|    11437|           276099|2467683|    2957257|     2948278|    2953142|    2499758|                    12277|                11437|           1006588|                 11437|       62845|         11437|             1828815|      13980|  11475|    2983593|            2673586|               2978470|1007727|           2904862|           2982570|    59928|        2303148|      2980054|                     54982|            42550|         11439|                67733|    31250|           2928755|           2928753|              75126|\n",
 860 |             "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----+------------+-----------+------------+---------------+--------------+---------------------+--------+---------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n",
 861 |             "\n"
 862 |           ]
 863 |         }
 864 |       ],
 865 |       "source": [
 866 |         "# columns remaining after deletion\n",
 867 |         "null_counts = df.select([sum(col(column).isNull().cast(\"int\")).alias(column) for column in df.columns])\n",
 868 |         "null_counts.show()"
 869 |       ]
 870 |     },
 871 |     {
 872 |       "cell_type": "code",
 873 |       "execution_count": null,
 874 |       "id": "I_Mx_6U2MJ93",
 875 |       "metadata": {
 876 |         "id": "I_Mx_6U2MJ93"
 877 |       },
 878 |       "outputs": [],
 879 |       "source": [
 880 |         "df.show()"
 881 |       ]
 882 |     },
 883 |     {
 884 |       "cell_type": "code",
 885 |       "execution_count": null,
 886 |       "id": "KsG0VYVsUFAt",
 887 |       "metadata": {
 888 |         "id": "KsG0VYVsUFAt"
 889 |       },
 890 |       "outputs": [],
 891 |       "source": [
 892 |         "PARCELID: 0\n",
 893 |         "BATHROOMCNT: 11462\n",
 894 |         "BEDROOMCNT: 11450\n",
 895 |         "BUILDINGQUALITYTYPEID: 1046729\n",
 896 |         "CALCULATEDBATHNBR: 128912\n",
 897 |         "CALCULATEDFINISHEDSQUAREFEET: 55565\n",
 898 |         "FINISHEDSQUAREFEET12: 276033\n",
 899 |         "FIPS: 11437\n",
 900 |         "FULLBATHCNT: 128912\n",
 901 |         "HEATINGORSYSTEMTYPEID: 1178816\n",
 902 |         "LATITUDE: 11437\n",
 903 |         "LONGITUDE: 11437\n",
 904 |         "LOTSIZESQUAREFEET: 276099\n",
 905 |         "PROPERTYCOUNTYLANDUSECODE: 12277\n",
 906 |         "PROPERTYLANDUSETYPEID: 11437\n",
 907 |         "PROPERTYZONINGDESC: 1006588\n",
 908 |         "RAWCENSUSTRACTANDBLOCK: 11437\n",
 909 |         "REGIONIDCITY: 62845\n",
 910 |         "REGIONIDCOUNTY: 11437\n",
 911 |         "REGIONIDZIP: 13980\n",
 912 |         "ROOMCNT: 11475\n",
 913 |         "UNITCNT: 1007727\n",
 914 |         "YEARBUILT: 59928\n",
 915 |         "STRUCTURETAXVALUEDOLLARCNT: 54982\n",
 916 |         "TAXVALUEDOLLARCNT: 42550\n",
 917 |         "ASSESSMENTYEAR: 11439\n",
 918 |         "LANDTAXVALUEDOLLARCNT: 67733\n",
 919 |         "TAXAMOUNT: 31250\n",
 920 |         "CENSUSTRACTANDBLOCK: 75126"
 921 |       ]
 922 |     },
 923 |     {
 924 |       "cell_type": "code",
 925 |       "execution_count": null,
 926 |       "id": "f958f97a",
 927 |       "metadata": {
 928 |         "colab": {
 929 |           "base_uri": "https://localhost:8080/"
 930 |         },
 931 |         "id": "f958f97a",
 932 |         "outputId": "3abae1d5-c133-4199-c41c-065c365b9c9f"
 933 |       },
 934 |       "outputs": [
 935 |         {
 936 |           "data": {
 937 |             "text/plain": [
 938 |               "58"
 939 |             ]
 940 |           },
 941 |           "execution_count": 23,
 942 |           "metadata": {},
 943 |           "output_type": "execute_result"
 944 |         }
 945 |       ],
 946 |       "source": [
 947 |         "# Fill miss value\n",
 948 |         "values = {\n",
 949 |         "    'bathroomcnt': 'value1',\n",
 950 |         "    'bedroomcnt': 'value2',\n",
 951 |         "    'buildingqualitytypeid': \"\",\n",
 952 |         "    \"CALCULATEDBATHNBR\": 128912,\n",
 953 |         "    \"CALCULATEDFINISHEDSQUAREFEET\": 55565,\n",
 954 |         "    \"FINISHEDSQUAREFEET12\": 276033,\n",
 955 |         "    \"FIPS\": 11437,\n",
 956 |         "    \"FULLBATHCNT\": 128912,\n",
 957 |         "    \"HEATINGORSYSTEMTYPEID\": 1178816,\n",
 958 |         "    \"LATITUDE\": 11437,\n",
 959 |         "    \"LONGITUDE\": 11437,\n",
 960 |         "    \"LOTSIZESQUAREFEET\": 276099,\n",
 961 |         "    \"PROPERTYCOUNTYLANDUSECODE\": 12277,\n",
 962 |         "    \"PROPERTYLANDUSETYPEID\": 11437,\n",
 963 |         "    \"PROPERTYZONINGDESC\": 1006588,\n",
 964 |         "    \"RAWCENSUSTRACTANDBLOCK\": 11437,\n",
 965 |         "    \"REGIONIDCITY\": 62845,\n",
 966 |         "    \"REGIONIDCOUNTY\": 11437,\n",
 967 |         "    \"REGIONIDZIP\": 13980,\n",
 968 |         "    \"ROOMCNT\": 11475,\n",
 969 |         "    \"UNITCNT\": 1007727,\n",
 970 |         "    \"YEARBUILT\": 59928,\n",
 971 |         "    \"STRUCTURETAXVALUEDOLLARCNT\": 54982,\n",
 972 |         "    \"TAXVALUEDOLLARCNT\": 42550,\n",
 973 |         "    \"ASSESSMENTYEAR\": 11439,\n",
 974 |         "    \"LANDTAXVALUEDOLLARCNT\": 67733,\n",
 975 |         "    \"TAXAMOUNT\": 31250,\n",
 976 |         "    \"CENSUSTRACTANDBLOCK\": 75126\n",
 977 |         "}\n",
 978 |         "filled_df = df.fillna(values)"
 979 |       ]
 980 |     },
 981 |     {
 982 |       "cell_type": "code",
 983 |       "execution_count": null,
 984 |       "id": "OTkG-6cMOxEt",
 985 |       "metadata": {
 986 |         "id": "OTkG-6cMOxEt"
 987 |       },
 988 |       "outputs": [],
 989 |       "source": [
 990 |         "null_counts = df.select([sum(col(column).isNull().cast(\"int\")).alias(column) for column in df.columns])\n",
 991 |         "null_counts.show()"
 992 |       ]
 993 |     },
 994 |     {
 995 |       "cell_type": "markdown",
 996 |       "id": "dnypT2myKfsh",
 997 |       "metadata": {
 998 |         "id": "dnypT2myKfsh"
 999 |       },
1000 |       "source": [
1001 |         "# Feature Engineering"
1002 |       ]
1003 |     },
1004 |     {
1005 |       "cell_type": "code",
1006 |       "execution_count": null,
1007 |       "id": "3f5d77f7",
1008 |       "metadata": {
1009 |         "id": "3f5d77f7"
1010 |       },
1011 |       "outputs": [],
1012 |       "source": [
1013 |         "# One-hot encoding for 'bathroomcnt'\n",
1014 |         "encoder_bathroomcnt = OneHotEncoder(inputCols=['bathroomcnt'], outputCols=['bathroomcnt_dummy'])\n",
1015 |         "df = encoder_bathroomcnt.fit(df).transform(df)\n"
1016 |       ]
1017 |     },
1018 |     {
1019 |       "cell_type": "code",
1020 |       "execution_count": null,
1021 |       "id": "5c50d08f",
1022 |       "metadata": {
1023 |         "id": "5c50d08f"
1024 |       },
1025 |       "outputs": [],
1026 |       "source": [
1027 |         "# One-hot encoding for 'bedroomcnt'\n",
1028 |         "encoder_bedroomcnt = OneHotEncoder(inputCols=['bedroomcnt'], outputCols=['bedroomcnt_dummy'])\n",
1029 |         "df = encoder_bedroomcnt.fit(df).transform(df)"
1030 |       ]
1031 |     },
1032 |     {
1033 |       "cell_type": "code",
1034 |       "execution_count": null,
1035 |       "id": "Z2pj086_RJFl",
1036 |       "metadata": {
1037 |         "id": "Z2pj086_RJFl"
1038 |       },
1039 |       "outputs": [],
1040 |       "source": [
1041 |         "# Assemble features into a single vector column\n",
1042 |         "assembler = VectorAssembler(inputCols=['roomcnt', 'latitude', 'longitude', 'bathroomcnt_dummy', 'bedroomcnt_dummy'], outputCol='features')\n",
1043 |         "df = assembler.transform(df)"
1044 |       ]
1045 |     },
1046 |     {
1047 |       "cell_type": "markdown",
1048 |       "id": "ZScPc9wxREPs",
1049 |       "metadata": {
1050 |         "id": "ZScPc9wxREPs"
1051 |       },
1052 |       "source": [
1053 |         "# Build Linear Regression Model"
1054 |       ]
1055 |     },
1056 |     {
1057 |       "cell_type": "code",
1058 |       "execution_count": null,
1059 |       "id": "3ef37703",
1060 |       "metadata": {
1061 |         "id": "3ef37703"
1062 |       },
1063 |       "outputs": [],
1064 |       "source": [
1065 |         "# Split the data\n",
1066 |         "train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)"
1067 |       ]
1068 |     },
1069 |     {
1070 |       "cell_type": "code",
1071 |       "execution_count": null,
1072 |       "id": "cadb50de",
1073 |       "metadata": {
1074 |         "id": "cadb50de"
1075 |       },
1076 |       "outputs": [],
1077 |       "source": [
1078 |         "# Build the model\n",
1079 |         "regression = LinearRegression(featuresCol='features', labelCol='duration')\n",
1080 |         "model = regression.fit(train_data)"
1081 |       ]
1082 |     },
1083 |     {
1084 |       "cell_type": "code",
1085 |       "execution_count": null,
1086 |       "id": "b56f0ff4",
1087 |       "metadata": {
1088 |         "id": "b56f0ff4"
1089 |       },
1090 |       "outputs": [],
1091 |       "source": [
1092 |         "# Make predictions\n",
1093 |         "predictions = model.transform(test_data)"
1094 |       ]
1095 |     },
1096 |     {
1097 |       "cell_type": "markdown",
1098 |       "id": "FeP14XfNRgzS",
1099 |       "metadata": {
1100 |         "id": "FeP14XfNRgzS"
1101 |       },
1102 |       "source": [
1103 |         "# Evaluate Model"
1104 |       ]
1105 |     },
1106 |     {
1107 |       "cell_type": "code",
1108 |       "execution_count": null,
1109 |       "id": "9e490664",
1110 |       "metadata": {
1111 |         "id": "9e490664"
1112 |       },
1113 |       "outputs": [],
1114 |       "source": [
1115 |         "# Evaluate the model\n",
1116 |         "evaluator = RegressionEvaluator(labelCol='duration', metricName='rmse')\n",
1117 |         "rmse = evaluator.evaluate(predictions)\n",
1118 |         "print(\"Root Mean Square Error (RMSE) on test data =\", rmse)\n",
1119 |         "\n",
1120 |         "# Print coefficients and intercept for interpretation\n",
1121 |         "print(\"Coefficients:\", model.coefficients)\n",
1122 |         "print(\"Intercept:\", model.intercept)"
1123 |       ]
1124 |     },
1125 |     {
1126 |       "cell_type": "code",
1127 |       "execution_count": null,
1128 |       "id": "c2f0476d",
1129 |       "metadata": {
1130 |         "id": "c2f0476d"
1131 |       },
1132 |       "outputs": [],
1133 |       "source": []
1134 |     },
1135 |     {
1136 |       "cell_type": "code",
1137 |       "execution_count": null,
1138 |       "id": "25938f77",
1139 |       "metadata": {
1140 |         "id": "25938f77"
1141 |       },
1142 |       "outputs": [],
1143 |       "source": []
1144 |     },
1145 |     {
1146 |       "cell_type": "code",
1147 |       "execution_count": null,
1148 |       "id": "a029e08d",
1149 |       "metadata": {
1150 |         "id": "a029e08d"
1151 |       },
1152 |       "outputs": [],
1153 |       "source": []
1154 |     },
1155 |     {
1156 |       "cell_type": "code",
1157 |       "execution_count": null,
1158 |       "id": "4a4ad2d6",
1159 |       "metadata": {
1160 |         "id": "4a4ad2d6"
1161 |       },
1162 |       "outputs": [],
1163 |       "source": []
1164 |     },
1165 |     {
1166 |       "cell_type": "code",
1167 |       "execution_count": null,
1168 |       "id": "31873dac",
1169 |       "metadata": {
1170 |         "id": "31873dac"
1171 |       },
1172 |       "outputs": [],
1173 |       "source": []
1174 |     },
1175 |     {
1176 |       "cell_type": "code",
1177 |       "execution_count": null,
1178 |       "id": "0d65e838",
1179 |       "metadata": {
1180 |         "id": "0d65e838"
1181 |       },
1182 |       "outputs": [],
1183 |       "source": []
1184 |     },
1185 |     {
1186 |       "cell_type": "code",
1187 |       "execution_count": null,
1188 |       "id": "041add92",
1189 |       "metadata": {
1190 |         "id": "041add92"
1191 |       },
1192 |       "outputs": [],
1193 |       "source": []
1194 |     },
1195 |     {
1196 |       "cell_type": "code",
1197 |       "execution_count": null,
1198 |       "id": "fa5fb063-3ed4-40bb-a566-f9eab2ca520e",
1199 |       "metadata": {
1200 |         "id": "fa5fb063-3ed4-40bb-a566-f9eab2ca520e"
1201 |       },
1202 |       "outputs": [],
1203 |       "source": []
1204 |     },
1205 |     {
1206 |       "cell_type": "code",
1207 |       "execution_count": null,
1208 |       "id": "8d651e75-dc8a-4bf4-ba9f-de4e27ddfa04",
1209 |       "metadata": {
1210 |         "id": "8d651e75-dc8a-4bf4-ba9f-de4e27ddfa04"
1211 |       },
1212 |       "outputs": [],
1213 |       "source": []
1214 |     },
1215 |     {
1216 |       "cell_type": "code",
1217 |       "execution_count": null,
1218 |       "id": "a4916a6a-f1e8-49c9-850d-f94639c337e7",
1219 |       "metadata": {
1220 |         "id": "a4916a6a-f1e8-49c9-850d-f94639c337e7"
1221 |       },
1222 |       "outputs": [],
1223 |       "source": []
1224 |     },
1225 |     {
1226 |       "cell_type": "code",
1227 |       "execution_count": null,
1228 |       "id": "cd1eb575-3b8b-49c2-943f-3f79a908d626",
1229 |       "metadata": {
1230 |         "id": "cd1eb575-3b8b-49c2-943f-3f79a908d626"
1231 |       },
1232 |       "outputs": [],
1233 |       "source": []
1234 |     },
1235 |     {
1236 |       "cell_type": "code",
1237 |       "execution_count": null,
1238 |       "id": "0907696c-cc0c-4d33-9707-89696752a4d1",
1239 |       "metadata": {
1240 |         "id": "0907696c-cc0c-4d33-9707-89696752a4d1"
1241 |       },
1242 |       "outputs": [],
1243 |       "source": []
1244 |     },
1245 |     {
1246 |       "cell_type": "code",
1247 |       "execution_count": null,
1248 |       "id": "29b355ad",
1249 |       "metadata": {
1250 |         "id": "29b355ad"
1251 |       },
1252 |       "outputs": [],
1253 |       "source": []
1254 |     },
1255 |     {
1256 |       "cell_type": "markdown",
1257 |       "id": "6d5349b1-4b09-4ace-9b02-a6eed61843bd",
1258 |       "metadata": {
1259 |         "id": "6d5349b1-4b09-4ace-9b02-a6eed61843bd"
1260 |       },
1261 |       "source": [
1262 |         "# Pre-Processing Data"
1263 |       ]
1264 |     },
1265 |     {
1266 |       "cell_type": "markdown",
1267 |       "id": "8788823a-5c40-481c-8dea-97ce436899bc",
1268 |       "metadata": {
1269 |         "id": "8788823a-5c40-481c-8dea-97ce436899bc"
1270 |       },
1271 |       "source": [
1272 |         "### 1. Check Data"
1273 |       ]
1274 |     },
1275 |     {
1276 |       "cell_type": "code",
1277 |       "execution_count": null,
1278 |       "id": "7c962808-b082-48cd-8b04-11b8d821f32c",
1279 |       "metadata": {
1280 |         "id": "7c962808-b082-48cd-8b04-11b8d821f32c"
1281 |       },
1282 |       "outputs": [],
1283 |       "source": [
1284 |         "df = spark.read.csv(\"properties_2016.csv\", header=True, inferSchema=True)"
1285 |       ]
1286 |     },
1287 |     {
1288 |       "cell_type": "code",
1289 |       "execution_count": null,
1290 |       "id": "bd61df3c-936d-4edf-8cc4-0d7012bee2ce",
1291 |       "metadata": {
1292 |         "id": "bd61df3c-936d-4edf-8cc4-0d7012bee2ce"
1293 |       },
1294 |       "outputs": [],
1295 |       "source": [
1296 |         "df.limit(10)"
1297 |       ]
1298 |     },
1299 |     {
1300 |       "cell_type": "code",
1301 |       "execution_count": null,
1302 |       "id": "1578e940-ec6f-4867-a345-bc1c9d144727",
1303 |       "metadata": {
1304 |         "id": "1578e940-ec6f-4867-a345-bc1c9d144727"
1305 |       },
1306 |       "outputs": [],
1307 |       "source": [
1308 |         "# convert all column names to uppercase\n",
1309 |         "for col in df.columns:\n",
1310 |         "  df = df.withColumnRenamed(col, col.upper())"
1311 |       ]
1312 |     },
1313 |     {
1314 |       "cell_type": "markdown",
1315 |       "id": "b0a960fc-0f60-4093-b135-04fb779d33cc",
1316 |       "metadata": {
1317 |         "id": "b0a960fc-0f60-4093-b135-04fb779d33cc"
1318 |       },
1319 |       "source": [
1320 |         "### 2. check descriptive statistics"
1321 |       ]
1322 |     },
1323 |     {
1324 |       "cell_type": "code",
1325 |       "execution_count": null,
1326 |       "id": "7dc0f07a-6b10-4cf5-a15b-2469b0907a9b",
1327 |       "metadata": {
1328 |         "id": "7dc0f07a-6b10-4cf5-a15b-2469b0907a9b"
1329 |       },
1330 |       "outputs": [],
1331 |       "source": [
1332 |         "df.describe().limit(20)"
1333 |       ]
1334 |     },
1335 |     {
1336 |       "cell_type": "markdown",
1337 |       "id": "80beb77e-de1e-4ea3-8a73-e3c4619e614b",
1338 |       "metadata": {
1339 |         "id": "80beb77e-de1e-4ea3-8a73-e3c4619e614b"
1340 |       },
1341 |       "source": [
1342 |         "### 3. Check DataType"
1343 |       ]
1344 |     },
1345 |     {
1346 |       "cell_type": "code",
1347 |       "execution_count": null,
1348 |       "id": "d048bb60-008b-4c79-87dc-6ae9703bcbea",
1349 |       "metadata": {
1350 |         "id": "d048bb60-008b-4c79-87dc-6ae9703bcbea"
1351 |       },
1352 |       "outputs": [],
1353 |       "source": [
1354 |         "df.printSchema()"
1355 |       ]
1356 |     },
1357 |     {
1358 |       "cell_type": "markdown",
1359 |       "id": "3b3d485b-303e-482f-966c-9692f60c315c",
1360 |       "metadata": {
1361 |         "id": "3b3d485b-303e-482f-966c-9692f60c315c"
1362 |       },
1363 |       "source": [
1364 |         "### 4. Check Number columns, rows current"
1365 |       ]
1366 |     },
1367 |     {
1368 |       "cell_type": "code",
1369 |       "execution_count": null,
1370 |       "id": "1869a2ac-0579-44b0-b7ea-d0f24852bbc2",
1371 |       "metadata": {
1372 |         "id": "1869a2ac-0579-44b0-b7ea-d0f24852bbc2"
1373 |       },
1374 |       "outputs": [],
1375 |       "source": [
1376 |         "# Columns\n",
1377 |         "len(df.columns)"
1378 |       ]
1379 |     },
1380 |     {
1381 |       "cell_type": "code",
1382 |       "execution_count": null,
1383 |       "id": "c63758b3-a367-4848-b6ae-f492f60ce9f8",
1384 |       "metadata": {
1385 |         "id": "c63758b3-a367-4848-b6ae-f492f60ce9f8"
1386 |       },
1387 |       "outputs": [],
1388 |       "source": [
1389 |         "# Rows\n",
1390 |         "df.count()"
1391 |       ]
1392 |     },
1393 |     {
1394 |       "cell_type": "markdown",
1395 |       "id": "337dffc1-edd9-454d-b07a-87e02566685f",
1396 |       "metadata": {
1397 |         "id": "337dffc1-edd9-454d-b07a-87e02566685f"
1398 |       },
1399 |       "source": [
1400 |         "### 5. Drop Duplicates"
1401 |       ]
1402 |     },
1403 |     {
1404 |       "cell_type": "code",
1405 |       "execution_count": null,
1406 |       "id": "fb672b6c-e6be-4d3c-b0bb-0b4a4ab29cab",
1407 |       "metadata": {
1408 |         "id": "fb672b6c-e6be-4d3c-b0bb-0b4a4ab29cab"
1409 |       },
1410 |       "outputs": [],
1411 |       "source": [
1412 |         "df = df.dropDuplicates()"
1413 |       ]
1414 |     },
1415 |     {
1416 |       "cell_type": "markdown",
1417 |       "id": "1c16498a-6da2-4fc1-8403-437795fdff61",
1418 |       "metadata": {
1419 |         "id": "1c16498a-6da2-4fc1-8403-437795fdff61"
1420 |       },
1421 |       "source": [
1422 |         "### 6. Check Miss Value"
1423 |       ]
1424 |     },
1425 |     {
1426 |       "cell_type": "code",
1427 |       "execution_count": null,
1428 |       "id": "68e60e46-6a73-4f7d-87ee-014e7ddbef1c",
1429 |       "metadata": {
1430 |         "id": "68e60e46-6a73-4f7d-87ee-014e7ddbef1c"
1431 |       },
1432 |       "outputs": [],
1433 |       "source": [
1434 |         "def check_null_count():\n",
1435 |         "    for column in df.columns:\n",
1436 |         "        null_count = df.filter(df[column].isNull()).count()\n",
1437 |         "        print(f\"{column}: {null_count}\")"
1438 |       ]
1439 |     },
1440 |     {
1441 |       "cell_type": "code",
1442 |       "execution_count": null,
1443 |       "id": "ddaf99dd-78e8-4ece-82ed-6081bee69b60",
1444 |       "metadata": {
1445 |         "id": "ddaf99dd-78e8-4ece-82ed-6081bee69b60"
1446 |       },
1447 |       "outputs": [],
1448 |       "source": [
1449 |         "# Số lượng giá trị khuyết thiếu của từng cột\n",
1450 |         "check_null_count()"
1451 |       ]
1452 |     },
1453 |     {
1454 |       "cell_type": "markdown",
1455 |       "id": "42fbdaaa-1895-4d31-9ed8-2153261108e5",
1456 |       "metadata": {
1457 |         "id": "42fbdaaa-1895-4d31-9ed8-2153261108e5"
1458 |       },
1459 |       "source": [
1460 |         "##### Drop columns with more than 60% missing"
1461 |       ]
1462 |     },
1463 |     {
1464 |       "cell_type": "code",
1465 |       "execution_count": null,
1466 |       "id": "59979bc8-5f48-4697-857e-b0bd65b00472",
1467 |       "metadata": {
1468 |         "id": "59979bc8-5f48-4697-857e-b0bd65b00472"
1469 |       },
1470 |       "outputs": [],
1471 |       "source": [
1472 |         "def column_dropper(df, threshold):\n",
1473 |         "    # Takes a dataframe and threshold for missing values. Returns a dataframe.\n",
1474 |         "    total_records = df.count()\n",
1475 |         "    for col in df.columns:\n",
1476 |         "        # Calculate the percentage of missing values\n",
1477 |         "        missing = df.where(df[col].isNull()).count()\n",
1478 |         "        missing_percent = missing / total_records\n",
1479 |         "        # Drop column if percent of missing is more than threshold\n",
1480 |         "        if missing_percent > threshold:\n",
1481 |         "            df = df.drop(col)\n",
1482 |         "    return df\n",
1483 |         "\n",
1484 |         "# Drop columns that are more than 60% missing\n",
1485 |         "df = column_dropper(df, 0.6)"
1486 |       ]
1487 |     },
1488 |     {
1489 |       "cell_type": "code",
1490 |       "execution_count": null,
1491 |       "id": "dfd6df2b-4a42-4a62-ae85-0d882c85405f",
1492 |       "metadata": {
1493 |         "id": "dfd6df2b-4a42-4a62-ae85-0d882c85405f"
1494 |       },
1495 |       "outputs": [],
1496 |       "source": [
1497 |         "check_null_count()"
1498 |       ]
1499 |     },
1500 |     {
1501 |       "cell_type": "markdown",
1502 |       "id": "b7526960-0c18-4577-ac67-b6542f6ff17e",
1503 |       "metadata": {
1504 |         "id": "b7526960-0c18-4577-ac67-b6542f6ff17e"
1505 |       },
1506 |       "source": [
1507 |         "### 7. Outlier Filtering"
1508 |       ]
1509 |     },
1510 |     {
1511 |       "cell_type": "code",
1512 |       "execution_count": null,
1513 |       "id": "af033c41-07ea-4474-bcce-9d388e150aef",
1514 |       "metadata": {
1515 |         "id": "af033c41-07ea-4474-bcce-9d388e150aef"
1516 |       },
1517 |       "outputs": [],
1518 |       "source": [
1519 |         "mean_val = df.agg({'BATHROOMCNT': 'mean'}).collect()[0][0]\n",
1520 |         "stddev_val = df.agg({'BATHROOMCNT': 'stddev'}).collect()[0][0]\n",
1521 |         "\n",
1522 |         "low_bound = mean_val - (3 * stddev_val)\n",
1523 |         "hi_bound = mean_val + (3 * stddev_val)\n",
1524 |         "\n",
1525 |         "df = df.where((df['BATHROOMCNT'] < hi_bound) & (df['BATHROOMCNT'] > low_bound))"
1526 |       ]
1527 |     },
1528 |     {
1529 |       "cell_type": "markdown",
1530 |       "id": "cb41849c-1339-4926-9c6c-3a23507d52ff",
1531 |       "metadata": {
1532 |         "id": "cb41849c-1339-4926-9c6c-3a23507d52ff"
1533 |       },
1534 |       "source": [
1535 |         "### 8. Adjust Data"
1536 |       ]
1537 |     },
1538 |     {
1539 |       "cell_type": "code",
1540 |       "execution_count": null,
1541 |       "id": "5e57b8b8-3ce2-401d-8dda-2402a608f5ee",
1542 |       "metadata": {
1543 |         "id": "5e57b8b8-3ce2-401d-8dda-2402a608f5ee"
1544 |       },
1545 |       "outputs": [],
1546 |       "source": [
1547 |         "mean = df.agg({'BATHROOMCNT': 'mean'}).collect()[0][0]\n",
1548 |         "stddev = df.agg({'BATHROOMCNT': 'stddev'}).collect()[0][0]\n",
1549 |         "# Create a new column with the scaled data\n",
1550 |         "df = df.withColumn(\"ztrans_days\", (df['BATHROOMCNT'] - mean) / stddev)\n",
1551 |         "df.agg({'ztrans_days': 'mean'}).collect()\n",
1552 |         "df.agg({'ztrans_days': 'stddev'}).collect()"
1553 |       ]
1554 |     },
1555 |     {
1556 |       "cell_type": "markdown",
1557 |       "id": "56d5a156-5472-4c77-b72f-173974721375",
1558 |       "metadata": {
1559 |         "id": "56d5a156-5472-4c77-b72f-173974721375"
1560 |       },
1561 |       "source": [
1562 |         "# Feature Engineering"
1563 |       ]
1564 |     },
1565 |     {
1566 |       "cell_type": "markdown",
1567 |       "id": "6865adc7-b638-4b5a-b650-d57a415ed110",
1568 |       "metadata": {
1569 |         "id": "6865adc7-b638-4b5a-b650-d57a415ed110"
1570 |       },
1571 |       "source": [
1572 |         "### 1. Bucketing"
1573 |       ]
1574 |     },
1575 |     {
1576 |       "cell_type": "code",
1577 |       "execution_count": null,
1578 |       "id": "5a695ed2-0573-4fa6-93c8-e123f26042a7",
1579 |       "metadata": {
1580 |         "id": "5a695ed2-0573-4fa6-93c8-e123f26042a7"
1581 |       },
1582 |       "outputs": [],
1583 |       "source": [
1584 |         "splits = [0, 1, 2, 3, 4, float('Inf')]\n",
1585 |         "\n",
1586 |         "# Create bucketing transformer\n",
1587 |         "buck = Bucketizer(splits=splits, inputCol='TAXAMOUNT', outputCol='TAXA')\n",
1588 |         "\n",
1589 |         "# Apply transformer\n",
1590 |         "df = buck.transform(df)\n",
1591 |         "\n",
1592 |         "# Inspect results\n",
1593 |         "df[['TAXAMOUNT', 'TAXA']].show()"
1594 |       ]
1595 |     },
1596 |     {
1597 |       "cell_type": "markdown",
1598 |       "id": "c487737f-7e48-42b1-83c9-8c5ef75951aa",
1599 |       "metadata": {
1600 |         "id": "c487737f-7e48-42b1-83c9-8c5ef75951aa"
1601 |       },
1602 |       "source": [
1603 |         "### 2. One-hot Encoding"
1604 |       ]
1605 |     },
1606 |     {
1607 |       "cell_type": "code",
1608 |       "execution_count": null,
1609 |       "id": "cf27659d-29ba-4fab-b8c5-30a979c6a445",
1610 |       "metadata": {
1611 |         "id": "cf27659d-29ba-4fab-b8c5-30a979c6a445"
1612 |       },
1613 |       "outputs": [],
1614 |       "source": [
1615 |         "from pyspark.ml.feature import OneHotEncoder, StringIndexer\n",
1616 |         "\n",
1617 |         "# Map strings to numbers with string indexer\n",
1618 |         "string_indexer = StringIndexer(inputCol='ROOMCNT', outputCol='ROOM_Index')\n",
1619 |         "indexed_df = string_indexer.fit(df).transform(df)\n",
1620 |         "\n",
1621 |         "# Onehot encode indexed values\n",
1622 |         "encoder = OneHotEncoder(inputCol='ROOM_Index', outputCol='ROOM_Vec')\n",
1623 |         "encoded_df = encoder.fit(indexed_df).transform(indexed_df)\n",
1624 |         "\n",
1625 |         "# Inspect the transformation steps\n",
1626 |         "encoded_df[['ROOMCNT', 'ROOM_Index', 'ROOM_Vec']].show(truncate=100)"
1627 |       ]
1628 |     }
1629 |   ],
1630 |   "metadata": {
1631 |     "colab": {
1632 |       "provenance": []
1633 |     },
1634 |     "kernelspec": {
1635 |       "display_name": "Python 3 (ipykernel)",
1636 |       "language": "python",
1637 |       "name": "python3"
1638 |     },
1639 |     "language_info": {
1640 |       "codemirror_mode": {
1641 |         "name": "ipython",
1642 |         "version": 3
1643 |       },
1644 |       "file_extension": ".py",
1645 |       "mimetype": "text/x-python",
1646 |       "name": "python",
1647 |       "nbconvert_exporter": "python",
1648 |       "pygments_lexer": "ipython3",
1649 |       "version": "3.11.6"
1650 |     }
1651 |   },
1652 |   "nbformat": 4,
1653 |   "nbformat_minor": 5
1654 | }
1655 | 


--------------------------------------------------------------------------------