├── public
├── notebooks
│ ├── EDA.ipynb
│ └── Preprocessing.ipynb
└── images
│ ├── DataFlow.png
│ ├── DataLineage.png
│ ├── data_flow.png
│ ├── youtube_icon.png
│ ├── Data_flow_youtube.png
│ └── Data_flow_youtube2.png
├── etl_pipeline
├── dbt_tranform
│ ├── analyses
│ │ └── .gitkeep
│ ├── macros
│ │ ├── .gitkeep
│ │ └── generate_schema_name.sql
│ ├── seeds
│ │ └── .gitkeep
│ ├── tests
│ │ └── .gitkeep
│ ├── snapshots
│ │ └── .gitkeep
│ ├── .gitignore
│ ├── .user.yml
│ ├── packages.yml
│ ├── package-lock.yml
│ ├── models
│ │ ├── youtube_trending
│ │ │ ├── search_linkvideo.sql
│ │ │ ├── search_videocategory.sql
│ │ │ └── search_information.sql
│ │ ├── sources.yml
│ │ └── schema.yml
│ ├── profiles.yml
│ ├── README.md
│ └── dbt_project.yml
├── etl_pipeline
│ ├── jobs
│ │ └── __init__.py
│ ├── assets
│ │ ├── __init__.py
│ │ ├── dbt.py
│ │ ├── warehouse.py
│ │ ├── gold.py
│ │ ├── bronze.py
│ │ └── silver.py
│ ├── schedules
│ │ └── __init__.py
│ ├── constants.py
│ ├── partitions
│ │ └── __init__.py
│ ├── func_process.py
│ ├── __init__.py
│ └── resources
│ │ ├── mysql_io_manager.py
│ │ ├── __init__.py
│ │ ├── minio_io_manager.py
│ │ ├── psql_io_manager.py
│ │ ├── spark_io_manager.py
│ │ └── youtube_io_manager.py
├── etl_pipeline_tests
│ ├── __init__.py
│ └── test_assets.py
├── setup.cfg
├── pyproject.toml
├── setup.py
├── requirements.txt
├── README.md
└── Dockerfile
├── app
├── icons
│ ├── video.png
│ ├── youtube.png
│ ├── youtube_v2.png
│ ├── icons8-like-48.png
│ ├── icons8-view-48.png
│ ├── icons8-channel-48.png
│ ├── icons8-category-48.png
│ └── icons8-thumbs-down-skin-type-4-48.png
├── streamlit_app.py
└── pages
│ ├── search_video.py
│ └── video_detail.py
├── docker-images
├── dagster
│ ├── requirements.txt
│ └── Dockerfile
├── streamlit
│ ├── requirements.txt
│ └── Dockerfile
└── spark
│ ├── Dockerfile
│ └── spark-defaults.conf
├── dagster_home
├── workspace.yaml
└── dagster.yaml
├── .gitignore
├── LICENSE
├── Makefile
├── load_dataset
├── mysql_load.sql
├── psql_schemas.sql
└── mysql_schemas.sql
├── docker-compose.yaml
└── README.md
/public/notebooks/EDA.ipynb:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/analyses/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/macros/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/tests/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/jobs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/assets/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/schedules/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline_tests/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline_tests/test_assets.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/etl_pipeline/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = etl_pipeline
3 |
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | target/
3 | dbt_packages/
4 | logs/
5 |
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/.user.yml:
--------------------------------------------------------------------------------
1 | id: 34ac7379-38f0-4235-94d4-210cae8a4832
2 |
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/constants.py:
--------------------------------------------------------------------------------
1 | START_DATE = "2020-06-15"
2 | END_DATE = "2024-05-13"
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/packages.yml:
--------------------------------------------------------------------------------
1 | packages:
2 | - package: dbt-labs/dbt_utils
3 | version: 1.1.1
--------------------------------------------------------------------------------
/app/icons/video.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/video.png
--------------------------------------------------------------------------------
/app/icons/youtube.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/youtube.png
--------------------------------------------------------------------------------
/app/icons/youtube_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/youtube_v2.png
--------------------------------------------------------------------------------
/docker-images/dagster/requirements.txt:
--------------------------------------------------------------------------------
1 | dagster==1.7.3
2 | dagit==1.7.3
3 | dagster-postgres
4 | dagster-dbt==0.23.3
5 | dagster-spark==0.23.3
--------------------------------------------------------------------------------
/public/images/DataFlow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/DataFlow.png
--------------------------------------------------------------------------------
/app/icons/icons8-like-48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/icons8-like-48.png
--------------------------------------------------------------------------------
/app/icons/icons8-view-48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/icons8-view-48.png
--------------------------------------------------------------------------------
/dagster_home/workspace.yaml:
--------------------------------------------------------------------------------
1 | load_from:
2 | - grpc_server:
3 | host: etl_pipeline
4 | port: 4000
5 | location_name: "etl_pipeline"
--------------------------------------------------------------------------------
/public/images/DataLineage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/DataLineage.png
--------------------------------------------------------------------------------
/public/images/data_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/data_flow.png
--------------------------------------------------------------------------------
/app/icons/icons8-channel-48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/icons8-channel-48.png
--------------------------------------------------------------------------------
/public/images/youtube_icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/youtube_icon.png
--------------------------------------------------------------------------------
/app/icons/icons8-category-48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/icons8-category-48.png
--------------------------------------------------------------------------------
/public/images/Data_flow_youtube.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/Data_flow_youtube.png
--------------------------------------------------------------------------------
/public/images/Data_flow_youtube2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/Data_flow_youtube2.png
--------------------------------------------------------------------------------
/docker-images/streamlit/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.34.0
2 | psycopg2-binary==2.9.9
3 | pandas==2.2.2
4 | polars==0.20.23
5 | # scikit-learn==1.5.0
6 | # surprise==0.1
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/package-lock.yml:
--------------------------------------------------------------------------------
1 | packages:
2 | - package: dbt-labs/dbt_utils
3 | version: 1.1.1
4 | sha1_hash: a158c48c59c2bb7d729d2a4e215aabe5bb4f3353
5 |
--------------------------------------------------------------------------------
/etl_pipeline/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [tool.dagster]
6 | module_name = "etl_pipeline"
7 |
--------------------------------------------------------------------------------
/app/icons/icons8-thumbs-down-skin-type-4-48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/icons8-thumbs-down-skin-type-4-48.png
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/models/youtube_trending/search_linkvideo.sql:
--------------------------------------------------------------------------------
1 |
2 |
3 | {{ config(materialized="table") }}
4 |
5 | SELECT
6 | video_id,
7 | link_video
8 | FROM {{ source("gold", "linkvideos") }}
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/models/youtube_trending/search_videocategory.sql:
--------------------------------------------------------------------------------
1 |
2 | {{ config(materialized="table") }}
3 |
4 | SELECT
5 | categoryid,
6 | categoryname
7 | FROM {{ source("gold", "videocategory") }}
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/partitions/__init__.py:
--------------------------------------------------------------------------------
1 | from dagster import MonthlyPartitionsDefinition
2 | from .. import constants
3 |
4 |
5 | monthly_partitions = MonthlyPartitionsDefinition(
6 | start_date=constants.START_DATE,
7 | end_date=constants.END_DATE
8 | )
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/func_process.py:
--------------------------------------------------------------------------------
1 | def replace_str(value: str):
2 | return value.replace("default", "maxresdefault")
3 |
4 | def format_date(value: str):
5 | return value.replace("T", " ").replace("Z", "")
6 |
7 | def convert(value: str):
8 | return value.replace('"', '')
--------------------------------------------------------------------------------
/etl_pipeline/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 | setup(
4 | name="etl_pipeline",
5 | packages=find_packages(exclude=["etl_pipeline_tests"]),
6 | install_requires=[
7 | "dagster",
8 | "dagster-cloud"
9 | ],
10 | extras_require={"dev": ["dagster-webserver", "pytest"]},
11 | )
12 |
--------------------------------------------------------------------------------
/etl_pipeline/requirements.txt:
--------------------------------------------------------------------------------
1 | dagster==1.7.3
2 | dagit==1.7.3
3 | pandas==2.2.2
4 | polars==0.20.23
5 | pyarrow==16.0.0
6 | minio==7.2.7
7 | pymysql==1.1.0
8 | cryptography==42.0.5
9 | psycopg2-binary==2.9.9
10 | dagster-postgres
11 | google-api-python-client==2.127.0
12 | pyspark==3.4.3
13 | dbt-postgres==1.7.13
14 | dagster-dbt==0.23.3
15 | dagster-spark==0.23.3
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .etl-pipeline/
2 | .idea/
3 | .pytest_cache/
4 |
5 | .logs_queue/
6 | .nux/
7 | .telemetry/
8 | history/
9 | logs/
10 | dagster_home/schedules/
11 |
12 | dataset/
13 |
14 | .mypy_cache/
15 | __pycache__/
16 |
17 | minio/
18 | mysql/
19 | postgresql/
20 | venv/
21 |
22 | .env/
23 | data/
24 | .streamlit/
25 |
26 | .env.spark_master
27 | .env.spark_worker
28 | .env
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/macros/generate_schema_name.sql:
--------------------------------------------------------------------------------
1 | {% macro generate_schema_name(custom_schema_name, node) -%}
2 |
3 | {%- set default_schema = target.schema -%}
4 | {%- if custom_schema_name is none -%}
5 |
6 | {{ default_schema }}
7 |
8 | {%- else -%}
9 |
10 | {{ custom_schema_name | trim }}
11 |
12 | {%- endif -%}
13 |
14 | {%- endmacro %}
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/profiles.yml:
--------------------------------------------------------------------------------
1 | dbt_tranform:
2 | outputs:
3 | dev:
4 | dbname: youTube_trending_video
5 | host: "{{ env_var('POSTGRES_HOST') }}"
6 | pass: "{{ env_var('POSTGRES_PASSWORD') }}"
7 | port: 5432
8 | schema: gold
9 | threads: 1
10 | type: postgres
11 | user: "{{ env_var('POSTGRES_USER') }}"
12 | target: dev
13 |
--------------------------------------------------------------------------------
/docker-images/dagster/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.11-slim
2 |
3 | ENV DAGSTER_HOME=/opt/dagster/dagster_home
4 |
5 | RUN mkdir -p $DAGSTER_HOME && \
6 | mkdir -p $DAGSTER_HOME/storage && \
7 | mkdir -p $DAGSTER_HOME/compute_logs && \
8 | mkdir -p $DAGSTER_HOME/local_artifact_storage
9 |
10 | WORKDIR $DAGSTER_HOME
11 |
12 | COPY requirements.txt $DAGSTER_HOME
13 |
14 | RUN pip install --upgrade pip && pip install -r requirements.txt
--------------------------------------------------------------------------------
/docker-images/streamlit/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.11-slim
2 |
3 | WORKDIR /app
4 |
5 | RUN apt-get update && apt-get install -y \
6 | build-essential \
7 | curl \
8 | software-properties-common \
9 | git \
10 | && rm -rf /var/lib/apt/lists/*
11 |
12 | COPY . .
13 |
14 | RUN pip install --upgrade pip && pip install -r requirements.txt
15 |
16 | EXPOSE 8501
17 |
18 | HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19 |
20 | ENTRYPOINT ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/README.md:
--------------------------------------------------------------------------------
1 | Welcome to your new dbt project!
2 |
3 | ### Using the starter project
4 |
5 | Try running the following commands:
6 | - dbt run
7 | - dbt test
8 |
9 |
10 | ### Resources:
11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support
14 | - Find [dbt events](https://events.getdbt.com) near you
15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
16 |
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/models/sources.yml:
--------------------------------------------------------------------------------
1 |
2 | version: 2
3 |
4 | sources:
5 | - name: gold
6 | tables:
7 | - name: videocategory
8 | meta:
9 | dagster:
10 | asset_key: ["warehouse", "gold", "videoCategory"]
11 | - name: linkvideos
12 | meta:
13 | dagster:
14 | asset_key: ["warehouse", "gold", "linkVideos"]
15 | - name: metricvideos
16 | meta:
17 | dagster:
18 | asset_key: ["warehouse", "gold", "metricVideos"]
19 | - name: informationvideos
20 | meta:
21 | dagster:
22 | asset_key: ["warehouse", "gold", "informationVideos"]
--------------------------------------------------------------------------------
/docker-images/spark/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM docker.io/bitnami/spark:3.4.3
2 |
3 | USER root
4 |
5 | # Install prerequisites
6 | RUN apt-get update && apt-get install -y curl
7 |
8 | RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \
9 | && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \
10 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \
11 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \
12 | && mv s3-2.18.41.jar /opt/bitnami/spark/jars \
13 | && mv aws-java-sdk-1.12.367.jar /opt/bitnami/spark/jars \
14 | && mv delta-core_2.12-2.2.0.jar /opt/bitnami/spark/jars \
15 | && mv delta-storage-2.2.0.jar /opt/bitnami/spark/jars
--------------------------------------------------------------------------------
/docker-images/spark/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | spark.jars jars/delta-core_2.12-2.2.0.jar,jars/hadoop-aws-3.3.2.jar,jars/delta-storage-2.2.0.jar,jars/aws-java-sdk-1.12.367.jar,jars/s3-2.18.41.jar,jars/aws-java-sdk-bundle-1.11.1026.jar
2 | spark.sql.extensions io.delta.sql.DeltaSparkSessionExtension
3 | spark.sql.catalog.spark_catalog org.apache.spark.sql.delta.catalog.DeltaCatalog
4 | spark.hadoop.fs.s3a.endpoint http://minio:9000
5 | spark.hadoop.fs.s3a.access.key minio
6 | spark.hadoop.fs.s3a.secret.key minio123
7 | ; spark.hadoop.fs.s3a.awsAccessKeyId minio
8 | ; spark.hadoop.fs.s3a.awsSecretAccessKey minio123
9 | spark.hadoop.fs.s3a.path.style.access true
10 | spark.hadoop.fs.s3a.connection.ssl.enabled false
11 | spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
12 | spark.driver.memory 4g
13 | spark.executor.memory 4g
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from dagster import Definitions, load_assets_from_modules
4 | from dagster_dbt import DbtCliResource
5 |
6 | from .assets import bronze, gold, silver, warehouse, dbt
7 | from .resources import mysql, minio, postgres, youtube, spark
8 |
9 |
10 | all_assets = load_assets_from_modules(
11 | [bronze, silver, gold, warehouse, dbt])
12 |
13 | defs = Definitions(
14 | assets=all_assets,
15 | resources={
16 | "mysql_io_manager": mysql,
17 | "minio_io_manager": minio,
18 | "psql_io_manager": postgres,
19 | "youtube_io_manager": youtube,
20 | "spark_io_manager": spark,
21 | "dbt": DbtCliResource(
22 | project_dir=os.fspath(dbt.DBT_PROJECT_DIR),
23 | profiles_dir=os.fspath(dbt.DBT_PROFILE_DIR)
24 | ),
25 | },
26 | )
27 |
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/models/youtube_trending/search_information.sql:
--------------------------------------------------------------------------------
1 |
2 |
3 | {{ config(materialized="table") }}
4 |
5 | select distinct
6 | i.video_id
7 | , i.title
8 | , i.channeltitle
9 | , v.categoryname
10 | , m.view
11 | , m.like as likes
12 | , m.dislike
13 | , m.publishedat
14 | , l.link_video
15 | , i.tags
16 | , i.thumbnail_link
17 |
18 | from {{ source('gold', 'informationvideos') }} i
19 | inner join {{ source('gold', 'linkvideos') }} l on i.video_id = l.video_id
20 | inner join {{ source('gold', 'videocategory') }} v on i.categoryid = v.categoryid
21 | inner join (
22 | SELECT
23 | video_id
24 | , MAX(view_count) AS view
25 | , MAX(likes) as like
26 | , MAX(dislikes) as dislike
27 | , MAX(publishedat) as publishedat
28 | FROM {{ source('gold', 'metricvideos') }}
29 | GROUP BY video_id
30 | ) AS m on i.video_id = m.video_id
31 |
32 |
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/assets/dbt.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | from dagster import AssetExecutionContext
5 | from dagster_dbt import DbtCliResource, dbt_assets
6 | from dagster_dbt import DagsterDbtTranslator
7 |
8 | from typing import Mapping, Optional, Any
9 |
10 |
11 |
12 | DBT_PROJECT_DIR = Path(__file__).joinpath("..", "..", "..", "dbt_tranform").resolve()
13 | DBT_PROFILE_DIR = Path(__file__).joinpath("..", "..", "..", "dbt_tranform").resolve()
14 | DBT_MANIFEST_PATH = DBT_PROJECT_DIR.joinpath("target", "manifest.json")
15 |
16 | class CustomDagsterDbtTranslator(DagsterDbtTranslator):
17 | def get_group_name(
18 | self, dbt_resource_props: Mapping[str, Any]
19 | ) -> Optional[str]:
20 | return "warehouse"
21 |
22 |
23 | @dbt_assets(
24 | manifest=DBT_MANIFEST_PATH,
25 | dagster_dbt_translator=CustomDagsterDbtTranslator()
26 | )
27 | def Brazilian_ECommerce_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):
28 | yield from dbt.cli(["build"], context=context).stream()
29 |
--------------------------------------------------------------------------------
/app/streamlit_app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import psycopg2
3 | import polars as pl
4 | import pandas as pd
5 | from PIL import Image
6 | from io import BytesIO
7 | import requests
8 |
9 | icon = Image.open("./icons/youtube_v2.png", mode="r")
10 |
11 | st.set_page_config(
12 | page_title="YouTube RecoMaster",
13 | page_icon=icon,
14 | layout="centered",
15 | initial_sidebar_state="expanded"
16 | )
17 |
18 | title, logo = st.columns([4,2.91])
19 | with title:
20 | st.title("YouTube RecoMaster")
21 | with logo:
22 | st.write("")
23 | st.image(icon, width=70)
24 |
25 |
26 | st.markdown(
27 | f'''''',
30 | unsafe_allow_html=True
31 | )
--------------------------------------------------------------------------------
/dagster_home/dagster.yaml:
--------------------------------------------------------------------------------
1 | run_coordinator:
2 | module: dagster.core.run_coordinator
3 | class: QueuedRunCoordinator
4 | config:
5 | max_concurrent_runs: 3
6 |
7 | scheduler:
8 | module: dagster.core.scheduler
9 | class: DagsterDaemonScheduler
10 | config:
11 | max_catchup_runs: 5
12 |
13 | storage:
14 | postgres:
15 | postgres_db:
16 | username:
17 | env: DAGSTER_PG_USERNAME
18 | password:
19 | env: DAGSTER_PG_PASSWORD
20 | hostname:
21 | env: DAGSTER_PG_HOSTNAME
22 | db_name:
23 | env: DAGSTER_PG_DB
24 | port: 5432
25 |
26 | run_launcher:
27 | module: dagster.core.launcher
28 | class: DefaultRunLauncher
29 |
30 | compute_logs:
31 | module: dagster.core.storage.local_compute_log_manager
32 | class: LocalComputeLogManager
33 | config:
34 | base_dir: /opt/dagster/dagster_home/compute_logs
35 |
36 | local_artifact_storage:
37 | module: dagster.core.storage.root
38 | class: LocalArtifactStorage
39 | config:
40 | base_dir: /opt/dagster/dagster_home/local_artifact_storage
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Long Nguyen
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/resources/mysql_io_manager.py:
--------------------------------------------------------------------------------
1 | import polars as pl
2 | from contextlib import contextmanager
3 | from sqlalchemy import create_engine
4 |
5 | from dagster import IOManager, OutputContext, InputContext
6 |
7 |
8 | @contextmanager
9 | def connect_mysql(config: dict):
10 | conn_info = (
11 | f"mysql+pymysql://{config['user']}:{config['password']}"
12 | + f"@{config['host']}:{config['port']}"
13 | + f"/{config['database']}"
14 | )
15 | db_conn = create_engine(conn_info)
16 | try:
17 | yield db_conn
18 | except Exception:
19 | raise
20 |
21 |
22 | class MySQLIOManager(IOManager):
23 |
24 | def __init__(self, config):
25 | self._config = config
26 |
27 | def handle_output(self, context: OutputContext, obj: pl.DataFrame):
28 | pass
29 |
30 | def load_input(self, context: InputContext) -> pl.DataFrame:
31 | pass
32 |
33 | def extract_data(self, sql: str) -> pl.DataFrame:
34 | with connect_mysql(self._config) as db_conn:
35 | pd_data = pl.read_database(query=sql, connection=db_conn)
36 | return pd_data
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | include .env
2 |
3 |
4 | build:
5 | docker-compose build
6 |
7 | up:
8 | docker-compose --env-file .env up -d
9 |
10 | down:
11 | docker-compose --env-file .env down
12 |
13 | restart:
14 | make down && make up
15 |
16 | to_psql:
17 | docker exec -ti de_psql psql postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}
18 |
19 | to_mysql:
20 | docker exec -it de_mysql mysql --local-infile=1 -u"${MYSQL_USER}" -p"${MYSQL_PASSWORD}" ${MYSQL_DATABASE}
21 |
22 | to_mysql_root:
23 | docker exec -it de_mysql mysql -u"root" -p"${MYSQL_ROOT_PASSWORD}" ${MYSQL_DATABASE}
24 |
25 | mysql_create:
26 | docker exec -it de_mysql mysql --local_infile -u"${MYSQL_USER}" -p"${MYSQL_PASSWORD}" ${MYSQL_DATABASE} -e"source /tmp/load_dataset/mysql_schemas.sql"
27 |
28 | mysql_load:
29 | docker exec -it de_mysql mysql --local_infile=1 -u"${MYSQL_USER}" -p"${MYSQL_PASSWORD}" ${MYSQL_DATABASE} -e"source /tmp/load_dataset/mysql_load.sql"
30 |
31 | psql_create:
32 | docker exec -it de_psql psql postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB} -f /tmp/load_dataset/psql_schemas.sql -a
--------------------------------------------------------------------------------
/load_dataset/mysql_load.sql:
--------------------------------------------------------------------------------
1 | LOAD DATA LOCAL INFILE '/tmp/youTube_trending_video/DE_youtube_trending_data.csv'
2 | INTO TABLE DE_youtube_trending_data
3 | FIELDS TERMINATED BY ','
4 | ENCLOSED BY '"'
5 | LINES TERMINATED BY '\n'
6 | IGNORE 1 ROWS;
7 |
8 | LOAD DATA LOCAL INFILE '/tmp/youTube_trending_video/JP_youtube_trending_data.csv'
9 | INTO TABLE JP_youtube_trending_data
10 | FIELDS TERMINATED BY ','
11 | ENCLOSED BY '"'
12 | LINES TERMINATED BY '\n'
13 | IGNORE 1 ROWS;
14 |
15 | LOAD DATA LOCAL INFILE '/tmp/youTube_trending_video/RU_youtube_trending_data.csv'
16 | INTO TABLE RU_youtube_trending_data
17 | FIELDS TERMINATED BY ','
18 | ENCLOSED BY '"'
19 | LINES TERMINATED BY '\n'
20 | IGNORE 1 ROWS;
21 |
22 | LOAD DATA LOCAL INFILE '/tmp/youTube_trending_video/CA_youtube_trending_data.csv'
23 | INTO TABLE CA_youtube_trending_data
24 | FIELDS TERMINATED BY ','
25 | ENCLOSED BY '"'
26 | LINES TERMINATED BY '\n'
27 | IGNORE 1 ROWS;
28 |
29 | LOAD DATA LOCAL INFILE '/tmp/youTube_trending_video/IN_youtube_trending_data.csv'
30 | INTO TABLE IN_youtube_trending_data
31 | FIELDS TERMINATED BY ','
32 | ENCLOSED BY '"'
33 | LINES TERMINATED BY '\n'
34 | IGNORE 1 ROWS;
--------------------------------------------------------------------------------
/load_dataset/psql_schemas.sql:
--------------------------------------------------------------------------------
1 | DROP SCHEMA IF EXISTS gold CASCADE;
2 | CREATE SCHEMA gold;
3 |
4 | DROP TABLE IF EXISTS gold.videoCategory;
5 | CREATE TABLE gold.videoCategory (
6 | categoryId VARCHAR(5),
7 | categoryName VARCHAR(50)
8 | );
9 |
10 | DROP TABLE IF EXISTS gold.linkVideos;
11 | CREATE TABLE gold.linkVideos (
12 | video_id VARCHAR(20),
13 | link_video VARCHAR(50)
14 | );
15 |
16 | DROP TABLE IF EXISTS gold.metricVideos;
17 | CREATE TABLE gold.metricVideos (
18 | video_id VARCHAR(20),
19 | -- country_code,
20 | publishedAt TIMESTAMP,
21 | trending_date TIMESTAMP,
22 | channelId VARCHAR(27),
23 | categoryId VARCHAR(5),
24 | view_count INTEGER,
25 | likes INTEGER,
26 | dislikes INTEGER,
27 | comment_count INTEGER
28 | );
29 |
30 | DROP TABLE IF EXISTS gold.informationVideos;
31 | CREATE TABLE gold.informationVideos (
32 | video_id VARCHAR(20),
33 | -- country_code,
34 | title TEXT,
35 | channelId VARCHAR(27),
36 | channelTitle TEXT,
37 | categoryId VARCHAR(5),
38 | tags TEXT,
39 | thumbnail_link TEXT,
40 | comments_disabled VARCHAR(5),
41 | ratings_disabled VARCHAR(5)
42 | );
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/dbt_project.yml:
--------------------------------------------------------------------------------
1 |
2 | # Name your project! Project names should contain only lowercase characters
3 | # and underscores. A good package name should reflect your organization's
4 | # name or the intended use of these models
5 | name: 'dbt_tranform'
6 | version: '1.0.0'
7 |
8 | # This setting configures which "profile" dbt uses for this project.
9 | profile: 'dbt_tranform'
10 |
11 | # These configurations specify where dbt should look for different types of files.
12 | # The `model-paths` config, for example, states that models in this project can be
13 | # found in the "models/" directory. You probably won't need to change these!
14 | model-paths: ["models"]
15 | analysis-paths: ["analyses"]
16 | test-paths: ["tests"]
17 | seed-paths: ["seeds"]
18 | macro-paths: ["macros"]
19 | snapshot-paths: ["snapshots"]
20 |
21 | clean-targets: # directories to be removed by `dbt clean`
22 | - "target"
23 | - "dbt_packages"
24 |
25 |
26 | # Configuring models
27 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
28 |
29 | # In this example config, we tell dbt to build all models in the example/
30 | # directory as views. These settings can be overridden in the individual model
31 | # files using the `{{ config(...) }}` macro.
32 | models:
33 | dbt_tranform:
34 | # Config indicated by + and applies to all files under models/example/
35 | youtube_trending:
36 | +materialized: table
37 | +schema: youtube_trending
38 |
--------------------------------------------------------------------------------
/etl_pipeline/dbt_tranform/models/schema.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | models:
4 | - name: search_videocategory
5 | description: ""
6 | columns:
7 | - name: categoryid
8 | description: "The primary key for this table"
9 | tests:
10 | - unique
11 | - not_null
12 | - name: categoryname
13 | description: ""
14 | tests:
15 | - unique
16 | - not_null
17 | - accepted_values:
18 | values: ['Film & Animation', 'Autos & Vehicles', 'Music', 'Pets & Animals', 'Sports', 'Travel & Events', 'Gaming', 'People & Blogs', 'Comedy', 'Entertainment', 'News & Politics', 'Howto & Style', 'Education', 'Science & Technology', 'Nonprofits & Activism']
19 |
20 | - name: search_linkvideo
21 | description: ""
22 | columns:
23 | - name: video_id
24 | description: "The primary key for this table"
25 | tests:
26 | - unique
27 | - not_null
28 | - name: link_video
29 | description: ""
30 | tests:
31 | - unique
32 | - not_null
33 | post-hook:
34 | - "CREATE INDEX IF NOT EXISTS idx_video_id ON {{ this }} (video_id)"
35 |
36 |
37 | - name: search_information
38 | description: ""
39 | columns:
40 | - name: video_id
41 | description: "The primary key for this table"
42 | tests:
43 | - not_null
44 | - name: categoryname
45 | description: ""
46 | tests:
47 | - not_null
48 | - accepted_values:
49 | values: ['Film & Animation', 'Autos & Vehicles', 'Music', 'Pets & Animals', 'Sports', 'Travel & Events', 'Gaming', 'People & Blogs', 'Comedy', 'Entertainment', 'News & Politics', 'Howto & Style', 'Education', 'Science & Technology', 'Nonprofits & Activism']
50 |
51 | post-hook:
52 | - "CREATE INDEX IF NOT EXISTS idx_video_id ON {{ this }} (video_id, categoryname, tags)"
--------------------------------------------------------------------------------
/etl_pipeline/README.md:
--------------------------------------------------------------------------------
1 | # etl_pipeline
2 |
3 | This is a [Dagster](https://dagster.io/) project scaffolded with [`dagster project scaffold`](https://docs.dagster.io/getting-started/create-new-project).
4 |
5 | ## Getting started
6 |
7 | First, install your Dagster code location as a Python package. By using the --editable flag, pip will install your Python package in ["editable mode"](https://pip.pypa.io/en/latest/topics/local-project-installs/#editable-installs) so that as you develop, local code changes will automatically apply.
8 |
9 | ```bash
10 | pip install -e ".[dev]"
11 | ```
12 |
13 | Then, start the Dagster UI web server:
14 |
15 | ```bash
16 | dagster dev
17 | ```
18 |
19 | Open http://localhost:3000 with your browser to see the project.
20 |
21 | You can start writing assets in `etl_pipeline/assets.py`. The assets are automatically loaded into the Dagster code location as you define them.
22 |
23 | ## Development
24 |
25 | ### Adding new Python dependencies
26 |
27 | You can specify new Python dependencies in `setup.py`.
28 |
29 | ### Unit testing
30 |
31 | Tests are in the `etl_pipeline_tests` directory and you can run tests using `pytest`:
32 |
33 | ```bash
34 | pytest etl_pipeline_tests
35 | ```
36 |
37 | ### Schedules and sensors
38 |
39 | If you want to enable Dagster [Schedules](https://docs.dagster.io/concepts/partitions-schedules-sensors/schedules) or [Sensors](https://docs.dagster.io/concepts/partitions-schedules-sensors/sensors) for your jobs, the [Dagster Daemon](https://docs.dagster.io/deployment/dagster-daemon) process must be running. This is done automatically when you run `dagster dev`.
40 |
41 | Once your Dagster Daemon is running, you can start turning on schedules and sensors for your jobs.
42 |
43 | ## Deploy on Dagster Cloud
44 |
45 | The easiest way to deploy your Dagster project is to use Dagster Cloud.
46 |
47 | Check out the [Dagster Cloud Documentation](https://docs.dagster.cloud) to learn more.
48 |
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/resources/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | from .mysql_io_manager import MySQLIOManager
3 | from .minio_io_manager import MinIOIOManager
4 | from .psql_io_manager import PostgreSQLIOManager
5 | from .youtube_io_manager import YoutubeIOManager
6 | from .spark_io_manager import SparkIOManager
7 |
8 |
9 | mysql = MySQLIOManager(
10 | {
11 | "host": os.getenv("MYSQL_HOST"),
12 | "port": 3306,
13 | "database": os.getenv("MYSQL_DATABASE"),
14 | "user": os.getenv("MYSQL_USER"),
15 | "password": os.getenv("MYSQL_PASSWORD"),
16 | }
17 | )
18 |
19 | minio = MinIOIOManager(
20 | {
21 | "endpoint_url": os.getenv("MINIO_ENDPOINT"),
22 | "bucket": os.getenv("DATALAKE_BUCKET"),
23 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
24 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
25 | }
26 | )
27 |
28 | postgres = PostgreSQLIOManager(
29 | {
30 | "host": os.getenv("POSTGRES_HOST"),
31 | "port": os.getenv("POSTGRES_PORT"),
32 | "database": os.getenv("POSTGRES_DB"),
33 | "user": os.getenv("POSTGRES_USER"),
34 | "password": os.getenv("POSTGRES_PASSWORD"),
35 | }
36 | )
37 |
38 | youtube = YoutubeIOManager(
39 | {
40 | "api_service_name": os.getenv("API_SERVICE_NAME"),
41 | "api_version": os.getenv("API_VERSION"),
42 | "api_key": os.getenv("API_KEY"),
43 | "endpoint_url": os.getenv("MINIO_ENDPOINT"),
44 | "bucket": os.getenv("DATALAKE_BUCKET"),
45 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
46 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
47 | }
48 | )
49 |
50 | spark = SparkIOManager(
51 | {
52 | "spark_master_url": os.getenv("SPARK_MASTER_URL"),
53 | "endpoint_url": os.getenv("MINIO_ENDPOINT"),
54 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
55 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
56 | "bucket": os.getenv("DATALAKE_BUCKET"),
57 | }
58 | )
--------------------------------------------------------------------------------
/etl_pipeline/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.11-slim
2 |
3 | ARG openjdk_version="17"
4 |
5 | # USER root
6 |
7 | RUN apt-get update --yes && \
8 | apt-get install --yes \
9 | curl \
10 | "openjdk-${openjdk_version}-jre-headless" ca-certificates-java procps && \
11 | apt-get clean && rm -rf /var/lib/apt/lists/*
12 |
13 |
14 | RUN curl -O -L https://dlcdn.apache.org/spark/spark-3.4.3/spark-3.4.3-bin-hadoop3.tgz \
15 | && tar -zxvf spark-3.4.3-bin-hadoop3.tgz \
16 | && rm -rf spark-3.4.3-bin-hadoop3.tgz \
17 | && mv spark-3.4.3-bin-hadoop3/ /usr/local/ \
18 | && rm -rf /usr/local/spark \
19 | && rm -rf /usr/local/spark-3.3.0-bin-hadoop3 \
20 | && ln -s /usr/local/spark-3.4.3-bin-hadoop3 /usr/local/spark
21 |
22 |
23 | RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \
24 | && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \
25 | && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1026/aws-java-sdk-bundle-1.11.1026.jar \
26 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \
27 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \
28 | && curl -O https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.2/hadoop-aws-3.3.2.jar \
29 | # && mkdir -p /usr/local/spark/jars \
30 | && mv s3-2.18.41.jar /usr/local/spark/jars \
31 | && mv aws-java-sdk-1.12.367.jar /usr/local/spark/jars \
32 | && mv aws-java-sdk-bundle-1.11.1026.jar /usr/local/spark/jars \
33 | && mv delta-core_2.12-2.2.0.jar /usr/local/spark/jars \
34 | && mv delta-storage-2.2.0.jar /usr/local/spark/jars \
35 | && mv hadoop-aws-3.3.2.jar /usr/local/spark/jars
36 |
37 |
38 | WORKDIR /opt/dagster/app
39 | COPY requirements.txt /opt/dagster/app
40 | RUN pip install --upgrade pip && pip install -r requirements.txt
41 | COPY . /opt/dagster/app
42 |
43 | CMD ["dagster", "api", "grpc", "-h", "0.0.0.0", "-p", "4000", "-m", "etl_pipeline"]
--------------------------------------------------------------------------------
/load_dataset/mysql_schemas.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS youtube_trending_data;
2 |
3 |
4 | DROP TABLE IF EXISTS CA_youtube_trending_data;
5 | CREATE TABLE CA_youtube_trending_data (
6 | video_id VARCHAR(20),
7 | title TEXT,
8 | publishedAt VARCHAR(27),
9 | channelId VARCHAR(27),
10 | channelTitle TEXT,
11 | categoryId VARCHAR(5),
12 | trending_date VARCHAR(27),
13 | tags TEXT,
14 | view_count TEXT,
15 | likes TEXT,
16 | dislikes TEXT,
17 | comment_count TEXT,
18 | thumbnail_link TEXT,
19 | comments_disabled VARCHAR(6),
20 | ratings_disabled VARCHAR(6)
21 | );
22 |
23 | DROP TABLE IF EXISTS DE_youtube_trending_data;
24 | CREATE TABLE DE_youtube_trending_data (
25 | video_id VARCHAR(20),
26 | title TEXT,
27 | publishedAt VARCHAR(27),
28 | channelId VARCHAR(27),
29 | channelTitle TEXT,
30 | categoryId VARCHAR(5),
31 | trending_date VARCHAR(27),
32 | tags TEXT,
33 | view_count TEXT,
34 | likes TEXT,
35 | dislikes TEXT,
36 | comment_count TEXT,
37 | thumbnail_link TEXT,
38 | comments_disabled VARCHAR(6),
39 | ratings_disabled VARCHAR(6)
40 | );
41 |
42 | DROP TABLE IF EXISTS IN_youtube_trending_data;
43 | CREATE TABLE IN_youtube_trending_data (
44 | video_id VARCHAR(20),
45 | title TEXT,
46 | publishedAt VARCHAR(27),
47 | channelId VARCHAR(27),
48 | channelTitle TEXT,
49 | categoryId VARCHAR(5),
50 | trending_date VARCHAR(27),
51 | tags TEXT,
52 | view_count TEXT,
53 | likes TEXT,
54 | dislikes TEXT,
55 | comment_count TEXT,
56 | thumbnail_link TEXT,
57 | comments_disabled VARCHAR(6),
58 | ratings_disabled VARCHAR(6)
59 | );
60 |
61 | DROP TABLE IF EXISTS JP_youtube_trending_data;
62 | CREATE TABLE JP_youtube_trending_data (
63 | video_id VARCHAR(20),
64 | title TEXT,
65 | publishedAt VARCHAR(27),
66 | channelId VARCHAR(27),
67 | channelTitle TEXT,
68 | categoryId VARCHAR(5),
69 | trending_date VARCHAR(27),
70 | tags TEXT,
71 | view_count TEXT,
72 | likes TEXT,
73 | dislikes TEXT,
74 | comment_count TEXT,
75 | thumbnail_link TEXT,
76 | comments_disabled VARCHAR(6),
77 | ratings_disabled VARCHAR(6)
78 | );
79 |
80 | DROP TABLE IF EXISTS RU_youtube_trending_data;
81 | CREATE TABLE RU_youtube_trending_data (
82 | video_id VARCHAR(20),
83 | title TEXT,
84 | publishedAt VARCHAR(27),
85 | channelId VARCHAR(27),
86 | channelTitle TEXT,
87 | categoryId VARCHAR(5),
88 | trending_date VARCHAR(27),
89 | tags TEXT,
90 | view_count TEXT,
91 | likes TEXT,
92 | dislikes TEXT,
93 | comment_count TEXT,
94 | thumbnail_link TEXT,
95 | comments_disabled VARCHAR(6),
96 | ratings_disabled VARCHAR(6)
97 | );
--------------------------------------------------------------------------------
/app/pages/search_video.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import psycopg2
3 | import polars as pl
4 | import pandas as pd
5 | from PIL import Image
6 | from io import BytesIO
7 | import requests
8 |
9 |
10 | icon = Image.open("./icons/youtube_v2.png", mode="r")
11 |
12 | st.set_page_config(
13 | page_title="YouTube RecoMaster",
14 | page_icon=icon,
15 | layout="centered",
16 | initial_sidebar_state="expanded"
17 | )
18 |
19 | @st.cache_resource
20 | def init_connection():
21 | return psycopg2.connect(**st.secrets["postgres"])
22 |
23 | conn = init_connection()
24 |
25 | @st.cache_data(ttl=600)
26 | def run_query(query):
27 | with conn.cursor() as cur:
28 | cur.execute(query)
29 | return cur.fetchall()
30 |
31 |
32 | title, logo = st.columns([4,2.91])
33 | with title:
34 | st.title("YouTube RecoMaster")
35 | with logo:
36 | st.write("")
37 | st.image(icon, width=70)
38 |
39 | st.slider("Size")
40 | video_name = st.text_input("Enter a video name")
41 | st.write(f"You entered: {video_name}")
42 |
43 |
44 | data = run_query(
45 | f"""
46 | SELECT DISTINCT
47 | video_id,
48 | title,
49 | channeltitle,
50 | thumbnail_link,
51 | link_video,
52 | categoryname,
53 | view
54 | FROM youtube_trending.search_information
55 | WHERE title LIKE '%{video_name}%'
56 | LIMIT 10;
57 | """
58 | )
59 |
60 | videos = {
61 | "video_id": [e[0] for e in data],
62 | "title": [e[1] for e in data],
63 | "channeltitle": [e[2] for e in data],
64 | "thumbnail_link": [e[3] for e in data],
65 | "link_video": [e[4] for e in data],
66 | "categoryname": [e[5] for e in data],
67 | "view_count": [e[6] for e in data]
68 | }
69 | video_url = "https://www.youtube.com/embed/J78aPJ3VyNs"
70 |
71 | recommended_videos = []
72 | recommended_videos += videos['link_video']
73 |
74 | st.subheader(f"Have {len(videos['video_id'])} results for keyword: {video_name}")
75 | for video_id,title,channeltitle,thumbnail_link,link_video,categoryname,view_count in zip(
76 | videos['video_id'],videos['title'],videos['channeltitle'],
77 | videos['thumbnail_link'],videos['link_video'],videos['categoryname'],videos['view_count']):
78 |
79 | col1, col2 = st.columns([1, 1])
80 |
81 | with col1:
82 | img = Image.open(BytesIO(requests.get(thumbnail_link).content))
83 | st.markdown(
84 | f'',
85 | unsafe_allow_html=True,
86 | )
87 | st.image(img, use_column_width=True)
88 |
89 | with col2:
90 | st.write("")
91 | st.markdown(f"""
92 |
93 | {title}
94 | channel: {channeltitle}
95 | category: {categoryname}
96 | views: {view_count}
97 |
98 | """, unsafe_allow_html=True)
99 | st.write("")
100 | is_clicked = st.button("Watch", key=video_id)
101 |
102 | if is_clicked:
103 | st.experimental_set_query_params(video_id=video_id)
104 | # st.experimental_rerun()
105 | st.switch_page("./pages/video_detail.py")
106 |
107 |
108 | st.write("---")
109 |
110 |
111 | # df = pl.DataFrame(videos)
112 | # st.table(df)
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/resources/minio_io_manager.py:
--------------------------------------------------------------------------------
1 | import os
2 | from contextlib import contextmanager
3 | from datetime import datetime
4 | from typing import Union
5 |
6 | import polars as pl
7 | import pyarrow as pa
8 | import pyarrow.parquet as pq
9 | from dagster import IOManager, OutputContext, InputContext
10 | from minio import Minio
11 |
12 |
13 | @contextmanager
14 | def connect_minio(config: dict):
15 | client = Minio(
16 | endpoint=config.get("endpoint_url"),
17 | access_key=config.get("aws_access_key_id"),
18 | secret_key=config.get("aws_secret_access_key"),
19 | secure=False,
20 | )
21 | try:
22 | yield client
23 | except Exception as e:
24 | raise e
25 |
26 |
27 | class MinIOIOManager(IOManager):
28 |
29 | def __init__(self, config):
30 | self._config = config
31 |
32 | def _get_path(self, context: Union[InputContext, OutputContext]):
33 | layer, schema, table = context.asset_key.path
34 | key = "/".join([layer, schema, table.replace(f"{layer}_", "")])
35 | tmp_file_path = "/tmp/file-{}-{}.parquet".format(
36 | datetime.today().strftime("%Y%m%d%H%M%S"),
37 | "-".join(context.asset_key.path)
38 | )
39 |
40 | if context.has_asset_partitions:
41 | start, end = context.asset_partitions_time_window
42 | # partition_str = context.asset_partition_key
43 | partition_str = start.strftime("%Y%m")
44 | context.log.info(f"INFO: {os.path.join(key, partition_str)}.pq, {tmp_file_path}")
45 | return os.path.join(key, f"{partition_str}.pq"), tmp_file_path
46 | else:
47 | context.log.info(f"INFO: {key}.pq, {tmp_file_path}")
48 | return f"{key}.pq", tmp_file_path
49 |
50 | def handle_output(self, context: OutputContext, obj: pl.DataFrame):
51 | # convert to parquet format
52 | key_name, tmp_file_path = self._get_path(context)
53 | obj.write_parquet(tmp_file_path)
54 |
55 | # upload to MinIO
56 | try:
57 | bucket_name = self._config.get("bucket")
58 | with connect_minio(self._config) as client:
59 | # Make bucket if not exist.
60 | found = client.bucket_exists(bucket_name)
61 | if not found:
62 | client.make_bucket(bucket_name)
63 | else:
64 | print(f"Bucket {bucket_name} already exists")
65 | client.fput_object(bucket_name, key_name, tmp_file_path)
66 | row_count = len(obj)
67 | context.add_output_metadata(
68 | {
69 | "path": key_name,
70 | "records": row_count,
71 | "tmp": tmp_file_path
72 | }
73 | )
74 | # clean up tmp file
75 | os.remove(tmp_file_path)
76 |
77 | except Exception as e:
78 | raise e
79 |
80 | def load_input(self, context: InputContext) -> pl.DataFrame:
81 | bucket_name = self._config.get("bucket")
82 | key_name, tmp_file_path = self._get_path(context)
83 |
84 | try:
85 | with connect_minio(self._config) as client:
86 | # Make bucket if not exist.
87 | found = client.bucket_exists(bucket_name)
88 | if not found:
89 | client.make_bucket(bucket_name)
90 | else:
91 | print(f"Bucket {bucket_name} already exists")
92 |
93 | client.fget_object(bucket_name, key_name, tmp_file_path)
94 | pd_data = pl.read_parquet(tmp_file_path)
95 | return pd_data
96 |
97 | except Exception as e:
98 | raise e
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/resources/psql_io_manager.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 | from datetime import datetime
3 | from psycopg2 import sql
4 | import psycopg2.extras
5 | import psycopg2
6 |
7 | import polars as pl
8 | from dagster import IOManager, OutputContext, InputContext
9 | from sqlalchemy import create_engine
10 |
11 |
12 | @contextmanager
13 | def connect_psql(config: dict):
14 | try:
15 | yield psycopg2.connect(
16 | host=config["host"],
17 | port=config["port"],
18 | database=config["database"],
19 | user=config["user"],
20 | password=config["password"],
21 | )
22 |
23 | except Exception as e:
24 | raise e
25 |
26 |
27 | class PostgreSQLIOManager(IOManager):
28 |
29 | def __init__(self, config):
30 | self._config = config
31 |
32 | def load_input(self, context: InputContext) -> pl.DataFrame:
33 | pass
34 |
35 | def handle_output(self, context: OutputContext, obj: pl.DataFrame):
36 | schema, table = context.asset_key.path[-2], context.asset_key.path[-1]
37 | tmp_tbl = f"{table}_tmp_{datetime.now().strftime('%Y_%m_%d')}"
38 |
39 | with connect_psql(self._config) as db_conn:
40 | primary_keys = (context.metadata or {}).get("primary_keys", [])
41 | ls_columns = (context.metadata or {}).get("columns", [])
42 |
43 | with db_conn.cursor() as cursor:
44 | # create temp table
45 | cursor.execute(
46 | f'CREATE TEMP TABLE IF NOT EXISTS "{tmp_tbl}" (LIKE {schema}.{table})'
47 | )
48 | cursor.execute(f'SELECT COUNT(*) FROM "{tmp_tbl}"')
49 | context.log.debug(
50 | f"Log for creating temp table: {cursor.fetchall()}"
51 | )
52 | # cursor.execute(
53 | # sql.SQL("CREATE TEMP TABLE IF NOT EXISTS {} (LIKE {}.{});").format(
54 | # sql.Identifier(tmp_tbl),
55 | # sql.Identifier(schema),
56 | # sql.Identifier(table),
57 | # )
58 | # )
59 |
60 | # insert new data
61 | try:
62 | columns = sql.SQL(",").join(
63 | sql.Identifier(name.lower()) for name in obj.columns
64 | )
65 | context.log.info(f"Table {table} with columns: {columns}")
66 | values = sql.SQL(",").join(sql.Placeholder() for _ in obj.columns)
67 |
68 | context.log.debug("Inserting data into temp table")
69 | insert_query = sql.SQL('INSERT INTO {} ({}) VALUES({});').format(
70 | sql.Identifier(tmp_tbl), columns, values
71 | )
72 | psycopg2.extras.execute_batch(cursor, insert_query, obj.rows())
73 | context.log.info(f"Insert into data for table {table} Success !!!")
74 |
75 | db_conn.commit()
76 |
77 | except Exception as e:
78 | raise e
79 |
80 | with db_conn.cursor() as cursor:
81 | # check data inserted
82 | cursor.execute(f'SELECT COUNT(*) FROM "{tmp_tbl}"')
83 | context.log.info(f"Number of rows inserted: {cursor.fetchone()}")
84 |
85 | # upsert data
86 | if len(primary_keys) > 0:
87 | conditions = " AND ".join(
88 | [
89 | f""" {schema}.{table}."{k.lower()}" = "{tmp_tbl}"."{k.lower()}" """
90 | for k in primary_keys
91 | ]
92 | )
93 | command = f"""
94 | BEGIN TRANSACTION;
95 | DELETE FROM {schema}.{table}
96 | USING "{tmp_tbl}"
97 | WHERE {conditions};
98 |
99 | INSERT INTO {schema}.{table}
100 | SELECT * FROM "{tmp_tbl}";
101 |
102 | END TRANSACTION;
103 | """
104 | else:
105 | command = f"""
106 | BEGIN TRANSACTION;
107 | TRUNCATE TABLE {schema}.{table};
108 |
109 | INSERT INTO {schema}.{table}
110 | SELECT * FROM "{tmp_tbl}";
111 |
112 | END TRANSACTION;
113 | """
114 |
115 | cursor.execute(command)
116 | # drop temp table
117 | cursor.execute(f'DROP TABLE IF EXISTS "{tmp_tbl}"')
118 | db_conn.commit()
--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: "3.9"
2 |
3 |
4 | services:
5 |
6 | # MySQL
7 | de_mysql:
8 | image: mysql:8.0
9 | container_name: de_mysql
10 | volumes:
11 | - ./mysql:/var/lib/mysql
12 | - ./dataset/youTube_trending_video:/tmp/youTube_trending_video
13 | - ./load_dataset:/tmp/load_dataset
14 | ports:
15 | - 3306:3306
16 | env_file: .env
17 | networks:
18 | - de_network
19 |
20 | # MinIO
21 | minio:
22 | hostname: minio
23 | image: minio/minio
24 | container_name: minio
25 | ports:
26 | - 9001:9001
27 | - 9000:9000
28 | command: [ "server", "/data", "--console-address", ":9001" ]
29 | volumes:
30 | - ./minio:/data
31 | env_file: .env
32 | networks:
33 | - de_network
34 |
35 | mc:
36 | image: minio/mc
37 | container_name: mc
38 | hostname: mc
39 | env_file: .env
40 | entrypoint: >
41 | /bin/sh -c " until (/usr/bin/mc config host add minio
42 | http://minio:9000 minio minio123) do echo '...waiting...' && sleep 1;
43 | done; /usr/bin/mc mb minio/lakehouse; /usr/bin/mc policy set public
44 | minio/lakehouse; exit 0; "
45 | depends_on:
46 | - minio
47 | networks:
48 | - de_network
49 |
50 | # Pipeline
51 | etl_pipeline:
52 | build:
53 | context: ./etl_pipeline
54 | dockerfile: Dockerfile
55 | container_name: etl_pipeline
56 | image: etl_pipeline:latest
57 | restart: always
58 | volumes:
59 | - ./etl_pipeline:/opt/dagster/app
60 | - ./docker-images/spark/spark-defaults.conf:/usr/local/spark/conf/spark-defaults.conf
61 | ports:
62 | - 4041:4040
63 | env_file: .env
64 | networks:
65 | - de_network
66 |
67 | # PostgreSQL
68 | de_psql:
69 | image: postgres:15
70 | container_name: de_psql
71 | volumes:
72 | - ./postgresql:/var/lib/postgresql/data
73 | - ./load_dataset:/tmp/load_dataset
74 | ports:
75 | - 5432:5432
76 | env_file: .env
77 | networks:
78 | - de_network
79 |
80 | # Dagster
81 | de_dagster:
82 | build:
83 | context: ./docker-images/dagster/
84 | container_name: de_dagster
85 | image: de_dagster
86 |
87 | de_dagster_dagit:
88 | image: de_dagster:latest
89 | entrypoint:
90 | - dagit
91 | - -h
92 | - "0.0.0.0"
93 | - -p
94 | - "3001"
95 | - -w
96 | - workspace.yaml
97 | container_name: de_dagster_dagit
98 | expose:
99 | - "3001"
100 | ports:
101 | - 3001:3001
102 | volumes:
103 | - /var/run/docker.sock:/var/run/docker.sock
104 | - ./dagster_home:/opt/dagster/dagster_home
105 | env_file: .env
106 | networks:
107 | - de_network
108 |
109 | de_dagster_daemon:
110 | image: de_dagster:latest
111 | entrypoint:
112 | - dagster-daemon
113 | - run
114 | container_name: de_dagster_daemon
115 | volumes:
116 | - /var/run/docker.sock:/var/run/docker.sock
117 | - ./dagster_home:/opt/dagster/dagster_home
118 | env_file: .env
119 | networks:
120 | - de_network
121 |
122 | # Streamlit
123 | de_streamlit:
124 | build:
125 | context: ./docker-images/streamlit
126 | dockerfile: Dockerfile
127 | image: de_streamlit:latest
128 | container_name: de_streamlit
129 | volumes:
130 | - ./app:/app
131 | env_file: .env
132 | ports:
133 | - "8501:8501"
134 | networks:
135 | - de_network
136 |
137 | # Metabase
138 | de_metabase:
139 | image: metabase/metabase:latest
140 | container_name: de_metabase
141 | volumes:
142 | - ./storage/metabase_data:/metabase_data
143 | ports:
144 | - "3030:3000"
145 | env_file: .env
146 | networks:
147 | - de_network
148 |
149 | # Jupyter
150 | # de_notebook:
151 | # image: jupyter/all-spark-notebook:python-3.9
152 | # container_name: de_notebook
153 | # command: [ "start-notebook.sh", "--NotebookApp.token=" ]
154 | # ports:
155 | # - 8888:8888
156 | # volumes:
157 | # - ./notebooks/work:/home/jovyan/work
158 | # env_file: .env
159 | # networks:
160 | # - de_network
161 |
162 | # # Spark
163 | # spark-master:
164 | # build:
165 | # context: ./docker-images/spark
166 | # dockerfile: Dockerfile
167 | # image: spark-master:latest
168 | # container_name: spark-master
169 | # hostname: spark_master
170 | # volumes:
171 | # - ./docker-images/spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
172 | # - ./data:/opt/spark-data
173 | # env_file: .env.spark_master
174 | # expose:
175 | # - "7077"
176 | # ports:
177 | # - "7077:7077"
178 | # - "8080:8080"
179 | # networks:
180 | # - de_network
181 |
182 | # spark-worker:
183 | # image: docker.io/bitnami/spark:3.4.3
184 | # depends_on:
185 | # - spark-master
186 | # deploy:
187 | # replicas: 3
188 | # env_file: .env.spark_worker
189 | # volumes:
190 | # - ./docker-images/spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
191 | # - ./data:/opt/spark-data
192 | # networks:
193 | # - de_network
194 |
195 | networks:
196 | de_network:
197 | driver: bridge
198 | name: de_network
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 🌄 Youtube-ETL-Pipeline
2 | In this project, I build a simple data pipeline following the ETL(extract - transform - load) model using Youtube-Trending-Video dataset, perform data processing, transformation and calculation using Apache Spark big data technology, serving the video search and recommendation system
3 |
4 | ## 🔦 About Project
5 |
6 |
7 | - **Data Source**: This project uses two main `data sources`: [Youtube Trending Video](https://www.kaggle.com/datasets/rsrishav/youtube-trending-video-dataset) data and [Youtube API](https://developers.google.com/youtube/v3)
8 | - `Youtube Trending Video` data is downloaded from [Kaggle.com](https://www.kaggle.com) with `.csv` file format, then loaded into `MySQL`, considered as a `data source`
9 | - Using `Video ID` and `Category ID` from `Youtube Trending Video` data, we collect some additional information fields from `Youtube API` such as `Video Link` and `Video Category`
10 | - **Extract Data**: Extract the above `data sources` using `Polars` `DataFrame`, now we have the `raw` layer, then load the data into `MinIO` `datalake`
11 | - **Tranform Data**: From `MinIO`, we use `Apache Spark`, specifically `PySpark`
12 | - convert from `Polars` `DataFrame` to `PySpark` `DataFrame` for processing and calculation, we get `silver` and `gold` layers
13 | - Data stored in `MinIO` is in `.parquet` format, providing better processing performance
14 | - **Load Data**: Load the `gold` layer into the `PostgreSQL` data warehouse, perform additional transform with `dbt` to create an `index`, making video searching faster
15 | - **Serving**: The data was used for visualization using `Metabase` and creating a video recommendation application using `Streamlit`
16 | - **package and orchestrator**: Use `Docker` to containerize and package projects and `Dagster` to coordinate `assets` across different tasks
17 |
18 | ## ⚡ Workflow
19 |
20 |
21 | ## 📦 Technologies
22 | - `MySQL`
23 | - `Youtube API`
24 | - `Polars`
25 | - `MinIO`
26 | - `Apache Spark`
27 | - `PostgreSQL`
28 | - `Dbt`
29 | - `Metabase`
30 | - `Streamlit`
31 | - `Dagster`
32 | - `Docker`
33 | - `Apache Superset`
34 | - `Unittest`
35 | - `Pytest`
36 |
37 | ## 🦄 Features
38 | Here's what you can do with:
39 | - You can completely change the logic or create new `assets` in the `data pipeline` as you wish, perform `aggregate` `calculations` on the `assets` in the `pipeline` according to your purposes.
40 | - You can also create new `data charts` as well as change existing `charts` as you like with extremely diverse `chart types` on `Metabase` and `Apache Superset`.
41 | - You can also create new or change my existing `dashboards` as you like
42 | - `Search` videos quickly with any `keyword`, for `Video Recommendation` Apps
43 | - `Search` in many different languages, not just `English` such as: `Japanese`, `Canadian`, `German`, `Indian`, `Russian`
44 | - Recommend videos based on `category` and `tags` video
45 |
46 | ## 👩🏽🍳 The Process
47 |
48 |
49 | ## 📚 What I Learned
50 |
51 | During this project, I learned important skills, understood complex ideas, knew how to install and set up popular and useful tools, which brought me closer to becoming a `Data Engineer`.
52 | - **Logical thinking**: I learned how to think like a data person, find the cause of the data `problem` and then come up with the most `reasonable solution` to achieve high data `accuracy`.
53 | - **Architecture**: I understand and grasp the `ideas` and `architecture` of today's popular and popular big data processing tool, `Apache Spark`.
54 | - **Installation**: I learned how to install popular data processing, visualization and storage tools such as: `Metabase`, `Streamlit`, `MinIO`,... with `Docker`
55 | - **Setup**: I know how to setup `Spark Standalone Cluster` using `Docker` with three `Worker Nodes` on my local machine
56 |
57 | ### 📈 Overall Growth:
58 | Each part of this project has helped me understand more about how to build a data engineering, data management project. Learn new knowledge and improve my skills in future work
59 |
60 | ## 💭 How can it be improved?
61 | - Add more `data sources` to increase data richness.
62 | - Refer to other `data warehouses` besides `PostgreSQL` such as `Amazon Redshift` or `Snowflake`.
63 | - Perform more `cleaning` and `optimization` `processing` of the data.
64 | - Perform more advanced `statistics`, `analysis` and `calculations` with `Apache Spark`.
65 | - Check out other popular and popular `data orchestration` tools like `Apache Airflow`.
66 | - Separate `dbt` into a separate service (separate `container`) in `docker` when the project expands
67 | - Setup `Spark Cluster` on `cloud platforms` instead of on `local machines`
68 | - Refer to `cloud computing` services if the project is more extensive
69 | - Learn about `dbt packages` like `dbt-labs/dbt_utils` to help make the `transformation` process faster and more optimal.
70 |
71 | ## 🚦 Running the Project
72 | To run the project in your local environment, follow these steps:
73 | 1. Run command after to clone the `repository` to your `local machine`.
74 | ~~~bash
75 | git clone https://github.com/longNguyen010203/Youtube-ETL-Pipeline.git
76 | ~~~
77 |
78 | 2. Run the following commands to build the images from the `Dockerfile`, pull images from `docker hub` and launch services
79 | ~~~bash
80 | make build
81 | make up
82 | ~~~
83 |
84 | 3. Run the following commands to access the `SQL editor` on the `terminal` and Check if `local_infile` was turned on
85 | ~~~python
86 | make to_mysql_root
87 |
88 | SET GLOBAL local_infile=TRUE;
89 | SHOW VARIABLES LIKE "local_infile";
90 | exit
91 | ~~~
92 |
93 | 4. Run the following commands to create tables with schema for `MySQL`, load data from `CSV` file to `MySQL` and create tables with schema for `PostgreSQL`
94 | ~~~bash
95 | make mysql_create
96 | make mysql_load
97 | make psql_create
98 | ~~~
99 |
100 | 5. Open [http://localhost:3001](http://localhost:3001) to view `Dagster UI` and click `Materialize all` button to run the Pipeline
101 | 6. Open [http://localhost:9001](http://localhost:9001) to view `MinIO UI` and check the data to be loaded
102 | 7. Open [http://localhost:8080](http://localhost:8080) to view `Spark UI` and three `workers` are running
103 | 8. Open [http://localhost:3030](http://localhost:3030) to see charts and `dashboards` on `Metabase`
104 | 9. Open [http://localhost:8501](http://localhost:8501) to try out the `video recommendation` app on `Streamlit`
105 |
106 | ## 🍿 Video
--------------------------------------------------------------------------------
/app/pages/video_detail.py:
--------------------------------------------------------------------------------
1 | from PIL import Image
2 | import streamlit as st
3 | import psycopg2
4 | from PIL import Image
5 | from io import BytesIO
6 | import requests
7 |
8 |
9 | icon = Image.open("./icons/youtube_v2.png", mode="r")
10 |
11 | st.set_page_config(
12 | page_title="Video Recommender",
13 | page_icon=icon,
14 | layout="centered",
15 | initial_sidebar_state="expanded"
16 | )
17 |
18 | title, logo = st.columns([4,2.91])
19 | with title:
20 | st.title("YouTube RecoMaster")
21 | with logo:
22 | st.write("")
23 | st.image(icon, width=70)
24 |
25 | def display_video(url, recommended_videos=[]):
26 | if url not in recommended_videos:
27 | st.markdown(
28 | f'''''',
31 | unsafe_allow_html=True
32 | )
33 | else:
34 | st.markdown(
35 | f'''''',
38 | unsafe_allow_html=True
39 | )
40 |
41 | @st.cache_resource
42 | def init_connection():
43 | return psycopg2.connect(**st.secrets["postgres"])
44 |
45 | conn = init_connection()
46 |
47 | @st.cache_data(ttl=600)
48 | def run_query(query):
49 | with conn.cursor() as cur:
50 | cur.execute(query)
51 | return cur.fetchall()
52 |
53 | query_params = st.experimental_get_query_params()
54 | video_id = query_params.get('video_id', [None])[0]
55 |
56 | data = run_query(f"""
57 | select distinct
58 | title
59 | , channeltitle
60 | , categoryname
61 | , view
62 | , likes
63 | , dislike
64 | , publishedat
65 | , link_video
66 | , tags
67 | from youtube_trending.search_information si
68 | where video_id = '{video_id}';
69 | """)
70 |
71 | videos = {
72 | "title": data[0][0],
73 | "channeltitle": data[0][1],
74 | "categoryname": data[0][2],
75 | "view": data[0][3],
76 | "like": data[0][4],
77 | "dislike": data[0][5],
78 | "publishedat": data[0][6],
79 | "link_video": data[0][7],
80 | "tags": data[0][8]
81 | }
82 |
83 | display_video(videos['link_video'])
84 | st.markdown(f"### {videos['title']}")
85 | view_icon = Image.open("./icons/icons8-view-48.png", mode="r")
86 | like_icon = Image.open("./icons/icons8-like-48.png", mode="r")
87 | dislike_icon = Image.open("./icons/icons8-thumbs-down-skin-type-4-48.png", mode="r")
88 | category_icon = Image.open("./icons/icons8-category-48.png", mode="r")
89 | channel_icon = Image.open("./icons/icons8-channel-48.png", mode="r")
90 | # st.write(f"{videos['tags']}")
91 | st.write(f"{videos['tags']}", unsafe_allow_html=True)
92 |
93 | title, view, like, dislike, category = st.columns([4,1,1,1,1.3])
94 | with title:
95 | st.image(channel_icon, width=40)
96 | st.write(f"{videos['channeltitle']}")
97 | with view:
98 | st.image(view_icon, width=30)
99 | st.write(f"{videos['view']}")
100 | with like:
101 | st.image(like_icon, width=30)
102 | st.write(f"{videos['like']}")
103 | with dislike:
104 | st.image(dislike_icon, width=30)
105 | st.write(f"{videos['dislike']}")
106 | with category:
107 | st.image(category_icon, width=30)
108 | st.write(f"{videos['categoryname']}")
109 |
110 |
111 | st.subheader("Recommended Videos:")
112 | tags = ""
113 | tag_list = videos['tags'].split(' ')
114 | for tag in tag_list: tags += f"tags LIKE '%{tag}%' OR "
115 | tags = tags[:-3]
116 |
117 | query = f"""
118 | select distinct
119 | video_id
120 | , title
121 | , channeltitle
122 | , categoryname
123 | , view
124 | , likes
125 | , dislike
126 | , publishedat
127 | , link_video
128 | , tags
129 | , thumbnail_link
130 | from youtube_trending.search_information
131 | where (categoryname = '{videos['categoryname']}') AND
132 | ({tags}) AND video_id <> '{video_id}'
133 | limit 10;
134 | """
135 | data2 = run_query(query)
136 |
137 | if data2 is not None:
138 | videos2 = {
139 | "video_id": [e[0] for e in data2],
140 | "title": [e[1] for e in data2],
141 | "channeltitle": [e[2] for e in data2],
142 | "categoryname": [e[3] for e in data2],
143 | "view": [e[4] for e in data2],
144 | "like": [e[5] for e in data2],
145 | "dislike": [e[6] for e in data2],
146 | "publishedat": [e[7] for e in data2],
147 | "link_video": [e[8] for e in data2],
148 | "tags": [e[9] for e in data2],
149 | 'thumbnail_link': [e[10] for e in data2]
150 | }
151 |
152 |
153 | recommended_videos = []
154 | recommended_videos += videos2['link_video']
155 |
156 | for video_id,title,channeltitle,categoryname,view,like,dislike,publishedat,link_video,tags,thumbnail_link in zip(
157 | videos2['video_id'],videos2['title'],videos2['channeltitle'],videos2['categoryname'],
158 | videos2['view'],videos2['like'], videos2['dislike'],videos2['publishedat'],
159 | videos2['link_video'],videos2['tags'],videos2['thumbnail_link']):
160 |
161 | col1, col2 = st.columns([1, 1])
162 |
163 | with col1:
164 | img = Image.open(BytesIO(requests.get(thumbnail_link).content))
165 | st.markdown(
166 | f'',
167 | unsafe_allow_html=True,
168 | )
169 | st.image(img, use_column_width=True)
170 |
171 | with col2:
172 | st.write("")
173 | st.markdown(f"""
174 |
175 | {title}
176 | channel: {channeltitle}
177 | category: {categoryname}
178 | views: {view}
179 |
180 | """, unsafe_allow_html=True)
181 | st.write("")
182 | st.button("Detail", key=video_id)
183 |
184 | st.write("---")
185 |
186 | else: st.write(f"Not found")
187 |
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/assets/warehouse.py:
--------------------------------------------------------------------------------
1 | import polars as pl
2 | from dagster import AssetExecutionContext
3 |
4 | from dagster import (
5 | multi_asset,
6 | AssetIn,
7 | AssetOut,
8 | MetadataValue,
9 | AssetExecutionContext,
10 | Output
11 | )
12 |
13 | from ..partitions import monthly_partitions
14 |
15 |
16 | GROUP_NAME = "warehouse"
17 |
18 | @multi_asset(
19 | ins={
20 | "gold_videoCategory": AssetIn(
21 | key_prefix=["gold", "youtube"],
22 | )
23 | },
24 | outs={
25 | "videoCategory": AssetOut(
26 | key_prefix=["warehouse", "gold"],
27 | io_manager_key="psql_io_manager",
28 | metadata={
29 | "primary_keys": [
30 | "categoryId"
31 | ],
32 | "columns": [
33 | "categoryId",
34 | "categoryName"
35 | ]
36 | },
37 | group_name=GROUP_NAME
38 | )
39 | },
40 | name="videoCategory",
41 | required_resource_keys={"psql_io_manager"},
42 | compute_kind="postgres",
43 | )
44 | def videoCategory(context: AssetExecutionContext,
45 | gold_videoCategory: pl.DataFrame
46 | ) -> Output[pl.DataFrame]:
47 | """
48 | Load videoCategory data from gold to PostgreSQL warehouse
49 | """
50 | pl_data: pl.DataFrame = gold_videoCategory
51 | context.log.info(f"Load videoCategory data Success with shape {pl_data.shape}")
52 |
53 | return Output(
54 | value=pl_data,
55 | metadata={
56 | "table name": MetadataValue.text("videoCategory"),
57 | "record count": MetadataValue.int(pl_data.shape[0]),
58 | "column count": MetadataValue.int(pl_data.shape[1]),
59 | "columns": pl_data.columns
60 | }
61 | )
62 |
63 |
64 | @multi_asset(
65 | ins={
66 | "gold_linkVideos": AssetIn(
67 | key_prefix=["gold", "youtube"],
68 | )
69 | },
70 | outs={
71 | "linkVideos": AssetOut(
72 | key_prefix=["warehouse", "gold"],
73 | io_manager_key="psql_io_manager",
74 | metadata={
75 | "primary_keys": [
76 | "video_id"
77 | ],
78 | "columns": [
79 | "video_id",
80 | "link_video"
81 | ]
82 | },
83 | group_name=GROUP_NAME
84 | )
85 | },
86 | name="linkVideos",
87 | required_resource_keys={"psql_io_manager"},
88 | compute_kind="postgres"
89 | )
90 | def linkVideos(context: AssetExecutionContext,
91 | gold_linkVideos: pl.DataFrame
92 | ) -> Output[pl.DataFrame]:
93 | """
94 | Load linkVideos data from gold to PostgreSQL warehouse
95 | """
96 | pl_data: pl.DataFrame = gold_linkVideos
97 | context.log.info(f"Load linkVideos data Success with shape {pl_data.shape}")
98 |
99 | return Output(
100 | value=pl_data,
101 | metadata={
102 | "table name": MetadataValue.text("linkVideos"),
103 | "record count": MetadataValue.int(pl_data.shape[0]),
104 | "column count": MetadataValue.int(pl_data.shape[1]),
105 | "columns": pl_data.columns
106 | }
107 | )
108 |
109 |
110 | @multi_asset(
111 | ins={
112 | "gold_metric_trending": AssetIn(
113 | key_prefix=["gold", "youtube"]
114 | )
115 | },
116 | outs={
117 | "metricVideos": AssetOut(
118 | key_prefix=["warehouse", "gold"],
119 | io_manager_key="psql_io_manager",
120 | metadata={
121 | "primary_keys": [
122 | "video_id"
123 | ],
124 | "columns": [
125 | "video_id",
126 | "publishedAt",
127 | "trending_date",
128 | "channelId",
129 | "categoryId",
130 | "view_count",
131 | "likes",
132 | "dislikes",
133 | "comment_count"
134 | ]
135 | },
136 | group_name=GROUP_NAME
137 | )
138 | },
139 | name="metricVideos",
140 | required_resource_keys={"psql_io_manager"},
141 | partitions_def=monthly_partitions,
142 | compute_kind="postgres"
143 | )
144 | def metricVideos(context: AssetExecutionContext,
145 | gold_metric_trending: pl.DataFrame
146 | ) -> Output[pl.DataFrame]:
147 | """
148 | Load metricVideos data from gold to PostgreSQL warehouse
149 | """
150 | pl_data: pl.DataFrame = gold_metric_trending
151 | context.log.info(f"Load metricVideos data Success with shape {pl_data.shape}")
152 |
153 | return Output(
154 | value=pl_data,
155 | metadata={
156 | "table name": MetadataValue.text("metricVideos"),
157 | "record count": MetadataValue.int(pl_data.shape[0]),
158 | "column count": MetadataValue.int(pl_data.shape[1]),
159 | "columns": pl_data.columns
160 | }
161 | )
162 |
163 |
164 | @multi_asset(
165 | ins={
166 | "gold_information_trending": AssetIn(
167 | key_prefix=["gold", "youtube"]
168 | )
169 | },
170 | outs={
171 | "informationVideos": AssetOut(
172 | key_prefix=["warehouse", "gold"],
173 | io_manager_key="psql_io_manager",
174 | metadata={
175 | "primary_keys": [
176 | "video_id"
177 | ],
178 | "columns": [
179 | "video_id",
180 | "title",
181 | "channelId",
182 | "channelTitle",
183 | "categoryId",
184 | "tags",
185 | "thumbnail_link",
186 | "comments_disabled",
187 | "ratings_disabled",
188 | ]
189 | },
190 | group_name=GROUP_NAME
191 | )
192 | },
193 | name="informationVideos",
194 | required_resource_keys={"psql_io_manager"},
195 | partitions_def=monthly_partitions,
196 | compute_kind="postgres"
197 | )
198 | def informationVideos(context: AssetExecutionContext,
199 | gold_information_trending: pl.DataFrame
200 | ) -> Output[pl.DataFrame]:
201 | """
202 | Load informationVideos data from gold to PostgreSQL warehouse
203 | """
204 | pl_data: pl.DataFrame = gold_information_trending
205 | context.log.info(f"Load informationVideos data Success with shape {pl_data.shape}")
206 |
207 | return Output(
208 | value=pl_data,
209 | metadata={
210 | "table name": MetadataValue.text("informationVideos"),
211 | "record count": MetadataValue.int(pl_data.shape[0]),
212 | "column count": MetadataValue.int(pl_data.shape[1]),
213 | "columns": pl_data.columns
214 | }
215 | )
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/resources/spark_io_manager.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Union
2 | from datetime import datetime
3 | from dagster import IOManager, InputContext, OutputContext
4 |
5 | import os
6 | import polars as pl
7 | import pandas as pd
8 | from contextlib import contextmanager
9 | from pyspark.sql import SparkSession, DataFrame
10 | from pyspark import SparkConf
11 | from .minio_io_manager import connect_minio
12 |
13 |
14 | @contextmanager
15 | def create_spark_session(config, appName=None):
16 | spark = (
17 | SparkSession.builder.appName(appName)
18 | .master("spark://spark-master:7077")
19 | .config("spark.driver.memory", "4g")
20 | .config("spark.executor.memory", "4g")
21 | # .config("spark.cores.max", "4")
22 | # .config("spark.executor.cores", "4")
23 | .config(
24 | "spark.jars",
25 | "/usr/local/spark/jars/delta-core_2.12-2.2.0.jar,/usr/local/spark/jars/hadoop-aws-3.3.2.jar,/usr/local/spark/jars/delta-storage-2.2.0.jar,/usr/local/spark/jars/aws-java-sdk-1.12.367.jar,/usr/local/spark/jars/s3-2.18.41.jar,/usr/local/spark/jars/aws-java-sdk-bundle-1.11.1026.jar",
26 | )
27 | .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
28 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
29 | .config("spark.hadoop.fs.s3a.endpoint", "http://" + config["endpoint_url"])
30 | .config("spark.hadoop.fs.s3a.access.key", str(config["aws_access_key_id"]))
31 | .config("spark.hadoop.fs.s3a.secret.key", str(config["aws_secret_access_key"]))
32 | .config("spark.hadoop.fs.s3a.path.style.access", "true")
33 | .config("spark.hadoop.fs.connection.ssl.enabled", "false")
34 | .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
35 | .config("spark.sql.execution.arrow.pyspark.enabled", "true")
36 | .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true")
37 | .getOrCreate()
38 | )
39 |
40 | try:
41 | yield spark
42 | except Exception as e:
43 | raise f"Error Pyspark: {e}"
44 |
45 |
46 | class SparkIOManager(IOManager):
47 |
48 | def __init__(self, config) -> None:
49 | self._config = config
50 |
51 |
52 | def get_spark_session(self, context, appName=None) -> SparkSession:
53 | with create_spark_session(self._config, appName) as spark:
54 | context.log.info("Return Object SparkSession")
55 | return spark
56 |
57 |
58 | def _get_path(self, context: Union[InputContext, OutputContext]):
59 | layer, schema, table = context.asset_key.path
60 | key = "/".join([layer, schema, table.replace(f"{layer}_", "")])
61 | tmp_file_path = "/tmp/file-{}-{}.parquet".format(
62 | datetime.today().strftime("%Y%m%d%H%M%S"),
63 | "-".join(context.asset_key.path)
64 | )
65 | return key, tmp_file_path
66 |
67 |
68 | def handle_output(self, context: OutputContext, obj: DataFrame):
69 | key_name, tmp_file_path = self._get_path(context)
70 | bucket_name = self._config.get("bucket")
71 | ## ====>
72 | file_path = "s3a://lakehouse/" + key_name
73 | context.log.info(f"file_path: {file_path}")
74 | context.log.info(f"key_name: {key_name}")
75 |
76 | if context.has_partition_key:
77 | start, end = context.asset_partitions_time_window
78 | # partition_str = context.asset_partition_key
79 | partition_str = start.strftime("%Y%m")
80 | context.log.info(f"INFO: {os.path.join(key_name, partition_str)}.parquet, {tmp_file_path}")
81 | key_name, tmp_file_path = os.path.join(key_name, f"{partition_str}.parquet"), tmp_file_path
82 | else:
83 | context.log.info(f"INFO: {key_name}.parquet, {tmp_file_path}")
84 | key_name, tmp_file_path = f"{key_name}.parquet", tmp_file_path
85 |
86 |
87 | obj.write.mode('overwrite').parquet(tmp_file_path)
88 |
89 | with connect_minio(self._config) as client:
90 | try:
91 | bucket_name = self._config.get("bucket")
92 | with connect_minio(self._config) as client:
93 | # Make bucket if not exist.
94 | found = client.bucket_exists(bucket_name)
95 | if not found:
96 | client.make_bucket(bucket_name)
97 | else:
98 | print(f"Bucket {bucket_name} already exists")
99 | client.fput_object(bucket_name, key_name, tmp_file_path)
100 | row_count = obj.count()
101 | context.add_output_metadata(
102 | {
103 | "path": key_name,
104 | "records": row_count,
105 | "tmp": tmp_file_path
106 | }
107 | )
108 | # clean up tmp file
109 | os.remove(tmp_file_path)
110 |
111 | except Exception as e:
112 | raise e
113 |
114 |
115 | def load_input(self, context: InputContext) -> DataFrame:
116 | key_name, tmp_file_path = self._get_path(context)
117 | bucket_name = self._config.get("bucket")
118 |
119 | if context.has_asset_partitions:
120 | start, end = context.asset_partitions_time_window
121 | # partition_str = context.asset_partition_key
122 | partition_str = start.strftime("%Y%m")
123 | context.log.info(f"INFO: {os.path.join(key_name, partition_str)}.parquet, {tmp_file_path}")
124 | key_name, tmp_file_path = os.path.join(key_name, f"{partition_str}.parquet"), tmp_file_path
125 | else:
126 | context.log.info(f"INFO: {key_name}.parquet, {tmp_file_path}")
127 | key_name, tmp_file_path = f"{key_name}.parquet", tmp_file_path
128 |
129 | with connect_minio(self._config) as client:
130 | try:
131 | with connect_minio(self._config) as client:
132 | # Make bucket if not exist.
133 | found = client.bucket_exists(bucket_name)
134 | if not found:
135 | client.make_bucket(bucket_name)
136 | else:
137 | print(f"Bucket {bucket_name} already exists")
138 |
139 | context.log.info(f"INFO -> bucket_name: {bucket_name}")
140 | context.log.info(f"INFO -> key_name: {key_name}")
141 | context.log.info(f"INFO -> tmp_file_path: {tmp_file_path}")
142 |
143 | client.fget_object(bucket_name, key_name, tmp_file_path)
144 |
145 | spark: SparkSession = self.get_spark_session(self, appName="Read-Parquet")
146 | df = spark.read.parquet(tmp_file_path)
147 |
148 | return df
149 |
150 | except Exception as e:
151 | raise e
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/resources/youtube_io_manager.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | from contextlib import contextmanager
4 | from datetime import datetime, timedelta
5 | from typing import Union, List
6 |
7 | import polars as pl
8 | from googleapiclient.discovery import build
9 | from googleapiclient.errors import HttpError
10 | from dagster import IOManager, InputContext, OutputContext
11 | from .minio_io_manager import connect_minio
12 | from .. import constants
13 |
14 |
15 | @contextmanager
16 | def youtube_client(config: dict):
17 | api_service_name = config["api_service_name"]
18 | api_version = config["api_version"]
19 | api_key = config["api_key"]
20 |
21 | youtube = build(
22 | serviceName=api_service_name,
23 | version=api_version,
24 | developerKey=api_key
25 | )
26 | try:
27 | yield youtube
28 | except HttpError as e:
29 | raise 'An HTTP error %d occurred:\n%s' % (e.resp.status, e.content)
30 |
31 |
32 | class YoutubeIOManager(IOManager):
33 |
34 | def __init__(self, config) -> None:
35 | self._config = config
36 |
37 |
38 | def _get_path(self, context: Union[InputContext, OutputContext]):
39 |
40 | start = constants.START_DATE
41 | end = constants.END_DATE
42 | start_date = datetime.strptime(start, "%Y-%m-%d")
43 | end_date = datetime.strptime(end, "%Y-%m-%d")
44 |
45 | layer, schema, table = context.asset_key.path
46 | table = "youtube_trending_data"
47 | layer = "bronze"
48 | key = "/".join([layer, schema, table.replace(f"{layer}_", "")])
49 |
50 | key_names: list[str] = []
51 | tmp_file_paths: list[str] = []
52 |
53 | for date in range((end_date - start_date).days + 1):
54 | partition_date = start_date + timedelta(days=date)
55 | partition_date.strftime("%Y-%m")
56 | key_name = f"{key}/" + str(partition_date)[:7].replace("-", "") + ".pq"
57 | # key_name = "bronze/youtube/youtube_trending_data/202011.pq"
58 | tmp_file_path = "/tmp/file-{}-{}.parquet".format(
59 | datetime.today().strftime("%Y%m%d%H%M%S"),
60 | str(partition_date)[:7].replace("-", "")
61 | )
62 | # tmp_file_path = "/tmp/file-2020-11.parquet"
63 | context.log.info(f"INFO -> key_name: {key_name}")
64 | context.log.info(f"INFO -> tmp_file_path: {tmp_file_path}")
65 |
66 | key_names.append(key_name)
67 | tmp_file_paths.append(tmp_file_path)
68 |
69 | return key_names, tmp_file_paths
70 |
71 |
72 | def list_of_list(self, obj: pl.Series) -> list[list[str]]:
73 | start = 0
74 | end = 50
75 | lists: List[List] = []
76 | for lst in range(len(obj) // 50 + 1):
77 | lists.append(list(obj)[start:end])
78 | start += 50
79 | end += 50
80 | return lists
81 |
82 |
83 | def get_DataFrame(self, context, field: str) -> pl.DataFrame:
84 | bucket_name = self._config.get("bucket")
85 | key_names, tmp_file_paths = self._get_path(context)
86 |
87 | try:
88 | with connect_minio(self._config) as client:
89 | # Make bucket if not exist.
90 | found = client.bucket_exists(bucket_name)
91 | if not found:
92 | client.make_bucket(bucket_name)
93 | else:
94 | print(f"Bucket {bucket_name} already exists")
95 |
96 | except Exception as e:
97 | raise e
98 |
99 | list_dfs: list[pl.DataFrame] = []
100 | for key_name, tmp_file_path in zip(key_names, tmp_file_paths):
101 | client.fget_object(bucket_name, key_name, tmp_file_path)
102 | df = pl.read_parquet(tmp_file_path)[field].unique()
103 | list_dfs.append(df)
104 | time.sleep(0.5)
105 |
106 | context.log.info(f"INFO -> key_name: {key_name}, tmp_file_path: {tmp_file_path}")
107 | os.remove(tmp_file_path)
108 |
109 | pl_data = pl.concat(list_dfs).unique()
110 | return pl_data
111 |
112 |
113 | def downLoad_videoCategories(self, context, obj: pl.DataFrame) -> pl.DataFrame:
114 |
115 | # pl_data = self.get_DataFrame(context, "categoryId")
116 | pl_data = obj["categoryId"].unique()
117 |
118 | with youtube_client(self._config) as service:
119 | categoryNames: list[str] = []
120 | categoryIds: list[str] = []
121 |
122 | categoryId_list: pl.Series = pl_data
123 | context.log.info("Divide categoryIds to multiple list categoryIds")
124 |
125 | for categoryId in list(categoryId_list.unique()):
126 | request = service.videoCategories().list(
127 | part="snippet",
128 | id=categoryId
129 | )
130 | response = request.execute()
131 |
132 | try:
133 | categoryIds.append(str(response["items"][0]["id"]))
134 | categoryNames.append(str(response["items"][0]["snippet"]["title"]))
135 | except IndexError:
136 | categoryNames.append(response["items"]["snippet"]["title"])
137 |
138 |
139 | return pl.DataFrame(
140 | {
141 | "categoryId": categoryIds,
142 | "categoryName": categoryNames
143 | }
144 | )
145 |
146 |
147 | def downLoad_linkVideos(self, context, obj: pl.DataFrame) -> pl.DataFrame:
148 |
149 | pl_data = obj["video_id"].unique()
150 |
151 | with youtube_client(self._config) as service:
152 | link_videos: list[str] = []
153 | videoIds: list[str] = []
154 |
155 | video_id_list: pl.Series = pl_data
156 | context.log.info("Divide videoId to multiple list videoId")
157 |
158 | for videoId in self.list_of_list(video_id_list.unique()):
159 | # videoId = list(map(lambda id: id[1:-1], videoId))
160 | # context.log.info(",".join(videoId)[:20])
161 | request = service.videos().list(
162 | part="player",
163 | id=",".join(videoId)
164 | )
165 | response = request.execute()
166 |
167 | for data in response["items"]:
168 | try:
169 | videoIds.append(str(data["id"]))
170 | link_videos.append(str(data["player"]["embedHtml"][40:74]))
171 | except IndexError as e:
172 | link_videos.append(response["items"]["snippet"]["title"])
173 | raise e
174 |
175 | return pl.DataFrame(
176 | {
177 | "videoId": videoIds,
178 | "link_video": link_videos
179 | }
180 | )
181 |
182 |
183 | def handle_output(self, context: OutputContext, obj: pl.DataFrame):
184 | pass
185 |
186 |
187 | def load_input(self, context: InputContext) -> pl.DataFrame:
188 | bucket_name = self._config.get("bucket")
189 | key_name, tmp_file_path = self._get_path(context)
190 |
191 | try:
192 | with connect_minio(self._config) as client:
193 | # Make bucket if not exist.
194 | found = client.bucket_exists(bucket_name)
195 | if not found:
196 | client.make_bucket(bucket_name)
197 | else:
198 | print(f"Bucket {bucket_name} already exists")
199 |
200 | client.fget_object(bucket_name, key_name, tmp_file_path)
201 | pd_data = pl.read_parquet(tmp_file_path)
202 | return pd_data
203 |
204 | except Exception as e:
205 | raise e
206 |
207 |
208 |
209 |
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/assets/gold.py:
--------------------------------------------------------------------------------
1 | import os
2 | import polars as pl
3 | from datetime import datetime
4 | from pyspark.sql import DataFrame
5 |
6 | from dagster import (
7 | multi_asset,
8 | AssetIn,
9 | AssetOut,
10 | MetadataValue,
11 | AssetExecutionContext,
12 | Output
13 | )
14 |
15 | from ..partitions import monthly_partitions
16 | from ..resources.spark_io_manager import create_spark_session
17 |
18 |
19 | GROUP_NAME = "gold"
20 |
21 | @multi_asset(
22 | ins={
23 | "silver_videoCategory_cleaned": AssetIn(
24 | key_prefix=["silver", "youtube"],
25 | input_manager_key="spark_io_manager"
26 | )
27 | },
28 | outs={
29 | "gold_videoCategory": AssetOut(
30 | key_prefix=["gold", "youtube"],
31 | io_manager_key="spark_io_manager",
32 | metadata={
33 | "primary_keys": [
34 | "categoryId"
35 | ],
36 | "columns": [
37 | "categoryId",
38 | "categoryName"
39 | ]
40 | },
41 | group_name=GROUP_NAME
42 | )
43 | },
44 | name="gold_videoCategory",
45 | required_resource_keys={"spark_io_manager"},
46 | compute_kind="PySpark",
47 | )
48 | def gold_videoCategory(context: AssetExecutionContext,
49 | silver_videoCategory_cleaned: pl.DataFrame
50 | ) -> Output[DataFrame]:
51 | """
52 | Compute and Load videoCategory data from silver to gold layer in MinIO
53 | """
54 | CONFIG = {
55 | "endpoint_url": os.getenv("MINIO_ENDPOINT"),
56 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
57 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
58 | }
59 |
60 | with create_spark_session(
61 | CONFIG, "gold_videoCategory-{}".format(datetime.today())
62 | ) as spark:
63 | spark_df: DataFrame = spark.createDataFrame(silver_videoCategory_cleaned.to_pandas())
64 | context.log.info(f"Load {context.asset_key.path[-1]} to gold layer success 🙂")
65 |
66 | return Output(
67 | value=spark_df,
68 | metadata={
69 | "file name": MetadataValue.text("videoCategory.pq"),
70 | "record count": MetadataValue.int(spark_df.count()),
71 | "column count": MetadataValue.int(len(spark_df.columns)),
72 | "columns": spark_df.columns
73 | }
74 | )
75 |
76 |
77 | @multi_asset(
78 | ins={
79 | "silver_linkVideos_cleaned": AssetIn(
80 | key_prefix=["silver", "youtube"],
81 | input_manager_key="spark_io_manager"
82 | )
83 | },
84 | outs={
85 | "gold_linkVideos": AssetOut(
86 | key_prefix=["gold", "youtube"],
87 | io_manager_key="spark_io_manager",
88 | metadata={
89 | "primary_keys": [
90 | "video_id"
91 | ],
92 | "columns": [
93 | "video_id",
94 | "link_video"
95 | ]
96 | },
97 | group_name=GROUP_NAME
98 | )
99 | },
100 | name="gold_linkVideos",
101 | required_resource_keys={"spark_io_manager"},
102 | compute_kind="PySpark"
103 | )
104 | def gold_linkVideos(context: AssetExecutionContext,
105 | silver_linkVideos_cleaned: pl.DataFrame
106 | ) -> Output[DataFrame]:
107 | """
108 | Compute and Load linkVideos data from silver to gold layer in MinIO
109 | """
110 |
111 | CONFIG = {
112 | "endpoint_url": os.getenv("MINIO_ENDPOINT"),
113 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
114 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
115 | }
116 |
117 | with create_spark_session(
118 | CONFIG, "gold_linkVideos-{}".format(datetime.today())
119 | ) as spark:
120 | spark_df: DataFrame = spark.createDataFrame(silver_linkVideos_cleaned.to_pandas())
121 | context.log.info(f"Load {context.asset_key.path[-1]} to gold layer success 🙂")
122 |
123 | return Output(
124 | value=spark_df,
125 | metadata={
126 | "file name": MetadataValue.text("linkVideos.pq"),
127 | "record count": MetadataValue.int(spark_df.count()),
128 | "column count": MetadataValue.int(len(spark_df.columns)),
129 | "columns": spark_df.columns
130 | }
131 | )
132 |
133 |
134 | @multi_asset(
135 | ins={
136 | "silver_trending_cleaned": AssetIn(
137 | key_prefix=["silver", "youtube"],
138 | input_manager_key="spark_io_manager"
139 | )
140 | },
141 | outs={
142 | "gold_metric_trending": AssetOut(
143 | key_prefix=["gold", "youtube"],
144 | io_manager_key="spark_io_manager",
145 | metadata={
146 | "primary_keys": [
147 | "video_id"
148 | ],
149 | "columns": [
150 | "video_id",
151 | "publishedAt",
152 | "trending_date",
153 | "channelId",
154 | "categoryId",
155 | "view_count",
156 | "likes",
157 | "dislikes",
158 | "comment_count"
159 | ]
160 | },
161 | group_name=GROUP_NAME
162 | ),
163 | "gold_information_trending": AssetOut(
164 | key_prefix=["gold", "youtube"],
165 | io_manager_key="spark_io_manager",
166 | metadata={
167 | "primary_keys": [
168 | "video_id"
169 | ],
170 | "columns": [
171 | "video_id",
172 | "title",
173 | "channelId",
174 | "channelTitle",
175 | "categoryId",
176 | "tags",
177 | "thumbnail_link",
178 | "comments_disabled",
179 | "ratings_disabled",
180 | ]
181 | },
182 | group_name=GROUP_NAME
183 | ),
184 | },
185 | name="gold_metric_trending",
186 | required_resource_keys={"spark_io_manager"},
187 | partitions_def=monthly_partitions,
188 | compute_kind="pyspark"
189 | )
190 | def gold_metric_trending(context: AssetExecutionContext,
191 | silver_trending_cleaned: pl.DataFrame
192 | ):
193 | """
194 | Compute and Load trending data from silver to gold layer in MinIO
195 | """
196 |
197 | CONFIG = {
198 | "endpoint_url": os.getenv("MINIO_ENDPOINT"),
199 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
200 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
201 | }
202 |
203 | with create_spark_session(
204 | CONFIG, "gold_metric_trending-{}".format(datetime.today())
205 | ) as spark:
206 |
207 | metric: DataFrame = spark.createDataFrame(silver_trending_cleaned.select([
208 | "video_id",
209 | "publishedAt",
210 | "trending_date",
211 | "channelId",
212 | "categoryId",
213 | "view_count",
214 | "likes",
215 | "dislikes",
216 | "comment_count"
217 | ]))
218 | information: DataFrame = spark.createDataFrame(silver_trending_cleaned.select([
219 | "video_id",
220 | "title",
221 | "channelId",
222 | "channelTitle",
223 | "categoryId",
224 | "tags",
225 | "thumbnail_link",
226 | "comments_disabled",
227 | "ratings_disabled",
228 | ]))
229 | context.log.info(f"Load {context.asset_key.path[-1]} to gold layer success 🙂")
230 |
231 | return Output(
232 | value=metric,
233 | output_name="gold_metric_trending",
234 | metadata={
235 | "folder name": MetadataValue.text("metric_trending"),
236 | "record count": MetadataValue.int(metric.count()),
237 | "column count": MetadataValue.int(len(metric.columns)),
238 | "columns": metric.columns
239 | }
240 | ), Output(
241 | value=information,
242 | output_name="gold_information_trending",
243 | metadata={
244 | "folder name": MetadataValue.text("information_trending"),
245 | "record count": MetadataValue.int(information.count()),
246 | "column count": MetadataValue.int(len(information.columns)),
247 | "columns": information.columns
248 | }
249 | ),
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/assets/bronze.py:
--------------------------------------------------------------------------------
1 | import polars as pl
2 | from ..partitions import monthly_partitions
3 |
4 | from dagster import (
5 | asset,
6 | Output,
7 | AssetIn,
8 | AssetOut,
9 | multi_asset,
10 | MetadataValue,
11 | AssetExecutionContext
12 | )
13 |
14 |
15 | GROUP_NAME = "bronze"
16 |
17 | @asset(
18 | name="bronze_CA_youtube_trending",
19 | required_resource_keys={"mysql_io_manager"},
20 | io_manager_key="minio_io_manager",
21 | key_prefix=["bronze", "youtube"],
22 | compute_kind="SQL",
23 | group_name=GROUP_NAME
24 | )
25 | def bronze_CA_youtube_trending(context: AssetExecutionContext) -> Output[pl.DataFrame]:
26 | """
27 | Load table 'CA_youtube_trending_data'
28 | from MySQL database as polars DataFrame and save to MinIO
29 | """
30 | query = """ SELECT * FROM CA_youtube_trending_data; """
31 |
32 | pl_data: pl.DataFrame = context.resources.mysql_io_manager.extract_data(query)
33 | context.log.info(f"Extract table 'CA_youtube_trending_data' from MySQL Success")
34 | pl_data = pl_data.with_columns(pl.lit("CA").alias("country_code"))
35 |
36 | return Output(
37 | value=pl_data,
38 | metadata={
39 | "file name": MetadataValue.text("CA_youtube_trending.pq"),
40 | "number columns": MetadataValue.int(pl_data.shape[1]),
41 | "number records": MetadataValue.int(pl_data.shape[0])
42 | }
43 | )
44 |
45 |
46 | @asset(
47 | name="bronze_DE_youtube_trending",
48 | required_resource_keys={"mysql_io_manager"},
49 | io_manager_key="minio_io_manager",
50 | key_prefix=["bronze", "youtube"],
51 | compute_kind="SQL",
52 | group_name=GROUP_NAME
53 | )
54 | def bronze_DE_youtube_trending(context: AssetExecutionContext) -> Output[pl.DataFrame]:
55 | """
56 | Load table 'DE_youtube_trending_data'
57 | from MySQL database as polars DataFrame and save to MinIO
58 | """
59 | query = """ SELECT * FROM DE_youtube_trending_data; """
60 |
61 | pl_data: pl.DataFrame = context.resources.mysql_io_manager.extract_data(query)
62 | context.log.info(f"Extract table 'DE_youtube_trending_data' from MySQL Success")
63 | pl_data = pl_data.with_columns(pl.lit("DE").alias("country_code"))
64 |
65 | return Output(
66 | value=pl_data,
67 | metadata={
68 | "file name": MetadataValue.text("DE_youtube_trending.pq"),
69 | "number columns": MetadataValue.int(pl_data.shape[1]),
70 | "number records": MetadataValue.int(pl_data.shape[0])
71 | }
72 | )
73 |
74 |
75 | @asset(
76 | name="bronze_IN_youtube_trending",
77 | required_resource_keys={"mysql_io_manager"},
78 | io_manager_key="minio_io_manager",
79 | key_prefix=["bronze", "youtube"],
80 | compute_kind="SQL",
81 | group_name=GROUP_NAME
82 | )
83 | def bronze_IN_youtube_trending(context: AssetExecutionContext) -> Output[pl.DataFrame]:
84 | """
85 | Load table 'IN_youtube_trending_data'
86 | from MySQL database as polars DataFrame and save to MinIO
87 | """
88 | query = """ SELECT * FROM IN_youtube_trending_data; """
89 |
90 | pl_data: pl.DataFrame = context.resources.mysql_io_manager.extract_data(query)
91 | context.log.info(f"Extract table 'IN_youtube_trending_data' from MySQL Success")
92 | pl_data = pl_data.with_columns(pl.lit("IN").alias("country_code"))
93 |
94 | return Output(
95 | value=pl_data,
96 | metadata={
97 | "file name": MetadataValue.text("IN_youtube_trending.pq"),
98 | "number columns": MetadataValue.int(pl_data.shape[1]),
99 | "number records": MetadataValue.int(pl_data.shape[0])
100 | }
101 | )
102 |
103 |
104 | @asset(
105 | name="bronze_JP_youtube_trending",
106 | required_resource_keys={"mysql_io_manager"},
107 | io_manager_key="minio_io_manager",
108 | key_prefix=["bronze", "youtube"],
109 | compute_kind="SQL",
110 | group_name=GROUP_NAME
111 | )
112 | def bronze_JP_youtube_trending(context: AssetExecutionContext) -> Output[pl.DataFrame]:
113 | """
114 | Load table 'JP_youtube_trending_data'
115 | from MySQL database as polars DataFrame and save to MinIO
116 | """
117 | query = """ SELECT * FROM JP_youtube_trending_data; """
118 |
119 | pl_data: pl.DataFrame = context.resources.mysql_io_manager.extract_data(query)
120 | context.log.info(f"Extract table 'JP_youtube_trending_data' from MySQL Success")
121 | pl_data = pl_data.with_columns(pl.lit("JP").alias("country_code"))
122 |
123 | return Output(
124 | value=pl_data,
125 | metadata={
126 | "file name": MetadataValue.text("JP_youtube_trending.pq"),
127 | "number columns": MetadataValue.int(pl_data.shape[1]),
128 | "number records": MetadataValue.int(pl_data.shape[0])
129 | }
130 | )
131 |
132 |
133 | @asset(
134 | name="bronze_RU_youtube_trending",
135 | required_resource_keys={"mysql_io_manager"},
136 | io_manager_key="minio_io_manager",
137 | key_prefix=["bronze", "youtube"],
138 | compute_kind="SQL",
139 | group_name=GROUP_NAME
140 | )
141 | def bronze_RU_youtube_trending(context: AssetExecutionContext) -> Output[pl.DataFrame]:
142 | """
143 | Load table 'RU_youtube_trending_data'
144 | from MySQL database as polars DataFrame and save to MinIO
145 | """
146 | query = """ SELECT * FROM RU_youtube_trending_data; """
147 |
148 | pl_data: pl.DataFrame = context.resources.mysql_io_manager.extract_data(query)
149 | context.log.info(f"Extract table 'RU_youtube_trending_data' from MySQL Success")
150 | pl_data = pl_data.with_columns(pl.lit("RU").alias("country_code"))
151 |
152 | return Output(
153 | value=pl_data,
154 | metadata={
155 | "file name": MetadataValue.text("RU_youtube_trending.pq"),
156 | "number columns": MetadataValue.int(pl_data.shape[1]),
157 | "number records": MetadataValue.int(pl_data.shape[0])
158 | }
159 | )
160 |
161 |
162 | @asset(
163 | ins={
164 | "bronze_CA_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
165 | "bronze_DE_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
166 | "bronze_IN_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"])
167 | },
168 | name="bronze_linkVideos_trending",
169 | required_resource_keys={"youtube_io_manager"},
170 | io_manager_key="minio_io_manager",
171 | key_prefix=["bronze", "youtube"],
172 | group_name=GROUP_NAME,
173 | compute_kind="Youtube API"
174 | )
175 | def bronze_linkVideos_trending(context: AssetExecutionContext,
176 | bronze_CA_youtube_trending: pl.DataFrame,
177 | bronze_DE_youtube_trending: pl.DataFrame,
178 | bronze_IN_youtube_trending: pl.DataFrame
179 | ) -> Output[pl.DataFrame]:
180 | """
181 | Download Link Video from Youtube API by VideoId
182 | """
183 | data = pl.concat(
184 | [
185 | bronze_CA_youtube_trending,
186 | bronze_DE_youtube_trending,
187 | bronze_IN_youtube_trending
188 | ]
189 | )
190 |
191 | pl_data: pl.DataFrame = context \
192 | .resources \
193 | .youtube_io_manager \
194 | .downLoad_linkVideos(
195 | context, data
196 | )
197 | context.log.info("Download links video from youtube api success")
198 |
199 | return Output(
200 | value=pl_data,
201 | metadata={
202 | "File Name": MetadataValue.text("linkVideos_trending.pq"),
203 | "Number Columns": MetadataValue.int(pl_data.shape[1]),
204 | "Number Records": MetadataValue.int(pl_data.shape[0])
205 | }
206 | )
207 |
208 |
209 | @asset(
210 | ins={
211 | "bronze_JP_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
212 | "bronze_RU_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"])
213 | },
214 | name="bronze_videoCategory_trending",
215 | required_resource_keys={"youtube_io_manager"},
216 | io_manager_key="minio_io_manager",
217 | key_prefix=["bronze", "youtube"],
218 | compute_kind="Youtube API",
219 | group_name=GROUP_NAME,
220 | )
221 | def bronze_videoCategory_trending(context: AssetExecutionContext,
222 | bronze_JP_youtube_trending: pl.DataFrame,
223 | bronze_RU_youtube_trending: pl.DataFrame
224 | ) -> Output[pl.DataFrame]:
225 | """
226 | Download Video Category from Youtube API by categoryId
227 | """
228 | data = pl.concat(
229 | [
230 | bronze_JP_youtube_trending,
231 | bronze_RU_youtube_trending
232 | ]
233 | )
234 |
235 | pl_data: pl.DataFrame = context \
236 | .resources \
237 | .youtube_io_manager \
238 | .downLoad_videoCategories(
239 | context, data
240 | )
241 | context.log.info("Download video category from youtube api success")
242 |
243 | return Output(
244 | value=pl_data,
245 | metadata={
246 | "File Name": MetadataValue.text("videoCategory_trending.pq"),
247 | "Number Columns": MetadataValue.int(pl_data.shape[1]),
248 | "Number Records": MetadataValue.int(pl_data.shape[0]),
249 | }
250 | )
251 |
252 |
253 | @asset(
254 | ins={
255 | "bronze_CA_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
256 | "bronze_DE_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
257 | "bronze_IN_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
258 | "bronze_JP_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
259 | "bronze_RU_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"])
260 | },
261 | name="bronze_youtube_trending",
262 | required_resource_keys={"youtube_io_manager"},
263 | io_manager_key="minio_io_manager",
264 | key_prefix=["bronze", "youtube"],
265 | compute_kind="Polars",
266 | group_name=GROUP_NAME,
267 | )
268 | def bronze_youtube_trending(context: AssetExecutionContext,
269 | bronze_CA_youtube_trending: pl.DataFrame,
270 | bronze_DE_youtube_trending: pl.DataFrame,
271 | bronze_IN_youtube_trending: pl.DataFrame,
272 | bronze_JP_youtube_trending: pl.DataFrame,
273 | bronze_RU_youtube_trending: pl.DataFrame
274 | ) -> Output[pl.DataFrame]:
275 | """ """
276 |
277 | pl_data = pl.concat(
278 | [
279 | bronze_CA_youtube_trending,
280 | bronze_DE_youtube_trending,
281 | bronze_IN_youtube_trending,
282 | bronze_JP_youtube_trending,
283 | bronze_RU_youtube_trending
284 | ]
285 | )
286 |
287 | # 2020-08-11T16:34:06Z
288 | pl_data = pl_data.with_columns(pl.col('publishedAt').apply(lambda e: e.replace('T', ' ').replace('Z', '')))
289 | pl_data = pl_data.with_columns(pl.col("publishedAt").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S"))
290 |
291 | return Output(
292 | value=pl_data,
293 | metadata={
294 | "File Name": MetadataValue.text("youtube_trending.pq"),
295 | "Number Columns": MetadataValue.int(pl_data.shape[1]),
296 | "Number Records": MetadataValue.int(pl_data.shape[0]),
297 | }
298 | )
--------------------------------------------------------------------------------
/etl_pipeline/etl_pipeline/assets/silver.py:
--------------------------------------------------------------------------------
1 | import os
2 | import polars as pl
3 | import pandas as pd
4 | from datetime import datetime
5 |
6 | from pyspark.sql import SparkSession, DataFrame
7 | from pyspark.sql.types import IntegerType, StringType
8 | from pyspark.sql.functions import udf, to_timestamp, count
9 | from pyspark.sql.functions import when, col, concat, lit
10 |
11 | from ..partitions import monthly_partitions
12 | from ..func_process import replace_str, format_date, convert
13 | from ..resources.spark_io_manager import create_spark_session
14 |
15 | from dagster import (
16 | AssetExecutionContext,
17 | MetadataValue,
18 | AssetIn,
19 | AssetIn,
20 | Output,
21 | asset
22 | )
23 |
24 |
25 | GROUP_NAME = "silver"
26 |
27 | @asset(
28 | ins={
29 | "bronze_videoCategory_trending": AssetIn(
30 | key_prefix=["bronze", "youtube"]
31 | )
32 | },
33 | name="silver_videoCategory_cleaned",
34 | required_resource_keys={"spark_io_manager"},
35 | io_manager_key="spark_io_manager",
36 | key_prefix=["silver", "youtube"],
37 | compute_kind="PySpark",
38 | group_name=GROUP_NAME
39 | )
40 | def silver_videoCategory_cleaned(context: AssetExecutionContext,
41 | bronze_videoCategory_trending: pl.DataFrame
42 | ) -> Output[DataFrame]:
43 | """
44 | Clean 'videoCategory_trending_data' and load to silver layer in MinIO
45 | """
46 | # spark: SparkSession = context.resources.spark_io_manager.get_spark_session(
47 | # context, "silver_videoCategory_cleaned-{}".format(datetime.today())
48 | # )
49 | CONFIG = {
50 | "endpoint_url": os.getenv("MINIO_ENDPOINT"),
51 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
52 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
53 | }
54 |
55 | with create_spark_session(
56 | CONFIG, "silver_videoCategory_cleaned-{}".format(datetime.today())
57 | ) as spark:
58 |
59 | # Convert from polars dataframe to pyspark dataframe
60 | spark_df: DataFrame = spark.createDataFrame(bronze_videoCategory_trending.to_pandas())
61 | # Convert data type from string to integer of categoryId column
62 | spark_df = spark_df.withColumn("categoryId", spark_df["categoryId"].cast(IntegerType()))
63 | # Sorted dataframe by categoryId column
64 | spark_df = spark_df.orderBy(spark_df["categoryId"])
65 | # polars_df = pl.DataFrame(spark_df.toPandas())
66 | context.log.info(f"Cleaning for {context.asset_key.path[-1]} success 🙂")
67 |
68 | return Output(
69 | value=spark_df,
70 | metadata={
71 | "File Name": MetadataValue.text("videoCategory_cleaned.pq"),
72 | "Number Columns": MetadataValue.int(len(spark_df.columns)),
73 | "Number Records": MetadataValue.int(spark_df.count())
74 | }
75 | )
76 |
77 |
78 | @asset(
79 | ins={
80 | "bronze_linkVideos_trending": AssetIn(key_prefix=["bronze", "youtube"]),
81 | "bronze_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
82 | },
83 | name="silver_linkVideos_cleaned",
84 | required_resource_keys={"spark_io_manager", "youtube_io_manager"},
85 | io_manager_key="spark_io_manager",
86 | key_prefix=["silver", "youtube"],
87 | compute_kind="PySpark",
88 | group_name=GROUP_NAME
89 | )
90 | def silver_linkVideos_cleaned(context: AssetExecutionContext,
91 | bronze_linkVideos_trending: pl.DataFrame,
92 | bronze_youtube_trending: pl.DataFrame
93 | ) -> Output[DataFrame]:
94 | """
95 | Clean 'linkVideos_trending_data' and load to silver layer in MinIO
96 | """
97 | # spark: SparkSession = context.resources.spark_io_manager.get_spark_session(
98 | # context, "silver_linkVideos_cleaned-{}".format(datetime.today())
99 | # )
100 |
101 | CONFIG = {
102 | "endpoint_url": os.getenv("MINIO_ENDPOINT"),
103 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
104 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
105 | }
106 |
107 | with create_spark_session(
108 | CONFIG, "silver_linkVideos_cleaned-{}".format(datetime.today())
109 | ) as spark:
110 |
111 | # Convert from polars dataframe to pyspark dataframe for linkVideos
112 | linkVideos: DataFrame = spark.createDataFrame(bronze_linkVideos_trending.to_pandas())
113 | # Convert from polars dataframe to pyspark dataframe for trending
114 | trending: DataFrame = spark.createDataFrame(bronze_youtube_trending.to_pandas())
115 | # Drop duplicates by video_id for trending
116 | trending = trending.dropDuplicates(["video_id"])
117 | # Convert the link to the correct format
118 | link_format = udf(convert, StringType())
119 | linkVideos = linkVideos.withColumn("link_video", link_format(linkVideos['link_video']))
120 | # Join two dataframe by video_id
121 | spark_df = linkVideos.join(
122 | trending,
123 | linkVideos["videoId"] == trending["video_id"],
124 | how="outer",
125 | ).select(trending.video_id, linkVideos.link_video)
126 | spark_df.cache()
127 |
128 | # fill NA for link video
129 | spark_df = spark_df.withColumn("link_video",when(
130 | col("link_video").isNull(),
131 | concat(lit("www.youtube.com/embed/"),
132 | col("video_id"))).otherwise(col("link_video"))
133 | )
134 | context.log.info(f"Cleaning for {context.asset_key.path[-1]} success 🙂")
135 |
136 | spark_df.unpersist()
137 |
138 | # trending = pl.concat(
139 | # [
140 | # silver_youtube_trending_01,
141 | # silver_youtube_trending_02
142 | # ]
143 | # )
144 | # bronze_linkVideos_trending = bronze_linkVideos_trending.with_columns(
145 | # pl.col('link_video').apply(lambda e: e.replace('"', ''))
146 | # )
147 | # bronze_youtube_trending = bronze_youtube_trending.unique(subset=["video_id"])
148 | # polars_df = bronze_linkVideos_trending.join(
149 | # bronze_youtube_trending,
150 | # left_on="videoId",
151 | # right_on="video_id",
152 | # how="outer"
153 | # ).select(["video_id", "link_video"])
154 |
155 | # polars_df = polars_df.with_columns(
156 | # pl.when(pl.col("link_video").is_null()).then(pl.format("www.youtube.com/embed/{}", pl.col("video_id")))
157 | # .otherwise(pl.col("link_video")).alias("link_video")
158 | # )
159 | # context.log.info(f"Cleaning for {context.asset_key.path[-1]} success 🙂")
160 |
161 | return Output(
162 | value=spark_df,
163 | metadata={
164 | "File Name": MetadataValue.text("linkVideos_cleaned.pq"),
165 | "Number Columns": MetadataValue.int(len(spark_df.columns)),
166 | "Number Records": MetadataValue.int(spark_df.count())
167 | }
168 | )
169 |
170 |
171 | @asset(
172 | ins={
173 | "bronze_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]),
174 | },
175 | name="silver_trending_cleaned",
176 | required_resource_keys={"spark_io_manager"},
177 | io_manager_key="spark_io_manager",
178 | key_prefix=["silver", "youtube"],
179 | partitions_def=monthly_partitions,
180 | compute_kind="PySpark",
181 | group_name=GROUP_NAME
182 | )
183 | def silver_trending_cleaned(context: AssetExecutionContext,
184 | bronze_youtube_trending: pl.DataFrame,
185 | ) -> Output[DataFrame]:
186 | """
187 | Clean 'bronze_youtube_trending_data' and load to silver layer in MinIO
188 | """
189 |
190 | try:
191 | partition_date_str = context.asset_partition_key_for_output()
192 | data_by_publishedAt = bronze_youtube_trending.filter(
193 | (pl.col("publishedAt").dt.year() == int(partition_date_str[:4])) &
194 | (pl.col("publishedAt").dt.month() == int(partition_date_str[5:7]))
195 | )
196 | except Exception as e:
197 | raise Exception(f"{e}")
198 |
199 | # data_by_publishedAt = data_by_publishedAt.with_columns(
200 | # pl.col('trending_date').apply(lambda e: e.replace('T', ' ').replace('Z', ''))
201 | # )
202 | # data_by_publishedAt = data_by_publishedAt.with_columns(
203 | # pl.col("trending_date").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S")
204 | # )
205 | # data_by_publishedAt = data_by_publishedAt.with_columns(
206 | # pl.when(pl.col("thumbnail_link").is_not_null())
207 | # .then(pl.col("thumbnail_link").str.replace("default.jpg", "maxresdefault.jpg"))
208 | # .otherwise(pl.col("thumbnail_link")).alias("thumbnail_link")
209 | # )
210 |
211 | # data_by_publishedAt = data_by_publishedAt.with_columns(
212 | # pl.col("comment_count").str.parse_int(10, strict=False)
213 | # )
214 | # data_by_publishedAt = data_by_publishedAt.filter(pl.col("comment_count").is_not_null())
215 |
216 | # data_by_publishedAt = data_by_publishedAt.with_columns(
217 | # pl.col('tags').apply(lambda e: e.replace('|', ' #').replace('Z', ''))
218 | # ) #Squeezie arnaque #Squeezie tableau #Squeezie thread #Squeezie art #Squeezie arnaqueur
219 |
220 | # data_by_publishedAt = data_by_publishedAt.with_columns(
221 | # (pl.col('tags').apply(lambda x: f"#{x}"))
222 | # )
223 |
224 | # data_by_publishedAt = data_by_publishedAt.with_columns([
225 | # pl.col("categoryId").cast(pl.Int64),
226 | # pl.col("view_count").cast(pl.Int64),
227 | # pl.col("likes").cast(pl.Int64),
228 | # pl.col("dislikes").cast(pl.Int64),
229 | # pl.col("comment_count").cast(pl.Int64)
230 | # ])
231 | # context.log.info(f"Cleaning for {context.asset_key.path[-1]} success 🙂")
232 |
233 | # polars_df: pl.DataFrame = data_by_publishedAt
234 |
235 | # spark: SparkSession = context.resources.spark_io_manager.get_spark_session(
236 | # context, "silver_trending_cleaned-{}".format(datetime.today())
237 | # )
238 |
239 | CONFIG = {
240 | "endpoint_url": os.getenv("MINIO_ENDPOINT"),
241 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
242 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
243 | }
244 |
245 | with create_spark_session(
246 | CONFIG, "silver_trending_cleaned-{}".format(datetime.today())
247 | ) as spark:
248 |
249 | spark_df: DataFrame = spark.createDataFrame(data_by_publishedAt.to_pandas())
250 | # publishedAt replace to format date
251 | date_format = udf(format_date, StringType())
252 | # spark_df = spark_df.withColumn("publishedAt", date_format(spark_df["publishedAt"]))
253 | # Convert date type of column publishedAt to datetime data type
254 | spark_df = spark_df.withColumn("publishedAt", to_timestamp("publishedAt"))
255 | # Convert date type of column categoryId to integer data type
256 | spark_df = spark_df.withColumn("categoryId", spark_df["categoryId"].cast(IntegerType()))
257 | # trending_date replace to format date
258 | spark_df = spark_df.withColumn("trending_date", date_format(spark_df["trending_date"]))
259 | # Convert date type of column trending_date to datetime data type
260 | spark_df = spark_df.withColumn("trending_date", to_timestamp("trending_date"))
261 | # Convert date type of column view_count to integer data type
262 | spark_df = spark_df.withColumn("view_count", spark_df["view_count"].cast(IntegerType()))
263 | # Convert date type of column likes to integer data type
264 | spark_df = spark_df.withColumn("likes", spark_df["likes"].cast(IntegerType()))
265 | # Convert date type of column dislikes to integer data type
266 | spark_df = spark_df.withColumn("dislikes", spark_df["dislikes"].cast(IntegerType()))
267 | # Convert date type of column comment_count to integer data type
268 | spark_df = spark_df.withColumn("comment_count", spark_df["comment_count"].cast(IntegerType()))
269 | # thumbnail_link replace from default to maxresdefault
270 | link_convert = udf(replace_str, StringType())
271 | spark_df = spark_df.withColumn("thumbnail_link", link_convert(spark_df["thumbnail_link"]))
272 | # context.log.info(f"Data: {spark_df.show(5)}")
273 | spark_df.unpersist()
274 | context.log.info(f"Cleaning for {context.asset_key.path[-1]} success 🙂")
275 | # polars_df = pl.DataFrame(spark_df.toPandas())
276 |
277 | return Output(
278 | value=spark_df,
279 | metadata={
280 | "file name": MetadataValue.text(f"{partition_date_str[:7]}.pq"),
281 | "Records": MetadataValue.int(spark_df.count()),
282 | "Columns": MetadataValue.int(len(spark_df.columns))
283 | }
284 | )
--------------------------------------------------------------------------------
/public/notebooks/Preprocessing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "0wCyW8k45E-a",
6 | "metadata": {
7 | "id": "0wCyW8k45E-a"
8 | },
9 | "source": [
10 | "# Xử lý trước khi đưa vào Asset"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "id": "6880fc6d",
16 | "metadata": {
17 | "id": "6880fc6d"
18 | },
19 | "source": [
20 | "# Import Library"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 1,
26 | "id": "8kaoHMK_lY1U",
27 | "metadata": {
28 | "colab": {
29 | "base_uri": "https://localhost:8080/"
30 | },
31 | "id": "8kaoHMK_lY1U",
32 | "outputId": "10480db2-cc8d-44ef-8fda-0c3bba66971f"
33 | },
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "Mounted at /content/drive\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "from google.colab import drive\n",
45 | "drive.mount('/content/drive')"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 4,
51 | "id": "6-BAYjQS2NzN",
52 | "metadata": {
53 | "id": "6-BAYjQS2NzN"
54 | },
55 | "outputs": [],
56 | "source": [
57 | "from pyspark.sql import SparkSession, DataFrame\n",
58 | "from pyspark.sql.types import IntegerType, StringType\n",
59 | "from pyspark.sql.functions import udf, to_timestamp"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 6,
65 | "id": "45adf8f6-0480-49af-966b-d1dfefa29ab3",
66 | "metadata": {
67 | "id": "45adf8f6-0480-49af-966b-d1dfefa29ab3"
68 | },
69 | "outputs": [],
70 | "source": [
71 | "# Create SparkSession object\n",
72 | "spark = SparkSession.builder \\\n",
73 | " .appName(\"HomeWork-W6\") \\\n",
74 | " .getOrCreate()"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 8,
80 | "id": "dhgY_ug22qoX",
81 | "metadata": {
82 | "id": "dhgY_ug22qoX"
83 | },
84 | "outputs": [],
85 | "source": [
86 | "spark_df = spark.read.parquet(\"/content/drive/MyDrive/Colab Notebooks/202204.pq\")"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 9,
92 | "id": "dY1y4YBf2zox",
93 | "metadata": {
94 | "colab": {
95 | "base_uri": "https://localhost:8080/"
96 | },
97 | "id": "dY1y4YBf2zox",
98 | "outputId": "329f8ff8-632c-4018-e840-94cf7f8da43d"
99 | },
100 | "outputs": [
101 | {
102 | "name": "stdout",
103 | "output_type": "stream",
104 | "text": [
105 | "+-----------+--------------------+--------------------+--------------------+---------------+----------+--------------------+--------------------+----------+-----+--------+-------------+--------------------+-----------------+----------------+\n",
106 | "| video_id| title| publishedAt| channelId| channelTitle|categoryId| trending_date| tags|view_count|likes|dislikes|comment_count| thumbnail_link|comments_disabled|ratings_disabled|\n",
107 | "+-----------+--------------------+--------------------+--------------------+---------------+----------+--------------------+--------------------+----------+-----+--------+-------------+--------------------+-----------------+----------------+\n",
108 | "|zoHGxJKjC_Y|Heiratsantrag, di...|2022-04-01T16:36:48Z|UCm3_j4RLEzgMovQT...| Drachen Lord| 24|2022-04-02T00:00:00Z|drachenlord origi...| 126194| 4922| 0| 2069|https://i.ytimg.c...| False| False|\n",
109 | "|s38-OigKoIU|Nachgefragt: Panz...|2022-04-01T11:03:23Z|UClCZul-nK9h8eVo7...| Bundeswehr| 25|2022-04-02T00:00:00Z|Bundeswehr|Soldat...| 345217|10056| 0| 1927|https://i.ytimg.c...| False| False|\n",
110 | "|fn_DBhbEscA|Aprilscherze in d...|2022-04-01T12:00:31Z|UC6UrlhHQXm9tWhZc...| How2Shirli| 22|2022-04-02T00:00:00Z| [None]| 353375|47638| 0| 517|https://i.ytimg.c...| False| False|\n",
111 | "|JpiJT7lLuAE| MOIN GERHARD!|2022-04-01T12:33:48Z|UC3oj6YrK6Tj3tR6-...| Tom Stein| 24|2022-04-02T00:00:00Z| [None]| 67361| 6114| 0| 435|https://i.ytimg.c...| False| False|\n",
112 | "|u_D9tg3cK1w|Saltatio Mortis f...|2022-04-01T10:01:34Z|UCDGhwUyQMvcNqz15...|Saltatio Mortis| 24|2022-04-02T00:00:00Z|Saltatio Morits|H...| 34958| 572| 0| 163|https://i.ytimg.c...| False| False|\n",
113 | "+-----------+--------------------+--------------------+--------------------+---------------+----------+--------------------+--------------------+----------+-----+--------+-------------+--------------------+-----------------+----------------+\n",
114 | "only showing top 5 rows\n",
115 | "\n"
116 | ]
117 | }
118 | ],
119 | "source": [
120 | "spark_df.show(5)"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 10,
126 | "id": "v7aGpy5328MN",
127 | "metadata": {
128 | "colab": {
129 | "base_uri": "https://localhost:8080/"
130 | },
131 | "id": "v7aGpy5328MN",
132 | "outputId": "fed2732f-d0c5-4e72-88e7-53360849b9e5"
133 | },
134 | "outputs": [
135 | {
136 | "name": "stdout",
137 | "output_type": "stream",
138 | "text": [
139 | "root\n",
140 | " |-- video_id: string (nullable = true)\n",
141 | " |-- title: string (nullable = true)\n",
142 | " |-- publishedAt: string (nullable = true)\n",
143 | " |-- channelId: string (nullable = true)\n",
144 | " |-- channelTitle: string (nullable = true)\n",
145 | " |-- categoryId: string (nullable = true)\n",
146 | " |-- trending_date: string (nullable = true)\n",
147 | " |-- tags: string (nullable = true)\n",
148 | " |-- view_count: string (nullable = true)\n",
149 | " |-- likes: string (nullable = true)\n",
150 | " |-- dislikes: string (nullable = true)\n",
151 | " |-- comment_count: string (nullable = true)\n",
152 | " |-- thumbnail_link: string (nullable = true)\n",
153 | " |-- comments_disabled: string (nullable = true)\n",
154 | " |-- ratings_disabled: string (nullable = true)\n",
155 | "\n"
156 | ]
157 | }
158 | ],
159 | "source": [
160 | "spark_df.printSchema()"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": 7,
166 | "id": "tQW9x4yi2Fta",
167 | "metadata": {
168 | "id": "tQW9x4yi2Fta"
169 | },
170 | "outputs": [],
171 | "source": [
172 | "def replace_str(value: str):\n",
173 | " return value.replace(\"default\", \"maxresdefault\")\n",
174 | "\n",
175 | "def format_date(value: str):\n",
176 | " return value.replace(\"T\", \" \").replace(\"Z\", \"\")"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 11,
182 | "id": "Wcl-xS4r2Fwq",
183 | "metadata": {
184 | "id": "Wcl-xS4r2Fwq"
185 | },
186 | "outputs": [],
187 | "source": [
188 | "date_format = udf(format_date, StringType())\n",
189 | "spark_df = spark_df.withColumn(\"publishedAt\", date_format(spark_df[\"publishedAt\"]))"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 12,
195 | "id": "r6hpuRoT3he6",
196 | "metadata": {
197 | "colab": {
198 | "base_uri": "https://localhost:8080/"
199 | },
200 | "id": "r6hpuRoT3he6",
201 | "outputId": "581b6d63-cf30-4e86-a472-e8844d928326"
202 | },
203 | "outputs": [
204 | {
205 | "name": "stdout",
206 | "output_type": "stream",
207 | "text": [
208 | "+-------------------+\n",
209 | "| publishedAt|\n",
210 | "+-------------------+\n",
211 | "|2022-04-01 16:36:48|\n",
212 | "|2022-04-01 11:03:23|\n",
213 | "+-------------------+\n",
214 | "only showing top 2 rows\n",
215 | "\n"
216 | ]
217 | }
218 | ],
219 | "source": [
220 | "spark_df.select(\"publishedAt\").show(2)"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 13,
226 | "id": "AN2oaZJm2F5E",
227 | "metadata": {
228 | "id": "AN2oaZJm2F5E"
229 | },
230 | "outputs": [],
231 | "source": [
232 | "# Convert date type of column publishedAt to datetime data type\n",
233 | "spark_df = spark_df.withColumn(\"publishedAt\", to_timestamp(\"publishedAt\"))"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 14,
239 | "id": "wOsobTLN2F7Q",
240 | "metadata": {
241 | "id": "wOsobTLN2F7Q"
242 | },
243 | "outputs": [],
244 | "source": [
245 | "# Convert date type of column categoryId to integer data type\n",
246 | "spark_df = spark_df.withColumn(\"categoryId\", spark_df[\"categoryId\"].cast(IntegerType()))"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": 15,
252 | "id": "g6FcJJ1N2F-O",
253 | "metadata": {
254 | "id": "g6FcJJ1N2F-O"
255 | },
256 | "outputs": [],
257 | "source": [
258 | "# trending_date replace to format date\n",
259 | "spark_df = spark_df.withColumn(\"trending_date\", date_format(spark_df[\"trending_date\"]))"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": 16,
265 | "id": "JSNG_4MW4ELz",
266 | "metadata": {
267 | "colab": {
268 | "base_uri": "https://localhost:8080/"
269 | },
270 | "id": "JSNG_4MW4ELz",
271 | "outputId": "c16f96a7-9267-40da-c2d1-7b0f429fc15c"
272 | },
273 | "outputs": [
274 | {
275 | "name": "stdout",
276 | "output_type": "stream",
277 | "text": [
278 | "+-------------------+\n",
279 | "| trending_date|\n",
280 | "+-------------------+\n",
281 | "|2022-04-02 00:00:00|\n",
282 | "|2022-04-02 00:00:00|\n",
283 | "+-------------------+\n",
284 | "only showing top 2 rows\n",
285 | "\n"
286 | ]
287 | }
288 | ],
289 | "source": [
290 | "spark_df.select(\"trending_date\").show(2)"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 17,
296 | "id": "4bovaYHZ2GBQ",
297 | "metadata": {
298 | "id": "4bovaYHZ2GBQ"
299 | },
300 | "outputs": [],
301 | "source": [
302 | "# Convert date type of column trending_date to datetime data type\n",
303 | "spark_df = spark_df.withColumn(\"trending_date\", to_timestamp(\"trending_date\"))"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": 18,
309 | "id": "xlQ7QkYJ2GEA",
310 | "metadata": {
311 | "id": "xlQ7QkYJ2GEA"
312 | },
313 | "outputs": [],
314 | "source": [
315 | "# Convert date type of column view_count to integer data type\n",
316 | "spark_df = spark_df.withColumn(\"view_count\", spark_df[\"view_count\"].cast(IntegerType()))"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 19,
322 | "id": "Inflh60t2GGj",
323 | "metadata": {
324 | "id": "Inflh60t2GGj"
325 | },
326 | "outputs": [],
327 | "source": [
328 | "# Convert date type of column likes to integer data type\n",
329 | "spark_df = spark_df.withColumn(\"likes\", spark_df[\"likes\"].cast(IntegerType()))"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": 20,
335 | "id": "jxJWX5ox2GLU",
336 | "metadata": {
337 | "id": "jxJWX5ox2GLU"
338 | },
339 | "outputs": [],
340 | "source": [
341 | "# Convert date type of column dislikes to integer data type\n",
342 | "spark_df = spark_df.withColumn(\"dislikes\", spark_df[\"dislikes\"].cast(IntegerType()))"
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": 21,
348 | "id": "ZcSxAoyQ3ZTo",
349 | "metadata": {
350 | "id": "ZcSxAoyQ3ZTo"
351 | },
352 | "outputs": [],
353 | "source": [
354 | "# Convert date type of column comment_count to integer data type\n",
355 | "spark_df = spark_df.withColumn(\"comment_count\", spark_df[\"comment_count\"].cast(IntegerType()))"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": 22,
361 | "id": "sR2x4HY83ZWz",
362 | "metadata": {
363 | "id": "sR2x4HY83ZWz"
364 | },
365 | "outputs": [],
366 | "source": [
367 | "# thumbnail_link replace from default to maxresdefault\n",
368 | "link_convert = udf(replace_str, StringType())\n",
369 | "spark_df = spark_df.withColumn(\"thumbnail_link\", link_convert(spark_df[\"thumbnail_link\"]))"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": 27,
375 | "id": "m8REaetN3ZZU",
376 | "metadata": {
377 | "colab": {
378 | "base_uri": "https://localhost:8080/",
379 | "height": 36
380 | },
381 | "id": "m8REaetN3ZZU",
382 | "outputId": "893811ff-3c5a-410a-ea90-96d637af1ba2"
383 | },
384 | "outputs": [
385 | {
386 | "data": {
387 | "application/vnd.google.colaboratory.intrinsic+json": {
388 | "type": "string"
389 | },
390 | "text/plain": [
391 | "'https://i.ytimg.com/vi/EfP1h_3u0Lk/maxresdefault.jpg'"
392 | ]
393 | },
394 | "execution_count": 27,
395 | "metadata": {},
396 | "output_type": "execute_result"
397 | }
398 | ],
399 | "source": [
400 | "spark_df.select(\"thumbnail_link\").collect()[17][0]"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": 28,
406 | "id": "PlOr7qhb3Zbh",
407 | "metadata": {
408 | "colab": {
409 | "base_uri": "https://localhost:8080/"
410 | },
411 | "id": "PlOr7qhb3Zbh",
412 | "outputId": "f49d0243-e8c7-4d80-eca1-154313a60f3f"
413 | },
414 | "outputs": [
415 | {
416 | "name": "stdout",
417 | "output_type": "stream",
418 | "text": [
419 | "root\n",
420 | " |-- video_id: string (nullable = true)\n",
421 | " |-- title: string (nullable = true)\n",
422 | " |-- publishedAt: timestamp (nullable = true)\n",
423 | " |-- channelId: string (nullable = true)\n",
424 | " |-- channelTitle: string (nullable = true)\n",
425 | " |-- categoryId: integer (nullable = true)\n",
426 | " |-- trending_date: timestamp (nullable = true)\n",
427 | " |-- tags: string (nullable = true)\n",
428 | " |-- view_count: integer (nullable = true)\n",
429 | " |-- likes: integer (nullable = true)\n",
430 | " |-- dislikes: integer (nullable = true)\n",
431 | " |-- comment_count: integer (nullable = true)\n",
432 | " |-- thumbnail_link: string (nullable = true)\n",
433 | " |-- comments_disabled: string (nullable = true)\n",
434 | " |-- ratings_disabled: string (nullable = true)\n",
435 | "\n"
436 | ]
437 | }
438 | ],
439 | "source": [
440 | "# Check\n",
441 | "spark_df.printSchema()"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": null,
447 | "id": "0npxdqo03Zdr",
448 | "metadata": {
449 | "id": "0npxdqo03Zdr"
450 | },
451 | "outputs": [],
452 | "source": []
453 | },
454 | {
455 | "cell_type": "code",
456 | "execution_count": null,
457 | "id": "vJWbPWHg3ZgW",
458 | "metadata": {
459 | "id": "vJWbPWHg3ZgW"
460 | },
461 | "outputs": [],
462 | "source": []
463 | },
464 | {
465 | "cell_type": "code",
466 | "execution_count": null,
467 | "id": "lrfACvfa2GO2",
468 | "metadata": {
469 | "id": "lrfACvfa2GO2"
470 | },
471 | "outputs": [],
472 | "source": []
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": null,
477 | "id": "oMo9Hmry2GXW",
478 | "metadata": {
479 | "id": "oMo9Hmry2GXW"
480 | },
481 | "outputs": [],
482 | "source": []
483 | },
484 | {
485 | "cell_type": "code",
486 | "execution_count": null,
487 | "id": "bb609589-83c8-45c5-b4ce-371458647e8c",
488 | "metadata": {
489 | "id": "bb609589-83c8-45c5-b4ce-371458647e8c"
490 | },
491 | "outputs": [],
492 | "source": [
493 | "from pyspark.sql.functions import *\n",
494 | "from pyspark.sql import SparkSession\n",
495 | "from pyspark.sql.functions import round\n",
496 | "from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Bucketizer\n",
497 | "from pyspark.ml.regression import LinearRegression"
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": null,
503 | "id": "c9fe1d06-d632-401b-b98b-c59d1198685b",
504 | "metadata": {
505 | "id": "c9fe1d06-d632-401b-b98b-c59d1198685b"
506 | },
507 | "outputs": [],
508 | "source": [
509 | "import seaborn as sns\n",
510 | "import matplotlib.pyplot as plt\n",
511 | "\n",
512 | "%matplotlib inline\n",
513 | "import warnings\n",
514 | "warnings.filterwarnings('ignore')"
515 | ]
516 | },
517 | {
518 | "cell_type": "markdown",
519 | "id": "c1271c1f",
520 | "metadata": {
521 | "id": "c1271c1f"
522 | },
523 | "source": [
524 | "# Read Data"
525 | ]
526 | },
527 | {
528 | "cell_type": "code",
529 | "execution_count": null,
530 | "id": "801e7626",
531 | "metadata": {
532 | "id": "801e7626"
533 | },
534 | "outputs": [],
535 | "source": [
536 | "# Read data from CSV file\n",
537 | "df = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/properties_2016.csv', sep=',', header=True, inferSchema=True, nullValue='NA')"
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": null,
543 | "id": "968d018d",
544 | "metadata": {
545 | "colab": {
546 | "base_uri": "https://localhost:8080/"
547 | },
548 | "id": "968d018d",
549 | "outputId": "c0050aba-eb23-482d-b4d9-ad85826b8ddf"
550 | },
551 | "outputs": [
552 | {
553 | "name": "stdout",
554 | "output_type": "stream",
555 | "text": [
556 | "Dataset have 2985217 records\n"
557 | ]
558 | }
559 | ],
560 | "source": [
561 | "# Get number of records\n",
562 | "print(f\"Dataset have {df.count()} records\")"
563 | ]
564 | },
565 | {
566 | "cell_type": "code",
567 | "execution_count": null,
568 | "id": "ea7b3db3-8682-4bac-a2ec-cd0324cb19cd",
569 | "metadata": {
570 | "colab": {
571 | "base_uri": "https://localhost:8080/"
572 | },
573 | "id": "ea7b3db3-8682-4bac-a2ec-cd0324cb19cd",
574 | "outputId": "c71a1c3a-6004-4e52-9cbd-b05ed8b71e61"
575 | },
576 | "outputs": [
577 | {
578 | "name": "stdout",
579 | "output_type": "stream",
580 | "text": [
581 | "column: 58\n",
582 | "row: 2985217\n"
583 | ]
584 | }
585 | ],
586 | "source": [
587 | "# Get Shape dataset\n",
588 | "print(f\"column: {len(df.columns)}\\nrow: {df.count()}\")"
589 | ]
590 | },
591 | {
592 | "cell_type": "code",
593 | "execution_count": null,
594 | "id": "7ec68403",
595 | "metadata": {
596 | "colab": {
597 | "base_uri": "https://localhost:8080/"
598 | },
599 | "id": "7ec68403",
600 | "outputId": "e729d51d-deed-49b5-da2c-cfe3701d5f53"
601 | },
602 | "outputs": [
603 | {
604 | "name": "stdout",
605 | "output_type": "stream",
606 | "text": [
607 |n",
608 | "|parcelid|airconditioningtypeid|architecturalstyletypeid|basementsqft|bathroomcnt|bedroomcnt|buildingclasstypeid|buildingqualitytypeid|calculatedbathnbr|decktypeid|finishedfloor1squarefeet|calculatedfinishedsquarefeet|finishedsquarefeet12|finishedsquarefeet13|finishedsquarefeet15|finishedsquarefeet50|finishedsquarefeet6|fips|fireplacecnt|fullbathcnt|garagecarcnt|garagetotalsqft|hashottuborspa|heatingorsystemtypeid|latitude| longitude|lotsizesquarefeet|poolcnt|poolsizesum|pooltypeid10|pooltypeid2|pooltypeid7|propertycountylandusecode|propertylandusetypeid|propertyzoningdesc|rawcensustractandblock|regionidcity|regionidcounty|regionidneighborhood|regionidzip|roomcnt|storytypeid|threequarterbathnbr|typeconstructiontypeid|unitcnt|yardbuildingsqft17|yardbuildingsqft26|yearbuilt|numberofstories|fireplaceflag|structuretaxvaluedollarcnt|taxvaluedollarcnt|assessmentyear|landtaxvaluedollarcnt|taxamount|taxdelinquencyflag|taxdelinquencyyear|censustractandblock|\n",
609 |n",
610 | "|10754147| NULL| NULL| NULL| 0.0| 0.0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|6037| NULL| NULL| NULL| NULL| NULL| NULL|34144442|-118654084| 85768.0| NULL| NULL| NULL| NULL| NULL| 010D| 269| NULL| 6.0378002041E7| 37688| 3101| NULL| 96337| 0.0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| 9.0| 2015| 9.0| NULL| NULL| NULL| NULL|\n",
611 | "|10759547| NULL| NULL| NULL| 0.0| 0.0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|6037| NULL| NULL| NULL| NULL| NULL| NULL|34140430|-118625364| 4083.0| NULL| NULL| NULL| NULL| NULL| 0109| 261| LCA11*| 6.0378001011002E7| 37688| 3101| NULL| 96337| 0.0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| 27516.0| 2015| 27516.0| NULL| NULL| NULL| NULL|\n",
612 | "|10843547| NULL| NULL| NULL| 0.0| 0.0| NULL| NULL| NULL| NULL| NULL| 73026.0| NULL| NULL| 73026| NULL| NULL|6037| NULL| NULL| NULL| NULL| NULL| NULL|33989359|-118394633| 63085.0| NULL| NULL| NULL| NULL| NULL| 1200| 47| LAC2| 6.0377030012017E7| 51617| 3101| NULL| 96095| 0.0| NULL| NULL| NULL| 2| NULL| NULL| NULL| NULL| NULL| 650756.0| 1413387.0| 2015| 762631.0| 20800.37| NULL| NULL| NULL|\n",
613 | "|10859147| NULL| NULL| NULL| 0.0| 0.0| 3| 7| NULL| NULL| NULL| 5068.0| NULL| NULL| 5068| NULL| NULL|6037| NULL| NULL| NULL| NULL| NULL| NULL|34148863|-118437206| 7521.0| NULL| NULL| NULL| NULL| NULL| 1200| 47| LAC2| 6.0371412023001E7| 12447| 3101| 27080| 96424| 0.0| NULL| NULL| NULL| NULL| NULL| NULL| 1948.0| 1| NULL| 571346.0| 1156834.0| 2015| 585488.0| 14557.57| NULL| NULL| NULL|\n",
614 | "|10879947| NULL| NULL| NULL| 0.0| 0.0| 4| NULL| NULL| NULL| NULL| 1776.0| NULL| NULL| 1776| NULL| NULL|6037| NULL| NULL| NULL| NULL| NULL| NULL|34194168|-118385816| 8512.0| NULL| NULL| NULL| NULL| NULL| 1210| 31| LAM1| 6.0371232052003E7| 12447| 3101| 46795| 96450| 0.0| NULL| NULL| NULL| 1| NULL| NULL| 1947.0| NULL| NULL| 193796.0| 433491.0| 2015| 239695.0| 5725.17| NULL| NULL| NULL|\n",
615 |n",
616 | "only showing top 5 rows\n",
617 | "\n"
618 | ]
619 | }
620 | ],
621 | "source": [
622 | "# View five records data\n",
623 | "df.show(5)"
624 | ]
625 | },
626 | {
627 | "cell_type": "code",
628 | "execution_count": null,
629 | "id": "32d76054",
630 | "metadata": {
631 | "colab": {
632 | "base_uri": "https://localhost:8080/"
633 | },
634 | "id": "32d76054",
635 | "outputId": "cf3b6c89-2ef3-47b6-f4e3-a50eb579ae5d"
636 | },
637 | "outputs": [
638 | {
639 | "name": "stdout",
640 | "output_type": "stream",
641 | "text": [
642 | "root\n",
643 | " |-- parcelid: integer (nullable = true)\n",
644 | " |-- airconditioningtypeid: integer (nullable = true)\n",
645 | " |-- architecturalstyletypeid: integer (nullable = true)\n",
646 | " |-- basementsqft: integer (nullable = true)\n",
647 | " |-- bathroomcnt: double (nullable = true)\n",
648 | " |-- bedroomcnt: double (nullable = true)\n",
649 | " |-- buildingclasstypeid: integer (nullable = true)\n",
650 | " |-- buildingqualitytypeid: integer (nullable = true)\n",
651 | " |-- calculatedbathnbr: double (nullable = true)\n",
652 | " |-- decktypeid: integer (nullable = true)\n",
653 | " |-- finishedfloor1squarefeet: integer (nullable = true)\n",
654 | " |-- calculatedfinishedsquarefeet: double (nullable = true)\n",
655 | " |-- finishedsquarefeet12: integer (nullable = true)\n",
656 | " |-- finishedsquarefeet13: integer (nullable = true)\n",
657 | " |-- finishedsquarefeet15: integer (nullable = true)\n",
658 | " |-- finishedsquarefeet50: integer (nullable = true)\n",
659 | " |-- finishedsquarefeet6: integer (nullable = true)\n",
660 | " |-- fips: integer (nullable = true)\n",
661 | " |-- fireplacecnt: integer (nullable = true)\n",
662 | " |-- fullbathcnt: integer (nullable = true)\n",
663 | " |-- garagecarcnt: integer (nullable = true)\n",
664 | " |-- garagetotalsqft: integer (nullable = true)\n",
665 | " |-- hashottuborspa: boolean (nullable = true)\n",
666 | " |-- heatingorsystemtypeid: integer (nullable = true)\n",
667 | " |-- latitude: integer (nullable = true)\n",
668 | " |-- longitude: integer (nullable = true)\n",
669 | " |-- lotsizesquarefeet: double (nullable = true)\n",
670 | " |-- poolcnt: integer (nullable = true)\n",
671 | " |-- poolsizesum: integer (nullable = true)\n",
672 | " |-- pooltypeid10: integer (nullable = true)\n",
673 | " |-- pooltypeid2: integer (nullable = true)\n",
674 | " |-- pooltypeid7: integer (nullable = true)\n",
675 | " |-- propertycountylandusecode: string (nullable = true)\n",
676 | " |-- propertylandusetypeid: integer (nullable = true)\n",
677 | " |-- propertyzoningdesc: string (nullable = true)\n",
678 | " |-- rawcensustractandblock: double (nullable = true)\n",
679 | " |-- regionidcity: integer (nullable = true)\n",
680 | " |-- regionidcounty: integer (nullable = true)\n",
681 | " |-- regionidneighborhood: integer (nullable = true)\n",
682 | " |-- regionidzip: integer (nullable = true)\n",
683 | " |-- roomcnt: double (nullable = true)\n",
684 | " |-- storytypeid: integer (nullable = true)\n",
685 | " |-- threequarterbathnbr: integer (nullable = true)\n",
686 | " |-- typeconstructiontypeid: integer (nullable = true)\n",
687 | " |-- unitcnt: integer (nullable = true)\n",
688 | " |-- yardbuildingsqft17: integer (nullable = true)\n",
689 | " |-- yardbuildingsqft26: integer (nullable = true)\n",
690 | " |-- yearbuilt: double (nullable = true)\n",
691 | " |-- numberofstories: integer (nullable = true)\n",
692 | " |-- fireplaceflag: boolean (nullable = true)\n",
693 | " |-- structuretaxvaluedollarcnt: double (nullable = true)\n",
694 | " |-- taxvaluedollarcnt: double (nullable = true)\n",
695 | " |-- assessmentyear: integer (nullable = true)\n",
696 | " |-- landtaxvaluedollarcnt: double (nullable = true)\n",
697 | " |-- taxamount: double (nullable = true)\n",
698 | " |-- taxdelinquencyflag: string (nullable = true)\n",
699 | " |-- taxdelinquencyyear: integer (nullable = true)\n",
700 | " |-- censustractandblock: long (nullable = true)\n",
701 | "\n"
702 | ]
703 | }
704 | ],
705 | "source": [
706 | "df.printSchema()"
707 | ]
708 | },
709 | {
710 | "cell_type": "code",
711 | "execution_count": null,
712 | "id": "65634354",
713 | "metadata": {
714 | "colab": {
715 | "base_uri": "https://localhost:8080/"
716 | },
717 | "id": "65634354",
718 | "outputId": "5ad24bc2-fef7-4898-bcd0-f401ec43b6c3"
719 | },
720 | "outputs": [
721 | {
722 | "name": "stdout",
723 | "output_type": "stream",
724 | "text": [
725 |n",
726 | "|summary| parcelid|airconditioningtypeid|architecturalstyletypeid| basementsqft| bathroomcnt| bedroomcnt|buildingclasstypeid|buildingqualitytypeid| calculatedbathnbr|decktypeid|finishedfloor1squarefeet|calculatedfinishedsquarefeet|finishedsquarefeet12|finishedsquarefeet13|finishedsquarefeet15|finishedsquarefeet50|finishedsquarefeet6| fips| fireplacecnt| fullbathcnt| garagecarcnt| garagetotalsqft|heatingorsystemtypeid| latitude| longitude| lotsizesquarefeet|poolcnt| poolsizesum|pooltypeid10|pooltypeid2|pooltypeid7|propertycountylandusecode|propertylandusetypeid| propertyzoningdesc|rawcensustractandblock| regionidcity| regionidcounty|regionidneighborhood| regionidzip| roomcnt|storytypeid|threequarterbathnbr|typeconstructiontypeid| unitcnt|yardbuildingsqft17|yardbuildingsqft26| yearbuilt| numberofstories|structuretaxvaluedollarcnt| taxvaluedollarcnt| assessmentyear|landtaxvaluedollarcnt| taxamount|taxdelinquencyflag|taxdelinquencyyear| censustractandblock|\n",
727 |n",
728 | "| count| 2985217| 811519| 6061| 1628| 2973755| 2973767| 12629| 1938488| 2856305| 17096| 202717| 2929652| 2709184| 7672| 190798| 202717| 22001| 2973780| 312637| 2856305| 883267| 883267| 1806401| 2973780| 2973780| 2709118| 517534| 27960| 36939| 32075| 485459| 2972940| 2973780| 1978629| 2973780| 2922372| 2973780| 1156402| 2971237| 2973742| 1624| 311631| 6747| 1977490| 80355| 2647| 2925289| 682069| 2930235| 2942667| 2973778| 2917484| 2953967| 56462| 56464| 2910091|\n",
729 | "| mean|1.3325858360229759E7| 1.9311661218036793| 7.202606830556014|646.8832923832924|2.2091427336818263| 3.088948797938776| 3.7259482144271123| 5.784786906083505|2.2992625087306853| 66.0| 1380.6303960693972| 1827.1621236925068| 1760.0006079321302| 1178.900677789364| 2739.18723466703| 1388.9445779091047| 2414.339439116404| 6048.031600185623| 1.1687100375195514|2.2441651014159905|1.8235165584132544| 383.7693573970272| 4.012053248420478|3.400146865372119E7|-1.18201934159426...|22822.805527748147| 1.0| 519.7109799713877| 1.0| 1.0| 1.0| 199.5320980966015| 260.0484285992911| 5.46084705882353E8| 6.048344961635102E7|34993.35022406456|2570.4605535715486| 193476.4074145496|96552.67280025121|1.4750183438912992| 7.0| 1.0100086320038764| 5.999555357936861| 1.181171080511153| 319.8033974239313| 278.2965621458255|1964.2616411575061|1.4014637815235702| 170883.57716599523|420478.99067852396| 2014.999458937419| 252478.02946854208|5377.607139338332| NULL|13.892409322754322|6.048431221257243E13|\n",
730 | "| stddev| 7909966.389233432| 3.148587394577264| 2.436290490710878|538.7934732127098|1.0777537772255268|1.2758587961101613| 0.5017002111297728| 1.8053515795599582|1.0007362395982085| 0.0| 632.8685428862445| 1819.7804693000555| 971.0610103785792| 357.07303551809184| 5447.428327204328| 664.4887085429802| 7695.302951762993|20.232784692561076|0.46127285457915057|0.9912053996708714|0.6100353832595409|245.44341897378612| 3.293732688713869| 243381.17831128882| 345317.10127200687| 337592.366407657| 0.0|191.32328381052514| 0.0| 0.0| 0.0| 302.825330209084| 15.908166600884176|2.0614819081932812E9| 200811.6754489488| 50727.4653888151| 788.071140066596| 165713.25431675857|3673.175037540778| 2.840402806614331| 0.0|0.11770930082463944| 0.38405027536042613|2.4478959553912745|233.08631396807013| 369.7315077596207|23.441319348584372|0.5390757507737586| 402068.3420150093| 726346.6517993591|0.03683161097766099| 445013.16961781326|9183.107127994226| NULL|2.5810057224984697|3.249034547374049E11|\n",
731 | "| min| 10711725| 1| 2| 20| 0.0| 0.0| 1| 1| 1.0| 66| 3| 1.0| 1| 120| 112| 3| 117| 6037| 1| 1| 0| 0| 1| 33324388| -119475780| 100.0| 1| 19| 1| 1| 1| 0| 31| #12| 6.0371011101E7| 3491| 1286| 6952| 95982| 0.0| 7| 1| 4| 1| 10| 10| 1801.0| 1| 1.0| 1.0| 2000| 1.0| 1.34| Y| 0| -1|\n",
732 | "| max| 169601949| 13| 27| 8516| 20.0| 20.0| 5| 12| 20.0| 66| 31303| 952576.0| 290345| 2688| 820242| 31303| 952576| 6111| 9| 20| 25| 7749| 24| 34819650| -117554316| 3.28263808E8| 1| 17410| 1| 1| 1| SFR| 275| ZONE LCC3| 6.1110091003011E7| 396556| 3101| 764167| 399675| 96.0| 7| 7| 13| 997| 7983| 6141| 2015.0| 41| 2.51486E8| 2.82786E8| 2016| 9.0246219E7| 3458861.12| Y| 99| 483030105084015|\n",
733 |n",
734 | "\n"
735 | ]
736 | }
737 | ],
738 | "source": [
739 | "df.describe().show()"
740 | ]
741 | },
742 | {
743 | "cell_type": "markdown",
744 | "id": "e7f4d289",
745 | "metadata": {
746 | "id": "e7f4d289"
747 | },
748 | "source": [
749 | "# Clean Data"
750 | ]
751 | },
752 | {
753 | "cell_type": "code",
754 | "execution_count": null,
755 | "id": "eddda1f7",
756 | "metadata": {
757 | "id": "eddda1f7"
758 | },
759 | "outputs": [],
760 | "source": [
761 | "# Drop duplicates\n",
762 | "df = df.dropDuplicates()"
763 | ]
764 | },
765 | {
766 | "cell_type": "code",
767 | "execution_count": null,
768 | "id": "2d651e5b",
769 | "metadata": {
770 | "colab": {
771 | "base_uri": "https://localhost:8080/"
772 | },
773 | "id": "2d651e5b",
774 | "outputId": "c59bf422-2e51-410e-fa39-b18c726e4e46"
775 | },
776 | "outputs": [
777 | {
778 | "name": "stdout",
779 | "output_type": "stream",
780 | "text": [
781 |n",
782 | "|parcelid|airconditioningtypeid|architecturalstyletypeid|basementsqft|bathroomcnt|bedroomcnt|buildingclasstypeid|buildingqualitytypeid|calculatedbathnbr|decktypeid|finishedfloor1squarefeet|calculatedfinishedsquarefeet|finishedsquarefeet12|finishedsquarefeet13|finishedsquarefeet15|finishedsquarefeet50|finishedsquarefeet6| fips|fireplacecnt|fullbathcnt|garagecarcnt|garagetotalsqft|hashottuborspa|heatingorsystemtypeid|latitude|longitude|lotsizesquarefeet|poolcnt|poolsizesum|pooltypeid10|pooltypeid2|pooltypeid7|propertycountylandusecode|propertylandusetypeid|propertyzoningdesc|rawcensustractandblock|regionidcity|regionidcounty|regionidneighborhood|regionidzip|roomcnt|storytypeid|threequarterbathnbr|typeconstructiontypeid|unitcnt|yardbuildingsqft17|yardbuildingsqft26|yearbuilt|numberofstories|fireplaceflag|structuretaxvaluedollarcnt|taxvaluedollarcnt|assessmentyear|landtaxvaluedollarcnt|taxamount|taxdelinquencyflag|taxdelinquencyyear|censustractandblock|\n",
783 |n",
784 | "| 0| 2173698| 2979156| 2983589| 11462| 11450| 2972588| 1046729| 128912| 2968121| 2782500| 55565| 276033| 2977545| 2794419| 2782500| 2963216|11437| 2672580| 128912| 2101950| 2101950| 2916203| 1178816| 11437| 11437| 276099|2467683| 2957257| 2948278| 2953142| 2499758| 12277| 11437| 1006588| 11437| 62845| 11437| 1828815| 13980| 11475| 2983593| 2673586| 2978470|1007727| 2904862| 2982570| 59928| 2303148| 2980054| 54982| 42550| 11439| 67733| 31250| 2928755| 2928753| 75126|\n",
785 |n",
786 | "\n"
787 | ]
788 | }
789 | ],
790 | "source": [
791 | "# Get the missing value of each column\n",
792 | "null_counts = df.select([sum(col(column).isNull().cast(\"int\")).alias(column) for column in df.columns])\n",
793 | "null_counts.show()"
794 | ]
795 | },
796 | {
797 | "cell_type": "code",
798 | "execution_count": null,
799 | "id": "c158a9c4-c639-47f5-9e29-6ac483daced6",
800 | "metadata": {
801 | "id": "c158a9c4-c639-47f5-9e29-6ac483daced6"
802 | },
803 | "outputs": [],
804 | "source": [
805 | "# Visualize missing value on each column\n",
806 | "pandas_df = df.toPandas()\n",
807 | "missing_count = pandas_df.isna().sum()\n",
808 | "sns.barplot(x=missing_count.index, y=missing_count.values)\n",
809 | "plt.title('Numbers Missing Value on each column')\n",
810 | "plt.xlabel('Column')\n",
811 | "plt.ylabel('Numbers Missing')\n",
812 | "plt.show()"
813 | ]
814 | },
815 | {
816 | "cell_type": "code",
817 | "execution_count": null,
818 | "id": "4b1b542f",
819 | "metadata": {
820 | "colab": {
821 | "background_save": true
822 | },
823 | "id": "4b1b542f"
824 | },
825 | "outputs": [],
826 | "source": [
827 | "# Drop columns that are more than 60% missing\n",
828 | "def column_dropper(df, threshold):\n",
829 | " total_records = df.count()\n",
830 | " for col in df.columns:\n",
831 | " missing = df.filter(df[col].isNull()).count()\n",
832 | " missing_percent = missing / total_records\n",
833 | " if missing_percent > threshold:\n",
834 | " df = df.drop(col)\n",
835 | " return df\n",
836 | "\n",
837 | "df = column_dropper(df, 0.6)"
838 | ]
839 | },
840 | {
841 | "cell_type": "code",
842 | "execution_count": null,
843 | "id": "41bd87bb",
844 | "metadata": {
845 | "colab": {
846 | "base_uri": "https://localhost:8080/"
847 | },
848 | "id": "41bd87bb",
849 | "outputId": "ea48b121-fe86-4c50-cc77-c284beaec8c7"
850 | },
851 | "outputs": [
852 | {
853 | "name": "stdout",
854 | "output_type": "stream",
855 | "text": [
856 |n",
857 | "|parcelid|airconditioningtypeid|architecturalstyletypeid|basementsqft|bathroomcnt|bedroomcnt|buildingclasstypeid|buildingqualitytypeid|calculatedbathnbr|decktypeid|finishedfloor1squarefeet|calculatedfinishedsquarefeet|finishedsquarefeet12|finishedsquarefeet13|finishedsquarefeet15|finishedsquarefeet50|finishedsquarefeet6| fips|fireplacecnt|fullbathcnt|garagecarcnt|garagetotalsqft|hashottuborspa|heatingorsystemtypeid|latitude|longitude|lotsizesquarefeet|poolcnt|poolsizesum|pooltypeid10|pooltypeid2|pooltypeid7|propertycountylandusecode|propertylandusetypeid|propertyzoningdesc|rawcensustractandblock|regionidcity|regionidcounty|regionidneighborhood|regionidzip|roomcnt|storytypeid|threequarterbathnbr|typeconstructiontypeid|unitcnt|yardbuildingsqft17|yardbuildingsqft26|yearbuilt|numberofstories|fireplaceflag|structuretaxvaluedollarcnt|taxvaluedollarcnt|assessmentyear|landtaxvaluedollarcnt|taxamount|taxdelinquencyflag|taxdelinquencyyear|censustractandblock|\n",
858 |n",
859 | "| 0| 2173698| 2979156| 2983589| 11462| 11450| 2972588| 1046729| 128912| 2968121| 2782500| 55565| 276033| 2977545| 2794419| 2782500| 2963216|11437| 2672580| 128912| 2101950| 2101950| 2916203| 1178816| 11437| 11437| 276099|2467683| 2957257| 2948278| 2953142| 2499758| 12277| 11437| 1006588| 11437| 62845| 11437| 1828815| 13980| 11475| 2983593| 2673586| 2978470|1007727| 2904862| 2982570| 59928| 2303148| 2980054| 54982| 42550| 11439| 67733| 31250| 2928755| 2928753| 75126|\n",
860 |n",
861 | "\n"
862 | ]
863 | }
864 | ],
865 | "source": [
866 | "# columns remaining after deletion\n",
867 | "null_counts = df.select([sum(col(column).isNull().cast(\"int\")).alias(column) for column in df.columns])\n",
868 | "null_counts.show()"
869 | ]
870 | },
871 | {
872 | "cell_type": "code",
873 | "execution_count": null,
874 | "id": "I_Mx_6U2MJ93",
875 | "metadata": {
876 | "id": "I_Mx_6U2MJ93"
877 | },
878 | "outputs": [],
879 | "source": [
880 | "df.show()"
881 | ]
882 | },
883 | {
884 | "cell_type": "code",
885 | "execution_count": null,
886 | "id": "KsG0VYVsUFAt",
887 | "metadata": {
888 | "id": "KsG0VYVsUFAt"
889 | },
890 | "outputs": [],
891 | "source": [
892 | "PARCELID: 0\n",
893 | "BATHROOMCNT: 11462\n",
894 | "BEDROOMCNT: 11450\n",
895 | "BUILDINGQUALITYTYPEID: 1046729\n",
896 | "CALCULATEDBATHNBR: 128912\n",
897 | "CALCULATEDFINISHEDSQUAREFEET: 55565\n",
898 | "FINISHEDSQUAREFEET12: 276033\n",
899 | "FIPS: 11437\n",
900 | "FULLBATHCNT: 128912\n",
901 | "HEATINGORSYSTEMTYPEID: 1178816\n",
902 | "LATITUDE: 11437\n",
903 | "LONGITUDE: 11437\n",
904 | "LOTSIZESQUAREFEET: 276099\n",
905 | "PROPERTYCOUNTYLANDUSECODE: 12277\n",
906 | "PROPERTYLANDUSETYPEID: 11437\n",
907 | "PROPERTYZONINGDESC: 1006588\n",
908 | "RAWCENSUSTRACTANDBLOCK: 11437\n",
909 | "REGIONIDCITY: 62845\n",
910 | "REGIONIDCOUNTY: 11437\n",
911 | "REGIONIDZIP: 13980\n",
912 | "ROOMCNT: 11475\n",
913 | "UNITCNT: 1007727\n",
914 | "YEARBUILT: 59928\n",
915 | "STRUCTURETAXVALUEDOLLARCNT: 54982\n",
916 | "TAXVALUEDOLLARCNT: 42550\n",
917 | "ASSESSMENTYEAR: 11439\n",
918 | "LANDTAXVALUEDOLLARCNT: 67733\n",
919 | "TAXAMOUNT: 31250\n",
920 | "CENSUSTRACTANDBLOCK: 75126"
921 | ]
922 | },
923 | {
924 | "cell_type": "code",
925 | "execution_count": null,
926 | "id": "f958f97a",
927 | "metadata": {
928 | "colab": {
929 | "base_uri": "https://localhost:8080/"
930 | },
931 | "id": "f958f97a",
932 | "outputId": "3abae1d5-c133-4199-c41c-065c365b9c9f"
933 | },
934 | "outputs": [
935 | {
936 | "data": {
937 | "text/plain": [
938 | "58"
939 | ]
940 | },
941 | "execution_count": 23,
942 | "metadata": {},
943 | "output_type": "execute_result"
944 | }
945 | ],
946 | "source": [
947 | "# Fill miss value\n",
948 | "values = {\n",
949 | " 'bathroomcnt': 'value1',\n",
950 | " 'bedroomcnt': 'value2',\n",
951 | " 'buildingqualitytypeid': \"\",\n",
952 | " \"CALCULATEDBATHNBR\": 128912,\n",
953 | " \"CALCULATEDFINISHEDSQUAREFEET\": 55565,\n",
954 | " \"FINISHEDSQUAREFEET12\": 276033,\n",
955 | " \"FIPS\": 11437,\n",
956 | " \"FULLBATHCNT\": 128912,\n",
957 | " \"HEATINGORSYSTEMTYPEID\": 1178816,\n",
958 | " \"LATITUDE\": 11437,\n",
959 | " \"LONGITUDE\": 11437,\n",
960 | " \"LOTSIZESQUAREFEET\": 276099,\n",
961 | " \"PROPERTYCOUNTYLANDUSECODE\": 12277,\n",
962 | " \"PROPERTYLANDUSETYPEID\": 11437,\n",
963 | " \"PROPERTYZONINGDESC\": 1006588,\n",
964 | " \"RAWCENSUSTRACTANDBLOCK\": 11437,\n",
965 | " \"REGIONIDCITY\": 62845,\n",
966 | " \"REGIONIDCOUNTY\": 11437,\n",
967 | " \"REGIONIDZIP\": 13980,\n",
968 | " \"ROOMCNT\": 11475,\n",
969 | " \"UNITCNT\": 1007727,\n",
970 | " \"YEARBUILT\": 59928,\n",
971 | " \"STRUCTURETAXVALUEDOLLARCNT\": 54982,\n",
972 | " \"TAXVALUEDOLLARCNT\": 42550,\n",
973 | " \"ASSESSMENTYEAR\": 11439,\n",
974 | " \"LANDTAXVALUEDOLLARCNT\": 67733,\n",
975 | " \"TAXAMOUNT\": 31250,\n",
976 | " \"CENSUSTRACTANDBLOCK\": 75126\n",
977 | "}\n",
978 | "filled_df = df.fillna(values)"
979 | ]
980 | },
981 | {
982 | "cell_type": "code",
983 | "execution_count": null,
984 | "id": "OTkG-6cMOxEt",
985 | "metadata": {
986 | "id": "OTkG-6cMOxEt"
987 | },
988 | "outputs": [],
989 | "source": [
990 | "null_counts = df.select([sum(col(column).isNull().cast(\"int\")).alias(column) for column in df.columns])\n",
991 | "null_counts.show()"
992 | ]
993 | },
994 | {
995 | "cell_type": "markdown",
996 | "id": "dnypT2myKfsh",
997 | "metadata": {
998 | "id": "dnypT2myKfsh"
999 | },
1000 | "source": [
1001 | "# Feature Engineering"
1002 | ]
1003 | },
1004 | {
1005 | "cell_type": "code",
1006 | "execution_count": null,
1007 | "id": "3f5d77f7",
1008 | "metadata": {
1009 | "id": "3f5d77f7"
1010 | },
1011 | "outputs": [],
1012 | "source": [
1013 | "# One-hot encoding for 'bathroomcnt'\n",
1014 | "encoder_bathroomcnt = OneHotEncoder(inputCols=['bathroomcnt'], outputCols=['bathroomcnt_dummy'])\n",
1015 | "df = encoder_bathroomcnt.fit(df).transform(df)\n"
1016 | ]
1017 | },
1018 | {
1019 | "cell_type": "code",
1020 | "execution_count": null,
1021 | "id": "5c50d08f",
1022 | "metadata": {
1023 | "id": "5c50d08f"
1024 | },
1025 | "outputs": [],
1026 | "source": [
1027 | "# One-hot encoding for 'bedroomcnt'\n",
1028 | "encoder_bedroomcnt = OneHotEncoder(inputCols=['bedroomcnt'], outputCols=['bedroomcnt_dummy'])\n",
1029 | "df = encoder_bedroomcnt.fit(df).transform(df)"
1030 | ]
1031 | },
1032 | {
1033 | "cell_type": "code",
1034 | "execution_count": null,
1035 | "id": "Z2pj086_RJFl",
1036 | "metadata": {
1037 | "id": "Z2pj086_RJFl"
1038 | },
1039 | "outputs": [],
1040 | "source": [
1041 | "# Assemble features into a single vector column\n",
1042 | "assembler = VectorAssembler(inputCols=['roomcnt', 'latitude', 'longitude', 'bathroomcnt_dummy', 'bedroomcnt_dummy'], outputCol='features')\n",
1043 | "df = assembler.transform(df)"
1044 | ]
1045 | },
1046 | {
1047 | "cell_type": "markdown",
1048 | "id": "ZScPc9wxREPs",
1049 | "metadata": {
1050 | "id": "ZScPc9wxREPs"
1051 | },
1052 | "source": [
1053 | "# Build Linear Regression Model"
1054 | ]
1055 | },
1056 | {
1057 | "cell_type": "code",
1058 | "execution_count": null,
1059 | "id": "3ef37703",
1060 | "metadata": {
1061 | "id": "3ef37703"
1062 | },
1063 | "outputs": [],
1064 | "source": [
1065 | "# Split the data\n",
1066 | "train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)"
1067 | ]
1068 | },
1069 | {
1070 | "cell_type": "code",
1071 | "execution_count": null,
1072 | "id": "cadb50de",
1073 | "metadata": {
1074 | "id": "cadb50de"
1075 | },
1076 | "outputs": [],
1077 | "source": [
1078 | "# Build the model\n",
1079 | "regression = LinearRegression(featuresCol='features', labelCol='duration')\n",
1080 | "model = regression.fit(train_data)"
1081 | ]
1082 | },
1083 | {
1084 | "cell_type": "code",
1085 | "execution_count": null,
1086 | "id": "b56f0ff4",
1087 | "metadata": {
1088 | "id": "b56f0ff4"
1089 | },
1090 | "outputs": [],
1091 | "source": [
1092 | "# Make predictions\n",
1093 | "predictions = model.transform(test_data)"
1094 | ]
1095 | },
1096 | {
1097 | "cell_type": "markdown",
1098 | "id": "FeP14XfNRgzS",
1099 | "metadata": {
1100 | "id": "FeP14XfNRgzS"
1101 | },
1102 | "source": [
1103 | "# Evaluate Model"
1104 | ]
1105 | },
1106 | {
1107 | "cell_type": "code",
1108 | "execution_count": null,
1109 | "id": "9e490664",
1110 | "metadata": {
1111 | "id": "9e490664"
1112 | },
1113 | "outputs": [],
1114 | "source": [
1115 | "# Evaluate the model\n",
1116 | "evaluator = RegressionEvaluator(labelCol='duration', metricName='rmse')\n",
1117 | "rmse = evaluator.evaluate(predictions)\n",
1118 | "print(\"Root Mean Square Error (RMSE) on test data =\", rmse)\n",
1119 | "\n",
1120 | "# Print coefficients and intercept for interpretation\n",
1121 | "print(\"Coefficients:\", model.coefficients)\n",
1122 | "print(\"Intercept:\", model.intercept)"
1123 | ]
1124 | },
1125 | {
1126 | "cell_type": "code",
1127 | "execution_count": null,
1128 | "id": "c2f0476d",
1129 | "metadata": {
1130 | "id": "c2f0476d"
1131 | },
1132 | "outputs": [],
1133 | "source": []
1134 | },
1135 | {
1136 | "cell_type": "code",
1137 | "execution_count": null,
1138 | "id": "25938f77",
1139 | "metadata": {
1140 | "id": "25938f77"
1141 | },
1142 | "outputs": [],
1143 | "source": []
1144 | },
1145 | {
1146 | "cell_type": "code",
1147 | "execution_count": null,
1148 | "id": "a029e08d",
1149 | "metadata": {
1150 | "id": "a029e08d"
1151 | },
1152 | "outputs": [],
1153 | "source": []
1154 | },
1155 | {
1156 | "cell_type": "code",
1157 | "execution_count": null,
1158 | "id": "4a4ad2d6",
1159 | "metadata": {
1160 | "id": "4a4ad2d6"
1161 | },
1162 | "outputs": [],
1163 | "source": []
1164 | },
1165 | {
1166 | "cell_type": "code",
1167 | "execution_count": null,
1168 | "id": "31873dac",
1169 | "metadata": {
1170 | "id": "31873dac"
1171 | },
1172 | "outputs": [],
1173 | "source": []
1174 | },
1175 | {
1176 | "cell_type": "code",
1177 | "execution_count": null,
1178 | "id": "0d65e838",
1179 | "metadata": {
1180 | "id": "0d65e838"
1181 | },
1182 | "outputs": [],
1183 | "source": []
1184 | },
1185 | {
1186 | "cell_type": "code",
1187 | "execution_count": null,
1188 | "id": "041add92",
1189 | "metadata": {
1190 | "id": "041add92"
1191 | },
1192 | "outputs": [],
1193 | "source": []
1194 | },
1195 | {
1196 | "cell_type": "code",
1197 | "execution_count": null,
1198 | "id": "fa5fb063-3ed4-40bb-a566-f9eab2ca520e",
1199 | "metadata": {
1200 | "id": "fa5fb063-3ed4-40bb-a566-f9eab2ca520e"
1201 | },
1202 | "outputs": [],
1203 | "source": []
1204 | },
1205 | {
1206 | "cell_type": "code",
1207 | "execution_count": null,
1208 | "id": "8d651e75-dc8a-4bf4-ba9f-de4e27ddfa04",
1209 | "metadata": {
1210 | "id": "8d651e75-dc8a-4bf4-ba9f-de4e27ddfa04"
1211 | },
1212 | "outputs": [],
1213 | "source": []
1214 | },
1215 | {
1216 | "cell_type": "code",
1217 | "execution_count": null,
1218 | "id": "a4916a6a-f1e8-49c9-850d-f94639c337e7",
1219 | "metadata": {
1220 | "id": "a4916a6a-f1e8-49c9-850d-f94639c337e7"
1221 | },
1222 | "outputs": [],
1223 | "source": []
1224 | },
1225 | {
1226 | "cell_type": "code",
1227 | "execution_count": null,
1228 | "id": "cd1eb575-3b8b-49c2-943f-3f79a908d626",
1229 | "metadata": {
1230 | "id": "cd1eb575-3b8b-49c2-943f-3f79a908d626"
1231 | },
1232 | "outputs": [],
1233 | "source": []
1234 | },
1235 | {
1236 | "cell_type": "code",
1237 | "execution_count": null,
1238 | "id": "0907696c-cc0c-4d33-9707-89696752a4d1",
1239 | "metadata": {
1240 | "id": "0907696c-cc0c-4d33-9707-89696752a4d1"
1241 | },
1242 | "outputs": [],
1243 | "source": []
1244 | },
1245 | {
1246 | "cell_type": "code",
1247 | "execution_count": null,
1248 | "id": "29b355ad",
1249 | "metadata": {
1250 | "id": "29b355ad"
1251 | },
1252 | "outputs": [],
1253 | "source": []
1254 | },
1255 | {
1256 | "cell_type": "markdown",
1257 | "id": "6d5349b1-4b09-4ace-9b02-a6eed61843bd",
1258 | "metadata": {
1259 | "id": "6d5349b1-4b09-4ace-9b02-a6eed61843bd"
1260 | },
1261 | "source": [
1262 | "# Pre-Processing Data"
1263 | ]
1264 | },
1265 | {
1266 | "cell_type": "markdown",
1267 | "id": "8788823a-5c40-481c-8dea-97ce436899bc",
1268 | "metadata": {
1269 | "id": "8788823a-5c40-481c-8dea-97ce436899bc"
1270 | },
1271 | "source": [
1272 | "### 1. Check Data"
1273 | ]
1274 | },
1275 | {
1276 | "cell_type": "code",
1277 | "execution_count": null,
1278 | "id": "7c962808-b082-48cd-8b04-11b8d821f32c",
1279 | "metadata": {
1280 | "id": "7c962808-b082-48cd-8b04-11b8d821f32c"
1281 | },
1282 | "outputs": [],
1283 | "source": [
1284 | "df = spark.read.csv(\"properties_2016.csv\", header=True, inferSchema=True)"
1285 | ]
1286 | },
1287 | {
1288 | "cell_type": "code",
1289 | "execution_count": null,
1290 | "id": "bd61df3c-936d-4edf-8cc4-0d7012bee2ce",
1291 | "metadata": {
1292 | "id": "bd61df3c-936d-4edf-8cc4-0d7012bee2ce"
1293 | },
1294 | "outputs": [],
1295 | "source": [
1296 | "df.limit(10)"
1297 | ]
1298 | },
1299 | {
1300 | "cell_type": "code",
1301 | "execution_count": null,
1302 | "id": "1578e940-ec6f-4867-a345-bc1c9d144727",
1303 | "metadata": {
1304 | "id": "1578e940-ec6f-4867-a345-bc1c9d144727"
1305 | },
1306 | "outputs": [],
1307 | "source": [
1308 | "# convert all column names to uppercase\n",
1309 | "for col in df.columns:\n",
1310 | " df = df.withColumnRenamed(col, col.upper())"
1311 | ]
1312 | },
1313 | {
1314 | "cell_type": "markdown",
1315 | "id": "b0a960fc-0f60-4093-b135-04fb779d33cc",
1316 | "metadata": {
1317 | "id": "b0a960fc-0f60-4093-b135-04fb779d33cc"
1318 | },
1319 | "source": [
1320 | "### 2. check descriptive statistics"
1321 | ]
1322 | },
1323 | {
1324 | "cell_type": "code",
1325 | "execution_count": null,
1326 | "id": "7dc0f07a-6b10-4cf5-a15b-2469b0907a9b",
1327 | "metadata": {
1328 | "id": "7dc0f07a-6b10-4cf5-a15b-2469b0907a9b"
1329 | },
1330 | "outputs": [],
1331 | "source": [
1332 | "df.describe().limit(20)"
1333 | ]
1334 | },
1335 | {
1336 | "cell_type": "markdown",
1337 | "id": "80beb77e-de1e-4ea3-8a73-e3c4619e614b",
1338 | "metadata": {
1339 | "id": "80beb77e-de1e-4ea3-8a73-e3c4619e614b"
1340 | },
1341 | "source": [
1342 | "### 3. Check DataType"
1343 | ]
1344 | },
1345 | {
1346 | "cell_type": "code",
1347 | "execution_count": null,
1348 | "id": "d048bb60-008b-4c79-87dc-6ae9703bcbea",
1349 | "metadata": {
1350 | "id": "d048bb60-008b-4c79-87dc-6ae9703bcbea"
1351 | },
1352 | "outputs": [],
1353 | "source": [
1354 | "df.printSchema()"
1355 | ]
1356 | },
1357 | {
1358 | "cell_type": "markdown",
1359 | "id": "3b3d485b-303e-482f-966c-9692f60c315c",
1360 | "metadata": {
1361 | "id": "3b3d485b-303e-482f-966c-9692f60c315c"
1362 | },
1363 | "source": [
1364 | "### 4. Check Number columns, rows current"
1365 | ]
1366 | },
1367 | {
1368 | "cell_type": "code",
1369 | "execution_count": null,
1370 | "id": "1869a2ac-0579-44b0-b7ea-d0f24852bbc2",
1371 | "metadata": {
1372 | "id": "1869a2ac-0579-44b0-b7ea-d0f24852bbc2"
1373 | },
1374 | "outputs": [],
1375 | "source": [
1376 | "# Columns\n",
1377 | "len(df.columns)"
1378 | ]
1379 | },
1380 | {
1381 | "cell_type": "code",
1382 | "execution_count": null,
1383 | "id": "c63758b3-a367-4848-b6ae-f492f60ce9f8",
1384 | "metadata": {
1385 | "id": "c63758b3-a367-4848-b6ae-f492f60ce9f8"
1386 | },
1387 | "outputs": [],
1388 | "source": [
1389 | "# Rows\n",
1390 | "df.count()"
1391 | ]
1392 | },
1393 | {
1394 | "cell_type": "markdown",
1395 | "id": "337dffc1-edd9-454d-b07a-87e02566685f",
1396 | "metadata": {
1397 | "id": "337dffc1-edd9-454d-b07a-87e02566685f"
1398 | },
1399 | "source": [
1400 | "### 5. Drop Duplicates"
1401 | ]
1402 | },
1403 | {
1404 | "cell_type": "code",
1405 | "execution_count": null,
1406 | "id": "fb672b6c-e6be-4d3c-b0bb-0b4a4ab29cab",
1407 | "metadata": {
1408 | "id": "fb672b6c-e6be-4d3c-b0bb-0b4a4ab29cab"
1409 | },
1410 | "outputs": [],
1411 | "source": [
1412 | "df = df.dropDuplicates()"
1413 | ]
1414 | },
1415 | {
1416 | "cell_type": "markdown",
1417 | "id": "1c16498a-6da2-4fc1-8403-437795fdff61",
1418 | "metadata": {
1419 | "id": "1c16498a-6da2-4fc1-8403-437795fdff61"
1420 | },
1421 | "source": [
1422 | "### 6. Check Miss Value"
1423 | ]
1424 | },
1425 | {
1426 | "cell_type": "code",
1427 | "execution_count": null,
1428 | "id": "68e60e46-6a73-4f7d-87ee-014e7ddbef1c",
1429 | "metadata": {
1430 | "id": "68e60e46-6a73-4f7d-87ee-014e7ddbef1c"
1431 | },
1432 | "outputs": [],
1433 | "source": [
1434 | "def check_null_count():\n",
1435 | " for column in df.columns:\n",
1436 | " null_count = df.filter(df[column].isNull()).count()\n",
1437 | " print(f\"{column}: {null_count}\")"
1438 | ]
1439 | },
1440 | {
1441 | "cell_type": "code",
1442 | "execution_count": null,
1443 | "id": "ddaf99dd-78e8-4ece-82ed-6081bee69b60",
1444 | "metadata": {
1445 | "id": "ddaf99dd-78e8-4ece-82ed-6081bee69b60"
1446 | },
1447 | "outputs": [],
1448 | "source": [
1449 | "# Số lượng giá trị khuyết thiếu của từng cột\n",
1450 | "check_null_count()"
1451 | ]
1452 | },
1453 | {
1454 | "cell_type": "markdown",
1455 | "id": "42fbdaaa-1895-4d31-9ed8-2153261108e5",
1456 | "metadata": {
1457 | "id": "42fbdaaa-1895-4d31-9ed8-2153261108e5"
1458 | },
1459 | "source": [
1460 | "##### Drop columns with more than 60% missing"
1461 | ]
1462 | },
1463 | {
1464 | "cell_type": "code",
1465 | "execution_count": null,
1466 | "id": "59979bc8-5f48-4697-857e-b0bd65b00472",
1467 | "metadata": {
1468 | "id": "59979bc8-5f48-4697-857e-b0bd65b00472"
1469 | },
1470 | "outputs": [],
1471 | "source": [
1472 | "def column_dropper(df, threshold):\n",
1473 | " # Takes a dataframe and threshold for missing values. Returns a dataframe.\n",
1474 | " total_records = df.count()\n",
1475 | " for col in df.columns:\n",
1476 | " # Calculate the percentage of missing values\n",
1477 | " missing = df.where(df[col].isNull()).count()\n",
1478 | " missing_percent = missing / total_records\n",
1479 | " # Drop column if percent of missing is more than threshold\n",
1480 | " if missing_percent > threshold:\n",
1481 | " df = df.drop(col)\n",
1482 | " return df\n",
1483 | "\n",
1484 | "# Drop columns that are more than 60% missing\n",
1485 | "df = column_dropper(df, 0.6)"
1486 | ]
1487 | },
1488 | {
1489 | "cell_type": "code",
1490 | "execution_count": null,
1491 | "id": "dfd6df2b-4a42-4a62-ae85-0d882c85405f",
1492 | "metadata": {
1493 | "id": "dfd6df2b-4a42-4a62-ae85-0d882c85405f"
1494 | },
1495 | "outputs": [],
1496 | "source": [
1497 | "check_null_count()"
1498 | ]
1499 | },
1500 | {
1501 | "cell_type": "markdown",
1502 | "id": "b7526960-0c18-4577-ac67-b6542f6ff17e",
1503 | "metadata": {
1504 | "id": "b7526960-0c18-4577-ac67-b6542f6ff17e"
1505 | },
1506 | "source": [
1507 | "### 7. Outlier Filtering"
1508 | ]
1509 | },
1510 | {
1511 | "cell_type": "code",
1512 | "execution_count": null,
1513 | "id": "af033c41-07ea-4474-bcce-9d388e150aef",
1514 | "metadata": {
1515 | "id": "af033c41-07ea-4474-bcce-9d388e150aef"
1516 | },
1517 | "outputs": [],
1518 | "source": [
1519 | "mean_val = df.agg({'BATHROOMCNT': 'mean'}).collect()[0][0]\n",
1520 | "stddev_val = df.agg({'BATHROOMCNT': 'stddev'}).collect()[0][0]\n",
1521 | "\n",
1522 | "low_bound = mean_val - (3 * stddev_val)\n",
1523 | "hi_bound = mean_val + (3 * stddev_val)\n",
1524 | "\n",
1525 | "df = df.where((df['BATHROOMCNT'] < hi_bound) & (df['BATHROOMCNT'] > low_bound))"
1526 | ]
1527 | },
1528 | {
1529 | "cell_type": "markdown",
1530 | "id": "cb41849c-1339-4926-9c6c-3a23507d52ff",
1531 | "metadata": {
1532 | "id": "cb41849c-1339-4926-9c6c-3a23507d52ff"
1533 | },
1534 | "source": [
1535 | "### 8. Adjust Data"
1536 | ]
1537 | },
1538 | {
1539 | "cell_type": "code",
1540 | "execution_count": null,
1541 | "id": "5e57b8b8-3ce2-401d-8dda-2402a608f5ee",
1542 | "metadata": {
1543 | "id": "5e57b8b8-3ce2-401d-8dda-2402a608f5ee"
1544 | },
1545 | "outputs": [],
1546 | "source": [
1547 | "mean = df.agg({'BATHROOMCNT': 'mean'}).collect()[0][0]\n",
1548 | "stddev = df.agg({'BATHROOMCNT': 'stddev'}).collect()[0][0]\n",
1549 | "# Create a new column with the scaled data\n",
1550 | "df = df.withColumn(\"ztrans_days\", (df['BATHROOMCNT'] - mean) / stddev)\n",
1551 | "df.agg({'ztrans_days': 'mean'}).collect()\n",
1552 | "df.agg({'ztrans_days': 'stddev'}).collect()"
1553 | ]
1554 | },
1555 | {
1556 | "cell_type": "markdown",
1557 | "id": "56d5a156-5472-4c77-b72f-173974721375",
1558 | "metadata": {
1559 | "id": "56d5a156-5472-4c77-b72f-173974721375"
1560 | },
1561 | "source": [
1562 | "# Feature Engineering"
1563 | ]
1564 | },
1565 | {
1566 | "cell_type": "markdown",
1567 | "id": "6865adc7-b638-4b5a-b650-d57a415ed110",
1568 | "metadata": {
1569 | "id": "6865adc7-b638-4b5a-b650-d57a415ed110"
1570 | },
1571 | "source": [
1572 | "### 1. Bucketing"
1573 | ]
1574 | },
1575 | {
1576 | "cell_type": "code",
1577 | "execution_count": null,
1578 | "id": "5a695ed2-0573-4fa6-93c8-e123f26042a7",
1579 | "metadata": {
1580 | "id": "5a695ed2-0573-4fa6-93c8-e123f26042a7"
1581 | },
1582 | "outputs": [],
1583 | "source": [
1584 | "splits = [0, 1, 2, 3, 4, float('Inf')]\n",
1585 | "\n",
1586 | "# Create bucketing transformer\n",
1587 | "buck = Bucketizer(splits=splits, inputCol='TAXAMOUNT', outputCol='TAXA')\n",
1588 | "\n",
1589 | "# Apply transformer\n",
1590 | "df = buck.transform(df)\n",
1591 | "\n",
1592 | "# Inspect results\n",
1593 | "df[['TAXAMOUNT', 'TAXA']].show()"
1594 | ]
1595 | },
1596 | {
1597 | "cell_type": "markdown",
1598 | "id": "c487737f-7e48-42b1-83c9-8c5ef75951aa",
1599 | "metadata": {
1600 | "id": "c487737f-7e48-42b1-83c9-8c5ef75951aa"
1601 | },
1602 | "source": [
1603 | "### 2. One-hot Encoding"
1604 | ]
1605 | },
1606 | {
1607 | "cell_type": "code",
1608 | "execution_count": null,
1609 | "id": "cf27659d-29ba-4fab-b8c5-30a979c6a445",
1610 | "metadata": {
1611 | "id": "cf27659d-29ba-4fab-b8c5-30a979c6a445"
1612 | },
1613 | "outputs": [],
1614 | "source": [
1615 | "from pyspark.ml.feature import OneHotEncoder, StringIndexer\n",
1616 | "\n",
1617 | "# Map strings to numbers with string indexer\n",
1618 | "string_indexer = StringIndexer(inputCol='ROOMCNT', outputCol='ROOM_Index')\n",
1619 | "indexed_df = string_indexer.fit(df).transform(df)\n",
1620 | "\n",
1621 | "# Onehot encode indexed values\n",
1622 | "encoder = OneHotEncoder(inputCol='ROOM_Index', outputCol='ROOM_Vec')\n",
1623 | "encoded_df = encoder.fit(indexed_df).transform(indexed_df)\n",
1624 | "\n",
1625 | "# Inspect the transformation steps\n",
1626 | "encoded_df[['ROOMCNT', 'ROOM_Index', 'ROOM_Vec']].show(truncate=100)"
1627 | ]
1628 | }
1629 | ],
1630 | "metadata": {
1631 | "colab": {
1632 | "provenance": []
1633 | },
1634 | "kernelspec": {
1635 | "display_name": "Python 3 (ipykernel)",
1636 | "language": "python",
1637 | "name": "python3"
1638 | },
1639 | "language_info": {
1640 | "codemirror_mode": {
1641 | "name": "ipython",
1642 | "version": 3
1643 | },
1644 | "file_extension": ".py",
1645 | "mimetype": "text/x-python",
1646 | "name": "python",
1647 | "nbconvert_exporter": "python",
1648 | "pygments_lexer": "ipython3",
1649 | "version": "3.11.6"
1650 | }
1651 | },
1652 | "nbformat": 4,
1653 | "nbformat_minor": 5
1654 | }
1655 |
--------------------------------------------------------------------------------