├── public ├── notebooks │ ├── EDA.ipynb │ └── Preprocessing.ipynb └── images │ ├── DataFlow.png │ ├── DataLineage.png │ ├── data_flow.png │ ├── youtube_icon.png │ ├── Data_flow_youtube.png │ └── Data_flow_youtube2.png ├── etl_pipeline ├── dbt_tranform │ ├── analyses │ │ └── .gitkeep │ ├── macros │ │ ├── .gitkeep │ │ └── generate_schema_name.sql │ ├── seeds │ │ └── .gitkeep │ ├── tests │ │ └── .gitkeep │ ├── snapshots │ │ └── .gitkeep │ ├── .gitignore │ ├── .user.yml │ ├── packages.yml │ ├── package-lock.yml │ ├── models │ │ ├── youtube_trending │ │ │ ├── search_linkvideo.sql │ │ │ ├── search_videocategory.sql │ │ │ └── search_information.sql │ │ ├── sources.yml │ │ └── schema.yml │ ├── profiles.yml │ ├── README.md │ └── dbt_project.yml ├── etl_pipeline │ ├── jobs │ │ └── __init__.py │ ├── assets │ │ ├── __init__.py │ │ ├── dbt.py │ │ ├── warehouse.py │ │ ├── gold.py │ │ ├── bronze.py │ │ └── silver.py │ ├── schedules │ │ └── __init__.py │ ├── constants.py │ ├── partitions │ │ └── __init__.py │ ├── func_process.py │ ├── __init__.py │ └── resources │ │ ├── mysql_io_manager.py │ │ ├── __init__.py │ │ ├── minio_io_manager.py │ │ ├── psql_io_manager.py │ │ ├── spark_io_manager.py │ │ └── youtube_io_manager.py ├── etl_pipeline_tests │ ├── __init__.py │ └── test_assets.py ├── setup.cfg ├── pyproject.toml ├── setup.py ├── requirements.txt ├── README.md └── Dockerfile ├── app ├── icons │ ├── video.png │ ├── youtube.png │ ├── youtube_v2.png │ ├── icons8-like-48.png │ ├── icons8-view-48.png │ ├── icons8-channel-48.png │ ├── icons8-category-48.png │ └── icons8-thumbs-down-skin-type-4-48.png ├── streamlit_app.py └── pages │ ├── search_video.py │ └── video_detail.py ├── docker-images ├── dagster │ ├── requirements.txt │ └── Dockerfile ├── streamlit │ ├── requirements.txt │ └── Dockerfile └── spark │ ├── Dockerfile │ └── spark-defaults.conf ├── dagster_home ├── workspace.yaml └── dagster.yaml ├── .gitignore ├── LICENSE ├── Makefile ├── load_dataset ├── mysql_load.sql ├── psql_schemas.sql └── mysql_schemas.sql ├── docker-compose.yaml └── README.md /public/notebooks/EDA.ipynb: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/analyses/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/macros/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/seeds/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/jobs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/snapshots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/assets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/schedules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline_tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline_tests/test_assets.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /etl_pipeline/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = etl_pipeline 3 | -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/.user.yml: -------------------------------------------------------------------------------- 1 | id: 34ac7379-38f0-4235-94d4-210cae8a4832 2 | -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/constants.py: -------------------------------------------------------------------------------- 1 | START_DATE = "2020-06-15" 2 | END_DATE = "2024-05-13" -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 -------------------------------------------------------------------------------- /app/icons/video.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/video.png -------------------------------------------------------------------------------- /app/icons/youtube.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/youtube.png -------------------------------------------------------------------------------- /app/icons/youtube_v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/youtube_v2.png -------------------------------------------------------------------------------- /docker-images/dagster/requirements.txt: -------------------------------------------------------------------------------- 1 | dagster==1.7.3 2 | dagit==1.7.3 3 | dagster-postgres 4 | dagster-dbt==0.23.3 5 | dagster-spark==0.23.3 -------------------------------------------------------------------------------- /public/images/DataFlow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/DataFlow.png -------------------------------------------------------------------------------- /app/icons/icons8-like-48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/icons8-like-48.png -------------------------------------------------------------------------------- /app/icons/icons8-view-48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/icons8-view-48.png -------------------------------------------------------------------------------- /dagster_home/workspace.yaml: -------------------------------------------------------------------------------- 1 | load_from: 2 | - grpc_server: 3 | host: etl_pipeline 4 | port: 4000 5 | location_name: "etl_pipeline" -------------------------------------------------------------------------------- /public/images/DataLineage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/DataLineage.png -------------------------------------------------------------------------------- /public/images/data_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/data_flow.png -------------------------------------------------------------------------------- /app/icons/icons8-channel-48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/icons8-channel-48.png -------------------------------------------------------------------------------- /public/images/youtube_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/youtube_icon.png -------------------------------------------------------------------------------- /app/icons/icons8-category-48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/icons8-category-48.png -------------------------------------------------------------------------------- /public/images/Data_flow_youtube.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/Data_flow_youtube.png -------------------------------------------------------------------------------- /public/images/Data_flow_youtube2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/public/images/Data_flow_youtube2.png -------------------------------------------------------------------------------- /docker-images/streamlit/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.34.0 2 | psycopg2-binary==2.9.9 3 | pandas==2.2.2 4 | polars==0.20.23 5 | # scikit-learn==1.5.0 6 | # surprise==0.1 -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/package-lock.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 4 | sha1_hash: a158c48c59c2bb7d729d2a4e215aabe5bb4f3353 5 | -------------------------------------------------------------------------------- /etl_pipeline/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.dagster] 6 | module_name = "etl_pipeline" 7 | -------------------------------------------------------------------------------- /app/icons/icons8-thumbs-down-skin-type-4-48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longNguyen010203/Youtube-Recommend-Master-ETL-Pipeline/HEAD/app/icons/icons8-thumbs-down-skin-type-4-48.png -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/models/youtube_trending/search_linkvideo.sql: -------------------------------------------------------------------------------- 1 | 2 | 3 | {{ config(materialized="table") }} 4 | 5 | SELECT 6 | video_id, 7 | link_video 8 | FROM {{ source("gold", "linkvideos") }} -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/models/youtube_trending/search_videocategory.sql: -------------------------------------------------------------------------------- 1 | 2 | {{ config(materialized="table") }} 3 | 4 | SELECT 5 | categoryid, 6 | categoryname 7 | FROM {{ source("gold", "videocategory") }} -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/partitions/__init__.py: -------------------------------------------------------------------------------- 1 | from dagster import MonthlyPartitionsDefinition 2 | from .. import constants 3 | 4 | 5 | monthly_partitions = MonthlyPartitionsDefinition( 6 | start_date=constants.START_DATE, 7 | end_date=constants.END_DATE 8 | ) -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/func_process.py: -------------------------------------------------------------------------------- 1 | def replace_str(value: str): 2 | return value.replace("default", "maxresdefault") 3 | 4 | def format_date(value: str): 5 | return value.replace("T", " ").replace("Z", "") 6 | 7 | def convert(value: str): 8 | return value.replace('"', '') -------------------------------------------------------------------------------- /etl_pipeline/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="etl_pipeline", 5 | packages=find_packages(exclude=["etl_pipeline_tests"]), 6 | install_requires=[ 7 | "dagster", 8 | "dagster-cloud" 9 | ], 10 | extras_require={"dev": ["dagster-webserver", "pytest"]}, 11 | ) 12 | -------------------------------------------------------------------------------- /etl_pipeline/requirements.txt: -------------------------------------------------------------------------------- 1 | dagster==1.7.3 2 | dagit==1.7.3 3 | pandas==2.2.2 4 | polars==0.20.23 5 | pyarrow==16.0.0 6 | minio==7.2.7 7 | pymysql==1.1.0 8 | cryptography==42.0.5 9 | psycopg2-binary==2.9.9 10 | dagster-postgres 11 | google-api-python-client==2.127.0 12 | pyspark==3.4.3 13 | dbt-postgres==1.7.13 14 | dagster-dbt==0.23.3 15 | dagster-spark==0.23.3 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .etl-pipeline/ 2 | .idea/ 3 | .pytest_cache/ 4 | 5 | .logs_queue/ 6 | .nux/ 7 | .telemetry/ 8 | history/ 9 | logs/ 10 | dagster_home/schedules/ 11 | 12 | dataset/ 13 | 14 | .mypy_cache/ 15 | __pycache__/ 16 | 17 | minio/ 18 | mysql/ 19 | postgresql/ 20 | venv/ 21 | 22 | .env/ 23 | data/ 24 | .streamlit/ 25 | 26 | .env.spark_master 27 | .env.spark_worker 28 | .env -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/macros/generate_schema_name.sql: -------------------------------------------------------------------------------- 1 | {% macro generate_schema_name(custom_schema_name, node) -%} 2 | 3 | {%- set default_schema = target.schema -%} 4 | {%- if custom_schema_name is none -%} 5 | 6 | {{ default_schema }} 7 | 8 | {%- else -%} 9 | 10 | {{ custom_schema_name | trim }} 11 | 12 | {%- endif -%} 13 | 14 | {%- endmacro %} -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/profiles.yml: -------------------------------------------------------------------------------- 1 | dbt_tranform: 2 | outputs: 3 | dev: 4 | dbname: youTube_trending_video 5 | host: "{{ env_var('POSTGRES_HOST') }}" 6 | pass: "{{ env_var('POSTGRES_PASSWORD') }}" 7 | port: 5432 8 | schema: gold 9 | threads: 1 10 | type: postgres 11 | user: "{{ env_var('POSTGRES_USER') }}" 12 | target: dev 13 | -------------------------------------------------------------------------------- /docker-images/dagster/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | ENV DAGSTER_HOME=/opt/dagster/dagster_home 4 | 5 | RUN mkdir -p $DAGSTER_HOME && \ 6 | mkdir -p $DAGSTER_HOME/storage && \ 7 | mkdir -p $DAGSTER_HOME/compute_logs && \ 8 | mkdir -p $DAGSTER_HOME/local_artifact_storage 9 | 10 | WORKDIR $DAGSTER_HOME 11 | 12 | COPY requirements.txt $DAGSTER_HOME 13 | 14 | RUN pip install --upgrade pip && pip install -r requirements.txt -------------------------------------------------------------------------------- /docker-images/streamlit/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | WORKDIR /app 4 | 5 | RUN apt-get update && apt-get install -y \ 6 | build-essential \ 7 | curl \ 8 | software-properties-common \ 9 | git \ 10 | && rm -rf /var/lib/apt/lists/* 11 | 12 | COPY . . 13 | 14 | RUN pip install --upgrade pip && pip install -r requirements.txt 15 | 16 | EXPOSE 8501 17 | 18 | HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health 19 | 20 | ENTRYPOINT ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"] -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt run 7 | - dbt test 8 | 9 | 10 | ### Resources: 11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support 14 | - Find [dbt events](https://events.getdbt.com) near you 15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 16 | -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/models/sources.yml: -------------------------------------------------------------------------------- 1 | 2 | version: 2 3 | 4 | sources: 5 | - name: gold 6 | tables: 7 | - name: videocategory 8 | meta: 9 | dagster: 10 | asset_key: ["warehouse", "gold", "videoCategory"] 11 | - name: linkvideos 12 | meta: 13 | dagster: 14 | asset_key: ["warehouse", "gold", "linkVideos"] 15 | - name: metricvideos 16 | meta: 17 | dagster: 18 | asset_key: ["warehouse", "gold", "metricVideos"] 19 | - name: informationvideos 20 | meta: 21 | dagster: 22 | asset_key: ["warehouse", "gold", "informationVideos"] -------------------------------------------------------------------------------- /docker-images/spark/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/bitnami/spark:3.4.3 2 | 3 | USER root 4 | 5 | # Install prerequisites 6 | RUN apt-get update && apt-get install -y curl 7 | 8 | RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \ 9 | && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \ 10 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \ 11 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \ 12 | && mv s3-2.18.41.jar /opt/bitnami/spark/jars \ 13 | && mv aws-java-sdk-1.12.367.jar /opt/bitnami/spark/jars \ 14 | && mv delta-core_2.12-2.2.0.jar /opt/bitnami/spark/jars \ 15 | && mv delta-storage-2.2.0.jar /opt/bitnami/spark/jars -------------------------------------------------------------------------------- /docker-images/spark/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.jars jars/delta-core_2.12-2.2.0.jar,jars/hadoop-aws-3.3.2.jar,jars/delta-storage-2.2.0.jar,jars/aws-java-sdk-1.12.367.jar,jars/s3-2.18.41.jar,jars/aws-java-sdk-bundle-1.11.1026.jar 2 | spark.sql.extensions io.delta.sql.DeltaSparkSessionExtension 3 | spark.sql.catalog.spark_catalog org.apache.spark.sql.delta.catalog.DeltaCatalog 4 | spark.hadoop.fs.s3a.endpoint http://minio:9000 5 | spark.hadoop.fs.s3a.access.key minio 6 | spark.hadoop.fs.s3a.secret.key minio123 7 | ; spark.hadoop.fs.s3a.awsAccessKeyId minio 8 | ; spark.hadoop.fs.s3a.awsSecretAccessKey minio123 9 | spark.hadoop.fs.s3a.path.style.access true 10 | spark.hadoop.fs.s3a.connection.ssl.enabled false 11 | spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem 12 | spark.driver.memory 4g 13 | spark.executor.memory 4g -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dagster import Definitions, load_assets_from_modules 4 | from dagster_dbt import DbtCliResource 5 | 6 | from .assets import bronze, gold, silver, warehouse, dbt 7 | from .resources import mysql, minio, postgres, youtube, spark 8 | 9 | 10 | all_assets = load_assets_from_modules( 11 | [bronze, silver, gold, warehouse, dbt]) 12 | 13 | defs = Definitions( 14 | assets=all_assets, 15 | resources={ 16 | "mysql_io_manager": mysql, 17 | "minio_io_manager": minio, 18 | "psql_io_manager": postgres, 19 | "youtube_io_manager": youtube, 20 | "spark_io_manager": spark, 21 | "dbt": DbtCliResource( 22 | project_dir=os.fspath(dbt.DBT_PROJECT_DIR), 23 | profiles_dir=os.fspath(dbt.DBT_PROFILE_DIR) 24 | ), 25 | }, 26 | ) 27 | -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/models/youtube_trending/search_information.sql: -------------------------------------------------------------------------------- 1 | 2 | 3 | {{ config(materialized="table") }} 4 | 5 | select distinct 6 | i.video_id 7 | , i.title 8 | , i.channeltitle 9 | , v.categoryname 10 | , m.view 11 | , m.like as likes 12 | , m.dislike 13 | , m.publishedat 14 | , l.link_video 15 | , i.tags 16 | , i.thumbnail_link 17 | 18 | from {{ source('gold', 'informationvideos') }} i 19 | inner join {{ source('gold', 'linkvideos') }} l on i.video_id = l.video_id 20 | inner join {{ source('gold', 'videocategory') }} v on i.categoryid = v.categoryid 21 | inner join ( 22 | SELECT 23 | video_id 24 | , MAX(view_count) AS view 25 | , MAX(likes) as like 26 | , MAX(dislikes) as dislike 27 | , MAX(publishedat) as publishedat 28 | FROM {{ source('gold', 'metricvideos') }} 29 | GROUP BY video_id 30 | ) AS m on i.video_id = m.video_id 31 | 32 | -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/assets/dbt.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | from dagster import AssetExecutionContext 5 | from dagster_dbt import DbtCliResource, dbt_assets 6 | from dagster_dbt import DagsterDbtTranslator 7 | 8 | from typing import Mapping, Optional, Any 9 | 10 | 11 | 12 | DBT_PROJECT_DIR = Path(__file__).joinpath("..", "..", "..", "dbt_tranform").resolve() 13 | DBT_PROFILE_DIR = Path(__file__).joinpath("..", "..", "..", "dbt_tranform").resolve() 14 | DBT_MANIFEST_PATH = DBT_PROJECT_DIR.joinpath("target", "manifest.json") 15 | 16 | class CustomDagsterDbtTranslator(DagsterDbtTranslator): 17 | def get_group_name( 18 | self, dbt_resource_props: Mapping[str, Any] 19 | ) -> Optional[str]: 20 | return "warehouse" 21 | 22 | 23 | @dbt_assets( 24 | manifest=DBT_MANIFEST_PATH, 25 | dagster_dbt_translator=CustomDagsterDbtTranslator() 26 | ) 27 | def Brazilian_ECommerce_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource): 28 | yield from dbt.cli(["build"], context=context).stream() 29 | -------------------------------------------------------------------------------- /app/streamlit_app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import psycopg2 3 | import polars as pl 4 | import pandas as pd 5 | from PIL import Image 6 | from io import BytesIO 7 | import requests 8 | 9 | icon = Image.open("./icons/youtube_v2.png", mode="r") 10 | 11 | st.set_page_config( 12 | page_title="YouTube RecoMaster", 13 | page_icon=icon, 14 | layout="centered", 15 | initial_sidebar_state="expanded" 16 | ) 17 | 18 | title, logo = st.columns([4,2.91]) 19 | with title: 20 | st.title("YouTube RecoMaster") 21 | with logo: 22 | st.write("") 23 | st.image(icon, width=70) 24 | 25 | 26 | st.markdown( 27 | f'''''', 30 | unsafe_allow_html=True 31 | ) -------------------------------------------------------------------------------- /dagster_home/dagster.yaml: -------------------------------------------------------------------------------- 1 | run_coordinator: 2 | module: dagster.core.run_coordinator 3 | class: QueuedRunCoordinator 4 | config: 5 | max_concurrent_runs: 3 6 | 7 | scheduler: 8 | module: dagster.core.scheduler 9 | class: DagsterDaemonScheduler 10 | config: 11 | max_catchup_runs: 5 12 | 13 | storage: 14 | postgres: 15 | postgres_db: 16 | username: 17 | env: DAGSTER_PG_USERNAME 18 | password: 19 | env: DAGSTER_PG_PASSWORD 20 | hostname: 21 | env: DAGSTER_PG_HOSTNAME 22 | db_name: 23 | env: DAGSTER_PG_DB 24 | port: 5432 25 | 26 | run_launcher: 27 | module: dagster.core.launcher 28 | class: DefaultRunLauncher 29 | 30 | compute_logs: 31 | module: dagster.core.storage.local_compute_log_manager 32 | class: LocalComputeLogManager 33 | config: 34 | base_dir: /opt/dagster/dagster_home/compute_logs 35 | 36 | local_artifact_storage: 37 | module: dagster.core.storage.root 38 | class: LocalArtifactStorage 39 | config: 40 | base_dir: /opt/dagster/dagster_home/local_artifact_storage -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Long Nguyen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/resources/mysql_io_manager.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | from contextlib import contextmanager 3 | from sqlalchemy import create_engine 4 | 5 | from dagster import IOManager, OutputContext, InputContext 6 | 7 | 8 | @contextmanager 9 | def connect_mysql(config: dict): 10 | conn_info = ( 11 | f"mysql+pymysql://{config['user']}:{config['password']}" 12 | + f"@{config['host']}:{config['port']}" 13 | + f"/{config['database']}" 14 | ) 15 | db_conn = create_engine(conn_info) 16 | try: 17 | yield db_conn 18 | except Exception: 19 | raise 20 | 21 | 22 | class MySQLIOManager(IOManager): 23 | 24 | def __init__(self, config): 25 | self._config = config 26 | 27 | def handle_output(self, context: OutputContext, obj: pl.DataFrame): 28 | pass 29 | 30 | def load_input(self, context: InputContext) -> pl.DataFrame: 31 | pass 32 | 33 | def extract_data(self, sql: str) -> pl.DataFrame: 34 | with connect_mysql(self._config) as db_conn: 35 | pd_data = pl.read_database(query=sql, connection=db_conn) 36 | return pd_data -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | include .env 2 | 3 | 4 | build: 5 | docker-compose build 6 | 7 | up: 8 | docker-compose --env-file .env up -d 9 | 10 | down: 11 | docker-compose --env-file .env down 12 | 13 | restart: 14 | make down && make up 15 | 16 | to_psql: 17 | docker exec -ti de_psql psql postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB} 18 | 19 | to_mysql: 20 | docker exec -it de_mysql mysql --local-infile=1 -u"${MYSQL_USER}" -p"${MYSQL_PASSWORD}" ${MYSQL_DATABASE} 21 | 22 | to_mysql_root: 23 | docker exec -it de_mysql mysql -u"root" -p"${MYSQL_ROOT_PASSWORD}" ${MYSQL_DATABASE} 24 | 25 | mysql_create: 26 | docker exec -it de_mysql mysql --local_infile -u"${MYSQL_USER}" -p"${MYSQL_PASSWORD}" ${MYSQL_DATABASE} -e"source /tmp/load_dataset/mysql_schemas.sql" 27 | 28 | mysql_load: 29 | docker exec -it de_mysql mysql --local_infile=1 -u"${MYSQL_USER}" -p"${MYSQL_PASSWORD}" ${MYSQL_DATABASE} -e"source /tmp/load_dataset/mysql_load.sql" 30 | 31 | psql_create: 32 | docker exec -it de_psql psql postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB} -f /tmp/load_dataset/psql_schemas.sql -a -------------------------------------------------------------------------------- /load_dataset/mysql_load.sql: -------------------------------------------------------------------------------- 1 | LOAD DATA LOCAL INFILE '/tmp/youTube_trending_video/DE_youtube_trending_data.csv' 2 | INTO TABLE DE_youtube_trending_data 3 | FIELDS TERMINATED BY ',' 4 | ENCLOSED BY '"' 5 | LINES TERMINATED BY '\n' 6 | IGNORE 1 ROWS; 7 | 8 | LOAD DATA LOCAL INFILE '/tmp/youTube_trending_video/JP_youtube_trending_data.csv' 9 | INTO TABLE JP_youtube_trending_data 10 | FIELDS TERMINATED BY ',' 11 | ENCLOSED BY '"' 12 | LINES TERMINATED BY '\n' 13 | IGNORE 1 ROWS; 14 | 15 | LOAD DATA LOCAL INFILE '/tmp/youTube_trending_video/RU_youtube_trending_data.csv' 16 | INTO TABLE RU_youtube_trending_data 17 | FIELDS TERMINATED BY ',' 18 | ENCLOSED BY '"' 19 | LINES TERMINATED BY '\n' 20 | IGNORE 1 ROWS; 21 | 22 | LOAD DATA LOCAL INFILE '/tmp/youTube_trending_video/CA_youtube_trending_data.csv' 23 | INTO TABLE CA_youtube_trending_data 24 | FIELDS TERMINATED BY ',' 25 | ENCLOSED BY '"' 26 | LINES TERMINATED BY '\n' 27 | IGNORE 1 ROWS; 28 | 29 | LOAD DATA LOCAL INFILE '/tmp/youTube_trending_video/IN_youtube_trending_data.csv' 30 | INTO TABLE IN_youtube_trending_data 31 | FIELDS TERMINATED BY ',' 32 | ENCLOSED BY '"' 33 | LINES TERMINATED BY '\n' 34 | IGNORE 1 ROWS; -------------------------------------------------------------------------------- /load_dataset/psql_schemas.sql: -------------------------------------------------------------------------------- 1 | DROP SCHEMA IF EXISTS gold CASCADE; 2 | CREATE SCHEMA gold; 3 | 4 | DROP TABLE IF EXISTS gold.videoCategory; 5 | CREATE TABLE gold.videoCategory ( 6 | categoryId VARCHAR(5), 7 | categoryName VARCHAR(50) 8 | ); 9 | 10 | DROP TABLE IF EXISTS gold.linkVideos; 11 | CREATE TABLE gold.linkVideos ( 12 | video_id VARCHAR(20), 13 | link_video VARCHAR(50) 14 | ); 15 | 16 | DROP TABLE IF EXISTS gold.metricVideos; 17 | CREATE TABLE gold.metricVideos ( 18 | video_id VARCHAR(20), 19 | -- country_code, 20 | publishedAt TIMESTAMP, 21 | trending_date TIMESTAMP, 22 | channelId VARCHAR(27), 23 | categoryId VARCHAR(5), 24 | view_count INTEGER, 25 | likes INTEGER, 26 | dislikes INTEGER, 27 | comment_count INTEGER 28 | ); 29 | 30 | DROP TABLE IF EXISTS gold.informationVideos; 31 | CREATE TABLE gold.informationVideos ( 32 | video_id VARCHAR(20), 33 | -- country_code, 34 | title TEXT, 35 | channelId VARCHAR(27), 36 | channelTitle TEXT, 37 | categoryId VARCHAR(5), 38 | tags TEXT, 39 | thumbnail_link TEXT, 40 | comments_disabled VARCHAR(5), 41 | ratings_disabled VARCHAR(5) 42 | ); -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'dbt_tranform' 6 | version: '1.0.0' 7 | 8 | # This setting configures which "profile" dbt uses for this project. 9 | profile: 'dbt_tranform' 10 | 11 | # These configurations specify where dbt should look for different types of files. 12 | # The `model-paths` config, for example, states that models in this project can be 13 | # found in the "models/" directory. You probably won't need to change these! 14 | model-paths: ["models"] 15 | analysis-paths: ["analyses"] 16 | test-paths: ["tests"] 17 | seed-paths: ["seeds"] 18 | macro-paths: ["macros"] 19 | snapshot-paths: ["snapshots"] 20 | 21 | clean-targets: # directories to be removed by `dbt clean` 22 | - "target" 23 | - "dbt_packages" 24 | 25 | 26 | # Configuring models 27 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 28 | 29 | # In this example config, we tell dbt to build all models in the example/ 30 | # directory as views. These settings can be overridden in the individual model 31 | # files using the `{{ config(...) }}` macro. 32 | models: 33 | dbt_tranform: 34 | # Config indicated by + and applies to all files under models/example/ 35 | youtube_trending: 36 | +materialized: table 37 | +schema: youtube_trending 38 | -------------------------------------------------------------------------------- /etl_pipeline/dbt_tranform/models/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: search_videocategory 5 | description: "" 6 | columns: 7 | - name: categoryid 8 | description: "The primary key for this table" 9 | tests: 10 | - unique 11 | - not_null 12 | - name: categoryname 13 | description: "" 14 | tests: 15 | - unique 16 | - not_null 17 | - accepted_values: 18 | values: ['Film & Animation', 'Autos & Vehicles', 'Music', 'Pets & Animals', 'Sports', 'Travel & Events', 'Gaming', 'People & Blogs', 'Comedy', 'Entertainment', 'News & Politics', 'Howto & Style', 'Education', 'Science & Technology', 'Nonprofits & Activism'] 19 | 20 | - name: search_linkvideo 21 | description: "" 22 | columns: 23 | - name: video_id 24 | description: "The primary key for this table" 25 | tests: 26 | - unique 27 | - not_null 28 | - name: link_video 29 | description: "" 30 | tests: 31 | - unique 32 | - not_null 33 | post-hook: 34 | - "CREATE INDEX IF NOT EXISTS idx_video_id ON {{ this }} (video_id)" 35 | 36 | 37 | - name: search_information 38 | description: "" 39 | columns: 40 | - name: video_id 41 | description: "The primary key for this table" 42 | tests: 43 | - not_null 44 | - name: categoryname 45 | description: "" 46 | tests: 47 | - not_null 48 | - accepted_values: 49 | values: ['Film & Animation', 'Autos & Vehicles', 'Music', 'Pets & Animals', 'Sports', 'Travel & Events', 'Gaming', 'People & Blogs', 'Comedy', 'Entertainment', 'News & Politics', 'Howto & Style', 'Education', 'Science & Technology', 'Nonprofits & Activism'] 50 | 51 | post-hook: 52 | - "CREATE INDEX IF NOT EXISTS idx_video_id ON {{ this }} (video_id, categoryname, tags)" -------------------------------------------------------------------------------- /etl_pipeline/README.md: -------------------------------------------------------------------------------- 1 | # etl_pipeline 2 | 3 | This is a [Dagster](https://dagster.io/) project scaffolded with [`dagster project scaffold`](https://docs.dagster.io/getting-started/create-new-project). 4 | 5 | ## Getting started 6 | 7 | First, install your Dagster code location as a Python package. By using the --editable flag, pip will install your Python package in ["editable mode"](https://pip.pypa.io/en/latest/topics/local-project-installs/#editable-installs) so that as you develop, local code changes will automatically apply. 8 | 9 | ```bash 10 | pip install -e ".[dev]" 11 | ``` 12 | 13 | Then, start the Dagster UI web server: 14 | 15 | ```bash 16 | dagster dev 17 | ``` 18 | 19 | Open http://localhost:3000 with your browser to see the project. 20 | 21 | You can start writing assets in `etl_pipeline/assets.py`. The assets are automatically loaded into the Dagster code location as you define them. 22 | 23 | ## Development 24 | 25 | ### Adding new Python dependencies 26 | 27 | You can specify new Python dependencies in `setup.py`. 28 | 29 | ### Unit testing 30 | 31 | Tests are in the `etl_pipeline_tests` directory and you can run tests using `pytest`: 32 | 33 | ```bash 34 | pytest etl_pipeline_tests 35 | ``` 36 | 37 | ### Schedules and sensors 38 | 39 | If you want to enable Dagster [Schedules](https://docs.dagster.io/concepts/partitions-schedules-sensors/schedules) or [Sensors](https://docs.dagster.io/concepts/partitions-schedules-sensors/sensors) for your jobs, the [Dagster Daemon](https://docs.dagster.io/deployment/dagster-daemon) process must be running. This is done automatically when you run `dagster dev`. 40 | 41 | Once your Dagster Daemon is running, you can start turning on schedules and sensors for your jobs. 42 | 43 | ## Deploy on Dagster Cloud 44 | 45 | The easiest way to deploy your Dagster project is to use Dagster Cloud. 46 | 47 | Check out the [Dagster Cloud Documentation](https://docs.dagster.cloud) to learn more. 48 | -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/resources/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .mysql_io_manager import MySQLIOManager 3 | from .minio_io_manager import MinIOIOManager 4 | from .psql_io_manager import PostgreSQLIOManager 5 | from .youtube_io_manager import YoutubeIOManager 6 | from .spark_io_manager import SparkIOManager 7 | 8 | 9 | mysql = MySQLIOManager( 10 | { 11 | "host": os.getenv("MYSQL_HOST"), 12 | "port": 3306, 13 | "database": os.getenv("MYSQL_DATABASE"), 14 | "user": os.getenv("MYSQL_USER"), 15 | "password": os.getenv("MYSQL_PASSWORD"), 16 | } 17 | ) 18 | 19 | minio = MinIOIOManager( 20 | { 21 | "endpoint_url": os.getenv("MINIO_ENDPOINT"), 22 | "bucket": os.getenv("DATALAKE_BUCKET"), 23 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"), 24 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"), 25 | } 26 | ) 27 | 28 | postgres = PostgreSQLIOManager( 29 | { 30 | "host": os.getenv("POSTGRES_HOST"), 31 | "port": os.getenv("POSTGRES_PORT"), 32 | "database": os.getenv("POSTGRES_DB"), 33 | "user": os.getenv("POSTGRES_USER"), 34 | "password": os.getenv("POSTGRES_PASSWORD"), 35 | } 36 | ) 37 | 38 | youtube = YoutubeIOManager( 39 | { 40 | "api_service_name": os.getenv("API_SERVICE_NAME"), 41 | "api_version": os.getenv("API_VERSION"), 42 | "api_key": os.getenv("API_KEY"), 43 | "endpoint_url": os.getenv("MINIO_ENDPOINT"), 44 | "bucket": os.getenv("DATALAKE_BUCKET"), 45 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"), 46 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"), 47 | } 48 | ) 49 | 50 | spark = SparkIOManager( 51 | { 52 | "spark_master_url": os.getenv("SPARK_MASTER_URL"), 53 | "endpoint_url": os.getenv("MINIO_ENDPOINT"), 54 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"), 55 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"), 56 | "bucket": os.getenv("DATALAKE_BUCKET"), 57 | } 58 | ) -------------------------------------------------------------------------------- /etl_pipeline/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | ARG openjdk_version="17" 4 | 5 | # USER root 6 | 7 | RUN apt-get update --yes && \ 8 | apt-get install --yes \ 9 | curl \ 10 | "openjdk-${openjdk_version}-jre-headless" ca-certificates-java procps && \ 11 | apt-get clean && rm -rf /var/lib/apt/lists/* 12 | 13 | 14 | RUN curl -O -L https://dlcdn.apache.org/spark/spark-3.4.3/spark-3.4.3-bin-hadoop3.tgz \ 15 | && tar -zxvf spark-3.4.3-bin-hadoop3.tgz \ 16 | && rm -rf spark-3.4.3-bin-hadoop3.tgz \ 17 | && mv spark-3.4.3-bin-hadoop3/ /usr/local/ \ 18 | && rm -rf /usr/local/spark \ 19 | && rm -rf /usr/local/spark-3.3.0-bin-hadoop3 \ 20 | && ln -s /usr/local/spark-3.4.3-bin-hadoop3 /usr/local/spark 21 | 22 | 23 | RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \ 24 | && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \ 25 | && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1026/aws-java-sdk-bundle-1.11.1026.jar \ 26 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \ 27 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \ 28 | && curl -O https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.2/hadoop-aws-3.3.2.jar \ 29 | # && mkdir -p /usr/local/spark/jars \ 30 | && mv s3-2.18.41.jar /usr/local/spark/jars \ 31 | && mv aws-java-sdk-1.12.367.jar /usr/local/spark/jars \ 32 | && mv aws-java-sdk-bundle-1.11.1026.jar /usr/local/spark/jars \ 33 | && mv delta-core_2.12-2.2.0.jar /usr/local/spark/jars \ 34 | && mv delta-storage-2.2.0.jar /usr/local/spark/jars \ 35 | && mv hadoop-aws-3.3.2.jar /usr/local/spark/jars 36 | 37 | 38 | WORKDIR /opt/dagster/app 39 | COPY requirements.txt /opt/dagster/app 40 | RUN pip install --upgrade pip && pip install -r requirements.txt 41 | COPY . /opt/dagster/app 42 | 43 | CMD ["dagster", "api", "grpc", "-h", "0.0.0.0", "-p", "4000", "-m", "etl_pipeline"] -------------------------------------------------------------------------------- /load_dataset/mysql_schemas.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS youtube_trending_data; 2 | 3 | 4 | DROP TABLE IF EXISTS CA_youtube_trending_data; 5 | CREATE TABLE CA_youtube_trending_data ( 6 | video_id VARCHAR(20), 7 | title TEXT, 8 | publishedAt VARCHAR(27), 9 | channelId VARCHAR(27), 10 | channelTitle TEXT, 11 | categoryId VARCHAR(5), 12 | trending_date VARCHAR(27), 13 | tags TEXT, 14 | view_count TEXT, 15 | likes TEXT, 16 | dislikes TEXT, 17 | comment_count TEXT, 18 | thumbnail_link TEXT, 19 | comments_disabled VARCHAR(6), 20 | ratings_disabled VARCHAR(6) 21 | ); 22 | 23 | DROP TABLE IF EXISTS DE_youtube_trending_data; 24 | CREATE TABLE DE_youtube_trending_data ( 25 | video_id VARCHAR(20), 26 | title TEXT, 27 | publishedAt VARCHAR(27), 28 | channelId VARCHAR(27), 29 | channelTitle TEXT, 30 | categoryId VARCHAR(5), 31 | trending_date VARCHAR(27), 32 | tags TEXT, 33 | view_count TEXT, 34 | likes TEXT, 35 | dislikes TEXT, 36 | comment_count TEXT, 37 | thumbnail_link TEXT, 38 | comments_disabled VARCHAR(6), 39 | ratings_disabled VARCHAR(6) 40 | ); 41 | 42 | DROP TABLE IF EXISTS IN_youtube_trending_data; 43 | CREATE TABLE IN_youtube_trending_data ( 44 | video_id VARCHAR(20), 45 | title TEXT, 46 | publishedAt VARCHAR(27), 47 | channelId VARCHAR(27), 48 | channelTitle TEXT, 49 | categoryId VARCHAR(5), 50 | trending_date VARCHAR(27), 51 | tags TEXT, 52 | view_count TEXT, 53 | likes TEXT, 54 | dislikes TEXT, 55 | comment_count TEXT, 56 | thumbnail_link TEXT, 57 | comments_disabled VARCHAR(6), 58 | ratings_disabled VARCHAR(6) 59 | ); 60 | 61 | DROP TABLE IF EXISTS JP_youtube_trending_data; 62 | CREATE TABLE JP_youtube_trending_data ( 63 | video_id VARCHAR(20), 64 | title TEXT, 65 | publishedAt VARCHAR(27), 66 | channelId VARCHAR(27), 67 | channelTitle TEXT, 68 | categoryId VARCHAR(5), 69 | trending_date VARCHAR(27), 70 | tags TEXT, 71 | view_count TEXT, 72 | likes TEXT, 73 | dislikes TEXT, 74 | comment_count TEXT, 75 | thumbnail_link TEXT, 76 | comments_disabled VARCHAR(6), 77 | ratings_disabled VARCHAR(6) 78 | ); 79 | 80 | DROP TABLE IF EXISTS RU_youtube_trending_data; 81 | CREATE TABLE RU_youtube_trending_data ( 82 | video_id VARCHAR(20), 83 | title TEXT, 84 | publishedAt VARCHAR(27), 85 | channelId VARCHAR(27), 86 | channelTitle TEXT, 87 | categoryId VARCHAR(5), 88 | trending_date VARCHAR(27), 89 | tags TEXT, 90 | view_count TEXT, 91 | likes TEXT, 92 | dislikes TEXT, 93 | comment_count TEXT, 94 | thumbnail_link TEXT, 95 | comments_disabled VARCHAR(6), 96 | ratings_disabled VARCHAR(6) 97 | ); -------------------------------------------------------------------------------- /app/pages/search_video.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import psycopg2 3 | import polars as pl 4 | import pandas as pd 5 | from PIL import Image 6 | from io import BytesIO 7 | import requests 8 | 9 | 10 | icon = Image.open("./icons/youtube_v2.png", mode="r") 11 | 12 | st.set_page_config( 13 | page_title="YouTube RecoMaster", 14 | page_icon=icon, 15 | layout="centered", 16 | initial_sidebar_state="expanded" 17 | ) 18 | 19 | @st.cache_resource 20 | def init_connection(): 21 | return psycopg2.connect(**st.secrets["postgres"]) 22 | 23 | conn = init_connection() 24 | 25 | @st.cache_data(ttl=600) 26 | def run_query(query): 27 | with conn.cursor() as cur: 28 | cur.execute(query) 29 | return cur.fetchall() 30 | 31 | 32 | title, logo = st.columns([4,2.91]) 33 | with title: 34 | st.title("YouTube RecoMaster") 35 | with logo: 36 | st.write("") 37 | st.image(icon, width=70) 38 | 39 | st.slider("Size") 40 | video_name = st.text_input("Enter a video name") 41 | st.write(f"You entered: {video_name}") 42 | 43 | 44 | data = run_query( 45 | f""" 46 | SELECT DISTINCT 47 | video_id, 48 | title, 49 | channeltitle, 50 | thumbnail_link, 51 | link_video, 52 | categoryname, 53 | view 54 | FROM youtube_trending.search_information 55 | WHERE title LIKE '%{video_name}%' 56 | LIMIT 10; 57 | """ 58 | ) 59 | 60 | videos = { 61 | "video_id": [e[0] for e in data], 62 | "title": [e[1] for e in data], 63 | "channeltitle": [e[2] for e in data], 64 | "thumbnail_link": [e[3] for e in data], 65 | "link_video": [e[4] for e in data], 66 | "categoryname": [e[5] for e in data], 67 | "view_count": [e[6] for e in data] 68 | } 69 | video_url = "https://www.youtube.com/embed/J78aPJ3VyNs" 70 | 71 | recommended_videos = [] 72 | recommended_videos += videos['link_video'] 73 | 74 | st.subheader(f"Have {len(videos['video_id'])} results for keyword: {video_name}") 75 | for video_id,title,channeltitle,thumbnail_link,link_video,categoryname,view_count in zip( 76 | videos['video_id'],videos['title'],videos['channeltitle'], 77 | videos['thumbnail_link'],videos['link_video'],videos['categoryname'],videos['view_count']): 78 | 79 | col1, col2 = st.columns([1, 1]) 80 | 81 | with col1: 82 | img = Image.open(BytesIO(requests.get(thumbnail_link).content)) 83 | st.markdown( 84 | f'', 85 | unsafe_allow_html=True, 86 | ) 87 | st.image(img, use_column_width=True) 88 | 89 | with col2: 90 | st.write("") 91 | st.markdown(f""" 92 |
93 | {title}
94 | channel: {channeltitle}
95 | category: {categoryname}
96 | views: {view_count} 97 |
98 | """, unsafe_allow_html=True) 99 | st.write("") 100 | is_clicked = st.button("Watch", key=video_id) 101 | 102 | if is_clicked: 103 | st.experimental_set_query_params(video_id=video_id) 104 | # st.experimental_rerun() 105 | st.switch_page("./pages/video_detail.py") 106 | 107 | 108 | st.write("---") 109 | 110 | 111 | # df = pl.DataFrame(videos) 112 | # st.table(df) -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/resources/minio_io_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | from contextlib import contextmanager 3 | from datetime import datetime 4 | from typing import Union 5 | 6 | import polars as pl 7 | import pyarrow as pa 8 | import pyarrow.parquet as pq 9 | from dagster import IOManager, OutputContext, InputContext 10 | from minio import Minio 11 | 12 | 13 | @contextmanager 14 | def connect_minio(config: dict): 15 | client = Minio( 16 | endpoint=config.get("endpoint_url"), 17 | access_key=config.get("aws_access_key_id"), 18 | secret_key=config.get("aws_secret_access_key"), 19 | secure=False, 20 | ) 21 | try: 22 | yield client 23 | except Exception as e: 24 | raise e 25 | 26 | 27 | class MinIOIOManager(IOManager): 28 | 29 | def __init__(self, config): 30 | self._config = config 31 | 32 | def _get_path(self, context: Union[InputContext, OutputContext]): 33 | layer, schema, table = context.asset_key.path 34 | key = "/".join([layer, schema, table.replace(f"{layer}_", "")]) 35 | tmp_file_path = "/tmp/file-{}-{}.parquet".format( 36 | datetime.today().strftime("%Y%m%d%H%M%S"), 37 | "-".join(context.asset_key.path) 38 | ) 39 | 40 | if context.has_asset_partitions: 41 | start, end = context.asset_partitions_time_window 42 | # partition_str = context.asset_partition_key 43 | partition_str = start.strftime("%Y%m") 44 | context.log.info(f"INFO: {os.path.join(key, partition_str)}.pq, {tmp_file_path}") 45 | return os.path.join(key, f"{partition_str}.pq"), tmp_file_path 46 | else: 47 | context.log.info(f"INFO: {key}.pq, {tmp_file_path}") 48 | return f"{key}.pq", tmp_file_path 49 | 50 | def handle_output(self, context: OutputContext, obj: pl.DataFrame): 51 | # convert to parquet format 52 | key_name, tmp_file_path = self._get_path(context) 53 | obj.write_parquet(tmp_file_path) 54 | 55 | # upload to MinIO 56 | try: 57 | bucket_name = self._config.get("bucket") 58 | with connect_minio(self._config) as client: 59 | # Make bucket if not exist. 60 | found = client.bucket_exists(bucket_name) 61 | if not found: 62 | client.make_bucket(bucket_name) 63 | else: 64 | print(f"Bucket {bucket_name} already exists") 65 | client.fput_object(bucket_name, key_name, tmp_file_path) 66 | row_count = len(obj) 67 | context.add_output_metadata( 68 | { 69 | "path": key_name, 70 | "records": row_count, 71 | "tmp": tmp_file_path 72 | } 73 | ) 74 | # clean up tmp file 75 | os.remove(tmp_file_path) 76 | 77 | except Exception as e: 78 | raise e 79 | 80 | def load_input(self, context: InputContext) -> pl.DataFrame: 81 | bucket_name = self._config.get("bucket") 82 | key_name, tmp_file_path = self._get_path(context) 83 | 84 | try: 85 | with connect_minio(self._config) as client: 86 | # Make bucket if not exist. 87 | found = client.bucket_exists(bucket_name) 88 | if not found: 89 | client.make_bucket(bucket_name) 90 | else: 91 | print(f"Bucket {bucket_name} already exists") 92 | 93 | client.fget_object(bucket_name, key_name, tmp_file_path) 94 | pd_data = pl.read_parquet(tmp_file_path) 95 | return pd_data 96 | 97 | except Exception as e: 98 | raise e -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/resources/psql_io_manager.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | from datetime import datetime 3 | from psycopg2 import sql 4 | import psycopg2.extras 5 | import psycopg2 6 | 7 | import polars as pl 8 | from dagster import IOManager, OutputContext, InputContext 9 | from sqlalchemy import create_engine 10 | 11 | 12 | @contextmanager 13 | def connect_psql(config: dict): 14 | try: 15 | yield psycopg2.connect( 16 | host=config["host"], 17 | port=config["port"], 18 | database=config["database"], 19 | user=config["user"], 20 | password=config["password"], 21 | ) 22 | 23 | except Exception as e: 24 | raise e 25 | 26 | 27 | class PostgreSQLIOManager(IOManager): 28 | 29 | def __init__(self, config): 30 | self._config = config 31 | 32 | def load_input(self, context: InputContext) -> pl.DataFrame: 33 | pass 34 | 35 | def handle_output(self, context: OutputContext, obj: pl.DataFrame): 36 | schema, table = context.asset_key.path[-2], context.asset_key.path[-1] 37 | tmp_tbl = f"{table}_tmp_{datetime.now().strftime('%Y_%m_%d')}" 38 | 39 | with connect_psql(self._config) as db_conn: 40 | primary_keys = (context.metadata or {}).get("primary_keys", []) 41 | ls_columns = (context.metadata or {}).get("columns", []) 42 | 43 | with db_conn.cursor() as cursor: 44 | # create temp table 45 | cursor.execute( 46 | f'CREATE TEMP TABLE IF NOT EXISTS "{tmp_tbl}" (LIKE {schema}.{table})' 47 | ) 48 | cursor.execute(f'SELECT COUNT(*) FROM "{tmp_tbl}"') 49 | context.log.debug( 50 | f"Log for creating temp table: {cursor.fetchall()}" 51 | ) 52 | # cursor.execute( 53 | # sql.SQL("CREATE TEMP TABLE IF NOT EXISTS {} (LIKE {}.{});").format( 54 | # sql.Identifier(tmp_tbl), 55 | # sql.Identifier(schema), 56 | # sql.Identifier(table), 57 | # ) 58 | # ) 59 | 60 | # insert new data 61 | try: 62 | columns = sql.SQL(",").join( 63 | sql.Identifier(name.lower()) for name in obj.columns 64 | ) 65 | context.log.info(f"Table {table} with columns: {columns}") 66 | values = sql.SQL(",").join(sql.Placeholder() for _ in obj.columns) 67 | 68 | context.log.debug("Inserting data into temp table") 69 | insert_query = sql.SQL('INSERT INTO {} ({}) VALUES({});').format( 70 | sql.Identifier(tmp_tbl), columns, values 71 | ) 72 | psycopg2.extras.execute_batch(cursor, insert_query, obj.rows()) 73 | context.log.info(f"Insert into data for table {table} Success !!!") 74 | 75 | db_conn.commit() 76 | 77 | except Exception as e: 78 | raise e 79 | 80 | with db_conn.cursor() as cursor: 81 | # check data inserted 82 | cursor.execute(f'SELECT COUNT(*) FROM "{tmp_tbl}"') 83 | context.log.info(f"Number of rows inserted: {cursor.fetchone()}") 84 | 85 | # upsert data 86 | if len(primary_keys) > 0: 87 | conditions = " AND ".join( 88 | [ 89 | f""" {schema}.{table}."{k.lower()}" = "{tmp_tbl}"."{k.lower()}" """ 90 | for k in primary_keys 91 | ] 92 | ) 93 | command = f""" 94 | BEGIN TRANSACTION; 95 | DELETE FROM {schema}.{table} 96 | USING "{tmp_tbl}" 97 | WHERE {conditions}; 98 | 99 | INSERT INTO {schema}.{table} 100 | SELECT * FROM "{tmp_tbl}"; 101 | 102 | END TRANSACTION; 103 | """ 104 | else: 105 | command = f""" 106 | BEGIN TRANSACTION; 107 | TRUNCATE TABLE {schema}.{table}; 108 | 109 | INSERT INTO {schema}.{table} 110 | SELECT * FROM "{tmp_tbl}"; 111 | 112 | END TRANSACTION; 113 | """ 114 | 115 | cursor.execute(command) 116 | # drop temp table 117 | cursor.execute(f'DROP TABLE IF EXISTS "{tmp_tbl}"') 118 | db_conn.commit() -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | 3 | 4 | services: 5 | 6 | # MySQL 7 | de_mysql: 8 | image: mysql:8.0 9 | container_name: de_mysql 10 | volumes: 11 | - ./mysql:/var/lib/mysql 12 | - ./dataset/youTube_trending_video:/tmp/youTube_trending_video 13 | - ./load_dataset:/tmp/load_dataset 14 | ports: 15 | - 3306:3306 16 | env_file: .env 17 | networks: 18 | - de_network 19 | 20 | # MinIO 21 | minio: 22 | hostname: minio 23 | image: minio/minio 24 | container_name: minio 25 | ports: 26 | - 9001:9001 27 | - 9000:9000 28 | command: [ "server", "/data", "--console-address", ":9001" ] 29 | volumes: 30 | - ./minio:/data 31 | env_file: .env 32 | networks: 33 | - de_network 34 | 35 | mc: 36 | image: minio/mc 37 | container_name: mc 38 | hostname: mc 39 | env_file: .env 40 | entrypoint: > 41 | /bin/sh -c " until (/usr/bin/mc config host add minio 42 | http://minio:9000 minio minio123) do echo '...waiting...' && sleep 1; 43 | done; /usr/bin/mc mb minio/lakehouse; /usr/bin/mc policy set public 44 | minio/lakehouse; exit 0; " 45 | depends_on: 46 | - minio 47 | networks: 48 | - de_network 49 | 50 | # Pipeline 51 | etl_pipeline: 52 | build: 53 | context: ./etl_pipeline 54 | dockerfile: Dockerfile 55 | container_name: etl_pipeline 56 | image: etl_pipeline:latest 57 | restart: always 58 | volumes: 59 | - ./etl_pipeline:/opt/dagster/app 60 | - ./docker-images/spark/spark-defaults.conf:/usr/local/spark/conf/spark-defaults.conf 61 | ports: 62 | - 4041:4040 63 | env_file: .env 64 | networks: 65 | - de_network 66 | 67 | # PostgreSQL 68 | de_psql: 69 | image: postgres:15 70 | container_name: de_psql 71 | volumes: 72 | - ./postgresql:/var/lib/postgresql/data 73 | - ./load_dataset:/tmp/load_dataset 74 | ports: 75 | - 5432:5432 76 | env_file: .env 77 | networks: 78 | - de_network 79 | 80 | # Dagster 81 | de_dagster: 82 | build: 83 | context: ./docker-images/dagster/ 84 | container_name: de_dagster 85 | image: de_dagster 86 | 87 | de_dagster_dagit: 88 | image: de_dagster:latest 89 | entrypoint: 90 | - dagit 91 | - -h 92 | - "0.0.0.0" 93 | - -p 94 | - "3001" 95 | - -w 96 | - workspace.yaml 97 | container_name: de_dagster_dagit 98 | expose: 99 | - "3001" 100 | ports: 101 | - 3001:3001 102 | volumes: 103 | - /var/run/docker.sock:/var/run/docker.sock 104 | - ./dagster_home:/opt/dagster/dagster_home 105 | env_file: .env 106 | networks: 107 | - de_network 108 | 109 | de_dagster_daemon: 110 | image: de_dagster:latest 111 | entrypoint: 112 | - dagster-daemon 113 | - run 114 | container_name: de_dagster_daemon 115 | volumes: 116 | - /var/run/docker.sock:/var/run/docker.sock 117 | - ./dagster_home:/opt/dagster/dagster_home 118 | env_file: .env 119 | networks: 120 | - de_network 121 | 122 | # Streamlit 123 | de_streamlit: 124 | build: 125 | context: ./docker-images/streamlit 126 | dockerfile: Dockerfile 127 | image: de_streamlit:latest 128 | container_name: de_streamlit 129 | volumes: 130 | - ./app:/app 131 | env_file: .env 132 | ports: 133 | - "8501:8501" 134 | networks: 135 | - de_network 136 | 137 | # Metabase 138 | de_metabase: 139 | image: metabase/metabase:latest 140 | container_name: de_metabase 141 | volumes: 142 | - ./storage/metabase_data:/metabase_data 143 | ports: 144 | - "3030:3000" 145 | env_file: .env 146 | networks: 147 | - de_network 148 | 149 | # Jupyter 150 | # de_notebook: 151 | # image: jupyter/all-spark-notebook:python-3.9 152 | # container_name: de_notebook 153 | # command: [ "start-notebook.sh", "--NotebookApp.token=" ] 154 | # ports: 155 | # - 8888:8888 156 | # volumes: 157 | # - ./notebooks/work:/home/jovyan/work 158 | # env_file: .env 159 | # networks: 160 | # - de_network 161 | 162 | # # Spark 163 | # spark-master: 164 | # build: 165 | # context: ./docker-images/spark 166 | # dockerfile: Dockerfile 167 | # image: spark-master:latest 168 | # container_name: spark-master 169 | # hostname: spark_master 170 | # volumes: 171 | # - ./docker-images/spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf 172 | # - ./data:/opt/spark-data 173 | # env_file: .env.spark_master 174 | # expose: 175 | # - "7077" 176 | # ports: 177 | # - "7077:7077" 178 | # - "8080:8080" 179 | # networks: 180 | # - de_network 181 | 182 | # spark-worker: 183 | # image: docker.io/bitnami/spark:3.4.3 184 | # depends_on: 185 | # - spark-master 186 | # deploy: 187 | # replicas: 3 188 | # env_file: .env.spark_worker 189 | # volumes: 190 | # - ./docker-images/spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf 191 | # - ./data:/opt/spark-data 192 | # networks: 193 | # - de_network 194 | 195 | networks: 196 | de_network: 197 | driver: bridge 198 | name: de_network -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🌄 Youtube-ETL-Pipeline 2 | In this project, I build a simple data pipeline following the ETL(extract - transform - load) model using Youtube-Trending-Video dataset, perform data processing, transformation and calculation using Apache Spark big data technology, serving the video search and recommendation system 3 | 4 | ## 🔦 About Project 5 | 6 | 7 | - **Data Source**: This project uses two main `data sources`: [Youtube Trending Video](https://www.kaggle.com/datasets/rsrishav/youtube-trending-video-dataset) data and [Youtube API](https://developers.google.com/youtube/v3) 8 | - `Youtube Trending Video` data is downloaded from [Kaggle.com](https://www.kaggle.com) with `.csv` file format, then loaded into `MySQL`, considered as a `data source` 9 | - Using `Video ID` and `Category ID` from `Youtube Trending Video` data, we collect some additional information fields from `Youtube API` such as `Video Link` and `Video Category` 10 | - **Extract Data**: Extract the above `data sources` using `Polars` `DataFrame`, now we have the `raw` layer, then load the data into `MinIO` `datalake` 11 | - **Tranform Data**: From `MinIO`, we use `Apache Spark`, specifically `PySpark` 12 | - convert from `Polars` `DataFrame` to `PySpark` `DataFrame` for processing and calculation, we get `silver` and `gold` layers 13 | - Data stored in `MinIO` is in `.parquet` format, providing better processing performance 14 | - **Load Data**: Load the `gold` layer into the `PostgreSQL` data warehouse, perform additional transform with `dbt` to create an `index`, making video searching faster 15 | - **Serving**: The data was used for visualization using `Metabase` and creating a video recommendation application using `Streamlit` 16 | - **package and orchestrator**: Use `Docker` to containerize and package projects and `Dagster` to coordinate `assets` across different tasks 17 | 18 | ## ⚡ Workflow 19 | 20 | 21 | ## 📦 Technologies 22 | - `MySQL` 23 | - `Youtube API` 24 | - `Polars` 25 | - `MinIO` 26 | - `Apache Spark` 27 | - `PostgreSQL` 28 | - `Dbt` 29 | - `Metabase` 30 | - `Streamlit` 31 | - `Dagster` 32 | - `Docker` 33 | - `Apache Superset` 34 | - `Unittest` 35 | - `Pytest` 36 | 37 | ## 🦄 Features 38 | Here's what you can do with: 39 | - You can completely change the logic or create new `assets` in the `data pipeline` as you wish, perform `aggregate` `calculations` on the `assets` in the `pipeline` according to your purposes. 40 | - You can also create new `data charts` as well as change existing `charts` as you like with extremely diverse `chart types` on `Metabase` and `Apache Superset`. 41 | - You can also create new or change my existing `dashboards` as you like 42 | - `Search` videos quickly with any `keyword`, for `Video Recommendation` Apps 43 | - `Search` in many different languages, not just `English` such as: `Japanese`, `Canadian`, `German`, `Indian`, `Russian` 44 | - Recommend videos based on `category` and `tags` video 45 | 46 | ## 👩🏽‍🍳 The Process 47 | 48 | 49 | ## 📚 What I Learned 50 | 51 | During this project, I learned important skills, understood complex ideas, knew how to install and set up popular and useful tools, which brought me closer to becoming a `Data Engineer`. 52 | - **Logical thinking**: I learned how to think like a data person, find the cause of the data `problem` and then come up with the most `reasonable solution` to achieve high data `accuracy`. 53 | - **Architecture**: I understand and grasp the `ideas` and `architecture` of today's popular and popular big data processing tool, `Apache Spark`. 54 | - **Installation**: I learned how to install popular data processing, visualization and storage tools such as: `Metabase`, `Streamlit`, `MinIO`,... with `Docker` 55 | - **Setup**: I know how to setup `Spark Standalone Cluster` using `Docker` with three `Worker Nodes` on my local machine 56 | 57 | ### 📈 Overall Growth: 58 | Each part of this project has helped me understand more about how to build a data engineering, data management project. Learn new knowledge and improve my skills in future work 59 | 60 | ## 💭 How can it be improved? 61 | - Add more `data sources` to increase data richness. 62 | - Refer to other `data warehouses` besides `PostgreSQL` such as `Amazon Redshift` or `Snowflake`. 63 | - Perform more `cleaning` and `optimization` `processing` of the data. 64 | - Perform more advanced `statistics`, `analysis` and `calculations` with `Apache Spark`. 65 | - Check out other popular and popular `data orchestration` tools like `Apache Airflow`. 66 | - Separate `dbt` into a separate service (separate `container`) in `docker` when the project expands 67 | - Setup `Spark Cluster` on `cloud platforms` instead of on `local machines` 68 | - Refer to `cloud computing` services if the project is more extensive 69 | - Learn about `dbt packages` like `dbt-labs/dbt_utils` to help make the `transformation` process faster and more optimal. 70 | 71 | ## 🚦 Running the Project 72 | To run the project in your local environment, follow these steps: 73 | 1. Run command after to clone the `repository` to your `local machine`. 74 | ~~~bash 75 | git clone https://github.com/longNguyen010203/Youtube-ETL-Pipeline.git 76 | ~~~ 77 | 78 | 2. Run the following commands to build the images from the `Dockerfile`, pull images from `docker hub` and launch services 79 | ~~~bash 80 | make build 81 | make up 82 | ~~~ 83 | 84 | 3. Run the following commands to access the `SQL editor` on the `terminal` and Check if `local_infile` was turned on 85 | ~~~python 86 | make to_mysql_root 87 | 88 | SET GLOBAL local_infile=TRUE; 89 | SHOW VARIABLES LIKE "local_infile"; 90 | exit 91 | ~~~ 92 | 93 | 4. Run the following commands to create tables with schema for `MySQL`, load data from `CSV` file to `MySQL` and create tables with schema for `PostgreSQL` 94 | ~~~bash 95 | make mysql_create 96 | make mysql_load 97 | make psql_create 98 | ~~~ 99 | 100 | 5. Open [http://localhost:3001](http://localhost:3001) to view `Dagster UI` and click `Materialize all` button to run the Pipeline 101 | 6. Open [http://localhost:9001](http://localhost:9001) to view `MinIO UI` and check the data to be loaded 102 | 7. Open [http://localhost:8080](http://localhost:8080) to view `Spark UI` and three `workers` are running 103 | 8. Open [http://localhost:3030](http://localhost:3030) to see charts and `dashboards` on `Metabase` 104 | 9. Open [http://localhost:8501](http://localhost:8501) to try out the `video recommendation` app on `Streamlit` 105 | 106 | ## 🍿 Video -------------------------------------------------------------------------------- /app/pages/video_detail.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import streamlit as st 3 | import psycopg2 4 | from PIL import Image 5 | from io import BytesIO 6 | import requests 7 | 8 | 9 | icon = Image.open("./icons/youtube_v2.png", mode="r") 10 | 11 | st.set_page_config( 12 | page_title="Video Recommender", 13 | page_icon=icon, 14 | layout="centered", 15 | initial_sidebar_state="expanded" 16 | ) 17 | 18 | title, logo = st.columns([4,2.91]) 19 | with title: 20 | st.title("YouTube RecoMaster") 21 | with logo: 22 | st.write("") 23 | st.image(icon, width=70) 24 | 25 | def display_video(url, recommended_videos=[]): 26 | if url not in recommended_videos: 27 | st.markdown( 28 | f'''''', 31 | unsafe_allow_html=True 32 | ) 33 | else: 34 | st.markdown( 35 | f'''''', 38 | unsafe_allow_html=True 39 | ) 40 | 41 | @st.cache_resource 42 | def init_connection(): 43 | return psycopg2.connect(**st.secrets["postgres"]) 44 | 45 | conn = init_connection() 46 | 47 | @st.cache_data(ttl=600) 48 | def run_query(query): 49 | with conn.cursor() as cur: 50 | cur.execute(query) 51 | return cur.fetchall() 52 | 53 | query_params = st.experimental_get_query_params() 54 | video_id = query_params.get('video_id', [None])[0] 55 | 56 | data = run_query(f""" 57 | select distinct 58 | title 59 | , channeltitle 60 | , categoryname 61 | , view 62 | , likes 63 | , dislike 64 | , publishedat 65 | , link_video 66 | , tags 67 | from youtube_trending.search_information si 68 | where video_id = '{video_id}'; 69 | """) 70 | 71 | videos = { 72 | "title": data[0][0], 73 | "channeltitle": data[0][1], 74 | "categoryname": data[0][2], 75 | "view": data[0][3], 76 | "like": data[0][4], 77 | "dislike": data[0][5], 78 | "publishedat": data[0][6], 79 | "link_video": data[0][7], 80 | "tags": data[0][8] 81 | } 82 | 83 | display_video(videos['link_video']) 84 | st.markdown(f"### {videos['title']}") 85 | view_icon = Image.open("./icons/icons8-view-48.png", mode="r") 86 | like_icon = Image.open("./icons/icons8-like-48.png", mode="r") 87 | dislike_icon = Image.open("./icons/icons8-thumbs-down-skin-type-4-48.png", mode="r") 88 | category_icon = Image.open("./icons/icons8-category-48.png", mode="r") 89 | channel_icon = Image.open("./icons/icons8-channel-48.png", mode="r") 90 | # st.write(f"{videos['tags']}") 91 | st.write(f"{videos['tags']}", unsafe_allow_html=True) 92 | 93 | title, view, like, dislike, category = st.columns([4,1,1,1,1.3]) 94 | with title: 95 | st.image(channel_icon, width=40) 96 | st.write(f"{videos['channeltitle']}") 97 | with view: 98 | st.image(view_icon, width=30) 99 | st.write(f"{videos['view']}") 100 | with like: 101 | st.image(like_icon, width=30) 102 | st.write(f"{videos['like']}") 103 | with dislike: 104 | st.image(dislike_icon, width=30) 105 | st.write(f"{videos['dislike']}") 106 | with category: 107 | st.image(category_icon, width=30) 108 | st.write(f"{videos['categoryname']}") 109 | 110 | 111 | st.subheader("Recommended Videos:") 112 | tags = "" 113 | tag_list = videos['tags'].split(' ') 114 | for tag in tag_list: tags += f"tags LIKE '%{tag}%' OR " 115 | tags = tags[:-3] 116 | 117 | query = f""" 118 | select distinct 119 | video_id 120 | , title 121 | , channeltitle 122 | , categoryname 123 | , view 124 | , likes 125 | , dislike 126 | , publishedat 127 | , link_video 128 | , tags 129 | , thumbnail_link 130 | from youtube_trending.search_information 131 | where (categoryname = '{videos['categoryname']}') AND 132 | ({tags}) AND video_id <> '{video_id}' 133 | limit 10; 134 | """ 135 | data2 = run_query(query) 136 | 137 | if data2 is not None: 138 | videos2 = { 139 | "video_id": [e[0] for e in data2], 140 | "title": [e[1] for e in data2], 141 | "channeltitle": [e[2] for e in data2], 142 | "categoryname": [e[3] for e in data2], 143 | "view": [e[4] for e in data2], 144 | "like": [e[5] for e in data2], 145 | "dislike": [e[6] for e in data2], 146 | "publishedat": [e[7] for e in data2], 147 | "link_video": [e[8] for e in data2], 148 | "tags": [e[9] for e in data2], 149 | 'thumbnail_link': [e[10] for e in data2] 150 | } 151 | 152 | 153 | recommended_videos = [] 154 | recommended_videos += videos2['link_video'] 155 | 156 | for video_id,title,channeltitle,categoryname,view,like,dislike,publishedat,link_video,tags,thumbnail_link in zip( 157 | videos2['video_id'],videos2['title'],videos2['channeltitle'],videos2['categoryname'], 158 | videos2['view'],videos2['like'], videos2['dislike'],videos2['publishedat'], 159 | videos2['link_video'],videos2['tags'],videos2['thumbnail_link']): 160 | 161 | col1, col2 = st.columns([1, 1]) 162 | 163 | with col1: 164 | img = Image.open(BytesIO(requests.get(thumbnail_link).content)) 165 | st.markdown( 166 | f'', 167 | unsafe_allow_html=True, 168 | ) 169 | st.image(img, use_column_width=True) 170 | 171 | with col2: 172 | st.write("") 173 | st.markdown(f""" 174 |
175 | {title}
176 | channel: {channeltitle}
177 | category: {categoryname}
178 | views: {view} 179 |
180 | """, unsafe_allow_html=True) 181 | st.write("") 182 | st.button("Detail", key=video_id) 183 | 184 | st.write("---") 185 | 186 | else: st.write(f"Not found") 187 | -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/assets/warehouse.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | from dagster import AssetExecutionContext 3 | 4 | from dagster import ( 5 | multi_asset, 6 | AssetIn, 7 | AssetOut, 8 | MetadataValue, 9 | AssetExecutionContext, 10 | Output 11 | ) 12 | 13 | from ..partitions import monthly_partitions 14 | 15 | 16 | GROUP_NAME = "warehouse" 17 | 18 | @multi_asset( 19 | ins={ 20 | "gold_videoCategory": AssetIn( 21 | key_prefix=["gold", "youtube"], 22 | ) 23 | }, 24 | outs={ 25 | "videoCategory": AssetOut( 26 | key_prefix=["warehouse", "gold"], 27 | io_manager_key="psql_io_manager", 28 | metadata={ 29 | "primary_keys": [ 30 | "categoryId" 31 | ], 32 | "columns": [ 33 | "categoryId", 34 | "categoryName" 35 | ] 36 | }, 37 | group_name=GROUP_NAME 38 | ) 39 | }, 40 | name="videoCategory", 41 | required_resource_keys={"psql_io_manager"}, 42 | compute_kind="postgres", 43 | ) 44 | def videoCategory(context: AssetExecutionContext, 45 | gold_videoCategory: pl.DataFrame 46 | ) -> Output[pl.DataFrame]: 47 | """ 48 | Load videoCategory data from gold to PostgreSQL warehouse 49 | """ 50 | pl_data: pl.DataFrame = gold_videoCategory 51 | context.log.info(f"Load videoCategory data Success with shape {pl_data.shape}") 52 | 53 | return Output( 54 | value=pl_data, 55 | metadata={ 56 | "table name": MetadataValue.text("videoCategory"), 57 | "record count": MetadataValue.int(pl_data.shape[0]), 58 | "column count": MetadataValue.int(pl_data.shape[1]), 59 | "columns": pl_data.columns 60 | } 61 | ) 62 | 63 | 64 | @multi_asset( 65 | ins={ 66 | "gold_linkVideos": AssetIn( 67 | key_prefix=["gold", "youtube"], 68 | ) 69 | }, 70 | outs={ 71 | "linkVideos": AssetOut( 72 | key_prefix=["warehouse", "gold"], 73 | io_manager_key="psql_io_manager", 74 | metadata={ 75 | "primary_keys": [ 76 | "video_id" 77 | ], 78 | "columns": [ 79 | "video_id", 80 | "link_video" 81 | ] 82 | }, 83 | group_name=GROUP_NAME 84 | ) 85 | }, 86 | name="linkVideos", 87 | required_resource_keys={"psql_io_manager"}, 88 | compute_kind="postgres" 89 | ) 90 | def linkVideos(context: AssetExecutionContext, 91 | gold_linkVideos: pl.DataFrame 92 | ) -> Output[pl.DataFrame]: 93 | """ 94 | Load linkVideos data from gold to PostgreSQL warehouse 95 | """ 96 | pl_data: pl.DataFrame = gold_linkVideos 97 | context.log.info(f"Load linkVideos data Success with shape {pl_data.shape}") 98 | 99 | return Output( 100 | value=pl_data, 101 | metadata={ 102 | "table name": MetadataValue.text("linkVideos"), 103 | "record count": MetadataValue.int(pl_data.shape[0]), 104 | "column count": MetadataValue.int(pl_data.shape[1]), 105 | "columns": pl_data.columns 106 | } 107 | ) 108 | 109 | 110 | @multi_asset( 111 | ins={ 112 | "gold_metric_trending": AssetIn( 113 | key_prefix=["gold", "youtube"] 114 | ) 115 | }, 116 | outs={ 117 | "metricVideos": AssetOut( 118 | key_prefix=["warehouse", "gold"], 119 | io_manager_key="psql_io_manager", 120 | metadata={ 121 | "primary_keys": [ 122 | "video_id" 123 | ], 124 | "columns": [ 125 | "video_id", 126 | "publishedAt", 127 | "trending_date", 128 | "channelId", 129 | "categoryId", 130 | "view_count", 131 | "likes", 132 | "dislikes", 133 | "comment_count" 134 | ] 135 | }, 136 | group_name=GROUP_NAME 137 | ) 138 | }, 139 | name="metricVideos", 140 | required_resource_keys={"psql_io_manager"}, 141 | partitions_def=monthly_partitions, 142 | compute_kind="postgres" 143 | ) 144 | def metricVideos(context: AssetExecutionContext, 145 | gold_metric_trending: pl.DataFrame 146 | ) -> Output[pl.DataFrame]: 147 | """ 148 | Load metricVideos data from gold to PostgreSQL warehouse 149 | """ 150 | pl_data: pl.DataFrame = gold_metric_trending 151 | context.log.info(f"Load metricVideos data Success with shape {pl_data.shape}") 152 | 153 | return Output( 154 | value=pl_data, 155 | metadata={ 156 | "table name": MetadataValue.text("metricVideos"), 157 | "record count": MetadataValue.int(pl_data.shape[0]), 158 | "column count": MetadataValue.int(pl_data.shape[1]), 159 | "columns": pl_data.columns 160 | } 161 | ) 162 | 163 | 164 | @multi_asset( 165 | ins={ 166 | "gold_information_trending": AssetIn( 167 | key_prefix=["gold", "youtube"] 168 | ) 169 | }, 170 | outs={ 171 | "informationVideos": AssetOut( 172 | key_prefix=["warehouse", "gold"], 173 | io_manager_key="psql_io_manager", 174 | metadata={ 175 | "primary_keys": [ 176 | "video_id" 177 | ], 178 | "columns": [ 179 | "video_id", 180 | "title", 181 | "channelId", 182 | "channelTitle", 183 | "categoryId", 184 | "tags", 185 | "thumbnail_link", 186 | "comments_disabled", 187 | "ratings_disabled", 188 | ] 189 | }, 190 | group_name=GROUP_NAME 191 | ) 192 | }, 193 | name="informationVideos", 194 | required_resource_keys={"psql_io_manager"}, 195 | partitions_def=monthly_partitions, 196 | compute_kind="postgres" 197 | ) 198 | def informationVideos(context: AssetExecutionContext, 199 | gold_information_trending: pl.DataFrame 200 | ) -> Output[pl.DataFrame]: 201 | """ 202 | Load informationVideos data from gold to PostgreSQL warehouse 203 | """ 204 | pl_data: pl.DataFrame = gold_information_trending 205 | context.log.info(f"Load informationVideos data Success with shape {pl_data.shape}") 206 | 207 | return Output( 208 | value=pl_data, 209 | metadata={ 210 | "table name": MetadataValue.text("informationVideos"), 211 | "record count": MetadataValue.int(pl_data.shape[0]), 212 | "column count": MetadataValue.int(pl_data.shape[1]), 213 | "columns": pl_data.columns 214 | } 215 | ) -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/resources/spark_io_manager.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Union 2 | from datetime import datetime 3 | from dagster import IOManager, InputContext, OutputContext 4 | 5 | import os 6 | import polars as pl 7 | import pandas as pd 8 | from contextlib import contextmanager 9 | from pyspark.sql import SparkSession, DataFrame 10 | from pyspark import SparkConf 11 | from .minio_io_manager import connect_minio 12 | 13 | 14 | @contextmanager 15 | def create_spark_session(config, appName=None): 16 | spark = ( 17 | SparkSession.builder.appName(appName) 18 | .master("spark://spark-master:7077") 19 | .config("spark.driver.memory", "4g") 20 | .config("spark.executor.memory", "4g") 21 | # .config("spark.cores.max", "4") 22 | # .config("spark.executor.cores", "4") 23 | .config( 24 | "spark.jars", 25 | "/usr/local/spark/jars/delta-core_2.12-2.2.0.jar,/usr/local/spark/jars/hadoop-aws-3.3.2.jar,/usr/local/spark/jars/delta-storage-2.2.0.jar,/usr/local/spark/jars/aws-java-sdk-1.12.367.jar,/usr/local/spark/jars/s3-2.18.41.jar,/usr/local/spark/jars/aws-java-sdk-bundle-1.11.1026.jar", 26 | ) 27 | .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 28 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 29 | .config("spark.hadoop.fs.s3a.endpoint", "http://" + config["endpoint_url"]) 30 | .config("spark.hadoop.fs.s3a.access.key", str(config["aws_access_key_id"])) 31 | .config("spark.hadoop.fs.s3a.secret.key", str(config["aws_secret_access_key"])) 32 | .config("spark.hadoop.fs.s3a.path.style.access", "true") 33 | .config("spark.hadoop.fs.connection.ssl.enabled", "false") 34 | .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 35 | .config("spark.sql.execution.arrow.pyspark.enabled", "true") 36 | .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true") 37 | .getOrCreate() 38 | ) 39 | 40 | try: 41 | yield spark 42 | except Exception as e: 43 | raise f"Error Pyspark: {e}" 44 | 45 | 46 | class SparkIOManager(IOManager): 47 | 48 | def __init__(self, config) -> None: 49 | self._config = config 50 | 51 | 52 | def get_spark_session(self, context, appName=None) -> SparkSession: 53 | with create_spark_session(self._config, appName) as spark: 54 | context.log.info("Return Object SparkSession") 55 | return spark 56 | 57 | 58 | def _get_path(self, context: Union[InputContext, OutputContext]): 59 | layer, schema, table = context.asset_key.path 60 | key = "/".join([layer, schema, table.replace(f"{layer}_", "")]) 61 | tmp_file_path = "/tmp/file-{}-{}.parquet".format( 62 | datetime.today().strftime("%Y%m%d%H%M%S"), 63 | "-".join(context.asset_key.path) 64 | ) 65 | return key, tmp_file_path 66 | 67 | 68 | def handle_output(self, context: OutputContext, obj: DataFrame): 69 | key_name, tmp_file_path = self._get_path(context) 70 | bucket_name = self._config.get("bucket") 71 | ## ====> 72 | file_path = "s3a://lakehouse/" + key_name 73 | context.log.info(f"file_path: {file_path}") 74 | context.log.info(f"key_name: {key_name}") 75 | 76 | if context.has_partition_key: 77 | start, end = context.asset_partitions_time_window 78 | # partition_str = context.asset_partition_key 79 | partition_str = start.strftime("%Y%m") 80 | context.log.info(f"INFO: {os.path.join(key_name, partition_str)}.parquet, {tmp_file_path}") 81 | key_name, tmp_file_path = os.path.join(key_name, f"{partition_str}.parquet"), tmp_file_path 82 | else: 83 | context.log.info(f"INFO: {key_name}.parquet, {tmp_file_path}") 84 | key_name, tmp_file_path = f"{key_name}.parquet", tmp_file_path 85 | 86 | 87 | obj.write.mode('overwrite').parquet(tmp_file_path) 88 | 89 | with connect_minio(self._config) as client: 90 | try: 91 | bucket_name = self._config.get("bucket") 92 | with connect_minio(self._config) as client: 93 | # Make bucket if not exist. 94 | found = client.bucket_exists(bucket_name) 95 | if not found: 96 | client.make_bucket(bucket_name) 97 | else: 98 | print(f"Bucket {bucket_name} already exists") 99 | client.fput_object(bucket_name, key_name, tmp_file_path) 100 | row_count = obj.count() 101 | context.add_output_metadata( 102 | { 103 | "path": key_name, 104 | "records": row_count, 105 | "tmp": tmp_file_path 106 | } 107 | ) 108 | # clean up tmp file 109 | os.remove(tmp_file_path) 110 | 111 | except Exception as e: 112 | raise e 113 | 114 | 115 | def load_input(self, context: InputContext) -> DataFrame: 116 | key_name, tmp_file_path = self._get_path(context) 117 | bucket_name = self._config.get("bucket") 118 | 119 | if context.has_asset_partitions: 120 | start, end = context.asset_partitions_time_window 121 | # partition_str = context.asset_partition_key 122 | partition_str = start.strftime("%Y%m") 123 | context.log.info(f"INFO: {os.path.join(key_name, partition_str)}.parquet, {tmp_file_path}") 124 | key_name, tmp_file_path = os.path.join(key_name, f"{partition_str}.parquet"), tmp_file_path 125 | else: 126 | context.log.info(f"INFO: {key_name}.parquet, {tmp_file_path}") 127 | key_name, tmp_file_path = f"{key_name}.parquet", tmp_file_path 128 | 129 | with connect_minio(self._config) as client: 130 | try: 131 | with connect_minio(self._config) as client: 132 | # Make bucket if not exist. 133 | found = client.bucket_exists(bucket_name) 134 | if not found: 135 | client.make_bucket(bucket_name) 136 | else: 137 | print(f"Bucket {bucket_name} already exists") 138 | 139 | context.log.info(f"INFO -> bucket_name: {bucket_name}") 140 | context.log.info(f"INFO -> key_name: {key_name}") 141 | context.log.info(f"INFO -> tmp_file_path: {tmp_file_path}") 142 | 143 | client.fget_object(bucket_name, key_name, tmp_file_path) 144 | 145 | spark: SparkSession = self.get_spark_session(self, appName="Read-Parquet") 146 | df = spark.read.parquet(tmp_file_path) 147 | 148 | return df 149 | 150 | except Exception as e: 151 | raise e -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/resources/youtube_io_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from contextlib import contextmanager 4 | from datetime import datetime, timedelta 5 | from typing import Union, List 6 | 7 | import polars as pl 8 | from googleapiclient.discovery import build 9 | from googleapiclient.errors import HttpError 10 | from dagster import IOManager, InputContext, OutputContext 11 | from .minio_io_manager import connect_minio 12 | from .. import constants 13 | 14 | 15 | @contextmanager 16 | def youtube_client(config: dict): 17 | api_service_name = config["api_service_name"] 18 | api_version = config["api_version"] 19 | api_key = config["api_key"] 20 | 21 | youtube = build( 22 | serviceName=api_service_name, 23 | version=api_version, 24 | developerKey=api_key 25 | ) 26 | try: 27 | yield youtube 28 | except HttpError as e: 29 | raise 'An HTTP error %d occurred:\n%s' % (e.resp.status, e.content) 30 | 31 | 32 | class YoutubeIOManager(IOManager): 33 | 34 | def __init__(self, config) -> None: 35 | self._config = config 36 | 37 | 38 | def _get_path(self, context: Union[InputContext, OutputContext]): 39 | 40 | start = constants.START_DATE 41 | end = constants.END_DATE 42 | start_date = datetime.strptime(start, "%Y-%m-%d") 43 | end_date = datetime.strptime(end, "%Y-%m-%d") 44 | 45 | layer, schema, table = context.asset_key.path 46 | table = "youtube_trending_data" 47 | layer = "bronze" 48 | key = "/".join([layer, schema, table.replace(f"{layer}_", "")]) 49 | 50 | key_names: list[str] = [] 51 | tmp_file_paths: list[str] = [] 52 | 53 | for date in range((end_date - start_date).days + 1): 54 | partition_date = start_date + timedelta(days=date) 55 | partition_date.strftime("%Y-%m") 56 | key_name = f"{key}/" + str(partition_date)[:7].replace("-", "") + ".pq" 57 | # key_name = "bronze/youtube/youtube_trending_data/202011.pq" 58 | tmp_file_path = "/tmp/file-{}-{}.parquet".format( 59 | datetime.today().strftime("%Y%m%d%H%M%S"), 60 | str(partition_date)[:7].replace("-", "") 61 | ) 62 | # tmp_file_path = "/tmp/file-2020-11.parquet" 63 | context.log.info(f"INFO -> key_name: {key_name}") 64 | context.log.info(f"INFO -> tmp_file_path: {tmp_file_path}") 65 | 66 | key_names.append(key_name) 67 | tmp_file_paths.append(tmp_file_path) 68 | 69 | return key_names, tmp_file_paths 70 | 71 | 72 | def list_of_list(self, obj: pl.Series) -> list[list[str]]: 73 | start = 0 74 | end = 50 75 | lists: List[List] = [] 76 | for lst in range(len(obj) // 50 + 1): 77 | lists.append(list(obj)[start:end]) 78 | start += 50 79 | end += 50 80 | return lists 81 | 82 | 83 | def get_DataFrame(self, context, field: str) -> pl.DataFrame: 84 | bucket_name = self._config.get("bucket") 85 | key_names, tmp_file_paths = self._get_path(context) 86 | 87 | try: 88 | with connect_minio(self._config) as client: 89 | # Make bucket if not exist. 90 | found = client.bucket_exists(bucket_name) 91 | if not found: 92 | client.make_bucket(bucket_name) 93 | else: 94 | print(f"Bucket {bucket_name} already exists") 95 | 96 | except Exception as e: 97 | raise e 98 | 99 | list_dfs: list[pl.DataFrame] = [] 100 | for key_name, tmp_file_path in zip(key_names, tmp_file_paths): 101 | client.fget_object(bucket_name, key_name, tmp_file_path) 102 | df = pl.read_parquet(tmp_file_path)[field].unique() 103 | list_dfs.append(df) 104 | time.sleep(0.5) 105 | 106 | context.log.info(f"INFO -> key_name: {key_name}, tmp_file_path: {tmp_file_path}") 107 | os.remove(tmp_file_path) 108 | 109 | pl_data = pl.concat(list_dfs).unique() 110 | return pl_data 111 | 112 | 113 | def downLoad_videoCategories(self, context, obj: pl.DataFrame) -> pl.DataFrame: 114 | 115 | # pl_data = self.get_DataFrame(context, "categoryId") 116 | pl_data = obj["categoryId"].unique() 117 | 118 | with youtube_client(self._config) as service: 119 | categoryNames: list[str] = [] 120 | categoryIds: list[str] = [] 121 | 122 | categoryId_list: pl.Series = pl_data 123 | context.log.info("Divide categoryIds to multiple list categoryIds") 124 | 125 | for categoryId in list(categoryId_list.unique()): 126 | request = service.videoCategories().list( 127 | part="snippet", 128 | id=categoryId 129 | ) 130 | response = request.execute() 131 | 132 | try: 133 | categoryIds.append(str(response["items"][0]["id"])) 134 | categoryNames.append(str(response["items"][0]["snippet"]["title"])) 135 | except IndexError: 136 | categoryNames.append(response["items"]["snippet"]["title"]) 137 | 138 | 139 | return pl.DataFrame( 140 | { 141 | "categoryId": categoryIds, 142 | "categoryName": categoryNames 143 | } 144 | ) 145 | 146 | 147 | def downLoad_linkVideos(self, context, obj: pl.DataFrame) -> pl.DataFrame: 148 | 149 | pl_data = obj["video_id"].unique() 150 | 151 | with youtube_client(self._config) as service: 152 | link_videos: list[str] = [] 153 | videoIds: list[str] = [] 154 | 155 | video_id_list: pl.Series = pl_data 156 | context.log.info("Divide videoId to multiple list videoId") 157 | 158 | for videoId in self.list_of_list(video_id_list.unique()): 159 | # videoId = list(map(lambda id: id[1:-1], videoId)) 160 | # context.log.info(",".join(videoId)[:20]) 161 | request = service.videos().list( 162 | part="player", 163 | id=",".join(videoId) 164 | ) 165 | response = request.execute() 166 | 167 | for data in response["items"]: 168 | try: 169 | videoIds.append(str(data["id"])) 170 | link_videos.append(str(data["player"]["embedHtml"][40:74])) 171 | except IndexError as e: 172 | link_videos.append(response["items"]["snippet"]["title"]) 173 | raise e 174 | 175 | return pl.DataFrame( 176 | { 177 | "videoId": videoIds, 178 | "link_video": link_videos 179 | } 180 | ) 181 | 182 | 183 | def handle_output(self, context: OutputContext, obj: pl.DataFrame): 184 | pass 185 | 186 | 187 | def load_input(self, context: InputContext) -> pl.DataFrame: 188 | bucket_name = self._config.get("bucket") 189 | key_name, tmp_file_path = self._get_path(context) 190 | 191 | try: 192 | with connect_minio(self._config) as client: 193 | # Make bucket if not exist. 194 | found = client.bucket_exists(bucket_name) 195 | if not found: 196 | client.make_bucket(bucket_name) 197 | else: 198 | print(f"Bucket {bucket_name} already exists") 199 | 200 | client.fget_object(bucket_name, key_name, tmp_file_path) 201 | pd_data = pl.read_parquet(tmp_file_path) 202 | return pd_data 203 | 204 | except Exception as e: 205 | raise e 206 | 207 | 208 | 209 | -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/assets/gold.py: -------------------------------------------------------------------------------- 1 | import os 2 | import polars as pl 3 | from datetime import datetime 4 | from pyspark.sql import DataFrame 5 | 6 | from dagster import ( 7 | multi_asset, 8 | AssetIn, 9 | AssetOut, 10 | MetadataValue, 11 | AssetExecutionContext, 12 | Output 13 | ) 14 | 15 | from ..partitions import monthly_partitions 16 | from ..resources.spark_io_manager import create_spark_session 17 | 18 | 19 | GROUP_NAME = "gold" 20 | 21 | @multi_asset( 22 | ins={ 23 | "silver_videoCategory_cleaned": AssetIn( 24 | key_prefix=["silver", "youtube"], 25 | input_manager_key="spark_io_manager" 26 | ) 27 | }, 28 | outs={ 29 | "gold_videoCategory": AssetOut( 30 | key_prefix=["gold", "youtube"], 31 | io_manager_key="spark_io_manager", 32 | metadata={ 33 | "primary_keys": [ 34 | "categoryId" 35 | ], 36 | "columns": [ 37 | "categoryId", 38 | "categoryName" 39 | ] 40 | }, 41 | group_name=GROUP_NAME 42 | ) 43 | }, 44 | name="gold_videoCategory", 45 | required_resource_keys={"spark_io_manager"}, 46 | compute_kind="PySpark", 47 | ) 48 | def gold_videoCategory(context: AssetExecutionContext, 49 | silver_videoCategory_cleaned: pl.DataFrame 50 | ) -> Output[DataFrame]: 51 | """ 52 | Compute and Load videoCategory data from silver to gold layer in MinIO 53 | """ 54 | CONFIG = { 55 | "endpoint_url": os.getenv("MINIO_ENDPOINT"), 56 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"), 57 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"), 58 | } 59 | 60 | with create_spark_session( 61 | CONFIG, "gold_videoCategory-{}".format(datetime.today()) 62 | ) as spark: 63 | spark_df: DataFrame = spark.createDataFrame(silver_videoCategory_cleaned.to_pandas()) 64 | context.log.info(f"Load {context.asset_key.path[-1]} to gold layer success 🙂") 65 | 66 | return Output( 67 | value=spark_df, 68 | metadata={ 69 | "file name": MetadataValue.text("videoCategory.pq"), 70 | "record count": MetadataValue.int(spark_df.count()), 71 | "column count": MetadataValue.int(len(spark_df.columns)), 72 | "columns": spark_df.columns 73 | } 74 | ) 75 | 76 | 77 | @multi_asset( 78 | ins={ 79 | "silver_linkVideos_cleaned": AssetIn( 80 | key_prefix=["silver", "youtube"], 81 | input_manager_key="spark_io_manager" 82 | ) 83 | }, 84 | outs={ 85 | "gold_linkVideos": AssetOut( 86 | key_prefix=["gold", "youtube"], 87 | io_manager_key="spark_io_manager", 88 | metadata={ 89 | "primary_keys": [ 90 | "video_id" 91 | ], 92 | "columns": [ 93 | "video_id", 94 | "link_video" 95 | ] 96 | }, 97 | group_name=GROUP_NAME 98 | ) 99 | }, 100 | name="gold_linkVideos", 101 | required_resource_keys={"spark_io_manager"}, 102 | compute_kind="PySpark" 103 | ) 104 | def gold_linkVideos(context: AssetExecutionContext, 105 | silver_linkVideos_cleaned: pl.DataFrame 106 | ) -> Output[DataFrame]: 107 | """ 108 | Compute and Load linkVideos data from silver to gold layer in MinIO 109 | """ 110 | 111 | CONFIG = { 112 | "endpoint_url": os.getenv("MINIO_ENDPOINT"), 113 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"), 114 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"), 115 | } 116 | 117 | with create_spark_session( 118 | CONFIG, "gold_linkVideos-{}".format(datetime.today()) 119 | ) as spark: 120 | spark_df: DataFrame = spark.createDataFrame(silver_linkVideos_cleaned.to_pandas()) 121 | context.log.info(f"Load {context.asset_key.path[-1]} to gold layer success 🙂") 122 | 123 | return Output( 124 | value=spark_df, 125 | metadata={ 126 | "file name": MetadataValue.text("linkVideos.pq"), 127 | "record count": MetadataValue.int(spark_df.count()), 128 | "column count": MetadataValue.int(len(spark_df.columns)), 129 | "columns": spark_df.columns 130 | } 131 | ) 132 | 133 | 134 | @multi_asset( 135 | ins={ 136 | "silver_trending_cleaned": AssetIn( 137 | key_prefix=["silver", "youtube"], 138 | input_manager_key="spark_io_manager" 139 | ) 140 | }, 141 | outs={ 142 | "gold_metric_trending": AssetOut( 143 | key_prefix=["gold", "youtube"], 144 | io_manager_key="spark_io_manager", 145 | metadata={ 146 | "primary_keys": [ 147 | "video_id" 148 | ], 149 | "columns": [ 150 | "video_id", 151 | "publishedAt", 152 | "trending_date", 153 | "channelId", 154 | "categoryId", 155 | "view_count", 156 | "likes", 157 | "dislikes", 158 | "comment_count" 159 | ] 160 | }, 161 | group_name=GROUP_NAME 162 | ), 163 | "gold_information_trending": AssetOut( 164 | key_prefix=["gold", "youtube"], 165 | io_manager_key="spark_io_manager", 166 | metadata={ 167 | "primary_keys": [ 168 | "video_id" 169 | ], 170 | "columns": [ 171 | "video_id", 172 | "title", 173 | "channelId", 174 | "channelTitle", 175 | "categoryId", 176 | "tags", 177 | "thumbnail_link", 178 | "comments_disabled", 179 | "ratings_disabled", 180 | ] 181 | }, 182 | group_name=GROUP_NAME 183 | ), 184 | }, 185 | name="gold_metric_trending", 186 | required_resource_keys={"spark_io_manager"}, 187 | partitions_def=monthly_partitions, 188 | compute_kind="pyspark" 189 | ) 190 | def gold_metric_trending(context: AssetExecutionContext, 191 | silver_trending_cleaned: pl.DataFrame 192 | ): 193 | """ 194 | Compute and Load trending data from silver to gold layer in MinIO 195 | """ 196 | 197 | CONFIG = { 198 | "endpoint_url": os.getenv("MINIO_ENDPOINT"), 199 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"), 200 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"), 201 | } 202 | 203 | with create_spark_session( 204 | CONFIG, "gold_metric_trending-{}".format(datetime.today()) 205 | ) as spark: 206 | 207 | metric: DataFrame = spark.createDataFrame(silver_trending_cleaned.select([ 208 | "video_id", 209 | "publishedAt", 210 | "trending_date", 211 | "channelId", 212 | "categoryId", 213 | "view_count", 214 | "likes", 215 | "dislikes", 216 | "comment_count" 217 | ])) 218 | information: DataFrame = spark.createDataFrame(silver_trending_cleaned.select([ 219 | "video_id", 220 | "title", 221 | "channelId", 222 | "channelTitle", 223 | "categoryId", 224 | "tags", 225 | "thumbnail_link", 226 | "comments_disabled", 227 | "ratings_disabled", 228 | ])) 229 | context.log.info(f"Load {context.asset_key.path[-1]} to gold layer success 🙂") 230 | 231 | return Output( 232 | value=metric, 233 | output_name="gold_metric_trending", 234 | metadata={ 235 | "folder name": MetadataValue.text("metric_trending"), 236 | "record count": MetadataValue.int(metric.count()), 237 | "column count": MetadataValue.int(len(metric.columns)), 238 | "columns": metric.columns 239 | } 240 | ), Output( 241 | value=information, 242 | output_name="gold_information_trending", 243 | metadata={ 244 | "folder name": MetadataValue.text("information_trending"), 245 | "record count": MetadataValue.int(information.count()), 246 | "column count": MetadataValue.int(len(information.columns)), 247 | "columns": information.columns 248 | } 249 | ), -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/assets/bronze.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | from ..partitions import monthly_partitions 3 | 4 | from dagster import ( 5 | asset, 6 | Output, 7 | AssetIn, 8 | AssetOut, 9 | multi_asset, 10 | MetadataValue, 11 | AssetExecutionContext 12 | ) 13 | 14 | 15 | GROUP_NAME = "bronze" 16 | 17 | @asset( 18 | name="bronze_CA_youtube_trending", 19 | required_resource_keys={"mysql_io_manager"}, 20 | io_manager_key="minio_io_manager", 21 | key_prefix=["bronze", "youtube"], 22 | compute_kind="SQL", 23 | group_name=GROUP_NAME 24 | ) 25 | def bronze_CA_youtube_trending(context: AssetExecutionContext) -> Output[pl.DataFrame]: 26 | """ 27 | Load table 'CA_youtube_trending_data' 28 | from MySQL database as polars DataFrame and save to MinIO 29 | """ 30 | query = """ SELECT * FROM CA_youtube_trending_data; """ 31 | 32 | pl_data: pl.DataFrame = context.resources.mysql_io_manager.extract_data(query) 33 | context.log.info(f"Extract table 'CA_youtube_trending_data' from MySQL Success") 34 | pl_data = pl_data.with_columns(pl.lit("CA").alias("country_code")) 35 | 36 | return Output( 37 | value=pl_data, 38 | metadata={ 39 | "file name": MetadataValue.text("CA_youtube_trending.pq"), 40 | "number columns": MetadataValue.int(pl_data.shape[1]), 41 | "number records": MetadataValue.int(pl_data.shape[0]) 42 | } 43 | ) 44 | 45 | 46 | @asset( 47 | name="bronze_DE_youtube_trending", 48 | required_resource_keys={"mysql_io_manager"}, 49 | io_manager_key="minio_io_manager", 50 | key_prefix=["bronze", "youtube"], 51 | compute_kind="SQL", 52 | group_name=GROUP_NAME 53 | ) 54 | def bronze_DE_youtube_trending(context: AssetExecutionContext) -> Output[pl.DataFrame]: 55 | """ 56 | Load table 'DE_youtube_trending_data' 57 | from MySQL database as polars DataFrame and save to MinIO 58 | """ 59 | query = """ SELECT * FROM DE_youtube_trending_data; """ 60 | 61 | pl_data: pl.DataFrame = context.resources.mysql_io_manager.extract_data(query) 62 | context.log.info(f"Extract table 'DE_youtube_trending_data' from MySQL Success") 63 | pl_data = pl_data.with_columns(pl.lit("DE").alias("country_code")) 64 | 65 | return Output( 66 | value=pl_data, 67 | metadata={ 68 | "file name": MetadataValue.text("DE_youtube_trending.pq"), 69 | "number columns": MetadataValue.int(pl_data.shape[1]), 70 | "number records": MetadataValue.int(pl_data.shape[0]) 71 | } 72 | ) 73 | 74 | 75 | @asset( 76 | name="bronze_IN_youtube_trending", 77 | required_resource_keys={"mysql_io_manager"}, 78 | io_manager_key="minio_io_manager", 79 | key_prefix=["bronze", "youtube"], 80 | compute_kind="SQL", 81 | group_name=GROUP_NAME 82 | ) 83 | def bronze_IN_youtube_trending(context: AssetExecutionContext) -> Output[pl.DataFrame]: 84 | """ 85 | Load table 'IN_youtube_trending_data' 86 | from MySQL database as polars DataFrame and save to MinIO 87 | """ 88 | query = """ SELECT * FROM IN_youtube_trending_data; """ 89 | 90 | pl_data: pl.DataFrame = context.resources.mysql_io_manager.extract_data(query) 91 | context.log.info(f"Extract table 'IN_youtube_trending_data' from MySQL Success") 92 | pl_data = pl_data.with_columns(pl.lit("IN").alias("country_code")) 93 | 94 | return Output( 95 | value=pl_data, 96 | metadata={ 97 | "file name": MetadataValue.text("IN_youtube_trending.pq"), 98 | "number columns": MetadataValue.int(pl_data.shape[1]), 99 | "number records": MetadataValue.int(pl_data.shape[0]) 100 | } 101 | ) 102 | 103 | 104 | @asset( 105 | name="bronze_JP_youtube_trending", 106 | required_resource_keys={"mysql_io_manager"}, 107 | io_manager_key="minio_io_manager", 108 | key_prefix=["bronze", "youtube"], 109 | compute_kind="SQL", 110 | group_name=GROUP_NAME 111 | ) 112 | def bronze_JP_youtube_trending(context: AssetExecutionContext) -> Output[pl.DataFrame]: 113 | """ 114 | Load table 'JP_youtube_trending_data' 115 | from MySQL database as polars DataFrame and save to MinIO 116 | """ 117 | query = """ SELECT * FROM JP_youtube_trending_data; """ 118 | 119 | pl_data: pl.DataFrame = context.resources.mysql_io_manager.extract_data(query) 120 | context.log.info(f"Extract table 'JP_youtube_trending_data' from MySQL Success") 121 | pl_data = pl_data.with_columns(pl.lit("JP").alias("country_code")) 122 | 123 | return Output( 124 | value=pl_data, 125 | metadata={ 126 | "file name": MetadataValue.text("JP_youtube_trending.pq"), 127 | "number columns": MetadataValue.int(pl_data.shape[1]), 128 | "number records": MetadataValue.int(pl_data.shape[0]) 129 | } 130 | ) 131 | 132 | 133 | @asset( 134 | name="bronze_RU_youtube_trending", 135 | required_resource_keys={"mysql_io_manager"}, 136 | io_manager_key="minio_io_manager", 137 | key_prefix=["bronze", "youtube"], 138 | compute_kind="SQL", 139 | group_name=GROUP_NAME 140 | ) 141 | def bronze_RU_youtube_trending(context: AssetExecutionContext) -> Output[pl.DataFrame]: 142 | """ 143 | Load table 'RU_youtube_trending_data' 144 | from MySQL database as polars DataFrame and save to MinIO 145 | """ 146 | query = """ SELECT * FROM RU_youtube_trending_data; """ 147 | 148 | pl_data: pl.DataFrame = context.resources.mysql_io_manager.extract_data(query) 149 | context.log.info(f"Extract table 'RU_youtube_trending_data' from MySQL Success") 150 | pl_data = pl_data.with_columns(pl.lit("RU").alias("country_code")) 151 | 152 | return Output( 153 | value=pl_data, 154 | metadata={ 155 | "file name": MetadataValue.text("RU_youtube_trending.pq"), 156 | "number columns": MetadataValue.int(pl_data.shape[1]), 157 | "number records": MetadataValue.int(pl_data.shape[0]) 158 | } 159 | ) 160 | 161 | 162 | @asset( 163 | ins={ 164 | "bronze_CA_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]), 165 | "bronze_DE_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]), 166 | "bronze_IN_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]) 167 | }, 168 | name="bronze_linkVideos_trending", 169 | required_resource_keys={"youtube_io_manager"}, 170 | io_manager_key="minio_io_manager", 171 | key_prefix=["bronze", "youtube"], 172 | group_name=GROUP_NAME, 173 | compute_kind="Youtube API" 174 | ) 175 | def bronze_linkVideos_trending(context: AssetExecutionContext, 176 | bronze_CA_youtube_trending: pl.DataFrame, 177 | bronze_DE_youtube_trending: pl.DataFrame, 178 | bronze_IN_youtube_trending: pl.DataFrame 179 | ) -> Output[pl.DataFrame]: 180 | """ 181 | Download Link Video from Youtube API by VideoId 182 | """ 183 | data = pl.concat( 184 | [ 185 | bronze_CA_youtube_trending, 186 | bronze_DE_youtube_trending, 187 | bronze_IN_youtube_trending 188 | ] 189 | ) 190 | 191 | pl_data: pl.DataFrame = context \ 192 | .resources \ 193 | .youtube_io_manager \ 194 | .downLoad_linkVideos( 195 | context, data 196 | ) 197 | context.log.info("Download links video from youtube api success") 198 | 199 | return Output( 200 | value=pl_data, 201 | metadata={ 202 | "File Name": MetadataValue.text("linkVideos_trending.pq"), 203 | "Number Columns": MetadataValue.int(pl_data.shape[1]), 204 | "Number Records": MetadataValue.int(pl_data.shape[0]) 205 | } 206 | ) 207 | 208 | 209 | @asset( 210 | ins={ 211 | "bronze_JP_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]), 212 | "bronze_RU_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]) 213 | }, 214 | name="bronze_videoCategory_trending", 215 | required_resource_keys={"youtube_io_manager"}, 216 | io_manager_key="minio_io_manager", 217 | key_prefix=["bronze", "youtube"], 218 | compute_kind="Youtube API", 219 | group_name=GROUP_NAME, 220 | ) 221 | def bronze_videoCategory_trending(context: AssetExecutionContext, 222 | bronze_JP_youtube_trending: pl.DataFrame, 223 | bronze_RU_youtube_trending: pl.DataFrame 224 | ) -> Output[pl.DataFrame]: 225 | """ 226 | Download Video Category from Youtube API by categoryId 227 | """ 228 | data = pl.concat( 229 | [ 230 | bronze_JP_youtube_trending, 231 | bronze_RU_youtube_trending 232 | ] 233 | ) 234 | 235 | pl_data: pl.DataFrame = context \ 236 | .resources \ 237 | .youtube_io_manager \ 238 | .downLoad_videoCategories( 239 | context, data 240 | ) 241 | context.log.info("Download video category from youtube api success") 242 | 243 | return Output( 244 | value=pl_data, 245 | metadata={ 246 | "File Name": MetadataValue.text("videoCategory_trending.pq"), 247 | "Number Columns": MetadataValue.int(pl_data.shape[1]), 248 | "Number Records": MetadataValue.int(pl_data.shape[0]), 249 | } 250 | ) 251 | 252 | 253 | @asset( 254 | ins={ 255 | "bronze_CA_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]), 256 | "bronze_DE_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]), 257 | "bronze_IN_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]), 258 | "bronze_JP_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]), 259 | "bronze_RU_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]) 260 | }, 261 | name="bronze_youtube_trending", 262 | required_resource_keys={"youtube_io_manager"}, 263 | io_manager_key="minio_io_manager", 264 | key_prefix=["bronze", "youtube"], 265 | compute_kind="Polars", 266 | group_name=GROUP_NAME, 267 | ) 268 | def bronze_youtube_trending(context: AssetExecutionContext, 269 | bronze_CA_youtube_trending: pl.DataFrame, 270 | bronze_DE_youtube_trending: pl.DataFrame, 271 | bronze_IN_youtube_trending: pl.DataFrame, 272 | bronze_JP_youtube_trending: pl.DataFrame, 273 | bronze_RU_youtube_trending: pl.DataFrame 274 | ) -> Output[pl.DataFrame]: 275 | """ """ 276 | 277 | pl_data = pl.concat( 278 | [ 279 | bronze_CA_youtube_trending, 280 | bronze_DE_youtube_trending, 281 | bronze_IN_youtube_trending, 282 | bronze_JP_youtube_trending, 283 | bronze_RU_youtube_trending 284 | ] 285 | ) 286 | 287 | # 2020-08-11T16:34:06Z 288 | pl_data = pl_data.with_columns(pl.col('publishedAt').apply(lambda e: e.replace('T', ' ').replace('Z', ''))) 289 | pl_data = pl_data.with_columns(pl.col("publishedAt").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S")) 290 | 291 | return Output( 292 | value=pl_data, 293 | metadata={ 294 | "File Name": MetadataValue.text("youtube_trending.pq"), 295 | "Number Columns": MetadataValue.int(pl_data.shape[1]), 296 | "Number Records": MetadataValue.int(pl_data.shape[0]), 297 | } 298 | ) -------------------------------------------------------------------------------- /etl_pipeline/etl_pipeline/assets/silver.py: -------------------------------------------------------------------------------- 1 | import os 2 | import polars as pl 3 | import pandas as pd 4 | from datetime import datetime 5 | 6 | from pyspark.sql import SparkSession, DataFrame 7 | from pyspark.sql.types import IntegerType, StringType 8 | from pyspark.sql.functions import udf, to_timestamp, count 9 | from pyspark.sql.functions import when, col, concat, lit 10 | 11 | from ..partitions import monthly_partitions 12 | from ..func_process import replace_str, format_date, convert 13 | from ..resources.spark_io_manager import create_spark_session 14 | 15 | from dagster import ( 16 | AssetExecutionContext, 17 | MetadataValue, 18 | AssetIn, 19 | AssetIn, 20 | Output, 21 | asset 22 | ) 23 | 24 | 25 | GROUP_NAME = "silver" 26 | 27 | @asset( 28 | ins={ 29 | "bronze_videoCategory_trending": AssetIn( 30 | key_prefix=["bronze", "youtube"] 31 | ) 32 | }, 33 | name="silver_videoCategory_cleaned", 34 | required_resource_keys={"spark_io_manager"}, 35 | io_manager_key="spark_io_manager", 36 | key_prefix=["silver", "youtube"], 37 | compute_kind="PySpark", 38 | group_name=GROUP_NAME 39 | ) 40 | def silver_videoCategory_cleaned(context: AssetExecutionContext, 41 | bronze_videoCategory_trending: pl.DataFrame 42 | ) -> Output[DataFrame]: 43 | """ 44 | Clean 'videoCategory_trending_data' and load to silver layer in MinIO 45 | """ 46 | # spark: SparkSession = context.resources.spark_io_manager.get_spark_session( 47 | # context, "silver_videoCategory_cleaned-{}".format(datetime.today()) 48 | # ) 49 | CONFIG = { 50 | "endpoint_url": os.getenv("MINIO_ENDPOINT"), 51 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"), 52 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"), 53 | } 54 | 55 | with create_spark_session( 56 | CONFIG, "silver_videoCategory_cleaned-{}".format(datetime.today()) 57 | ) as spark: 58 | 59 | # Convert from polars dataframe to pyspark dataframe 60 | spark_df: DataFrame = spark.createDataFrame(bronze_videoCategory_trending.to_pandas()) 61 | # Convert data type from string to integer of categoryId column 62 | spark_df = spark_df.withColumn("categoryId", spark_df["categoryId"].cast(IntegerType())) 63 | # Sorted dataframe by categoryId column 64 | spark_df = spark_df.orderBy(spark_df["categoryId"]) 65 | # polars_df = pl.DataFrame(spark_df.toPandas()) 66 | context.log.info(f"Cleaning for {context.asset_key.path[-1]} success 🙂") 67 | 68 | return Output( 69 | value=spark_df, 70 | metadata={ 71 | "File Name": MetadataValue.text("videoCategory_cleaned.pq"), 72 | "Number Columns": MetadataValue.int(len(spark_df.columns)), 73 | "Number Records": MetadataValue.int(spark_df.count()) 74 | } 75 | ) 76 | 77 | 78 | @asset( 79 | ins={ 80 | "bronze_linkVideos_trending": AssetIn(key_prefix=["bronze", "youtube"]), 81 | "bronze_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]), 82 | }, 83 | name="silver_linkVideos_cleaned", 84 | required_resource_keys={"spark_io_manager", "youtube_io_manager"}, 85 | io_manager_key="spark_io_manager", 86 | key_prefix=["silver", "youtube"], 87 | compute_kind="PySpark", 88 | group_name=GROUP_NAME 89 | ) 90 | def silver_linkVideos_cleaned(context: AssetExecutionContext, 91 | bronze_linkVideos_trending: pl.DataFrame, 92 | bronze_youtube_trending: pl.DataFrame 93 | ) -> Output[DataFrame]: 94 | """ 95 | Clean 'linkVideos_trending_data' and load to silver layer in MinIO 96 | """ 97 | # spark: SparkSession = context.resources.spark_io_manager.get_spark_session( 98 | # context, "silver_linkVideos_cleaned-{}".format(datetime.today()) 99 | # ) 100 | 101 | CONFIG = { 102 | "endpoint_url": os.getenv("MINIO_ENDPOINT"), 103 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"), 104 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"), 105 | } 106 | 107 | with create_spark_session( 108 | CONFIG, "silver_linkVideos_cleaned-{}".format(datetime.today()) 109 | ) as spark: 110 | 111 | # Convert from polars dataframe to pyspark dataframe for linkVideos 112 | linkVideos: DataFrame = spark.createDataFrame(bronze_linkVideos_trending.to_pandas()) 113 | # Convert from polars dataframe to pyspark dataframe for trending 114 | trending: DataFrame = spark.createDataFrame(bronze_youtube_trending.to_pandas()) 115 | # Drop duplicates by video_id for trending 116 | trending = trending.dropDuplicates(["video_id"]) 117 | # Convert the link to the correct format 118 | link_format = udf(convert, StringType()) 119 | linkVideos = linkVideos.withColumn("link_video", link_format(linkVideos['link_video'])) 120 | # Join two dataframe by video_id 121 | spark_df = linkVideos.join( 122 | trending, 123 | linkVideos["videoId"] == trending["video_id"], 124 | how="outer", 125 | ).select(trending.video_id, linkVideos.link_video) 126 | spark_df.cache() 127 | 128 | # fill NA for link video 129 | spark_df = spark_df.withColumn("link_video",when( 130 | col("link_video").isNull(), 131 | concat(lit("www.youtube.com/embed/"), 132 | col("video_id"))).otherwise(col("link_video")) 133 | ) 134 | context.log.info(f"Cleaning for {context.asset_key.path[-1]} success 🙂") 135 | 136 | spark_df.unpersist() 137 | 138 | # trending = pl.concat( 139 | # [ 140 | # silver_youtube_trending_01, 141 | # silver_youtube_trending_02 142 | # ] 143 | # ) 144 | # bronze_linkVideos_trending = bronze_linkVideos_trending.with_columns( 145 | # pl.col('link_video').apply(lambda e: e.replace('"', '')) 146 | # ) 147 | # bronze_youtube_trending = bronze_youtube_trending.unique(subset=["video_id"]) 148 | # polars_df = bronze_linkVideos_trending.join( 149 | # bronze_youtube_trending, 150 | # left_on="videoId", 151 | # right_on="video_id", 152 | # how="outer" 153 | # ).select(["video_id", "link_video"]) 154 | 155 | # polars_df = polars_df.with_columns( 156 | # pl.when(pl.col("link_video").is_null()).then(pl.format("www.youtube.com/embed/{}", pl.col("video_id"))) 157 | # .otherwise(pl.col("link_video")).alias("link_video") 158 | # ) 159 | # context.log.info(f"Cleaning for {context.asset_key.path[-1]} success 🙂") 160 | 161 | return Output( 162 | value=spark_df, 163 | metadata={ 164 | "File Name": MetadataValue.text("linkVideos_cleaned.pq"), 165 | "Number Columns": MetadataValue.int(len(spark_df.columns)), 166 | "Number Records": MetadataValue.int(spark_df.count()) 167 | } 168 | ) 169 | 170 | 171 | @asset( 172 | ins={ 173 | "bronze_youtube_trending": AssetIn(key_prefix=["bronze", "youtube"]), 174 | }, 175 | name="silver_trending_cleaned", 176 | required_resource_keys={"spark_io_manager"}, 177 | io_manager_key="spark_io_manager", 178 | key_prefix=["silver", "youtube"], 179 | partitions_def=monthly_partitions, 180 | compute_kind="PySpark", 181 | group_name=GROUP_NAME 182 | ) 183 | def silver_trending_cleaned(context: AssetExecutionContext, 184 | bronze_youtube_trending: pl.DataFrame, 185 | ) -> Output[DataFrame]: 186 | """ 187 | Clean 'bronze_youtube_trending_data' and load to silver layer in MinIO 188 | """ 189 | 190 | try: 191 | partition_date_str = context.asset_partition_key_for_output() 192 | data_by_publishedAt = bronze_youtube_trending.filter( 193 | (pl.col("publishedAt").dt.year() == int(partition_date_str[:4])) & 194 | (pl.col("publishedAt").dt.month() == int(partition_date_str[5:7])) 195 | ) 196 | except Exception as e: 197 | raise Exception(f"{e}") 198 | 199 | # data_by_publishedAt = data_by_publishedAt.with_columns( 200 | # pl.col('trending_date').apply(lambda e: e.replace('T', ' ').replace('Z', '')) 201 | # ) 202 | # data_by_publishedAt = data_by_publishedAt.with_columns( 203 | # pl.col("trending_date").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S") 204 | # ) 205 | # data_by_publishedAt = data_by_publishedAt.with_columns( 206 | # pl.when(pl.col("thumbnail_link").is_not_null()) 207 | # .then(pl.col("thumbnail_link").str.replace("default.jpg", "maxresdefault.jpg")) 208 | # .otherwise(pl.col("thumbnail_link")).alias("thumbnail_link") 209 | # ) 210 | 211 | # data_by_publishedAt = data_by_publishedAt.with_columns( 212 | # pl.col("comment_count").str.parse_int(10, strict=False) 213 | # ) 214 | # data_by_publishedAt = data_by_publishedAt.filter(pl.col("comment_count").is_not_null()) 215 | 216 | # data_by_publishedAt = data_by_publishedAt.with_columns( 217 | # pl.col('tags').apply(lambda e: e.replace('|', ' #').replace('Z', '')) 218 | # ) #Squeezie arnaque #Squeezie tableau #Squeezie thread #Squeezie art #Squeezie arnaqueur 219 | 220 | # data_by_publishedAt = data_by_publishedAt.with_columns( 221 | # (pl.col('tags').apply(lambda x: f"#{x}")) 222 | # ) 223 | 224 | # data_by_publishedAt = data_by_publishedAt.with_columns([ 225 | # pl.col("categoryId").cast(pl.Int64), 226 | # pl.col("view_count").cast(pl.Int64), 227 | # pl.col("likes").cast(pl.Int64), 228 | # pl.col("dislikes").cast(pl.Int64), 229 | # pl.col("comment_count").cast(pl.Int64) 230 | # ]) 231 | # context.log.info(f"Cleaning for {context.asset_key.path[-1]} success 🙂") 232 | 233 | # polars_df: pl.DataFrame = data_by_publishedAt 234 | 235 | # spark: SparkSession = context.resources.spark_io_manager.get_spark_session( 236 | # context, "silver_trending_cleaned-{}".format(datetime.today()) 237 | # ) 238 | 239 | CONFIG = { 240 | "endpoint_url": os.getenv("MINIO_ENDPOINT"), 241 | "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"), 242 | "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"), 243 | } 244 | 245 | with create_spark_session( 246 | CONFIG, "silver_trending_cleaned-{}".format(datetime.today()) 247 | ) as spark: 248 | 249 | spark_df: DataFrame = spark.createDataFrame(data_by_publishedAt.to_pandas()) 250 | # publishedAt replace to format date 251 | date_format = udf(format_date, StringType()) 252 | # spark_df = spark_df.withColumn("publishedAt", date_format(spark_df["publishedAt"])) 253 | # Convert date type of column publishedAt to datetime data type 254 | spark_df = spark_df.withColumn("publishedAt", to_timestamp("publishedAt")) 255 | # Convert date type of column categoryId to integer data type 256 | spark_df = spark_df.withColumn("categoryId", spark_df["categoryId"].cast(IntegerType())) 257 | # trending_date replace to format date 258 | spark_df = spark_df.withColumn("trending_date", date_format(spark_df["trending_date"])) 259 | # Convert date type of column trending_date to datetime data type 260 | spark_df = spark_df.withColumn("trending_date", to_timestamp("trending_date")) 261 | # Convert date type of column view_count to integer data type 262 | spark_df = spark_df.withColumn("view_count", spark_df["view_count"].cast(IntegerType())) 263 | # Convert date type of column likes to integer data type 264 | spark_df = spark_df.withColumn("likes", spark_df["likes"].cast(IntegerType())) 265 | # Convert date type of column dislikes to integer data type 266 | spark_df = spark_df.withColumn("dislikes", spark_df["dislikes"].cast(IntegerType())) 267 | # Convert date type of column comment_count to integer data type 268 | spark_df = spark_df.withColumn("comment_count", spark_df["comment_count"].cast(IntegerType())) 269 | # thumbnail_link replace from default to maxresdefault 270 | link_convert = udf(replace_str, StringType()) 271 | spark_df = spark_df.withColumn("thumbnail_link", link_convert(spark_df["thumbnail_link"])) 272 | # context.log.info(f"Data: {spark_df.show(5)}") 273 | spark_df.unpersist() 274 | context.log.info(f"Cleaning for {context.asset_key.path[-1]} success 🙂") 275 | # polars_df = pl.DataFrame(spark_df.toPandas()) 276 | 277 | return Output( 278 | value=spark_df, 279 | metadata={ 280 | "file name": MetadataValue.text(f"{partition_date_str[:7]}.pq"), 281 | "Records": MetadataValue.int(spark_df.count()), 282 | "Columns": MetadataValue.int(len(spark_df.columns)) 283 | } 284 | ) -------------------------------------------------------------------------------- /public/notebooks/Preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "0wCyW8k45E-a", 6 | "metadata": { 7 | "id": "0wCyW8k45E-a" 8 | }, 9 | "source": [ 10 | "# Xử lý trước khi đưa vào Asset" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "6880fc6d", 16 | "metadata": { 17 | "id": "6880fc6d" 18 | }, 19 | "source": [ 20 | "# Import Library" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "id": "8kaoHMK_lY1U", 27 | "metadata": { 28 | "colab": { 29 | "base_uri": "https://localhost:8080/" 30 | }, 31 | "id": "8kaoHMK_lY1U", 32 | "outputId": "10480db2-cc8d-44ef-8fda-0c3bba66971f" 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "Mounted at /content/drive\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "from google.colab import drive\n", 45 | "drive.mount('/content/drive')" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "id": "6-BAYjQS2NzN", 52 | "metadata": { 53 | "id": "6-BAYjQS2NzN" 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "from pyspark.sql import SparkSession, DataFrame\n", 58 | "from pyspark.sql.types import IntegerType, StringType\n", 59 | "from pyspark.sql.functions import udf, to_timestamp" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 6, 65 | "id": "45adf8f6-0480-49af-966b-d1dfefa29ab3", 66 | "metadata": { 67 | "id": "45adf8f6-0480-49af-966b-d1dfefa29ab3" 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "# Create SparkSession object\n", 72 | "spark = SparkSession.builder \\\n", 73 | " .appName(\"HomeWork-W6\") \\\n", 74 | " .getOrCreate()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 8, 80 | "id": "dhgY_ug22qoX", 81 | "metadata": { 82 | "id": "dhgY_ug22qoX" 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "spark_df = spark.read.parquet(\"/content/drive/MyDrive/Colab Notebooks/202204.pq\")" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 9, 92 | "id": "dY1y4YBf2zox", 93 | "metadata": { 94 | "colab": { 95 | "base_uri": "https://localhost:8080/" 96 | }, 97 | "id": "dY1y4YBf2zox", 98 | "outputId": "329f8ff8-632c-4018-e840-94cf7f8da43d" 99 | }, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "+-----------+--------------------+--------------------+--------------------+---------------+----------+--------------------+--------------------+----------+-----+--------+-------------+--------------------+-----------------+----------------+\n", 106 | "| video_id| title| publishedAt| channelId| channelTitle|categoryId| trending_date| tags|view_count|likes|dislikes|comment_count| thumbnail_link|comments_disabled|ratings_disabled|\n", 107 | "+-----------+--------------------+--------------------+--------------------+---------------+----------+--------------------+--------------------+----------+-----+--------+-------------+--------------------+-----------------+----------------+\n", 108 | "|zoHGxJKjC_Y|Heiratsantrag, di...|2022-04-01T16:36:48Z|UCm3_j4RLEzgMovQT...| Drachen Lord| 24|2022-04-02T00:00:00Z|drachenlord origi...| 126194| 4922| 0| 2069|https://i.ytimg.c...| False| False|\n", 109 | "|s38-OigKoIU|Nachgefragt: Panz...|2022-04-01T11:03:23Z|UClCZul-nK9h8eVo7...| Bundeswehr| 25|2022-04-02T00:00:00Z|Bundeswehr|Soldat...| 345217|10056| 0| 1927|https://i.ytimg.c...| False| False|\n", 110 | "|fn_DBhbEscA|Aprilscherze in d...|2022-04-01T12:00:31Z|UC6UrlhHQXm9tWhZc...| How2Shirli| 22|2022-04-02T00:00:00Z| [None]| 353375|47638| 0| 517|https://i.ytimg.c...| False| False|\n", 111 | "|JpiJT7lLuAE| MOIN GERHARD!|2022-04-01T12:33:48Z|UC3oj6YrK6Tj3tR6-...| Tom Stein| 24|2022-04-02T00:00:00Z| [None]| 67361| 6114| 0| 435|https://i.ytimg.c...| False| False|\n", 112 | "|u_D9tg3cK1w|Saltatio Mortis f...|2022-04-01T10:01:34Z|UCDGhwUyQMvcNqz15...|Saltatio Mortis| 24|2022-04-02T00:00:00Z|Saltatio Morits|H...| 34958| 572| 0| 163|https://i.ytimg.c...| False| False|\n", 113 | "+-----------+--------------------+--------------------+--------------------+---------------+----------+--------------------+--------------------+----------+-----+--------+-------------+--------------------+-----------------+----------------+\n", 114 | "only showing top 5 rows\n", 115 | "\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "spark_df.show(5)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 10, 126 | "id": "v7aGpy5328MN", 127 | "metadata": { 128 | "colab": { 129 | "base_uri": "https://localhost:8080/" 130 | }, 131 | "id": "v7aGpy5328MN", 132 | "outputId": "fed2732f-d0c5-4e72-88e7-53360849b9e5" 133 | }, 134 | "outputs": [ 135 | { 136 | "name": "stdout", 137 | "output_type": "stream", 138 | "text": [ 139 | "root\n", 140 | " |-- video_id: string (nullable = true)\n", 141 | " |-- title: string (nullable = true)\n", 142 | " |-- publishedAt: string (nullable = true)\n", 143 | " |-- channelId: string (nullable = true)\n", 144 | " |-- channelTitle: string (nullable = true)\n", 145 | " |-- categoryId: string (nullable = true)\n", 146 | " |-- trending_date: string (nullable = true)\n", 147 | " |-- tags: string (nullable = true)\n", 148 | " |-- view_count: string (nullable = true)\n", 149 | " |-- likes: string (nullable = true)\n", 150 | " |-- dislikes: string (nullable = true)\n", 151 | " |-- comment_count: string (nullable = true)\n", 152 | " |-- thumbnail_link: string (nullable = true)\n", 153 | " |-- comments_disabled: string (nullable = true)\n", 154 | " |-- ratings_disabled: string (nullable = true)\n", 155 | "\n" 156 | ] 157 | } 158 | ], 159 | "source": [ 160 | "spark_df.printSchema()" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 7, 166 | "id": "tQW9x4yi2Fta", 167 | "metadata": { 168 | "id": "tQW9x4yi2Fta" 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "def replace_str(value: str):\n", 173 | " return value.replace(\"default\", \"maxresdefault\")\n", 174 | "\n", 175 | "def format_date(value: str):\n", 176 | " return value.replace(\"T\", \" \").replace(\"Z\", \"\")" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 11, 182 | "id": "Wcl-xS4r2Fwq", 183 | "metadata": { 184 | "id": "Wcl-xS4r2Fwq" 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "date_format = udf(format_date, StringType())\n", 189 | "spark_df = spark_df.withColumn(\"publishedAt\", date_format(spark_df[\"publishedAt\"]))" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 12, 195 | "id": "r6hpuRoT3he6", 196 | "metadata": { 197 | "colab": { 198 | "base_uri": "https://localhost:8080/" 199 | }, 200 | "id": "r6hpuRoT3he6", 201 | "outputId": "581b6d63-cf30-4e86-a472-e8844d928326" 202 | }, 203 | "outputs": [ 204 | { 205 | "name": "stdout", 206 | "output_type": "stream", 207 | "text": [ 208 | "+-------------------+\n", 209 | "| publishedAt|\n", 210 | "+-------------------+\n", 211 | "|2022-04-01 16:36:48|\n", 212 | "|2022-04-01 11:03:23|\n", 213 | "+-------------------+\n", 214 | "only showing top 2 rows\n", 215 | "\n" 216 | ] 217 | } 218 | ], 219 | "source": [ 220 | "spark_df.select(\"publishedAt\").show(2)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 13, 226 | "id": "AN2oaZJm2F5E", 227 | "metadata": { 228 | "id": "AN2oaZJm2F5E" 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "# Convert date type of column publishedAt to datetime data type\n", 233 | "spark_df = spark_df.withColumn(\"publishedAt\", to_timestamp(\"publishedAt\"))" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 14, 239 | "id": "wOsobTLN2F7Q", 240 | "metadata": { 241 | "id": "wOsobTLN2F7Q" 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "# Convert date type of column categoryId to integer data type\n", 246 | "spark_df = spark_df.withColumn(\"categoryId\", spark_df[\"categoryId\"].cast(IntegerType()))" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 15, 252 | "id": "g6FcJJ1N2F-O", 253 | "metadata": { 254 | "id": "g6FcJJ1N2F-O" 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "# trending_date replace to format date\n", 259 | "spark_df = spark_df.withColumn(\"trending_date\", date_format(spark_df[\"trending_date\"]))" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 16, 265 | "id": "JSNG_4MW4ELz", 266 | "metadata": { 267 | "colab": { 268 | "base_uri": "https://localhost:8080/" 269 | }, 270 | "id": "JSNG_4MW4ELz", 271 | "outputId": "c16f96a7-9267-40da-c2d1-7b0f429fc15c" 272 | }, 273 | "outputs": [ 274 | { 275 | "name": "stdout", 276 | "output_type": "stream", 277 | "text": [ 278 | "+-------------------+\n", 279 | "| trending_date|\n", 280 | "+-------------------+\n", 281 | "|2022-04-02 00:00:00|\n", 282 | "|2022-04-02 00:00:00|\n", 283 | "+-------------------+\n", 284 | "only showing top 2 rows\n", 285 | "\n" 286 | ] 287 | } 288 | ], 289 | "source": [ 290 | "spark_df.select(\"trending_date\").show(2)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 17, 296 | "id": "4bovaYHZ2GBQ", 297 | "metadata": { 298 | "id": "4bovaYHZ2GBQ" 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "# Convert date type of column trending_date to datetime data type\n", 303 | "spark_df = spark_df.withColumn(\"trending_date\", to_timestamp(\"trending_date\"))" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 18, 309 | "id": "xlQ7QkYJ2GEA", 310 | "metadata": { 311 | "id": "xlQ7QkYJ2GEA" 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "# Convert date type of column view_count to integer data type\n", 316 | "spark_df = spark_df.withColumn(\"view_count\", spark_df[\"view_count\"].cast(IntegerType()))" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 19, 322 | "id": "Inflh60t2GGj", 323 | "metadata": { 324 | "id": "Inflh60t2GGj" 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "# Convert date type of column likes to integer data type\n", 329 | "spark_df = spark_df.withColumn(\"likes\", spark_df[\"likes\"].cast(IntegerType()))" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 20, 335 | "id": "jxJWX5ox2GLU", 336 | "metadata": { 337 | "id": "jxJWX5ox2GLU" 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "# Convert date type of column dislikes to integer data type\n", 342 | "spark_df = spark_df.withColumn(\"dislikes\", spark_df[\"dislikes\"].cast(IntegerType()))" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 21, 348 | "id": "ZcSxAoyQ3ZTo", 349 | "metadata": { 350 | "id": "ZcSxAoyQ3ZTo" 351 | }, 352 | "outputs": [], 353 | "source": [ 354 | "# Convert date type of column comment_count to integer data type\n", 355 | "spark_df = spark_df.withColumn(\"comment_count\", spark_df[\"comment_count\"].cast(IntegerType()))" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 22, 361 | "id": "sR2x4HY83ZWz", 362 | "metadata": { 363 | "id": "sR2x4HY83ZWz" 364 | }, 365 | "outputs": [], 366 | "source": [ 367 | "# thumbnail_link replace from default to maxresdefault\n", 368 | "link_convert = udf(replace_str, StringType())\n", 369 | "spark_df = spark_df.withColumn(\"thumbnail_link\", link_convert(spark_df[\"thumbnail_link\"]))" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 27, 375 | "id": "m8REaetN3ZZU", 376 | "metadata": { 377 | "colab": { 378 | "base_uri": "https://localhost:8080/", 379 | "height": 36 380 | }, 381 | "id": "m8REaetN3ZZU", 382 | "outputId": "893811ff-3c5a-410a-ea90-96d637af1ba2" 383 | }, 384 | "outputs": [ 385 | { 386 | "data": { 387 | "application/vnd.google.colaboratory.intrinsic+json": { 388 | "type": "string" 389 | }, 390 | "text/plain": [ 391 | "'https://i.ytimg.com/vi/EfP1h_3u0Lk/maxresdefault.jpg'" 392 | ] 393 | }, 394 | "execution_count": 27, 395 | "metadata": {}, 396 | "output_type": "execute_result" 397 | } 398 | ], 399 | "source": [ 400 | "spark_df.select(\"thumbnail_link\").collect()[17][0]" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 28, 406 | "id": "PlOr7qhb3Zbh", 407 | "metadata": { 408 | "colab": { 409 | "base_uri": "https://localhost:8080/" 410 | }, 411 | "id": "PlOr7qhb3Zbh", 412 | "outputId": "f49d0243-e8c7-4d80-eca1-154313a60f3f" 413 | }, 414 | "outputs": [ 415 | { 416 | "name": "stdout", 417 | "output_type": "stream", 418 | "text": [ 419 | "root\n", 420 | " |-- video_id: string (nullable = true)\n", 421 | " |-- title: string (nullable = true)\n", 422 | " |-- publishedAt: timestamp (nullable = true)\n", 423 | " |-- channelId: string (nullable = true)\n", 424 | " |-- channelTitle: string (nullable = true)\n", 425 | " |-- categoryId: integer (nullable = true)\n", 426 | " |-- trending_date: timestamp (nullable = true)\n", 427 | " |-- tags: string (nullable = true)\n", 428 | " |-- view_count: integer (nullable = true)\n", 429 | " |-- likes: integer (nullable = true)\n", 430 | " |-- dislikes: integer (nullable = true)\n", 431 | " |-- comment_count: integer (nullable = true)\n", 432 | " |-- thumbnail_link: string (nullable = true)\n", 433 | " |-- comments_disabled: string (nullable = true)\n", 434 | " |-- ratings_disabled: string (nullable = true)\n", 435 | "\n" 436 | ] 437 | } 438 | ], 439 | "source": [ 440 | "# Check\n", 441 | "spark_df.printSchema()" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "id": "0npxdqo03Zdr", 448 | "metadata": { 449 | "id": "0npxdqo03Zdr" 450 | }, 451 | "outputs": [], 452 | "source": [] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "id": "vJWbPWHg3ZgW", 458 | "metadata": { 459 | "id": "vJWbPWHg3ZgW" 460 | }, 461 | "outputs": [], 462 | "source": [] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "id": "lrfACvfa2GO2", 468 | "metadata": { 469 | "id": "lrfACvfa2GO2" 470 | }, 471 | "outputs": [], 472 | "source": [] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "id": "oMo9Hmry2GXW", 478 | "metadata": { 479 | "id": "oMo9Hmry2GXW" 480 | }, 481 | "outputs": [], 482 | "source": [] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "id": "bb609589-83c8-45c5-b4ce-371458647e8c", 488 | "metadata": { 489 | "id": "bb609589-83c8-45c5-b4ce-371458647e8c" 490 | }, 491 | "outputs": [], 492 | "source": [ 493 | "from pyspark.sql.functions import *\n", 494 | "from pyspark.sql import SparkSession\n", 495 | "from pyspark.sql.functions import round\n", 496 | "from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Bucketizer\n", 497 | "from pyspark.ml.regression import LinearRegression" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "id": "c9fe1d06-d632-401b-b98b-c59d1198685b", 504 | "metadata": { 505 | "id": "c9fe1d06-d632-401b-b98b-c59d1198685b" 506 | }, 507 | "outputs": [], 508 | "source": [ 509 | "import seaborn as sns\n", 510 | "import matplotlib.pyplot as plt\n", 511 | "\n", 512 | "%matplotlib inline\n", 513 | "import warnings\n", 514 | "warnings.filterwarnings('ignore')" 515 | ] 516 | }, 517 | { 518 | "cell_type": "markdown", 519 | "id": "c1271c1f", 520 | "metadata": { 521 | "id": "c1271c1f" 522 | }, 523 | "source": [ 524 | "# Read Data" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": null, 530 | "id": "801e7626", 531 | "metadata": { 532 | "id": "801e7626" 533 | }, 534 | "outputs": [], 535 | "source": [ 536 | "# Read data from CSV file\n", 537 | "df = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/properties_2016.csv', sep=',', header=True, inferSchema=True, nullValue='NA')" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "id": "968d018d", 544 | "metadata": { 545 | "colab": { 546 | "base_uri": "https://localhost:8080/" 547 | }, 548 | "id": "968d018d", 549 | "outputId": "c0050aba-eb23-482d-b4d9-ad85826b8ddf" 550 | }, 551 | "outputs": [ 552 | { 553 | "name": "stdout", 554 | "output_type": "stream", 555 | "text": [ 556 | "Dataset have 2985217 records\n" 557 | ] 558 | } 559 | ], 560 | "source": [ 561 | "# Get number of records\n", 562 | "print(f\"Dataset have {df.count()} records\")" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": null, 568 | "id": "ea7b3db3-8682-4bac-a2ec-cd0324cb19cd", 569 | "metadata": { 570 | "colab": { 571 | "base_uri": "https://localhost:8080/" 572 | }, 573 | "id": "ea7b3db3-8682-4bac-a2ec-cd0324cb19cd", 574 | "outputId": "c71a1c3a-6004-4e52-9cbd-b05ed8b71e61" 575 | }, 576 | "outputs": [ 577 | { 578 | "name": "stdout", 579 | "output_type": "stream", 580 | "text": [ 581 | "column: 58\n", 582 | "row: 2985217\n" 583 | ] 584 | } 585 | ], 586 | "source": [ 587 | "# Get Shape dataset\n", 588 | "print(f\"column: {len(df.columns)}\\nrow: {df.count()}\")" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": null, 594 | "id": "7ec68403", 595 | "metadata": { 596 | "colab": { 597 | "base_uri": "https://localhost:8080/" 598 | }, 599 | "id": "7ec68403", 600 | "outputId": "e729d51d-deed-49b5-da2c-cfe3701d5f53" 601 | }, 602 | "outputs": [ 603 | { 604 | "name": "stdout", 605 | "output_type": "stream", 606 | "text": [ 607 | "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+----+------------+-----------+------------+---------------+--------------+---------------------+--------+----------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n", 608 | "|parcelid|airconditioningtypeid|architecturalstyletypeid|basementsqft|bathroomcnt|bedroomcnt|buildingclasstypeid|buildingqualitytypeid|calculatedbathnbr|decktypeid|finishedfloor1squarefeet|calculatedfinishedsquarefeet|finishedsquarefeet12|finishedsquarefeet13|finishedsquarefeet15|finishedsquarefeet50|finishedsquarefeet6|fips|fireplacecnt|fullbathcnt|garagecarcnt|garagetotalsqft|hashottuborspa|heatingorsystemtypeid|latitude| longitude|lotsizesquarefeet|poolcnt|poolsizesum|pooltypeid10|pooltypeid2|pooltypeid7|propertycountylandusecode|propertylandusetypeid|propertyzoningdesc|rawcensustractandblock|regionidcity|regionidcounty|regionidneighborhood|regionidzip|roomcnt|storytypeid|threequarterbathnbr|typeconstructiontypeid|unitcnt|yardbuildingsqft17|yardbuildingsqft26|yearbuilt|numberofstories|fireplaceflag|structuretaxvaluedollarcnt|taxvaluedollarcnt|assessmentyear|landtaxvaluedollarcnt|taxamount|taxdelinquencyflag|taxdelinquencyyear|censustractandblock|\n", 609 | "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+----+------------+-----------+------------+---------------+--------------+---------------------+--------+----------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n", 610 | "|10754147| NULL| NULL| NULL| 0.0| 0.0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|6037| NULL| NULL| NULL| NULL| NULL| NULL|34144442|-118654084| 85768.0| NULL| NULL| NULL| NULL| NULL| 010D| 269| NULL| 6.0378002041E7| 37688| 3101| NULL| 96337| 0.0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| 9.0| 2015| 9.0| NULL| NULL| NULL| NULL|\n", 611 | "|10759547| NULL| NULL| NULL| 0.0| 0.0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|6037| NULL| NULL| NULL| NULL| NULL| NULL|34140430|-118625364| 4083.0| NULL| NULL| NULL| NULL| NULL| 0109| 261| LCA11*| 6.0378001011002E7| 37688| 3101| NULL| 96337| 0.0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| 27516.0| 2015| 27516.0| NULL| NULL| NULL| NULL|\n", 612 | "|10843547| NULL| NULL| NULL| 0.0| 0.0| NULL| NULL| NULL| NULL| NULL| 73026.0| NULL| NULL| 73026| NULL| NULL|6037| NULL| NULL| NULL| NULL| NULL| NULL|33989359|-118394633| 63085.0| NULL| NULL| NULL| NULL| NULL| 1200| 47| LAC2| 6.0377030012017E7| 51617| 3101| NULL| 96095| 0.0| NULL| NULL| NULL| 2| NULL| NULL| NULL| NULL| NULL| 650756.0| 1413387.0| 2015| 762631.0| 20800.37| NULL| NULL| NULL|\n", 613 | "|10859147| NULL| NULL| NULL| 0.0| 0.0| 3| 7| NULL| NULL| NULL| 5068.0| NULL| NULL| 5068| NULL| NULL|6037| NULL| NULL| NULL| NULL| NULL| NULL|34148863|-118437206| 7521.0| NULL| NULL| NULL| NULL| NULL| 1200| 47| LAC2| 6.0371412023001E7| 12447| 3101| 27080| 96424| 0.0| NULL| NULL| NULL| NULL| NULL| NULL| 1948.0| 1| NULL| 571346.0| 1156834.0| 2015| 585488.0| 14557.57| NULL| NULL| NULL|\n", 614 | "|10879947| NULL| NULL| NULL| 0.0| 0.0| 4| NULL| NULL| NULL| NULL| 1776.0| NULL| NULL| 1776| NULL| NULL|6037| NULL| NULL| NULL| NULL| NULL| NULL|34194168|-118385816| 8512.0| NULL| NULL| NULL| NULL| NULL| 1210| 31| LAM1| 6.0371232052003E7| 12447| 3101| 46795| 96450| 0.0| NULL| NULL| NULL| 1| NULL| NULL| 1947.0| NULL| NULL| 193796.0| 433491.0| 2015| 239695.0| 5725.17| NULL| NULL| NULL|\n", 615 | "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+----+------------+-----------+------------+---------------+--------------+---------------------+--------+----------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n", 616 | "only showing top 5 rows\n", 617 | "\n" 618 | ] 619 | } 620 | ], 621 | "source": [ 622 | "# View five records data\n", 623 | "df.show(5)" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": null, 629 | "id": "32d76054", 630 | "metadata": { 631 | "colab": { 632 | "base_uri": "https://localhost:8080/" 633 | }, 634 | "id": "32d76054", 635 | "outputId": "cf3b6c89-2ef3-47b6-f4e3-a50eb579ae5d" 636 | }, 637 | "outputs": [ 638 | { 639 | "name": "stdout", 640 | "output_type": "stream", 641 | "text": [ 642 | "root\n", 643 | " |-- parcelid: integer (nullable = true)\n", 644 | " |-- airconditioningtypeid: integer (nullable = true)\n", 645 | " |-- architecturalstyletypeid: integer (nullable = true)\n", 646 | " |-- basementsqft: integer (nullable = true)\n", 647 | " |-- bathroomcnt: double (nullable = true)\n", 648 | " |-- bedroomcnt: double (nullable = true)\n", 649 | " |-- buildingclasstypeid: integer (nullable = true)\n", 650 | " |-- buildingqualitytypeid: integer (nullable = true)\n", 651 | " |-- calculatedbathnbr: double (nullable = true)\n", 652 | " |-- decktypeid: integer (nullable = true)\n", 653 | " |-- finishedfloor1squarefeet: integer (nullable = true)\n", 654 | " |-- calculatedfinishedsquarefeet: double (nullable = true)\n", 655 | " |-- finishedsquarefeet12: integer (nullable = true)\n", 656 | " |-- finishedsquarefeet13: integer (nullable = true)\n", 657 | " |-- finishedsquarefeet15: integer (nullable = true)\n", 658 | " |-- finishedsquarefeet50: integer (nullable = true)\n", 659 | " |-- finishedsquarefeet6: integer (nullable = true)\n", 660 | " |-- fips: integer (nullable = true)\n", 661 | " |-- fireplacecnt: integer (nullable = true)\n", 662 | " |-- fullbathcnt: integer (nullable = true)\n", 663 | " |-- garagecarcnt: integer (nullable = true)\n", 664 | " |-- garagetotalsqft: integer (nullable = true)\n", 665 | " |-- hashottuborspa: boolean (nullable = true)\n", 666 | " |-- heatingorsystemtypeid: integer (nullable = true)\n", 667 | " |-- latitude: integer (nullable = true)\n", 668 | " |-- longitude: integer (nullable = true)\n", 669 | " |-- lotsizesquarefeet: double (nullable = true)\n", 670 | " |-- poolcnt: integer (nullable = true)\n", 671 | " |-- poolsizesum: integer (nullable = true)\n", 672 | " |-- pooltypeid10: integer (nullable = true)\n", 673 | " |-- pooltypeid2: integer (nullable = true)\n", 674 | " |-- pooltypeid7: integer (nullable = true)\n", 675 | " |-- propertycountylandusecode: string (nullable = true)\n", 676 | " |-- propertylandusetypeid: integer (nullable = true)\n", 677 | " |-- propertyzoningdesc: string (nullable = true)\n", 678 | " |-- rawcensustractandblock: double (nullable = true)\n", 679 | " |-- regionidcity: integer (nullable = true)\n", 680 | " |-- regionidcounty: integer (nullable = true)\n", 681 | " |-- regionidneighborhood: integer (nullable = true)\n", 682 | " |-- regionidzip: integer (nullable = true)\n", 683 | " |-- roomcnt: double (nullable = true)\n", 684 | " |-- storytypeid: integer (nullable = true)\n", 685 | " |-- threequarterbathnbr: integer (nullable = true)\n", 686 | " |-- typeconstructiontypeid: integer (nullable = true)\n", 687 | " |-- unitcnt: integer (nullable = true)\n", 688 | " |-- yardbuildingsqft17: integer (nullable = true)\n", 689 | " |-- yardbuildingsqft26: integer (nullable = true)\n", 690 | " |-- yearbuilt: double (nullable = true)\n", 691 | " |-- numberofstories: integer (nullable = true)\n", 692 | " |-- fireplaceflag: boolean (nullable = true)\n", 693 | " |-- structuretaxvaluedollarcnt: double (nullable = true)\n", 694 | " |-- taxvaluedollarcnt: double (nullable = true)\n", 695 | " |-- assessmentyear: integer (nullable = true)\n", 696 | " |-- landtaxvaluedollarcnt: double (nullable = true)\n", 697 | " |-- taxamount: double (nullable = true)\n", 698 | " |-- taxdelinquencyflag: string (nullable = true)\n", 699 | " |-- taxdelinquencyyear: integer (nullable = true)\n", 700 | " |-- censustractandblock: long (nullable = true)\n", 701 | "\n" 702 | ] 703 | } 704 | ], 705 | "source": [ 706 | "df.printSchema()" 707 | ] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "execution_count": null, 712 | "id": "65634354", 713 | "metadata": { 714 | "colab": { 715 | "base_uri": "https://localhost:8080/" 716 | }, 717 | "id": "65634354", 718 | "outputId": "5ad24bc2-fef7-4898-bcd0-f401ec43b6c3" 719 | }, 720 | "outputs": [ 721 | { 722 | "name": "stdout", 723 | "output_type": "stream", 724 | "text": [ 725 | "+-------+--------------------+---------------------+------------------------+-----------------+------------------+------------------+-------------------+---------------------+------------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+---------------------+-------------------+--------------------+------------------+-------+------------------+------------+-----------+-----------+-------------------------+---------------------+--------------------+----------------------+-----------------+------------------+--------------------+-----------------+------------------+-----------+-------------------+----------------------+------------------+------------------+------------------+------------------+------------------+--------------------------+------------------+-------------------+---------------------+-----------------+------------------+------------------+--------------------+\n", 726 | "|summary| parcelid|airconditioningtypeid|architecturalstyletypeid| basementsqft| bathroomcnt| bedroomcnt|buildingclasstypeid|buildingqualitytypeid| calculatedbathnbr|decktypeid|finishedfloor1squarefeet|calculatedfinishedsquarefeet|finishedsquarefeet12|finishedsquarefeet13|finishedsquarefeet15|finishedsquarefeet50|finishedsquarefeet6| fips| fireplacecnt| fullbathcnt| garagecarcnt| garagetotalsqft|heatingorsystemtypeid| latitude| longitude| lotsizesquarefeet|poolcnt| poolsizesum|pooltypeid10|pooltypeid2|pooltypeid7|propertycountylandusecode|propertylandusetypeid| propertyzoningdesc|rawcensustractandblock| regionidcity| regionidcounty|regionidneighborhood| regionidzip| roomcnt|storytypeid|threequarterbathnbr|typeconstructiontypeid| unitcnt|yardbuildingsqft17|yardbuildingsqft26| yearbuilt| numberofstories|structuretaxvaluedollarcnt| taxvaluedollarcnt| assessmentyear|landtaxvaluedollarcnt| taxamount|taxdelinquencyflag|taxdelinquencyyear| censustractandblock|\n", 727 | "+-------+--------------------+---------------------+------------------------+-----------------+------------------+------------------+-------------------+---------------------+------------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+---------------------+-------------------+--------------------+------------------+-------+------------------+------------+-----------+-----------+-------------------------+---------------------+--------------------+----------------------+-----------------+------------------+--------------------+-----------------+------------------+-----------+-------------------+----------------------+------------------+------------------+------------------+------------------+------------------+--------------------------+------------------+-------------------+---------------------+-----------------+------------------+------------------+--------------------+\n", 728 | "| count| 2985217| 811519| 6061| 1628| 2973755| 2973767| 12629| 1938488| 2856305| 17096| 202717| 2929652| 2709184| 7672| 190798| 202717| 22001| 2973780| 312637| 2856305| 883267| 883267| 1806401| 2973780| 2973780| 2709118| 517534| 27960| 36939| 32075| 485459| 2972940| 2973780| 1978629| 2973780| 2922372| 2973780| 1156402| 2971237| 2973742| 1624| 311631| 6747| 1977490| 80355| 2647| 2925289| 682069| 2930235| 2942667| 2973778| 2917484| 2953967| 56462| 56464| 2910091|\n", 729 | "| mean|1.3325858360229759E7| 1.9311661218036793| 7.202606830556014|646.8832923832924|2.2091427336818263| 3.088948797938776| 3.7259482144271123| 5.784786906083505|2.2992625087306853| 66.0| 1380.6303960693972| 1827.1621236925068| 1760.0006079321302| 1178.900677789364| 2739.18723466703| 1388.9445779091047| 2414.339439116404| 6048.031600185623| 1.1687100375195514|2.2441651014159905|1.8235165584132544| 383.7693573970272| 4.012053248420478|3.400146865372119E7|-1.18201934159426...|22822.805527748147| 1.0| 519.7109799713877| 1.0| 1.0| 1.0| 199.5320980966015| 260.0484285992911| 5.46084705882353E8| 6.048344961635102E7|34993.35022406456|2570.4605535715486| 193476.4074145496|96552.67280025121|1.4750183438912992| 7.0| 1.0100086320038764| 5.999555357936861| 1.181171080511153| 319.8033974239313| 278.2965621458255|1964.2616411575061|1.4014637815235702| 170883.57716599523|420478.99067852396| 2014.999458937419| 252478.02946854208|5377.607139338332| NULL|13.892409322754322|6.048431221257243E13|\n", 730 | "| stddev| 7909966.389233432| 3.148587394577264| 2.436290490710878|538.7934732127098|1.0777537772255268|1.2758587961101613| 0.5017002111297728| 1.8053515795599582|1.0007362395982085| 0.0| 632.8685428862445| 1819.7804693000555| 971.0610103785792| 357.07303551809184| 5447.428327204328| 664.4887085429802| 7695.302951762993|20.232784692561076|0.46127285457915057|0.9912053996708714|0.6100353832595409|245.44341897378612| 3.293732688713869| 243381.17831128882| 345317.10127200687| 337592.366407657| 0.0|191.32328381052514| 0.0| 0.0| 0.0| 302.825330209084| 15.908166600884176|2.0614819081932812E9| 200811.6754489488| 50727.4653888151| 788.071140066596| 165713.25431675857|3673.175037540778| 2.840402806614331| 0.0|0.11770930082463944| 0.38405027536042613|2.4478959553912745|233.08631396807013| 369.7315077596207|23.441319348584372|0.5390757507737586| 402068.3420150093| 726346.6517993591|0.03683161097766099| 445013.16961781326|9183.107127994226| NULL|2.5810057224984697|3.249034547374049E11|\n", 731 | "| min| 10711725| 1| 2| 20| 0.0| 0.0| 1| 1| 1.0| 66| 3| 1.0| 1| 120| 112| 3| 117| 6037| 1| 1| 0| 0| 1| 33324388| -119475780| 100.0| 1| 19| 1| 1| 1| 0| 31| #12| 6.0371011101E7| 3491| 1286| 6952| 95982| 0.0| 7| 1| 4| 1| 10| 10| 1801.0| 1| 1.0| 1.0| 2000| 1.0| 1.34| Y| 0| -1|\n", 732 | "| max| 169601949| 13| 27| 8516| 20.0| 20.0| 5| 12| 20.0| 66| 31303| 952576.0| 290345| 2688| 820242| 31303| 952576| 6111| 9| 20| 25| 7749| 24| 34819650| -117554316| 3.28263808E8| 1| 17410| 1| 1| 1| SFR| 275| ZONE LCC3| 6.1110091003011E7| 396556| 3101| 764167| 399675| 96.0| 7| 7| 13| 997| 7983| 6141| 2015.0| 41| 2.51486E8| 2.82786E8| 2016| 9.0246219E7| 3458861.12| Y| 99| 483030105084015|\n", 733 | "+-------+--------------------+---------------------+------------------------+-----------------+------------------+------------------+-------------------+---------------------+------------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+---------------------+-------------------+--------------------+------------------+-------+------------------+------------+-----------+-----------+-------------------------+---------------------+--------------------+----------------------+-----------------+------------------+--------------------+-----------------+------------------+-----------+-------------------+----------------------+------------------+------------------+------------------+------------------+------------------+--------------------------+------------------+-------------------+---------------------+-----------------+------------------+------------------+--------------------+\n", 734 | "\n" 735 | ] 736 | } 737 | ], 738 | "source": [ 739 | "df.describe().show()" 740 | ] 741 | }, 742 | { 743 | "cell_type": "markdown", 744 | "id": "e7f4d289", 745 | "metadata": { 746 | "id": "e7f4d289" 747 | }, 748 | "source": [ 749 | "# Clean Data" 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": null, 755 | "id": "eddda1f7", 756 | "metadata": { 757 | "id": "eddda1f7" 758 | }, 759 | "outputs": [], 760 | "source": [ 761 | "# Drop duplicates\n", 762 | "df = df.dropDuplicates()" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": null, 768 | "id": "2d651e5b", 769 | "metadata": { 770 | "colab": { 771 | "base_uri": "https://localhost:8080/" 772 | }, 773 | "id": "2d651e5b", 774 | "outputId": "c59bf422-2e51-410e-fa39-b18c726e4e46" 775 | }, 776 | "outputs": [ 777 | { 778 | "name": "stdout", 779 | "output_type": "stream", 780 | "text": [ 781 | "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----+------------+-----------+------------+---------------+--------------+---------------------+--------+---------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n", 782 | "|parcelid|airconditioningtypeid|architecturalstyletypeid|basementsqft|bathroomcnt|bedroomcnt|buildingclasstypeid|buildingqualitytypeid|calculatedbathnbr|decktypeid|finishedfloor1squarefeet|calculatedfinishedsquarefeet|finishedsquarefeet12|finishedsquarefeet13|finishedsquarefeet15|finishedsquarefeet50|finishedsquarefeet6| fips|fireplacecnt|fullbathcnt|garagecarcnt|garagetotalsqft|hashottuborspa|heatingorsystemtypeid|latitude|longitude|lotsizesquarefeet|poolcnt|poolsizesum|pooltypeid10|pooltypeid2|pooltypeid7|propertycountylandusecode|propertylandusetypeid|propertyzoningdesc|rawcensustractandblock|regionidcity|regionidcounty|regionidneighborhood|regionidzip|roomcnt|storytypeid|threequarterbathnbr|typeconstructiontypeid|unitcnt|yardbuildingsqft17|yardbuildingsqft26|yearbuilt|numberofstories|fireplaceflag|structuretaxvaluedollarcnt|taxvaluedollarcnt|assessmentyear|landtaxvaluedollarcnt|taxamount|taxdelinquencyflag|taxdelinquencyyear|censustractandblock|\n", 783 | "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----+------------+-----------+------------+---------------+--------------+---------------------+--------+---------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n", 784 | "| 0| 2173698| 2979156| 2983589| 11462| 11450| 2972588| 1046729| 128912| 2968121| 2782500| 55565| 276033| 2977545| 2794419| 2782500| 2963216|11437| 2672580| 128912| 2101950| 2101950| 2916203| 1178816| 11437| 11437| 276099|2467683| 2957257| 2948278| 2953142| 2499758| 12277| 11437| 1006588| 11437| 62845| 11437| 1828815| 13980| 11475| 2983593| 2673586| 2978470|1007727| 2904862| 2982570| 59928| 2303148| 2980054| 54982| 42550| 11439| 67733| 31250| 2928755| 2928753| 75126|\n", 785 | "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----+------------+-----------+------------+---------------+--------------+---------------------+--------+---------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n", 786 | "\n" 787 | ] 788 | } 789 | ], 790 | "source": [ 791 | "# Get the missing value of each column\n", 792 | "null_counts = df.select([sum(col(column).isNull().cast(\"int\")).alias(column) for column in df.columns])\n", 793 | "null_counts.show()" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": null, 799 | "id": "c158a9c4-c639-47f5-9e29-6ac483daced6", 800 | "metadata": { 801 | "id": "c158a9c4-c639-47f5-9e29-6ac483daced6" 802 | }, 803 | "outputs": [], 804 | "source": [ 805 | "# Visualize missing value on each column\n", 806 | "pandas_df = df.toPandas()\n", 807 | "missing_count = pandas_df.isna().sum()\n", 808 | "sns.barplot(x=missing_count.index, y=missing_count.values)\n", 809 | "plt.title('Numbers Missing Value on each column')\n", 810 | "plt.xlabel('Column')\n", 811 | "plt.ylabel('Numbers Missing')\n", 812 | "plt.show()" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": null, 818 | "id": "4b1b542f", 819 | "metadata": { 820 | "colab": { 821 | "background_save": true 822 | }, 823 | "id": "4b1b542f" 824 | }, 825 | "outputs": [], 826 | "source": [ 827 | "# Drop columns that are more than 60% missing\n", 828 | "def column_dropper(df, threshold):\n", 829 | " total_records = df.count()\n", 830 | " for col in df.columns:\n", 831 | " missing = df.filter(df[col].isNull()).count()\n", 832 | " missing_percent = missing / total_records\n", 833 | " if missing_percent > threshold:\n", 834 | " df = df.drop(col)\n", 835 | " return df\n", 836 | "\n", 837 | "df = column_dropper(df, 0.6)" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": null, 843 | "id": "41bd87bb", 844 | "metadata": { 845 | "colab": { 846 | "base_uri": "https://localhost:8080/" 847 | }, 848 | "id": "41bd87bb", 849 | "outputId": "ea48b121-fe86-4c50-cc77-c284beaec8c7" 850 | }, 851 | "outputs": [ 852 | { 853 | "name": "stdout", 854 | "output_type": "stream", 855 | "text": [ 856 | "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----+------------+-----------+------------+---------------+--------------+---------------------+--------+---------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n", 857 | "|parcelid|airconditioningtypeid|architecturalstyletypeid|basementsqft|bathroomcnt|bedroomcnt|buildingclasstypeid|buildingqualitytypeid|calculatedbathnbr|decktypeid|finishedfloor1squarefeet|calculatedfinishedsquarefeet|finishedsquarefeet12|finishedsquarefeet13|finishedsquarefeet15|finishedsquarefeet50|finishedsquarefeet6| fips|fireplacecnt|fullbathcnt|garagecarcnt|garagetotalsqft|hashottuborspa|heatingorsystemtypeid|latitude|longitude|lotsizesquarefeet|poolcnt|poolsizesum|pooltypeid10|pooltypeid2|pooltypeid7|propertycountylandusecode|propertylandusetypeid|propertyzoningdesc|rawcensustractandblock|regionidcity|regionidcounty|regionidneighborhood|regionidzip|roomcnt|storytypeid|threequarterbathnbr|typeconstructiontypeid|unitcnt|yardbuildingsqft17|yardbuildingsqft26|yearbuilt|numberofstories|fireplaceflag|structuretaxvaluedollarcnt|taxvaluedollarcnt|assessmentyear|landtaxvaluedollarcnt|taxamount|taxdelinquencyflag|taxdelinquencyyear|censustractandblock|\n", 858 | "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----+------------+-----------+------------+---------------+--------------+---------------------+--------+---------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n", 859 | "| 0| 2173698| 2979156| 2983589| 11462| 11450| 2972588| 1046729| 128912| 2968121| 2782500| 55565| 276033| 2977545| 2794419| 2782500| 2963216|11437| 2672580| 128912| 2101950| 2101950| 2916203| 1178816| 11437| 11437| 276099|2467683| 2957257| 2948278| 2953142| 2499758| 12277| 11437| 1006588| 11437| 62845| 11437| 1828815| 13980| 11475| 2983593| 2673586| 2978470|1007727| 2904862| 2982570| 59928| 2303148| 2980054| 54982| 42550| 11439| 67733| 31250| 2928755| 2928753| 75126|\n", 860 | "+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----+------------+-----------+------------+---------------+--------------+---------------------+--------+---------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+\n", 861 | "\n" 862 | ] 863 | } 864 | ], 865 | "source": [ 866 | "# columns remaining after deletion\n", 867 | "null_counts = df.select([sum(col(column).isNull().cast(\"int\")).alias(column) for column in df.columns])\n", 868 | "null_counts.show()" 869 | ] 870 | }, 871 | { 872 | "cell_type": "code", 873 | "execution_count": null, 874 | "id": "I_Mx_6U2MJ93", 875 | "metadata": { 876 | "id": "I_Mx_6U2MJ93" 877 | }, 878 | "outputs": [], 879 | "source": [ 880 | "df.show()" 881 | ] 882 | }, 883 | { 884 | "cell_type": "code", 885 | "execution_count": null, 886 | "id": "KsG0VYVsUFAt", 887 | "metadata": { 888 | "id": "KsG0VYVsUFAt" 889 | }, 890 | "outputs": [], 891 | "source": [ 892 | "PARCELID: 0\n", 893 | "BATHROOMCNT: 11462\n", 894 | "BEDROOMCNT: 11450\n", 895 | "BUILDINGQUALITYTYPEID: 1046729\n", 896 | "CALCULATEDBATHNBR: 128912\n", 897 | "CALCULATEDFINISHEDSQUAREFEET: 55565\n", 898 | "FINISHEDSQUAREFEET12: 276033\n", 899 | "FIPS: 11437\n", 900 | "FULLBATHCNT: 128912\n", 901 | "HEATINGORSYSTEMTYPEID: 1178816\n", 902 | "LATITUDE: 11437\n", 903 | "LONGITUDE: 11437\n", 904 | "LOTSIZESQUAREFEET: 276099\n", 905 | "PROPERTYCOUNTYLANDUSECODE: 12277\n", 906 | "PROPERTYLANDUSETYPEID: 11437\n", 907 | "PROPERTYZONINGDESC: 1006588\n", 908 | "RAWCENSUSTRACTANDBLOCK: 11437\n", 909 | "REGIONIDCITY: 62845\n", 910 | "REGIONIDCOUNTY: 11437\n", 911 | "REGIONIDZIP: 13980\n", 912 | "ROOMCNT: 11475\n", 913 | "UNITCNT: 1007727\n", 914 | "YEARBUILT: 59928\n", 915 | "STRUCTURETAXVALUEDOLLARCNT: 54982\n", 916 | "TAXVALUEDOLLARCNT: 42550\n", 917 | "ASSESSMENTYEAR: 11439\n", 918 | "LANDTAXVALUEDOLLARCNT: 67733\n", 919 | "TAXAMOUNT: 31250\n", 920 | "CENSUSTRACTANDBLOCK: 75126" 921 | ] 922 | }, 923 | { 924 | "cell_type": "code", 925 | "execution_count": null, 926 | "id": "f958f97a", 927 | "metadata": { 928 | "colab": { 929 | "base_uri": "https://localhost:8080/" 930 | }, 931 | "id": "f958f97a", 932 | "outputId": "3abae1d5-c133-4199-c41c-065c365b9c9f" 933 | }, 934 | "outputs": [ 935 | { 936 | "data": { 937 | "text/plain": [ 938 | "58" 939 | ] 940 | }, 941 | "execution_count": 23, 942 | "metadata": {}, 943 | "output_type": "execute_result" 944 | } 945 | ], 946 | "source": [ 947 | "# Fill miss value\n", 948 | "values = {\n", 949 | " 'bathroomcnt': 'value1',\n", 950 | " 'bedroomcnt': 'value2',\n", 951 | " 'buildingqualitytypeid': \"\",\n", 952 | " \"CALCULATEDBATHNBR\": 128912,\n", 953 | " \"CALCULATEDFINISHEDSQUAREFEET\": 55565,\n", 954 | " \"FINISHEDSQUAREFEET12\": 276033,\n", 955 | " \"FIPS\": 11437,\n", 956 | " \"FULLBATHCNT\": 128912,\n", 957 | " \"HEATINGORSYSTEMTYPEID\": 1178816,\n", 958 | " \"LATITUDE\": 11437,\n", 959 | " \"LONGITUDE\": 11437,\n", 960 | " \"LOTSIZESQUAREFEET\": 276099,\n", 961 | " \"PROPERTYCOUNTYLANDUSECODE\": 12277,\n", 962 | " \"PROPERTYLANDUSETYPEID\": 11437,\n", 963 | " \"PROPERTYZONINGDESC\": 1006588,\n", 964 | " \"RAWCENSUSTRACTANDBLOCK\": 11437,\n", 965 | " \"REGIONIDCITY\": 62845,\n", 966 | " \"REGIONIDCOUNTY\": 11437,\n", 967 | " \"REGIONIDZIP\": 13980,\n", 968 | " \"ROOMCNT\": 11475,\n", 969 | " \"UNITCNT\": 1007727,\n", 970 | " \"YEARBUILT\": 59928,\n", 971 | " \"STRUCTURETAXVALUEDOLLARCNT\": 54982,\n", 972 | " \"TAXVALUEDOLLARCNT\": 42550,\n", 973 | " \"ASSESSMENTYEAR\": 11439,\n", 974 | " \"LANDTAXVALUEDOLLARCNT\": 67733,\n", 975 | " \"TAXAMOUNT\": 31250,\n", 976 | " \"CENSUSTRACTANDBLOCK\": 75126\n", 977 | "}\n", 978 | "filled_df = df.fillna(values)" 979 | ] 980 | }, 981 | { 982 | "cell_type": "code", 983 | "execution_count": null, 984 | "id": "OTkG-6cMOxEt", 985 | "metadata": { 986 | "id": "OTkG-6cMOxEt" 987 | }, 988 | "outputs": [], 989 | "source": [ 990 | "null_counts = df.select([sum(col(column).isNull().cast(\"int\")).alias(column) for column in df.columns])\n", 991 | "null_counts.show()" 992 | ] 993 | }, 994 | { 995 | "cell_type": "markdown", 996 | "id": "dnypT2myKfsh", 997 | "metadata": { 998 | "id": "dnypT2myKfsh" 999 | }, 1000 | "source": [ 1001 | "# Feature Engineering" 1002 | ] 1003 | }, 1004 | { 1005 | "cell_type": "code", 1006 | "execution_count": null, 1007 | "id": "3f5d77f7", 1008 | "metadata": { 1009 | "id": "3f5d77f7" 1010 | }, 1011 | "outputs": [], 1012 | "source": [ 1013 | "# One-hot encoding for 'bathroomcnt'\n", 1014 | "encoder_bathroomcnt = OneHotEncoder(inputCols=['bathroomcnt'], outputCols=['bathroomcnt_dummy'])\n", 1015 | "df = encoder_bathroomcnt.fit(df).transform(df)\n" 1016 | ] 1017 | }, 1018 | { 1019 | "cell_type": "code", 1020 | "execution_count": null, 1021 | "id": "5c50d08f", 1022 | "metadata": { 1023 | "id": "5c50d08f" 1024 | }, 1025 | "outputs": [], 1026 | "source": [ 1027 | "# One-hot encoding for 'bedroomcnt'\n", 1028 | "encoder_bedroomcnt = OneHotEncoder(inputCols=['bedroomcnt'], outputCols=['bedroomcnt_dummy'])\n", 1029 | "df = encoder_bedroomcnt.fit(df).transform(df)" 1030 | ] 1031 | }, 1032 | { 1033 | "cell_type": "code", 1034 | "execution_count": null, 1035 | "id": "Z2pj086_RJFl", 1036 | "metadata": { 1037 | "id": "Z2pj086_RJFl" 1038 | }, 1039 | "outputs": [], 1040 | "source": [ 1041 | "# Assemble features into a single vector column\n", 1042 | "assembler = VectorAssembler(inputCols=['roomcnt', 'latitude', 'longitude', 'bathroomcnt_dummy', 'bedroomcnt_dummy'], outputCol='features')\n", 1043 | "df = assembler.transform(df)" 1044 | ] 1045 | }, 1046 | { 1047 | "cell_type": "markdown", 1048 | "id": "ZScPc9wxREPs", 1049 | "metadata": { 1050 | "id": "ZScPc9wxREPs" 1051 | }, 1052 | "source": [ 1053 | "# Build Linear Regression Model" 1054 | ] 1055 | }, 1056 | { 1057 | "cell_type": "code", 1058 | "execution_count": null, 1059 | "id": "3ef37703", 1060 | "metadata": { 1061 | "id": "3ef37703" 1062 | }, 1063 | "outputs": [], 1064 | "source": [ 1065 | "# Split the data\n", 1066 | "train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)" 1067 | ] 1068 | }, 1069 | { 1070 | "cell_type": "code", 1071 | "execution_count": null, 1072 | "id": "cadb50de", 1073 | "metadata": { 1074 | "id": "cadb50de" 1075 | }, 1076 | "outputs": [], 1077 | "source": [ 1078 | "# Build the model\n", 1079 | "regression = LinearRegression(featuresCol='features', labelCol='duration')\n", 1080 | "model = regression.fit(train_data)" 1081 | ] 1082 | }, 1083 | { 1084 | "cell_type": "code", 1085 | "execution_count": null, 1086 | "id": "b56f0ff4", 1087 | "metadata": { 1088 | "id": "b56f0ff4" 1089 | }, 1090 | "outputs": [], 1091 | "source": [ 1092 | "# Make predictions\n", 1093 | "predictions = model.transform(test_data)" 1094 | ] 1095 | }, 1096 | { 1097 | "cell_type": "markdown", 1098 | "id": "FeP14XfNRgzS", 1099 | "metadata": { 1100 | "id": "FeP14XfNRgzS" 1101 | }, 1102 | "source": [ 1103 | "# Evaluate Model" 1104 | ] 1105 | }, 1106 | { 1107 | "cell_type": "code", 1108 | "execution_count": null, 1109 | "id": "9e490664", 1110 | "metadata": { 1111 | "id": "9e490664" 1112 | }, 1113 | "outputs": [], 1114 | "source": [ 1115 | "# Evaluate the model\n", 1116 | "evaluator = RegressionEvaluator(labelCol='duration', metricName='rmse')\n", 1117 | "rmse = evaluator.evaluate(predictions)\n", 1118 | "print(\"Root Mean Square Error (RMSE) on test data =\", rmse)\n", 1119 | "\n", 1120 | "# Print coefficients and intercept for interpretation\n", 1121 | "print(\"Coefficients:\", model.coefficients)\n", 1122 | "print(\"Intercept:\", model.intercept)" 1123 | ] 1124 | }, 1125 | { 1126 | "cell_type": "code", 1127 | "execution_count": null, 1128 | "id": "c2f0476d", 1129 | "metadata": { 1130 | "id": "c2f0476d" 1131 | }, 1132 | "outputs": [], 1133 | "source": [] 1134 | }, 1135 | { 1136 | "cell_type": "code", 1137 | "execution_count": null, 1138 | "id": "25938f77", 1139 | "metadata": { 1140 | "id": "25938f77" 1141 | }, 1142 | "outputs": [], 1143 | "source": [] 1144 | }, 1145 | { 1146 | "cell_type": "code", 1147 | "execution_count": null, 1148 | "id": "a029e08d", 1149 | "metadata": { 1150 | "id": "a029e08d" 1151 | }, 1152 | "outputs": [], 1153 | "source": [] 1154 | }, 1155 | { 1156 | "cell_type": "code", 1157 | "execution_count": null, 1158 | "id": "4a4ad2d6", 1159 | "metadata": { 1160 | "id": "4a4ad2d6" 1161 | }, 1162 | "outputs": [], 1163 | "source": [] 1164 | }, 1165 | { 1166 | "cell_type": "code", 1167 | "execution_count": null, 1168 | "id": "31873dac", 1169 | "metadata": { 1170 | "id": "31873dac" 1171 | }, 1172 | "outputs": [], 1173 | "source": [] 1174 | }, 1175 | { 1176 | "cell_type": "code", 1177 | "execution_count": null, 1178 | "id": "0d65e838", 1179 | "metadata": { 1180 | "id": "0d65e838" 1181 | }, 1182 | "outputs": [], 1183 | "source": [] 1184 | }, 1185 | { 1186 | "cell_type": "code", 1187 | "execution_count": null, 1188 | "id": "041add92", 1189 | "metadata": { 1190 | "id": "041add92" 1191 | }, 1192 | "outputs": [], 1193 | "source": [] 1194 | }, 1195 | { 1196 | "cell_type": "code", 1197 | "execution_count": null, 1198 | "id": "fa5fb063-3ed4-40bb-a566-f9eab2ca520e", 1199 | "metadata": { 1200 | "id": "fa5fb063-3ed4-40bb-a566-f9eab2ca520e" 1201 | }, 1202 | "outputs": [], 1203 | "source": [] 1204 | }, 1205 | { 1206 | "cell_type": "code", 1207 | "execution_count": null, 1208 | "id": "8d651e75-dc8a-4bf4-ba9f-de4e27ddfa04", 1209 | "metadata": { 1210 | "id": "8d651e75-dc8a-4bf4-ba9f-de4e27ddfa04" 1211 | }, 1212 | "outputs": [], 1213 | "source": [] 1214 | }, 1215 | { 1216 | "cell_type": "code", 1217 | "execution_count": null, 1218 | "id": "a4916a6a-f1e8-49c9-850d-f94639c337e7", 1219 | "metadata": { 1220 | "id": "a4916a6a-f1e8-49c9-850d-f94639c337e7" 1221 | }, 1222 | "outputs": [], 1223 | "source": [] 1224 | }, 1225 | { 1226 | "cell_type": "code", 1227 | "execution_count": null, 1228 | "id": "cd1eb575-3b8b-49c2-943f-3f79a908d626", 1229 | "metadata": { 1230 | "id": "cd1eb575-3b8b-49c2-943f-3f79a908d626" 1231 | }, 1232 | "outputs": [], 1233 | "source": [] 1234 | }, 1235 | { 1236 | "cell_type": "code", 1237 | "execution_count": null, 1238 | "id": "0907696c-cc0c-4d33-9707-89696752a4d1", 1239 | "metadata": { 1240 | "id": "0907696c-cc0c-4d33-9707-89696752a4d1" 1241 | }, 1242 | "outputs": [], 1243 | "source": [] 1244 | }, 1245 | { 1246 | "cell_type": "code", 1247 | "execution_count": null, 1248 | "id": "29b355ad", 1249 | "metadata": { 1250 | "id": "29b355ad" 1251 | }, 1252 | "outputs": [], 1253 | "source": [] 1254 | }, 1255 | { 1256 | "cell_type": "markdown", 1257 | "id": "6d5349b1-4b09-4ace-9b02-a6eed61843bd", 1258 | "metadata": { 1259 | "id": "6d5349b1-4b09-4ace-9b02-a6eed61843bd" 1260 | }, 1261 | "source": [ 1262 | "# Pre-Processing Data" 1263 | ] 1264 | }, 1265 | { 1266 | "cell_type": "markdown", 1267 | "id": "8788823a-5c40-481c-8dea-97ce436899bc", 1268 | "metadata": { 1269 | "id": "8788823a-5c40-481c-8dea-97ce436899bc" 1270 | }, 1271 | "source": [ 1272 | "### 1. Check Data" 1273 | ] 1274 | }, 1275 | { 1276 | "cell_type": "code", 1277 | "execution_count": null, 1278 | "id": "7c962808-b082-48cd-8b04-11b8d821f32c", 1279 | "metadata": { 1280 | "id": "7c962808-b082-48cd-8b04-11b8d821f32c" 1281 | }, 1282 | "outputs": [], 1283 | "source": [ 1284 | "df = spark.read.csv(\"properties_2016.csv\", header=True, inferSchema=True)" 1285 | ] 1286 | }, 1287 | { 1288 | "cell_type": "code", 1289 | "execution_count": null, 1290 | "id": "bd61df3c-936d-4edf-8cc4-0d7012bee2ce", 1291 | "metadata": { 1292 | "id": "bd61df3c-936d-4edf-8cc4-0d7012bee2ce" 1293 | }, 1294 | "outputs": [], 1295 | "source": [ 1296 | "df.limit(10)" 1297 | ] 1298 | }, 1299 | { 1300 | "cell_type": "code", 1301 | "execution_count": null, 1302 | "id": "1578e940-ec6f-4867-a345-bc1c9d144727", 1303 | "metadata": { 1304 | "id": "1578e940-ec6f-4867-a345-bc1c9d144727" 1305 | }, 1306 | "outputs": [], 1307 | "source": [ 1308 | "# convert all column names to uppercase\n", 1309 | "for col in df.columns:\n", 1310 | " df = df.withColumnRenamed(col, col.upper())" 1311 | ] 1312 | }, 1313 | { 1314 | "cell_type": "markdown", 1315 | "id": "b0a960fc-0f60-4093-b135-04fb779d33cc", 1316 | "metadata": { 1317 | "id": "b0a960fc-0f60-4093-b135-04fb779d33cc" 1318 | }, 1319 | "source": [ 1320 | "### 2. check descriptive statistics" 1321 | ] 1322 | }, 1323 | { 1324 | "cell_type": "code", 1325 | "execution_count": null, 1326 | "id": "7dc0f07a-6b10-4cf5-a15b-2469b0907a9b", 1327 | "metadata": { 1328 | "id": "7dc0f07a-6b10-4cf5-a15b-2469b0907a9b" 1329 | }, 1330 | "outputs": [], 1331 | "source": [ 1332 | "df.describe().limit(20)" 1333 | ] 1334 | }, 1335 | { 1336 | "cell_type": "markdown", 1337 | "id": "80beb77e-de1e-4ea3-8a73-e3c4619e614b", 1338 | "metadata": { 1339 | "id": "80beb77e-de1e-4ea3-8a73-e3c4619e614b" 1340 | }, 1341 | "source": [ 1342 | "### 3. Check DataType" 1343 | ] 1344 | }, 1345 | { 1346 | "cell_type": "code", 1347 | "execution_count": null, 1348 | "id": "d048bb60-008b-4c79-87dc-6ae9703bcbea", 1349 | "metadata": { 1350 | "id": "d048bb60-008b-4c79-87dc-6ae9703bcbea" 1351 | }, 1352 | "outputs": [], 1353 | "source": [ 1354 | "df.printSchema()" 1355 | ] 1356 | }, 1357 | { 1358 | "cell_type": "markdown", 1359 | "id": "3b3d485b-303e-482f-966c-9692f60c315c", 1360 | "metadata": { 1361 | "id": "3b3d485b-303e-482f-966c-9692f60c315c" 1362 | }, 1363 | "source": [ 1364 | "### 4. Check Number columns, rows current" 1365 | ] 1366 | }, 1367 | { 1368 | "cell_type": "code", 1369 | "execution_count": null, 1370 | "id": "1869a2ac-0579-44b0-b7ea-d0f24852bbc2", 1371 | "metadata": { 1372 | "id": "1869a2ac-0579-44b0-b7ea-d0f24852bbc2" 1373 | }, 1374 | "outputs": [], 1375 | "source": [ 1376 | "# Columns\n", 1377 | "len(df.columns)" 1378 | ] 1379 | }, 1380 | { 1381 | "cell_type": "code", 1382 | "execution_count": null, 1383 | "id": "c63758b3-a367-4848-b6ae-f492f60ce9f8", 1384 | "metadata": { 1385 | "id": "c63758b3-a367-4848-b6ae-f492f60ce9f8" 1386 | }, 1387 | "outputs": [], 1388 | "source": [ 1389 | "# Rows\n", 1390 | "df.count()" 1391 | ] 1392 | }, 1393 | { 1394 | "cell_type": "markdown", 1395 | "id": "337dffc1-edd9-454d-b07a-87e02566685f", 1396 | "metadata": { 1397 | "id": "337dffc1-edd9-454d-b07a-87e02566685f" 1398 | }, 1399 | "source": [ 1400 | "### 5. Drop Duplicates" 1401 | ] 1402 | }, 1403 | { 1404 | "cell_type": "code", 1405 | "execution_count": null, 1406 | "id": "fb672b6c-e6be-4d3c-b0bb-0b4a4ab29cab", 1407 | "metadata": { 1408 | "id": "fb672b6c-e6be-4d3c-b0bb-0b4a4ab29cab" 1409 | }, 1410 | "outputs": [], 1411 | "source": [ 1412 | "df = df.dropDuplicates()" 1413 | ] 1414 | }, 1415 | { 1416 | "cell_type": "markdown", 1417 | "id": "1c16498a-6da2-4fc1-8403-437795fdff61", 1418 | "metadata": { 1419 | "id": "1c16498a-6da2-4fc1-8403-437795fdff61" 1420 | }, 1421 | "source": [ 1422 | "### 6. Check Miss Value" 1423 | ] 1424 | }, 1425 | { 1426 | "cell_type": "code", 1427 | "execution_count": null, 1428 | "id": "68e60e46-6a73-4f7d-87ee-014e7ddbef1c", 1429 | "metadata": { 1430 | "id": "68e60e46-6a73-4f7d-87ee-014e7ddbef1c" 1431 | }, 1432 | "outputs": [], 1433 | "source": [ 1434 | "def check_null_count():\n", 1435 | " for column in df.columns:\n", 1436 | " null_count = df.filter(df[column].isNull()).count()\n", 1437 | " print(f\"{column}: {null_count}\")" 1438 | ] 1439 | }, 1440 | { 1441 | "cell_type": "code", 1442 | "execution_count": null, 1443 | "id": "ddaf99dd-78e8-4ece-82ed-6081bee69b60", 1444 | "metadata": { 1445 | "id": "ddaf99dd-78e8-4ece-82ed-6081bee69b60" 1446 | }, 1447 | "outputs": [], 1448 | "source": [ 1449 | "# Số lượng giá trị khuyết thiếu của từng cột\n", 1450 | "check_null_count()" 1451 | ] 1452 | }, 1453 | { 1454 | "cell_type": "markdown", 1455 | "id": "42fbdaaa-1895-4d31-9ed8-2153261108e5", 1456 | "metadata": { 1457 | "id": "42fbdaaa-1895-4d31-9ed8-2153261108e5" 1458 | }, 1459 | "source": [ 1460 | "##### Drop columns with more than 60% missing" 1461 | ] 1462 | }, 1463 | { 1464 | "cell_type": "code", 1465 | "execution_count": null, 1466 | "id": "59979bc8-5f48-4697-857e-b0bd65b00472", 1467 | "metadata": { 1468 | "id": "59979bc8-5f48-4697-857e-b0bd65b00472" 1469 | }, 1470 | "outputs": [], 1471 | "source": [ 1472 | "def column_dropper(df, threshold):\n", 1473 | " # Takes a dataframe and threshold for missing values. Returns a dataframe.\n", 1474 | " total_records = df.count()\n", 1475 | " for col in df.columns:\n", 1476 | " # Calculate the percentage of missing values\n", 1477 | " missing = df.where(df[col].isNull()).count()\n", 1478 | " missing_percent = missing / total_records\n", 1479 | " # Drop column if percent of missing is more than threshold\n", 1480 | " if missing_percent > threshold:\n", 1481 | " df = df.drop(col)\n", 1482 | " return df\n", 1483 | "\n", 1484 | "# Drop columns that are more than 60% missing\n", 1485 | "df = column_dropper(df, 0.6)" 1486 | ] 1487 | }, 1488 | { 1489 | "cell_type": "code", 1490 | "execution_count": null, 1491 | "id": "dfd6df2b-4a42-4a62-ae85-0d882c85405f", 1492 | "metadata": { 1493 | "id": "dfd6df2b-4a42-4a62-ae85-0d882c85405f" 1494 | }, 1495 | "outputs": [], 1496 | "source": [ 1497 | "check_null_count()" 1498 | ] 1499 | }, 1500 | { 1501 | "cell_type": "markdown", 1502 | "id": "b7526960-0c18-4577-ac67-b6542f6ff17e", 1503 | "metadata": { 1504 | "id": "b7526960-0c18-4577-ac67-b6542f6ff17e" 1505 | }, 1506 | "source": [ 1507 | "### 7. Outlier Filtering" 1508 | ] 1509 | }, 1510 | { 1511 | "cell_type": "code", 1512 | "execution_count": null, 1513 | "id": "af033c41-07ea-4474-bcce-9d388e150aef", 1514 | "metadata": { 1515 | "id": "af033c41-07ea-4474-bcce-9d388e150aef" 1516 | }, 1517 | "outputs": [], 1518 | "source": [ 1519 | "mean_val = df.agg({'BATHROOMCNT': 'mean'}).collect()[0][0]\n", 1520 | "stddev_val = df.agg({'BATHROOMCNT': 'stddev'}).collect()[0][0]\n", 1521 | "\n", 1522 | "low_bound = mean_val - (3 * stddev_val)\n", 1523 | "hi_bound = mean_val + (3 * stddev_val)\n", 1524 | "\n", 1525 | "df = df.where((df['BATHROOMCNT'] < hi_bound) & (df['BATHROOMCNT'] > low_bound))" 1526 | ] 1527 | }, 1528 | { 1529 | "cell_type": "markdown", 1530 | "id": "cb41849c-1339-4926-9c6c-3a23507d52ff", 1531 | "metadata": { 1532 | "id": "cb41849c-1339-4926-9c6c-3a23507d52ff" 1533 | }, 1534 | "source": [ 1535 | "### 8. Adjust Data" 1536 | ] 1537 | }, 1538 | { 1539 | "cell_type": "code", 1540 | "execution_count": null, 1541 | "id": "5e57b8b8-3ce2-401d-8dda-2402a608f5ee", 1542 | "metadata": { 1543 | "id": "5e57b8b8-3ce2-401d-8dda-2402a608f5ee" 1544 | }, 1545 | "outputs": [], 1546 | "source": [ 1547 | "mean = df.agg({'BATHROOMCNT': 'mean'}).collect()[0][0]\n", 1548 | "stddev = df.agg({'BATHROOMCNT': 'stddev'}).collect()[0][0]\n", 1549 | "# Create a new column with the scaled data\n", 1550 | "df = df.withColumn(\"ztrans_days\", (df['BATHROOMCNT'] - mean) / stddev)\n", 1551 | "df.agg({'ztrans_days': 'mean'}).collect()\n", 1552 | "df.agg({'ztrans_days': 'stddev'}).collect()" 1553 | ] 1554 | }, 1555 | { 1556 | "cell_type": "markdown", 1557 | "id": "56d5a156-5472-4c77-b72f-173974721375", 1558 | "metadata": { 1559 | "id": "56d5a156-5472-4c77-b72f-173974721375" 1560 | }, 1561 | "source": [ 1562 | "# Feature Engineering" 1563 | ] 1564 | }, 1565 | { 1566 | "cell_type": "markdown", 1567 | "id": "6865adc7-b638-4b5a-b650-d57a415ed110", 1568 | "metadata": { 1569 | "id": "6865adc7-b638-4b5a-b650-d57a415ed110" 1570 | }, 1571 | "source": [ 1572 | "### 1. Bucketing" 1573 | ] 1574 | }, 1575 | { 1576 | "cell_type": "code", 1577 | "execution_count": null, 1578 | "id": "5a695ed2-0573-4fa6-93c8-e123f26042a7", 1579 | "metadata": { 1580 | "id": "5a695ed2-0573-4fa6-93c8-e123f26042a7" 1581 | }, 1582 | "outputs": [], 1583 | "source": [ 1584 | "splits = [0, 1, 2, 3, 4, float('Inf')]\n", 1585 | "\n", 1586 | "# Create bucketing transformer\n", 1587 | "buck = Bucketizer(splits=splits, inputCol='TAXAMOUNT', outputCol='TAXA')\n", 1588 | "\n", 1589 | "# Apply transformer\n", 1590 | "df = buck.transform(df)\n", 1591 | "\n", 1592 | "# Inspect results\n", 1593 | "df[['TAXAMOUNT', 'TAXA']].show()" 1594 | ] 1595 | }, 1596 | { 1597 | "cell_type": "markdown", 1598 | "id": "c487737f-7e48-42b1-83c9-8c5ef75951aa", 1599 | "metadata": { 1600 | "id": "c487737f-7e48-42b1-83c9-8c5ef75951aa" 1601 | }, 1602 | "source": [ 1603 | "### 2. One-hot Encoding" 1604 | ] 1605 | }, 1606 | { 1607 | "cell_type": "code", 1608 | "execution_count": null, 1609 | "id": "cf27659d-29ba-4fab-b8c5-30a979c6a445", 1610 | "metadata": { 1611 | "id": "cf27659d-29ba-4fab-b8c5-30a979c6a445" 1612 | }, 1613 | "outputs": [], 1614 | "source": [ 1615 | "from pyspark.ml.feature import OneHotEncoder, StringIndexer\n", 1616 | "\n", 1617 | "# Map strings to numbers with string indexer\n", 1618 | "string_indexer = StringIndexer(inputCol='ROOMCNT', outputCol='ROOM_Index')\n", 1619 | "indexed_df = string_indexer.fit(df).transform(df)\n", 1620 | "\n", 1621 | "# Onehot encode indexed values\n", 1622 | "encoder = OneHotEncoder(inputCol='ROOM_Index', outputCol='ROOM_Vec')\n", 1623 | "encoded_df = encoder.fit(indexed_df).transform(indexed_df)\n", 1624 | "\n", 1625 | "# Inspect the transformation steps\n", 1626 | "encoded_df[['ROOMCNT', 'ROOM_Index', 'ROOM_Vec']].show(truncate=100)" 1627 | ] 1628 | } 1629 | ], 1630 | "metadata": { 1631 | "colab": { 1632 | "provenance": [] 1633 | }, 1634 | "kernelspec": { 1635 | "display_name": "Python 3 (ipykernel)", 1636 | "language": "python", 1637 | "name": "python3" 1638 | }, 1639 | "language_info": { 1640 | "codemirror_mode": { 1641 | "name": "ipython", 1642 | "version": 3 1643 | }, 1644 | "file_extension": ".py", 1645 | "mimetype": "text/x-python", 1646 | "name": "python", 1647 | "nbconvert_exporter": "python", 1648 | "pygments_lexer": "ipython3", 1649 | "version": "3.11.6" 1650 | } 1651 | }, 1652 | "nbformat": 4, 1653 | "nbformat_minor": 5 1654 | } 1655 | --------------------------------------------------------------------------------