├── elt_pipeline ├── dbt_transform │ ├── macros │ │ └── .gitkeep │ ├── seeds │ │ └── .gitkeep │ ├── tests │ │ └── .gitkeep │ ├── analyses │ │ └── .gitkeep │ ├── snapshots │ │ └── .gitkeep │ ├── .gitignore │ ├── config │ │ ├── .user.yml │ │ └── profiles.yml │ ├── models │ │ ├── recommendations │ │ │ ├── search.sql │ │ │ ├── search_prior.sql │ │ │ └── criteria.sql │ │ ├── sources.yml │ │ └── schema.yml │ ├── README.md │ └── dbt_project.yml ├── elt_pipeline_tests │ ├── __init__.py │ └── test_assets.py ├── setup.cfg ├── pyproject.toml ├── setup.py ├── requirements.txt ├── elt_pipeline │ ├── assets │ │ ├── __init__.py │ │ ├── bronze.py │ │ ├── warehouse.py │ │ ├── gold.py │ │ └── silver.py │ ├── resources │ │ ├── mysql_io_manager.py │ │ ├── minio_io_manager.py │ │ ├── spark_io_manager.py │ │ ├── psql_io_manager.py │ │ └── gdrive_io_manager.py │ └── __init__.py ├── README.md ├── Dockerfile └── logs │ └── dbt.log ├── pg_hba.conf ├── images ├── docker_1.png ├── docker_2.png ├── gdrive_1.png ├── gdrive_2.png ├── gdrive_3.png ├── gdrive_4.png ├── gdrive_5.png ├── gdrive_6.png ├── gdrive_7.png ├── gdrive_8.png ├── gdrive_9.png ├── assets_dbt.png ├── assets_gold.png ├── gdrive_10.png ├── gdrive_11.png ├── gdrive_12.png ├── assets_bronze.png ├── assets_silver.png ├── design_schema.png ├── introduction.jpg ├── assets_general.png ├── assets_warehouse.png ├── design_pipeline.png ├── directory_tree.png ├── goodreads_logo.png └── datalake_structure.png ├── dockerimages ├── dagster │ ├── requirements.txt │ └── Dockerfile ├── streamlit │ ├── requirements.txt │ └── Dockerfile └── spark │ ├── spark-defaults.conf │ └── Dockerfile ├── app ├── .streamlit │ └── secrets.toml └── streamlit_app.py ├── dagster_home ├── workspace.yaml └── dagster.yaml ├── spark_master.env.template ├── spark_workder.env.template ├── .gitignore ├── tree_shorten.txt ├── requirements.txt ├── datalake_structure.txt ├── load_dataset ├── mysql_load.sql ├── psql_datasource.sql └── mysql_datasource.sql ├── Pipfile ├── LICENSE ├── env.template ├── tree.txt ├── Makefile ├── docker-compose.yml ├── README.md └── preprocess.ipynb /elt_pipeline/dbt_transform/macros/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /elt_pipeline/dbt_transform/seeds/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /elt_pipeline/dbt_transform/tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /elt_pipeline/dbt_transform/analyses/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /elt_pipeline/dbt_transform/snapshots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /elt_pipeline/elt_pipeline_tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /elt_pipeline/elt_pipeline_tests/test_assets.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /elt_pipeline/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = elt_pipeline 3 | -------------------------------------------------------------------------------- /pg_hba.conf: -------------------------------------------------------------------------------- 1 | local all all trust 2 | host all all 0.0.0.0/0 trust 3 | -------------------------------------------------------------------------------- /elt_pipeline/dbt_transform/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /elt_pipeline/dbt_transform/config/.user.yml: -------------------------------------------------------------------------------- 1 | id: 37d26752-a903-4570-88b2-7fc1505deb31 2 | -------------------------------------------------------------------------------- /images/docker_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/docker_1.png -------------------------------------------------------------------------------- /images/docker_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/docker_2.png -------------------------------------------------------------------------------- /images/gdrive_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_1.png -------------------------------------------------------------------------------- /images/gdrive_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_2.png -------------------------------------------------------------------------------- /images/gdrive_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_3.png -------------------------------------------------------------------------------- /images/gdrive_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_4.png -------------------------------------------------------------------------------- /images/gdrive_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_5.png -------------------------------------------------------------------------------- /images/gdrive_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_6.png -------------------------------------------------------------------------------- /images/gdrive_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_7.png -------------------------------------------------------------------------------- /images/gdrive_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_8.png -------------------------------------------------------------------------------- /images/gdrive_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_9.png -------------------------------------------------------------------------------- /images/assets_dbt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/assets_dbt.png -------------------------------------------------------------------------------- /images/assets_gold.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/assets_gold.png -------------------------------------------------------------------------------- /images/gdrive_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_10.png -------------------------------------------------------------------------------- /images/gdrive_11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_11.png -------------------------------------------------------------------------------- /images/gdrive_12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_12.png -------------------------------------------------------------------------------- /images/assets_bronze.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/assets_bronze.png -------------------------------------------------------------------------------- /images/assets_silver.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/assets_silver.png -------------------------------------------------------------------------------- /images/design_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/design_schema.png -------------------------------------------------------------------------------- /images/introduction.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/introduction.jpg -------------------------------------------------------------------------------- /images/assets_general.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/assets_general.png -------------------------------------------------------------------------------- /images/assets_warehouse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/assets_warehouse.png -------------------------------------------------------------------------------- /images/design_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/design_pipeline.png -------------------------------------------------------------------------------- /images/directory_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/directory_tree.png -------------------------------------------------------------------------------- /images/goodreads_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/goodreads_logo.png -------------------------------------------------------------------------------- /dockerimages/dagster/requirements.txt: -------------------------------------------------------------------------------- 1 | dagster==1.2.7 2 | dagit==1.2.7 3 | dagster-postgres==0.18.7 4 | dagster-dbt==0.18.7 5 | -------------------------------------------------------------------------------- /images/datalake_structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/datalake_structure.png -------------------------------------------------------------------------------- /elt_pipeline/dbt_transform/models/recommendations/search.sql: -------------------------------------------------------------------------------- 1 | select 2 | isbn, 3 | name 4 | from {{ source('gold', 'book_with_info') }} 5 | -------------------------------------------------------------------------------- /app/.streamlit/secrets.toml: -------------------------------------------------------------------------------- 1 | [postgres] 2 | host = "de_psql" 3 | port = 5432 4 | dbname = "goodreads" 5 | user = "admin" 6 | password = "admin123" 7 | -------------------------------------------------------------------------------- /dagster_home/workspace.yaml: -------------------------------------------------------------------------------- 1 | load_from: 2 | - grpc_server: 3 | host: elt_pipeline 4 | port: 4000 5 | location_name: "elt_pipeline" 6 | -------------------------------------------------------------------------------- /dockerimages/streamlit/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.21.0 2 | psycopg2-binary==2.9.6 3 | altair==4.2.2 4 | pandas==1.5.3 5 | polars==0.16.16 6 | minio==7.1.13 7 | -------------------------------------------------------------------------------- /elt_pipeline/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.dagster] 6 | module_name = "elt_pipeline" 7 | -------------------------------------------------------------------------------- /spark_master.env.template: -------------------------------------------------------------------------------- 1 | SPARK_MODE=master 2 | SPARK_LOCAL_IP=spark-master 3 | SPARK_RPC_AUTHENTICATION_ENABLED=no 4 | SPARK_RPC_ENCRYPTION_ENABLED=no 5 | SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no 6 | SPARK_SSL_ENABLED=no 7 | -------------------------------------------------------------------------------- /elt_pipeline/dbt_transform/models/recommendations/search_prior.sql: -------------------------------------------------------------------------------- 1 | select 2 | isbn, 3 | name 4 | from {{ source('gold', 'book_with_info') }} 5 | right join {{ source('recommendations', 'book_download_link') }} using (isbn) 6 | where link is not null 7 | -------------------------------------------------------------------------------- /spark_workder.env.template: -------------------------------------------------------------------------------- 1 | SPARK_MODE=worker 2 | SPARK_MASTER_URL=spark://spark-master:7077 3 | SPARK_WORKER_MEMORY=4G 4 | SPARK_WORKER_CORES=4 5 | SPARK_RPC_AUTHENTICATION_ENABLED=no 6 | SPARK_RPC_ENCRYPTION_ENABLED=no 7 | SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no 8 | SPARK_SSL_ENABLED=no 9 | -------------------------------------------------------------------------------- /elt_pipeline/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="elt_pipeline", 5 | packages=find_packages(exclude=["elt_pipeline_tests"]), 6 | install_requires=[ 7 | "dagster", 8 | "dagster-cloud" 9 | ], 10 | extras_require={"dev": ["dagit", "pytest"]}, 11 | ) 12 | -------------------------------------------------------------------------------- /elt_pipeline/dbt_transform/models/sources.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | sources: 3 | - name: gold 4 | tables: 5 | - name: genre 6 | - name: book_genre 7 | - name: book_with_info 8 | - name: book_with_publish 9 | - name: book_with_rating 10 | - name: recommendations 11 | tables: 12 | - name: book_download_link 13 | -------------------------------------------------------------------------------- /elt_pipeline/dbt_transform/config/profiles.yml: -------------------------------------------------------------------------------- 1 | dbt_transform: 2 | outputs: 3 | dev: 4 | type: postgres 5 | threads: 1 6 | host: "{{ env_var('DBT_HOST') }}" 7 | port: 5432 8 | user: "{{ env_var('DBT_USER') }}" 9 | pass: "{{ env_var('DBT_PASSWORD') }}" 10 | dbname: "{{ env_var('DBT_DATABASE') }}" 11 | schema: "{{ env_var('DBT_SCHEMA') }}" 12 | target: dev 13 | -------------------------------------------------------------------------------- /elt_pipeline/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.5.3 2 | polars==0.16.16 3 | dagit==1.2.7 4 | dagster==1.2.7 5 | dagster-postgres==0.18.7 6 | dagster-dbt==0.18.7 7 | SQLAlchemy==1.4.46 8 | pymysql==1.0.2 9 | cryptography==38.0.3 10 | pyarrow==11.0.0 11 | fsspec==2023.3.0 12 | minio==7.1.13 13 | connectorx==0.3.1 14 | google-api-python-client==2.84.0 15 | google-auth-httplib2==0.1.0 16 | google-auth-oauthlib==1.0.0 17 | pyspark==3.3.2 18 | dbt-core==1.4.5 19 | dbt-postgres==1.4.5 20 | pytz==2022.7.1 21 | -------------------------------------------------------------------------------- /elt_pipeline/dbt_transform/models/recommendations/criteria.sql: -------------------------------------------------------------------------------- 1 | with tmp_avg_rating as ( 2 | select 3 | isbn, 4 | rating 5 | from {{ source('gold', 'book_with_rating') }} 6 | ), 7 | tmp_download_link as ( 8 | select 9 | isbn, 10 | case 11 | when link is null then 0 12 | else 1 13 | end as hasdownloadlink, 14 | rating 15 | from {{ source('recommendations', 'book_download_link') }} 16 | right join tmp_avg_rating using (isbn) 17 | ) 18 | 19 | select * 20 | from tmp_download_link 21 | -------------------------------------------------------------------------------- /dockerimages/streamlit/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.16-slim 2 | 3 | WORKDIR /app 4 | 5 | RUN apt-get update && apt-get install -y \ 6 | build-essential \ 7 | curl \ 8 | software-properties-common \ 9 | git \ 10 | && rm -rf /var/lib/apt/lists/* 11 | 12 | COPY . . 13 | 14 | RUN pip install --upgrade pip && pip install -r requirements.txt 15 | 16 | EXPOSE 8501 17 | 18 | HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health 19 | 20 | ENTRYPOINT ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"] 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python env 2 | venv/ 3 | Pipfile.lock 4 | 5 | # Other env 6 | .env 7 | .spark_master.env 8 | .spark_worker.env 9 | .streamlit/* 10 | 11 | # Cache 12 | .cache* 13 | __pycache__/ 14 | storage/ 15 | dataset/ 16 | minio/ 17 | .coverage 18 | .spark_session.txt 19 | 20 | # Google secrets 21 | gdrive_client_secret.json 22 | client_secret.json 23 | token_drive_v3.pickle 24 | 25 | # dagster 26 | dagster_home/logs/ 27 | dagster_home/.logs_queue/ 28 | dagster_home/.telemetry/ 29 | dagster_home/history/ 30 | dagster_home/schedules/* 31 | 32 | # EDA 33 | ./EDA.ipynb 34 | -------------------------------------------------------------------------------- /dockerimages/dagster/Dockerfile: -------------------------------------------------------------------------------- 1 | # Dagster libraries to run both dagit and the dagster-daemon. Does not 2 | # need to have access to any pipeline code. 3 | FROM python:3.9.16-slim 4 | 5 | # Set $DAGSTER_HOME and copy dagster instance and workspace YAML there 6 | ENV DAGSTER_HOME=/opt/dagster/dagster_home 7 | RUN mkdir -p $DAGSTER_HOME && \ 8 | mkdir -p $DAGSTER_HOME/storage && \ 9 | mkdir -p $DAGSTER_HOME/compute_logs && \ 10 | mkdir -p $DAGSTER_HOME/local_artifact_storage 11 | 12 | WORKDIR $DAGSTER_HOME 13 | COPY requirements.txt $DAGSTER_HOME 14 | RUN pip install --upgrade pip && pip install -r requirements.txt 15 | -------------------------------------------------------------------------------- /tree_shorten.txt: -------------------------------------------------------------------------------- 1 | ├── app 2 | ├── dagster_home 3 | ├── dataset 4 | ├── docker-compose.yml 5 | ├── dockerimages 6 | ├── EDA.ipynb 7 | ├── elt_pipeline 8 | │   ├── dbt_transform 9 | │   ├── Dockerfile 10 | │   ├── elt_pipeline 11 | │   ├── requirements.txt 12 | ├── .env 13 | ├── env.template 14 | ├── .git 15 | ├── .gitignore 16 | ├── load_dataset 17 | ├── Makefile 18 | ├── pg_hba.conf 19 | ├── Pipfile 20 | ├── Pipfile.lock 21 | ├── README.md 22 | ├── requirements.txt 23 | ├── .spark_master.env 24 | ├── spark_master.env.template 25 | ├── .spark_session.txt 26 | ├── spark_workder.env.template 27 | ├── .spark_worker.env 28 | -------------------------------------------------------------------------------- /elt_pipeline/dbt_transform/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt run 7 | - dbt test 8 | 9 | 10 | ### Resources: 11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support 14 | - Find [dbt events](https://events.getdbt.com) near you 15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dagit==1.2.3 2 | dagster==1.2.3 3 | dagster-postgres==0.18.3 4 | dagster-dbt==0.18.3 5 | dagster-spark==0.18.3 6 | pandas==1.5.3 7 | polars==0.16.16 8 | pyspark==3.3.2 9 | SQLAlchemy==1.4.46 10 | pymysql==1.0.2 11 | cryptography==38.0.3 12 | pyarrow==11.0.0 13 | fsspec==2023.3.0 14 | minio==7.1.13 15 | dbt-core==1.4.5 16 | dbt-postgres==1.4.5 17 | pytest==7.2.2 18 | pylint==2.17.1 19 | pytest-cov==4.0.0 20 | autopep8==2.0.2 21 | ydata-profiling==4.1.2 22 | connectorx==0.3.1 23 | google-api-python-client==2.84.0 24 | google-auth-httplib2==0.1.0 25 | google-auth-oauthlib==1.0.0 26 | streamlit 27 | streamlit-pandas-profiling 28 | streamlit-elements 29 | -------------------------------------------------------------------------------- /datalake_structure.txt: -------------------------------------------------------------------------------- 1 | lakehouse 2 | ├── bronze 3 | │   └── goodreads 4 | │   │   ├── book 5 | │   │   ├── genre 6 | │   │   ├── book_genre 7 | │   │   ├── book_download_link 8 | ├── silver 9 | │   └── goodreads 10 | │   │   ├── cleaned_book 11 | │   │   ├── collected_book 12 | │   │   ├── cleaned_genre 13 | │   │   ├── collected_genre 14 | │   │   ├── collcted_book_genre 15 | │   │   ├── isbn 16 | ├── gold 17 | │   └── goodreads 18 | │   │   ├── genre 19 | │   │   ├── book_genre 20 | │   │   ├── book_with_info 21 | │   │   ├── book_with_publish 22 | │   │   ├── book_with_rating 23 | ├── files 24 | │   └── loremipsum.epub 25 | │   └── ... 26 | ├── images 27 | │   └── loremipsum.jpeg 28 | │   └── ... 29 | -------------------------------------------------------------------------------- /dockerimages/spark/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.jars jars/delta-core_2.12-2.2.0.jar,jars/hadoop-aws-3.3.2.jar,jars/delta-storage-2.2.0.jar,jars/aws-java-sdk-1.12.367.jar,jars/s3-2.18.41.jar,jars/aws-java-sdk-bundle-1.11.1026.jar 2 | spark.sql.extensions io.delta.sql.DeltaSparkSessionExtension 3 | spark.sql.catalog.spark_catalog org.apache.spark.sql.delta.catalog.DeltaCatalog 4 | spark.hadoop.fs.s3a.endpoint http://minio:9000 5 | spark.hadoop.fs.s3a.access.key minio 6 | spark.hadoop.fs.s3a.secret.key minio123 7 | spark.hadoop.fs.s3a.path.style.access true 8 | spark.hadoop.fs.s3a.connection.ssl.enabled false 9 | spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem 10 | spark.driver.memory 4g 11 | spark.executor.memory 4g 12 | -------------------------------------------------------------------------------- /elt_pipeline/elt_pipeline/assets/__init__.py: -------------------------------------------------------------------------------- 1 | from dagster import load_assets_from_modules, file_relative_path 2 | from dagster_dbt import load_assets_from_dbt_project 3 | 4 | from . import bronze, silver, gold, warehouse 5 | 6 | 7 | bronze_layer_assets = load_assets_from_modules([bronze]) 8 | silver_layer_assets = load_assets_from_modules([silver]) 9 | gold_layer_assets = load_assets_from_modules([gold]) 10 | warehouse_assets = load_assets_from_modules([warehouse]) 11 | 12 | DBT_PROJECT_PATH = file_relative_path(__file__, "../../dbt_transform") 13 | DBT_PROFILES = file_relative_path(__file__, "../../dbt_transform/config") 14 | 15 | dbt_assets = load_assets_from_dbt_project( 16 | project_dir=DBT_PROJECT_PATH, 17 | profiles_dir=DBT_PROFILES, 18 | key_prefix=["dbt"], 19 | ) 20 | -------------------------------------------------------------------------------- /dockerimages/spark/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/bitnami/spark:3.3.2 2 | 3 | USER root 4 | 5 | # Install prerequisites 6 | RUN apt-get update && apt-get install -y curl 7 | 8 | RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \ 9 | && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \ 10 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \ 11 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \ 12 | && mv s3-2.18.41.jar /opt/bitnami/spark/jars \ 13 | && mv aws-java-sdk-1.12.367.jar /opt/bitnami/spark/jars \ 14 | && mv delta-core_2.12-2.2.0.jar /opt/bitnami/spark/jars \ 15 | && mv delta-storage-2.2.0.jar /opt/bitnami/spark/jars 16 | -------------------------------------------------------------------------------- /load_dataset/mysql_load.sql: -------------------------------------------------------------------------------- 1 | LOAD DATA LOCAL INFILE '/tmp/dataset/book_full.csv' 2 | INTO TABLE goodreads.book 3 | FIELDS TERMINATED BY ',' 4 | ENCLOSED BY '"' 5 | LINES TERMINATED BY '\n' 6 | IGNORE 1 ROWS; 7 | 8 | -- LOAD DATA LOCAL INFILE '/tmp/dataset/my_book.csv' 9 | -- INTO TABLE goodreads.book 10 | -- FIELDS TERMINATED BY ',' 11 | -- ENCLOSED BY '"' 12 | -- LINES TERMINATED BY '\n' 13 | -- IGNORE 1 ROWS; 14 | -- 15 | -- LOAD DATA LOCAL INFILE '/tmp/dataset/genre.csv' 16 | -- INTO TABLE goodreads.genre 17 | -- FIELDS TERMINATED BY ',' 18 | -- ENCLOSED BY '"' 19 | -- LINES TERMINATED BY '\n' 20 | -- IGNORE 1 ROWS; 21 | -- 22 | -- LOAD DATA LOCAL INFILE '/tmp/dataset/book_genre.csv' 23 | -- INTO TABLE goodreads.book_genre 24 | -- FIELDS TERMINATED BY ',' 25 | -- ENCLOSED BY '"' 26 | -- LINES TERMINATED BY '\n' 27 | -- IGNORE 1 ROWS; 28 | -- 29 | -- LOAD DATA LOCAL INFILE '/tmp/dataset/book_download_link.csv' 30 | -- INTO TABLE goodreads.book_download_link 31 | -- FIELDS TERMINATED BY ',' 32 | -- ENCLOSED BY '"' 33 | -- LINES TERMINATED BY '\n' 34 | -- IGNORE 1 ROWS; 35 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | pandas = "==1.5.3" 8 | polars = "==0.16.16" 9 | pyspark = "==3.3.2" 10 | dagster = "==1.2.3" 11 | dagit = "==1.2.3" 12 | dagster-postgres = "==0.18.3" 13 | dagster-dbt = "==0.18.3" 14 | dagster-spark = "==0.18.3" 15 | sqlalchemy = "==1.4.46" 16 | pymysql = "==1.0.2" 17 | cryptography = "==38.0.3" 18 | pyarrow = "==11.0.0" 19 | fsspec = "==2023.3.0" 20 | s3fs = "==0.4.2" 21 | minio = "==7.1.13" 22 | dbt-core = "==1.4.5" 23 | dbt-postgres = "==1.4.5" 24 | pytest = "==7.2.2" 25 | pylint = "==2.17.1" 26 | pytest-cov = "==4.0.0" 27 | autopep8 = "==2.0.2" 28 | streamlit = "*" 29 | streamlit-pandas-profiling = "*" 30 | streamlit-elements = "*" 31 | ipywidgets = "*" 32 | connectorx = "==0.3.1" 33 | google-api-python-client = "==2.84.0" 34 | google-auth-httplib2 = "==0.1.0" 35 | google-auth-oauthlib = "==1.0.0" 36 | ydata-profiling = "==4.1.2" 37 | 38 | [dev-packages] 39 | ipykernel = "*" 40 | 41 | [requires] 42 | python_version = "3.10" 43 | python_full_version = "3.10.6" 44 | -------------------------------------------------------------------------------- /elt_pipeline/elt_pipeline/resources/mysql_io_manager.py: -------------------------------------------------------------------------------- 1 | from dagster import IOManager, InputContext, OutputContext 2 | from contextlib import contextmanager 3 | from sqlalchemy import create_engine 4 | import polars as pl 5 | 6 | 7 | def connect_mysql(config) -> str: 8 | conn_info = ( 9 | f"mysql://{config['user']}:{config['password']}" 10 | + f"@{config['host']}:{config['port']}" 11 | + f"/{config['database']}" 12 | ) 13 | return conn_info 14 | 15 | 16 | class MySQLIOManager(IOManager): 17 | def __init__(self, config): 18 | self._config = config 19 | 20 | def handle_output(self, context: "OutputContext", obj: pl.DataFrame): 21 | pass 22 | 23 | def load_input(self, context: "InputContext"): 24 | pass 25 | 26 | def extract_data(self, sql: str) -> pl.DataFrame: 27 | """ 28 | Extract data from MySQL database as polars DataFrame 29 | """ 30 | conn_info = connect_mysql(self._config) 31 | df_data = pl.read_database(query=sql, connection_uri=conn_info) 32 | return df_data 33 | -------------------------------------------------------------------------------- /dagster_home/dagster.yaml: -------------------------------------------------------------------------------- 1 | storage: 2 | postgres: 3 | postgres_db: 4 | username: 5 | env: DAGSTER_PG_USERNAME 6 | password: 7 | env: DAGSTER_PG_PASSWORD 8 | hostname: 9 | env: DAGSTER_PG_HOSTNAME 10 | db_name: 11 | env: DAGSTER_PG_DB 12 | port: 5432 13 | run_launcher: 14 | module: dagster.core.launcher 15 | class: DefaultRunLauncher 16 | run_coordinator: 17 | module: dagster.core.run_coordinator 18 | class: QueuedRunCoordinator 19 | config: 20 | max_concurrent_runs: 21 | env: DAGSTER_OVERALL_CONCURRENCY_LIMIT 22 | compute_logs: 23 | module: dagster.core.storage.local_compute_log_manager 24 | class: LocalComputeLogManager 25 | config: 26 | base_dir: /opt/dagster/dagster_home/compute_logs 27 | local_artifact_storage: 28 | module: dagster.core.storage.root 29 | class: LocalArtifactStorage 30 | config: 31 | base_dir: /opt/dagster/dagster_home/local_artifact_storage 32 | telemetry: 33 | enabled: true 34 | sensors: 35 | use_threads: true 36 | num_workers: 3 37 | schedules: 38 | use_threads: true 39 | num_workers: 3 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 lelouvincx 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /elt_pipeline/dbt_transform/models/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | models: 3 | - name: search_prior 4 | description: Search index table for books with higher priority 5 | columns: 6 | - name: isbn 7 | description: Primary key 8 | tests: 9 | - unique 10 | - not_null 11 | - name: name 12 | description: Long varchar, can be null, duplicated, ... 13 | - name: search 14 | description: Search index table for books 15 | columns: 16 | - name: isbn 17 | description: Primary key 18 | tests: 19 | - unique 20 | - not_null 21 | - name: name 22 | description: Long varchar, can be null, duplicated, ... 23 | - name: criteria 24 | description: More criteria to filter books 25 | columns: 26 | - name: isbn 27 | description: Primary key 28 | tests: 29 | - unique 30 | - not_null 31 | - name: downloadlink 32 | description: Long varchar, can be null, must be unique, ... 33 | tests: 34 | - unique 35 | - name: avgrating 36 | description: integer, from 0 -> 5 37 | tests: 38 | - not_null 39 | -------------------------------------------------------------------------------- /load_dataset/psql_datasource.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE IF NOT EXISTS goodreads; 2 | \c goodreads 3 | 4 | CREATE SCHEMA IF NOT EXISTS gold; 5 | CREATE SCHEMA IF NOT EXISTS analytics; 6 | CREATE SCHEMA IF NOT EXISTS recommendations; 7 | 8 | CREATE TABLE IF NOT EXISTS recommendations.book_download_link( 9 | ISBN VARCHAR(31) PRIMARY KEY, 10 | Link VARCHAR(255) 11 | ); 12 | 13 | CREATE TABLE IF NOT EXISTS gold.genre ( 14 | Id SERIAL PRIMARY KEY, 15 | Name VARCHAR(63) UNIQUE 16 | ); 17 | 18 | CREATE TABLE IF NOT EXISTS gold.book_genre ( 19 | BookISBN VARCHAR(31) NOT NULL, 20 | GenreId INT NOT NULL, 21 | PRIMARY KEY (BookISBN, GenreId) 22 | ); 23 | 24 | CREATE TABLE IF NOT EXISTS gold.book_with_info ( 25 | ISBN VARCHAR(31) PRIMARY KEY, 26 | Name VARCHAR(31), 27 | Authors VARCHAR(31), 28 | Language VARCHAR(7), 29 | Description TEXT, 30 | PagesNumber INT 31 | ); 32 | 33 | CREATE TABLE IF NOT EXISTS gold.book_with_publish ( 34 | ISBN VARCHAR(31) PRIMARY KEY, 35 | Publisher VARCHAR(31), 36 | PublishYear VARCHAR(31), 37 | PublishMonth INT, 38 | PublishDay INT 39 | ); 40 | 41 | CREATE TABLE IF NOT EXISTS gold.book_with_rating ( 42 | ISBN VARCHAR(31) PRIMARY KEY, 43 | Rating FLOAT, 44 | RatingDist5 INT, 45 | RatingDist4 INT, 46 | RatingDist3 INT, 47 | RatingDist2 INT, 48 | RatingDist1 INT, 49 | RatingDistTotal INT, 50 | CountOfTextReviews INT 51 | ); 52 | -------------------------------------------------------------------------------- /elt_pipeline/dbt_transform/dbt_project.yml: -------------------------------------------------------------------------------- 1 | # Name your project! Project names should contain only lowercase characters 2 | # and underscores. A good package name should reflect your organization's 3 | # name or the intended use of these models 4 | name: "dbt_transform" 5 | version: "1.0.0" 6 | config-version: 2 7 | # This setting configures which "profile" dbt uses for this project. 8 | profile: "dbt_transform" 9 | # These configurations specify where dbt should look for different types of files. 10 | # The `model-paths` config, for example, states that models in this project can be 11 | # found in the "models/" directory. You probably won't need to change these! 12 | model-paths: ["models"] 13 | analysis-paths: ["analyses"] 14 | test-paths: ["tests"] 15 | seed-paths: ["seeds"] 16 | macro-paths: ["macros"] 17 | snapshot-paths: ["snapshots"] 18 | target-path: "target" # directory which will store compiled SQL files 19 | clean-targets: # directories to be removed by `dbt clean` 20 | - "target" 21 | - "dbt_packages" 22 | # Configuring models 23 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 24 | 25 | # In this example config, we tell dbt to build all models in the example/ 26 | # directory as views. These settings can be overridden in the individual model 27 | # files using the `{{ config(...) }}` macro. 28 | models: 29 | dbt_transform: 30 | # Config indicated by + and applies to all files under models/example/ 31 | recommendations: 32 | +materialized: table 33 | -------------------------------------------------------------------------------- /load_dataset/mysql_datasource.sql: -------------------------------------------------------------------------------- 1 | -- Full load reference database 2 | DROP DATABASE IF EXISTS goodreads; 3 | CREATE DATABASE goodreads; 4 | USE goodreads; 5 | 6 | -- Load books 7 | DROP TABLE IF EXISTS goodreads.book; 8 | CREATE TABLE goodreads.book ( 9 | Id INT NOT NULL, 10 | Name VARCHAR(31), 11 | Authors VARCHAR(31), 12 | ISBN VARCHAR(31), 13 | Rating FLOAT, 14 | PublishYear VARCHAR(31), 15 | PublishMonth INT, 16 | PublishDay INT, 17 | Publisher VARCHAR(31), 18 | RatingDist5 VARCHAR(31), 19 | RatingDist4 VARCHAR(31), 20 | RatingDist3 VARCHAR(31), 21 | RatingDist2 VARCHAR(31), 22 | RatingDist1 VARCHAR(31), 23 | RatingDistTotal VARCHAR(31), 24 | CountsOfReview INT, 25 | Language VARCHAR(7), 26 | Description TEXT, 27 | `Count of text reviews` INT, 28 | PagesNumber INT, 29 | PRIMARY KEY (Id) 30 | ); 31 | 32 | -- Load genre 33 | DROP TABLE IF EXISTS goodreads.genre; 34 | CREATE TABLE goodreads.genre ( 35 | Id INT NOT NULL AUTO_INCREMENT, 36 | Name VARCHAR(255) UNIQUE, 37 | PRIMARY KEY (Id) 38 | ); 39 | 40 | -- Load book_genre 41 | DROP TABLE IF EXISTS goodreads.book_genre; 42 | CREATE TABLE goodreads.book_genre ( 43 | BookISBN VARCHAR(31) NOT NULL, 44 | GenreId INT NOT NULL, 45 | PRIMARY KEY (BookISBN, GenreId) 46 | ); 47 | 48 | -- Load book_download_link 49 | DROP TABLE IF EXISTS goodreads.book_download_link; 50 | CREATE TABLE goodreads.book_download_link ( 51 | BookISBN VARCHAR(31) NOT NULL UNIQUE, 52 | Link VARCHAR(255) NOT NULL, 53 | PRIMARY KEY (BookISBN, Link) 54 | ); 55 | -------------------------------------------------------------------------------- /env.template: -------------------------------------------------------------------------------- 1 | # MySQL 2 | MYSQL_HOST=de_mysql 3 | MYSQL_PORT=3306 4 | MYSQL_DATABASE=goodreads 5 | MYSQL_USER= 6 | MYSQL_PASSWORD= 7 | MYSQL_ROOT_PASSWORD= 8 | 9 | # PostgreSQL 10 | POSTGRES_HOST=de_psql 11 | POSTGRES_PORT=5432 12 | POSTGRES_USER= 13 | POSTGRES_PASSWORD= 14 | POSTGRES_DB=goodreads 15 | POSTGRES_HOST_AUTH_METHOD=trust 16 | 17 | # Google Drive 18 | GDRIVE_CLIENT_SECRET_FILE=client_secret.json 19 | GDRIVE_PICKLE_FILE=token_drive_v3.pickle 20 | GDRIVE_API_NAME=drive 21 | GDRIVE_API_VERSION=v3 22 | GDRIVE_SCOPES=https://www.googleapis.com/auth/drive.readonly 23 | 24 | # Dagster 25 | DAGSTER_PG_HOSTNAME=de_psql 26 | DAGSTER_PG_USERNAME= 27 | DAGSTER_PG_PASSWORD= 28 | DAGSTER_PG_DB= 29 | DAGSTER_OVERALL_CONCURRENCY_LIMIT=1 30 | DAGSTER_HOME=/opt/dagster/dagster_home 31 | 32 | # dbt 33 | DBT_HOST=de_psql 34 | DBT_USER= 35 | DBT_PASSWORD= 36 | DBT_DATABASE=goodreads 37 | DBT_SCHEMA=recommendations 38 | 39 | # MinIO 40 | MINIO_ENDPOINT=minio:9000 41 | MINIO_ROOT_USER= 42 | MINIO_ROOT_PASSWORD= 43 | MINIO_ACCESS_KEY= 44 | MINIO_SECRET_KEY= 45 | DATALAKE_BUCKET=lakehouse 46 | AWS_ACCESS_KEY_ID= 47 | AWS_SECRET_ACCESS_KEY= 48 | AWS_REGION= 49 | 50 | # MinIO client (mc) 51 | AWS_ACCESS_KEY_ID= 52 | AWS_SECRET_ACCESS_KEY= 53 | AWS_REGION= 54 | 55 | # Spark 56 | SPARK_MASTER_URL=spark://spark-master:7077 57 | SPARK_VERSION=3.3.2 58 | HADOOP_VERSION=3 59 | 60 | # Metabase 61 | MB_DB_TYPE=postgres 62 | MB_DB_DBNAME=goodreads 63 | MB_DB_PORT=5432 64 | MB_DB_USER= 65 | MB_DB_PASS= 66 | MB_DB_HOST=de_psql 67 | MB_DB_FILE=/metabase_data/metabase.db 68 | -------------------------------------------------------------------------------- /elt_pipeline/README.md: -------------------------------------------------------------------------------- 1 | # elt_pipeline 2 | 3 | This is a [Dagster](https://dagster.io/) project scaffolded with [`dagster project scaffold`](https://docs.dagster.io/getting-started/create-new-project). 4 | 5 | ## Getting started 6 | 7 | First, install your Dagster code location as a Python package. By using the --editable flag, pip will install your Python package in ["editable mode"](https://pip.pypa.io/en/latest/topics/local-project-installs/#editable-installs) so that as you develop, local code changes will automatically apply. 8 | 9 | ```bash 10 | pip install -e ".[dev]" 11 | ``` 12 | 13 | Then, start the Dagster UI web server: 14 | 15 | ```bash 16 | dagster dev 17 | ``` 18 | 19 | Open http://localhost:3000 with your browser to see the project. 20 | 21 | You can start writing assets in `elt_pipeline/assets.py`. The assets are automatically loaded into the Dagster code location as you define them. 22 | 23 | ## Development 24 | 25 | 26 | ### Adding new Python dependencies 27 | 28 | You can specify new Python dependencies in `setup.py`. 29 | 30 | ### Unit testing 31 | 32 | Tests are in the `elt_pipeline_tests` directory and you can run tests using `pytest`: 33 | 34 | ```bash 35 | pytest elt_pipeline_tests 36 | ``` 37 | 38 | ### Schedules and sensors 39 | 40 | If you want to enable Dagster [Schedules](https://docs.dagster.io/concepts/partitions-schedules-sensors/schedules) or [Sensors](https://docs.dagster.io/concepts/partitions-schedules-sensors/sensors) for your jobs, the [Dagster Daemon](https://docs.dagster.io/deployment/dagster-daemon) process must be running. This is done automatically when you run `dagster dev`. 41 | 42 | Once your Dagster Daemon is running, you can start turning on schedules and sensors for your jobs. 43 | 44 | ## Deploy on Dagster Cloud 45 | 46 | The easiest way to deploy your Dagster project is to use Dagster Cloud. 47 | 48 | Check out the [Dagster Cloud Documentation](https://docs.dagster.cloud) to learn more. 49 | -------------------------------------------------------------------------------- /tree.txt: -------------------------------------------------------------------------------- 1 | ├── app 2 | │   ├── .streamlit 3 | │   │   └── secrets.toml 4 | │   └── streamlit_app.py 5 | ├── dagster_home 6 | │   ├── dagster.yaml 7 | │   └── workspace.yaml 8 | ├── dataset 9 | │   ├── book_download_link.csv 10 | │   ├── book_full.csv 11 | │   ├── book_genre.csv 12 | │   ├── genre.csv 13 | │   └── my_book.csv 14 | ├── docker-compose.yml 15 | ├── dockerimages 16 | │   ├── dagster 17 | │   │   ├── Dockerfile 18 | │   │   └── requirements.txt 19 | │   ├── spark 20 | │   │   ├── Dockerfile 21 | │   │   └── spark-defaults.conf 22 | │   └── streamlit 23 | │   ├── Dockerfile 24 | │   └── requirements.txt 25 | ├── EDA.ipynb 26 | ├── elt_pipeline 27 | │   ├── dbt_transform 28 | │   │   ├── config 29 | │   │   │   ├── profiles.yml 30 | │   │   ├── dbt_packages 31 | │   │   ├── dbt_project.yml 32 | │   │   ├── models 33 | │   │   │   ├── recommendations 34 | │   │   │   │   ├── criteria.sql 35 | │   │   │   │   ├── search_prior.sql 36 | │   │   │   │   └── search.sql 37 | │   │   │   ├── schema.yml 38 | │   │   │   └── sources.yml 39 | │   │   │   ├── manifest.json 40 | │   ├── Dockerfile 41 | │   ├── elt_pipeline 42 | │   │   ├── assets 43 | │   │   │   ├── bronze.py 44 | │   │   │   ├── gold.py 45 | │   │   │   ├── silver.py 46 | │   │   │   └── warehouse.py 47 | │   │   ├── client_secret.json 48 | │   │   ├── resources 49 | │   │   │   ├── gdrive_io_manager.py 50 | │   │   │   ├── minio_io_manager.py 51 | │   │   │   ├── mysql_io_manager.py 52 | │   │   │   ├── psql_io_manager.py 53 | │   │   │   └── spark_io_manager.py 54 | │   │   └── token_drive_v3.pickle 55 | │   ├── pyproject.toml 56 | │   ├── requirements.txt 57 | │   ├── setup.cfg 58 | │   └── setup.py 59 | ├── .env 60 | ├── env.template 61 | ├── .git 62 | ├── .gitignore 63 | ├── load_dataset 64 | │   ├── mysql_datasource.sql 65 | │   ├── mysql_load.sql 66 | │   └── psql_datasource.sql 67 | ├── Makefile 68 | ├── pg_hba.conf 69 | ├── Pipfile 70 | ├── Pipfile.lock 71 | ├── README.md 72 | ├── requirements.txt 73 | ├── .spark_master.env 74 | ├── spark_master.env.template 75 | ├── .spark_session.txt 76 | ├── spark_workder.env.template 77 | ├── .spark_worker.env 78 | -------------------------------------------------------------------------------- /elt_pipeline/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.16-slim 2 | 3 | # Install spark and java 4 | ARG openjdk_version="17" 5 | 6 | RUN apt-get update --yes && \ 7 | apt-get install --yes curl "openjdk-${openjdk_version}-jre-headless" ca-certificates-java procps && \ 8 | apt-get clean && rm -rf /var/lib/apt/lists/* 9 | 10 | # Download spark neccessary jars 11 | RUN curl -O https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz \ 12 | && tar zxvf spark-3.3.2-bin-hadoop3.tgz \ 13 | && rm -rf spark-3.3.2-bin-hadoop3.tgz \ 14 | && mv spark-3.3.2-bin-hadoop3/ /usr/local/ \ 15 | && rm -rf /usr/local/spark \ 16 | && rm -rf /usr/local/spark-3.3.0-bin-hadoop3 \ 17 | && ln -s /usr/local/spark-3.3.2-bin-hadoop3 /usr/local/spark 18 | 19 | RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \ 20 | && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \ 21 | && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1026/aws-java-sdk-bundle-1.11.1026.jar \ 22 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \ 23 | && curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \ 24 | && curl -O https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.2/hadoop-aws-3.3.2.jar \ 25 | && mv s3-2.18.41.jar /usr/local/spark/jars \ 26 | && mv aws-java-sdk-1.12.367.jar /usr/local/spark/jars \ 27 | && mv aws-java-sdk-bundle-1.11.1026.jar /usr/local/spark/jars \ 28 | && mv delta-core_2.12-2.2.0.jar /usr/local/spark/jars \ 29 | && mv delta-storage-2.2.0.jar /usr/local/spark/jars \ 30 | && mv hadoop-aws-3.3.2.jar /usr/local/spark/jars 31 | 32 | # Add repository code 33 | WORKDIR /opt/dagster/app/elt_pipeline 34 | COPY requirements.txt /opt/dagster/app/elt_pipeline 35 | RUN pip install --upgrade pip && pip install -r requirements.txt 36 | COPY . /opt/dagster/app/elt_pipeline 37 | 38 | # CMD allows this to be overridden from run launchers or executors that want to run other commands against your repository 39 | CMD ["dagster", "api", "grpc", "-h", "0.0.0.0", "-p", "4000", "-m", "elt_pipeline"] 40 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | include .env 2 | 3 | install: 4 | python3 -V \ 5 | && python3 -m venv venv \ 6 | && . venv/bin/activate \ 7 | && pip install --upgrade pip && pip install -r requirements.txt 8 | 9 | check: 10 | black ./elt_pipeline --check 11 | 12 | lint: 13 | flake8 ./elt_pipeline 14 | 15 | test: 16 | docker exec elt_pipeline python -m pytest -vv --cov=utils tests/utils \ 17 | && docker exec elt_pipeline python -m pytest -vv --cov=ops tests/ops 18 | 19 | pull: 20 | docker compose pull 21 | 22 | build: 23 | docker compose build 24 | 25 | build-dagster: 26 | docker build -t de_dagster:latest ./dockerimages/dagster 27 | 28 | build-spark: 29 | docker build -t spark_master:latest ./dockerimages/spark 30 | 31 | build-pipeline: 32 | docker build -t elt_pipeline:latest ./elt_pipeline 33 | 34 | build-streamlit: 35 | docker build -t de_streamlit:latest ./dockerimages/streamlit 36 | 37 | up-bg: 38 | docker compose --env-file .env up -d 39 | 40 | up: 41 | docker compose --env-file .env up 42 | 43 | down: 44 | docker compose --env-file .env down 45 | 46 | restart-bg: 47 | docker compose --env-file .env down && docker compose --env-file .env up -d 48 | 49 | restart: 50 | docker compose --env-file .env down && docker compose --env-file .env up 51 | 52 | to_mysql: 53 | docker exec -it de_mysql mysql -u"${MYSQL_USER}" -p"${MYSQL_PASSWORD}" ${MYSQL_DATABASE} 54 | 55 | to_mysql_root: 56 | docker exec -it de_mysql mysql -u"root" -p"${MYSQL_ROOT_PASSWORD}" ${MYSQL_DATABASE} 57 | 58 | mysql_create: 59 | docker exec -it de_mysql mysql --local_infile -u"${MYSQL_USER}" -p"${MYSQL_PASSWORD}" ${MYSQL_DATABASE} -e"source /tmp/load_dataset/mysql_datasource.sql" 60 | 61 | mysql_load: 62 | docker exec -it de_mysql mysql --local_infile -u"${MYSQL_USER}" -p"${MYSQL_PASSWORD}" ${MYSQL_DATABASE} -e"source /tmp/load_dataset/mysql_load.sql" 63 | 64 | to_psql: 65 | docker exec -it de_psql psql postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB} 66 | 67 | to_psql_no_db: 68 | docker exec -it de_psql psql postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/postgres 69 | 70 | psql_create: 71 | docker exec -it de_psql psql postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB} -f /tmp/load_dataset/psql_datasource.sql -a 72 | -------------------------------------------------------------------------------- /elt_pipeline/logs/dbt.log: -------------------------------------------------------------------------------- 1 | 2 | 3 | ============================== 2023-04-15 04:45:59.961967 | 6fcecef0-8da6-4cbb-8428-1f02f0660fa9 ============================== 4 | 04:45:59.961967 [info ] [MainThread]: Running with dbt=1.4.5 5 | 04:45:59.963549 [debug] [MainThread]: running dbt with arguments {'write_json': True, 'use_colors': True, 'printer_width': 80, 'version_check': True, 'partial_parse': True, 'static_parser': True, 'profiles_dir': '/home/lelouvincx/Documents/FDE02/project/elt_pipeline', 'send_anonymous_usage_stats': True, 'quiet': False, 'no_print': False, 'cache_selected_only': False, 'skip_profile_setup': False, 'which': 'init', 'indirect_selection': 'eager'} 6 | 04:45:59.963829 [debug] [MainThread]: Tracking: tracking 7 | 04:45:59.965225 [debug] [MainThread]: Sending event: {'category': 'dbt', 'action': 'invocation', 'label': 'start', 'context': [, , ]} 8 | 04:46:07.900335 [debug] [MainThread]: Starter project path: /home/lelouvincx/.local/share/virtualenvs/project-zhl6RxJh/lib/python3.10/site-packages/dbt/include/starter_project 9 | 04:46:13.514941 [info ] [MainThread]: Profile dbt_transform written to /home/lelouvincx/Documents/FDE02/project/elt_pipeline/profiles.yml using target's sample configuration. Once updated, you'll be able to start developing with dbt. 10 | 04:46:13.515348 [info ] [MainThread]: 11 | Your new dbt project "dbt_transform" was created! 12 | 13 | For more information on how to configure the profiles.yml file, 14 | please consult the dbt documentation here: 15 | 16 | https://docs.getdbt.com/docs/configure-your-profile 17 | 18 | One more thing: 19 | 20 | Need help? Don't hesitate to reach out to us via GitHub issues or on Slack: 21 | 22 | https://community.getdbt.com/ 23 | 24 | Happy modeling! 25 | 26 | 04:46:13.515755 [debug] [MainThread]: Sending event: {'category': 'dbt', 'action': 'invocation', 'label': 'end', 'context': [, , ]} 27 | 04:46:13.516131 [debug] [MainThread]: Flushing usage events 28 | -------------------------------------------------------------------------------- /elt_pipeline/elt_pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from dagster import Definitions, load_assets_from_modules, file_relative_path 2 | from dagster_dbt import dbt_cli_resource 3 | import os 4 | 5 | from . import assets 6 | from .resources.mysql_io_manager import MySQLIOManager 7 | from .resources.minio_io_manager import MinIOIOManager 8 | from .resources.gdrive_io_manager import GDriveIOManager 9 | from .resources.spark_io_manager import SparkIOManager 10 | from .resources.psql_io_manager import PostgreSQLIOManager 11 | 12 | 13 | MYSQL_CONFIG = { 14 | "host": os.getenv("MYSQL_HOST"), 15 | "port": os.getenv("MYSQL_PORT"), 16 | "database": os.getenv("MYSQL_DATABASE"), 17 | "user": os.getenv("MYSQL_USER"), 18 | "password": os.getenv("MYSQL_PASSWORD"), 19 | } 20 | 21 | MINIO_CONFIG = { 22 | "bucket": os.getenv("DATALAKE_BUCKET"), 23 | "endpoint_url": os.getenv("MINIO_ENDPOINT"), 24 | "minio_access_key": os.getenv("MINIO_ACCESS_KEY"), 25 | "minio_secret_key": os.getenv("MINIO_SECRET_KEY"), 26 | } 27 | 28 | GDRIVE_CONFIG = { 29 | "client_secret_file": os.path.join( 30 | os.getcwd(), str(os.getenv("GDRIVE_CLIENT_SECRET_FILE")) 31 | ), 32 | "pickle_file": os.path.join( 33 | os.getcwd(), "elt_pipeline", str(os.getenv("GDRIVE_PICKLE_FILE")) 34 | ), 35 | "api_name": os.getenv("GDRIVE_API_NAME"), 36 | "api_version": os.getenv("GDRIVE_API_VERSION"), 37 | "scopes": os.getenv("GDRIVE_SCOPES"), 38 | "bucket": os.getenv("DATALAKE_BUCKET"), 39 | "endpoint_url": os.getenv("MINIO_ENDPOINT"), 40 | "minio_access_key": os.getenv("MINIO_ACCESS_KEY"), 41 | "minio_secret_key": os.getenv("MINIO_SECRET_KEY"), 42 | } 43 | 44 | SPARK_CONFIG = { 45 | "spark_master": os.getenv("SPARK_MASTER_URL"), 46 | "spark_version": os.getenv("SPARK_VERSION"), 47 | "hadoop_version": os.getenv("HADOOP_VERSION"), 48 | "endpoint_url": os.getenv("MINIO_ENDPOINT"), 49 | "minio_access_key": os.getenv("MINIO_ACCESS_KEY"), 50 | "minio_secret_key": os.getenv("MINIO_SECRET_KEY"), 51 | } 52 | 53 | PSQL_CONFIG = { 54 | "host": os.getenv("POSTGRES_HOST"), 55 | "port": os.getenv("POSTGRES_PORT"), 56 | "database": os.getenv("POSTGRES_DB"), 57 | "user": os.getenv("POSTGRES_USER"), 58 | "password": os.getenv("POSTGRES_PASSWORD"), 59 | } 60 | 61 | DBT_PROJECT_PATH = file_relative_path(__file__, "../dbt_transform") 62 | DBT_PROFILES = file_relative_path(__file__, "../dbt_transform/config") 63 | 64 | 65 | resources = { 66 | "mysql_io_manager": MySQLIOManager(MYSQL_CONFIG), 67 | "minio_io_manager": MinIOIOManager(MINIO_CONFIG), 68 | "gdrive_io_manager": GDriveIOManager(GDRIVE_CONFIG), 69 | "spark_io_manager": SparkIOManager(SPARK_CONFIG), 70 | "psql_io_manager": PostgreSQLIOManager(PSQL_CONFIG), 71 | "dbt": dbt_cli_resource.configured( 72 | { 73 | "project_dir": DBT_PROJECT_PATH, 74 | "profiles_dir": DBT_PROFILES, 75 | } 76 | ), 77 | } 78 | 79 | defs = Definitions( 80 | assets=load_assets_from_modules([assets]), 81 | resources=resources, 82 | ) 83 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | services: 3 | de_mysql: 4 | image: mysql:8.0 5 | container_name: de_mysql 6 | volumes: 7 | - ./storage/mysql_data:/var/lib/mysql 8 | - ./dataset:/tmp/dataset 9 | - ./load_dataset:/tmp/load_dataset 10 | ports: 11 | - "3306:3306" 12 | env_file: .env 13 | networks: 14 | - de_network 15 | # minio: 16 | # hostname: minio 17 | # image: quay.io/minio/minio:latest 18 | # container_name: minio 19 | # command: ["server", "/data", "--console-address", ":9001"] 20 | # volumes: 21 | # - minio:/data 22 | # ports: 23 | # - "9001:9001" 24 | # - "9000:9000" 25 | # env_file: .env 26 | # networks: 27 | # - de_network 28 | # mc: 29 | # image: minio/mc 30 | # container_name: mc 31 | # hostname: mc 32 | # env_file: .env 33 | # entrypoint: /bin/sh -c " until (/usr/bin/mc config host add minio http://minio:9000 34 | # minio minio123) do echo '...waiting...' && sleep 1; done; /usr/bin/mc mb minio/lakehouse; 35 | # /usr/bin/mc policy set public minio/lakehouse; exit 0; " 36 | # depends_on: 37 | # - minio 38 | # networks: 39 | # - de_network 40 | # elt_pipeline: 41 | # build: 42 | # context: ./elt_pipeline 43 | # dockerfile: Dockerfile 44 | # image: elt_pipeline:latest 45 | # container_name: elt_pipeline 46 | # volumes: 47 | # - ./elt_pipeline:/opt/dagster/app/elt_pipeline 48 | # env_file: .env 49 | # ports: 50 | # - "4040:4040" 51 | # networks: 52 | # - de_network 53 | # de_dagster_dagit: 54 | # image: de_dagster:latest 55 | # entrypoint: 56 | # - dagit 57 | # - -h 58 | # - "0.0.0.0" 59 | # - -p 60 | # - "3001" 61 | # - -w 62 | # - workspace.yaml 63 | # container_name: de_dagster_dagit 64 | # volumes: 65 | # - /var/run/docker.sock:/var/run/docker.sock 66 | # - ./dagster_home:/opt/dagster/dagster_home 67 | # ports: 68 | # - "3001:3001" 69 | # env_file: .env 70 | # networks: 71 | # - de_network 72 | # de_dagster_daemon: 73 | # image: de_dagster:latest 74 | # entrypoint: 75 | # - dagster-daemon 76 | # - run 77 | # container_name: de_dagster_daemon 78 | # volumes: 79 | # - /var/run/docker.sock:/var/run/docker.sock 80 | # - ./dagster_home:/opt/dagster/dagster_home 81 | # env_file: .env 82 | # networks: 83 | # - de_network 84 | # spark-master: 85 | # build: 86 | # context: ./dockerimages/spark 87 | # dockerfile: Dockerfile 88 | # image: spark_master:latest 89 | # container_name: spark-master 90 | # volumes: 91 | # - ./dockerimages/spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf 92 | # expose: 93 | # - "7077" 94 | # ports: 95 | # - "7077:7077" 96 | # - "8080:8080" 97 | # env_file: .spark_master.env 98 | # networks: 99 | # - de_network 100 | # spark-worker: 101 | # image: docker.io/bitnami/spark:3.3.2 102 | # env_file: .spark_worker.env 103 | # deploy: 104 | # replicas: 2 105 | # networks: 106 | # - de_network 107 | # de_psql: 108 | # image: postgres:14-alpine 109 | # container_name: de_psql 110 | # volumes: 111 | # - ./storage/postgres_data:/var/lib/postgresql/data 112 | # - ./pg_hba.conf:/tmp/pg_hba.conf 113 | # - ./load_dataset:/tmp/load_dataset 114 | # command: ["postgres", "-c", "hba_file=/tmp/pg_hba.conf"] 115 | # ports: 116 | # - "5432:5432" 117 | # env_file: .env 118 | # networks: 119 | # - de_network 120 | # de_streamlit: 121 | # build: 122 | # context: ./dockerimages/streamlit 123 | # dockerfile: Dockerfile 124 | # image: de_streamlit:latest 125 | # container_name: de_streamlit 126 | # volumes: 127 | # - ./app:/app 128 | # env_file: .env 129 | # ports: 130 | # - "8501:8501" 131 | # networks: 132 | # - de_network 133 | # de_metabase: 134 | # image: metabase/metabase:latest 135 | # container_name: de_metabase 136 | # volumes: 137 | # - ./storage/metabase_data:/metabase_data 138 | # ports: 139 | # - "3030:3000" 140 | # env_file: .env 141 | # networks: 142 | # - de_network 143 | networks: 144 | de_network: 145 | driver: bridge 146 | name: de_network 147 | volumes: 148 | minio: {} 149 | storage: {} 150 | -------------------------------------------------------------------------------- /elt_pipeline/elt_pipeline/resources/minio_io_manager.py: -------------------------------------------------------------------------------- 1 | from dagster import IOManager, OutputContext, InputContext 2 | from minio import Minio 3 | import polars as pl 4 | 5 | from contextlib import contextmanager 6 | from datetime import datetime 7 | from typing import Union 8 | import os 9 | 10 | 11 | @contextmanager 12 | def connect_minio(config): 13 | client = Minio( 14 | endpoint=config.get("endpoint_url"), 15 | access_key=config.get("minio_access_key"), 16 | secret_key=config.get("minio_secret_key"), 17 | secure=False, 18 | ) 19 | 20 | try: 21 | yield client 22 | except Exception as e: 23 | raise e 24 | 25 | 26 | # Make bucket if not exists 27 | def make_bucket(client: Minio, bucket_name): 28 | found = client.bucket_exists(bucket_name) 29 | if not found: 30 | client.make_bucket(bucket_name) 31 | else: 32 | print(f"Bucket {bucket_name} already exists.") 33 | 34 | 35 | class MinIOIOManager(IOManager): 36 | def __init__(self, config): 37 | self._config = config 38 | 39 | def _get_path(self, context: Union[InputContext, OutputContext]): 40 | """ 41 | Returns (key_name, tmp_file_path) where key_name is the path to the file in minIO 42 | and tmp_file_path is the path to the temp file in local disk, which will 43 | be uploaded to minIO and then deleted after the upload is done. 44 | """ 45 | # E.g context.asset_key.path: ['bronze', 'goodreads', 'book'] 46 | layer, schema, table = context.asset_key.path 47 | # NOTE: E.g: bronze/goodreads/book 48 | key = "/".join([layer, schema, table.replace(f"{layer}_", "")]) 49 | # E.g /tmp/file_bronze_goodreads_book_20210101000000.parquet 50 | tmp_file_path = "/tmp/file_{}_{}.parquet".format( 51 | "_".join(context.asset_key.path), datetime.today().strftime("%Y%m%d%H%M%S") 52 | ) # Partition by year 53 | 54 | if context.has_partition_key: 55 | # E.g partition_str: book_2021 56 | partition_str = str(table) + "_" + context.asset_partition_key 57 | # E.g key_name: bronze/goodreads/book/book_2021.parquet 58 | # tmp_file_path: /tmp/file_bronze_goodreads_book_20210101000000.parquet 59 | return os.path.join(key, f"{partition_str}.parquet"), tmp_file_path 60 | else: 61 | # E.g key_name: bronze/goodreads/book.parquet 62 | return f"{key}.parquet", tmp_file_path 63 | 64 | def handle_output(self, context: "OutputContext", obj: pl.DataFrame): 65 | """ 66 | Receives output from upstream asset, 67 | and converts to parquet format and upload to minIO. 68 | """ 69 | 70 | key_name, tmp_file_path = self._get_path(context) 71 | 72 | # Convert from polars DataFrame to parquet format 73 | obj.write_parquet(tmp_file_path) 74 | 75 | # Upload file to minIO 76 | try: 77 | bucket_name = self._config.get("bucket") 78 | with connect_minio(self._config) as client: 79 | # Make bucket if not exist 80 | make_bucket(client, bucket_name) 81 | 82 | # Upload file to minIO 83 | # E.g bucket_name: lakehouse, 84 | # key_name: bronze/goodreads/book/book_2021.parquet, 85 | # tmp_file_path: /tmp/file_bronze_goodreads_book_20210101000000.parquet 86 | client.fput_object(bucket_name, key_name, tmp_file_path) 87 | context.log.info( 88 | f"(MinIO handle_output) Number of rows and columns: {obj.shape}" 89 | ) 90 | context.add_output_metadata({"path": key_name, "tmp": tmp_file_path}) 91 | 92 | # Clean up tmp file 93 | os.remove(tmp_file_path) 94 | except Exception as e: 95 | raise e 96 | 97 | def load_input(self, context: "InputContext") -> pl.DataFrame: 98 | """ 99 | Prepares input for downstream asset, 100 | and downloads parquet file from minIO and converts to polars DataFrame 101 | """ 102 | 103 | bucket_name = self._config.get("bucket") 104 | key_name, tmp_file_path = self._get_path(context) 105 | 106 | try: 107 | with connect_minio(self._config) as client: 108 | # Make bucket if not exist 109 | make_bucket(client=client, bucket_name=bucket_name) 110 | 111 | # E.g bucket_name: lakehouse, 112 | # key_name: bronze/goodreads/book/book_2021.parquet, 113 | # tmp_file_path: /tmp/file_bronze_goodreads_book_20210101000000.parquet 114 | context.log.info(f"(MinIO load_input) from key_name: {key_name}") 115 | client.fget_object(bucket_name, key_name, tmp_file_path) 116 | df_data = pl.read_parquet(tmp_file_path) 117 | context.log.info( 118 | f"(MinIO load_input) Got polars dataframe with shape: {df_data.shape}" 119 | ) 120 | 121 | return df_data 122 | except Exception as e: 123 | raise e 124 | -------------------------------------------------------------------------------- /elt_pipeline/elt_pipeline/resources/spark_io_manager.py: -------------------------------------------------------------------------------- 1 | from dagster import IOManager, InputContext, OutputContext 2 | from pyspark.sql import SparkSession, DataFrame 3 | 4 | from contextlib import contextmanager 5 | from datetime import datetime 6 | 7 | 8 | @contextmanager 9 | def get_spark_session(config, run_id="Spark IO Manager"): 10 | executor_memory = "1g" if run_id != "Spark IO Manager" else "1500m" 11 | try: 12 | spark = ( 13 | SparkSession.builder.master("spark://spark-master:7077") 14 | .appName(run_id) 15 | .config("spark.driver.memory", "4g") 16 | .config("spark.executor.memory", executor_memory) 17 | .config("spark.cores.max", "4") 18 | .config("spark.executor.cores", "2") 19 | .config( 20 | "spark.jars", 21 | "/usr/local/spark/jars/delta-core_2.12-2.2.0.jar,/usr/local/spark/jars/hadoop-aws-3.3.2.jar,/usr/local/spark/jars/delta-storage-2.2.0.jar,/usr/local/spark/jars/aws-java-sdk-1.12.367.jar,/usr/local/spark/jars/s3-2.18.41.jar,/usr/local/spark/jars/aws-java-sdk-bundle-1.11.1026.jar", 22 | ) 23 | .config( 24 | "spark.sql.catalog.spark_catalog", 25 | "org.apache.spark.sql.delta.catalog.DeltaCatalog", 26 | ) 27 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 28 | .config("spark.hadoop.fs.s3a.endpoint", f"http://{config['endpoint_url']}") 29 | .config("spark.hadoop.fs.s3a.access.key", str(config["minio_access_key"])) 30 | .config("spark.hadoop.fs.s3a.secret.key", str(config["minio_secret_key"])) 31 | .config("spark.hadoop.fs.s3a.path.style.access", "true") 32 | .config("spark.hadoop.fs.connection.ssl.enabled", "false") 33 | .config( 34 | "spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem" 35 | ) 36 | .config("spark.sql.execution.arrow.pyspark.enabled", "true") 37 | .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true") 38 | .getOrCreate() 39 | ) 40 | yield spark 41 | except Exception as e: 42 | raise Exception(f"Error while creating spark session: {e}") 43 | 44 | 45 | class SparkIOManager(IOManager): 46 | def __init__(self, config): 47 | self._config = config 48 | 49 | def handle_output(self, context: "OutputContext", obj: DataFrame): 50 | """ 51 | Write output to s3a (aka minIO) as parquet file 52 | """ 53 | 54 | context.log.debug("(Spark handle_output) Writing output to MinIO ...") 55 | 56 | # E.g file_path: s3a://lakehouse/silver/goodreads/book/book_2021.parquet 57 | # Or file_path: s3a://lakehouse/silver/goodreads/book.parquet if full load 58 | file_path = "s3a://lakehouse/" + "/".join(context.asset_key.path) 59 | if context.has_partition_key: 60 | file_path += f"/book_{context.partition_key}" 61 | file_path += ".parquet" 62 | context.log.debug(f"(Spark handle_output) File path: {file_path}") 63 | file_name = str(context.asset_key.path[-1]) 64 | context.log.debug(f"(Spark handle_output) File name: {file_name}") 65 | 66 | try: 67 | obj.write.mode("overwrite").parquet(file_path) 68 | context.log.debug(f"Saved {file_name} to {file_path}") 69 | except Exception as e: 70 | raise Exception(f"(Spark handle_output) Error while writing output: {e}") 71 | 72 | def load_input(self, context: "InputContext") -> DataFrame: 73 | """ 74 | Load input from s3a (aka minIO) from parquet file to spark.sql.DataFrame 75 | """ 76 | 77 | # E.g context.asset_key.path: ['silver', 'goodreads', 'book'] 78 | context.log.debug(f"Loading input from {context.asset_key.path}...") 79 | file_path = "s3a://lakehouse/" + "/".join(context.asset_key.path) 80 | if context.has_partition_key: 81 | file_path += f"/book_{context.partition_key}" 82 | full_load = (context.metadata or {}).get("full_load", False) 83 | if not full_load: 84 | file_path += ".parquet" 85 | # E.g file_path: s3a://lakehouse/silver/goodreads/book/book_2021.parquet 86 | # Or file_path: s3a://lakehouse/silver/goodreads/book if has partitions 87 | context.log.debug("File path: " + file_path) 88 | 89 | try: 90 | with get_spark_session(self._config) as spark: 91 | df = None 92 | if full_load: 93 | tmp_df = spark.read.parquet(file_path + "/book_2022.parquet") 94 | book_schema = tmp_df.schema 95 | df = ( 96 | spark.read.format("parquet") 97 | .options(header=True, inferSchema=False) 98 | .schema(book_schema) 99 | .load(file_path + "/*.parquet") 100 | ) 101 | else: 102 | df = spark.read.parquet(file_path) 103 | context.log.debug(f"Loaded {df.count()} rows from {file_path}") 104 | return df 105 | except Exception as e: 106 | raise Exception(f"Error while loading input: {e}") 107 | -------------------------------------------------------------------------------- /elt_pipeline/elt_pipeline/resources/psql_io_manager.py: -------------------------------------------------------------------------------- 1 | from dagster import IOManager, InputContext, OutputContext 2 | from contextlib import contextmanager 3 | import polars as pl 4 | from datetime import datetime 5 | import psycopg2 6 | from psycopg2 import sql 7 | import psycopg2.extras 8 | 9 | 10 | @contextmanager 11 | def connect_psql(config): 12 | try: 13 | yield psycopg2.connect( 14 | host=config["host"], 15 | port=config["port"], 16 | database=config["database"], 17 | user=config["user"], 18 | password=config["password"], 19 | ) 20 | except (Exception) as e: 21 | print(f"Error while connecting to PostgreSQL: {e}") 22 | 23 | 24 | class PostgreSQLIOManager(IOManager): 25 | def __init__(self, config): 26 | self._config = config 27 | 28 | def handle_output(self, context: "OutputContext", obj: pl.DataFrame): 29 | # E.g context.asset_key.path = ['warehouse', 'gold', 'book_genre'] 30 | schema = context.asset_key.path[-2] 31 | # NOTE: Replace pattern is 'warehouse', not general 32 | table = str(context.asset_key.path[-1]).replace("warehouse_", "") 33 | context.log.debug(f"Schema: {schema}, Table: {table}") 34 | tmp_tbl = f"{table}_tmp_{datetime.now().strftime('%Y_%m_%d')}" 35 | try: 36 | with connect_psql(self._config) as conn: 37 | context.log.debug(f"Connected to PostgreSQL: {conn}") 38 | primary_keys = (context.metadata or {}).get("primary_keys", []) 39 | context.log.debug(f"Primary keys: {primary_keys}") 40 | 41 | with conn.cursor() as cursor: 42 | context.log.debug(f"Cursor info: {cursor}") 43 | cursor.execute("SELECT version()") 44 | context.log.info(f"PostgreSQL version: {cursor.fetchone()}") 45 | # Create temp file 46 | cursor.execute( 47 | f"CREATE TEMP TABLE IF NOT EXISTS {tmp_tbl} (LIKE {schema}.{table})" 48 | ) 49 | cursor.execute(f"SELECT COUNT(*) FROM {tmp_tbl}") 50 | context.log.debug( 51 | f"Log for creating temp table: {cursor.fetchone()}" 52 | ) 53 | # Create sql identifiers for the column names 54 | # Do this to safely insert into a sql query 55 | columns = sql.SQL(",").join( 56 | sql.Identifier(name.lower()) for name in obj.columns 57 | ) 58 | # Create a placeholder for the values. These will be filled later 59 | values = sql.SQL(",").join(sql.Placeholder() for _ in obj.columns) 60 | # Create the insert query 61 | context.log.debug("Inserting data into temp table") 62 | insert_query = sql.SQL("INSERT INTO {} ({}) VALUES({});").format( 63 | sql.Identifier(tmp_tbl), columns, values 64 | ) 65 | # Execute the insert query 66 | psycopg2.extras.execute_batch(cursor, insert_query, obj.rows()) 67 | conn.commit() 68 | 69 | # Check data inserted 70 | context.log.debug("Checking data inserted") 71 | cursor.execute(f"SELECT COUNT(*) FROM {tmp_tbl};") 72 | context.log.info(f"Number of rows inserted: {cursor.fetchone()}") 73 | # Upsert data 74 | if len(primary_keys) > 0: 75 | context.log.debug("Table has primary keys, upserting data") 76 | conditions = " AND ".join( 77 | [ 78 | f""" {schema}.{table}."{k}" = {tmp_tbl}."{k}" """ 79 | for k in primary_keys 80 | ] 81 | ) 82 | command = f""" 83 | BEGIN TRANSACTION; 84 | DELETE FROM {schema}.{table} 85 | USING {tmp_tbl} 86 | WHERE {conditions}; 87 | 88 | INSERT INTO {schema}.{table} 89 | SELECT * FROM {tmp_tbl}; 90 | 91 | END TRANSACTION; 92 | """ 93 | else: 94 | context.log.debug("Table has no primary keys, replacing data") 95 | command = f""" 96 | BEGIN TRANSACTION; 97 | DELETE FROM {schema}.{table}; 98 | 99 | INSERT INTO {schema}.{table} 100 | SELECT * FROM {tmp_tbl}; 101 | 102 | END TRANSACTION; 103 | """ 104 | 105 | # context.log.debug(f"Command: {command}") 106 | context.log.debug(f"Upserting data into {schema}.{table}") 107 | cursor.execute(command) 108 | context.log.debug(f"{cursor.statusmessage}") 109 | conn.commit() 110 | except (Exception) as e: 111 | print(f"Error while handling output to PostgreSQL: {e}") 112 | 113 | try: 114 | with connect_psql(self._config) as conn: 115 | with conn.cursor() as cursor: 116 | context.log.debug(f"{cursor.fetchone()}") 117 | cursor.execute(f"SELECT COUNT(*) FROM {schema}.{table};") 118 | context.log.info( 119 | f"Number of rows upserted in {schema}.{table}: {cursor.fetchone()}" 120 | ) 121 | 122 | # Drop temp table 123 | cursor.execute(f"DROP TABLE {tmp_tbl}") 124 | conn.commit() 125 | except (Exception) as e: 126 | print(f"Error while testing handle_output to PostgreSQL: {e}") 127 | 128 | def load_input(self, context: "InputContext"): 129 | pass 130 | -------------------------------------------------------------------------------- /elt_pipeline/elt_pipeline/assets/bronze.py: -------------------------------------------------------------------------------- 1 | from dagster import asset, AssetIn, Output, StaticPartitionsDefinition 2 | from datetime import datetime 3 | import polars as pl 4 | 5 | 6 | COMPUTE_KIND = "SQL" 7 | LAYER = "bronze" 8 | YEARLY = StaticPartitionsDefinition( 9 | [str(year) for year in range(1975, datetime.today().year)] 10 | ) 11 | 12 | 13 | # genre from my_sql 14 | @asset( 15 | description="Load table 'genre' from MySQL database as polars DataFrame, and save to minIO", 16 | io_manager_key="minio_io_manager", 17 | required_resource_keys={"mysql_io_manager"}, 18 | key_prefix=["bronze", "goodreads"], 19 | compute_kind=COMPUTE_KIND, 20 | group_name=LAYER, 21 | ) 22 | def bronze_genre(context) -> Output[pl.DataFrame]: 23 | query = "SELECT * FROM genre;" 24 | df_data = context.resources.mysql_io_manager.extract_data(query) 25 | context.log.info(f"Table extracted with shape: {df_data.shape}") 26 | 27 | return Output( 28 | value=df_data, 29 | metadata={ 30 | "table": "genre", 31 | "row_count": df_data.shape[0], 32 | "column_count": df_data.shape[1], 33 | "columns": df_data.columns, 34 | }, 35 | ) 36 | 37 | 38 | # book from my_sql 39 | @asset( 40 | description="Load table 'book' from MySQL database as polars DataFrame, and save to minIO", 41 | partitions_def=YEARLY, 42 | io_manager_key="minio_io_manager", 43 | required_resource_keys={"mysql_io_manager"}, 44 | key_prefix=["bronze", "goodreads"], 45 | compute_kind=COMPUTE_KIND, 46 | group_name=LAYER, 47 | ) 48 | def bronze_book(context) -> Output[pl.DataFrame]: 49 | query = "SELECT * FROM book" 50 | try: 51 | partion_year_str = context.asset_partition_key_for_output() 52 | partition_by = "PublishYear" 53 | query += f" WHERE {partition_by} = {partion_year_str}" 54 | context.log.info(f"Partition by {partition_by} = {partion_year_str}") 55 | except Exception: 56 | context.log.info("No partition key found, full load data") 57 | 58 | df_data = context.resources.mysql_io_manager.extract_data(query) 59 | context.log.info(f"Table extracted with shape: {df_data.shape}") 60 | 61 | return Output( 62 | value=df_data, 63 | metadata={ 64 | "table": "book", 65 | "row_count": df_data.shape[0], 66 | "column_count": df_data.shape[1], 67 | "columns": df_data.columns, 68 | }, 69 | ) 70 | 71 | 72 | # book_genre from my_sql 73 | @asset( 74 | description="Load table 'book_genre' from MySQL database as polars DataFrame, and save to minIO", 75 | io_manager_key="minio_io_manager", 76 | required_resource_keys={"mysql_io_manager"}, 77 | non_argument_deps={"bronze_book", "bronze_genre"}, 78 | key_prefix=["bronze", "goodreads"], 79 | compute_kind=COMPUTE_KIND, 80 | group_name=LAYER, 81 | ) 82 | def bronze_book_genre(context) -> Output[pl.DataFrame]: 83 | query = "SELECT * FROM book_genre;" 84 | df_data = context.resources.mysql_io_manager.extract_data(query) 85 | context.log.info(f"Table extracted with shape: {df_data.shape}") 86 | 87 | return Output( 88 | value=df_data, 89 | metadata={ 90 | "table": "book_genre", 91 | "row_count": df_data.shape[0], 92 | "column_count": df_data.shape[1], 93 | "columns": df_data.columns, 94 | }, 95 | ) 96 | 97 | 98 | # book_download_link from my_sql 99 | @asset( 100 | description="Load table 'book_download_link' from MySQL database as polars DataFrame, and save to minIO", 101 | io_manager_key="minio_io_manager", 102 | required_resource_keys={"mysql_io_manager"}, 103 | non_argument_deps={"bronze_book"}, 104 | key_prefix=["bronze", "goodreads"], 105 | compute_kind=COMPUTE_KIND, 106 | group_name=LAYER, 107 | ) 108 | def bronze_book_download_link(context) -> Output[pl.DataFrame]: 109 | query = "SELECT * FROM book_download_link;" 110 | df_data = context.resources.mysql_io_manager.extract_data(query) 111 | context.log.info(f"Table extracted with shape: {df_data.shape}") 112 | 113 | return Output( 114 | value=df_data, 115 | metadata={ 116 | "table": "book_download_link", 117 | "row_count": df_data.shape[0], 118 | "column_count": df_data.shape[1], 119 | "columns": df_data.columns, 120 | }, 121 | ) 122 | 123 | 124 | # download files from gdrive, given a download link 125 | @asset( 126 | description="Download image and epub file for books from gdrive, given a download link", 127 | io_manager_key="gdrive_io_manager", 128 | ins={ 129 | "bronze_book_download_link": AssetIn( 130 | key_prefix=["bronze", "goodreads"], 131 | ) 132 | }, 133 | compute_kind="google drive", 134 | group_name=LAYER, 135 | ) 136 | def bronze_images_and_files_download( 137 | context, bronze_book_download_link: pl.DataFrame 138 | ) -> Output[dict]: 139 | """ 140 | From upstream table 'book_download_link', download files from google drive 141 | with given download link, extract the images and files from it, 142 | then return the path to the folder containing the downloaded files. 143 | """ 144 | 145 | # Create temp folder path, e.g '/tmp/bronze/download/2021-08-01T00:00:00+00:00' -> images|files 146 | # WARN: If change the key_prefix above, also change the path here 147 | tmp_folder_path = f"/tmp/bronze/download/{datetime.now().isoformat()}" 148 | context.log.info(f"Path: {tmp_folder_path}") 149 | 150 | # Download folders by call download_folders function from gdrive_io_manager 151 | context.resources.gdrive_io_manager.download_folders( 152 | context=context, 153 | dataframe=bronze_book_download_link, 154 | tmp_folder_path=tmp_folder_path, 155 | ) 156 | 157 | return Output( 158 | value={ 159 | "tmp_folder_path": tmp_folder_path, 160 | "isbn": bronze_book_download_link["BookISBN"].to_list(), 161 | }, 162 | metadata={ 163 | "isbn": bronze_book_download_link["BookISBN"].to_list(), 164 | "download_link": bronze_book_download_link["Link"].to_list(), 165 | }, 166 | ) 167 | -------------------------------------------------------------------------------- /elt_pipeline/elt_pipeline/assets/warehouse.py: -------------------------------------------------------------------------------- 1 | from dagster import ( 2 | asset, 3 | AssetIn, 4 | Output, 5 | StaticPartitionsDefinition, 6 | ) 7 | from pyspark.sql import DataFrame 8 | from datetime import datetime 9 | import pyarrow as pa 10 | import polars as pl 11 | 12 | 13 | COMPUTE_KIND = "Postgres" 14 | LAYER = "warehouse" 15 | YEARLY = StaticPartitionsDefinition( 16 | [str(year) for year in range(1975, datetime.today().year)] 17 | ) 18 | 19 | 20 | # Asset warehouse_book_with_info 21 | @asset( 22 | description="Load book_with_info data from spark to postgres", 23 | ins={ 24 | "gold_book_with_info": AssetIn( 25 | key_prefix=["gold", "goodreads"], 26 | ), 27 | }, 28 | metadata={ 29 | "primary_keys": ["isbn"], 30 | "columns": ["isbn", "name", "authors", "language", "pagesnumber"], 31 | }, 32 | io_manager_key="psql_io_manager", 33 | key_prefix=["gold"], # Database: goodreads, Schema: gold 34 | compute_kind=COMPUTE_KIND, 35 | group_name=LAYER, 36 | ) 37 | def book_with_info(context, gold_book_with_info: DataFrame): 38 | """ 39 | Load book_with_info data from spark to postgres 40 | """ 41 | 42 | context.log.info("Got spark DataFrame, loading to postgres") 43 | # Convert from spark DataFrame to polars DataFrame 44 | df = pl.from_arrow(pa.Table.from_batches(gold_book_with_info._collect_as_arrow())) 45 | context.log.debug(f"Got polars DataFrame with shape: {df.shape}") 46 | 47 | return Output( 48 | value=df, 49 | metadata={ 50 | "database": "goodreads", 51 | "schema": "gold", 52 | "table": "book_with_info", 53 | "primary_keys": ["isbn"], 54 | "columns": ["isbn", "name", "authors", "language", "pagesnumber"], 55 | }, 56 | ) 57 | 58 | 59 | # Asset warehouse_book_with_publish 60 | @asset( 61 | description="Load book_with_publish data from spark to postgres", 62 | ins={ 63 | "gold_book_with_publish": AssetIn( 64 | key_prefix=["gold", "goodreads"], 65 | ), 66 | }, 67 | metadata={ 68 | "primary_keys": ["isbn"], 69 | "columns": ["isbn", "publisher", "publishyear", "publishmonth", "publishday"], 70 | }, 71 | io_manager_key="psql_io_manager", 72 | key_prefix=["gold"], # Database: goodreads, Schema: gold 73 | compute_kind=COMPUTE_KIND, 74 | group_name=LAYER, 75 | ) 76 | def book_with_publish(context, gold_book_with_publish: DataFrame): 77 | """ 78 | Load book_with_publish data from spark to postgres 79 | """ 80 | 81 | context.log.info("Got spark DataFrame, loading to postgres") 82 | # Convert from spark DataFrame to polars DataFrame 83 | df = pl.from_arrow( 84 | pa.Table.from_batches(gold_book_with_publish._collect_as_arrow()) 85 | ) 86 | context.log.debug(f"Got polars DataFrame with shape: {df.shape}") 87 | 88 | return Output( 89 | value=df, 90 | metadata={ 91 | "database": "goodreads", 92 | "schema": "gold", 93 | "table": "book_with_publish", 94 | "primary_keys": ["isbn"], 95 | "columns": [ 96 | "isbn", 97 | "publisher", 98 | "publishyear", 99 | "publishmonth", 100 | "publishday", 101 | ], 102 | }, 103 | ) 104 | 105 | 106 | # Asset warehouse_book_with_rating 107 | @asset( 108 | description="Load book_with_rating data from spark to postgres", 109 | ins={ 110 | "gold_book_with_rating": AssetIn( 111 | key_prefix=["gold", "goodreads"], 112 | ), 113 | }, 114 | metadata={ 115 | "primary_keys": ["isbn"], 116 | "columns": [ 117 | "isbn", 118 | "rating", 119 | "ratingdist5", 120 | "ratingdist4", 121 | "ratingdist3", 122 | "ratingdist2", 123 | "ratingdist1", 124 | "ratingdisttotal", 125 | "countoftextreviews", 126 | ], 127 | }, 128 | io_manager_key="psql_io_manager", 129 | key_prefix=["gold"], # Database: goodreads, Schema: gold 130 | compute_kind=COMPUTE_KIND, 131 | group_name=LAYER, 132 | ) 133 | def book_with_rating(context, gold_book_with_rating: DataFrame): 134 | """ 135 | Load book_with_rating data from spark to postgres 136 | """ 137 | 138 | context.log.info("Got spark DataFrame, loading to postgres") 139 | # Convert from spark DataFrame to polars DataFrame 140 | df = pl.from_arrow(pa.Table.from_batches(gold_book_with_rating._collect_as_arrow())) 141 | context.log.debug(f"Got polars DataFrame with shape: {df.shape}") 142 | 143 | return Output( 144 | value=df, 145 | metadata={ 146 | "database": "goodreads", 147 | "schema": "gold", 148 | "table": "book_with_rating", 149 | "primary_keys": ["isbn"], 150 | "columns": [ 151 | "isbn", 152 | "rating", 153 | "ratingdist5", 154 | "ratingdist4", 155 | "ratingdist3", 156 | "ratingdist2", 157 | "ratingdist1", 158 | "ratingdisttotal", 159 | "countoftextreviews", 160 | ], 161 | }, 162 | ) 163 | 164 | 165 | # Asset warehouse_book_download_link 166 | @asset( 167 | description="Load book_download_link data from minio to postgres", 168 | ins={ 169 | "bronze_book_download_link": AssetIn( 170 | key_prefix=["bronze", "goodreads"], 171 | ), 172 | }, 173 | metadata={ 174 | "primary_keys": ["isbn"], 175 | "columns": ["isbn", "link"], 176 | }, 177 | io_manager_key="psql_io_manager", 178 | key_prefix=["recommendations"], 179 | compute_kind=COMPUTE_KIND, 180 | group_name=LAYER, 181 | ) 182 | def book_download_link(context, bronze_book_download_link: pl.DataFrame): 183 | """ 184 | Load book_download_link data from minio to postgres 185 | """ 186 | 187 | df = bronze_book_download_link 188 | # Rename column BookISBN to isbn 189 | df = df.rename({"BookISBN": "isbn"}) 190 | context.log.info(f"Columns: {df.columns}") 191 | 192 | return Output( 193 | value=df, 194 | metadata={ 195 | "database": "goodreads", 196 | "schema": "recommendations", 197 | "table": "book_download_link", 198 | "primary_keys": ["isbn"], 199 | "columns": ["isbn", "link"], 200 | }, 201 | ) 202 | -------------------------------------------------------------------------------- /elt_pipeline/elt_pipeline/assets/gold.py: -------------------------------------------------------------------------------- 1 | from dagster import ( 2 | asset, 3 | multi_asset, 4 | AssetIn, 5 | AssetOut, 6 | Output, 7 | StaticPartitionsDefinition, 8 | ) 9 | from pyspark.sql import DataFrame 10 | from datetime import datetime 11 | import pyarrow as pa 12 | import polars as pl 13 | 14 | 15 | COMPUTE_KIND = "Python" 16 | YEARLY = StaticPartitionsDefinition( 17 | [str(year) for year in range(1975, datetime.today().year)] 18 | ) 19 | 20 | 21 | # genre to gold (minIO) and warehouse (postgres) 22 | @multi_asset( 23 | ins={ 24 | "silver_collected_genre": AssetIn( 25 | key_prefix=["silver", "goodreads"], 26 | ) 27 | }, 28 | outs={ 29 | "gold_genre": AssetOut( 30 | description="Load genre data from spark to minIO", 31 | io_manager_key="spark_io_manager", 32 | key_prefix=["gold", "goodreads"], 33 | group_name="gold", 34 | ), 35 | "genre": AssetOut( 36 | description="Load genre data from spark to postgres", 37 | io_manager_key="psql_io_manager", 38 | key_prefix=["gold"], # Database: goodreads, Schema: gold 39 | metadata={ 40 | "primary_keys": ["id", "name"], 41 | "columns": ["id", "name"], 42 | }, 43 | group_name="warehouse", 44 | ), 45 | }, 46 | compute_kind=COMPUTE_KIND, 47 | ) 48 | def genre(context, silver_collected_genre: DataFrame): 49 | """ 50 | Load genre data from spark to minIO and postgres 51 | """ 52 | 53 | spark_df = silver_collected_genre 54 | 55 | context.log.info("Got spark DataFrame, converting to polars DataFrame") 56 | # Convert from spark DataFrame to polars DataFrame 57 | df = pl.from_arrow( 58 | pa.Table.from_batches(silver_collected_genre._collect_as_arrow()) 59 | ) 60 | context.log.debug(f"Got polars DataFrame with shape: {df.shape}") 61 | 62 | return Output( 63 | value=spark_df, 64 | metadata={ 65 | "table": "gold_genre", 66 | "row_count": spark_df.count(), 67 | "column_count": len(spark_df.columns), 68 | "columns": spark_df.columns, 69 | }, 70 | ), Output( 71 | value=df, 72 | metadata={ 73 | "database": "goodreads", 74 | "schema": "gold", 75 | "table": "genre", 76 | "primary_keys": ["id", "name"], 77 | "columns": ["id", "name"], 78 | }, 79 | ) 80 | 81 | 82 | # book_genre to gold (minIO) and warehouse (postgres) 83 | @multi_asset( 84 | ins={ 85 | "silver_collected_book_genre": AssetIn( 86 | key_prefix=["silver", "goodreads"], 87 | ) 88 | }, 89 | outs={ 90 | "gold_book_genre": AssetOut( 91 | description="Load book_genre data from spark to minIO", 92 | io_manager_key="spark_io_manager", 93 | key_prefix=["gold", "goodreads"], 94 | group_name="gold", 95 | ), 96 | "book_genre": AssetOut( 97 | description="Load book_genre data from spark to postgres", 98 | io_manager_key="psql_io_manager", 99 | key_prefix=["gold"], # Database: goodreads, Schema: gold 100 | metadata={ 101 | "primary_keys": ["bookisbn", "genreid"], 102 | "columns": ["bookisbn", "genreid"], 103 | }, 104 | group_name="warehouse", 105 | ), 106 | }, 107 | compute_kind=COMPUTE_KIND, 108 | ) 109 | def book_genre(context, silver_collected_book_genre: DataFrame): 110 | """ 111 | Load book_genre data from spark to minIO and postgres 112 | """ 113 | 114 | spark_df = silver_collected_book_genre 115 | 116 | context.log.info("Got spark DataFrame, converting to polars DataFrame") 117 | # Convert from spark DataFrame to polars DataFrame 118 | df = pl.from_arrow( 119 | pa.Table.from_batches(silver_collected_book_genre._collect_as_arrow()) 120 | ) 121 | context.log.debug(f"Got polars DataFrame with shape: {df.shape}") 122 | 123 | return Output( 124 | value=spark_df, 125 | metadata={ 126 | "table": "gold_book_genre", 127 | "row_count": spark_df.count(), 128 | "column_count": len(spark_df.columns), 129 | "columns": spark_df.columns, 130 | }, 131 | ), Output( 132 | value=df, 133 | metadata={ 134 | "database": "goodreads", 135 | "schema": "gold", 136 | "table": "book_genre", 137 | "record_count": df.shape[0], 138 | }, 139 | ) 140 | 141 | 142 | # Asset book_with_info 143 | @asset( 144 | description="Split book table to get basic info", 145 | # partitions_def=YEARLY, 146 | ins={ 147 | "silver_collected_book": AssetIn( 148 | key_prefix=["silver", "goodreads"], 149 | metadata={"full_load": True}, 150 | ), 151 | }, 152 | io_manager_key="spark_io_manager", 153 | key_prefix=["gold", "goodreads"], 154 | compute_kind="PySpark", 155 | group_name="gold", 156 | ) 157 | def gold_book_with_info(context, silver_collected_book: DataFrame): 158 | """ 159 | Split book table to get basic info 160 | """ 161 | 162 | spark_df = silver_collected_book 163 | context.log.info("Got spark DataFrame, getting neccessary columns") 164 | 165 | # Drop rows with null value in Language column 166 | spark_df = spark_df.dropna(subset=["Language"]) 167 | 168 | # Select columns ISBN, Name, Authors, Language, Description, PagesNumber 169 | spark_df = spark_df.select( 170 | "ISBN", 171 | "Name", 172 | "Authors", 173 | "Language", 174 | "PagesNumber", 175 | ) 176 | spark_df.collect() 177 | 178 | return Output( 179 | value=spark_df, 180 | metadata={ 181 | "table": "gold_book_with_info", 182 | "row_count": spark_df.count(), 183 | "column_count": len(spark_df.columns), 184 | "columns": spark_df.columns, 185 | }, 186 | ) 187 | 188 | 189 | # Asset book_with_publish 190 | @asset( 191 | description="Split book table to get publishing info", 192 | ins={ 193 | "silver_collected_book": AssetIn( 194 | key_prefix=["silver", "goodreads"], 195 | metadata={"full_load": True}, 196 | ), 197 | }, 198 | io_manager_key="spark_io_manager", 199 | key_prefix=["gold", "goodreads"], 200 | compute_kind="PySpark", 201 | group_name="gold", 202 | ) 203 | def gold_book_with_publish(context, silver_collected_book: DataFrame): 204 | """ 205 | Split book table to get publishing info 206 | """ 207 | 208 | spark_df = silver_collected_book 209 | context.log.info("Got spark DataFrame, getting neccessary columns") 210 | 211 | # Drop rows with null value in Language column 212 | spark_df = spark_df.dropna(subset=["Language"]) 213 | 214 | # Select columns ISBN, Publisher, PublishYear, PublishMonth, PublishDay 215 | spark_df = spark_df.select( 216 | "ISBN", 217 | "Publisher", 218 | "PublishYear", 219 | "PublishMonth", 220 | "PublishDay", 221 | ) 222 | spark_df.collect() 223 | 224 | return Output( 225 | value=spark_df, 226 | metadata={ 227 | "table": "gold_book_with_publish", 228 | "row_count": spark_df.count(), 229 | "column_count": len(spark_df.columns), 230 | "columns": spark_df.columns, 231 | }, 232 | ) 233 | 234 | 235 | # Asset book_with_rating 236 | @asset( 237 | description="Split book table to get rating info", 238 | ins={ 239 | "silver_collected_book": AssetIn( 240 | key_prefix=["silver", "goodreads"], 241 | metadata={"full_load": True}, 242 | ), 243 | }, 244 | io_manager_key="spark_io_manager", 245 | key_prefix=["gold", "goodreads"], 246 | compute_kind="PySpark", 247 | group_name="gold", 248 | ) 249 | def gold_book_with_rating(context, silver_collected_book: DataFrame): 250 | """ 251 | Split book table to get rating info 252 | """ 253 | 254 | spark_df = silver_collected_book 255 | context.log.info("Got spark DataFrame, getting neccessary columns") 256 | 257 | # Drop rows with null value in Language column 258 | spark_df = spark_df.dropna(subset=["Language"]) 259 | 260 | # Select columns ISBN, Rating, RatingDist1, RatingDist2, RatingDist3, RatingDist4, RatingDist5, CountOfTextReviews 261 | spark_df = spark_df.select( 262 | "ISBN", 263 | "Rating", 264 | "RatingDist5", 265 | "RatingDist4", 266 | "RatingDist3", 267 | "RatingDist2", 268 | "RatingDist1", 269 | "RatingDistTotal", 270 | "CountOfTextReviews", 271 | ) 272 | spark_df.collect() 273 | 274 | return Output( 275 | value=spark_df, 276 | metadata={ 277 | "table": "gold_book_with_rating", 278 | "row_count": spark_df.count(), 279 | "column_count": len(spark_df.columns), 280 | "columns": spark_df.columns, 281 | }, 282 | ) 283 | -------------------------------------------------------------------------------- /app/streamlit_app.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | from datetime import datetime 3 | import streamlit as st 4 | import polars as pl 5 | import psycopg2 6 | from minio import Minio 7 | import os 8 | from PIL import Image 9 | import requests 10 | 11 | 12 | @contextmanager 13 | def connect_minio(): 14 | client = Minio( 15 | endpoint=os.getenv("MINIO_ENDPOINT"), 16 | access_key=os.getenv("MINIO_ACCESS_KEY"), 17 | secret_key=os.getenv("MINIO_SECRET_KEY"), 18 | secure=False, 19 | ) 20 | 21 | try: 22 | yield client 23 | except Exception as e: 24 | raise e 25 | 26 | 27 | # Make bucket if not exists 28 | def make_bucket(client: Minio, bucket_name): 29 | found = client.bucket_exists(bucket_name) 30 | if not found: 31 | client.make_bucket(bucket_name) 32 | else: 33 | print(f"Bucket {bucket_name} already exists.") 34 | 35 | 36 | # Download a file from minio 37 | def download_image(book_isbn): 38 | tmp_file_path = f"/tmp/{book_isbn}_{datetime.today()}.jpeg" 39 | key_name = f"images/{book_isbn}.jpeg" 40 | bucket_name = os.getenv("DATALAKE_BUCKET") 41 | try: 42 | with connect_minio() as client: 43 | # Make bucket if not exist 44 | make_bucket(client=client, bucket_name=bucket_name) 45 | client.fget_object(bucket_name, key_name, tmp_file_path) 46 | return tmp_file_path 47 | except Exception as e: 48 | raise e 49 | 50 | 51 | st.set_page_config( 52 | page_title="Ultimate Goodreads Recommender", 53 | page_icon="📔", 54 | layout="centered", 55 | initial_sidebar_state="expanded", 56 | ) 57 | 58 | 59 | # Initialize connection. 60 | # Uses st.cache_resource to only run once. 61 | @st.cache_resource 62 | def init_connection(): 63 | return psycopg2.connect(**st.secrets["postgres"]) 64 | 65 | 66 | conn = init_connection() 67 | 68 | 69 | # Perform query. 70 | # Uses st.cache_data to only rerun when the query changes or after 10 min. 71 | @st.cache_data(ttl=600) 72 | def run_query(query): 73 | with conn.cursor() as cur: 74 | cur.execute(query) 75 | return cur.fetchall() 76 | 77 | 78 | st.title("Ultimate Goodreads recommender!") 79 | 80 | book_name = st.text_input("Enter a book name", "Hannibal") 81 | st.write(f"You entered: {book_name}") 82 | 83 | # Take isbn from search_prior if it exists, otherwise from search. 84 | isbn = "" 85 | try: 86 | isbn = run_query( 87 | f"SELECT isbn FROM recommendations.search_prior WHERE name LIKE '{book_name}' LIMIT 1" 88 | )[0][0] 89 | # st.write(f"ISBN: {isbn}") 90 | except IndexError: 91 | try: 92 | isbn = run_query( 93 | f"SELECT isbn FROM recommendations.search WHERE name LIKE '{book_name}' LIMIT 1" 94 | )[0][0] 95 | # st.write(f"ISBN: {isbn}") 96 | except IndexError: 97 | st.write(f"Book {book_name} not found") 98 | else: 99 | print("Error while querying book") 100 | 101 | # From isbn take list of genreid 102 | genreid = None 103 | if isbn != "": 104 | try: 105 | result = run_query( 106 | f"SELECT genreid FROM gold.book_genre WHERE bookisbn = '{isbn}'" 107 | ) 108 | genreid = [x[0] for x in result] 109 | # st.write(f"Genreid: {genreid}") 110 | except Exception as e: 111 | conn.commit() 112 | cursor = conn.cursor() 113 | 114 | # From genreid take books with most common genreid 115 | book_with_most_common_genreid = None 116 | if genreid: 117 | try: 118 | result = run_query( 119 | f""" 120 | WITH common_books AS ( 121 | SELECT bookisbn, COUNT(*) as count 122 | FROM gold.book_genre 123 | WHERE genreid IN {tuple(genreid)} 124 | GROUP BY bookisbn 125 | HAVING COUNT(*) > 3 126 | ORDER BY COUNT(*) DESC 127 | ) 128 | SELECT common_books.bookisbn, common_books.count, criteria.hasdownloadlink, criteria.rating 129 | FROM recommendations.criteria 130 | RIGHT JOIN common_books ON common_books.bookisbn = criteria.isbn 131 | ORDER BY common_books.count DESC, criteria.hasdownloadlink DESC, criteria.rating DESC 132 | """ 133 | ) 134 | # st.write(result) 135 | book_with_most_common_genreid = pl.DataFrame( 136 | { 137 | "bookisbn": [x[0] for x in result], 138 | "count": [x[1] for x in result], 139 | "hasdownloadlink": [x[2] for x in result], 140 | "rating": [x[3] for x in result], 141 | } 142 | ) 143 | book_with_most_common_genreid = book_with_most_common_genreid[:4] 144 | # st.write(book_with_most_common_genreid) 145 | except Exception as e: 146 | print(f"Error while querying: {e}") 147 | conn.commit() 148 | cursor = conn.cursor() 149 | 150 | # From book_with_most_common_genreid show books 151 | if book_with_most_common_genreid is not None: 152 | st.subheader(f"You seached for {book_name}, here's information about the book:") 153 | c1, c2 = st.columns([5, 5]) 154 | rating = book_with_most_common_genreid[0]["rating"][0] 155 | with c1: 156 | image = None 157 | try: 158 | image = Image.open(download_image(isbn)) 159 | st.image(image, caption=f"{book_name}", width=300) 160 | except Exception as e: 161 | req = requests.get( 162 | f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&jscmd=data&format=json" 163 | ) 164 | json = req.json() 165 | image_url = json.get(f"ISBN:{isbn}", {}).get("cover", {}).get("large") 166 | if image_url: 167 | image = Image.open(requests.get(image_url, stream=True).raw) 168 | st.image(image, caption=f"{book_name}", width=300) 169 | with c2: 170 | author, language, pagesnumber = "", "", "" 171 | try: 172 | result = run_query( 173 | f"SELECT * FROM gold.book_with_info WHERE isbn = '{isbn}'" 174 | ) 175 | author = result[0][2] 176 | language = result[0][3] 177 | pagesnumber = result[0][5] 178 | except Exception as e: 179 | print(f"Error while querying: {e}") 180 | conn.commit() 181 | cursor = conn.cursor() 182 | st.write(f"**Book ISBN**: {isbn}") 183 | st.write(f"**Book name**: {book_name}") 184 | if author != "": 185 | st.write(f"**Author(s)**: {author}") 186 | if language != "": 187 | st.write(f"**Language**: {language}") 188 | if pagesnumber != 0: 189 | st.write(f"**Pages number**: {pagesnumber}") 190 | st.write("**Rating**: {:.2f}".format(rating)) 191 | hasdownloadlink = book_with_most_common_genreid[0]["hasdownloadlink"][0] 192 | if hasdownloadlink: 193 | if st.button(f"Send {book_name} to kindle", type="primary"): 194 | st.balloons() 195 | 196 | st.subheader(f"There's {len(book_with_most_common_genreid)-1} related book:") 197 | for i in range(1, len(book_with_most_common_genreid)): 198 | c1, c2 = st.columns([5, 5]) 199 | isbn = book_with_most_common_genreid[i]["bookisbn"][0] 200 | rating = book_with_most_common_genreid[i]["rating"][0] 201 | with c1: 202 | book_name, author, language, pagesnumber = "", "", "", "" 203 | try: 204 | result = run_query( 205 | f"SELECT * FROM gold.book_with_info WHERE isbn = '{isbn}'" 206 | ) 207 | book_name = result[0][1] 208 | author = result[0][2] 209 | language = result[0][3] 210 | pagesnumber = result[0][5] 211 | except Exception as e: 212 | print(f"Error while querying: {e}") 213 | conn.commit() 214 | cursor = conn.cursor() 215 | st.write(f"**Book ISBN**: {isbn}") 216 | st.write(f"**Book name**: {book_name}") 217 | if author != "": 218 | st.write(f"**Author(s)**: {author}") 219 | if language != "": 220 | st.write(f"**Language**: {language}") 221 | if pagesnumber != 0: 222 | st.write(f"**Pages number**: {pagesnumber}") 223 | st.write("**Rating**: {:.2f}".format(rating)) 224 | hasdownloadlink = book_with_most_common_genreid[0]["hasdownloadlink"][0] 225 | if hasdownloadlink: 226 | if st.button(f"Send {book_name} to kindle", type="primary"): 227 | st.balloons() 228 | with c2: 229 | image = None 230 | try: 231 | image = Image.open(download_image(isbn)) 232 | st.image(image, caption=f"{book_name}", width=300) 233 | except Exception as e: 234 | req = requests.get( 235 | f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&jscmd=data&format=json" 236 | ) 237 | json = req.json() 238 | image_url = json.get(f"ISBN:{isbn}", {}).get("cover", {}).get("large") 239 | if image_url: 240 | image = Image.open(requests.get(image_url, stream=True).raw) 241 | st.image(image, caption=f"{book_name}", width=300) 242 | -------------------------------------------------------------------------------- /elt_pipeline/elt_pipeline/resources/gdrive_io_manager.py: -------------------------------------------------------------------------------- 1 | from dagster import IOManager, InputContext, OutputContext 2 | from google_auth_oauthlib.flow import InstalledAppFlow 3 | from googleapiclient.discovery import build 4 | from googleapiclient.http import MediaIoBaseDownload 5 | from google.auth.transport.requests import Request 6 | 7 | import io 8 | import os 9 | import pickle 10 | import polars as pl 11 | from typing import Union 12 | from contextlib import contextmanager 13 | from .minio_io_manager import connect_minio, make_bucket 14 | 15 | 16 | @contextmanager 17 | def gdrive_client(config): 18 | client_secret_file = config["client_secret_file"] 19 | pickle_file = config["pickle_file"] 20 | api_name = config["api_name"] 21 | api_version = config["api_version"] 22 | scopes = config["scopes"] 23 | 24 | cred = None 25 | 26 | if os.path.exists(pickle_file): 27 | with open(pickle_file, "rb") as token: 28 | cred = pickle.load(token) 29 | else: 30 | raise Exception( 31 | f"Pickle not exists from this, pickle_file: {pickle_file} and cred file: {cred}" 32 | ) 33 | 34 | if not cred or not cred.valid: 35 | if cred and cred.expired and cred.refresh_token: 36 | cred.refresh(Request()) 37 | else: 38 | flow = InstalledAppFlow.from_client_secrets_file(client_secret_file, scopes) 39 | cred = flow.run_local_server() 40 | 41 | with open(pickle_file, "wb") as token: 42 | pickle.dump(cred, token) 43 | 44 | try: 45 | service = build(api_name, api_version, credentials=cred) 46 | yield service 47 | except Exception as e: 48 | raise Exception("Error while creating gdrive client: {}".format(e)) 49 | 50 | 51 | class GDriveIOManager(IOManager): 52 | def __init__(self, config): 53 | self._config = config 54 | 55 | def download_files( 56 | self, 57 | context, 58 | dowid, 59 | dfilespath, 60 | folder=None, 61 | ): 62 | """ 63 | Download files from gdrive with given id 64 | """ 65 | 66 | with gdrive_client(self._config) as service: 67 | request = service.files().get_media(fileId=dowid) 68 | fh = io.BytesIO() 69 | downloader = MediaIoBaseDownload(fh, request) 70 | done = False 71 | while done is False: 72 | status, done = downloader.next_chunk() 73 | context.log.debug("Download %d%%." % int(status.progress() * 100)) 74 | if folder: 75 | with io.open(folder + "/" + dfilespath, "wb") as f: 76 | fh.seek(0) 77 | f.write(fh.read()) 78 | else: 79 | with io.open(dfilespath, "wb") as f: 80 | fh.seek(0) 81 | f.write(fh.read()) 82 | 83 | def list_folders(self, context, filid, des): 84 | """ 85 | List all items (file/subfolder) in the folder with given id 86 | until all files are found 87 | """ 88 | 89 | with gdrive_client(self._config) as service: 90 | page_token = None 91 | while True: 92 | results = ( 93 | service.files() 94 | .list( 95 | pageSize=1000, 96 | q=f"'{filid}' in parents", 97 | fields="nextPageToken, files(id, name, mimeType)", 98 | ) 99 | .execute() 100 | ) 101 | page_token = results.get("nextPageToken", None) 102 | if page_token is None: 103 | folder = results.get("files", []) 104 | for item in folder: 105 | if ( 106 | str(item["mimeType"]) 107 | == "application/vnd.google-apps.folder" 108 | ): 109 | if not os.path.isdir(des + "/" + item["name"]): 110 | os.mkdir(des + "/" + item["name"]) 111 | self.list_folders( 112 | context, item["id"], des + "/" + item["name"] 113 | ) 114 | else: 115 | self.download_files(context, item["id"], item["name"], des) 116 | context.log.debug(f"Downloaded file {item['name']}") 117 | break 118 | return folder 119 | 120 | def download_folders(self, context, dataframe: pl.DataFrame, tmp_folder_path: str): 121 | """ 122 | Download all files in the folder with given id from gdrive 123 | """ 124 | 125 | with gdrive_client(self._config) as service: 126 | for row in dataframe.rows(named=True): 127 | isbn, folder_id = row["BookISBN"], row["Link"].split("/")[-1] 128 | 129 | folder = service.files().get(fileId=folder_id).execute() 130 | folder_name = folder["name"] 131 | page_token = None 132 | 133 | while True: 134 | results = ( 135 | service.files() 136 | .list( 137 | q=f"'{folder_id}' in parents", 138 | spaces="drive", 139 | fields="nextPageToken, files(id, name, mimeType)", 140 | ) 141 | .execute() 142 | ) 143 | page_token = results.get("nextPageToken", None) 144 | if page_token is None: 145 | items = results.get("files", []) 146 | # If no items in the folder, that means the id is a file -> download the file 147 | if not items: 148 | self.download_files(context, folder_id, folder_name) 149 | context.log.debug(f"Folder name: {folder_name}") 150 | # If there are items in the folder -> download files in folder 151 | else: 152 | context.log.info( 153 | f"Start downloading folder {folder_name} ..." 154 | ) 155 | 156 | for item in items: 157 | tmp_file_path = os.path.join(tmp_folder_path, str(isbn)) 158 | if not os.path.isdir(tmp_file_path): 159 | os.makedirs(tmp_file_path) 160 | context.log.debug(f"Tmp file path: {tmp_file_path}") 161 | 162 | file_path = "" 163 | file_type = item["mimeType"] 164 | context.log.debug(f"File type: {file_type}") 165 | accept_files = ["jpeg", "epub"] 166 | for accept_file in accept_files: 167 | if accept_file in file_type: 168 | file_path = os.path.join( 169 | tmp_file_path, item["name"] 170 | ) 171 | 172 | context.log.debug(f"File path: {file_path}") 173 | self.download_files( 174 | context, item["id"], file_path 175 | ) 176 | context.log.info( 177 | f"Downloaded file {item['name']}" 178 | ) 179 | 180 | os.rename( 181 | file_path, 182 | os.path.join( 183 | tmp_file_path, f"{isbn}.{accept_file}" 184 | ), 185 | ) 186 | break 187 | 188 | def handle_output(self, context: "OutputContext", obj: dict): 189 | """ 190 | Handle returned temporary folder path, load all files in the folder to minIO 191 | /tmp/bronze/download/2021-08-01T00:00:00+00:00 192 | /images 193 | ...jpeg 194 | /files 195 | ...epub 196 | to minIO: 197 | /lakehouse/images 198 | /lakehouse/files 199 | """ 200 | 201 | tmp_folder_path = str(obj.get("tmp_folder_path")) 202 | isbn_list = obj.get("isbn") 203 | context.add_output_metadata({"tmp": tmp_folder_path}) 204 | 205 | try: 206 | bucket_name = self._config.get("bucket") 207 | with connect_minio(self._config) as client: 208 | # Make bucket if not exist 209 | make_bucket(client, bucket_name) 210 | for isbn in isbn_list: 211 | for filetype in ["epub", "jpeg"]: 212 | # Upload epub file to minIO 213 | # E.g /tmp/bronze/download/2021-08-01T00:00:00+00:00/123456/123456.epub/jpeg 214 | tmp_file_path = os.path.join( 215 | tmp_folder_path, str(isbn), f"{isbn}.{filetype}" 216 | ) 217 | key_name = "files" if filetype == "epub" else "images" 218 | key_name += f"/{isbn}.{filetype}" 219 | 220 | # E.g bucket_name: lakehouse, key_name: files or images, tmp_file_path: /tmp/bronze/download/2021-08-01T00:00:00+00:00/123456/123456.epub/jpeg 221 | client.fput_object(bucket_name, key_name, tmp_file_path) 222 | context.log.debug( 223 | f"(MinIO handle_output) Got book.{filetype} with isbn: {isbn}" 224 | ) 225 | 226 | # Clean up tmp file 227 | os.remove(tmp_file_path) 228 | except Exception as e: 229 | raise e 230 | 231 | def load_input(self, context: "InputContext"): 232 | """ 233 | Skip this function 234 | """ 235 | pass 236 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # goodreads-elt-pipeline 2 | 3 | For Vietnamese edition, please visit 4 | 5 | In this project, I will guide you on building a basic data pipeline using the ELT model (extract - load - transform), using the dataset from Goodreads to ingest and transform data to serve a book recommendation system for yourself. 6 | 7 | This project is completed based on the knowledge learned from the Fundamental Data Engineering 02 course of AIDE. Special thanks to Mr. Nguyen Thanh Binh, Mr. Ong Xuan Hong, and Mr. Hung Le. 8 | 9 | ## 1. Introduction 10 | 11 | I love reading books, and I have a Kindle e-reader for my daily reading. 12 | 13 | ![](./images/introduction.jpg) 14 | 15 | One thing I like about the Kindle is that it has a separate email address provided by Amazon. If I use my own email to send e-book files (in .epub/.mobi format), the Amazon system will automatically send the files to my Kindle as long as there is an internet connection. 16 | 17 | So why not build an app that can extract data from Goodreads (a social network for book lovers), process it, and provide recommendations for my next reads? And that's where the project begins :D 18 | 19 | ## 2. Objective 20 | 21 | Dataset is collected from [Kaggle](https://www.kaggle.com/datasets/bahramjannesarr/goodreads-book-datasets-10m), [OpenLibrary API](https://openlibrary.org), [Google Drive API](https://developers.google.com/drive) and [Notion API](https://www.notion.so/lelouvincx/9dca269701b44d6b944c51a7f63b5b5a?v=4852b4d5cf8440e4bc232b0e25327f93) 22 | 23 | The objective of the project is to provide book recommendations to users based on the processed data collected. When a user inputs information about a book they have read, the app will suggest potential next reads. If the book has an .epub file, the app will also have a feature to send the book to the user's Kindle. 24 | 25 | ## 3. Design 26 | 27 | ### 3.1 Directory tree 28 | 29 | ![](./images/directory_tree.png) 30 | 31 | - `app`: The UI's application written with streamlit 32 | - `dagster_home`: Dagit and dagster daemon's configurations 33 | - `dataset`: Dataset under .csv format, in order to load into MySQL 34 | - `docker-compose`: To compose docker containers 35 | - `dockerimages`: Include self-built docker images, such as dagster (for dagit + daemon), spark master, streamlit app, ... 36 | - `EDA.ipynb`: Exploratory Data Analysis, view directly [here](https://gist.github.com/lelouvincx/a88fa6caf59d7ff76086ab485ecc69bd) 37 | - `elt_pipeline`: The pipeline 38 | - `dbt_transform`: dbt's code location, used for the last transform step 39 | - `Dockerfile + requirements.txt`: Docker image 40 | - `elt_pipeline`: EL (Extract -> Transform) pipeline 41 | - `.env + .spark_master.env + .spark_worker.env`: Env variables (e.g POSTGRES_USER, MYSQL_USER, SPARK, ...) 42 | - `env.template`: Env variables template 43 | - `.git + .gitignore`: Code versioning 44 | - `Makefile`: Shortcut for terminal's commands 45 | - `load_dataset`: .sql scripts to create schema and load `dataset` into MySQL, Postgres 46 | - `requirements.txt + Pipfile + Pipfile.lock`: Python's dependencies 47 | 48 | In addition, the containers also have their own separate directories, which include: 49 | 50 | - `minio` 51 | - `storage` 52 | - `mysql_data` 53 | - `postgres_data` 54 | - `metabase_data` 55 | 56 | Visit file [tree.txt](https://github.com/lelouvincx/goodreads-elt-pipeline/blob/main/tree.txt) for more details. 57 | 58 | ### 3.2 Pipeline design 59 | 60 | ![](./images/design_pipeline.png "Pipeline Design") 61 | 62 | 0. We use docker to containerize the application and dagster to orchestrate assets (as defined in dagster's [documentation](https://docs.dagster.io/concepts/assets/software-defined-assets)). 63 | 1. Goodreads data is downloaded from Kaggle in `.csv` format, then imported into `MySQL` to simulate development data 64 | 2. After obtaining the book's ISBN (international standard book number), collect additional data from relevant APIs: 65 | - Genre, author, pages number, image, description from `OpenLibrary API` 66 | - Download link from `Notion API` 67 | - Epub file from `Google Drive API` 68 | - Image from `OpenLibrary API` or `Google Drive API` 69 | 3. Extract the table-formatted data above using `polars`, and load it into the datalake - `MinIO`. 70 | 4. From `MinIO`, load data into `spark` to transform from raw into silver & gold 71 | 5. Convert `Spark DataFrame` to `.parquet`, and load back to `MinIO` 72 | 6. Load gold layer into data warehouse - postgreSQL, creating the warehouse layer. 73 | 7. Transform as needed using `dbt` on `postgres` 74 | 8. Visualize the data using `metabase` 75 | 9. Create a book recommendation app using `streamlit` 76 | 77 | ### 3.3 Database schema 78 | 79 | ![](./images/design_schema.png) 80 | 81 | 1. `book`: OLTP table contains books' infomation (e.g ISBN, Authors, Rating, Description...) 82 | 2. `genre`: table contains genres 83 | 3. `book_genre`: n-n relationship of `book` and `genre` 84 | 4. `book_download_link`: table contains link google drive 85 | 5. `files`: object storage contains books' download files (.epub/.pdf/.mobi) 86 | 6. `images`: object storage contains books' images 87 | 88 | ### 3.4 Datalake structure 89 | 90 | ![](./images/datalake_structure.png "Datalake Structure") 91 | 92 | 1. The datalake is divided into three layers: bronze, silver, and gold. 93 | 2. All files are under the .parquet format for better reading performance than .csv. 94 | 3. There are also `files` that stores .epub files in the format of abc.epub, where abc is the ISBN of the book. 95 | 4. Similarly, abc.jpeg stores the image of the book. 96 | 97 | ### 3.5 Data lineage 98 | 99 | 1. General 100 | 101 | ![](./images/assets_general.png) 102 | 103 | With a dense data lineage, Dagster is a big help when it comes to visualizing it in a clear way: 104 | 105 | - Data originates from MySQL and various APIs, and is loaded into the bronze layer. 106 | - From the bronze layer, data is deduped, cleaned, and missing values are filled in the silver layer. 107 | - Advanced computations and splitting are then performed in the gold layer. 108 | - The data is loaded into the data warehouse - Postgres - in the warehouse layer. 109 | - Finally, transformations are made according to needs in the recommendations layer using dbt. 110 | 111 | 2. Bronze layer 112 | 113 | ![](./images/assets_bronze.png) 114 | 115 | Includes these assets: 116 | 117 | - bronze_book: Table `book` from MySQL, because too large (over 1.2 million rows) it is partitioned by year from 1975 to 2022. 118 | - bronze_genre: Table `genre` from MySQL. 119 | - bronze_book_genre: Table `book_genre` from MySQL. 120 | - bronze_book_download_link: Table `book_download_link` from MySQL. 121 | - bronze_images_and_files_download: Đảm nhận việc kết nối tới google drive api, kéo file .epub và hình ảnh về, lưu trong datalake. 122 | - bronze_images_and_files_download: Responsible for connecting to the Google Drive API, pulling the .epub file and images, and storing them in the datalake. 123 | 124 | 3. Silver layer 125 | 126 | ![](./images/assets_silver.png) 127 | 128 | Includes these assets: 129 | 130 | - silver_cleaned_book: Data cleaning from upstream `bronze_book`, partitioned to ensure `spark standalone mode` can run efficiently. 131 | - silver_collected_book: Collect missing data from upstream such as authors, pages number, and description from `OpenLibrary API`. 132 | - silver_isbn: Extract the isbn column from book to serve as a dependency for assets related to genre. 133 | - silver_cleaned_genre: Similar to `silver_cleaned_book`, but doesn't need partitioning as its size is not very large. 134 | - silver_collected_genre: Based on `silver_isbn`, collect missing genres for each book. If there is no genre, it cannot be used for recommendations in subsequent tasks. 135 | - silver_collected_book_genre: Establish the n-n relationship between book and genre. 136 | 137 | 4. Gold layer 138 | 139 | ![](./images/assets_gold.png) 140 | 141 | Includes these assets: 142 | 143 | - gold_genre: Compute and sort genres to match from upstream `silver_collected_genre`, while also saving them to minIO. 144 | - gold_book_genre: Similarly, from upstream `silver_collected_book_genre`. 145 | - gold_with_info: Splitting, containing only basic information about the book such as ISBN, Name, Authors, Language, PagesNumber. 146 | - gold_with_publish: Splitting, containing information about the publisher, publication time. 147 | - gold_with_rating: Splitting and computing different types of ratings. 148 | 149 | 5. Warehouse layer 150 | 151 | ![](./images/assets_warehouse.png) 152 | 153 | Load the assets from the gold layer into Postgres, including one asset from the bronze layer which is book_download_link. 154 | 155 | In the future, the assets will be updated to add download links automatically from the Notion API, and a schedule will be set up. 156 | 157 | 6. Transform layer 158 | 159 | ![](./images/assets_dbt.png) 160 | 161 | Includes these assets: 162 | 163 | - search: Transform information to create an index table, which will be queried when users search for books. 164 | - search_prior: Also an index table, but contains books that are given priority based on factors such as availability of download links, functionality of the OpenLibrary API, high ratings, etc. 165 | - criteria: Criteria used to query related books when searching for a specific book. 166 | 167 | ## 4. Setup 168 | 169 | ### 4.1 Prequisites 170 | 171 | To develop this pipeline, download and install these softwares: 172 | 173 | 1. [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) 174 | 2. [Docker](https://docs.docker.com/engine/install/) with at least 4GB RAM, 6 core CPU, 2GB swap, 16GB disk 175 | 3. [CMake](https://cmake.org/install/), nếu dùng hệ máy UNIX (Linux/MacOS), check `make --version` already installed 176 | 4. Python 3.x (3.9.16 recommended as the Spark image runs on this version, installing via asdf is recommended) and a virtual environment (pipenv recommended) 177 | 5. A local machine that has freed the following ports: 3306, 5432, 9000, 9001, 3001, 8501, 4040, 7077, 8080, 3030 178 | 6. Dbeaver or any other DB client (if not available, can use command-line) 179 | 180 | If using Windows, set up WSL2 and a local Ubuntu virtual machine, then install the above software for Ubuntu. 181 | 182 | Clone the repository 183 | 184 | ```bash 185 | git clone https://github.com/lelouvincx/goodreads-elt-pipeline.git project 186 | cd project 187 | ``` 188 | 189 | Download the csv dataset [here](https://www.kaggle.com/datasets/lelouvincx/goodreads-elt-pipeline?select=book.csv), then place it in `project/dataset` 190 | 191 | ### 4.2 Setup google drive api 192 | 193 | Firstly we need to create an OAuth 2.0 token to google, [Google API Console](https://console.developers.google.com/). 194 | 195 | Select `create new project`: 196 | 197 | ![](./images/gdrive_1.png) 198 | 199 | Fill in project's name (goodreads-elt_pipeline), choose location (default `No organization`). 200 | 201 | ![](./images/gdrive_2.png) 202 | 203 | After creating project, select tab `Library`: 204 | 205 | ![](./images/gdrive_3.png) 206 | 207 | Search `Google Drive API`, enable it. 208 | 209 | ![](./images/gdrive_4.png) 210 | 211 | ![](./images/gdrive_5.png) 212 | 213 | Next, select tab `OAuth consent screen`, 214 | 215 | ![](./images/gdrive_6.png) 216 | 217 | Fill in below information: 218 | 219 | ![](./images/gdrive_7.png) 220 | 221 | In `scopes`, select `add or remove scopes`, look for `google drive api, readonly` then tick, `save and continue` until end. 222 | 223 | ![](./images/gdrive_8.png) 224 | 225 | Select tab `credentials` -> `create credentials` then `OAuth client ID`. 226 | 227 | ![](./images/gdrive_9.png) 228 | 229 | Select `Desktop app`, name as you like (default: goodreads-elt-pipeline) 230 | 231 | ![](./images/gdrive_10.png) 232 | 233 | Download json and place in `project/elt_pipeline/elt_pipeline` 234 | 235 | ![](./images/gdrive_11.png) 236 | 237 | ### 4.3 Setup local infrastructure 238 | 239 | Clone repository: 240 | 241 | ```bash 242 | # Create env file 243 | touch .env 244 | cp env.template .env 245 | touch .spark_master.env 246 | cp spark_master.env.template .spark_master.env 247 | touch .spark_worker.env 248 | cp spark_worker.env.template .spark_worker.env 249 | ``` 250 | 251 | Then fill in the infomation into the above env files, for examples: 252 | 253 | ```env 254 | # MySQL 255 | MYSQL_HOST=de_mysql 256 | MYSQL_PORT=3306 257 | MYSQL_DATABASE=goodreads 258 | MYSQL_USER=admin 259 | MYSQL_PASSWORD=admin123 260 | MYSQL_ROOT_PASSWORD=root123 261 | 262 | # PostgreSQL 263 | POSTGRES_HOST=de_psql 264 | POSTGRES_PORT=5432 265 | POSTGRES_USER=admin 266 | POSTGRES_PASSWORD=admin123 267 | POSTGRES_DB=goodreads 268 | POSTGRES_HOST_AUTH_METHOD=trust 269 | 270 | # Google Drive 271 | GDRIVE_CLIENT_SECRET_FILE=client_secret.json 272 | GDRIVE_PICKLE_FILE=token_drive_v3.pickle 273 | GDRIVE_API_NAME=drive 274 | GDRIVE_API_VERSION=v3 275 | GDRIVE_SCOPES=https://www.googleapis.com/auth/drive.readonly 276 | 277 | # Dagster 278 | DAGSTER_PG_HOSTNAME=de_psql 279 | DAGSTER_PG_USERNAME=admin 280 | DAGSTER_PG_PASSWORD=admin123 281 | DAGSTER_PG_DB=postgres 282 | DAGSTER_OVERALL_CONCURRENCY_LIMIT=1 283 | DAGSTER_HOME=/opt/dagster/dagster_home 284 | 285 | # dbt 286 | DBT_HOST=de_psql 287 | DBT_USER=admin 288 | DBT_PASSWORD=admin123 289 | DBT_DATABASE=goodreads 290 | DBT_SCHEMA=recommendations 291 | # MinIO 292 | MINIO_ENDPOINT=minio:9000 293 | MINIO_ROOT_USER=minio 294 | MINIO_ROOT_PASSWORD=minio123 295 | MINIO_ACCESS_KEY=minio 296 | MINIO_SECRET_KEY=minio123 297 | DATALAKE_BUCKET=lakehouse 298 | AWS_ACCESS_KEY_ID=minio 299 | AWS_SECRET_ACCESS_KEY=minio123 300 | AWS_REGION=us-east-1 301 | 302 | # MinIO client (mc) 303 | AWS_ACCESS_KEY_ID=minio 304 | AWS_SECRET_ACCESS_KEY=minio123 305 | AWS_REGION=us-east-1 306 | 307 | # Spark 308 | SPARK_MASTER_URL=spark://spark-master:7077 309 | SPARK_VERSION=3.3.2 310 | HADOOP_VERSION=3 311 | 312 | # Metabase 313 | MB_DB_TYPE=postgres 314 | MB_DB_DBNAME=goodreads 315 | MB_DB_PORT=5432 316 | MB_DB_USER=admin 317 | MB_DB_PASS=admin123 318 | MB_DB_HOST=de_psql 319 | MB_DB_FILE=/metabase_data/metabase.db 320 | ``` 321 | 322 | You can replace the infomation about user, password, ... 323 | 324 | **For development only, do not use for production.** 325 | 326 | ```bash 327 | # DO NOT RUN BOTH BELOW COMMANDS, ONLY CHOOSE ONE 328 | # Setup python environment 329 | pipenv install 330 | # Or create virtualenv and install manually by requirements.txt 331 | make install 332 | 333 | # Build docker images 334 | make build-dagster 335 | make build-spark 336 | make build-pipeline 337 | make build-streamlit 338 | 339 | # Run containers dettached 340 | make up-bg 341 | 342 | # Check running containers 343 | docker compose ps -a 344 | 345 | # Check code quality 346 | make check 347 | make lint 348 | 349 | # Format pipelines 350 | black ./elt_pipeline 351 | 352 | # Test coverage 353 | make test 354 | ``` 355 | 356 | Check there's 11 running services: 357 | 358 | ![](./images/docker_1.png) 359 | 360 | ![](./images/docker_2.png) 361 | 362 | **Ports**: 363 | 364 | - MySQL: 3306 365 | - PostgreSQL: 5432 366 | - Dagit: 3001 367 | - MinIO 368 | - UI: 9001 369 | - API: 9000 370 | - Spark master: 371 | - UI: 8080 372 | - API: 7077 373 | - Pipeline: 374 | - Spark jobs running: 4040 375 | - Metabase: 3030 376 | - Streamlit: 8501 377 | 378 | ### 4.4 Import data into MySQL 379 | 380 | Now we import the Goodreads dataset (unser csv format) into MySQL: 381 | 382 | ```bash 383 | make to_mysql_root 384 | ``` 385 | 386 | ```sql 387 | SET GLOBAL local_infile=TRUE; 388 | -- Check if local_infile was turned on 389 | SHOW VARIABLES LIKE "local_infile"; 390 | exit 391 | ``` 392 | 393 | ```bash 394 | # Create tables with schema 395 | make mysql_create 396 | 397 | # Load csv into created tables 398 | make mysql_load 399 | ``` 400 | 401 | ### 4.5 Create schema in Postgres 402 | 403 | ```bash 404 | make psql_create 405 | ``` 406 | 407 | ### 4.6 User interfaces 408 | 409 | 1. - Dagit 410 | 2. - Spark jobs 411 | 3. - Spark master 412 | 4. - MinIO 413 | 5. - Metabase 414 | 6. - Streamlit 415 | 416 | ## 5. Considerations 417 | 418 | Evaluation of the project: 419 | 420 | 1. Speed: `spark` is installed in standalone mode, so it does not achieve high performance and sometimes crashes in the middle of performing shuffle/read/write tasks. 421 | 2. Development environment: Currently, there is only a development environment, and in the future, testing, staging, and production environments will be considered. 422 | 3. `dbt` is currently a small project, and in the future, if more transformations are needed, it should be split into separate services with different permissions. 423 | 4. Deployment: Using one of the cloud computing services such as AWS, Azure, GCP. 424 | 425 | ## 6. Further actions 426 | 427 | 1. Complete the recommender system 428 | 2. Integrate Jupyter Notebook for DS tasks - [dagstermill](https://docs.dagster.io/integrations/dagstermill) 429 | 3. Testing environment 430 | 4. Continuous Integration with Github Actions 431 | -------------------------------------------------------------------------------- /elt_pipeline/elt_pipeline/assets/silver.py: -------------------------------------------------------------------------------- 1 | from dagster import asset, AssetIn, Output, StaticPartitionsDefinition 2 | from datetime import datetime 3 | import polars as pl 4 | import requests 5 | import os 6 | 7 | from pyspark.sql import DataFrame 8 | 9 | from ..resources.spark_io_manager import get_spark_session 10 | from pyspark.sql.functions import udf, col, regexp_replace, lower, when 11 | 12 | 13 | COMPUTE_KIND = "PySpark" 14 | LAYER = "silver" 15 | YEARLY = StaticPartitionsDefinition( 16 | [str(year) for year in range(1975, datetime.today().year)] 17 | ) 18 | 19 | 20 | @udf 21 | def split_take_second(value): 22 | return value.split(":")[1] 23 | 24 | 25 | # Silver cleaned book 26 | @asset( 27 | description="Load book table from bronze layer in minIO, into a Spark dataframe, then clean data", 28 | partitions_def=YEARLY, 29 | ins={ 30 | "bronze_book": AssetIn( 31 | key_prefix=["bronze", "goodreads"], 32 | ), 33 | }, 34 | io_manager_key="spark_io_manager", 35 | key_prefix=["silver", "goodreads"], 36 | compute_kind=COMPUTE_KIND, 37 | group_name=LAYER, 38 | ) 39 | def silver_cleaned_book(context, bronze_book: pl.DataFrame): 40 | """ 41 | Load book table from bronze layer in minIO, into a Spark dataframe, then clean data 42 | """ 43 | 44 | config = { 45 | "endpoint_url": os.getenv("MINIO_ENDPOINT"), 46 | "minio_access_key": os.getenv("MINIO_ACCESS_KEY"), 47 | "minio_secret_key": os.getenv("MINIO_SECRET_KEY"), 48 | } 49 | 50 | context.log.debug("Start creating spark session") 51 | 52 | with get_spark_session(config, str(context.run.run_id).split("-")[0]) as spark: 53 | # Convert bronze_book from polars DataFrame to Spark DataFrame 54 | pandas_df = bronze_book.to_pandas() 55 | context.log.debug( 56 | f"Converted to pandas DataFrame with shape: {pandas_df.shape}" 57 | ) 58 | 59 | spark_df = spark.createDataFrame(pandas_df) 60 | spark_df.cache() 61 | context.log.info("Got Spark DataFrame") 62 | 63 | # Dedupe books 64 | spark_df = spark_df.dropDuplicates() 65 | # Drop rows with null value in column 'Name' 66 | spark_df = spark_df.na.drop(subset=["Name"]) 67 | # Drop rows with null values in column ISBN 68 | spark_df = spark_df.na.drop(subset=["isbn"]) 69 | # Drop rows will null values in column 'Language' 70 | spark_df = spark_df.na.drop(subset=["Language"]) 71 | # Drop rows with value '--' in column 'Language' 72 | spark_df = spark_df.filter(spark_df.Language != "--") 73 | # Drop rows with value > 350 in column 'PagesNumber' 74 | spark_df = spark_df.filter(spark_df.PagesNumber <= 350) 75 | # Drop column 'CountsOfReview' (overlap with 'RatingDistTotal') 76 | spark_df = spark_df.drop("CountsOfReview") 77 | # Choose rows with 'PublishYear' from 1900 to datetime.today().year 78 | spark_df = spark_df.filter( 79 | (spark_df.PublishYear >= 1975) 80 | & (spark_df.PublishYear <= datetime.today().year) 81 | ) 82 | # Update value of column 'RatingDist...', splitting by ':' and take the second value 83 | spark_df = spark_df.withColumn( 84 | "RatingDist5", split_take_second(col("RatingDist5")) 85 | ) 86 | spark_df = spark_df.withColumn( 87 | "RatingDist4", split_take_second(col("RatingDist4")) 88 | ) 89 | spark_df = spark_df.withColumn( 90 | "RatingDist3", split_take_second(col("RatingDist3")) 91 | ) 92 | spark_df = spark_df.withColumn( 93 | "RatingDist2", split_take_second(col("RatingDist2")) 94 | ) 95 | spark_df = spark_df.withColumn( 96 | "RatingDist1", split_take_second(col("RatingDist1")) 97 | ) 98 | spark_df = spark_df.withColumn( 99 | "RatingDistTotal", split_take_second(col("RatingDistTotal")) 100 | ) 101 | # Cast column 'RatingDist...' to Interger 102 | spark_df = spark_df.withColumn( 103 | "RatingDist5", spark_df.RatingDist5.cast("Integer") 104 | ) 105 | spark_df = spark_df.withColumn( 106 | "RatingDist4", spark_df.RatingDist4.cast("Integer") 107 | ) 108 | spark_df = spark_df.withColumn( 109 | "RatingDist3", spark_df.RatingDist3.cast("Integer") 110 | ) 111 | spark_df = spark_df.withColumn( 112 | "RatingDist2", spark_df.RatingDist2.cast("Integer") 113 | ) 114 | spark_df = spark_df.withColumn( 115 | "RatingDist1", spark_df.RatingDist1.cast("Integer") 116 | ) 117 | spark_df = spark_df.withColumn( 118 | "RatingDistTotal", spark_df.RatingDistTotal.cast("Integer") 119 | ) 120 | # Change column name 'Count of text reviews' to 'CountOfTextReviews' 121 | spark_df = spark_df.withColumnRenamed( 122 | "Count of text reviews", "CountOfTextReviews" 123 | ) 124 | # Change value of column 'Language' from ['en-US', 'en-GB', 'en-CA'], to 'eng', from 'nl' to 'nld' 125 | spark_df = spark_df.withColumn( 126 | "Language", regexp_replace("Language", "en-US", "eng") 127 | ) 128 | spark_df = spark_df.withColumn( 129 | "Language", regexp_replace("Language", "en-GB", "eng") 130 | ) 131 | spark_df = spark_df.withColumn( 132 | "Language", regexp_replace("Language", "en-CA", "eng") 133 | ) 134 | spark_df = spark_df.withColumn( 135 | "Language", regexp_replace("Language", "nl", "nld") 136 | ) 137 | 138 | spark_df.unpersist() 139 | 140 | return Output( 141 | value=spark_df, 142 | metadata={ 143 | "table": "silver_cleaned_book", 144 | "row_count": spark_df.count(), 145 | "column_count": len(spark_df.columns), 146 | "columns": spark_df.columns, 147 | }, 148 | ) 149 | 150 | 151 | # Silver cleaned genre 152 | @asset( 153 | description="Load genre table from bronze layer in minIO, into a Spark dataframe, then clean data", 154 | ins={ 155 | "bronze_genre": AssetIn( 156 | key_prefix=["bronze", "goodreads"], 157 | ), 158 | }, 159 | io_manager_key="spark_io_manager", 160 | key_prefix=["silver", "goodreads"], 161 | compute_kind=COMPUTE_KIND, 162 | group_name=LAYER, 163 | ) 164 | def silver_cleaned_genre(context, bronze_genre: pl.DataFrame): 165 | """ 166 | Load genre table from bronze layer in minIO, into a Spark dataframe, then clean data 167 | """ 168 | 169 | config = { 170 | "endpoint_url": os.getenv("MINIO_ENDPOINT"), 171 | "minio_access_key": os.getenv("MINIO_ACCESS_KEY"), 172 | "minio_secret_key": os.getenv("MINIO_SECRET_KEY"), 173 | } 174 | 175 | context.log.debug("Start creating spark session") 176 | 177 | with get_spark_session(config) as spark: 178 | pandas_df = bronze_genre.to_pandas() 179 | context.log.debug(f"Got pandas DataFrame with shape: {pandas_df.shape}") 180 | 181 | spark_df = spark.createDataFrame(pandas_df) 182 | spark_df.cache() 183 | context.log.info("Got Spark DataFrame") 184 | 185 | # Downcase the column 'Name' 186 | spark_df = spark_df.withColumn("Name", lower(col("Name"))) 187 | 188 | return Output( 189 | value=spark_df, 190 | metadata={ 191 | "table": "silver_cleaned_genre", 192 | "row_count": spark_df.count(), 193 | "column_count": len(spark_df.columns), 194 | "columns": spark_df.columns, 195 | }, 196 | ) 197 | 198 | 199 | # Silver collected book 200 | @asset( 201 | description="Collect more infomation about cleaned books, such as authors, number of pages", 202 | partitions_def=YEARLY, 203 | ins={ 204 | "silver_cleaned_book": AssetIn( 205 | key_prefix=["silver", "goodreads"], 206 | metadata={"full_load": False}, 207 | ), 208 | }, 209 | io_manager_key="spark_io_manager", 210 | key_prefix=["silver", "goodreads"], 211 | compute_kind="OpenLibrary API", 212 | group_name=LAYER, 213 | ) 214 | def silver_collected_book(context, silver_cleaned_book: DataFrame) -> Output[DataFrame]: 215 | """ 216 | Collect more infomation about cleaned books 217 | - Authors: if missing 218 | - Number of pages: if missing 219 | """ 220 | 221 | spark_df = silver_cleaned_book 222 | context.log.debug("Caching spark_df ...") 223 | spark_df.cache() 224 | 225 | context.log.info("Starting filling missing data ...") 226 | null_authors_df = spark_df.filter( 227 | (spark_df.Authors.isNull()) | (spark_df.Authors == "") 228 | ) 229 | null_pages_number_df = spark_df.filter((spark_df.PagesNumber.isNull())) 230 | 231 | count = 0 232 | for row in null_authors_df.select("ISBN").collect(): 233 | isbn = row[0] 234 | context.log.debug(f"Got isbn: {isbn}") 235 | if isbn is not None: 236 | # Get request from OpenLibrary API 237 | req = requests.get( 238 | f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&format=json&jscmd=data" 239 | ) 240 | json = req.json() 241 | if len(json.keys()) > 0: 242 | context.log.debug("Got json with data") 243 | # Check if spark_df with column 'ISBN' = isbn has missing value in column 'Authors' 244 | row_from_df = spark_df.filter(spark_df.ISBN == isbn).collect()[0] 245 | if row_from_df.Authors is None or row_from_df.Authors is "": 246 | context.log.debug("Authors is missing, start filling ...") 247 | # Take the first author 248 | author = json.get(f"ISBN:{isbn}" or {}).get("authors" or []) 249 | author = author[0].get("name") if len(author) > 0 else None 250 | if author: 251 | count += 1 252 | # Update spark_df with column 'ISBN' = isbn and column 'Authors' = author 253 | spark_df = spark_df.withColumn( 254 | "Authors", 255 | when( 256 | (spark_df.ISBN == isbn) 257 | & ( 258 | (spark_df.Authors.isNull()) 259 | | (spark_df.Authors == "") 260 | ), 261 | author, 262 | ).otherwise(spark_df.Authors), 263 | ) 264 | context.log.info(f"Filled in {count} authors") 265 | 266 | count = 0 267 | for row in null_pages_number_df.select("ISBN").collect(): 268 | isbn = row[0] 269 | context.log.debug(f"Got isbn: {isbn}") 270 | if isbn is not None: 271 | # Get request from OpenLibrary API 272 | req = requests.get( 273 | f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&format=json&jscmd=data" 274 | ) 275 | json = req.json() 276 | if len(json.keys()) > 0: 277 | context.log.debug("Got json with real data") 278 | # Check if spark_df with column 'ISBN' = isbn has missing value in column 'Authors' 279 | row_from_df = spark_df.filter(spark_df.ISBN == isbn).collect()[0] 280 | # Check if spark_df with column 'ISBN' = isbn has missing value in column 'PagesNumber' 281 | if row_from_df.PagesNumber is None or row_from_df.PagesNumber == 0: 282 | context.log.debug("PagesNumber is missing, start filling ...") 283 | # Take the number of pages 284 | pages_number = json.get(f"ISBN:{isbn}" or {}).get("number_of_pages") 285 | if pages_number: 286 | count += 1 287 | # Update spark_df with column 'ISBN' = isbn and column 'PagesNumber' = pages_number 288 | spark_df = spark_df.withColumn( 289 | "PagesNumber", 290 | when( 291 | (spark_df.ISBN == isbn) 292 | & (spark_df.PagesNumber.isNull()), 293 | pages_number, 294 | ).otherwise(spark_df.PagesNumber), 295 | ) 296 | context.log.info(f"Filled in {count} pages numbers") 297 | 298 | spark_df.unpersist() 299 | 300 | return Output( 301 | value=spark_df, 302 | metadata={ 303 | "table": "silver_collected_book", 304 | "row_count": spark_df.count(), 305 | "column_count": len(spark_df.columns), 306 | "columns": spark_df.columns, 307 | }, 308 | ) 309 | 310 | 311 | # ISBN 312 | @asset( 313 | description="Extract column 'ISBN' from silver_cleaned_book", 314 | ins={ 315 | "silver_cleaned_book": AssetIn( 316 | key_prefix=["silver", "goodreads"], 317 | metadata={"full_load": True}, 318 | ), 319 | }, 320 | io_manager_key="spark_io_manager", 321 | key_prefix=["silver", "goodreads"], 322 | compute_kind=COMPUTE_KIND, 323 | group_name=LAYER, 324 | ) 325 | def silver_isbn(context, silver_cleaned_book: DataFrame) -> Output[DataFrame]: 326 | """ 327 | Extract column 'ISBN' from cleaned book 328 | """ 329 | 330 | context.log.debug("Extracting ISBN ...") 331 | spark_df = silver_cleaned_book.select("ISBN") 332 | 333 | return Output( 334 | value=spark_df, 335 | metadata={ 336 | "table": "silver_isbn", 337 | "row_count": spark_df.count(), 338 | "column_count": len(spark_df.columns), 339 | "columns": spark_df.columns, 340 | }, 341 | ) 342 | 343 | 344 | # Silver collected genre 345 | @asset( 346 | description="Collect more infomation about cleaned genres", 347 | ins={ 348 | "silver_isbn": AssetIn( 349 | key_prefix=["silver", "goodreads"], 350 | ), 351 | "silver_cleaned_genre": AssetIn( 352 | key_prefix=["silver", "goodreads"], 353 | ), 354 | }, 355 | io_manager_key="spark_io_manager", 356 | key_prefix=["silver", "goodreads"], 357 | compute_kind="OpenLibrary API", 358 | group_name=LAYER, 359 | ) 360 | def silver_collected_genre( 361 | context, silver_isbn: DataFrame, silver_cleaned_genre: DataFrame 362 | ) -> Output[DataFrame]: 363 | """ 364 | Collect more infomation about cleaned genres, with upstream isbn 365 | Connect to OpenLibrary API to get more information about genre, 366 | and union to silver_cleaned_genre, unique by 'Name' 367 | """ 368 | 369 | return Output( 370 | value=silver_cleaned_genre, 371 | metadata={ 372 | "table": "silver_collected_genre", 373 | "row_count": silver_cleaned_genre.count(), 374 | "column_count": len(silver_cleaned_genre.columns), 375 | "columns": silver_cleaned_genre.columns, 376 | }, 377 | ) 378 | 379 | 380 | # Silver collected book_genre 381 | @asset( 382 | description="Collect more relationships about books and genres", 383 | ins={ 384 | "silver_isbn": AssetIn( 385 | key_prefix=["silver", "goodreads"], 386 | ), 387 | "silver_collected_genre": AssetIn( 388 | key_prefix=["silver", "goodreads"], 389 | ), 390 | "bronze_book_genre": AssetIn( 391 | key_prefix=["bronze", "goodreads"], 392 | ), 393 | }, 394 | io_manager_key="spark_io_manager", 395 | key_prefix=["silver", "goodreads"], 396 | compute_kind="OpenLibrary API", 397 | group_name=LAYER, 398 | ) 399 | def silver_collected_book_genre( 400 | context, 401 | silver_isbn: DataFrame, 402 | silver_collected_genre: DataFrame, 403 | bronze_book_genre: pl.DataFrame, 404 | ) -> Output[DataFrame]: 405 | """ 406 | Collect more relationships about books and genres 407 | """ 408 | 409 | config = { 410 | "endpoint_url": os.getenv("MINIO_ENDPOINT"), 411 | "minio_access_key": os.getenv("MINIO_ACCESS_KEY"), 412 | "minio_secret_key": os.getenv("MINIO_SECRET_KEY"), 413 | } 414 | 415 | context.log.debug("Start creating spark session") 416 | 417 | # Convert bronze_book from polars DataFrame to Spark DataFrame 418 | pandas_df = bronze_book_genre.to_pandas() 419 | context.log.debug(f"Converted to pandas DataFrame with shape: {pandas_df.shape}") 420 | 421 | with get_spark_session(config) as spark: 422 | spark_df = spark.createDataFrame(pandas_df) 423 | spark_df.cache() 424 | context.log.info("Got Spark DataFrame") 425 | 426 | return Output( 427 | value=spark_df, 428 | metadata={ 429 | "table": "silver_collected_book_genre", 430 | "row_count": spark_df.count(), 431 | "column_count": len(spark_df.columns), 432 | "columns": spark_df.columns, 433 | }, 434 | ) 435 | -------------------------------------------------------------------------------- /preprocess.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import polars as pl" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "(1850310, 20)" 21 | ] 22 | }, 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "books_df = pl.read_csv(\"dataset/full_dataset.csv\")\n", 30 | "books_df.shape" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/plain": [ 41 | "(1850198, 20)" 42 | ] 43 | }, 44 | "execution_count": 3, 45 | "metadata": {}, 46 | "output_type": "execute_result" 47 | } 48 | ], 49 | "source": [ 50 | "books_df = books_df.unique()\n", 51 | "books_df.shape # Reduce 224/2 = 112 rows" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "low = books_df.filter(books_df['PublishYear'] < 1700).shape[0]\n", 61 | "high = books_df.filter(books_df['PublishYear'] > 2021).shape[0]" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 5, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "(1850149, 20)" 73 | ] 74 | }, 75 | "execution_count": 5, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "books_df = books_df[low:len(books_df)-high]\n", 82 | "books_df.shape" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 6, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "books_df = books_df.with_columns(books_df['Language'].str.replace('en-US', 'eng')\n", 92 | " .str.replace('en-GB', 'eng')\n", 93 | " .str.replace('en-CA', 'eng')\n", 94 | " .str.replace('--', 'eng')\n", 95 | " .str.replace('nl', 'nld')\n", 96 | " )" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 7, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/html": [ 107 | "
\n", 113 | "shape: (3, 2)
Languagecounts
stru32
"vie"2
null1598356
"eng"209656
" 114 | ], 115 | "text/plain": [ 116 | "shape: (3, 2)\n", 117 | "┌──────────┬─────────┐\n", 118 | "│ Language ┆ counts │\n", 119 | "│ --- ┆ --- │\n", 120 | "│ str ┆ u32 │\n", 121 | "╞══════════╪═════════╡\n", 122 | "│ vie ┆ 2 │\n", 123 | "│ null ┆ 1598356 │\n", 124 | "│ eng ┆ 209656 │\n", 125 | "└──────────┴─────────┘" 126 | ] 127 | }, 128 | "execution_count": 7, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "books_df = books_df.filter((books_df['Language'] == 'eng') | (books_df['Language'] == 'vie') | books_df['Language'].is_null())\n", 135 | "books_df['Language'].value_counts()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 8, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/plain": [ 146 | "(1808014, 20)" 147 | ] 148 | }, 149 | "execution_count": 8, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "books_df.shape" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 9, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "(1803163, 20)" 167 | ] 168 | }, 169 | "execution_count": 9, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "books_df = books_df.drop_nulls(\"ISBN\")\n", 176 | "books_df.shape" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 10, 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/plain": [ 187 | "419030" 188 | ] 189 | }, 190 | "execution_count": 10, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "books_df = books_df.sort('PagesNumber')\n", 197 | "n = books_df.filter(books_df['PagesNumber'] > 350).shape[0]\n", 198 | "n" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 11, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "data": { 208 | "text/plain": [ 209 | "(1384133, 20)" 210 | ] 211 | }, 212 | "execution_count": 11, 213 | "metadata": {}, 214 | "output_type": "execute_result" 215 | } 216 | ], 217 | "source": [ 218 | "books_df = books_df[:len(books_df)-n]\n", 219 | "books_df.shape" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 12, 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "data": { 229 | "text/plain": [ 230 | "(1245688, 20)" 231 | ] 232 | }, 233 | "execution_count": 12, 234 | "metadata": {}, 235 | "output_type": "execute_result" 236 | } 237 | ], 238 | "source": [ 239 | "books_df = books_df.unique(subset=['Name'])\n", 240 | "books_df.shape" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 13, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/html": [ 251 | "
\n", 257 | "shape: (1245688, 20)
IdNameAuthorsISBNRatingPublishYearPublishMonthPublishDayPublisherRatingDist5RatingDist4RatingDist3RatingDist2RatingDist1RatingDistTotalCountsOfReviewLanguageDescriptionCount of text reviewsPagesNumber
i64strstrstrf64i64i64i64strstrstrstrstrstrstri64strstrstri64
1900656"Good In Bed""Jennifer Weine…"0743508467"3.73200151"Simon & Schust…"5:78557""4:92923""3:71496""2:21959""1:13187""total:278122"0null"Jennifer Weine…null0
1900780"Letters from N…"Christopher J.…"1400105390"3.622007925"Tantor Media""5:28""4:70""3:50""2:15""1:4""total:167"1null"Senator Christ…null0
1900805"Tuesdays With …"Mitch Albom""0739311115"4.11200461"Random House A…"5:333959""4:249424""3:131086""2:36787""1:14351""total:765607"1null"This true stor…null0
1902872"The Bicycle Bo…"Geoff Apps""051708743X"2.01993214"Crescent""5:0""4:0""3:0""2:1""1:0""total:1"1nullnullnull0
1903429"Fair-Weather L…"Carla Bracale""0804102406"3.611989130"Ivy Books""5:21""4:15""3:24""2:11""1:1""total:72"0null"Sink or swim..…null0
1903542"The Adversity …"Paul G. Stoltz…"1400103584"3.98200711"Tantor Media""5:55""4:66""3:36""2:7""1:3""total:167"0null"A <i>Wall Stre…null0
1904282"Tell The World…"Liu Binyan""0517088355"3.671992816"Random House V…"5:1""4:4""3:4""2:0""1:0""total:9"1nullnullnull0
1904406"Hh-Big Red Bar…"Happy House""0394823915"0.01989419"Random House B…"5:0""4:0""3:0""2:0""1:0""total:0"0nullnullnull0
1904941"The Fourth K""Mario Puzo""0679423427"3.58199136"Random House A…"5:623""4:1085""3:1045""2:341""1:77""total:3171"0null"<b>A PRESIDENT…null0
1905496"1988 Baseball …"Donruss, Fleer…"0517660431"0.01988530"Beekman House""5:0""4:0""3:0""2:0""1:0""total:0"0nullnullnull0
1905683"The Princess a…"George MacDona…"140015085X"4.07200361"Tantor Media""5:3196""4:2851""3:1669""2:349""1:97""total:8162"0null"In this sequel…null0
1905755"Our Only May A…"Jennifer L. Ho…"0807282340"3.82200052"Listening Libr…"5:2178""4:2928""3:2062""2:506""1:169""total:7843"3null"Twelve-year-ol…null0
1185341"The Illinois …"Michael P. Con…"0875801285"3.5198818"Northern Illin…"5:0""4:1""3:1""2:0""1:0""total:2"0nullnull"0"350
1185950"At the End of …"Carole Minard""141373801X"0.02004126"America Star B…"5:0""4:0""3:0""2:0""1:0""total:0"0null"After being hu…"0"350
1185963"Immigration In…"Donald J. Puch…"1855674513"0.0199714"Cassell""5:0""4:0""3:0""2:0""1:0""total:0"0null"The essays in …"0"350
1187409"Performance-Ba…"Joan Boykoff B…"0226038033"0.01996124"The National S…"5:0""4:0""3:0""2:0""1:0""total:0"0null"Reforming our …"0"350
1187807"Community Jour…"Jock Lauterer""0813802865"0.02000174"Wiley-Blackwel…"5:0""4:0""3:0""2:0""1:0""total:0"0nullnull"0"350
1187954"I Am Dracula""C. Dean Anders…"0821760254"3.891998110"Zebra""5:49""4:33""3:18""2:10""1:8""total:118"12null"From the Carpa…"12"350
1194027"The Male Parad…"John Munder""0671705172"3.331992110"Simon & Schust…"5:0""4:1""3:2""2:0""1:0""total:3"1null"In chapters dr…"1"350
1195221"The Lamentable…"Edgardo Vega Y…"1585676306"3.812004111"Overlook Press…"5:27""4:33""3:22""2:5""1:4""total:91"1null"<br />Writing …"1"350
1195839"Catalogue of P…"Lane Poole""1851240764"0.02008157"Bodleian Libra…"5:0""4:0""3:0""2:0""1:0""total:0"0null"Fully illustra…"0"350
1198445"The Lancastria…"Simon Walker""0198201745"4.7519903112"Oxford Univers…"5:3""4:1""3:0""2:0""1:0""total:4"1null"John of Gaunt …"1"350
1199215"Call to Arms (…"Livia Hallam""1581824793"0.02005110"Cumberland Hou…"5:0""4:0""3:0""2:0""1:0""total:0"0null"On December 20…"0"350
1199788"Hong Kong (Jak…"Stephen Coonts…"0312253397"3.872000129"St. Martin's P…"5:508""4:721""3:472""2:93""1:28""total:1822"49"eng"null"49"350
" 258 | ], 259 | "text/plain": [ 260 | "shape: (1245688, 20)\n", 261 | "┌─────────┬────────────┬───────────┬───────────┬───┬──────────┬───────────┬────────────┬───────────┐\n", 262 | "│ Id ┆ Name ┆ Authors ┆ ISBN ┆ … ┆ Language ┆ Descripti ┆ Count of ┆ PagesNumb │\n", 263 | "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ on ┆ text ┆ er │\n", 264 | "│ i64 ┆ str ┆ str ┆ str ┆ ┆ str ┆ --- ┆ reviews ┆ --- │\n", 265 | "│ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ --- ┆ i64 │\n", 266 | "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ │\n", 267 | "╞═════════╪════════════╪═══════════╪═══════════╪═══╪══════════╪═══════════╪════════════╪═══════════╡\n", 268 | "│ 1900656 ┆ Good In ┆ Jennifer ┆ 074350846 ┆ … ┆ null ┆ Jennifer ┆ null ┆ 0 │\n", 269 | "│ ┆ Bed ┆ Weiner ┆ 7 ┆ ┆ ┆ Weiner's ┆ ┆ │\n", 270 | "│ ┆ ┆ ┆ ┆ ┆ ┆ Good ┆ ┆ │\n", 271 | "│ ┆ ┆ ┆ ┆ ┆ ┆ in Bed… ┆ ┆ │\n", 272 | "│ 1900780 ┆ Letters ┆ Christoph ┆ 140010539 ┆ … ┆ null ┆ Senator ┆ null ┆ 0 │\n", 273 | "│ ┆ from ┆ er J. ┆ 0 ┆ ┆ ┆ Christoph ┆ ┆ │\n", 274 | "│ ┆ Nuremberg: ┆ Dodd ┆ ┆ ┆ ┆ er J. ┆ ┆ │\n", 275 | "│ ┆ My Fathe… ┆ ┆ ┆ ┆ ┆ Dodd ┆ ┆ │\n", 276 | "│ ┆ ┆ ┆ ┆ ┆ ┆ (Con… ┆ ┆ │\n", 277 | "│ 1900805 ┆ Tuesdays ┆ Mitch ┆ 073931111 ┆ … ┆ null ┆ This true ┆ null ┆ 0 │\n", 278 | "│ ┆ With ┆ Albom ┆ 5 ┆ ┆ ┆ story ┆ ┆ │\n", 279 | "│ ┆ Morrie: An ┆ ┆ ┆ ┆ ┆ about the ┆ ┆ │\n", 280 | "│ ┆ Old Man… ┆ ┆ ┆ ┆ ┆ love b… ┆ ┆ │\n", 281 | "│ 1902872 ┆ The ┆ Geoff ┆ 051708743 ┆ … ┆ null ┆ null ┆ null ┆ 0 │\n", 282 | "│ ┆ Bicycle ┆ Apps ┆ X ┆ ┆ ┆ ┆ ┆ │\n", 283 | "│ ┆ Book: ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 284 | "│ ┆ Complete ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 285 | "│ ┆ Maint… ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 286 | "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", 287 | "│ 1195839 ┆ Catalogue ┆ Lane ┆ 185124076 ┆ … ┆ null ┆ Fully ill ┆ 0 ┆ 350 │\n", 288 | "│ ┆ of ┆ Poole ┆ 4 ┆ ┆ ┆ ustrated, ┆ ┆ │\n", 289 | "│ ┆ Portraits ┆ ┆ ┆ ┆ ┆ with ┆ ┆ │\n", 290 | "│ ┆ in the Bo… ┆ ┆ ┆ ┆ ┆ full-pag… ┆ ┆ │\n", 291 | "│ 1198445 ┆ The Lancas ┆ Simon ┆ 019820174 ┆ … ┆ null ┆ John of ┆ 1 ┆ 350 │\n", 292 | "│ ┆ trian ┆ Walker ┆ 5 ┆ ┆ ┆ Gaunt was ┆ ┆ │\n", 293 | "│ ┆ Affinity ┆ ┆ ┆ ┆ ┆ arguably ┆ ┆ │\n", 294 | "│ ┆ 1361 13… ┆ ┆ ┆ ┆ ┆ the m… ┆ ┆ │\n", 295 | "│ 1199215 ┆ Call to ┆ Livia ┆ 158182479 ┆ … ┆ null ┆ On ┆ 0 ┆ 350 │\n", 296 | "│ ┆ Arms ┆ Hallam ┆ 3 ┆ ┆ ┆ December ┆ ┆ │\n", 297 | "│ ┆ (Palmetto ┆ ┆ ┆ ┆ ┆ 20, 1860, ┆ ┆ │\n", 298 | "│ ┆ Trilogy, … ┆ ┆ ┆ ┆ ┆ two ┆ ┆ │\n", 299 | "│ ┆ ┆ ┆ ┆ ┆ ┆ friend… ┆ ┆ │\n", 300 | "│ 1199788 ┆ Hong Kong ┆ Stephen ┆ 031225339 ┆ … ┆ eng ┆ null ┆ 49 ┆ 350 │\n", 301 | "│ ┆ (Jake ┆ Coonts ┆ 7 ┆ ┆ ┆ ┆ ┆ │\n", 302 | "│ ┆ Grafton ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 303 | "│ ┆ #8) ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", 304 | "└─────────┴────────────┴───────────┴───────────┴───┴──────────┴───────────┴────────────┴───────────┘" 305 | ] 306 | }, 307 | "execution_count": 13, 308 | "metadata": {}, 309 | "output_type": "execute_result" 310 | } 311 | ], 312 | "source": [ 313 | "books_df" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 14, 319 | "metadata": {}, 320 | "outputs": [ 321 | { 322 | "data": { 323 | "text/plain": [ 324 | "(1245577, 20)" 325 | ] 326 | }, 327 | "execution_count": 14, 328 | "metadata": {}, 329 | "output_type": "execute_result" 330 | } 331 | ], 332 | "source": [ 333 | "books_df = books_df.filter((books_df['PublishYear'] >= 1900) & (books_df['PublishYear'] <= 2021))\n", 334 | "books_df.shape" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 15, 340 | "metadata": {}, 341 | "outputs": [ 342 | { 343 | "data": { 344 | "text/plain": [ 345 | "0" 346 | ] 347 | }, 348 | "execution_count": 15, 349 | "metadata": {}, 350 | "output_type": "execute_result" 351 | } 352 | ], 353 | "source": [ 354 | "books_df['PublishYear'].null_count()" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 23, 360 | "metadata": {}, 361 | "outputs": [ 362 | { 363 | "data": { 364 | "text/html": [ 365 | "
\n", 371 | "shape: (31, 2)
PublishMonthcounts
i64u32
852984
247843
166727
749209
3117359
1523649
237297
656532
147193
227444
3013765
217584
197474
1150881
277896
360740
267817
249068
1062994
186916
259656
965666
1415458
1710259
" 372 | ], 373 | "text/plain": [ 374 | "shape: (31, 2)\n", 375 | "┌──────────────┬────────┐\n", 376 | "│ PublishMonth ┆ counts │\n", 377 | "│ --- ┆ --- │\n", 378 | "│ i64 ┆ u32 │\n", 379 | "╞══════════════╪════════╡\n", 380 | "│ 8 ┆ 52984 │\n", 381 | "│ 24 ┆ 7843 │\n", 382 | "│ 16 ┆ 6727 │\n", 383 | "│ 7 ┆ 49209 │\n", 384 | "│ … ┆ … │\n", 385 | "│ 25 ┆ 9656 │\n", 386 | "│ 9 ┆ 65666 │\n", 387 | "│ 1 ┆ 415458 │\n", 388 | "│ 17 ┆ 10259 │\n", 389 | "└──────────────┴────────┘" 390 | ] 391 | }, 392 | "execution_count": 23, 393 | "metadata": {}, 394 | "output_type": "execute_result" 395 | } 396 | ], 397 | "source": [ 398 | "books_df['PublishMonth'].value_counts()" 399 | ] 400 | } 401 | ], 402 | "metadata": { 403 | "kernelspec": { 404 | "display_name": "project-zhl6RxJh", 405 | "language": "python", 406 | "name": "python3" 407 | }, 408 | "language_info": { 409 | "codemirror_mode": { 410 | "name": "ipython", 411 | "version": 3 412 | }, 413 | "file_extension": ".py", 414 | "mimetype": "text/x-python", 415 | "name": "python", 416 | "nbconvert_exporter": "python", 417 | "pygments_lexer": "ipython3", 418 | "version": "3.10.6" 419 | }, 420 | "orig_nbformat": 4 421 | }, 422 | "nbformat": 4, 423 | "nbformat_minor": 2 424 | } 425 | --------------------------------------------------------------------------------