├── elt_pipeline
    ├── dbt_transform
    │   ├── macros
    │   │   └── .gitkeep
    │   ├── seeds
    │   │   └── .gitkeep
    │   ├── tests
    │   │   └── .gitkeep
    │   ├── analyses
    │   │   └── .gitkeep
    │   ├── snapshots
    │   │   └── .gitkeep
    │   ├── .gitignore
    │   ├── config
    │   │   ├── .user.yml
    │   │   └── profiles.yml
    │   ├── models
    │   │   ├── recommendations
    │   │   │   ├── search.sql
    │   │   │   ├── search_prior.sql
    │   │   │   └── criteria.sql
    │   │   ├── sources.yml
    │   │   └── schema.yml
    │   ├── README.md
    │   └── dbt_project.yml
    ├── elt_pipeline_tests
    │   ├── __init__.py
    │   └── test_assets.py
    ├── setup.cfg
    ├── pyproject.toml
    ├── setup.py
    ├── requirements.txt
    ├── elt_pipeline
    │   ├── assets
    │   │   ├── __init__.py
    │   │   ├── bronze.py
    │   │   ├── warehouse.py
    │   │   ├── gold.py
    │   │   └── silver.py
    │   ├── resources
    │   │   ├── mysql_io_manager.py
    │   │   ├── minio_io_manager.py
    │   │   ├── spark_io_manager.py
    │   │   ├── psql_io_manager.py
    │   │   └── gdrive_io_manager.py
    │   └── __init__.py
    ├── README.md
    ├── Dockerfile
    └── logs
    │   └── dbt.log
├── pg_hba.conf
├── images
    ├── docker_1.png
    ├── docker_2.png
    ├── gdrive_1.png
    ├── gdrive_2.png
    ├── gdrive_3.png
    ├── gdrive_4.png
    ├── gdrive_5.png
    ├── gdrive_6.png
    ├── gdrive_7.png
    ├── gdrive_8.png
    ├── gdrive_9.png
    ├── assets_dbt.png
    ├── assets_gold.png
    ├── gdrive_10.png
    ├── gdrive_11.png
    ├── gdrive_12.png
    ├── assets_bronze.png
    ├── assets_silver.png
    ├── design_schema.png
    ├── introduction.jpg
    ├── assets_general.png
    ├── assets_warehouse.png
    ├── design_pipeline.png
    ├── directory_tree.png
    ├── goodreads_logo.png
    └── datalake_structure.png
├── dockerimages
    ├── dagster
    │   ├── requirements.txt
    │   └── Dockerfile
    ├── streamlit
    │   ├── requirements.txt
    │   └── Dockerfile
    └── spark
    │   ├── spark-defaults.conf
    │   └── Dockerfile
├── app
    ├── .streamlit
    │   └── secrets.toml
    └── streamlit_app.py
├── dagster_home
    ├── workspace.yaml
    └── dagster.yaml
├── spark_master.env.template
├── spark_workder.env.template
├── .gitignore
├── tree_shorten.txt
├── requirements.txt
├── datalake_structure.txt
├── load_dataset
    ├── mysql_load.sql
    ├── psql_datasource.sql
    └── mysql_datasource.sql
├── Pipfile
├── LICENSE
├── env.template
├── tree.txt
├── Makefile
├── docker-compose.yml
├── README.md
└── preprocess.ipynb


/elt_pipeline/dbt_transform/macros/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/elt_pipeline/dbt_transform/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/elt_pipeline/dbt_transform/tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/elt_pipeline/dbt_transform/analyses/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/elt_pipeline/dbt_transform/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/elt_pipeline/elt_pipeline_tests/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/elt_pipeline/elt_pipeline_tests/test_assets.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/elt_pipeline/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = elt_pipeline
3 | 


--------------------------------------------------------------------------------
/pg_hba.conf:
--------------------------------------------------------------------------------
1 | local all all trust
2 | host all all 0.0.0.0/0 trust
3 | 


--------------------------------------------------------------------------------
/elt_pipeline/dbt_transform/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | target/
3 | dbt_packages/
4 | logs/
5 | 


--------------------------------------------------------------------------------
/elt_pipeline/dbt_transform/config/.user.yml:
--------------------------------------------------------------------------------
1 | id: 37d26752-a903-4570-88b2-7fc1505deb31
2 | 


--------------------------------------------------------------------------------
/images/docker_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/docker_1.png


--------------------------------------------------------------------------------
/images/docker_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/docker_2.png


--------------------------------------------------------------------------------
/images/gdrive_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_1.png


--------------------------------------------------------------------------------
/images/gdrive_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_2.png


--------------------------------------------------------------------------------
/images/gdrive_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_3.png


--------------------------------------------------------------------------------
/images/gdrive_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_4.png


--------------------------------------------------------------------------------
/images/gdrive_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_5.png


--------------------------------------------------------------------------------
/images/gdrive_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_6.png


--------------------------------------------------------------------------------
/images/gdrive_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_7.png


--------------------------------------------------------------------------------
/images/gdrive_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_8.png


--------------------------------------------------------------------------------
/images/gdrive_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_9.png


--------------------------------------------------------------------------------
/images/assets_dbt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/assets_dbt.png


--------------------------------------------------------------------------------
/images/assets_gold.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/assets_gold.png


--------------------------------------------------------------------------------
/images/gdrive_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_10.png


--------------------------------------------------------------------------------
/images/gdrive_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_11.png


--------------------------------------------------------------------------------
/images/gdrive_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/gdrive_12.png


--------------------------------------------------------------------------------
/images/assets_bronze.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/assets_bronze.png


--------------------------------------------------------------------------------
/images/assets_silver.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/assets_silver.png


--------------------------------------------------------------------------------
/images/design_schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/design_schema.png


--------------------------------------------------------------------------------
/images/introduction.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/introduction.jpg


--------------------------------------------------------------------------------
/images/assets_general.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/assets_general.png


--------------------------------------------------------------------------------
/images/assets_warehouse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/assets_warehouse.png


--------------------------------------------------------------------------------
/images/design_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/design_pipeline.png


--------------------------------------------------------------------------------
/images/directory_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/directory_tree.png


--------------------------------------------------------------------------------
/images/goodreads_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/goodreads_logo.png


--------------------------------------------------------------------------------
/dockerimages/dagster/requirements.txt:
--------------------------------------------------------------------------------
1 | dagster==1.2.7
2 | dagit==1.2.7
3 | dagster-postgres==0.18.7
4 | dagster-dbt==0.18.7
5 | 


--------------------------------------------------------------------------------
/images/datalake_structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lelouvincx/goodreads-elt-pipeline/HEAD/images/datalake_structure.png


--------------------------------------------------------------------------------
/elt_pipeline/dbt_transform/models/recommendations/search.sql:
--------------------------------------------------------------------------------
1 | select
2 |   isbn,
3 |   name
4 | from {{ source('gold', 'book_with_info') }}
5 | 


--------------------------------------------------------------------------------
/app/.streamlit/secrets.toml:
--------------------------------------------------------------------------------
1 | [postgres]
2 | host = "de_psql"
3 | port = 5432
4 | dbname = "goodreads"
5 | user = "admin"
6 | password = "admin123"
7 | 


--------------------------------------------------------------------------------
/dagster_home/workspace.yaml:
--------------------------------------------------------------------------------
1 | load_from:
2 |   - grpc_server:
3 |       host: elt_pipeline
4 |       port: 4000
5 |       location_name: "elt_pipeline"
6 | 


--------------------------------------------------------------------------------
/dockerimages/streamlit/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.21.0
2 | psycopg2-binary==2.9.6
3 | altair==4.2.2
4 | pandas==1.5.3
5 | polars==0.16.16
6 | minio==7.1.13
7 | 


--------------------------------------------------------------------------------
/elt_pipeline/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools"]
3 | build-backend = "setuptools.build_meta"
4 | 
5 | [tool.dagster]
6 | module_name = "elt_pipeline"
7 | 


--------------------------------------------------------------------------------
/spark_master.env.template:
--------------------------------------------------------------------------------
1 | SPARK_MODE=master
2 | SPARK_LOCAL_IP=spark-master
3 | SPARK_RPC_AUTHENTICATION_ENABLED=no
4 | SPARK_RPC_ENCRYPTION_ENABLED=no
5 | SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
6 | SPARK_SSL_ENABLED=no
7 | 


--------------------------------------------------------------------------------
/elt_pipeline/dbt_transform/models/recommendations/search_prior.sql:
--------------------------------------------------------------------------------
1 | select
2 |   isbn,
3 |   name
4 | from {{ source('gold', 'book_with_info') }}
5 | right join {{ source('recommendations', 'book_download_link') }} using (isbn)
6 | where link is not null
7 | 


--------------------------------------------------------------------------------
/spark_workder.env.template:
--------------------------------------------------------------------------------
1 | SPARK_MODE=worker
2 | SPARK_MASTER_URL=spark://spark-master:7077
3 | SPARK_WORKER_MEMORY=4G
4 | SPARK_WORKER_CORES=4
5 | SPARK_RPC_AUTHENTICATION_ENABLED=no
6 | SPARK_RPC_ENCRYPTION_ENABLED=no
7 | SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
8 | SPARK_SSL_ENABLED=no
9 | 


--------------------------------------------------------------------------------
/elt_pipeline/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name="elt_pipeline",
 5 |     packages=find_packages(exclude=["elt_pipeline_tests"]),
 6 |     install_requires=[
 7 |         "dagster",
 8 |         "dagster-cloud"
 9 |     ],
10 |     extras_require={"dev": ["dagit", "pytest"]},
11 | )
12 | 


--------------------------------------------------------------------------------
/elt_pipeline/dbt_transform/models/sources.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | sources:
 3 |   - name: gold
 4 |     tables:
 5 |       - name: genre
 6 |       - name: book_genre
 7 |       - name: book_with_info
 8 |       - name: book_with_publish
 9 |       - name: book_with_rating
10 |   - name: recommendations
11 |     tables:
12 |       - name: book_download_link
13 | 


--------------------------------------------------------------------------------
/elt_pipeline/dbt_transform/config/profiles.yml:
--------------------------------------------------------------------------------
 1 | dbt_transform:
 2 |   outputs:
 3 |     dev:
 4 |       type: postgres
 5 |       threads: 1
 6 |       host: "{{ env_var('DBT_HOST') }}"
 7 |       port: 5432
 8 |       user: "{{ env_var('DBT_USER') }}"
 9 |       pass: "{{ env_var('DBT_PASSWORD') }}"
10 |       dbname: "{{ env_var('DBT_DATABASE') }}"
11 |       schema: "{{ env_var('DBT_SCHEMA') }}"
12 |   target: dev
13 | 


--------------------------------------------------------------------------------
/elt_pipeline/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas==1.5.3
 2 | polars==0.16.16
 3 | dagit==1.2.7
 4 | dagster==1.2.7
 5 | dagster-postgres==0.18.7
 6 | dagster-dbt==0.18.7
 7 | SQLAlchemy==1.4.46
 8 | pymysql==1.0.2
 9 | cryptography==38.0.3
10 | pyarrow==11.0.0
11 | fsspec==2023.3.0
12 | minio==7.1.13
13 | connectorx==0.3.1
14 | google-api-python-client==2.84.0
15 | google-auth-httplib2==0.1.0
16 | google-auth-oauthlib==1.0.0
17 | pyspark==3.3.2
18 | dbt-core==1.4.5
19 | dbt-postgres==1.4.5
20 | pytz==2022.7.1
21 | 


--------------------------------------------------------------------------------
/elt_pipeline/dbt_transform/models/recommendations/criteria.sql:
--------------------------------------------------------------------------------
 1 | with tmp_avg_rating as (
 2 |   select
 3 |     isbn,
 4 |     rating
 5 |   from {{ source('gold', 'book_with_rating') }}
 6 | ),
 7 | tmp_download_link as (
 8 |   select
 9 |     isbn,
10 |     case
11 |       when link is null then 0
12 |       else 1
13 |     end as hasdownloadlink,
14 |     rating
15 |   from {{ source('recommendations', 'book_download_link') }}
16 |   right join tmp_avg_rating using (isbn)
17 | )
18 | 
19 | select *
20 | from tmp_download_link
21 | 


--------------------------------------------------------------------------------
/dockerimages/streamlit/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.16-slim
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | RUN apt-get update && apt-get install -y \
 6 |     build-essential \
 7 |     curl \
 8 |     software-properties-common \
 9 |     git \
10 |     && rm -rf /var/lib/apt/lists/*
11 | 
12 | COPY . .
13 | 
14 | RUN pip install --upgrade pip && pip install -r requirements.txt
15 | 
16 | EXPOSE 8501
17 | 
18 | HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19 | 
20 | ENTRYPOINT ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python env
 2 | venv/
 3 | Pipfile.lock
 4 | 
 5 | # Other env
 6 | .env
 7 | .spark_master.env
 8 | .spark_worker.env
 9 | .streamlit/*
10 | 
11 | # Cache
12 | .cache*
13 | __pycache__/
14 | storage/
15 | dataset/
16 | minio/
17 | .coverage
18 | .spark_session.txt
19 | 
20 | # Google secrets
21 | gdrive_client_secret.json
22 | client_secret.json
23 | token_drive_v3.pickle
24 | 
25 | # dagster
26 | dagster_home/logs/
27 | dagster_home/.logs_queue/
28 | dagster_home/.telemetry/
29 | dagster_home/history/
30 | dagster_home/schedules/*
31 | 
32 | # EDA
33 | ./EDA.ipynb
34 | 


--------------------------------------------------------------------------------
/dockerimages/dagster/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Dagster libraries to run both dagit and the dagster-daemon. Does not
 2 | # need to have access to any pipeline code.
 3 | FROM python:3.9.16-slim
 4 | 
 5 | # Set $DAGSTER_HOME and copy dagster instance and workspace YAML there
 6 | ENV DAGSTER_HOME=/opt/dagster/dagster_home
 7 | RUN mkdir -p $DAGSTER_HOME && \
 8 | mkdir -p $DAGSTER_HOME/storage && \
 9 | mkdir -p $DAGSTER_HOME/compute_logs && \
10 | mkdir -p $DAGSTER_HOME/local_artifact_storage
11 | 
12 | WORKDIR $DAGSTER_HOME
13 | COPY requirements.txt $DAGSTER_HOME
14 | RUN pip install --upgrade pip && pip install -r requirements.txt
15 | 


--------------------------------------------------------------------------------
/tree_shorten.txt:
--------------------------------------------------------------------------------
 1 | ├── app
 2 | ├── dagster_home
 3 | ├── dataset
 4 | ├── docker-compose.yml
 5 | ├── dockerimages
 6 | ├── EDA.ipynb
 7 | ├── elt_pipeline
 8 | │   ├── dbt_transform
 9 | │   ├── Dockerfile
10 | │   ├── elt_pipeline
11 | │   ├── requirements.txt
12 | ├── .env
13 | ├── env.template
14 | ├── .git
15 | ├── .gitignore
16 | ├── load_dataset
17 | ├── Makefile
18 | ├── pg_hba.conf
19 | ├── Pipfile
20 | ├── Pipfile.lock
21 | ├── README.md
22 | ├── requirements.txt
23 | ├── .spark_master.env
24 | ├── spark_master.env.template
25 | ├── .spark_session.txt
26 | ├── spark_workder.env.template
27 | ├── .spark_worker.env
28 | 


--------------------------------------------------------------------------------
/elt_pipeline/dbt_transform/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to your new dbt project!
 2 | 
 3 | ### Using the starter project
 4 | 
 5 | Try running the following commands:
 6 | - dbt run
 7 | - dbt test
 8 | 
 9 | 
10 | ### Resources:
11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support
14 | - Find [dbt events](https://events.getdbt.com) near you
15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | dagit==1.2.3
 2 | dagster==1.2.3
 3 | dagster-postgres==0.18.3
 4 | dagster-dbt==0.18.3
 5 | dagster-spark==0.18.3
 6 | pandas==1.5.3
 7 | polars==0.16.16
 8 | pyspark==3.3.2
 9 | SQLAlchemy==1.4.46
10 | pymysql==1.0.2
11 | cryptography==38.0.3
12 | pyarrow==11.0.0
13 | fsspec==2023.3.0
14 | minio==7.1.13
15 | dbt-core==1.4.5
16 | dbt-postgres==1.4.5
17 | pytest==7.2.2
18 | pylint==2.17.1
19 | pytest-cov==4.0.0
20 | autopep8==2.0.2
21 | ydata-profiling==4.1.2
22 | connectorx==0.3.1
23 | google-api-python-client==2.84.0
24 | google-auth-httplib2==0.1.0
25 | google-auth-oauthlib==1.0.0
26 | streamlit
27 | streamlit-pandas-profiling
28 | streamlit-elements
29 | 


--------------------------------------------------------------------------------
/datalake_structure.txt:
--------------------------------------------------------------------------------
 1 | lakehouse
 2 | ├── bronze
 3 | │   └── goodreads
 4 | │   │   ├── book
 5 | │   │   ├── genre
 6 | │   │   ├── book_genre
 7 | │   │   ├── book_download_link
 8 | ├── silver
 9 | │   └── goodreads
10 | │   │   ├── cleaned_book
11 | │   │   ├── collected_book
12 | │   │   ├── cleaned_genre
13 | │   │   ├── collected_genre
14 | │   │   ├── collcted_book_genre
15 | │   │   ├── isbn
16 | ├── gold
17 | │   └── goodreads
18 | │   │   ├── genre
19 | │   │   ├── book_genre
20 | │   │   ├── book_with_info
21 | │   │   ├── book_with_publish
22 | │   │   ├── book_with_rating
23 | ├── files
24 | │   └── loremipsum.epub
25 | │   └── ...
26 | ├── images
27 | │   └── loremipsum.jpeg
28 | │   └── ...
29 | 


--------------------------------------------------------------------------------
/dockerimages/spark/spark-defaults.conf:
--------------------------------------------------------------------------------
 1 | spark.jars jars/delta-core_2.12-2.2.0.jar,jars/hadoop-aws-3.3.2.jar,jars/delta-storage-2.2.0.jar,jars/aws-java-sdk-1.12.367.jar,jars/s3-2.18.41.jar,jars/aws-java-sdk-bundle-1.11.1026.jar
 2 | spark.sql.extensions io.delta.sql.DeltaSparkSessionExtension
 3 | spark.sql.catalog.spark_catalog org.apache.spark.sql.delta.catalog.DeltaCatalog
 4 | spark.hadoop.fs.s3a.endpoint http://minio:9000
 5 | spark.hadoop.fs.s3a.access.key minio
 6 | spark.hadoop.fs.s3a.secret.key minio123
 7 | spark.hadoop.fs.s3a.path.style.access true
 8 | spark.hadoop.fs.s3a.connection.ssl.enabled false
 9 | spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
10 | spark.driver.memory 4g
11 | spark.executor.memory 4g
12 | 


--------------------------------------------------------------------------------
/elt_pipeline/elt_pipeline/assets/__init__.py:
--------------------------------------------------------------------------------
 1 | from dagster import load_assets_from_modules, file_relative_path
 2 | from dagster_dbt import load_assets_from_dbt_project
 3 | 
 4 | from . import bronze, silver, gold, warehouse
 5 | 
 6 | 
 7 | bronze_layer_assets = load_assets_from_modules([bronze])
 8 | silver_layer_assets = load_assets_from_modules([silver])
 9 | gold_layer_assets = load_assets_from_modules([gold])
10 | warehouse_assets = load_assets_from_modules([warehouse])
11 | 
12 | DBT_PROJECT_PATH = file_relative_path(__file__, "../../dbt_transform")
13 | DBT_PROFILES = file_relative_path(__file__, "../../dbt_transform/config")
14 | 
15 | dbt_assets = load_assets_from_dbt_project(
16 |     project_dir=DBT_PROJECT_PATH,
17 |     profiles_dir=DBT_PROFILES,
18 |     key_prefix=["dbt"],
19 | )
20 | 


--------------------------------------------------------------------------------
/dockerimages/spark/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/bitnami/spark:3.3.2
 2 | 
 3 | USER root
 4 | 
 5 | # Install prerequisites
 6 | RUN apt-get update && apt-get install -y curl
 7 | 
 8 | RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \
 9 |   && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \
10 |   && curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \
11 |   && curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \
12 |   && mv s3-2.18.41.jar /opt/bitnami/spark/jars \
13 |   && mv aws-java-sdk-1.12.367.jar /opt/bitnami/spark/jars \
14 |   && mv delta-core_2.12-2.2.0.jar /opt/bitnami/spark/jars \
15 |   && mv delta-storage-2.2.0.jar /opt/bitnami/spark/jars
16 | 


--------------------------------------------------------------------------------
/load_dataset/mysql_load.sql:
--------------------------------------------------------------------------------
 1 | LOAD DATA LOCAL INFILE '/tmp/dataset/book_full.csv'
 2 | INTO TABLE goodreads.book
 3 | FIELDS TERMINATED BY ','
 4 | ENCLOSED BY '"'
 5 | LINES TERMINATED BY '\n'
 6 | IGNORE 1 ROWS;
 7 | 
 8 | -- LOAD DATA LOCAL INFILE '/tmp/dataset/my_book.csv'
 9 | -- INTO TABLE goodreads.book
10 | -- FIELDS TERMINATED BY ','
11 | -- ENCLOSED BY '"'
12 | -- LINES TERMINATED BY '\n'
13 | -- IGNORE 1 ROWS;
14 | --
15 | -- LOAD DATA LOCAL INFILE '/tmp/dataset/genre.csv'
16 | -- INTO TABLE goodreads.genre
17 | -- FIELDS TERMINATED BY ','
18 | -- ENCLOSED BY '"'
19 | -- LINES TERMINATED BY '\n'
20 | -- IGNORE 1 ROWS;
21 | --
22 | -- LOAD DATA LOCAL INFILE '/tmp/dataset/book_genre.csv'
23 | -- INTO TABLE goodreads.book_genre
24 | -- FIELDS TERMINATED BY ','
25 | -- ENCLOSED BY '"'
26 | -- LINES TERMINATED BY '\n'
27 | -- IGNORE 1 ROWS;
28 | --
29 | -- LOAD DATA LOCAL INFILE '/tmp/dataset/book_download_link.csv'
30 | -- INTO TABLE goodreads.book_download_link
31 | -- FIELDS TERMINATED BY ','
32 | -- ENCLOSED BY '"'
33 | -- LINES TERMINATED BY '\n'
34 | -- IGNORE 1 ROWS;
35 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | pandas = "==1.5.3"
 8 | polars = "==0.16.16"
 9 | pyspark = "==3.3.2"
10 | dagster = "==1.2.3"
11 | dagit = "==1.2.3"
12 | dagster-postgres = "==0.18.3"
13 | dagster-dbt = "==0.18.3"
14 | dagster-spark = "==0.18.3"
15 | sqlalchemy = "==1.4.46"
16 | pymysql = "==1.0.2"
17 | cryptography = "==38.0.3"
18 | pyarrow = "==11.0.0"
19 | fsspec = "==2023.3.0"
20 | s3fs = "==0.4.2"
21 | minio = "==7.1.13"
22 | dbt-core = "==1.4.5"
23 | dbt-postgres = "==1.4.5"
24 | pytest = "==7.2.2"
25 | pylint = "==2.17.1"
26 | pytest-cov = "==4.0.0"
27 | autopep8 = "==2.0.2"
28 | streamlit = "*"
29 | streamlit-pandas-profiling = "*"
30 | streamlit-elements = "*"
31 | ipywidgets = "*"
32 | connectorx = "==0.3.1"
33 | google-api-python-client = "==2.84.0"
34 | google-auth-httplib2 = "==0.1.0"
35 | google-auth-oauthlib = "==1.0.0"
36 | ydata-profiling = "==4.1.2"
37 | 
38 | [dev-packages]
39 | ipykernel = "*"
40 | 
41 | [requires]
42 | python_version = "3.10"
43 | python_full_version = "3.10.6"
44 | 


--------------------------------------------------------------------------------
/elt_pipeline/elt_pipeline/resources/mysql_io_manager.py:
--------------------------------------------------------------------------------
 1 | from dagster import IOManager, InputContext, OutputContext
 2 | from contextlib import contextmanager
 3 | from sqlalchemy import create_engine
 4 | import polars as pl
 5 | 
 6 | 
 7 | def connect_mysql(config) -> str:
 8 |     conn_info = (
 9 |         f"mysql://{config['user']}:{config['password']}"
10 |         + f"@{config['host']}:{config['port']}"
11 |         + f"/{config['database']}"
12 |     )
13 |     return conn_info
14 | 
15 | 
16 | class MySQLIOManager(IOManager):
17 |     def __init__(self, config):
18 |         self._config = config
19 | 
20 |     def handle_output(self, context: "OutputContext", obj: pl.DataFrame):
21 |         pass
22 | 
23 |     def load_input(self, context: "InputContext"):
24 |         pass
25 | 
26 |     def extract_data(self, sql: str) -> pl.DataFrame:
27 |         """
28 |         Extract data from MySQL database as polars DataFrame
29 |         """
30 |         conn_info = connect_mysql(self._config)
31 |         df_data = pl.read_database(query=sql, connection_uri=conn_info)
32 |         return df_data
33 | 


--------------------------------------------------------------------------------
/dagster_home/dagster.yaml:
--------------------------------------------------------------------------------
 1 | storage:
 2 |   postgres:
 3 |     postgres_db:
 4 |       username:
 5 |         env: DAGSTER_PG_USERNAME
 6 |       password:
 7 |         env: DAGSTER_PG_PASSWORD
 8 |       hostname:
 9 |         env: DAGSTER_PG_HOSTNAME
10 |       db_name:
11 |         env: DAGSTER_PG_DB
12 |       port: 5432
13 | run_launcher:
14 |   module: dagster.core.launcher
15 |   class: DefaultRunLauncher
16 | run_coordinator:
17 |   module: dagster.core.run_coordinator
18 |   class: QueuedRunCoordinator
19 |   config:
20 |     max_concurrent_runs:
21 |       env: DAGSTER_OVERALL_CONCURRENCY_LIMIT
22 | compute_logs:
23 |   module: dagster.core.storage.local_compute_log_manager
24 |   class: LocalComputeLogManager
25 |   config:
26 |     base_dir: /opt/dagster/dagster_home/compute_logs
27 | local_artifact_storage:
28 |   module: dagster.core.storage.root
29 |   class: LocalArtifactStorage
30 |   config:
31 |     base_dir: /opt/dagster/dagster_home/local_artifact_storage
32 | telemetry:
33 |   enabled: true
34 | sensors:
35 |   use_threads: true
36 |   num_workers: 3
37 | schedules:
38 |   use_threads: true
39 |   num_workers: 3
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 lelouvincx
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/elt_pipeline/dbt_transform/models/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | models:
 3 |   - name: search_prior
 4 |     description: Search index table for books with higher priority
 5 |     columns:
 6 |       - name: isbn
 7 |         description: Primary key
 8 |         tests:
 9 |           - unique
10 |           - not_null
11 |       - name: name
12 |         description: Long varchar, can be null, duplicated, ...
13 |   - name: search
14 |     description: Search index table for books
15 |     columns:
16 |       - name: isbn
17 |         description: Primary key
18 |         tests:
19 |           - unique
20 |           - not_null
21 |       - name: name
22 |         description: Long varchar, can be null, duplicated, ...
23 |   - name: criteria
24 |     description: More criteria to filter books
25 |     columns:
26 |       - name: isbn
27 |         description: Primary key
28 |         tests:
29 |           - unique
30 |           - not_null
31 |       - name: downloadlink
32 |         description: Long varchar, can be null, must be unique, ...
33 |         tests:
34 |           - unique
35 |       - name: avgrating
36 |         description: integer, from 0 -> 5
37 |         tests:
38 |           - not_null
39 | 


--------------------------------------------------------------------------------
/load_dataset/psql_datasource.sql:
--------------------------------------------------------------------------------
 1 | CREATE DATABASE IF NOT EXISTS goodreads;
 2 | \c goodreads
 3 | 
 4 | CREATE SCHEMA IF NOT EXISTS gold;
 5 | CREATE SCHEMA IF NOT EXISTS analytics;
 6 | CREATE SCHEMA IF NOT EXISTS recommendations;
 7 | 
 8 | CREATE TABLE IF NOT EXISTS recommendations.book_download_link(
 9 |   ISBN VARCHAR(31) PRIMARY KEY,
10 |   Link VARCHAR(255)
11 | );
12 | 
13 | CREATE TABLE IF NOT EXISTS gold.genre (
14 |   Id SERIAL PRIMARY KEY,
15 |   Name VARCHAR(63) UNIQUE
16 | );
17 | 
18 | CREATE TABLE IF NOT EXISTS gold.book_genre (
19 |   BookISBN VARCHAR(31) NOT NULL,
20 |   GenreId INT NOT NULL,
21 |   PRIMARY KEY (BookISBN, GenreId)
22 | );
23 | 
24 | CREATE TABLE IF NOT EXISTS gold.book_with_info (
25 |   ISBN VARCHAR(31) PRIMARY KEY,
26 |   Name VARCHAR(31),
27 |   Authors VARCHAR(31),
28 |   Language VARCHAR(7),
29 |   Description TEXT,
30 |   PagesNumber INT
31 | );
32 | 
33 | CREATE TABLE IF NOT EXISTS gold.book_with_publish (
34 |   ISBN VARCHAR(31) PRIMARY KEY,
35 |   Publisher VARCHAR(31),
36 |   PublishYear VARCHAR(31),
37 |   PublishMonth INT,
38 |   PublishDay INT
39 | );
40 | 
41 | CREATE TABLE IF NOT EXISTS gold.book_with_rating (
42 |   ISBN VARCHAR(31) PRIMARY KEY,
43 |   Rating FLOAT,
44 |   RatingDist5 INT,
45 |   RatingDist4 INT,
46 |   RatingDist3 INT,
47 |   RatingDist2 INT,
48 |   RatingDist1 INT,
49 |   RatingDistTotal INT,
50 |   CountOfTextReviews INT
51 | );
52 | 


--------------------------------------------------------------------------------
/elt_pipeline/dbt_transform/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | # Name your project! Project names should contain only lowercase characters
 2 | # and underscores. A good package name should reflect your organization's
 3 | # name or the intended use of these models
 4 | name: "dbt_transform"
 5 | version: "1.0.0"
 6 | config-version: 2
 7 | # This setting configures which "profile" dbt uses for this project.
 8 | profile: "dbt_transform"
 9 | # These configurations specify where dbt should look for different types of files.
10 | # The `model-paths` config, for example, states that models in this project can be
11 | # found in the "models/" directory. You probably won't need to change these!
12 | model-paths: ["models"]
13 | analysis-paths: ["analyses"]
14 | test-paths: ["tests"]
15 | seed-paths: ["seeds"]
16 | macro-paths: ["macros"]
17 | snapshot-paths: ["snapshots"]
18 | target-path: "target" # directory which will store compiled SQL files
19 | clean-targets: # directories to be removed by `dbt clean`
20 |   - "target"
21 |   - "dbt_packages"
22 | # Configuring models
23 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
24 | 
25 | # In this example config, we tell dbt to build all models in the example/
26 | # directory as views. These settings can be overridden in the individual model
27 | # files using the `{{ config(...) }}` macro.
28 | models:
29 |   dbt_transform:
30 |     # Config indicated by + and applies to all files under models/example/
31 |     recommendations:
32 |       +materialized: table
33 | 


--------------------------------------------------------------------------------
/load_dataset/mysql_datasource.sql:
--------------------------------------------------------------------------------
 1 | -- Full load reference database
 2 | DROP DATABASE IF EXISTS goodreads;
 3 | CREATE DATABASE goodreads;
 4 | USE goodreads;
 5 | 
 6 | -- Load books
 7 | DROP TABLE IF EXISTS goodreads.book;
 8 | CREATE TABLE goodreads.book (
 9 |   Id INT NOT NULL,
10 |   Name VARCHAR(31),
11 |   Authors VARCHAR(31),
12 |   ISBN VARCHAR(31),
13 |   Rating FLOAT,
14 |   PublishYear VARCHAR(31),
15 |   PublishMonth INT,
16 |   PublishDay INT,
17 |   Publisher VARCHAR(31),
18 |   RatingDist5 VARCHAR(31),
19 |   RatingDist4 VARCHAR(31),
20 |   RatingDist3 VARCHAR(31),
21 |   RatingDist2 VARCHAR(31),
22 |   RatingDist1 VARCHAR(31),
23 |   RatingDistTotal VARCHAR(31),
24 |   CountsOfReview INT,
25 |   Language VARCHAR(7),
26 |   Description TEXT,
27 |   `Count of text reviews` INT,
28 |   PagesNumber INT,
29 |   PRIMARY KEY (Id)
30 | );
31 | 
32 | -- Load genre
33 | DROP TABLE IF EXISTS goodreads.genre;
34 | CREATE TABLE goodreads.genre (
35 |   Id INT NOT NULL AUTO_INCREMENT,
36 |   Name VARCHAR(255) UNIQUE,
37 |   PRIMARY KEY (Id)
38 | );
39 | 
40 | -- Load book_genre
41 | DROP TABLE IF EXISTS goodreads.book_genre;
42 | CREATE TABLE goodreads.book_genre (
43 |   BookISBN VARCHAR(31) NOT NULL,
44 |   GenreId INT NOT NULL,
45 |   PRIMARY KEY (BookISBN, GenreId)
46 | );
47 | 
48 | -- Load book_download_link
49 | DROP TABLE IF EXISTS goodreads.book_download_link;
50 | CREATE TABLE goodreads.book_download_link (
51 |   BookISBN VARCHAR(31) NOT NULL UNIQUE,
52 |   Link VARCHAR(255) NOT NULL,
53 |   PRIMARY KEY (BookISBN, Link)
54 | );
55 | 


--------------------------------------------------------------------------------
/env.template:
--------------------------------------------------------------------------------
 1 | # MySQL
 2 | MYSQL_HOST=de_mysql
 3 | MYSQL_PORT=3306
 4 | MYSQL_DATABASE=goodreads
 5 | MYSQL_USER=
 6 | MYSQL_PASSWORD=
 7 | MYSQL_ROOT_PASSWORD=
 8 | 
 9 | # PostgreSQL
10 | POSTGRES_HOST=de_psql
11 | POSTGRES_PORT=5432
12 | POSTGRES_USER=
13 | POSTGRES_PASSWORD=
14 | POSTGRES_DB=goodreads
15 | POSTGRES_HOST_AUTH_METHOD=trust
16 | 
17 | # Google Drive
18 | GDRIVE_CLIENT_SECRET_FILE=client_secret.json
19 | GDRIVE_PICKLE_FILE=token_drive_v3.pickle
20 | GDRIVE_API_NAME=drive
21 | GDRIVE_API_VERSION=v3
22 | GDRIVE_SCOPES=https://www.googleapis.com/auth/drive.readonly
23 | 
24 | # Dagster
25 | DAGSTER_PG_HOSTNAME=de_psql
26 | DAGSTER_PG_USERNAME=
27 | DAGSTER_PG_PASSWORD=
28 | DAGSTER_PG_DB=
29 | DAGSTER_OVERALL_CONCURRENCY_LIMIT=1
30 | DAGSTER_HOME=/opt/dagster/dagster_home
31 | 
32 | # dbt
33 | DBT_HOST=de_psql
34 | DBT_USER=
35 | DBT_PASSWORD=
36 | DBT_DATABASE=goodreads
37 | DBT_SCHEMA=recommendations
38 | 
39 | # MinIO
40 | MINIO_ENDPOINT=minio:9000
41 | MINIO_ROOT_USER=
42 | MINIO_ROOT_PASSWORD=
43 | MINIO_ACCESS_KEY=
44 | MINIO_SECRET_KEY=
45 | DATALAKE_BUCKET=lakehouse
46 | AWS_ACCESS_KEY_ID=
47 | AWS_SECRET_ACCESS_KEY=
48 | AWS_REGION=
49 | 
50 | # MinIO client (mc)
51 | AWS_ACCESS_KEY_ID=
52 | AWS_SECRET_ACCESS_KEY=
53 | AWS_REGION=
54 | 
55 | # Spark
56 | SPARK_MASTER_URL=spark://spark-master:7077
57 | SPARK_VERSION=3.3.2
58 | HADOOP_VERSION=3
59 | 
60 | # Metabase
61 | MB_DB_TYPE=postgres
62 | MB_DB_DBNAME=goodreads
63 | MB_DB_PORT=5432
64 | MB_DB_USER=
65 | MB_DB_PASS=
66 | MB_DB_HOST=de_psql
67 | MB_DB_FILE=/metabase_data/metabase.db
68 | 


--------------------------------------------------------------------------------
/elt_pipeline/README.md:
--------------------------------------------------------------------------------
 1 | # elt_pipeline
 2 | 
 3 | This is a [Dagster](https://dagster.io/) project scaffolded with [`dagster project scaffold`](https://docs.dagster.io/getting-started/create-new-project).
 4 | 
 5 | ## Getting started
 6 | 
 7 | First, install your Dagster code location as a Python package. By using the --editable flag, pip will install your Python package in ["editable mode"](https://pip.pypa.io/en/latest/topics/local-project-installs/#editable-installs) so that as you develop, local code changes will automatically apply.
 8 | 
 9 | ```bash
10 | pip install -e ".[dev]"
11 | ```
12 | 
13 | Then, start the Dagster UI web server:
14 | 
15 | ```bash
16 | dagster dev
17 | ```
18 | 
19 | Open http://localhost:3000 with your browser to see the project.
20 | 
21 | You can start writing assets in `elt_pipeline/assets.py`. The assets are automatically loaded into the Dagster code location as you define them.
22 | 
23 | ## Development
24 | 
25 | 
26 | ### Adding new Python dependencies
27 | 
28 | You can specify new Python dependencies in `setup.py`.
29 | 
30 | ### Unit testing
31 | 
32 | Tests are in the `elt_pipeline_tests` directory and you can run tests using `pytest`:
33 | 
34 | ```bash
35 | pytest elt_pipeline_tests
36 | ```
37 | 
38 | ### Schedules and sensors
39 | 
40 | If you want to enable Dagster [Schedules](https://docs.dagster.io/concepts/partitions-schedules-sensors/schedules) or [Sensors](https://docs.dagster.io/concepts/partitions-schedules-sensors/sensors) for your jobs, the [Dagster Daemon](https://docs.dagster.io/deployment/dagster-daemon) process must be running. This is done automatically when you run `dagster dev`.
41 | 
42 | Once your Dagster Daemon is running, you can start turning on schedules and sensors for your jobs.
43 | 
44 | ## Deploy on Dagster Cloud
45 | 
46 | The easiest way to deploy your Dagster project is to use Dagster Cloud.
47 | 
48 | Check out the [Dagster Cloud Documentation](https://docs.dagster.cloud) to learn more.
49 | 


--------------------------------------------------------------------------------
/tree.txt:
--------------------------------------------------------------------------------
 1 | ├── app
 2 | │   ├── .streamlit
 3 | │   │   └── secrets.toml
 4 | │   └── streamlit_app.py
 5 | ├── dagster_home
 6 | │   ├── dagster.yaml
 7 | │   └── workspace.yaml
 8 | ├── dataset
 9 | │   ├── book_download_link.csv
10 | │   ├── book_full.csv
11 | │   ├── book_genre.csv
12 | │   ├── genre.csv
13 | │   └── my_book.csv
14 | ├── docker-compose.yml
15 | ├── dockerimages
16 | │   ├── dagster
17 | │   │   ├── Dockerfile
18 | │   │   └── requirements.txt
19 | │   ├── spark
20 | │   │   ├── Dockerfile
21 | │   │   └── spark-defaults.conf
22 | │   └── streamlit
23 | │       ├── Dockerfile
24 | │       └── requirements.txt
25 | ├── EDA.ipynb
26 | ├── elt_pipeline
27 | │   ├── dbt_transform
28 | │   │   ├── config
29 | │   │   │   ├── profiles.yml
30 | │   │   ├── dbt_packages
31 | │   │   ├── dbt_project.yml
32 | │   │   ├── models
33 | │   │   │   ├── recommendations
34 | │   │   │   │   ├── criteria.sql
35 | │   │   │   │   ├── search_prior.sql
36 | │   │   │   │   └── search.sql
37 | │   │   │   ├── schema.yml
38 | │   │   │   └── sources.yml
39 | │   │   │   ├── manifest.json
40 | │   ├── Dockerfile
41 | │   ├── elt_pipeline
42 | │   │   ├── assets
43 | │   │   │   ├── bronze.py
44 | │   │   │   ├── gold.py
45 | │   │   │   ├── silver.py
46 | │   │   │   └── warehouse.py
47 | │   │   ├── client_secret.json
48 | │   │   ├── resources
49 | │   │   │   ├── gdrive_io_manager.py
50 | │   │   │   ├── minio_io_manager.py
51 | │   │   │   ├── mysql_io_manager.py
52 | │   │   │   ├── psql_io_manager.py
53 | │   │   │   └── spark_io_manager.py
54 | │   │   └── token_drive_v3.pickle
55 | │   ├── pyproject.toml
56 | │   ├── requirements.txt
57 | │   ├── setup.cfg
58 | │   └── setup.py
59 | ├── .env
60 | ├── env.template
61 | ├── .git
62 | ├── .gitignore
63 | ├── load_dataset
64 | │   ├── mysql_datasource.sql
65 | │   ├── mysql_load.sql
66 | │   └── psql_datasource.sql
67 | ├── Makefile
68 | ├── pg_hba.conf
69 | ├── Pipfile
70 | ├── Pipfile.lock
71 | ├── README.md
72 | ├── requirements.txt
73 | ├── .spark_master.env
74 | ├── spark_master.env.template
75 | ├── .spark_session.txt
76 | ├── spark_workder.env.template
77 | ├── .spark_worker.env
78 | 


--------------------------------------------------------------------------------
/elt_pipeline/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.16-slim
 2 | 
 3 | # Install spark and java
 4 | ARG openjdk_version="17"
 5 | 
 6 | RUN apt-get update --yes && \
 7 |     apt-get install --yes curl "openjdk-${openjdk_version}-jre-headless" ca-certificates-java procps && \
 8 |     apt-get clean && rm -rf /var/lib/apt/lists/*
 9 | 
10 | # Download spark neccessary jars
11 | RUN curl -O https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz \
12 |   && tar zxvf spark-3.3.2-bin-hadoop3.tgz \
13 |   && rm -rf spark-3.3.2-bin-hadoop3.tgz \
14 |   && mv spark-3.3.2-bin-hadoop3/ /usr/local/ \
15 |   && rm -rf /usr/local/spark \
16 |   && rm -rf /usr/local/spark-3.3.0-bin-hadoop3 \
17 |   && ln -s /usr/local/spark-3.3.2-bin-hadoop3 /usr/local/spark
18 | 
19 | RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \
20 |   && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \
21 |   && curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1026/aws-java-sdk-bundle-1.11.1026.jar \
22 |   && curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \
23 |   && curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \
24 |   && curl -O https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.2/hadoop-aws-3.3.2.jar \
25 |   && mv s3-2.18.41.jar /usr/local/spark/jars \
26 |   && mv aws-java-sdk-1.12.367.jar /usr/local/spark/jars \
27 |   && mv aws-java-sdk-bundle-1.11.1026.jar /usr/local/spark/jars \
28 |   && mv delta-core_2.12-2.2.0.jar /usr/local/spark/jars \
29 |   && mv delta-storage-2.2.0.jar /usr/local/spark/jars \
30 |   && mv hadoop-aws-3.3.2.jar /usr/local/spark/jars
31 | 
32 | # Add repository code
33 | WORKDIR /opt/dagster/app/elt_pipeline  
34 | COPY requirements.txt /opt/dagster/app/elt_pipeline
35 | RUN pip install --upgrade pip && pip install -r requirements.txt
36 | COPY . /opt/dagster/app/elt_pipeline
37 | 
38 | # CMD allows this to be overridden from run launchers or executors that want to run other commands against your repository
39 | CMD ["dagster", "api", "grpc", "-h", "0.0.0.0", "-p", "4000", "-m", "elt_pipeline"]
40 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | include .env
 2 | 
 3 | install:
 4 | 	python3 -V \
 5 | 	&& python3 -m venv venv \
 6 | 	&& . venv/bin/activate \
 7 | 	&& pip install --upgrade pip && pip install -r requirements.txt
 8 | 
 9 | check:
10 | 	black ./elt_pipeline --check
11 | 
12 | lint:
13 | 	flake8 ./elt_pipeline
14 | 
15 | test:
16 | 	docker exec elt_pipeline python -m pytest -vv --cov=utils tests/utils \
17 | 	&& docker exec elt_pipeline python -m pytest -vv --cov=ops tests/ops
18 | 
19 | pull:
20 | 	docker compose pull
21 | 
22 | build:
23 | 	docker compose build
24 | 
25 | build-dagster:
26 | 	docker build -t de_dagster:latest ./dockerimages/dagster
27 | 
28 | build-spark:
29 | 	docker build -t spark_master:latest ./dockerimages/spark
30 | 
31 | build-pipeline:
32 | 	docker build -t elt_pipeline:latest ./elt_pipeline
33 | 
34 | build-streamlit:
35 | 	docker build -t de_streamlit:latest ./dockerimages/streamlit
36 | 
37 | up-bg:
38 | 	docker compose --env-file .env up -d
39 | 
40 | up:
41 | 	docker compose --env-file .env up
42 | 
43 | down:
44 | 	docker compose --env-file .env down
45 | 
46 | restart-bg:
47 | 	docker compose --env-file .env down && docker compose --env-file .env up -d
48 | 
49 | restart:
50 | 	docker compose --env-file .env down && docker compose --env-file .env up
51 | 
52 | to_mysql:
53 | 	docker exec -it de_mysql mysql -u"${MYSQL_USER}" -p"${MYSQL_PASSWORD}" ${MYSQL_DATABASE}
54 | 
55 | to_mysql_root:
56 | 	docker exec -it de_mysql mysql -u"root" -p"${MYSQL_ROOT_PASSWORD}" ${MYSQL_DATABASE}
57 | 
58 | mysql_create:
59 | 	docker exec -it de_mysql mysql --local_infile -u"${MYSQL_USER}" -p"${MYSQL_PASSWORD}" ${MYSQL_DATABASE} -e"source /tmp/load_dataset/mysql_datasource.sql"
60 | 
61 | mysql_load:
62 | 	docker exec -it de_mysql mysql --local_infile -u"${MYSQL_USER}" -p"${MYSQL_PASSWORD}" ${MYSQL_DATABASE} -e"source /tmp/load_dataset/mysql_load.sql"
63 | 
64 | to_psql:
65 | 	docker exec -it de_psql psql postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}
66 | 
67 | to_psql_no_db:
68 | 	docker exec -it de_psql psql postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/postgres
69 | 
70 | psql_create:
71 | 	docker exec -it de_psql psql postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB} -f /tmp/load_dataset/psql_datasource.sql -a
72 | 


--------------------------------------------------------------------------------
/elt_pipeline/logs/dbt.log:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ============================== 2023-04-15 04:45:59.961967 | 6fcecef0-8da6-4cbb-8428-1f02f0660fa9 ==============================
 4 | [0m04:45:59.961967 [info ] [MainThread]: Running with dbt=1.4.5
 5 | [0m04:45:59.963549 [debug] [MainThread]: running dbt with arguments {'write_json': True, 'use_colors': True, 'printer_width': 80, 'version_check': True, 'partial_parse': True, 'static_parser': True, 'profiles_dir': '/home/lelouvincx/Documents/FDE02/project/elt_pipeline', 'send_anonymous_usage_stats': True, 'quiet': False, 'no_print': False, 'cache_selected_only': False, 'skip_profile_setup': False, 'which': 'init', 'indirect_selection': 'eager'}
 6 | [0m04:45:59.963829 [debug] [MainThread]: Tracking: tracking
 7 | [0m04:45:59.965225 [debug] [MainThread]: Sending event: {'category': 'dbt', 'action': 'invocation', 'label': 'start', 'context': [<snowplow_tracker.self_describing_json.SelfDescribingJson object at 0x7fba8f7a7a90>, <snowplow_tracker.self_describing_json.SelfDescribingJson object at 0x7fba8f7a7a60>, <snowplow_tracker.self_describing_json.SelfDescribingJson object at 0x7fba8f7a76a0>]}
 8 | [0m04:46:07.900335 [debug] [MainThread]: Starter project path: /home/lelouvincx/.local/share/virtualenvs/project-zhl6RxJh/lib/python3.10/site-packages/dbt/include/starter_project
 9 | [0m04:46:13.514941 [info ] [MainThread]: Profile dbt_transform written to /home/lelouvincx/Documents/FDE02/project/elt_pipeline/profiles.yml using target's sample configuration. Once updated, you'll be able to start developing with dbt.
10 | [0m04:46:13.515348 [info ] [MainThread]: 
11 | Your new dbt project "dbt_transform" was created!
12 | 
13 | For more information on how to configure the profiles.yml file,
14 | please consult the dbt documentation here:
15 | 
16 |   https://docs.getdbt.com/docs/configure-your-profile
17 | 
18 | One more thing:
19 | 
20 | Need help? Don't hesitate to reach out to us via GitHub issues or on Slack:
21 | 
22 |   https://community.getdbt.com/
23 | 
24 | Happy modeling!
25 | 
26 | [0m04:46:13.515755 [debug] [MainThread]: Sending event: {'category': 'dbt', 'action': 'invocation', 'label': 'end', 'context': [<snowplow_tracker.self_describing_json.SelfDescribingJson object at 0x7fba90fc2950>, <snowplow_tracker.self_describing_json.SelfDescribingJson object at 0x7fba91191ae0>, <snowplow_tracker.self_describing_json.SelfDescribingJson object at 0x7fba8f7a7a60>]}
27 | [0m04:46:13.516131 [debug] [MainThread]: Flushing usage events
28 | 


--------------------------------------------------------------------------------
/elt_pipeline/elt_pipeline/__init__.py:
--------------------------------------------------------------------------------
 1 | from dagster import Definitions, load_assets_from_modules, file_relative_path
 2 | from dagster_dbt import dbt_cli_resource
 3 | import os
 4 | 
 5 | from . import assets
 6 | from .resources.mysql_io_manager import MySQLIOManager
 7 | from .resources.minio_io_manager import MinIOIOManager
 8 | from .resources.gdrive_io_manager import GDriveIOManager
 9 | from .resources.spark_io_manager import SparkIOManager
10 | from .resources.psql_io_manager import PostgreSQLIOManager
11 | 
12 | 
13 | MYSQL_CONFIG = {
14 |     "host": os.getenv("MYSQL_HOST"),
15 |     "port": os.getenv("MYSQL_PORT"),
16 |     "database": os.getenv("MYSQL_DATABASE"),
17 |     "user": os.getenv("MYSQL_USER"),
18 |     "password": os.getenv("MYSQL_PASSWORD"),
19 | }
20 | 
21 | MINIO_CONFIG = {
22 |     "bucket": os.getenv("DATALAKE_BUCKET"),
23 |     "endpoint_url": os.getenv("MINIO_ENDPOINT"),
24 |     "minio_access_key": os.getenv("MINIO_ACCESS_KEY"),
25 |     "minio_secret_key": os.getenv("MINIO_SECRET_KEY"),
26 | }
27 | 
28 | GDRIVE_CONFIG = {
29 |     "client_secret_file": os.path.join(
30 |         os.getcwd(), str(os.getenv("GDRIVE_CLIENT_SECRET_FILE"))
31 |     ),
32 |     "pickle_file": os.path.join(
33 |         os.getcwd(), "elt_pipeline", str(os.getenv("GDRIVE_PICKLE_FILE"))
34 |     ),
35 |     "api_name": os.getenv("GDRIVE_API_NAME"),
36 |     "api_version": os.getenv("GDRIVE_API_VERSION"),
37 |     "scopes": os.getenv("GDRIVE_SCOPES"),
38 |     "bucket": os.getenv("DATALAKE_BUCKET"),
39 |     "endpoint_url": os.getenv("MINIO_ENDPOINT"),
40 |     "minio_access_key": os.getenv("MINIO_ACCESS_KEY"),
41 |     "minio_secret_key": os.getenv("MINIO_SECRET_KEY"),
42 | }
43 | 
44 | SPARK_CONFIG = {
45 |     "spark_master": os.getenv("SPARK_MASTER_URL"),
46 |     "spark_version": os.getenv("SPARK_VERSION"),
47 |     "hadoop_version": os.getenv("HADOOP_VERSION"),
48 |     "endpoint_url": os.getenv("MINIO_ENDPOINT"),
49 |     "minio_access_key": os.getenv("MINIO_ACCESS_KEY"),
50 |     "minio_secret_key": os.getenv("MINIO_SECRET_KEY"),
51 | }
52 | 
53 | PSQL_CONFIG = {
54 |     "host": os.getenv("POSTGRES_HOST"),
55 |     "port": os.getenv("POSTGRES_PORT"),
56 |     "database": os.getenv("POSTGRES_DB"),
57 |     "user": os.getenv("POSTGRES_USER"),
58 |     "password": os.getenv("POSTGRES_PASSWORD"),
59 | }
60 | 
61 | DBT_PROJECT_PATH = file_relative_path(__file__, "../dbt_transform")
62 | DBT_PROFILES = file_relative_path(__file__, "../dbt_transform/config")
63 | 
64 | 
65 | resources = {
66 |     "mysql_io_manager": MySQLIOManager(MYSQL_CONFIG),
67 |     "minio_io_manager": MinIOIOManager(MINIO_CONFIG),
68 |     "gdrive_io_manager": GDriveIOManager(GDRIVE_CONFIG),
69 |     "spark_io_manager": SparkIOManager(SPARK_CONFIG),
70 |     "psql_io_manager": PostgreSQLIOManager(PSQL_CONFIG),
71 |     "dbt": dbt_cli_resource.configured(
72 |         {
73 |             "project_dir": DBT_PROJECT_PATH,
74 |             "profiles_dir": DBT_PROFILES,
75 |         }
76 |     ),
77 | }
78 | 
79 | defs = Definitions(
80 |     assets=load_assets_from_modules([assets]),
81 |     resources=resources,
82 | )
83 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | version: "3.9"
  2 | services:
  3 |   de_mysql:
  4 |     image: mysql:8.0
  5 |     container_name: de_mysql
  6 |     volumes:
  7 |       - ./storage/mysql_data:/var/lib/mysql
  8 |       - ./dataset:/tmp/dataset
  9 |       - ./load_dataset:/tmp/load_dataset
 10 |     ports:
 11 |       - "3306:3306"
 12 |     env_file: .env
 13 |     networks:
 14 |       - de_network
 15 |       # minio:
 16 |       #   hostname: minio
 17 |       #   image: quay.io/minio/minio:latest
 18 |       #   container_name: minio
 19 |       #   command: ["server", "/data", "--console-address", ":9001"]
 20 |       #   volumes:
 21 |       #     - minio:/data
 22 |       #   ports:
 23 |       #     - "9001:9001"
 24 |       #     - "9000:9000"
 25 |       #   env_file: .env
 26 |       #   networks:
 27 |       #     - de_network
 28 |       # mc:
 29 |       #   image: minio/mc
 30 |       #   container_name: mc
 31 |       #   hostname: mc
 32 |       #   env_file: .env
 33 |       #   entrypoint: /bin/sh -c " until (/usr/bin/mc config host add minio http://minio:9000
 34 |       #     minio minio123) do echo '...waiting...' && sleep 1; done; /usr/bin/mc mb minio/lakehouse;
 35 |       #     /usr/bin/mc policy set public minio/lakehouse; exit 0; "
 36 |       #   depends_on:
 37 |       #     - minio
 38 |       #   networks:
 39 |       #     - de_network
 40 |       # elt_pipeline:
 41 |       #   build:
 42 |       #     context: ./elt_pipeline
 43 |       #     dockerfile: Dockerfile
 44 |       #   image: elt_pipeline:latest
 45 |       #   container_name: elt_pipeline
 46 |       #   volumes:
 47 |       #     - ./elt_pipeline:/opt/dagster/app/elt_pipeline
 48 |       #   env_file: .env
 49 |       #   ports:
 50 |       #     - "4040:4040"
 51 |       #   networks:
 52 |       #     - de_network
 53 |       # de_dagster_dagit:
 54 |       #   image: de_dagster:latest
 55 |       #   entrypoint:
 56 |       #     - dagit
 57 |       #     - -h
 58 |       #     - "0.0.0.0"
 59 |       #     - -p
 60 |       #     - "3001"
 61 |       #     - -w
 62 |       #     - workspace.yaml
 63 |       #   container_name: de_dagster_dagit
 64 |       #   volumes:
 65 |       #     - /var/run/docker.sock:/var/run/docker.sock
 66 |       #     - ./dagster_home:/opt/dagster/dagster_home
 67 |       #   ports:
 68 |       #     - "3001:3001"
 69 |       #   env_file: .env
 70 |       #   networks:
 71 |       #     - de_network
 72 |       # de_dagster_daemon:
 73 |       #   image: de_dagster:latest
 74 |       #   entrypoint:
 75 |       #     - dagster-daemon
 76 |       #     - run
 77 |       #   container_name: de_dagster_daemon
 78 |       #   volumes:
 79 |       #     - /var/run/docker.sock:/var/run/docker.sock
 80 |       #     - ./dagster_home:/opt/dagster/dagster_home
 81 |       #   env_file: .env
 82 |       #   networks:
 83 |       #     - de_network
 84 |       # spark-master:
 85 |       #   build:
 86 |       #     context: ./dockerimages/spark
 87 |       #     dockerfile: Dockerfile
 88 |       #   image: spark_master:latest
 89 |       #   container_name: spark-master
 90 |       #   volumes:
 91 |       #     - ./dockerimages/spark/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
 92 |       #   expose:
 93 |       #     - "7077"
 94 |       #   ports:
 95 |       #     - "7077:7077"
 96 |       #     - "8080:8080"
 97 |       #   env_file: .spark_master.env
 98 |       #   networks:
 99 |       #     - de_network
100 |       # spark-worker:
101 |       #   image: docker.io/bitnami/spark:3.3.2
102 |       #   env_file: .spark_worker.env
103 |       #   deploy:
104 |       #     replicas: 2
105 |       #   networks:
106 |       #     - de_network
107 |       # de_psql:
108 |       #   image: postgres:14-alpine
109 |       #   container_name: de_psql
110 |       #   volumes:
111 |       #     - ./storage/postgres_data:/var/lib/postgresql/data
112 |       #     - ./pg_hba.conf:/tmp/pg_hba.conf
113 |       #     - ./load_dataset:/tmp/load_dataset
114 |       #   command: ["postgres", "-c", "hba_file=/tmp/pg_hba.conf"]
115 |       #   ports:
116 |       #     - "5432:5432"
117 |       #   env_file: .env
118 |       #   networks:
119 |       #     - de_network
120 |       # de_streamlit:
121 |       #   build:
122 |       #     context: ./dockerimages/streamlit
123 |       #     dockerfile: Dockerfile
124 |       #   image: de_streamlit:latest
125 |       #   container_name: de_streamlit
126 |       #   volumes:
127 |       #     - ./app:/app
128 |       #   env_file: .env
129 |       #   ports:
130 |       #     - "8501:8501"
131 |       #   networks:
132 |       #     - de_network
133 |       # de_metabase:
134 |       #   image: metabase/metabase:latest
135 |       #   container_name: de_metabase
136 |       #   volumes:
137 |       #     - ./storage/metabase_data:/metabase_data
138 |       #   ports:
139 |       #     - "3030:3000"
140 |       #   env_file: .env
141 |       #   networks:
142 |       #     - de_network
143 | networks:
144 |   de_network:
145 |     driver: bridge
146 |     name: de_network
147 | volumes:
148 |   minio: {}
149 |   storage: {}
150 | 


--------------------------------------------------------------------------------
/elt_pipeline/elt_pipeline/resources/minio_io_manager.py:
--------------------------------------------------------------------------------
  1 | from dagster import IOManager, OutputContext, InputContext
  2 | from minio import Minio
  3 | import polars as pl
  4 | 
  5 | from contextlib import contextmanager
  6 | from datetime import datetime
  7 | from typing import Union
  8 | import os
  9 | 
 10 | 
 11 | @contextmanager
 12 | def connect_minio(config):
 13 |     client = Minio(
 14 |         endpoint=config.get("endpoint_url"),
 15 |         access_key=config.get("minio_access_key"),
 16 |         secret_key=config.get("minio_secret_key"),
 17 |         secure=False,
 18 |     )
 19 | 
 20 |     try:
 21 |         yield client
 22 |     except Exception as e:
 23 |         raise e
 24 | 
 25 | 
 26 | # Make bucket if not exists
 27 | def make_bucket(client: Minio, bucket_name):
 28 |     found = client.bucket_exists(bucket_name)
 29 |     if not found:
 30 |         client.make_bucket(bucket_name)
 31 |     else:
 32 |         print(f"Bucket {bucket_name} already exists.")
 33 | 
 34 | 
 35 | class MinIOIOManager(IOManager):
 36 |     def __init__(self, config):
 37 |         self._config = config
 38 | 
 39 |     def _get_path(self, context: Union[InputContext, OutputContext]):
 40 |         """
 41 |         Returns (key_name, tmp_file_path) where key_name is the path to the file in minIO
 42 |         and tmp_file_path is the path to the temp file in local disk, which will
 43 |         be uploaded to minIO and then deleted after the upload is done.
 44 |         """
 45 |         # E.g context.asset_key.path: ['bronze', 'goodreads', 'book']
 46 |         layer, schema, table = context.asset_key.path
 47 |         # NOTE: E.g: bronze/goodreads/book
 48 |         key = "/".join([layer, schema, table.replace(f"{layer}_", "")])
 49 |         # E.g /tmp/file_bronze_goodreads_book_20210101000000.parquet
 50 |         tmp_file_path = "/tmp/file_{}_{}.parquet".format(
 51 |             "_".join(context.asset_key.path), datetime.today().strftime("%Y%m%d%H%M%S")
 52 |         )  # Partition by year
 53 | 
 54 |         if context.has_partition_key:
 55 |             # E.g partition_str: book_2021
 56 |             partition_str = str(table) + "_" + context.asset_partition_key
 57 |             # E.g key_name: bronze/goodreads/book/book_2021.parquet
 58 |             # tmp_file_path: /tmp/file_bronze_goodreads_book_20210101000000.parquet
 59 |             return os.path.join(key, f"{partition_str}.parquet"), tmp_file_path
 60 |         else:
 61 |             # E.g key_name: bronze/goodreads/book.parquet
 62 |             return f"{key}.parquet", tmp_file_path
 63 | 
 64 |     def handle_output(self, context: "OutputContext", obj: pl.DataFrame):
 65 |         """
 66 |         Receives output from upstream asset,
 67 |         and converts to parquet format and upload to minIO.
 68 |         """
 69 | 
 70 |         key_name, tmp_file_path = self._get_path(context)
 71 | 
 72 |         # Convert from polars DataFrame to parquet format
 73 |         obj.write_parquet(tmp_file_path)
 74 | 
 75 |         # Upload file to minIO
 76 |         try:
 77 |             bucket_name = self._config.get("bucket")
 78 |             with connect_minio(self._config) as client:
 79 |                 # Make bucket if not exist
 80 |                 make_bucket(client, bucket_name)
 81 | 
 82 |                 # Upload file to minIO
 83 |                 # E.g bucket_name: lakehouse,
 84 |                 # key_name: bronze/goodreads/book/book_2021.parquet,
 85 |                 # tmp_file_path: /tmp/file_bronze_goodreads_book_20210101000000.parquet
 86 |                 client.fput_object(bucket_name, key_name, tmp_file_path)
 87 |                 context.log.info(
 88 |                     f"(MinIO handle_output) Number of rows and columns: {obj.shape}"
 89 |                 )
 90 |                 context.add_output_metadata({"path": key_name, "tmp": tmp_file_path})
 91 | 
 92 |                 # Clean up tmp file
 93 |                 os.remove(tmp_file_path)
 94 |         except Exception as e:
 95 |             raise e
 96 | 
 97 |     def load_input(self, context: "InputContext") -> pl.DataFrame:
 98 |         """
 99 |         Prepares input for downstream asset,
100 |         and downloads parquet file from minIO and converts to polars DataFrame
101 |         """
102 | 
103 |         bucket_name = self._config.get("bucket")
104 |         key_name, tmp_file_path = self._get_path(context)
105 | 
106 |         try:
107 |             with connect_minio(self._config) as client:
108 |                 # Make bucket if not exist
109 |                 make_bucket(client=client, bucket_name=bucket_name)
110 | 
111 |                 # E.g bucket_name: lakehouse,
112 |                 # key_name: bronze/goodreads/book/book_2021.parquet,
113 |                 # tmp_file_path: /tmp/file_bronze_goodreads_book_20210101000000.parquet
114 |                 context.log.info(f"(MinIO load_input) from key_name: {key_name}")
115 |                 client.fget_object(bucket_name, key_name, tmp_file_path)
116 |                 df_data = pl.read_parquet(tmp_file_path)
117 |                 context.log.info(
118 |                     f"(MinIO load_input) Got polars dataframe with shape: {df_data.shape}"
119 |                 )
120 | 
121 |                 return df_data
122 |         except Exception as e:
123 |             raise e
124 | 


--------------------------------------------------------------------------------
/elt_pipeline/elt_pipeline/resources/spark_io_manager.py:
--------------------------------------------------------------------------------
  1 | from dagster import IOManager, InputContext, OutputContext
  2 | from pyspark.sql import SparkSession, DataFrame
  3 | 
  4 | from contextlib import contextmanager
  5 | from datetime import datetime
  6 | 
  7 | 
  8 | @contextmanager
  9 | def get_spark_session(config, run_id="Spark IO Manager"):
 10 |     executor_memory = "1g" if run_id != "Spark IO Manager" else "1500m"
 11 |     try:
 12 |         spark = (
 13 |             SparkSession.builder.master("spark://spark-master:7077")
 14 |             .appName(run_id)
 15 |             .config("spark.driver.memory", "4g")
 16 |             .config("spark.executor.memory", executor_memory)
 17 |             .config("spark.cores.max", "4")
 18 |             .config("spark.executor.cores", "2")
 19 |             .config(
 20 |                 "spark.jars",
 21 |                 "/usr/local/spark/jars/delta-core_2.12-2.2.0.jar,/usr/local/spark/jars/hadoop-aws-3.3.2.jar,/usr/local/spark/jars/delta-storage-2.2.0.jar,/usr/local/spark/jars/aws-java-sdk-1.12.367.jar,/usr/local/spark/jars/s3-2.18.41.jar,/usr/local/spark/jars/aws-java-sdk-bundle-1.11.1026.jar",
 22 |             )
 23 |             .config(
 24 |                 "spark.sql.catalog.spark_catalog",
 25 |                 "org.apache.spark.sql.delta.catalog.DeltaCatalog",
 26 |             )
 27 |             .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
 28 |             .config("spark.hadoop.fs.s3a.endpoint", f"http://{config['endpoint_url']}")
 29 |             .config("spark.hadoop.fs.s3a.access.key", str(config["minio_access_key"]))
 30 |             .config("spark.hadoop.fs.s3a.secret.key", str(config["minio_secret_key"]))
 31 |             .config("spark.hadoop.fs.s3a.path.style.access", "true")
 32 |             .config("spark.hadoop.fs.connection.ssl.enabled", "false")
 33 |             .config(
 34 |                 "spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"
 35 |             )
 36 |             .config("spark.sql.execution.arrow.pyspark.enabled", "true")
 37 |             .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true")
 38 |             .getOrCreate()
 39 |         )
 40 |         yield spark
 41 |     except Exception as e:
 42 |         raise Exception(f"Error while creating spark session: {e}")
 43 | 
 44 | 
 45 | class SparkIOManager(IOManager):
 46 |     def __init__(self, config):
 47 |         self._config = config
 48 | 
 49 |     def handle_output(self, context: "OutputContext", obj: DataFrame):
 50 |         """
 51 |         Write output to s3a (aka minIO) as parquet file
 52 |         """
 53 | 
 54 |         context.log.debug("(Spark handle_output) Writing output to MinIO ...")
 55 | 
 56 |         # E.g file_path: s3a://lakehouse/silver/goodreads/book/book_2021.parquet
 57 |         # Or file_path: s3a://lakehouse/silver/goodreads/book.parquet if full load
 58 |         file_path = "s3a://lakehouse/" + "/".join(context.asset_key.path)
 59 |         if context.has_partition_key:
 60 |             file_path += f"/book_{context.partition_key}"
 61 |         file_path += ".parquet"
 62 |         context.log.debug(f"(Spark handle_output) File path: {file_path}")
 63 |         file_name = str(context.asset_key.path[-1])
 64 |         context.log.debug(f"(Spark handle_output) File name: {file_name}")
 65 | 
 66 |         try:
 67 |             obj.write.mode("overwrite").parquet(file_path)
 68 |             context.log.debug(f"Saved {file_name} to {file_path}")
 69 |         except Exception as e:
 70 |             raise Exception(f"(Spark handle_output) Error while writing output: {e}")
 71 | 
 72 |     def load_input(self, context: "InputContext") -> DataFrame:
 73 |         """
 74 |         Load input from s3a (aka minIO) from parquet file to spark.sql.DataFrame
 75 |         """
 76 | 
 77 |         # E.g context.asset_key.path: ['silver', 'goodreads', 'book']
 78 |         context.log.debug(f"Loading input from {context.asset_key.path}...")
 79 |         file_path = "s3a://lakehouse/" + "/".join(context.asset_key.path)
 80 |         if context.has_partition_key:
 81 |             file_path += f"/book_{context.partition_key}"
 82 |         full_load = (context.metadata or {}).get("full_load", False)
 83 |         if not full_load:
 84 |             file_path += ".parquet"
 85 |         # E.g file_path: s3a://lakehouse/silver/goodreads/book/book_2021.parquet
 86 |         # Or file_path: s3a://lakehouse/silver/goodreads/book if has partitions
 87 |         context.log.debug("File path: " + file_path)
 88 | 
 89 |         try:
 90 |             with get_spark_session(self._config) as spark:
 91 |                 df = None
 92 |                 if full_load:
 93 |                     tmp_df = spark.read.parquet(file_path + "/book_2022.parquet")
 94 |                     book_schema = tmp_df.schema
 95 |                     df = (
 96 |                         spark.read.format("parquet")
 97 |                         .options(header=True, inferSchema=False)
 98 |                         .schema(book_schema)
 99 |                         .load(file_path + "/*.parquet")
100 |                     )
101 |                 else:
102 |                     df = spark.read.parquet(file_path)
103 |                 context.log.debug(f"Loaded {df.count()} rows from {file_path}")
104 |                 return df
105 |         except Exception as e:
106 |             raise Exception(f"Error while loading input: {e}")
107 | 


--------------------------------------------------------------------------------
/elt_pipeline/elt_pipeline/resources/psql_io_manager.py:
--------------------------------------------------------------------------------
  1 | from dagster import IOManager, InputContext, OutputContext
  2 | from contextlib import contextmanager
  3 | import polars as pl
  4 | from datetime import datetime
  5 | import psycopg2
  6 | from psycopg2 import sql
  7 | import psycopg2.extras
  8 | 
  9 | 
 10 | @contextmanager
 11 | def connect_psql(config):
 12 |     try:
 13 |         yield psycopg2.connect(
 14 |             host=config["host"],
 15 |             port=config["port"],
 16 |             database=config["database"],
 17 |             user=config["user"],
 18 |             password=config["password"],
 19 |         )
 20 |     except (Exception) as e:
 21 |         print(f"Error while connecting to PostgreSQL: {e}")
 22 | 
 23 | 
 24 | class PostgreSQLIOManager(IOManager):
 25 |     def __init__(self, config):
 26 |         self._config = config
 27 | 
 28 |     def handle_output(self, context: "OutputContext", obj: pl.DataFrame):
 29 |         # E.g context.asset_key.path = ['warehouse', 'gold', 'book_genre']
 30 |         schema = context.asset_key.path[-2]
 31 |         # NOTE: Replace pattern is 'warehouse', not general
 32 |         table = str(context.asset_key.path[-1]).replace("warehouse_", "")
 33 |         context.log.debug(f"Schema: {schema}, Table: {table}")
 34 |         tmp_tbl = f"{table}_tmp_{datetime.now().strftime('%Y_%m_%d')}"
 35 |         try:
 36 |             with connect_psql(self._config) as conn:
 37 |                 context.log.debug(f"Connected to PostgreSQL: {conn}")
 38 |                 primary_keys = (context.metadata or {}).get("primary_keys", [])
 39 |                 context.log.debug(f"Primary keys: {primary_keys}")
 40 | 
 41 |                 with conn.cursor() as cursor:
 42 |                     context.log.debug(f"Cursor info: {cursor}")
 43 |                     cursor.execute("SELECT version()")
 44 |                     context.log.info(f"PostgreSQL version: {cursor.fetchone()}")
 45 |                     # Create temp file
 46 |                     cursor.execute(
 47 |                         f"CREATE TEMP TABLE IF NOT EXISTS {tmp_tbl} (LIKE {schema}.{table})"
 48 |                     )
 49 |                     cursor.execute(f"SELECT COUNT(*) FROM {tmp_tbl}")
 50 |                     context.log.debug(
 51 |                         f"Log for creating temp table: {cursor.fetchone()}"
 52 |                     )
 53 |                     # Create sql identifiers for the column names
 54 |                     # Do this to safely insert into a sql query
 55 |                     columns = sql.SQL(",").join(
 56 |                         sql.Identifier(name.lower()) for name in obj.columns
 57 |                     )
 58 |                     # Create a placeholder for the values. These will be filled later
 59 |                     values = sql.SQL(",").join(sql.Placeholder() for _ in obj.columns)
 60 |                     # Create the insert query
 61 |                     context.log.debug("Inserting data into temp table")
 62 |                     insert_query = sql.SQL("INSERT INTO {} ({}) VALUES({});").format(
 63 |                         sql.Identifier(tmp_tbl), columns, values
 64 |                     )
 65 |                     # Execute the insert query
 66 |                     psycopg2.extras.execute_batch(cursor, insert_query, obj.rows())
 67 |                     conn.commit()
 68 | 
 69 |                     # Check data inserted
 70 |                     context.log.debug("Checking data inserted")
 71 |                     cursor.execute(f"SELECT COUNT(*) FROM {tmp_tbl};")
 72 |                     context.log.info(f"Number of rows inserted: {cursor.fetchone()}")
 73 |                     # Upsert data
 74 |                     if len(primary_keys) > 0:
 75 |                         context.log.debug("Table has primary keys, upserting data")
 76 |                         conditions = " AND ".join(
 77 |                             [
 78 |                                 f""" {schema}.{table}."{k}" = {tmp_tbl}."{k}" """
 79 |                                 for k in primary_keys
 80 |                             ]
 81 |                         )
 82 |                         command = f"""
 83 |                             BEGIN TRANSACTION;
 84 |                             DELETE FROM {schema}.{table}
 85 |                             USING {tmp_tbl}
 86 |                             WHERE {conditions};
 87 | 
 88 |                             INSERT INTO {schema}.{table}
 89 |                             SELECT * FROM {tmp_tbl};
 90 | 
 91 |                             END TRANSACTION;
 92 |                         """
 93 |                     else:
 94 |                         context.log.debug("Table has no primary keys, replacing data")
 95 |                         command = f"""
 96 |                             BEGIN TRANSACTION;
 97 |                             DELETE FROM {schema}.{table};
 98 | 
 99 |                             INSERT INTO {schema}.{table}
100 |                             SELECT * FROM {tmp_tbl};
101 | 
102 |                             END TRANSACTION;
103 |                         """
104 | 
105 |                     # context.log.debug(f"Command: {command}")
106 |                     context.log.debug(f"Upserting data into {schema}.{table}")
107 |                     cursor.execute(command)
108 |                     context.log.debug(f"{cursor.statusmessage}")
109 |                     conn.commit()
110 |         except (Exception) as e:
111 |             print(f"Error while handling output to PostgreSQL: {e}")
112 | 
113 |         try:
114 |             with connect_psql(self._config) as conn:
115 |                 with conn.cursor() as cursor:
116 |                     context.log.debug(f"{cursor.fetchone()}")
117 |                     cursor.execute(f"SELECT COUNT(*) FROM {schema}.{table};")
118 |                     context.log.info(
119 |                         f"Number of rows upserted in {schema}.{table}: {cursor.fetchone()}"
120 |                     )
121 | 
122 |                     # Drop temp table
123 |                     cursor.execute(f"DROP TABLE {tmp_tbl}")
124 |                     conn.commit()
125 |         except (Exception) as e:
126 |             print(f"Error while testing handle_output to PostgreSQL: {e}")
127 | 
128 |     def load_input(self, context: "InputContext"):
129 |         pass
130 | 


--------------------------------------------------------------------------------
/elt_pipeline/elt_pipeline/assets/bronze.py:
--------------------------------------------------------------------------------
  1 | from dagster import asset, AssetIn, Output, StaticPartitionsDefinition
  2 | from datetime import datetime
  3 | import polars as pl
  4 | 
  5 | 
  6 | COMPUTE_KIND = "SQL"
  7 | LAYER = "bronze"
  8 | YEARLY = StaticPartitionsDefinition(
  9 |     [str(year) for year in range(1975, datetime.today().year)]
 10 | )
 11 | 
 12 | 
 13 | # genre from my_sql
 14 | @asset(
 15 |     description="Load table 'genre' from MySQL database as polars DataFrame, and save to minIO",
 16 |     io_manager_key="minio_io_manager",
 17 |     required_resource_keys={"mysql_io_manager"},
 18 |     key_prefix=["bronze", "goodreads"],
 19 |     compute_kind=COMPUTE_KIND,
 20 |     group_name=LAYER,
 21 | )
 22 | def bronze_genre(context) -> Output[pl.DataFrame]:
 23 |     query = "SELECT * FROM genre;"
 24 |     df_data = context.resources.mysql_io_manager.extract_data(query)
 25 |     context.log.info(f"Table extracted with shape: {df_data.shape}")
 26 | 
 27 |     return Output(
 28 |         value=df_data,
 29 |         metadata={
 30 |             "table": "genre",
 31 |             "row_count": df_data.shape[0],
 32 |             "column_count": df_data.shape[1],
 33 |             "columns": df_data.columns,
 34 |         },
 35 |     )
 36 | 
 37 | 
 38 | # book from my_sql
 39 | @asset(
 40 |     description="Load table 'book' from MySQL database as polars DataFrame, and save to minIO",
 41 |     partitions_def=YEARLY,
 42 |     io_manager_key="minio_io_manager",
 43 |     required_resource_keys={"mysql_io_manager"},
 44 |     key_prefix=["bronze", "goodreads"],
 45 |     compute_kind=COMPUTE_KIND,
 46 |     group_name=LAYER,
 47 | )
 48 | def bronze_book(context) -> Output[pl.DataFrame]:
 49 |     query = "SELECT * FROM book"
 50 |     try:
 51 |         partion_year_str = context.asset_partition_key_for_output()
 52 |         partition_by = "PublishYear"
 53 |         query += f" WHERE {partition_by} = {partion_year_str}"
 54 |         context.log.info(f"Partition by {partition_by} = {partion_year_str}")
 55 |     except Exception:
 56 |         context.log.info("No partition key found, full load data")
 57 | 
 58 |     df_data = context.resources.mysql_io_manager.extract_data(query)
 59 |     context.log.info(f"Table extracted with shape: {df_data.shape}")
 60 | 
 61 |     return Output(
 62 |         value=df_data,
 63 |         metadata={
 64 |             "table": "book",
 65 |             "row_count": df_data.shape[0],
 66 |             "column_count": df_data.shape[1],
 67 |             "columns": df_data.columns,
 68 |         },
 69 |     )
 70 | 
 71 | 
 72 | # book_genre from my_sql
 73 | @asset(
 74 |     description="Load table 'book_genre' from MySQL database as polars DataFrame, and save to minIO",
 75 |     io_manager_key="minio_io_manager",
 76 |     required_resource_keys={"mysql_io_manager"},
 77 |     non_argument_deps={"bronze_book", "bronze_genre"},
 78 |     key_prefix=["bronze", "goodreads"],
 79 |     compute_kind=COMPUTE_KIND,
 80 |     group_name=LAYER,
 81 | )
 82 | def bronze_book_genre(context) -> Output[pl.DataFrame]:
 83 |     query = "SELECT * FROM book_genre;"
 84 |     df_data = context.resources.mysql_io_manager.extract_data(query)
 85 |     context.log.info(f"Table extracted with shape: {df_data.shape}")
 86 | 
 87 |     return Output(
 88 |         value=df_data,
 89 |         metadata={
 90 |             "table": "book_genre",
 91 |             "row_count": df_data.shape[0],
 92 |             "column_count": df_data.shape[1],
 93 |             "columns": df_data.columns,
 94 |         },
 95 |     )
 96 | 
 97 | 
 98 | # book_download_link from my_sql
 99 | @asset(
100 |     description="Load table 'book_download_link' from MySQL database as polars DataFrame, and save to minIO",
101 |     io_manager_key="minio_io_manager",
102 |     required_resource_keys={"mysql_io_manager"},
103 |     non_argument_deps={"bronze_book"},
104 |     key_prefix=["bronze", "goodreads"],
105 |     compute_kind=COMPUTE_KIND,
106 |     group_name=LAYER,
107 | )
108 | def bronze_book_download_link(context) -> Output[pl.DataFrame]:
109 |     query = "SELECT * FROM book_download_link;"
110 |     df_data = context.resources.mysql_io_manager.extract_data(query)
111 |     context.log.info(f"Table extracted with shape: {df_data.shape}")
112 | 
113 |     return Output(
114 |         value=df_data,
115 |         metadata={
116 |             "table": "book_download_link",
117 |             "row_count": df_data.shape[0],
118 |             "column_count": df_data.shape[1],
119 |             "columns": df_data.columns,
120 |         },
121 |     )
122 | 
123 | 
124 | # download files from gdrive, given a download link
125 | @asset(
126 |     description="Download image and epub file for books from gdrive, given a download link",
127 |     io_manager_key="gdrive_io_manager",
128 |     ins={
129 |         "bronze_book_download_link": AssetIn(
130 |             key_prefix=["bronze", "goodreads"],
131 |         )
132 |     },
133 |     compute_kind="google drive",
134 |     group_name=LAYER,
135 | )
136 | def bronze_images_and_files_download(
137 |     context, bronze_book_download_link: pl.DataFrame
138 | ) -> Output[dict]:
139 |     """
140 |     From upstream table 'book_download_link', download files from google drive
141 |     with given download link, extract the images and files from it,
142 |     then return the path to the folder containing the downloaded files.
143 |     """
144 | 
145 |     # Create temp folder path, e.g '/tmp/bronze/download/2021-08-01T00:00:00+00:00' -> images|files
146 |     # WARN: If change the key_prefix above, also change the path here
147 |     tmp_folder_path = f"/tmp/bronze/download/{datetime.now().isoformat()}"
148 |     context.log.info(f"Path: {tmp_folder_path}")
149 | 
150 |     # Download folders by call download_folders function from gdrive_io_manager
151 |     context.resources.gdrive_io_manager.download_folders(
152 |         context=context,
153 |         dataframe=bronze_book_download_link,
154 |         tmp_folder_path=tmp_folder_path,
155 |     )
156 | 
157 |     return Output(
158 |         value={
159 |             "tmp_folder_path": tmp_folder_path,
160 |             "isbn": bronze_book_download_link["BookISBN"].to_list(),
161 |         },
162 |         metadata={
163 |             "isbn": bronze_book_download_link["BookISBN"].to_list(),
164 |             "download_link": bronze_book_download_link["Link"].to_list(),
165 |         },
166 |     )
167 | 


--------------------------------------------------------------------------------
/elt_pipeline/elt_pipeline/assets/warehouse.py:
--------------------------------------------------------------------------------
  1 | from dagster import (
  2 |     asset,
  3 |     AssetIn,
  4 |     Output,
  5 |     StaticPartitionsDefinition,
  6 | )
  7 | from pyspark.sql import DataFrame
  8 | from datetime import datetime
  9 | import pyarrow as pa
 10 | import polars as pl
 11 | 
 12 | 
 13 | COMPUTE_KIND = "Postgres"
 14 | LAYER = "warehouse"
 15 | YEARLY = StaticPartitionsDefinition(
 16 |     [str(year) for year in range(1975, datetime.today().year)]
 17 | )
 18 | 
 19 | 
 20 | # Asset warehouse_book_with_info
 21 | @asset(
 22 |     description="Load book_with_info data from spark to postgres",
 23 |     ins={
 24 |         "gold_book_with_info": AssetIn(
 25 |             key_prefix=["gold", "goodreads"],
 26 |         ),
 27 |     },
 28 |     metadata={
 29 |         "primary_keys": ["isbn"],
 30 |         "columns": ["isbn", "name", "authors", "language", "pagesnumber"],
 31 |     },
 32 |     io_manager_key="psql_io_manager",
 33 |     key_prefix=["gold"],  # Database: goodreads, Schema: gold
 34 |     compute_kind=COMPUTE_KIND,
 35 |     group_name=LAYER,
 36 | )
 37 | def book_with_info(context, gold_book_with_info: DataFrame):
 38 |     """
 39 |     Load book_with_info data from spark to postgres
 40 |     """
 41 | 
 42 |     context.log.info("Got spark DataFrame, loading to postgres")
 43 |     # Convert from spark DataFrame to polars DataFrame
 44 |     df = pl.from_arrow(pa.Table.from_batches(gold_book_with_info._collect_as_arrow()))
 45 |     context.log.debug(f"Got polars DataFrame with shape: {df.shape}")
 46 | 
 47 |     return Output(
 48 |         value=df,
 49 |         metadata={
 50 |             "database": "goodreads",
 51 |             "schema": "gold",
 52 |             "table": "book_with_info",
 53 |             "primary_keys": ["isbn"],
 54 |             "columns": ["isbn", "name", "authors", "language", "pagesnumber"],
 55 |         },
 56 |     )
 57 | 
 58 | 
 59 | # Asset warehouse_book_with_publish
 60 | @asset(
 61 |     description="Load book_with_publish data from spark to postgres",
 62 |     ins={
 63 |         "gold_book_with_publish": AssetIn(
 64 |             key_prefix=["gold", "goodreads"],
 65 |         ),
 66 |     },
 67 |     metadata={
 68 |         "primary_keys": ["isbn"],
 69 |         "columns": ["isbn", "publisher", "publishyear", "publishmonth", "publishday"],
 70 |     },
 71 |     io_manager_key="psql_io_manager",
 72 |     key_prefix=["gold"],  # Database: goodreads, Schema: gold
 73 |     compute_kind=COMPUTE_KIND,
 74 |     group_name=LAYER,
 75 | )
 76 | def book_with_publish(context, gold_book_with_publish: DataFrame):
 77 |     """
 78 |     Load book_with_publish data from spark to postgres
 79 |     """
 80 | 
 81 |     context.log.info("Got spark DataFrame, loading to postgres")
 82 |     # Convert from spark DataFrame to polars DataFrame
 83 |     df = pl.from_arrow(
 84 |         pa.Table.from_batches(gold_book_with_publish._collect_as_arrow())
 85 |     )
 86 |     context.log.debug(f"Got polars DataFrame with shape: {df.shape}")
 87 | 
 88 |     return Output(
 89 |         value=df,
 90 |         metadata={
 91 |             "database": "goodreads",
 92 |             "schema": "gold",
 93 |             "table": "book_with_publish",
 94 |             "primary_keys": ["isbn"],
 95 |             "columns": [
 96 |                 "isbn",
 97 |                 "publisher",
 98 |                 "publishyear",
 99 |                 "publishmonth",
100 |                 "publishday",
101 |             ],
102 |         },
103 |     )
104 | 
105 | 
106 | # Asset warehouse_book_with_rating
107 | @asset(
108 |     description="Load book_with_rating data from spark to postgres",
109 |     ins={
110 |         "gold_book_with_rating": AssetIn(
111 |             key_prefix=["gold", "goodreads"],
112 |         ),
113 |     },
114 |     metadata={
115 |         "primary_keys": ["isbn"],
116 |         "columns": [
117 |             "isbn",
118 |             "rating",
119 |             "ratingdist5",
120 |             "ratingdist4",
121 |             "ratingdist3",
122 |             "ratingdist2",
123 |             "ratingdist1",
124 |             "ratingdisttotal",
125 |             "countoftextreviews",
126 |         ],
127 |     },
128 |     io_manager_key="psql_io_manager",
129 |     key_prefix=["gold"],  # Database: goodreads, Schema: gold
130 |     compute_kind=COMPUTE_KIND,
131 |     group_name=LAYER,
132 | )
133 | def book_with_rating(context, gold_book_with_rating: DataFrame):
134 |     """
135 |     Load book_with_rating data from spark to postgres
136 |     """
137 | 
138 |     context.log.info("Got spark DataFrame, loading to postgres")
139 |     # Convert from spark DataFrame to polars DataFrame
140 |     df = pl.from_arrow(pa.Table.from_batches(gold_book_with_rating._collect_as_arrow()))
141 |     context.log.debug(f"Got polars DataFrame with shape: {df.shape}")
142 | 
143 |     return Output(
144 |         value=df,
145 |         metadata={
146 |             "database": "goodreads",
147 |             "schema": "gold",
148 |             "table": "book_with_rating",
149 |             "primary_keys": ["isbn"],
150 |             "columns": [
151 |                 "isbn",
152 |                 "rating",
153 |                 "ratingdist5",
154 |                 "ratingdist4",
155 |                 "ratingdist3",
156 |                 "ratingdist2",
157 |                 "ratingdist1",
158 |                 "ratingdisttotal",
159 |                 "countoftextreviews",
160 |             ],
161 |         },
162 |     )
163 | 
164 | 
165 | # Asset warehouse_book_download_link
166 | @asset(
167 |     description="Load book_download_link data from minio to postgres",
168 |     ins={
169 |         "bronze_book_download_link": AssetIn(
170 |             key_prefix=["bronze", "goodreads"],
171 |         ),
172 |     },
173 |     metadata={
174 |         "primary_keys": ["isbn"],
175 |         "columns": ["isbn", "link"],
176 |     },
177 |     io_manager_key="psql_io_manager",
178 |     key_prefix=["recommendations"],
179 |     compute_kind=COMPUTE_KIND,
180 |     group_name=LAYER,
181 | )
182 | def book_download_link(context, bronze_book_download_link: pl.DataFrame):
183 |     """
184 |     Load book_download_link data from minio to postgres
185 |     """
186 | 
187 |     df = bronze_book_download_link
188 |     # Rename column BookISBN to isbn
189 |     df = df.rename({"BookISBN": "isbn"})
190 |     context.log.info(f"Columns: {df.columns}")
191 | 
192 |     return Output(
193 |         value=df,
194 |         metadata={
195 |             "database": "goodreads",
196 |             "schema": "recommendations",
197 |             "table": "book_download_link",
198 |             "primary_keys": ["isbn"],
199 |             "columns": ["isbn", "link"],
200 |         },
201 |     )
202 | 


--------------------------------------------------------------------------------
/elt_pipeline/elt_pipeline/assets/gold.py:
--------------------------------------------------------------------------------
  1 | from dagster import (
  2 |     asset,
  3 |     multi_asset,
  4 |     AssetIn,
  5 |     AssetOut,
  6 |     Output,
  7 |     StaticPartitionsDefinition,
  8 | )
  9 | from pyspark.sql import DataFrame
 10 | from datetime import datetime
 11 | import pyarrow as pa
 12 | import polars as pl
 13 | 
 14 | 
 15 | COMPUTE_KIND = "Python"
 16 | YEARLY = StaticPartitionsDefinition(
 17 |     [str(year) for year in range(1975, datetime.today().year)]
 18 | )
 19 | 
 20 | 
 21 | # genre to gold (minIO) and warehouse (postgres)
 22 | @multi_asset(
 23 |     ins={
 24 |         "silver_collected_genre": AssetIn(
 25 |             key_prefix=["silver", "goodreads"],
 26 |         )
 27 |     },
 28 |     outs={
 29 |         "gold_genre": AssetOut(
 30 |             description="Load genre data from spark to minIO",
 31 |             io_manager_key="spark_io_manager",
 32 |             key_prefix=["gold", "goodreads"],
 33 |             group_name="gold",
 34 |         ),
 35 |         "genre": AssetOut(
 36 |             description="Load genre data from spark to postgres",
 37 |             io_manager_key="psql_io_manager",
 38 |             key_prefix=["gold"],  # Database: goodreads, Schema: gold
 39 |             metadata={
 40 |                 "primary_keys": ["id", "name"],
 41 |                 "columns": ["id", "name"],
 42 |             },
 43 |             group_name="warehouse",
 44 |         ),
 45 |     },
 46 |     compute_kind=COMPUTE_KIND,
 47 | )
 48 | def genre(context, silver_collected_genre: DataFrame):
 49 |     """
 50 |     Load genre data from spark to minIO and postgres
 51 |     """
 52 | 
 53 |     spark_df = silver_collected_genre
 54 | 
 55 |     context.log.info("Got spark DataFrame, converting to polars DataFrame")
 56 |     # Convert from spark DataFrame to polars DataFrame
 57 |     df = pl.from_arrow(
 58 |         pa.Table.from_batches(silver_collected_genre._collect_as_arrow())
 59 |     )
 60 |     context.log.debug(f"Got polars DataFrame with shape: {df.shape}")
 61 | 
 62 |     return Output(
 63 |         value=spark_df,
 64 |         metadata={
 65 |             "table": "gold_genre",
 66 |             "row_count": spark_df.count(),
 67 |             "column_count": len(spark_df.columns),
 68 |             "columns": spark_df.columns,
 69 |         },
 70 |     ), Output(
 71 |         value=df,
 72 |         metadata={
 73 |             "database": "goodreads",
 74 |             "schema": "gold",
 75 |             "table": "genre",
 76 |             "primary_keys": ["id", "name"],
 77 |             "columns": ["id", "name"],
 78 |         },
 79 |     )
 80 | 
 81 | 
 82 | # book_genre to gold (minIO) and warehouse (postgres)
 83 | @multi_asset(
 84 |     ins={
 85 |         "silver_collected_book_genre": AssetIn(
 86 |             key_prefix=["silver", "goodreads"],
 87 |         )
 88 |     },
 89 |     outs={
 90 |         "gold_book_genre": AssetOut(
 91 |             description="Load book_genre data from spark to minIO",
 92 |             io_manager_key="spark_io_manager",
 93 |             key_prefix=["gold", "goodreads"],
 94 |             group_name="gold",
 95 |         ),
 96 |         "book_genre": AssetOut(
 97 |             description="Load book_genre data from spark to postgres",
 98 |             io_manager_key="psql_io_manager",
 99 |             key_prefix=["gold"],  # Database: goodreads, Schema: gold
100 |             metadata={
101 |                 "primary_keys": ["bookisbn", "genreid"],
102 |                 "columns": ["bookisbn", "genreid"],
103 |             },
104 |             group_name="warehouse",
105 |         ),
106 |     },
107 |     compute_kind=COMPUTE_KIND,
108 | )
109 | def book_genre(context, silver_collected_book_genre: DataFrame):
110 |     """
111 |     Load book_genre data from spark to minIO and postgres
112 |     """
113 | 
114 |     spark_df = silver_collected_book_genre
115 | 
116 |     context.log.info("Got spark DataFrame, converting to polars DataFrame")
117 |     # Convert from spark DataFrame to polars DataFrame
118 |     df = pl.from_arrow(
119 |         pa.Table.from_batches(silver_collected_book_genre._collect_as_arrow())
120 |     )
121 |     context.log.debug(f"Got polars DataFrame with shape: {df.shape}")
122 | 
123 |     return Output(
124 |         value=spark_df,
125 |         metadata={
126 |             "table": "gold_book_genre",
127 |             "row_count": spark_df.count(),
128 |             "column_count": len(spark_df.columns),
129 |             "columns": spark_df.columns,
130 |         },
131 |     ), Output(
132 |         value=df,
133 |         metadata={
134 |             "database": "goodreads",
135 |             "schema": "gold",
136 |             "table": "book_genre",
137 |             "record_count": df.shape[0],
138 |         },
139 |     )
140 | 
141 | 
142 | # Asset book_with_info
143 | @asset(
144 |     description="Split book table to get basic info",
145 |     # partitions_def=YEARLY,
146 |     ins={
147 |         "silver_collected_book": AssetIn(
148 |             key_prefix=["silver", "goodreads"],
149 |             metadata={"full_load": True},
150 |         ),
151 |     },
152 |     io_manager_key="spark_io_manager",
153 |     key_prefix=["gold", "goodreads"],
154 |     compute_kind="PySpark",
155 |     group_name="gold",
156 | )
157 | def gold_book_with_info(context, silver_collected_book: DataFrame):
158 |     """
159 |     Split book table to get basic info
160 |     """
161 | 
162 |     spark_df = silver_collected_book
163 |     context.log.info("Got spark DataFrame, getting neccessary columns")
164 | 
165 |     # Drop rows with null value in Language column
166 |     spark_df = spark_df.dropna(subset=["Language"])
167 | 
168 |     # Select columns ISBN, Name, Authors, Language, Description, PagesNumber
169 |     spark_df = spark_df.select(
170 |         "ISBN",
171 |         "Name",
172 |         "Authors",
173 |         "Language",
174 |         "PagesNumber",
175 |     )
176 |     spark_df.collect()
177 | 
178 |     return Output(
179 |         value=spark_df,
180 |         metadata={
181 |             "table": "gold_book_with_info",
182 |             "row_count": spark_df.count(),
183 |             "column_count": len(spark_df.columns),
184 |             "columns": spark_df.columns,
185 |         },
186 |     )
187 | 
188 | 
189 | # Asset book_with_publish
190 | @asset(
191 |     description="Split book table to get publishing info",
192 |     ins={
193 |         "silver_collected_book": AssetIn(
194 |             key_prefix=["silver", "goodreads"],
195 |             metadata={"full_load": True},
196 |         ),
197 |     },
198 |     io_manager_key="spark_io_manager",
199 |     key_prefix=["gold", "goodreads"],
200 |     compute_kind="PySpark",
201 |     group_name="gold",
202 | )
203 | def gold_book_with_publish(context, silver_collected_book: DataFrame):
204 |     """
205 |     Split book table to get publishing info
206 |     """
207 | 
208 |     spark_df = silver_collected_book
209 |     context.log.info("Got spark DataFrame, getting neccessary columns")
210 | 
211 |     # Drop rows with null value in Language column
212 |     spark_df = spark_df.dropna(subset=["Language"])
213 | 
214 |     # Select columns ISBN, Publisher, PublishYear, PublishMonth, PublishDay
215 |     spark_df = spark_df.select(
216 |         "ISBN",
217 |         "Publisher",
218 |         "PublishYear",
219 |         "PublishMonth",
220 |         "PublishDay",
221 |     )
222 |     spark_df.collect()
223 | 
224 |     return Output(
225 |         value=spark_df,
226 |         metadata={
227 |             "table": "gold_book_with_publish",
228 |             "row_count": spark_df.count(),
229 |             "column_count": len(spark_df.columns),
230 |             "columns": spark_df.columns,
231 |         },
232 |     )
233 | 
234 | 
235 | # Asset book_with_rating
236 | @asset(
237 |     description="Split book table to get rating info",
238 |     ins={
239 |         "silver_collected_book": AssetIn(
240 |             key_prefix=["silver", "goodreads"],
241 |             metadata={"full_load": True},
242 |         ),
243 |     },
244 |     io_manager_key="spark_io_manager",
245 |     key_prefix=["gold", "goodreads"],
246 |     compute_kind="PySpark",
247 |     group_name="gold",
248 | )
249 | def gold_book_with_rating(context, silver_collected_book: DataFrame):
250 |     """
251 |     Split book table to get rating info
252 |     """
253 | 
254 |     spark_df = silver_collected_book
255 |     context.log.info("Got spark DataFrame, getting neccessary columns")
256 | 
257 |     # Drop rows with null value in Language column
258 |     spark_df = spark_df.dropna(subset=["Language"])
259 | 
260 |     # Select columns ISBN, Rating, RatingDist1, RatingDist2, RatingDist3, RatingDist4, RatingDist5, CountOfTextReviews
261 |     spark_df = spark_df.select(
262 |         "ISBN",
263 |         "Rating",
264 |         "RatingDist5",
265 |         "RatingDist4",
266 |         "RatingDist3",
267 |         "RatingDist2",
268 |         "RatingDist1",
269 |         "RatingDistTotal",
270 |         "CountOfTextReviews",
271 |     )
272 |     spark_df.collect()
273 | 
274 |     return Output(
275 |         value=spark_df,
276 |         metadata={
277 |             "table": "gold_book_with_rating",
278 |             "row_count": spark_df.count(),
279 |             "column_count": len(spark_df.columns),
280 |             "columns": spark_df.columns,
281 |         },
282 |     )
283 | 


--------------------------------------------------------------------------------
/app/streamlit_app.py:
--------------------------------------------------------------------------------
  1 | from contextlib import contextmanager
  2 | from datetime import datetime
  3 | import streamlit as st
  4 | import polars as pl
  5 | import psycopg2
  6 | from minio import Minio
  7 | import os
  8 | from PIL import Image
  9 | import requests
 10 | 
 11 | 
 12 | @contextmanager
 13 | def connect_minio():
 14 |     client = Minio(
 15 |         endpoint=os.getenv("MINIO_ENDPOINT"),
 16 |         access_key=os.getenv("MINIO_ACCESS_KEY"),
 17 |         secret_key=os.getenv("MINIO_SECRET_KEY"),
 18 |         secure=False,
 19 |     )
 20 | 
 21 |     try:
 22 |         yield client
 23 |     except Exception as e:
 24 |         raise e
 25 | 
 26 | 
 27 | # Make bucket if not exists
 28 | def make_bucket(client: Minio, bucket_name):
 29 |     found = client.bucket_exists(bucket_name)
 30 |     if not found:
 31 |         client.make_bucket(bucket_name)
 32 |     else:
 33 |         print(f"Bucket {bucket_name} already exists.")
 34 | 
 35 | 
 36 | # Download a file from minio
 37 | def download_image(book_isbn):
 38 |     tmp_file_path = f"/tmp/{book_isbn}_{datetime.today()}.jpeg"
 39 |     key_name = f"images/{book_isbn}.jpeg"
 40 |     bucket_name = os.getenv("DATALAKE_BUCKET")
 41 |     try:
 42 |         with connect_minio() as client:
 43 |             # Make bucket if not exist
 44 |             make_bucket(client=client, bucket_name=bucket_name)
 45 |             client.fget_object(bucket_name, key_name, tmp_file_path)
 46 |             return tmp_file_path
 47 |     except Exception as e:
 48 |         raise e
 49 | 
 50 | 
 51 | st.set_page_config(
 52 |     page_title="Ultimate Goodreads Recommender",
 53 |     page_icon="📔",
 54 |     layout="centered",
 55 |     initial_sidebar_state="expanded",
 56 | )
 57 | 
 58 | 
 59 | # Initialize connection.
 60 | # Uses st.cache_resource to only run once.
 61 | @st.cache_resource
 62 | def init_connection():
 63 |     return psycopg2.connect(**st.secrets["postgres"])
 64 | 
 65 | 
 66 | conn = init_connection()
 67 | 
 68 | 
 69 | # Perform query.
 70 | # Uses st.cache_data to only rerun when the query changes or after 10 min.
 71 | @st.cache_data(ttl=600)
 72 | def run_query(query):
 73 |     with conn.cursor() as cur:
 74 |         cur.execute(query)
 75 |         return cur.fetchall()
 76 | 
 77 | 
 78 | st.title("Ultimate Goodreads recommender!")
 79 | 
 80 | book_name = st.text_input("Enter a book name", "Hannibal")
 81 | st.write(f"You entered: {book_name}")
 82 | 
 83 | # Take isbn from search_prior if it exists, otherwise from search.
 84 | isbn = ""
 85 | try:
 86 |     isbn = run_query(
 87 |         f"SELECT isbn FROM recommendations.search_prior WHERE name LIKE '{book_name}' LIMIT 1"
 88 |     )[0][0]
 89 |     # st.write(f"ISBN: {isbn}")
 90 | except IndexError:
 91 |     try:
 92 |         isbn = run_query(
 93 |             f"SELECT isbn FROM recommendations.search WHERE name LIKE '{book_name}' LIMIT 1"
 94 |         )[0][0]
 95 |         # st.write(f"ISBN: {isbn}")
 96 |     except IndexError:
 97 |         st.write(f"Book {book_name} not found")
 98 | else:
 99 |     print("Error while querying book")
100 | 
101 | # From isbn take list of genreid
102 | genreid = None
103 | if isbn != "":
104 |     try:
105 |         result = run_query(
106 |             f"SELECT genreid FROM gold.book_genre WHERE bookisbn = '{isbn}'"
107 |         )
108 |         genreid = [x[0] for x in result]
109 |         # st.write(f"Genreid: {genreid}")
110 |     except Exception as e:
111 |         conn.commit()
112 |         cursor = conn.cursor()
113 | 
114 | # From genreid take books with most common genreid
115 | book_with_most_common_genreid = None
116 | if genreid:
117 |     try:
118 |         result = run_query(
119 |             f"""
120 |             WITH common_books AS (
121 |                 SELECT bookisbn, COUNT(*) as count
122 |                 FROM gold.book_genre
123 |                 WHERE genreid IN {tuple(genreid)}
124 |                 GROUP BY bookisbn
125 |                 HAVING COUNT(*) > 3
126 |                 ORDER BY COUNT(*) DESC
127 |             )
128 |             SELECT common_books.bookisbn, common_books.count, criteria.hasdownloadlink, criteria.rating
129 |             FROM recommendations.criteria
130 |             RIGHT JOIN common_books ON common_books.bookisbn = criteria.isbn
131 |             ORDER BY common_books.count DESC, criteria.hasdownloadlink DESC, criteria.rating DESC
132 |             """
133 |         )
134 |         # st.write(result)
135 |         book_with_most_common_genreid = pl.DataFrame(
136 |             {
137 |                 "bookisbn": [x[0] for x in result],
138 |                 "count": [x[1] for x in result],
139 |                 "hasdownloadlink": [x[2] for x in result],
140 |                 "rating": [x[3] for x in result],
141 |             }
142 |         )
143 |         book_with_most_common_genreid = book_with_most_common_genreid[:4]
144 |         # st.write(book_with_most_common_genreid)
145 |     except Exception as e:
146 |         print(f"Error while querying: {e}")
147 |         conn.commit()
148 |         cursor = conn.cursor()
149 | 
150 | # From book_with_most_common_genreid show books
151 | if book_with_most_common_genreid is not None:
152 |     st.subheader(f"You seached for {book_name}, here's information about the book:")
153 |     c1, c2 = st.columns([5, 5])
154 |     rating = book_with_most_common_genreid[0]["rating"][0]
155 |     with c1:
156 |         image = None
157 |         try:
158 |             image = Image.open(download_image(isbn))
159 |             st.image(image, caption=f"{book_name}", width=300)
160 |         except Exception as e:
161 |             req = requests.get(
162 |                 f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&jscmd=data&format=json"
163 |             )
164 |             json = req.json()
165 |             image_url = json.get(f"ISBN:{isbn}", {}).get("cover", {}).get("large")
166 |             if image_url:
167 |                 image = Image.open(requests.get(image_url, stream=True).raw)
168 |                 st.image(image, caption=f"{book_name}", width=300)
169 |     with c2:
170 |         author, language, pagesnumber = "", "", ""
171 |         try:
172 |             result = run_query(
173 |                 f"SELECT * FROM gold.book_with_info WHERE isbn = '{isbn}'"
174 |             )
175 |             author = result[0][2]
176 |             language = result[0][3]
177 |             pagesnumber = result[0][5]
178 |         except Exception as e:
179 |             print(f"Error while querying: {e}")
180 |             conn.commit()
181 |             cursor = conn.cursor()
182 |         st.write(f"**Book ISBN**: {isbn}")
183 |         st.write(f"**Book name**: {book_name}")
184 |         if author != "":
185 |             st.write(f"**Author(s)**: {author}")
186 |         if language != "":
187 |             st.write(f"**Language**: {language}")
188 |         if pagesnumber != 0:
189 |             st.write(f"**Pages number**: {pagesnumber}")
190 |         st.write("**Rating**: {:.2f}".format(rating))
191 |         hasdownloadlink = book_with_most_common_genreid[0]["hasdownloadlink"][0]
192 |         if hasdownloadlink:
193 |             if st.button(f"Send {book_name} to kindle", type="primary"):
194 |                 st.balloons()
195 | 
196 |     st.subheader(f"There's {len(book_with_most_common_genreid)-1} related book:")
197 |     for i in range(1, len(book_with_most_common_genreid)):
198 |         c1, c2 = st.columns([5, 5])
199 |         isbn = book_with_most_common_genreid[i]["bookisbn"][0]
200 |         rating = book_with_most_common_genreid[i]["rating"][0]
201 |         with c1:
202 |             book_name, author, language, pagesnumber = "", "", "", ""
203 |             try:
204 |                 result = run_query(
205 |                     f"SELECT * FROM gold.book_with_info WHERE isbn = '{isbn}'"
206 |                 )
207 |                 book_name = result[0][1]
208 |                 author = result[0][2]
209 |                 language = result[0][3]
210 |                 pagesnumber = result[0][5]
211 |             except Exception as e:
212 |                 print(f"Error while querying: {e}")
213 |                 conn.commit()
214 |                 cursor = conn.cursor()
215 |             st.write(f"**Book ISBN**: {isbn}")
216 |             st.write(f"**Book name**: {book_name}")
217 |             if author != "":
218 |                 st.write(f"**Author(s)**: {author}")
219 |             if language != "":
220 |                 st.write(f"**Language**: {language}")
221 |             if pagesnumber != 0:
222 |                 st.write(f"**Pages number**: {pagesnumber}")
223 |                 st.write("**Rating**: {:.2f}".format(rating))
224 |             hasdownloadlink = book_with_most_common_genreid[0]["hasdownloadlink"][0]
225 |             if hasdownloadlink:
226 |                 if st.button(f"Send {book_name} to kindle", type="primary"):
227 |                     st.balloons()
228 |         with c2:
229 |             image = None
230 |             try:
231 |                 image = Image.open(download_image(isbn))
232 |                 st.image(image, caption=f"{book_name}", width=300)
233 |             except Exception as e:
234 |                 req = requests.get(
235 |                     f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&jscmd=data&format=json"
236 |                 )
237 |                 json = req.json()
238 |                 image_url = json.get(f"ISBN:{isbn}", {}).get("cover", {}).get("large")
239 |                 if image_url:
240 |                     image = Image.open(requests.get(image_url, stream=True).raw)
241 |                     st.image(image, caption=f"{book_name}", width=300)
242 | 


--------------------------------------------------------------------------------
/elt_pipeline/elt_pipeline/resources/gdrive_io_manager.py:
--------------------------------------------------------------------------------
  1 | from dagster import IOManager, InputContext, OutputContext
  2 | from google_auth_oauthlib.flow import InstalledAppFlow
  3 | from googleapiclient.discovery import build
  4 | from googleapiclient.http import MediaIoBaseDownload
  5 | from google.auth.transport.requests import Request
  6 | 
  7 | import io
  8 | import os
  9 | import pickle
 10 | import polars as pl
 11 | from typing import Union
 12 | from contextlib import contextmanager
 13 | from .minio_io_manager import connect_minio, make_bucket
 14 | 
 15 | 
 16 | @contextmanager
 17 | def gdrive_client(config):
 18 |     client_secret_file = config["client_secret_file"]
 19 |     pickle_file = config["pickle_file"]
 20 |     api_name = config["api_name"]
 21 |     api_version = config["api_version"]
 22 |     scopes = config["scopes"]
 23 | 
 24 |     cred = None
 25 | 
 26 |     if os.path.exists(pickle_file):
 27 |         with open(pickle_file, "rb") as token:
 28 |             cred = pickle.load(token)
 29 |     else:
 30 |         raise Exception(
 31 |             f"Pickle not exists from this, pickle_file: {pickle_file} and cred file: {cred}"
 32 |         )
 33 | 
 34 |     if not cred or not cred.valid:
 35 |         if cred and cred.expired and cred.refresh_token:
 36 |             cred.refresh(Request())
 37 |         else:
 38 |             flow = InstalledAppFlow.from_client_secrets_file(client_secret_file, scopes)
 39 |             cred = flow.run_local_server()
 40 | 
 41 |         with open(pickle_file, "wb") as token:
 42 |             pickle.dump(cred, token)
 43 | 
 44 |     try:
 45 |         service = build(api_name, api_version, credentials=cred)
 46 |         yield service
 47 |     except Exception as e:
 48 |         raise Exception("Error while creating gdrive client: {}".format(e))
 49 | 
 50 | 
 51 | class GDriveIOManager(IOManager):
 52 |     def __init__(self, config):
 53 |         self._config = config
 54 | 
 55 |     def download_files(
 56 |         self,
 57 |         context,
 58 |         dowid,
 59 |         dfilespath,
 60 |         folder=None,
 61 |     ):
 62 |         """
 63 |         Download files from gdrive with given id
 64 |         """
 65 | 
 66 |         with gdrive_client(self._config) as service:
 67 |             request = service.files().get_media(fileId=dowid)
 68 |             fh = io.BytesIO()
 69 |             downloader = MediaIoBaseDownload(fh, request)
 70 |             done = False
 71 |             while done is False:
 72 |                 status, done = downloader.next_chunk()
 73 |                 context.log.debug("Download %d%%." % int(status.progress() * 100))
 74 |             if folder:
 75 |                 with io.open(folder + "/" + dfilespath, "wb") as f:
 76 |                     fh.seek(0)
 77 |                     f.write(fh.read())
 78 |             else:
 79 |                 with io.open(dfilespath, "wb") as f:
 80 |                     fh.seek(0)
 81 |                     f.write(fh.read())
 82 | 
 83 |     def list_folders(self, context, filid, des):
 84 |         """
 85 |         List all items (file/subfolder) in the folder with given id
 86 |         until all files are found
 87 |         """
 88 | 
 89 |         with gdrive_client(self._config) as service:
 90 |             page_token = None
 91 |             while True:
 92 |                 results = (
 93 |                     service.files()
 94 |                     .list(
 95 |                         pageSize=1000,
 96 |                         q=f"'{filid}' in parents",
 97 |                         fields="nextPageToken, files(id, name, mimeType)",
 98 |                     )
 99 |                     .execute()
100 |                 )
101 |                 page_token = results.get("nextPageToken", None)
102 |                 if page_token is None:
103 |                     folder = results.get("files", [])
104 |                     for item in folder:
105 |                         if (
106 |                             str(item["mimeType"])
107 |                             == "application/vnd.google-apps.folder"
108 |                         ):
109 |                             if not os.path.isdir(des + "/" + item["name"]):
110 |                                 os.mkdir(des + "/" + item["name"])
111 |                             self.list_folders(
112 |                                 context, item["id"], des + "/" + item["name"]
113 |                             )
114 |                         else:
115 |                             self.download_files(context, item["id"], item["name"], des)
116 |                             context.log.debug(f"Downloaded file {item['name']}")
117 |                     break
118 |             return folder
119 | 
120 |     def download_folders(self, context, dataframe: pl.DataFrame, tmp_folder_path: str):
121 |         """
122 |         Download all files in the folder with given id from gdrive
123 |         """
124 | 
125 |         with gdrive_client(self._config) as service:
126 |             for row in dataframe.rows(named=True):
127 |                 isbn, folder_id = row["BookISBN"], row["Link"].split("/")[-1]
128 | 
129 |                 folder = service.files().get(fileId=folder_id).execute()
130 |                 folder_name = folder["name"]
131 |                 page_token = None
132 | 
133 |                 while True:
134 |                     results = (
135 |                         service.files()
136 |                         .list(
137 |                             q=f"'{folder_id}' in parents",
138 |                             spaces="drive",
139 |                             fields="nextPageToken, files(id, name, mimeType)",
140 |                         )
141 |                         .execute()
142 |                     )
143 |                     page_token = results.get("nextPageToken", None)
144 |                     if page_token is None:
145 |                         items = results.get("files", [])
146 |                         # If no items in the folder, that means the id is a file -> download the file
147 |                         if not items:
148 |                             self.download_files(context, folder_id, folder_name)
149 |                             context.log.debug(f"Folder name: {folder_name}")
150 |                         # If there are items in the folder -> download files in folder
151 |                         else:
152 |                             context.log.info(
153 |                                 f"Start downloading folder {folder_name} ..."
154 |                             )
155 | 
156 |                             for item in items:
157 |                                 tmp_file_path = os.path.join(tmp_folder_path, str(isbn))
158 |                                 if not os.path.isdir(tmp_file_path):
159 |                                     os.makedirs(tmp_file_path)
160 |                                 context.log.debug(f"Tmp file path: {tmp_file_path}")
161 | 
162 |                                 file_path = ""
163 |                                 file_type = item["mimeType"]
164 |                                 context.log.debug(f"File type: {file_type}")
165 |                                 accept_files = ["jpeg", "epub"]
166 |                                 for accept_file in accept_files:
167 |                                     if accept_file in file_type:
168 |                                         file_path = os.path.join(
169 |                                             tmp_file_path, item["name"]
170 |                                         )
171 | 
172 |                                         context.log.debug(f"File path: {file_path}")
173 |                                         self.download_files(
174 |                                             context, item["id"], file_path
175 |                                         )
176 |                                         context.log.info(
177 |                                             f"Downloaded file {item['name']}"
178 |                                         )
179 | 
180 |                                         os.rename(
181 |                                             file_path,
182 |                                             os.path.join(
183 |                                                 tmp_file_path, f"{isbn}.{accept_file}"
184 |                                             ),
185 |                                         )
186 |                     break
187 | 
188 |     def handle_output(self, context: "OutputContext", obj: dict):
189 |         """
190 |         Handle returned temporary folder path, load all files in the folder to minIO
191 |         /tmp/bronze/download/2021-08-01T00:00:00+00:00
192 |                                                     /images
193 |                                                         ...jpeg
194 |                                                     /files
195 |                                                         ...epub
196 |         to minIO:
197 |         /lakehouse/images
198 |         /lakehouse/files
199 |         """
200 | 
201 |         tmp_folder_path = str(obj.get("tmp_folder_path"))
202 |         isbn_list = obj.get("isbn")
203 |         context.add_output_metadata({"tmp": tmp_folder_path})
204 | 
205 |         try:
206 |             bucket_name = self._config.get("bucket")
207 |             with connect_minio(self._config) as client:
208 |                 # Make bucket if not exist
209 |                 make_bucket(client, bucket_name)
210 |                 for isbn in isbn_list:
211 |                     for filetype in ["epub", "jpeg"]:
212 |                         # Upload epub file to minIO
213 |                         # E.g /tmp/bronze/download/2021-08-01T00:00:00+00:00/123456/123456.epub/jpeg
214 |                         tmp_file_path = os.path.join(
215 |                             tmp_folder_path, str(isbn), f"{isbn}.{filetype}"
216 |                         )
217 |                         key_name = "files" if filetype == "epub" else "images"
218 |                         key_name += f"/{isbn}.{filetype}"
219 | 
220 |                         # E.g bucket_name: lakehouse, key_name: files or images, tmp_file_path: /tmp/bronze/download/2021-08-01T00:00:00+00:00/123456/123456.epub/jpeg
221 |                         client.fput_object(bucket_name, key_name, tmp_file_path)
222 |                         context.log.debug(
223 |                             f"(MinIO handle_output) Got book.{filetype} with isbn: {isbn}"
224 |                         )
225 | 
226 |                         # Clean up tmp file
227 |                         os.remove(tmp_file_path)
228 |         except Exception as e:
229 |             raise e
230 | 
231 |     def load_input(self, context: "InputContext"):
232 |         """
233 |         Skip this function
234 |         """
235 |         pass
236 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # goodreads-elt-pipeline
  2 | 
  3 | For Vietnamese edition, please visit <https://lelouvincx.github.io/projects/fde2-goodreads-elt-pipeline/>
  4 | 
  5 | In this project, I will guide you on building a basic data pipeline using the ELT model (extract - load - transform), using the dataset from Goodreads to ingest and transform data to serve a book recommendation system for yourself.
  6 | 
  7 | This project is completed based on the knowledge learned from the Fundamental Data Engineering 02 course of AIDE. Special thanks to Mr. Nguyen Thanh Binh, Mr. Ong Xuan Hong, and Mr. Hung Le.
  8 | 
  9 | ## 1. Introduction
 10 | 
 11 | I love reading books, and I have a Kindle e-reader for my daily reading.
 12 | 
 13 | ![](./images/introduction.jpg)
 14 | 
 15 | One thing I like about the Kindle is that it has a separate email address provided by Amazon. If I use my own email to send e-book files (in .epub/.mobi format), the Amazon system will automatically send the files to my Kindle as long as there is an internet connection.
 16 | 
 17 | So why not build an app that can extract data from Goodreads (a social network for book lovers), process it, and provide recommendations for my next reads? And that's where the project begins :D
 18 | 
 19 | ## 2. Objective
 20 | 
 21 | Dataset is collected from [Kaggle](https://www.kaggle.com/datasets/bahramjannesarr/goodreads-book-datasets-10m), [OpenLibrary API](https://openlibrary.org), [Google Drive API](https://developers.google.com/drive) and [Notion API](https://www.notion.so/lelouvincx/9dca269701b44d6b944c51a7f63b5b5a?v=4852b4d5cf8440e4bc232b0e25327f93)
 22 | 
 23 | The objective of the project is to provide book recommendations to users based on the processed data collected. When a user inputs information about a book they have read, the app will suggest potential next reads. If the book has an .epub file, the app will also have a feature to send the book to the user's Kindle.
 24 | 
 25 | ## 3. Design
 26 | 
 27 | ### 3.1 Directory tree
 28 | 
 29 | ![](./images/directory_tree.png)
 30 | 
 31 | - `app`: The UI's application written with streamlit
 32 | - `dagster_home`: Dagit and dagster daemon's configurations
 33 | - `dataset`: Dataset under .csv format, in order to load into MySQL
 34 | - `docker-compose`: To compose docker containers
 35 | - `dockerimages`: Include self-built docker images, such as dagster (for dagit + daemon), spark master, streamlit app, ...
 36 | - `EDA.ipynb`: Exploratory Data Analysis, view directly [here](https://gist.github.com/lelouvincx/a88fa6caf59d7ff76086ab485ecc69bd)
 37 | - `elt_pipeline`: The pipeline
 38 |   - `dbt_transform`: dbt's code location, used for the last transform step
 39 |   - `Dockerfile + requirements.txt`: Docker image
 40 |   - `elt_pipeline`: EL (Extract -> Transform) pipeline
 41 | - `.env + .spark_master.env + .spark_worker.env`: Env variables (e.g POSTGRES_USER, MYSQL_USER, SPARK, ...)
 42 | - `env.template`: Env variables template
 43 | - `.git + .gitignore`: Code versioning
 44 | - `Makefile`: Shortcut for terminal's commands
 45 | - `load_dataset`: .sql scripts to create schema and load `dataset` into MySQL, Postgres
 46 | - `requirements.txt + Pipfile + Pipfile.lock`: Python's dependencies
 47 | 
 48 | In addition, the containers also have their own separate directories, which include:
 49 | 
 50 | - `minio`
 51 | - `storage`
 52 |   - `mysql_data`
 53 |   - `postgres_data`
 54 |   - `metabase_data`
 55 | 
 56 | Visit file [tree.txt](https://github.com/lelouvincx/goodreads-elt-pipeline/blob/main/tree.txt) for more details.
 57 | 
 58 | ### 3.2 Pipeline design
 59 | 
 60 | ![](./images/design_pipeline.png "Pipeline Design")
 61 | 
 62 | 0. We use docker to containerize the application and dagster to orchestrate assets (as defined in dagster's [documentation](https://docs.dagster.io/concepts/assets/software-defined-assets)).
 63 | 1. Goodreads data is downloaded from Kaggle in `.csv` format, then imported into `MySQL` to simulate development data
 64 | 2. After obtaining the book's ISBN (international standard book number), collect additional data from relevant APIs:
 65 |    - Genre, author, pages number, image, description from `OpenLibrary API`
 66 |    - Download link from `Notion API`
 67 |    - Epub file from `Google Drive API`
 68 |    - Image from `OpenLibrary API` or `Google Drive API`
 69 | 3. Extract the table-formatted data above using `polars`, and load it into the datalake - `MinIO`.
 70 | 4. From `MinIO`, load data into `spark` to transform from raw into silver & gold
 71 | 5. Convert `Spark DataFrame` to `.parquet`, and load back to `MinIO`
 72 | 6. Load gold layer into data warehouse - postgreSQL, creating the warehouse layer.
 73 | 7. Transform as needed using `dbt` on `postgres`
 74 | 8. Visualize the data using `metabase`
 75 | 9. Create a book recommendation app using `streamlit`
 76 | 
 77 | ### 3.3 Database schema
 78 | 
 79 | ![](./images/design_schema.png)
 80 | 
 81 | 1. `book`: OLTP table contains books' infomation (e.g ISBN, Authors, Rating, Description...)
 82 | 2. `genre`: table contains genres
 83 | 3. `book_genre`: n-n relationship of `book` and `genre`
 84 | 4. `book_download_link`: table contains link google drive
 85 | 5. `files`: object storage contains books' download files (.epub/.pdf/.mobi)
 86 | 6. `images`: object storage contains books' images
 87 | 
 88 | ### 3.4 Datalake structure
 89 | 
 90 | ![](./images/datalake_structure.png "Datalake Structure")
 91 | 
 92 | 1. The datalake is divided into three layers: bronze, silver, and gold.
 93 | 2. All files are under the .parquet format for better reading performance than .csv.
 94 | 3. There are also `files` that stores .epub files in the format of abc.epub, where abc is the ISBN of the book.
 95 | 4. Similarly, abc.jpeg stores the image of the book.
 96 | 
 97 | ### 3.5 Data lineage
 98 | 
 99 | 1. General
100 | 
101 | ![](./images/assets_general.png)
102 | 
103 | With a dense data lineage, Dagster is a big help when it comes to visualizing it in a clear way:
104 | 
105 | - Data originates from MySQL and various APIs, and is loaded into the bronze layer.
106 | - From the bronze layer, data is deduped, cleaned, and missing values are filled in the silver layer.
107 | - Advanced computations and splitting are then performed in the gold layer.
108 | - The data is loaded into the data warehouse - Postgres - in the warehouse layer.
109 | - Finally, transformations are made according to needs in the recommendations layer using dbt.
110 | 
111 | 2. Bronze layer
112 | 
113 | ![](./images/assets_bronze.png)
114 | 
115 | Includes these assets:
116 | 
117 | - bronze_book: Table `book` from MySQL, because too large (over 1.2 million rows) it is partitioned by year from 1975 to 2022.
118 | - bronze_genre: Table `genre` from MySQL.
119 | - bronze_book_genre: Table `book_genre` from MySQL.
120 | - bronze_book_download_link: Table `book_download_link` from MySQL.
121 | - bronze_images_and_files_download: Đảm nhận việc kết nối tới google drive api, kéo file .epub và hình ảnh về, lưu trong datalake.
122 | - bronze_images_and_files_download: Responsible for connecting to the Google Drive API, pulling the .epub file and images, and storing them in the datalake.
123 | 
124 | 3. Silver layer
125 | 
126 | ![](./images/assets_silver.png)
127 | 
128 | Includes these assets:
129 | 
130 | - silver_cleaned_book: Data cleaning from upstream `bronze_book`, partitioned to ensure `spark standalone mode` can run efficiently.
131 | - silver_collected_book: Collect missing data from upstream such as authors, pages number, and description from `OpenLibrary API`.
132 | - silver_isbn: Extract the isbn column from book to serve as a dependency for assets related to genre.
133 | - silver_cleaned_genre: Similar to `silver_cleaned_book`, but doesn't need partitioning as its size is not very large.
134 | - silver_collected_genre: Based on `silver_isbn`, collect missing genres for each book. If there is no genre, it cannot be used for recommendations in subsequent tasks.
135 | - silver_collected_book_genre: Establish the n-n relationship between book and genre.
136 | 
137 | 4. Gold layer
138 | 
139 | ![](./images/assets_gold.png)
140 | 
141 | Includes these assets:
142 | 
143 | - gold_genre: Compute and sort genres to match from upstream `silver_collected_genre`, while also saving them to minIO.
144 | - gold_book_genre: Similarly, from upstream `silver_collected_book_genre`.
145 | - gold_with_info: Splitting, containing only basic information about the book such as ISBN, Name, Authors, Language, PagesNumber.
146 | - gold_with_publish: Splitting, containing information about the publisher, publication time.
147 | - gold_with_rating: Splitting and computing different types of ratings.
148 | 
149 | 5. Warehouse layer
150 | 
151 | ![](./images/assets_warehouse.png)
152 | 
153 | Load the assets from the gold layer into Postgres, including one asset from the bronze layer which is book_download_link.
154 | 
155 | In the future, the assets will be updated to add download links automatically from the Notion API, and a schedule will be set up.
156 | 
157 | 6. Transform layer
158 | 
159 | ![](./images/assets_dbt.png)
160 | 
161 | Includes these assets:
162 | 
163 | - search: Transform information to create an index table, which will be queried when users search for books.
164 | - search_prior: Also an index table, but contains books that are given priority based on factors such as availability of download links, functionality of the OpenLibrary API, high ratings, etc.
165 | - criteria: Criteria used to query related books when searching for a specific book.
166 | 
167 | ## 4. Setup
168 | 
169 | ### 4.1 Prequisites
170 | 
171 | To develop this pipeline, download and install these softwares:
172 | 
173 | 1. [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git)
174 | 2. [Docker](https://docs.docker.com/engine/install/) with at least 4GB RAM, 6 core CPU, 2GB swap, 16GB disk
175 | 3. [CMake](https://cmake.org/install/), nếu dùng hệ máy UNIX (Linux/MacOS), check `make --version` already installed
176 | 4. Python 3.x (3.9.16 recommended as the Spark image runs on this version, installing via asdf is recommended) and a virtual environment (pipenv recommended)
177 | 5. A local machine that has freed the following ports: 3306, 5432, 9000, 9001, 3001, 8501, 4040, 7077, 8080, 3030
178 | 6. Dbeaver or any other DB client (if not available, can use command-line)
179 | 
180 | If using Windows, set up WSL2 and a local Ubuntu virtual machine, then install the above software for Ubuntu.
181 | 
182 | Clone the repository
183 | 
184 | ```bash
185 | git clone https://github.com/lelouvincx/goodreads-elt-pipeline.git project
186 | cd project
187 | ```
188 | 
189 | Download the csv dataset [here](https://www.kaggle.com/datasets/lelouvincx/goodreads-elt-pipeline?select=book.csv), then place it in `project/dataset`
190 | 
191 | ### 4.2 Setup google drive api
192 | 
193 | Firstly we need to create an OAuth 2.0 token to google, [Google API Console](https://console.developers.google.com/).
194 | 
195 | Select `create new project`:
196 | 
197 | ![](./images/gdrive_1.png)
198 | 
199 | Fill in project's name (goodreads-elt_pipeline), choose location (default `No organization`).
200 | 
201 | ![](./images/gdrive_2.png)
202 | 
203 | After creating project, select tab `Library`:
204 | 
205 | ![](./images/gdrive_3.png)
206 | 
207 | Search `Google Drive API`, enable it.
208 | 
209 | ![](./images/gdrive_4.png)
210 | 
211 | ![](./images/gdrive_5.png)
212 | 
213 | Next, select tab `OAuth consent screen`,
214 | 
215 | ![](./images/gdrive_6.png)
216 | 
217 | Fill in below information:
218 | 
219 | ![](./images/gdrive_7.png)
220 | 
221 | In `scopes`, select `add or remove scopes`, look for `google drive api, readonly` then tick, `save and continue` until end.
222 | 
223 | ![](./images/gdrive_8.png)
224 | 
225 | Select tab `credentials` -> `create credentials` then `OAuth client ID`.
226 | 
227 | ![](./images/gdrive_9.png)
228 | 
229 | Select `Desktop app`, name as you like (default: goodreads-elt-pipeline)
230 | 
231 | ![](./images/gdrive_10.png)
232 | 
233 | Download json and place in `project/elt_pipeline/elt_pipeline`
234 | 
235 | ![](./images/gdrive_11.png)
236 | 
237 | ### 4.3 Setup local infrastructure
238 | 
239 | Clone repository:
240 | 
241 | ```bash
242 | # Create env file
243 | touch .env
244 | cp env.template .env
245 | touch .spark_master.env
246 | cp spark_master.env.template .spark_master.env
247 | touch .spark_worker.env
248 | cp spark_worker.env.template .spark_worker.env
249 | ```
250 | 
251 | Then fill in the infomation into the above env files, for examples:
252 | 
253 | ```env
254 | # MySQL
255 | MYSQL_HOST=de_mysql
256 | MYSQL_PORT=3306
257 | MYSQL_DATABASE=goodreads
258 | MYSQL_USER=admin
259 | MYSQL_PASSWORD=admin123
260 | MYSQL_ROOT_PASSWORD=root123
261 | 
262 | # PostgreSQL
263 | POSTGRES_HOST=de_psql
264 | POSTGRES_PORT=5432
265 | POSTGRES_USER=admin
266 | POSTGRES_PASSWORD=admin123
267 | POSTGRES_DB=goodreads
268 | POSTGRES_HOST_AUTH_METHOD=trust
269 | 
270 | # Google Drive
271 | GDRIVE_CLIENT_SECRET_FILE=client_secret.json
272 | GDRIVE_PICKLE_FILE=token_drive_v3.pickle
273 | GDRIVE_API_NAME=drive
274 | GDRIVE_API_VERSION=v3
275 | GDRIVE_SCOPES=https://www.googleapis.com/auth/drive.readonly
276 | 
277 | # Dagster
278 | DAGSTER_PG_HOSTNAME=de_psql
279 | DAGSTER_PG_USERNAME=admin
280 | DAGSTER_PG_PASSWORD=admin123
281 | DAGSTER_PG_DB=postgres
282 | DAGSTER_OVERALL_CONCURRENCY_LIMIT=1
283 | DAGSTER_HOME=/opt/dagster/dagster_home
284 | 
285 | # dbt
286 | DBT_HOST=de_psql
287 | DBT_USER=admin
288 | DBT_PASSWORD=admin123
289 | DBT_DATABASE=goodreads
290 | DBT_SCHEMA=recommendations
291 | # MinIO
292 | MINIO_ENDPOINT=minio:9000
293 | MINIO_ROOT_USER=minio
294 | MINIO_ROOT_PASSWORD=minio123
295 | MINIO_ACCESS_KEY=minio
296 | MINIO_SECRET_KEY=minio123
297 | DATALAKE_BUCKET=lakehouse
298 | AWS_ACCESS_KEY_ID=minio
299 | AWS_SECRET_ACCESS_KEY=minio123
300 | AWS_REGION=us-east-1
301 | 
302 | # MinIO client (mc)
303 | AWS_ACCESS_KEY_ID=minio
304 | AWS_SECRET_ACCESS_KEY=minio123
305 | AWS_REGION=us-east-1
306 | 
307 | # Spark
308 | SPARK_MASTER_URL=spark://spark-master:7077
309 | SPARK_VERSION=3.3.2
310 | HADOOP_VERSION=3
311 | 
312 | # Metabase
313 | MB_DB_TYPE=postgres
314 | MB_DB_DBNAME=goodreads
315 | MB_DB_PORT=5432
316 | MB_DB_USER=admin
317 | MB_DB_PASS=admin123
318 | MB_DB_HOST=de_psql
319 | MB_DB_FILE=/metabase_data/metabase.db
320 | ```
321 | 
322 | You can replace the infomation about user, password, ...
323 | 
324 | **For development only, do not use for production.**
325 | 
326 | ```bash
327 | # DO NOT RUN BOTH BELOW COMMANDS, ONLY CHOOSE ONE
328 | # Setup python environment
329 | pipenv install
330 | # Or create virtualenv and install manually by requirements.txt
331 | make install
332 | 
333 | # Build docker images
334 | make build-dagster
335 | make build-spark
336 | make build-pipeline
337 | make build-streamlit
338 | 
339 | # Run containers dettached
340 | make up-bg
341 | 
342 | # Check running containers
343 | docker compose ps -a
344 | 
345 | # Check code quality
346 | make check
347 | make lint
348 | 
349 | # Format pipelines
350 | black ./elt_pipeline
351 | 
352 | # Test coverage
353 | make test
354 | ```
355 | 
356 | Check there's 11 running services:
357 | 
358 | ![](./images/docker_1.png)
359 | 
360 | ![](./images/docker_2.png)
361 | 
362 | **Ports**:
363 | 
364 | - MySQL: 3306
365 | - PostgreSQL: 5432
366 | - Dagit: 3001
367 | - MinIO
368 |   - UI: 9001
369 |   - API: 9000
370 | - Spark master:
371 |   - UI: 8080
372 |   - API: 7077
373 | - Pipeline:
374 |   - Spark jobs running: 4040
375 | - Metabase: 3030
376 | - Streamlit: 8501
377 | 
378 | ### 4.4 Import data into MySQL
379 | 
380 | Now we import the Goodreads dataset (unser csv format) into MySQL:
381 | 
382 | ```bash
383 | make to_mysql_root
384 | ```
385 | 
386 | ```sql
387 | SET GLOBAL local_infile=TRUE;
388 | -- Check if local_infile was turned on
389 | SHOW VARIABLES LIKE "local_infile";
390 | exit
391 | ```
392 | 
393 | ```bash
394 | # Create tables with schema
395 | make mysql_create
396 | 
397 | # Load csv into created tables
398 | make mysql_load
399 | ```
400 | 
401 | ### 4.5 Create schema in Postgres
402 | 
403 | ```bash
404 | make psql_create
405 | ```
406 | 
407 | ### 4.6 User interfaces
408 | 
409 | 1. <http://localhost:3001> - Dagit
410 | 2. <http://localhost:4040> - Spark jobs
411 | 3. <http://localhost:8080> - Spark master
412 | 4. <http://localhost:9001> - MinIO
413 | 5. <http://localhost:3030> - Metabase
414 | 6. <http://localhost:8501> - Streamlit
415 | 
416 | ## 5. Considerations
417 | 
418 | Evaluation of the project:
419 | 
420 | 1. Speed: `spark` is installed in standalone mode, so it does not achieve high performance and sometimes crashes in the middle of performing shuffle/read/write tasks.
421 | 2. Development environment: Currently, there is only a development environment, and in the future, testing, staging, and production environments will be considered.
422 | 3. `dbt` is currently a small project, and in the future, if more transformations are needed, it should be split into separate services with different permissions.
423 | 4. Deployment: Using one of the cloud computing services such as AWS, Azure, GCP.
424 | 
425 | ## 6. Further actions
426 | 
427 | 1. Complete the recommender system
428 | 2. Integrate Jupyter Notebook for DS tasks - [dagstermill](https://docs.dagster.io/integrations/dagstermill)
429 | 3. Testing environment
430 | 4. Continuous Integration with Github Actions
431 | 


--------------------------------------------------------------------------------
/elt_pipeline/elt_pipeline/assets/silver.py:
--------------------------------------------------------------------------------
  1 | from dagster import asset, AssetIn, Output, StaticPartitionsDefinition
  2 | from datetime import datetime
  3 | import polars as pl
  4 | import requests
  5 | import os
  6 | 
  7 | from pyspark.sql import DataFrame
  8 | 
  9 | from ..resources.spark_io_manager import get_spark_session
 10 | from pyspark.sql.functions import udf, col, regexp_replace, lower, when
 11 | 
 12 | 
 13 | COMPUTE_KIND = "PySpark"
 14 | LAYER = "silver"
 15 | YEARLY = StaticPartitionsDefinition(
 16 |     [str(year) for year in range(1975, datetime.today().year)]
 17 | )
 18 | 
 19 | 
 20 | @udf
 21 | def split_take_second(value):
 22 |     return value.split(":")[1]
 23 | 
 24 | 
 25 | # Silver cleaned book
 26 | @asset(
 27 |     description="Load book table from bronze layer in minIO, into a Spark dataframe, then clean data",
 28 |     partitions_def=YEARLY,
 29 |     ins={
 30 |         "bronze_book": AssetIn(
 31 |             key_prefix=["bronze", "goodreads"],
 32 |         ),
 33 |     },
 34 |     io_manager_key="spark_io_manager",
 35 |     key_prefix=["silver", "goodreads"],
 36 |     compute_kind=COMPUTE_KIND,
 37 |     group_name=LAYER,
 38 | )
 39 | def silver_cleaned_book(context, bronze_book: pl.DataFrame):
 40 |     """
 41 |     Load book table from bronze layer in minIO, into a Spark dataframe, then clean data
 42 |     """
 43 | 
 44 |     config = {
 45 |         "endpoint_url": os.getenv("MINIO_ENDPOINT"),
 46 |         "minio_access_key": os.getenv("MINIO_ACCESS_KEY"),
 47 |         "minio_secret_key": os.getenv("MINIO_SECRET_KEY"),
 48 |     }
 49 | 
 50 |     context.log.debug("Start creating spark session")
 51 | 
 52 |     with get_spark_session(config, str(context.run.run_id).split("-")[0]) as spark:
 53 |         # Convert bronze_book from polars DataFrame to Spark DataFrame
 54 |         pandas_df = bronze_book.to_pandas()
 55 |         context.log.debug(
 56 |             f"Converted to pandas DataFrame with shape: {pandas_df.shape}"
 57 |         )
 58 | 
 59 |         spark_df = spark.createDataFrame(pandas_df)
 60 |         spark_df.cache()
 61 |         context.log.info("Got Spark DataFrame")
 62 | 
 63 |         # Dedupe books
 64 |         spark_df = spark_df.dropDuplicates()
 65 |         # Drop rows with null value in column 'Name'
 66 |         spark_df = spark_df.na.drop(subset=["Name"])
 67 |         # Drop rows with null values in column ISBN
 68 |         spark_df = spark_df.na.drop(subset=["isbn"])
 69 |         # Drop rows will null values in column 'Language'
 70 |         spark_df = spark_df.na.drop(subset=["Language"])
 71 |         # Drop rows with value '--' in column 'Language'
 72 |         spark_df = spark_df.filter(spark_df.Language != "--")
 73 |         # Drop rows with value > 350 in column 'PagesNumber'
 74 |         spark_df = spark_df.filter(spark_df.PagesNumber <= 350)
 75 |         # Drop column 'CountsOfReview' (overlap with 'RatingDistTotal')
 76 |         spark_df = spark_df.drop("CountsOfReview")
 77 |         # Choose rows with 'PublishYear' from 1900 to datetime.today().year
 78 |         spark_df = spark_df.filter(
 79 |             (spark_df.PublishYear >= 1975)
 80 |             & (spark_df.PublishYear <= datetime.today().year)
 81 |         )
 82 |         # Update value of column 'RatingDist...', splitting by ':' and take the second value
 83 |         spark_df = spark_df.withColumn(
 84 |             "RatingDist5", split_take_second(col("RatingDist5"))
 85 |         )
 86 |         spark_df = spark_df.withColumn(
 87 |             "RatingDist4", split_take_second(col("RatingDist4"))
 88 |         )
 89 |         spark_df = spark_df.withColumn(
 90 |             "RatingDist3", split_take_second(col("RatingDist3"))
 91 |         )
 92 |         spark_df = spark_df.withColumn(
 93 |             "RatingDist2", split_take_second(col("RatingDist2"))
 94 |         )
 95 |         spark_df = spark_df.withColumn(
 96 |             "RatingDist1", split_take_second(col("RatingDist1"))
 97 |         )
 98 |         spark_df = spark_df.withColumn(
 99 |             "RatingDistTotal", split_take_second(col("RatingDistTotal"))
100 |         )
101 |         # Cast column 'RatingDist...' to Interger
102 |         spark_df = spark_df.withColumn(
103 |             "RatingDist5", spark_df.RatingDist5.cast("Integer")
104 |         )
105 |         spark_df = spark_df.withColumn(
106 |             "RatingDist4", spark_df.RatingDist4.cast("Integer")
107 |         )
108 |         spark_df = spark_df.withColumn(
109 |             "RatingDist3", spark_df.RatingDist3.cast("Integer")
110 |         )
111 |         spark_df = spark_df.withColumn(
112 |             "RatingDist2", spark_df.RatingDist2.cast("Integer")
113 |         )
114 |         spark_df = spark_df.withColumn(
115 |             "RatingDist1", spark_df.RatingDist1.cast("Integer")
116 |         )
117 |         spark_df = spark_df.withColumn(
118 |             "RatingDistTotal", spark_df.RatingDistTotal.cast("Integer")
119 |         )
120 |         # Change column name 'Count of text reviews' to 'CountOfTextReviews'
121 |         spark_df = spark_df.withColumnRenamed(
122 |             "Count of text reviews", "CountOfTextReviews"
123 |         )
124 |         # Change value of column 'Language' from ['en-US', 'en-GB', 'en-CA'],  to 'eng', from 'nl' to 'nld'
125 |         spark_df = spark_df.withColumn(
126 |             "Language", regexp_replace("Language", "en-US", "eng")
127 |         )
128 |         spark_df = spark_df.withColumn(
129 |             "Language", regexp_replace("Language", "en-GB", "eng")
130 |         )
131 |         spark_df = spark_df.withColumn(
132 |             "Language", regexp_replace("Language", "en-CA", "eng")
133 |         )
134 |         spark_df = spark_df.withColumn(
135 |             "Language", regexp_replace("Language", "nl", "nld")
136 |         )
137 | 
138 |         spark_df.unpersist()
139 | 
140 |         return Output(
141 |             value=spark_df,
142 |             metadata={
143 |                 "table": "silver_cleaned_book",
144 |                 "row_count": spark_df.count(),
145 |                 "column_count": len(spark_df.columns),
146 |                 "columns": spark_df.columns,
147 |             },
148 |         )
149 | 
150 | 
151 | # Silver cleaned genre
152 | @asset(
153 |     description="Load genre table from bronze layer in minIO, into a Spark dataframe, then clean data",
154 |     ins={
155 |         "bronze_genre": AssetIn(
156 |             key_prefix=["bronze", "goodreads"],
157 |         ),
158 |     },
159 |     io_manager_key="spark_io_manager",
160 |     key_prefix=["silver", "goodreads"],
161 |     compute_kind=COMPUTE_KIND,
162 |     group_name=LAYER,
163 | )
164 | def silver_cleaned_genre(context, bronze_genre: pl.DataFrame):
165 |     """
166 |     Load genre table from bronze layer in minIO, into a Spark dataframe, then clean data
167 |     """
168 | 
169 |     config = {
170 |         "endpoint_url": os.getenv("MINIO_ENDPOINT"),
171 |         "minio_access_key": os.getenv("MINIO_ACCESS_KEY"),
172 |         "minio_secret_key": os.getenv("MINIO_SECRET_KEY"),
173 |     }
174 | 
175 |     context.log.debug("Start creating spark session")
176 | 
177 |     with get_spark_session(config) as spark:
178 |         pandas_df = bronze_genre.to_pandas()
179 |         context.log.debug(f"Got pandas DataFrame with shape: {pandas_df.shape}")
180 | 
181 |         spark_df = spark.createDataFrame(pandas_df)
182 |         spark_df.cache()
183 |         context.log.info("Got Spark DataFrame")
184 | 
185 |         # Downcase the column 'Name'
186 |         spark_df = spark_df.withColumn("Name", lower(col("Name")))
187 | 
188 |         return Output(
189 |             value=spark_df,
190 |             metadata={
191 |                 "table": "silver_cleaned_genre",
192 |                 "row_count": spark_df.count(),
193 |                 "column_count": len(spark_df.columns),
194 |                 "columns": spark_df.columns,
195 |             },
196 |         )
197 | 
198 | 
199 | # Silver collected book
200 | @asset(
201 |     description="Collect more infomation about cleaned books, such as authors, number of pages",
202 |     partitions_def=YEARLY,
203 |     ins={
204 |         "silver_cleaned_book": AssetIn(
205 |             key_prefix=["silver", "goodreads"],
206 |             metadata={"full_load": False},
207 |         ),
208 |     },
209 |     io_manager_key="spark_io_manager",
210 |     key_prefix=["silver", "goodreads"],
211 |     compute_kind="OpenLibrary API",
212 |     group_name=LAYER,
213 | )
214 | def silver_collected_book(context, silver_cleaned_book: DataFrame) -> Output[DataFrame]:
215 |     """
216 |     Collect more infomation about cleaned books
217 |     - Authors: if missing
218 |     - Number of pages: if missing
219 |     """
220 | 
221 |     spark_df = silver_cleaned_book
222 |     context.log.debug("Caching spark_df ...")
223 |     spark_df.cache()
224 | 
225 |     context.log.info("Starting filling missing data ...")
226 |     null_authors_df = spark_df.filter(
227 |         (spark_df.Authors.isNull()) | (spark_df.Authors == "")
228 |     )
229 |     null_pages_number_df = spark_df.filter((spark_df.PagesNumber.isNull()))
230 | 
231 |     count = 0
232 |     for row in null_authors_df.select("ISBN").collect():
233 |         isbn = row[0]
234 |         context.log.debug(f"Got isbn: {isbn}")
235 |         if isbn is not None:
236 |             # Get request from OpenLibrary API
237 |             req = requests.get(
238 |                 f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&format=json&jscmd=data"
239 |             )
240 |             json = req.json()
241 |             if len(json.keys()) > 0:
242 |                 context.log.debug("Got json with data")
243 |                 # Check if spark_df with column 'ISBN' = isbn has missing value in column 'Authors'
244 |                 row_from_df = spark_df.filter(spark_df.ISBN == isbn).collect()[0]
245 |                 if row_from_df.Authors is None or row_from_df.Authors is "":
246 |                     context.log.debug("Authors is missing, start filling ...")
247 |                     # Take the first author
248 |                     author = json.get(f"ISBN:{isbn}" or {}).get("authors" or [])
249 |                     author = author[0].get("name") if len(author) > 0 else None
250 |                     if author:
251 |                         count += 1
252 |                         # Update spark_df with column 'ISBN' = isbn and column 'Authors' = author
253 |                         spark_df = spark_df.withColumn(
254 |                             "Authors",
255 |                             when(
256 |                                 (spark_df.ISBN == isbn)
257 |                                 & (
258 |                                     (spark_df.Authors.isNull())
259 |                                     | (spark_df.Authors == "")
260 |                                 ),
261 |                                 author,
262 |                             ).otherwise(spark_df.Authors),
263 |                         )
264 |     context.log.info(f"Filled in {count} authors")
265 | 
266 |     count = 0
267 |     for row in null_pages_number_df.select("ISBN").collect():
268 |         isbn = row[0]
269 |         context.log.debug(f"Got isbn: {isbn}")
270 |         if isbn is not None:
271 |             # Get request from OpenLibrary API
272 |             req = requests.get(
273 |                 f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&format=json&jscmd=data"
274 |             )
275 |             json = req.json()
276 |             if len(json.keys()) > 0:
277 |                 context.log.debug("Got json with real data")
278 |                 # Check if spark_df with column 'ISBN' = isbn has missing value in column 'Authors'
279 |                 row_from_df = spark_df.filter(spark_df.ISBN == isbn).collect()[0]
280 |                 # Check if spark_df with column 'ISBN' = isbn has missing value in column 'PagesNumber'
281 |                 if row_from_df.PagesNumber is None or row_from_df.PagesNumber == 0:
282 |                     context.log.debug("PagesNumber is missing, start filling ...")
283 |                     # Take the number of pages
284 |                     pages_number = json.get(f"ISBN:{isbn}" or {}).get("number_of_pages")
285 |                     if pages_number:
286 |                         count += 1
287 |                         # Update spark_df with column 'ISBN' = isbn and column 'PagesNumber' = pages_number
288 |                         spark_df = spark_df.withColumn(
289 |                             "PagesNumber",
290 |                             when(
291 |                                 (spark_df.ISBN == isbn)
292 |                                 & (spark_df.PagesNumber.isNull()),
293 |                                 pages_number,
294 |                             ).otherwise(spark_df.PagesNumber),
295 |                         )
296 |     context.log.info(f"Filled in {count} pages numbers")
297 | 
298 |     spark_df.unpersist()
299 | 
300 |     return Output(
301 |         value=spark_df,
302 |         metadata={
303 |             "table": "silver_collected_book",
304 |             "row_count": spark_df.count(),
305 |             "column_count": len(spark_df.columns),
306 |             "columns": spark_df.columns,
307 |         },
308 |     )
309 | 
310 | 
311 | # ISBN
312 | @asset(
313 |     description="Extract column 'ISBN' from silver_cleaned_book",
314 |     ins={
315 |         "silver_cleaned_book": AssetIn(
316 |             key_prefix=["silver", "goodreads"],
317 |             metadata={"full_load": True},
318 |         ),
319 |     },
320 |     io_manager_key="spark_io_manager",
321 |     key_prefix=["silver", "goodreads"],
322 |     compute_kind=COMPUTE_KIND,
323 |     group_name=LAYER,
324 | )
325 | def silver_isbn(context, silver_cleaned_book: DataFrame) -> Output[DataFrame]:
326 |     """
327 |     Extract column 'ISBN' from cleaned book
328 |     """
329 | 
330 |     context.log.debug("Extracting ISBN ...")
331 |     spark_df = silver_cleaned_book.select("ISBN")
332 | 
333 |     return Output(
334 |         value=spark_df,
335 |         metadata={
336 |             "table": "silver_isbn",
337 |             "row_count": spark_df.count(),
338 |             "column_count": len(spark_df.columns),
339 |             "columns": spark_df.columns,
340 |         },
341 |     )
342 | 
343 | 
344 | # Silver collected genre
345 | @asset(
346 |     description="Collect more infomation about cleaned genres",
347 |     ins={
348 |         "silver_isbn": AssetIn(
349 |             key_prefix=["silver", "goodreads"],
350 |         ),
351 |         "silver_cleaned_genre": AssetIn(
352 |             key_prefix=["silver", "goodreads"],
353 |         ),
354 |     },
355 |     io_manager_key="spark_io_manager",
356 |     key_prefix=["silver", "goodreads"],
357 |     compute_kind="OpenLibrary API",
358 |     group_name=LAYER,
359 | )
360 | def silver_collected_genre(
361 |     context, silver_isbn: DataFrame, silver_cleaned_genre: DataFrame
362 | ) -> Output[DataFrame]:
363 |     """
364 |     Collect more infomation about cleaned genres, with upstream isbn
365 |     Connect to OpenLibrary API to get more information about genre,
366 |     and union to silver_cleaned_genre, unique by 'Name'
367 |     """
368 | 
369 |     return Output(
370 |         value=silver_cleaned_genre,
371 |         metadata={
372 |             "table": "silver_collected_genre",
373 |             "row_count": silver_cleaned_genre.count(),
374 |             "column_count": len(silver_cleaned_genre.columns),
375 |             "columns": silver_cleaned_genre.columns,
376 |         },
377 |     )
378 | 
379 | 
380 | # Silver collected book_genre
381 | @asset(
382 |     description="Collect more relationships about books and genres",
383 |     ins={
384 |         "silver_isbn": AssetIn(
385 |             key_prefix=["silver", "goodreads"],
386 |         ),
387 |         "silver_collected_genre": AssetIn(
388 |             key_prefix=["silver", "goodreads"],
389 |         ),
390 |         "bronze_book_genre": AssetIn(
391 |             key_prefix=["bronze", "goodreads"],
392 |         ),
393 |     },
394 |     io_manager_key="spark_io_manager",
395 |     key_prefix=["silver", "goodreads"],
396 |     compute_kind="OpenLibrary API",
397 |     group_name=LAYER,
398 | )
399 | def silver_collected_book_genre(
400 |     context,
401 |     silver_isbn: DataFrame,
402 |     silver_collected_genre: DataFrame,
403 |     bronze_book_genre: pl.DataFrame,
404 | ) -> Output[DataFrame]:
405 |     """
406 |     Collect more relationships about books and genres
407 |     """
408 | 
409 |     config = {
410 |         "endpoint_url": os.getenv("MINIO_ENDPOINT"),
411 |         "minio_access_key": os.getenv("MINIO_ACCESS_KEY"),
412 |         "minio_secret_key": os.getenv("MINIO_SECRET_KEY"),
413 |     }
414 | 
415 |     context.log.debug("Start creating spark session")
416 | 
417 |     # Convert bronze_book from polars DataFrame to Spark DataFrame
418 |     pandas_df = bronze_book_genre.to_pandas()
419 |     context.log.debug(f"Converted to pandas DataFrame with shape: {pandas_df.shape}")
420 | 
421 |     with get_spark_session(config) as spark:
422 |         spark_df = spark.createDataFrame(pandas_df)
423 |         spark_df.cache()
424 |         context.log.info("Got Spark DataFrame")
425 | 
426 |         return Output(
427 |             value=spark_df,
428 |             metadata={
429 |                 "table": "silver_collected_book_genre",
430 |                 "row_count": spark_df.count(),
431 |                 "column_count": len(spark_df.columns),
432 |                 "columns": spark_df.columns,
433 |             },
434 |         )
435 | 


--------------------------------------------------------------------------------
/preprocess.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import polars as pl"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/plain": [
 20 |        "(1850310, 20)"
 21 |       ]
 22 |      },
 23 |      "execution_count": 2,
 24 |      "metadata": {},
 25 |      "output_type": "execute_result"
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "books_df = pl.read_csv(\"dataset/full_dataset.csv\")\n",
 30 |     "books_df.shape"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "data": {
 40 |       "text/plain": [
 41 |        "(1850198, 20)"
 42 |       ]
 43 |      },
 44 |      "execution_count": 3,
 45 |      "metadata": {},
 46 |      "output_type": "execute_result"
 47 |     }
 48 |    ],
 49 |    "source": [
 50 |     "books_df = books_df.unique()\n",
 51 |     "books_df.shape # Reduce 224/2 = 112 rows"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 4,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "low = books_df.filter(books_df['PublishYear'] < 1700).shape[0]\n",
 61 |     "high = books_df.filter(books_df['PublishYear'] > 2021).shape[0]"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 5,
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "data": {
 71 |       "text/plain": [
 72 |        "(1850149, 20)"
 73 |       ]
 74 |      },
 75 |      "execution_count": 5,
 76 |      "metadata": {},
 77 |      "output_type": "execute_result"
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "books_df = books_df[low:len(books_df)-high]\n",
 82 |     "books_df.shape"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 6,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "books_df = books_df.with_columns(books_df['Language'].str.replace('en-US', 'eng')\n",
 92 |     "                                                    .str.replace('en-GB', 'eng')\n",
 93 |     "                                                    .str.replace('en-CA', 'eng')\n",
 94 |     "                                                    .str.replace('--', 'eng')\n",
 95 |     "                                                    .str.replace('nl', 'nld')\n",
 96 |     "                                )"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 7,
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "data": {
106 |       "text/html": [
107 |        "<div><style>\n",
108 |        ".dataframe > thead > tr > th,\n",
109 |        ".dataframe > tbody > tr > td {\n",
110 |        "  text-align: right;\n",
111 |        "}\n",
112 |        "</style>\n",
113 |        "<small>shape: (3, 2)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>Language</th><th>counts</th></tr><tr><td>str</td><td>u32</td></tr></thead><tbody><tr><td>&quot;vie&quot;</td><td>2</td></tr><tr><td>null</td><td>1598356</td></tr><tr><td>&quot;eng&quot;</td><td>209656</td></tr></tbody></table></div>"
114 |       ],
115 |       "text/plain": [
116 |        "shape: (3, 2)\n",
117 |        "┌──────────┬─────────┐\n",
118 |        "│ Language ┆ counts  │\n",
119 |        "│ ---      ┆ ---     │\n",
120 |        "│ str      ┆ u32     │\n",
121 |        "╞══════════╪═════════╡\n",
122 |        "│ vie      ┆ 2       │\n",
123 |        "│ null     ┆ 1598356 │\n",
124 |        "│ eng      ┆ 209656  │\n",
125 |        "└──────────┴─────────┘"
126 |       ]
127 |      },
128 |      "execution_count": 7,
129 |      "metadata": {},
130 |      "output_type": "execute_result"
131 |     }
132 |    ],
133 |    "source": [
134 |     "books_df = books_df.filter((books_df['Language'] == 'eng') | (books_df['Language'] == 'vie') | books_df['Language'].is_null())\n",
135 |     "books_df['Language'].value_counts()"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 8,
141 |    "metadata": {},
142 |    "outputs": [
143 |     {
144 |      "data": {
145 |       "text/plain": [
146 |        "(1808014, 20)"
147 |       ]
148 |      },
149 |      "execution_count": 8,
150 |      "metadata": {},
151 |      "output_type": "execute_result"
152 |     }
153 |    ],
154 |    "source": [
155 |     "books_df.shape"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 9,
161 |    "metadata": {},
162 |    "outputs": [
163 |     {
164 |      "data": {
165 |       "text/plain": [
166 |        "(1803163, 20)"
167 |       ]
168 |      },
169 |      "execution_count": 9,
170 |      "metadata": {},
171 |      "output_type": "execute_result"
172 |     }
173 |    ],
174 |    "source": [
175 |     "books_df = books_df.drop_nulls(\"ISBN\")\n",
176 |     "books_df.shape"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 10,
182 |    "metadata": {},
183 |    "outputs": [
184 |     {
185 |      "data": {
186 |       "text/plain": [
187 |        "419030"
188 |       ]
189 |      },
190 |      "execution_count": 10,
191 |      "metadata": {},
192 |      "output_type": "execute_result"
193 |     }
194 |    ],
195 |    "source": [
196 |     "books_df = books_df.sort('PagesNumber')\n",
197 |     "n = books_df.filter(books_df['PagesNumber'] > 350).shape[0]\n",
198 |     "n"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 11,
204 |    "metadata": {},
205 |    "outputs": [
206 |     {
207 |      "data": {
208 |       "text/plain": [
209 |        "(1384133, 20)"
210 |       ]
211 |      },
212 |      "execution_count": 11,
213 |      "metadata": {},
214 |      "output_type": "execute_result"
215 |     }
216 |    ],
217 |    "source": [
218 |     "books_df = books_df[:len(books_df)-n]\n",
219 |     "books_df.shape"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 12,
225 |    "metadata": {},
226 |    "outputs": [
227 |     {
228 |      "data": {
229 |       "text/plain": [
230 |        "(1245688, 20)"
231 |       ]
232 |      },
233 |      "execution_count": 12,
234 |      "metadata": {},
235 |      "output_type": "execute_result"
236 |     }
237 |    ],
238 |    "source": [
239 |     "books_df = books_df.unique(subset=['Name'])\n",
240 |     "books_df.shape"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 13,
246 |    "metadata": {},
247 |    "outputs": [
248 |     {
249 |      "data": {
250 |       "text/html": [
251 |        "<div><style>\n",
252 |        ".dataframe > thead > tr > th,\n",
253 |        ".dataframe > tbody > tr > td {\n",
254 |        "  text-align: right;\n",
255 |        "}\n",
256 |        "</style>\n",
257 |        "<small>shape: (1245688, 20)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>Id</th><th>Name</th><th>Authors</th><th>ISBN</th><th>Rating</th><th>PublishYear</th><th>PublishMonth</th><th>PublishDay</th><th>Publisher</th><th>RatingDist5</th><th>RatingDist4</th><th>RatingDist3</th><th>RatingDist2</th><th>RatingDist1</th><th>RatingDistTotal</th><th>CountsOfReview</th><th>Language</th><th>Description</th><th>Count of text reviews</th><th>PagesNumber</th></tr><tr><td>i64</td><td>str</td><td>str</td><td>str</td><td>f64</td><td>i64</td><td>i64</td><td>i64</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>i64</td><td>str</td><td>str</td><td>str</td><td>i64</td></tr></thead><tbody><tr><td>1900656</td><td>&quot;Good In Bed&quot;</td><td>&quot;Jennifer Weine…</td><td>&quot;0743508467&quot;</td><td>3.73</td><td>2001</td><td>5</td><td>1</td><td>&quot;Simon &amp; Schust…</td><td>&quot;5:78557&quot;</td><td>&quot;4:92923&quot;</td><td>&quot;3:71496&quot;</td><td>&quot;2:21959&quot;</td><td>&quot;1:13187&quot;</td><td>&quot;total:278122&quot;</td><td>0</td><td>null</td><td>&quot;Jennifer Weine…</td><td>null</td><td>0</td></tr><tr><td>1900780</td><td>&quot;Letters from N…</td><td>&quot;Christopher J.…</td><td>&quot;1400105390&quot;</td><td>3.62</td><td>2007</td><td>9</td><td>25</td><td>&quot;Tantor Media&quot;</td><td>&quot;5:28&quot;</td><td>&quot;4:70&quot;</td><td>&quot;3:50&quot;</td><td>&quot;2:15&quot;</td><td>&quot;1:4&quot;</td><td>&quot;total:167&quot;</td><td>1</td><td>null</td><td>&quot;Senator Christ…</td><td>null</td><td>0</td></tr><tr><td>1900805</td><td>&quot;Tuesdays With …</td><td>&quot;Mitch Albom&quot;</td><td>&quot;0739311115&quot;</td><td>4.11</td><td>2004</td><td>6</td><td>1</td><td>&quot;Random House A…</td><td>&quot;5:333959&quot;</td><td>&quot;4:249424&quot;</td><td>&quot;3:131086&quot;</td><td>&quot;2:36787&quot;</td><td>&quot;1:14351&quot;</td><td>&quot;total:765607&quot;</td><td>1</td><td>null</td><td>&quot;This true stor…</td><td>null</td><td>0</td></tr><tr><td>1902872</td><td>&quot;The Bicycle Bo…</td><td>&quot;Geoff Apps&quot;</td><td>&quot;051708743X&quot;</td><td>2.0</td><td>1993</td><td>2</td><td>14</td><td>&quot;Crescent&quot;</td><td>&quot;5:0&quot;</td><td>&quot;4:0&quot;</td><td>&quot;3:0&quot;</td><td>&quot;2:1&quot;</td><td>&quot;1:0&quot;</td><td>&quot;total:1&quot;</td><td>1</td><td>null</td><td>null</td><td>null</td><td>0</td></tr><tr><td>1903429</td><td>&quot;Fair-Weather L…</td><td>&quot;Carla Bracale&quot;</td><td>&quot;0804102406&quot;</td><td>3.61</td><td>1989</td><td>1</td><td>30</td><td>&quot;Ivy Books&quot;</td><td>&quot;5:21&quot;</td><td>&quot;4:15&quot;</td><td>&quot;3:24&quot;</td><td>&quot;2:11&quot;</td><td>&quot;1:1&quot;</td><td>&quot;total:72&quot;</td><td>0</td><td>null</td><td>&quot;Sink or swim..…</td><td>null</td><td>0</td></tr><tr><td>1903542</td><td>&quot;The Adversity …</td><td>&quot;Paul G. Stoltz…</td><td>&quot;1400103584&quot;</td><td>3.98</td><td>2007</td><td>1</td><td>1</td><td>&quot;Tantor Media&quot;</td><td>&quot;5:55&quot;</td><td>&quot;4:66&quot;</td><td>&quot;3:36&quot;</td><td>&quot;2:7&quot;</td><td>&quot;1:3&quot;</td><td>&quot;total:167&quot;</td><td>0</td><td>null</td><td>&quot;A &lt;i&gt;Wall Stre…</td><td>null</td><td>0</td></tr><tr><td>1904282</td><td>&quot;Tell The World…</td><td>&quot;Liu Binyan&quot;</td><td>&quot;0517088355&quot;</td><td>3.67</td><td>1992</td><td>8</td><td>16</td><td>&quot;Random House V…</td><td>&quot;5:1&quot;</td><td>&quot;4:4&quot;</td><td>&quot;3:4&quot;</td><td>&quot;2:0&quot;</td><td>&quot;1:0&quot;</td><td>&quot;total:9&quot;</td><td>1</td><td>null</td><td>null</td><td>null</td><td>0</td></tr><tr><td>1904406</td><td>&quot;Hh-Big Red Bar…</td><td>&quot;Happy House&quot;</td><td>&quot;0394823915&quot;</td><td>0.0</td><td>1989</td><td>4</td><td>19</td><td>&quot;Random House B…</td><td>&quot;5:0&quot;</td><td>&quot;4:0&quot;</td><td>&quot;3:0&quot;</td><td>&quot;2:0&quot;</td><td>&quot;1:0&quot;</td><td>&quot;total:0&quot;</td><td>0</td><td>null</td><td>null</td><td>null</td><td>0</td></tr><tr><td>1904941</td><td>&quot;The Fourth K&quot;</td><td>&quot;Mario Puzo&quot;</td><td>&quot;0679423427&quot;</td><td>3.58</td><td>1991</td><td>3</td><td>6</td><td>&quot;Random House A…</td><td>&quot;5:623&quot;</td><td>&quot;4:1085&quot;</td><td>&quot;3:1045&quot;</td><td>&quot;2:341&quot;</td><td>&quot;1:77&quot;</td><td>&quot;total:3171&quot;</td><td>0</td><td>null</td><td>&quot;&lt;b&gt;A PRESIDENT…</td><td>null</td><td>0</td></tr><tr><td>1905496</td><td>&quot;1988 Baseball …</td><td>&quot;Donruss, Fleer…</td><td>&quot;0517660431&quot;</td><td>0.0</td><td>1988</td><td>5</td><td>30</td><td>&quot;Beekman House&quot;</td><td>&quot;5:0&quot;</td><td>&quot;4:0&quot;</td><td>&quot;3:0&quot;</td><td>&quot;2:0&quot;</td><td>&quot;1:0&quot;</td><td>&quot;total:0&quot;</td><td>0</td><td>null</td><td>null</td><td>null</td><td>0</td></tr><tr><td>1905683</td><td>&quot;The Princess a…</td><td>&quot;George MacDona…</td><td>&quot;140015085X&quot;</td><td>4.07</td><td>2003</td><td>6</td><td>1</td><td>&quot;Tantor Media&quot;</td><td>&quot;5:3196&quot;</td><td>&quot;4:2851&quot;</td><td>&quot;3:1669&quot;</td><td>&quot;2:349&quot;</td><td>&quot;1:97&quot;</td><td>&quot;total:8162&quot;</td><td>0</td><td>null</td><td>&quot;In this sequel…</td><td>null</td><td>0</td></tr><tr><td>1905755</td><td>&quot;Our Only May A…</td><td>&quot;Jennifer L. Ho…</td><td>&quot;0807282340&quot;</td><td>3.82</td><td>2000</td><td>5</td><td>2</td><td>&quot;Listening Libr…</td><td>&quot;5:2178&quot;</td><td>&quot;4:2928&quot;</td><td>&quot;3:2062&quot;</td><td>&quot;2:506&quot;</td><td>&quot;1:169&quot;</td><td>&quot;total:7843&quot;</td><td>3</td><td>null</td><td>&quot;Twelve-year-ol…</td><td>null</td><td>0</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>1185341</td><td>&quot;The Illinois  …</td><td>&quot;Michael P. Con…</td><td>&quot;0875801285&quot;</td><td>3.5</td><td>1988</td><td>1</td><td>8</td><td>&quot;Northern Illin…</td><td>&quot;5:0&quot;</td><td>&quot;4:1&quot;</td><td>&quot;3:1&quot;</td><td>&quot;2:0&quot;</td><td>&quot;1:0&quot;</td><td>&quot;total:2&quot;</td><td>0</td><td>null</td><td>null</td><td>&quot;0&quot;</td><td>350</td></tr><tr><td>1185950</td><td>&quot;At the End of …</td><td>&quot;Carole Minard&quot;</td><td>&quot;141373801X&quot;</td><td>0.0</td><td>2004</td><td>12</td><td>6</td><td>&quot;America Star B…</td><td>&quot;5:0&quot;</td><td>&quot;4:0&quot;</td><td>&quot;3:0&quot;</td><td>&quot;2:0&quot;</td><td>&quot;1:0&quot;</td><td>&quot;total:0&quot;</td><td>0</td><td>null</td><td>&quot;After being hu…</td><td>&quot;0&quot;</td><td>350</td></tr><tr><td>1185963</td><td>&quot;Immigration In…</td><td>&quot;Donald J. Puch…</td><td>&quot;1855674513&quot;</td><td>0.0</td><td>1997</td><td>1</td><td>4</td><td>&quot;Cassell&quot;</td><td>&quot;5:0&quot;</td><td>&quot;4:0&quot;</td><td>&quot;3:0&quot;</td><td>&quot;2:0&quot;</td><td>&quot;1:0&quot;</td><td>&quot;total:0&quot;</td><td>0</td><td>null</td><td>&quot;The essays in …</td><td>&quot;0&quot;</td><td>350</td></tr><tr><td>1187409</td><td>&quot;Performance-Ba…</td><td>&quot;Joan Boykoff B…</td><td>&quot;0226038033&quot;</td><td>0.0</td><td>1996</td><td>12</td><td>4</td><td>&quot;The National S…</td><td>&quot;5:0&quot;</td><td>&quot;4:0&quot;</td><td>&quot;3:0&quot;</td><td>&quot;2:0&quot;</td><td>&quot;1:0&quot;</td><td>&quot;total:0&quot;</td><td>0</td><td>null</td><td>&quot;Reforming our …</td><td>&quot;0&quot;</td><td>350</td></tr><tr><td>1187807</td><td>&quot;Community Jour…</td><td>&quot;Jock Lauterer&quot;</td><td>&quot;0813802865&quot;</td><td>0.0</td><td>2000</td><td>17</td><td>4</td><td>&quot;Wiley-Blackwel…</td><td>&quot;5:0&quot;</td><td>&quot;4:0&quot;</td><td>&quot;3:0&quot;</td><td>&quot;2:0&quot;</td><td>&quot;1:0&quot;</td><td>&quot;total:0&quot;</td><td>0</td><td>null</td><td>null</td><td>&quot;0&quot;</td><td>350</td></tr><tr><td>1187954</td><td>&quot;I Am Dracula&quot;</td><td>&quot;C. Dean Anders…</td><td>&quot;0821760254&quot;</td><td>3.89</td><td>1998</td><td>1</td><td>10</td><td>&quot;Zebra&quot;</td><td>&quot;5:49&quot;</td><td>&quot;4:33&quot;</td><td>&quot;3:18&quot;</td><td>&quot;2:10&quot;</td><td>&quot;1:8&quot;</td><td>&quot;total:118&quot;</td><td>12</td><td>null</td><td>&quot;From the Carpa…</td><td>&quot;12&quot;</td><td>350</td></tr><tr><td>1194027</td><td>&quot;The Male Parad…</td><td>&quot;John Munder&quot;</td><td>&quot;0671705172&quot;</td><td>3.33</td><td>1992</td><td>1</td><td>10</td><td>&quot;Simon &amp; Schust…</td><td>&quot;5:0&quot;</td><td>&quot;4:1&quot;</td><td>&quot;3:2&quot;</td><td>&quot;2:0&quot;</td><td>&quot;1:0&quot;</td><td>&quot;total:3&quot;</td><td>1</td><td>null</td><td>&quot;In chapters dr…</td><td>&quot;1&quot;</td><td>350</td></tr><tr><td>1195221</td><td>&quot;The Lamentable…</td><td>&quot;Edgardo Vega Y…</td><td>&quot;1585676306&quot;</td><td>3.81</td><td>2004</td><td>1</td><td>11</td><td>&quot;Overlook Press…</td><td>&quot;5:27&quot;</td><td>&quot;4:33&quot;</td><td>&quot;3:22&quot;</td><td>&quot;2:5&quot;</td><td>&quot;1:4&quot;</td><td>&quot;total:91&quot;</td><td>1</td><td>null</td><td>&quot;&lt;br /&gt;Writing …</td><td>&quot;1&quot;</td><td>350</td></tr><tr><td>1195839</td><td>&quot;Catalogue of P…</td><td>&quot;Lane Poole&quot;</td><td>&quot;1851240764&quot;</td><td>0.0</td><td>2008</td><td>15</td><td>7</td><td>&quot;Bodleian Libra…</td><td>&quot;5:0&quot;</td><td>&quot;4:0&quot;</td><td>&quot;3:0&quot;</td><td>&quot;2:0&quot;</td><td>&quot;1:0&quot;</td><td>&quot;total:0&quot;</td><td>0</td><td>null</td><td>&quot;Fully illustra…</td><td>&quot;0&quot;</td><td>350</td></tr><tr><td>1198445</td><td>&quot;The Lancastria…</td><td>&quot;Simon Walker&quot;</td><td>&quot;0198201745&quot;</td><td>4.75</td><td>1990</td><td>31</td><td>12</td><td>&quot;Oxford Univers…</td><td>&quot;5:3&quot;</td><td>&quot;4:1&quot;</td><td>&quot;3:0&quot;</td><td>&quot;2:0&quot;</td><td>&quot;1:0&quot;</td><td>&quot;total:4&quot;</td><td>1</td><td>null</td><td>&quot;John of Gaunt …</td><td>&quot;1&quot;</td><td>350</td></tr><tr><td>1199215</td><td>&quot;Call to Arms (…</td><td>&quot;Livia Hallam&quot;</td><td>&quot;1581824793&quot;</td><td>0.0</td><td>2005</td><td>1</td><td>10</td><td>&quot;Cumberland Hou…</td><td>&quot;5:0&quot;</td><td>&quot;4:0&quot;</td><td>&quot;3:0&quot;</td><td>&quot;2:0&quot;</td><td>&quot;1:0&quot;</td><td>&quot;total:0&quot;</td><td>0</td><td>null</td><td>&quot;On December 20…</td><td>&quot;0&quot;</td><td>350</td></tr><tr><td>1199788</td><td>&quot;Hong Kong (Jak…</td><td>&quot;Stephen Coonts…</td><td>&quot;0312253397&quot;</td><td>3.87</td><td>2000</td><td>12</td><td>9</td><td>&quot;St. Martin&#x27;s P…</td><td>&quot;5:508&quot;</td><td>&quot;4:721&quot;</td><td>&quot;3:472&quot;</td><td>&quot;2:93&quot;</td><td>&quot;1:28&quot;</td><td>&quot;total:1822&quot;</td><td>49</td><td>&quot;eng&quot;</td><td>null</td><td>&quot;49&quot;</td><td>350</td></tr></tbody></table></div>"
258 |       ],
259 |       "text/plain": [
260 |        "shape: (1245688, 20)\n",
261 |        "┌─────────┬────────────┬───────────┬───────────┬───┬──────────┬───────────┬────────────┬───────────┐\n",
262 |        "│ Id      ┆ Name       ┆ Authors   ┆ ISBN      ┆ … ┆ Language ┆ Descripti ┆ Count of   ┆ PagesNumb │\n",
263 |        "│ ---     ┆ ---        ┆ ---       ┆ ---       ┆   ┆ ---      ┆ on        ┆ text       ┆ er        │\n",
264 |        "│ i64     ┆ str        ┆ str       ┆ str       ┆   ┆ str      ┆ ---       ┆ reviews    ┆ ---       │\n",
265 |        "│         ┆            ┆           ┆           ┆   ┆          ┆ str       ┆ ---        ┆ i64       │\n",
266 |        "│         ┆            ┆           ┆           ┆   ┆          ┆           ┆ str        ┆           │\n",
267 |        "╞═════════╪════════════╪═══════════╪═══════════╪═══╪══════════╪═══════════╪════════════╪═══════════╡\n",
268 |        "│ 1900656 ┆ Good In    ┆ Jennifer  ┆ 074350846 ┆ … ┆ null     ┆ Jennifer  ┆ null       ┆ 0         │\n",
269 |        "│         ┆ Bed        ┆ Weiner    ┆ 7         ┆   ┆          ┆ Weiner's  ┆            ┆           │\n",
270 |        "│         ┆            ┆           ┆           ┆   ┆          ┆ <i>Good   ┆            ┆           │\n",
271 |        "│         ┆            ┆           ┆           ┆   ┆          ┆ in Bed…   ┆            ┆           │\n",
272 |        "│ 1900780 ┆ Letters    ┆ Christoph ┆ 140010539 ┆ … ┆ null     ┆ Senator   ┆ null       ┆ 0         │\n",
273 |        "│         ┆ from       ┆ er J.     ┆ 0         ┆   ┆          ┆ Christoph ┆            ┆           │\n",
274 |        "│         ┆ Nuremberg: ┆ Dodd      ┆           ┆   ┆          ┆ er J.     ┆            ┆           │\n",
275 |        "│         ┆ My Fathe…  ┆           ┆           ┆   ┆          ┆ Dodd      ┆            ┆           │\n",
276 |        "│         ┆            ┆           ┆           ┆   ┆          ┆ (Con…     ┆            ┆           │\n",
277 |        "│ 1900805 ┆ Tuesdays   ┆ Mitch     ┆ 073931111 ┆ … ┆ null     ┆ This true ┆ null       ┆ 0         │\n",
278 |        "│         ┆ With       ┆ Albom     ┆ 5         ┆   ┆          ┆ story     ┆            ┆           │\n",
279 |        "│         ┆ Morrie: An ┆           ┆           ┆   ┆          ┆ about the ┆            ┆           │\n",
280 |        "│         ┆ Old Man…   ┆           ┆           ┆   ┆          ┆ love b…   ┆            ┆           │\n",
281 |        "│ 1902872 ┆ The        ┆ Geoff     ┆ 051708743 ┆ … ┆ null     ┆ null      ┆ null       ┆ 0         │\n",
282 |        "│         ┆ Bicycle    ┆ Apps      ┆ X         ┆   ┆          ┆           ┆            ┆           │\n",
283 |        "│         ┆ Book:      ┆           ┆           ┆   ┆          ┆           ┆            ┆           │\n",
284 |        "│         ┆ Complete   ┆           ┆           ┆   ┆          ┆           ┆            ┆           │\n",
285 |        "│         ┆ Maint…     ┆           ┆           ┆   ┆          ┆           ┆            ┆           │\n",
286 |        "│ …       ┆ …          ┆ …         ┆ …         ┆ … ┆ …        ┆ …         ┆ …          ┆ …         │\n",
287 |        "│ 1195839 ┆ Catalogue  ┆ Lane      ┆ 185124076 ┆ … ┆ null     ┆ Fully ill ┆ 0          ┆ 350       │\n",
288 |        "│         ┆ of         ┆ Poole     ┆ 4         ┆   ┆          ┆ ustrated, ┆            ┆           │\n",
289 |        "│         ┆ Portraits  ┆           ┆           ┆   ┆          ┆ with      ┆            ┆           │\n",
290 |        "│         ┆ in the Bo… ┆           ┆           ┆   ┆          ┆ full-pag… ┆            ┆           │\n",
291 |        "│ 1198445 ┆ The Lancas ┆ Simon     ┆ 019820174 ┆ … ┆ null     ┆ John of   ┆ 1          ┆ 350       │\n",
292 |        "│         ┆ trian      ┆ Walker    ┆ 5         ┆   ┆          ┆ Gaunt was ┆            ┆           │\n",
293 |        "│         ┆ Affinity   ┆           ┆           ┆   ┆          ┆ arguably  ┆            ┆           │\n",
294 |        "│         ┆ 1361 13…   ┆           ┆           ┆   ┆          ┆ the m…    ┆            ┆           │\n",
295 |        "│ 1199215 ┆ Call to    ┆ Livia     ┆ 158182479 ┆ … ┆ null     ┆ On        ┆ 0          ┆ 350       │\n",
296 |        "│         ┆ Arms       ┆ Hallam    ┆ 3         ┆   ┆          ┆ December  ┆            ┆           │\n",
297 |        "│         ┆ (Palmetto  ┆           ┆           ┆   ┆          ┆ 20, 1860, ┆            ┆           │\n",
298 |        "│         ┆ Trilogy, … ┆           ┆           ┆   ┆          ┆ two       ┆            ┆           │\n",
299 |        "│         ┆            ┆           ┆           ┆   ┆          ┆ friend…   ┆            ┆           │\n",
300 |        "│ 1199788 ┆ Hong Kong  ┆ Stephen   ┆ 031225339 ┆ … ┆ eng      ┆ null      ┆ 49         ┆ 350       │\n",
301 |        "│         ┆ (Jake      ┆ Coonts    ┆ 7         ┆   ┆          ┆           ┆            ┆           │\n",
302 |        "│         ┆ Grafton    ┆           ┆           ┆   ┆          ┆           ┆            ┆           │\n",
303 |        "│         ┆ #8)        ┆           ┆           ┆   ┆          ┆           ┆            ┆           │\n",
304 |        "└─────────┴────────────┴───────────┴───────────┴───┴──────────┴───────────┴────────────┴───────────┘"
305 |       ]
306 |      },
307 |      "execution_count": 13,
308 |      "metadata": {},
309 |      "output_type": "execute_result"
310 |     }
311 |    ],
312 |    "source": [
313 |     "books_df"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 14,
319 |    "metadata": {},
320 |    "outputs": [
321 |     {
322 |      "data": {
323 |       "text/plain": [
324 |        "(1245577, 20)"
325 |       ]
326 |      },
327 |      "execution_count": 14,
328 |      "metadata": {},
329 |      "output_type": "execute_result"
330 |     }
331 |    ],
332 |    "source": [
333 |     "books_df = books_df.filter((books_df['PublishYear'] >= 1900) & (books_df['PublishYear'] <= 2021))\n",
334 |     "books_df.shape"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": 15,
340 |    "metadata": {},
341 |    "outputs": [
342 |     {
343 |      "data": {
344 |       "text/plain": [
345 |        "0"
346 |       ]
347 |      },
348 |      "execution_count": 15,
349 |      "metadata": {},
350 |      "output_type": "execute_result"
351 |     }
352 |    ],
353 |    "source": [
354 |     "books_df['PublishYear'].null_count()"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": 23,
360 |    "metadata": {},
361 |    "outputs": [
362 |     {
363 |      "data": {
364 |       "text/html": [
365 |        "<div><style>\n",
366 |        ".dataframe > thead > tr > th,\n",
367 |        ".dataframe > tbody > tr > td {\n",
368 |        "  text-align: right;\n",
369 |        "}\n",
370 |        "</style>\n",
371 |        "<small>shape: (31, 2)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>PublishMonth</th><th>counts</th></tr><tr><td>i64</td><td>u32</td></tr></thead><tbody><tr><td>8</td><td>52984</td></tr><tr><td>24</td><td>7843</td></tr><tr><td>16</td><td>6727</td></tr><tr><td>7</td><td>49209</td></tr><tr><td>31</td><td>17359</td></tr><tr><td>15</td><td>23649</td></tr><tr><td>23</td><td>7297</td></tr><tr><td>6</td><td>56532</td></tr><tr><td>14</td><td>7193</td></tr><tr><td>22</td><td>7444</td></tr><tr><td>30</td><td>13765</td></tr><tr><td>21</td><td>7584</td></tr><tr><td>&hellip;</td><td>&hellip;</td></tr><tr><td>19</td><td>7474</td></tr><tr><td>11</td><td>50881</td></tr><tr><td>27</td><td>7896</td></tr><tr><td>3</td><td>60740</td></tr><tr><td>26</td><td>7817</td></tr><tr><td>2</td><td>49068</td></tr><tr><td>10</td><td>62994</td></tr><tr><td>18</td><td>6916</td></tr><tr><td>25</td><td>9656</td></tr><tr><td>9</td><td>65666</td></tr><tr><td>1</td><td>415458</td></tr><tr><td>17</td><td>10259</td></tr></tbody></table></div>"
372 |       ],
373 |       "text/plain": [
374 |        "shape: (31, 2)\n",
375 |        "┌──────────────┬────────┐\n",
376 |        "│ PublishMonth ┆ counts │\n",
377 |        "│ ---          ┆ ---    │\n",
378 |        "│ i64          ┆ u32    │\n",
379 |        "╞══════════════╪════════╡\n",
380 |        "│ 8            ┆ 52984  │\n",
381 |        "│ 24           ┆ 7843   │\n",
382 |        "│ 16           ┆ 6727   │\n",
383 |        "│ 7            ┆ 49209  │\n",
384 |        "│ …            ┆ …      │\n",
385 |        "│ 25           ┆ 9656   │\n",
386 |        "│ 9            ┆ 65666  │\n",
387 |        "│ 1            ┆ 415458 │\n",
388 |        "│ 17           ┆ 10259  │\n",
389 |        "└──────────────┴────────┘"
390 |       ]
391 |      },
392 |      "execution_count": 23,
393 |      "metadata": {},
394 |      "output_type": "execute_result"
395 |     }
396 |    ],
397 |    "source": [
398 |     "books_df['PublishMonth'].value_counts()"
399 |    ]
400 |   }
401 |  ],
402 |  "metadata": {
403 |   "kernelspec": {
404 |    "display_name": "project-zhl6RxJh",
405 |    "language": "python",
406 |    "name": "python3"
407 |   },
408 |   "language_info": {
409 |    "codemirror_mode": {
410 |     "name": "ipython",
411 |     "version": 3
412 |    },
413 |    "file_extension": ".py",
414 |    "mimetype": "text/x-python",
415 |    "name": "python",
416 |    "nbconvert_exporter": "python",
417 |    "pygments_lexer": "ipython3",
418 |    "version": "3.10.6"
419 |   },
420 |   "orig_nbformat": 4
421 |  },
422 |  "nbformat": 4,
423 |  "nbformat_minor": 2
424 | }
425 | 


--------------------------------------------------------------------------------