├── .gitattributes ├── sql └── dwh │ ├── job_market_analytics │ ├── seeds │ │ └── .gitkeep │ ├── tests │ │ └── .gitkeep │ ├── analyses │ │ └── .gitkeep │ ├── macros │ │ └── .gitkeep │ ├── snapshots │ │ └── .gitkeep │ ├── .gitignore │ ├── models │ │ ├── mart │ │ │ ├── normalized_online_job_months_max.sql │ │ │ ├── normalized_online_job_months_1.sql │ │ │ ├── normalized_online_job_months_3.sql │ │ │ ├── normalized_online_job_months_12.sql │ │ │ ├── latest_dim_job.sql │ │ │ ├── dim_time.sql │ │ │ ├── dim_job.sql │ │ │ ├── fact_online_job.sql │ │ │ ├── dim_job_location.sql │ │ │ ├── dim_job_technology.sql │ │ │ └── normalized_online_job.sql │ │ └── sources.yml │ ├── README.md │ └── dbt_project.yml │ ├── requirements.in │ ├── update_requirements.sh │ └── requirements.txt ├── docker ├── airflow │ ├── logs │ │ └── scheduler │ │ │ └── latest │ ├── docker-compose-down.sh │ ├── restart_worker_and_scheduler.sh │ ├── .env.example │ └── docker-compose.yml └── postgres │ ├── postgres-parquet-fdw │ ├── s3-download-parquet-fdw.sh │ ├── Dockerfile │ ├── s4-install-parquet-fdw.sh │ ├── s1-download-arrow.sh │ └── s2-install-arrow.sh │ ├── .env.example │ ├── README.md │ └── docker-compose.yml ├── doc ├── dbt-dag.png ├── airflow_dag_daily.png ├── scrape_data_source_dag.png ├── raw-in-azure-blob-storage.png ├── TODO-search-document-structure.json ├── TODO-search.md ├── TODO-search-pre-search-data-model.md ├── metaData-bag.log └── TODO.md ├── python ├── dashy │ ├── .env.example │ ├── requirements.in │ ├── start_dashy.sh │ ├── update_requirements.sh │ └── requirements.txt ├── utils │ ├── generate_fernet_key.py │ ├── migrate_to_raw_v3.py │ └── migrate_raw_v1_to_raw_v2.py ├── tests │ ├── test_get_run_timestamp.py │ ├── test_get_chunk_size.py │ ├── test_parse_job_description.py │ └── data │ │ └── normalize_job_description │ │ └── output │ │ ├── test_case_7610222.json │ │ ├── test_case_7610188.json │ │ └── test_case_7609275.json ├── airflow │ ├── start_airflow_scheduler.sh │ ├── start_airflow_webserver.sh │ ├── create_user.sh │ ├── airflow_home │ │ └── dags │ │ │ ├── common_airflow_dag.py │ │ │ ├── test_dag.py │ │ │ ├── job_market_analytics_curate_sitemaps_catch_up_dag.py │ │ │ ├── job_market_analytics_cleanse_sitemaps_catch_up_dag.py │ │ │ ├── job_market_analytics_curate_job_descriptions_catch_up_dag.py │ │ │ ├── job_market_analytics_cleanse_job_descriptions_catch_up_dag.py │ │ │ ├── job_market_analytics_cleanse_catch_up_dag.py │ │ │ ├── job_market_analytics_curate_catch_up_dag_v2.py │ │ │ ├── job_market_analytics_hourly_dag.py │ │ │ ├── job_market_analytics_daily_dag.py │ │ │ └── job_market_analytics_daily_dag_catch_up.py │ ├── .env.example │ ├── configure_posgresql.sh │ └── install_airflow.sh ├── simplescraper │ ├── do_dbt_run.sh │ ├── start_flasky.sh │ ├── start_dashy_static.sh │ ├── requirements.in │ ├── common │ │ ├── logging.py │ │ ├── chunking.py │ │ ├── explore.py │ │ ├── entity.py │ │ ├── webclient.py │ │ ├── env_variables.py │ │ └── storage.py │ ├── cron_job.sh │ ├── update_requirements.sh │ ├── tasks │ │ ├── list_downloaded_sitemaps.py │ │ ├── curate_sitemaps.py │ │ ├── list_job_descriptions_to_download.py │ │ ├── prune_old_raw.py │ │ ├── list_downloaded_job_descriptions.py │ │ ├── cleanse_sitemaps.py │ │ ├── cleanse_job_descriptions.py │ │ ├── parse_job_description.py │ │ ├── download_sitemap.py │ │ ├── curate_job_descriptions.py │ │ └── download_job_descriptions.py │ ├── do_day_backup.sh │ ├── dashy_static.py │ ├── .env.example │ ├── create_curated_views_in_dwh.py │ ├── restore_day_backup.sh │ ├── scrape_data_source.py │ ├── verify_day_backup.sh │ ├── verify_all_backups.py │ ├── restore_all_backups.py │ ├── do_all_backups.py │ ├── explore │ │ ├── explore_dwh_mart.ipynb │ │ ├── explore_dwh_mart_dim_time.ipynb │ │ └── explore_dwh_location.ipynb │ ├── flasky.py │ └── requirements.txt └── .flake8 ├── Brewfile ├── azure ├── .env.example └── sync-remote-to-local.sh ├── .gitignore └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-vendored -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/seeds/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/analyses/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/macros/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/snapshots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sql/dwh/requirements.in: -------------------------------------------------------------------------------- 1 | dbt-duckdb==1.5.1 2 | duckdb==0.7.0 3 | -------------------------------------------------------------------------------- /docker/airflow/logs/scheduler/latest: -------------------------------------------------------------------------------- 1 | /opt/airflow/logs/scheduler/2022-07-30 -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /doc/dbt-dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petracarrion/job-market-analytics/HEAD/doc/dbt-dag.png -------------------------------------------------------------------------------- /doc/airflow_dag_daily.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petracarrion/job-market-analytics/HEAD/doc/airflow_dag_daily.png -------------------------------------------------------------------------------- /python/dashy/.env.example: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export DUCKDB_DWH_FILE= 4 | export VENV_ACTIVATE= 5 | export LOG_FOLDER= 6 | -------------------------------------------------------------------------------- /doc/scrape_data_source_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petracarrion/job-market-analytics/HEAD/doc/scrape_data_source_dag.png -------------------------------------------------------------------------------- /doc/raw-in-azure-blob-storage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petracarrion/job-market-analytics/HEAD/doc/raw-in-azure-blob-storage.png -------------------------------------------------------------------------------- /python/utils/generate_fernet_key.py: -------------------------------------------------------------------------------- 1 | from cryptography.fernet import Fernet 2 | 3 | fernet_key = Fernet.generate_key() 4 | print(fernet_key.decode()) 5 | -------------------------------------------------------------------------------- /python/dashy/requirements.in: -------------------------------------------------------------------------------- 1 | dash 2 | dash-bootstrap-components 3 | duckdb==0.7.0 4 | gunicorn 5 | jupyter-dash 6 | loguru 7 | pandas 8 | python-dotenv 9 | -------------------------------------------------------------------------------- /Brewfile: -------------------------------------------------------------------------------- 1 | tap "homebrew/bundle" 2 | tap "homebrew/core" 3 | brew "openblas" 4 | brew "parquet-tools" 5 | brew "postgresql" 6 | brew "rdfind" 7 | brew "rust" 8 | brew "wget" 9 | -------------------------------------------------------------------------------- /docker/airflow/docker-compose-down.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 4 | cd "$SCRIPTPATH" || exit 5 | 6 | docker compose down 7 | -------------------------------------------------------------------------------- /docker/postgres/postgres-parquet-fdw/s3-download-parquet-fdw.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | apt-get update 4 | apt-get install -y git 5 | 6 | git clone https://github.com/adjust/parquet_fdw.git 7 | -------------------------------------------------------------------------------- /azure/.env.example: -------------------------------------------------------------------------------- 1 | RAW_DIR= 2 | 3 | AZURE_STORAGE_CONTAINER_RAW_DIR_URL= 4 | 5 | export AZCOPY_AUTO_LOGIN_TYPE=SPN 6 | export AZCOPY_SPA_APPLICATION_ID= 7 | export AZCOPY_SPA_CLIENT_SECRET= 8 | export AZCOPY_TENANT_ID= -------------------------------------------------------------------------------- /python/tests/test_get_run_timestamp.py: -------------------------------------------------------------------------------- 1 | from common.storage import get_load_timestamp 2 | 3 | 4 | def test_get_load_timestamp(): 5 | assert get_load_timestamp('2022-01-22T12:49:39.448434+00:00') == '2022/01/22/12-49-39' 6 | -------------------------------------------------------------------------------- /docker/postgres/.env.example: -------------------------------------------------------------------------------- 1 | POSTGRES_USER= 2 | POSTGRES_PASSWORD= 3 | POSTGRES_DB= 4 | POSTGRES_VOLUME= 5 | POSTGRES_PARQUET_FDW_VOLUME= 6 | 7 | PGADMIN_DEFAULT_EMAIL= 8 | PGADMIN_DEFAULT_PASSWORD= 9 | PGADMIN_VOLUME= 10 | -------------------------------------------------------------------------------- /docker/postgres/README.md: -------------------------------------------------------------------------------- 1 | # Infrastructure 2 | 3 | ## How to run it 4 | 5 | Go to the folder postgres-parquet-fdw and run: 6 | 7 | `docker build -t postgres-parquet-fdw:v1 .` 8 | 9 | Then run: 10 | 11 | `docker-compose up` -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/models/mart/normalized_online_job_months_max.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='view' 4 | ) 5 | }} 6 | 7 | SELECT * 8 | FROM {{ ref('normalized_online_job') }} 9 | ORDER BY online_at 10 | -------------------------------------------------------------------------------- /python/airflow/start_airflow_scheduler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 4 | cd "$SCRIPTPATH" || exit 5 | 6 | source .env 7 | 8 | source "${VENV_ACTIVATE}" 9 | 10 | airflow scheduler 11 | -------------------------------------------------------------------------------- /python/airflow/start_airflow_webserver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 4 | cd "$SCRIPTPATH" || exit 5 | 6 | source .env 7 | 8 | source "${VENV_ACTIVATE}" 9 | 10 | airflow webserver 11 | -------------------------------------------------------------------------------- /docker/postgres/postgres-parquet-fdw/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM postgres:14.1 as postgres-parquet-fdw 2 | 3 | COPY *.sh /usr/local/bin/ 4 | 5 | RUN s1-download-arrow.sh 6 | RUN s2-install-arrow.sh 7 | RUN s3-download-parquet-fdw.sh 8 | RUN s4-install-parquet-fdw.sh 9 | -------------------------------------------------------------------------------- /docker/postgres/postgres-parquet-fdw/s4-install-parquet-fdw.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | apt-get update 4 | apt-get -y install \ 5 | build-essential \ 6 | cmake \ 7 | postgresql-server-dev-14 8 | 9 | cd parquet_fdw || exit 10 | make install 11 | -------------------------------------------------------------------------------- /python/simplescraper/do_dbt_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 4 | cd "$SCRIPTPATH" || exit 5 | 6 | source .env 7 | source "${DBT_VENV_ACTIVATE}" 8 | 9 | cd "$DBT_DIR" || exit 10 | 11 | dbt run 12 | -------------------------------------------------------------------------------- /sql/dwh/update_requirements.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 4 | cd "$SCRIPTPATH" || exit 5 | 6 | source .env 7 | 8 | source "${VENV_ACTIVATE}" 9 | 10 | pip install -r requirements.in 11 | pip freeze > requirements.txt 12 | -------------------------------------------------------------------------------- /azure/sync-remote-to-local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source .env 4 | 5 | azcopy login --service-principal --application-id "$AZCOPY_SPA_APPLICATION_ID" --tenant-id="$AZCOPY_TENANT_ID" 6 | 7 | azcopy sync "${RAW_DIR}" "${AZURE_STORAGE_CONTAINER_RAW_DIR_URL}" --recursive --exclude-pattern=".*" 8 | -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/models/mart/normalized_online_job_months_1.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='table' 4 | ) 5 | }} 6 | 7 | SELECT * 8 | FROM {{ ref('normalized_online_job') }} 9 | WHERE online_at >= current_date - INTERVAL 1 MONTH 10 | ORDER BY online_at 11 | -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/models/mart/normalized_online_job_months_3.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='table' 4 | ) 5 | }} 6 | 7 | SELECT * 8 | FROM {{ ref('normalized_online_job') }} 9 | WHERE online_at >= current_date - INTERVAL 3 MONTH 10 | ORDER BY online_at 11 | -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/models/mart/normalized_online_job_months_12.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='table' 4 | ) 5 | }} 6 | 7 | SELECT * 8 | FROM {{ ref('normalized_online_job') }} 9 | WHERE online_at >= current_date - INTERVAL 12 MONTH 10 | ORDER BY online_at 11 | -------------------------------------------------------------------------------- /python/dashy/start_dashy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 4 | cd "$SCRIPTPATH" || exit 5 | 6 | source .env 7 | 8 | source "${VENV_ACTIVATE}" 9 | 10 | gunicorn --workers 1 --timeout 600 --bind 0.0.0.0:8051 dashy:server --access-logfile '-' 11 | -------------------------------------------------------------------------------- /python/tests/test_get_chunk_size.py: -------------------------------------------------------------------------------- 1 | from common.chunking import get_chunk_size 2 | 3 | 4 | def test_get_chunk_size(): 5 | assert get_chunk_size(1000, 10, 500) == 100 6 | assert get_chunk_size(1000, 10, 50) == 50 7 | assert get_chunk_size(60, 4, 10) == 8 8 | assert get_chunk_size(100, 4, 10) == 9 9 | -------------------------------------------------------------------------------- /docker/postgres/postgres-parquet-fdw/s1-download-arrow.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | apt update 4 | apt install -y -V ca-certificates lsb-release wget 5 | wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb 6 | -------------------------------------------------------------------------------- /python/simplescraper/start_flasky.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 4 | cd "$SCRIPTPATH" || exit 5 | 6 | source .env 7 | 8 | source "${VENV_ACTIVATE}" 9 | 10 | ulimit -n 4096 11 | gunicorn --workers 4 --timeout 3600 --bind 0.0.0.0:3001 'flasky:app' 12 | -------------------------------------------------------------------------------- /python/simplescraper/start_dashy_static.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 4 | cd "$SCRIPTPATH" || exit 5 | 6 | source .env 7 | 8 | source "${VENV_ACTIVATE}" 9 | 10 | ulimit -n 4096 11 | gunicorn --workers 4 --timeout 3600 --bind 0.0.0.0:8054 'dashy_static:app' 12 | -------------------------------------------------------------------------------- /docker/airflow/restart_worker_and_scheduler.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 4 | cd "$SCRIPTPATH" || exit 5 | 6 | for container in airflow-worker airflow-scheduler; do 7 | docker compose stop $container 8 | docker compose rm -f $container 9 | docker compose up $container -d 10 | done 11 | -------------------------------------------------------------------------------- /python/simplescraper/requirements.in: -------------------------------------------------------------------------------- 1 | azure-storage-blob==2.1.0 2 | beautifulsoup4 3 | duckdb==0.7.0 4 | Flask 5 | gunicorn 6 | jupyter 7 | kaleido 8 | lxml 9 | loguru 10 | pandas 11 | pip-tools 12 | playwright==1.30.0 13 | plotly-calplot 14 | plotly-express 15 | pyarrow 16 | pytest 17 | python-dotenv 18 | requests 19 | wemake-python-styleguide 20 | xmltodict 21 | -------------------------------------------------------------------------------- /python/simplescraper/common/logging.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import sys 3 | 4 | from loguru import logger 5 | 6 | from common.env_variables import TEMP_DIR 7 | 8 | 9 | def configure_logger(load_timestamp): 10 | logger.remove() 11 | logger.add(sys.stdout, colorize=True) 12 | logger.add(os.path.join(TEMP_DIR, load_timestamp, f'00_logs.log')) 13 | 14 | 15 | logger = logger 16 | -------------------------------------------------------------------------------- /python/airflow/create_user.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source .env 4 | 5 | airflow users create \ 6 | --role Admin \ 7 | --username "${AIRFLOW_USERNAME}" \ 8 | --password "${AIRFLOW_PASSWORD}" \ 9 | --email "${AIRFLOW_EMAIL}" \ 10 | --firstname "${AIRFLOW_FIRSTNAME}" \ 11 | --lastname "${AIRFLOW_LASTNAME}" 12 | 13 | airflow users delete -e admin 14 | -------------------------------------------------------------------------------- /python/airflow/airflow_home/dags/common_airflow_dag.py: -------------------------------------------------------------------------------- 1 | from airflow.operators.python import get_current_context 2 | from airflow.providers.http.hooks.http import HttpHook 3 | 4 | 5 | def run_flasky_task(endpoint): 6 | context = get_current_context() 7 | data = { 8 | 'data_interval_end': context['data_interval_end'], 9 | 'ds': context['ds'], 10 | } 11 | HttpHook().run(endpoint, data) 12 | -------------------------------------------------------------------------------- /doc/TODO-search-document-structure.json: -------------------------------------------------------------------------------- 1 | { 2 | "job_id": "4324234", 3 | "short_description": "Snail Collector at Alternative Food in Berlin or Hamburg", 4 | "url": "https://data.source/snail-collector-berlin-hamburg.html", 5 | "locations": [ 6 | "Berlin", 7 | "Hamburg" 8 | ], 9 | "online_week": [ 10 | "2022W11", 11 | "2022W10", 12 | "2022W09", 13 | "2022W02", 14 | "2022W01" 15 | ] 16 | } -------------------------------------------------------------------------------- /doc/TODO-search.md: -------------------------------------------------------------------------------- 1 | # Search 2 | 3 | ## Facets 4 | 5 | - Company 6 | - Position 7 | - Technology 8 | - Location 9 | - Date? 10 | 11 | ## Document Fields 12 | 13 | - Job ID? 14 | - Job Short Description 15 | - Job Name 16 | - Job Company 17 | - Job Locations 18 | - Job URL 19 | - Job Online Dates 20 | 21 | ## Document Structure 22 | 23 | See [TODO-search-document-structure.json](TODO-search-document-structure.json) 24 | -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/models/sources.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: curated 5 | schema: curated 6 | freshness: # default freshness 7 | warn_after: { count: 24, period: hour } 8 | error_after: { count: 36, period: hour } 9 | loaded_at_field: load_timestamp 10 | tables: 11 | - name: online_job 12 | - name: job 13 | - name: job_location 14 | - name: job_technology 15 | -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/models/mart/latest_dim_job.sql: -------------------------------------------------------------------------------- 1 | SELECT job_key, 2 | job_id, 3 | job_ldts, 4 | title, 5 | company_name 6 | FROM ( 7 | SELECT job_key, 8 | job_id, 9 | job_ldts, 10 | title, 11 | company_name, 12 | ROW_NUMBER() OVER (PARTITION BY job_id ORDER BY job_ldts DESC) rn 13 | FROM {{ ref('dim_job') }} 14 | ) 15 | WHERE rn = 1 16 | -------------------------------------------------------------------------------- /python/simplescraper/common/chunking.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | def get_chunk_size(total, slots, max_chunk_size): 5 | max_run_size = slots * max_chunk_size 6 | 7 | number_of_runs = total / max_run_size 8 | number_of_runs = int(math.ceil(number_of_runs)) 9 | 10 | number_of_chunks = number_of_runs * slots 11 | 12 | chunk_size = total / number_of_chunks 13 | chunk_size = int(math.ceil(chunk_size)) 14 | 15 | return chunk_size 16 | -------------------------------------------------------------------------------- /python/simplescraper/cron_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Add the following to the cron jobs: 42 * * * * REPLACE_ME/cron_job.sh 4 | 5 | /usr/sbin/scutil --nc list | grep Connected | grep vpn || { 6 | echo "Please connect to the VPN" 7 | exit 1 8 | } 9 | 10 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 11 | cd "$SCRIPTPATH" || exit 12 | 13 | source .env 14 | 15 | source "${VENV_ACTIVATE}" 16 | 17 | "${VENV_PYTHON}" "${SOURCE_DIR}"/simplescraper/scrape_data_source.py 18 | -------------------------------------------------------------------------------- /python/airflow/.env.example: -------------------------------------------------------------------------------- 1 | export VENV_ACTIVATE= 2 | 3 | export AIRFLOW_HOME= 4 | 5 | export AIRFLOW_DATABASE_NAME= 6 | export AIRFLOW_DATABASE_USERNAME= 7 | export AIRFLOW_DATABASE_PASSWORD= 8 | 9 | export AIRFLOW__DATABASE__SQL_ALCHEMY_CONN= 10 | export AIRFLOW__CORE__EXECUTOR= 11 | 12 | export AIRFLOW_USERNAME= 13 | export AIRFLOW_PASSWORD= 14 | export AIRFLOW_EMAIL= 15 | export AIRFLOW_FIRSTNAME= 16 | export AIRFLOW_LASTNAME= 17 | 18 | export AIRFLOW__CORE__LOAD_EXAMPLES= 19 | 20 | export AIRFLOW_CONN_HTTP_DEFAULT= 21 | -------------------------------------------------------------------------------- /python/airflow/airflow_home/dags/test_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from airflow import DAG 4 | from airflow.decorators import task 5 | 6 | from common_airflow_dag import run_flasky_task 7 | 8 | with DAG('test_dag2', 9 | description='Test DAG', 10 | schedule_interval='@daily', 11 | start_date=datetime(2022, 7, 29), 12 | catchup=False) as dag: 13 | @task(task_id="test_task") 14 | def run_test(): 15 | run_flasky_task('do/test') 16 | 17 | 18 | run_test() 19 | -------------------------------------------------------------------------------- /python/simplescraper/update_requirements.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 4 | cd "$SCRIPTPATH" || exit 5 | 6 | source .env 7 | 8 | source "${VENV_ACTIVATE}" 9 | 10 | 11 | if ! pip show pip-tools; then 12 | pip install pip-tools 13 | fi 14 | 15 | pip-compile requirements.in --allow-unsafe 16 | pip-sync 17 | # pip install "apache-airflow[celery]==2.2.3" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.2.3/constraints-3.8.txt" 18 | # pip install dbt-postgres 19 | -------------------------------------------------------------------------------- /python/airflow/configure_posgresql.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 4 | cd "$SCRIPTPATH" || exit 5 | 6 | source .env 7 | 8 | sudo -u postgres psql -c "CREATE DATABASE ${AIRFLOW_DATABASE_NAME};" 9 | 10 | sudo -u postgres psql -c "CREATE USER ${AIRFLOW_DATABASE_USERNAME} WITH ENCRYPTED PASSWORD '${AIRFLOW_DATABASE_PASSWORD};'" 11 | 12 | sudo -u postgres psql -c "GRANT ALL PRIVILEGES ON DATABASE ${AIRFLOW_DATABASE_NAME} TO ${AIRFLOW_DATABASE_USERNAME};" 13 | sudo -u postgres psql -c "GRANT ALL ON SCHEMA public TO ${AIRFLOW_DATABASE_USERNAME};" 14 | -------------------------------------------------------------------------------- /python/dashy/update_requirements.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 4 | cd "$SCRIPTPATH" || exit 5 | 6 | source .env 7 | 8 | source "${VENV_ACTIVATE}" 9 | 10 | which pip | grep dashy || (echo "Wrong venv!!!" && exit) 11 | 12 | if ! pip show pip-tools; then 13 | pip install pip-tools 14 | fi 15 | 16 | pip-compile requirements.in --allow-unsafe 17 | pip-sync 18 | # pip install "apache-airflow[celery]==2.2.3" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.2.3/constraints-3.8.txt" 19 | # pip install dbt-postgres 20 | -------------------------------------------------------------------------------- /python/simplescraper/common/explore.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | import pandas as pd 3 | from IPython.display import display 4 | 5 | from common.env_variables import DUCKDB_DWH_FILE 6 | 7 | 8 | def display_df(_df): 9 | with pd.option_context('display.max_rows', None, 'display.max_columns', None, "expand_frame_repr", False, 10 | "display.float_format", '${:,.2f}'.format): 11 | display(_df.fillna('.')) 12 | 13 | 14 | def display_sql(sql_statement, read_only=True): 15 | conn = duckdb.connect(DUCKDB_DWH_FILE, read_only=read_only) 16 | _df = conn.execute(sql_statement).df() 17 | conn.close() 18 | return _df 19 | -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt run 7 | - dbt test 8 | 9 | 10 | ### Resources: 11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support 14 | - Find [dbt events](https://events.getdbt.com) near you 15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 16 | -------------------------------------------------------------------------------- /docker/airflow/.env.example: -------------------------------------------------------------------------------- 1 | AIRFLOW_UID= 2 | AIRFLOW_GID= 3 | 4 | AIRFLOW_FERNET_KEY= 5 | AIRFLOW_SECRET_KEY= 6 | 7 | AIRFLOW_DATABASE_HOST= 8 | AIRFLOW_DATABASE_PORT_NUMBER= 9 | AIRFLOW_DATABASE_NAME= 10 | AIRFLOW_DATABASE_USERNAME= 11 | AIRFLOW_DATABASE_PASSWORD= 12 | AIRFLOW_DATABASE_USE_SSL= 13 | 14 | AIRFLOW_USERNAME= 15 | AIRFLOW_PASSWORD= 16 | AIRFLOW_EMAIL= 17 | AIRFLOW_FIRSTNAME= 18 | AIRFLOW_LASTNAME= 19 | 20 | AIRFLOW_WEBSERVER_VOLUME= 21 | AIRFLOW_DAGS_VOLUME= 22 | AIRFLOW_LOGS_VOLUME= 23 | AIRFLOW_PLUGINS_VOLUME= 24 | REDIS_VOLUME= 25 | 26 | AIRFLOW__CORE__LOAD_EXAMPLES= 27 | 28 | AIRFLOW_CONN_HTTP_DEFAULT= 29 | AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG= 30 | -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/models/mart/dim_time.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'table', 4 | ) 5 | }} 6 | 7 | WITH unique_online_at AS ( 8 | SELECT DISTINCT online_at 9 | FROM {{ source('curated', 'online_job') }} 10 | ORDER BY 1 11 | ) 12 | SELECT online_at as date_key, 13 | date_part('year', online_at) as year, 14 | date_part('month', online_at) as month, 15 | date_part('day', online_at) as day, 16 | monthname(online_at) as month_name, 17 | date_part('yearweek', online_at) as year_week, 18 | date_part('isodow', online_at) as day_of_week, 19 | dayname(online_at) as day_of_week_name 20 | FROM unique_online_at 21 | -------------------------------------------------------------------------------- /python/simplescraper/common/entity.py: -------------------------------------------------------------------------------- 1 | class Entity: 2 | def __init__(self, name): 3 | self.name = name 4 | 5 | def __str__(self): 6 | return self.name 7 | 8 | 9 | SITEMAP = Entity('sitemap') 10 | ONLINE_JOB = Entity('online_job') 11 | JOB_DESCRIPTION = Entity('job_description') 12 | JOB = Entity('job') 13 | JOB_LOCATION = Entity('job_location') 14 | JOB_TECHNOLOGY = Entity('job_technology') 15 | 16 | RAW_ENTITIES = [ 17 | SITEMAP, 18 | JOB_DESCRIPTION, 19 | ] 20 | CURATED_ENTITIES = [ 21 | ONLINE_JOB, 22 | JOB, 23 | JOB_LOCATION, 24 | JOB_TECHNOLOGY, 25 | ] 26 | 27 | if __name__ == "__main__": 28 | for entity in CURATED_ENTITIES: 29 | print(entity) 30 | -------------------------------------------------------------------------------- /python/simplescraper/tasks/list_downloaded_sitemaps.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from common.entity import SITEMAP 4 | from common.env_variables import LATEST_LOAD_TIMESTAMP 5 | from common.storage import DATA_SOURCE_NAME, save_temp_df, list_raw_files, DOWNLOADED_SITEMAPS_CSV, get_load_date 6 | 7 | 8 | def list_downloaded_sitemaps(load_timestamp, load_date=None) -> pd.DataFrame: 9 | files = list_raw_files(DATA_SOURCE_NAME, SITEMAP, load_date) 10 | df = pd.DataFrame(files) 11 | df = df[df['file_name'] != 'sitemapindex.xml'] 12 | if load_date is None: 13 | save_temp_df(df, load_timestamp, DOWNLOADED_SITEMAPS_CSV) 14 | return df 15 | 16 | 17 | if __name__ == "__main__": 18 | list_downloaded_sitemaps(LATEST_LOAD_TIMESTAMP, get_load_date()) 19 | -------------------------------------------------------------------------------- /python/simplescraper/do_day_backup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 4 | cd "$SCRIPTPATH" || exit 5 | 6 | source .env 7 | 8 | if [[ $# -ne 3 ]] ; then 9 | echo "Please provide a date as script parameters in the following format: year month day" 10 | echo "Example: $0 2022 12 01" 11 | exit 1 12 | fi 13 | 14 | for entity in job_description sitemap 15 | do 16 | 17 | source=${RAW_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2/$3 18 | 19 | if [ -d "$source" ] 20 | then 21 | 22 | target_dir=${BACKUP_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2 23 | target_filename=${target_dir}/${entity}.$1$2$3.tar.gz 24 | mkdir -p "${target_dir}" 25 | tar -zcvf "${target_filename}" -C "${source}" . 26 | 27 | fi 28 | 29 | done 30 | -------------------------------------------------------------------------------- /doc/TODO-search-pre-search-data-model.md: -------------------------------------------------------------------------------- 1 | # Pre Search Data Model 2 | 3 | ## Overview 4 | 5 | - job_online 6 | - job_id 7 | - online_at 8 | - url 9 | - job 10 | - job_id 11 | - job_description 12 | - job_id 13 | - title 14 | - online_status 15 | - is_anonymous 16 | - should_display_early_applicant 17 | - contract_type 18 | - work_type 19 | - online_date 20 | - description_introduction 21 | - description_responsabilities 22 | - description_requirements' 23 | - description_perks 24 | - company 25 | - company_name 26 | - job_company 27 | - job_id 28 | - company_name 29 | - location 30 | - location_name 31 | - job_location 32 | - job_id 33 | - location_name 34 | - technology 35 | - technology_name 36 | - job_technology 37 | - job_id 38 | - technology_name 39 | -------------------------------------------------------------------------------- /python/airflow/install_airflow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | which pip | grep /airflow/venv/ || (echo "Wrong venv!!!" && exit) 4 | 5 | # Install Airflow using the constraints file 6 | AIRFLOW_VERSION=2.7.2 7 | PYTHON_VERSION="$(python --version | cut -d " " -f 2 | cut -d "." -f 1-2)" 8 | # For example: 3.7 9 | CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt" 10 | # For example: https://raw.githubusercontent.com/apache/airflow/constraints-2.4.1/constraints-3.7.txt 11 | pip install "apache-airflow==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}" 12 | pip install psycopg2 13 | 14 | airflow db upgrade 15 | 16 | # The Standalone command will initialise the database, make a user, 17 | # and start all components for you. 18 | airflow standalone 19 | -------------------------------------------------------------------------------- /python/tests/test_parse_job_description.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | 5 | from tasks.parse_job_description import parse_job_description 6 | 7 | 8 | def load_file(file_path): 9 | with open(f'data/normalize_job_description/{file_path}', 'r') as f: 10 | content = f.read() 11 | return content 12 | 13 | 14 | @pytest.mark.parametrize('test_case', ['test_case_7610188', 'test_case_7610222', 'test_case_7609275']) 15 | def test_parse_job_description(test_case): 16 | input_content = load_file('input/' + test_case + '.txt') 17 | 18 | result_content = parse_job_description(input_content) 19 | # temp = json.dumps(result_content, indent=2, ensure_ascii=False) 20 | 21 | output_content = json.loads(load_file('output/' + test_case + '.json')) 22 | assert result_content == output_content 23 | -------------------------------------------------------------------------------- /python/simplescraper/dashy_static.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | 3 | app = Flask(__name__) 4 | 5 | HTML = ''' 6 | 15 | 16 |

Static Dashboard

17 | 18 |

Overview

19 | Overview 20 | 21 |

Top Five Cities

22 | Top Five Cities 23 | 24 |

Top Five Technologies

25 | Top Five Technologies 26 | ''' 27 | 28 | 29 | @app.route('/') 30 | def index(): 31 | return HTML 32 | 33 | -------------------------------------------------------------------------------- /python/airflow/airflow_home/dags/job_market_analytics_curate_sitemaps_catch_up_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, timedelta 3 | 4 | from airflow import DAG 5 | from airflow.decorators import task 6 | 7 | from common_airflow_dag import run_flasky_task 8 | 9 | os.environ["no_proxy"] = "*" 10 | 11 | with DAG('job_market_analytics_curate_sitemaps_catch_up_dag', 12 | description='Job Market Analytics Curate Sitemaps Catch Up DAG', 13 | schedule_interval='@daily', 14 | start_date=datetime(2022, 1, 1), 15 | dagrun_timeout=timedelta(minutes=60), 16 | max_active_runs=4, 17 | max_active_tasks=4, 18 | catchup=True) as dag: 19 | @task(task_id="curate_sitemaps") 20 | def curate_sitemaps(): 21 | run_flasky_task('do/curate_sitemaps') 22 | 23 | 24 | curate_sitemaps() 25 | -------------------------------------------------------------------------------- /python/simplescraper/.env.example: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export VENV_ACTIVATE= 4 | export VENV_PYTHON= 5 | export SOURCE_DIR= 6 | 7 | export DATA_DIR= 8 | export DATA_SOURCE_NAME= 9 | export DATA_SOURCE_URL= 10 | 11 | export RAW_DIR= 12 | export CLEANSED_DIR= 13 | export CURATED_DIR= 14 | export DUCKDB_DWH_FILE= 15 | export TEMP_DIR= 16 | 17 | export BACKUP_DIR= 18 | 19 | export SEMAPHORE_COUNT= 20 | export MAX_CHUNK_SIZE= 21 | export MIN_TO_DOWNLOAD= 22 | export MAX_TO_DOWNLOAD= 23 | export ONLINE_EXPIRATION_IN_DAYS= 24 | 25 | export LATEST_LOAD_TIMESTAMP= 26 | 27 | export RUN_HEADLESS= 28 | 29 | export FLASK_APP= 30 | export FLASK_ENV= 31 | export FLASK_DEBUG= 32 | 33 | export UPLOAD_TO_AZURE= 34 | 35 | export AZURE_STORAGE_CONNECTION_STRING= 36 | export AZURE_STORAGE_CONTAINER_NAME= 37 | 38 | export LANG= 39 | export LC_ALL= 40 | 41 | export DBT_VENV_ACTIVATE= 42 | export DBT_DIR= 43 | -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/models/mart/dim_job.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental' 4 | ) 5 | }} 6 | 7 | 8 | SELECT MD5(CONCAT_WS('||', 9 | COALESCE( 10 | UPPER(TRIM(CAST( 11 | job.job_id 12 | AS VARCHAR))), 13 | '^^'), 14 | COALESCE( 15 | UPPER(TRIM(CAST( 16 | job.load_timestamp 17 | AS VARCHAR))), 18 | '^^') 19 | )) AS job_key, 20 | job.job_id, 21 | job.load_timestamp as job_ldts, 22 | job.title, 23 | job.company_name 24 | FROM {{ source('curated', 'job') }} 25 | 26 | {% if is_incremental() %} 27 | LEFT OUTER JOIN dim_job 28 | ON (job.job_id = dim_job.job_id AND 29 | job.load_timestamp = dim_job.job_ldts) 30 | WHERE dim_job.job_id IS NULL 31 | {% endif %} 32 | -------------------------------------------------------------------------------- /python/simplescraper/common/webclient.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | REQUEST_HEADERS = { 4 | "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8," 5 | "application/signed-exchange;v=b3;q=0.9", 6 | "accept-language": "en-US,en;q=0.9,es;q=0.8,it-IT;q=0.7,it;q=0.6,de-DE;q=0.5,de;q=0.4", 7 | "cache-control": "max-age=0", 8 | "sec-ch-ua": "\"Chromium\";v=\"94\", \"Google Chrome\";v=\"94\", \";Not A Brand\";v=\"99\"", 9 | "sec-ch-ua-mobile": "?0", 10 | "sec-ch-ua-platform": "\"macOS\"", 11 | "sec-fetch-dest": "document", 12 | "sec-fetch-mode": "navigate", 13 | "sec-fetch-site": "none", 14 | "sec-fetch-user": "?1", 15 | "upgrade-insecure-requests": "1" 16 | } 17 | 18 | 19 | def get_url_content(url): 20 | response = requests.get(url) 21 | content = response.content 22 | return content 23 | -------------------------------------------------------------------------------- /python/airflow/airflow_home/dags/job_market_analytics_cleanse_sitemaps_catch_up_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, timedelta 3 | 4 | from airflow import DAG 5 | from airflow.decorators import task 6 | 7 | from common_airflow_dag import run_flasky_task 8 | 9 | os.environ["no_proxy"] = "*" 10 | 11 | with DAG('job_market_analytics_cleanse_sitemaps_catch_up_dag', 12 | description='Job Market Analytics Cleanse Sitemaps Catch Up DAG', 13 | schedule_interval='@daily', 14 | start_date=datetime(2022, 1, 1), 15 | # end_date=datetime(2021, 12, 1), 16 | dagrun_timeout=timedelta(minutes=10), 17 | max_active_runs=4, 18 | max_active_tasks=4, 19 | catchup=True) as dag: 20 | @task(task_id="cleanse_sitemaps") 21 | def cleanse_sitemaps(): 22 | run_flasky_task('do/cleanse_sitemaps') 23 | 24 | 25 | cleanse_sitemaps() 26 | -------------------------------------------------------------------------------- /python/simplescraper/create_curated_views_in_dwh.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import duckdb 4 | 5 | from common.entity import CURATED_ENTITIES 6 | from common.env_variables import CURATED_DIR, DATA_SOURCE_NAME, DUCKDB_DWH_FILE 7 | 8 | 9 | def create_curated_views_in_dwh(): 10 | conn = duckdb.connect(DUCKDB_DWH_FILE) 11 | 12 | conn.execute(f''' 13 | CREATE SCHEMA IF NOT EXISTS curated; 14 | ''') 15 | 16 | for entity in CURATED_ENTITIES: 17 | curated_path = os.path.join(CURATED_DIR, DATA_SOURCE_NAME, entity.name, '*/*/*/*.parquet') 18 | 19 | conn.execute(f''' 20 | CREATE OR REPLACE view curated.{entity.name} AS 21 | SELECT * FROM parquet_scan('{curated_path}', HIVE_PARTITIONING=1) 22 | -- WHERE load_timestamp < '2022-07-01' 23 | ; 24 | ''') 25 | 26 | conn.close() 27 | 28 | 29 | if __name__ == "__main__": 30 | create_curated_views_in_dwh() 31 | -------------------------------------------------------------------------------- /python/airflow/airflow_home/dags/job_market_analytics_curate_job_descriptions_catch_up_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, timedelta 3 | 4 | from airflow import DAG 5 | from airflow.decorators import task 6 | 7 | from common_airflow_dag import run_flasky_task 8 | 9 | os.environ["no_proxy"] = "*" 10 | 11 | with DAG('job_market_analytics_curate_job_descriptions_catch_up_dag', 12 | description='Job Market Analytics Curate Job Descriptions Catch Up DAG', 13 | schedule_interval='@daily', 14 | start_date=datetime(2022, 11, 1), 15 | end_date=datetime(2022, 11, 30), 16 | dagrun_timeout=timedelta(minutes=60), 17 | max_active_runs=4, 18 | max_active_tasks=4, 19 | catchup=True) as dag: 20 | @task(task_id="curate_job_descriptions") 21 | def curate_job_descriptions(): 22 | run_flasky_task('do/curate_job_descriptions') 23 | 24 | 25 | curate_job_descriptions() 26 | -------------------------------------------------------------------------------- /python/simplescraper/restore_day_backup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 4 | cd "$SCRIPTPATH" || exit 5 | 6 | source .env 7 | 8 | if [[ $# -ne 3 ]] ; then 9 | echo "Please provide a date as script parameters in the following format: year month day" 10 | echo "Example: $0 2022 12 01" 11 | exit 1 12 | fi 13 | 14 | for entity in job_description sitemap 15 | do 16 | 17 | raw_day_dir=${RAW_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2/$3 18 | 19 | if [ -d "$raw_day_dir" ] 20 | then 21 | 22 | echo "The raw day dir is not empty: $raw_day_dir" 23 | 24 | else 25 | 26 | backup_day_dir=${BACKUP_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2 27 | backup_day_filename=${backup_day_dir}/${entity}.$1$2$3.tar.gz 28 | 29 | mkdir -p "$raw_day_dir" 30 | tar -xvzf "$backup_day_filename" -C "$raw_day_dir" 31 | 32 | echo "$1-$2-$3: Restored ${entity}" 33 | 34 | fi 35 | 36 | done 37 | -------------------------------------------------------------------------------- /python/airflow/airflow_home/dags/job_market_analytics_cleanse_job_descriptions_catch_up_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, timedelta 3 | 4 | from airflow import DAG 5 | from airflow.decorators import task 6 | 7 | from common_airflow_dag import run_flasky_task 8 | 9 | os.environ["no_proxy"] = "*" 10 | 11 | with DAG('job_market_analytics_cleanse_job_descriptions_catch_up_dag', 12 | description='Job Market Analytics Cleanse Job Descriptions Catch Up DAG', 13 | schedule_interval='@daily', 14 | start_date=datetime(2022, 11, 1), 15 | end_date=datetime(2022, 12, 1), 16 | dagrun_timeout=timedelta(minutes=10), 17 | max_active_runs=1, 18 | max_active_tasks=1, 19 | catchup=True) as dag: 20 | 21 | @task(task_id="cleanse_job_descriptions") 22 | def cleanse_job_descriptions(): 23 | run_flasky_task('do/cleanse_job_descriptions') 24 | 25 | 26 | cleanse_job_descriptions() 27 | -------------------------------------------------------------------------------- /python/simplescraper/scrape_data_source.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from common.logging import configure_logger 4 | from common.storage import get_load_timestamp 5 | from tasks.download_job_descriptions import download_job_descriptions 6 | from tasks.download_sitemap import download_sitemap 7 | from tasks.list_downloaded_job_descriptions import list_downloaded_job_descriptions 8 | from tasks.list_job_descriptions_to_download import list_job_descriptions_to_download 9 | 10 | 11 | def scrape_data_source(load_timestamp): 12 | configure_logger(load_timestamp) 13 | df_downloaded = list_downloaded_job_descriptions(load_timestamp) 14 | df_sitemap = download_sitemap(load_timestamp) 15 | df_to_download = list_job_descriptions_to_download(load_timestamp, df_sitemap, df_downloaded) 16 | download_job_descriptions(load_timestamp, df_to_download) 17 | 18 | os.system('say -v Fiona b') 19 | 20 | 21 | if __name__ == "__main__": 22 | scrape_data_source(get_load_timestamp()) 23 | -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/models/mart/fact_online_job.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental' 4 | ) 5 | }} 6 | 7 | 8 | WITH new_fact_online_job AS ( 9 | SELECT online_job.online_at as date_key, 10 | online_job.online_at, 11 | online_job.job_id 12 | FROM {{ source('curated', 'online_job') }} 13 | 14 | {% if is_incremental() %} 15 | LEFT OUTER JOIN {{ this }} fact_online_job 16 | ON (online_job.online_at = fact_online_job.online_at AND 17 | online_job.job_id = fact_online_job.job_id) 18 | WHERE fact_online_job.job_id IS NULL 19 | {% endif %} 20 | ) 21 | SELECT new_fact_online_job.date_key as date_key, 22 | latest_dim_job.job_key as job_key, 23 | new_fact_online_job.online_at as online_at, 24 | latest_dim_job.job_id as job_id, 25 | latest_dim_job.job_ldts 26 | FROM new_fact_online_job 27 | INNER JOIN {{ ref('latest_dim_job') }} 28 | ON (new_fact_online_job.job_id = latest_dim_job.job_id) 29 | -------------------------------------------------------------------------------- /docker/postgres/postgres-parquet-fdw/s2-install-arrow.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb 4 | apt update 5 | apt install -y -V libarrow-dev # For C++ 6 | apt install -y -V libarrow-glib-dev # For GLib (C) 7 | apt install -y -V libarrow-dataset-dev # For Apache Arrow Dataset C++ 8 | apt install -y -V libarrow-flight-dev # For Apache Arrow Flight C++ 9 | # Notes for Plasma related packages: 10 | # * You need to enable "non-free" component on Debian GNU/Linux 11 | # * You need to enable "multiverse" component on Ubuntu 12 | # * You can use Plasma related packages only on amd64 13 | apt install -y -V libplasma-dev # For Plasma C++ 14 | apt install -y -V libplasma-glib-dev # For Plasma GLib (C) 15 | apt install -y -V libgandiva-dev # For Gandiva C++ 16 | apt install -y -V libgandiva-glib-dev # For Gandiva GLib (C) 17 | apt install -y -V libparquet-dev # For Apache Parquet C++ 18 | apt install -y -V libparquet-glib-dev # For Apache Parquet GLib (C) 19 | -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/models/mart/dim_job_location.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental' 4 | ) 5 | }} 6 | 7 | 8 | SELECT MD5(CONCAT_WS('||', 9 | COALESCE( 10 | UPPER(TRIM(CAST( 11 | job_location.job_id 12 | AS VARCHAR))), 13 | '^^'), 14 | COALESCE( 15 | UPPER(TRIM(CAST( 16 | job_location.load_timestamp 17 | AS VARCHAR))), 18 | '^^') 19 | )) AS job_key, 20 | job_location.job_id, 21 | job_location.load_timestamp as job_ldts, 22 | job_location.location AS location_name 23 | FROM {{ source('curated', 'job_location') }} 24 | 25 | {% if is_incremental() %} 26 | LEFT OUTER JOIN dim_job_location 27 | ON (job_location.job_id = dim_job_location.job_id AND 28 | job_location.load_timestamp = dim_job_location.job_ldts AND 29 | job_location.location = dim_job_location.location_name) 30 | WHERE dim_job_location.job_id IS NULL 31 | {% endif %} 32 | -------------------------------------------------------------------------------- /python/simplescraper/verify_day_backup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 4 | cd "$SCRIPTPATH" || exit 5 | 6 | source .env 7 | 8 | if [[ $# -ne 3 ]] ; then 9 | echo "Please provide a date as script parameters in the following format: year month day" 10 | echo "Example: $0 2022 12 01" 11 | exit 1 12 | fi 13 | 14 | for entity in job_description sitemap 15 | do 16 | 17 | source=${RAW_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2/$3 18 | 19 | if [ -d "$source" ] 20 | then 21 | 22 | target_dir=${BACKUP_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2 23 | target_filename=${target_dir}/${entity}.$1$2$3.tar.gz 24 | diff <(cd "$source" && find . | grep -E '.xml$|.html$' | sort) <(tar -tf "$target_filename" | grep -E '.xml$|.html$' | sort) 25 | error_code=$? 26 | if [ $error_code -ne 0 ]; 27 | then 28 | echo "$1-$2-$3: NOT OK" >&2 29 | exit 1 30 | fi 31 | 32 | else 33 | 34 | echo "$1-$2-$3: NOT FOUND ${entity}" 35 | 36 | fi 37 | 38 | done 39 | 40 | echo "$1-$2-$3: OK" 41 | -------------------------------------------------------------------------------- /python/airflow/airflow_home/dags/job_market_analytics_cleanse_catch_up_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, timedelta 3 | 4 | from airflow import DAG 5 | from airflow.decorators import task 6 | 7 | from common_airflow_dag import run_flasky_task 8 | 9 | os.environ["no_proxy"] = "*" 10 | 11 | with DAG('job_market_analytics_cleanse_catch_up_dag', 12 | description='Job Market Analytics Cleanse Catch Up DAG', 13 | schedule_interval='@daily', 14 | start_date=datetime(2021, 12, 1), 15 | # end_date=datetime(2021, 12, 1), 16 | dagrun_timeout=timedelta(minutes=10), 17 | max_active_runs=1, 18 | max_active_tasks=1, 19 | catchup=True) as dag: 20 | @task(task_id="cleanse_sitemaps") 21 | def cleanse_sitemaps(): 22 | run_flasky_task('do/cleanse_sitemaps') 23 | 24 | 25 | @task(task_id="cleanse_job_descriptions") 26 | def cleanse_job_descriptions(): 27 | run_flasky_task('do/cleanse_job_descriptions') 28 | 29 | 30 | cleanse_sitemaps() 31 | cleanse_job_descriptions() 32 | -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/models/mart/dim_job_technology.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental' 4 | ) 5 | }} 6 | 7 | 8 | SELECT MD5(CONCAT_WS('||', 9 | COALESCE( 10 | UPPER(TRIM(CAST( 11 | job_technology.job_id 12 | AS VARCHAR))), 13 | '^^'), 14 | COALESCE( 15 | UPPER(TRIM(CAST( 16 | job_technology.load_timestamp 17 | AS VARCHAR))), 18 | '^^') 19 | )) AS job_key, 20 | job_technology.job_id, 21 | job_technology.load_timestamp as job_ldts, 22 | job_technology.technology AS technology_name 23 | FROM {{ source('curated', 'job_technology') }} 24 | 25 | {% if is_incremental() %} 26 | LEFT OUTER JOIN dim_job_technology 27 | ON (job_technology.job_id = dim_job_technology.job_id AND 28 | job_technology.load_timestamp = dim_job_technology.job_ldts AND 29 | job_technology.technology = dim_job_technology.technology_name) 30 | WHERE dim_job_technology.job_id IS NULL 31 | {% endif %} 32 | -------------------------------------------------------------------------------- /python/airflow/airflow_home/dags/job_market_analytics_curate_catch_up_dag_v2.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, timedelta 3 | 4 | from airflow import DAG 5 | from airflow.decorators import task 6 | 7 | from common_airflow_dag import run_flasky_task 8 | 9 | os.environ["no_proxy"] = "*" 10 | 11 | YEAR = 2021 12 | MONTH = 10 13 | DAY = 1 14 | 15 | with DAG('job_market_analytics_curate_catch_up_dag', 16 | description='Job Market Analytics Curate Catch Up DAG', 17 | schedule_interval='@daily', 18 | start_date=datetime(YEAR, MONTH, DAY), 19 | end_date=datetime(YEAR, MONTH, DAY) + timedelta(days=15), 20 | dagrun_timeout=timedelta(minutes=60), 21 | max_active_runs=2, 22 | max_active_tasks=2, 23 | catchup=True) as dag: 24 | @task(task_id="curate_sitemaps") 25 | def curate_sitemaps(): 26 | run_flasky_task('do/curate_sitemaps') 27 | 28 | 29 | @task(task_id="curate_job_descriptions") 30 | def curate_job_descriptions(): 31 | run_flasky_task('do/curate_job_descriptions') 32 | 33 | 34 | curate_sitemaps() 35 | curate_job_descriptions() 36 | -------------------------------------------------------------------------------- /python/simplescraper/common/env_variables.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dotenv import load_dotenv 4 | 5 | load_dotenv() 6 | 7 | DATA_DIR = os.getenv('DATA_DIR') 8 | RAW_DIR = os.getenv('RAW_DIR') 9 | CLEANSED_DIR = os.getenv('CLEANSED_DIR') 10 | CURATED_DIR = os.getenv('CURATED_DIR') 11 | DUCKDB_DWH_FILE = os.getenv('DUCKDB_DWH_FILE') 12 | TEMP_DIR = os.getenv('TEMP_DIR') 13 | BACKUP_DIR = os.getenv('BACKUP_DIR') 14 | SOURCE_DIR = os.getenv('SOURCE_DIR') 15 | 16 | DATA_SOURCE_NAME = os.getenv('DATA_SOURCE_NAME') 17 | DATA_SOURCE_URL = os.getenv('DATA_SOURCE_URL') 18 | 19 | SEMAPHORE_COUNT: int = int(os.getenv('SEMAPHORE_COUNT')) 20 | MAX_CHUNK_SIZE: int = int(os.getenv('MAX_CHUNK_SIZE')) 21 | MIN_TO_DOWNLOAD: int = int(os.getenv('MIN_TO_DOWNLOAD')) 22 | MAX_TO_DOWNLOAD: int = int(os.getenv('MAX_TO_DOWNLOAD')) 23 | ONLINE_EXPIRATION_IN_DAYS: int = int(os.getenv('ONLINE_EXPIRATION_IN_DAYS')) 24 | 25 | LATEST_LOAD_TIMESTAMP = os.getenv('LATEST_LOAD_TIMESTAMP') 26 | 27 | RUN_HEADLESS = os.getenv('RUN_HEADLESS') == 'True' 28 | 29 | UPLOAD_TO_AZURE = os.getenv('UPLOAD_TO_AZURE') == 'True' 30 | 31 | AZURE_STORAGE_CONNECTION_STRING = os.getenv('AZURE_STORAGE_CONNECTION_STRING') 32 | AZURE_STORAGE_CONTAINER_NAME = os.getenv('AZURE_STORAGE_CONTAINER_NAME') 33 | -------------------------------------------------------------------------------- /python/simplescraper/tasks/curate_sitemaps.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import pandas as pd 4 | 5 | from common.entity import SITEMAP, ONLINE_JOB 6 | from common.logging import configure_logger, logger 7 | from common.storage import get_load_timestamp, get_load_date, load_cleansed_df, save_curated_df 8 | from tasks.curate_job_descriptions import BASE_COLUMNS 9 | 10 | ONLINE_JOB_SAT_COLUMNS = ['online_at', 'url'] 11 | 12 | 13 | def curate_sitemaps(load_timestamp, load_date): 14 | configure_logger(load_timestamp) 15 | logger.info(f'Start curate_sitemaps: {load_timestamp} {load_date}') 16 | 17 | df = load_cleansed_df(SITEMAP, load_date=load_date) 18 | 19 | df = df.dropna(subset=['job_id']) 20 | df['job_id'] = df['job_id'].astype('int') 21 | df['online_at'] = pd.to_datetime(df['load_timestamp']).dt.date 22 | df = df[BASE_COLUMNS + ONLINE_JOB_SAT_COLUMNS] 23 | df = df.sort_values(by=['job_id']) 24 | 25 | save_curated_df(df, ONLINE_JOB) 26 | logger.info(f'End curate_sitemaps: {load_timestamp} {load_date}') 27 | 28 | 29 | if __name__ == "__main__": 30 | _load_timestamp = sys.argv[1] if len(sys.argv) > 1 else get_load_timestamp() 31 | _load_date = sys.argv[2] if len(sys.argv) > 2 else get_load_date() 32 | curate_sitemaps(_load_timestamp, _load_date) 33 | -------------------------------------------------------------------------------- /docker/postgres/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | # Inspired by https://github.com/khezen/compose-postgres/blob/master/docker-compose.yml 4 | services: 5 | postgres: 6 | build: 7 | context: postgres-parquet-fdw 8 | target: postgres-parquet-fdw 9 | environment: 10 | POSTGRES_USER: ${POSTGRES_USER} 11 | POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} 12 | POSTGRES_DB: ${POSTGRES_DB} 13 | networks: 14 | - postgres 15 | restart: always 16 | env_file: .env 17 | logging: 18 | options: 19 | max-size: 10m 20 | max-file: "3" 21 | ports: 22 | - '5432:5432' 23 | volumes: 24 | - ${POSTGRES_VOLUME}:/var/lib/postgresql/data 25 | - ${POSTGRES_PARQUET_FDW_VOLUME}:/var/lib/parquet-fdw/data 26 | # pgadmin: 27 | # image: dpage/pgadmin4 28 | # environment: 29 | # PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL} 30 | # PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD} 31 | # PGADMIN_CONFIG_SERVER_MODE: 'False' 32 | # ports: 33 | # - '2345:80' 34 | # volumes: 35 | # - ${PGADMIN_VOLUME}:/var/lib/pgadmin 36 | # networks: 37 | # - postgres 38 | # restart: always 39 | # depends_on: 40 | # - "postgres" 41 | 42 | networks: 43 | postgres: 44 | driver: bridge 45 | -------------------------------------------------------------------------------- /python/simplescraper/verify_all_backups.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import pandas as pd 4 | 5 | from common.entity import RAW_ENTITIES 6 | from common.env_variables import DATA_SOURCE_NAME, SOURCE_DIR 7 | from common.storage import list_raw_days 8 | 9 | 10 | def get_current_date(): 11 | return datetime.datetime.today().strftime('%Y%m%d') 12 | 13 | 14 | def list_missing_previous_dates(entity): 15 | df = pd.DataFrame(list_raw_days(DATA_SOURCE_NAME, entity)) 16 | df_current_date = pd.DataFrame([{ 17 | 'date': get_current_date() 18 | }]) 19 | df = df.drop_duplicates() 20 | df = pd.concat([ 21 | df, 22 | df_current_date, df_current_date 23 | ]).drop_duplicates(keep=False) 24 | return df 25 | 26 | 27 | def verify_backups(): 28 | dfs = [] 29 | for entity in RAW_ENTITIES: 30 | df = list_missing_previous_dates(entity) 31 | dfs.append(df) 32 | df = pd.concat(dfs, ignore_index=True) 33 | df = df.drop_duplicates() 34 | df = df.sort_values(by=['date']) 35 | dates_to_download = df['date'].to_list() 36 | for date_to_download in dates_to_download: 37 | year = date_to_download[:4] 38 | month = date_to_download[4:6] 39 | day = date_to_download[6:8] 40 | print( 41 | f'/bin/zsh {SOURCE_DIR}/simplescraper/verify_day_backup.sh {year} {month} {day} || exit 1') 42 | 43 | 44 | if __name__ == "__main__": 45 | verify_backups() 46 | -------------------------------------------------------------------------------- /python/simplescraper/tasks/list_job_descriptions_to_download.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from common.env_variables import LATEST_LOAD_TIMESTAMP 4 | from common.logging import logger, configure_logger 5 | from common.storage import load_temp_df, DOWNLOADED_JOB_DESCRIPTIONS_CSV, SITEMAP_URLS_CSV, save_temp_df, \ 6 | JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV 7 | 8 | 9 | def list_job_descriptions_to_download(load_timestamp, df_sitemap_urls=None, df_downloaded=None): 10 | configure_logger(load_timestamp) 11 | logger.info('list_job_descriptions_to_download: start') 12 | 13 | df_sitemap_urls = df_sitemap_urls or load_temp_df(load_timestamp, SITEMAP_URLS_CSV) 14 | df_downloaded = df_downloaded or load_temp_df(load_timestamp, DOWNLOADED_JOB_DESCRIPTIONS_CSV) 15 | 16 | df_downloaded = df_downloaded[['id']] 17 | df_downloaded = df_downloaded.drop_duplicates() 18 | df = df_sitemap_urls[['id']] 19 | df = df.drop_duplicates() 20 | df = pd.concat([df, df_downloaded, df_downloaded]).drop_duplicates(keep=False) 21 | df = df.merge(df_sitemap_urls) 22 | df = df[['url']] 23 | total_count = df.shape[0] 24 | 25 | save_temp_df(df, load_timestamp, JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV) 26 | logger.success(f'URLs to download: {total_count}') 27 | logger.info('list_job_descriptions_to_download: end') 28 | return df 29 | 30 | 31 | if __name__ == "__main__": 32 | list_job_descriptions_to_download(LATEST_LOAD_TIMESTAMP) 33 | -------------------------------------------------------------------------------- /sql/dwh/job_market_analytics/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'job_market_analytics' 6 | version: '1.0.0' 7 | config-version: 2 8 | 9 | # This setting configures which "profile" dbt uses for this project. 10 | profile: 'job_market_analytics' 11 | 12 | # These configurations specify where dbt should look for different types of files. 13 | # The `model-paths` config, for example, states that models in this project can be 14 | # found in the "models/" directory. You probably won't need to change these! 15 | model-paths: [ "models" ] 16 | analysis-paths: [ "analyses" ] 17 | test-paths: [ "tests" ] 18 | seed-paths: [ "seeds" ] 19 | macro-paths: [ "macros" ] 20 | snapshot-paths: [ "snapshots" ] 21 | 22 | target-path: "target" # directory which will store compiled SQL files 23 | clean-targets: # directories to be removed by `dbt clean` 24 | - "target" 25 | - "dbt_packages" 26 | 27 | 28 | # Configuring models 29 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 30 | 31 | # In this example config, we tell dbt to build all models in the example/ directory 32 | # as tables. These settings can be overridden in the individual model files 33 | # using the `{{ config(...) }}` macro. 34 | models: 35 | job_market_analytics: 36 | # Config indicated by + and applies to all files under models/example/ 37 | mart: 38 | +materialized: view 39 | -------------------------------------------------------------------------------- /python/simplescraper/tasks/prune_old_raw.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | import shutil 4 | import sys 5 | 6 | from common.entity import RAW_ENTITIES 7 | from common.env_variables import RAW_DIR, DATA_SOURCE_NAME 8 | from common.logging import configure_logger, logger 9 | from common.storage import get_load_timestamp, get_load_date, LOAD_DATE_FORMAT 10 | 11 | SEVEN_MONTHS_IN_DAYS = 7 * 30 12 | 13 | 14 | def prune_old_raw(load_timestamp, load_date): 15 | configure_logger(load_timestamp) 16 | logger.info(f'Start prune_old_raw: {load_date}') 17 | date_to_remove = datetime.datetime.strptime(load_date, LOAD_DATE_FORMAT).date() 18 | date_to_remove = date_to_remove - datetime.timedelta(days=SEVEN_MONTHS_IN_DAYS) 19 | date_to_remove = date_to_remove.strftime(LOAD_DATE_FORMAT) 20 | year, month, day = date_to_remove.split('/', 2) 21 | for entity in RAW_ENTITIES: 22 | folder_to_remove = f'{RAW_DIR}/{DATA_SOURCE_NAME}/{entity}/{year}/{month}/{day}' 23 | if os.path.exists(folder_to_remove) and os.path.isdir(folder_to_remove): 24 | logger.success(f'Removing {folder_to_remove}') 25 | shutil.rmtree(folder_to_remove) 26 | else: 27 | logger.warning(f'No folder to remove on {folder_to_remove}') 28 | 29 | logger.info(f'End prune_old_raw: {load_date}') 30 | 31 | 32 | if __name__ == "__main__": 33 | _load_timestamp = sys.argv[1] if len(sys.argv) > 1 else get_load_timestamp() 34 | _load_date = sys.argv[2] if len(sys.argv) > 2 else get_load_date() 35 | prune_old_raw(_load_timestamp, _load_date) 36 | -------------------------------------------------------------------------------- /python/simplescraper/tasks/list_downloaded_job_descriptions.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import functools 3 | 4 | import pandas as pd 5 | 6 | from common.entity import JOB_DESCRIPTION 7 | from common.env_variables import LATEST_LOAD_TIMESTAMP, ONLINE_EXPIRATION_IN_DAYS 8 | from common.logging import logger, configure_logger 9 | from common.storage import DOWNLOADED_JOB_DESCRIPTIONS_CSV, DATA_SOURCE_NAME, save_temp_df, list_raw_files 10 | 11 | 12 | @functools.lru_cache(maxsize=1024) 13 | def calculate_days_online(load_timestamp): 14 | ingestion_datetime = datetime.datetime.strptime(load_timestamp, '%Y/%m/%d/%H-%M-%S') 15 | now = datetime.datetime.now() 16 | delta = now - ingestion_datetime 17 | return delta.days 18 | 19 | 20 | def list_downloaded_job_descriptions(load_timestamp, load_date=None) -> pd.DataFrame: 21 | configure_logger(load_timestamp) 22 | logger.info('list_downloaded_job_descriptions start') 23 | files = list_raw_files(DATA_SOURCE_NAME, JOB_DESCRIPTION, load_date) 24 | df = pd.DataFrame(files) 25 | if not df.empty: 26 | df['id'] = df['file_name'].str.split('.', expand=True)[0] 27 | if ONLINE_EXPIRATION_IN_DAYS: 28 | df['days_online'] = df['load_timestamp'].map(calculate_days_online) 29 | df = df[df['days_online'] < ONLINE_EXPIRATION_IN_DAYS] 30 | df = df.drop(columns=['days_online']) 31 | if load_date is None: 32 | save_temp_df(df, load_timestamp, DOWNLOADED_JOB_DESCRIPTIONS_CSV) 33 | logger.info('list_downloaded_job_descriptions end') 34 | return df 35 | 36 | 37 | if __name__ == "__main__": 38 | list_downloaded_job_descriptions(LATEST_LOAD_TIMESTAMP) 39 | -------------------------------------------------------------------------------- /python/airflow/airflow_home/dags/job_market_analytics_hourly_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, timedelta 3 | from itertools import chain 4 | 5 | from airflow import DAG 6 | from airflow.decorators import task 7 | from airflow.providers.http.hooks.http import HttpHook 8 | 9 | from common_airflow_dag import run_flasky_task 10 | 11 | os.environ["no_proxy"] = "*" 12 | 13 | with DAG('job_market_analytics_hourly_dag', 14 | description='Job Market Analytics Hourly DAG', 15 | schedule_interval='@hourly', 16 | start_date=datetime(2022, 1, 1), 17 | dagrun_timeout=timedelta(minutes=60), 18 | max_active_runs=1, 19 | catchup=False) as dag: 20 | @task(task_id="check_vpn_status") 21 | def check_vpn_status(): 22 | HttpHook(method='GET').run('do/check_vpn_status') 23 | 24 | 25 | @task(task_id="list_downloaded_job_descriptions") 26 | def list_downloaded_job_descriptions(): 27 | run_flasky_task('do/list_downloaded_job_descriptions') 28 | 29 | 30 | @task(task_id="download_sitemap", retries=1) 31 | def download_sitemap(): 32 | run_flasky_task('do/download_sitemap') 33 | 34 | 35 | @task(task_id="list_job_descriptions_to_download") 36 | def list_job_descriptions_to_download(): 37 | run_flasky_task('do/list_job_descriptions_to_download') 38 | 39 | 40 | @task(task_id="download_job_descriptions") 41 | def download_job_descriptions(): 42 | run_flasky_task('do/download_job_descriptions') 43 | 44 | 45 | chain(check_vpn_status() >> [list_downloaded_job_descriptions(), 46 | download_sitemap()] >> \ 47 | list_job_descriptions_to_download() >> download_job_descriptions()) 48 | -------------------------------------------------------------------------------- /python/utils/migrate_to_raw_v3.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | import pandas as pd 6 | 7 | from common.env_variables import LATEST_LOAD_TIMESTAMP, RAW_DIR 8 | from common.storage import DATA_SOURCE_NAME, save_temp_df, load_temp_df 9 | 10 | 11 | def list_raw_files(data_source, entity): 12 | dir_path = os.path.join(RAW_DIR, data_source, entity) 13 | file_list = [{ 14 | 'old_file_path': f, 15 | } for f in glob.iglob(dir_path + '/*/*/*/*/*', recursive=True) if os.path.isfile(f)] 16 | return file_list 17 | 18 | 19 | def list_downloaded_files(load_timestamp) -> pd.DataFrame: 20 | files = list_raw_files(DATA_SOURCE_NAME, 'job_description') 21 | df = pd.DataFrame(files) 22 | save_temp_df(df, load_timestamp, '00_downloaded_raw_job_descriptions.csv') 23 | return df 24 | 25 | 26 | def get_new_file_path(row): 27 | old_file_path = row['old_file_path'] 28 | dirname = os.path.dirname(old_file_path) 29 | basename = os.path.basename(old_file_path) 30 | job_id = basename.rsplit('--', 1) 31 | job_id = job_id[1] 32 | job_id = job_id.split('-') 33 | job_id = job_id[0] 34 | new_file_path = os.path.join(dirname.replace('/raw/', '/raw_v3/'), f'{job_id}.html') 35 | return new_file_path 36 | 37 | 38 | def copy_file(row): 39 | src = row['old_file_path'] 40 | dst = row['new_file_path'] 41 | os.makedirs(os.path.dirname(dst), exist_ok=True) 42 | shutil.copy2(src, dst) 43 | 44 | 45 | def copy_files_to_raw_v2(load_timestamp): 46 | df = load_temp_df(load_timestamp, '00_downloaded_raw_job_descriptions.csv') 47 | df['new_file_path'] = df.apply(get_new_file_path, axis=1) 48 | df.apply(copy_file, axis=1) 49 | 50 | 51 | if __name__ == "__main__": 52 | copy_files_to_raw_v2(LATEST_LOAD_TIMESTAMP) 53 | -------------------------------------------------------------------------------- /python/simplescraper/restore_all_backups.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import pandas as pd 4 | 5 | from common.entity import RAW_ENTITIES 6 | from common.env_variables import DATA_SOURCE_NAME, SOURCE_DIR 7 | from common.storage import list_raw_days, list_backup_days 8 | 9 | 10 | def get_current_date(): 11 | return datetime.datetime.today().strftime('%Y%m%d') 12 | 13 | 14 | def list_backups_to_restore(entity): 15 | df = pd.DataFrame(list_backup_days(DATA_SOURCE_NAME, entity)) 16 | df_in_raw = pd.DataFrame(list_raw_days(DATA_SOURCE_NAME, entity)) 17 | df_current_date = pd.DataFrame([{ 18 | 'date': get_current_date() 19 | }]) 20 | df = df.drop_duplicates() 21 | df = pd.concat([ 22 | df, 23 | df_in_raw, df_in_raw, 24 | df_current_date, df_current_date 25 | ]).drop_duplicates(keep=False) 26 | return df 27 | 28 | 29 | def print_script_statements(script_name, days_to_restore): 30 | for day_to_restore in days_to_restore: 31 | year = day_to_restore[:4] 32 | month = day_to_restore[4:6] 33 | day = day_to_restore[6:8] 34 | print( 35 | f'/bin/zsh {SOURCE_DIR}/simplescraper/{script_name} {year} {month} {day} || exit 1') 36 | 37 | 38 | def restore_all_backups(): 39 | dfs = [] 40 | for entity in RAW_ENTITIES: 41 | df = list_backups_to_restore(entity) 42 | dfs.append(df) 43 | df = pd.concat(dfs, ignore_index=True) 44 | df = df.drop_duplicates() 45 | df = df.sort_values(by=['date']) 46 | days_to_restore = df['date'].to_list() 47 | print_script_statements('restore_day_backup.sh', days_to_restore) 48 | print() 49 | print_script_statements('verify_day_backup.sh', days_to_restore) 50 | 51 | 52 | if __name__ == "__main__": 53 | restore_all_backups() 54 | -------------------------------------------------------------------------------- /python/simplescraper/do_all_backups.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import pandas as pd 4 | 5 | from common.entity import RAW_ENTITIES 6 | from common.env_variables import DATA_SOURCE_NAME, SOURCE_DIR 7 | from common.storage import list_raw_days, list_backup_days 8 | 9 | 10 | def get_current_date(): 11 | return datetime.datetime.today().strftime('%Y%m%d') 12 | 13 | 14 | def list_days_to_backup(entity): 15 | df = pd.DataFrame(list_raw_days(DATA_SOURCE_NAME, entity)) 16 | df_backup_days = pd.DataFrame(list_backup_days(DATA_SOURCE_NAME, entity)) 17 | df_current_date = pd.DataFrame([{ 18 | 'date': get_current_date() 19 | }]) 20 | df = df.drop_duplicates() 21 | df = pd.concat([ 22 | df, 23 | df_backup_days, df_backup_days, 24 | df_current_date, df_current_date 25 | ]).drop_duplicates(keep=False) 26 | return df 27 | 28 | 29 | def print_script_statements(script_name, dates_to_download): 30 | for date_to_download in dates_to_download: 31 | year = date_to_download[:4] 32 | month = date_to_download[4:6] 33 | day = date_to_download[6:8] 34 | print( 35 | f'/bin/zsh {SOURCE_DIR}/simplescraper/{script_name} {year} {month} {day} || exit 1') 36 | 37 | 38 | def do_all_backups(): 39 | dfs = [] 40 | for entity in RAW_ENTITIES: 41 | df = list_days_to_backup(entity) 42 | dfs.append(df) 43 | df = pd.concat(dfs, ignore_index=True) 44 | df = df.drop_duplicates() 45 | df = df.sort_values(by=['date']) 46 | dates_to_download = df['date'].to_list() 47 | print_script_statements('do_day_backup.sh', dates_to_download) 48 | print() 49 | print_script_statements('verify_day_backup.sh', dates_to_download) 50 | 51 | 52 | if __name__ == "__main__": 53 | do_all_backups() 54 | -------------------------------------------------------------------------------- /python/airflow/airflow_home/dags/job_market_analytics_daily_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, timedelta 3 | 4 | from airflow import DAG 5 | from airflow.decorators import task 6 | 7 | from common_airflow_dag import run_flasky_task 8 | 9 | os.environ["no_proxy"] = "*" 10 | 11 | with DAG('job_market_analytics_daily_dag', 12 | description='Job Market Analytics Daily DAG', 13 | schedule_interval='@daily', 14 | start_date=datetime(2022, 1, 1), 15 | dagrun_timeout=timedelta(minutes=60), 16 | max_active_runs=1, 17 | catchup=True) as dag: 18 | @task(task_id="cleanse_sitemaps") 19 | def cleanse_sitemaps(): 20 | run_flasky_task('do/cleanse_sitemaps') 21 | 22 | 23 | @task(task_id="cleanse_job_descriptions") 24 | def cleanse_job_descriptions(): 25 | run_flasky_task('do/cleanse_job_descriptions') 26 | 27 | 28 | @task(task_id="curate_sitemaps") 29 | def curate_sitemaps(): 30 | run_flasky_task('do/curate_sitemaps') 31 | 32 | 33 | @task(task_id="curate_job_descriptions") 34 | def curate_job_descriptions(): 35 | run_flasky_task('do/curate_job_descriptions') 36 | 37 | 38 | @task(task_id="do_dbt_run") 39 | def dbt_run(): 40 | run_flasky_task('do/do_dbt_run') 41 | 42 | 43 | @task(task_id="do_day_backup") 44 | def backup_day(): 45 | run_flasky_task('do/do_day_backup') 46 | 47 | 48 | @task(task_id="verify_day_backup") 49 | def verify_day_backup(): 50 | run_flasky_task('do/verify_day_backup') 51 | 52 | 53 | @task(task_id="prune_old_raw") 54 | def prune_old_raw(): 55 | run_flasky_task('do/prune_old_raw') 56 | 57 | 58 | t_curate_sitemaps = curate_sitemaps() 59 | t_curate_job_descriptions = curate_job_descriptions() 60 | 61 | cleanse_sitemaps() >> t_curate_sitemaps 62 | cleanse_job_descriptions() >> t_curate_job_descriptions 63 | 64 | [t_curate_sitemaps, t_curate_job_descriptions] >> dbt_run() 65 | 66 | backup_day() >> verify_day_backup() >> prune_old_raw() 67 | -------------------------------------------------------------------------------- /python/airflow/airflow_home/dags/job_market_analytics_daily_dag_catch_up.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, timedelta 3 | 4 | from airflow import DAG 5 | from airflow.decorators import task 6 | 7 | from common_airflow_dag import run_flasky_task 8 | 9 | os.environ["no_proxy"] = "*" 10 | 11 | with DAG('job_market_analytics_daily_catch_up_dag', 12 | description='Job Market Analytics Daily Catch UP DAG', 13 | schedule_interval='@daily', 14 | start_date=datetime(2023, 5, 24), 15 | dagrun_timeout=timedelta(minutes=60), 16 | max_active_runs=1, 17 | max_active_tasks=1, 18 | catchup=True) as dag: 19 | @task(task_id="cleanse_sitemaps") 20 | def cleanse_sitemaps(): 21 | run_flasky_task('do/cleanse_sitemaps') 22 | 23 | 24 | @task(task_id="cleanse_job_descriptions") 25 | def cleanse_job_descriptions(): 26 | run_flasky_task('do/cleanse_job_descriptions') 27 | 28 | 29 | @task(task_id="curate_sitemaps") 30 | def curate_sitemaps(): 31 | run_flasky_task('do/curate_sitemaps') 32 | 33 | 34 | @task(task_id="curate_job_descriptions") 35 | def curate_job_descriptions(): 36 | run_flasky_task('do/curate_job_descriptions') 37 | 38 | 39 | @task(task_id="do_dbt_run") 40 | def dbt_run(): 41 | run_flasky_task('do/do_dbt_run') 42 | 43 | 44 | @task(task_id="do_day_backup") 45 | def backup_day(): 46 | run_flasky_task('do/do_day_backup') 47 | 48 | 49 | @task(task_id="verify_day_backup") 50 | def verify_day_backup(): 51 | run_flasky_task('do/verify_day_backup') 52 | 53 | 54 | @task(task_id="prune_old_raw") 55 | def prune_old_raw(): 56 | run_flasky_task('do/prune_old_raw') 57 | 58 | 59 | t_curate_sitemaps = curate_sitemaps() 60 | t_curate_job_descriptions = curate_job_descriptions() 61 | 62 | cleanse_sitemaps() >> t_curate_sitemaps 63 | cleanse_job_descriptions() >> t_curate_job_descriptions 64 | 65 | [t_curate_sitemaps, t_curate_job_descriptions] >> dbt_run() 66 | 67 | backup_day() >> verify_day_backup() >> prune_old_raw() 68 | -------------------------------------------------------------------------------- /doc/metaData-bag.log: -------------------------------------------------------------------------------- 1 | 2022-05-08 21:29:11.685 | DEBUG | __main__:load_and_parse:27 - Parsing (96/213) 2343/3437: 2022/04/20/09-00-00/8205291.html 2 | Traceback (most recent call last): 3 | File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/parse_job_descriptions.py", line 70, in 4 | parse_job_descriptions() 5 | File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/parse_job_descriptions.py", line 53, in parse_job_descriptions 6 | df['parsed_content'] = df.apply(load_and_parse, axis=1) 7 | File "/Users/carrion/PycharmProjects/job-market-analytics/venv/lib/python3.8/site-packages/pandas/core/frame.py", line 8740, in apply 8 | return op.apply() 9 | File "/Users/carrion/PycharmProjects/job-market-analytics/venv/lib/python3.8/site-packages/pandas/core/apply.py", line 688, in apply 10 | return self.apply_standard() 11 | File "/Users/carrion/PycharmProjects/job-market-analytics/venv/lib/python3.8/site-packages/pandas/core/apply.py", line 812, in apply_standard 12 | results, res_index = self.apply_series_generator() 13 | File "/Users/carrion/PycharmProjects/job-market-analytics/venv/lib/python3.8/site-packages/pandas/core/apply.py", line 828, in apply_series_generator 14 | results[i] = self.f(v) 15 | File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/parse_job_descriptions.py", line 28, in load_and_parse 16 | parsed_content = parse_job_description(html_content) 17 | File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/tasks/parse_job_description.py", line 55, in parse_job_description 18 | job_description = extract_metadata(soup) 19 | File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/tasks/parse_job_description.py", line 46, in extract_metadata 20 | metadata = flatten_metadata(metadata) 21 | File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/tasks/parse_job_description.py", line 24, in flatten_metadata 22 | temp_metadata = flatten.pop('metaData') 23 | KeyError: 'metaData' 24 | 25 | Process finished with exit code 1 26 | -------------------------------------------------------------------------------- /python/simplescraper/tasks/cleanse_sitemaps.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import bs4 4 | import pandas as pd 5 | from loguru import logger 6 | 7 | from common.entity import SITEMAP 8 | from common.logging import configure_logger 9 | from common.storage import get_load_timestamp, load_raw_file, save_cleansed_df, get_load_date, LOAD_TIMESTAMP_FORMAT 10 | from tasks.list_downloaded_sitemaps import list_downloaded_sitemaps 11 | 12 | 13 | def load_and_parse(row): 14 | load_timestamp = row['load_timestamp'] 15 | file_name = row['file_name'] 16 | sitemap_content = load_raw_file(SITEMAP, load_timestamp, file_name) 17 | logger.debug(f'Parsing: {load_timestamp}/{file_name}') 18 | soup = bs4.BeautifulSoup(sitemap_content, 'xml') 19 | urls = [loc.text for loc in soup.findAll('loc')] 20 | return urls 21 | 22 | 23 | def extract_job_id(url_column): 24 | url_split = url_column.str.split('--', expand=True) 25 | return url_split[2].str.split('-', expand=True)[0] 26 | 27 | 28 | def get_date_from_load_timestamp(load_timestamp): 29 | year, month, day, time = load_timestamp.split('/') 30 | return f'{year}-{month}-{day}' 31 | 32 | 33 | def cleanse_sitemaps(load_timestamp, load_date): 34 | configure_logger(load_timestamp) 35 | df = list_downloaded_sitemaps(load_timestamp, load_date) 36 | df[['year', 'month', 'day', 'time']] = df['load_timestamp'].str.split('/', 3, expand=True) 37 | if df.empty: 38 | logger.info('Nothing to parse') 39 | return 40 | df = df.sort_values(by=['load_timestamp', 'file_name']) 41 | df['url'] = df.apply(load_and_parse, axis=1) 42 | df = df.explode('url') 43 | df['job_id'] = extract_job_id(df['url']) 44 | df = df.drop_duplicates(['job_id'], keep='first') 45 | df['load_timestamp'] = pd.to_datetime(df['load_timestamp'], format=LOAD_TIMESTAMP_FORMAT, utc=True) 46 | logger.info(f'Saving cleansed: {df["load_timestamp"].iloc[0]}') 47 | save_cleansed_df(df, SITEMAP) 48 | 49 | 50 | if __name__ == "__main__": 51 | _load_timestamp = sys.argv[1] if len(sys.argv) > 1 else get_load_timestamp() 52 | _load_date = sys.argv[2] if len(sys.argv) > 2 else get_load_date() 53 | cleanse_sitemaps(_load_timestamp, _load_date) 54 | -------------------------------------------------------------------------------- /python/utils/migrate_raw_v1_to_raw_v2.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import glob 3 | import os 4 | import shutil 5 | 6 | import pandas as pd 7 | 8 | from common.env_variables import LATEST_LOAD_TIMESTAMP, RAW_DIR, DATA_DIR 9 | from common.storage import DATA_SOURCE_NAME, save_temp_df, load_temp_df 10 | 11 | 12 | def list_raw_files(data_source): 13 | dir_path = os.path.join(RAW_DIR, data_source) 14 | file_list = [{ 15 | 'old_file_path': f, 16 | 'entity': f.split('/')[-3], 17 | 'timestamp': datetime.datetime.fromtimestamp(os.stat(f).st_birthtime), 18 | 'file_name': f.split('/')[-1], 19 | } for f in glob.iglob(dir_path + '/*/*/*', recursive=True) if os.path.isfile(f)] 20 | return file_list 21 | 22 | 23 | def list_downloaded_files(load_timestamp) -> pd.DataFrame: 24 | files = list_raw_files(DATA_SOURCE_NAME) 25 | df = pd.DataFrame(files) 26 | # df = df[df['file_name'] != 'sitemapindex.xml'] 27 | save_temp_df(df, load_timestamp, '00_downloaded_raw_files.csv') 28 | return df 29 | 30 | 31 | def timestamp_to_datatime_partition(timestamp): 32 | timestamp = str(timestamp) 33 | split1, split2 = timestamp.split() 34 | year, month, day = split1.split('-') 35 | hour = split2[:2] 36 | datatime_partition = f'{year}/{month}/{day}/{hour}-00-00' 37 | return datatime_partition 38 | 39 | 40 | def get_new_file_path(row): 41 | new_file_path = os.path.join(DATA_DIR, 'raw_v2', DATA_SOURCE_NAME, row['entity'], row['datatime_partition'], 42 | row['file_name']) 43 | return new_file_path 44 | 45 | 46 | def copy_file(row): 47 | src = row['old_file_path'] 48 | dst = row['new_file_path'] 49 | os.makedirs(os.path.dirname(dst), exist_ok=True) 50 | shutil.copy2(src, dst) 51 | 52 | 53 | def copy_files_to_raw_v2(load_timestamp): 54 | df = load_temp_df(load_timestamp, '00_downloaded_raw_files.csv') 55 | df['datatime_partition'] = df['timestamp'].apply(timestamp_to_datatime_partition) 56 | df['new_file_path'] = df.apply(get_new_file_path, axis=1) 57 | df.apply(copy_file, axis=1) 58 | 59 | 60 | if __name__ == "__main__": 61 | list_downloaded_files(LATEST_LOAD_TIMESTAMP) 62 | copy_files_to_raw_v2(LATEST_LOAD_TIMESTAMP) 63 | -------------------------------------------------------------------------------- /python/simplescraper/tasks/cleanse_job_descriptions.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import pandas as pd 4 | 5 | from common.entity import JOB_DESCRIPTION 6 | from common.logging import logger, configure_logger 7 | from common.storage import get_load_timestamp, load_raw_file, save_cleansed_df, get_load_date, LOAD_TIMESTAMP_FORMAT 8 | from tasks.list_downloaded_job_descriptions import list_downloaded_job_descriptions 9 | from tasks.parse_job_description import parse_job_description 10 | 11 | 12 | def load_and_parse(row) -> str: 13 | load_timestamp = row['load_timestamp'] 14 | file_name = row['file_name'] 15 | html_content = load_raw_file(JOB_DESCRIPTION, load_timestamp, file_name) 16 | try: 17 | logger.debug(f'Parsing {load_timestamp}/{file_name}') 18 | parsed_content = parse_job_description(html_content) 19 | return parsed_content 20 | except AttributeError: 21 | logger.warning(f'The following file could not be parsed: {load_timestamp}/{file_name}') 22 | return '' 23 | 24 | 25 | def cleanse_job_descriptions(load_timestamp, load_date): 26 | configure_logger(load_timestamp) 27 | df = list_downloaded_job_descriptions(load_timestamp, load_date) 28 | if df.empty: 29 | logger.warning(f'Nothing to cleanse for the load date: {load_date}') 30 | return 31 | df = df.sort_values(by=['load_timestamp', 'file_name']) 32 | df = df.reset_index(drop=True) 33 | logger.info(f'Start to parse job descriptions for the load date: {load_date}') 34 | df['parsed_content'] = df.apply(load_and_parse, axis=1) 35 | df = df.join(pd.json_normalize(df['parsed_content'])) 36 | df = df.drop(columns=['parsed_content']) 37 | df[['year', 'month', 'day', 'hour']] = df['load_timestamp'].str.split('/', 3, expand=True) 38 | df['load_timestamp'] = pd.to_datetime(df['load_timestamp'], format=LOAD_TIMESTAMP_FORMAT, utc=True) 39 | logger.info(f'Finish to parse job descriptions for the load date: {load_date}') 40 | save_cleansed_df(df, JOB_DESCRIPTION) 41 | 42 | 43 | if __name__ == "__main__": 44 | _load_timestamp = sys.argv[1] if len(sys.argv) > 1 else get_load_timestamp() 45 | _load_date = sys.argv[2] if len(sys.argv) > 2 else get_load_date() 46 | cleanse_job_descriptions(_load_timestamp, _load_date) 47 | -------------------------------------------------------------------------------- /sql/dwh/requirements.txt: -------------------------------------------------------------------------------- 1 | agate==1.6.3 2 | appnope==0.1.3 3 | argon2-cffi==21.3.0 4 | argon2-cffi-bindings==21.2.0 5 | asttokens==2.0.8 6 | attrs==22.1.0 7 | Babel==2.10.3 8 | backcall==0.2.0 9 | beautifulsoup4==4.11.1 10 | bleach==5.0.1 11 | certifi==2022.9.14 12 | cffi==1.15.1 13 | charset-normalizer==2.1.1 14 | click==8.1.3 15 | colorama==0.4.4 16 | dbt-core==1.5.0 17 | dbt-duckdb==1.5.1 18 | dbt-extractor==0.4.1 19 | debugpy==1.6.3 20 | decorator==5.1.1 21 | defusedxml==0.7.1 22 | duckdb==0.7.0 23 | entrypoints==0.4 24 | executing==1.0.0 25 | fastjsonschema==2.16.1 26 | future==0.18.2 27 | hologram==0.0.15 28 | idna==3.4 29 | ipykernel==6.15.3 30 | ipython==8.5.0 31 | ipython-genutils==0.2.0 32 | ipywidgets==8.0.2 33 | isodate==0.6.1 34 | jedi==0.18.1 35 | Jinja2==3.1.2 36 | jsonschema==3.2.0 37 | jupyter==1.0.0 38 | jupyter-console==6.4.4 39 | jupyter-core==4.11.1 40 | jupyter_client==7.3.5 41 | jupyterlab-pygments==0.2.2 42 | jupyterlab-widgets==3.0.3 43 | leather==0.3.4 44 | Logbook==1.5.3 45 | MarkupSafe==2.0.1 46 | mashumaro==3.6 47 | matplotlib-inline==0.1.6 48 | minimal-snowplow-tracker==0.0.2 49 | mistune==0.8.4 50 | msgpack==1.0.4 51 | nbclient==0.5.13 52 | nbconvert==6.4.5 53 | nbformat==5.5.0 54 | nest-asyncio==1.5.5 55 | networkx==2.8.3 56 | notebook==6.4.12 57 | numpy==1.23.3 58 | packaging==21.3 59 | pandas==1.4.4 60 | pandocfilters==1.5.0 61 | parsedatetime==2.4 62 | parso==0.8.3 63 | pathspec==0.9.0 64 | patsy==0.5.2 65 | pexpect==4.8.0 66 | pickleshare==0.7.5 67 | plotly==5.10.0 68 | plotly-calplot==0.1.12 69 | plotly-express==0.4.1 70 | prometheus-client==0.14.1 71 | prompt-toolkit==3.0.31 72 | protobuf==4.23.1 73 | psutil==5.9.2 74 | ptyprocess==0.7.0 75 | pure-eval==0.2.2 76 | pycparser==2.21 77 | Pygments==2.13.0 78 | pyparsing==3.0.9 79 | pyrsistent==0.18.1 80 | python-dateutil==2.8.2 81 | python-dotenv==0.21.0 82 | python-slugify==6.1.2 83 | pytimeparse==1.1.8 84 | pytz==2022.2.1 85 | PyYAML==6.0 86 | pyzmq==24.0.0 87 | qtconsole==5.3.2 88 | QtPy==2.2.0 89 | requests==2.28.1 90 | scipy==1.9.1 91 | Send2Trash==1.8.0 92 | six==1.16.0 93 | soupsieve==2.3.2.post1 94 | sqlparse==0.4.2 95 | stack-data==0.5.0 96 | statsmodels==0.13.2 97 | tenacity==8.0.1 98 | terminado==0.15.0 99 | testpath==0.6.0 100 | text-unidecode==1.3 101 | tornado==6.2 102 | traitlets==5.4.0 103 | typing_extensions==4.3.0 104 | urllib3==1.26.12 105 | wcwidth==0.2.5 106 | webencodings==0.5.1 107 | Werkzeug==2.1.2 108 | widgetsnbextension==4.0.3 109 | -------------------------------------------------------------------------------- /python/simplescraper/tasks/parse_job_description.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | from bs4 import BeautifulSoup 5 | 6 | SPACE_CHAR = ' ' 7 | NBSP_CHAR = u'\xa0' 8 | 9 | METADATA_JSON_PREFIX = 'window.__PRELOADED_STATE__.HeaderStepStoneBlock = ' 10 | METADATA_JSON_SUFFIX = ';' 11 | 12 | FIELD_SELECTORS = { 13 | #'company_name': '.at-header-company-name', 14 | # 'description': 'div[itemprop="description"]', 15 | 'description_introduction': '.at-section-text-introduction', 16 | 'description_responsabilities': '.at-section-text-description-content', 17 | 'description_requirements': '.at-section-text-profile-content', 18 | 'description_perks': '.at-section-text-weoffer-content', 19 | } 20 | 21 | 22 | def flatten_metadata(metadata): 23 | flatten = metadata.copy() 24 | temp_metadata = flatten.pop('metaData') 25 | flatten.update(temp_metadata) 26 | return flatten 27 | 28 | 29 | def keys_to_snake_case(metadata): 30 | snake_case_object = {} 31 | for old_key in metadata.keys(): 32 | # https://stackoverflow.com/questions/60148175/convert-camelcase-to-snakecase 33 | new_key = re.sub(r'(? ONE_HOUR: 23 | raise Exception('The load_timestamp is older than one hour') 24 | 25 | 26 | def historize_url_content(url, content, load_timestamp): 27 | file_name = url.split('/')[-1] 28 | save_raw_file(content, SITEMAP, load_timestamp, file_name) 29 | 30 | 31 | def get_and_historize_url_content(url, load_timestamp): 32 | content = get_url_content(url) 33 | historize_url_content(url, content, load_timestamp) 34 | return content 35 | 36 | 37 | def get_listing_urls(load_timestamp): 38 | web_content = get_and_historize_url_content(SITEMAP_INDEX_XML, load_timestamp) 39 | web_content = xmltodict.parse(web_content) 40 | web_content = web_content['sitemapindex'] 41 | web_content = web_content['sitemap'] 42 | listing_urls = [] 43 | for entry in web_content: 44 | url = entry['loc'] 45 | if 'listings' in url: 46 | listing_urls.append(url) 47 | return listing_urls 48 | 49 | 50 | def get_job_description_urls(web_content): 51 | web_content = xmltodict.parse(web_content) 52 | web_content = web_content['urlset'] 53 | url_entries = web_content['url'] 54 | urls = [] 55 | for entry in url_entries: 56 | url = entry['loc'] 57 | urls.append(url) 58 | 59 | return urls 60 | 61 | 62 | def get_all_job_description_urls(load_timestamp): 63 | listing_urls = get_listing_urls(load_timestamp) 64 | job_description_urls = [] 65 | for listing_url in listing_urls: 66 | web_content = get_and_historize_url_content(listing_url, load_timestamp) 67 | job_description_urls.extend(get_job_description_urls(web_content)) 68 | return job_description_urls 69 | 70 | 71 | def convert_urls_to_df(all_job_description_urls) -> pd.DataFrame: 72 | df = pd.DataFrame(all_job_description_urls, columns=['url']) 73 | 74 | df = df.drop_duplicates() 75 | url_split = df['url'].str.split('--', expand=True) 76 | df['name_slug'] = url_split[1] 77 | df['id'] = url_split[2].str.split('-', expand=True)[0] 78 | df = df.sort_values(by=['id'], ascending=False) 79 | 80 | return df 81 | 82 | 83 | def download_sitemap(load_timestamp) -> pd.DataFrame: 84 | configure_logger(load_timestamp) 85 | check_load_timestamp(load_timestamp) 86 | logger.info('download_sitemap: start') 87 | all_job_description_urls = get_all_job_description_urls(load_timestamp) 88 | df = convert_urls_to_df(all_job_description_urls) 89 | save_temp_df(df, load_timestamp, SITEMAP_URLS_CSV) 90 | logger.info('download_sitemap: end') 91 | return df 92 | 93 | 94 | if __name__ == '__main__': 95 | download_sitemap(LATEST_LOAD_TIMESTAMP) 96 | -------------------------------------------------------------------------------- /python/dashy/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with python 3.10 3 | # To update, run: 4 | # 5 | # pip-compile --allow-unsafe requirements.in 6 | # 7 | ansi2html==1.8.0 8 | # via jupyter-dash 9 | appnope==0.1.3 10 | # via 11 | # ipykernel 12 | # ipython 13 | asttokens==2.0.8 14 | # via stack-data 15 | backcall==0.2.0 16 | # via ipython 17 | brotli==1.0.9 18 | # via flask-compress 19 | certifi==2022.9.24 20 | # via requests 21 | charset-normalizer==2.1.1 22 | # via requests 23 | click==8.1.3 24 | # via flask 25 | dash==2.6.2 26 | # via 27 | # -r requirements.in 28 | # dash-bootstrap-components 29 | # jupyter-dash 30 | dash-bootstrap-components==1.2.1 31 | # via -r requirements.in 32 | dash-core-components==2.0.0 33 | # via dash 34 | dash-html-components==2.0.0 35 | # via dash 36 | dash-table==5.0.0 37 | # via dash 38 | debugpy==1.6.3 39 | # via ipykernel 40 | decorator==5.1.1 41 | # via ipython 42 | duckdb==0.7.0 43 | # via -r requirements.in 44 | entrypoints==0.4 45 | # via jupyter-client 46 | executing==1.1.0 47 | # via stack-data 48 | flask==2.2.2 49 | # via 50 | # dash 51 | # flask-compress 52 | # jupyter-dash 53 | flask-compress==1.13 54 | # via dash 55 | gunicorn==20.1.0 56 | # via -r requirements.in 57 | idna==3.4 58 | # via requests 59 | ipykernel==6.16.0 60 | # via jupyter-dash 61 | ipython==8.5.0 62 | # via 63 | # ipykernel 64 | # jupyter-dash 65 | itsdangerous==2.1.2 66 | # via flask 67 | jedi==0.18.1 68 | # via ipython 69 | jinja2==3.1.2 70 | # via flask 71 | jupyter-client==7.3.5 72 | # via ipykernel 73 | jupyter-core==4.11.1 74 | # via jupyter-client 75 | jupyter-dash==0.4.2 76 | # via -r requirements.in 77 | loguru==0.6.0 78 | # via -r requirements.in 79 | markupsafe==2.1.1 80 | # via 81 | # jinja2 82 | # werkzeug 83 | matplotlib-inline==0.1.6 84 | # via 85 | # ipykernel 86 | # ipython 87 | nest-asyncio==1.5.6 88 | # via 89 | # ipykernel 90 | # jupyter-client 91 | # jupyter-dash 92 | numpy==1.23.3 93 | # via pandas 94 | packaging==21.3 95 | # via ipykernel 96 | pandas==1.5.0 97 | # via -r requirements.in 98 | parso==0.8.3 99 | # via jedi 100 | pexpect==4.8.0 101 | # via ipython 102 | pickleshare==0.7.5 103 | # via ipython 104 | plotly==5.10.0 105 | # via dash 106 | prompt-toolkit==3.0.31 107 | # via ipython 108 | psutil==5.9.2 109 | # via ipykernel 110 | ptyprocess==0.7.0 111 | # via pexpect 112 | pure-eval==0.2.2 113 | # via stack-data 114 | pygments==2.13.0 115 | # via ipython 116 | pyparsing==3.0.9 117 | # via packaging 118 | python-dateutil==2.8.2 119 | # via 120 | # jupyter-client 121 | # pandas 122 | python-dotenv==0.21.0 123 | # via -r requirements.in 124 | pytz==2022.4 125 | # via pandas 126 | pyzmq==24.0.1 127 | # via 128 | # ipykernel 129 | # jupyter-client 130 | requests==2.28.1 131 | # via jupyter-dash 132 | retrying==1.3.3 133 | # via jupyter-dash 134 | six==1.16.0 135 | # via 136 | # python-dateutil 137 | # retrying 138 | stack-data==0.5.1 139 | # via ipython 140 | tenacity==8.1.0 141 | # via plotly 142 | tornado==6.2 143 | # via 144 | # ipykernel 145 | # jupyter-client 146 | traitlets==5.4.0 147 | # via 148 | # ipykernel 149 | # ipython 150 | # jupyter-client 151 | # matplotlib-inline 152 | urllib3==1.26.12 153 | # via requests 154 | wcwidth==0.2.5 155 | # via prompt-toolkit 156 | werkzeug==2.2.2 157 | # via flask 158 | 159 | # The following packages are considered to be unsafe in a requirements file: 160 | setuptools==65.4.1 161 | # via gunicorn 162 | -------------------------------------------------------------------------------- /doc/TODO.md: -------------------------------------------------------------------------------- 1 | # TO DO 2 | 3 | ## Open 4 | 5 | - [ ] Implement use case: Location/Company/Technology changelog 6 | - [ ] Add the next data source 7 | - [ ] Slugify the value of the filter selectors 8 | - [ ] Upload only backup files to the Azure Blob Storage 9 | - [ ] Implement use case: Number of jobs relative to city population 10 | - [ ] Add the flag to the do and verify backup commands: --exclude='.DS_Store' 11 | - [ ] Add a file in the raw layer with the scrape run information for each execution 12 | - This file could be in JSON format and have the following fields: 13 | - run_id 14 | - timestamp 15 | - number of urls to download 16 | - number of urls downloaded 17 | - number of failed urls 18 | - failed urls (a list of string) 19 | 20 | ## In Progress 21 | 22 | 23 | ## Done 24 | 25 | - [x] Display more than 12 months 26 | - [x] Let users use interactive graphs instead of static plots 27 | - [x] Let users start the y-axis with zero 28 | - [x] Make Dashy public with the domain https://jobmarketanalytics.com/ 29 | - [x] Cache sql query executions on Dashy 30 | - [x] Implement use case: Compare technologies 31 | - [x] Have 3 materialized tables for Dashy with different time durations to improve the performance 32 | - [x] Use statefuls URLs according to state of the input components on Dashy 33 | - [x] Use LocalExecutor in Airflow 34 | - [x] Run Airflow locally to reduce the Docker overhead 35 | - [x] Implement use case: Technology trends 36 | - [x] Add a size indicator in the filter options in Dashy 37 | - [x] Implement some kind of search/dashboard for external users 38 | - [x] Check out https://github.com/rilldata/rill-developer 39 | - [x] Decide for a BI tool 40 | - [x] Check out https://superset.apache.org/ 41 | - [x] Create a separated virtual environment for dbt 42 | - [x] Check out https://www.linkedin.com/in/christian-kaul/recent-activity/posts/ 43 | - [x] Check out https://dbtvault.readthedocs.io/ 44 | - [x] Check out https://github.com/jwills/dbt-duckdb 45 | - [x] Use Gunicorn to run flasky with 4 workers 46 | - [x] On the cleansed layer, add the first sitemap occurance per URL instead of only the latest load_timestamp 47 | - [x] Add load_timestamp and load_date to the curated layer 48 | - [x] Rename target_date to load_date 49 | - [x] Rename run_timestamp to load_timestamp 50 | - [x] Fail the download sitemap task in the hourly dag if the load_timestamp is older than one hour 51 | - [x] Create a separated virtual environment for airflow 52 | - [x] Fix the issue "metaData-bag.log" 53 | - [x] Find a better way to avoid Airflow to hang when there are many jobs to download 54 | - [x] Move the raw storage to the cloud 55 | - [x] Improve logging 56 | - Log how many urls to download are 57 | - Make the check vpn more visible 58 | - [x] Download the job description again after a configurable number of days online 59 | - [x] Create a report that shows how many days a job offer is online 60 | - [x] Create a report that shows how many job offers are online at a given time 61 | - [x] Find a better timestamp to use than the logical timestamp for the scrape data source dag 62 | - [x] Fix bug with file names longer than 255 characters 63 | - [x] Fix logs in Flasky 64 | - [x] Add more granularity to the ingestion time in the raw data 65 | - [x] Add orchestration with Airflow 66 | - [x] Create the Data Vault 67 | - [x] Optimize the function to create the chunks 68 | - [x] Add a check for the network connection before we start crawling 69 | - [x] Save the whole html document from the source instead of just a fragment of it, so that no information is lost if 70 | the HTML format changes 71 | - [x] Add logging to the sitemap scraper 72 | - [x] Find a way to pass the list of parquet files to PostgreSQL. 73 | - Result: Use Python to create the staging fdw staging tables referencing the parquet files 74 | - [x] Add the _job_id_ to the _sitemap_ and _job_description_ on the cleansed layer 75 | - [x] Create a _ingestion_id_ with the hash of the _job_id_ and _timestap_ on the cleansed layer 76 | 77 | --- 78 | 79 | ## Discarded 80 | 81 | - [x] Try https://xapian.org/ for the search 82 | - [x] Replace the PostgreSQL ingestion with CSV instead of Parquet 83 | - [x] Do not let Flasky start a process behind an endpoint, if a process is still running 84 | - [x] Try Prefect 85 | - [x] Log the date and time more visible 86 | - [x] Allow one retry after the browser crashes 87 | 88 | ## Technical Debt 89 | 90 | - [ ] Rename job_description to job_offer 91 | - [ ] Rename cleansed to curated 92 | -------------------------------------------------------------------------------- /python/tests/data/normalize_job_description/output/test_case_7610222.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Anlagenmechaniker für Sanitär-, Heizungs- und Klimatechnik (m/w/d)", 3 | "online_status": "online", 4 | "is_anonymous": false, 5 | "job_id": 7610222, 6 | "should_display_early_applicant": false, 7 | "location": "Hamburg (Hammerbrook)", 8 | "contract_type": "Feste Anstellung", 9 | "work_type": "Vollzeit", 10 | "online_date": "2021-10-13T15:54:04Z", 11 | "company_name": "ENGIE Deutschland GmbH", 12 | "description": "ÜBER UNS:Als Komplett-Dienstleister im Bereich Facility Solutions sichern wir den optimalen Betrieb von Gebäuden und Anlagen. Wir bieten modulare Leistungspakete von Service und Wartung über Instandhaltung bis hin zur Bewirtschaftung komplexer Liegenschaften. Für unsere Multi-Site-Kunden arbeiten wir als überregionaler oder auch internationaler Facility-Management-Partner.IHRE AUFGABEN:Wir suchen Servicetechniker bzw. Anlagenmechaniker für die Gewerke Heizung, Klima, Lüftung, Sanitär oder Kälte für die Wartung, Instandsetzung und Bedienung der haustechnischen Anlagen bei unserem Kunden vor Ort.\nSie arbeiten an einem festen Objekt, sodass keine Reisetätigkeit anfällt.\n\nBetreiben der gebäudetechnischen Anlagen an einem anspruchsvollen Industriestandort\nOrganisation, Steuerung, Kontrolle und selbstständige Durchführung von Wartungs- und Instandsetzungsarbeiten an gebäudetechnischen Anlagen\nOptimierung der bestehenden Anlagentechnik und der Betriebsabläufe\nErstellung und Dokumentation der täglichen Arbeitsleistung über mobile Endgeräte\nKoordination und Begleitung von Nachunternehmern\nErster Ansprechpartner vor Ort für unsere Kunden im operative Tagesgeschäft\nIHR PROFIL:\nAbgeschlossene Berufsausbildung als Anlagenmechaniker für Sanitär-, Heizungs- und Klimatechnik oder als Zentralheizungs- und Lüftungsbauer, Gas-Wasserinstallateur oder Kältetechniker\nMehrjährige Berufserfahrungen im Bereich der Technischen Gebäudeausrüstung\nKunden- und Dienstleistungsorientierung gepaart mit Spaß an der Arbeit im Team\nGeregelten Arbeitszeiten mit gelegentlichen Bereitschaftsdiensten\nFührerschein der Klasse B\nIHRE BENEFITS:\nAkademie\nAltersvorsorge\nCorporate Benefits\nPerspektiven\nFirmenfeiern\nFlexible Arbeitszeiten\nGestaltungsfreiheit\nHohe Sicherheitsstandards\nInternationalität\nSpannende Projekte\nTeamgeist\nAttraktive Vergütung\nIHR JOB?Werden auch Sie ein ENGIEneer und gestalten Sie zusammen mit uns die Zukunft der Energiewende. Wir sind gespannt auf Ihre Online-Bewerbung!\n IHR KONTAKT:\nMonika Brzenska\nTalent Acquisition Specialist\nTelefon: 0221 46 90 54 29 \n \nKENNZIFFER: 2021-0476", 13 | "description_introduction": "ÜBER UNS:Als Komplett-Dienstleister im Bereich Facility Solutions sichern wir den optimalen Betrieb von Gebäuden und Anlagen. Wir bieten modulare Leistungspakete von Service und Wartung über Instandhaltung bis hin zur Bewirtschaftung komplexer Liegenschaften. Für unsere Multi-Site-Kunden arbeiten wir als überregionaler oder auch internationaler Facility-Management-Partner.", 14 | "description_responsabilities": "Wir suchen Servicetechniker bzw. Anlagenmechaniker für die Gewerke Heizung, Klima, Lüftung, Sanitär oder Kälte für die Wartung, Instandsetzung und Bedienung der haustechnischen Anlagen bei unserem Kunden vor Ort.\nSie arbeiten an einem festen Objekt, sodass keine Reisetätigkeit anfällt.\n\nBetreiben der gebäudetechnischen Anlagen an einem anspruchsvollen Industriestandort\nOrganisation, Steuerung, Kontrolle und selbstständige Durchführung von Wartungs- und Instandsetzungsarbeiten an gebäudetechnischen Anlagen\nOptimierung der bestehenden Anlagentechnik und der Betriebsabläufe\nErstellung und Dokumentation der täglichen Arbeitsleistung über mobile Endgeräte\nKoordination und Begleitung von Nachunternehmern\nErster Ansprechpartner vor Ort für unsere Kunden im operative Tagesgeschäft", 15 | "description_requirements": "Abgeschlossene Berufsausbildung als Anlagenmechaniker für Sanitär-, Heizungs- und Klimatechnik oder als Zentralheizungs- und Lüftungsbauer, Gas-Wasserinstallateur oder Kältetechniker\nMehrjährige Berufserfahrungen im Bereich der Technischen Gebäudeausrüstung\nKunden- und Dienstleistungsorientierung gepaart mit Spaß an der Arbeit im Team\nGeregelten Arbeitszeiten mit gelegentlichen Bereitschaftsdiensten\nFührerschein der Klasse B", 16 | "description_perks": "Akademie\nAltersvorsorge\nCorporate Benefits\nPerspektiven\nFirmenfeiern\nFlexible Arbeitszeiten\nGestaltungsfreiheit\nHohe Sicherheitsstandards\nInternationalität\nSpannende Projekte\nTeamgeist\nAttraktive Vergütung" 17 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Job Market Analytics 2 | 3 | The aim of this project is to develop an end-to-end Data Platform to explore and learn new technologies. 4 | 5 | ## Architecture 6 | 7 | ![Architecture Overview](doc/architecture-overview.drawio.svg) 8 | 9 | ## Storage 10 | 11 | ### Data Lake 12 | 13 | The Data Lake is basically a file system on my local computer, but could be easily transfered to a Cloud Blob Storage ( 14 | like AWS S3 or Azure Blob Storage) if needed. The current Data Lake we have two layers: 15 | 16 | - The **Raw Layer**, where the information from the data source are stored in the same file format as ingested (e.g. 17 | HTML or XML). 18 | - The **Cleansed Layer**, where we store the information in Parquet, which means that the information is stored in a 19 | tabular format with well-defined columns. 20 | 21 | ### Data Warehouse 22 | 23 | The Data Warehouse is based on PostgreSQL plus an extension in order to read Parquet files as foreign tables. PostgreSQL 24 | might not be the best choice for a datawarehouse since it is row-column-oriented but in this case we have reduced number 25 | of columns and a relative small data size. Another advantage of PostgreSQL is that I can run it easily on my computer 26 | via Docker so that I can avoid cloud service costs. We will divide the datawarehouse in 3 schemas: 27 | 28 | - **Staging**, which are basically foreign tables referencing the Parquet files on the Data Lake Cleansed Layer. 29 | - **Data Vault**, where the data is modelled and historized using 30 | the [Data Vault Specification](https://danlinstedt.com/wp-content/uploads/2018/06/DVModelingSpecs2-0-1.pdf). 31 | - **Data Mart**, which will be the consuming layer for our BI Tool. 32 | 33 | ### Data Vault Model 34 | 35 | ![Data Vault Model](doc/data-vault-model.drawio.svg) 36 | 37 | ### Mart Model 38 | 39 | ![Mart Model](doc/mart-model.drawio.svg) 40 | 41 | ### Data Lineage (dbt Dag) 42 | 43 | ![Data Lineage (dbt Dag)](doc/dbt-dag.png) 44 | 45 | ## Computing 46 | 47 | In order to compute the data, we use two different approaches. 48 | 49 | - **Python** for the data ingestion, when we crawl and scrape data directly from the data source. And also for the data 50 | transformation from the Raw to the Cleansed layer. All Python code is divided in atomic tasks and these are 51 | - orchestrated by [Airflow](https://airflow.apache.org/). 52 | - **SQL** for the transformations of the data inside the Data Warehouse. The SQL tasks are automated and orchestrated 53 | by [dbt](https://www.getdbt.com/). 54 | 55 | ### Data Source Scraping 56 | 57 | In order to download the data from the data source, we run the following Airflow dag: 58 | 59 | ![Scrape Data Source Dag](doc/scrape_data_source_dag.png) 60 | 61 | First, we make sure that we are connected to the VPN, then we download and archive the list of the jobs that online at 62 | the moment from the sitemap, and we list out which job descriptions we have not downloaded yet, and then we download 63 | them via browser automation with [Playwright](https://playwright.dev/). 64 | 65 | ### Data Transformation Orchestration 66 | 67 | The data transformation is orchestrated as an Airflow DAG, which runs on daily basis and combines Python transformation 68 | jobs and the dbt run to build up the incremental models. 69 | 70 | ![Airflow DAG Daily](doc/airflow_dag_daily.png) 71 | 72 | ## Frequently Asked Questions 73 | 74 | ### What questions can be answered with this project? 75 | 76 | Here are some examples of what we can answer: 77 | 78 | - How long is a job offer online until it is taken offline? 79 | - Which technologies are the most demanded at the moment? 80 | - How the demand for a particular technology evolves during the time? 81 | - How many jobs offers are remote and how this is evolving during the time? 82 | - When was a particular job offer first published? 83 | 84 | ### Could we answer those questions with a simplier technology stack? 85 | 86 | Yes, we could. But the point of the project is to explore and learn new technologies and concepts, therefore it has been 87 | over-engineered on purpose. 88 | 89 | ### Are you planning to create a public Web or Mobile Application with this? 90 | 91 | No, at least not at the moment. This is just for educative purposes. 92 | 93 | ### Why did you choose Parquet as file format for the Cleansed Layer in the Data Lake? 94 | 95 | I choose Parquet because it is a column-oriented compressed file type, which has been well-batled-tested. Good Python 96 | libraries are available like [Pyarrow](https://arrow.apache.org/docs/python/parquet.html) to write and read. 97 | 98 | ### Why did you choose PostgresSQL for the Data Warehouse? 99 | 100 | PostgreSQL is a very robust database with standard SQL that can run locally and its performance is good enough for the 101 | current data size and number of columns. 102 | 103 | ### How big is your data? 104 | 105 | It is around 530 GB in raw format after being scraping the data sources since October 2021, and it grows around 2 GB 106 | every day. After cleansing and compressing the data to Parquet is around 30 times smaller, since we can get rid of a 107 | great deal of HTML, CSS and JS because it does not provide any extra information for my use cases. 108 | 109 | ![Raw Storage Size in Azure Blob Container](doc/raw-in-azure-blob-storage.png) 110 | -------------------------------------------------------------------------------- /python/.flake8: -------------------------------------------------------------------------------- 1 | # All configuration for plugins and other utils is defined here. 2 | # Read more about `setup.cfg`: 3 | # https://docs.python.org/3/distutils/configfile.html 4 | 5 | 6 | # === Linter configuration === 7 | # You can reuse this configuration in your own projects. 8 | # See: https://wemake-python-stylegui.de/en/latest/pages/usage/integrations/nitpick.html 9 | 10 | [flake8] 11 | # Base flake8 configuration: 12 | # https://flake8.pycqa.org/en/latest/user/configuration.html 13 | format = wemake 14 | show-source = True 15 | statistics = False 16 | doctests = True 17 | 18 | # Plugins: 19 | max-complexity = 6 20 | max-line-length = 120 21 | 22 | # darglint configuration: 23 | # https://github.com/terrencepreilly/darglint 24 | strictness = long 25 | docstring-style = numpy 26 | 27 | # Self settings: 28 | max-imports = 17 29 | 30 | # Excluding some directories: 31 | exclude = 32 | .git 33 | __pycache__ 34 | .venv 35 | .eggs 36 | *.egg 37 | dist 38 | # These folders contain code badly written for reasons: 39 | # Project spefic, do not copy. 40 | tests/fixtures/** 41 | tests/**/snapshots/** 42 | 43 | # Exclude some pydoctest checks globally: 44 | ignore = D100, D104, D401, W504, RST303, RST304, DAR103, DAR203, E800, D103, WPS421, WPS305 45 | 46 | per-file-ignores = 47 | # These function names are part of 3d party API: 48 | wemake_python_styleguide/visitors/ast/*.py: N802 49 | # These modules should contain a lot of classes: 50 | wemake_python_styleguide/violations/*.py: WPS202 51 | # Eval is a complex task: 52 | wemake_python_styleguide/logic/safe_eval.py: WPS232 53 | # This module should contain magic numbers: 54 | wemake_python_styleguide/options/defaults.py: WPS432 55 | # Checker has a lot of imports: 56 | wemake_python_styleguide/checker.py: WPS201 57 | # Allows mypy type hinting, `Ellipsis`` usage, multiple methods: 58 | wemake_python_styleguide/types.py: D102, WPS214, WPS220, WPS428 59 | # There are multiple fixtures, `assert`s, and subprocesses in tests: 60 | tests/test_visitors/test_ast/test_naming/conftest.py: WPS202 61 | tests/*.py: S101, S105, S404, S603, S607, WPS211, WPS226, WPS323 62 | # Docs can have the configuration they need: 63 | docs/conf.py: WPS407 64 | # Pytest fixtures 65 | tests/plugins/*.py: WPS442 66 | 67 | 68 | [isort] 69 | # isort configuration: 70 | # https://github.com/timothycrosley/isort/wiki/isort-Settings 71 | include_trailing_comma = true 72 | use_parentheses = true 73 | # See https://github.com/timothycrosley/isort#multi-line-output-modes 74 | multi_line_output = 3 75 | # Is the same as 80 in flake8: 76 | line_length = 120 77 | 78 | # We need these lines for Github Action to work correctly, 79 | # **please** do not copy it to your own configs: 80 | default_section = THIRDPARTY 81 | known_first_party = wemake_python_styleguide* 82 | skip_glob = 83 | # These folders contain code badly written for reasons: 84 | tests/fixtures/** 85 | tests/**/snapshots/** 86 | 87 | 88 | # === Internal tools === 89 | # You are not interested in anything beyond this line. 90 | 91 | [tool:pytest] 92 | # py.test configuration: http://doc.pytest.org/en/latest/customize.html 93 | norecursedirs = tests/fixtures *.egg .eggs dist build docs .tox .git __pycache__ 94 | 95 | filterwarnings = 96 | ignore::DeprecationWarning 97 | 98 | addopts = 99 | --strict 100 | --doctest-modules 101 | --cov=wemake_python_styleguide 102 | --cov-branch 103 | --cov-report=term-missing:skip-covered 104 | --cov-report=html 105 | --cov-report=xml 106 | --cov-fail-under=100 107 | 108 | 109 | [coverage:run] 110 | # Coverage configuration: https://coverage.readthedocs.io/ 111 | 112 | # We don't need to cover some files. They are fully checked with mypy. 113 | # And don't contain any logic. 114 | omit = 115 | wemake_python_styleguide/types.py 116 | 117 | # Here we specify plugins for coverage to be used: 118 | plugins = 119 | coverage_conditional_plugin 120 | 121 | [coverage:coverage_conditional_plugin] 122 | # Here we specify our pragma rules: 123 | rules = 124 | "sys_version_info < (3, 8)": py-lt-38 125 | "sys_version_info >= (3, 8)": py-gte-38 126 | 127 | "sys_version_info < (3, 9)": py-lt-39 128 | "sys_version_info >= (3, 9)": py-gte-39 129 | 130 | 131 | [mypy] 132 | # The mypy configurations: http://bit.ly/2zEl9WI 133 | allow_redefinition = False 134 | check_untyped_defs = True 135 | disallow_untyped_decorators = True 136 | disallow_any_explicit = True 137 | disallow_any_generics = True 138 | disallow_untyped_calls = True 139 | ignore_errors = False 140 | ignore_missing_imports = True 141 | implicit_reexport = False 142 | local_partial_types = True 143 | strict_optional = True 144 | strict_equality = True 145 | no_implicit_optional = True 146 | warn_unused_ignores = True 147 | warn_redundant_casts = True 148 | warn_unused_configs = True 149 | warn_unreachable = True 150 | warn_no_return = True 151 | 152 | [mypy-wemake_python_styleguide.compat.nodes] 153 | # We allow explicit `Any` only in this file, because of the compatibility: 154 | disallow_any_explicit = False 155 | 156 | [mypy-wemake_python_styleguide.compat.packaging] 157 | # We allow unused `ignore` comments, because we cannot sync it between versions: 158 | warn_unused_ignores = False 159 | 160 | [mypy-wemake_python_styleguide.logic.safe_eval] 161 | # We allow explicit `Any` only in this file, because that's what it does: 162 | disallow_any_explicit = False 163 | 164 | 165 | [doc8] 166 | # doc8 configuration: https://pypi.org/project/doc8/ 167 | ignore-path = docs/_build 168 | max-line-length = 120 169 | sphinx = True -------------------------------------------------------------------------------- /python/tests/data/normalize_job_description/output/test_case_7610188.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Ansible/ServiceNow Experte (m/w/d)", 3 | "online_status": "online", 4 | "is_anonymous": false, 5 | "job_id": 7610188, 6 | "should_display_early_applicant": false, 7 | "location": "Hannover oder Münster", 8 | "contract_type": "Feste Anstellung", 9 | "work_type": "Vollzeit, Home Office möglich", 10 | "online_date": "2021-10-13T15:40:32Z", 11 | "company_name": "Finanz Informatik GmbH & Co. KG", 12 | "description": "Als einer der größten Banken-IT-Dienstleister Europas sind wir der Treiber der Digitalisierung innerhalb der Sparkassen-Finanzgruppe. Mit über 4.000 Mitarbeitern an 3 Standorten machen wir digitales Banking von heute leistungsfähig und entwickeln smarte Finanz-Services von morgen. Dabei bieten wir Ihnen ein breites Aufgabenspektrum, in dem Sie Ihre individuelle Stärke hervorragend einbringen können. Ob App-Entwicklung, Netzwerktechnologien und Serverbetrieb oder Beratung, Schulung und Support – bei uns finden Sie Ihre Berufung! Als Spezialist oder auch als Generalist. Alles mit besten Karrierechancen, viel Raum für persönliche Entfaltung und zahlreichen Benefits.\nFür unsere Abteilung Bereitstellung Kommunikationsdienste suchen wir zum nächstmöglichen Zeitpunkt für den Standort Hannover oder Münster Verstärkung als\nAnsible/ServiceNow Experte (m/w/d)\nIhre Aufgaben:\nSie sind unser Experte für die Einführung und kontinuierliche Weiterentwicklung unserer Automationsstrategie\nEntwurf/Programmierung (Python) von Automationsobjekten zur Optimierung des Produktionsablaufes und der Überwachung der Systemplattform \nAufbau von automatisierten Schnittstellen zur umliegenden Serverinfrastruktur\nDurchführung von Programm- und Systemtests und Unterstützung bei der Fehlerbehebung \nDokumentation sowie Pflege und Qualitätssicherung der automatisierten Plattform\nEntwicklung der Automatisierung bei der Bereitstellung neuer Services\n\nIhr Profil:\nAbgeschlossenes technisches Studium vorzugsweise im IT/TK-Umfeld oder eine vergleichbare Ausbildung/Qualifikation\nMehrjährige Erfahrung in der Programmierung und im Umgang mit Skriptsprachen \nErfahrung mit Telefonie-Plattformen und -Systemen, ACD, VoIP-Netzwerkstrukturen \nKenntnisse im Plattformbetrieb von Windows, Unix, Datenbanken sowie VMware\nErfahrungen im Prozess-, Test- und Qualitätsmanagement wünschenswert\nKundenorientierung und gute kommunikative Fähigkeiten \nSie sind ein Teamplayer und ergänzen unser dynamisches Team mit Initiative und Zielstrebigkeit\nBereitschaft zu gelegentlichen Dienstreisen sowie Sondereinsätzen\n\nIhre Benefits:\nAltersvorsorge\nBarrierefrei\nBetriebssport\nFamilienservice\nFirmenevents\nFlexible Arbeitszeiten\nMobiles Arbeiten\nJobticket\nKantine\nTarifvertrag\nWeiterbildung\nFitnessförderung\n\nBei uns erwartet Sie eine attraktive Vergütung basierend auf Ihrer Qualifikation sowie Ihrer relevanten, praktischen Erfahrung.\nKlingt interessant?Dann bewerben Sie sich ganz einfach über unser FI-Karriere-Online-Portal. Wir freuen uns auf Ihre Bewerbung unter Angabe der Kennziffer 341/2021! Sollten Sie vorab weitere Auskünfte zu dieser Stelle wünschen, steht Ihnen gerne Herr Malte Kurz zur Verfügung. Sie erreichen Malte Kurz unter Tel. 0511 5102-24958 oder per E-Mail unter karriere@f-i.de.", 13 | "description_introduction": "Als einer der größten Banken-IT-Dienstleister Europas sind wir der Treiber der Digitalisierung innerhalb der Sparkassen-Finanzgruppe. Mit über 4.000 Mitarbeitern an 3 Standorten machen wir digitales Banking von heute leistungsfähig und entwickeln smarte Finanz-Services von morgen. Dabei bieten wir Ihnen ein breites Aufgabenspektrum, in dem Sie Ihre individuelle Stärke hervorragend einbringen können. Ob App-Entwicklung, Netzwerktechnologien und Serverbetrieb oder Beratung, Schulung und Support – bei uns finden Sie Ihre Berufung! Als Spezialist oder auch als Generalist. Alles mit besten Karrierechancen, viel Raum für persönliche Entfaltung und zahlreichen Benefits.\nFür unsere Abteilung Bereitstellung Kommunikationsdienste suchen wir zum nächstmöglichen Zeitpunkt für den Standort Hannover oder Münster Verstärkung als\nAnsible/ServiceNow Experte (m/w/d)", 14 | "description_responsabilities": "Sie sind unser Experte für die Einführung und kontinuierliche Weiterentwicklung unserer Automationsstrategie\nEntwurf/Programmierung (Python) von Automationsobjekten zur Optimierung des Produktionsablaufes und der Überwachung der Systemplattform \nAufbau von automatisierten Schnittstellen zur umliegenden Serverinfrastruktur\nDurchführung von Programm- und Systemtests und Unterstützung bei der Fehlerbehebung \nDokumentation sowie Pflege und Qualitätssicherung der automatisierten Plattform\nEntwicklung der Automatisierung bei der Bereitstellung neuer Services", 15 | "description_requirements": "Abgeschlossenes technisches Studium vorzugsweise im IT/TK-Umfeld oder eine vergleichbare Ausbildung/Qualifikation\nMehrjährige Erfahrung in der Programmierung und im Umgang mit Skriptsprachen \nErfahrung mit Telefonie-Plattformen und -Systemen, ACD, VoIP-Netzwerkstrukturen \nKenntnisse im Plattformbetrieb von Windows, Unix, Datenbanken sowie VMware\nErfahrungen im Prozess-, Test- und Qualitätsmanagement wünschenswert\nKundenorientierung und gute kommunikative Fähigkeiten \nSie sind ein Teamplayer und ergänzen unser dynamisches Team mit Initiative und Zielstrebigkeit\nBereitschaft zu gelegentlichen Dienstreisen sowie Sondereinsätzen", 16 | "description_perks": "Altersvorsorge\nBarrierefrei\nBetriebssport\nFamilienservice\nFirmenevents\nFlexible Arbeitszeiten\nMobiles Arbeiten\nJobticket\nKantine\nTarifvertrag\nWeiterbildung\nFitnessförderung\n\nBei uns erwartet Sie eine attraktive Vergütung basierend auf Ihrer Qualifikation sowie Ihrer relevanten, praktischen Erfahrung." 17 | } -------------------------------------------------------------------------------- /python/simplescraper/tasks/curate_job_descriptions.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import numpy as np 4 | 5 | from common.entity import JOB, JOB_LOCATION, JOB_DESCRIPTION, JOB_TECHNOLOGY 6 | from common.logging import configure_logger, logger 7 | from common.storage import get_load_timestamp, get_load_date, load_cleansed_df, save_curated_df 8 | 9 | JOB_DESCRIPTION_SAT_COLUMNS = ['title', 'online_status', 'is_anonymous', 'should_display_early_applicant', 10 | 'contract_type', 'work_type', 'online_date', 'company_name', 'description_introduction', 11 | 'description_responsabilities', 'description_requirements', 'description_perks'] 12 | 13 | BASE_COLUMNS = ['year', 'month', 'day', 'job_id', 'load_timestamp'] 14 | 15 | TECHNOLOGIES = [ 16 | 'AI', 17 | 'Airflow', 18 | 'Android', 19 | 'Angular', 20 | 'AWS', 21 | 'Azure', 22 | 'CSS', 23 | 'Couchbase', 24 | 'CouchDB', 25 | 'Cypress', 26 | 'Dagster', 27 | 'Dask', 28 | 'Databricks', 29 | 'dbt', 30 | 'Docker', 31 | 'Duckdb', 32 | 'ELT', 33 | 'ETL', 34 | 'Flink', 35 | 'Flutter', 36 | 'GCP', 37 | 'Go', 38 | 'Golang', 39 | 'Gradle', 40 | 'gRPC', 41 | 'HANA', 42 | 'Java', 43 | 'JavaScript', 44 | 'Keras', 45 | 'Kotlin', 46 | 'Kubernetes', 47 | 'LESS', 48 | 'Maven', 49 | 'ML', 50 | 'MongoDB', 51 | 'MySQL', 52 | 'NLP', 53 | 'Oracle', 54 | 'Pandas', 55 | 'Playwright', 56 | 'PostgreSQL', 57 | 'Prefect', 58 | 'Puppeteer', 59 | 'Purview', 60 | 'Python', 61 | 'PyTorch', 62 | 'React', 63 | 'REST', 64 | 'Rust', 65 | 'Tensorflow', 66 | 'TestCafe', 67 | 'TypeScript', 68 | 'WebAssembly', 69 | 'scikit', 70 | 'Selenium', 71 | 'Snowflake', 72 | 'Snowplow', 73 | 'Spark', 74 | 'Spring', 75 | 'Storm', 76 | 'SAP', 77 | 'SCSS', 78 | 'SQL', 79 | 'SSIS', 80 | 'Synapse', 81 | 'Vue', 82 | ] 83 | 84 | 85 | def process_job_description(df): 86 | df = df.copy() 87 | df = df[df['company_name'].notna()] 88 | df = df[BASE_COLUMNS + JOB_DESCRIPTION_SAT_COLUMNS] 89 | save_curated_df(df, JOB) 90 | 91 | 92 | def process_location(df): 93 | df = df[BASE_COLUMNS + ['location']].copy() 94 | 95 | df['location'] = df['location'].str.replace('Frankfurt (Main)', 'Frankfurt am Main', regex=False) 96 | df['location'] = df['location'].str.replace('Frankfurt a. M.', 'Frankfurt am Main', regex=False) 97 | df['location'] = df['location'].str.replace('Frankfurt a.M.', 'Frankfurt am Main', regex=False) 98 | df['location'] = df['location'].str.replace('Frankfurt am Main (60488)', 'Frankfurt am Main', regex=False) 99 | df['location'] = df['location'].str.replace('Frankfurt Am Main', 'Frankfurt am Main', regex=False) 100 | df['location'] = df['location'].str.replace('Frankfurt/M.', 'Frankfurt am Main', regex=False) 101 | df['location'] = df['location'].str.replace('Frankfurt aM', 'Frankfurt am Main', regex=False) 102 | df['location'] = df['location'].str.replace('Frankfurt (am Main)', 'Frankfurt am Main', regex=False) 103 | df['location'] = df['location'].str.replace('Frankfurt Main', 'Frankfurt am Main', regex=False) 104 | df['location'] = df['location'].str.replace('Frankfurt aam Main', 'Frankfurt am Main', regex=False) 105 | 106 | df['location'] = df['location'].str.replace('|'.join([' und ', ' oder ', '/', ';', ' - ', ':']), ',', regex=True) 107 | df['location'] = df['location'].str.replace(' | ', ',', regex=False) 108 | df['location'] = df['location'].str.replace(' .', ',', regex=False) 109 | df['location'] = df['location'].str.replace(' u.a. ', ',', regex=False) 110 | df['location'] = df['location'].str.split(',') 111 | df = df.explode('location').reset_index(drop=True) 112 | 113 | df['location'] = df['location'].str.strip() 114 | 115 | df['location'] = df['location'].replace('Frankfurt', 'Frankfurt am Main') 116 | 117 | df['location'] = df['location'].replace('', np.nan) 118 | df['location'] = df['location'].replace('keine Angabe', np.nan) 119 | df = df.dropna() 120 | 121 | save_curated_df(df, JOB_LOCATION) 122 | 123 | 124 | def process_technology(df): 125 | df = df.copy() 126 | df['description'] = df['title'] + ' ' + \ 127 | df['description_introduction'] + ' ' + \ 128 | df['description_responsabilities'] + ' ' + \ 129 | df['description_requirements'] + ' ' + \ 130 | df['description_perks'] 131 | for technology in TECHNOLOGIES: 132 | df[technology] = df['description'].str.contains(fr'(?i)\b{technology}\b', regex=True) 133 | df['Other'] = ~df[TECHNOLOGIES].any(axis='columns') 134 | df = df.melt(id_vars=BASE_COLUMNS, value_vars=TECHNOLOGIES + ['Other'], var_name='technology') 135 | df = df[df['value'].notna()] 136 | df = df[df['value']] 137 | df = df[BASE_COLUMNS + ['technology']] 138 | 139 | save_curated_df(df, JOB_TECHNOLOGY) 140 | 141 | 142 | def curate_job_descriptions(load_timestamp, load_date): 143 | configure_logger(load_timestamp) 144 | logger.info(f'Start curate_job_descriptions: {load_timestamp} {load_date}') 145 | 146 | df = load_cleansed_df(JOB_DESCRIPTION, load_date=load_date) 147 | 148 | df = df.dropna(subset=['job_id']) 149 | df['job_id'] = df['job_id'].astype('int') 150 | df = df.sort_values(by=['job_id']) 151 | 152 | process_job_description(df) 153 | process_location(df) 154 | process_technology(df) 155 | 156 | logger.info(f'End curate_job_descriptions: {load_timestamp} {load_date}') 157 | 158 | 159 | if __name__ == "__main__": 160 | _load_timestamp = sys.argv[1] if len(sys.argv) > 1 else get_load_timestamp() 161 | _load_date = sys.argv[2] if len(sys.argv) > 2 else get_load_date() 162 | curate_job_descriptions(_load_timestamp, _load_date) 163 | -------------------------------------------------------------------------------- /python/tests/data/normalize_job_description/output/test_case_7609275.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Account Manager (m/w/d)", 3 | "online_status": "online", 4 | "is_anonymous": false, 5 | "job_id": 7609275, 6 | "should_display_early_applicant": false, 7 | "location": "bundesweit", 8 | "contract_type": "Feste Anstellung", 9 | "work_type": "Vollzeit, Home Office möglich", 10 | "online_date": "2021-10-13T13:22:15Z", 11 | "company_name": "Quentic GmbH", 12 | "description": "Passionate people for sustainable softwareQuentic ist einer der führenden Lösungsanbieter für Software as a Service (SaaS) im europäischen HSE- und CSR-Markt. Das Unternehmen hat seinen Hauptsitz in Berlin und beschäftigt über 250 Mitarbeitende. Niederlassungen befinden sich in Deutschland, Österreich und der Schweiz sowie in Finnland, Belgien, Dänemark, Schweden, den Niederlanden, Spanien und Italien.\nÜber 800 Kunden stärken ihr HSE- und CSR-Management mit den Quentic Software-Lösungen. Mit neun individuell kombinierbaren Modulen umfasst die Online-Plattform Arbeitssicherheit, Risks & Audits, Control of Work, Gefahrstoffe, Legal Compliance, Online-Unterweisungen, Prozesse sowie Umweltmanagement und Nachhaltigkeit. Quentic vernetzt Daten, verbindet alle HSE- und CSR-Akteure und begeistert für das gesamte Aufgabenfeld - via Browser oder per App. Da Aufgaben über Abteilungen, Standorte und Länder hinweg ineinandergreifen, lassen sich Unternehmensprozesse effizient nach gesetzlichen Vorgaben steuern.\nDeine Aufgaben\nDu betreust überwiegend Bestandkunden im Medium und Large Enterprise Business der Industrie im DACH-Raum\nDu erkennst Up- & Cross-Selling-Potentiale und schöpfst sie aus\nDu führst Verhandlungen über Preise und Vertragsverlängerungen\nDu präsentierst unser Leistungsversprechen unseren Bestandskunden und analysierst ihren Bedarf\nDu repräsentierst Quentic auf Roadshows und Messen\nDu pflegst unser CRM-System und reportest regelmäßig an unsere Head of Account Management\nDu arbeitest mit externen Dienstleistern zusammen\nDu sicherst und erhöhst die Kundenzufriedenheit\n\nDeine Qualifikationen\nDu hast bereits umfangreiche Berufserfahrung in der Bestandkundenbetreuung im B2B Software-Bereich\nBegriffe wie Buying Center, Tender und Complex Sales sind Dir geläufig\nDu bist technikaffin und hast Interesse an den Themen Arbeitssicherheit, Nachhaltigkeit und Umweltschutz\nMit Empathie und Geschick gelingt es Dir, komplexe Sachverhalte verständlich zu präsentieren\nDu bist argumentationssicher und verhandlungsstark und kannst so unsere Business Software online und vor Ort sicher präsentieren \nDu sprichst fließend Deutsch und Englisch, weitere europäische Sprachen sind ein Plus\nDu bist bereit, innerhalb der DACH-Region zu reisen (i.d.R. 1-2 Tage pro Woche innerhalb Deines lokalen Vertriebgebiets)\n\nDeine Aussichten\nNicht gesättigtes Marktumfeld mit steigender Nachfrage\nUnterstützung durch ein starkes Marketing sowie unsere Consultants bei der Kundenbetreuung\nAttraktive Vergütung aus einem Fixgehalt und einer transparenten Variablen je nach Zielvereinbarung\nFirmen-Kreditkarte und ein mobiles Büro\nStrukturierte Einarbeitung und Betreuung durch Mentoren\nFlache Hierarchien mit offenen Türen in einer lockeren, professionellen Atmosphäre\nRegelmäßige Teamevents und ein besonderes Augenmerk auf die Work-Life-Balance (flexible Arbeitszeiten, Bezuschussung Fitness-Studio u. v. m.)\n\nWeitere InformationenWenn du die Welt ein bisschen sicherer machen und mehr über die Themen Umweltschutz, Arbeitssicherheit und Nachhaltigkeit erfahren möchtest, bist du bei uns genau richtig! Wer wir sind und wie wir arbeiten, siehst du hier", 13 | "description_introduction": "Passionate people for sustainable softwareQuentic ist einer der führenden Lösungsanbieter für Software as a Service (SaaS) im europäischen HSE- und CSR-Markt. Das Unternehmen hat seinen Hauptsitz in Berlin und beschäftigt über 250 Mitarbeitende. Niederlassungen befinden sich in Deutschland, Österreich und der Schweiz sowie in Finnland, Belgien, Dänemark, Schweden, den Niederlanden, Spanien und Italien.\nÜber 800 Kunden stärken ihr HSE- und CSR-Management mit den Quentic Software-Lösungen. Mit neun individuell kombinierbaren Modulen umfasst die Online-Plattform Arbeitssicherheit, Risks & Audits, Control of Work, Gefahrstoffe, Legal Compliance, Online-Unterweisungen, Prozesse sowie Umweltmanagement und Nachhaltigkeit. Quentic vernetzt Daten, verbindet alle HSE- und CSR-Akteure und begeistert für das gesamte Aufgabenfeld - via Browser oder per App. Da Aufgaben über Abteilungen, Standorte und Länder hinweg ineinandergreifen, lassen sich Unternehmensprozesse effizient nach gesetzlichen Vorgaben steuern.", 14 | "description_responsabilities": "Du betreust überwiegend Bestandkunden im Medium und Large Enterprise Business der Industrie im DACH-Raum\nDu erkennst Up- & Cross-Selling-Potentiale und schöpfst sie aus\nDu führst Verhandlungen über Preise und Vertragsverlängerungen\nDu präsentierst unser Leistungsversprechen unseren Bestandskunden und analysierst ihren Bedarf\nDu repräsentierst Quentic auf Roadshows und Messen\nDu pflegst unser CRM-System und reportest regelmäßig an unsere Head of Account Management\nDu arbeitest mit externen Dienstleistern zusammen\nDu sicherst und erhöhst die Kundenzufriedenheit", 15 | "description_requirements": "Du hast bereits umfangreiche Berufserfahrung in der Bestandkundenbetreuung im B2B Software-Bereich\nBegriffe wie Buying Center, Tender und Complex Sales sind Dir geläufig\nDu bist technikaffin und hast Interesse an den Themen Arbeitssicherheit, Nachhaltigkeit und Umweltschutz\nMit Empathie und Geschick gelingt es Dir, komplexe Sachverhalte verständlich zu präsentieren\nDu bist argumentationssicher und verhandlungsstark und kannst so unsere Business Software online und vor Ort sicher präsentieren \nDu sprichst fließend Deutsch und Englisch, weitere europäische Sprachen sind ein Plus\nDu bist bereit, innerhalb der DACH-Region zu reisen (i.d.R. 1-2 Tage pro Woche innerhalb Deines lokalen Vertriebgebiets)", 16 | "description_perks": "Nicht gesättigtes Marktumfeld mit steigender Nachfrage\nUnterstützung durch ein starkes Marketing sowie unsere Consultants bei der Kundenbetreuung\nAttraktive Vergütung aus einem Fixgehalt und einer transparenten Variablen je nach Zielvereinbarung\nFirmen-Kreditkarte und ein mobiles Büro\nStrukturierte Einarbeitung und Betreuung durch Mentoren\nFlache Hierarchien mit offenen Türen in einer lockeren, professionellen Atmosphäre\nRegelmäßige Teamevents und ein besonderes Augenmerk auf die Work-Life-Balance (flexible Arbeitszeiten, Bezuschussung Fitness-Studio u. v. m.)" 17 | } -------------------------------------------------------------------------------- /python/simplescraper/tasks/download_job_descriptions.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | 4 | from playwright.async_api import async_playwright, Error, TimeoutError 5 | 6 | from common.chunking import get_chunk_size 7 | from common.entity import JOB_DESCRIPTION 8 | from common.env_variables import DATA_SOURCE_URL, SEMAPHORE_COUNT, MAX_CHUNK_SIZE, LATEST_LOAD_TIMESTAMP, RUN_HEADLESS, \ 9 | MIN_TO_DOWNLOAD, MAX_TO_DOWNLOAD 10 | from common.logging import logger, configure_logger 11 | from common.storage import save_raw_file, load_temp_df, JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV 12 | 13 | TAB_HITS = 10 14 | 15 | 16 | class PageNotFound(Exception): 17 | pass 18 | 19 | 20 | async def open_first_page(browser): 21 | page = await browser.new_page() 22 | await page.goto(DATA_SOURCE_URL, wait_until='domcontentloaded') 23 | await page.click('#ccmgt_explicit_accept') 24 | for i in range(TAB_HITS * 2): 25 | await page.keyboard.press('Tab') 26 | await page.goto(DATA_SOURCE_URL + 'de/sitemap/', wait_until='domcontentloaded') 27 | for i in range(TAB_HITS * 2): 28 | await page.keyboard.press('Tab') 29 | return page 30 | 31 | 32 | async def download_urls(df, load_timestamp): 33 | if df.empty: 34 | return 35 | async with async_playwright() as p: 36 | chunk_pos = df['chunk_pos'].values[0] 37 | chunk_pos = str(chunk_pos).rjust(2) 38 | num_chunks = df['num_chunks'].values[0] 39 | chunk_size = df['chunk_size'].values[0] 40 | chunk_id = f'{chunk_pos}/{num_chunks}' 41 | browser = await p.chromium.launch(headless=RUN_HEADLESS, slow_mo=250) 42 | try: 43 | logger.info(f'Starting chunk {chunk_id} with size of {chunk_size}') 44 | start_time = time.time() 45 | page = await open_first_page(browser) 46 | url_dicts = df.to_dict('records') 47 | for url_dict in url_dicts: 48 | pos_in_chunk = url_dict['pos_in_chunk'] 49 | url = url_dict['url'] 50 | job_id = url.rsplit('--', 1) 51 | job_id = job_id[1] 52 | job_id = job_id.split('-') 53 | job_id = job_id[0] 54 | file_name = f'{job_id}.html' 55 | try: 56 | logger.debug(f'Chunk {chunk_id}: Downloading ({pos_in_chunk}/{chunk_size}): {url}') 57 | try: 58 | response = await page.goto(url, wait_until='domcontentloaded') 59 | for i in range(TAB_HITS): 60 | await page.keyboard.press('Tab') 61 | if response.status >= 400 and response.status >= 400 < 500: 62 | raise PageNotFound('Page not found') 63 | await page.wait_for_selector('.js-app-ld-ContentBlock', timeout=10000, state='attached') 64 | except TimeoutError as err: 65 | logger.warning( 66 | f'Chunk {chunk_id}: TimeoutError: second try for {url} because of the following error: {err}') 67 | await page.goto(DATA_SOURCE_URL + 'de/sitemap/', wait_until='domcontentloaded') 68 | for i in range(TAB_HITS): 69 | await page.keyboard.press('Tab') 70 | await page.goto(url, wait_until='domcontentloaded') 71 | for i in range(TAB_HITS): 72 | await page.keyboard.press('Tab') 73 | await page.wait_for_selector('.js-app-ld-ContentBlock', timeout=20000, state='attached') 74 | page_content = await page.content() 75 | save_raw_file(page_content, JOB_DESCRIPTION, load_timestamp, file_name) 76 | logger.success(f'Chunk {chunk_id}: Downloaded ({pos_in_chunk}/{chunk_size}): {url}') 77 | except TimeoutError: 78 | logger.warning(f'Chunk {chunk_id}: TimeoutError: Timeout error while requesting the page {url}') 79 | except AttributeError: 80 | logger.warning(f'Chunk {chunk_id}: AttributeError: it seems the following URL is gone {url}') 81 | except PageNotFound: 82 | logger.warning(f'Chunk {chunk_id}: PageNotFound: the following URL is no longer available {url}') 83 | except Error as err: 84 | logger.error(f'Chunk {chunk_id}: It seems that the browser crashed because of the following error: {err}') 85 | finally: 86 | await browser.close() 87 | 88 | elapsed_time = time.time() - start_time 89 | logger.info(f'Finished chunk {chunk_id}') 90 | logger.info(f'Elapsed time {chunk_id}: {elapsed_time:.2f} seconds') 91 | logger.info(f'Downloads per second {chunk_id}: {chunk_size / elapsed_time:.2f}') 92 | 93 | 94 | def split_dataframe(df, chunk_size): 95 | chunks = [] 96 | num_chunks = len(df) // chunk_size + 1 97 | for i in range(num_chunks): 98 | chunk = df[i * chunk_size:(i + 1) * chunk_size] 99 | chunk = chunk.reset_index(drop=True) 100 | chunk['chunk_pos'] = i + 1 101 | chunk['num_chunks'] = num_chunks 102 | chunk['pos_in_chunk'] = chunk.index + 1 103 | chunk['chunk_size'] = chunk.shape[0] 104 | chunks.append(chunk) 105 | return chunks 106 | 107 | 108 | async def safe_download_urls(urls, load_timestamp, sem): 109 | async with sem: # semaphore limits num of simultaneous downloads 110 | return await download_urls(urls, load_timestamp) 111 | 112 | 113 | async def run_async_tasks(chunks, load_timestamp): 114 | sem = asyncio.Semaphore(SEMAPHORE_COUNT) 115 | tasks = [ 116 | asyncio.ensure_future(safe_download_urls(chunk, load_timestamp, sem)) # creating task starts coroutine 117 | for chunk 118 | in chunks 119 | ] 120 | await asyncio.gather(*tasks) 121 | 122 | 123 | def download_job_descriptions(load_timestamp, df_to_download=None): 124 | configure_logger(load_timestamp) 125 | df = df_to_download if df_to_download is not None else load_temp_df(load_timestamp, JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV) 126 | 127 | if MAX_TO_DOWNLOAD: 128 | pending_donwnload = df.shape[0] - MAX_TO_DOWNLOAD if df.shape[0] > MAX_TO_DOWNLOAD else 0 129 | df = df.head(MAX_TO_DOWNLOAD) 130 | else: 131 | pending_donwnload = 0 132 | 133 | total_count = df.shape[0] 134 | 135 | if total_count < MIN_TO_DOWNLOAD: 136 | logger.success(f'Not enough to download: {total_count} for the load timestamp {load_timestamp}') 137 | return 138 | 139 | chunk_size = get_chunk_size(total_count, SEMAPHORE_COUNT, MAX_CHUNK_SIZE) 140 | chunks = split_dataframe(df, chunk_size) 141 | 142 | start_time = time.time() 143 | logger.info(f'Starting downloading job descriptions for job: {load_timestamp}') 144 | logger.info(f'Concurrent tasks: {SEMAPHORE_COUNT}') 145 | logger.info(f'Urls to download: {total_count}') 146 | logger.info(f'Pending download: {pending_donwnload}') 147 | 148 | loop = asyncio.SelectorEventLoop() 149 | asyncio.set_event_loop(loop) 150 | try: 151 | loop.run_until_complete(run_async_tasks(chunks, load_timestamp)) 152 | finally: 153 | loop.run_until_complete(loop.shutdown_asyncgens()) 154 | loop.close() 155 | 156 | elapsed_time = time.time() - start_time 157 | logger.info(f'Elapsed time: {elapsed_time:.2f} seconds') 158 | logger.info(f'Downloads per second: {total_count / elapsed_time:.2f}') 159 | logger.success(f'Finished: {total_count} urls for the timestamp {load_timestamp}') 160 | logger.success(f'Pending download: {pending_donwnload} urls for the timestamp {load_timestamp}') 161 | 162 | 163 | if __name__ == '__main__': 164 | download_job_descriptions( 165 | LATEST_LOAD_TIMESTAMP, 166 | load_temp_df(LATEST_LOAD_TIMESTAMP, JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV), 167 | ) 168 | -------------------------------------------------------------------------------- /python/simplescraper/common/storage.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module will store the files in the following structure 3 | - root 4 | - 5 | - 6 | - 7 | - 8 | - 9 | """ 10 | import datetime 11 | import glob 12 | import os 13 | import pathlib 14 | 15 | import pandas as pd 16 | import pyarrow as pa 17 | import pyarrow.parquet as pq 18 | from dateutil import parser 19 | from pyarrow import ArrowInvalid 20 | 21 | from common.entity import Entity 22 | from common.env_variables import DATA_SOURCE_NAME, RAW_DIR, CLEANSED_DIR, TEMP_DIR, AZURE_STORAGE_CONNECTION_STRING, \ 23 | AZURE_STORAGE_CONTAINER_NAME, DATA_DIR, UPLOAD_TO_AZURE, BACKUP_DIR, CURATED_DIR 24 | from common.logging import logger 25 | 26 | LOAD_TIMESTAMP_FORMAT = '%Y/%m/%d/%H-%M-%S' 27 | LOAD_DATE_FORMAT = '%Y/%m/%d' 28 | 29 | RAW_LAYER = 'raw' 30 | CLEANSED_LAYER = 'cleansed' 31 | CURATED_LAYER = 'curated' 32 | TEMP_LAYER = 'temp' 33 | 34 | LAYERS = [RAW_LAYER, CLEANSED_LAYER, CURATED_LAYER, TEMP_LAYER] 35 | 36 | LAYER_DIR = { 37 | RAW_LAYER: RAW_DIR, 38 | CLEANSED_LAYER: CLEANSED_DIR, 39 | CURATED_LAYER: CURATED_DIR, 40 | TEMP_LAYER: TEMP_DIR, 41 | } 42 | 43 | DOWNLOADED_JOB_DESCRIPTIONS_CSV = '11_downloaded_job_descriptions.csv' 44 | SITEMAP_URLS_CSV = '12_sitemap_urls.csv' 45 | JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV = '13_job_descriptions_to_download.csv' 46 | PARSED_JOB_DESCRIPTIONS_CSV = '21_parsed_job_descriptions.csv' 47 | JOB_DESCRIPTIONS_TO_PARSE_CSV = '22_job_descriptions_to_parse.csv' 48 | DOWNLOADED_SITEMAPS_CSV = '31_downloaded_sitemaps.csv' 49 | PARSED_SITEMAP_DATES_CSV = '32_parsed_sitemap_dates.csv' 50 | SITEMAPS_TO_PARSE_CSV = '33_sitemaps_to_parse.csv' 51 | 52 | 53 | def list_raw_files(data_source, entity: Entity, load_date=None): 54 | dir_path = os.path.join(RAW_DIR, data_source, entity.name) 55 | if load_date: 56 | dir_path = os.path.join(dir_path, load_date) 57 | file_list = [{ 58 | 'load_timestamp': '/'.join(f.split('/')[-5:-1]), 59 | 'file_name': f.split('/')[-1], 60 | } for f in glob.iglob(dir_path + '/**/*', recursive=True) if os.path.isfile(f) and 'latest' not in f] 61 | return file_list 62 | 63 | 64 | def list_raw_days(data_source, entity: Entity): 65 | dir_path = os.path.join(RAW_DIR, data_source, entity.name) 66 | file_list = [{ 67 | 'date': ''.join(f.split('/')[-3:]), 68 | } for f in glob.iglob(dir_path + '/*/*/*', recursive=True) if os.path.isdir(f) and 'latest' not in f] 69 | return file_list 70 | 71 | 72 | def list_backup_days(data_source, entity: Entity): 73 | dir_path = os.path.join(BACKUP_DIR, data_source, entity.name) 74 | file_list = [{ 75 | 'date': f.split('.')[-3], 76 | } for f in glob.iglob(dir_path + '/**/*', recursive=True) if os.path.isfile(f)] 77 | return file_list 78 | 79 | 80 | def get_load_timestamp(ts=None): 81 | if ts is None: 82 | load_timestamp = datetime.datetime.today().strftime(LOAD_TIMESTAMP_FORMAT) 83 | else: 84 | load_timestamp = parser.parse(ts).strftime(LOAD_TIMESTAMP_FORMAT) 85 | return load_timestamp 86 | 87 | 88 | def get_load_date(ds=None): 89 | if ds is None: 90 | load_date = (datetime.datetime.today() - datetime.timedelta(days=1)).strftime(LOAD_DATE_FORMAT) 91 | else: 92 | load_date = parser.parse(ds).strftime(LOAD_DATE_FORMAT) 93 | return load_date 94 | 95 | 96 | def get_filters_from_load_date(load_date: str): 97 | year, month, day = load_date.split('/', 2) 98 | filters = [ 99 | ('year', '=', int(year)), 100 | ('month', '=', int(month)), 101 | ('day', '=', int(day)), 102 | ] 103 | return filters 104 | 105 | 106 | def create_dir(file_path): 107 | dir_path = os.path.dirname(file_path) 108 | pathlib.Path(dir_path).mkdir(parents=True, exist_ok=True) 109 | 110 | 111 | def save_local_file(content, file_path): 112 | create_dir(file_path) 113 | file_type = "w" if isinstance(content, str) else "wb" 114 | with open(file_path, file_type) as f: 115 | f.write(content) 116 | 117 | 118 | def save_remote_file(content, blob_name): 119 | from azure.storage.blob import BlockBlobService 120 | logger.debug(f'save_remote_file start: {blob_name}') 121 | blob_service_client = BlockBlobService(connection_string=AZURE_STORAGE_CONNECTION_STRING) 122 | if isinstance(content, str): 123 | blob_service_client.create_blob_from_text(AZURE_STORAGE_CONTAINER_NAME, blob_name, content) 124 | else: 125 | blob_service_client.create_blob_from_bytes(AZURE_STORAGE_CONTAINER_NAME, blob_name, content) 126 | logger.success(f'save_remote_file end: {blob_name}') 127 | 128 | 129 | def save_raw_file(content, entity: Entity, load_timestamp: str, file_name): 130 | blob_name = os.path.join(RAW_LAYER, DATA_SOURCE_NAME, entity.name, load_timestamp, file_name) 131 | file_path = os.path.join(DATA_DIR, blob_name) 132 | save_local_file(content, file_path) 133 | if UPLOAD_TO_AZURE: 134 | save_remote_file(content, blob_name) 135 | 136 | 137 | def load_raw_file(entity: Entity, load_timestamp, file_name): 138 | file_path = os.path.join(LAYER_DIR[RAW_LAYER], DATA_SOURCE_NAME, entity.name, load_timestamp, file_name) 139 | with open(file_path, 'r') as f: 140 | content = f.read() 141 | return content 142 | 143 | 144 | def save_temp_df(df: pd.DataFrame, load_timestamp: str, file_name: str): 145 | temp_dir = os.path.join(TEMP_DIR, load_timestamp) 146 | if not os.path.exists(temp_dir): 147 | os.makedirs(temp_dir) 148 | # noinspection PyTypeChecker 149 | df.to_csv(os.path.join(temp_dir, file_name), index=False) 150 | 151 | 152 | def load_temp_df(load_timestamp: str, file_name: str) -> pd.DataFrame: 153 | return pd.read_csv(os.path.join(TEMP_DIR, load_timestamp, file_name)) 154 | 155 | 156 | def list_parquet_files(layer, entity: Entity, relative_paths): 157 | dir_path = os.path.join(LAYER_DIR[layer], DATA_SOURCE_NAME, entity.name) 158 | file_list = [f for f in glob.iglob(dir_path + '/**/*.parquet', recursive=True) if os.path.isfile(f)] 159 | if relative_paths: 160 | file_list = [file_path.replace(dir_path + '/', '') for file_path in file_list] 161 | return file_list 162 | 163 | 164 | def list_cleansed_files(entity: Entity, relative_paths=True): 165 | return list_parquet_files(CLEANSED_LAYER, entity, relative_paths) 166 | 167 | 168 | def save_parquet_df(df: pd.DataFrame, layer, entity: Entity): 169 | # noinspection PyArgumentList 170 | table: pa.Table = pa.Table.from_pandas(df, preserve_index=False) 171 | root_path = os.path.join(LAYER_DIR[layer], DATA_SOURCE_NAME, entity.name) 172 | pq.write_to_dataset(table, 173 | root_path, 174 | partition_cols=['year', 'month', 'day'], 175 | basename_template='part-{i}.parquet', 176 | existing_data_behavior='delete_matching', 177 | use_legacy_dataset=False) 178 | 179 | 180 | def save_cleansed_df(df: pd.DataFrame, entity: Entity): 181 | save_parquet_df(df, CLEANSED_LAYER, entity) 182 | 183 | 184 | def save_curated_df(df: pd.DataFrame, entity: Entity): 185 | save_parquet_df(df, CURATED_LAYER, entity) 186 | 187 | 188 | def load_parquet_df(layer, entity: Entity, columns, filters) -> pd.DataFrame: 189 | # noinspection PyArgumentList 190 | root_path = os.path.join(LAYER_DIR[layer], DATA_SOURCE_NAME, entity.name) 191 | try: 192 | table = pq.read_table(root_path, columns=columns, filters=filters, use_legacy_dataset=False) 193 | return table.to_pandas() 194 | except (FileNotFoundError, ArrowInvalid): 195 | return pd.DataFrame(columns=columns) 196 | 197 | 198 | def load_cleansed_df(entity: Entity, columns=None, filters=None, load_date=None) -> pd.DataFrame: 199 | if filters is None and load_date is not None: 200 | filters = get_filters_from_load_date(load_date) 201 | return load_parquet_df(CLEANSED_LAYER, entity, columns, filters) 202 | -------------------------------------------------------------------------------- /python/simplescraper/explore/explore_dwh_mart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "pycharm": { 8 | "name": "#%%\n" 9 | } 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "from common.explore import display_sql" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/plain": " job_id total\n0 7543521 12\n1 7369771 10\n2 7723680 9\n3 7599993 8\n4 7571802 8", 23 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
job_idtotal
0754352112
1736977110
277236809
375999938
475718028
\n
" 24 | }, 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "display_sql(f'''\n", 32 | "SELECT job_id,\n", 33 | " COUNT(1) AS total\n", 34 | " FROM curated.job\n", 35 | " GROUP BY 1\n", 36 | " ORDER BY 2 DESC\n", 37 | " LIMIT 5\n", 38 | "''')" 39 | ], 40 | "metadata": { 41 | "collapsed": false, 42 | "pycharm": { 43 | "name": "#%%\n" 44 | } 45 | } 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": { 51 | "pycharm": { 52 | "name": "#%%\n" 53 | } 54 | }, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": " load_timestamp title \\\n0 2022-01-26 16:26:20 Vertriebsmitarbeiter/in Innendienst (m/w/d) \n1 2022-01-20 10:00:00 Innendienst Vertrieb Ausstellung (m/w/d) \n2 2022-01-10 19:00:00 Mitarbeiter/in Vertrieb Ausstellung (m/w/d) \n3 2022-01-08 13:00:00 Berater Ausstellung (m/w/d) \n4 2021-12-18 14:00:00 Verkaufsberater Ausstellung (m/w/d) \n5 2021-11-20 11:00:00 Berater Ausstellung (m/w/d) \n6 2021-11-12 16:00:00 Fachberater Ausstellung (m/w/d) \n7 2021-11-10 17:00:00 Fachberater - Glaser / Schreiner (m/w/d) \n8 2021-10-14 21:00:00 Kaufmännische/r Angestellte/r (m/w/d) \n9 2021-10-07 08:00:00 Kaufmännische/r Angestellte/r (m/w/d) \n10 2021-10-06 11:00:00 Kaufmännischer Angestellter (m/w/d) \n11 2021-10-05 08:00:00 Kaufmännischer Angestellter (m/w/d) \n\n online_date \n0 2022-01-02T13:03:06Z \n1 2022-01-02T13:03:06Z \n2 2022-01-02T13:03:06Z \n3 2022-01-02T13:03:06Z \n4 2021-12-18T13:03:05Z \n5 2021-11-13T17:03:10Z \n6 2021-10-29T15:30:01Z \n7 2021-10-29T15:30:01Z \n8 2021-10-06T15:03:04Z \n9 2021-10-06T15:03:04Z \n10 2021-09-21T14:32:36Z \n11 2021-09-21T14:32:36Z ", 59 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
load_timestamptitleonline_date
02022-01-26 16:26:20Vertriebsmitarbeiter/in Innendienst (m/w/d)2022-01-02T13:03:06Z
12022-01-20 10:00:00Innendienst Vertrieb Ausstellung (m/w/d)2022-01-02T13:03:06Z
22022-01-10 19:00:00Mitarbeiter/in Vertrieb Ausstellung (m/w/d)2022-01-02T13:03:06Z
32022-01-08 13:00:00Berater Ausstellung (m/w/d)2022-01-02T13:03:06Z
42021-12-18 14:00:00Verkaufsberater Ausstellung (m/w/d)2021-12-18T13:03:05Z
52021-11-20 11:00:00Berater Ausstellung (m/w/d)2021-11-13T17:03:10Z
62021-11-12 16:00:00Fachberater Ausstellung (m/w/d)2021-10-29T15:30:01Z
72021-11-10 17:00:00Fachberater - Glaser / Schreiner (m/w/d)2021-10-29T15:30:01Z
82021-10-14 21:00:00Kaufmännische/r Angestellte/r (m/w/d)2021-10-06T15:03:04Z
92021-10-07 08:00:00Kaufmännische/r Angestellte/r (m/w/d)2021-10-06T15:03:04Z
102021-10-06 11:00:00Kaufmännischer Angestellter (m/w/d)2021-09-21T14:32:36Z
112021-10-05 08:00:00Kaufmännischer Angestellter (m/w/d)2021-09-21T14:32:36Z
\n
" 60 | }, 61 | "execution_count": 3, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "display_sql(f'''\n", 68 | "SELECT load_timestamp,\n", 69 | " title,\n", 70 | " online_date\n", 71 | " FROM curated.job\n", 72 | " WHERE job_id = 7543521\n", 73 | " ORDER BY load_timestamp DESC\n", 74 | " LIMIT 20\n", 75 | "''')" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 4, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/plain": " job_id load_timestamp\n0 7543521 2022-01-26 16:26:20", 85 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
job_idload_timestamp
075435212022-01-26 16:26:20
\n
" 86 | }, 87 | "execution_count": 4, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | } 91 | ], 92 | "source": [ 93 | "display_sql(f'''\n", 94 | "SELECT job_id,\n", 95 | " load_timestamp\n", 96 | "FROM (\n", 97 | " SELECT j.*,\n", 98 | " row_number()\n", 99 | " OVER (\n", 100 | " PARTITION BY job_id ORDER BY load_timestamp DESC\n", 101 | " ) AS seqnum\n", 102 | " FROM curated.job j\n", 103 | " WHERE job_id = 7543521\n", 104 | ") j\n", 105 | "WHERE seqnum = 1;\n", 106 | "''')\n" 107 | ], 108 | "metadata": { 109 | "collapsed": false, 110 | "pycharm": { 111 | "name": "#%%\n" 112 | } 113 | } 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "Python 3 (ipykernel)", 119 | "language": "python", 120 | "name": "python3" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 3 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython3", 132 | "version": "3.10.6" 133 | } 134 | }, 135 | "nbformat": 4, 136 | "nbformat_minor": 1 137 | } -------------------------------------------------------------------------------- /python/simplescraper/flasky.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | from flask import Flask, request, Request 5 | 6 | from common.env_variables import SOURCE_DIR 7 | from common.logging import logger 8 | from common.storage import get_load_timestamp, get_load_date 9 | from tasks.cleanse_job_descriptions import cleanse_job_descriptions 10 | from tasks.cleanse_sitemaps import cleanse_sitemaps 11 | from tasks.curate_job_descriptions import curate_job_descriptions 12 | from tasks.curate_sitemaps import curate_sitemaps 13 | from tasks.download_job_descriptions import download_job_descriptions 14 | from tasks.download_sitemap import download_sitemap 15 | from tasks.list_downloaded_job_descriptions import list_downloaded_job_descriptions 16 | from tasks.list_job_descriptions_to_download import list_job_descriptions_to_download 17 | from tasks.prune_old_raw import prune_old_raw 18 | 19 | SUCCESS_RETURN_CODE = 0 20 | 21 | DEFAULT_DATA_INTERVAL_END = '2022-09-08T00:00:00+00:00' 22 | DEFAULT_DS = '2022-09-07' 23 | 24 | SUCCESS = {'result_status': 'success', }, 200 25 | 26 | HTML_FORM = f''' 27 |
28 |
29 |
30 | 31 |
32 | ''' 33 | 34 | 35 | def is_connected_to_vpn(): 36 | return os.system('/usr/sbin/scutil --nc list | grep Connected | grep vpn') == 0 37 | 38 | 39 | class RequestParams: 40 | def __init__(self, _request: Request): 41 | form = _request.form 42 | self.load_timestamp = get_load_timestamp(form.get('data_interval_end')) 43 | self.load_date = get_load_date(form.get('ds')) 44 | logger.info(self.__dict__) 45 | 46 | 47 | app = Flask(__name__) 48 | 49 | 50 | @app.route('/') 51 | def index(): 52 | return 'Check VPN Status
' \ 53 | 'List Downloaded Descriptions
' \ 54 | 'Download Sitemap
' \ 55 | 'List Job Descriptions to Download
' \ 56 | 'Download Job Descriptions
' \ 57 | 'Cleanse Sitemap
' \ 58 | 'Cleanse Job Descriptions
' \ 59 | 'Do dbt run
' \ 60 | 'Do Day Backup
' \ 61 | 'Validate Day Backup
' \ 62 | 'Test
' 63 | 64 | 65 | @app.route('/do/check_vpn_status') 66 | def do_check_vpn_status(): 67 | logger.info('is_connected_to_vpn: start') 68 | is_connected = is_connected_to_vpn() 69 | logger.info('is_connected_to_vpn: end') 70 | if is_connected: 71 | return SUCCESS 72 | else: 73 | return {'result_status': 'failed'}, 400 74 | 75 | 76 | @app.route('/do/list_downloaded_job_descriptions', methods=['GET', 'POST']) 77 | def do_list_downloaded_urls(): 78 | if request.method == 'POST': 79 | params = RequestParams(request) 80 | list_downloaded_job_descriptions(params.load_timestamp) 81 | return SUCCESS 82 | elif request.method == 'GET': 83 | return HTML_FORM 84 | 85 | 86 | @app.route('/do/download_sitemap', methods=['GET', 'POST']) 87 | def do_download_sitemap(): 88 | if request.method == 'POST': 89 | if is_connected_to_vpn(): 90 | params = RequestParams(request) 91 | download_sitemap(params.load_timestamp) 92 | return {'result_status': 'success'}, 200 93 | else: 94 | return {'result_status': 'failed'}, 400 95 | elif request.method == 'GET': 96 | return HTML_FORM 97 | 98 | 99 | @app.route('/do/list_job_descriptions_to_download', methods=['GET', 'POST']) 100 | def do_list_job_descriptions_to_download(): 101 | if request.method == 'POST': 102 | if is_connected_to_vpn(): 103 | params = RequestParams(request) 104 | list_job_descriptions_to_download(params.load_timestamp) 105 | return SUCCESS 106 | else: 107 | return {'result_status': 'failed'}, 400 108 | elif request.method == 'GET': 109 | return HTML_FORM 110 | 111 | 112 | @app.route('/do/download_job_descriptions', methods=['GET', 'POST']) 113 | def do_download_job_descriptions(): 114 | if request.method == 'POST': 115 | if is_connected_to_vpn(): 116 | params = RequestParams(request) 117 | download_job_descriptions(params.load_timestamp) 118 | return SUCCESS 119 | else: 120 | return {'result_status': 'failed'}, 400 121 | elif request.method == 'GET': 122 | return HTML_FORM 123 | 124 | 125 | @app.route('/do/cleanse_sitemaps', methods=['GET', 'POST']) 126 | def do_cleanse_sitemaps(): 127 | if request.method == 'POST': 128 | params = RequestParams(request) 129 | cleanse_sitemaps(params.load_timestamp, params.load_date) 130 | return SUCCESS 131 | elif request.method == 'GET': 132 | return HTML_FORM 133 | 134 | 135 | @app.route('/do/cleanse_job_descriptions', methods=['GET', 'POST']) 136 | def do_cleanse_job_descriptions(): 137 | if request.method == 'POST': 138 | params = RequestParams(request) 139 | cleanse_job_descriptions(params.load_timestamp, params.load_date) 140 | return SUCCESS 141 | elif request.method == 'GET': 142 | return HTML_FORM 143 | 144 | 145 | @app.route('/do/curate_sitemaps', methods=['GET', 'POST']) 146 | def do_curate_sitemaps(): 147 | if request.method == 'POST': 148 | params = RequestParams(request) 149 | curate_sitemaps(params.load_timestamp, params.load_date) 150 | return SUCCESS 151 | elif request.method == 'GET': 152 | return HTML_FORM 153 | 154 | 155 | @app.route('/do/curate_job_descriptions', methods=['GET', 'POST']) 156 | def do_curate_job_descriptions(): 157 | if request.method == 'POST': 158 | params = RequestParams(request) 159 | curate_job_descriptions(params.load_timestamp, params.load_date) 160 | return SUCCESS 161 | elif request.method == 'GET': 162 | return HTML_FORM 163 | 164 | 165 | @app.route('/do/do_day_backup', methods=['GET', 'POST']) 166 | def do_do_day_backup(): 167 | if request.method == 'POST': 168 | params = RequestParams(request) 169 | year, month, day = params.load_date.split('/') 170 | result = subprocess.run([f'{SOURCE_DIR}/simplescraper/do_day_backup.sh', year, month, day]) 171 | if result.returncode == SUCCESS_RETURN_CODE: 172 | return SUCCESS 173 | else: 174 | return { 175 | 'result_status': 'error', 176 | }, 400 177 | elif request.method == 'GET': 178 | return HTML_FORM 179 | 180 | 181 | @app.route('/do/do_dbt_run', methods=['GET', 'POST']) 182 | def do_dbt_run(): 183 | if request.method == 'POST': 184 | _ = RequestParams(request) 185 | result = subprocess.run([f'{SOURCE_DIR}/simplescraper/do_dbt_run.sh']) 186 | if result.returncode == SUCCESS_RETURN_CODE: 187 | return SUCCESS 188 | else: 189 | return { 190 | 'result_status': 'error', 191 | }, 400 192 | elif request.method == 'GET': 193 | return HTML_FORM 194 | 195 | 196 | @app.route('/do/verify_day_backup', methods=['GET', 'POST']) 197 | def do_verify_day_backup(): 198 | if request.method == 'POST': 199 | params = RequestParams(request) 200 | year, month, day = params.load_date.split('/') 201 | result = subprocess.run([f'{SOURCE_DIR}/simplescraper/verify_day_backup.sh', year, month, day]) 202 | if result.returncode == SUCCESS_RETURN_CODE: 203 | return SUCCESS 204 | else: 205 | return { 206 | 'result_status': 'error', 207 | }, 400 208 | elif request.method == 'GET': 209 | return HTML_FORM 210 | 211 | 212 | @app.route('/do/prune_old_raw', methods=['GET', 'POST']) 213 | def do_prune_old_raw(): 214 | if request.method == 'POST': 215 | params = RequestParams(request) 216 | prune_old_raw(params.load_timestamp, params.load_date) 217 | return SUCCESS 218 | elif request.method == 'GET': 219 | return HTML_FORM 220 | 221 | 222 | @app.route('/do/test', methods=['GET', 'POST']) 223 | def do_test(): 224 | if request.method == 'POST': 225 | params = RequestParams(request) 226 | return { 227 | 'result_status': 'success', 228 | 'load_timestamp': params.load_timestamp, 229 | 'load_date': params.load_date, 230 | }, 200 231 | elif request.method == 'GET': 232 | return HTML_FORM 233 | -------------------------------------------------------------------------------- /python/simplescraper/explore/explore_dwh_mart_dim_time.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "pycharm": { 8 | "name": "#%%\n" 9 | } 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "from common.explore import display_sql" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/plain": " date_key year month day month_name year_week day_of_week \\\n0 2021-10-09 2021 10 9 October 202140 6 \n1 2021-10-10 2021 10 10 October 202140 7 \n2 2021-10-11 2021 10 11 October 202141 1 \n3 2021-10-12 2021 10 12 October 202141 2 \n4 2021-10-13 2021 10 13 October 202141 3 \n.. ... ... ... ... ... ... ... \n353 2022-09-27 2022 9 27 September 202239 2 \n354 2022-09-28 2022 9 28 September 202239 3 \n355 2022-09-29 2022 9 29 September 202239 4 \n356 2022-09-30 2022 9 30 September 202239 5 \n357 2022-10-01 2022 10 1 October 202239 6 \n\n day_of_week_name \n0 Saturday \n1 Sunday \n2 Monday \n3 Tuesday \n4 Wednesday \n.. ... \n353 Tuesday \n354 Wednesday \n355 Thursday \n356 Friday \n357 Saturday \n\n[358 rows x 8 columns]", 23 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
date_keyyearmonthdaymonth_nameyear_weekday_of_weekday_of_week_name
02021-10-092021109October2021406Saturday
12021-10-1020211010October2021407Sunday
22021-10-1120211011October2021411Monday
32021-10-1220211012October2021412Tuesday
42021-10-1320211013October2021413Wednesday
...........................
3532022-09-272022927September2022392Tuesday
3542022-09-282022928September2022393Wednesday
3552022-09-292022929September2022394Thursday
3562022-09-302022930September2022395Friday
3572022-10-012022101October2022396Saturday
\n

358 rows × 8 columns

\n
" 24 | }, 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "display_sql(f'''\n", 32 | "WITH unique_online_at AS (\n", 33 | " SELECT DISTINCT online_at\n", 34 | " FROM curated.online_job\n", 35 | " ORDER BY 1\n", 36 | ")\n", 37 | "SELECT online_at as date_key,\n", 38 | " date_part('year', online_at) as year,\n", 39 | " date_part('month', online_at) as month,\n", 40 | " date_part('day', online_at) as day,\n", 41 | " monthname(online_at) as month_name,\n", 42 | " date_part('yearweek', online_at) as year_week,\n", 43 | " date_part('isodow', online_at) as day_of_week,\n", 44 | " dayname(online_at) as day_of_week_name\n", 45 | " FROM unique_online_at\n", 46 | "''')\n" 47 | ], 48 | "metadata": { 49 | "collapsed": false, 50 | "pycharm": { 51 | "name": "#%%\n" 52 | } 53 | } 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": " date_key year month day month_name year_week day_of_week \\\n0 2021-10-09 2021 10 9 October 202140 6 \n1 2021-10-10 2021 10 10 October 202140 7 \n2 2021-10-11 2021 10 11 October 202141 1 \n3 2021-10-12 2021 10 12 October 202141 2 \n4 2021-10-13 2021 10 13 October 202141 3 \n.. ... ... ... ... ... ... ... \n353 2022-09-27 2022 9 27 September 202239 2 \n354 2022-09-28 2022 9 28 September 202239 3 \n355 2022-09-29 2022 9 29 September 202239 4 \n356 2022-09-30 2022 9 30 September 202239 5 \n357 2022-10-01 2022 10 1 October 202239 6 \n\n day_of_week_name \n0 Saturday \n1 Sunday \n2 Monday \n3 Tuesday \n4 Wednesday \n.. ... \n353 Tuesday \n354 Wednesday \n355 Thursday \n356 Friday \n357 Saturday \n\n[358 rows x 8 columns]", 62 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
date_keyyearmonthdaymonth_nameyear_weekday_of_weekday_of_week_name
02021-10-092021109October2021406Saturday
12021-10-1020211010October2021407Sunday
22021-10-1120211011October2021411Monday
32021-10-1220211012October2021412Tuesday
42021-10-1320211013October2021413Wednesday
...........................
3532022-09-272022927September2022392Tuesday
3542022-09-282022928September2022393Wednesday
3552022-09-292022929September2022394Thursday
3562022-09-302022930September2022395Friday
3572022-10-012022101October2022396Saturday
\n

358 rows × 8 columns

\n
" 63 | }, 64 | "execution_count": 3, 65 | "metadata": {}, 66 | "output_type": "execute_result" 67 | } 68 | ], 69 | "source": [ 70 | "display_sql(f'''\n", 71 | "SELECT *\n", 72 | " FROM dim_time\n", 73 | "''')" 74 | ], 75 | "metadata": { 76 | "collapsed": false, 77 | "pycharm": { 78 | "name": "#%%\n" 79 | } 80 | } 81 | } 82 | ], 83 | "metadata": { 84 | "kernelspec": { 85 | "display_name": "Python 3 (ipykernel)", 86 | "language": "python", 87 | "name": "python3" 88 | }, 89 | "language_info": { 90 | "codemirror_mode": { 91 | "name": "ipython", 92 | "version": 3 93 | }, 94 | "file_extension": ".py", 95 | "mimetype": "text/x-python", 96 | "name": "python", 97 | "nbconvert_exporter": "python", 98 | "pygments_lexer": "ipython3", 99 | "version": "3.10.6" 100 | } 101 | }, 102 | "nbformat": 4, 103 | "nbformat_minor": 1 104 | } -------------------------------------------------------------------------------- /python/simplescraper/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with python 3.10 3 | # To update, run: 4 | # 5 | # pip-compile --allow-unsafe requirements.in 6 | # 7 | appnope==0.1.3 8 | # via 9 | # ipykernel 10 | # ipython 11 | argon2-cffi==21.3.0 12 | # via notebook 13 | argon2-cffi-bindings==21.2.0 14 | # via argon2-cffi 15 | astor==0.8.1 16 | # via wemake-python-styleguide 17 | asttokens==2.0.8 18 | # via stack-data 19 | attrs==22.1.0 20 | # via 21 | # flake8-bugbear 22 | # flake8-eradicate 23 | # jsonschema 24 | # pytest 25 | # wemake-python-styleguide 26 | azure-common==1.1.28 27 | # via 28 | # azure-storage-blob 29 | # azure-storage-common 30 | azure-storage-blob==2.1.0 31 | # via -r requirements.in 32 | azure-storage-common==2.1.0 33 | # via azure-storage-blob 34 | backcall==0.2.0 35 | # via ipython 36 | bandit==1.7.4 37 | # via flake8-bandit 38 | beautifulsoup4==4.11.1 39 | # via 40 | # -r requirements.in 41 | # nbconvert 42 | bleach==5.0.1 43 | # via nbconvert 44 | build==0.8.0 45 | # via pip-tools 46 | certifi==2022.6.15.1 47 | # via requests 48 | cffi==1.15.1 49 | # via 50 | # argon2-cffi-bindings 51 | # cryptography 52 | charset-normalizer==2.1.1 53 | # via requests 54 | click==8.1.3 55 | # via 56 | # flask 57 | # pip-tools 58 | cryptography==38.0.1 59 | # via azure-storage-common 60 | darglint==1.8.1 61 | # via wemake-python-styleguide 62 | debugpy==1.6.3 63 | # via ipykernel 64 | decorator==5.1.1 65 | # via ipython 66 | defusedxml==0.7.1 67 | # via nbconvert 68 | docutils==0.19 69 | # via restructuredtext-lint 70 | duckdb==0.7.0 71 | # via -r requirements.in 72 | entrypoints==0.4 73 | # via jupyter-client 74 | eradicate==2.1.0 75 | # via flake8-eradicate 76 | executing==1.0.0 77 | # via stack-data 78 | fastjsonschema==2.16.1 79 | # via nbformat 80 | flake8==4.0.1 81 | # via 82 | # flake8-bandit 83 | # flake8-broken-line 84 | # flake8-bugbear 85 | # flake8-commas 86 | # flake8-comprehensions 87 | # flake8-debugger 88 | # flake8-docstrings 89 | # flake8-eradicate 90 | # flake8-isort 91 | # flake8-polyfill 92 | # flake8-quotes 93 | # flake8-rst-docstrings 94 | # flake8-string-format 95 | # pep8-naming 96 | # wemake-python-styleguide 97 | flake8-bandit==3.0.0 98 | # via wemake-python-styleguide 99 | flake8-broken-line==0.4.0 100 | # via wemake-python-styleguide 101 | flake8-bugbear==22.9.11 102 | # via wemake-python-styleguide 103 | flake8-commas==2.1.0 104 | # via wemake-python-styleguide 105 | flake8-comprehensions==3.10.0 106 | # via wemake-python-styleguide 107 | flake8-debugger==4.1.2 108 | # via wemake-python-styleguide 109 | flake8-docstrings==1.6.0 110 | # via wemake-python-styleguide 111 | flake8-eradicate==1.3.0 112 | # via wemake-python-styleguide 113 | flake8-isort==4.2.0 114 | # via wemake-python-styleguide 115 | flake8-polyfill==1.0.2 116 | # via 117 | # flake8-bandit 118 | # pep8-naming 119 | flake8-quotes==3.3.1 120 | # via wemake-python-styleguide 121 | flake8-rst-docstrings==0.2.7 122 | # via wemake-python-styleguide 123 | flake8-string-format==0.3.0 124 | # via wemake-python-styleguide 125 | flask==2.2.2 126 | # via -r requirements.in 127 | gitdb==4.0.9 128 | # via gitpython 129 | gitpython==3.1.27 130 | # via bandit 131 | greenlet==2.0.1 132 | # via playwright 133 | gunicorn==20.1.0 134 | # via -r requirements.in 135 | idna==3.3 136 | # via requests 137 | iniconfig==1.1.1 138 | # via pytest 139 | ipykernel==6.15.2 140 | # via 141 | # ipywidgets 142 | # jupyter 143 | # jupyter-console 144 | # notebook 145 | # qtconsole 146 | ipython==8.5.0 147 | # via 148 | # ipykernel 149 | # ipywidgets 150 | # jupyter-console 151 | ipython-genutils==0.2.0 152 | # via 153 | # notebook 154 | # qtconsole 155 | ipywidgets==8.0.2 156 | # via jupyter 157 | isort==5.10.1 158 | # via flake8-isort 159 | itsdangerous==2.1.2 160 | # via flask 161 | jedi==0.18.1 162 | # via ipython 163 | jinja2==3.1.2 164 | # via 165 | # flask 166 | # nbconvert 167 | # notebook 168 | jsonschema==4.16.0 169 | # via nbformat 170 | jupyter==1.0.0 171 | # via -r requirements.in 172 | jupyter-client==7.3.5 173 | # via 174 | # ipykernel 175 | # jupyter-console 176 | # nbclient 177 | # notebook 178 | # qtconsole 179 | jupyter-console==6.4.4 180 | # via jupyter 181 | jupyter-core==4.11.1 182 | # via 183 | # jupyter-client 184 | # nbconvert 185 | # nbformat 186 | # notebook 187 | # qtconsole 188 | jupyterlab-pygments==0.2.2 189 | # via nbconvert 190 | jupyterlab-widgets==3.0.3 191 | # via ipywidgets 192 | kaleido==0.2.1 193 | # via -r requirements.in 194 | loguru==0.6.0 195 | # via -r requirements.in 196 | lxml==4.9.1 197 | # via 198 | # -r requirements.in 199 | # nbconvert 200 | markupsafe==2.1.1 201 | # via 202 | # jinja2 203 | # nbconvert 204 | # werkzeug 205 | matplotlib-inline==0.1.6 206 | # via 207 | # ipykernel 208 | # ipython 209 | mccabe==0.6.1 210 | # via flake8 211 | mistune==2.0.4 212 | # via nbconvert 213 | nbclient==0.6.8 214 | # via nbconvert 215 | nbconvert==7.0.0 216 | # via 217 | # jupyter 218 | # notebook 219 | nbformat==5.4.0 220 | # via 221 | # nbclient 222 | # nbconvert 223 | # notebook 224 | nest-asyncio==1.5.5 225 | # via 226 | # ipykernel 227 | # jupyter-client 228 | # nbclient 229 | # notebook 230 | notebook==6.4.12 231 | # via jupyter 232 | numpy==1.23.3 233 | # via 234 | # pandas 235 | # patsy 236 | # plotly-calplot 237 | # plotly-express 238 | # pyarrow 239 | # scipy 240 | # statsmodels 241 | packaging==21.3 242 | # via 243 | # build 244 | # ipykernel 245 | # nbconvert 246 | # pytest 247 | # qtpy 248 | # statsmodels 249 | pandas==1.4.4 250 | # via 251 | # -r requirements.in 252 | # plotly-calplot 253 | # plotly-express 254 | # statsmodels 255 | pandocfilters==1.5.0 256 | # via nbconvert 257 | parso==0.8.3 258 | # via jedi 259 | patsy==0.5.2 260 | # via 261 | # plotly-express 262 | # statsmodels 263 | pbr==5.10.0 264 | # via stevedore 265 | pep517==0.13.0 266 | # via build 267 | pep8-naming==0.12.1 268 | # via wemake-python-styleguide 269 | pexpect==4.8.0 270 | # via ipython 271 | pickleshare==0.7.5 272 | # via ipython 273 | pip-tools==6.8.0 274 | # via -r requirements.in 275 | playwright==1.30.0 276 | # via -r requirements.in 277 | plotly==5.10.0 278 | # via 279 | # plotly-calplot 280 | # plotly-express 281 | plotly-calplot==0.1.12 282 | # via -r requirements.in 283 | plotly-express==0.4.1 284 | # via -r requirements.in 285 | pluggy==1.0.0 286 | # via pytest 287 | prometheus-client==0.14.1 288 | # via notebook 289 | prompt-toolkit==3.0.31 290 | # via 291 | # ipython 292 | # jupyter-console 293 | psutil==5.9.2 294 | # via ipykernel 295 | ptyprocess==0.7.0 296 | # via 297 | # pexpect 298 | # terminado 299 | pure-eval==0.2.2 300 | # via stack-data 301 | py==1.11.0 302 | # via pytest 303 | pyarrow==9.0.0 304 | # via -r requirements.in 305 | pycodestyle==2.8.0 306 | # via 307 | # flake8 308 | # flake8-bandit 309 | # flake8-debugger 310 | pycparser==2.21 311 | # via cffi 312 | pydocstyle==6.1.1 313 | # via flake8-docstrings 314 | pyee==9.0.4 315 | # via playwright 316 | pyflakes==2.4.0 317 | # via flake8 318 | pygments==2.13.0 319 | # via 320 | # flake8-rst-docstrings 321 | # ipython 322 | # jupyter-console 323 | # nbconvert 324 | # qtconsole 325 | # wemake-python-styleguide 326 | pyparsing==3.0.9 327 | # via packaging 328 | pyrsistent==0.18.1 329 | # via jsonschema 330 | pytest==7.1.3 331 | # via -r requirements.in 332 | python-dateutil==2.8.2 333 | # via 334 | # azure-storage-common 335 | # jupyter-client 336 | # pandas 337 | python-dotenv==0.21.0 338 | # via -r requirements.in 339 | pytz==2022.2.1 340 | # via pandas 341 | pyyaml==6.0 342 | # via bandit 343 | pyzmq==23.2.1 344 | # via 345 | # ipykernel 346 | # jupyter-client 347 | # notebook 348 | # qtconsole 349 | qtconsole==5.3.2 350 | # via jupyter 351 | qtpy==2.2.0 352 | # via qtconsole 353 | requests==2.28.1 354 | # via 355 | # -r requirements.in 356 | # azure-storage-common 357 | restructuredtext-lint==1.4.0 358 | # via flake8-rst-docstrings 359 | scipy==1.9.1 360 | # via 361 | # plotly-express 362 | # statsmodels 363 | send2trash==1.8.0 364 | # via notebook 365 | six==1.16.0 366 | # via 367 | # asttokens 368 | # bleach 369 | # patsy 370 | # python-dateutil 371 | smmap==5.0.0 372 | # via gitdb 373 | snowballstemmer==2.2.0 374 | # via pydocstyle 375 | soupsieve==2.3.2.post1 376 | # via beautifulsoup4 377 | stack-data==0.5.0 378 | # via ipython 379 | statsmodels==0.13.2 380 | # via plotly-express 381 | stevedore==4.0.0 382 | # via bandit 383 | tenacity==8.0.1 384 | # via plotly 385 | terminado==0.15.0 386 | # via notebook 387 | tinycss2==1.1.1 388 | # via nbconvert 389 | tomli==2.0.1 390 | # via 391 | # build 392 | # pep517 393 | # pytest 394 | tornado==6.2 395 | # via 396 | # ipykernel 397 | # jupyter-client 398 | # notebook 399 | # terminado 400 | traitlets==5.4.0 401 | # via 402 | # ipykernel 403 | # ipython 404 | # ipywidgets 405 | # jupyter-client 406 | # jupyter-core 407 | # matplotlib-inline 408 | # nbclient 409 | # nbconvert 410 | # nbformat 411 | # notebook 412 | # qtconsole 413 | typing-extensions==4.3.0 414 | # via 415 | # pyee 416 | # wemake-python-styleguide 417 | urllib3==1.26.12 418 | # via requests 419 | wcwidth==0.2.5 420 | # via prompt-toolkit 421 | webencodings==0.5.1 422 | # via 423 | # bleach 424 | # tinycss2 425 | wemake-python-styleguide==0.16.1 426 | # via -r requirements.in 427 | werkzeug==2.2.2 428 | # via flask 429 | wheel==0.37.1 430 | # via pip-tools 431 | widgetsnbextension==4.0.3 432 | # via ipywidgets 433 | xmltodict==0.13.0 434 | # via -r requirements.in 435 | 436 | # The following packages are considered to be unsafe in a requirements file: 437 | pip==22.2.2 438 | # via pip-tools 439 | setuptools==65.3.0 440 | # via 441 | # flake8-eradicate 442 | # gunicorn 443 | # pip-tools 444 | -------------------------------------------------------------------------------- /docker/airflow/docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 20 | # 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 22 | # 23 | # This configuration supports basic configuration using environment variables or an .env file 24 | # The following variables are supported: 25 | # 26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 27 | # Default: apache/airflow:2.3.4 28 | # AIRFLOW_UID - User ID in Airflow containers 29 | # Default: 50000 30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode 31 | # 32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). 33 | # Default: airflow 34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). 35 | # Default: airflow 36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. 37 | # Default: '' 38 | # 39 | # Feel free to modify this file to suit your needs. 40 | --- 41 | version: '3' 42 | x-airflow-common: 43 | &airflow-common 44 | # In order to add custom dependencies or upgrade provider packages you can use your extended image. 45 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml 46 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images. 47 | image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.3.4} 48 | # build: . 49 | environment: 50 | &airflow-common-env 51 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor 52 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 53 | # For backward compatibility, with Airflow <2.3 54 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 55 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow 56 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 57 | AIRFLOW__CORE__FERNET_KEY: ${AIRFLOW_FERNET_KEY} 58 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 59 | AIRFLOW__CORE__LOAD_EXAMPLES: ${AIRFLOW__CORE__LOAD_EXAMPLES} 60 | AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth' 61 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} 62 | AIRFLOW_CONN_HTTP_DEFAULT: ${AIRFLOW_CONN_HTTP_DEFAULT} 63 | volumes: 64 | - ${AIRFLOW_DAGS_VOLUME}:/opt/airflow/dags 65 | - ${AIRFLOW_LOGS_VOLUME}:/opt/airflow/logs 66 | - ${AIRFLOW_PLUGINS_VOLUME}:/opt/airflow/plugins 67 | user: "${AIRFLOW_UID:-50000}:0" 68 | depends_on: 69 | &airflow-common-depends-on 70 | redis: 71 | condition: service_healthy 72 | postgres: 73 | condition: service_healthy 74 | 75 | services: 76 | postgres: 77 | image: postgres:13 78 | environment: 79 | POSTGRES_USER: airflow 80 | POSTGRES_PASSWORD: airflow 81 | POSTGRES_DB: airflow 82 | volumes: 83 | - postgres-db-volume:/var/lib/postgresql/data 84 | healthcheck: 85 | test: [ "CMD", "pg_isready", "-U", "airflow" ] 86 | interval: 5s 87 | retries: 5 88 | restart: always 89 | 90 | redis: 91 | image: redis:latest 92 | expose: 93 | - 6379 94 | healthcheck: 95 | test: [ "CMD", "redis-cli", "ping" ] 96 | interval: 5s 97 | timeout: 30s 98 | retries: 50 99 | restart: always 100 | 101 | airflow-webserver: 102 | <<: *airflow-common 103 | command: webserver 104 | ports: 105 | - 8080:8080 106 | healthcheck: 107 | test: [ "CMD", "curl", "--fail", "http://localhost:8080/health" ] 108 | interval: 10s 109 | timeout: 10s 110 | retries: 5 111 | restart: always 112 | depends_on: 113 | <<: *airflow-common-depends-on 114 | airflow-init: 115 | condition: service_completed_successfully 116 | volumes: 117 | - ${AIRFLOW_WEBSERVER_VOLUME}:/opt/airflow 118 | 119 | airflow-scheduler: 120 | <<: *airflow-common 121 | command: scheduler 122 | healthcheck: 123 | test: [ "CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"' ] 124 | interval: 10s 125 | timeout: 10s 126 | retries: 5 127 | restart: always 128 | depends_on: 129 | <<: *airflow-common-depends-on 130 | airflow-init: 131 | condition: service_completed_successfully 132 | 133 | airflow-worker: 134 | <<: *airflow-common 135 | command: celery worker 136 | healthcheck: 137 | test: 138 | - "CMD-SHELL" 139 | - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' 140 | interval: 10s 141 | timeout: 10s 142 | retries: 5 143 | environment: 144 | <<: *airflow-common-env 145 | # Required to handle warm shutdown of the celery workers properly 146 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation 147 | DUMB_INIT_SETSID: "0" 148 | restart: always 149 | depends_on: 150 | <<: *airflow-common-depends-on 151 | airflow-init: 152 | condition: service_completed_successfully 153 | 154 | airflow-triggerer: 155 | <<: *airflow-common 156 | command: triggerer 157 | healthcheck: 158 | test: [ "CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"' ] 159 | interval: 10s 160 | timeout: 10s 161 | retries: 5 162 | restart: always 163 | depends_on: 164 | <<: *airflow-common-depends-on 165 | airflow-init: 166 | condition: service_completed_successfully 167 | 168 | airflow-init: 169 | <<: *airflow-common 170 | entrypoint: /bin/bash 171 | # yamllint disable rule:line-length 172 | command: 173 | - -c 174 | - | 175 | function ver() { 176 | printf "%04d%04d%04d%04d" $${1//./ } 177 | } 178 | airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version) 179 | airflow_version_comparable=$$(ver $${airflow_version}) 180 | min_airflow_version=2.2.0 181 | min_airflow_version_comparable=$$(ver $${min_airflow_version}) 182 | if (( airflow_version_comparable < min_airflow_version_comparable )); then 183 | echo 184 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m" 185 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!" 186 | echo 187 | exit 1 188 | fi 189 | if [[ -z "${AIRFLOW_UID}" ]]; then 190 | echo 191 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 192 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 193 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 194 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 195 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user" 196 | echo 197 | fi 198 | one_meg=1048576 199 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 200 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 201 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 202 | warning_resources="false" 203 | if (( mem_available < 4000 )) ; then 204 | echo 205 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 206 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 207 | echo 208 | warning_resources="true" 209 | fi 210 | if (( cpus_available < 2 )); then 211 | echo 212 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 213 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 214 | echo 215 | warning_resources="true" 216 | fi 217 | if (( disk_available < one_meg * 10 )); then 218 | echo 219 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 220 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 221 | echo 222 | warning_resources="true" 223 | fi 224 | if [[ $${warning_resources} == "true" ]]; then 225 | echo 226 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 227 | echo "Please follow the instructions to increase amount of resources available:" 228 | echo " https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin" 229 | echo 230 | fi 231 | mkdir -p /sources/logs /sources/dags /sources/plugins 232 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} 233 | exec /entrypoint airflow version 234 | # yamllint enable rule:line-length 235 | environment: 236 | <<: *airflow-common-env 237 | _AIRFLOW_DB_UPGRADE: 'true' 238 | _AIRFLOW_WWW_USER_CREATE: 'true' 239 | _AIRFLOW_WWW_USER_USERNAME: ${AIRFLOW_USERNAME:-airflow} 240 | _AIRFLOW_WWW_USER_PASSWORD: ${AIRFLOW_PASSWORD:-airflow} 241 | _PIP_ADDITIONAL_REQUIREMENTS: '' 242 | user: "0:0" 243 | volumes: 244 | - .:/sources 245 | 246 | airflow-cli: 247 | <<: *airflow-common 248 | profiles: 249 | - debug 250 | environment: 251 | <<: *airflow-common-env 252 | CONNECTION_CHECK_MAX_COUNT: "0" 253 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 254 | command: 255 | - bash 256 | - -c 257 | - airflow 258 | 259 | # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up 260 | # or by explicitly targeted on the command line e.g. docker-compose up flower. 261 | # See: https://docs.docker.com/compose/profiles/ 262 | flower: 263 | <<: *airflow-common 264 | command: celery flower 265 | profiles: 266 | - flower 267 | ports: 268 | - 5555:5555 269 | healthcheck: 270 | test: [ "CMD", "curl", "--fail", "http://localhost:5555/" ] 271 | interval: 10s 272 | timeout: 10s 273 | retries: 5 274 | restart: always 275 | depends_on: 276 | <<: *airflow-common-depends-on 277 | airflow-init: 278 | condition: service_completed_successfully 279 | 280 | volumes: 281 | postgres-db-volume: -------------------------------------------------------------------------------- /python/simplescraper/explore/explore_dwh_location.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "outputs": [], 7 | "source": [ 8 | "import duckdb\n", 9 | "import pandas as pd\n", 10 | "import plotly.express as px\n", 11 | "from plotly_calplot import calplot\n", 12 | "\n", 13 | "from common.env_variables import DUCKDB_DWH_FILE" 14 | ], 15 | "metadata": { 16 | "collapsed": false, 17 | "pycharm": { 18 | "name": "#%%\n" 19 | } 20 | } 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "outputs": [], 26 | "source": [ 27 | "def display_df(df):\n", 28 | " with pd.option_context('display.max_rows', None, 'display.max_columns', None, \"expand_frame_repr\", False, \"display.float_format\", '${:,.2f}'.format):\n", 29 | " display(df.fillna('.'))" 30 | ], 31 | "metadata": { 32 | "collapsed": false, 33 | "pycharm": { 34 | "name": "#%%\n" 35 | } 36 | } 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "outputs": [], 42 | "source": [ 43 | "conn = duckdb.connect(DUCKDB_DWH_FILE, read_only=True)" 44 | ], 45 | "metadata": { 46 | "collapsed": false, 47 | "pycharm": { 48 | "name": "#%%\n" 49 | } 50 | } 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 4, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": " location job_count\n0 Berlin 98461\n1 Hamburg 88763\n2 München 85657\n3 Frankfurt am Main 55276\n4 Stuttgart 44858\n5 Köln 44203\n6 Düsseldorf 42574\n7 Hannover 21827\n8 Nürnberg 18802\n9 Leipzig 16443\n10 Essen 15877\n11 Bremen 14867\n12 Karlsruhe 14226\n13 Mannheim 12509\n14 Dortmund 12068\n15 Bonn 11815\n16 Dresden 11510\n17 Münster 9021\n18 Wiesbaden 7999\n19 Ulm 7942\n20 Bielefeld 6671\n21 Mainz 6622\n22 Augsburg 6620\n23 Heidelberg 6426\n24 Kiel 6137\n25 Duisburg 6050\n26 bundesweit 5759\n27 Regensburg 5731\n28 Darmstadt 5592\n29 Braunschweig 5492\n30 Aachen 5183\n31 Neckarsulm 5086\n32 Bochum 4981\n33 Erfurt 4973\n34 Ingolstadt 4836\n35 Kassel 4659\n36 Wolfsburg 4471\n37 Würzburg 4439\n38 Freiburg 4310\n39 Lübeck 4276\n40 Kreisfreie Stadt 4098\n41 Gütersloh 4083\n42 Home-Office 3971\n43 Osnabrück 3851\n44 Magdeburg 3825\n45 Rostock 3763\n46 Heilbronn 3728\n47 Potsdam 3670\n48 Koblenz 3550\n49 Wuppertal 3415\n50 Freiburg im Breisgau 3394\n51 Reutlingen 3381\n52 Krefeld 3370\n53 Jena 3363\n54 Sindelfingen 3260\n55 Chemnitz 3234\n56 Mönchengladbach 3130\n57 Saarbrücken 3034\n58 Ludwigsburg 2982\n59 Oldenburg 2739\n60 Neuss 2739\n61 Erlangen 2553\n62 Pforzheim 2552\n63 Göttingen 2536\n64 Ratingen 2489\n65 Paderborn 2460\n66 deutschlandweit 2376\n67 Tübingen 2363\n68 Norderstedt 2317\n69 Leverkusen 2244\n70 Eschborn 2189\n71 Main 2172\n72 Homeoffice 2159\n73 Oberkochen 2140\n74 Ludwigshafen 2097\n75 Oberhausen 2082\n76 Böblingen 2075\n77 Leinfelden-Echterdingen 2037\n78 Bayreuth 1997\n79 Offenburg 1967\n80 Halle (Saale) 1949\n81 Hanau 1851\n82 Minden 1782\n83 Kaiserslautern 1759\n84 Fulda 1680\n85 Fürth 1678\n86 Gelsenkirchen 1669\n87 Baden-Baden 1655\n88 Bamberg 1654\n89 Hildesheim 1627\n90 Munich 1618\n91 Gießen 1611\n92 Landshut 1604\n93 Konstanz 1602\n94 Friedrichshafen 1588\n95 Hagen 1588\n96 Baden-Württemberg 1557\n97 Neu-Isenburg 1553\n98 Flensburg 1493\n99 Trier 1483", 59 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
locationjob_count
0Berlin98461
1Hamburg88763
2München85657
3Frankfurt am Main55276
4Stuttgart44858
5Köln44203
6Düsseldorf42574
7Hannover21827
8Nürnberg18802
9Leipzig16443
10Essen15877
11Bremen14867
12Karlsruhe14226
13Mannheim12509
14Dortmund12068
15Bonn11815
16Dresden11510
17Münster9021
18Wiesbaden7999
19Ulm7942
20Bielefeld6671
21Mainz6622
22Augsburg6620
23Heidelberg6426
24Kiel6137
25Duisburg6050
26bundesweit5759
27Regensburg5731
28Darmstadt5592
29Braunschweig5492
30Aachen5183
31Neckarsulm5086
32Bochum4981
33Erfurt4973
34Ingolstadt4836
35Kassel4659
36Wolfsburg4471
37Würzburg4439
38Freiburg4310
39Lübeck4276
40Kreisfreie Stadt4098
41Gütersloh4083
42Home-Office3971
43Osnabrück3851
44Magdeburg3825
45Rostock3763
46Heilbronn3728
47Potsdam3670
48Koblenz3550
49Wuppertal3415
50Freiburg im Breisgau3394
51Reutlingen3381
52Krefeld3370
53Jena3363
54Sindelfingen3260
55Chemnitz3234
56Mönchengladbach3130
57Saarbrücken3034
58Ludwigsburg2982
59Oldenburg2739
60Neuss2739
61Erlangen2553
62Pforzheim2552
63Göttingen2536
64Ratingen2489
65Paderborn2460
66deutschlandweit2376
67Tübingen2363
68Norderstedt2317
69Leverkusen2244
70Eschborn2189
71Main2172
72Homeoffice2159
73Oberkochen2140
74Ludwigshafen2097
75Oberhausen2082
76Böblingen2075
77Leinfelden-Echterdingen2037
78Bayreuth1997
79Offenburg1967
80Halle (Saale)1949
81Hanau1851
82Minden1782
83Kaiserslautern1759
84Fulda1680
85Fürth1678
86Gelsenkirchen1669
87Baden-Baden1655
88Bamberg1654
89Hildesheim1627
90Munich1618
91Gießen1611
92Landshut1604
93Konstanz1602
94Friedrichshafen1588
95Hagen1588
96Baden-Württemberg1557
97Neu-Isenburg1553
98Flensburg1493
99Trier1483
\n
" 60 | }, 61 | "metadata": {}, 62 | "output_type": "display_data" 63 | } 64 | ], 65 | "source": [ 66 | "df = conn.execute(f'''\n", 67 | "SELECT * FROM location\n", 68 | "LIMIT 100\n", 69 | "''').df()\n", 70 | "display_df(df)" 71 | ], 72 | "metadata": { 73 | "collapsed": false, 74 | "pycharm": { 75 | "name": "#%%\n" 76 | } 77 | } 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 5, 82 | "outputs": [], 83 | "source": [ 84 | "conn.close()" 85 | ], 86 | "metadata": { 87 | "collapsed": false, 88 | "pycharm": { 89 | "name": "#%%\n" 90 | } 91 | } 92 | } 93 | ], 94 | "metadata": { 95 | "kernelspec": { 96 | "display_name": "Python 3", 97 | "language": "python", 98 | "name": "python3" 99 | }, 100 | "language_info": { 101 | "codemirror_mode": { 102 | "name": "ipython", 103 | "version": 2 104 | }, 105 | "file_extension": ".py", 106 | "mimetype": "text/x-python", 107 | "name": "python", 108 | "nbconvert_exporter": "python", 109 | "pygments_lexer": "ipython2", 110 | "version": "2.7.6" 111 | } 112 | }, 113 | "nbformat": 4, 114 | "nbformat_minor": 0 115 | } --------------------------------------------------------------------------------