├── .gitattributes
├── sql
└── dwh
│ ├── job_market_analytics
│ ├── seeds
│ │ └── .gitkeep
│ ├── tests
│ │ └── .gitkeep
│ ├── analyses
│ │ └── .gitkeep
│ ├── macros
│ │ └── .gitkeep
│ ├── snapshots
│ │ └── .gitkeep
│ ├── .gitignore
│ ├── models
│ │ ├── mart
│ │ │ ├── normalized_online_job_months_max.sql
│ │ │ ├── normalized_online_job_months_1.sql
│ │ │ ├── normalized_online_job_months_3.sql
│ │ │ ├── normalized_online_job_months_12.sql
│ │ │ ├── latest_dim_job.sql
│ │ │ ├── dim_time.sql
│ │ │ ├── dim_job.sql
│ │ │ ├── fact_online_job.sql
│ │ │ ├── dim_job_location.sql
│ │ │ ├── dim_job_technology.sql
│ │ │ └── normalized_online_job.sql
│ │ └── sources.yml
│ ├── README.md
│ └── dbt_project.yml
│ ├── requirements.in
│ ├── update_requirements.sh
│ └── requirements.txt
├── docker
├── airflow
│ ├── logs
│ │ └── scheduler
│ │ │ └── latest
│ ├── docker-compose-down.sh
│ ├── restart_worker_and_scheduler.sh
│ ├── .env.example
│ └── docker-compose.yml
└── postgres
│ ├── postgres-parquet-fdw
│ ├── s3-download-parquet-fdw.sh
│ ├── Dockerfile
│ ├── s4-install-parquet-fdw.sh
│ ├── s1-download-arrow.sh
│ └── s2-install-arrow.sh
│ ├── .env.example
│ ├── README.md
│ └── docker-compose.yml
├── doc
├── dbt-dag.png
├── airflow_dag_daily.png
├── scrape_data_source_dag.png
├── raw-in-azure-blob-storage.png
├── TODO-search-document-structure.json
├── TODO-search.md
├── TODO-search-pre-search-data-model.md
├── metaData-bag.log
└── TODO.md
├── python
├── dashy
│ ├── .env.example
│ ├── requirements.in
│ ├── start_dashy.sh
│ ├── update_requirements.sh
│ └── requirements.txt
├── utils
│ ├── generate_fernet_key.py
│ ├── migrate_to_raw_v3.py
│ └── migrate_raw_v1_to_raw_v2.py
├── tests
│ ├── test_get_run_timestamp.py
│ ├── test_get_chunk_size.py
│ ├── test_parse_job_description.py
│ └── data
│ │ └── normalize_job_description
│ │ └── output
│ │ ├── test_case_7610222.json
│ │ ├── test_case_7610188.json
│ │ └── test_case_7609275.json
├── airflow
│ ├── start_airflow_scheduler.sh
│ ├── start_airflow_webserver.sh
│ ├── create_user.sh
│ ├── airflow_home
│ │ └── dags
│ │ │ ├── common_airflow_dag.py
│ │ │ ├── test_dag.py
│ │ │ ├── job_market_analytics_curate_sitemaps_catch_up_dag.py
│ │ │ ├── job_market_analytics_cleanse_sitemaps_catch_up_dag.py
│ │ │ ├── job_market_analytics_curate_job_descriptions_catch_up_dag.py
│ │ │ ├── job_market_analytics_cleanse_job_descriptions_catch_up_dag.py
│ │ │ ├── job_market_analytics_cleanse_catch_up_dag.py
│ │ │ ├── job_market_analytics_curate_catch_up_dag_v2.py
│ │ │ ├── job_market_analytics_hourly_dag.py
│ │ │ ├── job_market_analytics_daily_dag.py
│ │ │ └── job_market_analytics_daily_dag_catch_up.py
│ ├── .env.example
│ ├── configure_posgresql.sh
│ └── install_airflow.sh
├── simplescraper
│ ├── do_dbt_run.sh
│ ├── start_flasky.sh
│ ├── start_dashy_static.sh
│ ├── requirements.in
│ ├── common
│ │ ├── logging.py
│ │ ├── chunking.py
│ │ ├── explore.py
│ │ ├── entity.py
│ │ ├── webclient.py
│ │ ├── env_variables.py
│ │ └── storage.py
│ ├── cron_job.sh
│ ├── update_requirements.sh
│ ├── tasks
│ │ ├── list_downloaded_sitemaps.py
│ │ ├── curate_sitemaps.py
│ │ ├── list_job_descriptions_to_download.py
│ │ ├── prune_old_raw.py
│ │ ├── list_downloaded_job_descriptions.py
│ │ ├── cleanse_sitemaps.py
│ │ ├── cleanse_job_descriptions.py
│ │ ├── parse_job_description.py
│ │ ├── download_sitemap.py
│ │ ├── curate_job_descriptions.py
│ │ └── download_job_descriptions.py
│ ├── do_day_backup.sh
│ ├── dashy_static.py
│ ├── .env.example
│ ├── create_curated_views_in_dwh.py
│ ├── restore_day_backup.sh
│ ├── scrape_data_source.py
│ ├── verify_day_backup.sh
│ ├── verify_all_backups.py
│ ├── restore_all_backups.py
│ ├── do_all_backups.py
│ ├── explore
│ │ ├── explore_dwh_mart.ipynb
│ │ ├── explore_dwh_mart_dim_time.ipynb
│ │ └── explore_dwh_location.ipynb
│ ├── flasky.py
│ └── requirements.txt
└── .flake8
├── Brewfile
├── azure
├── .env.example
└── sync-remote-to-local.sh
├── .gitignore
└── README.md
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-vendored
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/tests/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/analyses/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/macros/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/sql/dwh/requirements.in:
--------------------------------------------------------------------------------
1 | dbt-duckdb==1.5.1
2 | duckdb==0.7.0
3 |
--------------------------------------------------------------------------------
/docker/airflow/logs/scheduler/latest:
--------------------------------------------------------------------------------
1 | /opt/airflow/logs/scheduler/2022-07-30
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | target/
3 | dbt_packages/
4 | logs/
5 |
--------------------------------------------------------------------------------
/doc/dbt-dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petracarrion/job-market-analytics/HEAD/doc/dbt-dag.png
--------------------------------------------------------------------------------
/doc/airflow_dag_daily.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petracarrion/job-market-analytics/HEAD/doc/airflow_dag_daily.png
--------------------------------------------------------------------------------
/python/dashy/.env.example:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export DUCKDB_DWH_FILE=
4 | export VENV_ACTIVATE=
5 | export LOG_FOLDER=
6 |
--------------------------------------------------------------------------------
/doc/scrape_data_source_dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petracarrion/job-market-analytics/HEAD/doc/scrape_data_source_dag.png
--------------------------------------------------------------------------------
/doc/raw-in-azure-blob-storage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petracarrion/job-market-analytics/HEAD/doc/raw-in-azure-blob-storage.png
--------------------------------------------------------------------------------
/python/utils/generate_fernet_key.py:
--------------------------------------------------------------------------------
1 | from cryptography.fernet import Fernet
2 |
3 | fernet_key = Fernet.generate_key()
4 | print(fernet_key.decode())
5 |
--------------------------------------------------------------------------------
/python/dashy/requirements.in:
--------------------------------------------------------------------------------
1 | dash
2 | dash-bootstrap-components
3 | duckdb==0.7.0
4 | gunicorn
5 | jupyter-dash
6 | loguru
7 | pandas
8 | python-dotenv
9 |
--------------------------------------------------------------------------------
/Brewfile:
--------------------------------------------------------------------------------
1 | tap "homebrew/bundle"
2 | tap "homebrew/core"
3 | brew "openblas"
4 | brew "parquet-tools"
5 | brew "postgresql"
6 | brew "rdfind"
7 | brew "rust"
8 | brew "wget"
9 |
--------------------------------------------------------------------------------
/docker/airflow/docker-compose-down.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
4 | cd "$SCRIPTPATH" || exit
5 |
6 | docker compose down
7 |
--------------------------------------------------------------------------------
/docker/postgres/postgres-parquet-fdw/s3-download-parquet-fdw.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | apt-get update
4 | apt-get install -y git
5 |
6 | git clone https://github.com/adjust/parquet_fdw.git
7 |
--------------------------------------------------------------------------------
/azure/.env.example:
--------------------------------------------------------------------------------
1 | RAW_DIR=
2 |
3 | AZURE_STORAGE_CONTAINER_RAW_DIR_URL=
4 |
5 | export AZCOPY_AUTO_LOGIN_TYPE=SPN
6 | export AZCOPY_SPA_APPLICATION_ID=
7 | export AZCOPY_SPA_CLIENT_SECRET=
8 | export AZCOPY_TENANT_ID=
--------------------------------------------------------------------------------
/python/tests/test_get_run_timestamp.py:
--------------------------------------------------------------------------------
1 | from common.storage import get_load_timestamp
2 |
3 |
4 | def test_get_load_timestamp():
5 | assert get_load_timestamp('2022-01-22T12:49:39.448434+00:00') == '2022/01/22/12-49-39'
6 |
--------------------------------------------------------------------------------
/docker/postgres/.env.example:
--------------------------------------------------------------------------------
1 | POSTGRES_USER=
2 | POSTGRES_PASSWORD=
3 | POSTGRES_DB=
4 | POSTGRES_VOLUME=
5 | POSTGRES_PARQUET_FDW_VOLUME=
6 |
7 | PGADMIN_DEFAULT_EMAIL=
8 | PGADMIN_DEFAULT_PASSWORD=
9 | PGADMIN_VOLUME=
10 |
--------------------------------------------------------------------------------
/docker/postgres/README.md:
--------------------------------------------------------------------------------
1 | # Infrastructure
2 |
3 | ## How to run it
4 |
5 | Go to the folder postgres-parquet-fdw and run:
6 |
7 | `docker build -t postgres-parquet-fdw:v1 .`
8 |
9 | Then run:
10 |
11 | `docker-compose up`
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/normalized_online_job_months_max.sql:
--------------------------------------------------------------------------------
1 | {{
2 | config(
3 | materialized='view'
4 | )
5 | }}
6 |
7 | SELECT *
8 | FROM {{ ref('normalized_online_job') }}
9 | ORDER BY online_at
10 |
--------------------------------------------------------------------------------
/python/airflow/start_airflow_scheduler.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
4 | cd "$SCRIPTPATH" || exit
5 |
6 | source .env
7 |
8 | source "${VENV_ACTIVATE}"
9 |
10 | airflow scheduler
11 |
--------------------------------------------------------------------------------
/python/airflow/start_airflow_webserver.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
4 | cd "$SCRIPTPATH" || exit
5 |
6 | source .env
7 |
8 | source "${VENV_ACTIVATE}"
9 |
10 | airflow webserver
11 |
--------------------------------------------------------------------------------
/docker/postgres/postgres-parquet-fdw/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM postgres:14.1 as postgres-parquet-fdw
2 |
3 | COPY *.sh /usr/local/bin/
4 |
5 | RUN s1-download-arrow.sh
6 | RUN s2-install-arrow.sh
7 | RUN s3-download-parquet-fdw.sh
8 | RUN s4-install-parquet-fdw.sh
9 |
--------------------------------------------------------------------------------
/docker/postgres/postgres-parquet-fdw/s4-install-parquet-fdw.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | apt-get update
4 | apt-get -y install \
5 | build-essential \
6 | cmake \
7 | postgresql-server-dev-14
8 |
9 | cd parquet_fdw || exit
10 | make install
11 |
--------------------------------------------------------------------------------
/python/simplescraper/do_dbt_run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
4 | cd "$SCRIPTPATH" || exit
5 |
6 | source .env
7 | source "${DBT_VENV_ACTIVATE}"
8 |
9 | cd "$DBT_DIR" || exit
10 |
11 | dbt run
12 |
--------------------------------------------------------------------------------
/sql/dwh/update_requirements.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
4 | cd "$SCRIPTPATH" || exit
5 |
6 | source .env
7 |
8 | source "${VENV_ACTIVATE}"
9 |
10 | pip install -r requirements.in
11 | pip freeze > requirements.txt
12 |
--------------------------------------------------------------------------------
/azure/sync-remote-to-local.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | source .env
4 |
5 | azcopy login --service-principal --application-id "$AZCOPY_SPA_APPLICATION_ID" --tenant-id="$AZCOPY_TENANT_ID"
6 |
7 | azcopy sync "${RAW_DIR}" "${AZURE_STORAGE_CONTAINER_RAW_DIR_URL}" --recursive --exclude-pattern=".*"
8 |
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/normalized_online_job_months_1.sql:
--------------------------------------------------------------------------------
1 | {{
2 | config(
3 | materialized='table'
4 | )
5 | }}
6 |
7 | SELECT *
8 | FROM {{ ref('normalized_online_job') }}
9 | WHERE online_at >= current_date - INTERVAL 1 MONTH
10 | ORDER BY online_at
11 |
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/normalized_online_job_months_3.sql:
--------------------------------------------------------------------------------
1 | {{
2 | config(
3 | materialized='table'
4 | )
5 | }}
6 |
7 | SELECT *
8 | FROM {{ ref('normalized_online_job') }}
9 | WHERE online_at >= current_date - INTERVAL 3 MONTH
10 | ORDER BY online_at
11 |
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/normalized_online_job_months_12.sql:
--------------------------------------------------------------------------------
1 | {{
2 | config(
3 | materialized='table'
4 | )
5 | }}
6 |
7 | SELECT *
8 | FROM {{ ref('normalized_online_job') }}
9 | WHERE online_at >= current_date - INTERVAL 12 MONTH
10 | ORDER BY online_at
11 |
--------------------------------------------------------------------------------
/python/dashy/start_dashy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
4 | cd "$SCRIPTPATH" || exit
5 |
6 | source .env
7 |
8 | source "${VENV_ACTIVATE}"
9 |
10 | gunicorn --workers 1 --timeout 600 --bind 0.0.0.0:8051 dashy:server --access-logfile '-'
11 |
--------------------------------------------------------------------------------
/python/tests/test_get_chunk_size.py:
--------------------------------------------------------------------------------
1 | from common.chunking import get_chunk_size
2 |
3 |
4 | def test_get_chunk_size():
5 | assert get_chunk_size(1000, 10, 500) == 100
6 | assert get_chunk_size(1000, 10, 50) == 50
7 | assert get_chunk_size(60, 4, 10) == 8
8 | assert get_chunk_size(100, 4, 10) == 9
9 |
--------------------------------------------------------------------------------
/docker/postgres/postgres-parquet-fdw/s1-download-arrow.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | apt update
4 | apt install -y -V ca-certificates lsb-release wget
5 | wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
6 |
--------------------------------------------------------------------------------
/python/simplescraper/start_flasky.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
4 | cd "$SCRIPTPATH" || exit
5 |
6 | source .env
7 |
8 | source "${VENV_ACTIVATE}"
9 |
10 | ulimit -n 4096
11 | gunicorn --workers 4 --timeout 3600 --bind 0.0.0.0:3001 'flasky:app'
12 |
--------------------------------------------------------------------------------
/python/simplescraper/start_dashy_static.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
4 | cd "$SCRIPTPATH" || exit
5 |
6 | source .env
7 |
8 | source "${VENV_ACTIVATE}"
9 |
10 | ulimit -n 4096
11 | gunicorn --workers 4 --timeout 3600 --bind 0.0.0.0:8054 'dashy_static:app'
12 |
--------------------------------------------------------------------------------
/docker/airflow/restart_worker_and_scheduler.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
4 | cd "$SCRIPTPATH" || exit
5 |
6 | for container in airflow-worker airflow-scheduler; do
7 | docker compose stop $container
8 | docker compose rm -f $container
9 | docker compose up $container -d
10 | done
11 |
--------------------------------------------------------------------------------
/python/simplescraper/requirements.in:
--------------------------------------------------------------------------------
1 | azure-storage-blob==2.1.0
2 | beautifulsoup4
3 | duckdb==0.7.0
4 | Flask
5 | gunicorn
6 | jupyter
7 | kaleido
8 | lxml
9 | loguru
10 | pandas
11 | pip-tools
12 | playwright==1.30.0
13 | plotly-calplot
14 | plotly-express
15 | pyarrow
16 | pytest
17 | python-dotenv
18 | requests
19 | wemake-python-styleguide
20 | xmltodict
21 |
--------------------------------------------------------------------------------
/python/simplescraper/common/logging.py:
--------------------------------------------------------------------------------
1 | import os.path
2 | import sys
3 |
4 | from loguru import logger
5 |
6 | from common.env_variables import TEMP_DIR
7 |
8 |
9 | def configure_logger(load_timestamp):
10 | logger.remove()
11 | logger.add(sys.stdout, colorize=True)
12 | logger.add(os.path.join(TEMP_DIR, load_timestamp, f'00_logs.log'))
13 |
14 |
15 | logger = logger
16 |
--------------------------------------------------------------------------------
/python/airflow/create_user.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | source .env
4 |
5 | airflow users create \
6 | --role Admin \
7 | --username "${AIRFLOW_USERNAME}" \
8 | --password "${AIRFLOW_PASSWORD}" \
9 | --email "${AIRFLOW_EMAIL}" \
10 | --firstname "${AIRFLOW_FIRSTNAME}" \
11 | --lastname "${AIRFLOW_LASTNAME}"
12 |
13 | airflow users delete -e admin
14 |
--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/common_airflow_dag.py:
--------------------------------------------------------------------------------
1 | from airflow.operators.python import get_current_context
2 | from airflow.providers.http.hooks.http import HttpHook
3 |
4 |
5 | def run_flasky_task(endpoint):
6 | context = get_current_context()
7 | data = {
8 | 'data_interval_end': context['data_interval_end'],
9 | 'ds': context['ds'],
10 | }
11 | HttpHook().run(endpoint, data)
12 |
--------------------------------------------------------------------------------
/doc/TODO-search-document-structure.json:
--------------------------------------------------------------------------------
1 | {
2 | "job_id": "4324234",
3 | "short_description": "Snail Collector at Alternative Food in Berlin or Hamburg",
4 | "url": "https://data.source/snail-collector-berlin-hamburg.html",
5 | "locations": [
6 | "Berlin",
7 | "Hamburg"
8 | ],
9 | "online_week": [
10 | "2022W11",
11 | "2022W10",
12 | "2022W09",
13 | "2022W02",
14 | "2022W01"
15 | ]
16 | }
--------------------------------------------------------------------------------
/doc/TODO-search.md:
--------------------------------------------------------------------------------
1 | # Search
2 |
3 | ## Facets
4 |
5 | - Company
6 | - Position
7 | - Technology
8 | - Location
9 | - Date?
10 |
11 | ## Document Fields
12 |
13 | - Job ID?
14 | - Job Short Description
15 | - Job Name
16 | - Job Company
17 | - Job Locations
18 | - Job URL
19 | - Job Online Dates
20 |
21 | ## Document Structure
22 |
23 | See [TODO-search-document-structure.json](TODO-search-document-structure.json)
24 |
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/sources.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | sources:
4 | - name: curated
5 | schema: curated
6 | freshness: # default freshness
7 | warn_after: { count: 24, period: hour }
8 | error_after: { count: 36, period: hour }
9 | loaded_at_field: load_timestamp
10 | tables:
11 | - name: online_job
12 | - name: job
13 | - name: job_location
14 | - name: job_technology
15 |
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/latest_dim_job.sql:
--------------------------------------------------------------------------------
1 | SELECT job_key,
2 | job_id,
3 | job_ldts,
4 | title,
5 | company_name
6 | FROM (
7 | SELECT job_key,
8 | job_id,
9 | job_ldts,
10 | title,
11 | company_name,
12 | ROW_NUMBER() OVER (PARTITION BY job_id ORDER BY job_ldts DESC) rn
13 | FROM {{ ref('dim_job') }}
14 | )
15 | WHERE rn = 1
16 |
--------------------------------------------------------------------------------
/python/simplescraper/common/chunking.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 |
4 | def get_chunk_size(total, slots, max_chunk_size):
5 | max_run_size = slots * max_chunk_size
6 |
7 | number_of_runs = total / max_run_size
8 | number_of_runs = int(math.ceil(number_of_runs))
9 |
10 | number_of_chunks = number_of_runs * slots
11 |
12 | chunk_size = total / number_of_chunks
13 | chunk_size = int(math.ceil(chunk_size))
14 |
15 | return chunk_size
16 |
--------------------------------------------------------------------------------
/python/simplescraper/cron_job.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Add the following to the cron jobs: 42 * * * * REPLACE_ME/cron_job.sh
4 |
5 | /usr/sbin/scutil --nc list | grep Connected | grep vpn || {
6 | echo "Please connect to the VPN"
7 | exit 1
8 | }
9 |
10 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
11 | cd "$SCRIPTPATH" || exit
12 |
13 | source .env
14 |
15 | source "${VENV_ACTIVATE}"
16 |
17 | "${VENV_PYTHON}" "${SOURCE_DIR}"/simplescraper/scrape_data_source.py
18 |
--------------------------------------------------------------------------------
/python/airflow/.env.example:
--------------------------------------------------------------------------------
1 | export VENV_ACTIVATE=
2 |
3 | export AIRFLOW_HOME=
4 |
5 | export AIRFLOW_DATABASE_NAME=
6 | export AIRFLOW_DATABASE_USERNAME=
7 | export AIRFLOW_DATABASE_PASSWORD=
8 |
9 | export AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=
10 | export AIRFLOW__CORE__EXECUTOR=
11 |
12 | export AIRFLOW_USERNAME=
13 | export AIRFLOW_PASSWORD=
14 | export AIRFLOW_EMAIL=
15 | export AIRFLOW_FIRSTNAME=
16 | export AIRFLOW_LASTNAME=
17 |
18 | export AIRFLOW__CORE__LOAD_EXAMPLES=
19 |
20 | export AIRFLOW_CONN_HTTP_DEFAULT=
21 |
--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/test_dag.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | from airflow import DAG
4 | from airflow.decorators import task
5 |
6 | from common_airflow_dag import run_flasky_task
7 |
8 | with DAG('test_dag2',
9 | description='Test DAG',
10 | schedule_interval='@daily',
11 | start_date=datetime(2022, 7, 29),
12 | catchup=False) as dag:
13 | @task(task_id="test_task")
14 | def run_test():
15 | run_flasky_task('do/test')
16 |
17 |
18 | run_test()
19 |
--------------------------------------------------------------------------------
/python/simplescraper/update_requirements.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
4 | cd "$SCRIPTPATH" || exit
5 |
6 | source .env
7 |
8 | source "${VENV_ACTIVATE}"
9 |
10 |
11 | if ! pip show pip-tools; then
12 | pip install pip-tools
13 | fi
14 |
15 | pip-compile requirements.in --allow-unsafe
16 | pip-sync
17 | # pip install "apache-airflow[celery]==2.2.3" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.2.3/constraints-3.8.txt"
18 | # pip install dbt-postgres
19 |
--------------------------------------------------------------------------------
/python/airflow/configure_posgresql.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
4 | cd "$SCRIPTPATH" || exit
5 |
6 | source .env
7 |
8 | sudo -u postgres psql -c "CREATE DATABASE ${AIRFLOW_DATABASE_NAME};"
9 |
10 | sudo -u postgres psql -c "CREATE USER ${AIRFLOW_DATABASE_USERNAME} WITH ENCRYPTED PASSWORD '${AIRFLOW_DATABASE_PASSWORD};'"
11 |
12 | sudo -u postgres psql -c "GRANT ALL PRIVILEGES ON DATABASE ${AIRFLOW_DATABASE_NAME} TO ${AIRFLOW_DATABASE_USERNAME};"
13 | sudo -u postgres psql -c "GRANT ALL ON SCHEMA public TO ${AIRFLOW_DATABASE_USERNAME};"
14 |
--------------------------------------------------------------------------------
/python/dashy/update_requirements.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
4 | cd "$SCRIPTPATH" || exit
5 |
6 | source .env
7 |
8 | source "${VENV_ACTIVATE}"
9 |
10 | which pip | grep dashy || (echo "Wrong venv!!!" && exit)
11 |
12 | if ! pip show pip-tools; then
13 | pip install pip-tools
14 | fi
15 |
16 | pip-compile requirements.in --allow-unsafe
17 | pip-sync
18 | # pip install "apache-airflow[celery]==2.2.3" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.2.3/constraints-3.8.txt"
19 | # pip install dbt-postgres
20 |
--------------------------------------------------------------------------------
/python/simplescraper/common/explore.py:
--------------------------------------------------------------------------------
1 | import duckdb
2 | import pandas as pd
3 | from IPython.display import display
4 |
5 | from common.env_variables import DUCKDB_DWH_FILE
6 |
7 |
8 | def display_df(_df):
9 | with pd.option_context('display.max_rows', None, 'display.max_columns', None, "expand_frame_repr", False,
10 | "display.float_format", '${:,.2f}'.format):
11 | display(_df.fillna('.'))
12 |
13 |
14 | def display_sql(sql_statement, read_only=True):
15 | conn = duckdb.connect(DUCKDB_DWH_FILE, read_only=read_only)
16 | _df = conn.execute(sql_statement).df()
17 | conn.close()
18 | return _df
19 |
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/README.md:
--------------------------------------------------------------------------------
1 | Welcome to your new dbt project!
2 |
3 | ### Using the starter project
4 |
5 | Try running the following commands:
6 | - dbt run
7 | - dbt test
8 |
9 |
10 | ### Resources:
11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support
14 | - Find [dbt events](https://events.getdbt.com) near you
15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
16 |
--------------------------------------------------------------------------------
/docker/airflow/.env.example:
--------------------------------------------------------------------------------
1 | AIRFLOW_UID=
2 | AIRFLOW_GID=
3 |
4 | AIRFLOW_FERNET_KEY=
5 | AIRFLOW_SECRET_KEY=
6 |
7 | AIRFLOW_DATABASE_HOST=
8 | AIRFLOW_DATABASE_PORT_NUMBER=
9 | AIRFLOW_DATABASE_NAME=
10 | AIRFLOW_DATABASE_USERNAME=
11 | AIRFLOW_DATABASE_PASSWORD=
12 | AIRFLOW_DATABASE_USE_SSL=
13 |
14 | AIRFLOW_USERNAME=
15 | AIRFLOW_PASSWORD=
16 | AIRFLOW_EMAIL=
17 | AIRFLOW_FIRSTNAME=
18 | AIRFLOW_LASTNAME=
19 |
20 | AIRFLOW_WEBSERVER_VOLUME=
21 | AIRFLOW_DAGS_VOLUME=
22 | AIRFLOW_LOGS_VOLUME=
23 | AIRFLOW_PLUGINS_VOLUME=
24 | REDIS_VOLUME=
25 |
26 | AIRFLOW__CORE__LOAD_EXAMPLES=
27 |
28 | AIRFLOW_CONN_HTTP_DEFAULT=
29 | AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG=
30 |
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/dim_time.sql:
--------------------------------------------------------------------------------
1 | {{
2 | config(
3 | materialized = 'table',
4 | )
5 | }}
6 |
7 | WITH unique_online_at AS (
8 | SELECT DISTINCT online_at
9 | FROM {{ source('curated', 'online_job') }}
10 | ORDER BY 1
11 | )
12 | SELECT online_at as date_key,
13 | date_part('year', online_at) as year,
14 | date_part('month', online_at) as month,
15 | date_part('day', online_at) as day,
16 | monthname(online_at) as month_name,
17 | date_part('yearweek', online_at) as year_week,
18 | date_part('isodow', online_at) as day_of_week,
19 | dayname(online_at) as day_of_week_name
20 | FROM unique_online_at
21 |
--------------------------------------------------------------------------------
/python/simplescraper/common/entity.py:
--------------------------------------------------------------------------------
1 | class Entity:
2 | def __init__(self, name):
3 | self.name = name
4 |
5 | def __str__(self):
6 | return self.name
7 |
8 |
9 | SITEMAP = Entity('sitemap')
10 | ONLINE_JOB = Entity('online_job')
11 | JOB_DESCRIPTION = Entity('job_description')
12 | JOB = Entity('job')
13 | JOB_LOCATION = Entity('job_location')
14 | JOB_TECHNOLOGY = Entity('job_technology')
15 |
16 | RAW_ENTITIES = [
17 | SITEMAP,
18 | JOB_DESCRIPTION,
19 | ]
20 | CURATED_ENTITIES = [
21 | ONLINE_JOB,
22 | JOB,
23 | JOB_LOCATION,
24 | JOB_TECHNOLOGY,
25 | ]
26 |
27 | if __name__ == "__main__":
28 | for entity in CURATED_ENTITIES:
29 | print(entity)
30 |
--------------------------------------------------------------------------------
/python/simplescraper/tasks/list_downloaded_sitemaps.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from common.entity import SITEMAP
4 | from common.env_variables import LATEST_LOAD_TIMESTAMP
5 | from common.storage import DATA_SOURCE_NAME, save_temp_df, list_raw_files, DOWNLOADED_SITEMAPS_CSV, get_load_date
6 |
7 |
8 | def list_downloaded_sitemaps(load_timestamp, load_date=None) -> pd.DataFrame:
9 | files = list_raw_files(DATA_SOURCE_NAME, SITEMAP, load_date)
10 | df = pd.DataFrame(files)
11 | df = df[df['file_name'] != 'sitemapindex.xml']
12 | if load_date is None:
13 | save_temp_df(df, load_timestamp, DOWNLOADED_SITEMAPS_CSV)
14 | return df
15 |
16 |
17 | if __name__ == "__main__":
18 | list_downloaded_sitemaps(LATEST_LOAD_TIMESTAMP, get_load_date())
19 |
--------------------------------------------------------------------------------
/python/simplescraper/do_day_backup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
4 | cd "$SCRIPTPATH" || exit
5 |
6 | source .env
7 |
8 | if [[ $# -ne 3 ]] ; then
9 | echo "Please provide a date as script parameters in the following format: year month day"
10 | echo "Example: $0 2022 12 01"
11 | exit 1
12 | fi
13 |
14 | for entity in job_description sitemap
15 | do
16 |
17 | source=${RAW_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2/$3
18 |
19 | if [ -d "$source" ]
20 | then
21 |
22 | target_dir=${BACKUP_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2
23 | target_filename=${target_dir}/${entity}.$1$2$3.tar.gz
24 | mkdir -p "${target_dir}"
25 | tar -zcvf "${target_filename}" -C "${source}" .
26 |
27 | fi
28 |
29 | done
30 |
--------------------------------------------------------------------------------
/doc/TODO-search-pre-search-data-model.md:
--------------------------------------------------------------------------------
1 | # Pre Search Data Model
2 |
3 | ## Overview
4 |
5 | - job_online
6 | - job_id
7 | - online_at
8 | - url
9 | - job
10 | - job_id
11 | - job_description
12 | - job_id
13 | - title
14 | - online_status
15 | - is_anonymous
16 | - should_display_early_applicant
17 | - contract_type
18 | - work_type
19 | - online_date
20 | - description_introduction
21 | - description_responsabilities
22 | - description_requirements'
23 | - description_perks
24 | - company
25 | - company_name
26 | - job_company
27 | - job_id
28 | - company_name
29 | - location
30 | - location_name
31 | - job_location
32 | - job_id
33 | - location_name
34 | - technology
35 | - technology_name
36 | - job_technology
37 | - job_id
38 | - technology_name
39 |
--------------------------------------------------------------------------------
/python/airflow/install_airflow.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | which pip | grep /airflow/venv/ || (echo "Wrong venv!!!" && exit)
4 |
5 | # Install Airflow using the constraints file
6 | AIRFLOW_VERSION=2.7.2
7 | PYTHON_VERSION="$(python --version | cut -d " " -f 2 | cut -d "." -f 1-2)"
8 | # For example: 3.7
9 | CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt"
10 | # For example: https://raw.githubusercontent.com/apache/airflow/constraints-2.4.1/constraints-3.7.txt
11 | pip install "apache-airflow==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}"
12 | pip install psycopg2
13 |
14 | airflow db upgrade
15 |
16 | # The Standalone command will initialise the database, make a user,
17 | # and start all components for you.
18 | airflow standalone
19 |
--------------------------------------------------------------------------------
/python/tests/test_parse_job_description.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import pytest
4 |
5 | from tasks.parse_job_description import parse_job_description
6 |
7 |
8 | def load_file(file_path):
9 | with open(f'data/normalize_job_description/{file_path}', 'r') as f:
10 | content = f.read()
11 | return content
12 |
13 |
14 | @pytest.mark.parametrize('test_case', ['test_case_7610188', 'test_case_7610222', 'test_case_7609275'])
15 | def test_parse_job_description(test_case):
16 | input_content = load_file('input/' + test_case + '.txt')
17 |
18 | result_content = parse_job_description(input_content)
19 | # temp = json.dumps(result_content, indent=2, ensure_ascii=False)
20 |
21 | output_content = json.loads(load_file('output/' + test_case + '.json'))
22 | assert result_content == output_content
23 |
--------------------------------------------------------------------------------
/python/simplescraper/dashy_static.py:
--------------------------------------------------------------------------------
1 | from flask import Flask
2 |
3 | app = Flask(__name__)
4 |
5 | HTML = '''
6 |
15 |
16 |
Static Dashboard
17 |
18 | Overview
19 |
20 |
21 | Top Five Cities
22 |
23 |
24 | Top Five Technologies
25 |
26 | '''
27 |
28 |
29 | @app.route('/')
30 | def index():
31 | return HTML
32 |
33 |
--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_curate_sitemaps_catch_up_dag.py:
--------------------------------------------------------------------------------
1 | import os
2 | from datetime import datetime, timedelta
3 |
4 | from airflow import DAG
5 | from airflow.decorators import task
6 |
7 | from common_airflow_dag import run_flasky_task
8 |
9 | os.environ["no_proxy"] = "*"
10 |
11 | with DAG('job_market_analytics_curate_sitemaps_catch_up_dag',
12 | description='Job Market Analytics Curate Sitemaps Catch Up DAG',
13 | schedule_interval='@daily',
14 | start_date=datetime(2022, 1, 1),
15 | dagrun_timeout=timedelta(minutes=60),
16 | max_active_runs=4,
17 | max_active_tasks=4,
18 | catchup=True) as dag:
19 | @task(task_id="curate_sitemaps")
20 | def curate_sitemaps():
21 | run_flasky_task('do/curate_sitemaps')
22 |
23 |
24 | curate_sitemaps()
25 |
--------------------------------------------------------------------------------
/python/simplescraper/.env.example:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export VENV_ACTIVATE=
4 | export VENV_PYTHON=
5 | export SOURCE_DIR=
6 |
7 | export DATA_DIR=
8 | export DATA_SOURCE_NAME=
9 | export DATA_SOURCE_URL=
10 |
11 | export RAW_DIR=
12 | export CLEANSED_DIR=
13 | export CURATED_DIR=
14 | export DUCKDB_DWH_FILE=
15 | export TEMP_DIR=
16 |
17 | export BACKUP_DIR=
18 |
19 | export SEMAPHORE_COUNT=
20 | export MAX_CHUNK_SIZE=
21 | export MIN_TO_DOWNLOAD=
22 | export MAX_TO_DOWNLOAD=
23 | export ONLINE_EXPIRATION_IN_DAYS=
24 |
25 | export LATEST_LOAD_TIMESTAMP=
26 |
27 | export RUN_HEADLESS=
28 |
29 | export FLASK_APP=
30 | export FLASK_ENV=
31 | export FLASK_DEBUG=
32 |
33 | export UPLOAD_TO_AZURE=
34 |
35 | export AZURE_STORAGE_CONNECTION_STRING=
36 | export AZURE_STORAGE_CONTAINER_NAME=
37 |
38 | export LANG=
39 | export LC_ALL=
40 |
41 | export DBT_VENV_ACTIVATE=
42 | export DBT_DIR=
43 |
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/dim_job.sql:
--------------------------------------------------------------------------------
1 | {{
2 | config(
3 | materialized='incremental'
4 | )
5 | }}
6 |
7 |
8 | SELECT MD5(CONCAT_WS('||',
9 | COALESCE(
10 | UPPER(TRIM(CAST(
11 | job.job_id
12 | AS VARCHAR))),
13 | '^^'),
14 | COALESCE(
15 | UPPER(TRIM(CAST(
16 | job.load_timestamp
17 | AS VARCHAR))),
18 | '^^')
19 | )) AS job_key,
20 | job.job_id,
21 | job.load_timestamp as job_ldts,
22 | job.title,
23 | job.company_name
24 | FROM {{ source('curated', 'job') }}
25 |
26 | {% if is_incremental() %}
27 | LEFT OUTER JOIN dim_job
28 | ON (job.job_id = dim_job.job_id AND
29 | job.load_timestamp = dim_job.job_ldts)
30 | WHERE dim_job.job_id IS NULL
31 | {% endif %}
32 |
--------------------------------------------------------------------------------
/python/simplescraper/common/webclient.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | REQUEST_HEADERS = {
4 | "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,"
5 | "application/signed-exchange;v=b3;q=0.9",
6 | "accept-language": "en-US,en;q=0.9,es;q=0.8,it-IT;q=0.7,it;q=0.6,de-DE;q=0.5,de;q=0.4",
7 | "cache-control": "max-age=0",
8 | "sec-ch-ua": "\"Chromium\";v=\"94\", \"Google Chrome\";v=\"94\", \";Not A Brand\";v=\"99\"",
9 | "sec-ch-ua-mobile": "?0",
10 | "sec-ch-ua-platform": "\"macOS\"",
11 | "sec-fetch-dest": "document",
12 | "sec-fetch-mode": "navigate",
13 | "sec-fetch-site": "none",
14 | "sec-fetch-user": "?1",
15 | "upgrade-insecure-requests": "1"
16 | }
17 |
18 |
19 | def get_url_content(url):
20 | response = requests.get(url)
21 | content = response.content
22 | return content
23 |
--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_cleanse_sitemaps_catch_up_dag.py:
--------------------------------------------------------------------------------
1 | import os
2 | from datetime import datetime, timedelta
3 |
4 | from airflow import DAG
5 | from airflow.decorators import task
6 |
7 | from common_airflow_dag import run_flasky_task
8 |
9 | os.environ["no_proxy"] = "*"
10 |
11 | with DAG('job_market_analytics_cleanse_sitemaps_catch_up_dag',
12 | description='Job Market Analytics Cleanse Sitemaps Catch Up DAG',
13 | schedule_interval='@daily',
14 | start_date=datetime(2022, 1, 1),
15 | # end_date=datetime(2021, 12, 1),
16 | dagrun_timeout=timedelta(minutes=10),
17 | max_active_runs=4,
18 | max_active_tasks=4,
19 | catchup=True) as dag:
20 | @task(task_id="cleanse_sitemaps")
21 | def cleanse_sitemaps():
22 | run_flasky_task('do/cleanse_sitemaps')
23 |
24 |
25 | cleanse_sitemaps()
26 |
--------------------------------------------------------------------------------
/python/simplescraper/create_curated_views_in_dwh.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import duckdb
4 |
5 | from common.entity import CURATED_ENTITIES
6 | from common.env_variables import CURATED_DIR, DATA_SOURCE_NAME, DUCKDB_DWH_FILE
7 |
8 |
9 | def create_curated_views_in_dwh():
10 | conn = duckdb.connect(DUCKDB_DWH_FILE)
11 |
12 | conn.execute(f'''
13 | CREATE SCHEMA IF NOT EXISTS curated;
14 | ''')
15 |
16 | for entity in CURATED_ENTITIES:
17 | curated_path = os.path.join(CURATED_DIR, DATA_SOURCE_NAME, entity.name, '*/*/*/*.parquet')
18 |
19 | conn.execute(f'''
20 | CREATE OR REPLACE view curated.{entity.name} AS
21 | SELECT * FROM parquet_scan('{curated_path}', HIVE_PARTITIONING=1)
22 | -- WHERE load_timestamp < '2022-07-01'
23 | ;
24 | ''')
25 |
26 | conn.close()
27 |
28 |
29 | if __name__ == "__main__":
30 | create_curated_views_in_dwh()
31 |
--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_curate_job_descriptions_catch_up_dag.py:
--------------------------------------------------------------------------------
1 | import os
2 | from datetime import datetime, timedelta
3 |
4 | from airflow import DAG
5 | from airflow.decorators import task
6 |
7 | from common_airflow_dag import run_flasky_task
8 |
9 | os.environ["no_proxy"] = "*"
10 |
11 | with DAG('job_market_analytics_curate_job_descriptions_catch_up_dag',
12 | description='Job Market Analytics Curate Job Descriptions Catch Up DAG',
13 | schedule_interval='@daily',
14 | start_date=datetime(2022, 11, 1),
15 | end_date=datetime(2022, 11, 30),
16 | dagrun_timeout=timedelta(minutes=60),
17 | max_active_runs=4,
18 | max_active_tasks=4,
19 | catchup=True) as dag:
20 | @task(task_id="curate_job_descriptions")
21 | def curate_job_descriptions():
22 | run_flasky_task('do/curate_job_descriptions')
23 |
24 |
25 | curate_job_descriptions()
26 |
--------------------------------------------------------------------------------
/python/simplescraper/restore_day_backup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
4 | cd "$SCRIPTPATH" || exit
5 |
6 | source .env
7 |
8 | if [[ $# -ne 3 ]] ; then
9 | echo "Please provide a date as script parameters in the following format: year month day"
10 | echo "Example: $0 2022 12 01"
11 | exit 1
12 | fi
13 |
14 | for entity in job_description sitemap
15 | do
16 |
17 | raw_day_dir=${RAW_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2/$3
18 |
19 | if [ -d "$raw_day_dir" ]
20 | then
21 |
22 | echo "The raw day dir is not empty: $raw_day_dir"
23 |
24 | else
25 |
26 | backup_day_dir=${BACKUP_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2
27 | backup_day_filename=${backup_day_dir}/${entity}.$1$2$3.tar.gz
28 |
29 | mkdir -p "$raw_day_dir"
30 | tar -xvzf "$backup_day_filename" -C "$raw_day_dir"
31 |
32 | echo "$1-$2-$3: Restored ${entity}"
33 |
34 | fi
35 |
36 | done
37 |
--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_cleanse_job_descriptions_catch_up_dag.py:
--------------------------------------------------------------------------------
1 | import os
2 | from datetime import datetime, timedelta
3 |
4 | from airflow import DAG
5 | from airflow.decorators import task
6 |
7 | from common_airflow_dag import run_flasky_task
8 |
9 | os.environ["no_proxy"] = "*"
10 |
11 | with DAG('job_market_analytics_cleanse_job_descriptions_catch_up_dag',
12 | description='Job Market Analytics Cleanse Job Descriptions Catch Up DAG',
13 | schedule_interval='@daily',
14 | start_date=datetime(2022, 11, 1),
15 | end_date=datetime(2022, 12, 1),
16 | dagrun_timeout=timedelta(minutes=10),
17 | max_active_runs=1,
18 | max_active_tasks=1,
19 | catchup=True) as dag:
20 |
21 | @task(task_id="cleanse_job_descriptions")
22 | def cleanse_job_descriptions():
23 | run_flasky_task('do/cleanse_job_descriptions')
24 |
25 |
26 | cleanse_job_descriptions()
27 |
--------------------------------------------------------------------------------
/python/simplescraper/scrape_data_source.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from common.logging import configure_logger
4 | from common.storage import get_load_timestamp
5 | from tasks.download_job_descriptions import download_job_descriptions
6 | from tasks.download_sitemap import download_sitemap
7 | from tasks.list_downloaded_job_descriptions import list_downloaded_job_descriptions
8 | from tasks.list_job_descriptions_to_download import list_job_descriptions_to_download
9 |
10 |
11 | def scrape_data_source(load_timestamp):
12 | configure_logger(load_timestamp)
13 | df_downloaded = list_downloaded_job_descriptions(load_timestamp)
14 | df_sitemap = download_sitemap(load_timestamp)
15 | df_to_download = list_job_descriptions_to_download(load_timestamp, df_sitemap, df_downloaded)
16 | download_job_descriptions(load_timestamp, df_to_download)
17 |
18 | os.system('say -v Fiona b')
19 |
20 |
21 | if __name__ == "__main__":
22 | scrape_data_source(get_load_timestamp())
23 |
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/fact_online_job.sql:
--------------------------------------------------------------------------------
1 | {{
2 | config(
3 | materialized='incremental'
4 | )
5 | }}
6 |
7 |
8 | WITH new_fact_online_job AS (
9 | SELECT online_job.online_at as date_key,
10 | online_job.online_at,
11 | online_job.job_id
12 | FROM {{ source('curated', 'online_job') }}
13 |
14 | {% if is_incremental() %}
15 | LEFT OUTER JOIN {{ this }} fact_online_job
16 | ON (online_job.online_at = fact_online_job.online_at AND
17 | online_job.job_id = fact_online_job.job_id)
18 | WHERE fact_online_job.job_id IS NULL
19 | {% endif %}
20 | )
21 | SELECT new_fact_online_job.date_key as date_key,
22 | latest_dim_job.job_key as job_key,
23 | new_fact_online_job.online_at as online_at,
24 | latest_dim_job.job_id as job_id,
25 | latest_dim_job.job_ldts
26 | FROM new_fact_online_job
27 | INNER JOIN {{ ref('latest_dim_job') }}
28 | ON (new_fact_online_job.job_id = latest_dim_job.job_id)
29 |
--------------------------------------------------------------------------------
/docker/postgres/postgres-parquet-fdw/s2-install-arrow.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
4 | apt update
5 | apt install -y -V libarrow-dev # For C++
6 | apt install -y -V libarrow-glib-dev # For GLib (C)
7 | apt install -y -V libarrow-dataset-dev # For Apache Arrow Dataset C++
8 | apt install -y -V libarrow-flight-dev # For Apache Arrow Flight C++
9 | # Notes for Plasma related packages:
10 | # * You need to enable "non-free" component on Debian GNU/Linux
11 | # * You need to enable "multiverse" component on Ubuntu
12 | # * You can use Plasma related packages only on amd64
13 | apt install -y -V libplasma-dev # For Plasma C++
14 | apt install -y -V libplasma-glib-dev # For Plasma GLib (C)
15 | apt install -y -V libgandiva-dev # For Gandiva C++
16 | apt install -y -V libgandiva-glib-dev # For Gandiva GLib (C)
17 | apt install -y -V libparquet-dev # For Apache Parquet C++
18 | apt install -y -V libparquet-glib-dev # For Apache Parquet GLib (C)
19 |
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/dim_job_location.sql:
--------------------------------------------------------------------------------
1 | {{
2 | config(
3 | materialized='incremental'
4 | )
5 | }}
6 |
7 |
8 | SELECT MD5(CONCAT_WS('||',
9 | COALESCE(
10 | UPPER(TRIM(CAST(
11 | job_location.job_id
12 | AS VARCHAR))),
13 | '^^'),
14 | COALESCE(
15 | UPPER(TRIM(CAST(
16 | job_location.load_timestamp
17 | AS VARCHAR))),
18 | '^^')
19 | )) AS job_key,
20 | job_location.job_id,
21 | job_location.load_timestamp as job_ldts,
22 | job_location.location AS location_name
23 | FROM {{ source('curated', 'job_location') }}
24 |
25 | {% if is_incremental() %}
26 | LEFT OUTER JOIN dim_job_location
27 | ON (job_location.job_id = dim_job_location.job_id AND
28 | job_location.load_timestamp = dim_job_location.job_ldts AND
29 | job_location.location = dim_job_location.location_name)
30 | WHERE dim_job_location.job_id IS NULL
31 | {% endif %}
32 |
--------------------------------------------------------------------------------
/python/simplescraper/verify_day_backup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
4 | cd "$SCRIPTPATH" || exit
5 |
6 | source .env
7 |
8 | if [[ $# -ne 3 ]] ; then
9 | echo "Please provide a date as script parameters in the following format: year month day"
10 | echo "Example: $0 2022 12 01"
11 | exit 1
12 | fi
13 |
14 | for entity in job_description sitemap
15 | do
16 |
17 | source=${RAW_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2/$3
18 |
19 | if [ -d "$source" ]
20 | then
21 |
22 | target_dir=${BACKUP_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2
23 | target_filename=${target_dir}/${entity}.$1$2$3.tar.gz
24 | diff <(cd "$source" && find . | grep -E '.xml$|.html$' | sort) <(tar -tf "$target_filename" | grep -E '.xml$|.html$' | sort)
25 | error_code=$?
26 | if [ $error_code -ne 0 ];
27 | then
28 | echo "$1-$2-$3: NOT OK" >&2
29 | exit 1
30 | fi
31 |
32 | else
33 |
34 | echo "$1-$2-$3: NOT FOUND ${entity}"
35 |
36 | fi
37 |
38 | done
39 |
40 | echo "$1-$2-$3: OK"
41 |
--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_cleanse_catch_up_dag.py:
--------------------------------------------------------------------------------
1 | import os
2 | from datetime import datetime, timedelta
3 |
4 | from airflow import DAG
5 | from airflow.decorators import task
6 |
7 | from common_airflow_dag import run_flasky_task
8 |
9 | os.environ["no_proxy"] = "*"
10 |
11 | with DAG('job_market_analytics_cleanse_catch_up_dag',
12 | description='Job Market Analytics Cleanse Catch Up DAG',
13 | schedule_interval='@daily',
14 | start_date=datetime(2021, 12, 1),
15 | # end_date=datetime(2021, 12, 1),
16 | dagrun_timeout=timedelta(minutes=10),
17 | max_active_runs=1,
18 | max_active_tasks=1,
19 | catchup=True) as dag:
20 | @task(task_id="cleanse_sitemaps")
21 | def cleanse_sitemaps():
22 | run_flasky_task('do/cleanse_sitemaps')
23 |
24 |
25 | @task(task_id="cleanse_job_descriptions")
26 | def cleanse_job_descriptions():
27 | run_flasky_task('do/cleanse_job_descriptions')
28 |
29 |
30 | cleanse_sitemaps()
31 | cleanse_job_descriptions()
32 |
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/dim_job_technology.sql:
--------------------------------------------------------------------------------
1 | {{
2 | config(
3 | materialized='incremental'
4 | )
5 | }}
6 |
7 |
8 | SELECT MD5(CONCAT_WS('||',
9 | COALESCE(
10 | UPPER(TRIM(CAST(
11 | job_technology.job_id
12 | AS VARCHAR))),
13 | '^^'),
14 | COALESCE(
15 | UPPER(TRIM(CAST(
16 | job_technology.load_timestamp
17 | AS VARCHAR))),
18 | '^^')
19 | )) AS job_key,
20 | job_technology.job_id,
21 | job_technology.load_timestamp as job_ldts,
22 | job_technology.technology AS technology_name
23 | FROM {{ source('curated', 'job_technology') }}
24 |
25 | {% if is_incremental() %}
26 | LEFT OUTER JOIN dim_job_technology
27 | ON (job_technology.job_id = dim_job_technology.job_id AND
28 | job_technology.load_timestamp = dim_job_technology.job_ldts AND
29 | job_technology.technology = dim_job_technology.technology_name)
30 | WHERE dim_job_technology.job_id IS NULL
31 | {% endif %}
32 |
--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_curate_catch_up_dag_v2.py:
--------------------------------------------------------------------------------
1 | import os
2 | from datetime import datetime, timedelta
3 |
4 | from airflow import DAG
5 | from airflow.decorators import task
6 |
7 | from common_airflow_dag import run_flasky_task
8 |
9 | os.environ["no_proxy"] = "*"
10 |
11 | YEAR = 2021
12 | MONTH = 10
13 | DAY = 1
14 |
15 | with DAG('job_market_analytics_curate_catch_up_dag',
16 | description='Job Market Analytics Curate Catch Up DAG',
17 | schedule_interval='@daily',
18 | start_date=datetime(YEAR, MONTH, DAY),
19 | end_date=datetime(YEAR, MONTH, DAY) + timedelta(days=15),
20 | dagrun_timeout=timedelta(minutes=60),
21 | max_active_runs=2,
22 | max_active_tasks=2,
23 | catchup=True) as dag:
24 | @task(task_id="curate_sitemaps")
25 | def curate_sitemaps():
26 | run_flasky_task('do/curate_sitemaps')
27 |
28 |
29 | @task(task_id="curate_job_descriptions")
30 | def curate_job_descriptions():
31 | run_flasky_task('do/curate_job_descriptions')
32 |
33 |
34 | curate_sitemaps()
35 | curate_job_descriptions()
36 |
--------------------------------------------------------------------------------
/python/simplescraper/common/env_variables.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from dotenv import load_dotenv
4 |
5 | load_dotenv()
6 |
7 | DATA_DIR = os.getenv('DATA_DIR')
8 | RAW_DIR = os.getenv('RAW_DIR')
9 | CLEANSED_DIR = os.getenv('CLEANSED_DIR')
10 | CURATED_DIR = os.getenv('CURATED_DIR')
11 | DUCKDB_DWH_FILE = os.getenv('DUCKDB_DWH_FILE')
12 | TEMP_DIR = os.getenv('TEMP_DIR')
13 | BACKUP_DIR = os.getenv('BACKUP_DIR')
14 | SOURCE_DIR = os.getenv('SOURCE_DIR')
15 |
16 | DATA_SOURCE_NAME = os.getenv('DATA_SOURCE_NAME')
17 | DATA_SOURCE_URL = os.getenv('DATA_SOURCE_URL')
18 |
19 | SEMAPHORE_COUNT: int = int(os.getenv('SEMAPHORE_COUNT'))
20 | MAX_CHUNK_SIZE: int = int(os.getenv('MAX_CHUNK_SIZE'))
21 | MIN_TO_DOWNLOAD: int = int(os.getenv('MIN_TO_DOWNLOAD'))
22 | MAX_TO_DOWNLOAD: int = int(os.getenv('MAX_TO_DOWNLOAD'))
23 | ONLINE_EXPIRATION_IN_DAYS: int = int(os.getenv('ONLINE_EXPIRATION_IN_DAYS'))
24 |
25 | LATEST_LOAD_TIMESTAMP = os.getenv('LATEST_LOAD_TIMESTAMP')
26 |
27 | RUN_HEADLESS = os.getenv('RUN_HEADLESS') == 'True'
28 |
29 | UPLOAD_TO_AZURE = os.getenv('UPLOAD_TO_AZURE') == 'True'
30 |
31 | AZURE_STORAGE_CONNECTION_STRING = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
32 | AZURE_STORAGE_CONTAINER_NAME = os.getenv('AZURE_STORAGE_CONTAINER_NAME')
33 |
--------------------------------------------------------------------------------
/python/simplescraper/tasks/curate_sitemaps.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import pandas as pd
4 |
5 | from common.entity import SITEMAP, ONLINE_JOB
6 | from common.logging import configure_logger, logger
7 | from common.storage import get_load_timestamp, get_load_date, load_cleansed_df, save_curated_df
8 | from tasks.curate_job_descriptions import BASE_COLUMNS
9 |
10 | ONLINE_JOB_SAT_COLUMNS = ['online_at', 'url']
11 |
12 |
13 | def curate_sitemaps(load_timestamp, load_date):
14 | configure_logger(load_timestamp)
15 | logger.info(f'Start curate_sitemaps: {load_timestamp} {load_date}')
16 |
17 | df = load_cleansed_df(SITEMAP, load_date=load_date)
18 |
19 | df = df.dropna(subset=['job_id'])
20 | df['job_id'] = df['job_id'].astype('int')
21 | df['online_at'] = pd.to_datetime(df['load_timestamp']).dt.date
22 | df = df[BASE_COLUMNS + ONLINE_JOB_SAT_COLUMNS]
23 | df = df.sort_values(by=['job_id'])
24 |
25 | save_curated_df(df, ONLINE_JOB)
26 | logger.info(f'End curate_sitemaps: {load_timestamp} {load_date}')
27 |
28 |
29 | if __name__ == "__main__":
30 | _load_timestamp = sys.argv[1] if len(sys.argv) > 1 else get_load_timestamp()
31 | _load_date = sys.argv[2] if len(sys.argv) > 2 else get_load_date()
32 | curate_sitemaps(_load_timestamp, _load_date)
33 |
--------------------------------------------------------------------------------
/docker/postgres/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.8'
2 |
3 | # Inspired by https://github.com/khezen/compose-postgres/blob/master/docker-compose.yml
4 | services:
5 | postgres:
6 | build:
7 | context: postgres-parquet-fdw
8 | target: postgres-parquet-fdw
9 | environment:
10 | POSTGRES_USER: ${POSTGRES_USER}
11 | POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
12 | POSTGRES_DB: ${POSTGRES_DB}
13 | networks:
14 | - postgres
15 | restart: always
16 | env_file: .env
17 | logging:
18 | options:
19 | max-size: 10m
20 | max-file: "3"
21 | ports:
22 | - '5432:5432'
23 | volumes:
24 | - ${POSTGRES_VOLUME}:/var/lib/postgresql/data
25 | - ${POSTGRES_PARQUET_FDW_VOLUME}:/var/lib/parquet-fdw/data
26 | # pgadmin:
27 | # image: dpage/pgadmin4
28 | # environment:
29 | # PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL}
30 | # PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD}
31 | # PGADMIN_CONFIG_SERVER_MODE: 'False'
32 | # ports:
33 | # - '2345:80'
34 | # volumes:
35 | # - ${PGADMIN_VOLUME}:/var/lib/pgadmin
36 | # networks:
37 | # - postgres
38 | # restart: always
39 | # depends_on:
40 | # - "postgres"
41 |
42 | networks:
43 | postgres:
44 | driver: bridge
45 |
--------------------------------------------------------------------------------
/python/simplescraper/verify_all_backups.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | import pandas as pd
4 |
5 | from common.entity import RAW_ENTITIES
6 | from common.env_variables import DATA_SOURCE_NAME, SOURCE_DIR
7 | from common.storage import list_raw_days
8 |
9 |
10 | def get_current_date():
11 | return datetime.datetime.today().strftime('%Y%m%d')
12 |
13 |
14 | def list_missing_previous_dates(entity):
15 | df = pd.DataFrame(list_raw_days(DATA_SOURCE_NAME, entity))
16 | df_current_date = pd.DataFrame([{
17 | 'date': get_current_date()
18 | }])
19 | df = df.drop_duplicates()
20 | df = pd.concat([
21 | df,
22 | df_current_date, df_current_date
23 | ]).drop_duplicates(keep=False)
24 | return df
25 |
26 |
27 | def verify_backups():
28 | dfs = []
29 | for entity in RAW_ENTITIES:
30 | df = list_missing_previous_dates(entity)
31 | dfs.append(df)
32 | df = pd.concat(dfs, ignore_index=True)
33 | df = df.drop_duplicates()
34 | df = df.sort_values(by=['date'])
35 | dates_to_download = df['date'].to_list()
36 | for date_to_download in dates_to_download:
37 | year = date_to_download[:4]
38 | month = date_to_download[4:6]
39 | day = date_to_download[6:8]
40 | print(
41 | f'/bin/zsh {SOURCE_DIR}/simplescraper/verify_day_backup.sh {year} {month} {day} || exit 1')
42 |
43 |
44 | if __name__ == "__main__":
45 | verify_backups()
46 |
--------------------------------------------------------------------------------
/python/simplescraper/tasks/list_job_descriptions_to_download.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from common.env_variables import LATEST_LOAD_TIMESTAMP
4 | from common.logging import logger, configure_logger
5 | from common.storage import load_temp_df, DOWNLOADED_JOB_DESCRIPTIONS_CSV, SITEMAP_URLS_CSV, save_temp_df, \
6 | JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV
7 |
8 |
9 | def list_job_descriptions_to_download(load_timestamp, df_sitemap_urls=None, df_downloaded=None):
10 | configure_logger(load_timestamp)
11 | logger.info('list_job_descriptions_to_download: start')
12 |
13 | df_sitemap_urls = df_sitemap_urls or load_temp_df(load_timestamp, SITEMAP_URLS_CSV)
14 | df_downloaded = df_downloaded or load_temp_df(load_timestamp, DOWNLOADED_JOB_DESCRIPTIONS_CSV)
15 |
16 | df_downloaded = df_downloaded[['id']]
17 | df_downloaded = df_downloaded.drop_duplicates()
18 | df = df_sitemap_urls[['id']]
19 | df = df.drop_duplicates()
20 | df = pd.concat([df, df_downloaded, df_downloaded]).drop_duplicates(keep=False)
21 | df = df.merge(df_sitemap_urls)
22 | df = df[['url']]
23 | total_count = df.shape[0]
24 |
25 | save_temp_df(df, load_timestamp, JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV)
26 | logger.success(f'URLs to download: {total_count}')
27 | logger.info('list_job_descriptions_to_download: end')
28 | return df
29 |
30 |
31 | if __name__ == "__main__":
32 | list_job_descriptions_to_download(LATEST_LOAD_TIMESTAMP)
33 |
--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/dbt_project.yml:
--------------------------------------------------------------------------------
1 |
2 | # Name your project! Project names should contain only lowercase characters
3 | # and underscores. A good package name should reflect your organization's
4 | # name or the intended use of these models
5 | name: 'job_market_analytics'
6 | version: '1.0.0'
7 | config-version: 2
8 |
9 | # This setting configures which "profile" dbt uses for this project.
10 | profile: 'job_market_analytics'
11 |
12 | # These configurations specify where dbt should look for different types of files.
13 | # The `model-paths` config, for example, states that models in this project can be
14 | # found in the "models/" directory. You probably won't need to change these!
15 | model-paths: [ "models" ]
16 | analysis-paths: [ "analyses" ]
17 | test-paths: [ "tests" ]
18 | seed-paths: [ "seeds" ]
19 | macro-paths: [ "macros" ]
20 | snapshot-paths: [ "snapshots" ]
21 |
22 | target-path: "target" # directory which will store compiled SQL files
23 | clean-targets: # directories to be removed by `dbt clean`
24 | - "target"
25 | - "dbt_packages"
26 |
27 |
28 | # Configuring models
29 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
30 |
31 | # In this example config, we tell dbt to build all models in the example/ directory
32 | # as tables. These settings can be overridden in the individual model files
33 | # using the `{{ config(...) }}` macro.
34 | models:
35 | job_market_analytics:
36 | # Config indicated by + and applies to all files under models/example/
37 | mart:
38 | +materialized: view
39 |
--------------------------------------------------------------------------------
/python/simplescraper/tasks/prune_old_raw.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import os
3 | import shutil
4 | import sys
5 |
6 | from common.entity import RAW_ENTITIES
7 | from common.env_variables import RAW_DIR, DATA_SOURCE_NAME
8 | from common.logging import configure_logger, logger
9 | from common.storage import get_load_timestamp, get_load_date, LOAD_DATE_FORMAT
10 |
11 | SEVEN_MONTHS_IN_DAYS = 7 * 30
12 |
13 |
14 | def prune_old_raw(load_timestamp, load_date):
15 | configure_logger(load_timestamp)
16 | logger.info(f'Start prune_old_raw: {load_date}')
17 | date_to_remove = datetime.datetime.strptime(load_date, LOAD_DATE_FORMAT).date()
18 | date_to_remove = date_to_remove - datetime.timedelta(days=SEVEN_MONTHS_IN_DAYS)
19 | date_to_remove = date_to_remove.strftime(LOAD_DATE_FORMAT)
20 | year, month, day = date_to_remove.split('/', 2)
21 | for entity in RAW_ENTITIES:
22 | folder_to_remove = f'{RAW_DIR}/{DATA_SOURCE_NAME}/{entity}/{year}/{month}/{day}'
23 | if os.path.exists(folder_to_remove) and os.path.isdir(folder_to_remove):
24 | logger.success(f'Removing {folder_to_remove}')
25 | shutil.rmtree(folder_to_remove)
26 | else:
27 | logger.warning(f'No folder to remove on {folder_to_remove}')
28 |
29 | logger.info(f'End prune_old_raw: {load_date}')
30 |
31 |
32 | if __name__ == "__main__":
33 | _load_timestamp = sys.argv[1] if len(sys.argv) > 1 else get_load_timestamp()
34 | _load_date = sys.argv[2] if len(sys.argv) > 2 else get_load_date()
35 | prune_old_raw(_load_timestamp, _load_date)
36 |
--------------------------------------------------------------------------------
/python/simplescraper/tasks/list_downloaded_job_descriptions.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import functools
3 |
4 | import pandas as pd
5 |
6 | from common.entity import JOB_DESCRIPTION
7 | from common.env_variables import LATEST_LOAD_TIMESTAMP, ONLINE_EXPIRATION_IN_DAYS
8 | from common.logging import logger, configure_logger
9 | from common.storage import DOWNLOADED_JOB_DESCRIPTIONS_CSV, DATA_SOURCE_NAME, save_temp_df, list_raw_files
10 |
11 |
12 | @functools.lru_cache(maxsize=1024)
13 | def calculate_days_online(load_timestamp):
14 | ingestion_datetime = datetime.datetime.strptime(load_timestamp, '%Y/%m/%d/%H-%M-%S')
15 | now = datetime.datetime.now()
16 | delta = now - ingestion_datetime
17 | return delta.days
18 |
19 |
20 | def list_downloaded_job_descriptions(load_timestamp, load_date=None) -> pd.DataFrame:
21 | configure_logger(load_timestamp)
22 | logger.info('list_downloaded_job_descriptions start')
23 | files = list_raw_files(DATA_SOURCE_NAME, JOB_DESCRIPTION, load_date)
24 | df = pd.DataFrame(files)
25 | if not df.empty:
26 | df['id'] = df['file_name'].str.split('.', expand=True)[0]
27 | if ONLINE_EXPIRATION_IN_DAYS:
28 | df['days_online'] = df['load_timestamp'].map(calculate_days_online)
29 | df = df[df['days_online'] < ONLINE_EXPIRATION_IN_DAYS]
30 | df = df.drop(columns=['days_online'])
31 | if load_date is None:
32 | save_temp_df(df, load_timestamp, DOWNLOADED_JOB_DESCRIPTIONS_CSV)
33 | logger.info('list_downloaded_job_descriptions end')
34 | return df
35 |
36 |
37 | if __name__ == "__main__":
38 | list_downloaded_job_descriptions(LATEST_LOAD_TIMESTAMP)
39 |
--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_hourly_dag.py:
--------------------------------------------------------------------------------
1 | import os
2 | from datetime import datetime, timedelta
3 | from itertools import chain
4 |
5 | from airflow import DAG
6 | from airflow.decorators import task
7 | from airflow.providers.http.hooks.http import HttpHook
8 |
9 | from common_airflow_dag import run_flasky_task
10 |
11 | os.environ["no_proxy"] = "*"
12 |
13 | with DAG('job_market_analytics_hourly_dag',
14 | description='Job Market Analytics Hourly DAG',
15 | schedule_interval='@hourly',
16 | start_date=datetime(2022, 1, 1),
17 | dagrun_timeout=timedelta(minutes=60),
18 | max_active_runs=1,
19 | catchup=False) as dag:
20 | @task(task_id="check_vpn_status")
21 | def check_vpn_status():
22 | HttpHook(method='GET').run('do/check_vpn_status')
23 |
24 |
25 | @task(task_id="list_downloaded_job_descriptions")
26 | def list_downloaded_job_descriptions():
27 | run_flasky_task('do/list_downloaded_job_descriptions')
28 |
29 |
30 | @task(task_id="download_sitemap", retries=1)
31 | def download_sitemap():
32 | run_flasky_task('do/download_sitemap')
33 |
34 |
35 | @task(task_id="list_job_descriptions_to_download")
36 | def list_job_descriptions_to_download():
37 | run_flasky_task('do/list_job_descriptions_to_download')
38 |
39 |
40 | @task(task_id="download_job_descriptions")
41 | def download_job_descriptions():
42 | run_flasky_task('do/download_job_descriptions')
43 |
44 |
45 | chain(check_vpn_status() >> [list_downloaded_job_descriptions(),
46 | download_sitemap()] >> \
47 | list_job_descriptions_to_download() >> download_job_descriptions())
48 |
--------------------------------------------------------------------------------
/python/utils/migrate_to_raw_v3.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import shutil
4 |
5 | import pandas as pd
6 |
7 | from common.env_variables import LATEST_LOAD_TIMESTAMP, RAW_DIR
8 | from common.storage import DATA_SOURCE_NAME, save_temp_df, load_temp_df
9 |
10 |
11 | def list_raw_files(data_source, entity):
12 | dir_path = os.path.join(RAW_DIR, data_source, entity)
13 | file_list = [{
14 | 'old_file_path': f,
15 | } for f in glob.iglob(dir_path + '/*/*/*/*/*', recursive=True) if os.path.isfile(f)]
16 | return file_list
17 |
18 |
19 | def list_downloaded_files(load_timestamp) -> pd.DataFrame:
20 | files = list_raw_files(DATA_SOURCE_NAME, 'job_description')
21 | df = pd.DataFrame(files)
22 | save_temp_df(df, load_timestamp, '00_downloaded_raw_job_descriptions.csv')
23 | return df
24 |
25 |
26 | def get_new_file_path(row):
27 | old_file_path = row['old_file_path']
28 | dirname = os.path.dirname(old_file_path)
29 | basename = os.path.basename(old_file_path)
30 | job_id = basename.rsplit('--', 1)
31 | job_id = job_id[1]
32 | job_id = job_id.split('-')
33 | job_id = job_id[0]
34 | new_file_path = os.path.join(dirname.replace('/raw/', '/raw_v3/'), f'{job_id}.html')
35 | return new_file_path
36 |
37 |
38 | def copy_file(row):
39 | src = row['old_file_path']
40 | dst = row['new_file_path']
41 | os.makedirs(os.path.dirname(dst), exist_ok=True)
42 | shutil.copy2(src, dst)
43 |
44 |
45 | def copy_files_to_raw_v2(load_timestamp):
46 | df = load_temp_df(load_timestamp, '00_downloaded_raw_job_descriptions.csv')
47 | df['new_file_path'] = df.apply(get_new_file_path, axis=1)
48 | df.apply(copy_file, axis=1)
49 |
50 |
51 | if __name__ == "__main__":
52 | copy_files_to_raw_v2(LATEST_LOAD_TIMESTAMP)
53 |
--------------------------------------------------------------------------------
/python/simplescraper/restore_all_backups.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | import pandas as pd
4 |
5 | from common.entity import RAW_ENTITIES
6 | from common.env_variables import DATA_SOURCE_NAME, SOURCE_DIR
7 | from common.storage import list_raw_days, list_backup_days
8 |
9 |
10 | def get_current_date():
11 | return datetime.datetime.today().strftime('%Y%m%d')
12 |
13 |
14 | def list_backups_to_restore(entity):
15 | df = pd.DataFrame(list_backup_days(DATA_SOURCE_NAME, entity))
16 | df_in_raw = pd.DataFrame(list_raw_days(DATA_SOURCE_NAME, entity))
17 | df_current_date = pd.DataFrame([{
18 | 'date': get_current_date()
19 | }])
20 | df = df.drop_duplicates()
21 | df = pd.concat([
22 | df,
23 | df_in_raw, df_in_raw,
24 | df_current_date, df_current_date
25 | ]).drop_duplicates(keep=False)
26 | return df
27 |
28 |
29 | def print_script_statements(script_name, days_to_restore):
30 | for day_to_restore in days_to_restore:
31 | year = day_to_restore[:4]
32 | month = day_to_restore[4:6]
33 | day = day_to_restore[6:8]
34 | print(
35 | f'/bin/zsh {SOURCE_DIR}/simplescraper/{script_name} {year} {month} {day} || exit 1')
36 |
37 |
38 | def restore_all_backups():
39 | dfs = []
40 | for entity in RAW_ENTITIES:
41 | df = list_backups_to_restore(entity)
42 | dfs.append(df)
43 | df = pd.concat(dfs, ignore_index=True)
44 | df = df.drop_duplicates()
45 | df = df.sort_values(by=['date'])
46 | days_to_restore = df['date'].to_list()
47 | print_script_statements('restore_day_backup.sh', days_to_restore)
48 | print()
49 | print_script_statements('verify_day_backup.sh', days_to_restore)
50 |
51 |
52 | if __name__ == "__main__":
53 | restore_all_backups()
54 |
--------------------------------------------------------------------------------
/python/simplescraper/do_all_backups.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | import pandas as pd
4 |
5 | from common.entity import RAW_ENTITIES
6 | from common.env_variables import DATA_SOURCE_NAME, SOURCE_DIR
7 | from common.storage import list_raw_days, list_backup_days
8 |
9 |
10 | def get_current_date():
11 | return datetime.datetime.today().strftime('%Y%m%d')
12 |
13 |
14 | def list_days_to_backup(entity):
15 | df = pd.DataFrame(list_raw_days(DATA_SOURCE_NAME, entity))
16 | df_backup_days = pd.DataFrame(list_backup_days(DATA_SOURCE_NAME, entity))
17 | df_current_date = pd.DataFrame([{
18 | 'date': get_current_date()
19 | }])
20 | df = df.drop_duplicates()
21 | df = pd.concat([
22 | df,
23 | df_backup_days, df_backup_days,
24 | df_current_date, df_current_date
25 | ]).drop_duplicates(keep=False)
26 | return df
27 |
28 |
29 | def print_script_statements(script_name, dates_to_download):
30 | for date_to_download in dates_to_download:
31 | year = date_to_download[:4]
32 | month = date_to_download[4:6]
33 | day = date_to_download[6:8]
34 | print(
35 | f'/bin/zsh {SOURCE_DIR}/simplescraper/{script_name} {year} {month} {day} || exit 1')
36 |
37 |
38 | def do_all_backups():
39 | dfs = []
40 | for entity in RAW_ENTITIES:
41 | df = list_days_to_backup(entity)
42 | dfs.append(df)
43 | df = pd.concat(dfs, ignore_index=True)
44 | df = df.drop_duplicates()
45 | df = df.sort_values(by=['date'])
46 | dates_to_download = df['date'].to_list()
47 | print_script_statements('do_day_backup.sh', dates_to_download)
48 | print()
49 | print_script_statements('verify_day_backup.sh', dates_to_download)
50 |
51 |
52 | if __name__ == "__main__":
53 | do_all_backups()
54 |
--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_daily_dag.py:
--------------------------------------------------------------------------------
1 | import os
2 | from datetime import datetime, timedelta
3 |
4 | from airflow import DAG
5 | from airflow.decorators import task
6 |
7 | from common_airflow_dag import run_flasky_task
8 |
9 | os.environ["no_proxy"] = "*"
10 |
11 | with DAG('job_market_analytics_daily_dag',
12 | description='Job Market Analytics Daily DAG',
13 | schedule_interval='@daily',
14 | start_date=datetime(2022, 1, 1),
15 | dagrun_timeout=timedelta(minutes=60),
16 | max_active_runs=1,
17 | catchup=True) as dag:
18 | @task(task_id="cleanse_sitemaps")
19 | def cleanse_sitemaps():
20 | run_flasky_task('do/cleanse_sitemaps')
21 |
22 |
23 | @task(task_id="cleanse_job_descriptions")
24 | def cleanse_job_descriptions():
25 | run_flasky_task('do/cleanse_job_descriptions')
26 |
27 |
28 | @task(task_id="curate_sitemaps")
29 | def curate_sitemaps():
30 | run_flasky_task('do/curate_sitemaps')
31 |
32 |
33 | @task(task_id="curate_job_descriptions")
34 | def curate_job_descriptions():
35 | run_flasky_task('do/curate_job_descriptions')
36 |
37 |
38 | @task(task_id="do_dbt_run")
39 | def dbt_run():
40 | run_flasky_task('do/do_dbt_run')
41 |
42 |
43 | @task(task_id="do_day_backup")
44 | def backup_day():
45 | run_flasky_task('do/do_day_backup')
46 |
47 |
48 | @task(task_id="verify_day_backup")
49 | def verify_day_backup():
50 | run_flasky_task('do/verify_day_backup')
51 |
52 |
53 | @task(task_id="prune_old_raw")
54 | def prune_old_raw():
55 | run_flasky_task('do/prune_old_raw')
56 |
57 |
58 | t_curate_sitemaps = curate_sitemaps()
59 | t_curate_job_descriptions = curate_job_descriptions()
60 |
61 | cleanse_sitemaps() >> t_curate_sitemaps
62 | cleanse_job_descriptions() >> t_curate_job_descriptions
63 |
64 | [t_curate_sitemaps, t_curate_job_descriptions] >> dbt_run()
65 |
66 | backup_day() >> verify_day_backup() >> prune_old_raw()
67 |
--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_daily_dag_catch_up.py:
--------------------------------------------------------------------------------
1 | import os
2 | from datetime import datetime, timedelta
3 |
4 | from airflow import DAG
5 | from airflow.decorators import task
6 |
7 | from common_airflow_dag import run_flasky_task
8 |
9 | os.environ["no_proxy"] = "*"
10 |
11 | with DAG('job_market_analytics_daily_catch_up_dag',
12 | description='Job Market Analytics Daily Catch UP DAG',
13 | schedule_interval='@daily',
14 | start_date=datetime(2023, 5, 24),
15 | dagrun_timeout=timedelta(minutes=60),
16 | max_active_runs=1,
17 | max_active_tasks=1,
18 | catchup=True) as dag:
19 | @task(task_id="cleanse_sitemaps")
20 | def cleanse_sitemaps():
21 | run_flasky_task('do/cleanse_sitemaps')
22 |
23 |
24 | @task(task_id="cleanse_job_descriptions")
25 | def cleanse_job_descriptions():
26 | run_flasky_task('do/cleanse_job_descriptions')
27 |
28 |
29 | @task(task_id="curate_sitemaps")
30 | def curate_sitemaps():
31 | run_flasky_task('do/curate_sitemaps')
32 |
33 |
34 | @task(task_id="curate_job_descriptions")
35 | def curate_job_descriptions():
36 | run_flasky_task('do/curate_job_descriptions')
37 |
38 |
39 | @task(task_id="do_dbt_run")
40 | def dbt_run():
41 | run_flasky_task('do/do_dbt_run')
42 |
43 |
44 | @task(task_id="do_day_backup")
45 | def backup_day():
46 | run_flasky_task('do/do_day_backup')
47 |
48 |
49 | @task(task_id="verify_day_backup")
50 | def verify_day_backup():
51 | run_flasky_task('do/verify_day_backup')
52 |
53 |
54 | @task(task_id="prune_old_raw")
55 | def prune_old_raw():
56 | run_flasky_task('do/prune_old_raw')
57 |
58 |
59 | t_curate_sitemaps = curate_sitemaps()
60 | t_curate_job_descriptions = curate_job_descriptions()
61 |
62 | cleanse_sitemaps() >> t_curate_sitemaps
63 | cleanse_job_descriptions() >> t_curate_job_descriptions
64 |
65 | [t_curate_sitemaps, t_curate_job_descriptions] >> dbt_run()
66 |
67 | backup_day() >> verify_day_backup() >> prune_old_raw()
68 |
--------------------------------------------------------------------------------
/doc/metaData-bag.log:
--------------------------------------------------------------------------------
1 | 2022-05-08 21:29:11.685 | DEBUG | __main__:load_and_parse:27 - Parsing (96/213) 2343/3437: 2022/04/20/09-00-00/8205291.html
2 | Traceback (most recent call last):
3 | File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/parse_job_descriptions.py", line 70, in
4 | parse_job_descriptions()
5 | File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/parse_job_descriptions.py", line 53, in parse_job_descriptions
6 | df['parsed_content'] = df.apply(load_and_parse, axis=1)
7 | File "/Users/carrion/PycharmProjects/job-market-analytics/venv/lib/python3.8/site-packages/pandas/core/frame.py", line 8740, in apply
8 | return op.apply()
9 | File "/Users/carrion/PycharmProjects/job-market-analytics/venv/lib/python3.8/site-packages/pandas/core/apply.py", line 688, in apply
10 | return self.apply_standard()
11 | File "/Users/carrion/PycharmProjects/job-market-analytics/venv/lib/python3.8/site-packages/pandas/core/apply.py", line 812, in apply_standard
12 | results, res_index = self.apply_series_generator()
13 | File "/Users/carrion/PycharmProjects/job-market-analytics/venv/lib/python3.8/site-packages/pandas/core/apply.py", line 828, in apply_series_generator
14 | results[i] = self.f(v)
15 | File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/parse_job_descriptions.py", line 28, in load_and_parse
16 | parsed_content = parse_job_description(html_content)
17 | File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/tasks/parse_job_description.py", line 55, in parse_job_description
18 | job_description = extract_metadata(soup)
19 | File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/tasks/parse_job_description.py", line 46, in extract_metadata
20 | metadata = flatten_metadata(metadata)
21 | File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/tasks/parse_job_description.py", line 24, in flatten_metadata
22 | temp_metadata = flatten.pop('metaData')
23 | KeyError: 'metaData'
24 |
25 | Process finished with exit code 1
26 |
--------------------------------------------------------------------------------
/python/simplescraper/tasks/cleanse_sitemaps.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import bs4
4 | import pandas as pd
5 | from loguru import logger
6 |
7 | from common.entity import SITEMAP
8 | from common.logging import configure_logger
9 | from common.storage import get_load_timestamp, load_raw_file, save_cleansed_df, get_load_date, LOAD_TIMESTAMP_FORMAT
10 | from tasks.list_downloaded_sitemaps import list_downloaded_sitemaps
11 |
12 |
13 | def load_and_parse(row):
14 | load_timestamp = row['load_timestamp']
15 | file_name = row['file_name']
16 | sitemap_content = load_raw_file(SITEMAP, load_timestamp, file_name)
17 | logger.debug(f'Parsing: {load_timestamp}/{file_name}')
18 | soup = bs4.BeautifulSoup(sitemap_content, 'xml')
19 | urls = [loc.text for loc in soup.findAll('loc')]
20 | return urls
21 |
22 |
23 | def extract_job_id(url_column):
24 | url_split = url_column.str.split('--', expand=True)
25 | return url_split[2].str.split('-', expand=True)[0]
26 |
27 |
28 | def get_date_from_load_timestamp(load_timestamp):
29 | year, month, day, time = load_timestamp.split('/')
30 | return f'{year}-{month}-{day}'
31 |
32 |
33 | def cleanse_sitemaps(load_timestamp, load_date):
34 | configure_logger(load_timestamp)
35 | df = list_downloaded_sitemaps(load_timestamp, load_date)
36 | df[['year', 'month', 'day', 'time']] = df['load_timestamp'].str.split('/', 3, expand=True)
37 | if df.empty:
38 | logger.info('Nothing to parse')
39 | return
40 | df = df.sort_values(by=['load_timestamp', 'file_name'])
41 | df['url'] = df.apply(load_and_parse, axis=1)
42 | df = df.explode('url')
43 | df['job_id'] = extract_job_id(df['url'])
44 | df = df.drop_duplicates(['job_id'], keep='first')
45 | df['load_timestamp'] = pd.to_datetime(df['load_timestamp'], format=LOAD_TIMESTAMP_FORMAT, utc=True)
46 | logger.info(f'Saving cleansed: {df["load_timestamp"].iloc[0]}')
47 | save_cleansed_df(df, SITEMAP)
48 |
49 |
50 | if __name__ == "__main__":
51 | _load_timestamp = sys.argv[1] if len(sys.argv) > 1 else get_load_timestamp()
52 | _load_date = sys.argv[2] if len(sys.argv) > 2 else get_load_date()
53 | cleanse_sitemaps(_load_timestamp, _load_date)
54 |
--------------------------------------------------------------------------------
/python/utils/migrate_raw_v1_to_raw_v2.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import glob
3 | import os
4 | import shutil
5 |
6 | import pandas as pd
7 |
8 | from common.env_variables import LATEST_LOAD_TIMESTAMP, RAW_DIR, DATA_DIR
9 | from common.storage import DATA_SOURCE_NAME, save_temp_df, load_temp_df
10 |
11 |
12 | def list_raw_files(data_source):
13 | dir_path = os.path.join(RAW_DIR, data_source)
14 | file_list = [{
15 | 'old_file_path': f,
16 | 'entity': f.split('/')[-3],
17 | 'timestamp': datetime.datetime.fromtimestamp(os.stat(f).st_birthtime),
18 | 'file_name': f.split('/')[-1],
19 | } for f in glob.iglob(dir_path + '/*/*/*', recursive=True) if os.path.isfile(f)]
20 | return file_list
21 |
22 |
23 | def list_downloaded_files(load_timestamp) -> pd.DataFrame:
24 | files = list_raw_files(DATA_SOURCE_NAME)
25 | df = pd.DataFrame(files)
26 | # df = df[df['file_name'] != 'sitemapindex.xml']
27 | save_temp_df(df, load_timestamp, '00_downloaded_raw_files.csv')
28 | return df
29 |
30 |
31 | def timestamp_to_datatime_partition(timestamp):
32 | timestamp = str(timestamp)
33 | split1, split2 = timestamp.split()
34 | year, month, day = split1.split('-')
35 | hour = split2[:2]
36 | datatime_partition = f'{year}/{month}/{day}/{hour}-00-00'
37 | return datatime_partition
38 |
39 |
40 | def get_new_file_path(row):
41 | new_file_path = os.path.join(DATA_DIR, 'raw_v2', DATA_SOURCE_NAME, row['entity'], row['datatime_partition'],
42 | row['file_name'])
43 | return new_file_path
44 |
45 |
46 | def copy_file(row):
47 | src = row['old_file_path']
48 | dst = row['new_file_path']
49 | os.makedirs(os.path.dirname(dst), exist_ok=True)
50 | shutil.copy2(src, dst)
51 |
52 |
53 | def copy_files_to_raw_v2(load_timestamp):
54 | df = load_temp_df(load_timestamp, '00_downloaded_raw_files.csv')
55 | df['datatime_partition'] = df['timestamp'].apply(timestamp_to_datatime_partition)
56 | df['new_file_path'] = df.apply(get_new_file_path, axis=1)
57 | df.apply(copy_file, axis=1)
58 |
59 |
60 | if __name__ == "__main__":
61 | list_downloaded_files(LATEST_LOAD_TIMESTAMP)
62 | copy_files_to_raw_v2(LATEST_LOAD_TIMESTAMP)
63 |
--------------------------------------------------------------------------------
/python/simplescraper/tasks/cleanse_job_descriptions.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import pandas as pd
4 |
5 | from common.entity import JOB_DESCRIPTION
6 | from common.logging import logger, configure_logger
7 | from common.storage import get_load_timestamp, load_raw_file, save_cleansed_df, get_load_date, LOAD_TIMESTAMP_FORMAT
8 | from tasks.list_downloaded_job_descriptions import list_downloaded_job_descriptions
9 | from tasks.parse_job_description import parse_job_description
10 |
11 |
12 | def load_and_parse(row) -> str:
13 | load_timestamp = row['load_timestamp']
14 | file_name = row['file_name']
15 | html_content = load_raw_file(JOB_DESCRIPTION, load_timestamp, file_name)
16 | try:
17 | logger.debug(f'Parsing {load_timestamp}/{file_name}')
18 | parsed_content = parse_job_description(html_content)
19 | return parsed_content
20 | except AttributeError:
21 | logger.warning(f'The following file could not be parsed: {load_timestamp}/{file_name}')
22 | return ''
23 |
24 |
25 | def cleanse_job_descriptions(load_timestamp, load_date):
26 | configure_logger(load_timestamp)
27 | df = list_downloaded_job_descriptions(load_timestamp, load_date)
28 | if df.empty:
29 | logger.warning(f'Nothing to cleanse for the load date: {load_date}')
30 | return
31 | df = df.sort_values(by=['load_timestamp', 'file_name'])
32 | df = df.reset_index(drop=True)
33 | logger.info(f'Start to parse job descriptions for the load date: {load_date}')
34 | df['parsed_content'] = df.apply(load_and_parse, axis=1)
35 | df = df.join(pd.json_normalize(df['parsed_content']))
36 | df = df.drop(columns=['parsed_content'])
37 | df[['year', 'month', 'day', 'hour']] = df['load_timestamp'].str.split('/', 3, expand=True)
38 | df['load_timestamp'] = pd.to_datetime(df['load_timestamp'], format=LOAD_TIMESTAMP_FORMAT, utc=True)
39 | logger.info(f'Finish to parse job descriptions for the load date: {load_date}')
40 | save_cleansed_df(df, JOB_DESCRIPTION)
41 |
42 |
43 | if __name__ == "__main__":
44 | _load_timestamp = sys.argv[1] if len(sys.argv) > 1 else get_load_timestamp()
45 | _load_date = sys.argv[2] if len(sys.argv) > 2 else get_load_date()
46 | cleanse_job_descriptions(_load_timestamp, _load_date)
47 |
--------------------------------------------------------------------------------
/sql/dwh/requirements.txt:
--------------------------------------------------------------------------------
1 | agate==1.6.3
2 | appnope==0.1.3
3 | argon2-cffi==21.3.0
4 | argon2-cffi-bindings==21.2.0
5 | asttokens==2.0.8
6 | attrs==22.1.0
7 | Babel==2.10.3
8 | backcall==0.2.0
9 | beautifulsoup4==4.11.1
10 | bleach==5.0.1
11 | certifi==2022.9.14
12 | cffi==1.15.1
13 | charset-normalizer==2.1.1
14 | click==8.1.3
15 | colorama==0.4.4
16 | dbt-core==1.5.0
17 | dbt-duckdb==1.5.1
18 | dbt-extractor==0.4.1
19 | debugpy==1.6.3
20 | decorator==5.1.1
21 | defusedxml==0.7.1
22 | duckdb==0.7.0
23 | entrypoints==0.4
24 | executing==1.0.0
25 | fastjsonschema==2.16.1
26 | future==0.18.2
27 | hologram==0.0.15
28 | idna==3.4
29 | ipykernel==6.15.3
30 | ipython==8.5.0
31 | ipython-genutils==0.2.0
32 | ipywidgets==8.0.2
33 | isodate==0.6.1
34 | jedi==0.18.1
35 | Jinja2==3.1.2
36 | jsonschema==3.2.0
37 | jupyter==1.0.0
38 | jupyter-console==6.4.4
39 | jupyter-core==4.11.1
40 | jupyter_client==7.3.5
41 | jupyterlab-pygments==0.2.2
42 | jupyterlab-widgets==3.0.3
43 | leather==0.3.4
44 | Logbook==1.5.3
45 | MarkupSafe==2.0.1
46 | mashumaro==3.6
47 | matplotlib-inline==0.1.6
48 | minimal-snowplow-tracker==0.0.2
49 | mistune==0.8.4
50 | msgpack==1.0.4
51 | nbclient==0.5.13
52 | nbconvert==6.4.5
53 | nbformat==5.5.0
54 | nest-asyncio==1.5.5
55 | networkx==2.8.3
56 | notebook==6.4.12
57 | numpy==1.23.3
58 | packaging==21.3
59 | pandas==1.4.4
60 | pandocfilters==1.5.0
61 | parsedatetime==2.4
62 | parso==0.8.3
63 | pathspec==0.9.0
64 | patsy==0.5.2
65 | pexpect==4.8.0
66 | pickleshare==0.7.5
67 | plotly==5.10.0
68 | plotly-calplot==0.1.12
69 | plotly-express==0.4.1
70 | prometheus-client==0.14.1
71 | prompt-toolkit==3.0.31
72 | protobuf==4.23.1
73 | psutil==5.9.2
74 | ptyprocess==0.7.0
75 | pure-eval==0.2.2
76 | pycparser==2.21
77 | Pygments==2.13.0
78 | pyparsing==3.0.9
79 | pyrsistent==0.18.1
80 | python-dateutil==2.8.2
81 | python-dotenv==0.21.0
82 | python-slugify==6.1.2
83 | pytimeparse==1.1.8
84 | pytz==2022.2.1
85 | PyYAML==6.0
86 | pyzmq==24.0.0
87 | qtconsole==5.3.2
88 | QtPy==2.2.0
89 | requests==2.28.1
90 | scipy==1.9.1
91 | Send2Trash==1.8.0
92 | six==1.16.0
93 | soupsieve==2.3.2.post1
94 | sqlparse==0.4.2
95 | stack-data==0.5.0
96 | statsmodels==0.13.2
97 | tenacity==8.0.1
98 | terminado==0.15.0
99 | testpath==0.6.0
100 | text-unidecode==1.3
101 | tornado==6.2
102 | traitlets==5.4.0
103 | typing_extensions==4.3.0
104 | urllib3==1.26.12
105 | wcwidth==0.2.5
106 | webencodings==0.5.1
107 | Werkzeug==2.1.2
108 | widgetsnbextension==4.0.3
109 |
--------------------------------------------------------------------------------
/python/simplescraper/tasks/parse_job_description.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 |
4 | from bs4 import BeautifulSoup
5 |
6 | SPACE_CHAR = ' '
7 | NBSP_CHAR = u'\xa0'
8 |
9 | METADATA_JSON_PREFIX = 'window.__PRELOADED_STATE__.HeaderStepStoneBlock = '
10 | METADATA_JSON_SUFFIX = ';'
11 |
12 | FIELD_SELECTORS = {
13 | #'company_name': '.at-header-company-name',
14 | # 'description': 'div[itemprop="description"]',
15 | 'description_introduction': '.at-section-text-introduction',
16 | 'description_responsabilities': '.at-section-text-description-content',
17 | 'description_requirements': '.at-section-text-profile-content',
18 | 'description_perks': '.at-section-text-weoffer-content',
19 | }
20 |
21 |
22 | def flatten_metadata(metadata):
23 | flatten = metadata.copy()
24 | temp_metadata = flatten.pop('metaData')
25 | flatten.update(temp_metadata)
26 | return flatten
27 |
28 |
29 | def keys_to_snake_case(metadata):
30 | snake_case_object = {}
31 | for old_key in metadata.keys():
32 | # https://stackoverflow.com/questions/60148175/convert-camelcase-to-snakecase
33 | new_key = re.sub(r'(? ONE_HOUR:
23 | raise Exception('The load_timestamp is older than one hour')
24 |
25 |
26 | def historize_url_content(url, content, load_timestamp):
27 | file_name = url.split('/')[-1]
28 | save_raw_file(content, SITEMAP, load_timestamp, file_name)
29 |
30 |
31 | def get_and_historize_url_content(url, load_timestamp):
32 | content = get_url_content(url)
33 | historize_url_content(url, content, load_timestamp)
34 | return content
35 |
36 |
37 | def get_listing_urls(load_timestamp):
38 | web_content = get_and_historize_url_content(SITEMAP_INDEX_XML, load_timestamp)
39 | web_content = xmltodict.parse(web_content)
40 | web_content = web_content['sitemapindex']
41 | web_content = web_content['sitemap']
42 | listing_urls = []
43 | for entry in web_content:
44 | url = entry['loc']
45 | if 'listings' in url:
46 | listing_urls.append(url)
47 | return listing_urls
48 |
49 |
50 | def get_job_description_urls(web_content):
51 | web_content = xmltodict.parse(web_content)
52 | web_content = web_content['urlset']
53 | url_entries = web_content['url']
54 | urls = []
55 | for entry in url_entries:
56 | url = entry['loc']
57 | urls.append(url)
58 |
59 | return urls
60 |
61 |
62 | def get_all_job_description_urls(load_timestamp):
63 | listing_urls = get_listing_urls(load_timestamp)
64 | job_description_urls = []
65 | for listing_url in listing_urls:
66 | web_content = get_and_historize_url_content(listing_url, load_timestamp)
67 | job_description_urls.extend(get_job_description_urls(web_content))
68 | return job_description_urls
69 |
70 |
71 | def convert_urls_to_df(all_job_description_urls) -> pd.DataFrame:
72 | df = pd.DataFrame(all_job_description_urls, columns=['url'])
73 |
74 | df = df.drop_duplicates()
75 | url_split = df['url'].str.split('--', expand=True)
76 | df['name_slug'] = url_split[1]
77 | df['id'] = url_split[2].str.split('-', expand=True)[0]
78 | df = df.sort_values(by=['id'], ascending=False)
79 |
80 | return df
81 |
82 |
83 | def download_sitemap(load_timestamp) -> pd.DataFrame:
84 | configure_logger(load_timestamp)
85 | check_load_timestamp(load_timestamp)
86 | logger.info('download_sitemap: start')
87 | all_job_description_urls = get_all_job_description_urls(load_timestamp)
88 | df = convert_urls_to_df(all_job_description_urls)
89 | save_temp_df(df, load_timestamp, SITEMAP_URLS_CSV)
90 | logger.info('download_sitemap: end')
91 | return df
92 |
93 |
94 | if __name__ == '__main__':
95 | download_sitemap(LATEST_LOAD_TIMESTAMP)
96 |
--------------------------------------------------------------------------------
/python/dashy/requirements.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is autogenerated by pip-compile with python 3.10
3 | # To update, run:
4 | #
5 | # pip-compile --allow-unsafe requirements.in
6 | #
7 | ansi2html==1.8.0
8 | # via jupyter-dash
9 | appnope==0.1.3
10 | # via
11 | # ipykernel
12 | # ipython
13 | asttokens==2.0.8
14 | # via stack-data
15 | backcall==0.2.0
16 | # via ipython
17 | brotli==1.0.9
18 | # via flask-compress
19 | certifi==2022.9.24
20 | # via requests
21 | charset-normalizer==2.1.1
22 | # via requests
23 | click==8.1.3
24 | # via flask
25 | dash==2.6.2
26 | # via
27 | # -r requirements.in
28 | # dash-bootstrap-components
29 | # jupyter-dash
30 | dash-bootstrap-components==1.2.1
31 | # via -r requirements.in
32 | dash-core-components==2.0.0
33 | # via dash
34 | dash-html-components==2.0.0
35 | # via dash
36 | dash-table==5.0.0
37 | # via dash
38 | debugpy==1.6.3
39 | # via ipykernel
40 | decorator==5.1.1
41 | # via ipython
42 | duckdb==0.7.0
43 | # via -r requirements.in
44 | entrypoints==0.4
45 | # via jupyter-client
46 | executing==1.1.0
47 | # via stack-data
48 | flask==2.2.2
49 | # via
50 | # dash
51 | # flask-compress
52 | # jupyter-dash
53 | flask-compress==1.13
54 | # via dash
55 | gunicorn==20.1.0
56 | # via -r requirements.in
57 | idna==3.4
58 | # via requests
59 | ipykernel==6.16.0
60 | # via jupyter-dash
61 | ipython==8.5.0
62 | # via
63 | # ipykernel
64 | # jupyter-dash
65 | itsdangerous==2.1.2
66 | # via flask
67 | jedi==0.18.1
68 | # via ipython
69 | jinja2==3.1.2
70 | # via flask
71 | jupyter-client==7.3.5
72 | # via ipykernel
73 | jupyter-core==4.11.1
74 | # via jupyter-client
75 | jupyter-dash==0.4.2
76 | # via -r requirements.in
77 | loguru==0.6.0
78 | # via -r requirements.in
79 | markupsafe==2.1.1
80 | # via
81 | # jinja2
82 | # werkzeug
83 | matplotlib-inline==0.1.6
84 | # via
85 | # ipykernel
86 | # ipython
87 | nest-asyncio==1.5.6
88 | # via
89 | # ipykernel
90 | # jupyter-client
91 | # jupyter-dash
92 | numpy==1.23.3
93 | # via pandas
94 | packaging==21.3
95 | # via ipykernel
96 | pandas==1.5.0
97 | # via -r requirements.in
98 | parso==0.8.3
99 | # via jedi
100 | pexpect==4.8.0
101 | # via ipython
102 | pickleshare==0.7.5
103 | # via ipython
104 | plotly==5.10.0
105 | # via dash
106 | prompt-toolkit==3.0.31
107 | # via ipython
108 | psutil==5.9.2
109 | # via ipykernel
110 | ptyprocess==0.7.0
111 | # via pexpect
112 | pure-eval==0.2.2
113 | # via stack-data
114 | pygments==2.13.0
115 | # via ipython
116 | pyparsing==3.0.9
117 | # via packaging
118 | python-dateutil==2.8.2
119 | # via
120 | # jupyter-client
121 | # pandas
122 | python-dotenv==0.21.0
123 | # via -r requirements.in
124 | pytz==2022.4
125 | # via pandas
126 | pyzmq==24.0.1
127 | # via
128 | # ipykernel
129 | # jupyter-client
130 | requests==2.28.1
131 | # via jupyter-dash
132 | retrying==1.3.3
133 | # via jupyter-dash
134 | six==1.16.0
135 | # via
136 | # python-dateutil
137 | # retrying
138 | stack-data==0.5.1
139 | # via ipython
140 | tenacity==8.1.0
141 | # via plotly
142 | tornado==6.2
143 | # via
144 | # ipykernel
145 | # jupyter-client
146 | traitlets==5.4.0
147 | # via
148 | # ipykernel
149 | # ipython
150 | # jupyter-client
151 | # matplotlib-inline
152 | urllib3==1.26.12
153 | # via requests
154 | wcwidth==0.2.5
155 | # via prompt-toolkit
156 | werkzeug==2.2.2
157 | # via flask
158 |
159 | # The following packages are considered to be unsafe in a requirements file:
160 | setuptools==65.4.1
161 | # via gunicorn
162 |
--------------------------------------------------------------------------------
/doc/TODO.md:
--------------------------------------------------------------------------------
1 | # TO DO
2 |
3 | ## Open
4 |
5 | - [ ] Implement use case: Location/Company/Technology changelog
6 | - [ ] Add the next data source
7 | - [ ] Slugify the value of the filter selectors
8 | - [ ] Upload only backup files to the Azure Blob Storage
9 | - [ ] Implement use case: Number of jobs relative to city population
10 | - [ ] Add the flag to the do and verify backup commands: --exclude='.DS_Store'
11 | - [ ] Add a file in the raw layer with the scrape run information for each execution
12 | - This file could be in JSON format and have the following fields:
13 | - run_id
14 | - timestamp
15 | - number of urls to download
16 | - number of urls downloaded
17 | - number of failed urls
18 | - failed urls (a list of string)
19 |
20 | ## In Progress
21 |
22 |
23 | ## Done
24 |
25 | - [x] Display more than 12 months
26 | - [x] Let users use interactive graphs instead of static plots
27 | - [x] Let users start the y-axis with zero
28 | - [x] Make Dashy public with the domain https://jobmarketanalytics.com/
29 | - [x] Cache sql query executions on Dashy
30 | - [x] Implement use case: Compare technologies
31 | - [x] Have 3 materialized tables for Dashy with different time durations to improve the performance
32 | - [x] Use statefuls URLs according to state of the input components on Dashy
33 | - [x] Use LocalExecutor in Airflow
34 | - [x] Run Airflow locally to reduce the Docker overhead
35 | - [x] Implement use case: Technology trends
36 | - [x] Add a size indicator in the filter options in Dashy
37 | - [x] Implement some kind of search/dashboard for external users
38 | - [x] Check out https://github.com/rilldata/rill-developer
39 | - [x] Decide for a BI tool
40 | - [x] Check out https://superset.apache.org/
41 | - [x] Create a separated virtual environment for dbt
42 | - [x] Check out https://www.linkedin.com/in/christian-kaul/recent-activity/posts/
43 | - [x] Check out https://dbtvault.readthedocs.io/
44 | - [x] Check out https://github.com/jwills/dbt-duckdb
45 | - [x] Use Gunicorn to run flasky with 4 workers
46 | - [x] On the cleansed layer, add the first sitemap occurance per URL instead of only the latest load_timestamp
47 | - [x] Add load_timestamp and load_date to the curated layer
48 | - [x] Rename target_date to load_date
49 | - [x] Rename run_timestamp to load_timestamp
50 | - [x] Fail the download sitemap task in the hourly dag if the load_timestamp is older than one hour
51 | - [x] Create a separated virtual environment for airflow
52 | - [x] Fix the issue "metaData-bag.log"
53 | - [x] Find a better way to avoid Airflow to hang when there are many jobs to download
54 | - [x] Move the raw storage to the cloud
55 | - [x] Improve logging
56 | - Log how many urls to download are
57 | - Make the check vpn more visible
58 | - [x] Download the job description again after a configurable number of days online
59 | - [x] Create a report that shows how many days a job offer is online
60 | - [x] Create a report that shows how many job offers are online at a given time
61 | - [x] Find a better timestamp to use than the logical timestamp for the scrape data source dag
62 | - [x] Fix bug with file names longer than 255 characters
63 | - [x] Fix logs in Flasky
64 | - [x] Add more granularity to the ingestion time in the raw data
65 | - [x] Add orchestration with Airflow
66 | - [x] Create the Data Vault
67 | - [x] Optimize the function to create the chunks
68 | - [x] Add a check for the network connection before we start crawling
69 | - [x] Save the whole html document from the source instead of just a fragment of it, so that no information is lost if
70 | the HTML format changes
71 | - [x] Add logging to the sitemap scraper
72 | - [x] Find a way to pass the list of parquet files to PostgreSQL.
73 | - Result: Use Python to create the staging fdw staging tables referencing the parquet files
74 | - [x] Add the _job_id_ to the _sitemap_ and _job_description_ on the cleansed layer
75 | - [x] Create a _ingestion_id_ with the hash of the _job_id_ and _timestap_ on the cleansed layer
76 |
77 | ---
78 |
79 | ## Discarded
80 |
81 | - [x] Try https://xapian.org/ for the search
82 | - [x] Replace the PostgreSQL ingestion with CSV instead of Parquet
83 | - [x] Do not let Flasky start a process behind an endpoint, if a process is still running
84 | - [x] Try Prefect
85 | - [x] Log the date and time more visible
86 | - [x] Allow one retry after the browser crashes
87 |
88 | ## Technical Debt
89 |
90 | - [ ] Rename job_description to job_offer
91 | - [ ] Rename cleansed to curated
92 |
--------------------------------------------------------------------------------
/python/tests/data/normalize_job_description/output/test_case_7610222.json:
--------------------------------------------------------------------------------
1 | {
2 | "title": "Anlagenmechaniker für Sanitär-, Heizungs- und Klimatechnik (m/w/d)",
3 | "online_status": "online",
4 | "is_anonymous": false,
5 | "job_id": 7610222,
6 | "should_display_early_applicant": false,
7 | "location": "Hamburg (Hammerbrook)",
8 | "contract_type": "Feste Anstellung",
9 | "work_type": "Vollzeit",
10 | "online_date": "2021-10-13T15:54:04Z",
11 | "company_name": "ENGIE Deutschland GmbH",
12 | "description": "ÜBER UNS:Als Komplett-Dienstleister im Bereich Facility Solutions sichern wir den optimalen Betrieb von Gebäuden und Anlagen. Wir bieten modulare Leistungspakete von Service und Wartung über Instandhaltung bis hin zur Bewirtschaftung komplexer Liegenschaften. Für unsere Multi-Site-Kunden arbeiten wir als überregionaler oder auch internationaler Facility-Management-Partner.IHRE AUFGABEN:Wir suchen Servicetechniker bzw. Anlagenmechaniker für die Gewerke Heizung, Klima, Lüftung, Sanitär oder Kälte für die Wartung, Instandsetzung und Bedienung der haustechnischen Anlagen bei unserem Kunden vor Ort.\nSie arbeiten an einem festen Objekt, sodass keine Reisetätigkeit anfällt.\n\nBetreiben der gebäudetechnischen Anlagen an einem anspruchsvollen Industriestandort\nOrganisation, Steuerung, Kontrolle und selbstständige Durchführung von Wartungs- und Instandsetzungsarbeiten an gebäudetechnischen Anlagen\nOptimierung der bestehenden Anlagentechnik und der Betriebsabläufe\nErstellung und Dokumentation der täglichen Arbeitsleistung über mobile Endgeräte\nKoordination und Begleitung von Nachunternehmern\nErster Ansprechpartner vor Ort für unsere Kunden im operative Tagesgeschäft\nIHR PROFIL:\nAbgeschlossene Berufsausbildung als Anlagenmechaniker für Sanitär-, Heizungs- und Klimatechnik oder als Zentralheizungs- und Lüftungsbauer, Gas-Wasserinstallateur oder Kältetechniker\nMehrjährige Berufserfahrungen im Bereich der Technischen Gebäudeausrüstung\nKunden- und Dienstleistungsorientierung gepaart mit Spaß an der Arbeit im Team\nGeregelten Arbeitszeiten mit gelegentlichen Bereitschaftsdiensten\nFührerschein der Klasse B\nIHRE BENEFITS:\nAkademie\nAltersvorsorge\nCorporate Benefits\nPerspektiven\nFirmenfeiern\nFlexible Arbeitszeiten\nGestaltungsfreiheit\nHohe Sicherheitsstandards\nInternationalität\nSpannende Projekte\nTeamgeist\nAttraktive Vergütung\nIHR JOB?Werden auch Sie ein ENGIEneer und gestalten Sie zusammen mit uns die Zukunft der Energiewende. Wir sind gespannt auf Ihre Online-Bewerbung!\n IHR KONTAKT:\nMonika Brzenska\nTalent Acquisition Specialist\nTelefon: 0221 46 90 54 29 \n \nKENNZIFFER: 2021-0476",
13 | "description_introduction": "ÜBER UNS:Als Komplett-Dienstleister im Bereich Facility Solutions sichern wir den optimalen Betrieb von Gebäuden und Anlagen. Wir bieten modulare Leistungspakete von Service und Wartung über Instandhaltung bis hin zur Bewirtschaftung komplexer Liegenschaften. Für unsere Multi-Site-Kunden arbeiten wir als überregionaler oder auch internationaler Facility-Management-Partner.",
14 | "description_responsabilities": "Wir suchen Servicetechniker bzw. Anlagenmechaniker für die Gewerke Heizung, Klima, Lüftung, Sanitär oder Kälte für die Wartung, Instandsetzung und Bedienung der haustechnischen Anlagen bei unserem Kunden vor Ort.\nSie arbeiten an einem festen Objekt, sodass keine Reisetätigkeit anfällt.\n\nBetreiben der gebäudetechnischen Anlagen an einem anspruchsvollen Industriestandort\nOrganisation, Steuerung, Kontrolle und selbstständige Durchführung von Wartungs- und Instandsetzungsarbeiten an gebäudetechnischen Anlagen\nOptimierung der bestehenden Anlagentechnik und der Betriebsabläufe\nErstellung und Dokumentation der täglichen Arbeitsleistung über mobile Endgeräte\nKoordination und Begleitung von Nachunternehmern\nErster Ansprechpartner vor Ort für unsere Kunden im operative Tagesgeschäft",
15 | "description_requirements": "Abgeschlossene Berufsausbildung als Anlagenmechaniker für Sanitär-, Heizungs- und Klimatechnik oder als Zentralheizungs- und Lüftungsbauer, Gas-Wasserinstallateur oder Kältetechniker\nMehrjährige Berufserfahrungen im Bereich der Technischen Gebäudeausrüstung\nKunden- und Dienstleistungsorientierung gepaart mit Spaß an der Arbeit im Team\nGeregelten Arbeitszeiten mit gelegentlichen Bereitschaftsdiensten\nFührerschein der Klasse B",
16 | "description_perks": "Akademie\nAltersvorsorge\nCorporate Benefits\nPerspektiven\nFirmenfeiern\nFlexible Arbeitszeiten\nGestaltungsfreiheit\nHohe Sicherheitsstandards\nInternationalität\nSpannende Projekte\nTeamgeist\nAttraktive Vergütung"
17 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Job Market Analytics
2 |
3 | The aim of this project is to develop an end-to-end Data Platform to explore and learn new technologies.
4 |
5 | ## Architecture
6 |
7 | 
8 |
9 | ## Storage
10 |
11 | ### Data Lake
12 |
13 | The Data Lake is basically a file system on my local computer, but could be easily transfered to a Cloud Blob Storage (
14 | like AWS S3 or Azure Blob Storage) if needed. The current Data Lake we have two layers:
15 |
16 | - The **Raw Layer**, where the information from the data source are stored in the same file format as ingested (e.g.
17 | HTML or XML).
18 | - The **Cleansed Layer**, where we store the information in Parquet, which means that the information is stored in a
19 | tabular format with well-defined columns.
20 |
21 | ### Data Warehouse
22 |
23 | The Data Warehouse is based on PostgreSQL plus an extension in order to read Parquet files as foreign tables. PostgreSQL
24 | might not be the best choice for a datawarehouse since it is row-column-oriented but in this case we have reduced number
25 | of columns and a relative small data size. Another advantage of PostgreSQL is that I can run it easily on my computer
26 | via Docker so that I can avoid cloud service costs. We will divide the datawarehouse in 3 schemas:
27 |
28 | - **Staging**, which are basically foreign tables referencing the Parquet files on the Data Lake Cleansed Layer.
29 | - **Data Vault**, where the data is modelled and historized using
30 | the [Data Vault Specification](https://danlinstedt.com/wp-content/uploads/2018/06/DVModelingSpecs2-0-1.pdf).
31 | - **Data Mart**, which will be the consuming layer for our BI Tool.
32 |
33 | ### Data Vault Model
34 |
35 | 
36 |
37 | ### Mart Model
38 |
39 | 
40 |
41 | ### Data Lineage (dbt Dag)
42 |
43 | 
44 |
45 | ## Computing
46 |
47 | In order to compute the data, we use two different approaches.
48 |
49 | - **Python** for the data ingestion, when we crawl and scrape data directly from the data source. And also for the data
50 | transformation from the Raw to the Cleansed layer. All Python code is divided in atomic tasks and these are
51 | - orchestrated by [Airflow](https://airflow.apache.org/).
52 | - **SQL** for the transformations of the data inside the Data Warehouse. The SQL tasks are automated and orchestrated
53 | by [dbt](https://www.getdbt.com/).
54 |
55 | ### Data Source Scraping
56 |
57 | In order to download the data from the data source, we run the following Airflow dag:
58 |
59 | 
60 |
61 | First, we make sure that we are connected to the VPN, then we download and archive the list of the jobs that online at
62 | the moment from the sitemap, and we list out which job descriptions we have not downloaded yet, and then we download
63 | them via browser automation with [Playwright](https://playwright.dev/).
64 |
65 | ### Data Transformation Orchestration
66 |
67 | The data transformation is orchestrated as an Airflow DAG, which runs on daily basis and combines Python transformation
68 | jobs and the dbt run to build up the incremental models.
69 |
70 | 
71 |
72 | ## Frequently Asked Questions
73 |
74 | ### What questions can be answered with this project?
75 |
76 | Here are some examples of what we can answer:
77 |
78 | - How long is a job offer online until it is taken offline?
79 | - Which technologies are the most demanded at the moment?
80 | - How the demand for a particular technology evolves during the time?
81 | - How many jobs offers are remote and how this is evolving during the time?
82 | - When was a particular job offer first published?
83 |
84 | ### Could we answer those questions with a simplier technology stack?
85 |
86 | Yes, we could. But the point of the project is to explore and learn new technologies and concepts, therefore it has been
87 | over-engineered on purpose.
88 |
89 | ### Are you planning to create a public Web or Mobile Application with this?
90 |
91 | No, at least not at the moment. This is just for educative purposes.
92 |
93 | ### Why did you choose Parquet as file format for the Cleansed Layer in the Data Lake?
94 |
95 | I choose Parquet because it is a column-oriented compressed file type, which has been well-batled-tested. Good Python
96 | libraries are available like [Pyarrow](https://arrow.apache.org/docs/python/parquet.html) to write and read.
97 |
98 | ### Why did you choose PostgresSQL for the Data Warehouse?
99 |
100 | PostgreSQL is a very robust database with standard SQL that can run locally and its performance is good enough for the
101 | current data size and number of columns.
102 |
103 | ### How big is your data?
104 |
105 | It is around 530 GB in raw format after being scraping the data sources since October 2021, and it grows around 2 GB
106 | every day. After cleansing and compressing the data to Parquet is around 30 times smaller, since we can get rid of a
107 | great deal of HTML, CSS and JS because it does not provide any extra information for my use cases.
108 |
109 | 
110 |
--------------------------------------------------------------------------------
/python/.flake8:
--------------------------------------------------------------------------------
1 | # All configuration for plugins and other utils is defined here.
2 | # Read more about `setup.cfg`:
3 | # https://docs.python.org/3/distutils/configfile.html
4 |
5 |
6 | # === Linter configuration ===
7 | # You can reuse this configuration in your own projects.
8 | # See: https://wemake-python-stylegui.de/en/latest/pages/usage/integrations/nitpick.html
9 |
10 | [flake8]
11 | # Base flake8 configuration:
12 | # https://flake8.pycqa.org/en/latest/user/configuration.html
13 | format = wemake
14 | show-source = True
15 | statistics = False
16 | doctests = True
17 |
18 | # Plugins:
19 | max-complexity = 6
20 | max-line-length = 120
21 |
22 | # darglint configuration:
23 | # https://github.com/terrencepreilly/darglint
24 | strictness = long
25 | docstring-style = numpy
26 |
27 | # Self settings:
28 | max-imports = 17
29 |
30 | # Excluding some directories:
31 | exclude =
32 | .git
33 | __pycache__
34 | .venv
35 | .eggs
36 | *.egg
37 | dist
38 | # These folders contain code badly written for reasons:
39 | # Project spefic, do not copy.
40 | tests/fixtures/**
41 | tests/**/snapshots/**
42 |
43 | # Exclude some pydoctest checks globally:
44 | ignore = D100, D104, D401, W504, RST303, RST304, DAR103, DAR203, E800, D103, WPS421, WPS305
45 |
46 | per-file-ignores =
47 | # These function names are part of 3d party API:
48 | wemake_python_styleguide/visitors/ast/*.py: N802
49 | # These modules should contain a lot of classes:
50 | wemake_python_styleguide/violations/*.py: WPS202
51 | # Eval is a complex task:
52 | wemake_python_styleguide/logic/safe_eval.py: WPS232
53 | # This module should contain magic numbers:
54 | wemake_python_styleguide/options/defaults.py: WPS432
55 | # Checker has a lot of imports:
56 | wemake_python_styleguide/checker.py: WPS201
57 | # Allows mypy type hinting, `Ellipsis`` usage, multiple methods:
58 | wemake_python_styleguide/types.py: D102, WPS214, WPS220, WPS428
59 | # There are multiple fixtures, `assert`s, and subprocesses in tests:
60 | tests/test_visitors/test_ast/test_naming/conftest.py: WPS202
61 | tests/*.py: S101, S105, S404, S603, S607, WPS211, WPS226, WPS323
62 | # Docs can have the configuration they need:
63 | docs/conf.py: WPS407
64 | # Pytest fixtures
65 | tests/plugins/*.py: WPS442
66 |
67 |
68 | [isort]
69 | # isort configuration:
70 | # https://github.com/timothycrosley/isort/wiki/isort-Settings
71 | include_trailing_comma = true
72 | use_parentheses = true
73 | # See https://github.com/timothycrosley/isort#multi-line-output-modes
74 | multi_line_output = 3
75 | # Is the same as 80 in flake8:
76 | line_length = 120
77 |
78 | # We need these lines for Github Action to work correctly,
79 | # **please** do not copy it to your own configs:
80 | default_section = THIRDPARTY
81 | known_first_party = wemake_python_styleguide*
82 | skip_glob =
83 | # These folders contain code badly written for reasons:
84 | tests/fixtures/**
85 | tests/**/snapshots/**
86 |
87 |
88 | # === Internal tools ===
89 | # You are not interested in anything beyond this line.
90 |
91 | [tool:pytest]
92 | # py.test configuration: http://doc.pytest.org/en/latest/customize.html
93 | norecursedirs = tests/fixtures *.egg .eggs dist build docs .tox .git __pycache__
94 |
95 | filterwarnings =
96 | ignore::DeprecationWarning
97 |
98 | addopts =
99 | --strict
100 | --doctest-modules
101 | --cov=wemake_python_styleguide
102 | --cov-branch
103 | --cov-report=term-missing:skip-covered
104 | --cov-report=html
105 | --cov-report=xml
106 | --cov-fail-under=100
107 |
108 |
109 | [coverage:run]
110 | # Coverage configuration: https://coverage.readthedocs.io/
111 |
112 | # We don't need to cover some files. They are fully checked with mypy.
113 | # And don't contain any logic.
114 | omit =
115 | wemake_python_styleguide/types.py
116 |
117 | # Here we specify plugins for coverage to be used:
118 | plugins =
119 | coverage_conditional_plugin
120 |
121 | [coverage:coverage_conditional_plugin]
122 | # Here we specify our pragma rules:
123 | rules =
124 | "sys_version_info < (3, 8)": py-lt-38
125 | "sys_version_info >= (3, 8)": py-gte-38
126 |
127 | "sys_version_info < (3, 9)": py-lt-39
128 | "sys_version_info >= (3, 9)": py-gte-39
129 |
130 |
131 | [mypy]
132 | # The mypy configurations: http://bit.ly/2zEl9WI
133 | allow_redefinition = False
134 | check_untyped_defs = True
135 | disallow_untyped_decorators = True
136 | disallow_any_explicit = True
137 | disallow_any_generics = True
138 | disallow_untyped_calls = True
139 | ignore_errors = False
140 | ignore_missing_imports = True
141 | implicit_reexport = False
142 | local_partial_types = True
143 | strict_optional = True
144 | strict_equality = True
145 | no_implicit_optional = True
146 | warn_unused_ignores = True
147 | warn_redundant_casts = True
148 | warn_unused_configs = True
149 | warn_unreachable = True
150 | warn_no_return = True
151 |
152 | [mypy-wemake_python_styleguide.compat.nodes]
153 | # We allow explicit `Any` only in this file, because of the compatibility:
154 | disallow_any_explicit = False
155 |
156 | [mypy-wemake_python_styleguide.compat.packaging]
157 | # We allow unused `ignore` comments, because we cannot sync it between versions:
158 | warn_unused_ignores = False
159 |
160 | [mypy-wemake_python_styleguide.logic.safe_eval]
161 | # We allow explicit `Any` only in this file, because that's what it does:
162 | disallow_any_explicit = False
163 |
164 |
165 | [doc8]
166 | # doc8 configuration: https://pypi.org/project/doc8/
167 | ignore-path = docs/_build
168 | max-line-length = 120
169 | sphinx = True
--------------------------------------------------------------------------------
/python/tests/data/normalize_job_description/output/test_case_7610188.json:
--------------------------------------------------------------------------------
1 | {
2 | "title": "Ansible/ServiceNow Experte (m/w/d)",
3 | "online_status": "online",
4 | "is_anonymous": false,
5 | "job_id": 7610188,
6 | "should_display_early_applicant": false,
7 | "location": "Hannover oder Münster",
8 | "contract_type": "Feste Anstellung",
9 | "work_type": "Vollzeit, Home Office möglich",
10 | "online_date": "2021-10-13T15:40:32Z",
11 | "company_name": "Finanz Informatik GmbH & Co. KG",
12 | "description": "Als einer der größten Banken-IT-Dienstleister Europas sind wir der Treiber der Digitalisierung innerhalb der Sparkassen-Finanzgruppe. Mit über 4.000 Mitarbeitern an 3 Standorten machen wir digitales Banking von heute leistungsfähig und entwickeln smarte Finanz-Services von morgen. Dabei bieten wir Ihnen ein breites Aufgabenspektrum, in dem Sie Ihre individuelle Stärke hervorragend einbringen können. Ob App-Entwicklung, Netzwerktechnologien und Serverbetrieb oder Beratung, Schulung und Support – bei uns finden Sie Ihre Berufung! Als Spezialist oder auch als Generalist. Alles mit besten Karrierechancen, viel Raum für persönliche Entfaltung und zahlreichen Benefits.\nFür unsere Abteilung Bereitstellung Kommunikationsdienste suchen wir zum nächstmöglichen Zeitpunkt für den Standort Hannover oder Münster Verstärkung als\nAnsible/ServiceNow Experte (m/w/d)\nIhre Aufgaben:\nSie sind unser Experte für die Einführung und kontinuierliche Weiterentwicklung unserer Automationsstrategie\nEntwurf/Programmierung (Python) von Automationsobjekten zur Optimierung des Produktionsablaufes und der Überwachung der Systemplattform \nAufbau von automatisierten Schnittstellen zur umliegenden Serverinfrastruktur\nDurchführung von Programm- und Systemtests und Unterstützung bei der Fehlerbehebung \nDokumentation sowie Pflege und Qualitätssicherung der automatisierten Plattform\nEntwicklung der Automatisierung bei der Bereitstellung neuer Services\n\nIhr Profil:\nAbgeschlossenes technisches Studium vorzugsweise im IT/TK-Umfeld oder eine vergleichbare Ausbildung/Qualifikation\nMehrjährige Erfahrung in der Programmierung und im Umgang mit Skriptsprachen \nErfahrung mit Telefonie-Plattformen und -Systemen, ACD, VoIP-Netzwerkstrukturen \nKenntnisse im Plattformbetrieb von Windows, Unix, Datenbanken sowie VMware\nErfahrungen im Prozess-, Test- und Qualitätsmanagement wünschenswert\nKundenorientierung und gute kommunikative Fähigkeiten \nSie sind ein Teamplayer und ergänzen unser dynamisches Team mit Initiative und Zielstrebigkeit\nBereitschaft zu gelegentlichen Dienstreisen sowie Sondereinsätzen\n\nIhre Benefits:\nAltersvorsorge\nBarrierefrei\nBetriebssport\nFamilienservice\nFirmenevents\nFlexible Arbeitszeiten\nMobiles Arbeiten\nJobticket\nKantine\nTarifvertrag\nWeiterbildung\nFitnessförderung\n\nBei uns erwartet Sie eine attraktive Vergütung basierend auf Ihrer Qualifikation sowie Ihrer relevanten, praktischen Erfahrung.\nKlingt interessant?Dann bewerben Sie sich ganz einfach über unser FI-Karriere-Online-Portal. Wir freuen uns auf Ihre Bewerbung unter Angabe der Kennziffer 341/2021! Sollten Sie vorab weitere Auskünfte zu dieser Stelle wünschen, steht Ihnen gerne Herr Malte Kurz zur Verfügung. Sie erreichen Malte Kurz unter Tel. 0511 5102-24958 oder per E-Mail unter karriere@f-i.de.",
13 | "description_introduction": "Als einer der größten Banken-IT-Dienstleister Europas sind wir der Treiber der Digitalisierung innerhalb der Sparkassen-Finanzgruppe. Mit über 4.000 Mitarbeitern an 3 Standorten machen wir digitales Banking von heute leistungsfähig und entwickeln smarte Finanz-Services von morgen. Dabei bieten wir Ihnen ein breites Aufgabenspektrum, in dem Sie Ihre individuelle Stärke hervorragend einbringen können. Ob App-Entwicklung, Netzwerktechnologien und Serverbetrieb oder Beratung, Schulung und Support – bei uns finden Sie Ihre Berufung! Als Spezialist oder auch als Generalist. Alles mit besten Karrierechancen, viel Raum für persönliche Entfaltung und zahlreichen Benefits.\nFür unsere Abteilung Bereitstellung Kommunikationsdienste suchen wir zum nächstmöglichen Zeitpunkt für den Standort Hannover oder Münster Verstärkung als\nAnsible/ServiceNow Experte (m/w/d)",
14 | "description_responsabilities": "Sie sind unser Experte für die Einführung und kontinuierliche Weiterentwicklung unserer Automationsstrategie\nEntwurf/Programmierung (Python) von Automationsobjekten zur Optimierung des Produktionsablaufes und der Überwachung der Systemplattform \nAufbau von automatisierten Schnittstellen zur umliegenden Serverinfrastruktur\nDurchführung von Programm- und Systemtests und Unterstützung bei der Fehlerbehebung \nDokumentation sowie Pflege und Qualitätssicherung der automatisierten Plattform\nEntwicklung der Automatisierung bei der Bereitstellung neuer Services",
15 | "description_requirements": "Abgeschlossenes technisches Studium vorzugsweise im IT/TK-Umfeld oder eine vergleichbare Ausbildung/Qualifikation\nMehrjährige Erfahrung in der Programmierung und im Umgang mit Skriptsprachen \nErfahrung mit Telefonie-Plattformen und -Systemen, ACD, VoIP-Netzwerkstrukturen \nKenntnisse im Plattformbetrieb von Windows, Unix, Datenbanken sowie VMware\nErfahrungen im Prozess-, Test- und Qualitätsmanagement wünschenswert\nKundenorientierung und gute kommunikative Fähigkeiten \nSie sind ein Teamplayer und ergänzen unser dynamisches Team mit Initiative und Zielstrebigkeit\nBereitschaft zu gelegentlichen Dienstreisen sowie Sondereinsätzen",
16 | "description_perks": "Altersvorsorge\nBarrierefrei\nBetriebssport\nFamilienservice\nFirmenevents\nFlexible Arbeitszeiten\nMobiles Arbeiten\nJobticket\nKantine\nTarifvertrag\nWeiterbildung\nFitnessförderung\n\nBei uns erwartet Sie eine attraktive Vergütung basierend auf Ihrer Qualifikation sowie Ihrer relevanten, praktischen Erfahrung."
17 | }
--------------------------------------------------------------------------------
/python/simplescraper/tasks/curate_job_descriptions.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import numpy as np
4 |
5 | from common.entity import JOB, JOB_LOCATION, JOB_DESCRIPTION, JOB_TECHNOLOGY
6 | from common.logging import configure_logger, logger
7 | from common.storage import get_load_timestamp, get_load_date, load_cleansed_df, save_curated_df
8 |
9 | JOB_DESCRIPTION_SAT_COLUMNS = ['title', 'online_status', 'is_anonymous', 'should_display_early_applicant',
10 | 'contract_type', 'work_type', 'online_date', 'company_name', 'description_introduction',
11 | 'description_responsabilities', 'description_requirements', 'description_perks']
12 |
13 | BASE_COLUMNS = ['year', 'month', 'day', 'job_id', 'load_timestamp']
14 |
15 | TECHNOLOGIES = [
16 | 'AI',
17 | 'Airflow',
18 | 'Android',
19 | 'Angular',
20 | 'AWS',
21 | 'Azure',
22 | 'CSS',
23 | 'Couchbase',
24 | 'CouchDB',
25 | 'Cypress',
26 | 'Dagster',
27 | 'Dask',
28 | 'Databricks',
29 | 'dbt',
30 | 'Docker',
31 | 'Duckdb',
32 | 'ELT',
33 | 'ETL',
34 | 'Flink',
35 | 'Flutter',
36 | 'GCP',
37 | 'Go',
38 | 'Golang',
39 | 'Gradle',
40 | 'gRPC',
41 | 'HANA',
42 | 'Java',
43 | 'JavaScript',
44 | 'Keras',
45 | 'Kotlin',
46 | 'Kubernetes',
47 | 'LESS',
48 | 'Maven',
49 | 'ML',
50 | 'MongoDB',
51 | 'MySQL',
52 | 'NLP',
53 | 'Oracle',
54 | 'Pandas',
55 | 'Playwright',
56 | 'PostgreSQL',
57 | 'Prefect',
58 | 'Puppeteer',
59 | 'Purview',
60 | 'Python',
61 | 'PyTorch',
62 | 'React',
63 | 'REST',
64 | 'Rust',
65 | 'Tensorflow',
66 | 'TestCafe',
67 | 'TypeScript',
68 | 'WebAssembly',
69 | 'scikit',
70 | 'Selenium',
71 | 'Snowflake',
72 | 'Snowplow',
73 | 'Spark',
74 | 'Spring',
75 | 'Storm',
76 | 'SAP',
77 | 'SCSS',
78 | 'SQL',
79 | 'SSIS',
80 | 'Synapse',
81 | 'Vue',
82 | ]
83 |
84 |
85 | def process_job_description(df):
86 | df = df.copy()
87 | df = df[df['company_name'].notna()]
88 | df = df[BASE_COLUMNS + JOB_DESCRIPTION_SAT_COLUMNS]
89 | save_curated_df(df, JOB)
90 |
91 |
92 | def process_location(df):
93 | df = df[BASE_COLUMNS + ['location']].copy()
94 |
95 | df['location'] = df['location'].str.replace('Frankfurt (Main)', 'Frankfurt am Main', regex=False)
96 | df['location'] = df['location'].str.replace('Frankfurt a. M.', 'Frankfurt am Main', regex=False)
97 | df['location'] = df['location'].str.replace('Frankfurt a.M.', 'Frankfurt am Main', regex=False)
98 | df['location'] = df['location'].str.replace('Frankfurt am Main (60488)', 'Frankfurt am Main', regex=False)
99 | df['location'] = df['location'].str.replace('Frankfurt Am Main', 'Frankfurt am Main', regex=False)
100 | df['location'] = df['location'].str.replace('Frankfurt/M.', 'Frankfurt am Main', regex=False)
101 | df['location'] = df['location'].str.replace('Frankfurt aM', 'Frankfurt am Main', regex=False)
102 | df['location'] = df['location'].str.replace('Frankfurt (am Main)', 'Frankfurt am Main', regex=False)
103 | df['location'] = df['location'].str.replace('Frankfurt Main', 'Frankfurt am Main', regex=False)
104 | df['location'] = df['location'].str.replace('Frankfurt aam Main', 'Frankfurt am Main', regex=False)
105 |
106 | df['location'] = df['location'].str.replace('|'.join([' und ', ' oder ', '/', ';', ' - ', ':']), ',', regex=True)
107 | df['location'] = df['location'].str.replace(' | ', ',', regex=False)
108 | df['location'] = df['location'].str.replace(' .', ',', regex=False)
109 | df['location'] = df['location'].str.replace(' u.a. ', ',', regex=False)
110 | df['location'] = df['location'].str.split(',')
111 | df = df.explode('location').reset_index(drop=True)
112 |
113 | df['location'] = df['location'].str.strip()
114 |
115 | df['location'] = df['location'].replace('Frankfurt', 'Frankfurt am Main')
116 |
117 | df['location'] = df['location'].replace('', np.nan)
118 | df['location'] = df['location'].replace('keine Angabe', np.nan)
119 | df = df.dropna()
120 |
121 | save_curated_df(df, JOB_LOCATION)
122 |
123 |
124 | def process_technology(df):
125 | df = df.copy()
126 | df['description'] = df['title'] + ' ' + \
127 | df['description_introduction'] + ' ' + \
128 | df['description_responsabilities'] + ' ' + \
129 | df['description_requirements'] + ' ' + \
130 | df['description_perks']
131 | for technology in TECHNOLOGIES:
132 | df[technology] = df['description'].str.contains(fr'(?i)\b{technology}\b', regex=True)
133 | df['Other'] = ~df[TECHNOLOGIES].any(axis='columns')
134 | df = df.melt(id_vars=BASE_COLUMNS, value_vars=TECHNOLOGIES + ['Other'], var_name='technology')
135 | df = df[df['value'].notna()]
136 | df = df[df['value']]
137 | df = df[BASE_COLUMNS + ['technology']]
138 |
139 | save_curated_df(df, JOB_TECHNOLOGY)
140 |
141 |
142 | def curate_job_descriptions(load_timestamp, load_date):
143 | configure_logger(load_timestamp)
144 | logger.info(f'Start curate_job_descriptions: {load_timestamp} {load_date}')
145 |
146 | df = load_cleansed_df(JOB_DESCRIPTION, load_date=load_date)
147 |
148 | df = df.dropna(subset=['job_id'])
149 | df['job_id'] = df['job_id'].astype('int')
150 | df = df.sort_values(by=['job_id'])
151 |
152 | process_job_description(df)
153 | process_location(df)
154 | process_technology(df)
155 |
156 | logger.info(f'End curate_job_descriptions: {load_timestamp} {load_date}')
157 |
158 |
159 | if __name__ == "__main__":
160 | _load_timestamp = sys.argv[1] if len(sys.argv) > 1 else get_load_timestamp()
161 | _load_date = sys.argv[2] if len(sys.argv) > 2 else get_load_date()
162 | curate_job_descriptions(_load_timestamp, _load_date)
163 |
--------------------------------------------------------------------------------
/python/tests/data/normalize_job_description/output/test_case_7609275.json:
--------------------------------------------------------------------------------
1 | {
2 | "title": "Account Manager (m/w/d)",
3 | "online_status": "online",
4 | "is_anonymous": false,
5 | "job_id": 7609275,
6 | "should_display_early_applicant": false,
7 | "location": "bundesweit",
8 | "contract_type": "Feste Anstellung",
9 | "work_type": "Vollzeit, Home Office möglich",
10 | "online_date": "2021-10-13T13:22:15Z",
11 | "company_name": "Quentic GmbH",
12 | "description": "Passionate people for sustainable softwareQuentic ist einer der führenden Lösungsanbieter für Software as a Service (SaaS) im europäischen HSE- und CSR-Markt. Das Unternehmen hat seinen Hauptsitz in Berlin und beschäftigt über 250 Mitarbeitende. Niederlassungen befinden sich in Deutschland, Österreich und der Schweiz sowie in Finnland, Belgien, Dänemark, Schweden, den Niederlanden, Spanien und Italien.\nÜber 800 Kunden stärken ihr HSE- und CSR-Management mit den Quentic Software-Lösungen. Mit neun individuell kombinierbaren Modulen umfasst die Online-Plattform Arbeitssicherheit, Risks & Audits, Control of Work, Gefahrstoffe, Legal Compliance, Online-Unterweisungen, Prozesse sowie Umweltmanagement und Nachhaltigkeit. Quentic vernetzt Daten, verbindet alle HSE- und CSR-Akteure und begeistert für das gesamte Aufgabenfeld - via Browser oder per App. Da Aufgaben über Abteilungen, Standorte und Länder hinweg ineinandergreifen, lassen sich Unternehmensprozesse effizient nach gesetzlichen Vorgaben steuern.\nDeine Aufgaben\nDu betreust überwiegend Bestandkunden im Medium und Large Enterprise Business der Industrie im DACH-Raum\nDu erkennst Up- & Cross-Selling-Potentiale und schöpfst sie aus\nDu führst Verhandlungen über Preise und Vertragsverlängerungen\nDu präsentierst unser Leistungsversprechen unseren Bestandskunden und analysierst ihren Bedarf\nDu repräsentierst Quentic auf Roadshows und Messen\nDu pflegst unser CRM-System und reportest regelmäßig an unsere Head of Account Management\nDu arbeitest mit externen Dienstleistern zusammen\nDu sicherst und erhöhst die Kundenzufriedenheit\n\nDeine Qualifikationen\nDu hast bereits umfangreiche Berufserfahrung in der Bestandkundenbetreuung im B2B Software-Bereich\nBegriffe wie Buying Center, Tender und Complex Sales sind Dir geläufig\nDu bist technikaffin und hast Interesse an den Themen Arbeitssicherheit, Nachhaltigkeit und Umweltschutz\nMit Empathie und Geschick gelingt es Dir, komplexe Sachverhalte verständlich zu präsentieren\nDu bist argumentationssicher und verhandlungsstark und kannst so unsere Business Software online und vor Ort sicher präsentieren \nDu sprichst fließend Deutsch und Englisch, weitere europäische Sprachen sind ein Plus\nDu bist bereit, innerhalb der DACH-Region zu reisen (i.d.R. 1-2 Tage pro Woche innerhalb Deines lokalen Vertriebgebiets)\n\nDeine Aussichten\nNicht gesättigtes Marktumfeld mit steigender Nachfrage\nUnterstützung durch ein starkes Marketing sowie unsere Consultants bei der Kundenbetreuung\nAttraktive Vergütung aus einem Fixgehalt und einer transparenten Variablen je nach Zielvereinbarung\nFirmen-Kreditkarte und ein mobiles Büro\nStrukturierte Einarbeitung und Betreuung durch Mentoren\nFlache Hierarchien mit offenen Türen in einer lockeren, professionellen Atmosphäre\nRegelmäßige Teamevents und ein besonderes Augenmerk auf die Work-Life-Balance (flexible Arbeitszeiten, Bezuschussung Fitness-Studio u. v. m.)\n\nWeitere InformationenWenn du die Welt ein bisschen sicherer machen und mehr über die Themen Umweltschutz, Arbeitssicherheit und Nachhaltigkeit erfahren möchtest, bist du bei uns genau richtig! Wer wir sind und wie wir arbeiten, siehst du hier",
13 | "description_introduction": "Passionate people for sustainable softwareQuentic ist einer der führenden Lösungsanbieter für Software as a Service (SaaS) im europäischen HSE- und CSR-Markt. Das Unternehmen hat seinen Hauptsitz in Berlin und beschäftigt über 250 Mitarbeitende. Niederlassungen befinden sich in Deutschland, Österreich und der Schweiz sowie in Finnland, Belgien, Dänemark, Schweden, den Niederlanden, Spanien und Italien.\nÜber 800 Kunden stärken ihr HSE- und CSR-Management mit den Quentic Software-Lösungen. Mit neun individuell kombinierbaren Modulen umfasst die Online-Plattform Arbeitssicherheit, Risks & Audits, Control of Work, Gefahrstoffe, Legal Compliance, Online-Unterweisungen, Prozesse sowie Umweltmanagement und Nachhaltigkeit. Quentic vernetzt Daten, verbindet alle HSE- und CSR-Akteure und begeistert für das gesamte Aufgabenfeld - via Browser oder per App. Da Aufgaben über Abteilungen, Standorte und Länder hinweg ineinandergreifen, lassen sich Unternehmensprozesse effizient nach gesetzlichen Vorgaben steuern.",
14 | "description_responsabilities": "Du betreust überwiegend Bestandkunden im Medium und Large Enterprise Business der Industrie im DACH-Raum\nDu erkennst Up- & Cross-Selling-Potentiale und schöpfst sie aus\nDu führst Verhandlungen über Preise und Vertragsverlängerungen\nDu präsentierst unser Leistungsversprechen unseren Bestandskunden und analysierst ihren Bedarf\nDu repräsentierst Quentic auf Roadshows und Messen\nDu pflegst unser CRM-System und reportest regelmäßig an unsere Head of Account Management\nDu arbeitest mit externen Dienstleistern zusammen\nDu sicherst und erhöhst die Kundenzufriedenheit",
15 | "description_requirements": "Du hast bereits umfangreiche Berufserfahrung in der Bestandkundenbetreuung im B2B Software-Bereich\nBegriffe wie Buying Center, Tender und Complex Sales sind Dir geläufig\nDu bist technikaffin und hast Interesse an den Themen Arbeitssicherheit, Nachhaltigkeit und Umweltschutz\nMit Empathie und Geschick gelingt es Dir, komplexe Sachverhalte verständlich zu präsentieren\nDu bist argumentationssicher und verhandlungsstark und kannst so unsere Business Software online und vor Ort sicher präsentieren \nDu sprichst fließend Deutsch und Englisch, weitere europäische Sprachen sind ein Plus\nDu bist bereit, innerhalb der DACH-Region zu reisen (i.d.R. 1-2 Tage pro Woche innerhalb Deines lokalen Vertriebgebiets)",
16 | "description_perks": "Nicht gesättigtes Marktumfeld mit steigender Nachfrage\nUnterstützung durch ein starkes Marketing sowie unsere Consultants bei der Kundenbetreuung\nAttraktive Vergütung aus einem Fixgehalt und einer transparenten Variablen je nach Zielvereinbarung\nFirmen-Kreditkarte und ein mobiles Büro\nStrukturierte Einarbeitung und Betreuung durch Mentoren\nFlache Hierarchien mit offenen Türen in einer lockeren, professionellen Atmosphäre\nRegelmäßige Teamevents und ein besonderes Augenmerk auf die Work-Life-Balance (flexible Arbeitszeiten, Bezuschussung Fitness-Studio u. v. m.)"
17 | }
--------------------------------------------------------------------------------
/python/simplescraper/tasks/download_job_descriptions.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import time
3 |
4 | from playwright.async_api import async_playwright, Error, TimeoutError
5 |
6 | from common.chunking import get_chunk_size
7 | from common.entity import JOB_DESCRIPTION
8 | from common.env_variables import DATA_SOURCE_URL, SEMAPHORE_COUNT, MAX_CHUNK_SIZE, LATEST_LOAD_TIMESTAMP, RUN_HEADLESS, \
9 | MIN_TO_DOWNLOAD, MAX_TO_DOWNLOAD
10 | from common.logging import logger, configure_logger
11 | from common.storage import save_raw_file, load_temp_df, JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV
12 |
13 | TAB_HITS = 10
14 |
15 |
16 | class PageNotFound(Exception):
17 | pass
18 |
19 |
20 | async def open_first_page(browser):
21 | page = await browser.new_page()
22 | await page.goto(DATA_SOURCE_URL, wait_until='domcontentloaded')
23 | await page.click('#ccmgt_explicit_accept')
24 | for i in range(TAB_HITS * 2):
25 | await page.keyboard.press('Tab')
26 | await page.goto(DATA_SOURCE_URL + 'de/sitemap/', wait_until='domcontentloaded')
27 | for i in range(TAB_HITS * 2):
28 | await page.keyboard.press('Tab')
29 | return page
30 |
31 |
32 | async def download_urls(df, load_timestamp):
33 | if df.empty:
34 | return
35 | async with async_playwright() as p:
36 | chunk_pos = df['chunk_pos'].values[0]
37 | chunk_pos = str(chunk_pos).rjust(2)
38 | num_chunks = df['num_chunks'].values[0]
39 | chunk_size = df['chunk_size'].values[0]
40 | chunk_id = f'{chunk_pos}/{num_chunks}'
41 | browser = await p.chromium.launch(headless=RUN_HEADLESS, slow_mo=250)
42 | try:
43 | logger.info(f'Starting chunk {chunk_id} with size of {chunk_size}')
44 | start_time = time.time()
45 | page = await open_first_page(browser)
46 | url_dicts = df.to_dict('records')
47 | for url_dict in url_dicts:
48 | pos_in_chunk = url_dict['pos_in_chunk']
49 | url = url_dict['url']
50 | job_id = url.rsplit('--', 1)
51 | job_id = job_id[1]
52 | job_id = job_id.split('-')
53 | job_id = job_id[0]
54 | file_name = f'{job_id}.html'
55 | try:
56 | logger.debug(f'Chunk {chunk_id}: Downloading ({pos_in_chunk}/{chunk_size}): {url}')
57 | try:
58 | response = await page.goto(url, wait_until='domcontentloaded')
59 | for i in range(TAB_HITS):
60 | await page.keyboard.press('Tab')
61 | if response.status >= 400 and response.status >= 400 < 500:
62 | raise PageNotFound('Page not found')
63 | await page.wait_for_selector('.js-app-ld-ContentBlock', timeout=10000, state='attached')
64 | except TimeoutError as err:
65 | logger.warning(
66 | f'Chunk {chunk_id}: TimeoutError: second try for {url} because of the following error: {err}')
67 | await page.goto(DATA_SOURCE_URL + 'de/sitemap/', wait_until='domcontentloaded')
68 | for i in range(TAB_HITS):
69 | await page.keyboard.press('Tab')
70 | await page.goto(url, wait_until='domcontentloaded')
71 | for i in range(TAB_HITS):
72 | await page.keyboard.press('Tab')
73 | await page.wait_for_selector('.js-app-ld-ContentBlock', timeout=20000, state='attached')
74 | page_content = await page.content()
75 | save_raw_file(page_content, JOB_DESCRIPTION, load_timestamp, file_name)
76 | logger.success(f'Chunk {chunk_id}: Downloaded ({pos_in_chunk}/{chunk_size}): {url}')
77 | except TimeoutError:
78 | logger.warning(f'Chunk {chunk_id}: TimeoutError: Timeout error while requesting the page {url}')
79 | except AttributeError:
80 | logger.warning(f'Chunk {chunk_id}: AttributeError: it seems the following URL is gone {url}')
81 | except PageNotFound:
82 | logger.warning(f'Chunk {chunk_id}: PageNotFound: the following URL is no longer available {url}')
83 | except Error as err:
84 | logger.error(f'Chunk {chunk_id}: It seems that the browser crashed because of the following error: {err}')
85 | finally:
86 | await browser.close()
87 |
88 | elapsed_time = time.time() - start_time
89 | logger.info(f'Finished chunk {chunk_id}')
90 | logger.info(f'Elapsed time {chunk_id}: {elapsed_time:.2f} seconds')
91 | logger.info(f'Downloads per second {chunk_id}: {chunk_size / elapsed_time:.2f}')
92 |
93 |
94 | def split_dataframe(df, chunk_size):
95 | chunks = []
96 | num_chunks = len(df) // chunk_size + 1
97 | for i in range(num_chunks):
98 | chunk = df[i * chunk_size:(i + 1) * chunk_size]
99 | chunk = chunk.reset_index(drop=True)
100 | chunk['chunk_pos'] = i + 1
101 | chunk['num_chunks'] = num_chunks
102 | chunk['pos_in_chunk'] = chunk.index + 1
103 | chunk['chunk_size'] = chunk.shape[0]
104 | chunks.append(chunk)
105 | return chunks
106 |
107 |
108 | async def safe_download_urls(urls, load_timestamp, sem):
109 | async with sem: # semaphore limits num of simultaneous downloads
110 | return await download_urls(urls, load_timestamp)
111 |
112 |
113 | async def run_async_tasks(chunks, load_timestamp):
114 | sem = asyncio.Semaphore(SEMAPHORE_COUNT)
115 | tasks = [
116 | asyncio.ensure_future(safe_download_urls(chunk, load_timestamp, sem)) # creating task starts coroutine
117 | for chunk
118 | in chunks
119 | ]
120 | await asyncio.gather(*tasks)
121 |
122 |
123 | def download_job_descriptions(load_timestamp, df_to_download=None):
124 | configure_logger(load_timestamp)
125 | df = df_to_download if df_to_download is not None else load_temp_df(load_timestamp, JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV)
126 |
127 | if MAX_TO_DOWNLOAD:
128 | pending_donwnload = df.shape[0] - MAX_TO_DOWNLOAD if df.shape[0] > MAX_TO_DOWNLOAD else 0
129 | df = df.head(MAX_TO_DOWNLOAD)
130 | else:
131 | pending_donwnload = 0
132 |
133 | total_count = df.shape[0]
134 |
135 | if total_count < MIN_TO_DOWNLOAD:
136 | logger.success(f'Not enough to download: {total_count} for the load timestamp {load_timestamp}')
137 | return
138 |
139 | chunk_size = get_chunk_size(total_count, SEMAPHORE_COUNT, MAX_CHUNK_SIZE)
140 | chunks = split_dataframe(df, chunk_size)
141 |
142 | start_time = time.time()
143 | logger.info(f'Starting downloading job descriptions for job: {load_timestamp}')
144 | logger.info(f'Concurrent tasks: {SEMAPHORE_COUNT}')
145 | logger.info(f'Urls to download: {total_count}')
146 | logger.info(f'Pending download: {pending_donwnload}')
147 |
148 | loop = asyncio.SelectorEventLoop()
149 | asyncio.set_event_loop(loop)
150 | try:
151 | loop.run_until_complete(run_async_tasks(chunks, load_timestamp))
152 | finally:
153 | loop.run_until_complete(loop.shutdown_asyncgens())
154 | loop.close()
155 |
156 | elapsed_time = time.time() - start_time
157 | logger.info(f'Elapsed time: {elapsed_time:.2f} seconds')
158 | logger.info(f'Downloads per second: {total_count / elapsed_time:.2f}')
159 | logger.success(f'Finished: {total_count} urls for the timestamp {load_timestamp}')
160 | logger.success(f'Pending download: {pending_donwnload} urls for the timestamp {load_timestamp}')
161 |
162 |
163 | if __name__ == '__main__':
164 | download_job_descriptions(
165 | LATEST_LOAD_TIMESTAMP,
166 | load_temp_df(LATEST_LOAD_TIMESTAMP, JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV),
167 | )
168 |
--------------------------------------------------------------------------------
/python/simplescraper/common/storage.py:
--------------------------------------------------------------------------------
1 | """
2 | This module will store the files in the following structure
3 | - root
4 | -
5 | -
6 | -
7 | -
8 | -
9 | """
10 | import datetime
11 | import glob
12 | import os
13 | import pathlib
14 |
15 | import pandas as pd
16 | import pyarrow as pa
17 | import pyarrow.parquet as pq
18 | from dateutil import parser
19 | from pyarrow import ArrowInvalid
20 |
21 | from common.entity import Entity
22 | from common.env_variables import DATA_SOURCE_NAME, RAW_DIR, CLEANSED_DIR, TEMP_DIR, AZURE_STORAGE_CONNECTION_STRING, \
23 | AZURE_STORAGE_CONTAINER_NAME, DATA_DIR, UPLOAD_TO_AZURE, BACKUP_DIR, CURATED_DIR
24 | from common.logging import logger
25 |
26 | LOAD_TIMESTAMP_FORMAT = '%Y/%m/%d/%H-%M-%S'
27 | LOAD_DATE_FORMAT = '%Y/%m/%d'
28 |
29 | RAW_LAYER = 'raw'
30 | CLEANSED_LAYER = 'cleansed'
31 | CURATED_LAYER = 'curated'
32 | TEMP_LAYER = 'temp'
33 |
34 | LAYERS = [RAW_LAYER, CLEANSED_LAYER, CURATED_LAYER, TEMP_LAYER]
35 |
36 | LAYER_DIR = {
37 | RAW_LAYER: RAW_DIR,
38 | CLEANSED_LAYER: CLEANSED_DIR,
39 | CURATED_LAYER: CURATED_DIR,
40 | TEMP_LAYER: TEMP_DIR,
41 | }
42 |
43 | DOWNLOADED_JOB_DESCRIPTIONS_CSV = '11_downloaded_job_descriptions.csv'
44 | SITEMAP_URLS_CSV = '12_sitemap_urls.csv'
45 | JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV = '13_job_descriptions_to_download.csv'
46 | PARSED_JOB_DESCRIPTIONS_CSV = '21_parsed_job_descriptions.csv'
47 | JOB_DESCRIPTIONS_TO_PARSE_CSV = '22_job_descriptions_to_parse.csv'
48 | DOWNLOADED_SITEMAPS_CSV = '31_downloaded_sitemaps.csv'
49 | PARSED_SITEMAP_DATES_CSV = '32_parsed_sitemap_dates.csv'
50 | SITEMAPS_TO_PARSE_CSV = '33_sitemaps_to_parse.csv'
51 |
52 |
53 | def list_raw_files(data_source, entity: Entity, load_date=None):
54 | dir_path = os.path.join(RAW_DIR, data_source, entity.name)
55 | if load_date:
56 | dir_path = os.path.join(dir_path, load_date)
57 | file_list = [{
58 | 'load_timestamp': '/'.join(f.split('/')[-5:-1]),
59 | 'file_name': f.split('/')[-1],
60 | } for f in glob.iglob(dir_path + '/**/*', recursive=True) if os.path.isfile(f) and 'latest' not in f]
61 | return file_list
62 |
63 |
64 | def list_raw_days(data_source, entity: Entity):
65 | dir_path = os.path.join(RAW_DIR, data_source, entity.name)
66 | file_list = [{
67 | 'date': ''.join(f.split('/')[-3:]),
68 | } for f in glob.iglob(dir_path + '/*/*/*', recursive=True) if os.path.isdir(f) and 'latest' not in f]
69 | return file_list
70 |
71 |
72 | def list_backup_days(data_source, entity: Entity):
73 | dir_path = os.path.join(BACKUP_DIR, data_source, entity.name)
74 | file_list = [{
75 | 'date': f.split('.')[-3],
76 | } for f in glob.iglob(dir_path + '/**/*', recursive=True) if os.path.isfile(f)]
77 | return file_list
78 |
79 |
80 | def get_load_timestamp(ts=None):
81 | if ts is None:
82 | load_timestamp = datetime.datetime.today().strftime(LOAD_TIMESTAMP_FORMAT)
83 | else:
84 | load_timestamp = parser.parse(ts).strftime(LOAD_TIMESTAMP_FORMAT)
85 | return load_timestamp
86 |
87 |
88 | def get_load_date(ds=None):
89 | if ds is None:
90 | load_date = (datetime.datetime.today() - datetime.timedelta(days=1)).strftime(LOAD_DATE_FORMAT)
91 | else:
92 | load_date = parser.parse(ds).strftime(LOAD_DATE_FORMAT)
93 | return load_date
94 |
95 |
96 | def get_filters_from_load_date(load_date: str):
97 | year, month, day = load_date.split('/', 2)
98 | filters = [
99 | ('year', '=', int(year)),
100 | ('month', '=', int(month)),
101 | ('day', '=', int(day)),
102 | ]
103 | return filters
104 |
105 |
106 | def create_dir(file_path):
107 | dir_path = os.path.dirname(file_path)
108 | pathlib.Path(dir_path).mkdir(parents=True, exist_ok=True)
109 |
110 |
111 | def save_local_file(content, file_path):
112 | create_dir(file_path)
113 | file_type = "w" if isinstance(content, str) else "wb"
114 | with open(file_path, file_type) as f:
115 | f.write(content)
116 |
117 |
118 | def save_remote_file(content, blob_name):
119 | from azure.storage.blob import BlockBlobService
120 | logger.debug(f'save_remote_file start: {blob_name}')
121 | blob_service_client = BlockBlobService(connection_string=AZURE_STORAGE_CONNECTION_STRING)
122 | if isinstance(content, str):
123 | blob_service_client.create_blob_from_text(AZURE_STORAGE_CONTAINER_NAME, blob_name, content)
124 | else:
125 | blob_service_client.create_blob_from_bytes(AZURE_STORAGE_CONTAINER_NAME, blob_name, content)
126 | logger.success(f'save_remote_file end: {blob_name}')
127 |
128 |
129 | def save_raw_file(content, entity: Entity, load_timestamp: str, file_name):
130 | blob_name = os.path.join(RAW_LAYER, DATA_SOURCE_NAME, entity.name, load_timestamp, file_name)
131 | file_path = os.path.join(DATA_DIR, blob_name)
132 | save_local_file(content, file_path)
133 | if UPLOAD_TO_AZURE:
134 | save_remote_file(content, blob_name)
135 |
136 |
137 | def load_raw_file(entity: Entity, load_timestamp, file_name):
138 | file_path = os.path.join(LAYER_DIR[RAW_LAYER], DATA_SOURCE_NAME, entity.name, load_timestamp, file_name)
139 | with open(file_path, 'r') as f:
140 | content = f.read()
141 | return content
142 |
143 |
144 | def save_temp_df(df: pd.DataFrame, load_timestamp: str, file_name: str):
145 | temp_dir = os.path.join(TEMP_DIR, load_timestamp)
146 | if not os.path.exists(temp_dir):
147 | os.makedirs(temp_dir)
148 | # noinspection PyTypeChecker
149 | df.to_csv(os.path.join(temp_dir, file_name), index=False)
150 |
151 |
152 | def load_temp_df(load_timestamp: str, file_name: str) -> pd.DataFrame:
153 | return pd.read_csv(os.path.join(TEMP_DIR, load_timestamp, file_name))
154 |
155 |
156 | def list_parquet_files(layer, entity: Entity, relative_paths):
157 | dir_path = os.path.join(LAYER_DIR[layer], DATA_SOURCE_NAME, entity.name)
158 | file_list = [f for f in glob.iglob(dir_path + '/**/*.parquet', recursive=True) if os.path.isfile(f)]
159 | if relative_paths:
160 | file_list = [file_path.replace(dir_path + '/', '') for file_path in file_list]
161 | return file_list
162 |
163 |
164 | def list_cleansed_files(entity: Entity, relative_paths=True):
165 | return list_parquet_files(CLEANSED_LAYER, entity, relative_paths)
166 |
167 |
168 | def save_parquet_df(df: pd.DataFrame, layer, entity: Entity):
169 | # noinspection PyArgumentList
170 | table: pa.Table = pa.Table.from_pandas(df, preserve_index=False)
171 | root_path = os.path.join(LAYER_DIR[layer], DATA_SOURCE_NAME, entity.name)
172 | pq.write_to_dataset(table,
173 | root_path,
174 | partition_cols=['year', 'month', 'day'],
175 | basename_template='part-{i}.parquet',
176 | existing_data_behavior='delete_matching',
177 | use_legacy_dataset=False)
178 |
179 |
180 | def save_cleansed_df(df: pd.DataFrame, entity: Entity):
181 | save_parquet_df(df, CLEANSED_LAYER, entity)
182 |
183 |
184 | def save_curated_df(df: pd.DataFrame, entity: Entity):
185 | save_parquet_df(df, CURATED_LAYER, entity)
186 |
187 |
188 | def load_parquet_df(layer, entity: Entity, columns, filters) -> pd.DataFrame:
189 | # noinspection PyArgumentList
190 | root_path = os.path.join(LAYER_DIR[layer], DATA_SOURCE_NAME, entity.name)
191 | try:
192 | table = pq.read_table(root_path, columns=columns, filters=filters, use_legacy_dataset=False)
193 | return table.to_pandas()
194 | except (FileNotFoundError, ArrowInvalid):
195 | return pd.DataFrame(columns=columns)
196 |
197 |
198 | def load_cleansed_df(entity: Entity, columns=None, filters=None, load_date=None) -> pd.DataFrame:
199 | if filters is None and load_date is not None:
200 | filters = get_filters_from_load_date(load_date)
201 | return load_parquet_df(CLEANSED_LAYER, entity, columns, filters)
202 |
--------------------------------------------------------------------------------
/python/simplescraper/explore/explore_dwh_mart.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "pycharm": {
8 | "name": "#%%\n"
9 | }
10 | },
11 | "outputs": [],
12 | "source": [
13 | "from common.explore import display_sql"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 2,
19 | "outputs": [
20 | {
21 | "data": {
22 | "text/plain": " job_id total\n0 7543521 12\n1 7369771 10\n2 7723680 9\n3 7599993 8\n4 7571802 8",
23 | "text/html": "\n\n
\n \n \n | \n job_id | \n total | \n
\n \n \n \n | 0 | \n 7543521 | \n 12 | \n
\n \n | 1 | \n 7369771 | \n 10 | \n
\n \n | 2 | \n 7723680 | \n 9 | \n
\n \n | 3 | \n 7599993 | \n 8 | \n
\n \n | 4 | \n 7571802 | \n 8 | \n
\n \n
\n
"
24 | },
25 | "execution_count": 2,
26 | "metadata": {},
27 | "output_type": "execute_result"
28 | }
29 | ],
30 | "source": [
31 | "display_sql(f'''\n",
32 | "SELECT job_id,\n",
33 | " COUNT(1) AS total\n",
34 | " FROM curated.job\n",
35 | " GROUP BY 1\n",
36 | " ORDER BY 2 DESC\n",
37 | " LIMIT 5\n",
38 | "''')"
39 | ],
40 | "metadata": {
41 | "collapsed": false,
42 | "pycharm": {
43 | "name": "#%%\n"
44 | }
45 | }
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 3,
50 | "metadata": {
51 | "pycharm": {
52 | "name": "#%%\n"
53 | }
54 | },
55 | "outputs": [
56 | {
57 | "data": {
58 | "text/plain": " load_timestamp title \\\n0 2022-01-26 16:26:20 Vertriebsmitarbeiter/in Innendienst (m/w/d) \n1 2022-01-20 10:00:00 Innendienst Vertrieb Ausstellung (m/w/d) \n2 2022-01-10 19:00:00 Mitarbeiter/in Vertrieb Ausstellung (m/w/d) \n3 2022-01-08 13:00:00 Berater Ausstellung (m/w/d) \n4 2021-12-18 14:00:00 Verkaufsberater Ausstellung (m/w/d) \n5 2021-11-20 11:00:00 Berater Ausstellung (m/w/d) \n6 2021-11-12 16:00:00 Fachberater Ausstellung (m/w/d) \n7 2021-11-10 17:00:00 Fachberater - Glaser / Schreiner (m/w/d) \n8 2021-10-14 21:00:00 Kaufmännische/r Angestellte/r (m/w/d) \n9 2021-10-07 08:00:00 Kaufmännische/r Angestellte/r (m/w/d) \n10 2021-10-06 11:00:00 Kaufmännischer Angestellter (m/w/d) \n11 2021-10-05 08:00:00 Kaufmännischer Angestellter (m/w/d) \n\n online_date \n0 2022-01-02T13:03:06Z \n1 2022-01-02T13:03:06Z \n2 2022-01-02T13:03:06Z \n3 2022-01-02T13:03:06Z \n4 2021-12-18T13:03:05Z \n5 2021-11-13T17:03:10Z \n6 2021-10-29T15:30:01Z \n7 2021-10-29T15:30:01Z \n8 2021-10-06T15:03:04Z \n9 2021-10-06T15:03:04Z \n10 2021-09-21T14:32:36Z \n11 2021-09-21T14:32:36Z ",
59 | "text/html": "\n\n
\n \n \n | \n load_timestamp | \n title | \n online_date | \n
\n \n \n \n | 0 | \n 2022-01-26 16:26:20 | \n Vertriebsmitarbeiter/in Innendienst (m/w/d) | \n 2022-01-02T13:03:06Z | \n
\n \n | 1 | \n 2022-01-20 10:00:00 | \n Innendienst Vertrieb Ausstellung (m/w/d) | \n 2022-01-02T13:03:06Z | \n
\n \n | 2 | \n 2022-01-10 19:00:00 | \n Mitarbeiter/in Vertrieb Ausstellung (m/w/d) | \n 2022-01-02T13:03:06Z | \n
\n \n | 3 | \n 2022-01-08 13:00:00 | \n Berater Ausstellung (m/w/d) | \n 2022-01-02T13:03:06Z | \n
\n \n | 4 | \n 2021-12-18 14:00:00 | \n Verkaufsberater Ausstellung (m/w/d) | \n 2021-12-18T13:03:05Z | \n
\n \n | 5 | \n 2021-11-20 11:00:00 | \n Berater Ausstellung (m/w/d) | \n 2021-11-13T17:03:10Z | \n
\n \n | 6 | \n 2021-11-12 16:00:00 | \n Fachberater Ausstellung (m/w/d) | \n 2021-10-29T15:30:01Z | \n
\n \n | 7 | \n 2021-11-10 17:00:00 | \n Fachberater - Glaser / Schreiner (m/w/d) | \n 2021-10-29T15:30:01Z | \n
\n \n | 8 | \n 2021-10-14 21:00:00 | \n Kaufmännische/r Angestellte/r (m/w/d) | \n 2021-10-06T15:03:04Z | \n
\n \n | 9 | \n 2021-10-07 08:00:00 | \n Kaufmännische/r Angestellte/r (m/w/d) | \n 2021-10-06T15:03:04Z | \n
\n \n | 10 | \n 2021-10-06 11:00:00 | \n Kaufmännischer Angestellter (m/w/d) | \n 2021-09-21T14:32:36Z | \n
\n \n | 11 | \n 2021-10-05 08:00:00 | \n Kaufmännischer Angestellter (m/w/d) | \n 2021-09-21T14:32:36Z | \n
\n \n
\n
"
60 | },
61 | "execution_count": 3,
62 | "metadata": {},
63 | "output_type": "execute_result"
64 | }
65 | ],
66 | "source": [
67 | "display_sql(f'''\n",
68 | "SELECT load_timestamp,\n",
69 | " title,\n",
70 | " online_date\n",
71 | " FROM curated.job\n",
72 | " WHERE job_id = 7543521\n",
73 | " ORDER BY load_timestamp DESC\n",
74 | " LIMIT 20\n",
75 | "''')"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 4,
81 | "outputs": [
82 | {
83 | "data": {
84 | "text/plain": " job_id load_timestamp\n0 7543521 2022-01-26 16:26:20",
85 | "text/html": "\n\n
\n \n \n | \n job_id | \n load_timestamp | \n
\n \n \n \n | 0 | \n 7543521 | \n 2022-01-26 16:26:20 | \n
\n \n
\n
"
86 | },
87 | "execution_count": 4,
88 | "metadata": {},
89 | "output_type": "execute_result"
90 | }
91 | ],
92 | "source": [
93 | "display_sql(f'''\n",
94 | "SELECT job_id,\n",
95 | " load_timestamp\n",
96 | "FROM (\n",
97 | " SELECT j.*,\n",
98 | " row_number()\n",
99 | " OVER (\n",
100 | " PARTITION BY job_id ORDER BY load_timestamp DESC\n",
101 | " ) AS seqnum\n",
102 | " FROM curated.job j\n",
103 | " WHERE job_id = 7543521\n",
104 | ") j\n",
105 | "WHERE seqnum = 1;\n",
106 | "''')\n"
107 | ],
108 | "metadata": {
109 | "collapsed": false,
110 | "pycharm": {
111 | "name": "#%%\n"
112 | }
113 | }
114 | }
115 | ],
116 | "metadata": {
117 | "kernelspec": {
118 | "display_name": "Python 3 (ipykernel)",
119 | "language": "python",
120 | "name": "python3"
121 | },
122 | "language_info": {
123 | "codemirror_mode": {
124 | "name": "ipython",
125 | "version": 3
126 | },
127 | "file_extension": ".py",
128 | "mimetype": "text/x-python",
129 | "name": "python",
130 | "nbconvert_exporter": "python",
131 | "pygments_lexer": "ipython3",
132 | "version": "3.10.6"
133 | }
134 | },
135 | "nbformat": 4,
136 | "nbformat_minor": 1
137 | }
--------------------------------------------------------------------------------
/python/simplescraper/flasky.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 |
4 | from flask import Flask, request, Request
5 |
6 | from common.env_variables import SOURCE_DIR
7 | from common.logging import logger
8 | from common.storage import get_load_timestamp, get_load_date
9 | from tasks.cleanse_job_descriptions import cleanse_job_descriptions
10 | from tasks.cleanse_sitemaps import cleanse_sitemaps
11 | from tasks.curate_job_descriptions import curate_job_descriptions
12 | from tasks.curate_sitemaps import curate_sitemaps
13 | from tasks.download_job_descriptions import download_job_descriptions
14 | from tasks.download_sitemap import download_sitemap
15 | from tasks.list_downloaded_job_descriptions import list_downloaded_job_descriptions
16 | from tasks.list_job_descriptions_to_download import list_job_descriptions_to_download
17 | from tasks.prune_old_raw import prune_old_raw
18 |
19 | SUCCESS_RETURN_CODE = 0
20 |
21 | DEFAULT_DATA_INTERVAL_END = '2022-09-08T00:00:00+00:00'
22 | DEFAULT_DS = '2022-09-07'
23 |
24 | SUCCESS = {'result_status': 'success', }, 200
25 |
26 | HTML_FORM = f'''
27 |
32 | '''
33 |
34 |
35 | def is_connected_to_vpn():
36 | return os.system('/usr/sbin/scutil --nc list | grep Connected | grep vpn') == 0
37 |
38 |
39 | class RequestParams:
40 | def __init__(self, _request: Request):
41 | form = _request.form
42 | self.load_timestamp = get_load_timestamp(form.get('data_interval_end'))
43 | self.load_date = get_load_date(form.get('ds'))
44 | logger.info(self.__dict__)
45 |
46 |
47 | app = Flask(__name__)
48 |
49 |
50 | @app.route('/')
51 | def index():
52 | return 'Check VPN Status
' \
53 | 'List Downloaded Descriptions
' \
54 | 'Download Sitemap
' \
55 | 'List Job Descriptions to Download
' \
56 | 'Download Job Descriptions
' \
57 | 'Cleanse Sitemap
' \
58 | 'Cleanse Job Descriptions
' \
59 | 'Do dbt run
' \
60 | 'Do Day Backup
' \
61 | 'Validate Day Backup
' \
62 | 'Test
'
63 |
64 |
65 | @app.route('/do/check_vpn_status')
66 | def do_check_vpn_status():
67 | logger.info('is_connected_to_vpn: start')
68 | is_connected = is_connected_to_vpn()
69 | logger.info('is_connected_to_vpn: end')
70 | if is_connected:
71 | return SUCCESS
72 | else:
73 | return {'result_status': 'failed'}, 400
74 |
75 |
76 | @app.route('/do/list_downloaded_job_descriptions', methods=['GET', 'POST'])
77 | def do_list_downloaded_urls():
78 | if request.method == 'POST':
79 | params = RequestParams(request)
80 | list_downloaded_job_descriptions(params.load_timestamp)
81 | return SUCCESS
82 | elif request.method == 'GET':
83 | return HTML_FORM
84 |
85 |
86 | @app.route('/do/download_sitemap', methods=['GET', 'POST'])
87 | def do_download_sitemap():
88 | if request.method == 'POST':
89 | if is_connected_to_vpn():
90 | params = RequestParams(request)
91 | download_sitemap(params.load_timestamp)
92 | return {'result_status': 'success'}, 200
93 | else:
94 | return {'result_status': 'failed'}, 400
95 | elif request.method == 'GET':
96 | return HTML_FORM
97 |
98 |
99 | @app.route('/do/list_job_descriptions_to_download', methods=['GET', 'POST'])
100 | def do_list_job_descriptions_to_download():
101 | if request.method == 'POST':
102 | if is_connected_to_vpn():
103 | params = RequestParams(request)
104 | list_job_descriptions_to_download(params.load_timestamp)
105 | return SUCCESS
106 | else:
107 | return {'result_status': 'failed'}, 400
108 | elif request.method == 'GET':
109 | return HTML_FORM
110 |
111 |
112 | @app.route('/do/download_job_descriptions', methods=['GET', 'POST'])
113 | def do_download_job_descriptions():
114 | if request.method == 'POST':
115 | if is_connected_to_vpn():
116 | params = RequestParams(request)
117 | download_job_descriptions(params.load_timestamp)
118 | return SUCCESS
119 | else:
120 | return {'result_status': 'failed'}, 400
121 | elif request.method == 'GET':
122 | return HTML_FORM
123 |
124 |
125 | @app.route('/do/cleanse_sitemaps', methods=['GET', 'POST'])
126 | def do_cleanse_sitemaps():
127 | if request.method == 'POST':
128 | params = RequestParams(request)
129 | cleanse_sitemaps(params.load_timestamp, params.load_date)
130 | return SUCCESS
131 | elif request.method == 'GET':
132 | return HTML_FORM
133 |
134 |
135 | @app.route('/do/cleanse_job_descriptions', methods=['GET', 'POST'])
136 | def do_cleanse_job_descriptions():
137 | if request.method == 'POST':
138 | params = RequestParams(request)
139 | cleanse_job_descriptions(params.load_timestamp, params.load_date)
140 | return SUCCESS
141 | elif request.method == 'GET':
142 | return HTML_FORM
143 |
144 |
145 | @app.route('/do/curate_sitemaps', methods=['GET', 'POST'])
146 | def do_curate_sitemaps():
147 | if request.method == 'POST':
148 | params = RequestParams(request)
149 | curate_sitemaps(params.load_timestamp, params.load_date)
150 | return SUCCESS
151 | elif request.method == 'GET':
152 | return HTML_FORM
153 |
154 |
155 | @app.route('/do/curate_job_descriptions', methods=['GET', 'POST'])
156 | def do_curate_job_descriptions():
157 | if request.method == 'POST':
158 | params = RequestParams(request)
159 | curate_job_descriptions(params.load_timestamp, params.load_date)
160 | return SUCCESS
161 | elif request.method == 'GET':
162 | return HTML_FORM
163 |
164 |
165 | @app.route('/do/do_day_backup', methods=['GET', 'POST'])
166 | def do_do_day_backup():
167 | if request.method == 'POST':
168 | params = RequestParams(request)
169 | year, month, day = params.load_date.split('/')
170 | result = subprocess.run([f'{SOURCE_DIR}/simplescraper/do_day_backup.sh', year, month, day])
171 | if result.returncode == SUCCESS_RETURN_CODE:
172 | return SUCCESS
173 | else:
174 | return {
175 | 'result_status': 'error',
176 | }, 400
177 | elif request.method == 'GET':
178 | return HTML_FORM
179 |
180 |
181 | @app.route('/do/do_dbt_run', methods=['GET', 'POST'])
182 | def do_dbt_run():
183 | if request.method == 'POST':
184 | _ = RequestParams(request)
185 | result = subprocess.run([f'{SOURCE_DIR}/simplescraper/do_dbt_run.sh'])
186 | if result.returncode == SUCCESS_RETURN_CODE:
187 | return SUCCESS
188 | else:
189 | return {
190 | 'result_status': 'error',
191 | }, 400
192 | elif request.method == 'GET':
193 | return HTML_FORM
194 |
195 |
196 | @app.route('/do/verify_day_backup', methods=['GET', 'POST'])
197 | def do_verify_day_backup():
198 | if request.method == 'POST':
199 | params = RequestParams(request)
200 | year, month, day = params.load_date.split('/')
201 | result = subprocess.run([f'{SOURCE_DIR}/simplescraper/verify_day_backup.sh', year, month, day])
202 | if result.returncode == SUCCESS_RETURN_CODE:
203 | return SUCCESS
204 | else:
205 | return {
206 | 'result_status': 'error',
207 | }, 400
208 | elif request.method == 'GET':
209 | return HTML_FORM
210 |
211 |
212 | @app.route('/do/prune_old_raw', methods=['GET', 'POST'])
213 | def do_prune_old_raw():
214 | if request.method == 'POST':
215 | params = RequestParams(request)
216 | prune_old_raw(params.load_timestamp, params.load_date)
217 | return SUCCESS
218 | elif request.method == 'GET':
219 | return HTML_FORM
220 |
221 |
222 | @app.route('/do/test', methods=['GET', 'POST'])
223 | def do_test():
224 | if request.method == 'POST':
225 | params = RequestParams(request)
226 | return {
227 | 'result_status': 'success',
228 | 'load_timestamp': params.load_timestamp,
229 | 'load_date': params.load_date,
230 | }, 200
231 | elif request.method == 'GET':
232 | return HTML_FORM
233 |
--------------------------------------------------------------------------------
/python/simplescraper/explore/explore_dwh_mart_dim_time.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "pycharm": {
8 | "name": "#%%\n"
9 | }
10 | },
11 | "outputs": [],
12 | "source": [
13 | "from common.explore import display_sql"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 2,
19 | "outputs": [
20 | {
21 | "data": {
22 | "text/plain": " date_key year month day month_name year_week day_of_week \\\n0 2021-10-09 2021 10 9 October 202140 6 \n1 2021-10-10 2021 10 10 October 202140 7 \n2 2021-10-11 2021 10 11 October 202141 1 \n3 2021-10-12 2021 10 12 October 202141 2 \n4 2021-10-13 2021 10 13 October 202141 3 \n.. ... ... ... ... ... ... ... \n353 2022-09-27 2022 9 27 September 202239 2 \n354 2022-09-28 2022 9 28 September 202239 3 \n355 2022-09-29 2022 9 29 September 202239 4 \n356 2022-09-30 2022 9 30 September 202239 5 \n357 2022-10-01 2022 10 1 October 202239 6 \n\n day_of_week_name \n0 Saturday \n1 Sunday \n2 Monday \n3 Tuesday \n4 Wednesday \n.. ... \n353 Tuesday \n354 Wednesday \n355 Thursday \n356 Friday \n357 Saturday \n\n[358 rows x 8 columns]",
23 | "text/html": "\n\n
\n \n \n | \n date_key | \n year | \n month | \n day | \n month_name | \n year_week | \n day_of_week | \n day_of_week_name | \n
\n \n \n \n | 0 | \n 2021-10-09 | \n 2021 | \n 10 | \n 9 | \n October | \n 202140 | \n 6 | \n Saturday | \n
\n \n | 1 | \n 2021-10-10 | \n 2021 | \n 10 | \n 10 | \n October | \n 202140 | \n 7 | \n Sunday | \n
\n \n | 2 | \n 2021-10-11 | \n 2021 | \n 10 | \n 11 | \n October | \n 202141 | \n 1 | \n Monday | \n
\n \n | 3 | \n 2021-10-12 | \n 2021 | \n 10 | \n 12 | \n October | \n 202141 | \n 2 | \n Tuesday | \n
\n \n | 4 | \n 2021-10-13 | \n 2021 | \n 10 | \n 13 | \n October | \n 202141 | \n 3 | \n Wednesday | \n
\n \n | ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n
\n \n | 353 | \n 2022-09-27 | \n 2022 | \n 9 | \n 27 | \n September | \n 202239 | \n 2 | \n Tuesday | \n
\n \n | 354 | \n 2022-09-28 | \n 2022 | \n 9 | \n 28 | \n September | \n 202239 | \n 3 | \n Wednesday | \n
\n \n | 355 | \n 2022-09-29 | \n 2022 | \n 9 | \n 29 | \n September | \n 202239 | \n 4 | \n Thursday | \n
\n \n | 356 | \n 2022-09-30 | \n 2022 | \n 9 | \n 30 | \n September | \n 202239 | \n 5 | \n Friday | \n
\n \n | 357 | \n 2022-10-01 | \n 2022 | \n 10 | \n 1 | \n October | \n 202239 | \n 6 | \n Saturday | \n
\n \n
\n
358 rows × 8 columns
\n
"
24 | },
25 | "execution_count": 2,
26 | "metadata": {},
27 | "output_type": "execute_result"
28 | }
29 | ],
30 | "source": [
31 | "display_sql(f'''\n",
32 | "WITH unique_online_at AS (\n",
33 | " SELECT DISTINCT online_at\n",
34 | " FROM curated.online_job\n",
35 | " ORDER BY 1\n",
36 | ")\n",
37 | "SELECT online_at as date_key,\n",
38 | " date_part('year', online_at) as year,\n",
39 | " date_part('month', online_at) as month,\n",
40 | " date_part('day', online_at) as day,\n",
41 | " monthname(online_at) as month_name,\n",
42 | " date_part('yearweek', online_at) as year_week,\n",
43 | " date_part('isodow', online_at) as day_of_week,\n",
44 | " dayname(online_at) as day_of_week_name\n",
45 | " FROM unique_online_at\n",
46 | "''')\n"
47 | ],
48 | "metadata": {
49 | "collapsed": false,
50 | "pycharm": {
51 | "name": "#%%\n"
52 | }
53 | }
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 3,
58 | "outputs": [
59 | {
60 | "data": {
61 | "text/plain": " date_key year month day month_name year_week day_of_week \\\n0 2021-10-09 2021 10 9 October 202140 6 \n1 2021-10-10 2021 10 10 October 202140 7 \n2 2021-10-11 2021 10 11 October 202141 1 \n3 2021-10-12 2021 10 12 October 202141 2 \n4 2021-10-13 2021 10 13 October 202141 3 \n.. ... ... ... ... ... ... ... \n353 2022-09-27 2022 9 27 September 202239 2 \n354 2022-09-28 2022 9 28 September 202239 3 \n355 2022-09-29 2022 9 29 September 202239 4 \n356 2022-09-30 2022 9 30 September 202239 5 \n357 2022-10-01 2022 10 1 October 202239 6 \n\n day_of_week_name \n0 Saturday \n1 Sunday \n2 Monday \n3 Tuesday \n4 Wednesday \n.. ... \n353 Tuesday \n354 Wednesday \n355 Thursday \n356 Friday \n357 Saturday \n\n[358 rows x 8 columns]",
62 | "text/html": "\n\n
\n \n \n | \n date_key | \n year | \n month | \n day | \n month_name | \n year_week | \n day_of_week | \n day_of_week_name | \n
\n \n \n \n | 0 | \n 2021-10-09 | \n 2021 | \n 10 | \n 9 | \n October | \n 202140 | \n 6 | \n Saturday | \n
\n \n | 1 | \n 2021-10-10 | \n 2021 | \n 10 | \n 10 | \n October | \n 202140 | \n 7 | \n Sunday | \n
\n \n | 2 | \n 2021-10-11 | \n 2021 | \n 10 | \n 11 | \n October | \n 202141 | \n 1 | \n Monday | \n
\n \n | 3 | \n 2021-10-12 | \n 2021 | \n 10 | \n 12 | \n October | \n 202141 | \n 2 | \n Tuesday | \n
\n \n | 4 | \n 2021-10-13 | \n 2021 | \n 10 | \n 13 | \n October | \n 202141 | \n 3 | \n Wednesday | \n
\n \n | ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n
\n \n | 353 | \n 2022-09-27 | \n 2022 | \n 9 | \n 27 | \n September | \n 202239 | \n 2 | \n Tuesday | \n
\n \n | 354 | \n 2022-09-28 | \n 2022 | \n 9 | \n 28 | \n September | \n 202239 | \n 3 | \n Wednesday | \n
\n \n | 355 | \n 2022-09-29 | \n 2022 | \n 9 | \n 29 | \n September | \n 202239 | \n 4 | \n Thursday | \n
\n \n | 356 | \n 2022-09-30 | \n 2022 | \n 9 | \n 30 | \n September | \n 202239 | \n 5 | \n Friday | \n
\n \n | 357 | \n 2022-10-01 | \n 2022 | \n 10 | \n 1 | \n October | \n 202239 | \n 6 | \n Saturday | \n
\n \n
\n
358 rows × 8 columns
\n
"
63 | },
64 | "execution_count": 3,
65 | "metadata": {},
66 | "output_type": "execute_result"
67 | }
68 | ],
69 | "source": [
70 | "display_sql(f'''\n",
71 | "SELECT *\n",
72 | " FROM dim_time\n",
73 | "''')"
74 | ],
75 | "metadata": {
76 | "collapsed": false,
77 | "pycharm": {
78 | "name": "#%%\n"
79 | }
80 | }
81 | }
82 | ],
83 | "metadata": {
84 | "kernelspec": {
85 | "display_name": "Python 3 (ipykernel)",
86 | "language": "python",
87 | "name": "python3"
88 | },
89 | "language_info": {
90 | "codemirror_mode": {
91 | "name": "ipython",
92 | "version": 3
93 | },
94 | "file_extension": ".py",
95 | "mimetype": "text/x-python",
96 | "name": "python",
97 | "nbconvert_exporter": "python",
98 | "pygments_lexer": "ipython3",
99 | "version": "3.10.6"
100 | }
101 | },
102 | "nbformat": 4,
103 | "nbformat_minor": 1
104 | }
--------------------------------------------------------------------------------
/python/simplescraper/requirements.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is autogenerated by pip-compile with python 3.10
3 | # To update, run:
4 | #
5 | # pip-compile --allow-unsafe requirements.in
6 | #
7 | appnope==0.1.3
8 | # via
9 | # ipykernel
10 | # ipython
11 | argon2-cffi==21.3.0
12 | # via notebook
13 | argon2-cffi-bindings==21.2.0
14 | # via argon2-cffi
15 | astor==0.8.1
16 | # via wemake-python-styleguide
17 | asttokens==2.0.8
18 | # via stack-data
19 | attrs==22.1.0
20 | # via
21 | # flake8-bugbear
22 | # flake8-eradicate
23 | # jsonschema
24 | # pytest
25 | # wemake-python-styleguide
26 | azure-common==1.1.28
27 | # via
28 | # azure-storage-blob
29 | # azure-storage-common
30 | azure-storage-blob==2.1.0
31 | # via -r requirements.in
32 | azure-storage-common==2.1.0
33 | # via azure-storage-blob
34 | backcall==0.2.0
35 | # via ipython
36 | bandit==1.7.4
37 | # via flake8-bandit
38 | beautifulsoup4==4.11.1
39 | # via
40 | # -r requirements.in
41 | # nbconvert
42 | bleach==5.0.1
43 | # via nbconvert
44 | build==0.8.0
45 | # via pip-tools
46 | certifi==2022.6.15.1
47 | # via requests
48 | cffi==1.15.1
49 | # via
50 | # argon2-cffi-bindings
51 | # cryptography
52 | charset-normalizer==2.1.1
53 | # via requests
54 | click==8.1.3
55 | # via
56 | # flask
57 | # pip-tools
58 | cryptography==38.0.1
59 | # via azure-storage-common
60 | darglint==1.8.1
61 | # via wemake-python-styleguide
62 | debugpy==1.6.3
63 | # via ipykernel
64 | decorator==5.1.1
65 | # via ipython
66 | defusedxml==0.7.1
67 | # via nbconvert
68 | docutils==0.19
69 | # via restructuredtext-lint
70 | duckdb==0.7.0
71 | # via -r requirements.in
72 | entrypoints==0.4
73 | # via jupyter-client
74 | eradicate==2.1.0
75 | # via flake8-eradicate
76 | executing==1.0.0
77 | # via stack-data
78 | fastjsonschema==2.16.1
79 | # via nbformat
80 | flake8==4.0.1
81 | # via
82 | # flake8-bandit
83 | # flake8-broken-line
84 | # flake8-bugbear
85 | # flake8-commas
86 | # flake8-comprehensions
87 | # flake8-debugger
88 | # flake8-docstrings
89 | # flake8-eradicate
90 | # flake8-isort
91 | # flake8-polyfill
92 | # flake8-quotes
93 | # flake8-rst-docstrings
94 | # flake8-string-format
95 | # pep8-naming
96 | # wemake-python-styleguide
97 | flake8-bandit==3.0.0
98 | # via wemake-python-styleguide
99 | flake8-broken-line==0.4.0
100 | # via wemake-python-styleguide
101 | flake8-bugbear==22.9.11
102 | # via wemake-python-styleguide
103 | flake8-commas==2.1.0
104 | # via wemake-python-styleguide
105 | flake8-comprehensions==3.10.0
106 | # via wemake-python-styleguide
107 | flake8-debugger==4.1.2
108 | # via wemake-python-styleguide
109 | flake8-docstrings==1.6.0
110 | # via wemake-python-styleguide
111 | flake8-eradicate==1.3.0
112 | # via wemake-python-styleguide
113 | flake8-isort==4.2.0
114 | # via wemake-python-styleguide
115 | flake8-polyfill==1.0.2
116 | # via
117 | # flake8-bandit
118 | # pep8-naming
119 | flake8-quotes==3.3.1
120 | # via wemake-python-styleguide
121 | flake8-rst-docstrings==0.2.7
122 | # via wemake-python-styleguide
123 | flake8-string-format==0.3.0
124 | # via wemake-python-styleguide
125 | flask==2.2.2
126 | # via -r requirements.in
127 | gitdb==4.0.9
128 | # via gitpython
129 | gitpython==3.1.27
130 | # via bandit
131 | greenlet==2.0.1
132 | # via playwright
133 | gunicorn==20.1.0
134 | # via -r requirements.in
135 | idna==3.3
136 | # via requests
137 | iniconfig==1.1.1
138 | # via pytest
139 | ipykernel==6.15.2
140 | # via
141 | # ipywidgets
142 | # jupyter
143 | # jupyter-console
144 | # notebook
145 | # qtconsole
146 | ipython==8.5.0
147 | # via
148 | # ipykernel
149 | # ipywidgets
150 | # jupyter-console
151 | ipython-genutils==0.2.0
152 | # via
153 | # notebook
154 | # qtconsole
155 | ipywidgets==8.0.2
156 | # via jupyter
157 | isort==5.10.1
158 | # via flake8-isort
159 | itsdangerous==2.1.2
160 | # via flask
161 | jedi==0.18.1
162 | # via ipython
163 | jinja2==3.1.2
164 | # via
165 | # flask
166 | # nbconvert
167 | # notebook
168 | jsonschema==4.16.0
169 | # via nbformat
170 | jupyter==1.0.0
171 | # via -r requirements.in
172 | jupyter-client==7.3.5
173 | # via
174 | # ipykernel
175 | # jupyter-console
176 | # nbclient
177 | # notebook
178 | # qtconsole
179 | jupyter-console==6.4.4
180 | # via jupyter
181 | jupyter-core==4.11.1
182 | # via
183 | # jupyter-client
184 | # nbconvert
185 | # nbformat
186 | # notebook
187 | # qtconsole
188 | jupyterlab-pygments==0.2.2
189 | # via nbconvert
190 | jupyterlab-widgets==3.0.3
191 | # via ipywidgets
192 | kaleido==0.2.1
193 | # via -r requirements.in
194 | loguru==0.6.0
195 | # via -r requirements.in
196 | lxml==4.9.1
197 | # via
198 | # -r requirements.in
199 | # nbconvert
200 | markupsafe==2.1.1
201 | # via
202 | # jinja2
203 | # nbconvert
204 | # werkzeug
205 | matplotlib-inline==0.1.6
206 | # via
207 | # ipykernel
208 | # ipython
209 | mccabe==0.6.1
210 | # via flake8
211 | mistune==2.0.4
212 | # via nbconvert
213 | nbclient==0.6.8
214 | # via nbconvert
215 | nbconvert==7.0.0
216 | # via
217 | # jupyter
218 | # notebook
219 | nbformat==5.4.0
220 | # via
221 | # nbclient
222 | # nbconvert
223 | # notebook
224 | nest-asyncio==1.5.5
225 | # via
226 | # ipykernel
227 | # jupyter-client
228 | # nbclient
229 | # notebook
230 | notebook==6.4.12
231 | # via jupyter
232 | numpy==1.23.3
233 | # via
234 | # pandas
235 | # patsy
236 | # plotly-calplot
237 | # plotly-express
238 | # pyarrow
239 | # scipy
240 | # statsmodels
241 | packaging==21.3
242 | # via
243 | # build
244 | # ipykernel
245 | # nbconvert
246 | # pytest
247 | # qtpy
248 | # statsmodels
249 | pandas==1.4.4
250 | # via
251 | # -r requirements.in
252 | # plotly-calplot
253 | # plotly-express
254 | # statsmodels
255 | pandocfilters==1.5.0
256 | # via nbconvert
257 | parso==0.8.3
258 | # via jedi
259 | patsy==0.5.2
260 | # via
261 | # plotly-express
262 | # statsmodels
263 | pbr==5.10.0
264 | # via stevedore
265 | pep517==0.13.0
266 | # via build
267 | pep8-naming==0.12.1
268 | # via wemake-python-styleguide
269 | pexpect==4.8.0
270 | # via ipython
271 | pickleshare==0.7.5
272 | # via ipython
273 | pip-tools==6.8.0
274 | # via -r requirements.in
275 | playwright==1.30.0
276 | # via -r requirements.in
277 | plotly==5.10.0
278 | # via
279 | # plotly-calplot
280 | # plotly-express
281 | plotly-calplot==0.1.12
282 | # via -r requirements.in
283 | plotly-express==0.4.1
284 | # via -r requirements.in
285 | pluggy==1.0.0
286 | # via pytest
287 | prometheus-client==0.14.1
288 | # via notebook
289 | prompt-toolkit==3.0.31
290 | # via
291 | # ipython
292 | # jupyter-console
293 | psutil==5.9.2
294 | # via ipykernel
295 | ptyprocess==0.7.0
296 | # via
297 | # pexpect
298 | # terminado
299 | pure-eval==0.2.2
300 | # via stack-data
301 | py==1.11.0
302 | # via pytest
303 | pyarrow==9.0.0
304 | # via -r requirements.in
305 | pycodestyle==2.8.0
306 | # via
307 | # flake8
308 | # flake8-bandit
309 | # flake8-debugger
310 | pycparser==2.21
311 | # via cffi
312 | pydocstyle==6.1.1
313 | # via flake8-docstrings
314 | pyee==9.0.4
315 | # via playwright
316 | pyflakes==2.4.0
317 | # via flake8
318 | pygments==2.13.0
319 | # via
320 | # flake8-rst-docstrings
321 | # ipython
322 | # jupyter-console
323 | # nbconvert
324 | # qtconsole
325 | # wemake-python-styleguide
326 | pyparsing==3.0.9
327 | # via packaging
328 | pyrsistent==0.18.1
329 | # via jsonschema
330 | pytest==7.1.3
331 | # via -r requirements.in
332 | python-dateutil==2.8.2
333 | # via
334 | # azure-storage-common
335 | # jupyter-client
336 | # pandas
337 | python-dotenv==0.21.0
338 | # via -r requirements.in
339 | pytz==2022.2.1
340 | # via pandas
341 | pyyaml==6.0
342 | # via bandit
343 | pyzmq==23.2.1
344 | # via
345 | # ipykernel
346 | # jupyter-client
347 | # notebook
348 | # qtconsole
349 | qtconsole==5.3.2
350 | # via jupyter
351 | qtpy==2.2.0
352 | # via qtconsole
353 | requests==2.28.1
354 | # via
355 | # -r requirements.in
356 | # azure-storage-common
357 | restructuredtext-lint==1.4.0
358 | # via flake8-rst-docstrings
359 | scipy==1.9.1
360 | # via
361 | # plotly-express
362 | # statsmodels
363 | send2trash==1.8.0
364 | # via notebook
365 | six==1.16.0
366 | # via
367 | # asttokens
368 | # bleach
369 | # patsy
370 | # python-dateutil
371 | smmap==5.0.0
372 | # via gitdb
373 | snowballstemmer==2.2.0
374 | # via pydocstyle
375 | soupsieve==2.3.2.post1
376 | # via beautifulsoup4
377 | stack-data==0.5.0
378 | # via ipython
379 | statsmodels==0.13.2
380 | # via plotly-express
381 | stevedore==4.0.0
382 | # via bandit
383 | tenacity==8.0.1
384 | # via plotly
385 | terminado==0.15.0
386 | # via notebook
387 | tinycss2==1.1.1
388 | # via nbconvert
389 | tomli==2.0.1
390 | # via
391 | # build
392 | # pep517
393 | # pytest
394 | tornado==6.2
395 | # via
396 | # ipykernel
397 | # jupyter-client
398 | # notebook
399 | # terminado
400 | traitlets==5.4.0
401 | # via
402 | # ipykernel
403 | # ipython
404 | # ipywidgets
405 | # jupyter-client
406 | # jupyter-core
407 | # matplotlib-inline
408 | # nbclient
409 | # nbconvert
410 | # nbformat
411 | # notebook
412 | # qtconsole
413 | typing-extensions==4.3.0
414 | # via
415 | # pyee
416 | # wemake-python-styleguide
417 | urllib3==1.26.12
418 | # via requests
419 | wcwidth==0.2.5
420 | # via prompt-toolkit
421 | webencodings==0.5.1
422 | # via
423 | # bleach
424 | # tinycss2
425 | wemake-python-styleguide==0.16.1
426 | # via -r requirements.in
427 | werkzeug==2.2.2
428 | # via flask
429 | wheel==0.37.1
430 | # via pip-tools
431 | widgetsnbextension==4.0.3
432 | # via ipywidgets
433 | xmltodict==0.13.0
434 | # via -r requirements.in
435 |
436 | # The following packages are considered to be unsafe in a requirements file:
437 | pip==22.2.2
438 | # via pip-tools
439 | setuptools==65.3.0
440 | # via
441 | # flake8-eradicate
442 | # gunicorn
443 | # pip-tools
444 |
--------------------------------------------------------------------------------
/docker/airflow/docker-compose.yml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | #
18 |
19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
20 | #
21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
22 | #
23 | # This configuration supports basic configuration using environment variables or an .env file
24 | # The following variables are supported:
25 | #
26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow.
27 | # Default: apache/airflow:2.3.4
28 | # AIRFLOW_UID - User ID in Airflow containers
29 | # Default: 50000
30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
31 | #
32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested).
33 | # Default: airflow
34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested).
35 | # Default: airflow
36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
37 | # Default: ''
38 | #
39 | # Feel free to modify this file to suit your needs.
40 | ---
41 | version: '3'
42 | x-airflow-common:
43 | &airflow-common
44 | # In order to add custom dependencies or upgrade provider packages you can use your extended image.
45 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
46 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
47 | image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.3.4}
48 | # build: .
49 | environment:
50 | &airflow-common-env
51 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor
52 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
53 | # For backward compatibility, with Airflow <2.3
54 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
55 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
56 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
57 | AIRFLOW__CORE__FERNET_KEY: ${AIRFLOW_FERNET_KEY}
58 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
59 | AIRFLOW__CORE__LOAD_EXAMPLES: ${AIRFLOW__CORE__LOAD_EXAMPLES}
60 | AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth'
61 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
62 | AIRFLOW_CONN_HTTP_DEFAULT: ${AIRFLOW_CONN_HTTP_DEFAULT}
63 | volumes:
64 | - ${AIRFLOW_DAGS_VOLUME}:/opt/airflow/dags
65 | - ${AIRFLOW_LOGS_VOLUME}:/opt/airflow/logs
66 | - ${AIRFLOW_PLUGINS_VOLUME}:/opt/airflow/plugins
67 | user: "${AIRFLOW_UID:-50000}:0"
68 | depends_on:
69 | &airflow-common-depends-on
70 | redis:
71 | condition: service_healthy
72 | postgres:
73 | condition: service_healthy
74 |
75 | services:
76 | postgres:
77 | image: postgres:13
78 | environment:
79 | POSTGRES_USER: airflow
80 | POSTGRES_PASSWORD: airflow
81 | POSTGRES_DB: airflow
82 | volumes:
83 | - postgres-db-volume:/var/lib/postgresql/data
84 | healthcheck:
85 | test: [ "CMD", "pg_isready", "-U", "airflow" ]
86 | interval: 5s
87 | retries: 5
88 | restart: always
89 |
90 | redis:
91 | image: redis:latest
92 | expose:
93 | - 6379
94 | healthcheck:
95 | test: [ "CMD", "redis-cli", "ping" ]
96 | interval: 5s
97 | timeout: 30s
98 | retries: 50
99 | restart: always
100 |
101 | airflow-webserver:
102 | <<: *airflow-common
103 | command: webserver
104 | ports:
105 | - 8080:8080
106 | healthcheck:
107 | test: [ "CMD", "curl", "--fail", "http://localhost:8080/health" ]
108 | interval: 10s
109 | timeout: 10s
110 | retries: 5
111 | restart: always
112 | depends_on:
113 | <<: *airflow-common-depends-on
114 | airflow-init:
115 | condition: service_completed_successfully
116 | volumes:
117 | - ${AIRFLOW_WEBSERVER_VOLUME}:/opt/airflow
118 |
119 | airflow-scheduler:
120 | <<: *airflow-common
121 | command: scheduler
122 | healthcheck:
123 | test: [ "CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"' ]
124 | interval: 10s
125 | timeout: 10s
126 | retries: 5
127 | restart: always
128 | depends_on:
129 | <<: *airflow-common-depends-on
130 | airflow-init:
131 | condition: service_completed_successfully
132 |
133 | airflow-worker:
134 | <<: *airflow-common
135 | command: celery worker
136 | healthcheck:
137 | test:
138 | - "CMD-SHELL"
139 | - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
140 | interval: 10s
141 | timeout: 10s
142 | retries: 5
143 | environment:
144 | <<: *airflow-common-env
145 | # Required to handle warm shutdown of the celery workers properly
146 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
147 | DUMB_INIT_SETSID: "0"
148 | restart: always
149 | depends_on:
150 | <<: *airflow-common-depends-on
151 | airflow-init:
152 | condition: service_completed_successfully
153 |
154 | airflow-triggerer:
155 | <<: *airflow-common
156 | command: triggerer
157 | healthcheck:
158 | test: [ "CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"' ]
159 | interval: 10s
160 | timeout: 10s
161 | retries: 5
162 | restart: always
163 | depends_on:
164 | <<: *airflow-common-depends-on
165 | airflow-init:
166 | condition: service_completed_successfully
167 |
168 | airflow-init:
169 | <<: *airflow-common
170 | entrypoint: /bin/bash
171 | # yamllint disable rule:line-length
172 | command:
173 | - -c
174 | - |
175 | function ver() {
176 | printf "%04d%04d%04d%04d" $${1//./ }
177 | }
178 | airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version)
179 | airflow_version_comparable=$$(ver $${airflow_version})
180 | min_airflow_version=2.2.0
181 | min_airflow_version_comparable=$$(ver $${min_airflow_version})
182 | if (( airflow_version_comparable < min_airflow_version_comparable )); then
183 | echo
184 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
185 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
186 | echo
187 | exit 1
188 | fi
189 | if [[ -z "${AIRFLOW_UID}" ]]; then
190 | echo
191 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
192 | echo "If you are on Linux, you SHOULD follow the instructions below to set "
193 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
194 | echo "For other operating systems you can get rid of the warning with manually created .env file:"
195 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user"
196 | echo
197 | fi
198 | one_meg=1048576
199 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
200 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
201 | disk_available=$$(df / | tail -1 | awk '{print $$4}')
202 | warning_resources="false"
203 | if (( mem_available < 4000 )) ; then
204 | echo
205 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
206 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
207 | echo
208 | warning_resources="true"
209 | fi
210 | if (( cpus_available < 2 )); then
211 | echo
212 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
213 | echo "At least 2 CPUs recommended. You have $${cpus_available}"
214 | echo
215 | warning_resources="true"
216 | fi
217 | if (( disk_available < one_meg * 10 )); then
218 | echo
219 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
220 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
221 | echo
222 | warning_resources="true"
223 | fi
224 | if [[ $${warning_resources} == "true" ]]; then
225 | echo
226 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
227 | echo "Please follow the instructions to increase amount of resources available:"
228 | echo " https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin"
229 | echo
230 | fi
231 | mkdir -p /sources/logs /sources/dags /sources/plugins
232 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
233 | exec /entrypoint airflow version
234 | # yamllint enable rule:line-length
235 | environment:
236 | <<: *airflow-common-env
237 | _AIRFLOW_DB_UPGRADE: 'true'
238 | _AIRFLOW_WWW_USER_CREATE: 'true'
239 | _AIRFLOW_WWW_USER_USERNAME: ${AIRFLOW_USERNAME:-airflow}
240 | _AIRFLOW_WWW_USER_PASSWORD: ${AIRFLOW_PASSWORD:-airflow}
241 | _PIP_ADDITIONAL_REQUIREMENTS: ''
242 | user: "0:0"
243 | volumes:
244 | - .:/sources
245 |
246 | airflow-cli:
247 | <<: *airflow-common
248 | profiles:
249 | - debug
250 | environment:
251 | <<: *airflow-common-env
252 | CONNECTION_CHECK_MAX_COUNT: "0"
253 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
254 | command:
255 | - bash
256 | - -c
257 | - airflow
258 |
259 | # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up
260 | # or by explicitly targeted on the command line e.g. docker-compose up flower.
261 | # See: https://docs.docker.com/compose/profiles/
262 | flower:
263 | <<: *airflow-common
264 | command: celery flower
265 | profiles:
266 | - flower
267 | ports:
268 | - 5555:5555
269 | healthcheck:
270 | test: [ "CMD", "curl", "--fail", "http://localhost:5555/" ]
271 | interval: 10s
272 | timeout: 10s
273 | retries: 5
274 | restart: always
275 | depends_on:
276 | <<: *airflow-common-depends-on
277 | airflow-init:
278 | condition: service_completed_successfully
279 |
280 | volumes:
281 | postgres-db-volume:
--------------------------------------------------------------------------------
/python/simplescraper/explore/explore_dwh_location.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "outputs": [],
7 | "source": [
8 | "import duckdb\n",
9 | "import pandas as pd\n",
10 | "import plotly.express as px\n",
11 | "from plotly_calplot import calplot\n",
12 | "\n",
13 | "from common.env_variables import DUCKDB_DWH_FILE"
14 | ],
15 | "metadata": {
16 | "collapsed": false,
17 | "pycharm": {
18 | "name": "#%%\n"
19 | }
20 | }
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "outputs": [],
26 | "source": [
27 | "def display_df(df):\n",
28 | " with pd.option_context('display.max_rows', None, 'display.max_columns', None, \"expand_frame_repr\", False, \"display.float_format\", '${:,.2f}'.format):\n",
29 | " display(df.fillna('.'))"
30 | ],
31 | "metadata": {
32 | "collapsed": false,
33 | "pycharm": {
34 | "name": "#%%\n"
35 | }
36 | }
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 3,
41 | "outputs": [],
42 | "source": [
43 | "conn = duckdb.connect(DUCKDB_DWH_FILE, read_only=True)"
44 | ],
45 | "metadata": {
46 | "collapsed": false,
47 | "pycharm": {
48 | "name": "#%%\n"
49 | }
50 | }
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 4,
55 | "outputs": [
56 | {
57 | "data": {
58 | "text/plain": " location job_count\n0 Berlin 98461\n1 Hamburg 88763\n2 München 85657\n3 Frankfurt am Main 55276\n4 Stuttgart 44858\n5 Köln 44203\n6 Düsseldorf 42574\n7 Hannover 21827\n8 Nürnberg 18802\n9 Leipzig 16443\n10 Essen 15877\n11 Bremen 14867\n12 Karlsruhe 14226\n13 Mannheim 12509\n14 Dortmund 12068\n15 Bonn 11815\n16 Dresden 11510\n17 Münster 9021\n18 Wiesbaden 7999\n19 Ulm 7942\n20 Bielefeld 6671\n21 Mainz 6622\n22 Augsburg 6620\n23 Heidelberg 6426\n24 Kiel 6137\n25 Duisburg 6050\n26 bundesweit 5759\n27 Regensburg 5731\n28 Darmstadt 5592\n29 Braunschweig 5492\n30 Aachen 5183\n31 Neckarsulm 5086\n32 Bochum 4981\n33 Erfurt 4973\n34 Ingolstadt 4836\n35 Kassel 4659\n36 Wolfsburg 4471\n37 Würzburg 4439\n38 Freiburg 4310\n39 Lübeck 4276\n40 Kreisfreie Stadt 4098\n41 Gütersloh 4083\n42 Home-Office 3971\n43 Osnabrück 3851\n44 Magdeburg 3825\n45 Rostock 3763\n46 Heilbronn 3728\n47 Potsdam 3670\n48 Koblenz 3550\n49 Wuppertal 3415\n50 Freiburg im Breisgau 3394\n51 Reutlingen 3381\n52 Krefeld 3370\n53 Jena 3363\n54 Sindelfingen 3260\n55 Chemnitz 3234\n56 Mönchengladbach 3130\n57 Saarbrücken 3034\n58 Ludwigsburg 2982\n59 Oldenburg 2739\n60 Neuss 2739\n61 Erlangen 2553\n62 Pforzheim 2552\n63 Göttingen 2536\n64 Ratingen 2489\n65 Paderborn 2460\n66 deutschlandweit 2376\n67 Tübingen 2363\n68 Norderstedt 2317\n69 Leverkusen 2244\n70 Eschborn 2189\n71 Main 2172\n72 Homeoffice 2159\n73 Oberkochen 2140\n74 Ludwigshafen 2097\n75 Oberhausen 2082\n76 Böblingen 2075\n77 Leinfelden-Echterdingen 2037\n78 Bayreuth 1997\n79 Offenburg 1967\n80 Halle (Saale) 1949\n81 Hanau 1851\n82 Minden 1782\n83 Kaiserslautern 1759\n84 Fulda 1680\n85 Fürth 1678\n86 Gelsenkirchen 1669\n87 Baden-Baden 1655\n88 Bamberg 1654\n89 Hildesheim 1627\n90 Munich 1618\n91 Gießen 1611\n92 Landshut 1604\n93 Konstanz 1602\n94 Friedrichshafen 1588\n95 Hagen 1588\n96 Baden-Württemberg 1557\n97 Neu-Isenburg 1553\n98 Flensburg 1493\n99 Trier 1483",
59 | "text/html": "\n\n
\n \n \n | \n location | \n job_count | \n
\n \n \n \n | 0 | \n Berlin | \n 98461 | \n
\n \n | 1 | \n Hamburg | \n 88763 | \n
\n \n | 2 | \n München | \n 85657 | \n
\n \n | 3 | \n Frankfurt am Main | \n 55276 | \n
\n \n | 4 | \n Stuttgart | \n 44858 | \n
\n \n | 5 | \n Köln | \n 44203 | \n
\n \n | 6 | \n Düsseldorf | \n 42574 | \n
\n \n | 7 | \n Hannover | \n 21827 | \n
\n \n | 8 | \n Nürnberg | \n 18802 | \n
\n \n | 9 | \n Leipzig | \n 16443 | \n
\n \n | 10 | \n Essen | \n 15877 | \n
\n \n | 11 | \n Bremen | \n 14867 | \n
\n \n | 12 | \n Karlsruhe | \n 14226 | \n
\n \n | 13 | \n Mannheim | \n 12509 | \n
\n \n | 14 | \n Dortmund | \n 12068 | \n
\n \n | 15 | \n Bonn | \n 11815 | \n
\n \n | 16 | \n Dresden | \n 11510 | \n
\n \n | 17 | \n Münster | \n 9021 | \n
\n \n | 18 | \n Wiesbaden | \n 7999 | \n
\n \n | 19 | \n Ulm | \n 7942 | \n
\n \n | 20 | \n Bielefeld | \n 6671 | \n
\n \n | 21 | \n Mainz | \n 6622 | \n
\n \n | 22 | \n Augsburg | \n 6620 | \n
\n \n | 23 | \n Heidelberg | \n 6426 | \n
\n \n | 24 | \n Kiel | \n 6137 | \n
\n \n | 25 | \n Duisburg | \n 6050 | \n
\n \n | 26 | \n bundesweit | \n 5759 | \n
\n \n | 27 | \n Regensburg | \n 5731 | \n
\n \n | 28 | \n Darmstadt | \n 5592 | \n
\n \n | 29 | \n Braunschweig | \n 5492 | \n
\n \n | 30 | \n Aachen | \n 5183 | \n
\n \n | 31 | \n Neckarsulm | \n 5086 | \n
\n \n | 32 | \n Bochum | \n 4981 | \n
\n \n | 33 | \n Erfurt | \n 4973 | \n
\n \n | 34 | \n Ingolstadt | \n 4836 | \n
\n \n | 35 | \n Kassel | \n 4659 | \n
\n \n | 36 | \n Wolfsburg | \n 4471 | \n
\n \n | 37 | \n Würzburg | \n 4439 | \n
\n \n | 38 | \n Freiburg | \n 4310 | \n
\n \n | 39 | \n Lübeck | \n 4276 | \n
\n \n | 40 | \n Kreisfreie Stadt | \n 4098 | \n
\n \n | 41 | \n Gütersloh | \n 4083 | \n
\n \n | 42 | \n Home-Office | \n 3971 | \n
\n \n | 43 | \n Osnabrück | \n 3851 | \n
\n \n | 44 | \n Magdeburg | \n 3825 | \n
\n \n | 45 | \n Rostock | \n 3763 | \n
\n \n | 46 | \n Heilbronn | \n 3728 | \n
\n \n | 47 | \n Potsdam | \n 3670 | \n
\n \n | 48 | \n Koblenz | \n 3550 | \n
\n \n | 49 | \n Wuppertal | \n 3415 | \n
\n \n | 50 | \n Freiburg im Breisgau | \n 3394 | \n
\n \n | 51 | \n Reutlingen | \n 3381 | \n
\n \n | 52 | \n Krefeld | \n 3370 | \n
\n \n | 53 | \n Jena | \n 3363 | \n
\n \n | 54 | \n Sindelfingen | \n 3260 | \n
\n \n | 55 | \n Chemnitz | \n 3234 | \n
\n \n | 56 | \n Mönchengladbach | \n 3130 | \n
\n \n | 57 | \n Saarbrücken | \n 3034 | \n
\n \n | 58 | \n Ludwigsburg | \n 2982 | \n
\n \n | 59 | \n Oldenburg | \n 2739 | \n
\n \n | 60 | \n Neuss | \n 2739 | \n
\n \n | 61 | \n Erlangen | \n 2553 | \n
\n \n | 62 | \n Pforzheim | \n 2552 | \n
\n \n | 63 | \n Göttingen | \n 2536 | \n
\n \n | 64 | \n Ratingen | \n 2489 | \n
\n \n | 65 | \n Paderborn | \n 2460 | \n
\n \n | 66 | \n deutschlandweit | \n 2376 | \n
\n \n | 67 | \n Tübingen | \n 2363 | \n
\n \n | 68 | \n Norderstedt | \n 2317 | \n
\n \n | 69 | \n Leverkusen | \n 2244 | \n
\n \n | 70 | \n Eschborn | \n 2189 | \n
\n \n | 71 | \n Main | \n 2172 | \n
\n \n | 72 | \n Homeoffice | \n 2159 | \n
\n \n | 73 | \n Oberkochen | \n 2140 | \n
\n \n | 74 | \n Ludwigshafen | \n 2097 | \n
\n \n | 75 | \n Oberhausen | \n 2082 | \n
\n \n | 76 | \n Böblingen | \n 2075 | \n
\n \n | 77 | \n Leinfelden-Echterdingen | \n 2037 | \n
\n \n | 78 | \n Bayreuth | \n 1997 | \n
\n \n | 79 | \n Offenburg | \n 1967 | \n
\n \n | 80 | \n Halle (Saale) | \n 1949 | \n
\n \n | 81 | \n Hanau | \n 1851 | \n
\n \n | 82 | \n Minden | \n 1782 | \n
\n \n | 83 | \n Kaiserslautern | \n 1759 | \n
\n \n | 84 | \n Fulda | \n 1680 | \n
\n \n | 85 | \n Fürth | \n 1678 | \n
\n \n | 86 | \n Gelsenkirchen | \n 1669 | \n
\n \n | 87 | \n Baden-Baden | \n 1655 | \n
\n \n | 88 | \n Bamberg | \n 1654 | \n
\n \n | 89 | \n Hildesheim | \n 1627 | \n
\n \n | 90 | \n Munich | \n 1618 | \n
\n \n | 91 | \n Gießen | \n 1611 | \n
\n \n | 92 | \n Landshut | \n 1604 | \n
\n \n | 93 | \n Konstanz | \n 1602 | \n
\n \n | 94 | \n Friedrichshafen | \n 1588 | \n
\n \n | 95 | \n Hagen | \n 1588 | \n
\n \n | 96 | \n Baden-Württemberg | \n 1557 | \n
\n \n | 97 | \n Neu-Isenburg | \n 1553 | \n
\n \n | 98 | \n Flensburg | \n 1493 | \n
\n \n | 99 | \n Trier | \n 1483 | \n
\n \n
\n
"
60 | },
61 | "metadata": {},
62 | "output_type": "display_data"
63 | }
64 | ],
65 | "source": [
66 | "df = conn.execute(f'''\n",
67 | "SELECT * FROM location\n",
68 | "LIMIT 100\n",
69 | "''').df()\n",
70 | "display_df(df)"
71 | ],
72 | "metadata": {
73 | "collapsed": false,
74 | "pycharm": {
75 | "name": "#%%\n"
76 | }
77 | }
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 5,
82 | "outputs": [],
83 | "source": [
84 | "conn.close()"
85 | ],
86 | "metadata": {
87 | "collapsed": false,
88 | "pycharm": {
89 | "name": "#%%\n"
90 | }
91 | }
92 | }
93 | ],
94 | "metadata": {
95 | "kernelspec": {
96 | "display_name": "Python 3",
97 | "language": "python",
98 | "name": "python3"
99 | },
100 | "language_info": {
101 | "codemirror_mode": {
102 | "name": "ipython",
103 | "version": 2
104 | },
105 | "file_extension": ".py",
106 | "mimetype": "text/x-python",
107 | "name": "python",
108 | "nbconvert_exporter": "python",
109 | "pygments_lexer": "ipython2",
110 | "version": "2.7.6"
111 | }
112 | },
113 | "nbformat": 4,
114 | "nbformat_minor": 0
115 | }
--------------------------------------------------------------------------------