├── .gitattributes
├── sql
    └── dwh
    │   ├── job_market_analytics
    │       ├── seeds
    │       │   └── .gitkeep
    │       ├── tests
    │       │   └── .gitkeep
    │       ├── analyses
    │       │   └── .gitkeep
    │       ├── macros
    │       │   └── .gitkeep
    │       ├── snapshots
    │       │   └── .gitkeep
    │       ├── .gitignore
    │       ├── models
    │       │   ├── mart
    │       │   │   ├── normalized_online_job_months_max.sql
    │       │   │   ├── normalized_online_job_months_1.sql
    │       │   │   ├── normalized_online_job_months_3.sql
    │       │   │   ├── normalized_online_job_months_12.sql
    │       │   │   ├── latest_dim_job.sql
    │       │   │   ├── dim_time.sql
    │       │   │   ├── dim_job.sql
    │       │   │   ├── fact_online_job.sql
    │       │   │   ├── dim_job_location.sql
    │       │   │   ├── dim_job_technology.sql
    │       │   │   └── normalized_online_job.sql
    │       │   └── sources.yml
    │       ├── README.md
    │       └── dbt_project.yml
    │   ├── requirements.in
    │   ├── update_requirements.sh
    │   └── requirements.txt
├── docker
    ├── airflow
    │   ├── logs
    │   │   └── scheduler
    │   │   │   └── latest
    │   ├── docker-compose-down.sh
    │   ├── restart_worker_and_scheduler.sh
    │   ├── .env.example
    │   └── docker-compose.yml
    └── postgres
    │   ├── postgres-parquet-fdw
    │       ├── s3-download-parquet-fdw.sh
    │       ├── Dockerfile
    │       ├── s4-install-parquet-fdw.sh
    │       ├── s1-download-arrow.sh
    │       └── s2-install-arrow.sh
    │   ├── .env.example
    │   ├── README.md
    │   └── docker-compose.yml
├── doc
    ├── dbt-dag.png
    ├── airflow_dag_daily.png
    ├── scrape_data_source_dag.png
    ├── raw-in-azure-blob-storage.png
    ├── TODO-search-document-structure.json
    ├── TODO-search.md
    ├── TODO-search-pre-search-data-model.md
    ├── metaData-bag.log
    └── TODO.md
├── python
    ├── dashy
    │   ├── .env.example
    │   ├── requirements.in
    │   ├── start_dashy.sh
    │   ├── update_requirements.sh
    │   └── requirements.txt
    ├── utils
    │   ├── generate_fernet_key.py
    │   ├── migrate_to_raw_v3.py
    │   └── migrate_raw_v1_to_raw_v2.py
    ├── tests
    │   ├── test_get_run_timestamp.py
    │   ├── test_get_chunk_size.py
    │   ├── test_parse_job_description.py
    │   └── data
    │   │   └── normalize_job_description
    │   │       └── output
    │   │           ├── test_case_7610222.json
    │   │           ├── test_case_7610188.json
    │   │           └── test_case_7609275.json
    ├── airflow
    │   ├── start_airflow_scheduler.sh
    │   ├── start_airflow_webserver.sh
    │   ├── create_user.sh
    │   ├── airflow_home
    │   │   └── dags
    │   │   │   ├── common_airflow_dag.py
    │   │   │   ├── test_dag.py
    │   │   │   ├── job_market_analytics_curate_sitemaps_catch_up_dag.py
    │   │   │   ├── job_market_analytics_cleanse_sitemaps_catch_up_dag.py
    │   │   │   ├── job_market_analytics_curate_job_descriptions_catch_up_dag.py
    │   │   │   ├── job_market_analytics_cleanse_job_descriptions_catch_up_dag.py
    │   │   │   ├── job_market_analytics_cleanse_catch_up_dag.py
    │   │   │   ├── job_market_analytics_curate_catch_up_dag_v2.py
    │   │   │   ├── job_market_analytics_hourly_dag.py
    │   │   │   ├── job_market_analytics_daily_dag.py
    │   │   │   └── job_market_analytics_daily_dag_catch_up.py
    │   ├── .env.example
    │   ├── configure_posgresql.sh
    │   └── install_airflow.sh
    ├── simplescraper
    │   ├── do_dbt_run.sh
    │   ├── start_flasky.sh
    │   ├── start_dashy_static.sh
    │   ├── requirements.in
    │   ├── common
    │   │   ├── logging.py
    │   │   ├── chunking.py
    │   │   ├── explore.py
    │   │   ├── entity.py
    │   │   ├── webclient.py
    │   │   ├── env_variables.py
    │   │   └── storage.py
    │   ├── cron_job.sh
    │   ├── update_requirements.sh
    │   ├── tasks
    │   │   ├── list_downloaded_sitemaps.py
    │   │   ├── curate_sitemaps.py
    │   │   ├── list_job_descriptions_to_download.py
    │   │   ├── prune_old_raw.py
    │   │   ├── list_downloaded_job_descriptions.py
    │   │   ├── cleanse_sitemaps.py
    │   │   ├── cleanse_job_descriptions.py
    │   │   ├── parse_job_description.py
    │   │   ├── download_sitemap.py
    │   │   ├── curate_job_descriptions.py
    │   │   └── download_job_descriptions.py
    │   ├── do_day_backup.sh
    │   ├── dashy_static.py
    │   ├── .env.example
    │   ├── create_curated_views_in_dwh.py
    │   ├── restore_day_backup.sh
    │   ├── scrape_data_source.py
    │   ├── verify_day_backup.sh
    │   ├── verify_all_backups.py
    │   ├── restore_all_backups.py
    │   ├── do_all_backups.py
    │   ├── explore
    │   │   ├── explore_dwh_mart.ipynb
    │   │   ├── explore_dwh_mart_dim_time.ipynb
    │   │   └── explore_dwh_location.ipynb
    │   ├── flasky.py
    │   └── requirements.txt
    └── .flake8
├── Brewfile
├── azure
    ├── .env.example
    └── sync-remote-to-local.sh
├── .gitignore
└── README.md


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb    linguist-vendored


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/analyses/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/macros/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sql/dwh/requirements.in:
--------------------------------------------------------------------------------
1 | dbt-duckdb==1.5.1
2 | duckdb==0.7.0
3 | 


--------------------------------------------------------------------------------
/docker/airflow/logs/scheduler/latest:
--------------------------------------------------------------------------------
1 | /opt/airflow/logs/scheduler/2022-07-30


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | target/
3 | dbt_packages/
4 | logs/
5 | 


--------------------------------------------------------------------------------
/doc/dbt-dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petracarrion/job-market-analytics/HEAD/doc/dbt-dag.png


--------------------------------------------------------------------------------
/doc/airflow_dag_daily.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petracarrion/job-market-analytics/HEAD/doc/airflow_dag_daily.png


--------------------------------------------------------------------------------
/python/dashy/.env.example:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | export DUCKDB_DWH_FILE=
4 | export VENV_ACTIVATE=
5 | export LOG_FOLDER=
6 | 


--------------------------------------------------------------------------------
/doc/scrape_data_source_dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petracarrion/job-market-analytics/HEAD/doc/scrape_data_source_dag.png


--------------------------------------------------------------------------------
/doc/raw-in-azure-blob-storage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petracarrion/job-market-analytics/HEAD/doc/raw-in-azure-blob-storage.png


--------------------------------------------------------------------------------
/python/utils/generate_fernet_key.py:
--------------------------------------------------------------------------------
1 | from cryptography.fernet import Fernet
2 | 
3 | fernet_key = Fernet.generate_key()
4 | print(fernet_key.decode())
5 | 


--------------------------------------------------------------------------------
/python/dashy/requirements.in:
--------------------------------------------------------------------------------
1 | dash
2 | dash-bootstrap-components
3 | duckdb==0.7.0
4 | gunicorn
5 | jupyter-dash
6 | loguru
7 | pandas
8 | python-dotenv
9 | 


--------------------------------------------------------------------------------
/Brewfile:
--------------------------------------------------------------------------------
1 | tap "homebrew/bundle"
2 | tap "homebrew/core"
3 | brew "openblas"
4 | brew "parquet-tools"
5 | brew "postgresql"
6 | brew "rdfind"
7 | brew "rust"
8 | brew "wget"
9 | 


--------------------------------------------------------------------------------
/docker/airflow/docker-compose-down.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
4 | cd "$SCRIPTPATH" || exit
5 | 
6 | docker compose down
7 | 


--------------------------------------------------------------------------------
/docker/postgres/postgres-parquet-fdw/s3-download-parquet-fdw.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | apt-get update
4 | apt-get install -y git
5 | 
6 | git clone https://github.com/adjust/parquet_fdw.git
7 | 


--------------------------------------------------------------------------------
/azure/.env.example:
--------------------------------------------------------------------------------
1 | RAW_DIR=
2 | 
3 | AZURE_STORAGE_CONTAINER_RAW_DIR_URL=
4 | 
5 | export AZCOPY_AUTO_LOGIN_TYPE=SPN
6 | export AZCOPY_SPA_APPLICATION_ID=
7 | export AZCOPY_SPA_CLIENT_SECRET=
8 | export AZCOPY_TENANT_ID=


--------------------------------------------------------------------------------
/python/tests/test_get_run_timestamp.py:
--------------------------------------------------------------------------------
1 | from common.storage import get_load_timestamp
2 | 
3 | 
4 | def test_get_load_timestamp():
5 |     assert get_load_timestamp('2022-01-22T12:49:39.448434+00:00') == '2022/01/22/12-49-39'
6 | 


--------------------------------------------------------------------------------
/docker/postgres/.env.example:
--------------------------------------------------------------------------------
 1 | POSTGRES_USER=
 2 | POSTGRES_PASSWORD=
 3 | POSTGRES_DB=
 4 | POSTGRES_VOLUME=
 5 | POSTGRES_PARQUET_FDW_VOLUME=
 6 | 
 7 | PGADMIN_DEFAULT_EMAIL=
 8 | PGADMIN_DEFAULT_PASSWORD=
 9 | PGADMIN_VOLUME=
10 | 


--------------------------------------------------------------------------------
/docker/postgres/README.md:
--------------------------------------------------------------------------------
 1 | # Infrastructure
 2 | 
 3 | ## How to run it
 4 | 
 5 | Go to the folder postgres-parquet-fdw and run:
 6 | 
 7 | `docker build -t postgres-parquet-fdw:v1 .`
 8 | 
 9 | Then run:
10 | 
11 | `docker-compose up`


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/normalized_online_job_months_max.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='view'
 4 |     )
 5 | }}
 6 | 
 7 | SELECT *
 8 |   FROM {{ ref('normalized_online_job') }}
 9 |  ORDER BY online_at
10 | 


--------------------------------------------------------------------------------
/python/airflow/start_airflow_scheduler.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 4 | cd "$SCRIPTPATH" || exit
 5 | 
 6 | source .env
 7 | 
 8 | source "${VENV_ACTIVATE}"
 9 | 
10 | airflow scheduler
11 | 


--------------------------------------------------------------------------------
/python/airflow/start_airflow_webserver.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 4 | cd "$SCRIPTPATH" || exit
 5 | 
 6 | source .env
 7 | 
 8 | source "${VENV_ACTIVATE}"
 9 | 
10 | airflow webserver
11 | 


--------------------------------------------------------------------------------
/docker/postgres/postgres-parquet-fdw/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM postgres:14.1 as postgres-parquet-fdw
2 | 
3 | COPY *.sh /usr/local/bin/
4 | 
5 | RUN s1-download-arrow.sh
6 | RUN s2-install-arrow.sh
7 | RUN s3-download-parquet-fdw.sh
8 | RUN s4-install-parquet-fdw.sh
9 | 


--------------------------------------------------------------------------------
/docker/postgres/postgres-parquet-fdw/s4-install-parquet-fdw.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | apt-get update
 4 | apt-get -y install \
 5 |      build-essential \
 6 |      cmake \
 7 |      postgresql-server-dev-14
 8 | 
 9 | cd parquet_fdw || exit
10 | make install
11 | 


--------------------------------------------------------------------------------
/python/simplescraper/do_dbt_run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 4 | cd "$SCRIPTPATH" || exit
 5 | 
 6 | source .env
 7 | source "${DBT_VENV_ACTIVATE}"
 8 | 
 9 | cd "$DBT_DIR" || exit
10 | 
11 | dbt run
12 | 


--------------------------------------------------------------------------------
/sql/dwh/update_requirements.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 4 | cd "$SCRIPTPATH" || exit
 5 | 
 6 | source .env
 7 | 
 8 | source "${VENV_ACTIVATE}"
 9 | 
10 | pip install -r requirements.in
11 | pip freeze > requirements.txt
12 | 


--------------------------------------------------------------------------------
/azure/sync-remote-to-local.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | source .env
4 | 
5 | azcopy login --service-principal  --application-id "$AZCOPY_SPA_APPLICATION_ID" --tenant-id="$AZCOPY_TENANT_ID"
6 | 
7 | azcopy sync "${RAW_DIR}" "${AZURE_STORAGE_CONTAINER_RAW_DIR_URL}" --recursive --exclude-pattern=".*"
8 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/normalized_online_job_months_1.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='table'
 4 |     )
 5 | }}
 6 | 
 7 | SELECT *
 8 |   FROM {{ ref('normalized_online_job') }}
 9 |  WHERE online_at >= current_date - INTERVAL 1 MONTH
10 |  ORDER BY online_at
11 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/normalized_online_job_months_3.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='table'
 4 |     )
 5 | }}
 6 | 
 7 | SELECT *
 8 |   FROM {{ ref('normalized_online_job') }}
 9 |  WHERE online_at >= current_date - INTERVAL 3 MONTH
10 |  ORDER BY online_at
11 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/normalized_online_job_months_12.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='table'
 4 |     )
 5 | }}
 6 | 
 7 | SELECT *
 8 |   FROM {{ ref('normalized_online_job') }}
 9 |  WHERE online_at >= current_date - INTERVAL 12 MONTH
10 |  ORDER BY online_at
11 | 


--------------------------------------------------------------------------------
/python/dashy/start_dashy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 4 | cd "$SCRIPTPATH" || exit
 5 | 
 6 | source .env
 7 | 
 8 | source "${VENV_ACTIVATE}"
 9 | 
10 | gunicorn --workers 1 --timeout 600 --bind 0.0.0.0:8051 dashy:server --access-logfile '-'
11 | 


--------------------------------------------------------------------------------
/python/tests/test_get_chunk_size.py:
--------------------------------------------------------------------------------
1 | from common.chunking import get_chunk_size
2 | 
3 | 
4 | def test_get_chunk_size():
5 |     assert get_chunk_size(1000, 10, 500) == 100
6 |     assert get_chunk_size(1000, 10, 50) == 50
7 |     assert get_chunk_size(60, 4, 10) == 8
8 |     assert get_chunk_size(100, 4, 10) == 9
9 | 


--------------------------------------------------------------------------------
/docker/postgres/postgres-parquet-fdw/s1-download-arrow.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | apt update
4 | apt install -y -V ca-certificates lsb-release wget
5 | wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
6 | 


--------------------------------------------------------------------------------
/python/simplescraper/start_flasky.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 4 | cd "$SCRIPTPATH" || exit
 5 | 
 6 | source .env
 7 | 
 8 | source "${VENV_ACTIVATE}"
 9 | 
10 | ulimit -n 4096
11 | gunicorn --workers 4 --timeout 3600 --bind 0.0.0.0:3001 'flasky:app'
12 | 


--------------------------------------------------------------------------------
/python/simplescraper/start_dashy_static.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 4 | cd "$SCRIPTPATH" || exit
 5 | 
 6 | source .env
 7 | 
 8 | source "${VENV_ACTIVATE}"
 9 | 
10 | ulimit -n 4096
11 | gunicorn --workers 4 --timeout 3600 --bind 0.0.0.0:8054 'dashy_static:app'
12 | 


--------------------------------------------------------------------------------
/docker/airflow/restart_worker_and_scheduler.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 4 | cd "$SCRIPTPATH" || exit
 5 | 
 6 | for container in airflow-worker airflow-scheduler; do
 7 |   docker compose stop $container
 8 |   docker compose rm -f $container
 9 |   docker compose up $container -d
10 | done
11 | 


--------------------------------------------------------------------------------
/python/simplescraper/requirements.in:
--------------------------------------------------------------------------------
 1 | azure-storage-blob==2.1.0
 2 | beautifulsoup4
 3 | duckdb==0.7.0
 4 | Flask
 5 | gunicorn
 6 | jupyter
 7 | kaleido
 8 | lxml
 9 | loguru
10 | pandas
11 | pip-tools
12 | playwright==1.30.0
13 | plotly-calplot
14 | plotly-express
15 | pyarrow
16 | pytest
17 | python-dotenv
18 | requests
19 | wemake-python-styleguide
20 | xmltodict
21 | 


--------------------------------------------------------------------------------
/python/simplescraper/common/logging.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import sys
 3 | 
 4 | from loguru import logger
 5 | 
 6 | from common.env_variables import TEMP_DIR
 7 | 
 8 | 
 9 | def configure_logger(load_timestamp):
10 |     logger.remove()
11 |     logger.add(sys.stdout, colorize=True)
12 |     logger.add(os.path.join(TEMP_DIR, load_timestamp, f'00_logs.log'))
13 | 
14 | 
15 | logger = logger
16 | 


--------------------------------------------------------------------------------
/python/airflow/create_user.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source .env
 4 | 
 5 | airflow users create \
 6 |         --role Admin \
 7 |         --username "${AIRFLOW_USERNAME}" \
 8 |         --password "${AIRFLOW_PASSWORD}" \
 9 |         --email "${AIRFLOW_EMAIL}" \
10 |         --firstname "${AIRFLOW_FIRSTNAME}" \
11 |         --lastname "${AIRFLOW_LASTNAME}"
12 | 
13 | airflow users delete -e admin
14 | 


--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/common_airflow_dag.py:
--------------------------------------------------------------------------------
 1 | from airflow.operators.python import get_current_context
 2 | from airflow.providers.http.hooks.http import HttpHook
 3 | 
 4 | 
 5 | def run_flasky_task(endpoint):
 6 |     context = get_current_context()
 7 |     data = {
 8 |         'data_interval_end': context['data_interval_end'],
 9 |         'ds': context['ds'],
10 |     }
11 |     HttpHook().run(endpoint, data)
12 | 


--------------------------------------------------------------------------------
/doc/TODO-search-document-structure.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "job_id": "4324234",
 3 |   "short_description": "Snail Collector at Alternative Food in Berlin or Hamburg",
 4 |   "url": "https://data.source/snail-collector-berlin-hamburg.html",
 5 |   "locations": [
 6 |     "Berlin",
 7 |     "Hamburg"
 8 |   ],
 9 |   "online_week": [
10 |     "2022W11",
11 |     "2022W10",
12 |     "2022W09",
13 |     "2022W02",
14 |     "2022W01"
15 |   ]
16 | }


--------------------------------------------------------------------------------
/doc/TODO-search.md:
--------------------------------------------------------------------------------
 1 | # Search
 2 | 
 3 | ## Facets
 4 | 
 5 | - Company
 6 | - Position
 7 | - Technology
 8 | - Location
 9 | - Date?
10 | 
11 | ## Document Fields
12 | 
13 | - Job ID?
14 | - Job Short Description
15 |   - Job Name
16 |   - Job Company
17 |   - Job Locations
18 | - Job URL
19 | - Job Online Dates
20 | 
21 | ## Document Structure
22 | 
23 | See [TODO-search-document-structure.json](TODO-search-document-structure.json)
24 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/sources.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name: curated
 5 |     schema: curated
 6 |     freshness: # default freshness
 7 |       warn_after: { count: 24, period: hour }
 8 |       error_after: { count: 36, period: hour }
 9 |     loaded_at_field: load_timestamp
10 |     tables:
11 |       - name: online_job
12 |       - name: job
13 |       - name: job_location
14 |       - name: job_technology
15 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/latest_dim_job.sql:
--------------------------------------------------------------------------------
 1 | SELECT job_key,
 2 |        job_id,
 3 |        job_ldts,
 4 |        title,
 5 |        company_name
 6 |   FROM (
 7 |     SELECT job_key,
 8 |            job_id,
 9 |            job_ldts,
10 |            title,
11 |            company_name,
12 |            ROW_NUMBER() OVER (PARTITION BY job_id ORDER BY job_ldts DESC) rn
13 |       FROM {{ ref('dim_job') }}
14 |     )
15 | WHERE rn = 1
16 | 


--------------------------------------------------------------------------------
/python/simplescraper/common/chunking.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | 
 4 | def get_chunk_size(total, slots, max_chunk_size):
 5 |     max_run_size = slots * max_chunk_size
 6 | 
 7 |     number_of_runs = total / max_run_size
 8 |     number_of_runs = int(math.ceil(number_of_runs))
 9 | 
10 |     number_of_chunks = number_of_runs * slots
11 | 
12 |     chunk_size = total / number_of_chunks
13 |     chunk_size = int(math.ceil(chunk_size))
14 | 
15 |     return chunk_size
16 | 


--------------------------------------------------------------------------------
/python/simplescraper/cron_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Add the following to the cron jobs: 42 * * * * REPLACE_ME/cron_job.sh
 4 | 
 5 | /usr/sbin/scutil --nc list | grep Connected | grep vpn || {
 6 |   echo "Please connect to the VPN"
 7 |   exit 1
 8 | }
 9 | 
10 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
11 | cd "$SCRIPTPATH" || exit
12 | 
13 | source .env
14 | 
15 | source "${VENV_ACTIVATE}"
16 | 
17 | "${VENV_PYTHON}" "${SOURCE_DIR}"/simplescraper/scrape_data_source.py
18 | 


--------------------------------------------------------------------------------
/python/airflow/.env.example:
--------------------------------------------------------------------------------
 1 | export VENV_ACTIVATE=
 2 | 
 3 | export AIRFLOW_HOME=
 4 | 
 5 | export AIRFLOW_DATABASE_NAME=
 6 | export AIRFLOW_DATABASE_USERNAME=
 7 | export AIRFLOW_DATABASE_PASSWORD=
 8 | 
 9 | export AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=
10 | export AIRFLOW__CORE__EXECUTOR=
11 | 
12 | export AIRFLOW_USERNAME=
13 | export AIRFLOW_PASSWORD=
14 | export AIRFLOW_EMAIL=
15 | export AIRFLOW_FIRSTNAME=
16 | export AIRFLOW_LASTNAME=
17 | 
18 | export AIRFLOW__CORE__LOAD_EXAMPLES=
19 | 
20 | export AIRFLOW_CONN_HTTP_DEFAULT=
21 | 


--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/test_dag.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | from airflow import DAG
 4 | from airflow.decorators import task
 5 | 
 6 | from common_airflow_dag import run_flasky_task
 7 | 
 8 | with DAG('test_dag2',
 9 |          description='Test DAG',
10 |          schedule_interval='@daily',
11 |          start_date=datetime(2022, 7, 29),
12 |          catchup=False) as dag:
13 |     @task(task_id="test_task")
14 |     def run_test():
15 |         run_flasky_task('do/test')
16 | 
17 | 
18 |     run_test()
19 | 


--------------------------------------------------------------------------------
/python/simplescraper/update_requirements.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 4 | cd "$SCRIPTPATH" || exit
 5 | 
 6 | source .env
 7 | 
 8 | source "${VENV_ACTIVATE}"
 9 | 
10 | 
11 | if ! pip show pip-tools; then
12 |   pip install pip-tools
13 | fi
14 | 
15 | pip-compile requirements.in --allow-unsafe
16 | pip-sync
17 | # pip install "apache-airflow[celery]==2.2.3" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.2.3/constraints-3.8.txt"
18 | # pip install dbt-postgres
19 | 


--------------------------------------------------------------------------------
/python/airflow/configure_posgresql.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 4 | cd "$SCRIPTPATH" || exit
 5 | 
 6 | source .env
 7 | 
 8 | sudo -u postgres psql -c "CREATE DATABASE ${AIRFLOW_DATABASE_NAME};"
 9 | 
10 | sudo -u postgres psql -c "CREATE USER ${AIRFLOW_DATABASE_USERNAME} WITH ENCRYPTED PASSWORD '${AIRFLOW_DATABASE_PASSWORD};'"
11 | 
12 | sudo -u postgres psql -c "GRANT ALL PRIVILEGES ON DATABASE ${AIRFLOW_DATABASE_NAME} TO ${AIRFLOW_DATABASE_USERNAME};"
13 | sudo -u postgres psql -c "GRANT ALL ON SCHEMA public TO ${AIRFLOW_DATABASE_USERNAME};"
14 | 


--------------------------------------------------------------------------------
/python/dashy/update_requirements.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 4 | cd "$SCRIPTPATH" || exit
 5 | 
 6 | source .env
 7 | 
 8 | source "${VENV_ACTIVATE}"
 9 | 
10 | which pip | grep dashy || (echo "Wrong venv!!!" && exit)
11 | 
12 | if ! pip show pip-tools; then
13 |   pip install pip-tools
14 | fi
15 | 
16 | pip-compile requirements.in --allow-unsafe
17 | pip-sync
18 | # pip install "apache-airflow[celery]==2.2.3" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.2.3/constraints-3.8.txt"
19 | # pip install dbt-postgres
20 | 


--------------------------------------------------------------------------------
/python/simplescraper/common/explore.py:
--------------------------------------------------------------------------------
 1 | import duckdb
 2 | import pandas as pd
 3 | from IPython.display import display
 4 | 
 5 | from common.env_variables import DUCKDB_DWH_FILE
 6 | 
 7 | 
 8 | def display_df(_df):
 9 |     with pd.option_context('display.max_rows', None, 'display.max_columns', None, "expand_frame_repr", False,
10 |                            "display.float_format", '${:,.2f}'.format):
11 |         display(_df.fillna('.'))
12 | 
13 | 
14 | def display_sql(sql_statement, read_only=True):
15 |     conn = duckdb.connect(DUCKDB_DWH_FILE, read_only=read_only)
16 |     _df = conn.execute(sql_statement).df()
17 |     conn.close()
18 |     return _df
19 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to your new dbt project!
 2 | 
 3 | ### Using the starter project
 4 | 
 5 | Try running the following commands:
 6 | - dbt run
 7 | - dbt test
 8 | 
 9 | 
10 | ### Resources:
11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support
14 | - Find [dbt events](https://events.getdbt.com) near you
15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
16 | 


--------------------------------------------------------------------------------
/docker/airflow/.env.example:
--------------------------------------------------------------------------------
 1 | AIRFLOW_UID=
 2 | AIRFLOW_GID=
 3 | 
 4 | AIRFLOW_FERNET_KEY=
 5 | AIRFLOW_SECRET_KEY=
 6 | 
 7 | AIRFLOW_DATABASE_HOST=
 8 | AIRFLOW_DATABASE_PORT_NUMBER=
 9 | AIRFLOW_DATABASE_NAME=
10 | AIRFLOW_DATABASE_USERNAME=
11 | AIRFLOW_DATABASE_PASSWORD=
12 | AIRFLOW_DATABASE_USE_SSL=
13 | 
14 | AIRFLOW_USERNAME=
15 | AIRFLOW_PASSWORD=
16 | AIRFLOW_EMAIL=
17 | AIRFLOW_FIRSTNAME=
18 | AIRFLOW_LASTNAME=
19 | 
20 | AIRFLOW_WEBSERVER_VOLUME=
21 | AIRFLOW_DAGS_VOLUME=
22 | AIRFLOW_LOGS_VOLUME=
23 | AIRFLOW_PLUGINS_VOLUME=
24 | REDIS_VOLUME=
25 | 
26 | AIRFLOW__CORE__LOAD_EXAMPLES=
27 | 
28 | AIRFLOW_CONN_HTTP_DEFAULT=
29 | AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG=
30 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/dim_time.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized = 'table',
 4 |     )
 5 | }}
 6 | 
 7 | WITH unique_online_at AS (
 8 |     SELECT DISTINCT online_at
 9 |       FROM {{ source('curated', 'online_job') }}
10 |      ORDER BY 1
11 | )
12 | SELECT online_at as date_key,
13 |        date_part('year', online_at) as year,
14 |        date_part('month', online_at) as month,
15 |        date_part('day', online_at) as day,
16 |        monthname(online_at) as month_name,
17 |        date_part('yearweek', online_at) as year_week,
18 |        date_part('isodow', online_at) as day_of_week,
19 |        dayname(online_at) as day_of_week_name
20 |   FROM unique_online_at
21 | 


--------------------------------------------------------------------------------
/python/simplescraper/common/entity.py:
--------------------------------------------------------------------------------
 1 | class Entity:
 2 |     def __init__(self, name):
 3 |         self.name = name
 4 | 
 5 |     def __str__(self):
 6 |         return self.name
 7 | 
 8 | 
 9 | SITEMAP = Entity('sitemap')
10 | ONLINE_JOB = Entity('online_job')
11 | JOB_DESCRIPTION = Entity('job_description')
12 | JOB = Entity('job')
13 | JOB_LOCATION = Entity('job_location')
14 | JOB_TECHNOLOGY = Entity('job_technology')
15 | 
16 | RAW_ENTITIES = [
17 |     SITEMAP,
18 |     JOB_DESCRIPTION,
19 | ]
20 | CURATED_ENTITIES = [
21 |     ONLINE_JOB,
22 |     JOB,
23 |     JOB_LOCATION,
24 |     JOB_TECHNOLOGY,
25 | ]
26 | 
27 | if __name__ == "__main__":
28 |     for entity in CURATED_ENTITIES:
29 |         print(entity)
30 | 


--------------------------------------------------------------------------------
/python/simplescraper/tasks/list_downloaded_sitemaps.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from common.entity import SITEMAP
 4 | from common.env_variables import LATEST_LOAD_TIMESTAMP
 5 | from common.storage import DATA_SOURCE_NAME, save_temp_df, list_raw_files, DOWNLOADED_SITEMAPS_CSV, get_load_date
 6 | 
 7 | 
 8 | def list_downloaded_sitemaps(load_timestamp, load_date=None) -> pd.DataFrame:
 9 |     files = list_raw_files(DATA_SOURCE_NAME, SITEMAP, load_date)
10 |     df = pd.DataFrame(files)
11 |     df = df[df['file_name'] != 'sitemapindex.xml']
12 |     if load_date is None:
13 |         save_temp_df(df, load_timestamp, DOWNLOADED_SITEMAPS_CSV)
14 |     return df
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     list_downloaded_sitemaps(LATEST_LOAD_TIMESTAMP, get_load_date())
19 | 


--------------------------------------------------------------------------------
/python/simplescraper/do_day_backup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 4 | cd "$SCRIPTPATH" || exit
 5 | 
 6 | source .env
 7 | 
 8 | if [[ $# -ne 3 ]] ; then
 9 |     echo "Please provide a date as script parameters in the following format: year month day"
10 |     echo "Example: $0 2022 12 01"
11 |     exit 1
12 | fi
13 | 
14 | for entity in job_description sitemap
15 | do
16 | 
17 |   source=${RAW_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2/$3
18 | 
19 |   if [ -d "$source" ]
20 |   then
21 | 
22 |     target_dir=${BACKUP_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2
23 |     target_filename=${target_dir}/${entity}.$1$2$3.tar.gz
24 |     mkdir -p "${target_dir}"
25 |     tar -zcvf "${target_filename}" -C "${source}" .
26 | 
27 |   fi
28 | 
29 | done
30 | 


--------------------------------------------------------------------------------
/doc/TODO-search-pre-search-data-model.md:
--------------------------------------------------------------------------------
 1 | # Pre Search Data Model
 2 | 
 3 | ## Overview
 4 | 
 5 | - job_online
 6 |   - job_id
 7 |   - online_at
 8 |   - url
 9 | - job
10 |   - job_id
11 | - job_description
12 |   - job_id
13 |   - title
14 |   - online_status
15 |   - is_anonymous
16 |   - should_display_early_applicant
17 |   - contract_type
18 |   - work_type
19 |   - online_date
20 |   - description_introduction
21 |   - description_responsabilities
22 |   - description_requirements'
23 |   - description_perks
24 | - company
25 |   - company_name
26 | - job_company
27 |   - job_id
28 |   - company_name
29 | - location
30 |   - location_name
31 | - job_location
32 |   - job_id
33 |   - location_name
34 | - technology
35 |   - technology_name
36 | - job_technology
37 |   - job_id
38 |   - technology_name
39 | 


--------------------------------------------------------------------------------
/python/airflow/install_airflow.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | which pip | grep /airflow/venv/ || (echo "Wrong venv!!!" && exit)
 4 | 
 5 | # Install Airflow using the constraints file
 6 | AIRFLOW_VERSION=2.7.2
 7 | PYTHON_VERSION="$(python --version | cut -d " " -f 2 | cut -d "." -f 1-2)"
 8 | # For example: 3.7
 9 | CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt"
10 | # For example: https://raw.githubusercontent.com/apache/airflow/constraints-2.4.1/constraints-3.7.txt
11 | pip install "apache-airflow==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}"
12 | pip install psycopg2
13 | 
14 | airflow db upgrade
15 | 
16 | # The Standalone command will initialise the database, make a user,
17 | # and start all components for you.
18 | airflow standalone
19 | 


--------------------------------------------------------------------------------
/python/tests/test_parse_job_description.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pytest
 4 | 
 5 | from tasks.parse_job_description import parse_job_description
 6 | 
 7 | 
 8 | def load_file(file_path):
 9 |     with open(f'data/normalize_job_description/{file_path}', 'r') as f:
10 |         content = f.read()
11 |     return content
12 | 
13 | 
14 | @pytest.mark.parametrize('test_case', ['test_case_7610188', 'test_case_7610222', 'test_case_7609275'])
15 | def test_parse_job_description(test_case):
16 |     input_content = load_file('input/' + test_case + '.txt')
17 | 
18 |     result_content = parse_job_description(input_content)
19 |     # temp = json.dumps(result_content, indent=2, ensure_ascii=False)
20 | 
21 |     output_content = json.loads(load_file('output/' + test_case + '.json'))
22 |     assert result_content == output_content
23 | 


--------------------------------------------------------------------------------
/python/simplescraper/dashy_static.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask
 2 | 
 3 | app = Flask(__name__)
 4 | 
 5 | HTML = '''
 6 | <style>
 7 |     h1, h2, img {
 8 |         display: block;
 9 |         max-width:80%;
10 |         width: auto;
11 |         height: auto;
12 |         margin-left: 10%;
13 |     }
14 | </style>
15 | 
16 | <h1>Static Dashboard</h1>
17 | 
18 | <h2>Overview</h2>
19 | <img src="https://petra.carrion.io/static_dashboard/imgs/overview.png" alt="Overview">
20 | 
21 | <h2>Top Five Cities</h2>
22 | <img src="https://petra.carrion.io/static_dashboard/imgs/top_5_cities.png" alt="Top Five Cities">
23 | 
24 | <h2>Top Five Technologies</h2>
25 | <img src="https://petra.carrion.io/static_dashboard/imgs/top_5_technologies.png" alt="Top Five Technologies">
26 | '''
27 | 
28 | 
29 | @app.route('/')
30 | def index():
31 |     return HTML
32 | 
33 | 


--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_curate_sitemaps_catch_up_dag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime, timedelta
 3 | 
 4 | from airflow import DAG
 5 | from airflow.decorators import task
 6 | 
 7 | from common_airflow_dag import run_flasky_task
 8 | 
 9 | os.environ["no_proxy"] = "*"
10 | 
11 | with DAG('job_market_analytics_curate_sitemaps_catch_up_dag',
12 |          description='Job Market Analytics Curate Sitemaps Catch Up DAG',
13 |          schedule_interval='@daily',
14 |          start_date=datetime(2022, 1, 1),
15 |          dagrun_timeout=timedelta(minutes=60),
16 |          max_active_runs=4,
17 |          max_active_tasks=4,
18 |          catchup=True) as dag:
19 |     @task(task_id="curate_sitemaps")
20 |     def curate_sitemaps():
21 |         run_flasky_task('do/curate_sitemaps')
22 | 
23 | 
24 |     curate_sitemaps()
25 | 


--------------------------------------------------------------------------------
/python/simplescraper/.env.example:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export VENV_ACTIVATE=
 4 | export VENV_PYTHON=
 5 | export SOURCE_DIR=
 6 | 
 7 | export DATA_DIR=
 8 | export DATA_SOURCE_NAME=
 9 | export DATA_SOURCE_URL=
10 | 
11 | export RAW_DIR=
12 | export CLEANSED_DIR=
13 | export CURATED_DIR=
14 | export DUCKDB_DWH_FILE=
15 | export TEMP_DIR=
16 | 
17 | export BACKUP_DIR=
18 | 
19 | export SEMAPHORE_COUNT=
20 | export MAX_CHUNK_SIZE=
21 | export MIN_TO_DOWNLOAD=
22 | export MAX_TO_DOWNLOAD=
23 | export ONLINE_EXPIRATION_IN_DAYS=
24 | 
25 | export LATEST_LOAD_TIMESTAMP=
26 | 
27 | export RUN_HEADLESS=
28 | 
29 | export FLASK_APP=
30 | export FLASK_ENV=
31 | export FLASK_DEBUG=
32 | 
33 | export UPLOAD_TO_AZURE=
34 | 
35 | export AZURE_STORAGE_CONNECTION_STRING=
36 | export AZURE_STORAGE_CONTAINER_NAME=
37 | 
38 | export LANG=
39 | export LC_ALL=
40 | 
41 | export DBT_VENV_ACTIVATE=
42 | export DBT_DIR=
43 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/dim_job.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='incremental'
 4 |     )
 5 | }}
 6 | 
 7 | 
 8 | SELECT MD5(CONCAT_WS('||',
 9 |             COALESCE(
10 |                 UPPER(TRIM(CAST(
11 |                     job.job_id
12 |                 AS VARCHAR))),
13 |                 '^^'),
14 |             COALESCE(
15 |                 UPPER(TRIM(CAST(
16 |                     job.load_timestamp
17 |                 AS VARCHAR))),
18 |                 '^^')
19 |        )) AS job_key,
20 |        job.job_id,
21 |        job.load_timestamp as job_ldts,
22 |        job.title,
23 |        job.company_name
24 |   FROM {{ source('curated', 'job') }}
25 | 
26 | {% if is_incremental() %}
27 |  LEFT OUTER JOIN dim_job
28 |   ON (job.job_id   = dim_job.job_id AND
29 |       job.load_timestamp = dim_job.job_ldts)
30 | WHERE dim_job.job_id IS NULL
31 | {% endif %}
32 | 


--------------------------------------------------------------------------------
/python/simplescraper/common/webclient.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | REQUEST_HEADERS = {
 4 |     "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,"
 5 |               "application/signed-exchange;v=b3;q=0.9",
 6 |     "accept-language": "en-US,en;q=0.9,es;q=0.8,it-IT;q=0.7,it;q=0.6,de-DE;q=0.5,de;q=0.4",
 7 |     "cache-control": "max-age=0",
 8 |     "sec-ch-ua": "\"Chromium\";v=\"94\", \"Google Chrome\";v=\"94\", \";Not A Brand\";v=\"99\"",
 9 |     "sec-ch-ua-mobile": "?0",
10 |     "sec-ch-ua-platform": "\"macOS\"",
11 |     "sec-fetch-dest": "document",
12 |     "sec-fetch-mode": "navigate",
13 |     "sec-fetch-site": "none",
14 |     "sec-fetch-user": "?1",
15 |     "upgrade-insecure-requests": "1"
16 | }
17 | 
18 | 
19 | def get_url_content(url):
20 |     response = requests.get(url)
21 |     content = response.content
22 |     return content
23 | 


--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_cleanse_sitemaps_catch_up_dag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime, timedelta
 3 | 
 4 | from airflow import DAG
 5 | from airflow.decorators import task
 6 | 
 7 | from common_airflow_dag import run_flasky_task
 8 | 
 9 | os.environ["no_proxy"] = "*"
10 | 
11 | with DAG('job_market_analytics_cleanse_sitemaps_catch_up_dag',
12 |          description='Job Market Analytics Cleanse Sitemaps Catch Up DAG',
13 |          schedule_interval='@daily',
14 |          start_date=datetime(2022, 1, 1),
15 |          # end_date=datetime(2021, 12, 1),
16 |          dagrun_timeout=timedelta(minutes=10),
17 |          max_active_runs=4,
18 |          max_active_tasks=4,
19 |          catchup=True) as dag:
20 |     @task(task_id="cleanse_sitemaps")
21 |     def cleanse_sitemaps():
22 |         run_flasky_task('do/cleanse_sitemaps')
23 | 
24 | 
25 |     cleanse_sitemaps()
26 | 


--------------------------------------------------------------------------------
/python/simplescraper/create_curated_views_in_dwh.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import duckdb
 4 | 
 5 | from common.entity import CURATED_ENTITIES
 6 | from common.env_variables import CURATED_DIR, DATA_SOURCE_NAME, DUCKDB_DWH_FILE
 7 | 
 8 | 
 9 | def create_curated_views_in_dwh():
10 |     conn = duckdb.connect(DUCKDB_DWH_FILE)
11 | 
12 |     conn.execute(f'''
13 |     CREATE SCHEMA IF NOT EXISTS curated;
14 |     ''')
15 | 
16 |     for entity in CURATED_ENTITIES:
17 |         curated_path = os.path.join(CURATED_DIR, DATA_SOURCE_NAME, entity.name, '*/*/*/*.parquet')
18 | 
19 |         conn.execute(f'''
20 |         CREATE OR REPLACE view curated.{entity.name} AS
21 |             SELECT * FROM parquet_scan('{curated_path}', HIVE_PARTITIONING=1)
22 |              -- WHERE load_timestamp < '2022-07-01'
23 |              ;
24 |         ''')
25 | 
26 |     conn.close()
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     create_curated_views_in_dwh()
31 | 


--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_curate_job_descriptions_catch_up_dag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime, timedelta
 3 | 
 4 | from airflow import DAG
 5 | from airflow.decorators import task
 6 | 
 7 | from common_airflow_dag import run_flasky_task
 8 | 
 9 | os.environ["no_proxy"] = "*"
10 | 
11 | with DAG('job_market_analytics_curate_job_descriptions_catch_up_dag',
12 |          description='Job Market Analytics Curate Job Descriptions Catch Up DAG',
13 |          schedule_interval='@daily',
14 |          start_date=datetime(2022, 11, 1),
15 |          end_date=datetime(2022, 11, 30),
16 |          dagrun_timeout=timedelta(minutes=60),
17 |          max_active_runs=4,
18 |          max_active_tasks=4,
19 |          catchup=True) as dag:
20 |     @task(task_id="curate_job_descriptions")
21 |     def curate_job_descriptions():
22 |         run_flasky_task('do/curate_job_descriptions')
23 | 
24 | 
25 |     curate_job_descriptions()
26 | 


--------------------------------------------------------------------------------
/python/simplescraper/restore_day_backup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 4 | cd "$SCRIPTPATH" || exit
 5 | 
 6 | source .env
 7 | 
 8 | if [[ $# -ne 3 ]] ; then
 9 |     echo "Please provide a date as script parameters in the following format: year month day"
10 |     echo "Example: $0 2022 12 01"
11 |     exit 1
12 | fi
13 | 
14 | for entity in job_description sitemap
15 | do
16 | 
17 |   raw_day_dir=${RAW_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2/$3
18 | 
19 |   if [ -d "$raw_day_dir" ]
20 |   then
21 | 
22 |     echo "The raw day dir is not empty: $raw_day_dir"
23 | 
24 |   else
25 | 
26 |     backup_day_dir=${BACKUP_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2
27 |     backup_day_filename=${backup_day_dir}/${entity}.$1$2$3.tar.gz
28 | 
29 |     mkdir -p "$raw_day_dir"
30 |     tar -xvzf "$backup_day_filename" -C "$raw_day_dir"
31 | 
32 |     echo "$1-$2-$3: Restored ${entity}"
33 | 
34 |   fi
35 | 
36 | done
37 | 


--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_cleanse_job_descriptions_catch_up_dag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime, timedelta
 3 | 
 4 | from airflow import DAG
 5 | from airflow.decorators import task
 6 | 
 7 | from common_airflow_dag import run_flasky_task
 8 | 
 9 | os.environ["no_proxy"] = "*"
10 | 
11 | with DAG('job_market_analytics_cleanse_job_descriptions_catch_up_dag',
12 |          description='Job Market Analytics Cleanse Job Descriptions Catch Up DAG',
13 |          schedule_interval='@daily',
14 |          start_date=datetime(2022, 11, 1),
15 |          end_date=datetime(2022, 12, 1),
16 |          dagrun_timeout=timedelta(minutes=10),
17 |          max_active_runs=1,
18 |          max_active_tasks=1,
19 |          catchup=True) as dag:
20 | 
21 |     @task(task_id="cleanse_job_descriptions")
22 |     def cleanse_job_descriptions():
23 |         run_flasky_task('do/cleanse_job_descriptions')
24 | 
25 | 
26 |     cleanse_job_descriptions()
27 | 


--------------------------------------------------------------------------------
/python/simplescraper/scrape_data_source.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from common.logging import configure_logger
 4 | from common.storage import get_load_timestamp
 5 | from tasks.download_job_descriptions import download_job_descriptions
 6 | from tasks.download_sitemap import download_sitemap
 7 | from tasks.list_downloaded_job_descriptions import list_downloaded_job_descriptions
 8 | from tasks.list_job_descriptions_to_download import list_job_descriptions_to_download
 9 | 
10 | 
11 | def scrape_data_source(load_timestamp):
12 |     configure_logger(load_timestamp)
13 |     df_downloaded = list_downloaded_job_descriptions(load_timestamp)
14 |     df_sitemap = download_sitemap(load_timestamp)
15 |     df_to_download = list_job_descriptions_to_download(load_timestamp, df_sitemap, df_downloaded)
16 |     download_job_descriptions(load_timestamp, df_to_download)
17 | 
18 |     os.system('say -v Fiona b')
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     scrape_data_source(get_load_timestamp())
23 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/fact_online_job.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='incremental'
 4 |     )
 5 | }}
 6 | 
 7 | 
 8 | WITH new_fact_online_job AS (
 9 |     SELECT online_job.online_at as date_key,
10 |            online_job.online_at,
11 |            online_job.job_id
12 |     FROM {{ source('curated', 'online_job') }}
13 | 
14 | {% if is_incremental() %}
15 |  LEFT OUTER JOIN {{ this }} fact_online_job
16 |     ON (online_job.online_at = fact_online_job.online_at AND
17 |         online_job.job_id    = fact_online_job.job_id)
18 |     WHERE fact_online_job.job_id IS NULL
19 | {% endif %}
20 | )
21 | SELECT new_fact_online_job.date_key as date_key,
22 |        latest_dim_job.job_key as job_key,
23 |        new_fact_online_job.online_at as online_at,
24 |        latest_dim_job.job_id as job_id,
25 |        latest_dim_job.job_ldts
26 |   FROM new_fact_online_job
27 |  INNER JOIN {{ ref('latest_dim_job') }}
28 |     ON (new_fact_online_job.job_id = latest_dim_job.job_id)
29 | 


--------------------------------------------------------------------------------
/docker/postgres/postgres-parquet-fdw/s2-install-arrow.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
 4 | apt update
 5 | apt install -y -V libarrow-dev # For C++
 6 | apt install -y -V libarrow-glib-dev # For GLib (C)
 7 | apt install -y -V libarrow-dataset-dev # For Apache Arrow Dataset C++
 8 | apt install -y -V libarrow-flight-dev # For Apache Arrow Flight C++
 9 | # Notes for Plasma related packages:
10 | #   * You need to enable "non-free" component on Debian GNU/Linux
11 | #   * You need to enable "multiverse" component on Ubuntu
12 | #   * You can use Plasma related packages only on amd64
13 | apt install -y -V libplasma-dev # For Plasma C++
14 | apt install -y -V libplasma-glib-dev # For Plasma GLib (C)
15 | apt install -y -V libgandiva-dev # For Gandiva C++
16 | apt install -y -V libgandiva-glib-dev # For Gandiva GLib (C)
17 | apt install -y -V libparquet-dev # For Apache Parquet C++
18 | apt install -y -V libparquet-glib-dev # For Apache Parquet GLib (C)
19 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/dim_job_location.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='incremental'
 4 |     )
 5 | }}
 6 | 
 7 | 
 8 | SELECT MD5(CONCAT_WS('||',
 9 |             COALESCE(
10 |                 UPPER(TRIM(CAST(
11 |                     job_location.job_id
12 |                 AS VARCHAR))),
13 |                 '^^'),
14 |             COALESCE(
15 |                 UPPER(TRIM(CAST(
16 |                     job_location.load_timestamp
17 |                 AS VARCHAR))),
18 |                 '^^')
19 |        )) AS job_key,
20 |        job_location.job_id,
21 |        job_location.load_timestamp as job_ldts,
22 |        job_location.location AS location_name
23 |   FROM {{ source('curated', 'job_location') }}
24 | 
25 | {% if is_incremental() %}
26 |  LEFT OUTER JOIN dim_job_location
27 |   ON (job_location.job_id   = dim_job_location.job_id AND
28 |       job_location.load_timestamp = dim_job_location.job_ldts AND
29 |       job_location.location = dim_job_location.location_name)
30 | WHERE dim_job_location.job_id IS NULL
31 | {% endif %}
32 | 


--------------------------------------------------------------------------------
/python/simplescraper/verify_day_backup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 4 | cd "$SCRIPTPATH" || exit
 5 | 
 6 | source .env
 7 | 
 8 | if [[ $# -ne 3 ]] ; then
 9 |     echo "Please provide a date as script parameters in the following format: year month day"
10 |     echo "Example: $0 2022 12 01"
11 |     exit 1
12 | fi
13 | 
14 | for entity in job_description sitemap
15 | do
16 | 
17 |   source=${RAW_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2/$3
18 | 
19 |   if [ -d "$source" ]
20 |   then
21 | 
22 |     target_dir=${BACKUP_DIR}/${DATA_SOURCE_NAME}/${entity}/$1/$2
23 |     target_filename=${target_dir}/${entity}.$1$2$3.tar.gz
24 |     diff <(cd "$source" && find . | grep -E '.xml$|.html$' | sort) <(tar -tf "$target_filename" | grep -E '.xml$|.html$' | sort)
25 |     error_code=$?
26 |     if [ $error_code -ne 0 ];
27 |     then
28 |       echo "$1-$2-$3: NOT OK" >&2
29 |       exit 1
30 |     fi
31 | 
32 |   else
33 | 
34 |     echo "$1-$2-$3: NOT FOUND ${entity}"
35 | 
36 |   fi
37 | 
38 | done
39 | 
40 | echo "$1-$2-$3: OK"
41 | 


--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_cleanse_catch_up_dag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime, timedelta
 3 | 
 4 | from airflow import DAG
 5 | from airflow.decorators import task
 6 | 
 7 | from common_airflow_dag import run_flasky_task
 8 | 
 9 | os.environ["no_proxy"] = "*"
10 | 
11 | with DAG('job_market_analytics_cleanse_catch_up_dag',
12 |          description='Job Market Analytics Cleanse Catch Up DAG',
13 |          schedule_interval='@daily',
14 |          start_date=datetime(2021, 12, 1),
15 |          # end_date=datetime(2021, 12, 1),
16 |          dagrun_timeout=timedelta(minutes=10),
17 |          max_active_runs=1,
18 |          max_active_tasks=1,
19 |          catchup=True) as dag:
20 |     @task(task_id="cleanse_sitemaps")
21 |     def cleanse_sitemaps():
22 |         run_flasky_task('do/cleanse_sitemaps')
23 | 
24 | 
25 |     @task(task_id="cleanse_job_descriptions")
26 |     def cleanse_job_descriptions():
27 |         run_flasky_task('do/cleanse_job_descriptions')
28 | 
29 | 
30 |     cleanse_sitemaps()
31 |     cleanse_job_descriptions()
32 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/dim_job_technology.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='incremental'
 4 |     )
 5 | }}
 6 | 
 7 | 
 8 | SELECT MD5(CONCAT_WS('||',
 9 |             COALESCE(
10 |                 UPPER(TRIM(CAST(
11 |                     job_technology.job_id
12 |                 AS VARCHAR))),
13 |                 '^^'),
14 |             COALESCE(
15 |                 UPPER(TRIM(CAST(
16 |                     job_technology.load_timestamp
17 |                 AS VARCHAR))),
18 |                 '^^')
19 |        )) AS job_key,
20 |        job_technology.job_id,
21 |        job_technology.load_timestamp as job_ldts,
22 |        job_technology.technology AS technology_name
23 |   FROM {{ source('curated', 'job_technology') }}
24 | 
25 | {% if is_incremental() %}
26 |  LEFT OUTER JOIN dim_job_technology
27 |   ON (job_technology.job_id         = dim_job_technology.job_id AND
28 |       job_technology.load_timestamp = dim_job_technology.job_ldts AND
29 |       job_technology.technology     = dim_job_technology.technology_name)
30 | WHERE dim_job_technology.job_id IS NULL
31 | {% endif %}
32 | 


--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_curate_catch_up_dag_v2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime, timedelta
 3 | 
 4 | from airflow import DAG
 5 | from airflow.decorators import task
 6 | 
 7 | from common_airflow_dag import run_flasky_task
 8 | 
 9 | os.environ["no_proxy"] = "*"
10 | 
11 | YEAR = 2021
12 | MONTH = 10
13 | DAY = 1
14 | 
15 | with DAG('job_market_analytics_curate_catch_up_dag',
16 |          description='Job Market Analytics Curate Catch Up DAG',
17 |          schedule_interval='@daily',
18 |          start_date=datetime(YEAR, MONTH, DAY),
19 |          end_date=datetime(YEAR, MONTH, DAY) + timedelta(days=15),
20 |          dagrun_timeout=timedelta(minutes=60),
21 |          max_active_runs=2,
22 |          max_active_tasks=2,
23 |          catchup=True) as dag:
24 |     @task(task_id="curate_sitemaps")
25 |     def curate_sitemaps():
26 |         run_flasky_task('do/curate_sitemaps')
27 | 
28 | 
29 |     @task(task_id="curate_job_descriptions")
30 |     def curate_job_descriptions():
31 |         run_flasky_task('do/curate_job_descriptions')
32 | 
33 | 
34 |     curate_sitemaps()
35 |     curate_job_descriptions()
36 | 


--------------------------------------------------------------------------------
/python/simplescraper/common/env_variables.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dotenv import load_dotenv
 4 | 
 5 | load_dotenv()
 6 | 
 7 | DATA_DIR = os.getenv('DATA_DIR')
 8 | RAW_DIR = os.getenv('RAW_DIR')
 9 | CLEANSED_DIR = os.getenv('CLEANSED_DIR')
10 | CURATED_DIR = os.getenv('CURATED_DIR')
11 | DUCKDB_DWH_FILE = os.getenv('DUCKDB_DWH_FILE')
12 | TEMP_DIR = os.getenv('TEMP_DIR')
13 | BACKUP_DIR = os.getenv('BACKUP_DIR')
14 | SOURCE_DIR = os.getenv('SOURCE_DIR')
15 | 
16 | DATA_SOURCE_NAME = os.getenv('DATA_SOURCE_NAME')
17 | DATA_SOURCE_URL = os.getenv('DATA_SOURCE_URL')
18 | 
19 | SEMAPHORE_COUNT: int = int(os.getenv('SEMAPHORE_COUNT'))
20 | MAX_CHUNK_SIZE: int = int(os.getenv('MAX_CHUNK_SIZE'))
21 | MIN_TO_DOWNLOAD: int = int(os.getenv('MIN_TO_DOWNLOAD'))
22 | MAX_TO_DOWNLOAD: int = int(os.getenv('MAX_TO_DOWNLOAD'))
23 | ONLINE_EXPIRATION_IN_DAYS: int = int(os.getenv('ONLINE_EXPIRATION_IN_DAYS'))
24 | 
25 | LATEST_LOAD_TIMESTAMP = os.getenv('LATEST_LOAD_TIMESTAMP')
26 | 
27 | RUN_HEADLESS = os.getenv('RUN_HEADLESS') == 'True'
28 | 
29 | UPLOAD_TO_AZURE = os.getenv('UPLOAD_TO_AZURE') == 'True'
30 | 
31 | AZURE_STORAGE_CONNECTION_STRING = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
32 | AZURE_STORAGE_CONTAINER_NAME = os.getenv('AZURE_STORAGE_CONTAINER_NAME')
33 | 


--------------------------------------------------------------------------------
/python/simplescraper/tasks/curate_sitemaps.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from common.entity import SITEMAP, ONLINE_JOB
 6 | from common.logging import configure_logger, logger
 7 | from common.storage import get_load_timestamp, get_load_date, load_cleansed_df, save_curated_df
 8 | from tasks.curate_job_descriptions import BASE_COLUMNS
 9 | 
10 | ONLINE_JOB_SAT_COLUMNS = ['online_at', 'url']
11 | 
12 | 
13 | def curate_sitemaps(load_timestamp, load_date):
14 |     configure_logger(load_timestamp)
15 |     logger.info(f'Start curate_sitemaps: {load_timestamp} {load_date}')
16 | 
17 |     df = load_cleansed_df(SITEMAP, load_date=load_date)
18 | 
19 |     df = df.dropna(subset=['job_id'])
20 |     df['job_id'] = df['job_id'].astype('int')
21 |     df['online_at'] = pd.to_datetime(df['load_timestamp']).dt.date
22 |     df = df[BASE_COLUMNS + ONLINE_JOB_SAT_COLUMNS]
23 |     df = df.sort_values(by=['job_id'])
24 | 
25 |     save_curated_df(df, ONLINE_JOB)
26 |     logger.info(f'End   curate_sitemaps: {load_timestamp} {load_date}')
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     _load_timestamp = sys.argv[1] if len(sys.argv) > 1 else get_load_timestamp()
31 |     _load_date = sys.argv[2] if len(sys.argv) > 2 else get_load_date()
32 |     curate_sitemaps(_load_timestamp, _load_date)
33 | 


--------------------------------------------------------------------------------
/docker/postgres/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | # Inspired by https://github.com/khezen/compose-postgres/blob/master/docker-compose.yml
 4 | services:
 5 |   postgres:
 6 |     build:
 7 |       context: postgres-parquet-fdw
 8 |       target: postgres-parquet-fdw
 9 |     environment:
10 |       POSTGRES_USER: ${POSTGRES_USER}
11 |       POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
12 |       POSTGRES_DB: ${POSTGRES_DB}
13 |     networks:
14 |       - postgres
15 |     restart: always
16 |     env_file: .env
17 |     logging:
18 |       options:
19 |         max-size: 10m
20 |         max-file: "3"
21 |     ports:
22 |       - '5432:5432'
23 |     volumes:
24 |       - ${POSTGRES_VOLUME}:/var/lib/postgresql/data
25 |       - ${POSTGRES_PARQUET_FDW_VOLUME}:/var/lib/parquet-fdw/data
26 | #  pgadmin:
27 | #    image: dpage/pgadmin4
28 | #    environment:
29 | #      PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL}
30 | #      PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD}
31 | #      PGADMIN_CONFIG_SERVER_MODE: 'False'
32 | #    ports:
33 | #      - '2345:80'
34 | #    volumes:
35 | #      - ${PGADMIN_VOLUME}:/var/lib/pgadmin
36 | #    networks:
37 | #      - postgres
38 | #    restart: always
39 | #    depends_on:
40 | #      - "postgres"
41 | 
42 | networks:
43 |   postgres:
44 |     driver: bridge
45 | 


--------------------------------------------------------------------------------
/python/simplescraper/verify_all_backups.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from common.entity import RAW_ENTITIES
 6 | from common.env_variables import DATA_SOURCE_NAME, SOURCE_DIR
 7 | from common.storage import list_raw_days
 8 | 
 9 | 
10 | def get_current_date():
11 |     return datetime.datetime.today().strftime('%Y%m%d')
12 | 
13 | 
14 | def list_missing_previous_dates(entity):
15 |     df = pd.DataFrame(list_raw_days(DATA_SOURCE_NAME, entity))
16 |     df_current_date = pd.DataFrame([{
17 |         'date': get_current_date()
18 |     }])
19 |     df = df.drop_duplicates()
20 |     df = pd.concat([
21 |         df,
22 |         df_current_date, df_current_date
23 |     ]).drop_duplicates(keep=False)
24 |     return df
25 | 
26 | 
27 | def verify_backups():
28 |     dfs = []
29 |     for entity in RAW_ENTITIES:
30 |         df = list_missing_previous_dates(entity)
31 |         dfs.append(df)
32 |     df = pd.concat(dfs, ignore_index=True)
33 |     df = df.drop_duplicates()
34 |     df = df.sort_values(by=['date'])
35 |     dates_to_download = df['date'].to_list()
36 |     for date_to_download in dates_to_download:
37 |         year = date_to_download[:4]
38 |         month = date_to_download[4:6]
39 |         day = date_to_download[6:8]
40 |         print(
41 |             f'/bin/zsh {SOURCE_DIR}/simplescraper/verify_day_backup.sh {year} {month} {day} || exit 1')
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     verify_backups()
46 | 


--------------------------------------------------------------------------------
/python/simplescraper/tasks/list_job_descriptions_to_download.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from common.env_variables import LATEST_LOAD_TIMESTAMP
 4 | from common.logging import logger, configure_logger
 5 | from common.storage import load_temp_df, DOWNLOADED_JOB_DESCRIPTIONS_CSV, SITEMAP_URLS_CSV, save_temp_df, \
 6 |     JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV
 7 | 
 8 | 
 9 | def list_job_descriptions_to_download(load_timestamp, df_sitemap_urls=None, df_downloaded=None):
10 |     configure_logger(load_timestamp)
11 |     logger.info('list_job_descriptions_to_download: start')
12 | 
13 |     df_sitemap_urls = df_sitemap_urls or load_temp_df(load_timestamp, SITEMAP_URLS_CSV)
14 |     df_downloaded = df_downloaded or load_temp_df(load_timestamp, DOWNLOADED_JOB_DESCRIPTIONS_CSV)
15 | 
16 |     df_downloaded = df_downloaded[['id']]
17 |     df_downloaded = df_downloaded.drop_duplicates()
18 |     df = df_sitemap_urls[['id']]
19 |     df = df.drop_duplicates()
20 |     df = pd.concat([df, df_downloaded, df_downloaded]).drop_duplicates(keep=False)
21 |     df = df.merge(df_sitemap_urls)
22 |     df = df[['url']]
23 |     total_count = df.shape[0]
24 | 
25 |     save_temp_df(df, load_timestamp, JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV)
26 |     logger.success(f'URLs to download: {total_count}')
27 |     logger.info('list_job_descriptions_to_download: end')
28 |     return df
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     list_job_descriptions_to_download(LATEST_LOAD_TIMESTAMP)
33 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Name your project! Project names should contain only lowercase characters
 3 | # and underscores. A good package name should reflect your organization's
 4 | # name or the intended use of these models
 5 | name: 'job_market_analytics'
 6 | version: '1.0.0'
 7 | config-version: 2
 8 | 
 9 | # This setting configures which "profile" dbt uses for this project.
10 | profile: 'job_market_analytics'
11 | 
12 | # These configurations specify where dbt should look for different types of files.
13 | # The `model-paths` config, for example, states that models in this project can be
14 | # found in the "models/" directory. You probably won't need to change these!
15 | model-paths: [ "models" ]
16 | analysis-paths: [ "analyses" ]
17 | test-paths: [ "tests" ]
18 | seed-paths: [ "seeds" ]
19 | macro-paths: [ "macros" ]
20 | snapshot-paths: [ "snapshots" ]
21 | 
22 | target-path: "target"  # directory which will store compiled SQL files
23 | clean-targets: # directories to be removed by `dbt clean`
24 |   - "target"
25 |   - "dbt_packages"
26 | 
27 | 
28 | # Configuring models
29 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
30 | 
31 | # In this example config, we tell dbt to build all models in the example/ directory
32 | # as tables. These settings can be overridden in the individual model files
33 | # using the `{{ config(...) }}` macro.
34 | models:
35 |   job_market_analytics:
36 |     # Config indicated by + and applies to all files under models/example/
37 |     mart:
38 |       +materialized: view
39 | 


--------------------------------------------------------------------------------
/python/simplescraper/tasks/prune_old_raw.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import os
 3 | import shutil
 4 | import sys
 5 | 
 6 | from common.entity import RAW_ENTITIES
 7 | from common.env_variables import RAW_DIR, DATA_SOURCE_NAME
 8 | from common.logging import configure_logger, logger
 9 | from common.storage import get_load_timestamp, get_load_date, LOAD_DATE_FORMAT
10 | 
11 | SEVEN_MONTHS_IN_DAYS = 7 * 30
12 | 
13 | 
14 | def prune_old_raw(load_timestamp, load_date):
15 |     configure_logger(load_timestamp)
16 |     logger.info(f'Start prune_old_raw: {load_date}')
17 |     date_to_remove = datetime.datetime.strptime(load_date, LOAD_DATE_FORMAT).date()
18 |     date_to_remove = date_to_remove - datetime.timedelta(days=SEVEN_MONTHS_IN_DAYS)
19 |     date_to_remove = date_to_remove.strftime(LOAD_DATE_FORMAT)
20 |     year, month, day = date_to_remove.split('/', 2)
21 |     for entity in RAW_ENTITIES:
22 |         folder_to_remove = f'{RAW_DIR}/{DATA_SOURCE_NAME}/{entity}/{year}/{month}/{day}'
23 |         if os.path.exists(folder_to_remove) and os.path.isdir(folder_to_remove):
24 |             logger.success(f'Removing {folder_to_remove}')
25 |             shutil.rmtree(folder_to_remove)
26 |         else:
27 |             logger.warning(f'No folder to remove on {folder_to_remove}')
28 | 
29 |     logger.info(f'End   prune_old_raw: {load_date}')
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     _load_timestamp = sys.argv[1] if len(sys.argv) > 1 else get_load_timestamp()
34 |     _load_date = sys.argv[2] if len(sys.argv) > 2 else get_load_date()
35 |     prune_old_raw(_load_timestamp, _load_date)
36 | 


--------------------------------------------------------------------------------
/python/simplescraper/tasks/list_downloaded_job_descriptions.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import functools
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from common.entity import JOB_DESCRIPTION
 7 | from common.env_variables import LATEST_LOAD_TIMESTAMP, ONLINE_EXPIRATION_IN_DAYS
 8 | from common.logging import logger, configure_logger
 9 | from common.storage import DOWNLOADED_JOB_DESCRIPTIONS_CSV, DATA_SOURCE_NAME, save_temp_df, list_raw_files
10 | 
11 | 
12 | @functools.lru_cache(maxsize=1024)
13 | def calculate_days_online(load_timestamp):
14 |     ingestion_datetime = datetime.datetime.strptime(load_timestamp, '%Y/%m/%d/%H-%M-%S')
15 |     now = datetime.datetime.now()
16 |     delta = now - ingestion_datetime
17 |     return delta.days
18 | 
19 | 
20 | def list_downloaded_job_descriptions(load_timestamp, load_date=None) -> pd.DataFrame:
21 |     configure_logger(load_timestamp)
22 |     logger.info('list_downloaded_job_descriptions start')
23 |     files = list_raw_files(DATA_SOURCE_NAME, JOB_DESCRIPTION, load_date)
24 |     df = pd.DataFrame(files)
25 |     if not df.empty:
26 |         df['id'] = df['file_name'].str.split('.', expand=True)[0]
27 |         if ONLINE_EXPIRATION_IN_DAYS:
28 |             df['days_online'] = df['load_timestamp'].map(calculate_days_online)
29 |             df = df[df['days_online'] < ONLINE_EXPIRATION_IN_DAYS]
30 |             df = df.drop(columns=['days_online'])
31 |         if load_date is None:
32 |             save_temp_df(df, load_timestamp, DOWNLOADED_JOB_DESCRIPTIONS_CSV)
33 |     logger.info('list_downloaded_job_descriptions end')
34 |     return df
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     list_downloaded_job_descriptions(LATEST_LOAD_TIMESTAMP)
39 | 


--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_hourly_dag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime, timedelta
 3 | from itertools import chain
 4 | 
 5 | from airflow import DAG
 6 | from airflow.decorators import task
 7 | from airflow.providers.http.hooks.http import HttpHook
 8 | 
 9 | from common_airflow_dag import run_flasky_task
10 | 
11 | os.environ["no_proxy"] = "*"
12 | 
13 | with DAG('job_market_analytics_hourly_dag',
14 |          description='Job Market Analytics Hourly DAG',
15 |          schedule_interval='@hourly',
16 |          start_date=datetime(2022, 1, 1),
17 |          dagrun_timeout=timedelta(minutes=60),
18 |          max_active_runs=1,
19 |          catchup=False) as dag:
20 |     @task(task_id="check_vpn_status")
21 |     def check_vpn_status():
22 |         HttpHook(method='GET').run('do/check_vpn_status')
23 | 
24 | 
25 |     @task(task_id="list_downloaded_job_descriptions")
26 |     def list_downloaded_job_descriptions():
27 |         run_flasky_task('do/list_downloaded_job_descriptions')
28 | 
29 | 
30 |     @task(task_id="download_sitemap", retries=1)
31 |     def download_sitemap():
32 |         run_flasky_task('do/download_sitemap')
33 | 
34 | 
35 |     @task(task_id="list_job_descriptions_to_download")
36 |     def list_job_descriptions_to_download():
37 |         run_flasky_task('do/list_job_descriptions_to_download')
38 | 
39 | 
40 |     @task(task_id="download_job_descriptions")
41 |     def download_job_descriptions():
42 |         run_flasky_task('do/download_job_descriptions')
43 | 
44 | 
45 |     chain(check_vpn_status() >> [list_downloaded_job_descriptions(),
46 |                                  download_sitemap()] >> \
47 |           list_job_descriptions_to_download() >> download_job_descriptions())
48 | 


--------------------------------------------------------------------------------
/python/utils/migrate_to_raw_v3.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import shutil
 4 | 
 5 | import pandas as pd
 6 | 
 7 | from common.env_variables import LATEST_LOAD_TIMESTAMP, RAW_DIR
 8 | from common.storage import DATA_SOURCE_NAME, save_temp_df, load_temp_df
 9 | 
10 | 
11 | def list_raw_files(data_source, entity):
12 |     dir_path = os.path.join(RAW_DIR, data_source, entity)
13 |     file_list = [{
14 |         'old_file_path': f,
15 |     } for f in glob.iglob(dir_path + '/*/*/*/*/*', recursive=True) if os.path.isfile(f)]
16 |     return file_list
17 | 
18 | 
19 | def list_downloaded_files(load_timestamp) -> pd.DataFrame:
20 |     files = list_raw_files(DATA_SOURCE_NAME, 'job_description')
21 |     df = pd.DataFrame(files)
22 |     save_temp_df(df, load_timestamp, '00_downloaded_raw_job_descriptions.csv')
23 |     return df
24 | 
25 | 
26 | def get_new_file_path(row):
27 |     old_file_path = row['old_file_path']
28 |     dirname = os.path.dirname(old_file_path)
29 |     basename = os.path.basename(old_file_path)
30 |     job_id = basename.rsplit('--', 1)
31 |     job_id = job_id[1]
32 |     job_id = job_id.split('-')
33 |     job_id = job_id[0]
34 |     new_file_path = os.path.join(dirname.replace('/raw/', '/raw_v3/'), f'{job_id}.html')
35 |     return new_file_path
36 | 
37 | 
38 | def copy_file(row):
39 |     src = row['old_file_path']
40 |     dst = row['new_file_path']
41 |     os.makedirs(os.path.dirname(dst), exist_ok=True)
42 |     shutil.copy2(src, dst)
43 | 
44 | 
45 | def copy_files_to_raw_v2(load_timestamp):
46 |     df = load_temp_df(load_timestamp, '00_downloaded_raw_job_descriptions.csv')
47 |     df['new_file_path'] = df.apply(get_new_file_path, axis=1)
48 |     df.apply(copy_file, axis=1)
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     copy_files_to_raw_v2(LATEST_LOAD_TIMESTAMP)
53 | 


--------------------------------------------------------------------------------
/python/simplescraper/restore_all_backups.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from common.entity import RAW_ENTITIES
 6 | from common.env_variables import DATA_SOURCE_NAME, SOURCE_DIR
 7 | from common.storage import list_raw_days, list_backup_days
 8 | 
 9 | 
10 | def get_current_date():
11 |     return datetime.datetime.today().strftime('%Y%m%d')
12 | 
13 | 
14 | def list_backups_to_restore(entity):
15 |     df = pd.DataFrame(list_backup_days(DATA_SOURCE_NAME, entity))
16 |     df_in_raw = pd.DataFrame(list_raw_days(DATA_SOURCE_NAME, entity))
17 |     df_current_date = pd.DataFrame([{
18 |         'date': get_current_date()
19 |     }])
20 |     df = df.drop_duplicates()
21 |     df = pd.concat([
22 |         df,
23 |         df_in_raw, df_in_raw,
24 |         df_current_date, df_current_date
25 |     ]).drop_duplicates(keep=False)
26 |     return df
27 | 
28 | 
29 | def print_script_statements(script_name, days_to_restore):
30 |     for day_to_restore in days_to_restore:
31 |         year = day_to_restore[:4]
32 |         month = day_to_restore[4:6]
33 |         day = day_to_restore[6:8]
34 |         print(
35 |             f'/bin/zsh {SOURCE_DIR}/simplescraper/{script_name} {year} {month} {day} || exit 1')
36 | 
37 | 
38 | def restore_all_backups():
39 |     dfs = []
40 |     for entity in RAW_ENTITIES:
41 |         df = list_backups_to_restore(entity)
42 |         dfs.append(df)
43 |     df = pd.concat(dfs, ignore_index=True)
44 |     df = df.drop_duplicates()
45 |     df = df.sort_values(by=['date'])
46 |     days_to_restore = df['date'].to_list()
47 |     print_script_statements('restore_day_backup.sh', days_to_restore)
48 |     print()
49 |     print_script_statements('verify_day_backup.sh', days_to_restore)
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     restore_all_backups()
54 | 


--------------------------------------------------------------------------------
/python/simplescraper/do_all_backups.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from common.entity import RAW_ENTITIES
 6 | from common.env_variables import DATA_SOURCE_NAME, SOURCE_DIR
 7 | from common.storage import list_raw_days, list_backup_days
 8 | 
 9 | 
10 | def get_current_date():
11 |     return datetime.datetime.today().strftime('%Y%m%d')
12 | 
13 | 
14 | def list_days_to_backup(entity):
15 |     df = pd.DataFrame(list_raw_days(DATA_SOURCE_NAME, entity))
16 |     df_backup_days = pd.DataFrame(list_backup_days(DATA_SOURCE_NAME, entity))
17 |     df_current_date = pd.DataFrame([{
18 |         'date': get_current_date()
19 |     }])
20 |     df = df.drop_duplicates()
21 |     df = pd.concat([
22 |         df,
23 |         df_backup_days, df_backup_days,
24 |         df_current_date, df_current_date
25 |     ]).drop_duplicates(keep=False)
26 |     return df
27 | 
28 | 
29 | def print_script_statements(script_name, dates_to_download):
30 |     for date_to_download in dates_to_download:
31 |         year = date_to_download[:4]
32 |         month = date_to_download[4:6]
33 |         day = date_to_download[6:8]
34 |         print(
35 |             f'/bin/zsh {SOURCE_DIR}/simplescraper/{script_name} {year} {month} {day} || exit 1')
36 | 
37 | 
38 | def do_all_backups():
39 |     dfs = []
40 |     for entity in RAW_ENTITIES:
41 |         df = list_days_to_backup(entity)
42 |         dfs.append(df)
43 |     df = pd.concat(dfs, ignore_index=True)
44 |     df = df.drop_duplicates()
45 |     df = df.sort_values(by=['date'])
46 |     dates_to_download = df['date'].to_list()
47 |     print_script_statements('do_day_backup.sh', dates_to_download)
48 |     print()
49 |     print_script_statements('verify_day_backup.sh', dates_to_download)
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     do_all_backups()
54 | 


--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_daily_dag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime, timedelta
 3 | 
 4 | from airflow import DAG
 5 | from airflow.decorators import task
 6 | 
 7 | from common_airflow_dag import run_flasky_task
 8 | 
 9 | os.environ["no_proxy"] = "*"
10 | 
11 | with DAG('job_market_analytics_daily_dag',
12 |          description='Job Market Analytics Daily DAG',
13 |          schedule_interval='@daily',
14 |          start_date=datetime(2022, 1, 1),
15 |          dagrun_timeout=timedelta(minutes=60),
16 |          max_active_runs=1,
17 |          catchup=True) as dag:
18 |     @task(task_id="cleanse_sitemaps")
19 |     def cleanse_sitemaps():
20 |         run_flasky_task('do/cleanse_sitemaps')
21 | 
22 | 
23 |     @task(task_id="cleanse_job_descriptions")
24 |     def cleanse_job_descriptions():
25 |         run_flasky_task('do/cleanse_job_descriptions')
26 | 
27 | 
28 |     @task(task_id="curate_sitemaps")
29 |     def curate_sitemaps():
30 |         run_flasky_task('do/curate_sitemaps')
31 | 
32 | 
33 |     @task(task_id="curate_job_descriptions")
34 |     def curate_job_descriptions():
35 |         run_flasky_task('do/curate_job_descriptions')
36 | 
37 | 
38 |     @task(task_id="do_dbt_run")
39 |     def dbt_run():
40 |         run_flasky_task('do/do_dbt_run')
41 | 
42 | 
43 |     @task(task_id="do_day_backup")
44 |     def backup_day():
45 |         run_flasky_task('do/do_day_backup')
46 | 
47 | 
48 |     @task(task_id="verify_day_backup")
49 |     def verify_day_backup():
50 |         run_flasky_task('do/verify_day_backup')
51 | 
52 | 
53 |     @task(task_id="prune_old_raw")
54 |     def prune_old_raw():
55 |         run_flasky_task('do/prune_old_raw')
56 | 
57 | 
58 |     t_curate_sitemaps = curate_sitemaps()
59 |     t_curate_job_descriptions = curate_job_descriptions()
60 | 
61 |     cleanse_sitemaps() >> t_curate_sitemaps
62 |     cleanse_job_descriptions() >> t_curate_job_descriptions
63 | 
64 |     [t_curate_sitemaps, t_curate_job_descriptions] >> dbt_run()
65 | 
66 |     backup_day() >> verify_day_backup() >> prune_old_raw()
67 | 


--------------------------------------------------------------------------------
/python/airflow/airflow_home/dags/job_market_analytics_daily_dag_catch_up.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime, timedelta
 3 | 
 4 | from airflow import DAG
 5 | from airflow.decorators import task
 6 | 
 7 | from common_airflow_dag import run_flasky_task
 8 | 
 9 | os.environ["no_proxy"] = "*"
10 | 
11 | with DAG('job_market_analytics_daily_catch_up_dag',
12 |          description='Job Market Analytics Daily Catch UP DAG',
13 |          schedule_interval='@daily',
14 |          start_date=datetime(2023, 5, 24),
15 |          dagrun_timeout=timedelta(minutes=60),
16 |          max_active_runs=1,
17 |          max_active_tasks=1,
18 |          catchup=True) as dag:
19 |     @task(task_id="cleanse_sitemaps")
20 |     def cleanse_sitemaps():
21 |         run_flasky_task('do/cleanse_sitemaps')
22 | 
23 | 
24 |     @task(task_id="cleanse_job_descriptions")
25 |     def cleanse_job_descriptions():
26 |         run_flasky_task('do/cleanse_job_descriptions')
27 | 
28 | 
29 |     @task(task_id="curate_sitemaps")
30 |     def curate_sitemaps():
31 |         run_flasky_task('do/curate_sitemaps')
32 | 
33 | 
34 |     @task(task_id="curate_job_descriptions")
35 |     def curate_job_descriptions():
36 |         run_flasky_task('do/curate_job_descriptions')
37 | 
38 | 
39 |     @task(task_id="do_dbt_run")
40 |     def dbt_run():
41 |         run_flasky_task('do/do_dbt_run')
42 | 
43 | 
44 |     @task(task_id="do_day_backup")
45 |     def backup_day():
46 |         run_flasky_task('do/do_day_backup')
47 | 
48 | 
49 |     @task(task_id="verify_day_backup")
50 |     def verify_day_backup():
51 |         run_flasky_task('do/verify_day_backup')
52 | 
53 | 
54 |     @task(task_id="prune_old_raw")
55 |     def prune_old_raw():
56 |         run_flasky_task('do/prune_old_raw')
57 | 
58 | 
59 |     t_curate_sitemaps = curate_sitemaps()
60 |     t_curate_job_descriptions = curate_job_descriptions()
61 | 
62 |     cleanse_sitemaps() >> t_curate_sitemaps
63 |     cleanse_job_descriptions() >> t_curate_job_descriptions
64 | 
65 |     [t_curate_sitemaps, t_curate_job_descriptions] >> dbt_run()
66 | 
67 |     backup_day() >> verify_day_backup() >> prune_old_raw()
68 | 


--------------------------------------------------------------------------------
/doc/metaData-bag.log:
--------------------------------------------------------------------------------
 1 | 2022-05-08 21:29:11.685 | DEBUG    | __main__:load_and_parse:27 - Parsing (96/213) 2343/3437: 2022/04/20/09-00-00/8205291.html
 2 | Traceback (most recent call last):
 3 |   File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/parse_job_descriptions.py", line 70, in <module>
 4 |     parse_job_descriptions()
 5 |   File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/parse_job_descriptions.py", line 53, in parse_job_descriptions
 6 |     df['parsed_content'] = df.apply(load_and_parse, axis=1)
 7 |   File "/Users/carrion/PycharmProjects/job-market-analytics/venv/lib/python3.8/site-packages/pandas/core/frame.py", line 8740, in apply
 8 |     return op.apply()
 9 |   File "/Users/carrion/PycharmProjects/job-market-analytics/venv/lib/python3.8/site-packages/pandas/core/apply.py", line 688, in apply
10 |     return self.apply_standard()
11 |   File "/Users/carrion/PycharmProjects/job-market-analytics/venv/lib/python3.8/site-packages/pandas/core/apply.py", line 812, in apply_standard
12 |     results, res_index = self.apply_series_generator()
13 |   File "/Users/carrion/PycharmProjects/job-market-analytics/venv/lib/python3.8/site-packages/pandas/core/apply.py", line 828, in apply_series_generator
14 |     results[i] = self.f(v)
15 |   File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/parse_job_descriptions.py", line 28, in load_and_parse
16 |     parsed_content = parse_job_description(html_content)
17 |   File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/tasks/parse_job_description.py", line 55, in parse_job_description
18 |     job_description = extract_metadata(soup)
19 |   File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/tasks/parse_job_description.py", line 46, in extract_metadata
20 |     metadata = flatten_metadata(metadata)
21 |   File "/Users/carrion/PycharmProjects/job-market-analytics/python/simplescraper/tasks/parse_job_description.py", line 24, in flatten_metadata
22 |     temp_metadata = flatten.pop('metaData')
23 | KeyError: 'metaData'
24 | 
25 | Process finished with exit code 1
26 | 


--------------------------------------------------------------------------------
/python/simplescraper/tasks/cleanse_sitemaps.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import bs4
 4 | import pandas as pd
 5 | from loguru import logger
 6 | 
 7 | from common.entity import SITEMAP
 8 | from common.logging import configure_logger
 9 | from common.storage import get_load_timestamp, load_raw_file, save_cleansed_df, get_load_date, LOAD_TIMESTAMP_FORMAT
10 | from tasks.list_downloaded_sitemaps import list_downloaded_sitemaps
11 | 
12 | 
13 | def load_and_parse(row):
14 |     load_timestamp = row['load_timestamp']
15 |     file_name = row['file_name']
16 |     sitemap_content = load_raw_file(SITEMAP, load_timestamp, file_name)
17 |     logger.debug(f'Parsing: {load_timestamp}/{file_name}')
18 |     soup = bs4.BeautifulSoup(sitemap_content, 'xml')
19 |     urls = [loc.text for loc in soup.findAll('loc')]
20 |     return urls
21 | 
22 | 
23 | def extract_job_id(url_column):
24 |     url_split = url_column.str.split('--', expand=True)
25 |     return url_split[2].str.split('-', expand=True)[0]
26 | 
27 | 
28 | def get_date_from_load_timestamp(load_timestamp):
29 |     year, month, day, time = load_timestamp.split('/')
30 |     return f'{year}-{month}-{day}'
31 | 
32 | 
33 | def cleanse_sitemaps(load_timestamp, load_date):
34 |     configure_logger(load_timestamp)
35 |     df = list_downloaded_sitemaps(load_timestamp, load_date)
36 |     df[['year', 'month', 'day', 'time']] = df['load_timestamp'].str.split('/', 3, expand=True)
37 |     if df.empty:
38 |         logger.info('Nothing to parse')
39 |         return
40 |     df = df.sort_values(by=['load_timestamp', 'file_name'])
41 |     df['url'] = df.apply(load_and_parse, axis=1)
42 |     df = df.explode('url')
43 |     df['job_id'] = extract_job_id(df['url'])
44 |     df = df.drop_duplicates(['job_id'], keep='first')
45 |     df['load_timestamp'] = pd.to_datetime(df['load_timestamp'], format=LOAD_TIMESTAMP_FORMAT, utc=True)
46 |     logger.info(f'Saving cleansed: {df["load_timestamp"].iloc[0]}')
47 |     save_cleansed_df(df, SITEMAP)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     _load_timestamp = sys.argv[1] if len(sys.argv) > 1 else get_load_timestamp()
52 |     _load_date = sys.argv[2] if len(sys.argv) > 2 else get_load_date()
53 |     cleanse_sitemaps(_load_timestamp, _load_date)
54 | 


--------------------------------------------------------------------------------
/python/utils/migrate_raw_v1_to_raw_v2.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import glob
 3 | import os
 4 | import shutil
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from common.env_variables import LATEST_LOAD_TIMESTAMP, RAW_DIR, DATA_DIR
 9 | from common.storage import DATA_SOURCE_NAME, save_temp_df, load_temp_df
10 | 
11 | 
12 | def list_raw_files(data_source):
13 |     dir_path = os.path.join(RAW_DIR, data_source)
14 |     file_list = [{
15 |         'old_file_path': f,
16 |         'entity': f.split('/')[-3],
17 |         'timestamp': datetime.datetime.fromtimestamp(os.stat(f).st_birthtime),
18 |         'file_name': f.split('/')[-1],
19 |     } for f in glob.iglob(dir_path + '/*/*/*', recursive=True) if os.path.isfile(f)]
20 |     return file_list
21 | 
22 | 
23 | def list_downloaded_files(load_timestamp) -> pd.DataFrame:
24 |     files = list_raw_files(DATA_SOURCE_NAME)
25 |     df = pd.DataFrame(files)
26 |     # df = df[df['file_name'] != 'sitemapindex.xml']
27 |     save_temp_df(df, load_timestamp, '00_downloaded_raw_files.csv')
28 |     return df
29 | 
30 | 
31 | def timestamp_to_datatime_partition(timestamp):
32 |     timestamp = str(timestamp)
33 |     split1, split2 = timestamp.split()
34 |     year, month, day = split1.split('-')
35 |     hour = split2[:2]
36 |     datatime_partition = f'{year}/{month}/{day}/{hour}-00-00'
37 |     return datatime_partition
38 | 
39 | 
40 | def get_new_file_path(row):
41 |     new_file_path = os.path.join(DATA_DIR, 'raw_v2', DATA_SOURCE_NAME, row['entity'], row['datatime_partition'],
42 |                                  row['file_name'])
43 |     return new_file_path
44 | 
45 | 
46 | def copy_file(row):
47 |     src = row['old_file_path']
48 |     dst = row['new_file_path']
49 |     os.makedirs(os.path.dirname(dst), exist_ok=True)
50 |     shutil.copy2(src, dst)
51 | 
52 | 
53 | def copy_files_to_raw_v2(load_timestamp):
54 |     df = load_temp_df(load_timestamp, '00_downloaded_raw_files.csv')
55 |     df['datatime_partition'] = df['timestamp'].apply(timestamp_to_datatime_partition)
56 |     df['new_file_path'] = df.apply(get_new_file_path, axis=1)
57 |     df.apply(copy_file, axis=1)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     list_downloaded_files(LATEST_LOAD_TIMESTAMP)
62 |     copy_files_to_raw_v2(LATEST_LOAD_TIMESTAMP)
63 | 


--------------------------------------------------------------------------------
/python/simplescraper/tasks/cleanse_job_descriptions.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from common.entity import JOB_DESCRIPTION
 6 | from common.logging import logger, configure_logger
 7 | from common.storage import get_load_timestamp, load_raw_file, save_cleansed_df, get_load_date, LOAD_TIMESTAMP_FORMAT
 8 | from tasks.list_downloaded_job_descriptions import list_downloaded_job_descriptions
 9 | from tasks.parse_job_description import parse_job_description
10 | 
11 | 
12 | def load_and_parse(row) -> str:
13 |     load_timestamp = row['load_timestamp']
14 |     file_name = row['file_name']
15 |     html_content = load_raw_file(JOB_DESCRIPTION, load_timestamp, file_name)
16 |     try:
17 |         logger.debug(f'Parsing {load_timestamp}/{file_name}')
18 |         parsed_content = parse_job_description(html_content)
19 |         return parsed_content
20 |     except AttributeError:
21 |         logger.warning(f'The following file could not be parsed: {load_timestamp}/{file_name}')
22 |         return ''
23 | 
24 | 
25 | def cleanse_job_descriptions(load_timestamp, load_date):
26 |     configure_logger(load_timestamp)
27 |     df = list_downloaded_job_descriptions(load_timestamp, load_date)
28 |     if df.empty:
29 |         logger.warning(f'Nothing to cleanse for the load date: {load_date}')
30 |         return
31 |     df = df.sort_values(by=['load_timestamp', 'file_name'])
32 |     df = df.reset_index(drop=True)
33 |     logger.info(f'Start  to parse job descriptions for the load date: {load_date}')
34 |     df['parsed_content'] = df.apply(load_and_parse, axis=1)
35 |     df = df.join(pd.json_normalize(df['parsed_content']))
36 |     df = df.drop(columns=['parsed_content'])
37 |     df[['year', 'month', 'day', 'hour']] = df['load_timestamp'].str.split('/', 3, expand=True)
38 |     df['load_timestamp'] = pd.to_datetime(df['load_timestamp'], format=LOAD_TIMESTAMP_FORMAT, utc=True)
39 |     logger.info(f'Finish to parse job descriptions for the load date: {load_date}')
40 |     save_cleansed_df(df, JOB_DESCRIPTION)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     _load_timestamp = sys.argv[1] if len(sys.argv) > 1 else get_load_timestamp()
45 |     _load_date = sys.argv[2] if len(sys.argv) > 2 else get_load_date()
46 |     cleanse_job_descriptions(_load_timestamp, _load_date)
47 | 


--------------------------------------------------------------------------------
/sql/dwh/requirements.txt:
--------------------------------------------------------------------------------
  1 | agate==1.6.3
  2 | appnope==0.1.3
  3 | argon2-cffi==21.3.0
  4 | argon2-cffi-bindings==21.2.0
  5 | asttokens==2.0.8
  6 | attrs==22.1.0
  7 | Babel==2.10.3
  8 | backcall==0.2.0
  9 | beautifulsoup4==4.11.1
 10 | bleach==5.0.1
 11 | certifi==2022.9.14
 12 | cffi==1.15.1
 13 | charset-normalizer==2.1.1
 14 | click==8.1.3
 15 | colorama==0.4.4
 16 | dbt-core==1.5.0
 17 | dbt-duckdb==1.5.1
 18 | dbt-extractor==0.4.1
 19 | debugpy==1.6.3
 20 | decorator==5.1.1
 21 | defusedxml==0.7.1
 22 | duckdb==0.7.0
 23 | entrypoints==0.4
 24 | executing==1.0.0
 25 | fastjsonschema==2.16.1
 26 | future==0.18.2
 27 | hologram==0.0.15
 28 | idna==3.4
 29 | ipykernel==6.15.3
 30 | ipython==8.5.0
 31 | ipython-genutils==0.2.0
 32 | ipywidgets==8.0.2
 33 | isodate==0.6.1
 34 | jedi==0.18.1
 35 | Jinja2==3.1.2
 36 | jsonschema==3.2.0
 37 | jupyter==1.0.0
 38 | jupyter-console==6.4.4
 39 | jupyter-core==4.11.1
 40 | jupyter_client==7.3.5
 41 | jupyterlab-pygments==0.2.2
 42 | jupyterlab-widgets==3.0.3
 43 | leather==0.3.4
 44 | Logbook==1.5.3
 45 | MarkupSafe==2.0.1
 46 | mashumaro==3.6
 47 | matplotlib-inline==0.1.6
 48 | minimal-snowplow-tracker==0.0.2
 49 | mistune==0.8.4
 50 | msgpack==1.0.4
 51 | nbclient==0.5.13
 52 | nbconvert==6.4.5
 53 | nbformat==5.5.0
 54 | nest-asyncio==1.5.5
 55 | networkx==2.8.3
 56 | notebook==6.4.12
 57 | numpy==1.23.3
 58 | packaging==21.3
 59 | pandas==1.4.4
 60 | pandocfilters==1.5.0
 61 | parsedatetime==2.4
 62 | parso==0.8.3
 63 | pathspec==0.9.0
 64 | patsy==0.5.2
 65 | pexpect==4.8.0
 66 | pickleshare==0.7.5
 67 | plotly==5.10.0
 68 | plotly-calplot==0.1.12
 69 | plotly-express==0.4.1
 70 | prometheus-client==0.14.1
 71 | prompt-toolkit==3.0.31
 72 | protobuf==4.23.1
 73 | psutil==5.9.2
 74 | ptyprocess==0.7.0
 75 | pure-eval==0.2.2
 76 | pycparser==2.21
 77 | Pygments==2.13.0
 78 | pyparsing==3.0.9
 79 | pyrsistent==0.18.1
 80 | python-dateutil==2.8.2
 81 | python-dotenv==0.21.0
 82 | python-slugify==6.1.2
 83 | pytimeparse==1.1.8
 84 | pytz==2022.2.1
 85 | PyYAML==6.0
 86 | pyzmq==24.0.0
 87 | qtconsole==5.3.2
 88 | QtPy==2.2.0
 89 | requests==2.28.1
 90 | scipy==1.9.1
 91 | Send2Trash==1.8.0
 92 | six==1.16.0
 93 | soupsieve==2.3.2.post1
 94 | sqlparse==0.4.2
 95 | stack-data==0.5.0
 96 | statsmodels==0.13.2
 97 | tenacity==8.0.1
 98 | terminado==0.15.0
 99 | testpath==0.6.0
100 | text-unidecode==1.3
101 | tornado==6.2
102 | traitlets==5.4.0
103 | typing_extensions==4.3.0
104 | urllib3==1.26.12
105 | wcwidth==0.2.5
106 | webencodings==0.5.1
107 | Werkzeug==2.1.2
108 | widgetsnbextension==4.0.3
109 | 


--------------------------------------------------------------------------------
/python/simplescraper/tasks/parse_job_description.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | SPACE_CHAR = ' '
 7 | NBSP_CHAR = u'\xa0'
 8 | 
 9 | METADATA_JSON_PREFIX = 'window.__PRELOADED_STATE__.HeaderStepStoneBlock = '
10 | METADATA_JSON_SUFFIX = ';'
11 | 
12 | FIELD_SELECTORS = {
13 |     #'company_name': '.at-header-company-name',
14 |     # 'description': 'div[itemprop="description"]',
15 |     'description_introduction': '.at-section-text-introduction',
16 |     'description_responsabilities': '.at-section-text-description-content',
17 |     'description_requirements': '.at-section-text-profile-content',
18 |     'description_perks': '.at-section-text-weoffer-content',
19 | }
20 | 
21 | 
22 | def flatten_metadata(metadata):
23 |     flatten = metadata.copy()
24 |     temp_metadata = flatten.pop('metaData')
25 |     flatten.update(temp_metadata)
26 |     return flatten
27 | 
28 | 
29 | def keys_to_snake_case(metadata):
30 |     snake_case_object = {}
31 |     for old_key in metadata.keys():
32 |         # https://stackoverflow.com/questions/60148175/convert-camelcase-to-snakecase
33 |         new_key = re.sub(r'(?<!^)(?=[A-Z])', '_', old_key).lower()
34 |         snake_case_object[new_key] = metadata[old_key]
35 |     return snake_case_object
36 | 
37 | 
38 | def extract_metadata(soup):
39 |     listing_data = {}
40 |     company_data = {}
41 |     script_tag = soup.find('script', id='js-section-preloaded-HeaderStepStoneBlock')
42 |     script_tag_lines = script_tag.text.split('\n')
43 |     for line in script_tag_lines:
44 |         if line.startswith(METADATA_JSON_PREFIX) and line.endswith(METADATA_JSON_SUFFIX):
45 |             json_line = line[len(METADATA_JSON_PREFIX):len(line) - len(METADATA_JSON_SUFFIX)]
46 |             line_as_json = json.loads(json_line)
47 |             listing_data = line_as_json['listingData']
48 |             company_data = line_as_json['companyData']
49 |     metadata = flatten_metadata(listing_data)
50 |     metadata = keys_to_snake_case(metadata)
51 |     metadata['job_id'] = metadata.pop('id')
52 |     metadata['company_name'] = company_data.pop('name') if company_data and 'name' in company_data else ''
53 |     return metadata
54 | 
55 | 
56 | def parse_job_description(html_content):
57 |     soup = BeautifulSoup(html_content, features='lxml')
58 | 
59 |     job_description = extract_metadata(soup)
60 |     for field, selector in FIELD_SELECTORS.items():
61 |         element = soup.select_one(selector)
62 |         if element:
63 |             job_description[field] = element.text.strip().replace(NBSP_CHAR, SPACE_CHAR)
64 | 
65 |     return job_description
66 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # Added by me
132 | temp/
133 | .idea/
134 | _trial_temp/
135 | .env-test
136 | temp.sh
137 | .DS_Store
138 | /python/airflow/airflow_home/logs/
139 | /python/airflow/airflow_home/airflow.cfg
140 | /python/airflow/airflow_home/airflow.db
141 | /python/airflow/airflow_home/airflow-webserver.pid
142 | /python/airflow/airflow_home/standalone_admin_password.txt
143 | /python/airflow/airflow_home/webserver_config.py
144 | 


--------------------------------------------------------------------------------
/sql/dwh/job_market_analytics/models/mart/normalized_online_job.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='incremental'
 4 |     )
 5 | }}
 6 | 
 7 | WITH f_created_at AS (
 8 |     SELECT DISTINCT online_at
 9 |       FROM {{ ref('fact_online_job') }} f
10 | ), max_normalized AS (
11 |     SELECT MAX(online_at)
12 |       FROM {{ this }}
13 | ), n_created_at AS (
14 |     SELECT DISTINCT online_at
15 |       FROM {{ this }}
16 | ), n_created_at_without_max AS (
17 |     SELECT *
18 |       FROM n_created_at
19 |      WHERE online_at != (SELECT * FROM max_normalized)
20 |      ORDER BY 1
21 | ), to_materialize AS (
22 | 
23 |     SELECT DISTINCT f.online_at
24 |       FROM f_created_at f
25 | 
26 | {% if is_incremental() %}
27 |      LEFT OUTER JOIN n_created_at_without_max n
28 |        ON (f.online_at = n.online_at)
29 |     WHERE n.online_at IS NULL
30 | {% endif %}
31 | 
32 |     ORDER BY 1
33 | 
34 | ), normalized AS (
35 |     SELECT f.online_at,
36 |            f.job_id,
37 |            j.company_name,
38 |            l.location_name,
39 |            t.technology_name
40 |       FROM to_materialize tm
41 |       JOIN {{ ref('fact_online_job') }} f
42 |         ON (tm.online_at = f.online_at)
43 |       JOIN {{ ref('dim_job') }} j
44 |         ON (f.job_key = j.job_key)
45 |       JOIN {{ ref('dim_job_location') }} l
46 |         ON (f.job_key = l.job_key)
47 |       JOIN {{ ref('dim_job_technology') }} t
48 |         ON (f.job_key = t.job_key)
49 |      ORDER BY 1
50 | ), normalized_with_previous AS (
51 |     SELECT job_id,
52 |            location_name,
53 |            company_name,
54 |            technology_name,
55 |            online_at,
56 |            online_at - INTERVAL 1 DAY AS previous_day
57 |       FROM normalized
58 | ), min_online_at AS (
59 |     SELECT MIN(online_at)
60 |       FROM normalized_with_previous
61 | ), max_online_at AS (
62 |     SELECT MAX(online_at)
63 |       FROM normalized_with_previous
64 | ), joined_normalized_with_previous AS (
65 |     SELECT DISTINCT
66 |            c.job_id,
67 |            c.online_at,
68 |            c.location_name,
69 |            c.company_name,
70 |            c.technology_name,
71 |            p.location_name AS previous_location_name,
72 |            p.company_name AS previous_company_name,
73 |            p.technology_name AS previous_technology_name,
74 |            p.online_at AS previous_online_at,
75 |            p.job_id AS previous_job_id
76 |       FROM normalized_with_previous c
77 |       FULL OUTER JOIN normalized_with_previous p
78 |         ON (c.job_id = p.job_id AND
79 |             c.location_name = p.location_name AND
80 |             c.company_name = p.company_name AND
81 |             c.technology_name = p.technology_name AND
82 |             c.previous_day = p.online_at)
83 | )
84 | SELECT COALESCE(job_id, previous_job_id) AS job_id,
85 |        COALESCE(location_name, previous_location_name) AS location_name,
86 |        COALESCE(company_name, previous_company_name) AS company_name,
87 |        COALESCE(technology_name, previous_technology_name) AS technology_name,
88 |        COALESCE(online_at, previous_online_at) AS online_at,
89 |        previous_job_id IS NULL AS added,
90 |        job_id IS NULL AS deleted
91 |   FROM joined_normalized_with_previous
92 |  WHERE NOT COALESCE(online_at, previous_online_at) IN (SELECT * FROM max_online_at)
93 |    AND NOT COALESCE(online_at, previous_online_at) IN (SELECT * FROM min_online_at)
94 | 


--------------------------------------------------------------------------------
/python/simplescraper/tasks/download_sitemap.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | import pandas as pd
 4 | import xmltodict
 5 | 
 6 | from common.entity import SITEMAP
 7 | from common.env_variables import DATA_SOURCE_URL, LATEST_LOAD_TIMESTAMP
 8 | from common.logging import logger, configure_logger
 9 | from common.storage import save_temp_df, SITEMAP_URLS_CSV, save_raw_file, LOAD_TIMESTAMP_FORMAT
10 | from common.webclient import get_url_content
11 | 
12 | SITEMAP_INDEX_XML = f'{DATA_SOURCE_URL}5/sitemaps/de/sitemapindex.xml'
13 | 
14 | ONE_HOUR = 3600
15 | 
16 | 
17 | def check_load_timestamp(load_timestamp):
18 |     parsed_load_timestamp = datetime.datetime.strptime(load_timestamp, LOAD_TIMESTAMP_FORMAT).replace(
19 |         tzinfo=datetime.timezone.utc)
20 |     current_timestamp = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
21 |     time_delta = current_timestamp - parsed_load_timestamp
22 |     if time_delta.seconds > ONE_HOUR:
23 |         raise Exception('The load_timestamp is older than one hour')
24 | 
25 | 
26 | def historize_url_content(url, content, load_timestamp):
27 |     file_name = url.split('/')[-1]
28 |     save_raw_file(content, SITEMAP, load_timestamp, file_name)
29 | 
30 | 
31 | def get_and_historize_url_content(url, load_timestamp):
32 |     content = get_url_content(url)
33 |     historize_url_content(url, content, load_timestamp)
34 |     return content
35 | 
36 | 
37 | def get_listing_urls(load_timestamp):
38 |     web_content = get_and_historize_url_content(SITEMAP_INDEX_XML, load_timestamp)
39 |     web_content = xmltodict.parse(web_content)
40 |     web_content = web_content['sitemapindex']
41 |     web_content = web_content['sitemap']
42 |     listing_urls = []
43 |     for entry in web_content:
44 |         url = entry['loc']
45 |         if 'listings' in url:
46 |             listing_urls.append(url)
47 |     return listing_urls
48 | 
49 | 
50 | def get_job_description_urls(web_content):
51 |     web_content = xmltodict.parse(web_content)
52 |     web_content = web_content['urlset']
53 |     url_entries = web_content['url']
54 |     urls = []
55 |     for entry in url_entries:
56 |         url = entry['loc']
57 |         urls.append(url)
58 | 
59 |     return urls
60 | 
61 | 
62 | def get_all_job_description_urls(load_timestamp):
63 |     listing_urls = get_listing_urls(load_timestamp)
64 |     job_description_urls = []
65 |     for listing_url in listing_urls:
66 |         web_content = get_and_historize_url_content(listing_url, load_timestamp)
67 |         job_description_urls.extend(get_job_description_urls(web_content))
68 |     return job_description_urls
69 | 
70 | 
71 | def convert_urls_to_df(all_job_description_urls) -> pd.DataFrame:
72 |     df = pd.DataFrame(all_job_description_urls, columns=['url'])
73 | 
74 |     df = df.drop_duplicates()
75 |     url_split = df['url'].str.split('--', expand=True)
76 |     df['name_slug'] = url_split[1]
77 |     df['id'] = url_split[2].str.split('-', expand=True)[0]
78 |     df = df.sort_values(by=['id'], ascending=False)
79 | 
80 |     return df
81 | 
82 | 
83 | def download_sitemap(load_timestamp) -> pd.DataFrame:
84 |     configure_logger(load_timestamp)
85 |     check_load_timestamp(load_timestamp)
86 |     logger.info('download_sitemap: start')
87 |     all_job_description_urls = get_all_job_description_urls(load_timestamp)
88 |     df = convert_urls_to_df(all_job_description_urls)
89 |     save_temp_df(df, load_timestamp, SITEMAP_URLS_CSV)
90 |     logger.info('download_sitemap: end')
91 |     return df
92 | 
93 | 
94 | if __name__ == '__main__':
95 |     download_sitemap(LATEST_LOAD_TIMESTAMP)
96 | 


--------------------------------------------------------------------------------
/python/dashy/requirements.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file is autogenerated by pip-compile with python 3.10
  3 | # To update, run:
  4 | #
  5 | #    pip-compile --allow-unsafe requirements.in
  6 | #
  7 | ansi2html==1.8.0
  8 |     # via jupyter-dash
  9 | appnope==0.1.3
 10 |     # via
 11 |     #   ipykernel
 12 |     #   ipython
 13 | asttokens==2.0.8
 14 |     # via stack-data
 15 | backcall==0.2.0
 16 |     # via ipython
 17 | brotli==1.0.9
 18 |     # via flask-compress
 19 | certifi==2022.9.24
 20 |     # via requests
 21 | charset-normalizer==2.1.1
 22 |     # via requests
 23 | click==8.1.3
 24 |     # via flask
 25 | dash==2.6.2
 26 |     # via
 27 |     #   -r requirements.in
 28 |     #   dash-bootstrap-components
 29 |     #   jupyter-dash
 30 | dash-bootstrap-components==1.2.1
 31 |     # via -r requirements.in
 32 | dash-core-components==2.0.0
 33 |     # via dash
 34 | dash-html-components==2.0.0
 35 |     # via dash
 36 | dash-table==5.0.0
 37 |     # via dash
 38 | debugpy==1.6.3
 39 |     # via ipykernel
 40 | decorator==5.1.1
 41 |     # via ipython
 42 | duckdb==0.7.0
 43 |     # via -r requirements.in
 44 | entrypoints==0.4
 45 |     # via jupyter-client
 46 | executing==1.1.0
 47 |     # via stack-data
 48 | flask==2.2.2
 49 |     # via
 50 |     #   dash
 51 |     #   flask-compress
 52 |     #   jupyter-dash
 53 | flask-compress==1.13
 54 |     # via dash
 55 | gunicorn==20.1.0
 56 |     # via -r requirements.in
 57 | idna==3.4
 58 |     # via requests
 59 | ipykernel==6.16.0
 60 |     # via jupyter-dash
 61 | ipython==8.5.0
 62 |     # via
 63 |     #   ipykernel
 64 |     #   jupyter-dash
 65 | itsdangerous==2.1.2
 66 |     # via flask
 67 | jedi==0.18.1
 68 |     # via ipython
 69 | jinja2==3.1.2
 70 |     # via flask
 71 | jupyter-client==7.3.5
 72 |     # via ipykernel
 73 | jupyter-core==4.11.1
 74 |     # via jupyter-client
 75 | jupyter-dash==0.4.2
 76 |     # via -r requirements.in
 77 | loguru==0.6.0
 78 |     # via -r requirements.in
 79 | markupsafe==2.1.1
 80 |     # via
 81 |     #   jinja2
 82 |     #   werkzeug
 83 | matplotlib-inline==0.1.6
 84 |     # via
 85 |     #   ipykernel
 86 |     #   ipython
 87 | nest-asyncio==1.5.6
 88 |     # via
 89 |     #   ipykernel
 90 |     #   jupyter-client
 91 |     #   jupyter-dash
 92 | numpy==1.23.3
 93 |     # via pandas
 94 | packaging==21.3
 95 |     # via ipykernel
 96 | pandas==1.5.0
 97 |     # via -r requirements.in
 98 | parso==0.8.3
 99 |     # via jedi
100 | pexpect==4.8.0
101 |     # via ipython
102 | pickleshare==0.7.5
103 |     # via ipython
104 | plotly==5.10.0
105 |     # via dash
106 | prompt-toolkit==3.0.31
107 |     # via ipython
108 | psutil==5.9.2
109 |     # via ipykernel
110 | ptyprocess==0.7.0
111 |     # via pexpect
112 | pure-eval==0.2.2
113 |     # via stack-data
114 | pygments==2.13.0
115 |     # via ipython
116 | pyparsing==3.0.9
117 |     # via packaging
118 | python-dateutil==2.8.2
119 |     # via
120 |     #   jupyter-client
121 |     #   pandas
122 | python-dotenv==0.21.0
123 |     # via -r requirements.in
124 | pytz==2022.4
125 |     # via pandas
126 | pyzmq==24.0.1
127 |     # via
128 |     #   ipykernel
129 |     #   jupyter-client
130 | requests==2.28.1
131 |     # via jupyter-dash
132 | retrying==1.3.3
133 |     # via jupyter-dash
134 | six==1.16.0
135 |     # via
136 |     #   python-dateutil
137 |     #   retrying
138 | stack-data==0.5.1
139 |     # via ipython
140 | tenacity==8.1.0
141 |     # via plotly
142 | tornado==6.2
143 |     # via
144 |     #   ipykernel
145 |     #   jupyter-client
146 | traitlets==5.4.0
147 |     # via
148 |     #   ipykernel
149 |     #   ipython
150 |     #   jupyter-client
151 |     #   matplotlib-inline
152 | urllib3==1.26.12
153 |     # via requests
154 | wcwidth==0.2.5
155 |     # via prompt-toolkit
156 | werkzeug==2.2.2
157 |     # via flask
158 | 
159 | # The following packages are considered to be unsafe in a requirements file:
160 | setuptools==65.4.1
161 |     # via gunicorn
162 | 


--------------------------------------------------------------------------------
/doc/TODO.md:
--------------------------------------------------------------------------------
 1 | # TO DO
 2 | 
 3 | ## Open
 4 | 
 5 | - [ ] Implement use case: Location/Company/Technology changelog
 6 | - [ ] Add the next data source
 7 | - [ ] Slugify the value of the filter selectors
 8 | - [ ] Upload only backup files to the Azure Blob Storage
 9 | - [ ] Implement use case: Number of jobs relative to city population
10 | - [ ] Add the flag to the do and verify backup commands: --exclude='.DS_Store'
11 | - [ ] Add a file in the raw layer with the scrape run information for each execution
12 |     - This file could be in JSON format and have the following fields:
13 |         - run_id
14 |         - timestamp
15 |         - number of urls to download
16 |         - number of urls downloaded
17 |         - number of failed urls
18 |         - failed urls (a list of string)
19 | 
20 | ## In Progress
21 | 
22 | 
23 | ## Done
24 | 
25 | - [x] Display more than 12 months
26 | - [x] Let users use interactive graphs instead of static plots
27 | - [x] Let users start the y-axis with zero
28 | - [x] Make Dashy public with the domain https://jobmarketanalytics.com/
29 | - [x] Cache sql query executions on Dashy
30 | - [x] Implement use case: Compare technologies
31 | - [x] Have 3 materialized tables for Dashy with different time durations to improve the performance
32 | - [x] Use statefuls URLs according to state of the input components on Dashy
33 | - [x] Use LocalExecutor in Airflow
34 | - [x] Run Airflow locally to reduce the Docker overhead
35 | - [x] Implement use case: Technology trends
36 | - [x] Add a size indicator in the filter options in Dashy
37 | - [x] Implement some kind of search/dashboard for external users
38 | - [x] Check out https://github.com/rilldata/rill-developer
39 | - [x] Decide for a BI tool
40 | - [x] Check out https://superset.apache.org/
41 | - [x] Create a separated virtual environment for dbt
42 | - [x] Check out https://www.linkedin.com/in/christian-kaul/recent-activity/posts/
43 | - [x] Check out https://dbtvault.readthedocs.io/
44 | - [x] Check out https://github.com/jwills/dbt-duckdb
45 | - [x] Use Gunicorn to run flasky with 4 workers
46 | - [x] On the cleansed layer, add the first sitemap occurance per URL instead of only the latest load_timestamp
47 | - [x] Add load_timestamp and load_date to the curated layer
48 | - [x] Rename target_date to load_date
49 | - [x] Rename run_timestamp to load_timestamp
50 | - [x] Fail the download sitemap task in the hourly dag if the load_timestamp is older than one hour
51 | - [x] Create a separated virtual environment for airflow
52 | - [x] Fix the issue "metaData-bag.log"
53 | - [x] Find a better way to avoid Airflow to hang when there are many jobs to download
54 | - [x] Move the raw storage to the cloud
55 | - [x] Improve logging
56 |     - Log how many urls to download are
57 |     - Make the check vpn more visible
58 | - [x] Download the job description again after a configurable number of days online
59 | - [x] Create a report that shows how many days a job offer is online
60 | - [x] Create a report that shows how many job offers are online at a given time
61 | - [x] Find a better timestamp to use than the logical timestamp for the scrape data source dag
62 | - [x] Fix bug with file names longer than 255 characters
63 | - [x] Fix logs in Flasky
64 | - [x] Add more granularity to the ingestion time in the raw data
65 | - [x] Add orchestration with Airflow
66 | - [x] Create the Data Vault
67 | - [x] Optimize the function to create the chunks
68 | - [x] Add a check for the network connection before we start crawling
69 | - [x] Save the whole html document from the source instead of just a fragment of it, so that no information is lost if
70 |   the HTML format changes
71 | - [x] Add logging to the sitemap scraper
72 | - [x] Find a way to pass the list of parquet files to PostgreSQL.
73 |     - Result: Use Python to create the staging fdw staging tables referencing the parquet files
74 | - [x] Add the _job_id_ to the _sitemap_ and _job_description_ on the cleansed layer
75 | - [x] Create a _ingestion_id_ with the hash of the _job_id_ and _timestap_ on the cleansed layer
76 | 
77 | ---
78 | 
79 | ## Discarded
80 | 
81 | - [x] Try https://xapian.org/ for the search
82 | - [x] Replace the PostgreSQL ingestion with CSV instead of Parquet
83 | - [x] Do not let Flasky start a process behind an endpoint, if a process is still running
84 | - [x] Try Prefect
85 | - [x] Log the date and time more visible
86 | - [x] Allow one retry after the browser crashes
87 | 
88 | ## Technical Debt
89 | 
90 | - [ ] Rename job_description to job_offer
91 | - [ ] Rename cleansed to curated
92 | 


--------------------------------------------------------------------------------
/python/tests/data/normalize_job_description/output/test_case_7610222.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": "Anlagenmechaniker für Sanitär-, Heizungs- und Klimatechnik (m/w/d)",
 3 |   "online_status": "online",
 4 |   "is_anonymous": false,
 5 |   "job_id": 7610222,
 6 |   "should_display_early_applicant": false,
 7 |   "location": "Hamburg (Hammerbrook)",
 8 |   "contract_type": "Feste Anstellung",
 9 |   "work_type": "Vollzeit",
10 |   "online_date": "2021-10-13T15:54:04Z",
11 |   "company_name": "ENGIE Deutschland GmbH",
12 |   "description": "ÜBER UNS:Als Komplett-Dienstleister im Bereich Facility Solutions sichern wir den optimalen Betrieb von Gebäuden und Anlagen. Wir bieten modulare Leistungspakete von Service und Wartung über Instandhaltung bis hin zur Bewirtschaftung komplexer Liegenschaften. Für unsere Multi-Site-Kunden arbeiten wir als überregionaler oder auch internationaler Facility-Management-Partner.IHRE AUFGABEN:Wir suchen Servicetechniker bzw. Anlagenmechaniker für die Gewerke Heizung, Klima, Lüftung, Sanitär oder Kälte für die Wartung, Instandsetzung und Bedienung der haustechnischen Anlagen bei unserem Kunden vor Ort.\nSie arbeiten an einem festen Objekt, sodass keine Reisetätigkeit anfällt.\n\nBetreiben der gebäudetechnischen Anlagen an einem anspruchsvollen Industriestandort\nOrganisation, Steuerung, Kontrolle und selbstständige Durchführung von Wartungs- und Instandsetzungsarbeiten an gebäudetechnischen Anlagen\nOptimierung der bestehenden Anlagentechnik und der Betriebsabläufe\nErstellung und Dokumentation der täglichen Arbeitsleistung über mobile Endgeräte\nKoordination und Begleitung von Nachunternehmern\nErster Ansprechpartner vor Ort für unsere Kunden im operative Tagesgeschäft\nIHR PROFIL:\nAbgeschlossene Berufsausbildung als Anlagenmechaniker für Sanitär-, Heizungs- und Klimatechnik oder als Zentralheizungs- und Lüftungsbauer, Gas-Wasserinstallateur oder Kältetechniker\nMehrjährige Berufserfahrungen im Bereich der Technischen Gebäudeausrüstung\nKunden- und Dienstleistungsorientierung gepaart mit Spaß an der Arbeit im Team\nGeregelten Arbeitszeiten mit gelegentlichen Bereitschaftsdiensten\nFührerschein der Klasse B\nIHRE BENEFITS:\nAkademie\nAltersvorsorge\nCorporate Benefits\nPerspektiven\nFirmenfeiern\nFlexible Arbeitszeiten\nGestaltungsfreiheit\nHohe Sicherheitsstandards\nInternationalität\nSpannende Projekte\nTeamgeist\nAttraktive Vergütung\nIHR JOB?Werden auch Sie ein ENGIEneer und gestalten Sie zusammen mit uns die Zukunft der Energiewende. Wir sind gespannt auf Ihre Online-Bewerbung!\n IHR KONTAKT:\nMonika Brzenska\nTalent Acquisition Specialist\nTelefon: 0221 46 90 54 29 \n \nKENNZIFFER: 2021-0476",
13 |   "description_introduction": "ÜBER UNS:Als Komplett-Dienstleister im Bereich Facility Solutions sichern wir den optimalen Betrieb von Gebäuden und Anlagen. Wir bieten modulare Leistungspakete von Service und Wartung über Instandhaltung bis hin zur Bewirtschaftung komplexer Liegenschaften. Für unsere Multi-Site-Kunden arbeiten wir als überregionaler oder auch internationaler Facility-Management-Partner.",
14 |   "description_responsabilities": "Wir suchen Servicetechniker bzw. Anlagenmechaniker für die Gewerke Heizung, Klima, Lüftung, Sanitär oder Kälte für die Wartung, Instandsetzung und Bedienung der haustechnischen Anlagen bei unserem Kunden vor Ort.\nSie arbeiten an einem festen Objekt, sodass keine Reisetätigkeit anfällt.\n\nBetreiben der gebäudetechnischen Anlagen an einem anspruchsvollen Industriestandort\nOrganisation, Steuerung, Kontrolle und selbstständige Durchführung von Wartungs- und Instandsetzungsarbeiten an gebäudetechnischen Anlagen\nOptimierung der bestehenden Anlagentechnik und der Betriebsabläufe\nErstellung und Dokumentation der täglichen Arbeitsleistung über mobile Endgeräte\nKoordination und Begleitung von Nachunternehmern\nErster Ansprechpartner vor Ort für unsere Kunden im operative Tagesgeschäft",
15 |   "description_requirements": "Abgeschlossene Berufsausbildung als Anlagenmechaniker für Sanitär-, Heizungs- und Klimatechnik oder als Zentralheizungs- und Lüftungsbauer, Gas-Wasserinstallateur oder Kältetechniker\nMehrjährige Berufserfahrungen im Bereich der Technischen Gebäudeausrüstung\nKunden- und Dienstleistungsorientierung gepaart mit Spaß an der Arbeit im Team\nGeregelten Arbeitszeiten mit gelegentlichen Bereitschaftsdiensten\nFührerschein der Klasse B",
16 |   "description_perks": "Akademie\nAltersvorsorge\nCorporate Benefits\nPerspektiven\nFirmenfeiern\nFlexible Arbeitszeiten\nGestaltungsfreiheit\nHohe Sicherheitsstandards\nInternationalität\nSpannende Projekte\nTeamgeist\nAttraktive Vergütung"
17 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Job Market Analytics
  2 | 
  3 | The aim of this project is to develop an end-to-end Data Platform to explore and learn new technologies.
  4 | 
  5 | ## Architecture
  6 | 
  7 | ![Architecture Overview](doc/architecture-overview.drawio.svg)
  8 | 
  9 | ## Storage
 10 | 
 11 | ### Data Lake
 12 | 
 13 | The Data Lake is basically a file system on my local computer, but could be easily transfered to a Cloud Blob Storage (
 14 | like AWS S3 or Azure Blob Storage) if needed. The current Data Lake we have two layers:
 15 | 
 16 | - The **Raw Layer**, where the information from the data source are stored in the same file format as ingested (e.g.
 17 |   HTML or XML).
 18 | - The **Cleansed Layer**, where we store the information in Parquet, which means that the information is stored in a
 19 |   tabular format with well-defined columns.
 20 | 
 21 | ### Data Warehouse
 22 | 
 23 | The Data Warehouse is based on PostgreSQL plus an extension in order to read Parquet files as foreign tables. PostgreSQL
 24 | might not be the best choice for a datawarehouse since it is row-column-oriented but in this case we have reduced number
 25 | of columns and a relative small data size. Another advantage of PostgreSQL is that I can run it easily on my computer
 26 | via Docker so that I can avoid cloud service costs. We will divide the datawarehouse in 3 schemas:
 27 | 
 28 | - **Staging**, which are basically foreign tables referencing the Parquet files on the Data Lake Cleansed Layer.
 29 | - **Data Vault**, where the data is modelled and historized using
 30 |   the [Data Vault Specification](https://danlinstedt.com/wp-content/uploads/2018/06/DVModelingSpecs2-0-1.pdf).
 31 | - **Data Mart**, which will be the consuming layer for our BI Tool.
 32 | 
 33 | ### Data Vault Model
 34 | 
 35 | ![Data Vault Model](doc/data-vault-model.drawio.svg)
 36 | 
 37 | ### Mart Model
 38 | 
 39 | ![Mart Model](doc/mart-model.drawio.svg)
 40 | 
 41 | ### Data Lineage (dbt Dag)
 42 | 
 43 | ![Data Lineage (dbt Dag)](doc/dbt-dag.png)
 44 | 
 45 | ## Computing
 46 | 
 47 | In order to compute the data, we use two different approaches.
 48 | 
 49 | - **Python** for the data ingestion, when we crawl and scrape data directly from the data source. And also for the data
 50 |   transformation from the Raw to the Cleansed layer. All Python code is divided in atomic tasks and these are
 51 | - orchestrated by [Airflow](https://airflow.apache.org/).
 52 | - **SQL** for the transformations of the data inside the Data Warehouse. The SQL tasks are automated and orchestrated
 53 |   by [dbt](https://www.getdbt.com/).
 54 | 
 55 | ### Data Source Scraping
 56 | 
 57 | In order to download the data from the data source, we run the following Airflow dag:
 58 | 
 59 | ![Scrape Data Source Dag](doc/scrape_data_source_dag.png)
 60 | 
 61 | First, we make sure that we are connected to the VPN, then we download and archive the list of the jobs that online at
 62 | the moment from the sitemap, and we list out which job descriptions we have not downloaded yet, and then we download
 63 | them via browser automation with [Playwright](https://playwright.dev/).
 64 | 
 65 | ### Data Transformation Orchestration
 66 | 
 67 | The data transformation is orchestrated as an Airflow DAG, which runs on daily basis and combines Python transformation
 68 | jobs and the dbt run to build up the incremental models.
 69 | 
 70 | ![Airflow DAG Daily](doc/airflow_dag_daily.png)
 71 | 
 72 | ## Frequently Asked Questions
 73 | 
 74 | ### What questions can be answered with this project?
 75 | 
 76 | Here are some examples of what we can answer:
 77 | 
 78 | - How long is a job offer online until it is taken offline?
 79 | - Which technologies are the most demanded at the moment?
 80 | - How the demand for a particular technology evolves during the time?
 81 | - How many jobs offers are remote and how this is evolving during the time?
 82 | - When was a particular job offer first published?
 83 | 
 84 | ### Could we answer those questions with a simplier technology stack?
 85 | 
 86 | Yes, we could. But the point of the project is to explore and learn new technologies and concepts, therefore it has been
 87 | over-engineered on purpose.
 88 | 
 89 | ### Are you planning to create a public Web or Mobile Application with this?
 90 | 
 91 | No, at least not at the moment. This is just for educative purposes.
 92 | 
 93 | ### Why did you choose Parquet as file format for the Cleansed Layer in the Data Lake?
 94 | 
 95 | I choose Parquet because it is a column-oriented compressed file type, which has been well-batled-tested. Good Python
 96 | libraries are available like [Pyarrow](https://arrow.apache.org/docs/python/parquet.html) to write and read.
 97 | 
 98 | ### Why did you choose PostgresSQL for the Data Warehouse?
 99 | 
100 | PostgreSQL is a very robust database with standard SQL that can run locally and its performance is good enough for the
101 | current data size and number of columns.
102 | 
103 | ### How big is your data?
104 | 
105 | It is around 530 GB in raw format after being scraping the data sources since October 2021, and it grows around 2 GB
106 | every day. After cleansing and compressing the data to Parquet is around 30 times smaller, since we can get rid of a
107 | great deal of HTML, CSS and JS because it does not provide any extra information for my use cases.
108 | 
109 | ![Raw Storage Size in Azure Blob Container](doc/raw-in-azure-blob-storage.png)
110 | 


--------------------------------------------------------------------------------
/python/.flake8:
--------------------------------------------------------------------------------
  1 | # All configuration for plugins and other utils is defined here.
  2 | # Read more about `setup.cfg`:
  3 | # https://docs.python.org/3/distutils/configfile.html
  4 | 
  5 | 
  6 | # === Linter configuration ===
  7 | # You can reuse this configuration in your own projects.
  8 | # See: https://wemake-python-stylegui.de/en/latest/pages/usage/integrations/nitpick.html
  9 | 
 10 | [flake8]
 11 | # Base flake8 configuration:
 12 | # https://flake8.pycqa.org/en/latest/user/configuration.html
 13 | format = wemake
 14 | show-source = True
 15 | statistics = False
 16 | doctests = True
 17 | 
 18 | # Plugins:
 19 | max-complexity = 6
 20 | max-line-length = 120
 21 | 
 22 | # darglint configuration:
 23 | # https://github.com/terrencepreilly/darglint
 24 | strictness = long
 25 | docstring-style = numpy
 26 | 
 27 | # Self settings:
 28 | max-imports = 17
 29 | 
 30 | # Excluding some directories:
 31 | exclude =
 32 |   .git
 33 |   __pycache__
 34 |   .venv
 35 |   .eggs
 36 |   *.egg
 37 |   dist
 38 |   # These folders contain code badly written for reasons:
 39 |   # Project spefic, do not copy.
 40 |   tests/fixtures/**
 41 |   tests/**/snapshots/**
 42 | 
 43 | # Exclude some pydoctest checks globally:
 44 | ignore = D100, D104, D401, W504, RST303, RST304, DAR103, DAR203, E800, D103, WPS421, WPS305
 45 | 
 46 | per-file-ignores =
 47 |   # These function names are part of 3d party API:
 48 |   wemake_python_styleguide/visitors/ast/*.py: N802
 49 |   # These modules should contain a lot of classes:
 50 |   wemake_python_styleguide/violations/*.py: WPS202
 51 |   # Eval is a complex task:
 52 |   wemake_python_styleguide/logic/safe_eval.py: WPS232
 53 |   # This module should contain magic numbers:
 54 |   wemake_python_styleguide/options/defaults.py: WPS432
 55 |   # Checker has a lot of imports:
 56 |   wemake_python_styleguide/checker.py: WPS201
 57 |   # Allows mypy type hinting, `Ellipsis`` usage, multiple methods:
 58 |   wemake_python_styleguide/types.py: D102, WPS214, WPS220, WPS428
 59 |   # There are multiple fixtures, `assert`s, and subprocesses in tests:
 60 |   tests/test_visitors/test_ast/test_naming/conftest.py: WPS202
 61 |   tests/*.py: S101, S105, S404, S603, S607, WPS211, WPS226, WPS323
 62 |   # Docs can have the configuration they need:
 63 |   docs/conf.py: WPS407
 64 |   # Pytest fixtures
 65 |   tests/plugins/*.py: WPS442
 66 | 
 67 | 
 68 | [isort]
 69 | # isort configuration:
 70 | # https://github.com/timothycrosley/isort/wiki/isort-Settings
 71 | include_trailing_comma = true
 72 | use_parentheses = true
 73 | # See https://github.com/timothycrosley/isort#multi-line-output-modes
 74 | multi_line_output = 3
 75 | # Is the same as 80 in flake8:
 76 | line_length = 120
 77 | 
 78 | # We need these lines for Github Action to work correctly,
 79 | # **please** do not copy it to your own configs:
 80 | default_section = THIRDPARTY
 81 | known_first_party = wemake_python_styleguide*
 82 | skip_glob =
 83 |   # These folders contain code badly written for reasons:
 84 |   tests/fixtures/**
 85 |   tests/**/snapshots/**
 86 | 
 87 | 
 88 | # === Internal tools ===
 89 | # You are not interested in anything beyond this line.
 90 | 
 91 | [tool:pytest]
 92 | # py.test configuration: http://doc.pytest.org/en/latest/customize.html
 93 | norecursedirs = tests/fixtures *.egg .eggs dist build docs .tox .git __pycache__
 94 | 
 95 | filterwarnings =
 96 |   ignore::DeprecationWarning
 97 | 
 98 | addopts =
 99 |   --strict
100 |   --doctest-modules
101 |   --cov=wemake_python_styleguide
102 |   --cov-branch
103 |   --cov-report=term-missing:skip-covered
104 |   --cov-report=html
105 |   --cov-report=xml
106 |   --cov-fail-under=100
107 | 
108 | 
109 | [coverage:run]
110 | # Coverage configuration: https://coverage.readthedocs.io/
111 | 
112 | # We don't need to cover some files. They are fully checked with mypy.
113 | # And don't contain any logic.
114 | omit =
115 |   wemake_python_styleguide/types.py
116 | 
117 | # Here we specify plugins for coverage to be used:
118 | plugins =
119 |   coverage_conditional_plugin
120 | 
121 | [coverage:coverage_conditional_plugin]
122 | # Here we specify our pragma rules:
123 | rules =
124 |   "sys_version_info < (3, 8)": py-lt-38
125 |   "sys_version_info >= (3, 8)": py-gte-38
126 | 
127 |   "sys_version_info < (3, 9)": py-lt-39
128 |   "sys_version_info >= (3, 9)": py-gte-39
129 | 
130 | 
131 | [mypy]
132 | # The mypy configurations: http://bit.ly/2zEl9WI
133 | allow_redefinition = False
134 | check_untyped_defs = True
135 | disallow_untyped_decorators = True
136 | disallow_any_explicit = True
137 | disallow_any_generics = True
138 | disallow_untyped_calls = True
139 | ignore_errors = False
140 | ignore_missing_imports = True
141 | implicit_reexport = False
142 | local_partial_types = True
143 | strict_optional = True
144 | strict_equality = True
145 | no_implicit_optional = True
146 | warn_unused_ignores = True
147 | warn_redundant_casts = True
148 | warn_unused_configs = True
149 | warn_unreachable = True
150 | warn_no_return = True
151 | 
152 | [mypy-wemake_python_styleguide.compat.nodes]
153 | # We allow explicit `Any` only in this file, because of the compatibility:
154 | disallow_any_explicit = False
155 | 
156 | [mypy-wemake_python_styleguide.compat.packaging]
157 | # We allow unused `ignore` comments, because we cannot sync it between versions:
158 | warn_unused_ignores = False
159 | 
160 | [mypy-wemake_python_styleguide.logic.safe_eval]
161 | # We allow explicit `Any` only in this file, because that's what it does:
162 | disallow_any_explicit = False
163 | 
164 | 
165 | [doc8]
166 | # doc8 configuration: https://pypi.org/project/doc8/
167 | ignore-path = docs/_build
168 | max-line-length = 120
169 | sphinx = True


--------------------------------------------------------------------------------
/python/tests/data/normalize_job_description/output/test_case_7610188.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": "Ansible/ServiceNow Experte (m/w/d)",
 3 |   "online_status": "online",
 4 |   "is_anonymous": false,
 5 |   "job_id": 7610188,
 6 |   "should_display_early_applicant": false,
 7 |   "location": "Hannover oder Münster",
 8 |   "contract_type": "Feste Anstellung",
 9 |   "work_type": "Vollzeit, Home Office möglich",
10 |   "online_date": "2021-10-13T15:40:32Z",
11 |   "company_name": "Finanz Informatik GmbH & Co. KG",
12 |   "description": "Als einer der größten Banken-IT-Dienstleister Europas sind wir der Treiber der Digitalisierung innerhalb der Sparkassen-Finanzgruppe. Mit über 4.000 Mitarbeitern an 3 Standorten machen wir digitales Banking von heute leistungsfähig und entwickeln smarte Finanz-Services von morgen. Dabei bieten wir Ihnen ein breites Aufgabenspektrum, in dem Sie Ihre individuelle Stärke hervorragend einbringen können. Ob App-Entwicklung, Netzwerktechnologien und Serverbetrieb oder Beratung, Schulung und Support – bei uns finden Sie Ihre Berufung! Als Spezialist oder auch als Generalist. Alles mit besten Karrierechancen, viel Raum für persönliche Entfaltung und zahlreichen Benefits.\nFür unsere Abteilung Bereitstellung Kommunikationsdienste suchen wir zum nächstmöglichen Zeitpunkt für den Standort Hannover oder Münster Verstärkung als\nAnsible/ServiceNow Experte (m/w/d)\nIhre Aufgaben:\nSie sind unser Experte für die Einführung und kontinuierliche Weiterentwicklung unserer Automationsstrategie\nEntwurf/Programmierung (Python) von Automationsobjekten zur Optimierung des Produktionsablaufes und der Überwachung der Systemplattform \nAufbau von automatisierten Schnittstellen zur umliegenden Serverinfrastruktur\nDurchführung von Programm- und Systemtests und Unterstützung bei der Fehlerbehebung \nDokumentation sowie Pflege und Qualitätssicherung der automatisierten Plattform\nEntwicklung der Automatisierung bei der Bereitstellung neuer Services\n\nIhr Profil:\nAbgeschlossenes technisches Studium vorzugsweise im IT/TK-Umfeld oder eine vergleichbare Ausbildung/Qualifikation\nMehrjährige Erfahrung in der Programmierung und im Umgang mit Skriptsprachen \nErfahrung mit Telefonie-Plattformen und -Systemen, ACD, VoIP-Netzwerkstrukturen \nKenntnisse im Plattformbetrieb von Windows, Unix, Datenbanken sowie VMware\nErfahrungen im Prozess-, Test- und Qualitätsmanagement wünschenswert\nKundenorientierung und gute kommunikative Fähigkeiten \nSie sind ein Teamplayer und ergänzen unser dynamisches Team mit Initiative und Zielstrebigkeit\nBereitschaft zu gelegentlichen Dienstreisen sowie Sondereinsätzen\n\nIhre Benefits:\nAltersvorsorge\nBarrierefrei\nBetriebssport\nFamilienservice\nFirmenevents\nFlexible Arbeitszeiten\nMobiles Arbeiten\nJobticket\nKantine\nTarifvertrag\nWeiterbildung\nFitnessförderung\n\nBei uns erwartet Sie eine attraktive Vergütung basierend auf Ihrer Qualifikation sowie Ihrer relevanten, praktischen Erfahrung.\nKlingt interessant?Dann bewerben Sie sich ganz einfach über unser FI-Karriere-Online-Portal. Wir freuen uns auf Ihre Bewerbung unter Angabe der Kennziffer 341/2021! Sollten Sie vorab weitere Auskünfte zu dieser Stelle wünschen, steht Ihnen gerne Herr Malte Kurz zur Verfügung. Sie erreichen Malte Kurz unter Tel. 0511 5102-24958 oder per E-Mail unter karriere@f-i.de.",
13 |   "description_introduction": "Als einer der größten Banken-IT-Dienstleister Europas sind wir der Treiber der Digitalisierung innerhalb der Sparkassen-Finanzgruppe. Mit über 4.000 Mitarbeitern an 3 Standorten machen wir digitales Banking von heute leistungsfähig und entwickeln smarte Finanz-Services von morgen. Dabei bieten wir Ihnen ein breites Aufgabenspektrum, in dem Sie Ihre individuelle Stärke hervorragend einbringen können. Ob App-Entwicklung, Netzwerktechnologien und Serverbetrieb oder Beratung, Schulung und Support – bei uns finden Sie Ihre Berufung! Als Spezialist oder auch als Generalist. Alles mit besten Karrierechancen, viel Raum für persönliche Entfaltung und zahlreichen Benefits.\nFür unsere Abteilung Bereitstellung Kommunikationsdienste suchen wir zum nächstmöglichen Zeitpunkt für den Standort Hannover oder Münster Verstärkung als\nAnsible/ServiceNow Experte (m/w/d)",
14 |   "description_responsabilities": "Sie sind unser Experte für die Einführung und kontinuierliche Weiterentwicklung unserer Automationsstrategie\nEntwurf/Programmierung (Python) von Automationsobjekten zur Optimierung des Produktionsablaufes und der Überwachung der Systemplattform \nAufbau von automatisierten Schnittstellen zur umliegenden Serverinfrastruktur\nDurchführung von Programm- und Systemtests und Unterstützung bei der Fehlerbehebung \nDokumentation sowie Pflege und Qualitätssicherung der automatisierten Plattform\nEntwicklung der Automatisierung bei der Bereitstellung neuer Services",
15 |   "description_requirements": "Abgeschlossenes technisches Studium vorzugsweise im IT/TK-Umfeld oder eine vergleichbare Ausbildung/Qualifikation\nMehrjährige Erfahrung in der Programmierung und im Umgang mit Skriptsprachen \nErfahrung mit Telefonie-Plattformen und -Systemen, ACD, VoIP-Netzwerkstrukturen \nKenntnisse im Plattformbetrieb von Windows, Unix, Datenbanken sowie VMware\nErfahrungen im Prozess-, Test- und Qualitätsmanagement wünschenswert\nKundenorientierung und gute kommunikative Fähigkeiten \nSie sind ein Teamplayer und ergänzen unser dynamisches Team mit Initiative und Zielstrebigkeit\nBereitschaft zu gelegentlichen Dienstreisen sowie Sondereinsätzen",
16 |   "description_perks": "Altersvorsorge\nBarrierefrei\nBetriebssport\nFamilienservice\nFirmenevents\nFlexible Arbeitszeiten\nMobiles Arbeiten\nJobticket\nKantine\nTarifvertrag\nWeiterbildung\nFitnessförderung\n\nBei uns erwartet Sie eine attraktive Vergütung basierend auf Ihrer Qualifikation sowie Ihrer relevanten, praktischen Erfahrung."
17 | }


--------------------------------------------------------------------------------
/python/simplescraper/tasks/curate_job_descriptions.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import numpy as np
  4 | 
  5 | from common.entity import JOB, JOB_LOCATION, JOB_DESCRIPTION, JOB_TECHNOLOGY
  6 | from common.logging import configure_logger, logger
  7 | from common.storage import get_load_timestamp, get_load_date, load_cleansed_df, save_curated_df
  8 | 
  9 | JOB_DESCRIPTION_SAT_COLUMNS = ['title', 'online_status', 'is_anonymous', 'should_display_early_applicant',
 10 |                                'contract_type', 'work_type', 'online_date', 'company_name', 'description_introduction',
 11 |                                'description_responsabilities', 'description_requirements', 'description_perks']
 12 | 
 13 | BASE_COLUMNS = ['year', 'month', 'day', 'job_id', 'load_timestamp']
 14 | 
 15 | TECHNOLOGIES = [
 16 |     'AI',
 17 |     'Airflow',
 18 |     'Android',
 19 |     'Angular',
 20 |     'AWS',
 21 |     'Azure',
 22 |     'CSS',
 23 |     'Couchbase',
 24 |     'CouchDB',
 25 |     'Cypress',
 26 |     'Dagster',
 27 |     'Dask',
 28 |     'Databricks',
 29 |     'dbt',
 30 |     'Docker',
 31 |     'Duckdb',
 32 |     'ELT',
 33 |     'ETL',
 34 |     'Flink',
 35 |     'Flutter',
 36 |     'GCP',
 37 |     'Go',
 38 |     'Golang',
 39 |     'Gradle',
 40 |     'gRPC',
 41 |     'HANA',
 42 |     'Java',
 43 |     'JavaScript',
 44 |     'Keras',
 45 |     'Kotlin',
 46 |     'Kubernetes',
 47 |     'LESS',
 48 |     'Maven',
 49 |     'ML',
 50 |     'MongoDB',
 51 |     'MySQL',
 52 |     'NLP',
 53 |     'Oracle',
 54 |     'Pandas',
 55 |     'Playwright',
 56 |     'PostgreSQL',
 57 |     'Prefect',
 58 |     'Puppeteer',
 59 |     'Purview',
 60 |     'Python',
 61 |     'PyTorch',
 62 |     'React',
 63 |     'REST',
 64 |     'Rust',
 65 |     'Tensorflow',
 66 |     'TestCafe',
 67 |     'TypeScript',
 68 |     'WebAssembly',
 69 |     'scikit',
 70 |     'Selenium',
 71 |     'Snowflake',
 72 |     'Snowplow',
 73 |     'Spark',
 74 |     'Spring',
 75 |     'Storm',
 76 |     'SAP',
 77 |     'SCSS',
 78 |     'SQL',
 79 |     'SSIS',
 80 |     'Synapse',
 81 |     'Vue',
 82 | ]
 83 | 
 84 | 
 85 | def process_job_description(df):
 86 |     df = df.copy()
 87 |     df = df[df['company_name'].notna()]
 88 |     df = df[BASE_COLUMNS + JOB_DESCRIPTION_SAT_COLUMNS]
 89 |     save_curated_df(df, JOB)
 90 | 
 91 | 
 92 | def process_location(df):
 93 |     df = df[BASE_COLUMNS + ['location']].copy()
 94 | 
 95 |     df['location'] = df['location'].str.replace('Frankfurt (Main)', 'Frankfurt am Main', regex=False)
 96 |     df['location'] = df['location'].str.replace('Frankfurt a. M.', 'Frankfurt am Main', regex=False)
 97 |     df['location'] = df['location'].str.replace('Frankfurt a.M.', 'Frankfurt am Main', regex=False)
 98 |     df['location'] = df['location'].str.replace('Frankfurt am Main (60488)', 'Frankfurt am Main', regex=False)
 99 |     df['location'] = df['location'].str.replace('Frankfurt Am Main', 'Frankfurt am Main', regex=False)
100 |     df['location'] = df['location'].str.replace('Frankfurt/M.', 'Frankfurt am Main', regex=False)
101 |     df['location'] = df['location'].str.replace('Frankfurt aM', 'Frankfurt am Main', regex=False)
102 |     df['location'] = df['location'].str.replace('Frankfurt (am Main)', 'Frankfurt am Main', regex=False)
103 |     df['location'] = df['location'].str.replace('Frankfurt Main', 'Frankfurt am Main', regex=False)
104 |     df['location'] = df['location'].str.replace('Frankfurt aam Main', 'Frankfurt am Main', regex=False)
105 | 
106 |     df['location'] = df['location'].str.replace('|'.join([' und ', ' oder ', '/', ';', ' - ', ':']), ',', regex=True)
107 |     df['location'] = df['location'].str.replace(' | ', ',', regex=False)
108 |     df['location'] = df['location'].str.replace(' .', ',', regex=False)
109 |     df['location'] = df['location'].str.replace(' u.a. ', ',', regex=False)
110 |     df['location'] = df['location'].str.split(',')
111 |     df = df.explode('location').reset_index(drop=True)
112 | 
113 |     df['location'] = df['location'].str.strip()
114 | 
115 |     df['location'] = df['location'].replace('Frankfurt', 'Frankfurt am Main')
116 | 
117 |     df['location'] = df['location'].replace('', np.nan)
118 |     df['location'] = df['location'].replace('keine Angabe', np.nan)
119 |     df = df.dropna()
120 | 
121 |     save_curated_df(df, JOB_LOCATION)
122 | 
123 | 
124 | def process_technology(df):
125 |     df = df.copy()
126 |     df['description'] = df['title'] + ' ' + \
127 |                         df['description_introduction'] + ' ' + \
128 |                         df['description_responsabilities'] + ' ' + \
129 |                         df['description_requirements'] + ' ' + \
130 |                         df['description_perks']
131 |     for technology in TECHNOLOGIES:
132 |         df[technology] = df['description'].str.contains(fr'(?i)\b{technology}\b', regex=True)
133 |     df['Other'] = ~df[TECHNOLOGIES].any(axis='columns')
134 |     df = df.melt(id_vars=BASE_COLUMNS, value_vars=TECHNOLOGIES + ['Other'], var_name='technology')
135 |     df = df[df['value'].notna()]
136 |     df = df[df['value']]
137 |     df = df[BASE_COLUMNS + ['technology']]
138 | 
139 |     save_curated_df(df, JOB_TECHNOLOGY)
140 | 
141 | 
142 | def curate_job_descriptions(load_timestamp, load_date):
143 |     configure_logger(load_timestamp)
144 |     logger.info(f'Start curate_job_descriptions: {load_timestamp} {load_date}')
145 | 
146 |     df = load_cleansed_df(JOB_DESCRIPTION, load_date=load_date)
147 | 
148 |     df = df.dropna(subset=['job_id'])
149 |     df['job_id'] = df['job_id'].astype('int')
150 |     df = df.sort_values(by=['job_id'])
151 | 
152 |     process_job_description(df)
153 |     process_location(df)
154 |     process_technology(df)
155 | 
156 |     logger.info(f'End   curate_job_descriptions: {load_timestamp} {load_date}')
157 | 
158 | 
159 | if __name__ == "__main__":
160 |     _load_timestamp = sys.argv[1] if len(sys.argv) > 1 else get_load_timestamp()
161 |     _load_date = sys.argv[2] if len(sys.argv) > 2 else get_load_date()
162 |     curate_job_descriptions(_load_timestamp, _load_date)
163 | 


--------------------------------------------------------------------------------
/python/tests/data/normalize_job_description/output/test_case_7609275.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": "Account Manager (m/w/d)",
 3 |   "online_status": "online",
 4 |   "is_anonymous": false,
 5 |   "job_id": 7609275,
 6 |   "should_display_early_applicant": false,
 7 |   "location": "bundesweit",
 8 |   "contract_type": "Feste Anstellung",
 9 |   "work_type": "Vollzeit, Home Office möglich",
10 |   "online_date": "2021-10-13T13:22:15Z",
11 |   "company_name": "Quentic GmbH",
12 |   "description": "Passionate people for sustainable softwareQuentic ist einer der führenden Lösungsanbieter für Software as a Service (SaaS) im europäischen HSE- und CSR-Markt. Das Unternehmen hat seinen Hauptsitz in Berlin und beschäftigt über 250 Mitarbeitende. Niederlassungen befinden sich in Deutschland, Österreich und der Schweiz sowie in Finnland, Belgien, Dänemark, Schweden, den Niederlanden, Spanien und Italien.\nÜber 800 Kunden stärken ihr HSE- und CSR-Management mit den Quentic Software-Lösungen. Mit neun individuell kombinierbaren Modulen umfasst die Online-Plattform Arbeitssicherheit, Risks & Audits, Control of Work, Gefahrstoffe, Legal Compliance, Online-Unterweisungen, Prozesse sowie Umweltmanagement und Nachhaltigkeit. Quentic vernetzt Daten, verbindet alle HSE- und CSR-Akteure und begeistert für das gesamte Aufgabenfeld - via Browser oder per App. Da Aufgaben über Abteilungen, Standorte und Länder hinweg ineinandergreifen, lassen sich Unternehmensprozesse effizient nach gesetzlichen Vorgaben steuern.\nDeine Aufgaben\nDu betreust überwiegend Bestandkunden im Medium und Large Enterprise Business der Industrie im DACH-Raum\nDu erkennst Up- & Cross-Selling-Potentiale und schöpfst sie aus\nDu führst Verhandlungen über Preise und Vertragsverlängerungen\nDu präsentierst unser Leistungsversprechen unseren Bestandskunden und analysierst ihren Bedarf\nDu repräsentierst Quentic auf Roadshows und Messen\nDu pflegst unser CRM-System und reportest regelmäßig an unsere Head of Account Management\nDu arbeitest mit externen Dienstleistern zusammen\nDu sicherst und erhöhst die Kundenzufriedenheit\n\nDeine Qualifikationen\nDu hast bereits umfangreiche Berufserfahrung in der Bestandkundenbetreuung im B2B Software-Bereich\nBegriffe wie Buying Center, Tender und Complex Sales sind Dir geläufig\nDu bist technikaffin und hast Interesse an den Themen Arbeitssicherheit, Nachhaltigkeit und Umweltschutz\nMit Empathie und Geschick gelingt es Dir, komplexe Sachverhalte verständlich zu präsentieren\nDu bist argumentationssicher und verhandlungsstark und kannst so unsere Business Software online und vor Ort sicher präsentieren \nDu sprichst fließend Deutsch und Englisch, weitere europäische Sprachen sind ein Plus\nDu bist bereit, innerhalb der DACH-Region zu reisen (i.d.R. 1-2 Tage pro Woche innerhalb Deines lokalen Vertriebgebiets)\n\nDeine Aussichten\nNicht gesättigtes Marktumfeld mit steigender Nachfrage\nUnterstützung durch ein starkes Marketing sowie unsere Consultants bei der Kundenbetreuung\nAttraktive Vergütung aus einem Fixgehalt und einer transparenten Variablen je nach Zielvereinbarung\nFirmen-Kreditkarte und ein mobiles Büro\nStrukturierte Einarbeitung und Betreuung durch Mentoren\nFlache Hierarchien mit offenen Türen in einer lockeren, professionellen Atmosphäre\nRegelmäßige Teamevents und ein besonderes Augenmerk auf die Work-Life-Balance (flexible Arbeitszeiten, Bezuschussung Fitness-Studio u. v. m.)\n\nWeitere InformationenWenn du die Welt ein bisschen sicherer machen und mehr über die Themen Umweltschutz, Arbeitssicherheit und Nachhaltigkeit erfahren möchtest, bist du bei uns genau richtig! Wer wir sind und wie wir arbeiten, siehst du hier",
13 |   "description_introduction": "Passionate people for sustainable softwareQuentic ist einer der führenden Lösungsanbieter für Software as a Service (SaaS) im europäischen HSE- und CSR-Markt. Das Unternehmen hat seinen Hauptsitz in Berlin und beschäftigt über 250 Mitarbeitende. Niederlassungen befinden sich in Deutschland, Österreich und der Schweiz sowie in Finnland, Belgien, Dänemark, Schweden, den Niederlanden, Spanien und Italien.\nÜber 800 Kunden stärken ihr HSE- und CSR-Management mit den Quentic Software-Lösungen. Mit neun individuell kombinierbaren Modulen umfasst die Online-Plattform Arbeitssicherheit, Risks & Audits, Control of Work, Gefahrstoffe, Legal Compliance, Online-Unterweisungen, Prozesse sowie Umweltmanagement und Nachhaltigkeit. Quentic vernetzt Daten, verbindet alle HSE- und CSR-Akteure und begeistert für das gesamte Aufgabenfeld - via Browser oder per App. Da Aufgaben über Abteilungen, Standorte und Länder hinweg ineinandergreifen, lassen sich Unternehmensprozesse effizient nach gesetzlichen Vorgaben steuern.",
14 |   "description_responsabilities": "Du betreust überwiegend Bestandkunden im Medium und Large Enterprise Business der Industrie im DACH-Raum\nDu erkennst Up- & Cross-Selling-Potentiale und schöpfst sie aus\nDu führst Verhandlungen über Preise und Vertragsverlängerungen\nDu präsentierst unser Leistungsversprechen unseren Bestandskunden und analysierst ihren Bedarf\nDu repräsentierst Quentic auf Roadshows und Messen\nDu pflegst unser CRM-System und reportest regelmäßig an unsere Head of Account Management\nDu arbeitest mit externen Dienstleistern zusammen\nDu sicherst und erhöhst die Kundenzufriedenheit",
15 |   "description_requirements": "Du hast bereits umfangreiche Berufserfahrung in der Bestandkundenbetreuung im B2B Software-Bereich\nBegriffe wie Buying Center, Tender und Complex Sales sind Dir geläufig\nDu bist technikaffin und hast Interesse an den Themen Arbeitssicherheit, Nachhaltigkeit und Umweltschutz\nMit Empathie und Geschick gelingt es Dir, komplexe Sachverhalte verständlich zu präsentieren\nDu bist argumentationssicher und verhandlungsstark und kannst so unsere Business Software online und vor Ort sicher präsentieren \nDu sprichst fließend Deutsch und Englisch, weitere europäische Sprachen sind ein Plus\nDu bist bereit, innerhalb der DACH-Region zu reisen (i.d.R. 1-2 Tage pro Woche innerhalb Deines lokalen Vertriebgebiets)",
16 |   "description_perks": "Nicht gesättigtes Marktumfeld mit steigender Nachfrage\nUnterstützung durch ein starkes Marketing sowie unsere Consultants bei der Kundenbetreuung\nAttraktive Vergütung aus einem Fixgehalt und einer transparenten Variablen je nach Zielvereinbarung\nFirmen-Kreditkarte und ein mobiles Büro\nStrukturierte Einarbeitung und Betreuung durch Mentoren\nFlache Hierarchien mit offenen Türen in einer lockeren, professionellen Atmosphäre\nRegelmäßige Teamevents und ein besonderes Augenmerk auf die Work-Life-Balance (flexible Arbeitszeiten, Bezuschussung Fitness-Studio u. v. m.)"
17 | }


--------------------------------------------------------------------------------
/python/simplescraper/tasks/download_job_descriptions.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import time
  3 | 
  4 | from playwright.async_api import async_playwright, Error, TimeoutError
  5 | 
  6 | from common.chunking import get_chunk_size
  7 | from common.entity import JOB_DESCRIPTION
  8 | from common.env_variables import DATA_SOURCE_URL, SEMAPHORE_COUNT, MAX_CHUNK_SIZE, LATEST_LOAD_TIMESTAMP, RUN_HEADLESS, \
  9 |     MIN_TO_DOWNLOAD, MAX_TO_DOWNLOAD
 10 | from common.logging import logger, configure_logger
 11 | from common.storage import save_raw_file, load_temp_df, JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV
 12 | 
 13 | TAB_HITS = 10
 14 | 
 15 | 
 16 | class PageNotFound(Exception):
 17 |     pass
 18 | 
 19 | 
 20 | async def open_first_page(browser):
 21 |     page = await browser.new_page()
 22 |     await page.goto(DATA_SOURCE_URL, wait_until='domcontentloaded')
 23 |     await page.click('#ccmgt_explicit_accept')
 24 |     for i in range(TAB_HITS * 2):
 25 |         await page.keyboard.press('Tab')
 26 |     await page.goto(DATA_SOURCE_URL + 'de/sitemap/', wait_until='domcontentloaded')
 27 |     for i in range(TAB_HITS * 2):
 28 |         await page.keyboard.press('Tab')
 29 |     return page
 30 | 
 31 | 
 32 | async def download_urls(df, load_timestamp):
 33 |     if df.empty:
 34 |         return
 35 |     async with async_playwright() as p:
 36 |         chunk_pos = df['chunk_pos'].values[0]
 37 |         chunk_pos = str(chunk_pos).rjust(2)
 38 |         num_chunks = df['num_chunks'].values[0]
 39 |         chunk_size = df['chunk_size'].values[0]
 40 |         chunk_id = f'{chunk_pos}/{num_chunks}'
 41 |         browser = await p.chromium.launch(headless=RUN_HEADLESS, slow_mo=250)
 42 |         try:
 43 |             logger.info(f'Starting chunk {chunk_id} with size of {chunk_size}')
 44 |             start_time = time.time()
 45 |             page = await open_first_page(browser)
 46 |             url_dicts = df.to_dict('records')
 47 |             for url_dict in url_dicts:
 48 |                 pos_in_chunk = url_dict['pos_in_chunk']
 49 |                 url = url_dict['url']
 50 |                 job_id = url.rsplit('--', 1)
 51 |                 job_id = job_id[1]
 52 |                 job_id = job_id.split('-')
 53 |                 job_id = job_id[0]
 54 |                 file_name = f'{job_id}.html'
 55 |                 try:
 56 |                     logger.debug(f'Chunk {chunk_id}: Downloading ({pos_in_chunk}/{chunk_size}): {url}')
 57 |                     try:
 58 |                         response = await page.goto(url, wait_until='domcontentloaded')
 59 |                         for i in range(TAB_HITS):
 60 |                             await page.keyboard.press('Tab')
 61 |                         if response.status >= 400 and response.status >= 400 < 500:
 62 |                             raise PageNotFound('Page not found')
 63 |                         await page.wait_for_selector('.js-app-ld-ContentBlock', timeout=10000, state='attached')
 64 |                     except TimeoutError as err:
 65 |                         logger.warning(
 66 |                             f'Chunk {chunk_id}: TimeoutError: second try for {url} because of the following error: {err}')
 67 |                         await page.goto(DATA_SOURCE_URL + 'de/sitemap/', wait_until='domcontentloaded')
 68 |                         for i in range(TAB_HITS):
 69 |                             await page.keyboard.press('Tab')
 70 |                         await page.goto(url, wait_until='domcontentloaded')
 71 |                         for i in range(TAB_HITS):
 72 |                             await page.keyboard.press('Tab')
 73 |                         await page.wait_for_selector('.js-app-ld-ContentBlock', timeout=20000, state='attached')
 74 |                     page_content = await page.content()
 75 |                     save_raw_file(page_content, JOB_DESCRIPTION, load_timestamp, file_name)
 76 |                     logger.success(f'Chunk {chunk_id}: Downloaded  ({pos_in_chunk}/{chunk_size}): {url}')
 77 |                 except TimeoutError:
 78 |                     logger.warning(f'Chunk {chunk_id}: TimeoutError: Timeout error while requesting the page {url}')
 79 |                 except AttributeError:
 80 |                     logger.warning(f'Chunk {chunk_id}: AttributeError: it seems the following URL is gone {url}')
 81 |                 except PageNotFound:
 82 |                     logger.warning(f'Chunk {chunk_id}: PageNotFound: the following URL is no longer available {url}')
 83 |         except Error as err:
 84 |             logger.error(f'Chunk {chunk_id}: It seems that the browser crashed because of the following error: {err}')
 85 |         finally:
 86 |             await browser.close()
 87 | 
 88 |         elapsed_time = time.time() - start_time
 89 |         logger.info(f'Finished chunk {chunk_id}')
 90 |         logger.info(f'Elapsed time {chunk_id}: {elapsed_time:.2f} seconds')
 91 |         logger.info(f'Downloads per second {chunk_id}: {chunk_size / elapsed_time:.2f}')
 92 | 
 93 | 
 94 | def split_dataframe(df, chunk_size):
 95 |     chunks = []
 96 |     num_chunks = len(df) // chunk_size + 1
 97 |     for i in range(num_chunks):
 98 |         chunk = df[i * chunk_size:(i + 1) * chunk_size]
 99 |         chunk = chunk.reset_index(drop=True)
100 |         chunk['chunk_pos'] = i + 1
101 |         chunk['num_chunks'] = num_chunks
102 |         chunk['pos_in_chunk'] = chunk.index + 1
103 |         chunk['chunk_size'] = chunk.shape[0]
104 |         chunks.append(chunk)
105 |     return chunks
106 | 
107 | 
108 | async def safe_download_urls(urls, load_timestamp, sem):
109 |     async with sem:  # semaphore limits num of simultaneous downloads
110 |         return await download_urls(urls, load_timestamp)
111 | 
112 | 
113 | async def run_async_tasks(chunks, load_timestamp):
114 |     sem = asyncio.Semaphore(SEMAPHORE_COUNT)
115 |     tasks = [
116 |         asyncio.ensure_future(safe_download_urls(chunk, load_timestamp, sem))  # creating task starts coroutine
117 |         for chunk
118 |         in chunks
119 |     ]
120 |     await asyncio.gather(*tasks)
121 | 
122 | 
123 | def download_job_descriptions(load_timestamp, df_to_download=None):
124 |     configure_logger(load_timestamp)
125 |     df = df_to_download if df_to_download is not None else load_temp_df(load_timestamp, JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV)
126 | 
127 |     if MAX_TO_DOWNLOAD:
128 |         pending_donwnload = df.shape[0] - MAX_TO_DOWNLOAD if df.shape[0] > MAX_TO_DOWNLOAD else 0
129 |         df = df.head(MAX_TO_DOWNLOAD)
130 |     else:
131 |         pending_donwnload = 0
132 | 
133 |     total_count = df.shape[0]
134 | 
135 |     if total_count < MIN_TO_DOWNLOAD:
136 |         logger.success(f'Not enough to download: {total_count} for the load timestamp {load_timestamp}')
137 |         return
138 | 
139 |     chunk_size = get_chunk_size(total_count, SEMAPHORE_COUNT, MAX_CHUNK_SIZE)
140 |     chunks = split_dataframe(df, chunk_size)
141 | 
142 |     start_time = time.time()
143 |     logger.info(f'Starting downloading job descriptions for job: {load_timestamp}')
144 |     logger.info(f'Concurrent tasks: {SEMAPHORE_COUNT}')
145 |     logger.info(f'Urls to download: {total_count}')
146 |     logger.info(f'Pending download: {pending_donwnload}')
147 | 
148 |     loop = asyncio.SelectorEventLoop()
149 |     asyncio.set_event_loop(loop)
150 |     try:
151 |         loop.run_until_complete(run_async_tasks(chunks, load_timestamp))
152 |     finally:
153 |         loop.run_until_complete(loop.shutdown_asyncgens())
154 |         loop.close()
155 | 
156 |     elapsed_time = time.time() - start_time
157 |     logger.info(f'Elapsed time: {elapsed_time:.2f} seconds')
158 |     logger.info(f'Downloads per second: {total_count / elapsed_time:.2f}')
159 |     logger.success(f'Finished: {total_count} urls for the timestamp {load_timestamp}')
160 |     logger.success(f'Pending download: {pending_donwnload} urls for the timestamp {load_timestamp}')
161 | 
162 | 
163 | if __name__ == '__main__':
164 |     download_job_descriptions(
165 |         LATEST_LOAD_TIMESTAMP,
166 |         load_temp_df(LATEST_LOAD_TIMESTAMP, JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV),
167 |     )
168 | 


--------------------------------------------------------------------------------
/python/simplescraper/common/storage.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module will store the files in the following structure
  3 | - root
  4 |   - <layer>
  5 |     - <data_source>
  6 |       - <entity>
  7 |         - <timestamp>
  8 |           - <file.extension>
  9 | """
 10 | import datetime
 11 | import glob
 12 | import os
 13 | import pathlib
 14 | 
 15 | import pandas as pd
 16 | import pyarrow as pa
 17 | import pyarrow.parquet as pq
 18 | from dateutil import parser
 19 | from pyarrow import ArrowInvalid
 20 | 
 21 | from common.entity import Entity
 22 | from common.env_variables import DATA_SOURCE_NAME, RAW_DIR, CLEANSED_DIR, TEMP_DIR, AZURE_STORAGE_CONNECTION_STRING, \
 23 |     AZURE_STORAGE_CONTAINER_NAME, DATA_DIR, UPLOAD_TO_AZURE, BACKUP_DIR, CURATED_DIR
 24 | from common.logging import logger
 25 | 
 26 | LOAD_TIMESTAMP_FORMAT = '%Y/%m/%d/%H-%M-%S'
 27 | LOAD_DATE_FORMAT = '%Y/%m/%d'
 28 | 
 29 | RAW_LAYER = 'raw'
 30 | CLEANSED_LAYER = 'cleansed'
 31 | CURATED_LAYER = 'curated'
 32 | TEMP_LAYER = 'temp'
 33 | 
 34 | LAYERS = [RAW_LAYER, CLEANSED_LAYER, CURATED_LAYER, TEMP_LAYER]
 35 | 
 36 | LAYER_DIR = {
 37 |     RAW_LAYER: RAW_DIR,
 38 |     CLEANSED_LAYER: CLEANSED_DIR,
 39 |     CURATED_LAYER: CURATED_DIR,
 40 |     TEMP_LAYER: TEMP_DIR,
 41 | }
 42 | 
 43 | DOWNLOADED_JOB_DESCRIPTIONS_CSV = '11_downloaded_job_descriptions.csv'
 44 | SITEMAP_URLS_CSV = '12_sitemap_urls.csv'
 45 | JOB_DESCRIPTIONS_TO_DOWNLOAD_CSV = '13_job_descriptions_to_download.csv'
 46 | PARSED_JOB_DESCRIPTIONS_CSV = '21_parsed_job_descriptions.csv'
 47 | JOB_DESCRIPTIONS_TO_PARSE_CSV = '22_job_descriptions_to_parse.csv'
 48 | DOWNLOADED_SITEMAPS_CSV = '31_downloaded_sitemaps.csv'
 49 | PARSED_SITEMAP_DATES_CSV = '32_parsed_sitemap_dates.csv'
 50 | SITEMAPS_TO_PARSE_CSV = '33_sitemaps_to_parse.csv'
 51 | 
 52 | 
 53 | def list_raw_files(data_source, entity: Entity, load_date=None):
 54 |     dir_path = os.path.join(RAW_DIR, data_source, entity.name)
 55 |     if load_date:
 56 |         dir_path = os.path.join(dir_path, load_date)
 57 |     file_list = [{
 58 |         'load_timestamp': '/'.join(f.split('/')[-5:-1]),
 59 |         'file_name': f.split('/')[-1],
 60 |     } for f in glob.iglob(dir_path + '/**/*', recursive=True) if os.path.isfile(f) and 'latest' not in f]
 61 |     return file_list
 62 | 
 63 | 
 64 | def list_raw_days(data_source, entity: Entity):
 65 |     dir_path = os.path.join(RAW_DIR, data_source, entity.name)
 66 |     file_list = [{
 67 |         'date': ''.join(f.split('/')[-3:]),
 68 |     } for f in glob.iglob(dir_path + '/*/*/*', recursive=True) if os.path.isdir(f) and 'latest' not in f]
 69 |     return file_list
 70 | 
 71 | 
 72 | def list_backup_days(data_source, entity: Entity):
 73 |     dir_path = os.path.join(BACKUP_DIR, data_source, entity.name)
 74 |     file_list = [{
 75 |         'date': f.split('.')[-3],
 76 |     } for f in glob.iglob(dir_path + '/**/*', recursive=True) if os.path.isfile(f)]
 77 |     return file_list
 78 | 
 79 | 
 80 | def get_load_timestamp(ts=None):
 81 |     if ts is None:
 82 |         load_timestamp = datetime.datetime.today().strftime(LOAD_TIMESTAMP_FORMAT)
 83 |     else:
 84 |         load_timestamp = parser.parse(ts).strftime(LOAD_TIMESTAMP_FORMAT)
 85 |     return load_timestamp
 86 | 
 87 | 
 88 | def get_load_date(ds=None):
 89 |     if ds is None:
 90 |         load_date = (datetime.datetime.today() - datetime.timedelta(days=1)).strftime(LOAD_DATE_FORMAT)
 91 |     else:
 92 |         load_date = parser.parse(ds).strftime(LOAD_DATE_FORMAT)
 93 |     return load_date
 94 | 
 95 | 
 96 | def get_filters_from_load_date(load_date: str):
 97 |     year, month, day = load_date.split('/', 2)
 98 |     filters = [
 99 |         ('year', '=', int(year)),
100 |         ('month', '=', int(month)),
101 |         ('day', '=', int(day)),
102 |     ]
103 |     return filters
104 | 
105 | 
106 | def create_dir(file_path):
107 |     dir_path = os.path.dirname(file_path)
108 |     pathlib.Path(dir_path).mkdir(parents=True, exist_ok=True)
109 | 
110 | 
111 | def save_local_file(content, file_path):
112 |     create_dir(file_path)
113 |     file_type = "w" if isinstance(content, str) else "wb"
114 |     with open(file_path, file_type) as f:
115 |         f.write(content)
116 | 
117 | 
118 | def save_remote_file(content, blob_name):
119 |     from azure.storage.blob import BlockBlobService
120 |     logger.debug(f'save_remote_file start: {blob_name}')
121 |     blob_service_client = BlockBlobService(connection_string=AZURE_STORAGE_CONNECTION_STRING)
122 |     if isinstance(content, str):
123 |         blob_service_client.create_blob_from_text(AZURE_STORAGE_CONTAINER_NAME, blob_name, content)
124 |     else:
125 |         blob_service_client.create_blob_from_bytes(AZURE_STORAGE_CONTAINER_NAME, blob_name, content)
126 |     logger.success(f'save_remote_file end:   {blob_name}')
127 | 
128 | 
129 | def save_raw_file(content, entity: Entity, load_timestamp: str, file_name):
130 |     blob_name = os.path.join(RAW_LAYER, DATA_SOURCE_NAME, entity.name, load_timestamp, file_name)
131 |     file_path = os.path.join(DATA_DIR, blob_name)
132 |     save_local_file(content, file_path)
133 |     if UPLOAD_TO_AZURE:
134 |         save_remote_file(content, blob_name)
135 | 
136 | 
137 | def load_raw_file(entity: Entity, load_timestamp, file_name):
138 |     file_path = os.path.join(LAYER_DIR[RAW_LAYER], DATA_SOURCE_NAME, entity.name, load_timestamp, file_name)
139 |     with open(file_path, 'r') as f:
140 |         content = f.read()
141 |     return content
142 | 
143 | 
144 | def save_temp_df(df: pd.DataFrame, load_timestamp: str, file_name: str):
145 |     temp_dir = os.path.join(TEMP_DIR, load_timestamp)
146 |     if not os.path.exists(temp_dir):
147 |         os.makedirs(temp_dir)
148 |     # noinspection PyTypeChecker
149 |     df.to_csv(os.path.join(temp_dir, file_name), index=False)
150 | 
151 | 
152 | def load_temp_df(load_timestamp: str, file_name: str) -> pd.DataFrame:
153 |     return pd.read_csv(os.path.join(TEMP_DIR, load_timestamp, file_name))
154 | 
155 | 
156 | def list_parquet_files(layer, entity: Entity, relative_paths):
157 |     dir_path = os.path.join(LAYER_DIR[layer], DATA_SOURCE_NAME, entity.name)
158 |     file_list = [f for f in glob.iglob(dir_path + '/**/*.parquet', recursive=True) if os.path.isfile(f)]
159 |     if relative_paths:
160 |         file_list = [file_path.replace(dir_path + '/', '') for file_path in file_list]
161 |     return file_list
162 | 
163 | 
164 | def list_cleansed_files(entity: Entity, relative_paths=True):
165 |     return list_parquet_files(CLEANSED_LAYER, entity, relative_paths)
166 | 
167 | 
168 | def save_parquet_df(df: pd.DataFrame, layer, entity: Entity):
169 |     # noinspection PyArgumentList
170 |     table: pa.Table = pa.Table.from_pandas(df, preserve_index=False)
171 |     root_path = os.path.join(LAYER_DIR[layer], DATA_SOURCE_NAME, entity.name)
172 |     pq.write_to_dataset(table,
173 |                         root_path,
174 |                         partition_cols=['year', 'month', 'day'],
175 |                         basename_template='part-{i}.parquet',
176 |                         existing_data_behavior='delete_matching',
177 |                         use_legacy_dataset=False)
178 | 
179 | 
180 | def save_cleansed_df(df: pd.DataFrame, entity: Entity):
181 |     save_parquet_df(df, CLEANSED_LAYER, entity)
182 | 
183 | 
184 | def save_curated_df(df: pd.DataFrame, entity: Entity):
185 |     save_parquet_df(df, CURATED_LAYER, entity)
186 | 
187 | 
188 | def load_parquet_df(layer, entity: Entity, columns, filters) -> pd.DataFrame:
189 |     # noinspection PyArgumentList
190 |     root_path = os.path.join(LAYER_DIR[layer], DATA_SOURCE_NAME, entity.name)
191 |     try:
192 |         table = pq.read_table(root_path, columns=columns, filters=filters, use_legacy_dataset=False)
193 |         return table.to_pandas()
194 |     except (FileNotFoundError, ArrowInvalid):
195 |         return pd.DataFrame(columns=columns)
196 | 
197 | 
198 | def load_cleansed_df(entity: Entity, columns=None, filters=None, load_date=None) -> pd.DataFrame:
199 |     if filters is None and load_date is not None:
200 |         filters = get_filters_from_load_date(load_date)
201 |     return load_parquet_df(CLEANSED_LAYER, entity, columns, filters)
202 | 


--------------------------------------------------------------------------------
/python/simplescraper/explore/explore_dwh_mart.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "pycharm": {
  8 |      "name": "#%%\n"
  9 |     }
 10 |    },
 11 |    "outputs": [],
 12 |    "source": [
 13 |     "from common.explore import display_sql"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "outputs": [
 20 |     {
 21 |      "data": {
 22 |       "text/plain": "    job_id  total\n0  7543521     12\n1  7369771     10\n2  7723680      9\n3  7599993      8\n4  7571802      8",
 23 |       "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>job_id</th>\n      <th>total</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>7543521</td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>7369771</td>\n      <td>10</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>7723680</td>\n      <td>9</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>7599993</td>\n      <td>8</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>7571802</td>\n      <td>8</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
 24 |      },
 25 |      "execution_count": 2,
 26 |      "metadata": {},
 27 |      "output_type": "execute_result"
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "display_sql(f'''\n",
 32 |     "SELECT job_id,\n",
 33 |     "       COUNT(1) AS total\n",
 34 |     "  FROM curated.job\n",
 35 |     " GROUP BY 1\n",
 36 |     " ORDER BY 2 DESC\n",
 37 |     " LIMIT 5\n",
 38 |     "''')"
 39 |    ],
 40 |    "metadata": {
 41 |     "collapsed": false,
 42 |     "pycharm": {
 43 |      "name": "#%%\n"
 44 |     }
 45 |    }
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {
 51 |     "pycharm": {
 52 |      "name": "#%%\n"
 53 |     }
 54 |    },
 55 |    "outputs": [
 56 |     {
 57 |      "data": {
 58 |       "text/plain": "        load_timestamp                                        title  \\\n0  2022-01-26 16:26:20  Vertriebsmitarbeiter/in Innendienst (m/w/d)   \n1  2022-01-20 10:00:00     Innendienst Vertrieb Ausstellung (m/w/d)   \n2  2022-01-10 19:00:00  Mitarbeiter/in Vertrieb Ausstellung (m/w/d)   \n3  2022-01-08 13:00:00                  Berater Ausstellung (m/w/d)   \n4  2021-12-18 14:00:00          Verkaufsberater Ausstellung (m/w/d)   \n5  2021-11-20 11:00:00                  Berater Ausstellung (m/w/d)   \n6  2021-11-12 16:00:00              Fachberater Ausstellung (m/w/d)   \n7  2021-11-10 17:00:00     Fachberater - Glaser / Schreiner (m/w/d)   \n8  2021-10-14 21:00:00        Kaufmännische/r Angestellte/r (m/w/d)   \n9  2021-10-07 08:00:00        Kaufmännische/r Angestellte/r (m/w/d)   \n10 2021-10-06 11:00:00          Kaufmännischer Angestellter (m/w/d)   \n11 2021-10-05 08:00:00          Kaufmännischer Angestellter (m/w/d)   \n\n             online_date  \n0   2022-01-02T13:03:06Z  \n1   2022-01-02T13:03:06Z  \n2   2022-01-02T13:03:06Z  \n3   2022-01-02T13:03:06Z  \n4   2021-12-18T13:03:05Z  \n5   2021-11-13T17:03:10Z  \n6   2021-10-29T15:30:01Z  \n7   2021-10-29T15:30:01Z  \n8   2021-10-06T15:03:04Z  \n9   2021-10-06T15:03:04Z  \n10  2021-09-21T14:32:36Z  \n11  2021-09-21T14:32:36Z  ",
 59 |       "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>load_timestamp</th>\n      <th>title</th>\n      <th>online_date</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>2022-01-26 16:26:20</td>\n      <td>Vertriebsmitarbeiter/in Innendienst (m/w/d)</td>\n      <td>2022-01-02T13:03:06Z</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2022-01-20 10:00:00</td>\n      <td>Innendienst Vertrieb Ausstellung (m/w/d)</td>\n      <td>2022-01-02T13:03:06Z</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2022-01-10 19:00:00</td>\n      <td>Mitarbeiter/in Vertrieb Ausstellung (m/w/d)</td>\n      <td>2022-01-02T13:03:06Z</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>2022-01-08 13:00:00</td>\n      <td>Berater Ausstellung (m/w/d)</td>\n      <td>2022-01-02T13:03:06Z</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>2021-12-18 14:00:00</td>\n      <td>Verkaufsberater Ausstellung (m/w/d)</td>\n      <td>2021-12-18T13:03:05Z</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>2021-11-20 11:00:00</td>\n      <td>Berater Ausstellung (m/w/d)</td>\n      <td>2021-11-13T17:03:10Z</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>2021-11-12 16:00:00</td>\n      <td>Fachberater Ausstellung (m/w/d)</td>\n      <td>2021-10-29T15:30:01Z</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>2021-11-10 17:00:00</td>\n      <td>Fachberater - Glaser / Schreiner (m/w/d)</td>\n      <td>2021-10-29T15:30:01Z</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>2021-10-14 21:00:00</td>\n      <td>Kaufmännische/r Angestellte/r (m/w/d)</td>\n      <td>2021-10-06T15:03:04Z</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>2021-10-07 08:00:00</td>\n      <td>Kaufmännische/r Angestellte/r (m/w/d)</td>\n      <td>2021-10-06T15:03:04Z</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>2021-10-06 11:00:00</td>\n      <td>Kaufmännischer Angestellter (m/w/d)</td>\n      <td>2021-09-21T14:32:36Z</td>\n    </tr>\n    <tr>\n      <th>11</th>\n      <td>2021-10-05 08:00:00</td>\n      <td>Kaufmännischer Angestellter (m/w/d)</td>\n      <td>2021-09-21T14:32:36Z</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
 60 |      },
 61 |      "execution_count": 3,
 62 |      "metadata": {},
 63 |      "output_type": "execute_result"
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "display_sql(f'''\n",
 68 |     "SELECT load_timestamp,\n",
 69 |     "       title,\n",
 70 |     "       online_date\n",
 71 |     "  FROM curated.job\n",
 72 |     " WHERE job_id = 7543521\n",
 73 |     " ORDER BY load_timestamp DESC\n",
 74 |     " LIMIT 20\n",
 75 |     "''')"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 4,
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/plain": "    job_id      load_timestamp\n0  7543521 2022-01-26 16:26:20",
 85 |       "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>job_id</th>\n      <th>load_timestamp</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>7543521</td>\n      <td>2022-01-26 16:26:20</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
 86 |      },
 87 |      "execution_count": 4,
 88 |      "metadata": {},
 89 |      "output_type": "execute_result"
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "display_sql(f'''\n",
 94 |     "SELECT job_id,\n",
 95 |     "       load_timestamp\n",
 96 |     "FROM (\n",
 97 |     "    SELECT j.*,\n",
 98 |     "           row_number()\n",
 99 |     "           OVER (\n",
100 |     "            PARTITION BY job_id ORDER BY load_timestamp DESC\n",
101 |     "           ) AS seqnum\n",
102 |     "      FROM curated.job j\n",
103 |     "     WHERE job_id = 7543521\n",
104 |     ") j\n",
105 |     "WHERE seqnum = 1;\n",
106 |     "''')\n"
107 |    ],
108 |    "metadata": {
109 |     "collapsed": false,
110 |     "pycharm": {
111 |      "name": "#%%\n"
112 |     }
113 |    }
114 |   }
115 |  ],
116 |  "metadata": {
117 |   "kernelspec": {
118 |    "display_name": "Python 3 (ipykernel)",
119 |    "language": "python",
120 |    "name": "python3"
121 |   },
122 |   "language_info": {
123 |    "codemirror_mode": {
124 |     "name": "ipython",
125 |     "version": 3
126 |    },
127 |    "file_extension": ".py",
128 |    "mimetype": "text/x-python",
129 |    "name": "python",
130 |    "nbconvert_exporter": "python",
131 |    "pygments_lexer": "ipython3",
132 |    "version": "3.10.6"
133 |   }
134 |  },
135 |  "nbformat": 4,
136 |  "nbformat_minor": 1
137 | }


--------------------------------------------------------------------------------
/python/simplescraper/flasky.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | 
  4 | from flask import Flask, request, Request
  5 | 
  6 | from common.env_variables import SOURCE_DIR
  7 | from common.logging import logger
  8 | from common.storage import get_load_timestamp, get_load_date
  9 | from tasks.cleanse_job_descriptions import cleanse_job_descriptions
 10 | from tasks.cleanse_sitemaps import cleanse_sitemaps
 11 | from tasks.curate_job_descriptions import curate_job_descriptions
 12 | from tasks.curate_sitemaps import curate_sitemaps
 13 | from tasks.download_job_descriptions import download_job_descriptions
 14 | from tasks.download_sitemap import download_sitemap
 15 | from tasks.list_downloaded_job_descriptions import list_downloaded_job_descriptions
 16 | from tasks.list_job_descriptions_to_download import list_job_descriptions_to_download
 17 | from tasks.prune_old_raw import prune_old_raw
 18 | 
 19 | SUCCESS_RETURN_CODE = 0
 20 | 
 21 | DEFAULT_DATA_INTERVAL_END = '2022-09-08T00:00:00+00:00'
 22 | DEFAULT_DS = '2022-09-07'
 23 | 
 24 | SUCCESS = {'result_status': 'success', }, 200
 25 | 
 26 | HTML_FORM = f'''
 27 | <form method="POST">
 28 |   <label>data_interval_end:<input type="text" name="data_interval_end" value="{DEFAULT_DATA_INTERVAL_END}"></label><br>
 29 |   <label>ds:               <input type="text" name="ds"                value="{DEFAULT_DS}"></label><br>
 30 |   <input type="submit" value="Submit">
 31 | </form>
 32 | '''
 33 | 
 34 | 
 35 | def is_connected_to_vpn():
 36 |     return os.system('/usr/sbin/scutil --nc list | grep Connected | grep vpn') == 0
 37 | 
 38 | 
 39 | class RequestParams:
 40 |     def __init__(self, _request: Request):
 41 |         form = _request.form
 42 |         self.load_timestamp = get_load_timestamp(form.get('data_interval_end'))
 43 |         self.load_date = get_load_date(form.get('ds'))
 44 |         logger.info(self.__dict__)
 45 | 
 46 | 
 47 | app = Flask(__name__)
 48 | 
 49 | 
 50 | @app.route('/')
 51 | def index():
 52 |     return '<a href="/do/check_vpn_status">Check VPN Status</a><br>' \
 53 |            '<a href="/do/list_donwloaded_job_descriptions">List Downloaded Descriptions</a><br>' \
 54 |            '<a href="/do/download_sitemap">Download Sitemap</a><br>' \
 55 |            '<a href="/do/list_job_descriptions_to_download">List Job Descriptions to Download</a><br>' \
 56 |            '<a href="/do/download_job_descriptions">Download Job Descriptions</a><br>' \
 57 |            '<a href="/do/cleanse_sitemaps">Cleanse Sitemap</a><br>' \
 58 |            '<a href="/do/cleanse_job_descriptions">Cleanse Job Descriptions</a><br>' \
 59 |            '<a href="/do/do_dbt_run">Do dbt run</a><br>' \
 60 |            '<a href="/do/do_day_backup">Do Day Backup</a><br>' \
 61 |            '<a href="/do/validate_day_backup">Validate Day Backup</a><br>' \
 62 |            '<a href="/do/test">Test</a><br>'
 63 | 
 64 | 
 65 | @app.route('/do/check_vpn_status')
 66 | def do_check_vpn_status():
 67 |     logger.info('is_connected_to_vpn: start')
 68 |     is_connected = is_connected_to_vpn()
 69 |     logger.info('is_connected_to_vpn: end')
 70 |     if is_connected:
 71 |         return SUCCESS
 72 |     else:
 73 |         return {'result_status': 'failed'}, 400
 74 | 
 75 | 
 76 | @app.route('/do/list_downloaded_job_descriptions', methods=['GET', 'POST'])
 77 | def do_list_downloaded_urls():
 78 |     if request.method == 'POST':
 79 |         params = RequestParams(request)
 80 |         list_downloaded_job_descriptions(params.load_timestamp)
 81 |         return SUCCESS
 82 |     elif request.method == 'GET':
 83 |         return HTML_FORM
 84 | 
 85 | 
 86 | @app.route('/do/download_sitemap', methods=['GET', 'POST'])
 87 | def do_download_sitemap():
 88 |     if request.method == 'POST':
 89 |         if is_connected_to_vpn():
 90 |             params = RequestParams(request)
 91 |             download_sitemap(params.load_timestamp)
 92 |             return {'result_status': 'success'}, 200
 93 |         else:
 94 |             return {'result_status': 'failed'}, 400
 95 |     elif request.method == 'GET':
 96 |         return HTML_FORM
 97 | 
 98 | 
 99 | @app.route('/do/list_job_descriptions_to_download', methods=['GET', 'POST'])
100 | def do_list_job_descriptions_to_download():
101 |     if request.method == 'POST':
102 |         if is_connected_to_vpn():
103 |             params = RequestParams(request)
104 |             list_job_descriptions_to_download(params.load_timestamp)
105 |             return SUCCESS
106 |         else:
107 |             return {'result_status': 'failed'}, 400
108 |     elif request.method == 'GET':
109 |         return HTML_FORM
110 | 
111 | 
112 | @app.route('/do/download_job_descriptions', methods=['GET', 'POST'])
113 | def do_download_job_descriptions():
114 |     if request.method == 'POST':
115 |         if is_connected_to_vpn():
116 |             params = RequestParams(request)
117 |             download_job_descriptions(params.load_timestamp)
118 |             return SUCCESS
119 |         else:
120 |             return {'result_status': 'failed'}, 400
121 |     elif request.method == 'GET':
122 |         return HTML_FORM
123 | 
124 | 
125 | @app.route('/do/cleanse_sitemaps', methods=['GET', 'POST'])
126 | def do_cleanse_sitemaps():
127 |     if request.method == 'POST':
128 |         params = RequestParams(request)
129 |         cleanse_sitemaps(params.load_timestamp, params.load_date)
130 |         return SUCCESS
131 |     elif request.method == 'GET':
132 |         return HTML_FORM
133 | 
134 | 
135 | @app.route('/do/cleanse_job_descriptions', methods=['GET', 'POST'])
136 | def do_cleanse_job_descriptions():
137 |     if request.method == 'POST':
138 |         params = RequestParams(request)
139 |         cleanse_job_descriptions(params.load_timestamp, params.load_date)
140 |         return SUCCESS
141 |     elif request.method == 'GET':
142 |         return HTML_FORM
143 | 
144 | 
145 | @app.route('/do/curate_sitemaps', methods=['GET', 'POST'])
146 | def do_curate_sitemaps():
147 |     if request.method == 'POST':
148 |         params = RequestParams(request)
149 |         curate_sitemaps(params.load_timestamp, params.load_date)
150 |         return SUCCESS
151 |     elif request.method == 'GET':
152 |         return HTML_FORM
153 | 
154 | 
155 | @app.route('/do/curate_job_descriptions', methods=['GET', 'POST'])
156 | def do_curate_job_descriptions():
157 |     if request.method == 'POST':
158 |         params = RequestParams(request)
159 |         curate_job_descriptions(params.load_timestamp, params.load_date)
160 |         return SUCCESS
161 |     elif request.method == 'GET':
162 |         return HTML_FORM
163 | 
164 | 
165 | @app.route('/do/do_day_backup', methods=['GET', 'POST'])
166 | def do_do_day_backup():
167 |     if request.method == 'POST':
168 |         params = RequestParams(request)
169 |         year, month, day = params.load_date.split('/')
170 |         result = subprocess.run([f'{SOURCE_DIR}/simplescraper/do_day_backup.sh', year, month, day])
171 |         if result.returncode == SUCCESS_RETURN_CODE:
172 |             return SUCCESS
173 |         else:
174 |             return {
175 |                        'result_status': 'error',
176 |                    }, 400
177 |     elif request.method == 'GET':
178 |         return HTML_FORM
179 | 
180 | 
181 | @app.route('/do/do_dbt_run', methods=['GET', 'POST'])
182 | def do_dbt_run():
183 |     if request.method == 'POST':
184 |         _ = RequestParams(request)
185 |         result = subprocess.run([f'{SOURCE_DIR}/simplescraper/do_dbt_run.sh'])
186 |         if result.returncode == SUCCESS_RETURN_CODE:
187 |             return SUCCESS
188 |         else:
189 |             return {
190 |                        'result_status': 'error',
191 |                    }, 400
192 |     elif request.method == 'GET':
193 |         return HTML_FORM
194 | 
195 | 
196 | @app.route('/do/verify_day_backup', methods=['GET', 'POST'])
197 | def do_verify_day_backup():
198 |     if request.method == 'POST':
199 |         params = RequestParams(request)
200 |         year, month, day = params.load_date.split('/')
201 |         result = subprocess.run([f'{SOURCE_DIR}/simplescraper/verify_day_backup.sh', year, month, day])
202 |         if result.returncode == SUCCESS_RETURN_CODE:
203 |             return SUCCESS
204 |         else:
205 |             return {
206 |                        'result_status': 'error',
207 |                    }, 400
208 |     elif request.method == 'GET':
209 |         return HTML_FORM
210 | 
211 | 
212 | @app.route('/do/prune_old_raw', methods=['GET', 'POST'])
213 | def do_prune_old_raw():
214 |     if request.method == 'POST':
215 |         params = RequestParams(request)
216 |         prune_old_raw(params.load_timestamp, params.load_date)
217 |         return SUCCESS
218 |     elif request.method == 'GET':
219 |         return HTML_FORM
220 | 
221 | 
222 | @app.route('/do/test', methods=['GET', 'POST'])
223 | def do_test():
224 |     if request.method == 'POST':
225 |         params = RequestParams(request)
226 |         return {
227 |                    'result_status': 'success',
228 |                    'load_timestamp': params.load_timestamp,
229 |                    'load_date': params.load_date,
230 |                }, 200
231 |     elif request.method == 'GET':
232 |         return HTML_FORM
233 | 


--------------------------------------------------------------------------------
/python/simplescraper/explore/explore_dwh_mart_dim_time.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "pycharm": {
  8 |      "name": "#%%\n"
  9 |     }
 10 |    },
 11 |    "outputs": [],
 12 |    "source": [
 13 |     "from common.explore import display_sql"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "outputs": [
 20 |     {
 21 |      "data": {
 22 |       "text/plain": "      date_key  year  month  day month_name  year_week  day_of_week  \\\n0   2021-10-09  2021     10    9    October     202140            6   \n1   2021-10-10  2021     10   10    October     202140            7   \n2   2021-10-11  2021     10   11    October     202141            1   \n3   2021-10-12  2021     10   12    October     202141            2   \n4   2021-10-13  2021     10   13    October     202141            3   \n..         ...   ...    ...  ...        ...        ...          ...   \n353 2022-09-27  2022      9   27  September     202239            2   \n354 2022-09-28  2022      9   28  September     202239            3   \n355 2022-09-29  2022      9   29  September     202239            4   \n356 2022-09-30  2022      9   30  September     202239            5   \n357 2022-10-01  2022     10    1    October     202239            6   \n\n    day_of_week_name  \n0           Saturday  \n1             Sunday  \n2             Monday  \n3            Tuesday  \n4          Wednesday  \n..               ...  \n353          Tuesday  \n354        Wednesday  \n355         Thursday  \n356           Friday  \n357         Saturday  \n\n[358 rows x 8 columns]",
 23 |       "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>date_key</th>\n      <th>year</th>\n      <th>month</th>\n      <th>day</th>\n      <th>month_name</th>\n      <th>year_week</th>\n      <th>day_of_week</th>\n      <th>day_of_week_name</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>2021-10-09</td>\n      <td>2021</td>\n      <td>10</td>\n      <td>9</td>\n      <td>October</td>\n      <td>202140</td>\n      <td>6</td>\n      <td>Saturday</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2021-10-10</td>\n      <td>2021</td>\n      <td>10</td>\n      <td>10</td>\n      <td>October</td>\n      <td>202140</td>\n      <td>7</td>\n      <td>Sunday</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2021-10-11</td>\n      <td>2021</td>\n      <td>10</td>\n      <td>11</td>\n      <td>October</td>\n      <td>202141</td>\n      <td>1</td>\n      <td>Monday</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>2021-10-12</td>\n      <td>2021</td>\n      <td>10</td>\n      <td>12</td>\n      <td>October</td>\n      <td>202141</td>\n      <td>2</td>\n      <td>Tuesday</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>2021-10-13</td>\n      <td>2021</td>\n      <td>10</td>\n      <td>13</td>\n      <td>October</td>\n      <td>202141</td>\n      <td>3</td>\n      <td>Wednesday</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>353</th>\n      <td>2022-09-27</td>\n      <td>2022</td>\n      <td>9</td>\n      <td>27</td>\n      <td>September</td>\n      <td>202239</td>\n      <td>2</td>\n      <td>Tuesday</td>\n    </tr>\n    <tr>\n      <th>354</th>\n      <td>2022-09-28</td>\n      <td>2022</td>\n      <td>9</td>\n      <td>28</td>\n      <td>September</td>\n      <td>202239</td>\n      <td>3</td>\n      <td>Wednesday</td>\n    </tr>\n    <tr>\n      <th>355</th>\n      <td>2022-09-29</td>\n      <td>2022</td>\n      <td>9</td>\n      <td>29</td>\n      <td>September</td>\n      <td>202239</td>\n      <td>4</td>\n      <td>Thursday</td>\n    </tr>\n    <tr>\n      <th>356</th>\n      <td>2022-09-30</td>\n      <td>2022</td>\n      <td>9</td>\n      <td>30</td>\n      <td>September</td>\n      <td>202239</td>\n      <td>5</td>\n      <td>Friday</td>\n    </tr>\n    <tr>\n      <th>357</th>\n      <td>2022-10-01</td>\n      <td>2022</td>\n      <td>10</td>\n      <td>1</td>\n      <td>October</td>\n      <td>202239</td>\n      <td>6</td>\n      <td>Saturday</td>\n    </tr>\n  </tbody>\n</table>\n<p>358 rows × 8 columns</p>\n</div>"
 24 |      },
 25 |      "execution_count": 2,
 26 |      "metadata": {},
 27 |      "output_type": "execute_result"
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "display_sql(f'''\n",
 32 |     "WITH unique_online_at AS (\n",
 33 |     "    SELECT DISTINCT online_at\n",
 34 |     "      FROM curated.online_job\n",
 35 |     "     ORDER BY 1\n",
 36 |     ")\n",
 37 |     "SELECT online_at as date_key,\n",
 38 |     "       date_part('year', online_at) as year,\n",
 39 |     "       date_part('month', online_at) as month,\n",
 40 |     "       date_part('day', online_at) as day,\n",
 41 |     "       monthname(online_at) as month_name,\n",
 42 |     "       date_part('yearweek', online_at) as year_week,\n",
 43 |     "       date_part('isodow', online_at) as day_of_week,\n",
 44 |     "       dayname(online_at) as day_of_week_name\n",
 45 |     "  FROM unique_online_at\n",
 46 |     "''')\n"
 47 |    ],
 48 |    "metadata": {
 49 |     "collapsed": false,
 50 |     "pycharm": {
 51 |      "name": "#%%\n"
 52 |     }
 53 |    }
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 3,
 58 |    "outputs": [
 59 |     {
 60 |      "data": {
 61 |       "text/plain": "      date_key  year  month  day month_name  year_week  day_of_week  \\\n0   2021-10-09  2021     10    9    October     202140            6   \n1   2021-10-10  2021     10   10    October     202140            7   \n2   2021-10-11  2021     10   11    October     202141            1   \n3   2021-10-12  2021     10   12    October     202141            2   \n4   2021-10-13  2021     10   13    October     202141            3   \n..         ...   ...    ...  ...        ...        ...          ...   \n353 2022-09-27  2022      9   27  September     202239            2   \n354 2022-09-28  2022      9   28  September     202239            3   \n355 2022-09-29  2022      9   29  September     202239            4   \n356 2022-09-30  2022      9   30  September     202239            5   \n357 2022-10-01  2022     10    1    October     202239            6   \n\n    day_of_week_name  \n0           Saturday  \n1             Sunday  \n2             Monday  \n3            Tuesday  \n4          Wednesday  \n..               ...  \n353          Tuesday  \n354        Wednesday  \n355         Thursday  \n356           Friday  \n357         Saturday  \n\n[358 rows x 8 columns]",
 62 |       "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>date_key</th>\n      <th>year</th>\n      <th>month</th>\n      <th>day</th>\n      <th>month_name</th>\n      <th>year_week</th>\n      <th>day_of_week</th>\n      <th>day_of_week_name</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>2021-10-09</td>\n      <td>2021</td>\n      <td>10</td>\n      <td>9</td>\n      <td>October</td>\n      <td>202140</td>\n      <td>6</td>\n      <td>Saturday</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2021-10-10</td>\n      <td>2021</td>\n      <td>10</td>\n      <td>10</td>\n      <td>October</td>\n      <td>202140</td>\n      <td>7</td>\n      <td>Sunday</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2021-10-11</td>\n      <td>2021</td>\n      <td>10</td>\n      <td>11</td>\n      <td>October</td>\n      <td>202141</td>\n      <td>1</td>\n      <td>Monday</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>2021-10-12</td>\n      <td>2021</td>\n      <td>10</td>\n      <td>12</td>\n      <td>October</td>\n      <td>202141</td>\n      <td>2</td>\n      <td>Tuesday</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>2021-10-13</td>\n      <td>2021</td>\n      <td>10</td>\n      <td>13</td>\n      <td>October</td>\n      <td>202141</td>\n      <td>3</td>\n      <td>Wednesday</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>353</th>\n      <td>2022-09-27</td>\n      <td>2022</td>\n      <td>9</td>\n      <td>27</td>\n      <td>September</td>\n      <td>202239</td>\n      <td>2</td>\n      <td>Tuesday</td>\n    </tr>\n    <tr>\n      <th>354</th>\n      <td>2022-09-28</td>\n      <td>2022</td>\n      <td>9</td>\n      <td>28</td>\n      <td>September</td>\n      <td>202239</td>\n      <td>3</td>\n      <td>Wednesday</td>\n    </tr>\n    <tr>\n      <th>355</th>\n      <td>2022-09-29</td>\n      <td>2022</td>\n      <td>9</td>\n      <td>29</td>\n      <td>September</td>\n      <td>202239</td>\n      <td>4</td>\n      <td>Thursday</td>\n    </tr>\n    <tr>\n      <th>356</th>\n      <td>2022-09-30</td>\n      <td>2022</td>\n      <td>9</td>\n      <td>30</td>\n      <td>September</td>\n      <td>202239</td>\n      <td>5</td>\n      <td>Friday</td>\n    </tr>\n    <tr>\n      <th>357</th>\n      <td>2022-10-01</td>\n      <td>2022</td>\n      <td>10</td>\n      <td>1</td>\n      <td>October</td>\n      <td>202239</td>\n      <td>6</td>\n      <td>Saturday</td>\n    </tr>\n  </tbody>\n</table>\n<p>358 rows × 8 columns</p>\n</div>"
 63 |      },
 64 |      "execution_count": 3,
 65 |      "metadata": {},
 66 |      "output_type": "execute_result"
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "display_sql(f'''\n",
 71 |     "SELECT *\n",
 72 |     "  FROM dim_time\n",
 73 |     "''')"
 74 |    ],
 75 |    "metadata": {
 76 |     "collapsed": false,
 77 |     "pycharm": {
 78 |      "name": "#%%\n"
 79 |     }
 80 |    }
 81 |   }
 82 |  ],
 83 |  "metadata": {
 84 |   "kernelspec": {
 85 |    "display_name": "Python 3 (ipykernel)",
 86 |    "language": "python",
 87 |    "name": "python3"
 88 |   },
 89 |   "language_info": {
 90 |    "codemirror_mode": {
 91 |     "name": "ipython",
 92 |     "version": 3
 93 |    },
 94 |    "file_extension": ".py",
 95 |    "mimetype": "text/x-python",
 96 |    "name": "python",
 97 |    "nbconvert_exporter": "python",
 98 |    "pygments_lexer": "ipython3",
 99 |    "version": "3.10.6"
100 |   }
101 |  },
102 |  "nbformat": 4,
103 |  "nbformat_minor": 1
104 | }


--------------------------------------------------------------------------------
/python/simplescraper/requirements.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file is autogenerated by pip-compile with python 3.10
  3 | # To update, run:
  4 | #
  5 | #    pip-compile --allow-unsafe requirements.in
  6 | #
  7 | appnope==0.1.3
  8 |     # via
  9 |     #   ipykernel
 10 |     #   ipython
 11 | argon2-cffi==21.3.0
 12 |     # via notebook
 13 | argon2-cffi-bindings==21.2.0
 14 |     # via argon2-cffi
 15 | astor==0.8.1
 16 |     # via wemake-python-styleguide
 17 | asttokens==2.0.8
 18 |     # via stack-data
 19 | attrs==22.1.0
 20 |     # via
 21 |     #   flake8-bugbear
 22 |     #   flake8-eradicate
 23 |     #   jsonschema
 24 |     #   pytest
 25 |     #   wemake-python-styleguide
 26 | azure-common==1.1.28
 27 |     # via
 28 |     #   azure-storage-blob
 29 |     #   azure-storage-common
 30 | azure-storage-blob==2.1.0
 31 |     # via -r requirements.in
 32 | azure-storage-common==2.1.0
 33 |     # via azure-storage-blob
 34 | backcall==0.2.0
 35 |     # via ipython
 36 | bandit==1.7.4
 37 |     # via flake8-bandit
 38 | beautifulsoup4==4.11.1
 39 |     # via
 40 |     #   -r requirements.in
 41 |     #   nbconvert
 42 | bleach==5.0.1
 43 |     # via nbconvert
 44 | build==0.8.0
 45 |     # via pip-tools
 46 | certifi==2022.6.15.1
 47 |     # via requests
 48 | cffi==1.15.1
 49 |     # via
 50 |     #   argon2-cffi-bindings
 51 |     #   cryptography
 52 | charset-normalizer==2.1.1
 53 |     # via requests
 54 | click==8.1.3
 55 |     # via
 56 |     #   flask
 57 |     #   pip-tools
 58 | cryptography==38.0.1
 59 |     # via azure-storage-common
 60 | darglint==1.8.1
 61 |     # via wemake-python-styleguide
 62 | debugpy==1.6.3
 63 |     # via ipykernel
 64 | decorator==5.1.1
 65 |     # via ipython
 66 | defusedxml==0.7.1
 67 |     # via nbconvert
 68 | docutils==0.19
 69 |     # via restructuredtext-lint
 70 | duckdb==0.7.0
 71 |     # via -r requirements.in
 72 | entrypoints==0.4
 73 |     # via jupyter-client
 74 | eradicate==2.1.0
 75 |     # via flake8-eradicate
 76 | executing==1.0.0
 77 |     # via stack-data
 78 | fastjsonschema==2.16.1
 79 |     # via nbformat
 80 | flake8==4.0.1
 81 |     # via
 82 |     #   flake8-bandit
 83 |     #   flake8-broken-line
 84 |     #   flake8-bugbear
 85 |     #   flake8-commas
 86 |     #   flake8-comprehensions
 87 |     #   flake8-debugger
 88 |     #   flake8-docstrings
 89 |     #   flake8-eradicate
 90 |     #   flake8-isort
 91 |     #   flake8-polyfill
 92 |     #   flake8-quotes
 93 |     #   flake8-rst-docstrings
 94 |     #   flake8-string-format
 95 |     #   pep8-naming
 96 |     #   wemake-python-styleguide
 97 | flake8-bandit==3.0.0
 98 |     # via wemake-python-styleguide
 99 | flake8-broken-line==0.4.0
100 |     # via wemake-python-styleguide
101 | flake8-bugbear==22.9.11
102 |     # via wemake-python-styleguide
103 | flake8-commas==2.1.0
104 |     # via wemake-python-styleguide
105 | flake8-comprehensions==3.10.0
106 |     # via wemake-python-styleguide
107 | flake8-debugger==4.1.2
108 |     # via wemake-python-styleguide
109 | flake8-docstrings==1.6.0
110 |     # via wemake-python-styleguide
111 | flake8-eradicate==1.3.0
112 |     # via wemake-python-styleguide
113 | flake8-isort==4.2.0
114 |     # via wemake-python-styleguide
115 | flake8-polyfill==1.0.2
116 |     # via
117 |     #   flake8-bandit
118 |     #   pep8-naming
119 | flake8-quotes==3.3.1
120 |     # via wemake-python-styleguide
121 | flake8-rst-docstrings==0.2.7
122 |     # via wemake-python-styleguide
123 | flake8-string-format==0.3.0
124 |     # via wemake-python-styleguide
125 | flask==2.2.2
126 |     # via -r requirements.in
127 | gitdb==4.0.9
128 |     # via gitpython
129 | gitpython==3.1.27
130 |     # via bandit
131 | greenlet==2.0.1
132 |     # via playwright
133 | gunicorn==20.1.0
134 |     # via -r requirements.in
135 | idna==3.3
136 |     # via requests
137 | iniconfig==1.1.1
138 |     # via pytest
139 | ipykernel==6.15.2
140 |     # via
141 |     #   ipywidgets
142 |     #   jupyter
143 |     #   jupyter-console
144 |     #   notebook
145 |     #   qtconsole
146 | ipython==8.5.0
147 |     # via
148 |     #   ipykernel
149 |     #   ipywidgets
150 |     #   jupyter-console
151 | ipython-genutils==0.2.0
152 |     # via
153 |     #   notebook
154 |     #   qtconsole
155 | ipywidgets==8.0.2
156 |     # via jupyter
157 | isort==5.10.1
158 |     # via flake8-isort
159 | itsdangerous==2.1.2
160 |     # via flask
161 | jedi==0.18.1
162 |     # via ipython
163 | jinja2==3.1.2
164 |     # via
165 |     #   flask
166 |     #   nbconvert
167 |     #   notebook
168 | jsonschema==4.16.0
169 |     # via nbformat
170 | jupyter==1.0.0
171 |     # via -r requirements.in
172 | jupyter-client==7.3.5
173 |     # via
174 |     #   ipykernel
175 |     #   jupyter-console
176 |     #   nbclient
177 |     #   notebook
178 |     #   qtconsole
179 | jupyter-console==6.4.4
180 |     # via jupyter
181 | jupyter-core==4.11.1
182 |     # via
183 |     #   jupyter-client
184 |     #   nbconvert
185 |     #   nbformat
186 |     #   notebook
187 |     #   qtconsole
188 | jupyterlab-pygments==0.2.2
189 |     # via nbconvert
190 | jupyterlab-widgets==3.0.3
191 |     # via ipywidgets
192 | kaleido==0.2.1
193 |     # via -r requirements.in
194 | loguru==0.6.0
195 |     # via -r requirements.in
196 | lxml==4.9.1
197 |     # via
198 |     #   -r requirements.in
199 |     #   nbconvert
200 | markupsafe==2.1.1
201 |     # via
202 |     #   jinja2
203 |     #   nbconvert
204 |     #   werkzeug
205 | matplotlib-inline==0.1.6
206 |     # via
207 |     #   ipykernel
208 |     #   ipython
209 | mccabe==0.6.1
210 |     # via flake8
211 | mistune==2.0.4
212 |     # via nbconvert
213 | nbclient==0.6.8
214 |     # via nbconvert
215 | nbconvert==7.0.0
216 |     # via
217 |     #   jupyter
218 |     #   notebook
219 | nbformat==5.4.0
220 |     # via
221 |     #   nbclient
222 |     #   nbconvert
223 |     #   notebook
224 | nest-asyncio==1.5.5
225 |     # via
226 |     #   ipykernel
227 |     #   jupyter-client
228 |     #   nbclient
229 |     #   notebook
230 | notebook==6.4.12
231 |     # via jupyter
232 | numpy==1.23.3
233 |     # via
234 |     #   pandas
235 |     #   patsy
236 |     #   plotly-calplot
237 |     #   plotly-express
238 |     #   pyarrow
239 |     #   scipy
240 |     #   statsmodels
241 | packaging==21.3
242 |     # via
243 |     #   build
244 |     #   ipykernel
245 |     #   nbconvert
246 |     #   pytest
247 |     #   qtpy
248 |     #   statsmodels
249 | pandas==1.4.4
250 |     # via
251 |     #   -r requirements.in
252 |     #   plotly-calplot
253 |     #   plotly-express
254 |     #   statsmodels
255 | pandocfilters==1.5.0
256 |     # via nbconvert
257 | parso==0.8.3
258 |     # via jedi
259 | patsy==0.5.2
260 |     # via
261 |     #   plotly-express
262 |     #   statsmodels
263 | pbr==5.10.0
264 |     # via stevedore
265 | pep517==0.13.0
266 |     # via build
267 | pep8-naming==0.12.1
268 |     # via wemake-python-styleguide
269 | pexpect==4.8.0
270 |     # via ipython
271 | pickleshare==0.7.5
272 |     # via ipython
273 | pip-tools==6.8.0
274 |     # via -r requirements.in
275 | playwright==1.30.0
276 |     # via -r requirements.in
277 | plotly==5.10.0
278 |     # via
279 |     #   plotly-calplot
280 |     #   plotly-express
281 | plotly-calplot==0.1.12
282 |     # via -r requirements.in
283 | plotly-express==0.4.1
284 |     # via -r requirements.in
285 | pluggy==1.0.0
286 |     # via pytest
287 | prometheus-client==0.14.1
288 |     # via notebook
289 | prompt-toolkit==3.0.31
290 |     # via
291 |     #   ipython
292 |     #   jupyter-console
293 | psutil==5.9.2
294 |     # via ipykernel
295 | ptyprocess==0.7.0
296 |     # via
297 |     #   pexpect
298 |     #   terminado
299 | pure-eval==0.2.2
300 |     # via stack-data
301 | py==1.11.0
302 |     # via pytest
303 | pyarrow==9.0.0
304 |     # via -r requirements.in
305 | pycodestyle==2.8.0
306 |     # via
307 |     #   flake8
308 |     #   flake8-bandit
309 |     #   flake8-debugger
310 | pycparser==2.21
311 |     # via cffi
312 | pydocstyle==6.1.1
313 |     # via flake8-docstrings
314 | pyee==9.0.4
315 |     # via playwright
316 | pyflakes==2.4.0
317 |     # via flake8
318 | pygments==2.13.0
319 |     # via
320 |     #   flake8-rst-docstrings
321 |     #   ipython
322 |     #   jupyter-console
323 |     #   nbconvert
324 |     #   qtconsole
325 |     #   wemake-python-styleguide
326 | pyparsing==3.0.9
327 |     # via packaging
328 | pyrsistent==0.18.1
329 |     # via jsonschema
330 | pytest==7.1.3
331 |     # via -r requirements.in
332 | python-dateutil==2.8.2
333 |     # via
334 |     #   azure-storage-common
335 |     #   jupyter-client
336 |     #   pandas
337 | python-dotenv==0.21.0
338 |     # via -r requirements.in
339 | pytz==2022.2.1
340 |     # via pandas
341 | pyyaml==6.0
342 |     # via bandit
343 | pyzmq==23.2.1
344 |     # via
345 |     #   ipykernel
346 |     #   jupyter-client
347 |     #   notebook
348 |     #   qtconsole
349 | qtconsole==5.3.2
350 |     # via jupyter
351 | qtpy==2.2.0
352 |     # via qtconsole
353 | requests==2.28.1
354 |     # via
355 |     #   -r requirements.in
356 |     #   azure-storage-common
357 | restructuredtext-lint==1.4.0
358 |     # via flake8-rst-docstrings
359 | scipy==1.9.1
360 |     # via
361 |     #   plotly-express
362 |     #   statsmodels
363 | send2trash==1.8.0
364 |     # via notebook
365 | six==1.16.0
366 |     # via
367 |     #   asttokens
368 |     #   bleach
369 |     #   patsy
370 |     #   python-dateutil
371 | smmap==5.0.0
372 |     # via gitdb
373 | snowballstemmer==2.2.0
374 |     # via pydocstyle
375 | soupsieve==2.3.2.post1
376 |     # via beautifulsoup4
377 | stack-data==0.5.0
378 |     # via ipython
379 | statsmodels==0.13.2
380 |     # via plotly-express
381 | stevedore==4.0.0
382 |     # via bandit
383 | tenacity==8.0.1
384 |     # via plotly
385 | terminado==0.15.0
386 |     # via notebook
387 | tinycss2==1.1.1
388 |     # via nbconvert
389 | tomli==2.0.1
390 |     # via
391 |     #   build
392 |     #   pep517
393 |     #   pytest
394 | tornado==6.2
395 |     # via
396 |     #   ipykernel
397 |     #   jupyter-client
398 |     #   notebook
399 |     #   terminado
400 | traitlets==5.4.0
401 |     # via
402 |     #   ipykernel
403 |     #   ipython
404 |     #   ipywidgets
405 |     #   jupyter-client
406 |     #   jupyter-core
407 |     #   matplotlib-inline
408 |     #   nbclient
409 |     #   nbconvert
410 |     #   nbformat
411 |     #   notebook
412 |     #   qtconsole
413 | typing-extensions==4.3.0
414 |     # via
415 |     #   pyee
416 |     #   wemake-python-styleguide
417 | urllib3==1.26.12
418 |     # via requests
419 | wcwidth==0.2.5
420 |     # via prompt-toolkit
421 | webencodings==0.5.1
422 |     # via
423 |     #   bleach
424 |     #   tinycss2
425 | wemake-python-styleguide==0.16.1
426 |     # via -r requirements.in
427 | werkzeug==2.2.2
428 |     # via flask
429 | wheel==0.37.1
430 |     # via pip-tools
431 | widgetsnbextension==4.0.3
432 |     # via ipywidgets
433 | xmltodict==0.13.0
434 |     # via -r requirements.in
435 | 
436 | # The following packages are considered to be unsafe in a requirements file:
437 | pip==22.2.2
438 |     # via pip-tools
439 | setuptools==65.3.0
440 |     # via
441 |     #   flake8-eradicate
442 |     #   gunicorn
443 |     #   pip-tools
444 | 


--------------------------------------------------------------------------------
/docker/airflow/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on an
 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | # KIND, either express or implied.  See the License for the
 15 | # specific language governing permissions and limitations
 16 | # under the License.
 17 | #
 18 | 
 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
 20 | #
 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
 22 | #
 23 | # This configuration supports basic configuration using environment variables or an .env file
 24 | # The following variables are supported:
 25 | #
 26 | # AIRFLOW_IMAGE_NAME           - Docker image name used to run Airflow.
 27 | #                                Default: apache/airflow:2.3.4
 28 | # AIRFLOW_UID                  - User ID in Airflow containers
 29 | #                                Default: 50000
 30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
 31 | #
 32 | # _AIRFLOW_WWW_USER_USERNAME   - Username for the administrator account (if requested).
 33 | #                                Default: airflow
 34 | # _AIRFLOW_WWW_USER_PASSWORD   - Password for the administrator account (if requested).
 35 | #                                Default: airflow
 36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
 37 | #                                Default: ''
 38 | #
 39 | # Feel free to modify this file to suit your needs.
 40 | ---
 41 | version: '3'
 42 | x-airflow-common:
 43 |   &airflow-common
 44 |   # In order to add custom dependencies or upgrade provider packages you can use your extended image.
 45 |   # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
 46 |   # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
 47 |   image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.3.4}
 48 |   # build: .
 49 |   environment:
 50 |     &airflow-common-env
 51 |     AIRFLOW__CORE__EXECUTOR: CeleryExecutor
 52 |     AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 53 |     # For backward compatibility, with Airflow <2.3
 54 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 55 |     AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
 56 |     AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
 57 |     AIRFLOW__CORE__FERNET_KEY: ${AIRFLOW_FERNET_KEY}
 58 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 59 |     AIRFLOW__CORE__LOAD_EXAMPLES: ${AIRFLOW__CORE__LOAD_EXAMPLES}
 60 |     AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth'
 61 |     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
 62 |     AIRFLOW_CONN_HTTP_DEFAULT: ${AIRFLOW_CONN_HTTP_DEFAULT}
 63 |   volumes:
 64 |     - ${AIRFLOW_DAGS_VOLUME}:/opt/airflow/dags
 65 |     - ${AIRFLOW_LOGS_VOLUME}:/opt/airflow/logs
 66 |     - ${AIRFLOW_PLUGINS_VOLUME}:/opt/airflow/plugins
 67 |   user: "${AIRFLOW_UID:-50000}:0"
 68 |   depends_on:
 69 |     &airflow-common-depends-on
 70 |     redis:
 71 |       condition: service_healthy
 72 |     postgres:
 73 |       condition: service_healthy
 74 | 
 75 | services:
 76 |   postgres:
 77 |     image: postgres:13
 78 |     environment:
 79 |       POSTGRES_USER: airflow
 80 |       POSTGRES_PASSWORD: airflow
 81 |       POSTGRES_DB: airflow
 82 |     volumes:
 83 |       - postgres-db-volume:/var/lib/postgresql/data
 84 |     healthcheck:
 85 |       test: [ "CMD", "pg_isready", "-U", "airflow" ]
 86 |       interval: 5s
 87 |       retries: 5
 88 |     restart: always
 89 | 
 90 |   redis:
 91 |     image: redis:latest
 92 |     expose:
 93 |       - 6379
 94 |     healthcheck:
 95 |       test: [ "CMD", "redis-cli", "ping" ]
 96 |       interval: 5s
 97 |       timeout: 30s
 98 |       retries: 50
 99 |     restart: always
100 | 
101 |   airflow-webserver:
102 |     <<: *airflow-common
103 |     command: webserver
104 |     ports:
105 |       - 8080:8080
106 |     healthcheck:
107 |       test: [ "CMD", "curl", "--fail", "http://localhost:8080/health" ]
108 |       interval: 10s
109 |       timeout: 10s
110 |       retries: 5
111 |     restart: always
112 |     depends_on:
113 |       <<: *airflow-common-depends-on
114 |       airflow-init:
115 |         condition: service_completed_successfully
116 |     volumes:
117 |       - ${AIRFLOW_WEBSERVER_VOLUME}:/opt/airflow
118 | 
119 |   airflow-scheduler:
120 |     <<: *airflow-common
121 |     command: scheduler
122 |     healthcheck:
123 |       test: [ "CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"' ]
124 |       interval: 10s
125 |       timeout: 10s
126 |       retries: 5
127 |     restart: always
128 |     depends_on:
129 |       <<: *airflow-common-depends-on
130 |       airflow-init:
131 |         condition: service_completed_successfully
132 | 
133 |   airflow-worker:
134 |     <<: *airflow-common
135 |     command: celery worker
136 |     healthcheck:
137 |       test:
138 |         - "CMD-SHELL"
139 |         - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
140 |       interval: 10s
141 |       timeout: 10s
142 |       retries: 5
143 |     environment:
144 |       <<: *airflow-common-env
145 |       # Required to handle warm shutdown of the celery workers properly
146 |       # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
147 |       DUMB_INIT_SETSID: "0"
148 |     restart: always
149 |     depends_on:
150 |       <<: *airflow-common-depends-on
151 |       airflow-init:
152 |         condition: service_completed_successfully
153 | 
154 |   airflow-triggerer:
155 |     <<: *airflow-common
156 |     command: triggerer
157 |     healthcheck:
158 |       test: [ "CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"' ]
159 |       interval: 10s
160 |       timeout: 10s
161 |       retries: 5
162 |     restart: always
163 |     depends_on:
164 |       <<: *airflow-common-depends-on
165 |       airflow-init:
166 |         condition: service_completed_successfully
167 | 
168 |   airflow-init:
169 |     <<: *airflow-common
170 |     entrypoint: /bin/bash
171 |     # yamllint disable rule:line-length
172 |     command:
173 |       - -c
174 |       - |
175 |         function ver() {
176 |           printf "%04d%04d%04d%04d" $${1//./ }
177 |         }
178 |         airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version)
179 |         airflow_version_comparable=$$(ver $${airflow_version})
180 |         min_airflow_version=2.2.0
181 |         min_airflow_version_comparable=$$(ver $${min_airflow_version})
182 |         if (( airflow_version_comparable < min_airflow_version_comparable )); then
183 |           echo
184 |           echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
185 |           echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
186 |           echo
187 |           exit 1
188 |         fi
189 |         if [[ -z "${AIRFLOW_UID}" ]]; then
190 |           echo
191 |           echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
192 |           echo "If you are on Linux, you SHOULD follow the instructions below to set "
193 |           echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
194 |           echo "For other operating systems you can get rid of the warning with manually created .env file:"
195 |           echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user"
196 |           echo
197 |         fi
198 |         one_meg=1048576
199 |         mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
200 |         cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
201 |         disk_available=$$(df / | tail -1 | awk '{print $$4}')
202 |         warning_resources="false"
203 |         if (( mem_available < 4000 )) ; then
204 |           echo
205 |           echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
206 |           echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
207 |           echo
208 |           warning_resources="true"
209 |         fi
210 |         if (( cpus_available < 2 )); then
211 |           echo
212 |           echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
213 |           echo "At least 2 CPUs recommended. You have $${cpus_available}"
214 |           echo
215 |           warning_resources="true"
216 |         fi
217 |         if (( disk_available < one_meg * 10 )); then
218 |           echo
219 |           echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
220 |           echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
221 |           echo
222 |           warning_resources="true"
223 |         fi
224 |         if [[ $${warning_resources} == "true" ]]; then
225 |           echo
226 |           echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
227 |           echo "Please follow the instructions to increase amount of resources available:"
228 |           echo "   https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin"
229 |           echo
230 |         fi
231 |         mkdir -p /sources/logs /sources/dags /sources/plugins
232 |         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
233 |         exec /entrypoint airflow version
234 |     # yamllint enable rule:line-length
235 |     environment:
236 |       <<: *airflow-common-env
237 |       _AIRFLOW_DB_UPGRADE: 'true'
238 |       _AIRFLOW_WWW_USER_CREATE: 'true'
239 |       _AIRFLOW_WWW_USER_USERNAME: ${AIRFLOW_USERNAME:-airflow}
240 |       _AIRFLOW_WWW_USER_PASSWORD: ${AIRFLOW_PASSWORD:-airflow}
241 |       _PIP_ADDITIONAL_REQUIREMENTS: ''
242 |     user: "0:0"
243 |     volumes:
244 |       - .:/sources
245 | 
246 |   airflow-cli:
247 |     <<: *airflow-common
248 |     profiles:
249 |       - debug
250 |     environment:
251 |       <<: *airflow-common-env
252 |       CONNECTION_CHECK_MAX_COUNT: "0"
253 |     # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
254 |     command:
255 |       - bash
256 |       - -c
257 |       - airflow
258 | 
259 |   # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up
260 |   # or by explicitly targeted on the command line e.g. docker-compose up flower.
261 |   # See: https://docs.docker.com/compose/profiles/
262 |   flower:
263 |     <<: *airflow-common
264 |     command: celery flower
265 |     profiles:
266 |       - flower
267 |     ports:
268 |       - 5555:5555
269 |     healthcheck:
270 |       test: [ "CMD", "curl", "--fail", "http://localhost:5555/" ]
271 |       interval: 10s
272 |       timeout: 10s
273 |       retries: 5
274 |     restart: always
275 |     depends_on:
276 |       <<: *airflow-common-depends-on
277 |       airflow-init:
278 |         condition: service_completed_successfully
279 | 
280 | volumes:
281 |   postgres-db-volume:


--------------------------------------------------------------------------------
/python/simplescraper/explore/explore_dwh_location.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "outputs": [],
  7 |    "source": [
  8 |     "import duckdb\n",
  9 |     "import pandas as pd\n",
 10 |     "import plotly.express as px\n",
 11 |     "from plotly_calplot import calplot\n",
 12 |     "\n",
 13 |     "from common.env_variables import DUCKDB_DWH_FILE"
 14 |    ],
 15 |    "metadata": {
 16 |     "collapsed": false,
 17 |     "pycharm": {
 18 |      "name": "#%%\n"
 19 |     }
 20 |    }
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "def display_df(df):\n",
 28 |     "    with pd.option_context('display.max_rows', None, 'display.max_columns', None, \"expand_frame_repr\", False, \"display.float_format\", '${:,.2f}'.format):\n",
 29 |     "        display(df.fillna('.'))"
 30 |    ],
 31 |    "metadata": {
 32 |     "collapsed": false,
 33 |     "pycharm": {
 34 |      "name": "#%%\n"
 35 |     }
 36 |    }
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "conn = duckdb.connect(DUCKDB_DWH_FILE, read_only=True)"
 44 |    ],
 45 |    "metadata": {
 46 |     "collapsed": false,
 47 |     "pycharm": {
 48 |      "name": "#%%\n"
 49 |     }
 50 |    }
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 4,
 55 |    "outputs": [
 56 |     {
 57 |      "data": {
 58 |       "text/plain": "                   location  job_count\n0                    Berlin      98461\n1                   Hamburg      88763\n2                   München      85657\n3         Frankfurt am Main      55276\n4                 Stuttgart      44858\n5                      Köln      44203\n6                Düsseldorf      42574\n7                  Hannover      21827\n8                  Nürnberg      18802\n9                   Leipzig      16443\n10                    Essen      15877\n11                   Bremen      14867\n12                Karlsruhe      14226\n13                 Mannheim      12509\n14                 Dortmund      12068\n15                     Bonn      11815\n16                  Dresden      11510\n17                  Münster       9021\n18                Wiesbaden       7999\n19                      Ulm       7942\n20                Bielefeld       6671\n21                    Mainz       6622\n22                 Augsburg       6620\n23               Heidelberg       6426\n24                     Kiel       6137\n25                 Duisburg       6050\n26               bundesweit       5759\n27               Regensburg       5731\n28                Darmstadt       5592\n29             Braunschweig       5492\n30                   Aachen       5183\n31               Neckarsulm       5086\n32                   Bochum       4981\n33                   Erfurt       4973\n34               Ingolstadt       4836\n35                   Kassel       4659\n36                Wolfsburg       4471\n37                 Würzburg       4439\n38                 Freiburg       4310\n39                   Lübeck       4276\n40         Kreisfreie Stadt       4098\n41                Gütersloh       4083\n42              Home-Office       3971\n43                Osnabrück       3851\n44                Magdeburg       3825\n45                  Rostock       3763\n46                Heilbronn       3728\n47                  Potsdam       3670\n48                  Koblenz       3550\n49                Wuppertal       3415\n50     Freiburg im Breisgau       3394\n51               Reutlingen       3381\n52                  Krefeld       3370\n53                     Jena       3363\n54             Sindelfingen       3260\n55                 Chemnitz       3234\n56          Mönchengladbach       3130\n57              Saarbrücken       3034\n58              Ludwigsburg       2982\n59                Oldenburg       2739\n60                    Neuss       2739\n61                 Erlangen       2553\n62                Pforzheim       2552\n63                Göttingen       2536\n64                 Ratingen       2489\n65                Paderborn       2460\n66          deutschlandweit       2376\n67                 Tübingen       2363\n68              Norderstedt       2317\n69               Leverkusen       2244\n70                 Eschborn       2189\n71                     Main       2172\n72               Homeoffice       2159\n73               Oberkochen       2140\n74             Ludwigshafen       2097\n75               Oberhausen       2082\n76                Böblingen       2075\n77  Leinfelden-Echterdingen       2037\n78                 Bayreuth       1997\n79                Offenburg       1967\n80            Halle (Saale)       1949\n81                    Hanau       1851\n82                   Minden       1782\n83           Kaiserslautern       1759\n84                    Fulda       1680\n85                    Fürth       1678\n86            Gelsenkirchen       1669\n87              Baden-Baden       1655\n88                  Bamberg       1654\n89               Hildesheim       1627\n90                   Munich       1618\n91                   Gießen       1611\n92                 Landshut       1604\n93                 Konstanz       1602\n94          Friedrichshafen       1588\n95                    Hagen       1588\n96        Baden-Württemberg       1557\n97             Neu-Isenburg       1553\n98                Flensburg       1493\n99                    Trier       1483",
 59 |       "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>location</th>\n      <th>job_count</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Berlin</td>\n      <td>98461</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Hamburg</td>\n      <td>88763</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>München</td>\n      <td>85657</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Frankfurt am Main</td>\n      <td>55276</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Stuttgart</td>\n      <td>44858</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Köln</td>\n      <td>44203</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Düsseldorf</td>\n      <td>42574</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>Hannover</td>\n      <td>21827</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>Nürnberg</td>\n      <td>18802</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>Leipzig</td>\n      <td>16443</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>Essen</td>\n      <td>15877</td>\n    </tr>\n    <tr>\n      <th>11</th>\n      <td>Bremen</td>\n      <td>14867</td>\n    </tr>\n    <tr>\n      <th>12</th>\n      <td>Karlsruhe</td>\n      <td>14226</td>\n    </tr>\n    <tr>\n      <th>13</th>\n      <td>Mannheim</td>\n      <td>12509</td>\n    </tr>\n    <tr>\n      <th>14</th>\n      <td>Dortmund</td>\n      <td>12068</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>Bonn</td>\n      <td>11815</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>Dresden</td>\n      <td>11510</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>Münster</td>\n      <td>9021</td>\n    </tr>\n    <tr>\n      <th>18</th>\n      <td>Wiesbaden</td>\n      <td>7999</td>\n    </tr>\n    <tr>\n      <th>19</th>\n      <td>Ulm</td>\n      <td>7942</td>\n    </tr>\n    <tr>\n      <th>20</th>\n      <td>Bielefeld</td>\n      <td>6671</td>\n    </tr>\n    <tr>\n      <th>21</th>\n      <td>Mainz</td>\n      <td>6622</td>\n    </tr>\n    <tr>\n      <th>22</th>\n      <td>Augsburg</td>\n      <td>6620</td>\n    </tr>\n    <tr>\n      <th>23</th>\n      <td>Heidelberg</td>\n      <td>6426</td>\n    </tr>\n    <tr>\n      <th>24</th>\n      <td>Kiel</td>\n      <td>6137</td>\n    </tr>\n    <tr>\n      <th>25</th>\n      <td>Duisburg</td>\n      <td>6050</td>\n    </tr>\n    <tr>\n      <th>26</th>\n      <td>bundesweit</td>\n      <td>5759</td>\n    </tr>\n    <tr>\n      <th>27</th>\n      <td>Regensburg</td>\n      <td>5731</td>\n    </tr>\n    <tr>\n      <th>28</th>\n      <td>Darmstadt</td>\n      <td>5592</td>\n    </tr>\n    <tr>\n      <th>29</th>\n      <td>Braunschweig</td>\n      <td>5492</td>\n    </tr>\n    <tr>\n      <th>30</th>\n      <td>Aachen</td>\n      <td>5183</td>\n    </tr>\n    <tr>\n      <th>31</th>\n      <td>Neckarsulm</td>\n      <td>5086</td>\n    </tr>\n    <tr>\n      <th>32</th>\n      <td>Bochum</td>\n      <td>4981</td>\n    </tr>\n    <tr>\n      <th>33</th>\n      <td>Erfurt</td>\n      <td>4973</td>\n    </tr>\n    <tr>\n      <th>34</th>\n      <td>Ingolstadt</td>\n      <td>4836</td>\n    </tr>\n    <tr>\n      <th>35</th>\n      <td>Kassel</td>\n      <td>4659</td>\n    </tr>\n    <tr>\n      <th>36</th>\n      <td>Wolfsburg</td>\n      <td>4471</td>\n    </tr>\n    <tr>\n      <th>37</th>\n      <td>Würzburg</td>\n      <td>4439</td>\n    </tr>\n    <tr>\n      <th>38</th>\n      <td>Freiburg</td>\n      <td>4310</td>\n    </tr>\n    <tr>\n      <th>39</th>\n      <td>Lübeck</td>\n      <td>4276</td>\n    </tr>\n    <tr>\n      <th>40</th>\n      <td>Kreisfreie Stadt</td>\n      <td>4098</td>\n    </tr>\n    <tr>\n      <th>41</th>\n      <td>Gütersloh</td>\n      <td>4083</td>\n    </tr>\n    <tr>\n      <th>42</th>\n      <td>Home-Office</td>\n      <td>3971</td>\n    </tr>\n    <tr>\n      <th>43</th>\n      <td>Osnabrück</td>\n      <td>3851</td>\n    </tr>\n    <tr>\n      <th>44</th>\n      <td>Magdeburg</td>\n      <td>3825</td>\n    </tr>\n    <tr>\n      <th>45</th>\n      <td>Rostock</td>\n      <td>3763</td>\n    </tr>\n    <tr>\n      <th>46</th>\n      <td>Heilbronn</td>\n      <td>3728</td>\n    </tr>\n    <tr>\n      <th>47</th>\n      <td>Potsdam</td>\n      <td>3670</td>\n    </tr>\n    <tr>\n      <th>48</th>\n      <td>Koblenz</td>\n      <td>3550</td>\n    </tr>\n    <tr>\n      <th>49</th>\n      <td>Wuppertal</td>\n      <td>3415</td>\n    </tr>\n    <tr>\n      <th>50</th>\n      <td>Freiburg im Breisgau</td>\n      <td>3394</td>\n    </tr>\n    <tr>\n      <th>51</th>\n      <td>Reutlingen</td>\n      <td>3381</td>\n    </tr>\n    <tr>\n      <th>52</th>\n      <td>Krefeld</td>\n      <td>3370</td>\n    </tr>\n    <tr>\n      <th>53</th>\n      <td>Jena</td>\n      <td>3363</td>\n    </tr>\n    <tr>\n      <th>54</th>\n      <td>Sindelfingen</td>\n      <td>3260</td>\n    </tr>\n    <tr>\n      <th>55</th>\n      <td>Chemnitz</td>\n      <td>3234</td>\n    </tr>\n    <tr>\n      <th>56</th>\n      <td>Mönchengladbach</td>\n      <td>3130</td>\n    </tr>\n    <tr>\n      <th>57</th>\n      <td>Saarbrücken</td>\n      <td>3034</td>\n    </tr>\n    <tr>\n      <th>58</th>\n      <td>Ludwigsburg</td>\n      <td>2982</td>\n    </tr>\n    <tr>\n      <th>59</th>\n      <td>Oldenburg</td>\n      <td>2739</td>\n    </tr>\n    <tr>\n      <th>60</th>\n      <td>Neuss</td>\n      <td>2739</td>\n    </tr>\n    <tr>\n      <th>61</th>\n      <td>Erlangen</td>\n      <td>2553</td>\n    </tr>\n    <tr>\n      <th>62</th>\n      <td>Pforzheim</td>\n      <td>2552</td>\n    </tr>\n    <tr>\n      <th>63</th>\n      <td>Göttingen</td>\n      <td>2536</td>\n    </tr>\n    <tr>\n      <th>64</th>\n      <td>Ratingen</td>\n      <td>2489</td>\n    </tr>\n    <tr>\n      <th>65</th>\n      <td>Paderborn</td>\n      <td>2460</td>\n    </tr>\n    <tr>\n      <th>66</th>\n      <td>deutschlandweit</td>\n      <td>2376</td>\n    </tr>\n    <tr>\n      <th>67</th>\n      <td>Tübingen</td>\n      <td>2363</td>\n    </tr>\n    <tr>\n      <th>68</th>\n      <td>Norderstedt</td>\n      <td>2317</td>\n    </tr>\n    <tr>\n      <th>69</th>\n      <td>Leverkusen</td>\n      <td>2244</td>\n    </tr>\n    <tr>\n      <th>70</th>\n      <td>Eschborn</td>\n      <td>2189</td>\n    </tr>\n    <tr>\n      <th>71</th>\n      <td>Main</td>\n      <td>2172</td>\n    </tr>\n    <tr>\n      <th>72</th>\n      <td>Homeoffice</td>\n      <td>2159</td>\n    </tr>\n    <tr>\n      <th>73</th>\n      <td>Oberkochen</td>\n      <td>2140</td>\n    </tr>\n    <tr>\n      <th>74</th>\n      <td>Ludwigshafen</td>\n      <td>2097</td>\n    </tr>\n    <tr>\n      <th>75</th>\n      <td>Oberhausen</td>\n      <td>2082</td>\n    </tr>\n    <tr>\n      <th>76</th>\n      <td>Böblingen</td>\n      <td>2075</td>\n    </tr>\n    <tr>\n      <th>77</th>\n      <td>Leinfelden-Echterdingen</td>\n      <td>2037</td>\n    </tr>\n    <tr>\n      <th>78</th>\n      <td>Bayreuth</td>\n      <td>1997</td>\n    </tr>\n    <tr>\n      <th>79</th>\n      <td>Offenburg</td>\n      <td>1967</td>\n    </tr>\n    <tr>\n      <th>80</th>\n      <td>Halle (Saale)</td>\n      <td>1949</td>\n    </tr>\n    <tr>\n      <th>81</th>\n      <td>Hanau</td>\n      <td>1851</td>\n    </tr>\n    <tr>\n      <th>82</th>\n      <td>Minden</td>\n      <td>1782</td>\n    </tr>\n    <tr>\n      <th>83</th>\n      <td>Kaiserslautern</td>\n      <td>1759</td>\n    </tr>\n    <tr>\n      <th>84</th>\n      <td>Fulda</td>\n      <td>1680</td>\n    </tr>\n    <tr>\n      <th>85</th>\n      <td>Fürth</td>\n      <td>1678</td>\n    </tr>\n    <tr>\n      <th>86</th>\n      <td>Gelsenkirchen</td>\n      <td>1669</td>\n    </tr>\n    <tr>\n      <th>87</th>\n      <td>Baden-Baden</td>\n      <td>1655</td>\n    </tr>\n    <tr>\n      <th>88</th>\n      <td>Bamberg</td>\n      <td>1654</td>\n    </tr>\n    <tr>\n      <th>89</th>\n      <td>Hildesheim</td>\n      <td>1627</td>\n    </tr>\n    <tr>\n      <th>90</th>\n      <td>Munich</td>\n      <td>1618</td>\n    </tr>\n    <tr>\n      <th>91</th>\n      <td>Gießen</td>\n      <td>1611</td>\n    </tr>\n    <tr>\n      <th>92</th>\n      <td>Landshut</td>\n      <td>1604</td>\n    </tr>\n    <tr>\n      <th>93</th>\n      <td>Konstanz</td>\n      <td>1602</td>\n    </tr>\n    <tr>\n      <th>94</th>\n      <td>Friedrichshafen</td>\n      <td>1588</td>\n    </tr>\n    <tr>\n      <th>95</th>\n      <td>Hagen</td>\n      <td>1588</td>\n    </tr>\n    <tr>\n      <th>96</th>\n      <td>Baden-Württemberg</td>\n      <td>1557</td>\n    </tr>\n    <tr>\n      <th>97</th>\n      <td>Neu-Isenburg</td>\n      <td>1553</td>\n    </tr>\n    <tr>\n      <th>98</th>\n      <td>Flensburg</td>\n      <td>1493</td>\n    </tr>\n    <tr>\n      <th>99</th>\n      <td>Trier</td>\n      <td>1483</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
 60 |      },
 61 |      "metadata": {},
 62 |      "output_type": "display_data"
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "df = conn.execute(f'''\n",
 67 |     "SELECT * FROM location\n",
 68 |     "LIMIT 100\n",
 69 |     "''').df()\n",
 70 |     "display_df(df)"
 71 |    ],
 72 |    "metadata": {
 73 |     "collapsed": false,
 74 |     "pycharm": {
 75 |      "name": "#%%\n"
 76 |     }
 77 |    }
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 5,
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "conn.close()"
 85 |    ],
 86 |    "metadata": {
 87 |     "collapsed": false,
 88 |     "pycharm": {
 89 |      "name": "#%%\n"
 90 |     }
 91 |    }
 92 |   }
 93 |  ],
 94 |  "metadata": {
 95 |   "kernelspec": {
 96 |    "display_name": "Python 3",
 97 |    "language": "python",
 98 |    "name": "python3"
 99 |   },
100 |   "language_info": {
101 |    "codemirror_mode": {
102 |     "name": "ipython",
103 |     "version": 2
104 |    },
105 |    "file_extension": ".py",
106 |    "mimetype": "text/x-python",
107 |    "name": "python",
108 |    "nbconvert_exporter": "python",
109 |    "pygments_lexer": "ipython2",
110 |    "version": "2.7.6"
111 |   }
112 |  },
113 |  "nbformat": 4,
114 |  "nbformat_minor": 0
115 | }


--------------------------------------------------------------------------------