├── .dockerignore ├── .env.template ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── documentation.md │ └── feature_request.md ├── pull_request_template.md └── workflows │ ├── dockerimage.yml │ └── python.yml ├── .gitignore ├── Dockerfile ├── Dockerfile.test ├── Makefile ├── README.md ├── airflow.cfg ├── contrib ├── README.md ├── data │ ├── corporate-attendees-2018.csv │ ├── corporate-attendees-2019.csv │ ├── corporate-attendees-2020.csv │ ├── individual-attendees-2018.csv │ ├── individual-attendees-2019.csv │ ├── individual-attendees-2020.csv │ ├── reserved-attendees-2018.csv │ ├── reserved-attendees-2019.csv │ └── reserved-attendees-2020.csv ├── kktix_bq_etl.sh ├── survey_cake │ ├── udfs │ │ └── survey_cake_csv_uploader.py │ └── upload-survey-cake-csv-to-bigquery.py ├── upload-kktix-ticket-csv-to-bigquery.py └── upload-kktix-ticket-csv-to-bigquery.sh ├── dags ├── airflow-log-cleanup.py ├── airlfow-db-cleanup.py ├── app │ ├── channel_reminder │ │ ├── __init__.py │ │ ├── dag.py │ │ └── udf.py │ ├── discord.py │ ├── finance_bot │ │ ├── __init__.py │ │ ├── dag.py │ │ └── udf.py │ ├── proposal_reminder │ │ ├── __init__.py │ │ ├── dag.py │ │ └── udf.py │ ├── team_registration_bot │ │ ├── __init__.py │ │ ├── dag.py │ │ └── udf.py │ └── twitter_post_notification_bot │ │ ├── dag.py │ │ └── udf.py ├── dwd │ └── __init__.py ├── dws │ └── __init__.py ├── fixtures │ └── data_questionnaire.csv ├── ods │ ├── fb_post_insights │ │ ├── dag.py │ │ └── udfs.py │ ├── google_search_console │ │ ├── dag.py │ │ └── udfs │ │ │ └── google_search.py │ ├── ig_post_insights │ │ ├── dags.py │ │ └── udfs.py │ ├── kktix_ticket_orders │ │ ├── kktix_dag.py │ │ ├── kktix_refund_dag.py │ │ ├── klaviyo_backfill_dag.py │ │ ├── sqls │ │ │ └── create_table.sql │ │ └── udfs │ │ │ ├── batch_kktix2mailer.py │ │ │ ├── bigquery_loader.py │ │ │ ├── gather_town_loader.py │ │ │ ├── kktix_api.py │ │ │ ├── kktix_bq_dwd_etl.py │ │ │ ├── kktix_loader.py │ │ │ ├── kktix_refund.py │ │ │ ├── kktix_transformer.py │ │ │ ├── klaviyo_loader.py │ │ │ └── klaviyo_mailer.py │ ├── linkedin_post_insights │ │ ├── dags.py │ │ └── udfs.py │ ├── opening_crawler │ │ ├── dags │ │ │ └── cakeresume_crawler.py │ │ └── udfs │ │ │ └── crawlers.py │ ├── survey_cake │ │ ├── dags │ │ │ └── questionnaire_2_bigquery.py │ │ └── udfs │ │ │ └── survey_cake_csv_uploader.py │ ├── twitter_post_insights │ │ ├── dags.py │ │ └── udfs.py │ └── youtube │ │ ├── dags │ │ └── dag.py │ │ ├── sqls │ │ └── create_table.sql │ │ └── udfs │ │ └── youtube_api.py └── utils │ └── hook_related.py ├── docker-compose-dev.yml ├── docker-compose.yml ├── docs ├── CONTRIBUTING.md ├── DEPLOYMENT.md ├── MAINTENANCE.md ├── airflow.png ├── kktix.png └── youtube-connection.png ├── entrypoint.sh ├── pyproject.toml ├── setup.cfg ├── tests ├── __init__.py ├── conftest.py ├── data_questionnaire.csv ├── kktix_ticket_orders │ ├── test_klaviyo_loader.py │ └── test_transformer.py ├── test_cakeresume_uploader.py └── test_crawler.py └── uv.lock /.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .mypy_cache 3 | .pytest_cache 4 | .vscode 5 | bower_components 6 | venv 7 | node_modules 8 | .git 9 | service-account.json -------------------------------------------------------------------------------- /.env.template: -------------------------------------------------------------------------------- 1 | AIRFLOW_HOME=/opt/airflow 2 | BIGQUERY_PROJECT=pycontw-225217 3 | GOOGLE_APPLICATION_CREDENTIALS=/opt/airflow/service-account.json 4 | AIRFLOW__CORE__FERNET_KEY=paste-your-fernet-key-here -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 🛠 Bug report 3 | about: Create a report to help us improve 4 | title: "[Bug Report] Good bug title tells us about precise symptom, not about the root cause." 5 | labels: "bug" 6 | assignees: "" 7 | --- 8 | 9 | ## Description 10 | 11 | 12 | ## {{ cookiecutter.project_name }} version 13 | 14 | 15 | ## Steps to Reproduce 16 | 23 | 24 | ## Expected Behavior 25 | 31 | 32 | ## Actual Behavior 33 | 34 | 35 | ## More Information 36 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 📖 Documentation 3 | about: Suggest an improvement for the documentation of this project 4 | title: "[Documentation] Content to be added or fixed" 5 | labels: "documentation" 6 | assignees: "" 7 | --- 8 | 9 | ## Type 10 | 11 | * [ ] Content inaccurate 12 | * [ ] Content missing 13 | * [ ] Typo 14 | 15 | ## URL 16 | 17 | 18 | ## Description 19 | 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 🚀 Feature request 3 | about: Suggest an idea for this project 4 | title: "[Feature Request] " 5 | labels: "" 6 | assignees: "" 7 | --- 8 | 9 | ## Description 10 | 11 | 12 | ## Possible Solution 13 | 14 | 15 | ## Additional context 16 | 17 | 18 | ## Related Issue 19 | 20 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Types of changes 4 | 5 | 6 | - **Bugfix** 7 | - **New feature** 8 | - **Refactoring** 9 | - **Breaking change** (any change that would cause existing functionality to not work as expected) 10 | - **Documentation Update** 11 | - **Other (please describe)** 12 | 13 | ## Description 14 | 15 | 16 | ## Checklist 17 | 18 | - [ ] Add test cases to all the changes you introduce 19 | - [ ] Run `make lint` and `make test` locally to ensure all linter checks and testing pass 20 | - [ ] Update the documentation if necessary 21 | 22 | ## Steps to Test This Pull Request 23 | 29 | 30 | ## Expected behavior 31 | 32 | 33 | ## Related Issue 34 | 35 | 36 | ## Additional context 37 | 38 | -------------------------------------------------------------------------------- /.github/workflows/dockerimage.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | on: 3 | push: 4 | branches: [ master, prod ] 5 | pull_request: 6 | branches: [ master, prod ] 7 | env: 8 | RC_NAME: asia-east1-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/data-team/pycon-etl 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Authenticate to Google Cloud 15 | uses: google-github-actions/auth@v1 16 | with: 17 | credentials_json: ${{ secrets.GCP_SERVICE_ACCOUNT_KEY }} 18 | - name: Configure docker to use gcloud command-line tool as a credential helper 19 | run: | 20 | gcloud auth configure-docker asia-east1-docker.pkg.dev 21 | - name: Pull cache 22 | run: | 23 | docker pull ${RC_NAME}:cache || true 24 | - name: Build the Docker image 25 | run: | 26 | docker build -t ${RC_NAME}:cache --cache-from ${RC_NAME}:cache . 27 | docker build -t ${RC_NAME}:test --cache-from ${RC_NAME}:cache -f Dockerfile.test . 28 | - name: Run test 29 | run: | 30 | docker run -d --rm -p 8080:8080 --name airflow -v $(pwd)/dags:/opt/airflow/dags -v $(pwd)/fixtures:/opt/airflow/fixtures ${RC_NAME}:test webserver 31 | sleep 10 32 | - name: Push cache to Google Container Registry 33 | if: success() 34 | run: | 35 | docker push ${RC_NAME}:cache 36 | - name: Push staging to Google Container Registry 37 | if: github.ref == 'refs/heads/master' && success() 38 | run: | 39 | docker tag ${RC_NAME}:cache ${RC_NAME}:staging 40 | docker push ${RC_NAME}:staging 41 | - name: Push prod version to Google Container Registry 42 | if: github.ref == 'refs/heads/prod' && success() 43 | run: | 44 | docker tag ${RC_NAME}:cache ${RC_NAME}:latest 45 | docker push ${RC_NAME}:latest 46 | -------------------------------------------------------------------------------- /.github/workflows/python.yml: -------------------------------------------------------------------------------- 1 | name: Python CI 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | branches: [master] 8 | env: 9 | AIRFLOW_TEST_MODE: true 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | timeout-minutes: 10 14 | steps: 15 | - name: Check out 16 | uses: actions/checkout@v4 17 | with: 18 | fetch-depth: 0 19 | 20 | - name: Install the latest version of uv 21 | uses: astral-sh/setup-uv@v5 22 | with: 23 | enable-cache: true 24 | version: "latest" 25 | 26 | - name: Install dependencies 27 | run: | 28 | uv sync --group dev 29 | 30 | - name: Run linters 31 | run: make lint 32 | 33 | - name: Run test 34 | run: make test 35 | 36 | - name: Coverage 37 | run: make coverage 38 | 39 | # CD part 40 | # - name: Push dags to GCS 41 | # not implemented yet 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # project stuff 2 | .env.production 3 | .env.staging 4 | env 5 | env.sh 6 | client_secret_google_search_console* 7 | *.csv 8 | !dags/fixtures/*.csv 9 | service-account.json 10 | PyConTW2019/ 11 | PyConTW2020-CCIP-DB-dump/ 12 | dags_data-venue-booth-checking-in_PyConTW2019-20200906T164504Z-001.zip 13 | dags_data-venue-booth-checking-in_PyConTW2020-CCIP-DB-dump.zip 14 | .env 15 | PyConTW-ab56b4c31ba4-bigquery-data-strat-owned-by-tai.json 16 | PyConTW-ab56b4c31ba4-bigquery-data-strat-owned-by-tai.json.gz 17 | 18 | # npm stuff 19 | node_modules/ 20 | 21 | # ipython notebooks 22 | .ipynb_checkpoints/ 23 | 24 | # mypy stuff 25 | .mypy_cache/ 26 | 27 | # vscode stuff 28 | .vscode/ 29 | 30 | # virtualenv 31 | venv 32 | 33 | # System stuff 34 | [Oo]bj 35 | [Bb]in 36 | [Tt]emp 37 | *.user 38 | *.suo 39 | *.[Cc]ache 40 | *.bak 41 | *.log 42 | *.DS_Store 43 | [Tt]est[Rr]esult* 44 | [Tt]humbs.db 45 | _ReSharper.* 46 | *.resharper 47 | Ankh.NoLoad 48 | 49 | # Byte-compiled / optimized / DLL files 50 | __pycache__/ 51 | *.py[cod] 52 | 53 | # C extensions 54 | *.so 55 | 56 | # Distribution / packaging 57 | .Python 58 | env/ 59 | build/ 60 | develop-eggs/ 61 | dist/ 62 | downloads/ 63 | eggs/ 64 | .eggs/ 65 | lib/ 66 | lib64/ 67 | parts/ 68 | sdist/ 69 | var/ 70 | *.egg-info/ 71 | .installed.cfg 72 | *.egg 73 | 74 | # PyInstaller 75 | # Usually these files are written by a python script from a template 76 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 77 | *.manifest 78 | *.spec 79 | 80 | # Installer logs 81 | pip-log.txt 82 | pip-delete-this-directory.txt 83 | 84 | # Unit test / coverage reports 85 | htmlcov/ 86 | .tox/ 87 | .coverage 88 | .coverage.* 89 | .cache 90 | nosetests.xml 91 | coverage.xml 92 | *,cover 93 | 94 | # Translations 95 | *.mo 96 | *.pot 97 | 98 | # Django stuff: 99 | *.log 100 | 101 | # Sphinx documentation 102 | docs/_build/ 103 | 104 | # PyBuilder 105 | target/ 106 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG AIRFLOW_VERSION=1.10.15 2 | ARG PYTHON_VERSION=3.8 3 | ARG PLATFORM=linux/amd64 4 | 5 | FROM --platform=${PLATFORM} ghcr.io/astral-sh/uv:python${PYTHON_VERSION}-bookworm-slim AS builder 6 | 7 | ENV UV_COMPILE_BYTECODE=1 \ 8 | UV_LINK_MODE=copy \ 9 | UV_PYTHON_DOWNLOADS=0 10 | 11 | RUN apt-get update && \ 12 | apt-get install -y gcc libc6-dev --no-install-recommends 13 | 14 | WORKDIR /app 15 | 16 | COPY ./pyproject.toml . 17 | COPY ./uv.lock . 18 | 19 | RUN --mount=type=cache,target=/root/.cache/uv \ 20 | --mount=type=bind,source=uv.lock,target=uv.lock \ 21 | --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ 22 | uv sync --frozen --no-install-project --no-dev 23 | 24 | RUN --mount=type=cache,target=/root/.cache/uv \ 25 | uv sync --frozen --no-install-project --no-dev 26 | 27 | 28 | FROM --platform=${PLATFORM} apache/airflow:${AIRFLOW_VERSION}-python${PYTHON_VERSION} 29 | 30 | USER root 31 | 32 | RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 467B942D3A79BD29 \ 33 | && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys B7B3B788A8D3785C \ 34 | && apt-get update \ 35 | && apt-get install -y --no-install-recommends git \ 36 | # 1. if you don't need postgres, remember to remove postgresql-dev and sqlalchemy 37 | # 2. libglib2.0-0 libsm6 libxext6 libxrender-dev libgl1-mesa-dev are required by opencv 38 | # 3. git is required by pip install git+https 39 | && apt-get clean \ 40 | && rm -rf /var/lib/apt/lists/* 41 | 42 | COPY entrypoint.sh /entrypoint.sh 43 | 44 | RUN chmod +x /entrypoint.sh 45 | 46 | USER airflow 47 | 48 | COPY --from=builder --chown=airflow:airflow /app /app 49 | ENV PATH="/app/.venv/bin:$PATH" 50 | 51 | COPY airflow.cfg ${AIRFLOW_HOME}/airflow.cfg 52 | COPY --chown=airflow:root dags ${AIRFLOW_HOME}/dags 53 | 54 | ENTRYPOINT ["/entrypoint.sh"] 55 | -------------------------------------------------------------------------------- /Dockerfile.test: -------------------------------------------------------------------------------- 1 | ARG AIRFLOW_VERSION=1.10.15 2 | ARG PYTHON_VERSION=3.8 3 | ARG PLATFORM=linux/amd64 4 | 5 | FROM --platform=${PLATFORM} ghcr.io/astral-sh/uv:python${PYTHON_VERSION}-bookworm-slim AS builder 6 | 7 | ENV UV_COMPILE_BYTECODE=1 \ 8 | UV_LINK_MODE=copy \ 9 | UV_PYTHON_DOWNLOADS=0 10 | 11 | RUN apt-get update && \ 12 | apt-get install -y gcc libc6-dev --no-install-recommends 13 | 14 | WORKDIR /app 15 | 16 | COPY ./pyproject.toml . 17 | COPY ./uv.lock . 18 | 19 | RUN --mount=type=cache,target=/root/.cache/uv \ 20 | --mount=type=bind,source=uv.lock,target=uv.lock \ 21 | --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ 22 | uv sync --frozen --no-install-project --group dev 23 | 24 | RUN --mount=type=cache,target=/root/.cache/uv \ 25 | uv sync --frozen --group dev 26 | 27 | FROM --platform=${PLATFORM} apache/airflow:${AIRFLOW_VERSION}-python${PYTHON_VERSION} 28 | 29 | USER root 30 | 31 | RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 467B942D3A79BD29 \ 32 | && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys B7B3B788A8D3785C \ 33 | && apt-get update \ 34 | && apt-get install -y --no-install-recommends git \ 35 | # 1. if you don't need postgres, remember to remove postgresql-dev and sqlalchemy 36 | # 2. libglib2.0-0 libsm6 libxext6 libxrender-dev libgl1-mesa-dev are required by opencv 37 | # 3. git is required by pip install git+https 38 | && apt-get clean \ 39 | && rm -rf /var/lib/apt/lists/* 40 | 41 | COPY entrypoint.sh /entrypoint.sh 42 | 43 | RUN chmod +x /entrypoint.sh 44 | 45 | USER airflow 46 | 47 | COPY --from=builder --chown=airflow:airflow /app /app 48 | ENV PATH="/app/.venv/bin:$PATH" 49 | 50 | COPY airflow.cfg ${AIRFLOW_HOME}/airflow.cfg 51 | COPY --chown=airflow:root dags ${AIRFLOW_HOME}/dags 52 | 53 | ENV AIRFLOW_TEST_MODE=True 54 | 55 | ENTRYPOINT ["/entrypoint.sh"] 56 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | lint: 2 | uv run ruff check . 3 | uv run ruff format . 4 | uv run mypy dags/ tests/ 5 | 6 | format: 7 | uv run ruff check . --fix 8 | uv run ruff format . 9 | 10 | test: 11 | PYTHONPATH=./dags uv run pytest 12 | 13 | coverage: 14 | PYTHONPATH=./dags uv run pytest --cov=dags tests 15 | 16 | build-dev: 17 | docker-compose -f ./docker-compose-dev.yml build 18 | 19 | deploy-dev: 20 | docker-compose -f ./docker-compose-dev.yml up -d 21 | 22 | down-dev: 23 | docker-compose -f ./docker-compose-dev.yml down 24 | 25 | deploy-prod: 26 | docker-compose -f ./docker-compose.yml up -d 27 | 28 | down-prod: 29 | docker-compose -f ./docker-compose.yml down 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyConTW ETL 2 | 3 | ![Python CI](https://github.com/pycontw/PyCon-ETL/workflows/Python%20CI/badge.svg) 4 | ![Docker Image CI](https://github.com/pycontw/PyCon-ETL/workflows/Docker%20Image%20CI/badge.svg) 5 | 6 | Using Airflow to implement our ETL pipelines. 7 | 8 | ## Table of Contents 9 | 10 | - [Prerequisites](#prerequisites) 11 | - [Installation](#installation) 12 | - [Configuration](#configuration) 13 | - [BigQuery (Optional)](#bigquery-optional) 14 | - [Running the Project](#running-the-project) 15 | - [Local Environment with Docker](#local-environment-with-docker) 16 | - [Production](#production) 17 | - [Contact](#contact) 18 | 19 | ## Prerequisites 20 | 21 | - [Python 3.8+](https://www.python.org/downloads/release/python-3811/) 22 | - [Docker](https://docs.docker.com/get-docker/) 23 | - [Git](https://git-scm.com/book/zh-tw/v2/%E9%96%8B%E5%A7%8B-Git-%E5%AE%89%E8%A3%9D%E6%95%99%E5%AD%B8) 24 | - [uv] 25 | 26 | ## Installation 27 | 28 | We use [uv] to manage dependencies and virtual environment. 29 | 30 | Below are the steps to create a virtual environment using [uv]: 31 | 32 | 1. Create a Virtual Environment with Dependencies Installed 33 | 34 | To create a virtual environment, run the following command: 35 | 36 | ```bash 37 | uv sync 38 | ``` 39 | 40 | By default, [uv] sets up the virtual environment in `.venv` 41 | 42 | 2. Activate the Virtual Environment 43 | 44 | After creating the virtual environment, activate it using the following command: 45 | 46 | ```bash 47 | source .venv/bin/activate 48 | ``` 49 | 50 | 3. Deactivate the Virtual Environment 51 | 52 | When you're done working in the virtual environment, you can deactivate it with: 53 | 54 | ```bash 55 | deactivate 56 | ``` 57 | 58 | ## Configuration 59 | 60 | 1. For development or testing, run `cp .env.template .env.staging`. For production, run `cp .env.template .env.production`. 61 | 62 | 2. Follow the instructions in `.env.` and fill in your secrets. 63 | If you are running the staging instance for development as a sandbox and do not need to access any specific third-party services, leaving `.env.staging` as-is should be fine. 64 | 65 | > Contact the maintainer if you don't have these secrets. 66 | 67 | > **⚠ WARNING: About .env** 68 | > Please do not use the .env file for local development, as it might affect the production tables. 69 | 70 | ### BigQuery (Optional) 71 | 72 | Set up the Authentication for GCP: 73 | 74 | - After running `gcloud auth application-default login`, you will get a credentials.json file located at `$HOME/.config/gcloud/application_default_credentials.json`. Run `export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json"` if you have it. 75 | - service-account.json: Please contact @david30907d via email or Discord. You do not need this json file if you are running the sandbox staging instance for development. 76 | 77 | ## Running the Project 78 | 79 | If you are a developer 👨‍💻, please check the [Contributing Guide](./docs/CONTRIBUTING.md). 80 | 81 | If you are a maintainer 👨‍🔧, please check the [Maintenance Guide](./docs/MAINTENANCE.md). 82 | 83 | ### Local Environment with Docker 84 | 85 | For development/testing: 86 | 87 | ```bash 88 | # Build the local dev/test image 89 | make build-dev 90 | 91 | # Start dev/test services 92 | make deploy-dev 93 | 94 | # Stop dev/test services 95 | make down-dev 96 | ``` 97 | 98 | > The difference between production and dev/test compose files is that the dev/test compose file uses a locally built image, while the production compose file uses the image from Docker Hub. 99 | 100 | If you are an authorized maintainer, you can pull the image from the GCP Artifact Registry. 101 | 102 | Docker client must be configured to use the GCP Artifact Registry. 103 | 104 | ```bash 105 | gcloud auth configure-docker asia-east1-docker.pkg.dev 106 | ``` 107 | 108 | Then, pull the image: 109 | 110 | ```bash 111 | docker pull asia-east1-docker.pkg.dev/pycontw-225217/data-team/pycon-etl:{tag} 112 | ``` 113 | 114 | There are several tags available: 115 | 116 | - `cache`: cache the image for faster deployment 117 | - `test`: for testing purposes, including the test dependencies 118 | - `staging`: when pushing to the staging environment 119 | - `latest`: when pushing to the production environment 120 | 121 | ### Production 122 | 123 | Please check the [Production Deployment Guide](./docs/DEPLOYMENT.md). 124 | 125 | ## Contact 126 | 127 | [PyCon TW Volunteer Data Team - Discord](https://discord.com/channels/752904426057892052/900721883383758879) 128 | 129 | [uv]: https://docs.astral.sh/uv/ 130 | -------------------------------------------------------------------------------- /airflow.cfg: -------------------------------------------------------------------------------- 1 | [core] 2 | # The folder where your airflow pipelines live, most likely a 3 | # subfolder in a code repository. This path must be absolute. 4 | dags_folder = /opt/airflow/dags 5 | 6 | # The folder where airflow should store its log files 7 | # This path must be absolute 8 | base_log_folder = /opt/airflow/logs 9 | 10 | # Log format for when Colored logs is enabled 11 | colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {{%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d}} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s 12 | 13 | # Format of Log line 14 | log_format = [%%(asctime)s] {{%%(filename)s:%%(lineno)d}} %%(levelname)s - %%(message)s 15 | 16 | dag_processor_manager_log_location = /opt/airflow/logs/dag_processor_manager/dag_processor_manager.log 17 | 18 | # The SqlAlchemy connection string to the metadata database. 19 | # SqlAlchemy supports many different database engine, more information 20 | # their website 21 | # sql_alchemy_conn = sqlite:////tmp/airflow.db 22 | 23 | # The amount of parallelism as a setting to the executor. This defines 24 | # the max number of task instances that should run simultaneously 25 | # on this airflow installation 26 | parallelism = 256 27 | 28 | # The number of task instances allowed to run concurrently by the scheduler 29 | dag_concurrency = 64 30 | 31 | # Whether to load the examples that ship with Airflow. It's good to 32 | # get started, but you probably want to set this to False in a production 33 | # environment 34 | load_examples = False 35 | 36 | # Where your Airflow plugins are stored 37 | plugins_folder = /opt/airflow/plugins 38 | 39 | # Secret key to save connection passwords in the db 40 | fernet_key = $FERNET_KEY 41 | 42 | # How long before timing out a python file import 43 | dagbag_import_timeout = 600 44 | 45 | # How long before timing out a DagFileProcessor, which processes a dag file 46 | dag_file_processor_timeout = 600 47 | 48 | [api] 49 | # How to authenticate users of the API 50 | auth_backend = airflow.api.auth.backend.default 51 | 52 | 53 | [webserver] 54 | # Number of seconds the webserver waits before killing gunicorn master that doesn't respond 55 | web_server_master_timeout = 600 56 | 57 | # Number of seconds the gunicorn webserver waits before timing out on a worker 58 | web_server_worker_timeout = 600 59 | 60 | # Secret key used to run your flask app 61 | # It should be as random as possible 62 | secret_key = l\xba,\xc3\x023\xca\x04\xdb\xf2\xf7\xfa\xb8#\xee> 63 | 64 | # Number of workers to run the Gunicorn web server 65 | workers = 2 66 | 67 | # Expose the configuration file in the web server 68 | expose_config = True 69 | 70 | # Allow the UI to be rendered in a frame 71 | x_frame_enabled = True 72 | 73 | # Minutes of non-activity before logged out from UI 74 | # 0 means never get forcibly logged out 75 | force_log_out_after = 0 76 | 77 | authenticate = False 78 | auth_backend = airflow.api.auth.backend.default 79 | 80 | 81 | [celery] 82 | # The concurrency that will be used when starting workers with the 83 | # ``airflow celery worker`` command. This defines the number of task instances that 84 | # a worker will take, so size up your workers based on the resources on 85 | # your worker box and the nature of your tasks 86 | worker_concurrency = 32 87 | 88 | # The maximum and minimum concurrency that will be used when starting workers with the 89 | # ``airflow celery worker`` command (always keep minimum processes, but grow 90 | # to maximum if necessary). Note the value should be max_concurrency,min_concurrency 91 | # Pick these numbers based on resources on worker box and the nature of the task. 92 | # If autoscale option is available, worker_concurrency will be ignored. 93 | # http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale 94 | # Example: worker_autoscale = 16,12 95 | worker_autoscale = 32,12 96 | 97 | # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally 98 | # a sqlalchemy database. Refer to the Celery documentation for more 99 | # information. 100 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#broker-settings 101 | broker_url = redis://redis:6379/1 102 | 103 | # The Celery result_backend. When a job finishes, it needs to update the 104 | # metadata of the job. Therefore it will post a message on a message bus, 105 | # or insert it into a database (depending of the backend) 106 | # This status is used by the scheduler to update the state of the task 107 | # The use of a database is highly recommended 108 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings 109 | result_backend = db+postgresql://airflow:airflow@postgres/airflow 110 | 111 | [scheduler] 112 | child_process_log_directory = /opt/airflow/logs/scheduler 113 | 114 | 115 | # Format of the log_id, which is used to query for a given tasks logs 116 | log_id_template = {{dag_id}}-{{task_id}}-{{execution_date}}-{{try_number}} 117 | 118 | [kubernetes] 119 | # Keyword parameters to pass while calling a kubernetes client core_v1_api methods 120 | # from Kubernetes Executor provided as a single line formatted JSON dictionary string. 121 | # List of supported params are similar for all core_v1_apis, hence a single config 122 | # variable for all apis. 123 | # See: 124 | # https://raw.githubusercontent.com/kubernetes-client/python/master/kubernetes/client/apis/core_v1_api.py 125 | # Note that if no _request_timeout is specified, the kubernetes client will wait indefinitely 126 | # for kubernetes api responses, which will cause the scheduler to hang. 127 | # The timeout is specified as [connect timeout, read timeout] 128 | kube_client_request_args = {{"_request_timeout" : [60,60] }} 129 | 130 | # Specifies the uid to run the first process of the worker pods containers as 131 | run_as_user = 132 | 133 | # ref: https://airflow.apache.org/docs/apache-airflow/1.10.1/security.html#setting-up-google-authentication 134 | [google] 135 | client_id = 136 | client_secret = 137 | oauth_callback_route = /oauth2callback 138 | domain = localhost,pycon.tw 139 | prompt = select_account 140 | -------------------------------------------------------------------------------- /contrib/README.md: -------------------------------------------------------------------------------- 1 | # Contrib 2 | 3 | ## Upload KKTIX 4 | 5 | ![](../docs/kktix.png) 6 | 7 | 1. Navigate to KKTIX's attendees page 8 | 2. Download the CSV 9 | 3. `upload-kktix-ticket-csv-to-bigquery.py -p pycontw-225217 -d ods -t ods_kktix_ticket__attendees --upload` 10 | 11 | ## Survey Cake 12 | 13 | [Demo Video](https://www.loom.com/share/4c494f1d3ce443c6a43ed514c53b70ff) 14 | 1. download CSV from survey cake (account: data-strategy-registration-survey-cake@pycon.tw) 15 | 2. `. ./.env.sh ` 16 | 2. `cd contrib/survey_cake` 17 | 3. `python upload-survey-cake-csv-to-bigquery.py --year=<20xx> -c ` 18 | 1. it would upload data to Bigquery's `test` dataset 19 | 2. If everything looks good, you can `copy` the `fact table` and `dimension table` first 20 | 3. Then run `python upload-survey-cake-csv-to-bigquery.py --year=<20xx> -p`. `-p` stands for `production` 21 | 22 | ## KKTIX BigQuery Transform 23 | 1. Background: Start from 2022, we extract the KKTIX data via KKTIX API and load to "pycontw-225217.ods.ods_kktix_attendeeId_datetime". However most of the data are store in the ATTENDEE_INFO column with json format. To use metabase with SQL, users need to extract the data by json_extract with the knowledge kktix format instead of flat database. And we also need to rewrite all the SQLs build for current databases. 24 | 2. Solution: Transform the tables in backend that we could keep the same user experience by using Metabase. 25 | 3. Run: 26 | - for 3 tables in single bash script: `./kktix_bq_etl.sh 2023` 27 | -------------------------------------------------------------------------------- /contrib/data/corporate-attendees-2018.csv: -------------------------------------------------------------------------------- 1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,# invoice policy #,Nickname / 暱稱 (Shown on Badge),Invoiced Company Name / 發票抬頭,Unified Business No. / 發票統編,Dietary Habit / 餐點偏好,Need Shuttle Bus Service? 會議兩天的早上 要搭交通車到會場嗎?,Size of T-shirt / T恤尺寸,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Gender / 性別,Contact Name,Contact Email,Contact Mobile,Attendance Book 2 | 12345678,111111111,23,EEEE,Regular 原價,paid,,37d2639c885c499493608e51797b45c1,2018-04-17,5000,,PyConTW1,PyConTW1 ltd.,11111111,Normal / 一般,"Yes, Wait for the Bus at Nangang Expo Center. 需要, 會在南港展覽館候車",XS / 胸寬(F.W.): 46cm / 衣長(C.L.): 66cm,1 年以內,"AI, Machine Learning, Fintech, Business, Internet of Things, Education, Human Resource, Sustainability, Health & Wellness",PyConTW1 ltd.,資深開發研究員,Taiwan 台灣,Female / 女性,pycontwcontact1,pycontwemail1@pycon.tw,912345678, 3 | 12345679,111111112,52,9999,Regular 原價,paid,,37d2639c885c499493608e51797b45c2,2018-04-16,5000,,PyConTW2,PyConTW2 ltd.,22222222,Normal / 一般,No. 不需要,M / 胸寬(F.W.): 50cm / 衣長(C.L.): 71cm,1 到 5 年,"AI, Machine Learning, Science, Technology, Engineering & Mathematics, Entrepreneurship, Startup",PyConTW2 ltd.,資料工程師,,Male / 男性,pycontwcontact2,pycontwemail2@pycon.tw,912345666, 4 | -------------------------------------------------------------------------------- /contrib/data/corporate-attendees-2019.csv: -------------------------------------------------------------------------------- 1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,# invoice policy #,Nickname / 暱稱 (Shown on Badge),Invoiced Company Name / 發票抬頭 (Optional),Unified Business No. / 發票統編 (Optional),Dietary Habit / 餐點偏好,Need Shuttle Bus Service? 會議兩天的早上 要搭交通車到會場嗎?,Size of T-shirt / T恤尺寸,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Gender / 生理性別,Contact Name,Contact Email,Contact Mobile,Attendance Book 2 | 20000000,120000000,1,DDDD,Regular 原價,paid,,37d2639c885c499493608e51797b45c1,2019-07-31,5500,,PyConTW1,PyConTWLTD1,88888888,Normal / 一般,"Yes, Wait for the Bus at Nangang Expo Center. 需要, 會在南港展覽館候車",M / 胸寬(F.W.): 50cm / 衣長(C.L.): 71cm,1 到 5 年,"AI, Machine Learning, Fintech, Business, DevOps, Security, Web Technology, Science, Technology, Engineering & Mathematics, Entrepreneurship, Startup",PyConTWLTD1,System engineer,Taiwan 台灣,Other / 其它,PyConTW1,pycontwemail1@pycon.tw,921000001, 3 | 20000001,120000001,4,FFFF,Regular 原價,paid,,37d2639c885c499493608e51797b45c2,2019-07-31,5500,,PyConTW2,PyConTWLTD2,88889999,Normal / 一般,No. 不需要,L / 胸寬(F.W.): 53cm / 衣長(C.L.): 73cm,5 到 10 年,"AI, Machine Learning, DevOps, Security, Web Technology, Internet of Things",PyConTWLTD2,Founder/CEO,Taiwan 台灣,Male / 男性,PyConTW2,pycontwemail2@pycon.tw,921000002, 4 | -------------------------------------------------------------------------------- /contrib/data/corporate-attendees-2020.csv: -------------------------------------------------------------------------------- 1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,# invoice policy #,Nickname / 暱稱 (Shown on Badge),Invoiced Company Name / 發票抬頭 (Optional),Unified Business No. / 發票統編 (Optional),Dietary Habit / 餐點偏好,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Departure from (Regions) / 出發區域,How did you find out PyCon TW? / 如何得知 PyCon TW?,Have you ever attended PyCon TW?/ 是否曾參加 PyCon TW?,Do you know we have Financial Aid this year? / 請問您知道今年有財務補助嗎?,Gender / 生理性別,PyNight 參加意願僅供統計人數,實際是否舉辦需由官方另行公告,PyNight 參加意願,是否願意收到贊助商轉發 Email 訊息,是否願意提供 Email 給贊助商,Privacy Policy of PyCon TW 2020 / PyCon TW 2020 個人資料保護聲明 bit.ly/3eipAut,I've already read and I accept the Privacy Policy of PyCon TW 2020 / 我已閱讀並同意 PyCon TW 2020 個人資料保護聲明,Epidemic Prevention of PyCon TW 2020 / PyCon TW 2020 COVID-19 防疫守則 bit.ly/3fcnhu2,I've already read and I accept the Epidemic Prevention of PyCon TW 2020 / 我已閱讀並同意 PyCon TW 2020 COVID-19 防疫守則,Contact Name,Contact Email,Contact Mobile,Attendance Book 2 | 20000000,120000000,1,EEEE,Regular 原價,paid,,37d2639c885c499493608e51797b45c1,2020-07-13,5500,,PyConTW1,PyConTWLTD1,88888888,Normal / 一般,1 到 5 年,"DevOps, Security, Web Technology",PyConTWLTD1,工程師,Taiwan 台灣,Northern Taiwan / 北部,,,No,Male / 男性,,Yes,No,No,,Yes,,Yes,PyConTW1,pycontwemail1@pycon.tw,921000001, 3 | 20000001,120000001,2,9A9E,Regular 原價,paid,,37d2639c885c499493608e51797b45c2,2020-07-15,5500,,PyConTW2,PyConTWLTD2,88888889,Normal / 一般,1 到 5 年,"AI, Machine Learning, Internet of Things",PyConTWLTD2,,Taiwan 台灣,Northern Taiwan / 北部,公司報名,Yes,No,Male / 男性,,Yes,No,No,,Yes,,Yes,PyConTW2,pycontwemail2@pycon.tw,886921000002, 4 | -------------------------------------------------------------------------------- /contrib/data/individual-attendees-2018.csv: -------------------------------------------------------------------------------- 1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,# invoice policy #,Nickname / 暱稱 (Shown on Badge),Dietary Habit / 餐點偏好,Need Shuttle Bus Service? 會議兩天的早上 要搭交通車到會場嗎?,Size of T-shirt / T恤尺寸,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Gender / 性別,Contact Name,Contact Email,Contact Mobile,Attendance Book 2 | 50000000,110000000,1,EEEE,"EarlyBird, Discount 優惠價",paid,,37d2639c885c499493608e51797b45c1,2018-04-15,2500,,alice,Normal / 一般,"Yes, Wait for the Bus at Nangang Expo Center. 需要, 會在南港展覽館候車",L / 胸寬(F.W.): 53cm / 衣長(C.L.): 73cm,1 到 5 年,"AI, Machine Learning, Science, Technology, Engineering & Mathematics",,,Taiwan 台灣,Male / 男性,alice,alice@pycon.tw,933123456, 3 | 50000001,110000001,2,9999,"EarlyBird, Discount 優惠價",paid,,37d2639c885c499493608e51797b45c2,2018-04-15,2500,,bob,Normal / 一般,No. 不需要,L / 胸寬(F.W.): 53cm / 衣長(C.L.): 73cm,1 到 5 年,"DevOps, Security, Web Technology, Entrepreneurship, Startup",,,,Male / 男性,bob,bob@pycon.tw,922123456, 4 | -------------------------------------------------------------------------------- /contrib/data/individual-attendees-2019.csv: -------------------------------------------------------------------------------- 1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,Nickname / 暱稱 (Shown on Badge),Dietary Habit / 餐點偏好,Need Shuttle Bus Service? 會議三天的早上 要搭交通車到會場嗎?,Size of T-shirt / T恤尺寸,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Gender / 生理性別,Contact Name,Contact Email,Contact Mobile,Attendance Book 2 | 30000000,130000000,7,GGGG,Discount 優惠價,paid,,37d2639c885c499493608e51797b45c1,2019-07-31,2600,Alice,Normal / 一般,No. 不需要,M / 胸寬(F.W.): 50cm / 衣長(C.L.): 71cm,1 到 5 年,"AI, Machine Learning, Web Technology, Internet of Things, Sustainability, Health & Wellness",PyConTW1,Software Engineer,Taiwan 台灣,Male / 男性,Alice,alice@pycon.tw,921123456, 3 | 30000001,130000001,9,HHHH,Discount 優惠價,paid,,37d2639c885c499493608e51797b45c2,2019-07-31,2600,Bob,Normal / 一般,No. 不需要,M / 胸寬(F.W.): 50cm / 衣長(C.L.): 71cm,1 到 5 年,"AI, Machine Learning, Web Technology, Science, Technology, Engineering & Mathematics, Internet of Things",PyConTW2,CTO,Taiwan 台灣,Male / 男性,Bob,bob@pycon.tw,921456123, 4 | -------------------------------------------------------------------------------- /contrib/data/individual-attendees-2020.csv: -------------------------------------------------------------------------------- 1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,Nickname / 暱稱 (Shown on Badge),Dietary Habit / 餐點偏好,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Departure from (Regions) / 出發區域,Gender / 生理性別,How did you find out PyCon TW? / 如何得知 PyCon TW,Have you ever attended PyCon TW?/ 是否曾參加 PyCon TW? /,Do you know we have Financial Aid this year? / 請問您知道今年有財務補助嗎?,PyNight 參加意願僅供統計人數,實際是否舉辦需由官方另行公告,PyNight 參加意願,是否願意收到贊助商轉發 Email 訊息,是否願意提供 Email 給贊助商,Privacy Policy of PyCon TW 2020 / PyCon TW 2020 個人資料保護聲明 bit.ly/3eipAut,I've already read and I accept the Privacy Policy of PyConTW 2020 / 我已閱讀並同意 PyCon TW 2020 個人資料保護聲明,Epidemic Prevention of PyCon TW 2020 / PyCon TW 2020 COVID-19 防疫守則 bit.ly/3fcnhu2,I've already read and I accept the Epidemic Prevention of PyCon TW 2020 / 我已閱讀並同意 PyCon TW 2020 COVID-19 防疫守則,Contact Name,Contact Email,Contact Mobile,Attendance Book 2 | 20000000,130000001,1,FFFF,Discount 優惠價,paid,,37d2639c885c499493608e51797b45c1,2020-07-13,2600,PyConTW1,Normal / 一般,1 到 5 年,"AI, Machine Learning, DevOps, Security, Web Technology, Science, Technology, Engineering & Mathematics, Arts, Design, Multimedia, Sustainability, Health & Wellness",PyConTWLTD1,士官長,Taiwan 台灣,,Female / 女性,,,,,Yes,No,No,,,,,PyConTW1,pycontwemail1@pycon.tw,921000003, 3 | 20000001,130000002,2,GGGG,Discount 優惠價,paid,,37d2639c885c499493608e51797b45c2,2020-07-13,2600,PyConTW2,Normal / 一般,,"DevOps, Security, Internet of Things",PyConTWLTD2,Senior Test Engineer,Taiwan 台灣,,Male / 男性,,,,,Yes,No,No,,,,,PyConTW2,pycontwemail2@pycon.tw,886921000004, 4 | -------------------------------------------------------------------------------- /contrib/data/reserved-attendees-2018.csv: -------------------------------------------------------------------------------- 1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,# invoice policy #,Nickname / 暱稱 (Shown on Badge),Invoiced Company Name / 發票抬頭 (Optional),Unified Business No. / 發票統編 (Optional),Dietary Habit / 餐點偏好,Need Shuttle Bus Service? 會議兩天的早上 要搭交通車到會場嗎?,Size of T-shirt / T恤尺寸,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Gender / 性別,Contact Name,Contact Email,Contact Mobile,Attendance Book 2 | 57000000,113100000,3,6666,"Contributor 工作人員, 貢獻者",paid,,37d2639c885c499493608e51797b45c1,2018-04-14,1800,,alice,,,Normal / 一般,No. 不需要,M / 胸寬(F.W.): 50cm / 衣長(C.L.): 71cm,5 到 10 年,"AI, Machine Learning, Entrepreneurship, Startup, Internet of Things",,Student,Taiwan 台灣,Male / 男性,alice,alice@pycon.tw,977123456, 3 | 57000001,113100001,12,8888,Sponsor 贊助夥伴,paid,,37d2639c885c499493608e51797b45c3,2018-04-28,0,,bob,,,Normal / 一般,No. 不需要,M / 胸寬(F.W.): 50cm / 衣長(C.L.): 71cm,1 年以內,"AI, Machine Learning, Web Technology, Science, Technology, Engineering & Mathematics, Education, Human Resource, Arts, Design, Multimedia",,,Taiwan 台灣,Male / 男性,bob,bob@pycon.tw,977456123, 4 | -------------------------------------------------------------------------------- /contrib/data/reserved-attendees-2019.csv: -------------------------------------------------------------------------------- 1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,# invoice policy #,Nickname / 暱稱 (Shown on Badge),Dietary Habit / 餐點偏好,Need Shuttle Bus Service? 會議三天的早上 要搭交通車到會場嗎?,Size of T-shirt / T恤尺寸,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Gender / 性別,Contact Name,Contact Email,Contact Mobile,Attendance Book 2 | 66666666,120000000,2,HHHH,Fanatic 鐵粉票,paid,,37d2639c885c499493608e51797b45c1,2019-04-09,1800,,alice,Vegetarian / 素食者,No. 不需要,2XL / 胸寬(F.W.): 58cm / 衣長(C.L.): 78cm,5 到 10 年,"Web Technology, Education, Human Resource, Sustainability, Health & Wellness",pycontw1,Vice-Chair,Japan 日本,Male / 男性,alice,alice@pycon.tw,1234123456, 3 | 66666667,120000001,6,KKKK,Invited 邀請票,paid,,37d2639c885c499493608e51797b45c2,2019-08-03,0,,bob,Normal / 一般,No. 不需要,M / 胸寬(F.W.): 50cm / 衣長(C.L.): 71cm,10 到 20 年,"AI, Machine Learning, DevOps, Security, Science, Technology, Engineering & Mathematics, Internet of Things, Arts, Design, Multimedia, Sustainability, Health & Wellness",pycontw2,Senior Test Engineer,Taiwan 台灣,Male / 男性,bob,bob@pycon.tw,1234987654, 4 | -------------------------------------------------------------------------------- /contrib/data/reserved-attendees-2020.csv: -------------------------------------------------------------------------------- 1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,Nickname / 暱稱 (Shown on Badge),Dietary Habit / 餐點偏好,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Departure from (Regions) / 出發區域,Gender / 性別,是否願意收到贊助商轉發 Email 訊息,是否願意提供 Email 給贊助商,Privacy Policy of PyCon TW 2020,I've already read and I accept the Privacy Policy of PyConTW 2020 / 我已閱讀並同意 PyCon TW 2020 個人資料保護聲明,Epidemic Prevention of PyCon TW 2020,I've already read and I accept the Epidemic Prevention of PyCon TW 2020 / 我已閱讀並同意 PyCon TW 2020 COVID-19 防疫守則,Contact Name,Contact Email,Contact Mobile,Attendance Book 2 | 76000001,123421001,1,CCCC,Invited 邀請票,paid,,37d2639c885c499493608e51797b45c1,2020-08-08,0,Alice,Normal / 一般,5 到 10 年,"DevOps, Security, Web Technology, Entrepreneurship, Startup",pycontw1,Student,Japan 日本,Overseas / 海外,Male / 男性,No,No,,Yes,,Yes,Alice,alice@pycon.tw,12345678, 3 | 76000002,123421002,2,BBBB,Contributor 貢獻者票,paid,,37d2639c885c499493608e51797b45c2,2020-08-08,0,Bob,Normal / 一般,1 到 5 年,"DevOps, Security, Web Technology, Entrepreneurship, Startup, Internet of Things, Arts, Design, Multimedia",pycontw2,Engineer,Taiwan 台灣,Northern Taiwan / 北部,Male / 男性,Yes,No,,Yes,,Yes,Bob,bob@pycon.tw,87654321, 4 | 76000003,123421003,3,AAAA,Speaker 講者票,paid,,37d2639c885c499493608e51797b45c3,2020-08-08,0,Chris,Normal / 一般,1 到 5 年,"AI, Machine Learning, Web Technology, Science, Technology, Engineering & Mathematics, Internet of Things, Education, Human Resource, Arts, Design, Multimedia, Sustainability, Health & Wellness",pycontw3,Assistant professor,United States 美國,Overseas / 海外,Male / 男性,Yes,No,,Yes,,Yes,Chris,chris@pycon.tw,11223344, 5 | -------------------------------------------------------------------------------- /contrib/kktix_bq_etl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # export GOOGLE_APPLICATION_CREDENTIALS="" 4 | # 5 | project_id="pycontw-225217" 6 | cmd=${PWD}/../dags/ods/kktix_ticket_orders/udfs/kktix_bq_dwd_etl.py 7 | 8 | 9 | for ticket_type in corporate individual reserved 10 | do 11 | suffix=${ticket_type}_attendees$2 12 | cmd_args="-p ${project_id} -d dwd -t kktix_ticket_${suffix} -k ${ticket_type} -y $1 --upload" 13 | echo ${cmd_args} 14 | ${cmd} ${cmd_args} 15 | done 16 | -------------------------------------------------------------------------------- /contrib/survey_cake/udfs/survey_cake_csv_uploader.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | from pathlib import Path 4 | from typing import Dict, List 5 | 6 | from google.cloud import bigquery 7 | 8 | 9 | class SurveyCakeCSVUploader: 10 | USELESS_COLUMNS = { 11 | "額滿結束註記", 12 | "使用者紀錄", 13 | "會員時間", 14 | "會員編號", 15 | "自訂ID", 16 | "備註", 17 | } 18 | 19 | def __init__(self, year: int, filename: str): 20 | self._year = year 21 | self.filename = Path(filename) 22 | if not bool(os.getenv("AIRFLOW_TEST_MODE")): 23 | self.client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 24 | self.existing_question_and_id_dict = self._get_existing_question_and_id() 25 | else: 26 | self.existing_question_and_id_dict = {"placeholder": 1} 27 | self.facttable_filepath = ( 28 | self.filename.parent / f"{self.filename.stem}_facttable.csv" 29 | ) 30 | self.dimension_table_filepath = ( 31 | self.filename.parent / f"{self.filename.stem}_dimension.csv" 32 | ) 33 | 34 | @property 35 | def year(self): 36 | return self._year 37 | 38 | @property 39 | def bigquery_project(self): 40 | return os.getenv("BIGQUERY_PROJECT") 41 | 42 | def _get_existing_question_and_id(self): 43 | query = """ 44 | SELECT 45 | question, question_id 46 | FROM 47 | dim.dim_questionnaire_questionId_year; 48 | """ 49 | query_job = self.client.query(query) 50 | return {row["question"]: row["question_id"] for row in query_job} 51 | 52 | def transform(self): 53 | def _export_facttable(header_of_fact_table): 54 | with open(self.facttable_filepath, "w") as target: 55 | writer = csv.writer(target) 56 | writer.writerow(header_of_fact_table) 57 | for row in rows_of_fact_table: 58 | row_with_year = row + (self.year,) 59 | writer.writerow(row_with_year) 60 | 61 | def _export_dimension_table(question_id_dimension_table): 62 | with open(self.dimension_table_filepath, "w") as target: 63 | writer = csv.writer(target) 64 | writer.writerow(("question_id", "question", "year")) 65 | for question_id, question in question_id_dimension_table.items(): 66 | # need to filter out existing question_id, otherwise we would end up having duplicate question_id in BigQuery 67 | if question not in self.existing_question_and_id_dict.keys(): 68 | writer.writerow((question_id, question, self.year)) 69 | 70 | def _get_question_ids_of_this_year( 71 | header: List, question_id_dimension_table: Dict 72 | ) -> List: 73 | reversed_question_id_dimension_table = { 74 | question: question_id 75 | for question_id, question in question_id_dimension_table.items() 76 | } 77 | return [ 78 | reversed_question_id_dimension_table[column] 79 | for column in header 80 | if column not in self.USELESS_COLUMNS 81 | ] 82 | 83 | with open(Path(self.filename), encoding="utf-8-sig") as csvfile: 84 | rows = csv.reader(csvfile) 85 | # skip header 86 | header = [column.strip() for column in next(iter(rows))] 87 | question_id_dimension_table = self._generate_question_id_dimension_table( 88 | header 89 | ) 90 | 91 | question_ids = _get_question_ids_of_this_year( 92 | header, question_id_dimension_table 93 | ) 94 | header_of_fact_table = ("ip", "question_id", "answer", "year") 95 | rows_of_fact_table = self._transform_raw_data_to_fact_table_format( 96 | rows, question_id_dimension_table, question_ids 97 | ) 98 | 99 | _export_facttable(header_of_fact_table) 100 | _export_dimension_table(question_id_dimension_table) 101 | 102 | def upload( 103 | self, 104 | facttable_or_dimension_table, 105 | data_layer, 106 | data_domain, 107 | primary_key, 108 | time_dimension, 109 | ): 110 | if facttable_or_dimension_table == "fact": 111 | self._upload_2_bigquery( 112 | self.facttable_filepath, 113 | f"{self.bigquery_project}.{data_layer}.{data_layer}_{data_domain}_{primary_key}_{time_dimension}", 114 | ) 115 | elif facttable_or_dimension_table == "dim": 116 | self._upload_2_bigquery( 117 | self.dimension_table_filepath, 118 | f"{self.bigquery_project}.{data_layer}.{data_layer}_{data_domain}_{primary_key}_{time_dimension}", 119 | ) 120 | 121 | def _upload_2_bigquery(self, file_path, table_id): 122 | job_config = bigquery.LoadJobConfig( 123 | source_format=bigquery.SourceFormat.CSV, 124 | skip_leading_rows=1, 125 | autodetect=True, 126 | allow_quoted_newlines=True, 127 | write_disposition="WRITE_APPEND", 128 | schema_update_options="ALLOW_FIELD_ADDITION", 129 | ) 130 | with open(file_path, "rb") as source_file: 131 | job = self.client.load_table_from_file( 132 | source_file, table_id, job_config=job_config 133 | ) 134 | 135 | job.result() # Waits for the job to complete. 136 | 137 | table = self.client.get_table(table_id) # Make an API request. 138 | print( 139 | f"There's {table.num_rows} rows and {len(table.schema)} columns in {table_id} now!" 140 | ) 141 | 142 | def _generate_question_id_dimension_table(self, header): 143 | max_existing_question_id = int(max(self.existing_question_and_id_dict.values())) 144 | question_id_dim_table = {} 145 | for index, column in enumerate(header, start=max_existing_question_id): 146 | if column in self.USELESS_COLUMNS: 147 | continue 148 | if column in self.existing_question_and_id_dict: 149 | question_id_dim_table[self.existing_question_and_id_dict[column]] = ( 150 | column 151 | ) 152 | else: 153 | question_id_dim_table[float(index)] = column 154 | return question_id_dim_table 155 | 156 | @staticmethod 157 | def _transform_raw_data_to_fact_table_format( 158 | rows, question_id_dimension_table, question_ids 159 | ): 160 | result = [] 161 | for row in rows: 162 | row_dict = dict(zip(question_ids, row)) 163 | question_id_of_primary_key = [ 164 | key 165 | for key, value in question_id_dimension_table.items() 166 | if value == "IP紀錄" 167 | ][0] 168 | primary_key = row_dict[question_id_of_primary_key] 169 | for question_id, answer in row_dict.items(): 170 | result.append((primary_key, question_id, answer)) 171 | return result 172 | -------------------------------------------------------------------------------- /contrib/survey_cake/upload-survey-cake-csv-to-bigquery.py: -------------------------------------------------------------------------------- 1 | """ 2 | A crawler which would crawl the openings 3 | """ 4 | 5 | import argparse 6 | 7 | from udfs.survey_cake_csv_uploader import SurveyCakeCSVUploader 8 | 9 | TEST_DATA_LAYER = "test" 10 | FILENAMES = { 11 | "data_questionnaire.csv": { 12 | "data_domain": "questionnaire", 13 | "primary_key": "ip", 14 | "time_dimension": "datetime", 15 | }, 16 | "data_sponsor_questionnaire.csv": { 17 | "data_domain": "sponsorQuestionnaire", 18 | "primary_key": "ip", 19 | "time_dimension": "datetime", 20 | }, 21 | } 22 | if __name__ == "__main__": 23 | PARSER = argparse.ArgumentParser() 24 | PARSER.add_argument("-y", "--year", type=int, required=True) 25 | PARSER.add_argument( 26 | "-c", 27 | "--contributor", 28 | type=str, 29 | help="input your name please! You'll find a table with your name in Bigquery.test dataset", 30 | required=True, 31 | ) 32 | PARSER.add_argument("-p", "--prod", action="store_true") 33 | ARGS = PARSER.parse_args() 34 | print( 35 | "HINT: the default mode would load data to dataset `test`. To load data to bigquery's `ods` dataset, please add `--prod` flag!" 36 | ) 37 | for filename, metadata in FILENAMES.items(): 38 | SURVEY_CAKE_CSV_UPLOADER = SurveyCakeCSVUploader( 39 | year=ARGS.year, filename=filename 40 | ) 41 | SURVEY_CAKE_CSV_UPLOADER.transform() 42 | SURVEY_CAKE_CSV_UPLOADER.upload( 43 | facttable_or_dimension_table="fact", 44 | data_layer="ods" if ARGS.prod else TEST_DATA_LAYER, 45 | data_domain=metadata["data_domain"] 46 | if ARGS.prod 47 | else f"{ARGS.contributor}_{metadata['data_domain']}", 48 | primary_key=metadata["primary_key"], 49 | time_dimension=metadata["time_dimension"], 50 | ) 51 | SURVEY_CAKE_CSV_UPLOADER.upload( 52 | facttable_or_dimension_table="dim", 53 | data_layer="dim" if ARGS.prod else TEST_DATA_LAYER, 54 | data_domain=metadata["data_domain"] 55 | if ARGS.prod 56 | else f"{ARGS.contributor}_{metadata['data_domain']}", 57 | primary_key="questionId", 58 | time_dimension="year", 59 | ) 60 | -------------------------------------------------------------------------------- /contrib/upload-kktix-ticket-csv-to-bigquery.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # export GOOGLE_APPLICATION_CREDENTIALS="" 4 | # 5 | root_pycon_etl=${HOME}/work-my-projects/pycontw-projects/PyCon-ETL 6 | root_upload_data=${HOME}/work-my-projects/pycontw-projects/PyCon-ETL-working 7 | project_id="pycontw-225217" 8 | cmd_upload=${root_pycon_etl}/contrib/upload-kktix-ticket-csv-to-bigquery.py 9 | 10 | 11 | for year in 2018 2019 2020 12 | do 13 | for ticket_type in corporate individual reserved 14 | do 15 | suffix=${ticket_type}_attendees_${year} 16 | cmd_args="${root_upload_data}/${suffix}.csv -p ${project_id} -d ods -t ods_kktix_ticket_${suffix} --upload" 17 | echo ${cmd_args} 18 | ${cmd_upload} ${cmd_args} 19 | done 20 | done 21 | -------------------------------------------------------------------------------- /dags/airflow-log-cleanup.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | """ 3 | A maintenance workflow that you can deploy into Airflow to periodically clean 4 | out the task logs to avoid those getting too big. 5 | airflow trigger_dag --conf '[curly-braces]"maxLogAgeInDays":30[curly-braces]' airflow-log-cleanup 6 | --conf options: 7 | maxLogAgeInDays: - Optional 8 | """ 9 | 10 | import logging 11 | import os 12 | from datetime import timedelta 13 | 14 | import airflow 15 | import jinja2 16 | from airflow.configuration import conf 17 | from airflow.models import DAG, Variable 18 | from airflow.operators.bash_operator import BashOperator 19 | from airflow.operators.dummy_operator import DummyOperator 20 | 21 | # airflow-log-cleanup 22 | DAG_ID = os.path.basename(__file__).replace(".pyc", "").replace(".py", "") 23 | START_DATE = airflow.utils.dates.days_ago(1) 24 | try: 25 | BASE_LOG_FOLDER = conf.get("core", "BASE_LOG_FOLDER").rstrip("/") 26 | except Exception: 27 | BASE_LOG_FOLDER = conf.get("logging", "BASE_LOG_FOLDER").rstrip("/") 28 | # How often to Run. @daily - Once a day at Midnight 29 | SCHEDULE_INTERVAL = "@daily" 30 | # Who is listed as the owner of this DAG in the Airflow Web Server 31 | DAG_OWNER_NAME = "operations" 32 | # List of email address to send email alerts to if this job fails 33 | ALERT_EMAIL_ADDRESSES = ["davidtnfsh@gmail.com"] 34 | # Length to retain the log files if not already provided in the conf. If this 35 | # is set to 30, the job will remove those files that are 30 days old or older 36 | DEFAULT_MAX_LOG_AGE_IN_DAYS = Variable.get( 37 | "airflow_log_cleanup__max_log_age_in_days", 3 38 | ) 39 | # Whether the job should delete the logs or not. Included if you want to 40 | # temporarily avoid deleting the logs 41 | ENABLE_DELETE = True 42 | # The number of worker nodes you have in Airflow. Will attempt to run this 43 | # process for however many workers there are so that each worker gets its 44 | # logs cleared. 45 | NUMBER_OF_WORKERS = 1 46 | DIRECTORIES_TO_DELETE = [BASE_LOG_FOLDER] 47 | ENABLE_DELETE_CHILD_LOG = Variable.get( 48 | "airflow_log_cleanup__enable_delete_child_log", "True" 49 | ) 50 | LOG_CLEANUP_PROCESS_LOCK_FILE = "/tmp/airflow_log_cleanup_worker.lock" 51 | logging.info("ENABLE_DELETE_CHILD_LOG " + ENABLE_DELETE_CHILD_LOG) 52 | 53 | if not BASE_LOG_FOLDER or BASE_LOG_FOLDER.strip() == "": 54 | raise ValueError( 55 | "BASE_LOG_FOLDER variable is empty in airflow.cfg. It can be found " 56 | "under the [core] (<2.0.0) section or [logging] (>=2.0.0) in the cfg file. " 57 | "Kindly provide an appropriate directory path." 58 | ) 59 | 60 | if ENABLE_DELETE_CHILD_LOG.lower() == "true": 61 | try: 62 | CHILD_PROCESS_LOG_DIRECTORY = conf.get( 63 | "scheduler", "CHILD_PROCESS_LOG_DIRECTORY" 64 | ) 65 | if CHILD_PROCESS_LOG_DIRECTORY != " ": 66 | DIRECTORIES_TO_DELETE.append(CHILD_PROCESS_LOG_DIRECTORY) 67 | except Exception as e: 68 | logging.exception( 69 | "Could not obtain CHILD_PROCESS_LOG_DIRECTORY from " 70 | + "Airflow Configurations: " 71 | + str(e) 72 | ) 73 | 74 | default_args = { 75 | "owner": DAG_OWNER_NAME, 76 | "depends_on_past": False, 77 | "email": ALERT_EMAIL_ADDRESSES, 78 | "email_on_failure": True, 79 | "email_on_retry": False, 80 | "start_date": START_DATE, 81 | "retries": 1, 82 | "retry_delay": timedelta(minutes=1), 83 | } 84 | 85 | dag = DAG( 86 | DAG_ID, 87 | default_args=default_args, 88 | schedule_interval=SCHEDULE_INTERVAL, 89 | start_date=START_DATE, 90 | tags=["teamclairvoyant", "airflow-maintenance-dags"], 91 | template_undefined=jinja2.Undefined, 92 | ) 93 | if hasattr(dag, "doc_md"): 94 | dag.doc_md = __doc__ 95 | if hasattr(dag, "catchup"): 96 | dag.catchup = False 97 | 98 | start = DummyOperator(task_id="start", dag=dag) 99 | 100 | log_cleanup = ( 101 | """ 102 | 103 | echo "Getting Configurations..." 104 | BASE_LOG_FOLDER="{{params.directory}}" 105 | WORKER_SLEEP_TIME="{{params.sleep_time}}" 106 | 107 | sleep ${WORKER_SLEEP_TIME}s 108 | 109 | MAX_LOG_AGE_IN_DAYS="{{dag_run.conf.maxLogAgeInDays}}" 110 | if [ "${MAX_LOG_AGE_IN_DAYS}" == "" ]; then 111 | echo "maxLogAgeInDays conf variable isn't included. Using Default '""" 112 | + str(DEFAULT_MAX_LOG_AGE_IN_DAYS) 113 | + """'." 114 | MAX_LOG_AGE_IN_DAYS='""" 115 | + str(DEFAULT_MAX_LOG_AGE_IN_DAYS) 116 | + """' 117 | fi 118 | ENABLE_DELETE=""" 119 | + str("true" if ENABLE_DELETE else "false") 120 | + """ 121 | echo "Finished Getting Configurations" 122 | echo "" 123 | 124 | echo "Configurations:" 125 | echo "BASE_LOG_FOLDER: '${BASE_LOG_FOLDER}'" 126 | echo "MAX_LOG_AGE_IN_DAYS: '${MAX_LOG_AGE_IN_DAYS}'" 127 | echo "ENABLE_DELETE: '${ENABLE_DELETE}'" 128 | 129 | cleanup() { 130 | echo "Executing Find Statement: $1" 131 | FILES_MARKED_FOR_DELETE=`eval $1` 132 | echo "Process will be Deleting the following File(s)/Directory(s):" 133 | echo "${FILES_MARKED_FOR_DELETE}" 134 | echo "Process will be Deleting `echo "${FILES_MARKED_FOR_DELETE}" | \ 135 | grep -v '^$' | wc -l` File(s)/Directory(s)" \ 136 | # "grep -v '^$'" - removes empty lines. 137 | # "wc -l" - Counts the number of lines 138 | echo "" 139 | if [ "${ENABLE_DELETE}" == "true" ]; 140 | then 141 | if [ "${FILES_MARKED_FOR_DELETE}" != "" ]; 142 | then 143 | echo "Executing Delete Statement: $2" 144 | eval $2 145 | DELETE_STMT_EXIT_CODE=$? 146 | if [ "${DELETE_STMT_EXIT_CODE}" != "0" ]; then 147 | echo "Delete process failed with exit code \ 148 | '${DELETE_STMT_EXIT_CODE}'" 149 | 150 | echo "Removing lock file..." 151 | rm -f """ 152 | + str(LOG_CLEANUP_PROCESS_LOCK_FILE) 153 | + """ 154 | if [ "${REMOVE_LOCK_FILE_EXIT_CODE}" != "0" ]; then 155 | echo "Error removing the lock file. \ 156 | Check file permissions.\ 157 | To re-run the DAG, ensure that the lock file has been \ 158 | deleted (""" 159 | + str(LOG_CLEANUP_PROCESS_LOCK_FILE) 160 | + """)." 161 | exit ${REMOVE_LOCK_FILE_EXIT_CODE} 162 | fi 163 | exit ${DELETE_STMT_EXIT_CODE} 164 | fi 165 | else 166 | echo "WARN: No File(s)/Directory(s) to Delete" 167 | fi 168 | else 169 | echo "WARN: You're opted to skip deleting the File(s)/Directory(s)!!!" 170 | fi 171 | } 172 | 173 | 174 | if [ ! -f """ 175 | + str(LOG_CLEANUP_PROCESS_LOCK_FILE) 176 | + """ ]; then 177 | 178 | echo "Lock file not found on this node! \ 179 | Creating it to prevent collisions..." 180 | touch """ 181 | + str(LOG_CLEANUP_PROCESS_LOCK_FILE) 182 | + """ 183 | CREATE_LOCK_FILE_EXIT_CODE=$? 184 | if [ "${CREATE_LOCK_FILE_EXIT_CODE}" != "0" ]; then 185 | echo "Error creating the lock file. \ 186 | Check if the airflow user can create files under tmp directory. \ 187 | Exiting..." 188 | exit ${CREATE_LOCK_FILE_EXIT_CODE} 189 | fi 190 | 191 | echo "" 192 | echo "Running Cleanup Process..." 193 | 194 | FIND_STATEMENT="find ${BASE_LOG_FOLDER}/*/* -type f -mtime \ 195 | +${MAX_LOG_AGE_IN_DAYS}" 196 | DELETE_STMT="${FIND_STATEMENT} -exec rm -f {} \;" 197 | 198 | cleanup "${FIND_STATEMENT}" "${DELETE_STMT}" 199 | CLEANUP_EXIT_CODE=$? 200 | 201 | FIND_STATEMENT="find ${BASE_LOG_FOLDER}/*/* -type d -empty" 202 | DELETE_STMT="${FIND_STATEMENT} -prune -exec rm -rf {} \;" 203 | 204 | cleanup "${FIND_STATEMENT}" "${DELETE_STMT}" 205 | CLEANUP_EXIT_CODE=$? 206 | 207 | FIND_STATEMENT="find ${BASE_LOG_FOLDER}/* -type d -empty" 208 | DELETE_STMT="${FIND_STATEMENT} -prune -exec rm -rf {} \;" 209 | 210 | cleanup "${FIND_STATEMENT}" "${DELETE_STMT}" 211 | CLEANUP_EXIT_CODE=$? 212 | 213 | echo "Finished Running Cleanup Process" 214 | 215 | echo "Deleting lock file..." 216 | rm -f """ 217 | + str(LOG_CLEANUP_PROCESS_LOCK_FILE) 218 | + """ 219 | REMOVE_LOCK_FILE_EXIT_CODE=$? 220 | if [ "${REMOVE_LOCK_FILE_EXIT_CODE}" != "0" ]; then 221 | echo "Error removing the lock file. Check file permissions. To re-run the DAG, ensure that the lock file has been deleted (""" 222 | + str(LOG_CLEANUP_PROCESS_LOCK_FILE) 223 | + """)." 224 | exit ${REMOVE_LOCK_FILE_EXIT_CODE} 225 | fi 226 | 227 | else 228 | echo "Another task is already deleting logs on this worker node. \ 229 | Skipping it!" 230 | echo "If you believe you're receiving this message in error, kindly check \ 231 | if """ 232 | + str(LOG_CLEANUP_PROCESS_LOCK_FILE) 233 | + """ exists and delete it." 234 | exit 0 235 | fi 236 | 237 | """ 238 | ) 239 | 240 | for log_cleanup_id in range(1, NUMBER_OF_WORKERS + 1): 241 | for dir_id, directory in enumerate(DIRECTORIES_TO_DELETE): 242 | log_cleanup_op = BashOperator( 243 | task_id="log_cleanup_worker_num_" 244 | + str(log_cleanup_id) 245 | + "_dir_" 246 | + str(dir_id), 247 | bash_command=log_cleanup, 248 | params={"directory": str(directory), "sleep_time": int(log_cleanup_id) * 3}, 249 | dag=dag, 250 | ) 251 | 252 | log_cleanup_op.set_upstream(start) 253 | -------------------------------------------------------------------------------- /dags/airlfow-db-cleanup.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | """ 3 | A maintenance workflow that you can deploy into Airflow to periodically clean 4 | out the DagRun, TaskInstance, Log, XCom, Job DB and SlaMiss entries to avoid 5 | having too much data in your Airflow MetaStore. 6 | 7 | airflow trigger_dag --conf '[curly-braces]"maxDBEntryAgeInDays":30[curly-braces]' airflow-db-cleanup 8 | 9 | --conf options: 10 | maxDBEntryAgeInDays: - Optional 11 | 12 | """ 13 | 14 | import airflow 15 | from airflow import settings 16 | from airflow.configuration import conf 17 | from airflow.models import ( 18 | DAG, 19 | DagModel, 20 | DagRun, 21 | DagTag, 22 | Log, 23 | SlaMiss, 24 | TaskInstance, 25 | Variable, 26 | XCom, 27 | ) 28 | 29 | try: 30 | from airflow.jobs import BaseJob 31 | except Exception as e: 32 | from airflow.jobs.base_job import BaseJob 33 | 34 | import logging 35 | import os 36 | from datetime import datetime, timedelta 37 | 38 | import dateutil.parser 39 | from airflow.operators.python_operator import PythonOperator 40 | from sqlalchemy import and_, func 41 | from sqlalchemy.exc import ProgrammingError 42 | from sqlalchemy.orm import load_only 43 | 44 | try: 45 | # airflow.utils.timezone is available from v1.10 onwards 46 | from airflow.utils import timezone 47 | 48 | now = timezone.utcnow 49 | except ImportError: 50 | now = datetime.utcnow 51 | 52 | # airflow-db-cleanup 53 | DAG_ID = os.path.basename(__file__).replace(".pyc", "").replace(".py", "") 54 | START_DATE = airflow.utils.dates.days_ago(1) 55 | # How often to Run. @daily - Once a day at Midnight (UTC) 56 | SCHEDULE_INTERVAL = "@daily" 57 | # Who is listed as the owner of this DAG in the Airflow Web Server 58 | DAG_OWNER_NAME = "operations" 59 | # List of email address to send email alerts to if this job fails 60 | ALERT_EMAIL_ADDRESSES = ["henry410213028@gmail.com"] 61 | # Length to retain the log files if not already provided in the conf. If this 62 | # is set to 30, the job will remove those files that arE 30 days old or older. 63 | 64 | DEFAULT_MAX_DB_ENTRY_AGE_IN_DAYS = int( 65 | Variable.get("airflow_db_cleanup__max_db_entry_age_in_days", 30) 66 | ) 67 | # Prints the database entries which will be getting deleted; set to False to avoid printing large lists and slowdown process 68 | PRINT_DELETES = True 69 | # Whether the job should delete the db entries or not. Included if you want to 70 | # temporarily avoid deleting the db entries. 71 | ENABLE_DELETE = True 72 | 73 | # get dag model last schedule run 74 | try: 75 | dag_model_last_scheduler_run = DagModel.last_scheduler_run 76 | except AttributeError: 77 | dag_model_last_scheduler_run = DagModel.last_parsed_time 78 | 79 | # List of all the objects that will be deleted. Comment out the DB objects you 80 | # want to skip. 81 | DATABASE_OBJECTS = [ 82 | { 83 | "airflow_db_model": BaseJob, 84 | "age_check_column": BaseJob.latest_heartbeat, 85 | "keep_last": False, 86 | "keep_last_filters": None, 87 | "keep_last_group_by": None, 88 | }, 89 | { 90 | "airflow_db_model": DagRun, 91 | "age_check_column": DagRun.execution_date, 92 | "keep_last": True, 93 | "keep_last_filters": [DagRun.external_trigger.is_(False)], 94 | "keep_last_group_by": DagRun.dag_id, 95 | }, 96 | { 97 | "airflow_db_model": TaskInstance, 98 | "age_check_column": TaskInstance.execution_date, 99 | "keep_last": False, 100 | "keep_last_filters": None, 101 | "keep_last_group_by": None, 102 | }, 103 | { 104 | "airflow_db_model": Log, 105 | "age_check_column": Log.dttm, 106 | "keep_last": False, 107 | "keep_last_filters": None, 108 | "keep_last_group_by": None, 109 | }, 110 | { 111 | "airflow_db_model": XCom, 112 | "age_check_column": XCom.execution_date, 113 | "keep_last": False, 114 | "keep_last_filters": None, 115 | "keep_last_group_by": None, 116 | }, 117 | { 118 | "airflow_db_model": SlaMiss, 119 | "age_check_column": SlaMiss.execution_date, 120 | "keep_last": False, 121 | "keep_last_filters": None, 122 | "keep_last_group_by": None, 123 | }, 124 | { 125 | "airflow_db_model": DagModel, 126 | "age_check_column": dag_model_last_scheduler_run, 127 | "keep_last": False, 128 | "keep_last_filters": None, 129 | "keep_last_group_by": None, 130 | }, 131 | ] 132 | 133 | # Check for TaskReschedule model 134 | try: 135 | from airflow.models import TaskReschedule 136 | 137 | DATABASE_OBJECTS.append( 138 | { 139 | "airflow_db_model": TaskReschedule, 140 | "age_check_column": TaskReschedule.execution_date, 141 | "keep_last": False, 142 | "keep_last_filters": None, 143 | "keep_last_group_by": None, 144 | } 145 | ) 146 | 147 | except Exception as e: 148 | logging.error(e) 149 | 150 | # Check for TaskFail model 151 | try: 152 | from airflow.models import TaskFail 153 | 154 | DATABASE_OBJECTS.append( 155 | { 156 | "airflow_db_model": TaskFail, 157 | "age_check_column": TaskFail.execution_date, 158 | "keep_last": False, 159 | "keep_last_filters": None, 160 | "keep_last_group_by": None, 161 | } 162 | ) 163 | 164 | except Exception as e: 165 | logging.error(e) 166 | 167 | # Check for RenderedTaskInstanceFields model 168 | try: 169 | from airflow.models import RenderedTaskInstanceFields 170 | 171 | DATABASE_OBJECTS.append( 172 | { 173 | "airflow_db_model": RenderedTaskInstanceFields, 174 | "age_check_column": RenderedTaskInstanceFields.execution_date, 175 | "keep_last": False, 176 | "keep_last_filters": None, 177 | "keep_last_group_by": None, 178 | } 179 | ) 180 | 181 | except Exception as e: 182 | logging.error(e) 183 | 184 | # Check for ImportError model 185 | try: 186 | from airflow.models import ImportError 187 | 188 | DATABASE_OBJECTS.append( 189 | { 190 | "airflow_db_model": ImportError, 191 | "age_check_column": ImportError.timestamp, 192 | "keep_last": False, 193 | "keep_last_filters": None, 194 | "keep_last_group_by": None, 195 | } 196 | ) 197 | 198 | except Exception as e: 199 | logging.error(e) 200 | 201 | # Check for celery executor 202 | airflow_executor = str(conf.get("core", "executor")) 203 | logging.info("Airflow Executor: " + str(airflow_executor)) 204 | if airflow_executor == "CeleryExecutor": 205 | logging.info("Including Celery Modules") 206 | try: 207 | from celery.backends.database.models import Task, TaskSet 208 | 209 | DATABASE_OBJECTS.extend( 210 | ( 211 | { 212 | "airflow_db_model": Task, 213 | "age_check_column": Task.date_done, 214 | "keep_last": False, 215 | "keep_last_filters": None, 216 | "keep_last_group_by": None, 217 | }, 218 | { 219 | "airflow_db_model": TaskSet, 220 | "age_check_column": TaskSet.date_done, 221 | "keep_last": False, 222 | "keep_last_filters": None, 223 | "keep_last_group_by": None, 224 | }, 225 | ) 226 | ) 227 | 228 | except Exception as e: 229 | logging.error(e) 230 | 231 | session = settings.Session() 232 | 233 | default_args = { 234 | "owner": DAG_OWNER_NAME, 235 | "depends_on_past": False, 236 | "email": ALERT_EMAIL_ADDRESSES, 237 | "email_on_failure": True, 238 | "email_on_retry": False, 239 | "start_date": START_DATE, 240 | "retries": 1, 241 | "retry_delay": timedelta(minutes=1), 242 | } 243 | 244 | dag = DAG( 245 | DAG_ID, 246 | default_args=default_args, 247 | schedule_interval=SCHEDULE_INTERVAL, 248 | start_date=START_DATE, 249 | tags=["teamclairvoyant", "airflow-maintenance-dags"], 250 | ) 251 | if hasattr(dag, "doc_md"): 252 | dag.doc_md = __doc__ 253 | if hasattr(dag, "catchup"): 254 | dag.catchup = False 255 | 256 | 257 | def print_configuration_function(**context): 258 | logging.info("Loading Configurations...") 259 | dag_run_conf = context.get("dag_run").conf 260 | logging.info("dag_run.conf: " + str(dag_run_conf)) 261 | max_db_entry_age_in_days = None 262 | if dag_run_conf: 263 | max_db_entry_age_in_days = dag_run_conf.get("maxDBEntryAgeInDays", None) 264 | logging.info("maxDBEntryAgeInDays from dag_run.conf: " + str(dag_run_conf)) 265 | if max_db_entry_age_in_days is None or max_db_entry_age_in_days < 1: 266 | logging.info( 267 | "maxDBEntryAgeInDays conf variable isn't included or Variable " 268 | + "value is less than 1. Using Default '" 269 | + str(DEFAULT_MAX_DB_ENTRY_AGE_IN_DAYS) 270 | + "'" 271 | ) 272 | max_db_entry_age_in_days = DEFAULT_MAX_DB_ENTRY_AGE_IN_DAYS 273 | max_date = now() + timedelta(-max_db_entry_age_in_days) 274 | logging.info("Finished Loading Configurations") 275 | logging.info("") 276 | 277 | logging.info("Configurations:") 278 | logging.info("max_db_entry_age_in_days: " + str(max_db_entry_age_in_days)) 279 | logging.info("max_date: " + str(max_date)) 280 | logging.info("enable_delete: " + str(ENABLE_DELETE)) 281 | logging.info("session: " + str(session)) 282 | logging.info("") 283 | 284 | logging.info("Setting max_execution_date to XCom for Downstream Processes") 285 | context["ti"].xcom_push(key="max_date", value=max_date.isoformat()) 286 | 287 | 288 | print_configuration = PythonOperator( 289 | task_id="print_configuration", 290 | python_callable=print_configuration_function, 291 | provide_context=True, 292 | dag=dag, 293 | ) 294 | 295 | 296 | def cleanup_function(**context): 297 | logging.info("Retrieving max_execution_date from XCom") 298 | max_date = context["ti"].xcom_pull( 299 | task_ids=print_configuration.task_id, key="max_date" 300 | ) 301 | max_date = dateutil.parser.parse(max_date) # stored as iso8601 str in xcom 302 | 303 | airflow_db_model = context["params"].get("airflow_db_model") 304 | state = context["params"].get("state") 305 | age_check_column = context["params"].get("age_check_column") 306 | keep_last = context["params"].get("keep_last") 307 | keep_last_filters = context["params"].get("keep_last_filters") 308 | keep_last_group_by = context["params"].get("keep_last_group_by") 309 | 310 | logging.info("Configurations:") 311 | logging.info("max_date: " + str(max_date)) 312 | logging.info("enable_delete: " + str(ENABLE_DELETE)) 313 | logging.info("session: " + str(session)) 314 | logging.info("airflow_db_model: " + str(airflow_db_model)) 315 | logging.info("state: " + str(state)) 316 | logging.info("age_check_column: " + str(age_check_column)) 317 | logging.info("keep_last: " + str(keep_last)) 318 | logging.info("keep_last_filters: " + str(keep_last_filters)) 319 | logging.info("keep_last_group_by: " + str(keep_last_group_by)) 320 | 321 | logging.info("") 322 | 323 | logging.info("Running Cleanup Process...") 324 | 325 | try: 326 | query = session.query(airflow_db_model).options(load_only(age_check_column)) 327 | 328 | logging.info("INITIAL QUERY : " + str(query)) 329 | 330 | if keep_last: 331 | subquery = session.query(func.max(DagRun.execution_date)) 332 | # workaround for MySQL "table specified twice" issue 333 | # https://github.com/teamclairvoyant/airflow-maintenance-dags/issues/41 334 | if keep_last_filters is not None: 335 | for entry in keep_last_filters: 336 | subquery = subquery.filter(entry) 337 | 338 | logging.info("SUB QUERY [keep_last_filters]: " + str(subquery)) 339 | 340 | if keep_last_group_by is not None: 341 | subquery = subquery.group_by(keep_last_group_by) 342 | logging.info("SUB QUERY [keep_last_group_by]: " + str(subquery)) 343 | 344 | subquery = subquery.from_self() 345 | 346 | query = query.filter( 347 | and_(age_check_column.notin_(subquery)), 348 | and_(age_check_column <= max_date), 349 | ) 350 | 351 | else: 352 | query = query.filter( 353 | age_check_column <= max_date, 354 | ) 355 | 356 | if PRINT_DELETES: 357 | entries_to_delete = query.all() 358 | 359 | logging.info("Query: " + str(query)) 360 | logging.info( 361 | "Process will be Deleting the following " 362 | + str(airflow_db_model.__name__) 363 | + "(s):" 364 | ) 365 | for entry in entries_to_delete: 366 | logging.info( 367 | "\tEntry: " 368 | + str(entry) 369 | + ", Date: " 370 | + str(entry.__dict__[str(age_check_column).split(".")[1]]) 371 | ) 372 | 373 | logging.info( 374 | "Process will be Deleting " 375 | + str(len(entries_to_delete)) 376 | + " " 377 | + str(airflow_db_model.__name__) 378 | + "(s)" 379 | ) 380 | else: 381 | logging.warn( 382 | "You've opted to skip printing the db entries to be deleted. Set PRINT_DELETES to True to show entries!!!" 383 | ) 384 | 385 | if ENABLE_DELETE: 386 | logging.info("Performing Delete...") 387 | if airflow_db_model.__name__ == "DagModel": 388 | logging.info("Deleting tags...") 389 | ids_query = query.from_self().with_entities(DagModel.dag_id) 390 | tags_query = session.query(DagTag).filter(DagTag.dag_id.in_(ids_query)) 391 | logging.info("Tags delete Query: " + str(tags_query)) 392 | tags_query.delete(synchronize_session=False) 393 | # using bulk delete 394 | query.delete(synchronize_session=False) 395 | session.commit() 396 | logging.info("Finished Performing Delete") 397 | else: 398 | logging.warn( 399 | "You've opted to skip deleting the db entries. Set ENABLE_DELETE to True to delete entries!!!" 400 | ) 401 | 402 | logging.info("Finished Running Cleanup Process") 403 | 404 | except ProgrammingError as e: 405 | logging.error(e) 406 | logging.error( 407 | str(airflow_db_model) + " is not present in the metadata. Skipping..." 408 | ) 409 | 410 | 411 | for db_object in DATABASE_OBJECTS: 412 | cleanup_op = PythonOperator( 413 | task_id="cleanup_" + str(db_object["airflow_db_model"].__name__), 414 | python_callable=cleanup_function, 415 | params=db_object, 416 | provide_context=True, 417 | dag=dag, 418 | ) 419 | 420 | print_configuration.set_downstream(cleanup_op) 421 | -------------------------------------------------------------------------------- /dags/app/channel_reminder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/dags/app/channel_reminder/__init__.py -------------------------------------------------------------------------------- /dags/app/channel_reminder/dag.py: -------------------------------------------------------------------------------- 1 | """ 2 | Send Google Search Report to Discord 3 | """ 4 | 5 | from datetime import datetime, timedelta 6 | 7 | from airflow import DAG 8 | from airflow.operators.python_operator import PythonOperator 9 | from app.channel_reminder import udf 10 | 11 | DEFAULT_ARGS = { 12 | "owner": "davidtnfsh", 13 | "depends_on_past": False, 14 | "start_date": datetime(2022, 9, 15), 15 | "retries": 2, 16 | "retry_delay": timedelta(minutes=5), 17 | "on_failure_callback": lambda x: "Need to send notification to Discord", 18 | } 19 | dag = DAG( 20 | "DISCORD_CHORES_REMINDER", 21 | default_args=DEFAULT_ARGS, 22 | schedule_interval="@yearly", 23 | max_active_runs=1, 24 | catchup=False, 25 | ) 26 | with dag: 27 | REMINDER_OF_THIS_TEAM = PythonOperator( 28 | task_id="KLAIVYO_REMINDER", python_callable=udf.main 29 | ) 30 | 31 | if __name__ == "__main__": 32 | dag.cli() 33 | -------------------------------------------------------------------------------- /dags/app/channel_reminder/udf.py: -------------------------------------------------------------------------------- 1 | from airflow.models import Variable 2 | from app import discord 3 | 4 | 5 | def main() -> None: 6 | kwargs = { 7 | "webhook_url": Variable.get("DISCORD_CHORES_REMINDER_WEBHOOK"), 8 | "username": "Data Team Airflow reminder", 9 | "msg": ( 10 | "<@&790739794148982796> <@&755827317904769184> <@&791157626099859487>\n", 11 | "記得大會結束後,要有一個人負責去取消 Klaviyo 的訂閱,不然我們每個月會一直繳 $NTD2000 喔!", 12 | ), 13 | } 14 | discord.send_webhook_message(**kwargs) 15 | -------------------------------------------------------------------------------- /dags/app/discord.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import tenacity 3 | 4 | session = requests.session() 5 | 6 | RETRY_ARGS = dict( 7 | wait=tenacity.wait_random(min=1, max=10), 8 | stop=tenacity.stop_after_attempt(10), 9 | retry=tenacity.retry_if_exception_type(requests.exceptions.ConnectionError), 10 | ) 11 | 12 | 13 | @tenacity.retry(**RETRY_ARGS) 14 | def send_webhook_message(webhook_url: str, username: str, msg: str) -> None: 15 | session.post( 16 | webhook_url, 17 | json={"username": username, "content": msg}, 18 | ) 19 | -------------------------------------------------------------------------------- /dags/app/finance_bot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/dags/app/finance_bot/__init__.py -------------------------------------------------------------------------------- /dags/app/finance_bot/dag.py: -------------------------------------------------------------------------------- 1 | """ 2 | Send Google Search Report to Discord 3 | """ 4 | 5 | from datetime import datetime, timedelta 6 | 7 | from airflow import DAG 8 | from airflow.operators.python_operator import PythonOperator 9 | from app.finance_bot import udf 10 | 11 | DEFAULT_ARGS = { 12 | "owner": "qchwan", 13 | "depends_on_past": False, 14 | "start_date": datetime(2023, 8, 27), 15 | "retries": 2, 16 | "retry_delay": timedelta(minutes=5), 17 | "on_failure_callback": lambda x: "Need to send notification to Discord", 18 | } 19 | dag = DAG( 20 | "DISCORD_FINANCE_REMINDER", 21 | default_args=DEFAULT_ARGS, 22 | schedule_interval="@daily", 23 | max_active_runs=1, 24 | catchup=False, 25 | ) 26 | with dag: 27 | REMINDER_OF_THIS_TEAM = PythonOperator( 28 | task_id="FINANCE_REMINDER", python_callable=udf.main 29 | ) 30 | 31 | if __name__ == "__main__": 32 | dag.cli() 33 | -------------------------------------------------------------------------------- /dags/app/finance_bot/udf.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pygsheets 6 | import requests 7 | from airflow.models import Variable 8 | from app import discord 9 | from google.cloud import bigquery 10 | 11 | session = requests.session() 12 | 13 | 14 | def main() -> None: 15 | # read xls from google doc to df. 16 | df_xls = read_google_xls_to_df() 17 | # read bigquery to df. 18 | df_bigquery = read_bigquery_to_df() 19 | # check difference between 2 df 20 | df_diff = df_difference(df_xls, df_bigquery) 21 | # link to bigquery and write xls file 22 | write_to_bigquery(df_diff) 23 | # push to discord 24 | kwargs = { 25 | "webhook_url": Variable.get("discord_data_stratagy_webhook"), 26 | "username": "財務機器人", 27 | "msg": refine_diff_df_to_string(df_diff), 28 | } 29 | if kwargs["msg"] != "no data": 30 | discord.send_webhook_message(**kwargs) 31 | 32 | 33 | def df_difference(df_xls, df_bigquery) -> pd.DataFrame: 34 | merged = pd.merge(df_xls, df_bigquery, how="outer", indicator=True) 35 | return merged[merged["_merge"] == "left_only"].drop("_merge", axis=1) 36 | 37 | 38 | def read_bigquery_to_df() -> pd.DataFrame: 39 | client = bigquery.Client() 40 | query = """ 41 | SELECT * 42 | FROM `pycontw-225217.ods.pycontw_finance` 43 | """ 44 | query_job = client.query(query) 45 | results = query_job.result() 46 | schema = results.schema 47 | column_names = [field.name for field in schema] 48 | data = [list(row.values()) for row in results] 49 | df = pd.DataFrame(data=data, columns=column_names) 50 | 51 | return df 52 | 53 | 54 | def read_google_xls_to_df() -> pd.DataFrame: 55 | gc = pygsheets.authorize(service_file=os.getenv("GOOGLE_APPLICATION_CREDENTIALS")) 56 | sheet = gc.open_by_url(Variable.get("finance_xls_path")) 57 | wks = sheet.sheet1 58 | df = wks.get_as_df(include_tailing_empty=False) 59 | df.replace("", np.nan, inplace=True) 60 | df.dropna(inplace=True) 61 | df = df.astype(str) 62 | df.columns = [ 63 | "Reason", 64 | "Price", 65 | "Remarks", 66 | "Team_name", 67 | "Details", 68 | "To_who", 69 | "Yes_or_No", 70 | ] 71 | return df 72 | 73 | 74 | def write_to_bigquery(df) -> None: 75 | project_id = "pycontw-225217" 76 | dataset_id = "ods" 77 | table_id = "pycontw_finance" 78 | client = bigquery.Client(project=project_id) 79 | table = client.dataset(dataset_id).table(table_id) 80 | schema = [ 81 | bigquery.SchemaField("Reason", "STRING", mode="REQUIRED"), 82 | bigquery.SchemaField("Price", "STRING", mode="REQUIRED"), 83 | bigquery.SchemaField("Remarks", "STRING", mode="REQUIRED"), 84 | bigquery.SchemaField("Team_name", "STRING", mode="REQUIRED"), 85 | bigquery.SchemaField("Details", "STRING", mode="REQUIRED"), 86 | bigquery.SchemaField("To_who", "STRING", mode="REQUIRED"), 87 | bigquery.SchemaField("Yes_or_No", "STRING", mode="REQUIRED"), 88 | ] 89 | job_config = bigquery.LoadJobConfig(schema=schema) 90 | job = client.load_table_from_dataframe(df, table, job_config=job_config) 91 | job.result() 92 | 93 | 94 | def refine_diff_df_to_string(df) -> str: 95 | msg = "" 96 | if df.empty: 97 | return "no data" 98 | else: 99 | for row in df.itertuples(index=False): 100 | msg += f"{row[0]}, 花費: {row[1]}, {row[3]}, {row[4]}\n" 101 | return msg 102 | -------------------------------------------------------------------------------- /dags/app/proposal_reminder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/dags/app/proposal_reminder/__init__.py -------------------------------------------------------------------------------- /dags/app/proposal_reminder/dag.py: -------------------------------------------------------------------------------- 1 | """ 2 | Send Proposal Summary to Discord 3 | """ 4 | 5 | from datetime import datetime, timedelta 6 | 7 | from airflow import DAG 8 | from airflow.operators.python_operator import PythonOperator 9 | from app.proposal_reminder import udf 10 | 11 | DEFAULT_ARGS = { 12 | "owner": "Henry Lee", 13 | "depends_on_past": False, 14 | "start_date": datetime(2025, 2, 25), 15 | "end_date": datetime(2025, 4, 9), 16 | "retries": 2, 17 | "retry_delay": timedelta(minutes=5), 18 | } 19 | 20 | with DAG( 21 | "DISCORD_PROPOSAL_REMINDER_v3", 22 | default_args=DEFAULT_ARGS, 23 | schedule_interval="0 16 * * *", # At 16:00 (00:00 +8) 24 | max_active_runs=1, 25 | catchup=False, 26 | ) as dag: 27 | PythonOperator( 28 | task_id="SEND_PROPOSAL_SUMMARY", 29 | python_callable=udf.main, 30 | ) 31 | 32 | if __name__ == "__main__": 33 | dag.cli() 34 | -------------------------------------------------------------------------------- /dags/app/proposal_reminder/udf.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from airflow.models import Variable 3 | from app import discord 4 | 5 | 6 | def main() -> None: 7 | summary = get_proposal_summary() 8 | n_talk = summary["num_proposed_talk"] 9 | n_tutorial = summary["num_proposed_tutorial"] 10 | kwargs = { 11 | "webhook_url": Variable.get("DISCORD_PROGRAM_REMINDER_WEBHOOK"), 12 | "username": "Program talk reminder", 13 | "msg": f"目前投稿議程數: {n_talk}; 課程數: {n_tutorial}", 14 | } 15 | discord.send_webhook_message(**kwargs) 16 | 17 | 18 | def get_proposal_summary() -> dict: 19 | url = "https://tw.pycon.org/prs/api/proposals/summary/" 20 | headers = { 21 | "Content-Type": "application/json", 22 | "authorization": Variable.get("PYCON_API_TOKEN"), 23 | } 24 | response = requests.get(url, headers=headers) 25 | return response.json() 26 | -------------------------------------------------------------------------------- /dags/app/team_registration_bot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/dags/app/team_registration_bot/__init__.py -------------------------------------------------------------------------------- /dags/app/team_registration_bot/dag.py: -------------------------------------------------------------------------------- 1 | """ 2 | send daily ordering metrics to discord channel 3 | """ 4 | 5 | from datetime import datetime, timedelta 6 | 7 | from airflow import DAG 8 | from airflow.operators.python_operator import PythonOperator 9 | from app.team_registration_bot import udf 10 | 11 | DEFAULT_ARGS = { 12 | "owner": "davidtnfsh@gmail.com", 13 | "depends_on_past": False, 14 | "start_date": datetime(2022, 7, 4), 15 | "retries": 2, 16 | "retry_delay": timedelta(minutes=5), 17 | "on_failure_callback": lambda x: "Need to send notification to Discord!", 18 | } 19 | dag = DAG( 20 | "KKTIX_DISCORD_BOT_FOR_TEAM_REGISTRATION", 21 | default_args=DEFAULT_ARGS, 22 | schedule_interval="@daily", 23 | max_active_runs=1, 24 | catchup=False, 25 | ) 26 | with dag: 27 | SEND_MSG_TO_DISCORD = PythonOperator( 28 | task_id="LOAD_TO_DISCORD", 29 | python_callable=udf.main, 30 | ) 31 | 32 | if __name__ == "__main__": 33 | dag.cli() 34 | -------------------------------------------------------------------------------- /dags/app/team_registration_bot/udf.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | from typing import Dict 4 | 5 | from airflow.models import Variable 6 | from app import discord 7 | from google.cloud import bigquery 8 | 9 | YEAR = datetime.now().year 10 | 11 | TABLE = f"{os.getenv('BIGQUERY_PROJECT', 'pycontw-225217')}.ods.ods_kktix_attendeeId_datetime" 12 | 13 | CLIENT = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 14 | 15 | 16 | def main() -> None: 17 | statistics = _get_statistics_from_bigquery() 18 | msg = _compose_discord_msg(statistics) 19 | kwargs = { 20 | "webhook_url": Variable.get("discord_webhook_registration_endpoint"), 21 | "username": "KKTIX order report", 22 | "msg": msg, 23 | } 24 | discord.send_webhook_message(**kwargs) 25 | 26 | 27 | def _get_statistics_from_bigquery() -> Dict: 28 | query_job = CLIENT.query( 29 | f""" 30 | WITH UNIQUE_RECORDS AS ( 31 | SELECT DISTINCT 32 | NAME, 33 | JSON_EXTRACT(ATTENDEE_INFO, '$.id') AS ORDER_ID, 34 | REPLACE(JSON_EXTRACT(ATTENDEE_INFO, '$.ticket_name'), '"', '') AS TICKET_NAME, 35 | FROM 36 | `{TABLE}` 37 | WHERE 38 | ((REFUNDED IS NULL) OR (REFUNDED = FALSE)) AND (NAME LIKE "PyCon TW {YEAR} Registration%") 39 | ) 40 | 41 | SELECT 42 | NAME, 43 | TICKET_NAME, 44 | COUNT(1) AS COUNTS 45 | FROM UNIQUE_RECORDS 46 | GROUP BY 47 | NAME, TICKET_NAME; 48 | """ # nosec 49 | ) 50 | result = query_job.result() 51 | return result 52 | 53 | 54 | ticket_price = { 55 | # please update the price for target year 56 | "企業票 - 一般階段 / Corporate - Regular Stage": 5800, 57 | "企業票 - 晚鳥階段 / Corporate - Final Stage": 6500, 58 | "企業團體票 (歡迎申請) / Group-Buy Corporate (Free to Apply)": 5220, 59 | "優惠票 (含紀念衣服) / Reserved - Community (with T-Shirt)": 2590, 60 | "貢獻者票 (含紀念衣服) / Reserved - Contributor (with T-Shirt)": 1290, 61 | "財務補助票 / Reserved - Financial Aid": 0, 62 | "邀請票 (含紀念衣服) / Reserved - Invited (with T-Shirt)": 0, 63 | "個人贊助票 (含紀念衣服) / Individual - Sponsor (with T-Shirt)": 5500, 64 | "個人票 - 早鳥 (含紀念衣服) / Individual - Early Stage (with T-Shirt)": 2790, 65 | "個人票 - 一般 (含紀念衣服)/ Individual - Regular Stage (with T-Shirt)": 3790, 66 | "個人票 - 晚鳥階段 / Individual - Final Stage": 4290, 67 | "愛心優待票 (含紀念衣服)/ Individual - Concession": 1895, 68 | } 69 | 70 | 71 | def _compose_discord_msg(payload) -> str: 72 | msg = f"Hi 這是今天 {datetime.now().date()} 的票種統計資料,售票期結束後,請 follow README 的 `gcloud` 指令進去把 Airflow DAG 關掉\n\n" 73 | total = 0 74 | total_income = 0 75 | for name, ticket_name, counts in payload: 76 | msg += f" * 票種:{ticket_name}\t{counts}張\n" 77 | total += counts 78 | total_income += ticket_price.get(ticket_name, 0) * counts 79 | total_income = f"{total_income:,}" 80 | msg += f"dashboard: https://metabase.pycon.tw/question/142?year={YEAR}\n" 81 | msg += f"總共賣出 {total} 張喔~ (總收入 TWD${total_income})" 82 | return msg 83 | -------------------------------------------------------------------------------- /dags/app/twitter_post_notification_bot/dag.py: -------------------------------------------------------------------------------- 1 | """ 2 | Send Proposal Summary to Discord 3 | """ 4 | 5 | from datetime import datetime, timedelta 6 | 7 | from airflow import DAG 8 | from airflow.operators.python_operator import PythonOperator 9 | from app.twitter_post_notification_bot import udf 10 | 11 | DEFAULT_ARGS = { 12 | "owner": "David Jr.", 13 | "depends_on_past": False, 14 | "start_date": datetime(2023, 7, 1), 15 | "retries": 1, 16 | "retry_delay": timedelta(minutes=5), 17 | } 18 | 19 | with DAG( 20 | "TWITTER_POST_NOTIFICATION_BOT_V2", 21 | default_args=DEFAULT_ARGS, 22 | schedule_interval="@daily", 23 | max_active_runs=1, 24 | catchup=False, 25 | ) as dag: 26 | PythonOperator( 27 | task_id="SEND_TWITTER_POST_NOTIFICATION", 28 | python_callable=udf.main, 29 | ) 30 | 31 | if __name__ == "__main__": 32 | dag.cli() 33 | -------------------------------------------------------------------------------- /dags/app/twitter_post_notification_bot/udf.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from airflow import settings 3 | from airflow.models import Variable 4 | from sqlalchemy.orm import sessionmaker 5 | 6 | 7 | def main() -> None: 8 | url = "https://twitter135.p.rapidapi.com/v2/UserTweets/" 9 | # 499339900 is PyConTW's twitter id 10 | querystring = {"id": "499339900", "count": "1"} 11 | headers = { 12 | "X-RapidAPI-Key": Variable.get("RAPIDAPIAPI_KEY"), 13 | "X-RapidAPI-Host": "twitter135.p.rapidapi.com", 14 | } 15 | webhook_url = Variable.get("DISCORD_POST_NOTIFICATION_WEBHOOK") 16 | response = requests.get(url, headers=headers, params=querystring) 17 | response_json = response.json() 18 | try: 19 | Session = sessionmaker(bind=settings.engine) 20 | # Update the variable using a context manager 21 | variable_key = "TWITTER_LATEST_REST_ID" 22 | rest_id = response_json["data"]["user"]["result"]["timeline_v2"]["timeline"][ 23 | "instructions" 24 | ][1]["entries"][0]["content"]["itemContent"]["tweet_results"]["result"][ 25 | "rest_id" 26 | ] 27 | full_text = response_json["data"]["user"]["result"]["timeline_v2"]["timeline"][ 28 | "instructions" 29 | ][1]["entries"][0]["content"]["itemContent"]["tweet_results"]["result"][ 30 | "legacy" 31 | ]["full_text"] 32 | rest_id_in_DB = Variable.get(variable_key) 33 | if rest_id_in_DB < rest_id: 34 | # Create a session 35 | session = Session() 36 | 37 | # Query the variable by key 38 | variable = session.query(Variable).filter_by(key=variable_key).first() 39 | 40 | # Update the variable value 41 | variable.set_val(rest_id) 42 | 43 | msg = f"new twitter post: https://twitter.com/PyConTW/status/{rest_id}\n\n{full_text}" 44 | requests.post( 45 | url=webhook_url, 46 | json={"username": "Twitter Post Notification", "content": msg}, 47 | ) 48 | 49 | # Commit the changes to the database 50 | session.commit() 51 | 52 | # Close the session 53 | session.close() 54 | except Exception: 55 | requests.post( 56 | url=webhook_url, 57 | json={ 58 | "username": "Twitter Post Notification", 59 | "content": str(response_json), 60 | }, 61 | ) 62 | -------------------------------------------------------------------------------- /dags/dwd/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/dags/dwd/__init__.py -------------------------------------------------------------------------------- /dags/dws/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/dags/dws/__init__.py -------------------------------------------------------------------------------- /dags/ods/fb_post_insights/dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.python_operator import PythonOperator 5 | from ods.fb_post_insights import udfs 6 | 7 | DEFAULT_ARGS = { 8 | "owner": "CHWan", 9 | "depends_on_past": False, 10 | "start_date": datetime(2023, 6, 14, 0), 11 | "retries": 2, 12 | "retry_delay": timedelta(minutes=5), 13 | "on_failure_callback": lambda x: "Need to send notification to Discord!", 14 | } 15 | dag = DAG( 16 | "FB_POST_INSIGHTS_V1", 17 | default_args=DEFAULT_ARGS, 18 | schedule_interval="5 8 * * *", 19 | max_active_runs=1, 20 | catchup=False, 21 | ) 22 | with dag: 23 | CREATE_TABLE_IF_NEEDED = PythonOperator( 24 | task_id="CREATE_TABLE_IF_NEEDED", 25 | python_callable=udfs.create_table_if_needed, 26 | ) 27 | 28 | SAVE_FB_POSTS_AND_INSIGHTS = PythonOperator( 29 | task_id="SAVE_FB_POSTS_AND_INSIGHTS", 30 | python_callable=udfs.save_fb_posts_and_insights, 31 | ) 32 | 33 | CREATE_TABLE_IF_NEEDED >> SAVE_FB_POSTS_AND_INSIGHTS 34 | 35 | 36 | if __name__ == "__main__": 37 | dag.cli() 38 | -------------------------------------------------------------------------------- /dags/ods/fb_post_insights/udfs.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from datetime import datetime 4 | from typing import List, Optional 5 | 6 | import requests 7 | from airflow.models import Variable 8 | from google.cloud import bigquery 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def create_table_if_needed() -> None: 14 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 15 | post_sql = """ 16 | CREATE TABLE IF NOT EXISTS `pycontw-225217.ods.ods_pycontw_fb_posts` ( 17 | id STRING, 18 | created_at TIMESTAMP, 19 | message STRING 20 | ) 21 | """ 22 | client.query(post_sql) 23 | insights_sql = """ 24 | CREATE TABLE IF NOT EXISTS `pycontw-225217.ods.ods_pycontw_fb_posts_insights` ( 25 | post_id STRING, 26 | query_time TIMESTAMP, 27 | comments INTEGER, 28 | reactions INTEGER, 29 | share INTEGER 30 | ) 31 | """ 32 | client.query(insights_sql) 33 | 34 | 35 | def save_fb_posts_and_insights() -> None: 36 | posts = request_posts_data() 37 | 38 | last_post = query_last_post() 39 | if last_post is None: 40 | new_posts = posts 41 | else: 42 | new_posts = [ 43 | post 44 | for post in posts 45 | if datetime.strptime( 46 | post["created_time"], "%Y-%m-%dT%H:%M:%S%z" 47 | ).timestamp() 48 | > last_post["created_at"].timestamp() 49 | ] 50 | 51 | if not dump_posts_to_bigquery( 52 | [ 53 | { 54 | "id": post["id"], 55 | "created_at": convert_fb_time(post["created_time"]), 56 | "message": post.get("message", "No message found"), 57 | } 58 | for post in new_posts 59 | ] 60 | ): 61 | raise RuntimeError("Failed to dump posts to BigQuery") 62 | 63 | if not dump_posts_insights_to_bigquery( 64 | [ 65 | { 66 | "post_id": post["id"], 67 | "query_time": datetime.now().timestamp(), 68 | "comments": post["comments"]["summary"]["total_count"], 69 | "reactions": post["reactions"]["summary"]["total_count"], 70 | "share": post.get("shares", {}).get("count", 0), 71 | } 72 | for post in posts 73 | ] 74 | ): 75 | raise RuntimeError("Failed to dump posts insights to BigQuery") 76 | 77 | 78 | def query_last_post() -> Optional[dict]: 79 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 80 | sql = """ 81 | SELECT 82 | created_at 83 | FROM 84 | `pycontw-225217.ods.ods_pycontw_fb_posts` 85 | ORDER BY 86 | created_at DESC 87 | LIMIT 1 88 | """ 89 | result = client.query(sql) 90 | data = list(result) 91 | return data[0] if data else None 92 | 93 | 94 | def request_posts_data() -> List[dict]: 95 | url = "https://graph.facebook.com/v20.0/160712400714277/feed/" 96 | # 160712400714277 is PyConTW's fb id 97 | access_token = Variable.get("FB_ACCESS_KEY") 98 | headers = {"Content-Type": "application/json"} 99 | params = { 100 | "fields": "id,created_time,message,comments.summary(true),reactions.summary(true),shares", 101 | "access_token": access_token, 102 | } 103 | response = requests.get(url, headers=headers, params=params) 104 | if response.ok: 105 | return response.json()["data"] 106 | raise RuntimeError(f"Failed to fetch posts data: {response.text}") 107 | 108 | 109 | def dump_posts_to_bigquery(posts: List[dict]) -> bool: 110 | if not posts: 111 | logger.info("No posts to dump!") 112 | return True 113 | 114 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 115 | job_config = bigquery.LoadJobConfig( 116 | schema=[ 117 | bigquery.SchemaField("id", "STRING", mode="REQUIRED"), 118 | bigquery.SchemaField("created_at", "TIMESTAMP", mode="REQUIRED"), 119 | bigquery.SchemaField("message", "STRING", mode="REQUIRED"), 120 | ], 121 | write_disposition="WRITE_APPEND", 122 | ) 123 | try: 124 | job = client.load_table_from_json( 125 | posts, 126 | "pycontw-225217.ods.ods_pycontw_fb_posts", 127 | job_config=job_config, 128 | ) 129 | job.result() 130 | return True 131 | except Exception as e: 132 | logger.error(f"Failed to dump posts to BigQuery: {e}", exc_info=True) 133 | return False 134 | 135 | 136 | def dump_posts_insights_to_bigquery(posts: List[dict]) -> bool: 137 | if not posts: 138 | logger.info("No post insights to dump!") 139 | return True 140 | 141 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 142 | job_config = bigquery.LoadJobConfig( 143 | schema=[ 144 | bigquery.SchemaField("post_id", "STRING", mode="REQUIRED"), 145 | bigquery.SchemaField("query_time", "TIMESTAMP", mode="REQUIRED"), 146 | bigquery.SchemaField("comments", "INTEGER", mode="NULLABLE"), 147 | bigquery.SchemaField("reactions", "INTEGER", mode="NULLABLE"), 148 | bigquery.SchemaField("share", "INTEGER", mode="NULLABLE"), 149 | ], 150 | write_disposition="WRITE_APPEND", 151 | ) 152 | try: 153 | job = client.load_table_from_json( 154 | posts, 155 | "pycontw-225217.ods.ods_pycontw_fb_posts_insights", 156 | job_config=job_config, 157 | ) 158 | job.result() 159 | return True 160 | except Exception as e: 161 | logger.error(f"Failed to dump posts insights to BigQuery: {e}", exc_info=True) 162 | return False 163 | 164 | 165 | def convert_fb_time(time_string: str) -> str: 166 | return ( 167 | datetime.strptime(time_string, "%Y-%m-%dT%H:%M:%S%z") 168 | .strftime("%Y-%m-%d %H:%M:%S%z") 169 | .replace("+0000", "UTC") 170 | ) 171 | -------------------------------------------------------------------------------- /dags/ods/google_search_console/dag.py: -------------------------------------------------------------------------------- 1 | """ 2 | Send Google Search Report to Discord 3 | """ 4 | 5 | from datetime import datetime, timedelta 6 | 7 | from airflow import DAG 8 | from airflow.operators.python_operator import PythonOperator 9 | from ods.google_search_console.udfs.google_search import GoogleSearchConsoleReporter 10 | 11 | DEFAULT_ARGS = { 12 | "owner": "davidtnfsh", 13 | "depends_on_past": False, 14 | "start_date": datetime(2020, 12, 9), 15 | "retries": 2, 16 | "retry_delay": timedelta(minutes=5), 17 | "on_failure_callback": lambda x: "Need to send notification to Discord", 18 | } 19 | dag = DAG( 20 | "GOOGLE_SEARCH_REPORT", 21 | default_args=DEFAULT_ARGS, 22 | schedule_interval=timedelta(weeks=2), 23 | max_active_runs=1, 24 | catchup=False, 25 | ) 26 | GOOGLE_SEARCH_REPORTER = GoogleSearchConsoleReporter() 27 | with dag: 28 | GET_AND_SEND_REPORT = PythonOperator( 29 | task_id="GET_AND_SEND_REPORT", 30 | python_callable=GOOGLE_SEARCH_REPORTER.main, 31 | ) 32 | 33 | if __name__ == "__main__": 34 | dag.cli() 35 | -------------------------------------------------------------------------------- /dags/ods/google_search_console/udfs/google_search.py: -------------------------------------------------------------------------------- 1 | import heapq 2 | import os 3 | from pathlib import Path 4 | 5 | import requests 6 | import searchconsole 7 | 8 | TOPK = 5 9 | 10 | AIRFLOW_HOME = os.getenv("AIRFLOW_HOME") 11 | 12 | 13 | class GoogleSearchConsoleReporter: 14 | def __init__(self): 15 | self.top_k_ctr = [] 16 | self.top_k_position = [] 17 | self.top_k_clicks = [] 18 | self.top_k_impressions = [] 19 | for top_k_heap in ( 20 | self.top_k_ctr, 21 | self.top_k_position, 22 | self.top_k_clicks, 23 | self.top_k_impressions, 24 | ): 25 | heapq.heapify(top_k_heap) 26 | self.report = None 27 | 28 | def main(self): 29 | report_msg = self._get_report() 30 | self._send_report(report_msg) 31 | 32 | def _get_report(self): 33 | client_config_path = ( 34 | Path(AIRFLOW_HOME) / "dags/client_secret_google_search_console.json" 35 | ) 36 | credentials_path = ( 37 | Path(AIRFLOW_HOME) 38 | / "dags/client_secret_google_search_console_serialized.json" 39 | ) 40 | account = searchconsole.authenticate( 41 | client_config=client_config_path, 42 | credentials=credentials_path, 43 | ) 44 | webproperty = account["https://tw.pycon.org/"] 45 | return webproperty.query.range("today", days=-7).dimension("query").get() 46 | 47 | def _send_report(self, report_msg): 48 | self._maitain_topk_heap(report_msg) 49 | msg_heap_dict = { 50 | f"透過 google 搜尋點進 PyConTW 官網的所有關鍵字中,ctr 最高的前{TOPK}名關鍵字": self.top_k_ctr, 51 | f"透過 google 搜尋點進 PyConTW 官網的所有關鍵字中,官網排名位置越靠前的前{TOPK}名": self.top_k_position, 52 | f"透過 google 搜尋點進 PyConTW 官網的所有關鍵字中,clicks 數最高的前{TOPK}名關鍵字": self.top_k_clicks, 53 | f"透過 google 搜尋點進 PyConTW 官網的所有關鍵字中,impressions 數最高的前{TOPK}名關鍵字": self.top_k_impressions, 54 | } 55 | 56 | for msg, heap in msg_heap_dict.items(): 57 | self._send_msg_to_discord(msg, heap) 58 | self._send_team_msg() 59 | 60 | def _maitain_topk_heap(self, report_msg): 61 | def heappush(heap, item, topk): 62 | heapq.heappush(heap, item) 63 | while len(heap) > topk: 64 | heapq.heappop(heap) 65 | 66 | for row in report_msg.rows: 67 | heappush(self.top_k_ctr, (row.ctr, row.query), TOPK) 68 | heappush(self.top_k_position, (-row.position, row.query), TOPK) 69 | heappush(self.top_k_clicks, (row.clicks, row.query), TOPK) 70 | heappush(self.top_k_impressions, (row.impressions, row.query), TOPK) 71 | 72 | @staticmethod 73 | def _send_msg_to_discord(msg, heap): 74 | def get_topk_from_heap(heap): 75 | def turn_negative_back_to_positive_int(heap): 76 | return [(num if num >= 0 else -num, query) for num, query in heap] 77 | 78 | return turn_negative_back_to_positive_int(sorted(heap, key=lambda x: -x[0])) 79 | 80 | def format_heap_content(topk_heap): 81 | return "\n".join([f'"{query}"\t{num}' for num, query in topk_heap]) 82 | 83 | topk_heap = get_topk_from_heap(heap) 84 | formatted_heap_content = format_heap_content(topk_heap) 85 | requests.post( 86 | os.getenv("DISCORD_WEBHOOK"), 87 | json={ 88 | "username": "Data Team 雙週報", 89 | "content": f"{msg}:\n {formatted_heap_content}\n----------------------\n", 90 | }, 91 | ) 92 | 93 | @staticmethod 94 | def _send_team_msg(): 95 | requests.post( 96 | os.getenv("DISCORD_WEBHOOK"), 97 | json={ 98 | "username": "Data Team 雙週報", 99 | "content": "有任何問題,歡迎敲 data team 任何一位成員~", 100 | }, 101 | ) 102 | 103 | 104 | if __name__ == "__main__": 105 | g = GoogleSearchConsoleReporter() 106 | g.main() 107 | -------------------------------------------------------------------------------- /dags/ods/ig_post_insights/dags.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.python_operator import PythonOperator 5 | from ods.ig_post_insights import udfs 6 | 7 | DEFAULT_ARGS = { 8 | "owner": "Angus Yang", 9 | "depends_on_past": False, 10 | "start_date": datetime(2023, 6, 14, 0), 11 | "retries": 2, 12 | "retry_delay": timedelta(minutes=5), 13 | "on_failure_callback": lambda x: "Need to send notification to Discord!", 14 | } 15 | dag = DAG( 16 | "IG_POST_INSIGHTS_V1", 17 | default_args=DEFAULT_ARGS, 18 | schedule_interval="5 8 * * *", 19 | max_active_runs=1, 20 | catchup=False, 21 | ) 22 | with dag: 23 | CREATE_TABLE_IF_NEEDED = PythonOperator( 24 | task_id="CREATE_TABLE_IF_NEEDED", 25 | python_callable=udfs.create_table_if_needed, 26 | ) 27 | 28 | SAVE_TWITTER_POSTS_AND_INSIGHTS = PythonOperator( 29 | task_id="SAVE_IG_POSTS_AND_INSIGHTS", 30 | python_callable=udfs.save_posts_and_insights, 31 | ) 32 | 33 | CREATE_TABLE_IF_NEEDED >> SAVE_TWITTER_POSTS_AND_INSIGHTS 34 | 35 | 36 | if __name__ == "__main__": 37 | dag.cli() 38 | -------------------------------------------------------------------------------- /dags/ods/ig_post_insights/udfs.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from datetime import datetime 4 | from typing import List, Optional 5 | 6 | import requests 7 | from airflow.models import Variable 8 | from google.cloud import bigquery 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | # IG API docs 14 | # https://developers.facebook.com/docs/instagram-api/reference/ig-user/media?locale=zh_TW 15 | # https://developers.facebook.com/docs/instagram-api/reference/ig-media 16 | 17 | # // get list of media-id 18 | # GET /v20.0/{page-id}/media/?access_token={access_token} 19 | 20 | # // get media detail 21 | # GET /v20.0/{media-id}?access_token={access_token}&fields=id,media_type,caption,timestamp,comments_count,like_count 22 | 23 | # PyConTW IG page-id: 17841405043609765 24 | # ps. IG api 目前不提供分享數, 所以只有點讚數和留言數 25 | 26 | # Access Token 27 | # Check Henry 28 | 29 | 30 | def create_table_if_needed() -> None: 31 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 32 | post_sql = """ 33 | CREATE TABLE IF NOT EXISTS `pycontw-225217.ods.ods_pycontw_ig_posts` ( 34 | id STRING, 35 | created_at TIMESTAMP, 36 | message STRING 37 | ) 38 | """ 39 | client.query(post_sql) 40 | insights_sql = """ 41 | CREATE TABLE IF NOT EXISTS `pycontw-225217.ods.ods_pycontw_ig_posts_insights` ( 42 | post_id STRING, 43 | query_time TIMESTAMP, 44 | period STRING, 45 | favorite INTEGER, 46 | reply INTEGER, 47 | retweet INTEGER, 48 | views INTEGER 49 | ) 50 | """ 51 | client.query(insights_sql) 52 | 53 | 54 | def save_posts_and_insights() -> None: 55 | posts = request_posts_data() 56 | 57 | last_post = query_last_post() 58 | new_posts = ( 59 | [ 60 | post 61 | for post in posts 62 | if post["timestamp"] > last_post["created_at"].timestamp() 63 | ] 64 | if last_post 65 | else posts 66 | ) 67 | 68 | if not dump_posts_to_bigquery( 69 | [ 70 | { 71 | "id": post["id"], 72 | "created_at": post["timestamp"], 73 | "message": post["caption"], 74 | } 75 | for post in new_posts 76 | ] 77 | ): 78 | raise RuntimeError("Failed to dump posts to BigQuery") 79 | 80 | if not dump_posts_insights_to_bigquery( 81 | [ 82 | { 83 | "post_id": post["id"], 84 | "query_time": datetime.now().timestamp(), 85 | "period": "lifetime", 86 | "favorite": post["like_count"], 87 | "reply": post["comments_count"], 88 | "retweet": "0", # API not supported 89 | "views": "0", # API not supported 90 | } 91 | for post in posts 92 | ] 93 | ): 94 | raise RuntimeError("Failed to dump posts insights to BigQuery") 95 | 96 | 97 | def query_last_post() -> Optional[dict]: 98 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 99 | sql = """ 100 | SELECT 101 | created_at 102 | FROM 103 | `pycontw-225217.ods.ods_pycontw_ig_posts` 104 | ORDER BY 105 | created_at DESC 106 | LIMIT 1 107 | """ 108 | result = client.query(sql) 109 | data = list(result) 110 | return data[0] if data else None 111 | 112 | 113 | def request_posts_data() -> List[dict]: 114 | media_list_url = "https://graph.facebook.com/v20.0/17841405043609765/media" 115 | querystring = {"access_token": Variable.get("IG_ACCESS_TOKEN"), "limit": "0"} 116 | headers = {"Content-Type": "application/json"} 117 | 118 | response = requests.get( 119 | media_list_url, headers=headers, params=querystring, timeout=180 120 | ) 121 | if not response.ok: 122 | raise RuntimeError(f"Failed to fetch posts data: {response.text}") 123 | media_list = response.json()["data"] 124 | 125 | media_insight_list = [] 126 | 127 | for media in media_list: 128 | media_insight_url = f"https://graph.facebook.com/v20.0/{media['id']}" 129 | querystring = { 130 | "access_token": Variable.get("IG_ACCESS_TOKEN"), 131 | "fields": "id,media_type,caption,timestamp,comments_count,like_count", 132 | } 133 | response = requests.get( 134 | media_insight_url, headers=headers, params=querystring, timeout=180 135 | ) 136 | if not response.ok: 137 | raise RuntimeError(f"Failed to fetch posts data: {response.text}") 138 | 139 | media_insight = {} 140 | media_res: dict = response.json() 141 | # Error handling, the response may not include the required fields, media id: 17889558458829258, no "caption" 142 | media_insight["id"] = media_res.get("id", "0") 143 | media_insight["timestamp"] = datetime.strptime( 144 | media_res.get("timestamp", "0"), "%Y-%m-%dT%H:%M:%S%z" 145 | ).timestamp() 146 | media_insight["caption"] = media_res.get("caption", "No Content") 147 | media_insight["comments_count"] = media_res.get("comments_count", "0") 148 | media_insight["like_count"] = media_res.get("like_count", "0") 149 | media_insight["media_type"] = media_res.get("media_type", "No Content") 150 | 151 | # print(media_insight) 152 | media_insight_list.append(media_insight) 153 | 154 | return media_insight_list 155 | 156 | 157 | def dump_posts_to_bigquery(posts: List[dict]) -> bool: 158 | if not posts: 159 | logger.info("No posts to dump!") 160 | return True 161 | 162 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 163 | job_config = bigquery.LoadJobConfig( 164 | schema=[ 165 | bigquery.SchemaField("id", "STRING", mode="REQUIRED"), 166 | bigquery.SchemaField("created_at", "TIMESTAMP", mode="REQUIRED"), 167 | bigquery.SchemaField("message", "STRING", mode="REQUIRED"), 168 | ], 169 | write_disposition="WRITE_APPEND", 170 | ) 171 | try: 172 | job = client.load_table_from_json( 173 | posts, 174 | "pycontw-225217.ods.ods_pycontw_ig_posts", 175 | job_config=job_config, 176 | ) 177 | job.result() 178 | return True 179 | except Exception as e: 180 | logger.error(f"Failed to dump posts to BigQuery: {e}", exc_info=True) 181 | return False 182 | 183 | 184 | def dump_posts_insights_to_bigquery(posts: List[dict]) -> bool: 185 | if not posts: 186 | logger.info("No post insights to dump!") 187 | return True 188 | 189 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 190 | job_config = bigquery.LoadJobConfig( 191 | schema=[ 192 | bigquery.SchemaField("post_id", "STRING", mode="REQUIRED"), 193 | bigquery.SchemaField("query_time", "TIMESTAMP", mode="REQUIRED"), 194 | bigquery.SchemaField("period", "STRING", mode="REQUIRED"), 195 | bigquery.SchemaField("favorite", "INTEGER", mode="NULLABLE"), 196 | bigquery.SchemaField("reply", "INTEGER", mode="NULLABLE"), 197 | bigquery.SchemaField("retweet", "INTEGER", mode="NULLABLE"), 198 | bigquery.SchemaField("views", "INTEGER", mode="NULLABLE"), 199 | ], 200 | write_disposition="WRITE_APPEND", 201 | ) 202 | try: 203 | job = client.load_table_from_json( 204 | posts, 205 | "pycontw-225217.ods.ods_pycontw_ig_posts_insights", 206 | job_config=job_config, 207 | ) 208 | job.result() 209 | return True 210 | except Exception as e: 211 | logger.error(f"Failed to dump posts insights to BigQuery: {e}", exc_info=True) 212 | return False 213 | 214 | 215 | def test_main(): 216 | create_table_if_needed() 217 | 218 | save_posts_and_insights() 219 | 220 | 221 | if __name__ == "__main__": 222 | test_main() 223 | -------------------------------------------------------------------------------- /dags/ods/kktix_ticket_orders/kktix_dag.py: -------------------------------------------------------------------------------- 1 | """ 2 | Ingest KKTIX's data and load them to BigQuery every 5mins 3 | """ 4 | 5 | from datetime import datetime, timedelta 6 | 7 | from airflow import DAG 8 | from airflow.operators.python_operator import PythonOperator 9 | from ods.kktix_ticket_orders.udfs import bigquery_loader, kktix_api 10 | 11 | DEFAULT_ARGS = { 12 | "owner": "davidtnfsh@gmail.com", 13 | "depends_on_past": False, 14 | "start_date": datetime(2024, 6, 16, 15), # 23 (+8) 15 | "retries": 2, 16 | "retry_delay": timedelta(minutes=5), 17 | "on_failure_callback": lambda x: "Need to send notification to Discord!", 18 | } 19 | dag = DAG( 20 | "KKTIX_TICKET_ORDERS_V10", 21 | default_args=DEFAULT_ARGS, 22 | schedule_interval="50 * * * *", 23 | max_active_runs=1, 24 | catchup=True, 25 | ) 26 | with dag: 27 | CREATE_TABLE_IF_NEEDED = PythonOperator( 28 | task_id="CREATE_TABLE_IF_NEEDED", 29 | python_callable=bigquery_loader.create_table_if_needed, 30 | ) 31 | 32 | GET_ATTENDEE_INFOS = PythonOperator( 33 | task_id="GET_ATTENDEE_INFOS", 34 | python_callable=kktix_api.main, 35 | provide_context=True, 36 | ) 37 | 38 | CREATE_TABLE_IF_NEEDED >> GET_ATTENDEE_INFOS 39 | 40 | if __name__ == "__main__": 41 | dag.cli() 42 | -------------------------------------------------------------------------------- /dags/ods/kktix_ticket_orders/kktix_refund_dag.py: -------------------------------------------------------------------------------- 1 | """ 2 | Update KKTIX's data if attendee has been refunded 3 | """ 4 | 5 | from datetime import datetime, timedelta 6 | 7 | from airflow import DAG 8 | from airflow.operators.python_operator import PythonOperator 9 | from ods.kktix_ticket_orders.udfs import kktix_refund 10 | 11 | DEFAULT_ARGS = { 12 | "owner": "henry410213028@gmail.com", 13 | "depends_on_past": False, 14 | "start_date": datetime(2024, 6, 18, 0), 15 | "retries": 2, 16 | "retry_delay": timedelta(minutes=5), 17 | "on_failure_callback": lambda x: "Need to send notification to Discord!", 18 | } 19 | dag = DAG( 20 | "KKTIX_TICKET_REFUND_V3", 21 | default_args=DEFAULT_ARGS, 22 | schedule_interval="50 23 * * *", # At 23:50 (everyday) 23 | max_active_runs=1, 24 | catchup=True, 25 | ) 26 | with dag: 27 | UPDATE_REFUNDED_ATTENDEE_IDS = PythonOperator( 28 | task_id="UPDATE_REFUNDED_ATTENDEE_IDS", 29 | python_callable=kktix_refund.main, 30 | ) 31 | 32 | UPDATE_REFUNDED_ATTENDEE_IDS 33 | 34 | if __name__ == "__main__": 35 | dag.cli() 36 | -------------------------------------------------------------------------------- /dags/ods/kktix_ticket_orders/klaviyo_backfill_dag.py: -------------------------------------------------------------------------------- 1 | """ 2 | Ingest KKTIX's daily data and load them to Mailer 3 | """ 4 | 5 | from datetime import datetime, timedelta 6 | 7 | from airflow import DAG 8 | from airflow.operators.python_operator import PythonOperator 9 | from ods.kktix_ticket_orders.udfs import batch_kktix2mailer 10 | 11 | DEFAULT_ARGS = { 12 | "owner": "henry410213028@gmail.com", 13 | "depends_on_past": False, 14 | "start_date": datetime(2022, 8, 29), 15 | "retries": 2, 16 | "retry_delay": timedelta(minutes=5), 17 | "on_failure_callback": lambda x: "Need to send notification to Discord!", 18 | } 19 | dag = DAG( 20 | "KLAVIYO_SEND_MAIL_V3", 21 | default_args=DEFAULT_ARGS, 22 | schedule_interval="0 * * * *", 23 | max_active_runs=1, 24 | catchup=True, 25 | ) 26 | with dag: 27 | GET_ATTENDEE_INFOS = PythonOperator( 28 | task_id="GET_ATTENDEE_INFOS", 29 | python_callable=batch_kktix2mailer.main, 30 | provide_context=True, 31 | ) 32 | 33 | GET_ATTENDEE_INFOS 34 | 35 | if __name__ == "__main__": 36 | dag.cli() 37 | -------------------------------------------------------------------------------- /dags/ods/kktix_ticket_orders/sqls/create_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS `{}` 2 | ( 3 | ID INT64 NOT NULL, 4 | NAME STRING NOT NULL, 5 | ATTENDEE_INFO STRING NOT NULL 6 | ); 7 | -------------------------------------------------------------------------------- /dags/ods/kktix_ticket_orders/udfs/batch_kktix2mailer.py: -------------------------------------------------------------------------------- 1 | from dateutil.parser import parse 2 | from ods.kktix_ticket_orders.udfs import kktix_api, klaviyo_loader 3 | 4 | 5 | def main(**context): 6 | """ 7 | Extract user info from kktix api and load to mailer 8 | """ 9 | schedule_interval = context["dag"].schedule_interval 10 | # If we change the schedule_interval, we need to update the logic in condition_filter_callback 11 | assert schedule_interval == "0 * * * *" # nosec 12 | ts_datetime_obj = parse(context["ts"]) 13 | year = ts_datetime_obj.year 14 | timestamp = ts_datetime_obj.timestamp() 15 | event_raw_data_array = kktix_api._extract( 16 | year=year, 17 | timestamp=timestamp, 18 | ) 19 | # load name and email to mailer before data has been hashed 20 | klaviyo_loader.load(event_raw_data_array) 21 | print(f"Batch load {len(event_raw_data_array)} data to downstream task") 22 | -------------------------------------------------------------------------------- /dags/ods/kktix_ticket_orders/udfs/bigquery_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | from google.cloud import bigquery 5 | 6 | TABLE = f"{os.getenv('BIGQUERY_PROJECT', 'pycontw-225217')}.ods.ods_kktix_attendeeId_datetime" 7 | # since backfill would insert duplicate records, we need this dedupe to make it idempotent 8 | DEDUPE_SQL = f""" 9 | CREATE OR REPLACE TABLE 10 | `{TABLE}` AS 11 | SELECT 12 | DISTINCT * 13 | FROM 14 | `{TABLE}` 15 | """ # nosec 16 | AIRFLOW_HOME = os.getenv("AIRFLOW_HOME") 17 | 18 | 19 | def create_table_if_needed() -> None: 20 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 21 | sql_filepath = ( 22 | Path(AIRFLOW_HOME) / "dags/ods/kktix_ticket_orders/sqls/create_table.sql" 23 | ) 24 | sql = sql_filepath.read_text().format(TABLE) 25 | client.query(sql) 26 | client.query(DEDUPE_SQL) 27 | -------------------------------------------------------------------------------- /dags/ods/kktix_ticket_orders/udfs/gather_town_loader.py: -------------------------------------------------------------------------------- 1 | """ 2 | load user's name, email etc into gather town whitelist 3 | please refer to this document for details: https://hackmd.io/PM_sWO5USo6dxMqT1uCrCQ?view 4 | """ 5 | 6 | import requests 7 | import tenacity 8 | from airflow.hooks.http_hook import HttpHook 9 | from airflow.models import Variable 10 | 11 | RETRY_ARGS = dict( 12 | wait=tenacity.wait_none(), 13 | stop=tenacity.stop_after_attempt(3), 14 | retry=tenacity.retry_if_exception_type(requests.exceptions.ConnectionError), 15 | ) 16 | 17 | GATHERTOWN_HTTP_HOOK = HttpHook(http_conn_id="gathertown_api", method="POST") 18 | 19 | 20 | def load(**context): 21 | event_raw_data_array = context["ti"].xcom_pull(task_ids="GET_ATTENDEE_INFOS") 22 | for event_raw_data in event_raw_data_array: 23 | resp = GATHERTOWN_HTTP_HOOK.run_with_advanced_retry( 24 | endpoint="/api/setEmailGuestlist", 25 | _retry_args=RETRY_ARGS, 26 | json={ 27 | "spaceId": Variable.get("gather_town_space_id"), 28 | "apiKey": Variable.get("gather_town_api_key"), 29 | "guestlist": { 30 | event_raw_data["聯絡人 Email"]: { 31 | "name": "", 32 | "role": "guest", 33 | "affiliation": "Attendee", 34 | } 35 | }, 36 | }, 37 | headers={"Accept": "application/json", "Content-Type": "application/json"}, 38 | ).json() 39 | print(resp) 40 | -------------------------------------------------------------------------------- /dags/ods/kktix_ticket_orders/udfs/kktix_api.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Callable, Dict, List 3 | 4 | import requests 5 | import tenacity 6 | from airflow.hooks.http_hook import HttpHook 7 | from airflow.models import Variable 8 | from dateutil.parser import parse 9 | from ods.kktix_ticket_orders.udfs import kktix_loader, kktix_transformer 10 | 11 | SCHEDULE_INTERVAL_SECONDS: int = 3600 12 | HTTP_HOOK = HttpHook(http_conn_id="kktix_api", method="GET") 13 | RETRY_ARGS = dict( 14 | wait=tenacity.wait_none(), 15 | stop=tenacity.stop_after_attempt(3), 16 | retry=tenacity.retry_if_exception_type(requests.exceptions.ConnectionError), 17 | ) 18 | 19 | 20 | def main(**context): 21 | """ 22 | ETL pipeline should consists of extract, transform and load 23 | """ 24 | schedule_interval = context["dag"].schedule_interval 25 | # If we change the schedule_interval, we need to update the logic in condition_filter_callback 26 | assert schedule_interval == "50 * * * *" # nosec 27 | ts_datetime_obj = parse(context["ts"]) 28 | year = ts_datetime_obj.year 29 | timestamp = ts_datetime_obj.timestamp() 30 | event_raw_data_array = _extract( 31 | year=year, 32 | timestamp=timestamp, 33 | ) 34 | transformed_event_raw_data_array = kktix_transformer.transform( 35 | copy.deepcopy(event_raw_data_array) 36 | ) 37 | kktix_loader.load(transformed_event_raw_data_array) 38 | print(f"Loaded {len(transformed_event_raw_data_array)} rows to BigQuery!") 39 | 40 | # pass these unhashed data through xcom to next airflow task 41 | return kktix_transformer._extract_sensitive_unhashed_raw_data( 42 | copy.deepcopy(event_raw_data_array) 43 | ) 44 | 45 | 46 | def _extract(year: int, timestamp: float) -> List[Dict]: 47 | """ 48 | get data from KKTIX's API 49 | 1. condition_filter_callb: use this callbacl to filter out unwanted event! 50 | 2. right now schedule_interval_seconds is a hardcoded value! 51 | """ 52 | event_raw_data_array: List[Dict] = [] 53 | 54 | def _condition_filter_callback(event): 55 | return str(year) in event["name"] and "registration" in event["name"].lower() 56 | 57 | event_metadatas = get_event_metadatas(_condition_filter_callback) 58 | for event_metadata in event_metadatas: 59 | event_id = event_metadata["id"] 60 | for attendee_info in get_attendee_infos(event_id, timestamp): 61 | event_raw_data_array.append( 62 | { 63 | "id": event_id, 64 | "name": event_metadata["name"], 65 | "attendee_info": attendee_info, 66 | } 67 | ) 68 | return event_raw_data_array 69 | 70 | 71 | def get_attendee_infos(event_id: int, timestamp: float) -> List: 72 | """ 73 | it's a public wrapper for people to get attendee infos! 74 | """ 75 | attendance_book_id = _get_attendance_book_id(event_id) 76 | attendee_ids = _get_attendee_ids(event_id, attendance_book_id) 77 | attendee_infos = _get_attendee_infos(event_id, attendee_ids, timestamp) 78 | return attendee_infos 79 | 80 | 81 | def get_event_metadatas(condition_filter: Callable) -> List[Dict]: 82 | """ 83 | Fetch all the ongoing events 84 | """ 85 | event_list_resp = HTTP_HOOK.run_with_advanced_retry( 86 | endpoint=f"{Variable.get('kktix_events_endpoint')}?only_not_ended_event=true", 87 | _retry_args=RETRY_ARGS, 88 | ).json() 89 | event_metadatas: List[dict] = [] 90 | for event in event_list_resp["data"]: 91 | if condition_filter(event): 92 | event_metadatas.append(event) 93 | return event_metadatas 94 | 95 | 96 | def _get_attendance_book_id(event_id: int) -> int: 97 | """ 98 | Fetch attendance books 99 | """ 100 | attendance_books_resp = HTTP_HOOK.run_with_advanced_retry( 101 | endpoint=f"{Variable.get('kktix_events_endpoint')}/{event_id}/attendance_books?only_not_ended_event=true", 102 | _retry_args=RETRY_ARGS, 103 | ).json() 104 | return attendance_books_resp[0]["id"] 105 | 106 | 107 | def _get_attendee_ids(event_id: int, attendance_book_id: int) -> List[int]: 108 | """ 109 | get all attendee ids! 110 | """ 111 | attendee_ids = [] 112 | attendees_resp = HTTP_HOOK.run_with_advanced_retry( 113 | endpoint=f"{Variable.get('kktix_events_endpoint')}/{event_id}/attendance_books/{attendance_book_id}", 114 | _retry_args=RETRY_ARGS, 115 | ).json() 116 | for signin_status_tuple in attendees_resp["signin_status"]: 117 | attendee_ids.append(signin_status_tuple[0]) 118 | return attendee_ids 119 | 120 | 121 | def _get_attendee_infos( 122 | event_id: int, attendee_ids: List[int], timestamp: float 123 | ) -> List: 124 | """ 125 | get attendee infos, e.g. email, phonenumber, name and etc 126 | """ 127 | print( 128 | f"Fetching attendee infos between {timestamp} and {timestamp + SCHEDULE_INTERVAL_SECONDS}" 129 | ) 130 | attendee_infos = [] 131 | for attendee_id in attendee_ids: 132 | attendee_info = HTTP_HOOK.run_with_advanced_retry( 133 | endpoint=f"{Variable.get('kktix_events_endpoint')}/{event_id}/attendees/{attendee_id}", 134 | _retry_args=RETRY_ARGS, 135 | ).json() 136 | if not attendee_info["is_paid"]: 137 | continue 138 | if ( 139 | timestamp 140 | < attendee_info["updated_at"] 141 | < timestamp + SCHEDULE_INTERVAL_SECONDS 142 | ): 143 | attendee_infos.append(attendee_info) 144 | return attendee_infos 145 | -------------------------------------------------------------------------------- /dags/ods/kktix_ticket_orders/udfs/kktix_loader.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import json 3 | import os 4 | from typing import Dict, List 5 | 6 | import pandas as pd 7 | from google.cloud import bigquery 8 | from ods.kktix_ticket_orders.udfs import kktix_bq_dwd_etl 9 | from ods.kktix_ticket_orders.udfs.bigquery_loader import TABLE 10 | 11 | SCHEMA = [ 12 | bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"), 13 | bigquery.SchemaField("name", "STRING", mode="REQUIRED"), 14 | bigquery.SchemaField("attendee_info", "STRING", mode="REQUIRED"), 15 | bigquery.SchemaField("refunded", "BOOLEAN", mode="REQUIRED"), 16 | ] 17 | JOB_CONFIG = bigquery.LoadJobConfig(schema=SCHEMA) 18 | 19 | 20 | def load(event_raw_data_array: List): 21 | """ 22 | load data into bigquery! 23 | """ 24 | # data quality check 25 | if len(event_raw_data_array) == 0: 26 | print("Nothing to load, skip!") 27 | return 28 | payload = [] 29 | for event_raw_data in event_raw_data_array: 30 | sanitized_event_raw_data = _sanitize_payload(event_raw_data) 31 | payload.append(sanitized_event_raw_data) 32 | _load_to_bigquery(payload) 33 | _load_to_bigquery_dwd(payload) 34 | 35 | 36 | def _load_to_bigquery(payload: List[Dict]) -> None: 37 | """ 38 | load data to BigQuery's `TABLE` 39 | """ 40 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 41 | df = pd.DataFrame( 42 | payload, 43 | columns=["id", "name", "attendee_info"], 44 | ) 45 | # for now, these attendees haven't refunded our ticket, yet... 46 | # we don't know if they would refund down the road 47 | df["refunded"] = [False] * len(payload) 48 | job = client.load_table_from_dataframe(df, TABLE, job_config=JOB_CONFIG) 49 | job.result() 50 | 51 | 52 | def _load_to_bigquery_dwd(payload: List[Dict]) -> None: 53 | """ 54 | load data to BigQuery's `TABLE` 55 | """ 56 | # Spilt payload to dict lists by ticket group 57 | ticket_groups = ["corporate", "individual", "reserved"] 58 | dol = collections.defaultdict(list) 59 | for d in payload: 60 | for tg in ticket_groups: 61 | if tg in d["name"].lower(): 62 | dol[tg].append(d) 63 | 64 | print(dol[tg]) 65 | project_id = os.getenv("BIGQUERY_PROJECT") 66 | dataset_id = "dwd" 67 | for tg in ticket_groups: 68 | if len(dol[tg]) > 0: 69 | _, sanitized_df = kktix_bq_dwd_etl.load_to_df_from_list(dol[tg]) 70 | table_id = f"kktix_ticket_{tg}_attendees" 71 | kktix_bq_dwd_etl.upload_dataframe_to_bigquery( 72 | sanitized_df, project_id, dataset_id, table_id 73 | ) 74 | 75 | 76 | def _sanitize_payload(event_raw_data) -> Dict: 77 | """ 78 | BigQuery has some constraints for nested data type 79 | So we put out sanitization/data cleansing logic here! 80 | """ 81 | event_raw_data["attendee_info"] = json.dumps(event_raw_data["attendee_info"]) 82 | return event_raw_data 83 | -------------------------------------------------------------------------------- /dags/ods/kktix_ticket_orders/udfs/kktix_refund.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import defaultdict 3 | from typing import List 4 | 5 | from google.cloud import bigquery 6 | from ods.kktix_ticket_orders.udfs.bigquery_loader import TABLE 7 | from ods.kktix_ticket_orders.udfs.kktix_api import ( 8 | _get_attendance_book_id, 9 | _get_attendee_ids, 10 | ) 11 | 12 | CLIENT = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 13 | 14 | 15 | def main() -> None: 16 | refunded_attendee_ids = _check_if_refunded_ticket_exists() 17 | if refunded_attendee_ids: 18 | _mark_tickets_as_refunded(refunded_attendee_ids) 19 | 20 | 21 | def _check_if_refunded_ticket_exists() -> List[int]: 22 | """ 23 | 1. iterate through all unrefunded tickets 24 | 2. build up a hash map 25 | 3. get the latest attendance book 26 | 4. compare the difference, the diff would be refunded attendee ids 27 | """ 28 | refunded_attendee_ids: List[int] = [] 29 | query_job = CLIENT.query( 30 | f""" 31 | SELECT 32 | ID, 33 | CAST(REPLACE(JSON_EXTRACT(ATTENDEE_INFO, 34 | '$.id'), '"', '') AS INT64) AS ATTENDEE_ID 35 | FROM 36 | `{TABLE}` 37 | WHERE 38 | REFUNDED IS NULL OR REFUNDED = FALSE 39 | """ # nosec 40 | ) 41 | event_ids_and_attendee_ids = query_job.result() 42 | 43 | bigquery_side_event_attendee_id_dict = defaultdict(list) 44 | for event_id, attendee_id in event_ids_and_attendee_ids: 45 | bigquery_side_event_attendee_id_dict[event_id].append(attendee_id) 46 | for ( 47 | event_id, 48 | outdated_latest_attendee_ids, 49 | ) in bigquery_side_event_attendee_id_dict.items(): 50 | attendance_book_id = _get_attendance_book_id(event_id) 51 | latest_attendee_ids = _get_attendee_ids(event_id, attendance_book_id) 52 | refunded_attendee_ids_in_this_event = set( 53 | outdated_latest_attendee_ids 54 | ).difference(set(latest_attendee_ids)) 55 | refunded_attendee_ids += list(refunded_attendee_ids_in_this_event) 56 | return refunded_attendee_ids 57 | 58 | 59 | def _mark_tickets_as_refunded(refunded_attendee_ids: List[int]) -> None: 60 | """ 61 | set these attendee info to refunded=true, if we cannot find its attendee_info right now by using KKTIX's API! 62 | """ 63 | query_job = CLIENT.query( 64 | f""" 65 | UPDATE 66 | `{TABLE}` 67 | SET 68 | refunded=TRUE 69 | WHERE 70 | CAST(REPLACE(JSON_EXTRACT(ATTENDEE_INFO, 71 | '$.id'), '"', '') AS INT64) in ({",".join(str(i) for i in refunded_attendee_ids)}) 72 | """ 73 | ) 74 | result = query_job.result() 75 | print(f"Result of _mark_tickets_as_refunded: {result}") 76 | -------------------------------------------------------------------------------- /dags/ods/kktix_ticket_orders/udfs/kktix_transformer.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | from typing import Dict, List 3 | 4 | SENSITIVE_KEY_NAME_SET = { 5 | "聯絡人 姓名", 6 | "聯絡人 Email", 7 | "聯絡人 手機", 8 | "Address", 9 | } 10 | 11 | 12 | def transform(event_raw_data_array: List) -> List[Dict]: 13 | """ 14 | de-identify user's email in this block! 15 | """ 16 | for event in event_raw_data_array: 17 | attendee_info = event["attendee_info"] 18 | # search string contains personal information and it's unstructured. Therefore just drop it! 19 | del attendee_info["search_string"] 20 | for index, (key, value) in enumerate(attendee_info["data"]): 21 | for key_should_be_hashed in SENSITIVE_KEY_NAME_SET: 22 | if key_should_be_hashed in key: 23 | hashed_value = hashlib.sha256(value.encode("utf-8")).hexdigest() 24 | attendee_info["data"][index][1] = hashed_value 25 | else: 26 | continue 27 | return event_raw_data_array 28 | 29 | 30 | def _extract_sensitive_unhashed_raw_data(event_raw_data_array: List) -> List[Dict]: 31 | """ 32 | only keep these data in xcom and pass them to next Airflow task 33 | """ 34 | sensitive_unhashed_raw_data_array = [] 35 | for event in event_raw_data_array: 36 | attendee_info = event["attendee_info"] 37 | payload = {} 38 | for key, value in attendee_info["data"]: 39 | if key in SENSITIVE_KEY_NAME_SET: 40 | payload[key] = value 41 | sensitive_unhashed_raw_data_array.append(payload) 42 | return sensitive_unhashed_raw_data_array 43 | -------------------------------------------------------------------------------- /dags/ods/kktix_ticket_orders/udfs/klaviyo_loader.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, List 2 | 3 | from airflow.models import Variable 4 | from ods.kktix_ticket_orders.udfs import klaviyo_mailer 5 | 6 | 7 | def _load_raw_data(event_raw_data_array: List) -> Iterable: 8 | for event in event_raw_data_array: 9 | attendee_info = event["attendee_info"] 10 | # search string contains personal information and it's unstructured. Therefore just drop it! 11 | del attendee_info["search_string"] 12 | tmp = { 13 | key: value 14 | for index, (key, value) in enumerate(attendee_info["data"]) 15 | if key in ("聯絡人 Email", "聯絡人 姓名") 16 | } 17 | tmp.update({"qrcode": attendee_info["qrcode"]}) 18 | yield tmp 19 | 20 | 21 | def load(event_raw_data_array: List) -> None: 22 | """ 23 | Send a notify mail for all participants via third-party service 24 | """ 25 | try: 26 | list_id = Variable.get("KLAVIYO_LIST_ID") 27 | campaign_id = Variable.get("KLAVIYO_CAMPAIGN_ID") 28 | except KeyError: 29 | print( 30 | "Skip klaviyo mailer, 'KLAVIYO_LIST_ID' or 'KLAVIYO_CAMPAIGN_ID' variable not found" 31 | ) 32 | return 33 | 34 | datas = [ 35 | { 36 | "email": item["聯絡人 Email"], 37 | "name": item["聯絡人 姓名"], 38 | "qrcode": item["qrcode"], 39 | } 40 | for item in _load_raw_data(event_raw_data_array) 41 | ] 42 | if not datas: 43 | print("Skip klaviyo mailer, no user profiles") 44 | return 45 | 46 | klaviyo_mailer.main( 47 | list_id=list_id, 48 | campaign_id=campaign_id, 49 | campaign_name="隨買即用", 50 | datas=datas, 51 | ) 52 | -------------------------------------------------------------------------------- /dags/ods/kktix_ticket_orders/udfs/klaviyo_mailer.py: -------------------------------------------------------------------------------- 1 | """Send a mail via Klaviyo 2 | 3 | Requirements: 4 | 5 | 1. Create a [Klaviyo List](https://www.klaviyo.com/lists) 6 | 7 | 2. Create a template [campaign](https://www.klaviyo.com/campaigns) and set the previous List as target recipients list 8 | 9 | """ 10 | 11 | from datetime import datetime 12 | from typing import List 13 | 14 | import requests 15 | import tenacity 16 | from airflow.hooks.http_hook import HttpHook 17 | from airflow.models import Variable 18 | 19 | SCHEDULE_INTERVAL_SECONDS: int = 300 20 | RETRY_ARGS = dict( 21 | wait=tenacity.wait_none(), 22 | stop=tenacity.stop_after_attempt(3), 23 | retry=tenacity.retry_if_exception_type(requests.exceptions.ConnectionError), 24 | ) 25 | 26 | 27 | def main( 28 | list_id: str, 29 | campaign_id: str, 30 | campaign_name: str, 31 | datas: List[dict], 32 | ): 33 | """ 34 | Args: 35 | list_id (str): Klaviyo list id, that will be save your target recipients 36 | campaign_id (str): A existed campaign you want to copy from 37 | campaign_name (str): A new campaign name 38 | datas (List[dict]): Recipient profile, example like below 39 | 40 | [ 41 | { 42 | "email": "foo@example.com", 43 | "name": "Foo", 44 | "property1": "value1", 45 | "property2": "value2", 46 | }, 47 | { 48 | "email": "bar@example.com", 49 | "name": "Bar", 50 | "property1": "value1", 51 | "property2": "value2", 52 | }, 53 | ] 54 | """ 55 | # check list and compaign existed 56 | assert _klaviyo_get_list_info(list_id) 57 | assert _klaviyo_get_campaign_info(campaign_id) 58 | 59 | # update list members 60 | existed_members = _klaviyo_get_list_members(list_id)["records"] 61 | if existed_members: 62 | _klaviyo_remove_list_members( 63 | list_id, body={"emails": list(map(lambda x: x["email"], existed_members))} 64 | ) 65 | 66 | _klaviyo_add_list_members(list_id, body={"profiles": datas}) 67 | new_members = _klaviyo_get_list_members(list_id)["records"] 68 | assert new_members 69 | 70 | # create a new compaign and send mail immediately 71 | campaign_suffix = f"{datetime.now():%Y-%m-%d_%H:%M:%S}" 72 | response = _klaviyo_clone_campaign( 73 | campaign_id, 74 | name=f"{campaign_name}_{campaign_suffix}", 75 | list_id=list_id, 76 | ) 77 | new_campaign_id = response["id"] 78 | _klaviyo_send_campaign(new_campaign_id) 79 | print(f"Send {len(new_members)} Mails") 80 | 81 | 82 | def _klaviyo_get_list_info(list_id: str) -> dict: 83 | HTTP_HOOK = HttpHook(http_conn_id="klaviyo_api", method="GET") 84 | API_KEY = Variable.get("KLAVIYO_KEY") 85 | return HTTP_HOOK.run_with_advanced_retry( 86 | endpoint=f"/v2/list/{list_id}?api_key={API_KEY}", 87 | _retry_args=RETRY_ARGS, 88 | headers={"Accept": "application/json"}, 89 | ).json() 90 | 91 | 92 | def _klaviyo_get_list_members(list_id: str) -> dict: 93 | HTTP_HOOK = HttpHook(http_conn_id="klaviyo_api", method="GET") 94 | API_KEY = Variable.get("KLAVIYO_KEY") 95 | return HTTP_HOOK.run_with_advanced_retry( 96 | endpoint=f"/v2/group/{list_id}/members/all?api_key={API_KEY}", 97 | _retry_args=RETRY_ARGS, 98 | headers={"Accept": "application/json"}, 99 | ).json() 100 | 101 | 102 | def _klaviyo_remove_list_members(list_id: str, body: dict) -> dict: 103 | HTTP_HOOK = HttpHook(http_conn_id="klaviyo_api", method="DELETE") 104 | API_KEY = Variable.get("KLAVIYO_KEY") 105 | return HTTP_HOOK.run_with_advanced_retry( 106 | endpoint=f"/v2/list/{list_id}/members?api_key={API_KEY}", 107 | _retry_args=RETRY_ARGS, 108 | json=body, 109 | headers={"Content-Type": "application/json"}, 110 | ) 111 | 112 | 113 | def _klaviyo_add_list_members(list_id: str, body: dict) -> dict: 114 | HTTP_HOOK = HttpHook(http_conn_id="klaviyo_api", method="POST") 115 | API_KEY = Variable.get("KLAVIYO_KEY") 116 | return HTTP_HOOK.run_with_advanced_retry( 117 | endpoint=f"/v2/list/{list_id}/members?api_key={API_KEY}", 118 | _retry_args=RETRY_ARGS, 119 | json=body, 120 | headers={"Accept": "application/json", "Content-Type": "application/json"}, 121 | ).json() 122 | 123 | 124 | def _klaviyo_get_campaign_info(campaign_id: str) -> dict: 125 | HTTP_HOOK = HttpHook(http_conn_id="klaviyo_api", method="GET") 126 | API_KEY = Variable.get("KLAVIYO_KEY") 127 | return HTTP_HOOK.run_with_advanced_retry( 128 | endpoint=f"/v1/campaign/{campaign_id}?api_key={API_KEY}", 129 | _retry_args=RETRY_ARGS, 130 | headers={"Accept": "application/json"}, 131 | ).json() 132 | 133 | 134 | def _klaviyo_clone_campaign(campaign_id: str, name: str, list_id: str) -> dict: 135 | HTTP_HOOK = HttpHook(http_conn_id="klaviyo_api", method="POST") 136 | API_KEY = Variable.get("KLAVIYO_KEY") 137 | return HTTP_HOOK.run_with_advanced_retry( 138 | endpoint=f"/v1/campaign/{campaign_id}/clone?api_key={API_KEY}", 139 | _retry_args=RETRY_ARGS, 140 | data={"name": name, "list_id": list_id}, 141 | headers={ 142 | "Accept": "application/json", 143 | "Content-Type": "application/x-www-form-urlencoded", 144 | }, 145 | ).json() 146 | 147 | 148 | def _klaviyo_send_campaign(campaign_id: str) -> dict: 149 | HTTP_HOOK = HttpHook(http_conn_id="klaviyo_api", method="POST") 150 | API_KEY = Variable.get("KLAVIYO_KEY") 151 | return HTTP_HOOK.run_with_advanced_retry( 152 | endpoint=f"/v1/campaign/{campaign_id}/send?api_key={API_KEY}", 153 | _retry_args=RETRY_ARGS, 154 | headers={"Accept": "application/json"}, 155 | ).json() 156 | -------------------------------------------------------------------------------- /dags/ods/linkedin_post_insights/dags.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.python_operator import PythonOperator 5 | from ods.linkedin_post_insights import udfs 6 | 7 | DEFAULT_ARGS = { 8 | "owner": "Angus Yang", 9 | "depends_on_past": False, 10 | "start_date": datetime(2023, 6, 14, 0), 11 | "retries": 2, 12 | "retry_delay": timedelta(minutes=5), 13 | "on_failure_callback": lambda x: "Need to send notification to Discord!", 14 | } 15 | dag = DAG( 16 | "LINKEDIN_POST_INSIGHTS_V2", 17 | default_args=DEFAULT_ARGS, 18 | schedule_interval="5 8 */2 * *", 19 | max_active_runs=1, 20 | catchup=False, 21 | ) 22 | with dag: 23 | CREATE_TABLE_IF_NEEDED = PythonOperator( 24 | task_id="CREATE_TABLE_IF_NEEDED", 25 | python_callable=udfs.create_table_if_needed, 26 | ) 27 | 28 | SAVE_TWITTER_POSTS_AND_INSIGHTS = PythonOperator( 29 | task_id="SAVE_LINKEDIN_POSTS_AND_INSIGHTS", 30 | python_callable=udfs.save_posts_and_insights, 31 | ) 32 | 33 | CREATE_TABLE_IF_NEEDED >> SAVE_TWITTER_POSTS_AND_INSIGHTS 34 | 35 | 36 | if __name__ == "__main__": 37 | dag.cli() 38 | -------------------------------------------------------------------------------- /dags/ods/linkedin_post_insights/udfs.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from datetime import datetime 4 | from typing import List, Optional 5 | 6 | import requests 7 | from airflow.models import Variable 8 | from google.cloud import bigquery 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def create_table_if_needed() -> None: 14 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 15 | post_sql = """ 16 | CREATE TABLE IF NOT EXISTS `pycontw-225217.ods.ods_pycontw_linkedin_posts` ( 17 | id STRING, 18 | created_at TIMESTAMP, 19 | message STRING 20 | ) 21 | """ 22 | client.query(post_sql) 23 | insights_sql = """ 24 | CREATE TABLE IF NOT EXISTS `pycontw-225217.ods.ods_pycontw_linkedin_posts_insights` ( 25 | post_id STRING, 26 | query_time TIMESTAMP, 27 | period STRING, 28 | favorite INTEGER, 29 | reply INTEGER, 30 | retweet INTEGER, 31 | views INTEGER 32 | ) 33 | """ 34 | client.query(insights_sql) 35 | 36 | # Example output from the Rapid API, not all fields will exists for a specific post 37 | # 38 | # { 39 | # "text": "For your kids in senior high.", 40 | # "totalReactionCount": 6, 41 | # "likeCount": 6, 42 | # "repostsCount": 1, 43 | # "empathyCount": 1, 44 | # "commentsCount": 20, 45 | # repostsCount:1, 46 | # "postUrl": "https://www.linkedin.com/feed/update/urn:li:activity:6940542340960763905/", 47 | # "postedAt": "1yr", 48 | # "postedDate": "2022-06-09 05:57:23.126 +0000 UTC", 49 | # "postedDateTimestamp": 1654754243126, 50 | # "urn": "6940542340960763905", 51 | # "author": { 52 | # "firstName": "Angus", 53 | # "lastName": "Yang", 54 | # "username": "angus-yang-8885279a", 55 | # "url": "https://www.linkedin.com/in/angus-yang-8885279a" 56 | # }, 57 | # "company": {}, 58 | # "article": { 59 | # "title": "2022 AWS STEM Summer Camp On The Cloud", 60 | # "subtitle": "pages.awscloud.com • 2 min read", 61 | # "link": "https://pages.awscloud.com/tw-2022-aws-stem-summer-camp-on-the-cloud_registration.html" 62 | # } 63 | # }, 64 | 65 | 66 | def save_posts_and_insights() -> None: 67 | posts = request_posts_data() 68 | 69 | last_post = query_last_post() 70 | new_posts = ( 71 | [ 72 | post 73 | for post in posts 74 | if post["postedDateTimestamp"] > last_post["created_at"].timestamp() 75 | ] 76 | if last_post 77 | else posts 78 | ) 79 | 80 | if not dump_posts_to_bigquery( 81 | [ 82 | { 83 | "id": post["urn"], 84 | "created_at": post["postedDateTimestamp"], 85 | "message": post["text"], 86 | } 87 | for post in new_posts 88 | ] 89 | ): 90 | raise RuntimeError("Failed to dump posts to BigQuery") 91 | 92 | if not dump_posts_insights_to_bigquery( 93 | [ 94 | { 95 | "post_id": post["urn"], 96 | "query_time": datetime.now().timestamp(), 97 | "period": "lifetime", 98 | "favorite": post["likeCount"], 99 | "reply": post["commentsCount"], 100 | "retweet": post["repostsCount"], 101 | "views": "0", # not support by RapidAPI 102 | } 103 | for post in posts 104 | ] 105 | ): 106 | raise RuntimeError("Failed to dump posts insights to BigQuery") 107 | 108 | 109 | def query_last_post() -> Optional[dict]: 110 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 111 | sql = """ 112 | SELECT 113 | created_at 114 | FROM 115 | `pycontw-225217.ods.ods_pycontw_linkedin_posts` 116 | ORDER BY 117 | created_at DESC 118 | LIMIT 1 119 | """ 120 | result = client.query(sql) 121 | data = list(result) 122 | return data[0] if data else None 123 | 124 | 125 | def request_posts_data() -> List[dict]: 126 | # Define the request options 127 | # url = 'https://linkedin-data-api.p.rapidapi.com/get-profile-posts' # for user 128 | url = "https://linkedin-data-api.p.rapidapi.com/get-company-posts" 129 | querystring = {"username": "pycontw"} 130 | headers = { 131 | "X-RapidAPI-Key": Variable.get("LINKEDIN_RAPIDAPI_KEY"), 132 | "X-RapidAPI-Host": "linkedin-data-api.p.rapidapi.com", 133 | } 134 | 135 | response = requests.get(url, headers=headers, params=querystring, timeout=180) 136 | if not response.ok: 137 | raise RuntimeError(f"Failed to fetch posts data: {response.text}") 138 | 139 | media_insight_list = [] 140 | media_res_list = response.json()["data"] 141 | # format handling, the response may not include the required fields 142 | for media_res in media_res_list: 143 | media_insight = {} 144 | media_insight["urn"] = media_res.get("urn", "0") 145 | media_insight["postedDateTimestamp"] = ( 146 | media_res.get("postedDateTimestamp", "0") / 1000 147 | ) 148 | media_insight["text"] = media_res.get("text", "No Content") 149 | media_insight["likeCount"] = media_res.get("totalReactionCount", "0") 150 | media_insight["commentsCount"] = media_res.get("commentsCount", "0") 151 | media_insight["repostsCount"] = media_res.get("repostsCount", "0") 152 | # logger.info(media_insight) 153 | media_insight_list.append(media_insight) 154 | 155 | return media_insight_list 156 | 157 | 158 | def dump_posts_to_bigquery(posts: List[dict]) -> bool: 159 | if not posts: 160 | logger.info("No posts to dump!") 161 | return True 162 | 163 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 164 | job_config = bigquery.LoadJobConfig( 165 | schema=[ 166 | bigquery.SchemaField("id", "STRING", mode="REQUIRED"), 167 | bigquery.SchemaField("created_at", "TIMESTAMP", mode="REQUIRED"), 168 | bigquery.SchemaField("message", "STRING", mode="REQUIRED"), 169 | ], 170 | write_disposition="WRITE_APPEND", 171 | ) 172 | try: 173 | job = client.load_table_from_json( 174 | posts, 175 | "pycontw-225217.ods.ods_pycontw_linkedin_posts", 176 | job_config=job_config, 177 | ) 178 | job.result() 179 | return True 180 | except Exception as e: 181 | logger.error(f"Failed to dump posts to BigQuery: {e}", exc_info=True) 182 | return False 183 | 184 | 185 | def dump_posts_insights_to_bigquery(posts: List[dict]) -> bool: 186 | if not posts: 187 | logger.info("No post insights to dump!") 188 | return True 189 | 190 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 191 | job_config = bigquery.LoadJobConfig( 192 | schema=[ 193 | bigquery.SchemaField("post_id", "STRING", mode="REQUIRED"), 194 | bigquery.SchemaField("query_time", "TIMESTAMP", mode="REQUIRED"), 195 | bigquery.SchemaField("period", "STRING", mode="REQUIRED"), 196 | bigquery.SchemaField("favorite", "INTEGER", mode="NULLABLE"), 197 | bigquery.SchemaField("reply", "INTEGER", mode="NULLABLE"), 198 | bigquery.SchemaField("retweet", "INTEGER", mode="NULLABLE"), 199 | bigquery.SchemaField("views", "INTEGER", mode="NULLABLE"), 200 | ], 201 | write_disposition="WRITE_APPEND", 202 | ) 203 | try: 204 | job = client.load_table_from_json( 205 | posts, 206 | "pycontw-225217.ods.ods_pycontw_linkedin_posts_insights", 207 | job_config=job_config, 208 | ) 209 | job.result() 210 | return True 211 | except Exception as e: 212 | logger.error(f"Failed to dump posts insights to BigQuery: {e}", exc_info=True) 213 | return False 214 | 215 | 216 | def test_main(): 217 | create_table_if_needed() 218 | 219 | # request_posts_data() 220 | 221 | save_posts_and_insights() 222 | 223 | 224 | if __name__ == "__main__": 225 | test_main() 226 | -------------------------------------------------------------------------------- /dags/ods/opening_crawler/dags/cakeresume_crawler.py: -------------------------------------------------------------------------------- 1 | """ 2 | A crawler which would crawl the openings 3 | """ 4 | 5 | from datetime import datetime, timedelta 6 | 7 | from airflow import DAG 8 | from airflow.operators.python_operator import PythonOperator 9 | from ods.opening_crawler.udfs.crawlers import CakeResumeCrawler 10 | 11 | DEFAULT_ARGS = { 12 | "owner": "davidtnfsh", 13 | "depends_on_past": False, 14 | "start_date": datetime(2020, 8, 30), 15 | "retries": 2, 16 | "retry_delay": timedelta(minutes=5), 17 | "on_failure_callback": lambda x: "Need to send notification to Telegrame", 18 | } 19 | dag = DAG( 20 | "OPENING_CRAWLER_V1", 21 | default_args=DEFAULT_ARGS, 22 | schedule_interval="@daily", 23 | max_active_runs=1, 24 | catchup=False, 25 | ) 26 | with dag: 27 | CRAWLER = PythonOperator( 28 | task_id="CRAWLER", 29 | python_callable=CakeResumeCrawler.crawl, 30 | provide_context=True, 31 | op_kwargs={}, 32 | ) 33 | 34 | if __name__ == "__main__": 35 | dag.cli() 36 | -------------------------------------------------------------------------------- /dags/ods/opening_crawler/udfs/crawlers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Crawler of openings 3 | """ 4 | 5 | from abc import ABC, abstractclassmethod 6 | 7 | 8 | class BaseCrawler(ABC): 9 | """ 10 | Abstract cralwer 11 | """ 12 | 13 | @classmethod 14 | @abstractclassmethod 15 | def crawl(cls, **conf): 16 | pass 17 | 18 | 19 | class CakeResumeCrawler(BaseCrawler): 20 | """ 21 | Crawler of cakeresume 22 | """ 23 | 24 | @classmethod 25 | def crawl(cls, **conf): 26 | print("i'm a CakeResume crawler!") 27 | return "i'm a CakeResume crawler!" 28 | -------------------------------------------------------------------------------- /dags/ods/survey_cake/dags/questionnaire_2_bigquery.py: -------------------------------------------------------------------------------- 1 | """ 2 | A crawler which would crawl the openings 3 | """ 4 | 5 | from __future__ import annotations 6 | 7 | import os 8 | from datetime import datetime, timedelta 9 | from pathlib import Path 10 | 11 | from airflow import DAG 12 | from airflow.operators.python_operator import PythonOperator 13 | from ods.survey_cake.udfs.survey_cake_csv_uploader import SurveyCakeCSVUploader 14 | 15 | AIRFLOW_HOME = os.getenv("AIRFLOW_HOME") 16 | 17 | DEFAULT_ARGS = { 18 | "owner": "davidtnfsh", 19 | "depends_on_past": False, 20 | "start_date": datetime(2020, 9, 30), 21 | "retries": 2, 22 | "retry_delay": timedelta(minutes=5), 23 | "on_failure_callback": lambda x: "Need to send notification to Telegrame", 24 | } 25 | dag = DAG( 26 | "QUESTIONNAIRE_2_BIGQUERY", 27 | default_args=DEFAULT_ARGS, 28 | schedule_interval=None, 29 | max_active_runs=1, 30 | catchup=False, 31 | ) 32 | with dag: 33 | if bool(os.getenv("AIRFLOW_TEST_MODE")): 34 | filepath = Path(AIRFLOW_HOME) / "dags/fixtures/data_questionnaire.csv" 35 | FILENAMES: dict[str, dict] = {str(filepath): {}} 36 | else: 37 | FILENAMES = { 38 | "data_questionnaire.csv": { 39 | "data_domain": "questionnaire", 40 | "primary_key": "ip", 41 | "time_dimension": "datetime", 42 | }, 43 | "data_sponsor_questionnaire.csv": { 44 | "data_domain": "sponsorQuestionnaire", 45 | "primary_key": "ip", 46 | "time_dimension": "datetime", 47 | }, 48 | } 49 | for filename, metadata in FILENAMES.items(): 50 | FILENAME_STEM = Path(filename).stem 51 | SURVEY_CAKE_CSV_UPLOADER = SurveyCakeCSVUploader(filename=filename) 52 | TRANSFORM = PythonOperator( 53 | task_id=f"TRANSFORM_{FILENAME_STEM}", 54 | python_callable=SURVEY_CAKE_CSV_UPLOADER.transform, 55 | provide_context=True, 56 | ) 57 | 58 | if not bool(os.getenv("AIRFLOW_TEST_MODE")): 59 | UPLOAD_FACTTABLE = PythonOperator( 60 | task_id=f"UPLOAD_FACTTABLE_{FILENAME_STEM}", 61 | python_callable=SURVEY_CAKE_CSV_UPLOADER.upload, 62 | op_kwargs={ 63 | "facttable_or_dimension_table": "fact", 64 | "data_layer": "ods", 65 | "data_domain": metadata["data_domain"], 66 | "primary_key": metadata["primary_key"], 67 | "time_dimension": metadata["time_dimension"], 68 | }, 69 | ) 70 | UPLOAD_DIMENSION_TABLE = PythonOperator( 71 | task_id=f"UPLOAD_DIMENSION_TABLE_{FILENAME_STEM}", 72 | python_callable=SURVEY_CAKE_CSV_UPLOADER.upload, 73 | op_kwargs={ 74 | "facttable_or_dimension_table": "dim", 75 | "data_layer": "dim", 76 | "data_domain": metadata["data_domain"], 77 | "primary_key": "questionId", 78 | "time_dimension": "year", 79 | }, 80 | ) 81 | TRANSFORM >> UPLOAD_FACTTABLE 82 | TRANSFORM >> UPLOAD_DIMENSION_TABLE 83 | 84 | 85 | if __name__ == "__main__": 86 | dag.cli() 87 | -------------------------------------------------------------------------------- /dags/ods/survey_cake/udfs/survey_cake_csv_uploader.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | from pathlib import Path 4 | 5 | from google.cloud import bigquery 6 | 7 | AIRFLOW_HOME = os.getenv("AIRFLOW_HOME") 8 | 9 | 10 | class SurveyCakeCSVUploader: 11 | def __init__(self, filename): 12 | self.filename = Path(filename) 13 | self.year = None 14 | if not bool(os.getenv("AIRFLOW_TEST_MODE")): 15 | self.client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 16 | 17 | self.facttable_filepath = ( 18 | self.filename.parent / f"{self.filename.stem}_facttable.csv" 19 | ) 20 | self.dimension_table_filepath = ( 21 | self.filename.parent / f"{self.filename.stem}_dimension.csv" 22 | ) 23 | 24 | @property 25 | def bigquery_project(self): 26 | return os.getenv("BIGQUERY_PROJECT") 27 | 28 | def transform(self, **context): 29 | self.year = context["execution_date"].year 30 | self._transform() 31 | 32 | def _transform(self): 33 | def _export_facttable(header_of_fact_table): 34 | with open(self.facttable_filepath, "w") as target: 35 | writer = csv.writer(target) 36 | writer.writerow(header_of_fact_table) 37 | for row in rows_of_fact_table: 38 | writer.writerow(row) 39 | 40 | def _export_dimension_table(question_id_dienstion_table): 41 | with open(self.dimension_table_filepath, "w") as target: 42 | writer = csv.writer(target) 43 | writer.writerow(("question_id", "question", "year")) 44 | for question_id, question in question_id_dienstion_table.items(): 45 | writer.writerow((question_id, question, self.year)) 46 | 47 | filepath = Path(AIRFLOW_HOME) / "dags" / self.filename 48 | with open(filepath, encoding="utf-8-sig") as csvfile: 49 | rows = csv.reader(csvfile) 50 | # skip header 51 | header = next(iter(rows)) 52 | question_id_dienstion_table = self._generate_question_id_dimension_table( 53 | header 54 | ) 55 | question_ids = sorted(question_id_dienstion_table.keys()) 56 | header_of_fact_table = ("ip", "question_id", "answer") 57 | rows_of_fact_table = self._transform_raw_data_to_fact_table_format( 58 | rows, question_id_dienstion_table, question_ids 59 | ) 60 | 61 | _export_facttable(header_of_fact_table) 62 | _export_dimension_table(question_id_dienstion_table) 63 | 64 | def upload( 65 | self, 66 | facttable_or_dimension_table, 67 | data_layer, 68 | data_domain, 69 | primary_key, 70 | time_dimension, 71 | ): 72 | if facttable_or_dimension_table == "fact": 73 | print(self.facttable_filepath) 74 | print(self.facttable_filepath) 75 | print(self.facttable_filepath) 76 | print(self.facttable_filepath) 77 | self._upload_2_bigquery( 78 | self.facttable_filepath, 79 | f"{self.bigquery_project}.{data_layer}.{data_layer}_{data_domain}_{primary_key}_{time_dimension}", 80 | ) 81 | elif facttable_or_dimension_table == "dim": 82 | self._upload_2_bigquery( 83 | self.dimension_table_filepath, 84 | f"{self.bigquery_project}.{data_layer}.{data_layer}_{data_domain}_{primary_key}_{time_dimension}", 85 | ) 86 | 87 | def _upload_2_bigquery(self, file_path, table_id): 88 | job_config = bigquery.LoadJobConfig( 89 | source_format=bigquery.SourceFormat.CSV, 90 | skip_leading_rows=1, 91 | autodetect=True, 92 | allow_quoted_newlines=True, 93 | write_disposition="WRITE_TRUNCATE", 94 | ) 95 | with open(file_path, "rb") as source_file: 96 | job = self.client.load_table_from_file( 97 | source_file, table_id, job_config=job_config 98 | ) 99 | 100 | job.result() # Waits for the job to complete. 101 | 102 | table = self.client.get_table(table_id) # Make an API request. 103 | print( 104 | f"Loaded {table.num_rows} rows and {len(table.schema)} columns to {table_id}" 105 | ) 106 | 107 | def _generate_question_id_dimension_table(self, header): 108 | question_id_dim_table = {} 109 | for index, column in enumerate(header): 110 | column = column.strip() 111 | question_id_dim_table[ 112 | index if column != "其他" else self._get_index_of_else_column(index) 113 | ] = column 114 | return question_id_dim_table 115 | 116 | @staticmethod 117 | def _get_index_of_else_column(index): 118 | """ 119 | use 0.1 to represent "其他" 120 | """ 121 | return index - 1 + 0.1 122 | 123 | @staticmethod 124 | def _transform_raw_data_to_fact_table_format( 125 | rows, question_id_dienstion_table, question_ids 126 | ): 127 | result = [] 128 | for row in rows: 129 | row_dict = dict(zip(question_ids, row)) 130 | question_id_of_primary_key = [ 131 | key 132 | for key, value in question_id_dienstion_table.items() 133 | if value == "IP紀錄" 134 | ][0] 135 | primary_key = row_dict[question_id_of_primary_key] 136 | for question_id, answer in row_dict.items(): 137 | result.append((primary_key, question_id, answer)) 138 | return result 139 | -------------------------------------------------------------------------------- /dags/ods/twitter_post_insights/dags.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.python_operator import PythonOperator 5 | from ods.twitter_post_insights import udfs 6 | 7 | DEFAULT_ARGS = { 8 | "owner": "Henry Lee", 9 | "depends_on_past": False, 10 | "start_date": datetime(2023, 6, 14, 0), 11 | "retries": 2, 12 | "retry_delay": timedelta(minutes=5), 13 | "on_failure_callback": lambda x: "Need to send notification to Discord!", 14 | } 15 | dag = DAG( 16 | "TWITTER_POST_INSIGHTS_V1", 17 | default_args=DEFAULT_ARGS, 18 | schedule_interval="5 8 * * *", 19 | max_active_runs=1, 20 | catchup=False, 21 | ) 22 | with dag: 23 | CREATE_TABLE_IF_NEEDED = PythonOperator( 24 | task_id="CREATE_TABLE_IF_NEEDED", 25 | python_callable=udfs.create_table_if_needed, 26 | ) 27 | 28 | SAVE_TWITTER_POSTS_AND_INSIGHTS = PythonOperator( 29 | task_id="SAVE_TWITTER_POSTS_AND_INSIGHTS", 30 | python_callable=udfs.save_twitter_posts_and_insights, 31 | ) 32 | 33 | CREATE_TABLE_IF_NEEDED >> SAVE_TWITTER_POSTS_AND_INSIGHTS 34 | 35 | 36 | if __name__ == "__main__": 37 | dag.cli() 38 | -------------------------------------------------------------------------------- /dags/ods/twitter_post_insights/udfs.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from datetime import datetime 4 | from typing import List, Optional 5 | 6 | import requests 7 | from airflow.models import Variable 8 | from google.cloud import bigquery 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def create_table_if_needed() -> None: 14 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 15 | post_sql = """ 16 | CREATE TABLE IF NOT EXISTS `pycontw-225217.ods.ods_pycontw_twitter_posts` ( 17 | id STRING, 18 | created_at TIMESTAMP, 19 | message STRING 20 | ) 21 | """ 22 | client.query(post_sql) 23 | insights_sql = """ 24 | CREATE TABLE IF NOT EXISTS `pycontw-225217.ods.ods_pycontw_twitter_posts_insights` ( 25 | post_id STRING, 26 | query_time TIMESTAMP, 27 | period STRING, 28 | favorite INTEGER, 29 | reply INTEGER, 30 | retweet INTEGER, 31 | views INTEGER 32 | ) 33 | """ 34 | client.query(insights_sql) 35 | 36 | 37 | def save_twitter_posts_and_insights() -> None: 38 | posts = request_posts_data() 39 | 40 | last_post = query_last_post() 41 | if last_post is None: 42 | new_posts = posts 43 | else: 44 | new_posts = [ 45 | post 46 | for post in posts 47 | if post["timestamp"] > last_post["created_at"].timestamp() 48 | ] 49 | 50 | if not dump_posts_to_bigquery( 51 | [ 52 | { 53 | "id": post["tweet_id"], 54 | "created_at": post["timestamp"], 55 | "message": post["text"], 56 | } 57 | for post in new_posts 58 | ] 59 | ): 60 | raise RuntimeError("Failed to dump posts to BigQuery") 61 | 62 | if not dump_posts_insights_to_bigquery( 63 | [ 64 | { 65 | "post_id": post["tweet_id"], 66 | "query_time": datetime.now().timestamp(), 67 | "period": "lifetime", 68 | "favorite": post["favorite_count"], 69 | "reply": post["reply_count"], 70 | "retweet": post["retweet_count"], 71 | "views": post["views"], 72 | } 73 | for post in posts 74 | ] 75 | ): 76 | raise RuntimeError("Failed to dump posts insights to BigQuery") 77 | 78 | 79 | def query_last_post() -> Optional[dict]: 80 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 81 | sql = """ 82 | SELECT 83 | created_at 84 | FROM 85 | `pycontw-225217.ods.ods_pycontw_twitter_posts` 86 | ORDER BY 87 | created_at DESC 88 | LIMIT 1 89 | """ 90 | result = client.query(sql) 91 | data = list(result) 92 | return data[0] if data else None 93 | 94 | 95 | def request_posts_data() -> List[dict]: 96 | url = "https://twitter154.p.rapidapi.com/user/tweets" 97 | # 499339900 is PyConTW's twitter id 98 | querystring = { 99 | "username": "pycontw", 100 | "user_id": "499339900", 101 | "limit": "40", 102 | "include_replies": "false", 103 | "include_pinned": "false", 104 | } 105 | headers = { 106 | "X-RapidAPI-Key": Variable.get("RAPIDAPIAPI_KEY"), 107 | "X-RapidAPI-Host": "twitter154.p.rapidapi.com", 108 | } 109 | response = requests.get(url, headers=headers, params=querystring) 110 | if response.ok: 111 | return response.json()["results"] 112 | raise RuntimeError(f"Failed to fetch posts data: {response.text}") 113 | 114 | 115 | def dump_posts_to_bigquery(posts: List[dict]) -> bool: 116 | if not posts: 117 | logger.info("No posts to dump!") 118 | return True 119 | 120 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 121 | job_config = bigquery.LoadJobConfig( 122 | schema=[ 123 | bigquery.SchemaField("id", "STRING", mode="REQUIRED"), 124 | bigquery.SchemaField("created_at", "TIMESTAMP", mode="REQUIRED"), 125 | bigquery.SchemaField("message", "STRING", mode="REQUIRED"), 126 | ], 127 | write_disposition="WRITE_APPEND", 128 | ) 129 | try: 130 | job = client.load_table_from_json( 131 | posts, 132 | "pycontw-225217.ods.ods_pycontw_twitter_posts", 133 | job_config=job_config, 134 | ) 135 | job.result() 136 | return True 137 | except Exception as e: 138 | logger.error(f"Failed to dump posts to BigQuery: {e}", exc_info=True) 139 | return False 140 | 141 | 142 | def dump_posts_insights_to_bigquery(posts: List[dict]) -> bool: 143 | if not posts: 144 | logger.info("No post insights to dump!") 145 | return True 146 | 147 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 148 | job_config = bigquery.LoadJobConfig( 149 | schema=[ 150 | bigquery.SchemaField("post_id", "STRING", mode="REQUIRED"), 151 | bigquery.SchemaField("query_time", "TIMESTAMP", mode="REQUIRED"), 152 | bigquery.SchemaField("period", "STRING", mode="REQUIRED"), 153 | bigquery.SchemaField("favorite", "INTEGER", mode="NULLABLE"), 154 | bigquery.SchemaField("reply", "INTEGER", mode="NULLABLE"), 155 | bigquery.SchemaField("retweet", "INTEGER", mode="NULLABLE"), 156 | bigquery.SchemaField("views", "INTEGER", mode="NULLABLE"), 157 | ], 158 | write_disposition="WRITE_APPEND", 159 | ) 160 | try: 161 | job = client.load_table_from_json( 162 | posts, 163 | "pycontw-225217.ods.ods_pycontw_twitter_posts_insights", 164 | job_config=job_config, 165 | ) 166 | job.result() 167 | return True 168 | except Exception as e: 169 | logger.error(f"Failed to dump posts insights to BigQuery: {e}", exc_info=True) 170 | return False 171 | -------------------------------------------------------------------------------- /dags/ods/youtube/dags/dag.py: -------------------------------------------------------------------------------- 1 | """ 2 | Save view, like count these kind of metrics into BigQuery 3 | """ 4 | 5 | from datetime import datetime, timedelta 6 | 7 | from airflow import DAG 8 | from airflow.operators.python_operator import PythonOperator 9 | from ods.youtube.udfs import youtube_api 10 | 11 | DEFAULT_ARGS = { 12 | "owner": "davidtnfsh", 13 | "depends_on_past": False, 14 | "start_date": datetime(2021, 9, 19), 15 | "retries": 2, 16 | "retry_delay": timedelta(minutes=5), 17 | "on_failure_callback": lambda x: "Need to send notification to Telegram", 18 | } 19 | dag = DAG( 20 | "ODS_YOUTUBE_2_BIGQUERY", 21 | default_args=DEFAULT_ARGS, 22 | schedule_interval="@daily", 23 | max_active_runs=1, 24 | catchup=False, 25 | ) 26 | with dag: 27 | CREATE_TABLE_IF_NEEDED = PythonOperator( 28 | task_id="CREATE_TABLE_IF_NEEDED", 29 | python_callable=youtube_api.create_table_if_needed, 30 | ) 31 | 32 | GET_VIDEO_IDS = PythonOperator( 33 | task_id="GET_VIDEO_IDS", 34 | python_callable=youtube_api.get_video_ids, 35 | provide_context=True, 36 | ) 37 | 38 | SAVE_STATISTICS_DATA_2_BQ = PythonOperator( 39 | task_id="SAVE_STATISTICS_DATA_2_BQ", 40 | python_callable=youtube_api.save_video_data_2_bq, 41 | provide_context=True, 42 | op_kwargs={"datatype": "statistics"}, 43 | ) 44 | CREATE_TABLE_IF_NEEDED >> GET_VIDEO_IDS >> SAVE_STATISTICS_DATA_2_BQ 45 | 46 | SAVE_INFO_DATA_2_BQ = PythonOperator( 47 | task_id="SAVE_INFO_DATA_2_BQ", 48 | python_callable=youtube_api.save_video_data_2_bq, 49 | provide_context=True, 50 | op_kwargs={"datatype": "info"}, 51 | ) 52 | GET_VIDEO_IDS >> SAVE_INFO_DATA_2_BQ 53 | 54 | if __name__ == "__main__": 55 | dag.cli() 56 | -------------------------------------------------------------------------------- /dags/ods/youtube/sqls/create_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS `{}.ods.ods_youtubeStatistics_videoId_datetime` 2 | ( 3 | created_at TIMESTAMP NOT NULL, 4 | videoId STRING NOT NULL, 5 | title STRING NOT NULL, 6 | viewCount INT64 NOT NULL, 7 | likeCount INT64 NOT NULL, 8 | dislikeCount INT64 NOT NULL, 9 | favoriteCount INT64 NOT NULL, 10 | commentCount INT64 NOT NULL 11 | ); 12 | 13 | CREATE TABLE IF NOT EXISTS `{}.ods.ods_youtubeInfo_videoId_datetime` 14 | ( 15 | created_at TIMESTAMP NOT NULL, 16 | videoId STRING NOT NULL, 17 | title STRING NOT NULL, 18 | image_url STRING NOT NULL, 19 | subtitle STRING NOT NULL, 20 | time TIMESTAMP NOT NULL, 21 | url STRING NOT NULL 22 | ); 23 | 24 | -------------------------------------------------------------------------------- /dags/ods/youtube/udfs/youtube_api.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | from pathlib import Path 4 | 5 | import pandas as pd 6 | from airflow import macros 7 | from airflow.hooks.http_hook import HttpHook 8 | from airflow.models import Variable 9 | from google.cloud import bigquery 10 | from utils.hook_related import RETRY_ARGS 11 | 12 | # channel id of YouTube is public to everyone, so it's okay to commit this ID into git 13 | CHANNEL_ID = "UCHLnNgRnfGYDzPCCH8qGbQw" 14 | MAX_RESULTS = 50 15 | PROJECT = os.getenv("BIGQUERY_PROJECT") 16 | AIRFLOW_HOME = os.getenv("AIRFLOW_HOME") 17 | 18 | 19 | def create_table_if_needed(): 20 | client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT")) 21 | sql_filepath = Path(AIRFLOW_HOME) / "dags/ods/youtube/sqls/create_table.sql" 22 | sql = sql_filepath.read_text().format(PROJECT) 23 | client.query(sql) 24 | 25 | 26 | def get_video_ids(**context) -> None: 27 | video_metadatas = [] 28 | execution_date = context["execution_date"].replace(tzinfo=None) 29 | last_year = execution_date - macros.timedelta(days=365) 30 | last_year_RFC_3339_format = f"{last_year.date()}T00:00:00Z" 31 | http_conn = HttpHook(method="GET", http_conn_id="youtube") 32 | base_url = f"/youtube/v3/search?key={Variable.get('YOUTUBE_KEY')}&channelId={CHANNEL_ID}&part=snippet,id&order=date&maxResults={MAX_RESULTS}&publishedAfter={last_year_RFC_3339_format}" 33 | 34 | response_json = http_conn.run_with_advanced_retry( 35 | endpoint=base_url, 36 | _retry_args=RETRY_ARGS, 37 | headers={"Content-Type": "application/json", "Cache-Control": "no-cache"}, 38 | ).json() 39 | video_metadatas += [ 40 | {"videoId": item["id"]["videoId"], "title": item["snippet"]["title"]} 41 | for item in response_json["items"] 42 | if "videoId" in item["id"] 43 | ] 44 | while response_json.get("nextPageToken"): 45 | next_page_token = response_json["nextPageToken"] 46 | response_json = http_conn.run_with_advanced_retry( 47 | endpoint=f"{base_url}&pageToken={next_page_token}", 48 | _retry_args=RETRY_ARGS, 49 | headers={"Content-Type": "application/json", "Cache-Control": "no-cache"}, 50 | ).json() 51 | video_metadatas += [ 52 | {"videoId": item["id"]["videoId"], "title": item["snippet"]["title"]} 53 | for item in response_json["items"] 54 | if "videoId" in item["id"] 55 | ] 56 | task_instance = context["task_instance"] 57 | task_instance.xcom_push("GET_VIDEO_IDS", video_metadatas) 58 | 59 | 60 | def save_video_data_2_bq(**context): 61 | def _init(): 62 | client = bigquery.Client(project=PROJECT) 63 | http_conn = HttpHook(method="GET", http_conn_id="youtube") 64 | execution_date = context["execution_date"].replace(tzinfo=None) 65 | task_instance = context["task_instance"] 66 | datatype = context["datatype"] 67 | video_metadatas = task_instance.xcom_pull("GET_VIDEO_IDS", key="GET_VIDEO_IDS") 68 | result = [] 69 | return ( 70 | client, 71 | http_conn, 72 | execution_date, 73 | task_instance, 74 | datatype, 75 | video_metadatas, 76 | result, 77 | ) 78 | 79 | def _get_statistics(): 80 | for video_metadata in video_metadatas: 81 | video_id = video_metadata["videoId"] 82 | title = video_metadata["title"] 83 | response_json = http_conn.run_with_advanced_retry( 84 | endpoint=f"/youtube/v3/videos?id={video_id}&key={Variable.get('YOUTUBE_KEY')}&part=statistics", 85 | _retry_args=RETRY_ARGS, 86 | headers={ 87 | "Content-Type": "application/json", 88 | "Cache-Control": "no-cache", 89 | }, 90 | ).json() 91 | print(response_json["items"][0]["statistics"].keys()) 92 | result.append( 93 | ( 94 | execution_date, 95 | video_id, 96 | title, 97 | int(response_json["items"][0]["statistics"]["viewCount"]), 98 | int(response_json["items"][0]["statistics"]["likeCount"]), 99 | 0, # dislikeCount field is not available in statistics API since 2021! 100 | int(response_json["items"][0]["statistics"]["favoriteCount"]), 101 | int(response_json["items"][0]["statistics"]["commentCount"]), 102 | ) 103 | ) 104 | return result 105 | 106 | def _get_info(): 107 | for video_metadata in video_metadatas: 108 | video_id = video_metadata["videoId"] 109 | title = video_metadata["title"] 110 | response_json = http_conn.run_with_advanced_retry( 111 | endpoint=f"/youtube/v3/videos?id={video_id}&key={Variable.get('YOUTUBE_KEY')}&part=snippet", 112 | _retry_args=RETRY_ARGS, 113 | headers={ 114 | "Content-Type": "application/json", 115 | "Cache-Control": "no-cache", 116 | }, 117 | ).json() 118 | result.append( 119 | ( 120 | execution_date, 121 | video_id, 122 | title, 123 | response_json["items"][0]["snippet"]["thumbnails"]["default"][ 124 | "url" 125 | ], 126 | response_json["items"][0]["description"], 127 | datetime.strptime( 128 | response_json["items"][0]["publishedAt"], "%Y-%m-%dT%H:%M:%SZ" 129 | ), 130 | f"https://www.youtube.com/watch?v={response_json['items'][0]['id']}", 131 | ) 132 | ) 133 | return result 134 | 135 | def _transform_to_pandas_dataframe(result): 136 | df = pd.DataFrame( 137 | result, 138 | columns=[ 139 | "created_at", 140 | "videoId", 141 | "title", 142 | "viewCount", 143 | "likeCount", 144 | "dislikeCount", 145 | "favoriteCount", 146 | "commentCount", 147 | ], 148 | ) 149 | return df 150 | 151 | def _insert_to_bq(df, tablename): 152 | TABLE = f"PROJECT.{tablename}" 153 | job = client.load_table_from_dataframe(df, TABLE) 154 | job.result() 155 | 156 | ( 157 | client, 158 | http_conn, 159 | execution_date, 160 | task_instance, 161 | datatype, 162 | video_metadatas, 163 | result, 164 | ) = _init() 165 | 166 | if datatype == "statistics": 167 | tablename = "ods.ods_youtubeStatistics_videoId_datetime" 168 | result = _get_statistics() 169 | elif datatype == "info": 170 | tablename = "ods.ods_youtubeInfo_videoId_datetime" 171 | result = _get_info() 172 | else: 173 | raise RuntimeError(f"Unsupported datatype: {datatype}") 174 | 175 | df = _transform_to_pandas_dataframe(result) 176 | _insert_to_bq(df, tablename) 177 | -------------------------------------------------------------------------------- /dags/utils/hook_related.py: -------------------------------------------------------------------------------- 1 | import tenacity 2 | 3 | RETRY_ARGS = { 4 | "stop": tenacity.stop_after_attempt(10), 5 | "wait": tenacity.wait_fixed(120), 6 | "reraise": True, 7 | } 8 | -------------------------------------------------------------------------------- /docker-compose-dev.yml: -------------------------------------------------------------------------------- 1 | x-docker-common: &docker-common 2 | env_file: .env.staging 3 | image: pycon-etl 4 | build: 5 | context: . 6 | dockerfile: Dockerfile.test 7 | volumes: 8 | - ./airflow.db:/opt/airflow/airflow.db 9 | # you can comment out the following line if you don't have service-account.json 10 | - ./service-account.json:/opt/airflow/service-account.json 11 | restart: unless-stopped 12 | logging: 13 | driver: json-file 14 | options: 15 | max-size: 10m 16 | 17 | services: 18 | airflow: 19 | <<: *docker-common 20 | container_name: airflow 21 | ports: 22 | - "8080:8080" 23 | command: webserver 24 | 25 | scheduler: 26 | <<: *docker-common 27 | container_name: scheduler 28 | depends_on: 29 | - airflow 30 | command: scheduler 31 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | x-docker-common: &docker-common 2 | env_file: .env.production 3 | image: asia-east1-docker.pkg.dev/pycontw-225217/data-team/pycon-etl:latest 4 | volumes: 5 | - ./service-account.json:/opt/airflow/service-account.json 6 | - ./airflow.db:/opt/airflow/airflow.db 7 | restart: unless-stopped 8 | logging: 9 | driver: json-file 10 | options: 11 | max-size: 10m 12 | 13 | services: 14 | airflow: 15 | <<: *docker-common 16 | container_name: airflow 17 | ports: 18 | - "8080:8080" 19 | command: webserver 20 | 21 | scheduler: 22 | <<: *docker-common 23 | container_name: scheduler 24 | depends_on: 25 | - airflow 26 | command: scheduler 27 | -------------------------------------------------------------------------------- /docs/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guide 2 | 3 | ## How to Contribute to this Project 4 | 5 | 1. Clone this repository: 6 | 7 | ```bash 8 | git clone https://github.com/pycontw/pycon-etl 9 | ``` 10 | 11 | 2. Create a new branch: 12 | 13 | ```bash 14 | git checkout -b 15 | ``` 16 | 17 | 3. Make your changes. 18 | 19 | > **NOTICE:** We are still using Airflow v1, so please read the official document [Apache Airflow v1.10.15 Documentation](https://airflow.apache.org/docs/apache-airflow/1.10.15/) to ensure your changes are compatible with our current version. 20 | 21 | If your task uses an external service, add the connection and variable in the Airflow UI. 22 | 23 | 4. Test your changes in your local environment: 24 | 25 | - Ensure the DAG file is loaded successfully. 26 | - Verify that the task runs successfully. 27 | - Confirm that your code is correctly formatted and linted. 28 | - Check that all necessary dependencies are included in `requirements.txt`. 29 | 30 | 5. Push your branch: 31 | 32 | ```bash 33 | git push origin 34 | ``` 35 | 36 | 6. Create a Pull Request (PR). 37 | 38 | 7. Wait for the review and merge. 39 | 40 | 8. Write any necessary documentation. 41 | 42 | ## Release Management 43 | 44 | Please use [GitLab Flow](https://about.gitlab.com/topics/version-control/what-is-gitlab-flow/); otherwise, you cannot pass Docker Hub CI. 45 | 46 | ## Dependency Management 47 | 48 | Airflow dependencies are managed by [uv]. For more information, refer to the [Airflow Installation Documentation](https://airflow.apache.org/docs/apache-airflow/1.10.15/installation.html). 49 | 50 | ## Code Convention 51 | 52 | ### Airflow DAG 53 | 54 | - Please refer to [this article](https://medium.com/@davidtnfsh/%E5%A4%A7%E6%95%B0%E6%8D%AE%E4%B9%8B%E8%B7%AF-%E9%98%BF%E9%87%8C%E5%B7%B4%E5%B7%B4%E5%A4%A7%E6%95%B0%E6%8D%AE%E5%AE%9E%E8%B7%B5-%E8%AE%80%E6%9B%B8%E5%BF%83%E5%BE%97-54e795c2b8c) for naming guidelines. 55 | 56 | - Examples: 57 | 1. `ods/opening_crawler`: Crawlers written by @Rain. These openings can be used for the recruitment board, which was implemented by @tai271828 and @stacy. 58 | 2. `ods/survey_cake`: A manually triggered uploader that uploads questionnaires to BigQuery. The uploader should be invoked after we receive the SurveyCake questionnaire. 59 | 60 | - Table name convention: 61 | ![img](https://miro.medium.com/max/1400/1*bppuEKMnL9gFnvoRHUO8CQ.png) 62 | 63 | ### Format 64 | 65 | Please use `make format` to format your code before committing, otherwise, the CI will fail. 66 | 67 | ### Commit Message 68 | 69 | It is recommended to use [Commitizen](https://commitizen-tools.github.io/commitizen/). 70 | 71 | ### CI/CD 72 | 73 | Please check the [.github/workflows](.github/workflows) directory for details. 74 | 75 | [uv]: https://docs.astral.sh/uv/ -------------------------------------------------------------------------------- /docs/DEPLOYMENT.md: -------------------------------------------------------------------------------- 1 | # Deployment Guide 2 | 3 | 1. Login to the data team's server: 4 | 1. Run: `gcloud compute ssh --zone "asia-east1-b" "data-team" --project "pycontw-225217"` 5 | 2. Services: 6 | * ETL: `/srv/pycon-etl` 7 | * Metabase is located here: `/mnt/disks/data-team-additional-disk/pycontw-infra-scripts/data_team/metabase_server` 8 | 9 | 2. Pull the latest codebase to this server: `git pull` 10 | 11 | 3. Add credentials to the `.env.production` file (only needs to be done once). 12 | 13 | 4. Start the services: 14 | 15 | ```bash 16 | # Start production services 17 | docker-compose -f ./docker-compose.yml up 18 | 19 | # Stop production services 20 | # docker-compose -f ./docker-compose.yml down 21 | ``` -------------------------------------------------------------------------------- /docs/MAINTENANCE.md: -------------------------------------------------------------------------------- 1 | # Maintenance Guide 2 | 3 | ## Disk Space 4 | 5 | Currently, the disk space is limited, so please check the disk space before running any ETL jobs. 6 | 7 | This section will be deprecated if we no longer encounter out-of-disk issues. 8 | 9 | 1. Find the largest folders: 10 | ```bash 11 | du -a /var/lib/docker/overlay2 | sort -n -r | head -n 20 12 | ``` 13 | 2. Show the folder size: 14 | ```bash 15 | du -hs xxxx 16 | ``` 17 | 3. Delete the large folders identified. 18 | 4. Check disk space: 19 | ```bash 20 | df -h 21 | ``` 22 | 23 | ## Token Expiration 24 | 25 | Some API tokens might expire, so please check them regularly. 26 | 27 | ## Year-to-Year Jobs 28 | 29 | Please refer to [Dev Data Team - Year to Year Jobs - HackMD](https://hackmd.io/R417olqPQSWnQYY1Oc_-Sw?view) for more details. 30 | -------------------------------------------------------------------------------- /docs/airflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/docs/airflow.png -------------------------------------------------------------------------------- /docs/kktix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/docs/kktix.png -------------------------------------------------------------------------------- /docs/youtube-connection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/docs/youtube-connection.png -------------------------------------------------------------------------------- /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Exit script on first error 3 | set -e 4 | 5 | # Check if the AIRFLOW_HOME variable is set 6 | if [ -z "${AIRFLOW_HOME}" ]; then 7 | echo 'AIRFLOW_HOME not set' 8 | exit 1 9 | fi 10 | 11 | # Create Fernet key if not exists 12 | if [ -z "${AIRFLOW__CORE__FERNET_KEY}" ]; then 13 | echo "Fernet key not set. Generating a new one." 14 | export AIRFLOW__CORE__FERNET_KEY=$(python -c 'from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())') 15 | echo "Fernet key generated and set." 16 | echo "[WARNING] Please save the AIRFLOW__CORE__FERNET_KEY for future use." 17 | else 18 | echo "Fernet key exists." 19 | fi 20 | 21 | # Check if the database exists and initialize it if not 22 | if [ ! -f "${AIRFLOW_HOME}/airflow.db" ]; then 23 | airflow db init 24 | echo 'Database initialized' 25 | else 26 | echo 'Database existed' 27 | fi 28 | 29 | # Check if the GCP service account is provided 30 | if [ -z "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then 31 | echo "No GCP service account provided, set to default path" 32 | export GOOGLE_APPLICATION_CREDENTIALS="${AIRFLOW_HOME}/service-account.json" 33 | fi 34 | 35 | # Check if the command is provided 36 | if [ -z "$1" ]; then 37 | echo "No command provided. Usage: $0 {airflow_command}" 38 | exit 1 39 | fi 40 | 41 | # Execute the provided Airflow command 42 | echo "Running command: airflow $@" 43 | exec airflow "$@" 44 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pycon-etl" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.8,<3.9" 7 | dependencies = [ 8 | # Please use pip to manage airflow dependencies. 9 | "apache-airflow==1.10.15", 10 | # Editable install with no version specified. 11 | "google-cloud-bigquery", 12 | "pandas", 13 | "pyarrow", 14 | "pydantic<2", 15 | "pygsheets", 16 | "requests", 17 | "searchconsole", 18 | ] 19 | 20 | [dependency-groups] 21 | dev = [ 22 | "bandit", 23 | "ruff", 24 | "mypy", 25 | "pytest", 26 | "pytest-cov", 27 | "coverage[toml]", 28 | "safety", 29 | ] 30 | 31 | 32 | [tool.ruff] 33 | line-length = 88 34 | 35 | [tool.ruff.lint] 36 | extend-select = [ 37 | "I", # Missing required import (auto-fixable) 38 | "UP", # Pyupgrade 39 | "PT", # flake8-pytest-style rules 40 | "TID25", # flake8-tidy-imports rules 41 | ] 42 | 43 | ignore = ["E501", "D1", "D415"] 44 | 45 | [tool.ruff.lint.isort] 46 | combine-as-imports = true 47 | known-first-party = ["tests"] 48 | 49 | [tool.ruff.lint.pydocstyle] 50 | convention = "google" 51 | 52 | 53 | [tool.coverage] 54 | [tool.coverage.report] 55 | show_missing = true 56 | exclude_lines = [ 57 | # Have to re-enable the standard pragma 58 | 'pragma: no cover', 59 | 60 | # Don't complain about missing debug-only code: 61 | 'def __repr__', 62 | 'if self\.debug', 63 | 64 | # Don't complain if tests don't hit defensive assertion code: 65 | 'raise AssertionError', 66 | 'raise NotImplementedError', 67 | 68 | # Don't complain if non-runnable code isn't run: 69 | 'if 0:', 70 | 'if __name__ == .__main__.:', 71 | ] 72 | omit = ['env/*', 'venv/*', '*/virtualenv/*', '*/virtualenvs/*', '*/tests/*'] 73 | 74 | [tool.uv] 75 | constraint-dependencies = [ 76 | # constraints-3.8.txt for apache-airflow==1.10.15 77 | # Editable install with no version control (apache-airflow==1.10.15) 78 | "Babel==2.9.0", 79 | "Flask-Admin==1.5.4", 80 | "Flask-AppBuilder==2.3.4", 81 | "Flask-Babel==1.0.0", 82 | "Flask-Bcrypt==0.7.1", 83 | "Flask-Caching==1.3.3", 84 | "Flask-JWT-Extended==3.25.0", 85 | "Flask-Login==0.4.1", 86 | "Flask-OpenID==1.3.0", 87 | "Flask-SQLAlchemy==2.4.4", 88 | "Flask-WTF==0.14.3", 89 | "Flask==1.1.2", 90 | "GitPython==3.1.11", 91 | "JPype1==0.7.1", 92 | "JayDeBeApi==1.2.3", 93 | "Jinja2==2.11.2", 94 | "Mako==1.1.3", 95 | "Markdown==2.6.11", 96 | "MarkupSafe==1.1.1", 97 | "PyHive==0.6.3", 98 | "PyJWT==1.7.1", 99 | "PyNaCl==1.4.0", 100 | "PySmbClient==0.1.5", 101 | "PyYAML==5.3.1", 102 | "Pygments==2.7.2", 103 | "SQLAlchemy-JSONField==0.9.0", 104 | "SQLAlchemy-Utils==0.36.8", 105 | "SQLAlchemy==1.3.20", 106 | "Sphinx==3.3.1", 107 | "Unidecode==1.1.1", 108 | "WTForms==2.3.3", 109 | "Werkzeug==0.16.1", 110 | "adal==1.2.5", 111 | "aiohttp==3.7.3", 112 | "alabaster==0.7.12", 113 | "alembic==1.4.3", 114 | "amqp==2.6.1", 115 | "analytics-python==1.2.9", 116 | "ansiwrap==0.8.4", 117 | "apispec==1.3.3", 118 | "appdirs==1.4.4", 119 | "argcomplete==1.12.2", 120 | "asn1crypto==1.4.0", 121 | "astroid==2.4.2", 122 | "async-generator==1.10", 123 | "async-timeout==3.0.1", 124 | "atlasclient==1.0.0", 125 | "attrs==20.3.0", 126 | "aws-sam-translator==1.31.0", 127 | "aws-xray-sdk==2.6.0", 128 | "azure-common==1.1.26", 129 | "azure-core==1.9.0", 130 | "azure-cosmos==3.2.0", 131 | "azure-datalake-store==0.0.51", 132 | "azure-identity==1.5.0", 133 | "azure-keyvault-certificates==4.2.1", 134 | "azure-keyvault-keys==4.3.0", 135 | "azure-keyvault-secrets==4.2.0", 136 | "azure-keyvault==4.1.0", 137 | "azure-mgmt-containerinstance==1.5.0", 138 | "azure-mgmt-core==1.2.2", 139 | "azure-mgmt-datalake-nspkg==3.0.1", 140 | "azure-mgmt-datalake-store==0.5.0", 141 | "azure-mgmt-nspkg==3.0.2", 142 | "azure-mgmt-resource==15.0.0", 143 | "azure-nspkg==3.0.2", 144 | "azure-storage-blob==12.6.0", 145 | "azure-storage-common==2.1.0", 146 | "azure-storage==0.36.0", 147 | "backcall==0.2.0", 148 | "bcrypt==3.2.0", 149 | "beautifulsoup4==4.7.1", 150 | "billiard==3.6.3.0", 151 | "black==20.8b0", 152 | "blinker==1.4", 153 | "boto3==1.10.50", 154 | "boto==2.49.0", 155 | "botocore==1.13.50", 156 | "cached-property==1.5.2", 157 | "cachetools==4.1.1", 158 | "cassandra-driver==3.20.2", 159 | "cattrs==1.1.2", 160 | "celery==4.4.7", 161 | "certifi==2020.11.8", 162 | "cffi==1.14.4", 163 | "cfgv==3.2.0", 164 | "cfn-lint==0.42.0", 165 | "cgroupspy==0.1.6", 166 | "chardet==3.0.4", 167 | "click==6.7", 168 | "cloudant==0.5.10", 169 | "colorama==0.4.4", 170 | "colorlog==4.0.2", 171 | "configparser==3.5.3", 172 | "coverage==5.3", 173 | "croniter==0.3.36", 174 | "cryptography==3.2.1", 175 | "cx-Oracle==8.0.1", 176 | "datadog==0.39.0", 177 | "decorator==4.4.2", 178 | "defusedxml==0.6.0", 179 | "dill==0.3.3", 180 | "distlib==0.3.1", 181 | "dnspython==1.16.0", 182 | "docker-pycreds==0.4.0", 183 | "docker==3.7.3", 184 | "docopt==0.6.2", 185 | "docutils==0.15.2", 186 | "ecdsa==0.14.1", 187 | "elasticsearch-dsl==5.4.0", 188 | "elasticsearch==5.5.3", 189 | "email-validator==1.1.2", 190 | "entrypoints==0.3", 191 | "fastavro==1.2.0", 192 | "filelock==3.0.12", 193 | "flake8-colors==0.1.9", 194 | "flake8==3.8.4", 195 | "flaky==3.7.0", 196 | "flask-swagger==0.2.14", 197 | "flower==0.9.5", 198 | "freezegun==1.0.0", 199 | "fsspec==0.8.4", 200 | "funcsigs==1.0.2", 201 | "future-fstrings==1.2.0", 202 | "future==0.18.2", 203 | "gcsfs==0.7.1", 204 | "gitdb==4.0.5", 205 | "google-api-core==1.23.0", 206 | "google-api-python-client==1.12.8", 207 | "google-auth-httplib2==0.0.4", 208 | "google-auth-oauthlib==0.4.2", 209 | "google-auth==1.23.0", 210 | "google-cloud-bigquery-storage==2.1.0", 211 | "google-cloud-bigquery==2.4.0", 212 | "google-cloud-bigtable==1.6.0", 213 | "google-cloud-container==1.0.1", 214 | "google-cloud-core==1.4.3", 215 | "google-cloud-dlp==1.0.0", 216 | "google-cloud-language==1.3.0", 217 | "google-cloud-secret-manager==1.0.0", 218 | "google-cloud-spanner==1.19.1", 219 | "google-cloud-speech==1.3.2", 220 | "google-cloud-storage==1.33.0", 221 | "google-cloud-texttospeech==1.0.1", 222 | "google-cloud-translate==1.7.0", 223 | "google-cloud-videointelligence==1.16.1", 224 | "google-cloud-vision==1.0.0", 225 | "google-crc32c==1.0.0", 226 | "google-resumable-media==1.1.0", 227 | "googleapis-common-protos==1.52.0", 228 | "graphviz==0.15", 229 | "grpc-google-iam-v1==0.12.3", 230 | "grpcio-gcp==0.2.2", 231 | "grpcio==1.33.2", 232 | "gunicorn==20.0.4", 233 | "hdfs==2.5.8", 234 | "hmsclient==0.1.1", 235 | "httplib2==0.18.1", 236 | "humanize==3.1.0", 237 | "hvac==0.10.5", 238 | "identify==1.5.10", 239 | "idna==2.8", 240 | "imagesize==1.2.0", 241 | "importlib-metadata==2.1.1", 242 | "importlib-resources==1.5.0", 243 | "inflection==0.5.1", 244 | "ipdb==0.13.4", 245 | "ipython-genutils==0.2.0", 246 | "ipython==7.19.0", 247 | "iso8601==0.1.13", 248 | "isodate==0.6.0", 249 | "itsdangerous==1.1.0", 250 | "jedi==0.17.2", 251 | "jira==2.0.0", 252 | "jmespath==0.10.0", 253 | "json-merge-patch==0.2", 254 | "jsondiff==1.1.2", 255 | "jsonpatch==1.27", 256 | "jsonpickle==1.4.1", 257 | "jsonpointer==2.0", 258 | "jsonschema==3.2.0", 259 | "junit-xml==1.9", 260 | "jupyter-client==6.1.7", 261 | "jupyter-core==4.7.0", 262 | "kombu==4.6.11", 263 | "kubernetes==11.0.0", 264 | "lazy-object-proxy==1.4.3", 265 | "ldap3==2.8.1", 266 | "libcst==0.3.14", 267 | "lockfile==0.12.2", 268 | "marshmallow-enum==1.5.1", 269 | "marshmallow-sqlalchemy==0.23.1", 270 | "marshmallow==2.21.0", 271 | "mccabe==0.6.1", 272 | "mock==4.0.2", 273 | "mongomock==3.21.0", 274 | "more-itertools==8.6.0", 275 | "moto==1.3.14", 276 | "msal-extensions==0.3.0", 277 | "msal==1.6.0", 278 | "msrest==0.6.19", 279 | "msrestazure==0.6.4", 280 | "multi-key-dict==2.0.3", 281 | "multidict==5.0.2", 282 | "mypy-extensions==0.4.3", 283 | "mypy==0.720", 284 | "mysqlclient==1.3.14", 285 | "natsort==7.1.0", 286 | "nbclient==0.5.1", 287 | "nbformat==5.0.8", 288 | "nest-asyncio==1.4.3", 289 | "networkx==2.5", 290 | "nodeenv==1.5.0", 291 | "nteract-scrapbook==0.4.1", 292 | "ntlm-auth==1.5.0", 293 | "numpy==1.19.4", 294 | "oauthlib==3.1.0", 295 | "oscrypto==1.2.1", 296 | "packaging==20.7", 297 | "pandas-gbq==0.14.1", 298 | "pandas==1.1.4", 299 | "papermill==2.2.2", 300 | "parameterized==0.7.4", 301 | "paramiko==2.7.2", 302 | "parso==0.7.1", 303 | "pathspec==0.8.1", 304 | "pbr==5.5.1", 305 | "pendulum==1.4.4", 306 | "pexpect==4.8.0", 307 | "pickleshare==0.7.5", 308 | "pinotdb==0.1.1", 309 | "pipdeptree==1.0.0", 310 | "pluggy==0.13.1", 311 | "portalocker==1.7.1", 312 | "pre-commit==2.9.2", 313 | "presto-python-client==0.7.0", 314 | "prison==0.1.3", 315 | "prometheus-client==0.8.0", 316 | "prompt-toolkit==3.0.8", 317 | "proto-plus==1.11.0", 318 | "protobuf==3.14.0", 319 | "psutil==5.7.3", 320 | "psycopg2-binary==2.8.6", 321 | "ptyprocess==0.6.0", 322 | "py==1.9.0", 323 | "pyOpenSSL==20.0.0", 324 | "pyarrow==0.17.1", 325 | "pyasn1-modules==0.2.8", 326 | "pyasn1==0.4.8", 327 | "pycodestyle==2.6.0", 328 | "pycparser==2.20", 329 | "pycryptodomex==3.9.9", 330 | "pydata-google-auth==1.1.0", 331 | "pydruid==0.5.8", 332 | "pyflakes==2.2.0", 333 | "pykerberos==1.2.1", 334 | "pymongo==3.10.1", 335 | "pyparsing==2.4.7", 336 | "pyrsistent==0.17.3", 337 | "pysftp==0.2.9", 338 | "pytest-cov==2.10.1", 339 | "pytest-instafail==0.4.2", 340 | "pytest-timeouts==1.2.1", 341 | "pytest==5.4.3", 342 | "python-daemon==2.2.4", 343 | "python-dateutil==2.8.1", 344 | "python-editor==1.0.4", 345 | "python-http-client==3.3.1", 346 | "python-jenkins==1.7.0", 347 | "python-jose==3.2.0", 348 | "python-nvd3==0.15.0", 349 | "python-slugify==4.0.1", 350 | "python3-openid==3.2.0", 351 | "pytz==2020.4", 352 | "pytzdata==2020.1", 353 | "pywinrm==0.4.1", 354 | "pyzmq==20.0.0", 355 | "qds-sdk==1.16.1", 356 | "redis==3.5.3", 357 | "regex==2020.11.13", 358 | "requests-futures==0.9.4", 359 | "requests-kerberos==0.12.0", 360 | "requests-mock==1.8.0", 361 | "requests-ntlm==1.1.0", 362 | "requests-oauthlib==1.3.0", 363 | "requests-toolbelt==0.9.1", 364 | "requests==2.23.0", 365 | "responses==0.12.1", 366 | "rsa==4.6", 367 | "s3transfer==0.2.1", 368 | "sasl==0.2.1", 369 | "sendgrid==5.6.0", 370 | "sentinels==1.0.0", 371 | "sentry-sdk==0.19.4", 372 | "setproctitle==1.2", 373 | "six==1.15.0", 374 | "slackclient==1.3.2", 375 | "smmap==3.0.4", 376 | "snakebite-py3==3.0.5", 377 | "snowballstemmer==2.0.0", 378 | "snowflake-connector-python==2.3.6", 379 | "snowflake-sqlalchemy==1.2.4", 380 | "soupsieve==2.0.1", 381 | "sphinx-argparse==0.2.5", 382 | "sphinx-autoapi==1.0.0", 383 | "sphinx-copybutton==0.3.1", 384 | "sphinx-jinja==1.1.1", 385 | "sphinx-rtd-theme==0.5.0", 386 | "sphinxcontrib-applehelp==1.0.2", 387 | "sphinxcontrib-devhelp==1.0.2", 388 | "sphinxcontrib-dotnetdomain==0.4", 389 | "sphinxcontrib-golangdomain==0.2.0.dev0", 390 | "sphinxcontrib-htmlhelp==1.0.3", 391 | "sphinxcontrib-httpdomain==1.7.0", 392 | "sphinxcontrib-jsmath==1.0.1", 393 | "sphinxcontrib-qthelp==1.0.3", 394 | "sphinxcontrib-serializinghtml==1.1.4", 395 | "sshpubkeys==3.1.0", 396 | "sshtunnel==0.1.5", 397 | "tabulate==0.8.7", 398 | "tenacity==4.12.0", 399 | "text-unidecode==1.3", 400 | "textwrap3==0.9.2", 401 | "thrift-sasl==0.4.2", 402 | "thrift==0.13.0", 403 | "toml==0.10.2", 404 | "tornado==5.1.1", 405 | "tqdm==4.54.0", 406 | "traitlets==5.0.5", 407 | "typed-ast==1.4.1", 408 | "typing-extensions==3.7.4.3", 409 | "typing-inspect==0.6.0", 410 | "tzlocal==1.5.1", 411 | "unicodecsv==0.14.1", 412 | "uritemplate==3.0.1", 413 | "urllib3==1.25.11", 414 | "vertica-python==1.0.0", 415 | "vine==1.3.0", 416 | "virtualenv==20.2.1", 417 | "wcwidth==0.2.5", 418 | "websocket-client==0.54.0", 419 | "wrapt==1.12.1", 420 | "xmltodict==0.12.0", 421 | "yamllint==1.25.0", 422 | "yarl==1.6.3", 423 | "zdesk==2.7.1", 424 | "zipp==3.4.0", 425 | "zope.deprecation==4.4.0", 426 | ] 427 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [mypy] 2 | files = dags, tests 3 | ignore_missing_imports = true 4 | follow_imports = silent 5 | warn_redundant_casts = True 6 | warn_unused_ignores = True 7 | warn_unused_configs = True 8 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture 5 | def kktix_api_data(): 6 | return [ 7 | { 8 | "id": 84296, 9 | "name": "PyCon APAC 2022 Registration: Individual【Online Conference】", 10 | "attendee_info": { 11 | "id": 84748358, 12 | "ticket_id": 449116, 13 | "ticket_name": "Regular 一般票(with Pyckage)", 14 | "reg_no": 104, 15 | "state": "activated", 16 | "checkin_code": "BC7B", 17 | "qrcode": "bc7bd846f49d2d2e1g833cc92gdg2cf9", 18 | "is_paid": True, 19 | "price": 2600, 20 | "currency": "TWD", 21 | "payment_method": "WEBSITE", 22 | "data": [ 23 | ["Nickname / 暱稱", "Stanley"], 24 | ["Gender / 生理性別", "Male / 男性"], 25 | [ 26 | "If you buy the ticket with PySafe, remember to fill out correct address and size of t-shirt for us to send the parcel. if you fill the wrong information to cause missed delivery, we will not resend th", 27 | "", 28 | ], 29 | [ 30 | "購買含 Pyckage 票卷者,請務必填寫正確之「Address / 收件地址」和「Size of T-shirt / T恤尺寸 」(僅限台灣及離島區域),以避免 Pyckage 無法送達,如因填寫錯誤致未收到 Pyckage ,報名人須自行負責,大會恕不再另行補寄", 31 | "", 32 | ], 33 | [ 34 | "Address / 收件地址 Ex: No. 128, Sec. 2, Academia Rd., Nangang Dist., Taipei City 115201, Taiwan (R.O.C.) / 115台北市南港區研究院路二段128號", 35 | "新竹市北區天府路一段162號4樓之3", 36 | ], 37 | [ 38 | "Size of T-shirt / T恤尺寸", 39 | "M / 胸寬(F.W.): 49cm / 衣長(C.L.): 70cm", 40 | ], 41 | ["Come From / 國家或地區", "Taiwan 台灣"], 42 | ["Age range / 年齡區間", "36 - 45"], 43 | [ 44 | 'Job Title / 職稱 (If you are a student, fill in "student")', 45 | "全端工程師", 46 | ], 47 | [ 48 | "Company / 服務單位 (For students or teachers, fill in the School + Department Name)", 49 | "雲灣資訊有限公司", 50 | ], 51 | ["Years of Using Python / 使用 Python 多久", "6-10 years"], 52 | [ 53 | "Area of Interest / 興趣領域", 54 | "Web Development, DevOps, Engineering & Mathematics", 55 | ], 56 | [ 57 | "Have you ever attended PyCon TW?/ 是否曾參加 PyCon TW?", 58 | "5-7 times", 59 | ], 60 | [ 61 | "Would you like to receive an email from sponsors?/ 是否願意收到贊助商轉發 Email 訊息?", 62 | "Yes", 63 | ], 64 | [ 65 | "I would like to donate invoice to Open Culture Foundation / 我願意捐贈發票給開放文化基金會 (ref: https://reurl.cc/ZQ6VY6)", 66 | "No", 67 | ], 68 | [ 69 | "Privacy Policy of PyCon APAC 2022 / PyCon APAC 2022 個人資料保護聲明", 70 | "", 71 | ], 72 | [ 73 | "I’ve already read and I accept the Privacy Policy of PyCon APAC 2022 / 我已閱讀並同意 PyCon APAC 2022 個人資料保護聲明", 74 | "Yes", 75 | ], 76 | [ 77 | "I am fully aware of the Gather Privacy Policy, only participants that are over the age of 18 can access to the venue / 我已被告知因為 gather 政策,需滿18歲以上方能登入會議場地", 78 | "", 79 | ], 80 | ["聯絡人 姓名", "李xx"], 81 | ["聯絡人 Email", "xxx@gmail.com"], 82 | ["聯絡人 手機", "0900000000"], 83 | ["標籤", ""], 84 | ], 85 | "kyc": {}, 86 | "id_number": None, 87 | "search_string": "Stanley\nMale", 88 | "updated_at": 1656921502.5667331, 89 | "ticket_type": "qrcode", 90 | "slot": {}, 91 | "order_no": 127666621, 92 | }, 93 | } 94 | ] 95 | -------------------------------------------------------------------------------- /tests/data_questionnaire.csv: -------------------------------------------------------------------------------- 1 | 你是從哪裡過來呢?,填答時間,填答秒數,IP紀錄,額滿結束註記,使用者紀錄,會員時間,會員編號,自訂ID,備註 2 | 台灣南部,"2021-07-03 15:34:50",311,36.226.4.68,,,,',, 3 | 台灣北部,"2021-07-03 16:13:18",5,36.226.4.68,,,,',, 4 | 其他,"2021-07-03 16:55:40",8,2001:b400:e23c:9eb4:90b4:98ef:33:8eee,,,,',, 5 | -------------------------------------------------------------------------------- /tests/kktix_ticket_orders/test_klaviyo_loader.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | from dags.ods.kktix_ticket_orders.udfs import klaviyo_loader 4 | 5 | 6 | def fake_airflow_variable(): 7 | return { 8 | "KLAVIYO_LIST_ID": "abc", 9 | "KLAVIYO_CAMPAIGN_ID": "123", 10 | } 11 | 12 | 13 | @patch("dags.ods.kktix_ticket_orders.udfs.klaviyo_loader.klaviyo_mailer.main") 14 | @patch( 15 | "dags.ods.kktix_ticket_orders.udfs.klaviyo_loader.Variable", 16 | new_callable=fake_airflow_variable, 17 | ) 18 | def test_klaviyo_loader(variable, mailer, kktix_api_data): 19 | klaviyo_loader.load(kktix_api_data) 20 | mailer.assert_called_once_with( 21 | list_id="abc", 22 | campaign_id="123", 23 | campaign_name="隨買即用", 24 | datas=[ 25 | { 26 | "email": "xxx@gmail.com", 27 | "name": "李xx", 28 | "qrcode": "bc7bd846f49d2d2e1g833cc92gdg2cf9", 29 | } 30 | ], 31 | ) 32 | -------------------------------------------------------------------------------- /tests/kktix_ticket_orders/test_transformer.py: -------------------------------------------------------------------------------- 1 | """ 2 | test transformer 3 | """ 4 | 5 | from dags.ods.kktix_ticket_orders.udfs.kktix_transformer import transform 6 | 7 | 8 | def test_transform(kktix_api_data) -> None: 9 | ground_truth = [ 10 | { 11 | "id": 84296, 12 | "name": "PyCon APAC 2022 Registration: Individual【Online Conference】", 13 | "attendee_info": { 14 | "id": 84748358, 15 | "ticket_id": 449116, 16 | "ticket_name": "Regular 一般票(with Pyckage)", 17 | "reg_no": 104, 18 | "state": "activated", 19 | "checkin_code": "BC7B", 20 | "qrcode": "bc7bd846f49d2d2e1g833cc92gdg2cf9", 21 | "is_paid": True, 22 | "price": 2600, 23 | "currency": "TWD", 24 | "payment_method": "WEBSITE", 25 | "data": [ 26 | ["Nickname / 暱稱", "Stanley"], 27 | ["Gender / 生理性別", "Male / 男性"], 28 | [ 29 | "If you buy the ticket with PySafe, remember to fill out correct address and size of t-shirt for us to send the parcel. if you fill the wrong information to cause missed delivery, we will not resend th", 30 | "", 31 | ], 32 | [ 33 | "購買含 Pyckage 票卷者,請務必填寫正確之「Address / 收件地址」和「Size of T-shirt / T恤尺寸 」(僅限台灣及離島區域),以避免 Pyckage 無法送達,如因填寫錯誤致未收到 Pyckage ,報名人須自行負責,大會恕不再另行補寄", 34 | "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", 35 | ], 36 | [ 37 | "Address / 收件地址 Ex: No. 128, Sec. 2, Academia Rd., Nangang Dist., Taipei City 115201, Taiwan (R.O.C.) / 115台北市南港區研究院路二段128號", 38 | "36190b79eb7396cfb91e413fecef9707bca87f32012fb01fc38caa236fb053d0", 39 | ], 40 | [ 41 | "Size of T-shirt / T恤尺寸", 42 | "M / 胸寬(F.W.): 49cm / 衣長(C.L.): 70cm", 43 | ], 44 | ["Come From / 國家或地區", "Taiwan 台灣"], 45 | ["Age range / 年齡區間", "36 - 45"], 46 | [ 47 | 'Job Title / 職稱 (If you are a student, fill in "student")', 48 | "全端工程師", 49 | ], 50 | [ 51 | "Company / 服務單位 (For students or teachers, fill in the School + Department Name)", 52 | "雲灣資訊有限公司", 53 | ], 54 | ["Years of Using Python / 使用 Python 多久", "6-10 years"], 55 | [ 56 | "Area of Interest / 興趣領域", 57 | "Web Development, DevOps, Engineering & Mathematics", 58 | ], 59 | [ 60 | "Have you ever attended PyCon TW?/ 是否曾參加 PyCon TW?", 61 | "5-7 times", 62 | ], 63 | [ 64 | "Would you like to receive an email from sponsors?/ 是否願意收到贊助商轉發 Email 訊息?", 65 | "Yes", 66 | ], 67 | [ 68 | "I would like to donate invoice to Open Culture Foundation / 我願意捐贈發票給開放文化基金會 (ref: https://reurl.cc/ZQ6VY6)", 69 | "No", 70 | ], 71 | [ 72 | "Privacy Policy of PyCon APAC 2022 / PyCon APAC 2022 個人資料保護聲明", 73 | "", 74 | ], 75 | [ 76 | "I’ve already read and I accept the Privacy Policy of PyCon APAC 2022 / 我已閱讀並同意 PyCon APAC 2022 個人資料保護聲明", 77 | "Yes", 78 | ], 79 | [ 80 | "I am fully aware of the Gather Privacy Policy, only participants that are over the age of 18 can access to the venue / 我已被告知因為 gather 政策,需滿18歲以上方能登入會議場地", 81 | "", 82 | ], 83 | [ 84 | "聯絡人 姓名", 85 | "2150750f32ee8dcd40537be8b5bee7c26e893a77cb23049eb3a0ca49a7512791", 86 | ], 87 | [ 88 | "聯絡人 Email", 89 | "26a695fcd9d98ffa1fba78cb5a1eacf0fbe19e40bf9de0cafa0080cdf4c14514", 90 | ], 91 | [ 92 | "聯絡人 手機", 93 | "86f3abfffd2f714a6d611429f82fac9264e8036b0fb490320bfe3e56c494a0e0", 94 | ], 95 | ["標籤", ""], 96 | ], 97 | "kyc": {}, 98 | "id_number": None, 99 | "updated_at": 1656921502.5667331, 100 | "ticket_type": "qrcode", 101 | "slot": {}, 102 | "order_no": 127666621, 103 | }, 104 | } 105 | ] 106 | if __debug__: 107 | if transform(kktix_api_data) != ground_truth: 108 | raise AssertionError( 109 | "Transform() might forget to de-identify some columns! e.g. name, email or phone number" 110 | ) 111 | -------------------------------------------------------------------------------- /tests/test_cakeresume_uploader.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from pathlib import Path 3 | 4 | from contrib.survey_cake.udfs.survey_cake_csv_uploader import SurveyCakeCSVUploader 5 | 6 | 7 | def test_cakeresume_uploader() -> None: 8 | fixtures = { 9 | "tests/data_questionnaire.csv": { 10 | "data_domain": "questionnaire", 11 | "primary_key": "ip", 12 | "time_dimension": "datetime", 13 | } 14 | } 15 | 16 | for filename, metadata in fixtures.items(): 17 | SURVEY_CAKE_CSV_UPLOADER = SurveyCakeCSVUploader(year=2146, filename=filename) 18 | SURVEY_CAKE_CSV_UPLOADER.transform() 19 | with open( 20 | Path("tests/data_questionnaire_dimension.csv") 21 | ) as data_questionnaire_dimension: 22 | rows = csv.reader(data_questionnaire_dimension) 23 | header = next(iter(rows)) 24 | if __debug__: 25 | if header != ["question_id", "question", "year"]: 26 | raise AssertionError("wrong header!") 27 | 28 | with open( 29 | Path("tests/data_questionnaire_facttable.csv") 30 | ) as data_questionnaire_facttable: 31 | rows = csv.reader(data_questionnaire_facttable) 32 | header = next(iter(rows)) 33 | if __debug__: 34 | if header != ["ip", "question_id", "answer", "year"]: 35 | raise AssertionError("wrong header!") 36 | -------------------------------------------------------------------------------- /tests/test_crawler.py: -------------------------------------------------------------------------------- 1 | """ 2 | test crawler 3 | """ 4 | 5 | from dags.ods.opening_crawler.udfs.crawlers import CakeResumeCrawler 6 | 7 | 8 | def test_demo() -> None: 9 | if __debug__: 10 | if CakeResumeCrawler.crawl() != "i'm a CakeResume crawler!": 11 | raise AssertionError("CakeResumeCrawler Error!") 12 | --------------------------------------------------------------------------------