├── .dockerignore
├── .env.template
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── documentation.md
    │   └── feature_request.md
    ├── pull_request_template.md
    └── workflows
    │   ├── dockerimage.yml
    │   └── python.yml
├── .gitignore
├── Dockerfile
├── Dockerfile.test
├── Makefile
├── README.md
├── airflow.cfg
├── contrib
    ├── README.md
    ├── data
    │   ├── corporate-attendees-2018.csv
    │   ├── corporate-attendees-2019.csv
    │   ├── corporate-attendees-2020.csv
    │   ├── individual-attendees-2018.csv
    │   ├── individual-attendees-2019.csv
    │   ├── individual-attendees-2020.csv
    │   ├── reserved-attendees-2018.csv
    │   ├── reserved-attendees-2019.csv
    │   └── reserved-attendees-2020.csv
    ├── kktix_bq_etl.sh
    ├── survey_cake
    │   ├── udfs
    │   │   └── survey_cake_csv_uploader.py
    │   └── upload-survey-cake-csv-to-bigquery.py
    ├── upload-kktix-ticket-csv-to-bigquery.py
    └── upload-kktix-ticket-csv-to-bigquery.sh
├── dags
    ├── airflow-log-cleanup.py
    ├── airlfow-db-cleanup.py
    ├── app
    │   ├── channel_reminder
    │   │   ├── __init__.py
    │   │   ├── dag.py
    │   │   └── udf.py
    │   ├── discord.py
    │   ├── finance_bot
    │   │   ├── __init__.py
    │   │   ├── dag.py
    │   │   └── udf.py
    │   ├── proposal_reminder
    │   │   ├── __init__.py
    │   │   ├── dag.py
    │   │   └── udf.py
    │   ├── team_registration_bot
    │   │   ├── __init__.py
    │   │   ├── dag.py
    │   │   └── udf.py
    │   └── twitter_post_notification_bot
    │   │   ├── dag.py
    │   │   └── udf.py
    ├── dwd
    │   └── __init__.py
    ├── dws
    │   └── __init__.py
    ├── fixtures
    │   └── data_questionnaire.csv
    ├── ods
    │   ├── fb_post_insights
    │   │   ├── dag.py
    │   │   └── udfs.py
    │   ├── google_search_console
    │   │   ├── dag.py
    │   │   └── udfs
    │   │   │   └── google_search.py
    │   ├── ig_post_insights
    │   │   ├── dags.py
    │   │   └── udfs.py
    │   ├── kktix_ticket_orders
    │   │   ├── kktix_dag.py
    │   │   ├── kktix_refund_dag.py
    │   │   ├── klaviyo_backfill_dag.py
    │   │   ├── sqls
    │   │   │   └── create_table.sql
    │   │   └── udfs
    │   │   │   ├── batch_kktix2mailer.py
    │   │   │   ├── bigquery_loader.py
    │   │   │   ├── gather_town_loader.py
    │   │   │   ├── kktix_api.py
    │   │   │   ├── kktix_bq_dwd_etl.py
    │   │   │   ├── kktix_loader.py
    │   │   │   ├── kktix_refund.py
    │   │   │   ├── kktix_transformer.py
    │   │   │   ├── klaviyo_loader.py
    │   │   │   └── klaviyo_mailer.py
    │   ├── linkedin_post_insights
    │   │   ├── dags.py
    │   │   └── udfs.py
    │   ├── opening_crawler
    │   │   ├── dags
    │   │   │   └── cakeresume_crawler.py
    │   │   └── udfs
    │   │   │   └── crawlers.py
    │   ├── survey_cake
    │   │   ├── dags
    │   │   │   └── questionnaire_2_bigquery.py
    │   │   └── udfs
    │   │   │   └── survey_cake_csv_uploader.py
    │   ├── twitter_post_insights
    │   │   ├── dags.py
    │   │   └── udfs.py
    │   └── youtube
    │   │   ├── dags
    │   │       └── dag.py
    │   │   ├── sqls
    │   │       └── create_table.sql
    │   │   └── udfs
    │   │       └── youtube_api.py
    └── utils
    │   └── hook_related.py
├── docker-compose-dev.yml
├── docker-compose.yml
├── docs
    ├── CONTRIBUTING.md
    ├── DEPLOYMENT.md
    ├── MAINTENANCE.md
    ├── airflow.png
    ├── kktix.png
    └── youtube-connection.png
├── entrypoint.sh
├── pyproject.toml
├── setup.cfg
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── data_questionnaire.csv
    ├── kktix_ticket_orders
    │   ├── test_klaviyo_loader.py
    │   └── test_transformer.py
    ├── test_cakeresume_uploader.py
    └── test_crawler.py
└── uv.lock


/.dockerignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .mypy_cache
3 | .pytest_cache
4 | .vscode
5 | bower_components
6 | venv
7 | node_modules
8 | .git
9 | service-account.json


--------------------------------------------------------------------------------
/.env.template:
--------------------------------------------------------------------------------
1 | AIRFLOW_HOME=/opt/airflow
2 | BIGQUERY_PROJECT=pycontw-225217
3 | GOOGLE_APPLICATION_CREDENTIALS=/opt/airflow/service-account.json
4 | AIRFLOW__CORE__FERNET_KEY=paste-your-fernet-key-here


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: 🛠 Bug report
 3 | about: Create a report to help us improve
 4 | title: "[Bug Report] Good bug title tells us about precise symptom, not about the root cause."
 5 | labels: "bug"
 6 | assignees: ""
 7 | ---
 8 | 
 9 | ## Description
10 | <!-- A clear and concise description of what the bug is. -->
11 | 
12 | ## {{ cookiecutter.project_name }} version
13 | <!-- x.y.z -->
14 | 
15 | ## Steps to Reproduce
16 | <!--
17 | Steps to reproduce the behavior:
18 | 1. Go to '...'
19 | 2. Click on '....'
20 | 3. Scroll down to '....'
21 | 4. See error
22 | -->
23 | 
24 | ## Expected Behavior
25 | <!--
26 | A clear and concise description of what you expected to happen.
27 | 
28 | **Screenshots**
29 | If applicable, add screenshots to help explain your problem.
30 | -->
31 | 
32 | ## Actual Behavior
33 | <!-- What happens actually so you think this is a bug. -->
34 | 
35 | ## More Information
36 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: 📖 Documentation
 3 | about: Suggest an improvement for the documentation of this project
 4 | title: "[Documentation] Content to be added or fixed"
 5 | labels: "documentation"
 6 | assignees: ""
 7 | ---
 8 | 
 9 | ## Type
10 | 
11 | * [ ] Content inaccurate
12 | * [ ] Content missing
13 | * [ ] Typo
14 | 
15 | ## URL
16 | <!-- URL to the code we did not clearly describe or the document page where the content is inaccurate -->
17 | 
18 | ## Description
19 | <!-- A clear and concise description of what content should be added or fixed -->
20 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: 🚀 Feature request
 3 | about: Suggest an idea for this project
 4 | title: "[Feature Request] <One feature request per issue>"
 5 | labels: ""
 6 | assignees: ""
 7 | ---
 8 | 
 9 | ## Description
10 | <!--A clear and concise description for us to know your idea.-->
11 | 
12 | ## Possible Solution
13 | <!--A clear and concise description of what you want to happen.-->
14 | 
15 | ## Additional context
16 | <!--Add any other context or screenshots about the feature request here.-->
17 | 
18 | ## Related Issue
19 | <!--If applicable, add link to existing issue also help us know better.-->
20 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | <!--(Thanks for sending a pull request! Please fill in the following content to let us know better about this change.)-->
 2 | 
 3 | ## Types of changes
 4 | <!--Please remove the types that does not apply to this change-->
 5 | 
 6 | - **Bugfix**
 7 | - **New feature**
 8 | - **Refactoring**
 9 | - **Breaking change** (any change that would cause existing functionality to not work as expected)
10 | - **Documentation Update**
11 | - **Other (please describe)**
12 | 
13 | ## Description
14 | <!--Describe what the change is**-->
15 | 
16 | ## Checklist
17 | 
18 | - [ ] Add test cases to all the changes you introduce
19 | - [ ] Run `make lint` and `make test` locally to ensure all linter checks and testing pass
20 | - [ ] Update the documentation if necessary
21 | 
22 | ## Steps to Test This Pull Request
23 | <!--
24 | Steps to reproduce the behavior:
25 | 1. ...
26 | 2. ...
27 | 3. ...
28 | -->
29 | 
30 | ## Expected behavior
31 | <!--A clear and concise description of what you expected to happen-->
32 | 
33 | ## Related Issue
34 | <!--If applicable, refernce to the issue related to this pull request.-->
35 | 
36 | ## Additional context
37 | <!--Add any other context or screenshots about the pull request here.-->
38 | 


--------------------------------------------------------------------------------
/.github/workflows/dockerimage.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image CI
 2 | on:
 3 |   push:
 4 |     branches: [ master, prod ]
 5 |   pull_request:
 6 |     branches: [ master, prod ]
 7 | env:
 8 |   RC_NAME: asia-east1-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/data-team/pycon-etl
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - uses: actions/checkout@v4
14 |     - name: Authenticate to Google Cloud
15 |       uses: google-github-actions/auth@v1
16 |       with:
17 |         credentials_json: ${{ secrets.GCP_SERVICE_ACCOUNT_KEY }}
18 |     - name: Configure docker to use gcloud command-line tool as a credential helper
19 |       run: |
20 |         gcloud auth configure-docker asia-east1-docker.pkg.dev
21 |     - name: Pull cache
22 |       run: |
23 |         docker pull ${RC_NAME}:cache || true
24 |     - name: Build the Docker image
25 |       run: |
26 |         docker build -t ${RC_NAME}:cache --cache-from ${RC_NAME}:cache .
27 |         docker build -t ${RC_NAME}:test --cache-from ${RC_NAME}:cache -f Dockerfile.test .
28 |     - name: Run test
29 |       run: |
30 |         docker run -d --rm -p 8080:8080 --name airflow -v $(pwd)/dags:/opt/airflow/dags -v $(pwd)/fixtures:/opt/airflow/fixtures ${RC_NAME}:test webserver
31 |         sleep 10
32 |     - name: Push cache to Google Container Registry
33 |       if: success()
34 |       run: |
35 |         docker push ${RC_NAME}:cache
36 |     - name: Push staging to Google Container Registry
37 |       if: github.ref == 'refs/heads/master' && success()
38 |       run: |
39 |         docker tag ${RC_NAME}:cache ${RC_NAME}:staging
40 |         docker push ${RC_NAME}:staging
41 |     - name: Push prod version to Google Container Registry
42 |       if: github.ref == 'refs/heads/prod' && success()
43 |       run: |
44 |         docker tag ${RC_NAME}:cache ${RC_NAME}:latest
45 |         docker push ${RC_NAME}:latest
46 | 


--------------------------------------------------------------------------------
/.github/workflows/python.yml:
--------------------------------------------------------------------------------
 1 | name: Python CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master]
 6 |   pull_request:
 7 |     branches: [master]
 8 | env:
 9 |   AIRFLOW_TEST_MODE: true
10 | jobs:
11 |   build:
12 |     runs-on: ubuntu-latest
13 |     timeout-minutes: 10
14 |     steps:
15 |       - name: Check out
16 |         uses: actions/checkout@v4
17 |         with:
18 |           fetch-depth: 0
19 | 
20 |       - name: Install the latest version of uv
21 |         uses: astral-sh/setup-uv@v5
22 |         with:
23 |           enable-cache: true
24 |           version: "latest"
25 | 
26 |       - name: Install dependencies
27 |         run: |
28 |           uv sync --group dev
29 | 
30 |       - name: Run linters
31 |         run: make lint
32 | 
33 |       - name: Run test
34 |         run: make test
35 | 
36 |       - name: Coverage
37 |         run: make coverage
38 | 
39 |     # CD part
40 |     # - name: Push dags to GCS
41 |     #   not implemented yet
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # project stuff
  2 | .env.production
  3 | .env.staging
  4 | env
  5 | env.sh
  6 | client_secret_google_search_console*
  7 | *.csv
  8 | !dags/fixtures/*.csv
  9 | service-account.json
 10 | PyConTW2019/
 11 | PyConTW2020-CCIP-DB-dump/
 12 | dags_data-venue-booth-checking-in_PyConTW2019-20200906T164504Z-001.zip
 13 | dags_data-venue-booth-checking-in_PyConTW2020-CCIP-DB-dump.zip
 14 | .env
 15 | PyConTW-ab56b4c31ba4-bigquery-data-strat-owned-by-tai.json
 16 | PyConTW-ab56b4c31ba4-bigquery-data-strat-owned-by-tai.json.gz
 17 | 
 18 | # npm stuff
 19 | node_modules/
 20 | 
 21 | # ipython notebooks
 22 | .ipynb_checkpoints/
 23 | 
 24 | # mypy stuff
 25 | .mypy_cache/
 26 | 
 27 | # vscode stuff
 28 | .vscode/
 29 | 
 30 | # virtualenv
 31 | venv
 32 | 
 33 | # System stuff
 34 | [Oo]bj
 35 | [Bb]in
 36 | [Tt]emp
 37 | *.user
 38 | *.suo
 39 | *.[Cc]ache
 40 | *.bak
 41 | *.log
 42 | *.DS_Store
 43 | [Tt]est[Rr]esult*
 44 | [Tt]humbs.db
 45 | _ReSharper.*
 46 | *.resharper
 47 | Ankh.NoLoad
 48 | 
 49 | # Byte-compiled / optimized / DLL files
 50 | __pycache__/
 51 | *.py[cod]
 52 | 
 53 | # C extensions
 54 | *.so
 55 | 
 56 | # Distribution / packaging
 57 | .Python
 58 | env/
 59 | build/
 60 | develop-eggs/
 61 | dist/
 62 | downloads/
 63 | eggs/
 64 | .eggs/
 65 | lib/
 66 | lib64/
 67 | parts/
 68 | sdist/
 69 | var/
 70 | *.egg-info/
 71 | .installed.cfg
 72 | *.egg
 73 | 
 74 | # PyInstaller
 75 | #  Usually these files are written by a python script from a template
 76 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 77 | *.manifest
 78 | *.spec
 79 | 
 80 | # Installer logs
 81 | pip-log.txt
 82 | pip-delete-this-directory.txt
 83 | 
 84 | # Unit test / coverage reports
 85 | htmlcov/
 86 | .tox/
 87 | .coverage
 88 | .coverage.*
 89 | .cache
 90 | nosetests.xml
 91 | coverage.xml
 92 | *,cover
 93 | 
 94 | # Translations
 95 | *.mo
 96 | *.pot
 97 | 
 98 | # Django stuff:
 99 | *.log
100 | 
101 | # Sphinx documentation
102 | docs/_build/
103 | 
104 | # PyBuilder
105 | target/
106 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG AIRFLOW_VERSION=1.10.15
 2 | ARG PYTHON_VERSION=3.8
 3 | ARG PLATFORM=linux/amd64
 4 | 
 5 | FROM --platform=${PLATFORM} ghcr.io/astral-sh/uv:python${PYTHON_VERSION}-bookworm-slim AS builder
 6 | 
 7 | ENV UV_COMPILE_BYTECODE=1 \
 8 |     UV_LINK_MODE=copy \
 9 |     UV_PYTHON_DOWNLOADS=0
10 | 
11 | RUN apt-get update && \
12 |     apt-get install -y gcc libc6-dev --no-install-recommends
13 | 
14 | WORKDIR /app
15 | 
16 | COPY ./pyproject.toml .
17 | COPY ./uv.lock .
18 | 
19 | RUN --mount=type=cache,target=/root/.cache/uv \
20 |     --mount=type=bind,source=uv.lock,target=uv.lock \
21 |     --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
22 |     uv sync --frozen --no-install-project --no-dev
23 | 
24 | RUN --mount=type=cache,target=/root/.cache/uv \
25 |     uv sync --frozen --no-install-project --no-dev
26 | 
27 | 
28 | FROM --platform=${PLATFORM} apache/airflow:${AIRFLOW_VERSION}-python${PYTHON_VERSION}
29 | 
30 | USER root
31 | 
32 | RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 467B942D3A79BD29 \
33 |     && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys B7B3B788A8D3785C \
34 |     && apt-get update \
35 |     && apt-get install -y --no-install-recommends git \
36 |     # 1. if you don't need postgres, remember to remove postgresql-dev and sqlalchemy
37 |     # 2. libglib2.0-0 libsm6 libxext6 libxrender-dev libgl1-mesa-dev are required by opencv
38 |     # 3. git is required by pip install git+https
39 |     && apt-get clean \
40 |     && rm -rf /var/lib/apt/lists/*
41 | 
42 | COPY entrypoint.sh /entrypoint.sh
43 | 
44 | RUN chmod +x /entrypoint.sh
45 | 
46 | USER airflow
47 | 
48 | COPY --from=builder --chown=airflow:airflow /app /app
49 | ENV PATH="/app/.venv/bin:$PATH"
50 | 
51 | COPY airflow.cfg ${AIRFLOW_HOME}/airflow.cfg
52 | COPY --chown=airflow:root dags ${AIRFLOW_HOME}/dags
53 | 
54 | ENTRYPOINT ["/entrypoint.sh"]
55 | 


--------------------------------------------------------------------------------
/Dockerfile.test:
--------------------------------------------------------------------------------
 1 | ARG AIRFLOW_VERSION=1.10.15
 2 | ARG PYTHON_VERSION=3.8
 3 | ARG PLATFORM=linux/amd64
 4 | 
 5 | FROM --platform=${PLATFORM} ghcr.io/astral-sh/uv:python${PYTHON_VERSION}-bookworm-slim AS builder
 6 | 
 7 | ENV UV_COMPILE_BYTECODE=1 \
 8 |     UV_LINK_MODE=copy \
 9 |     UV_PYTHON_DOWNLOADS=0
10 | 
11 | RUN apt-get update && \
12 |     apt-get install -y gcc libc6-dev --no-install-recommends
13 | 
14 | WORKDIR /app
15 | 
16 | COPY ./pyproject.toml .
17 | COPY ./uv.lock .
18 | 
19 | RUN --mount=type=cache,target=/root/.cache/uv \
20 |     --mount=type=bind,source=uv.lock,target=uv.lock \
21 |     --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
22 |     uv sync --frozen --no-install-project --group dev
23 | 
24 | RUN --mount=type=cache,target=/root/.cache/uv \
25 |     uv sync --frozen --group dev
26 | 
27 | FROM --platform=${PLATFORM} apache/airflow:${AIRFLOW_VERSION}-python${PYTHON_VERSION}
28 | 
29 | USER root
30 | 
31 | RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 467B942D3A79BD29 \
32 |     && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys B7B3B788A8D3785C \
33 |     && apt-get update \
34 |     && apt-get install -y --no-install-recommends git \
35 |     # 1. if you don't need postgres, remember to remove postgresql-dev and sqlalchemy
36 |     # 2. libglib2.0-0 libsm6 libxext6 libxrender-dev libgl1-mesa-dev are required by opencv
37 |     # 3. git is required by pip install git+https
38 |     && apt-get clean \
39 |     && rm -rf /var/lib/apt/lists/*
40 | 
41 | COPY entrypoint.sh /entrypoint.sh
42 | 
43 | RUN chmod +x /entrypoint.sh
44 | 
45 | USER airflow
46 | 
47 | COPY --from=builder --chown=airflow:airflow /app /app
48 | ENV PATH="/app/.venv/bin:$PATH"
49 | 
50 | COPY airflow.cfg ${AIRFLOW_HOME}/airflow.cfg
51 | COPY --chown=airflow:root dags ${AIRFLOW_HOME}/dags
52 | 
53 | ENV AIRFLOW_TEST_MODE=True
54 | 
55 | ENTRYPOINT ["/entrypoint.sh"]
56 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | lint:
 2 | 	uv run ruff check .
 3 | 	uv run ruff format .
 4 | 	uv run mypy dags/ tests/
 5 | 
 6 | format:
 7 | 	uv run ruff check . --fix
 8 | 	uv run ruff format .
 9 | 
10 | test:
11 | 	PYTHONPATH=./dags uv run pytest
12 | 
13 | coverage:
14 | 	PYTHONPATH=./dags uv run pytest --cov=dags tests
15 | 
16 | build-dev:
17 | 	docker-compose -f ./docker-compose-dev.yml build
18 | 
19 | deploy-dev:
20 | 	docker-compose -f ./docker-compose-dev.yml up -d
21 | 
22 | down-dev:
23 | 	docker-compose -f ./docker-compose-dev.yml down
24 | 
25 | deploy-prod:
26 | 	docker-compose -f ./docker-compose.yml up -d
27 | 
28 | down-prod:
29 | 	docker-compose -f ./docker-compose.yml down
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PyConTW ETL
  2 | 
  3 | ![Python CI](https://github.com/pycontw/PyCon-ETL/workflows/Python%20CI/badge.svg)
  4 | ![Docker Image CI](https://github.com/pycontw/PyCon-ETL/workflows/Docker%20Image%20CI/badge.svg)
  5 | 
  6 | Using Airflow to implement our ETL pipelines.
  7 | 
  8 | ## Table of Contents
  9 | 
 10 | - [Prerequisites](#prerequisites)
 11 | - [Installation](#installation)
 12 | - [Configuration](#configuration)
 13 | - [BigQuery (Optional)](#bigquery-optional)
 14 | - [Running the Project](#running-the-project)
 15 |   - [Local Environment with Docker](#local-environment-with-docker)
 16 |   - [Production](#production)
 17 | - [Contact](#contact)
 18 | 
 19 | ## Prerequisites
 20 | 
 21 | - [Python 3.8+](https://www.python.org/downloads/release/python-3811/)
 22 | - [Docker](https://docs.docker.com/get-docker/)
 23 | - [Git](https://git-scm.com/book/zh-tw/v2/%E9%96%8B%E5%A7%8B-Git-%E5%AE%89%E8%A3%9D%E6%95%99%E5%AD%B8)
 24 | - [uv]
 25 | 
 26 | ## Installation
 27 | 
 28 | We use [uv] to manage dependencies and virtual environment.
 29 | 
 30 | Below are the steps to create a virtual environment using [uv]:
 31 | 
 32 | 1. Create a Virtual Environment with Dependencies Installed
 33 | 
 34 |     To create a virtual environment, run the following command:
 35 | 
 36 |     ```bash
 37 |     uv sync
 38 |     ```
 39 | 
 40 |     By default, [uv] sets up the virtual environment in `.venv`
 41 | 
 42 | 2. Activate the Virtual Environment
 43 | 
 44 |     After creating the virtual environment, activate it using the following command:
 45 | 
 46 |     ```bash
 47 |     source .venv/bin/activate
 48 |     ```
 49 | 
 50 | 3. Deactivate the Virtual Environment
 51 | 
 52 |     When you're done working in the virtual environment, you can deactivate it with:
 53 | 
 54 |     ```bash
 55 |     deactivate
 56 |     ```
 57 | 
 58 | ## Configuration
 59 | 
 60 | 1. For development or testing, run `cp .env.template .env.staging`. For production, run `cp .env.template .env.production`.
 61 | 
 62 | 2. Follow the instructions in `.env.<staging|production>` and fill in your secrets.
 63 |     If you are running the staging instance for development as a sandbox and do not need to access any specific third-party services, leaving `.env.staging` as-is should be fine.
 64 | 
 65 | > Contact the maintainer if you don't have these secrets.
 66 | 
 67 | > **⚠ WARNING: About .env**
 68 | > Please do not use the .env file for local development, as it might affect the production tables.
 69 | 
 70 | ### BigQuery (Optional)
 71 | 
 72 | Set up the Authentication for GCP: <https://googleapis.dev/python/google-api-core/latest/auth.html>
 73 | 
 74 | - After running `gcloud auth application-default login`, you will get a credentials.json file located at `$HOME/.config/gcloud/application_default_credentials.json`. Run `export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json"` if you have it.
 75 | - service-account.json: Please contact @david30907d via email or Discord. You do not need this json file if you are running the sandbox staging instance for development.
 76 | 
 77 | ## Running the Project
 78 | 
 79 | If you are a developer 👨‍💻, please check the [Contributing Guide](./docs/CONTRIBUTING.md).
 80 | 
 81 | If you are a maintainer 👨‍🔧, please check the [Maintenance Guide](./docs/MAINTENANCE.md).
 82 | 
 83 | ### Local Environment with Docker
 84 | 
 85 | For development/testing:
 86 | 
 87 | ```bash
 88 | # Build the local dev/test image
 89 | make build-dev
 90 | 
 91 | # Start dev/test services
 92 | make deploy-dev
 93 | 
 94 | # Stop dev/test services
 95 | make down-dev
 96 | ```
 97 | 
 98 | > The difference between production and dev/test compose files is that the dev/test compose file uses a locally built image, while the production compose file uses the image from Docker Hub.
 99 | 
100 | If you are an authorized maintainer, you can pull the image from the GCP Artifact Registry.
101 | 
102 | Docker client must be configured to use the GCP Artifact Registry.
103 | 
104 | ```bash
105 | gcloud auth configure-docker asia-east1-docker.pkg.dev
106 | ```
107 | 
108 | Then, pull the image:
109 | 
110 | ```bash
111 | docker pull asia-east1-docker.pkg.dev/pycontw-225217/data-team/pycon-etl:{tag}
112 | ```
113 | 
114 | There are several tags available:
115 | 
116 | - `cache`: cache the image for faster deployment
117 | - `test`: for testing purposes, including the test dependencies
118 | - `staging`: when pushing to the staging environment
119 | - `latest`: when pushing to the production environment
120 | 
121 | ### Production
122 | 
123 | Please check the [Production Deployment Guide](./docs/DEPLOYMENT.md).
124 | 
125 | ## Contact
126 | 
127 | [PyCon TW Volunteer Data Team - Discord](https://discord.com/channels/752904426057892052/900721883383758879)
128 | 
129 | [uv]: https://docs.astral.sh/uv/
130 | 


--------------------------------------------------------------------------------
/airflow.cfg:
--------------------------------------------------------------------------------
  1 | [core]
  2 | # The folder where your airflow pipelines live, most likely a
  3 | # subfolder in a code repository. This path must be absolute.
  4 | dags_folder = /opt/airflow/dags
  5 | 
  6 | # The folder where airflow should store its log files
  7 | # This path must be absolute
  8 | base_log_folder = /opt/airflow/logs
  9 | 
 10 | # Log format for when Colored logs is enabled
 11 | colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {{%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d}} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s
 12 | 
 13 | # Format of Log line
 14 | log_format = [%%(asctime)s] {{%%(filename)s:%%(lineno)d}} %%(levelname)s - %%(message)s
 15 | 
 16 | dag_processor_manager_log_location = /opt/airflow/logs/dag_processor_manager/dag_processor_manager.log
 17 | 
 18 | # The SqlAlchemy connection string to the metadata database.
 19 | # SqlAlchemy supports many different database engine, more information
 20 | # their website
 21 | # sql_alchemy_conn = sqlite:////tmp/airflow.db
 22 | 
 23 | # The amount of parallelism as a setting to the executor. This defines
 24 | # the max number of task instances that should run simultaneously
 25 | # on this airflow installation
 26 | parallelism = 256
 27 | 
 28 | # The number of task instances allowed to run concurrently by the scheduler
 29 | dag_concurrency = 64
 30 | 
 31 | # Whether to load the examples that ship with Airflow. It's good to
 32 | # get started, but you probably want to set this to False in a production
 33 | # environment
 34 | load_examples = False
 35 | 
 36 | # Where your Airflow plugins are stored
 37 | plugins_folder = /opt/airflow/plugins
 38 | 
 39 | # Secret key to save connection passwords in the db
 40 | fernet_key = $FERNET_KEY
 41 | 
 42 | # How long before timing out a python file import
 43 | dagbag_import_timeout = 600
 44 | 
 45 | # How long before timing out a DagFileProcessor, which processes a dag file
 46 | dag_file_processor_timeout = 600
 47 | 
 48 | [api]
 49 | # How to authenticate users of the API
 50 | auth_backend = airflow.api.auth.backend.default
 51 | 
 52 | 
 53 | [webserver]
 54 | # Number of seconds the webserver waits before killing gunicorn master that doesn't respond
 55 | web_server_master_timeout = 600
 56 | 
 57 | # Number of seconds the gunicorn webserver waits before timing out on a worker
 58 | web_server_worker_timeout = 600
 59 | 
 60 | # Secret key used to run your flask app
 61 | # It should be as random as possible
 62 | secret_key = l\xba,\xc3\x023\xca\x04\xdb\xf2\xf7\xfa\xb8#\xee>
 63 | 
 64 | # Number of workers to run the Gunicorn web server
 65 | workers = 2
 66 | 
 67 | # Expose the configuration file in the web server
 68 | expose_config = True
 69 | 
 70 | # Allow the UI to be rendered in a frame
 71 | x_frame_enabled = True
 72 | 
 73 | # Minutes of non-activity before logged out from UI
 74 | # 0 means never get forcibly logged out
 75 | force_log_out_after = 0
 76 | 
 77 | authenticate = False
 78 | auth_backend = airflow.api.auth.backend.default
 79 | 
 80 | 
 81 | [celery]
 82 | # The concurrency that will be used when starting workers with the
 83 | # ``airflow celery worker`` command. This defines the number of task instances that
 84 | # a worker will take, so size up your workers based on the resources on
 85 | # your worker box and the nature of your tasks
 86 | worker_concurrency = 32
 87 | 
 88 | # The maximum and minimum concurrency that will be used when starting workers with the
 89 | # ``airflow celery worker`` command (always keep minimum processes, but grow
 90 | # to maximum if necessary). Note the value should be max_concurrency,min_concurrency
 91 | # Pick these numbers based on resources on worker box and the nature of the task.
 92 | # If autoscale option is available, worker_concurrency will be ignored.
 93 | # http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale
 94 | # Example: worker_autoscale = 16,12
 95 | worker_autoscale = 32,12
 96 | 
 97 | # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally
 98 | # a sqlalchemy database. Refer to the Celery documentation for more
 99 | # information.
100 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#broker-settings
101 | broker_url = redis://redis:6379/1
102 | 
103 | # The Celery result_backend. When a job finishes, it needs to update the
104 | # metadata of the job. Therefore it will post a message on a message bus,
105 | # or insert it into a database (depending of the backend)
106 | # This status is used by the scheduler to update the state of the task
107 | # The use of a database is highly recommended
108 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings
109 | result_backend = db+postgresql://airflow:airflow@postgres/airflow
110 | 
111 | [scheduler]
112 | child_process_log_directory = /opt/airflow/logs/scheduler
113 | 
114 | 
115 | # Format of the log_id, which is used to query for a given tasks logs
116 | log_id_template = {{dag_id}}-{{task_id}}-{{execution_date}}-{{try_number}}
117 | 
118 | [kubernetes]
119 | # Keyword parameters to pass while calling a kubernetes client core_v1_api methods
120 | # from Kubernetes Executor provided as a single line formatted JSON dictionary string.
121 | # List of supported params are similar for all core_v1_apis, hence a single config
122 | # variable for all apis.
123 | # See:
124 | # https://raw.githubusercontent.com/kubernetes-client/python/master/kubernetes/client/apis/core_v1_api.py
125 | # Note that if no _request_timeout is specified, the kubernetes client will wait indefinitely
126 | # for kubernetes api responses, which will cause the scheduler to hang.
127 | # The timeout is specified as [connect timeout, read timeout]
128 | kube_client_request_args = {{"_request_timeout" : [60,60] }}
129 | 
130 | # Specifies the uid to run the first process of the worker pods containers as
131 | run_as_user =
132 | 
133 | # ref: https://airflow.apache.org/docs/apache-airflow/1.10.1/security.html#setting-up-google-authentication
134 | [google]
135 | client_id = <check the doc above>
136 | client_secret = <check the doc above>
137 | oauth_callback_route = /oauth2callback
138 | domain = localhost,pycon.tw
139 | prompt = select_account
140 | 


--------------------------------------------------------------------------------
/contrib/README.md:
--------------------------------------------------------------------------------
 1 | # Contrib
 2 | 
 3 | ## Upload KKTIX
 4 | 
 5 | ![](../docs/kktix.png)
 6 | 
 7 | 1. Navigate to KKTIX's attendees page
 8 | 2. Download the CSV
 9 | 3. `upload-kktix-ticket-csv-to-bigquery.py -p pycontw-225217 -d ods -t ods_kktix_ticket_<corporate/individual/reserved>_attendees --upload`
10 | 
11 | ## Survey Cake
12 | 
13 | [Demo Video](https://www.loom.com/share/4c494f1d3ce443c6a43ed514c53b70ff)
14 | 1. download CSV from survey cake (account: data-strategy-registration-survey-cake@pycon.tw)
15 | 2. `. ./.env.sh `
16 | 2. `cd contrib/survey_cake`
17 | 3. `python upload-survey-cake-csv-to-bigquery.py --year=<20xx> -c <name of contributor>`
18 |     1. it would upload data to Bigquery's `test` dataset
19 |     2. If everything looks good, you can `copy` the `fact table` and `dimension table` first
20 |     3. Then run `python upload-survey-cake-csv-to-bigquery.py --year=<20xx> -p`. `-p` stands for `production`
21 | 
22 | ## KKTIX BigQuery Transform
23 | 1. Background: Start from 2022, we extract the KKTIX data via KKTIX API and load to "pycontw-225217.ods.ods_kktix_attendeeId_datetime". However most of the data are store in the ATTENDEE_INFO column with json format. To use metabase with SQL, users need to extract the data by json_extract with the knowledge kktix format instead of flat database. And we also need to rewrite all the SQLs build for current databases.
24 | 2. Solution: Transform the tables in backend that we could keep the same user experience by using Metabase.
25 | 3. Run: 
26 |  - for 3 tables in single bash script: `./kktix_bq_etl.sh 2023`
27 | 


--------------------------------------------------------------------------------
/contrib/data/corporate-attendees-2018.csv:
--------------------------------------------------------------------------------
1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,# invoice policy #,Nickname / 暱稱 (Shown on Badge),Invoiced Company Name / 發票抬頭,Unified Business No. / 發票統編,Dietary Habit / 餐點偏好,Need Shuttle Bus Service? 會議兩天的早上  要搭交通車到會場嗎?,Size of T-shirt / T恤尺寸,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Gender / 性別,Contact Name,Contact Email,Contact Mobile,Attendance Book
2 | 12345678,111111111,23,EEEE,Regular 原價,paid,,37d2639c885c499493608e51797b45c1,2018-04-17,5000,,PyConTW1,PyConTW1 ltd.,11111111,Normal / 一般,"Yes, Wait for the Bus at Nangang Expo Center. 需要, 會在南港展覽館候車",XS / 胸寬(F.W.): 46cm / 衣長(C.L.): 66cm,1 年以內,"AI, Machine Learning, Fintech, Business, Internet of Things, Education, Human Resource, Sustainability, Health & Wellness",PyConTW1 ltd.,資深開發研究員,Taiwan 台灣,Female / 女性,pycontwcontact1,pycontwemail1@pycon.tw,912345678,
3 | 12345679,111111112,52,9999,Regular 原價,paid,,37d2639c885c499493608e51797b45c2,2018-04-16,5000,,PyConTW2,PyConTW2 ltd.,22222222,Normal / 一般,No. 不需要,M / 胸寬(F.W.): 50cm / 衣長(C.L.): 71cm,1 到 5 年,"AI, Machine Learning, Science, Technology, Engineering & Mathematics, Entrepreneurship, Startup",PyConTW2 ltd.,資料工程師,,Male / 男性,pycontwcontact2,pycontwemail2@pycon.tw,912345666,
4 | 


--------------------------------------------------------------------------------
/contrib/data/corporate-attendees-2019.csv:
--------------------------------------------------------------------------------
1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,# invoice policy #,Nickname / 暱稱 (Shown on Badge),Invoiced Company Name / 發票抬頭 (Optional),Unified Business No. / 發票統編 (Optional),Dietary Habit / 餐點偏好,Need Shuttle Bus Service? 會議兩天的早上  要搭交通車到會場嗎?,Size of T-shirt / T恤尺寸,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company  / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Gender / 生理性別,Contact Name,Contact Email,Contact Mobile,Attendance Book
2 | 20000000,120000000,1,DDDD,Regular 原價,paid,,37d2639c885c499493608e51797b45c1,2019-07-31,5500,,PyConTW1,PyConTWLTD1,88888888,Normal / 一般,"Yes, Wait for the Bus at Nangang Expo Center. 需要, 會在南港展覽館候車",M / 胸寬(F.W.): 50cm / 衣長(C.L.): 71cm,1 到 5 年,"AI, Machine Learning, Fintech, Business, DevOps, Security, Web Technology, Science, Technology, Engineering & Mathematics, Entrepreneurship, Startup",PyConTWLTD1,System engineer,Taiwan 台灣,Other / 其它,PyConTW1,pycontwemail1@pycon.tw,921000001,
3 | 20000001,120000001,4,FFFF,Regular 原價,paid,,37d2639c885c499493608e51797b45c2,2019-07-31,5500,,PyConTW2,PyConTWLTD2,88889999,Normal / 一般,No. 不需要,L / 胸寬(F.W.): 53cm / 衣長(C.L.): 73cm,5 到 10 年,"AI, Machine Learning, DevOps, Security, Web Technology, Internet of Things",PyConTWLTD2,Founder/CEO,Taiwan 台灣,Male / 男性,PyConTW2,pycontwemail2@pycon.tw,921000002,
4 | 


--------------------------------------------------------------------------------
/contrib/data/corporate-attendees-2020.csv:
--------------------------------------------------------------------------------
1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,# invoice policy #,Nickname / 暱稱 (Shown on Badge),Invoiced Company Name / 發票抬頭 (Optional),Unified Business No. / 發票統編 (Optional),Dietary Habit / 餐點偏好,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company  / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Departure from (Regions) / 出發區域,How did you find out PyCon TW？ / 如何得知 PyCon TW？,Have you ever attended PyCon TW？/ 是否曾參加 PyCon TW？,Do you know we have Financial Aid this year? / 請問您知道今年有財務補助嗎？,Gender / 生理性別,PyNight 參加意願僅供統計人數，實際是否舉辦需由官方另行公告,PyNight 參加意願,是否願意收到贊助商轉發 Email 訊息,是否願意提供 Email 給贊助商,Privacy Policy of PyCon TW 2020 / PyCon TW 2020 個人資料保護聲明 bit.ly/3eipAut,I've already read and I accept the Privacy Policy of PyCon TW 2020 / 我已閱讀並同意 PyCon TW 2020 個人資料保護聲明,Epidemic Prevention of PyCon TW 2020 / PyCon TW 2020 COVID-19 防疫守則 bit.ly/3fcnhu2,I've already read and I accept the Epidemic Prevention of PyCon TW 2020 / 我已閱讀並同意 PyCon TW 2020 COVID-19 防疫守則,Contact Name,Contact Email,Contact Mobile,Attendance Book
2 | 20000000,120000000,1,EEEE,Regular 原價,paid,,37d2639c885c499493608e51797b45c1,2020-07-13,5500,,PyConTW1,PyConTWLTD1,88888888,Normal / 一般,1 到 5 年,"DevOps, Security, Web Technology",PyConTWLTD1,工程師,Taiwan 台灣,Northern Taiwan / 北部,,,No,Male / 男性,,Yes,No,No,,Yes,,Yes,PyConTW1,pycontwemail1@pycon.tw,921000001,
3 | 20000001,120000001,2,9A9E,Regular 原價,paid,,37d2639c885c499493608e51797b45c2,2020-07-15,5500,,PyConTW2,PyConTWLTD2,88888889,Normal / 一般,1 到 5 年,"AI, Machine Learning, Internet of Things",PyConTWLTD2,,Taiwan 台灣,Northern Taiwan / 北部,公司報名,Yes,No,Male / 男性,,Yes,No,No,,Yes,,Yes,PyConTW2,pycontwemail2@pycon.tw,886921000002,
4 | 


--------------------------------------------------------------------------------
/contrib/data/individual-attendees-2018.csv:
--------------------------------------------------------------------------------
1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,# invoice policy #,Nickname / 暱稱 (Shown on Badge),Dietary Habit / 餐點偏好,Need Shuttle Bus Service? 會議兩天的早上  要搭交通車到會場嗎?,Size of T-shirt / T恤尺寸,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Gender / 性別,Contact Name,Contact Email,Contact Mobile,Attendance Book
2 | 50000000,110000000,1,EEEE,"EarlyBird, Discount 優惠價",paid,,37d2639c885c499493608e51797b45c1,2018-04-15,2500,,alice,Normal / 一般,"Yes, Wait for the Bus at Nangang Expo Center. 需要, 會在南港展覽館候車",L / 胸寬(F.W.): 53cm / 衣長(C.L.): 73cm,1 到 5 年,"AI, Machine Learning, Science, Technology, Engineering & Mathematics",,,Taiwan 台灣,Male / 男性,alice,alice@pycon.tw,933123456,
3 | 50000001,110000001,2,9999,"EarlyBird, Discount 優惠價",paid,,37d2639c885c499493608e51797b45c2,2018-04-15,2500,,bob,Normal / 一般,No. 不需要,L / 胸寬(F.W.): 53cm / 衣長(C.L.): 73cm,1 到 5 年,"DevOps, Security, Web Technology, Entrepreneurship, Startup",,,,Male / 男性,bob,bob@pycon.tw,922123456,
4 | 


--------------------------------------------------------------------------------
/contrib/data/individual-attendees-2019.csv:
--------------------------------------------------------------------------------
1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,Nickname / 暱稱 (Shown on Badge),Dietary Habit / 餐點偏好,Need Shuttle Bus Service? 會議三天的早上  要搭交通車到會場嗎?,Size of T-shirt / T恤尺寸,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company  / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Gender / 生理性別,Contact Name,Contact Email,Contact Mobile,Attendance Book
2 | 30000000,130000000,7,GGGG,Discount 優惠價,paid,,37d2639c885c499493608e51797b45c1,2019-07-31,2600,Alice,Normal / 一般,No. 不需要,M / 胸寬(F.W.): 50cm / 衣長(C.L.): 71cm,1 到 5 年,"AI, Machine Learning, Web Technology, Internet of Things, Sustainability, Health & Wellness",PyConTW1,Software Engineer,Taiwan 台灣,Male / 男性,Alice,alice@pycon.tw,921123456,
3 | 30000001,130000001,9,HHHH,Discount 優惠價,paid,,37d2639c885c499493608e51797b45c2,2019-07-31,2600,Bob,Normal / 一般,No. 不需要,M / 胸寬(F.W.): 50cm / 衣長(C.L.): 71cm,1 到 5 年,"AI, Machine Learning, Web Technology, Science, Technology, Engineering & Mathematics, Internet of Things",PyConTW2,CTO,Taiwan 台灣,Male / 男性,Bob,bob@pycon.tw,921456123,
4 | 


--------------------------------------------------------------------------------
/contrib/data/individual-attendees-2020.csv:
--------------------------------------------------------------------------------
1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,Nickname / 暱稱 (Shown on Badge),Dietary Habit / 餐點偏好,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company  / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Departure from (Regions) / 出發區域,Gender / 生理性別,How did you find out PyCon TW? / 如何得知 PyCon TW,Have you ever attended PyCon TW？/ 是否曾參加 PyCon TW？ /,Do you know we have Financial Aid this year? / 請問您知道今年有財務補助嗎？,PyNight 參加意願僅供統計人數，實際是否舉辦需由官方另行公告,PyNight 參加意願,是否願意收到贊助商轉發 Email 訊息,是否願意提供 Email 給贊助商,Privacy Policy of PyCon TW 2020 / PyCon TW 2020 個人資料保護聲明 bit.ly/3eipAut,I've already read and I accept the Privacy Policy of PyConTW 2020 / 我已閱讀並同意 PyCon TW 2020 個人資料保護聲明,Epidemic Prevention of PyCon TW 2020 / PyCon TW 2020 COVID-19 防疫守則 bit.ly/3fcnhu2,I've already read and I accept the Epidemic Prevention of PyCon TW 2020 / 我已閱讀並同意 PyCon TW 2020 COVID-19 防疫守則,Contact Name,Contact Email,Contact Mobile,Attendance Book
2 | 20000000,130000001,1,FFFF,Discount 優惠價,paid,,37d2639c885c499493608e51797b45c1,2020-07-13,2600,PyConTW1,Normal / 一般,1 到 5 年,"AI, Machine Learning, DevOps, Security, Web Technology, Science, Technology, Engineering & Mathematics, Arts, Design, Multimedia, Sustainability, Health & Wellness",PyConTWLTD1,士官長,Taiwan 台灣,,Female / 女性,,,,,Yes,No,No,,,,,PyConTW1,pycontwemail1@pycon.tw,921000003,
3 | 20000001,130000002,2,GGGG,Discount 優惠價,paid,,37d2639c885c499493608e51797b45c2,2020-07-13,2600,PyConTW2,Normal / 一般,,"DevOps, Security, Internet of Things",PyConTWLTD2,Senior Test Engineer,Taiwan 台灣,,Male / 男性,,,,,Yes,No,No,,,,,PyConTW2,pycontwemail2@pycon.tw,886921000004,
4 | 


--------------------------------------------------------------------------------
/contrib/data/reserved-attendees-2018.csv:
--------------------------------------------------------------------------------
1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,# invoice policy #,Nickname / 暱稱 (Shown on Badge),Invoiced Company Name / 發票抬頭 (Optional),Unified Business No. / 發票統編 (Optional),Dietary Habit / 餐點偏好,Need Shuttle Bus Service? 會議兩天的早上  要搭交通車到會場嗎?,Size of T-shirt / T恤尺寸,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company  / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Gender / 性別,Contact Name,Contact Email,Contact Mobile,Attendance Book
2 | 57000000,113100000,3,6666,"Contributor 工作人員, 貢獻者",paid,,37d2639c885c499493608e51797b45c1,2018-04-14,1800,,alice,,,Normal / 一般,No. 不需要,M / 胸寬(F.W.): 50cm / 衣長(C.L.): 71cm,5 到 10 年,"AI, Machine Learning, Entrepreneurship, Startup, Internet of Things",,Student,Taiwan 台灣,Male / 男性,alice,alice@pycon.tw,977123456,
3 | 57000001,113100001,12,8888,Sponsor 贊助夥伴,paid,,37d2639c885c499493608e51797b45c3,2018-04-28,0,,bob,,,Normal / 一般,No. 不需要,M / 胸寬(F.W.): 50cm / 衣長(C.L.): 71cm,1 年以內,"AI, Machine Learning, Web Technology, Science, Technology, Engineering & Mathematics, Education, Human Resource, Arts, Design, Multimedia",,,Taiwan 台灣,Male / 男性,bob,bob@pycon.tw,977456123,
4 | 


--------------------------------------------------------------------------------
/contrib/data/reserved-attendees-2019.csv:
--------------------------------------------------------------------------------
1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,# invoice policy #,Nickname / 暱稱 (Shown on Badge),Dietary Habit / 餐點偏好,Need Shuttle Bus Service? 會議三天的早上  要搭交通車到會場嗎?,Size of T-shirt / T恤尺寸,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company  / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Gender / 性別,Contact Name,Contact Email,Contact Mobile,Attendance Book
2 | 66666666,120000000,2,HHHH,Fanatic 鐵粉票,paid,,37d2639c885c499493608e51797b45c1,2019-04-09,1800,,alice,Vegetarian / 素食者,No. 不需要,2XL / 胸寬(F.W.): 58cm / 衣長(C.L.): 78cm,5 到 10 年,"Web Technology, Education, Human Resource, Sustainability, Health & Wellness",pycontw1,Vice-Chair,Japan 日本,Male / 男性,alice,alice@pycon.tw,1234123456,
3 | 66666667,120000001,6,KKKK,Invited 邀請票,paid,,37d2639c885c499493608e51797b45c2,2019-08-03,0,,bob,Normal / 一般,No. 不需要,M / 胸寬(F.W.): 50cm / 衣長(C.L.): 71cm,10 到 20 年,"AI, Machine Learning, DevOps, Security, Science, Technology, Engineering & Mathematics, Internet of Things, Arts, Design, Multimedia, Sustainability, Health & Wellness",pycontw2,Senior Test Engineer,Taiwan 台灣,Male / 男性,bob,bob@pycon.tw,1234987654,
4 | 


--------------------------------------------------------------------------------
/contrib/data/reserved-attendees-2020.csv:
--------------------------------------------------------------------------------
1 | Id,Order Number,Registration No.,Checkin Code,Ticket Type,Payment Status,Tags,QR Code Serial No.,Paid Date,Price,Nickname / 暱稱 (Shown on Badge),Dietary Habit / 餐點偏好,Years of Using Python / 使用 Python 多久,Area of Interest / 興趣領域,"Company  / 服務單位 (For students or teachers, fill in the School + Department Name)","Job Title / 職稱 (If you are a student, fill in ""student"")",Come From / 國家或地區,Departure from (Regions) / 出發區域,Gender / 性別,是否願意收到贊助商轉發 Email 訊息,是否願意提供 Email 給贊助商,Privacy Policy of PyCon TW 2020,I've already read and I accept the Privacy Policy of PyConTW 2020 / 我已閱讀並同意 PyCon TW 2020 個人資料保護聲明,Epidemic Prevention of PyCon TW 2020,I've already read and I accept the Epidemic Prevention of PyCon TW 2020 / 我已閱讀並同意 PyCon TW 2020 COVID-19 防疫守則,Contact Name,Contact Email,Contact Mobile,Attendance Book
2 | 76000001,123421001,1,CCCC,Invited 邀請票,paid,,37d2639c885c499493608e51797b45c1,2020-08-08,0,Alice,Normal / 一般,5 到 10 年,"DevOps, Security, Web Technology, Entrepreneurship, Startup",pycontw1,Student,Japan 日本,Overseas / 海外,Male / 男性,No,No,,Yes,,Yes,Alice,alice@pycon.tw,12345678,
3 | 76000002,123421002,2,BBBB,Contributor 貢獻者票,paid,,37d2639c885c499493608e51797b45c2,2020-08-08,0,Bob,Normal / 一般,1 到 5 年,"DevOps, Security, Web Technology, Entrepreneurship, Startup, Internet of Things, Arts, Design, Multimedia",pycontw2,Engineer,Taiwan 台灣,Northern Taiwan / 北部,Male / 男性,Yes,No,,Yes,,Yes,Bob,bob@pycon.tw,87654321,
4 | 76000003,123421003,3,AAAA,Speaker 講者票,paid,,37d2639c885c499493608e51797b45c3,2020-08-08,0,Chris,Normal / 一般,1 到 5 年,"AI, Machine Learning, Web Technology, Science, Technology, Engineering & Mathematics, Internet of Things, Education, Human Resource, Arts, Design, Multimedia, Sustainability, Health & Wellness",pycontw3,Assistant professor,United States 美國,Overseas / 海外,Male / 男性,Yes,No,,Yes,,Yes,Chris,chris@pycon.tw,11223344,
5 | 


--------------------------------------------------------------------------------
/contrib/kktix_bq_etl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # export GOOGLE_APPLICATION_CREDENTIALS="<where to access service-account.json>"
 4 | #
 5 | project_id="pycontw-225217"
 6 | cmd=${PWD}/../dags/ods/kktix_ticket_orders/udfs/kktix_bq_dwd_etl.py
 7 | 
 8 | 
 9 | for ticket_type in corporate individual reserved
10 | do
11 |     suffix=${ticket_type}_attendees$2
12 |     cmd_args="-p ${project_id} -d dwd -t kktix_ticket_${suffix} -k ${ticket_type} -y $1 --upload"
13 |     echo ${cmd_args}
14 |     ${cmd} ${cmd_args}
15 | done
16 | 


--------------------------------------------------------------------------------
/contrib/survey_cake/udfs/survey_cake_csv_uploader.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | from pathlib import Path
  4 | from typing import Dict, List
  5 | 
  6 | from google.cloud import bigquery
  7 | 
  8 | 
  9 | class SurveyCakeCSVUploader:
 10 |     USELESS_COLUMNS = {
 11 |         "額滿結束註記",
 12 |         "使用者紀錄",
 13 |         "會員時間",
 14 |         "會員編號",
 15 |         "自訂ID",
 16 |         "備註",
 17 |     }
 18 | 
 19 |     def __init__(self, year: int, filename: str):
 20 |         self._year = year
 21 |         self.filename = Path(filename)
 22 |         if not bool(os.getenv("AIRFLOW_TEST_MODE")):
 23 |             self.client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
 24 |             self.existing_question_and_id_dict = self._get_existing_question_and_id()
 25 |         else:
 26 |             self.existing_question_and_id_dict = {"placeholder": 1}
 27 |         self.facttable_filepath = (
 28 |             self.filename.parent / f"{self.filename.stem}_facttable.csv"
 29 |         )
 30 |         self.dimension_table_filepath = (
 31 |             self.filename.parent / f"{self.filename.stem}_dimension.csv"
 32 |         )
 33 | 
 34 |     @property
 35 |     def year(self):
 36 |         return self._year
 37 | 
 38 |     @property
 39 |     def bigquery_project(self):
 40 |         return os.getenv("BIGQUERY_PROJECT")
 41 | 
 42 |     def _get_existing_question_and_id(self):
 43 |         query = """
 44 |             SELECT
 45 |                 question, question_id
 46 |             FROM
 47 |                 dim.dim_questionnaire_questionId_year;
 48 |         """
 49 |         query_job = self.client.query(query)
 50 |         return {row["question"]: row["question_id"] for row in query_job}
 51 | 
 52 |     def transform(self):
 53 |         def _export_facttable(header_of_fact_table):
 54 |             with open(self.facttable_filepath, "w") as target:
 55 |                 writer = csv.writer(target)
 56 |                 writer.writerow(header_of_fact_table)
 57 |                 for row in rows_of_fact_table:
 58 |                     row_with_year = row + (self.year,)
 59 |                     writer.writerow(row_with_year)
 60 | 
 61 |         def _export_dimension_table(question_id_dimension_table):
 62 |             with open(self.dimension_table_filepath, "w") as target:
 63 |                 writer = csv.writer(target)
 64 |                 writer.writerow(("question_id", "question", "year"))
 65 |                 for question_id, question in question_id_dimension_table.items():
 66 |                     # need to filter out existing question_id, otherwise we would end up having duplicate question_id in BigQuery
 67 |                     if question not in self.existing_question_and_id_dict.keys():
 68 |                         writer.writerow((question_id, question, self.year))
 69 | 
 70 |         def _get_question_ids_of_this_year(
 71 |             header: List, question_id_dimension_table: Dict
 72 |         ) -> List:
 73 |             reversed_question_id_dimension_table = {
 74 |                 question: question_id
 75 |                 for question_id, question in question_id_dimension_table.items()
 76 |             }
 77 |             return [
 78 |                 reversed_question_id_dimension_table[column]
 79 |                 for column in header
 80 |                 if column not in self.USELESS_COLUMNS
 81 |             ]
 82 | 
 83 |         with open(Path(self.filename), encoding="utf-8-sig") as csvfile:
 84 |             rows = csv.reader(csvfile)
 85 |             # skip header
 86 |             header = [column.strip() for column in next(iter(rows))]
 87 |             question_id_dimension_table = self._generate_question_id_dimension_table(
 88 |                 header
 89 |             )
 90 | 
 91 |             question_ids = _get_question_ids_of_this_year(
 92 |                 header, question_id_dimension_table
 93 |             )
 94 |             header_of_fact_table = ("ip", "question_id", "answer", "year")
 95 |             rows_of_fact_table = self._transform_raw_data_to_fact_table_format(
 96 |                 rows, question_id_dimension_table, question_ids
 97 |             )
 98 | 
 99 |         _export_facttable(header_of_fact_table)
100 |         _export_dimension_table(question_id_dimension_table)
101 | 
102 |     def upload(
103 |         self,
104 |         facttable_or_dimension_table,
105 |         data_layer,
106 |         data_domain,
107 |         primary_key,
108 |         time_dimension,
109 |     ):
110 |         if facttable_or_dimension_table == "fact":
111 |             self._upload_2_bigquery(
112 |                 self.facttable_filepath,
113 |                 f"{self.bigquery_project}.{data_layer}.{data_layer}_{data_domain}_{primary_key}_{time_dimension}",
114 |             )
115 |         elif facttable_or_dimension_table == "dim":
116 |             self._upload_2_bigquery(
117 |                 self.dimension_table_filepath,
118 |                 f"{self.bigquery_project}.{data_layer}.{data_layer}_{data_domain}_{primary_key}_{time_dimension}",
119 |             )
120 | 
121 |     def _upload_2_bigquery(self, file_path, table_id):
122 |         job_config = bigquery.LoadJobConfig(
123 |             source_format=bigquery.SourceFormat.CSV,
124 |             skip_leading_rows=1,
125 |             autodetect=True,
126 |             allow_quoted_newlines=True,
127 |             write_disposition="WRITE_APPEND",
128 |             schema_update_options="ALLOW_FIELD_ADDITION",
129 |         )
130 |         with open(file_path, "rb") as source_file:
131 |             job = self.client.load_table_from_file(
132 |                 source_file, table_id, job_config=job_config
133 |             )
134 | 
135 |         job.result()  # Waits for the job to complete.
136 | 
137 |         table = self.client.get_table(table_id)  # Make an API request.
138 |         print(
139 |             f"There's {table.num_rows} rows and {len(table.schema)} columns in {table_id} now!"
140 |         )
141 | 
142 |     def _generate_question_id_dimension_table(self, header):
143 |         max_existing_question_id = int(max(self.existing_question_and_id_dict.values()))
144 |         question_id_dim_table = {}
145 |         for index, column in enumerate(header, start=max_existing_question_id):
146 |             if column in self.USELESS_COLUMNS:
147 |                 continue
148 |             if column in self.existing_question_and_id_dict:
149 |                 question_id_dim_table[self.existing_question_and_id_dict[column]] = (
150 |                     column
151 |                 )
152 |             else:
153 |                 question_id_dim_table[float(index)] = column
154 |         return question_id_dim_table
155 | 
156 |     @staticmethod
157 |     def _transform_raw_data_to_fact_table_format(
158 |         rows, question_id_dimension_table, question_ids
159 |     ):
160 |         result = []
161 |         for row in rows:
162 |             row_dict = dict(zip(question_ids, row))
163 |             question_id_of_primary_key = [
164 |                 key
165 |                 for key, value in question_id_dimension_table.items()
166 |                 if value == "IP紀錄"
167 |             ][0]
168 |             primary_key = row_dict[question_id_of_primary_key]
169 |             for question_id, answer in row_dict.items():
170 |                 result.append((primary_key, question_id, answer))
171 |         return result
172 | 


--------------------------------------------------------------------------------
/contrib/survey_cake/upload-survey-cake-csv-to-bigquery.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A crawler which would crawl the openings
 3 | """
 4 | 
 5 | import argparse
 6 | 
 7 | from udfs.survey_cake_csv_uploader import SurveyCakeCSVUploader
 8 | 
 9 | TEST_DATA_LAYER = "test"
10 | FILENAMES = {
11 |     "data_questionnaire.csv": {
12 |         "data_domain": "questionnaire",
13 |         "primary_key": "ip",
14 |         "time_dimension": "datetime",
15 |     },
16 |     "data_sponsor_questionnaire.csv": {
17 |         "data_domain": "sponsorQuestionnaire",
18 |         "primary_key": "ip",
19 |         "time_dimension": "datetime",
20 |     },
21 | }
22 | if __name__ == "__main__":
23 |     PARSER = argparse.ArgumentParser()
24 |     PARSER.add_argument("-y", "--year", type=int, required=True)
25 |     PARSER.add_argument(
26 |         "-c",
27 |         "--contributor",
28 |         type=str,
29 |         help="input your name please! You'll find a table with your name in Bigquery.test dataset",
30 |         required=True,
31 |     )
32 |     PARSER.add_argument("-p", "--prod", action="store_true")
33 |     ARGS = PARSER.parse_args()
34 |     print(
35 |         "HINT: the default mode would load data to dataset `test`. To load data to bigquery's `ods` dataset, please add `--prod` flag!"
36 |     )
37 |     for filename, metadata in FILENAMES.items():
38 |         SURVEY_CAKE_CSV_UPLOADER = SurveyCakeCSVUploader(
39 |             year=ARGS.year, filename=filename
40 |         )
41 |         SURVEY_CAKE_CSV_UPLOADER.transform()
42 |         SURVEY_CAKE_CSV_UPLOADER.upload(
43 |             facttable_or_dimension_table="fact",
44 |             data_layer="ods" if ARGS.prod else TEST_DATA_LAYER,
45 |             data_domain=metadata["data_domain"]
46 |             if ARGS.prod
47 |             else f"{ARGS.contributor}_{metadata['data_domain']}",
48 |             primary_key=metadata["primary_key"],
49 |             time_dimension=metadata["time_dimension"],
50 |         )
51 |         SURVEY_CAKE_CSV_UPLOADER.upload(
52 |             facttable_or_dimension_table="dim",
53 |             data_layer="dim" if ARGS.prod else TEST_DATA_LAYER,
54 |             data_domain=metadata["data_domain"]
55 |             if ARGS.prod
56 |             else f"{ARGS.contributor}_{metadata['data_domain']}",
57 |             primary_key="questionId",
58 |             time_dimension="year",
59 |         )
60 | 


--------------------------------------------------------------------------------
/contrib/upload-kktix-ticket-csv-to-bigquery.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # export GOOGLE_APPLICATION_CREDENTIALS="<where to access service-account.json>"
 4 | #
 5 | root_pycon_etl=${HOME}/work-my-projects/pycontw-projects/PyCon-ETL
 6 | root_upload_data=${HOME}/work-my-projects/pycontw-projects/PyCon-ETL-working
 7 | project_id="pycontw-225217"
 8 | cmd_upload=${root_pycon_etl}/contrib/upload-kktix-ticket-csv-to-bigquery.py
 9 | 
10 | 
11 | for year in 2018 2019 2020
12 | do
13 |     for ticket_type in corporate individual reserved
14 |     do
15 |         suffix=${ticket_type}_attendees_${year}
16 |         cmd_args="${root_upload_data}/${suffix}.csv -p ${project_id} -d ods -t ods_kktix_ticket_${suffix} --upload"
17 |         echo ${cmd_args}
18 |         ${cmd_upload} ${cmd_args}
19 |     done
20 | done
21 | 


--------------------------------------------------------------------------------
/dags/airflow-log-cleanup.py:
--------------------------------------------------------------------------------
  1 | # flake8: noqa
  2 | """
  3 | A maintenance workflow that you can deploy into Airflow to periodically clean
  4 | out the task logs to avoid those getting too big.
  5 | airflow trigger_dag --conf '[curly-braces]"maxLogAgeInDays":30[curly-braces]' airflow-log-cleanup
  6 | --conf options:
  7 |     maxLogAgeInDays:<INT> - Optional
  8 | """
  9 | 
 10 | import logging
 11 | import os
 12 | from datetime import timedelta
 13 | 
 14 | import airflow
 15 | import jinja2
 16 | from airflow.configuration import conf
 17 | from airflow.models import DAG, Variable
 18 | from airflow.operators.bash_operator import BashOperator
 19 | from airflow.operators.dummy_operator import DummyOperator
 20 | 
 21 | # airflow-log-cleanup
 22 | DAG_ID = os.path.basename(__file__).replace(".pyc", "").replace(".py", "")
 23 | START_DATE = airflow.utils.dates.days_ago(1)
 24 | try:
 25 |     BASE_LOG_FOLDER = conf.get("core", "BASE_LOG_FOLDER").rstrip("/")
 26 | except Exception:
 27 |     BASE_LOG_FOLDER = conf.get("logging", "BASE_LOG_FOLDER").rstrip("/")
 28 | # How often to Run. @daily - Once a day at Midnight
 29 | SCHEDULE_INTERVAL = "@daily"
 30 | # Who is listed as the owner of this DAG in the Airflow Web Server
 31 | DAG_OWNER_NAME = "operations"
 32 | # List of email address to send email alerts to if this job fails
 33 | ALERT_EMAIL_ADDRESSES = ["davidtnfsh@gmail.com"]
 34 | # Length to retain the log files if not already provided in the conf. If this
 35 | # is set to 30, the job will remove those files that are 30 days old or older
 36 | DEFAULT_MAX_LOG_AGE_IN_DAYS = Variable.get(
 37 |     "airflow_log_cleanup__max_log_age_in_days", 3
 38 | )
 39 | # Whether the job should delete the logs or not. Included if you want to
 40 | # temporarily avoid deleting the logs
 41 | ENABLE_DELETE = True
 42 | # The number of worker nodes you have in Airflow. Will attempt to run this
 43 | # process for however many workers there are so that each worker gets its
 44 | # logs cleared.
 45 | NUMBER_OF_WORKERS = 1
 46 | DIRECTORIES_TO_DELETE = [BASE_LOG_FOLDER]
 47 | ENABLE_DELETE_CHILD_LOG = Variable.get(
 48 |     "airflow_log_cleanup__enable_delete_child_log", "True"
 49 | )
 50 | LOG_CLEANUP_PROCESS_LOCK_FILE = "/tmp/airflow_log_cleanup_worker.lock"
 51 | logging.info("ENABLE_DELETE_CHILD_LOG  " + ENABLE_DELETE_CHILD_LOG)
 52 | 
 53 | if not BASE_LOG_FOLDER or BASE_LOG_FOLDER.strip() == "":
 54 |     raise ValueError(
 55 |         "BASE_LOG_FOLDER variable is empty in airflow.cfg. It can be found "
 56 |         "under the [core] (<2.0.0) section or [logging] (>=2.0.0) in the cfg file. "
 57 |         "Kindly provide an appropriate directory path."
 58 |     )
 59 | 
 60 | if ENABLE_DELETE_CHILD_LOG.lower() == "true":
 61 |     try:
 62 |         CHILD_PROCESS_LOG_DIRECTORY = conf.get(
 63 |             "scheduler", "CHILD_PROCESS_LOG_DIRECTORY"
 64 |         )
 65 |         if CHILD_PROCESS_LOG_DIRECTORY != " ":
 66 |             DIRECTORIES_TO_DELETE.append(CHILD_PROCESS_LOG_DIRECTORY)
 67 |     except Exception as e:
 68 |         logging.exception(
 69 |             "Could not obtain CHILD_PROCESS_LOG_DIRECTORY from "
 70 |             + "Airflow Configurations: "
 71 |             + str(e)
 72 |         )
 73 | 
 74 | default_args = {
 75 |     "owner": DAG_OWNER_NAME,
 76 |     "depends_on_past": False,
 77 |     "email": ALERT_EMAIL_ADDRESSES,
 78 |     "email_on_failure": True,
 79 |     "email_on_retry": False,
 80 |     "start_date": START_DATE,
 81 |     "retries": 1,
 82 |     "retry_delay": timedelta(minutes=1),
 83 | }
 84 | 
 85 | dag = DAG(
 86 |     DAG_ID,
 87 |     default_args=default_args,
 88 |     schedule_interval=SCHEDULE_INTERVAL,
 89 |     start_date=START_DATE,
 90 |     tags=["teamclairvoyant", "airflow-maintenance-dags"],
 91 |     template_undefined=jinja2.Undefined,
 92 | )
 93 | if hasattr(dag, "doc_md"):
 94 |     dag.doc_md = __doc__
 95 | if hasattr(dag, "catchup"):
 96 |     dag.catchup = False
 97 | 
 98 | start = DummyOperator(task_id="start", dag=dag)
 99 | 
100 | log_cleanup = (
101 |     """
102 | 
103 | echo "Getting Configurations..."
104 | BASE_LOG_FOLDER="{{params.directory}}"
105 | WORKER_SLEEP_TIME="{{params.sleep_time}}"
106 | 
107 | sleep ${WORKER_SLEEP_TIME}s
108 | 
109 | MAX_LOG_AGE_IN_DAYS="{{dag_run.conf.maxLogAgeInDays}}"
110 | if [ "${MAX_LOG_AGE_IN_DAYS}" == "" ]; then
111 |     echo "maxLogAgeInDays conf variable isn't included. Using Default '"""
112 |     + str(DEFAULT_MAX_LOG_AGE_IN_DAYS)
113 |     + """'."
114 |     MAX_LOG_AGE_IN_DAYS='"""
115 |     + str(DEFAULT_MAX_LOG_AGE_IN_DAYS)
116 |     + """'
117 | fi
118 | ENABLE_DELETE="""
119 |     + str("true" if ENABLE_DELETE else "false")
120 |     + """
121 | echo "Finished Getting Configurations"
122 | echo ""
123 | 
124 | echo "Configurations:"
125 | echo "BASE_LOG_FOLDER:      '${BASE_LOG_FOLDER}'"
126 | echo "MAX_LOG_AGE_IN_DAYS:  '${MAX_LOG_AGE_IN_DAYS}'"
127 | echo "ENABLE_DELETE:        '${ENABLE_DELETE}'"
128 | 
129 | cleanup() {
130 |     echo "Executing Find Statement: $1"
131 |     FILES_MARKED_FOR_DELETE=`eval $1`
132 |     echo "Process will be Deleting the following File(s)/Directory(s):"
133 |     echo "${FILES_MARKED_FOR_DELETE}"
134 |     echo "Process will be Deleting `echo "${FILES_MARKED_FOR_DELETE}" | \
135 |     grep -v '^$' | wc -l` File(s)/Directory(s)"     \
136 |     # "grep -v '^$'" - removes empty lines.
137 |     # "wc -l" - Counts the number of lines
138 |     echo ""
139 |     if [ "${ENABLE_DELETE}" == "true" ];
140 |     then
141 |         if [ "${FILES_MARKED_FOR_DELETE}" != "" ];
142 |         then
143 |             echo "Executing Delete Statement: $2"
144 |             eval $2
145 |             DELETE_STMT_EXIT_CODE=$?
146 |             if [ "${DELETE_STMT_EXIT_CODE}" != "0" ]; then
147 |                 echo "Delete process failed with exit code \
148 |                     '${DELETE_STMT_EXIT_CODE}'"
149 | 
150 |                 echo "Removing lock file..."
151 |                 rm -f """
152 |     + str(LOG_CLEANUP_PROCESS_LOCK_FILE)
153 |     + """
154 |                 if [ "${REMOVE_LOCK_FILE_EXIT_CODE}" != "0" ]; then
155 |                     echo "Error removing the lock file. \
156 |                     Check file permissions.\
157 |                     To re-run the DAG, ensure that the lock file has been \
158 |                     deleted ("""
159 |     + str(LOG_CLEANUP_PROCESS_LOCK_FILE)
160 |     + """)."
161 |                     exit ${REMOVE_LOCK_FILE_EXIT_CODE}
162 |                 fi
163 |                 exit ${DELETE_STMT_EXIT_CODE}
164 |             fi
165 |         else
166 |             echo "WARN: No File(s)/Directory(s) to Delete"
167 |         fi
168 |     else
169 |         echo "WARN: You're opted to skip deleting the File(s)/Directory(s)!!!"
170 |     fi
171 | }
172 | 
173 | 
174 | if [ ! -f """
175 |     + str(LOG_CLEANUP_PROCESS_LOCK_FILE)
176 |     + """ ]; then
177 | 
178 |     echo "Lock file not found on this node! \
179 |     Creating it to prevent collisions..."
180 |     touch """
181 |     + str(LOG_CLEANUP_PROCESS_LOCK_FILE)
182 |     + """
183 |     CREATE_LOCK_FILE_EXIT_CODE=$?
184 |     if [ "${CREATE_LOCK_FILE_EXIT_CODE}" != "0" ]; then
185 |         echo "Error creating the lock file. \
186 |         Check if the airflow user can create files under tmp directory. \
187 |         Exiting..."
188 |         exit ${CREATE_LOCK_FILE_EXIT_CODE}
189 |     fi
190 | 
191 |     echo ""
192 |     echo "Running Cleanup Process..."
193 | 
194 |     FIND_STATEMENT="find ${BASE_LOG_FOLDER}/*/* -type f -mtime \
195 |      +${MAX_LOG_AGE_IN_DAYS}"
196 |     DELETE_STMT="${FIND_STATEMENT} -exec rm -f {} \;"
197 | 
198 |     cleanup "${FIND_STATEMENT}" "${DELETE_STMT}"
199 |     CLEANUP_EXIT_CODE=$?
200 | 
201 |     FIND_STATEMENT="find ${BASE_LOG_FOLDER}/*/* -type d -empty"
202 |     DELETE_STMT="${FIND_STATEMENT} -prune -exec rm -rf {} \;"
203 | 
204 |     cleanup "${FIND_STATEMENT}" "${DELETE_STMT}"
205 |     CLEANUP_EXIT_CODE=$?
206 | 
207 |     FIND_STATEMENT="find ${BASE_LOG_FOLDER}/* -type d -empty"
208 |     DELETE_STMT="${FIND_STATEMENT} -prune -exec rm -rf {} \;"
209 | 
210 |     cleanup "${FIND_STATEMENT}" "${DELETE_STMT}"
211 |     CLEANUP_EXIT_CODE=$?
212 | 
213 |     echo "Finished Running Cleanup Process"
214 | 
215 |     echo "Deleting lock file..."
216 |     rm -f """
217 |     + str(LOG_CLEANUP_PROCESS_LOCK_FILE)
218 |     + """
219 |     REMOVE_LOCK_FILE_EXIT_CODE=$?
220 |     if [ "${REMOVE_LOCK_FILE_EXIT_CODE}" != "0" ]; then
221 |         echo "Error removing the lock file. Check file permissions. To re-run the DAG, ensure that the lock file has been deleted ("""
222 |     + str(LOG_CLEANUP_PROCESS_LOCK_FILE)
223 |     + """)."
224 |         exit ${REMOVE_LOCK_FILE_EXIT_CODE}
225 |     fi
226 | 
227 | else
228 |     echo "Another task is already deleting logs on this worker node. \
229 |     Skipping it!"
230 |     echo "If you believe you're receiving this message in error, kindly check \
231 |     if """
232 |     + str(LOG_CLEANUP_PROCESS_LOCK_FILE)
233 |     + """ exists and delete it."
234 |     exit 0
235 | fi
236 | 
237 | """
238 | )
239 | 
240 | for log_cleanup_id in range(1, NUMBER_OF_WORKERS + 1):
241 |     for dir_id, directory in enumerate(DIRECTORIES_TO_DELETE):
242 |         log_cleanup_op = BashOperator(
243 |             task_id="log_cleanup_worker_num_"
244 |             + str(log_cleanup_id)
245 |             + "_dir_"
246 |             + str(dir_id),
247 |             bash_command=log_cleanup,
248 |             params={"directory": str(directory), "sleep_time": int(log_cleanup_id) * 3},
249 |             dag=dag,
250 |         )
251 | 
252 |         log_cleanup_op.set_upstream(start)
253 | 


--------------------------------------------------------------------------------
/dags/airlfow-db-cleanup.py:
--------------------------------------------------------------------------------
  1 | # flake8: noqa
  2 | """
  3 | A maintenance workflow that you can deploy into Airflow to periodically clean
  4 | out the DagRun, TaskInstance, Log, XCom, Job DB and SlaMiss entries to avoid
  5 | having too much data in your Airflow MetaStore.
  6 | 
  7 | airflow trigger_dag --conf '[curly-braces]"maxDBEntryAgeInDays":30[curly-braces]' airflow-db-cleanup
  8 | 
  9 | --conf options:
 10 |     maxDBEntryAgeInDays:<INT> - Optional
 11 | 
 12 | """
 13 | 
 14 | import airflow
 15 | from airflow import settings
 16 | from airflow.configuration import conf
 17 | from airflow.models import (
 18 |     DAG,
 19 |     DagModel,
 20 |     DagRun,
 21 |     DagTag,
 22 |     Log,
 23 |     SlaMiss,
 24 |     TaskInstance,
 25 |     Variable,
 26 |     XCom,
 27 | )
 28 | 
 29 | try:
 30 |     from airflow.jobs import BaseJob
 31 | except Exception as e:
 32 |     from airflow.jobs.base_job import BaseJob
 33 | 
 34 | import logging
 35 | import os
 36 | from datetime import datetime, timedelta
 37 | 
 38 | import dateutil.parser
 39 | from airflow.operators.python_operator import PythonOperator
 40 | from sqlalchemy import and_, func
 41 | from sqlalchemy.exc import ProgrammingError
 42 | from sqlalchemy.orm import load_only
 43 | 
 44 | try:
 45 |     # airflow.utils.timezone is available from v1.10 onwards
 46 |     from airflow.utils import timezone
 47 | 
 48 |     now = timezone.utcnow
 49 | except ImportError:
 50 |     now = datetime.utcnow
 51 | 
 52 | # airflow-db-cleanup
 53 | DAG_ID = os.path.basename(__file__).replace(".pyc", "").replace(".py", "")
 54 | START_DATE = airflow.utils.dates.days_ago(1)
 55 | # How often to Run. @daily - Once a day at Midnight (UTC)
 56 | SCHEDULE_INTERVAL = "@daily"
 57 | # Who is listed as the owner of this DAG in the Airflow Web Server
 58 | DAG_OWNER_NAME = "operations"
 59 | # List of email address to send email alerts to if this job fails
 60 | ALERT_EMAIL_ADDRESSES = ["henry410213028@gmail.com"]
 61 | # Length to retain the log files if not already provided in the conf. If this
 62 | # is set to 30, the job will remove those files that arE 30 days old or older.
 63 | 
 64 | DEFAULT_MAX_DB_ENTRY_AGE_IN_DAYS = int(
 65 |     Variable.get("airflow_db_cleanup__max_db_entry_age_in_days", 30)
 66 | )
 67 | # Prints the database entries which will be getting deleted; set to False to avoid printing large lists and slowdown process
 68 | PRINT_DELETES = True
 69 | # Whether the job should delete the db entries or not. Included if you want to
 70 | # temporarily avoid deleting the db entries.
 71 | ENABLE_DELETE = True
 72 | 
 73 | # get dag model last schedule run
 74 | try:
 75 |     dag_model_last_scheduler_run = DagModel.last_scheduler_run
 76 | except AttributeError:
 77 |     dag_model_last_scheduler_run = DagModel.last_parsed_time
 78 | 
 79 | # List of all the objects that will be deleted. Comment out the DB objects you
 80 | # want to skip.
 81 | DATABASE_OBJECTS = [
 82 |     {
 83 |         "airflow_db_model": BaseJob,
 84 |         "age_check_column": BaseJob.latest_heartbeat,
 85 |         "keep_last": False,
 86 |         "keep_last_filters": None,
 87 |         "keep_last_group_by": None,
 88 |     },
 89 |     {
 90 |         "airflow_db_model": DagRun,
 91 |         "age_check_column": DagRun.execution_date,
 92 |         "keep_last": True,
 93 |         "keep_last_filters": [DagRun.external_trigger.is_(False)],
 94 |         "keep_last_group_by": DagRun.dag_id,
 95 |     },
 96 |     {
 97 |         "airflow_db_model": TaskInstance,
 98 |         "age_check_column": TaskInstance.execution_date,
 99 |         "keep_last": False,
100 |         "keep_last_filters": None,
101 |         "keep_last_group_by": None,
102 |     },
103 |     {
104 |         "airflow_db_model": Log,
105 |         "age_check_column": Log.dttm,
106 |         "keep_last": False,
107 |         "keep_last_filters": None,
108 |         "keep_last_group_by": None,
109 |     },
110 |     {
111 |         "airflow_db_model": XCom,
112 |         "age_check_column": XCom.execution_date,
113 |         "keep_last": False,
114 |         "keep_last_filters": None,
115 |         "keep_last_group_by": None,
116 |     },
117 |     {
118 |         "airflow_db_model": SlaMiss,
119 |         "age_check_column": SlaMiss.execution_date,
120 |         "keep_last": False,
121 |         "keep_last_filters": None,
122 |         "keep_last_group_by": None,
123 |     },
124 |     {
125 |         "airflow_db_model": DagModel,
126 |         "age_check_column": dag_model_last_scheduler_run,
127 |         "keep_last": False,
128 |         "keep_last_filters": None,
129 |         "keep_last_group_by": None,
130 |     },
131 | ]
132 | 
133 | # Check for TaskReschedule model
134 | try:
135 |     from airflow.models import TaskReschedule
136 | 
137 |     DATABASE_OBJECTS.append(
138 |         {
139 |             "airflow_db_model": TaskReschedule,
140 |             "age_check_column": TaskReschedule.execution_date,
141 |             "keep_last": False,
142 |             "keep_last_filters": None,
143 |             "keep_last_group_by": None,
144 |         }
145 |     )
146 | 
147 | except Exception as e:
148 |     logging.error(e)
149 | 
150 | # Check for TaskFail model
151 | try:
152 |     from airflow.models import TaskFail
153 | 
154 |     DATABASE_OBJECTS.append(
155 |         {
156 |             "airflow_db_model": TaskFail,
157 |             "age_check_column": TaskFail.execution_date,
158 |             "keep_last": False,
159 |             "keep_last_filters": None,
160 |             "keep_last_group_by": None,
161 |         }
162 |     )
163 | 
164 | except Exception as e:
165 |     logging.error(e)
166 | 
167 | # Check for RenderedTaskInstanceFields model
168 | try:
169 |     from airflow.models import RenderedTaskInstanceFields
170 | 
171 |     DATABASE_OBJECTS.append(
172 |         {
173 |             "airflow_db_model": RenderedTaskInstanceFields,
174 |             "age_check_column": RenderedTaskInstanceFields.execution_date,
175 |             "keep_last": False,
176 |             "keep_last_filters": None,
177 |             "keep_last_group_by": None,
178 |         }
179 |     )
180 | 
181 | except Exception as e:
182 |     logging.error(e)
183 | 
184 | # Check for ImportError model
185 | try:
186 |     from airflow.models import ImportError
187 | 
188 |     DATABASE_OBJECTS.append(
189 |         {
190 |             "airflow_db_model": ImportError,
191 |             "age_check_column": ImportError.timestamp,
192 |             "keep_last": False,
193 |             "keep_last_filters": None,
194 |             "keep_last_group_by": None,
195 |         }
196 |     )
197 | 
198 | except Exception as e:
199 |     logging.error(e)
200 | 
201 | # Check for celery executor
202 | airflow_executor = str(conf.get("core", "executor"))
203 | logging.info("Airflow Executor: " + str(airflow_executor))
204 | if airflow_executor == "CeleryExecutor":
205 |     logging.info("Including Celery Modules")
206 |     try:
207 |         from celery.backends.database.models import Task, TaskSet
208 | 
209 |         DATABASE_OBJECTS.extend(
210 |             (
211 |                 {
212 |                     "airflow_db_model": Task,
213 |                     "age_check_column": Task.date_done,
214 |                     "keep_last": False,
215 |                     "keep_last_filters": None,
216 |                     "keep_last_group_by": None,
217 |                 },
218 |                 {
219 |                     "airflow_db_model": TaskSet,
220 |                     "age_check_column": TaskSet.date_done,
221 |                     "keep_last": False,
222 |                     "keep_last_filters": None,
223 |                     "keep_last_group_by": None,
224 |                 },
225 |             )
226 |         )
227 | 
228 |     except Exception as e:
229 |         logging.error(e)
230 | 
231 | session = settings.Session()
232 | 
233 | default_args = {
234 |     "owner": DAG_OWNER_NAME,
235 |     "depends_on_past": False,
236 |     "email": ALERT_EMAIL_ADDRESSES,
237 |     "email_on_failure": True,
238 |     "email_on_retry": False,
239 |     "start_date": START_DATE,
240 |     "retries": 1,
241 |     "retry_delay": timedelta(minutes=1),
242 | }
243 | 
244 | dag = DAG(
245 |     DAG_ID,
246 |     default_args=default_args,
247 |     schedule_interval=SCHEDULE_INTERVAL,
248 |     start_date=START_DATE,
249 |     tags=["teamclairvoyant", "airflow-maintenance-dags"],
250 | )
251 | if hasattr(dag, "doc_md"):
252 |     dag.doc_md = __doc__
253 | if hasattr(dag, "catchup"):
254 |     dag.catchup = False
255 | 
256 | 
257 | def print_configuration_function(**context):
258 |     logging.info("Loading Configurations...")
259 |     dag_run_conf = context.get("dag_run").conf
260 |     logging.info("dag_run.conf: " + str(dag_run_conf))
261 |     max_db_entry_age_in_days = None
262 |     if dag_run_conf:
263 |         max_db_entry_age_in_days = dag_run_conf.get("maxDBEntryAgeInDays", None)
264 |     logging.info("maxDBEntryAgeInDays from dag_run.conf: " + str(dag_run_conf))
265 |     if max_db_entry_age_in_days is None or max_db_entry_age_in_days < 1:
266 |         logging.info(
267 |             "maxDBEntryAgeInDays conf variable isn't included or Variable "
268 |             + "value is less than 1. Using Default '"
269 |             + str(DEFAULT_MAX_DB_ENTRY_AGE_IN_DAYS)
270 |             + "'"
271 |         )
272 |         max_db_entry_age_in_days = DEFAULT_MAX_DB_ENTRY_AGE_IN_DAYS
273 |     max_date = now() + timedelta(-max_db_entry_age_in_days)
274 |     logging.info("Finished Loading Configurations")
275 |     logging.info("")
276 | 
277 |     logging.info("Configurations:")
278 |     logging.info("max_db_entry_age_in_days: " + str(max_db_entry_age_in_days))
279 |     logging.info("max_date:                 " + str(max_date))
280 |     logging.info("enable_delete:            " + str(ENABLE_DELETE))
281 |     logging.info("session:                  " + str(session))
282 |     logging.info("")
283 | 
284 |     logging.info("Setting max_execution_date to XCom for Downstream Processes")
285 |     context["ti"].xcom_push(key="max_date", value=max_date.isoformat())
286 | 
287 | 
288 | print_configuration = PythonOperator(
289 |     task_id="print_configuration",
290 |     python_callable=print_configuration_function,
291 |     provide_context=True,
292 |     dag=dag,
293 | )
294 | 
295 | 
296 | def cleanup_function(**context):
297 |     logging.info("Retrieving max_execution_date from XCom")
298 |     max_date = context["ti"].xcom_pull(
299 |         task_ids=print_configuration.task_id, key="max_date"
300 |     )
301 |     max_date = dateutil.parser.parse(max_date)  # stored as iso8601 str in xcom
302 | 
303 |     airflow_db_model = context["params"].get("airflow_db_model")
304 |     state = context["params"].get("state")
305 |     age_check_column = context["params"].get("age_check_column")
306 |     keep_last = context["params"].get("keep_last")
307 |     keep_last_filters = context["params"].get("keep_last_filters")
308 |     keep_last_group_by = context["params"].get("keep_last_group_by")
309 | 
310 |     logging.info("Configurations:")
311 |     logging.info("max_date:                 " + str(max_date))
312 |     logging.info("enable_delete:            " + str(ENABLE_DELETE))
313 |     logging.info("session:                  " + str(session))
314 |     logging.info("airflow_db_model:         " + str(airflow_db_model))
315 |     logging.info("state:                    " + str(state))
316 |     logging.info("age_check_column:         " + str(age_check_column))
317 |     logging.info("keep_last:                " + str(keep_last))
318 |     logging.info("keep_last_filters:        " + str(keep_last_filters))
319 |     logging.info("keep_last_group_by:       " + str(keep_last_group_by))
320 | 
321 |     logging.info("")
322 | 
323 |     logging.info("Running Cleanup Process...")
324 | 
325 |     try:
326 |         query = session.query(airflow_db_model).options(load_only(age_check_column))
327 | 
328 |         logging.info("INITIAL QUERY : " + str(query))
329 | 
330 |         if keep_last:
331 |             subquery = session.query(func.max(DagRun.execution_date))
332 |             # workaround for MySQL "table specified twice" issue
333 |             # https://github.com/teamclairvoyant/airflow-maintenance-dags/issues/41
334 |             if keep_last_filters is not None:
335 |                 for entry in keep_last_filters:
336 |                     subquery = subquery.filter(entry)
337 | 
338 |                 logging.info("SUB QUERY [keep_last_filters]: " + str(subquery))
339 | 
340 |             if keep_last_group_by is not None:
341 |                 subquery = subquery.group_by(keep_last_group_by)
342 |                 logging.info("SUB QUERY [keep_last_group_by]: " + str(subquery))
343 | 
344 |             subquery = subquery.from_self()
345 | 
346 |             query = query.filter(
347 |                 and_(age_check_column.notin_(subquery)),
348 |                 and_(age_check_column <= max_date),
349 |             )
350 | 
351 |         else:
352 |             query = query.filter(
353 |                 age_check_column <= max_date,
354 |             )
355 | 
356 |         if PRINT_DELETES:
357 |             entries_to_delete = query.all()
358 | 
359 |             logging.info("Query: " + str(query))
360 |             logging.info(
361 |                 "Process will be Deleting the following "
362 |                 + str(airflow_db_model.__name__)
363 |                 + "(s):"
364 |             )
365 |             for entry in entries_to_delete:
366 |                 logging.info(
367 |                     "\tEntry: "
368 |                     + str(entry)
369 |                     + ", Date: "
370 |                     + str(entry.__dict__[str(age_check_column).split(".")[1]])
371 |                 )
372 | 
373 |             logging.info(
374 |                 "Process will be Deleting "
375 |                 + str(len(entries_to_delete))
376 |                 + " "
377 |                 + str(airflow_db_model.__name__)
378 |                 + "(s)"
379 |             )
380 |         else:
381 |             logging.warn(
382 |                 "You've opted to skip printing the db entries to be deleted. Set PRINT_DELETES to True to show entries!!!"
383 |             )
384 | 
385 |         if ENABLE_DELETE:
386 |             logging.info("Performing Delete...")
387 |             if airflow_db_model.__name__ == "DagModel":
388 |                 logging.info("Deleting tags...")
389 |                 ids_query = query.from_self().with_entities(DagModel.dag_id)
390 |                 tags_query = session.query(DagTag).filter(DagTag.dag_id.in_(ids_query))
391 |                 logging.info("Tags delete Query: " + str(tags_query))
392 |                 tags_query.delete(synchronize_session=False)
393 |             # using bulk delete
394 |             query.delete(synchronize_session=False)
395 |             session.commit()
396 |             logging.info("Finished Performing Delete")
397 |         else:
398 |             logging.warn(
399 |                 "You've opted to skip deleting the db entries. Set ENABLE_DELETE to True to delete entries!!!"
400 |             )
401 | 
402 |         logging.info("Finished Running Cleanup Process")
403 | 
404 |     except ProgrammingError as e:
405 |         logging.error(e)
406 |         logging.error(
407 |             str(airflow_db_model) + " is not present in the metadata. Skipping..."
408 |         )
409 | 
410 | 
411 | for db_object in DATABASE_OBJECTS:
412 |     cleanup_op = PythonOperator(
413 |         task_id="cleanup_" + str(db_object["airflow_db_model"].__name__),
414 |         python_callable=cleanup_function,
415 |         params=db_object,
416 |         provide_context=True,
417 |         dag=dag,
418 |     )
419 | 
420 |     print_configuration.set_downstream(cleanup_op)
421 | 


--------------------------------------------------------------------------------
/dags/app/channel_reminder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/dags/app/channel_reminder/__init__.py


--------------------------------------------------------------------------------
/dags/app/channel_reminder/dag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Send Google Search Report to Discord
 3 | """
 4 | 
 5 | from datetime import datetime, timedelta
 6 | 
 7 | from airflow import DAG
 8 | from airflow.operators.python_operator import PythonOperator
 9 | from app.channel_reminder import udf
10 | 
11 | DEFAULT_ARGS = {
12 |     "owner": "davidtnfsh",
13 |     "depends_on_past": False,
14 |     "start_date": datetime(2022, 9, 15),
15 |     "retries": 2,
16 |     "retry_delay": timedelta(minutes=5),
17 |     "on_failure_callback": lambda x: "Need to send notification to Discord",
18 | }
19 | dag = DAG(
20 |     "DISCORD_CHORES_REMINDER",
21 |     default_args=DEFAULT_ARGS,
22 |     schedule_interval="@yearly",
23 |     max_active_runs=1,
24 |     catchup=False,
25 | )
26 | with dag:
27 |     REMINDER_OF_THIS_TEAM = PythonOperator(
28 |         task_id="KLAIVYO_REMINDER", python_callable=udf.main
29 |     )
30 | 
31 | if __name__ == "__main__":
32 |     dag.cli()
33 | 


--------------------------------------------------------------------------------
/dags/app/channel_reminder/udf.py:
--------------------------------------------------------------------------------
 1 | from airflow.models import Variable
 2 | from app import discord
 3 | 
 4 | 
 5 | def main() -> None:
 6 |     kwargs = {
 7 |         "webhook_url": Variable.get("DISCORD_CHORES_REMINDER_WEBHOOK"),
 8 |         "username": "Data Team Airflow reminder",
 9 |         "msg": (
10 |             "<@&790739794148982796> <@&755827317904769184> <@&791157626099859487>\n",
11 |             "記得大會結束後，要有一個人負責去取消 Klaviyo 的訂閱，不然我們每個月會一直繳 $NTD2000 喔！",
12 |         ),
13 |     }
14 |     discord.send_webhook_message(**kwargs)
15 | 


--------------------------------------------------------------------------------
/dags/app/discord.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import tenacity
 3 | 
 4 | session = requests.session()
 5 | 
 6 | RETRY_ARGS = dict(
 7 |     wait=tenacity.wait_random(min=1, max=10),
 8 |     stop=tenacity.stop_after_attempt(10),
 9 |     retry=tenacity.retry_if_exception_type(requests.exceptions.ConnectionError),
10 | )
11 | 
12 | 
13 | @tenacity.retry(**RETRY_ARGS)
14 | def send_webhook_message(webhook_url: str, username: str, msg: str) -> None:
15 |     session.post(
16 |         webhook_url,
17 |         json={"username": username, "content": msg},
18 |     )
19 | 


--------------------------------------------------------------------------------
/dags/app/finance_bot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/dags/app/finance_bot/__init__.py


--------------------------------------------------------------------------------
/dags/app/finance_bot/dag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Send Google Search Report to Discord
 3 | """
 4 | 
 5 | from datetime import datetime, timedelta
 6 | 
 7 | from airflow import DAG
 8 | from airflow.operators.python_operator import PythonOperator
 9 | from app.finance_bot import udf
10 | 
11 | DEFAULT_ARGS = {
12 |     "owner": "qchwan",
13 |     "depends_on_past": False,
14 |     "start_date": datetime(2023, 8, 27),
15 |     "retries": 2,
16 |     "retry_delay": timedelta(minutes=5),
17 |     "on_failure_callback": lambda x: "Need to send notification to Discord",
18 | }
19 | dag = DAG(
20 |     "DISCORD_FINANCE_REMINDER",
21 |     default_args=DEFAULT_ARGS,
22 |     schedule_interval="@daily",
23 |     max_active_runs=1,
24 |     catchup=False,
25 | )
26 | with dag:
27 |     REMINDER_OF_THIS_TEAM = PythonOperator(
28 |         task_id="FINANCE_REMINDER", python_callable=udf.main
29 |     )
30 | 
31 | if __name__ == "__main__":
32 |     dag.cli()
33 | 


--------------------------------------------------------------------------------
/dags/app/finance_bot/udf.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import pygsheets
  6 | import requests
  7 | from airflow.models import Variable
  8 | from app import discord
  9 | from google.cloud import bigquery
 10 | 
 11 | session = requests.session()
 12 | 
 13 | 
 14 | def main() -> None:
 15 |     # read xls from google doc to df.
 16 |     df_xls = read_google_xls_to_df()
 17 |     # read bigquery to df.
 18 |     df_bigquery = read_bigquery_to_df()
 19 |     # check difference between 2 df
 20 |     df_diff = df_difference(df_xls, df_bigquery)
 21 |     # link to bigquery and write xls file
 22 |     write_to_bigquery(df_diff)
 23 |     # push to discord
 24 |     kwargs = {
 25 |         "webhook_url": Variable.get("discord_data_stratagy_webhook"),
 26 |         "username": "財務機器人",
 27 |         "msg": refine_diff_df_to_string(df_diff),
 28 |     }
 29 |     if kwargs["msg"] != "no data":
 30 |         discord.send_webhook_message(**kwargs)
 31 | 
 32 | 
 33 | def df_difference(df_xls, df_bigquery) -> pd.DataFrame:
 34 |     merged = pd.merge(df_xls, df_bigquery, how="outer", indicator=True)
 35 |     return merged[merged["_merge"] == "left_only"].drop("_merge", axis=1)
 36 | 
 37 | 
 38 | def read_bigquery_to_df() -> pd.DataFrame:
 39 |     client = bigquery.Client()
 40 |     query = """
 41 |     SELECT *
 42 |     FROM `pycontw-225217.ods.pycontw_finance`
 43 |     """
 44 |     query_job = client.query(query)
 45 |     results = query_job.result()
 46 |     schema = results.schema
 47 |     column_names = [field.name for field in schema]
 48 |     data = [list(row.values()) for row in results]
 49 |     df = pd.DataFrame(data=data, columns=column_names)
 50 | 
 51 |     return df
 52 | 
 53 | 
 54 | def read_google_xls_to_df() -> pd.DataFrame:
 55 |     gc = pygsheets.authorize(service_file=os.getenv("GOOGLE_APPLICATION_CREDENTIALS"))
 56 |     sheet = gc.open_by_url(Variable.get("finance_xls_path"))
 57 |     wks = sheet.sheet1
 58 |     df = wks.get_as_df(include_tailing_empty=False)
 59 |     df.replace("", np.nan, inplace=True)
 60 |     df.dropna(inplace=True)
 61 |     df = df.astype(str)
 62 |     df.columns = [
 63 |         "Reason",
 64 |         "Price",
 65 |         "Remarks",
 66 |         "Team_name",
 67 |         "Details",
 68 |         "To_who",
 69 |         "Yes_or_No",
 70 |     ]
 71 |     return df
 72 | 
 73 | 
 74 | def write_to_bigquery(df) -> None:
 75 |     project_id = "pycontw-225217"
 76 |     dataset_id = "ods"
 77 |     table_id = "pycontw_finance"
 78 |     client = bigquery.Client(project=project_id)
 79 |     table = client.dataset(dataset_id).table(table_id)
 80 |     schema = [
 81 |         bigquery.SchemaField("Reason", "STRING", mode="REQUIRED"),
 82 |         bigquery.SchemaField("Price", "STRING", mode="REQUIRED"),
 83 |         bigquery.SchemaField("Remarks", "STRING", mode="REQUIRED"),
 84 |         bigquery.SchemaField("Team_name", "STRING", mode="REQUIRED"),
 85 |         bigquery.SchemaField("Details", "STRING", mode="REQUIRED"),
 86 |         bigquery.SchemaField("To_who", "STRING", mode="REQUIRED"),
 87 |         bigquery.SchemaField("Yes_or_No", "STRING", mode="REQUIRED"),
 88 |     ]
 89 |     job_config = bigquery.LoadJobConfig(schema=schema)
 90 |     job = client.load_table_from_dataframe(df, table, job_config=job_config)
 91 |     job.result()
 92 | 
 93 | 
 94 | def refine_diff_df_to_string(df) -> str:
 95 |     msg = ""
 96 |     if df.empty:
 97 |         return "no data"
 98 |     else:
 99 |         for row in df.itertuples(index=False):
100 |             msg += f"{row[0]}, 花費: {row[1]}, {row[3]}, {row[4]}\n"
101 |         return msg
102 | 


--------------------------------------------------------------------------------
/dags/app/proposal_reminder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/dags/app/proposal_reminder/__init__.py


--------------------------------------------------------------------------------
/dags/app/proposal_reminder/dag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Send Proposal Summary to Discord
 3 | """
 4 | 
 5 | from datetime import datetime, timedelta
 6 | 
 7 | from airflow import DAG
 8 | from airflow.operators.python_operator import PythonOperator
 9 | from app.proposal_reminder import udf
10 | 
11 | DEFAULT_ARGS = {
12 |     "owner": "Henry Lee",
13 |     "depends_on_past": False,
14 |     "start_date": datetime(2025, 2, 25),
15 |     "end_date": datetime(2025, 4, 9),
16 |     "retries": 2,
17 |     "retry_delay": timedelta(minutes=5),
18 | }
19 | 
20 | with DAG(
21 |     "DISCORD_PROPOSAL_REMINDER_v3",
22 |     default_args=DEFAULT_ARGS,
23 |     schedule_interval="0 16 * * *",  # At 16:00 (00:00 +8)
24 |     max_active_runs=1,
25 |     catchup=False,
26 | ) as dag:
27 |     PythonOperator(
28 |         task_id="SEND_PROPOSAL_SUMMARY",
29 |         python_callable=udf.main,
30 |     )
31 | 
32 | if __name__ == "__main__":
33 |     dag.cli()
34 | 


--------------------------------------------------------------------------------
/dags/app/proposal_reminder/udf.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from airflow.models import Variable
 3 | from app import discord
 4 | 
 5 | 
 6 | def main() -> None:
 7 |     summary = get_proposal_summary()
 8 |     n_talk = summary["num_proposed_talk"]
 9 |     n_tutorial = summary["num_proposed_tutorial"]
10 |     kwargs = {
11 |         "webhook_url": Variable.get("DISCORD_PROGRAM_REMINDER_WEBHOOK"),
12 |         "username": "Program talk reminder",
13 |         "msg": f"目前投稿議程數: {n_talk}; 課程數: {n_tutorial}",
14 |     }
15 |     discord.send_webhook_message(**kwargs)
16 | 
17 | 
18 | def get_proposal_summary() -> dict:
19 |     url = "https://tw.pycon.org/prs/api/proposals/summary/"
20 |     headers = {
21 |         "Content-Type": "application/json",
22 |         "authorization": Variable.get("PYCON_API_TOKEN"),
23 |     }
24 |     response = requests.get(url, headers=headers)
25 |     return response.json()
26 | 


--------------------------------------------------------------------------------
/dags/app/team_registration_bot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/dags/app/team_registration_bot/__init__.py


--------------------------------------------------------------------------------
/dags/app/team_registration_bot/dag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | send daily ordering metrics to discord channel
 3 | """
 4 | 
 5 | from datetime import datetime, timedelta
 6 | 
 7 | from airflow import DAG
 8 | from airflow.operators.python_operator import PythonOperator
 9 | from app.team_registration_bot import udf
10 | 
11 | DEFAULT_ARGS = {
12 |     "owner": "davidtnfsh@gmail.com",
13 |     "depends_on_past": False,
14 |     "start_date": datetime(2022, 7, 4),
15 |     "retries": 2,
16 |     "retry_delay": timedelta(minutes=5),
17 |     "on_failure_callback": lambda x: "Need to send notification to Discord!",
18 | }
19 | dag = DAG(
20 |     "KKTIX_DISCORD_BOT_FOR_TEAM_REGISTRATION",
21 |     default_args=DEFAULT_ARGS,
22 |     schedule_interval="@daily",
23 |     max_active_runs=1,
24 |     catchup=False,
25 | )
26 | with dag:
27 |     SEND_MSG_TO_DISCORD = PythonOperator(
28 |         task_id="LOAD_TO_DISCORD",
29 |         python_callable=udf.main,
30 |     )
31 | 
32 | if __name__ == "__main__":
33 |     dag.cli()
34 | 


--------------------------------------------------------------------------------
/dags/app/team_registration_bot/udf.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | from typing import Dict
 4 | 
 5 | from airflow.models import Variable
 6 | from app import discord
 7 | from google.cloud import bigquery
 8 | 
 9 | YEAR = datetime.now().year
10 | 
11 | TABLE = f"{os.getenv('BIGQUERY_PROJECT', 'pycontw-225217')}.ods.ods_kktix_attendeeId_datetime"
12 | 
13 | CLIENT = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
14 | 
15 | 
16 | def main() -> None:
17 |     statistics = _get_statistics_from_bigquery()
18 |     msg = _compose_discord_msg(statistics)
19 |     kwargs = {
20 |         "webhook_url": Variable.get("discord_webhook_registration_endpoint"),
21 |         "username": "KKTIX order report",
22 |         "msg": msg,
23 |     }
24 |     discord.send_webhook_message(**kwargs)
25 | 
26 | 
27 | def _get_statistics_from_bigquery() -> Dict:
28 |     query_job = CLIENT.query(
29 |         f"""
30 |         WITH UNIQUE_RECORDS AS (
31 |           SELECT DISTINCT
32 |             NAME,
33 |             JSON_EXTRACT(ATTENDEE_INFO, '$.id') AS ORDER_ID,
34 |             REPLACE(JSON_EXTRACT(ATTENDEE_INFO, '$.ticket_name'), '"', '') AS TICKET_NAME,
35 |           FROM
36 |             `{TABLE}`
37 |           WHERE
38 |             ((REFUNDED IS NULL) OR (REFUNDED = FALSE)) AND (NAME LIKE "PyCon TW {YEAR} Registration%")
39 |         )
40 | 
41 |         SELECT
42 |           NAME,
43 |           TICKET_NAME,
44 |           COUNT(1) AS COUNTS
45 |         FROM UNIQUE_RECORDS
46 |         GROUP BY
47 |           NAME, TICKET_NAME;
48 |     """  # nosec
49 |     )
50 |     result = query_job.result()
51 |     return result
52 | 
53 | 
54 | ticket_price = {
55 |     # please update the price for target year
56 |     "企業票 - 一般階段 / Corporate - Regular Stage": 5800,
57 |     "企業票 - 晚鳥階段 / Corporate - Final Stage": 6500,
58 |     "企業團體票 (歡迎申請) / Group-Buy Corporate (Free to Apply)": 5220,
59 |     "優惠票 (含紀念衣服) / Reserved - Community (with T-Shirt)": 2590,
60 |     "貢獻者票 (含紀念衣服) / Reserved - Contributor (with T-Shirt)": 1290,
61 |     "財務補助票 / Reserved - Financial Aid": 0,
62 |     "邀請票 (含紀念衣服) / Reserved - Invited (with T-Shirt)": 0,
63 |     "個人贊助票 (含紀念衣服) / Individual - Sponsor (with T-Shirt)": 5500,
64 |     "個人票 - 早鳥 (含紀念衣服) / Individual - Early Stage (with T-Shirt)": 2790,
65 |     "個人票 - 一般 (含紀念衣服)/ Individual - Regular Stage (with T-Shirt)": 3790,
66 |     "個人票 - 晚鳥階段 / Individual - Final Stage": 4290,
67 |     "愛心優待票 (含紀念衣服)/ Individual - Concession": 1895,
68 | }
69 | 
70 | 
71 | def _compose_discord_msg(payload) -> str:
72 |     msg = f"Hi 這是今天 {datetime.now().date()} 的票種統計資料，售票期結束後，請 follow README 的 `gcloud` 指令進去把 Airflow DAG 關掉\n\n"
73 |     total = 0
74 |     total_income = 0
75 |     for name, ticket_name, counts in payload:
76 |         msg += f"  * 票種：{ticket_name}\t{counts}張\n"
77 |         total += counts
78 |         total_income += ticket_price.get(ticket_name, 0) * counts
79 |     total_income = f"{total_income:,}"
80 |     msg += f"dashboard: https://metabase.pycon.tw/question/142?year={YEAR}\n"
81 |     msg += f"總共賣出 {total} 張喔～ (總收入 TWD${total_income})"
82 |     return msg
83 | 


--------------------------------------------------------------------------------
/dags/app/twitter_post_notification_bot/dag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Send Proposal Summary to Discord
 3 | """
 4 | 
 5 | from datetime import datetime, timedelta
 6 | 
 7 | from airflow import DAG
 8 | from airflow.operators.python_operator import PythonOperator
 9 | from app.twitter_post_notification_bot import udf
10 | 
11 | DEFAULT_ARGS = {
12 |     "owner": "David Jr.",
13 |     "depends_on_past": False,
14 |     "start_date": datetime(2023, 7, 1),
15 |     "retries": 1,
16 |     "retry_delay": timedelta(minutes=5),
17 | }
18 | 
19 | with DAG(
20 |     "TWITTER_POST_NOTIFICATION_BOT_V2",
21 |     default_args=DEFAULT_ARGS,
22 |     schedule_interval="@daily",
23 |     max_active_runs=1,
24 |     catchup=False,
25 | ) as dag:
26 |     PythonOperator(
27 |         task_id="SEND_TWITTER_POST_NOTIFICATION",
28 |         python_callable=udf.main,
29 |     )
30 | 
31 | if __name__ == "__main__":
32 |     dag.cli()
33 | 


--------------------------------------------------------------------------------
/dags/app/twitter_post_notification_bot/udf.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from airflow import settings
 3 | from airflow.models import Variable
 4 | from sqlalchemy.orm import sessionmaker
 5 | 
 6 | 
 7 | def main() -> None:
 8 |     url = "https://twitter135.p.rapidapi.com/v2/UserTweets/"
 9 |     # 499339900 is PyConTW's twitter id
10 |     querystring = {"id": "499339900", "count": "1"}
11 |     headers = {
12 |         "X-RapidAPI-Key": Variable.get("RAPIDAPIAPI_KEY"),
13 |         "X-RapidAPI-Host": "twitter135.p.rapidapi.com",
14 |     }
15 |     webhook_url = Variable.get("DISCORD_POST_NOTIFICATION_WEBHOOK")
16 |     response = requests.get(url, headers=headers, params=querystring)
17 |     response_json = response.json()
18 |     try:
19 |         Session = sessionmaker(bind=settings.engine)
20 |         # Update the variable using a context manager
21 |         variable_key = "TWITTER_LATEST_REST_ID"
22 |         rest_id = response_json["data"]["user"]["result"]["timeline_v2"]["timeline"][
23 |             "instructions"
24 |         ][1]["entries"][0]["content"]["itemContent"]["tweet_results"]["result"][
25 |             "rest_id"
26 |         ]
27 |         full_text = response_json["data"]["user"]["result"]["timeline_v2"]["timeline"][
28 |             "instructions"
29 |         ][1]["entries"][0]["content"]["itemContent"]["tweet_results"]["result"][
30 |             "legacy"
31 |         ]["full_text"]
32 |         rest_id_in_DB = Variable.get(variable_key)
33 |         if rest_id_in_DB < rest_id:
34 |             # Create a session
35 |             session = Session()
36 | 
37 |             # Query the variable by key
38 |             variable = session.query(Variable).filter_by(key=variable_key).first()
39 | 
40 |             # Update the variable value
41 |             variable.set_val(rest_id)
42 | 
43 |             msg = f"new twitter post: https://twitter.com/PyConTW/status/{rest_id}\n\n{full_text}"
44 |             requests.post(
45 |                 url=webhook_url,
46 |                 json={"username": "Twitter Post Notification", "content": msg},
47 |             )
48 | 
49 |             # Commit the changes to the database
50 |             session.commit()
51 | 
52 |             # Close the session
53 |             session.close()
54 |     except Exception:
55 |         requests.post(
56 |             url=webhook_url,
57 |             json={
58 |                 "username": "Twitter Post Notification",
59 |                 "content": str(response_json),
60 |             },
61 |         )
62 | 


--------------------------------------------------------------------------------
/dags/dwd/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/dags/dwd/__init__.py


--------------------------------------------------------------------------------
/dags/dws/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/dags/dws/__init__.py


--------------------------------------------------------------------------------
/dags/ods/fb_post_insights/dag.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.python_operator import PythonOperator
 5 | from ods.fb_post_insights import udfs
 6 | 
 7 | DEFAULT_ARGS = {
 8 |     "owner": "CHWan",
 9 |     "depends_on_past": False,
10 |     "start_date": datetime(2023, 6, 14, 0),
11 |     "retries": 2,
12 |     "retry_delay": timedelta(minutes=5),
13 |     "on_failure_callback": lambda x: "Need to send notification to Discord!",
14 | }
15 | dag = DAG(
16 |     "FB_POST_INSIGHTS_V1",
17 |     default_args=DEFAULT_ARGS,
18 |     schedule_interval="5 8 * * *",
19 |     max_active_runs=1,
20 |     catchup=False,
21 | )
22 | with dag:
23 |     CREATE_TABLE_IF_NEEDED = PythonOperator(
24 |         task_id="CREATE_TABLE_IF_NEEDED",
25 |         python_callable=udfs.create_table_if_needed,
26 |     )
27 | 
28 |     SAVE_FB_POSTS_AND_INSIGHTS = PythonOperator(
29 |         task_id="SAVE_FB_POSTS_AND_INSIGHTS",
30 |         python_callable=udfs.save_fb_posts_and_insights,
31 |     )
32 | 
33 |     CREATE_TABLE_IF_NEEDED >> SAVE_FB_POSTS_AND_INSIGHTS
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     dag.cli()
38 | 


--------------------------------------------------------------------------------
/dags/ods/fb_post_insights/udfs.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from datetime import datetime
  4 | from typing import List, Optional
  5 | 
  6 | import requests
  7 | from airflow.models import Variable
  8 | from google.cloud import bigquery
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | def create_table_if_needed() -> None:
 14 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
 15 |     post_sql = """
 16 |     CREATE TABLE IF NOT EXISTS `pycontw-225217.ods.ods_pycontw_fb_posts` (
 17 |         id STRING,
 18 |         created_at TIMESTAMP,
 19 |         message STRING
 20 |     )
 21 |     """
 22 |     client.query(post_sql)
 23 |     insights_sql = """
 24 |     CREATE TABLE IF NOT EXISTS `pycontw-225217.ods.ods_pycontw_fb_posts_insights` (
 25 |         post_id STRING,
 26 |         query_time TIMESTAMP,
 27 |         comments INTEGER,
 28 |         reactions INTEGER,
 29 |         share INTEGER
 30 |     )
 31 |     """
 32 |     client.query(insights_sql)
 33 | 
 34 | 
 35 | def save_fb_posts_and_insights() -> None:
 36 |     posts = request_posts_data()
 37 | 
 38 |     last_post = query_last_post()
 39 |     if last_post is None:
 40 |         new_posts = posts
 41 |     else:
 42 |         new_posts = [
 43 |             post
 44 |             for post in posts
 45 |             if datetime.strptime(
 46 |                 post["created_time"], "%Y-%m-%dT%H:%M:%S%z"
 47 |             ).timestamp()
 48 |             > last_post["created_at"].timestamp()
 49 |         ]
 50 | 
 51 |     if not dump_posts_to_bigquery(
 52 |         [
 53 |             {
 54 |                 "id": post["id"],
 55 |                 "created_at": convert_fb_time(post["created_time"]),
 56 |                 "message": post.get("message", "No message found"),
 57 |             }
 58 |             for post in new_posts
 59 |         ]
 60 |     ):
 61 |         raise RuntimeError("Failed to dump posts to BigQuery")
 62 | 
 63 |     if not dump_posts_insights_to_bigquery(
 64 |         [
 65 |             {
 66 |                 "post_id": post["id"],
 67 |                 "query_time": datetime.now().timestamp(),
 68 |                 "comments": post["comments"]["summary"]["total_count"],
 69 |                 "reactions": post["reactions"]["summary"]["total_count"],
 70 |                 "share": post.get("shares", {}).get("count", 0),
 71 |             }
 72 |             for post in posts
 73 |         ]
 74 |     ):
 75 |         raise RuntimeError("Failed to dump posts insights to BigQuery")
 76 | 
 77 | 
 78 | def query_last_post() -> Optional[dict]:
 79 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
 80 |     sql = """
 81 |     SELECT
 82 |         created_at
 83 |     FROM
 84 |         `pycontw-225217.ods.ods_pycontw_fb_posts`
 85 |     ORDER BY
 86 |         created_at DESC
 87 |     LIMIT 1
 88 |     """
 89 |     result = client.query(sql)
 90 |     data = list(result)
 91 |     return data[0] if data else None
 92 | 
 93 | 
 94 | def request_posts_data() -> List[dict]:
 95 |     url = "https://graph.facebook.com/v20.0/160712400714277/feed/"
 96 |     # 160712400714277 is PyConTW's fb id
 97 |     access_token = Variable.get("FB_ACCESS_KEY")
 98 |     headers = {"Content-Type": "application/json"}
 99 |     params = {
100 |         "fields": "id,created_time,message,comments.summary(true),reactions.summary(true),shares",
101 |         "access_token": access_token,
102 |     }
103 |     response = requests.get(url, headers=headers, params=params)
104 |     if response.ok:
105 |         return response.json()["data"]
106 |     raise RuntimeError(f"Failed to fetch posts data: {response.text}")
107 | 
108 | 
109 | def dump_posts_to_bigquery(posts: List[dict]) -> bool:
110 |     if not posts:
111 |         logger.info("No posts to dump!")
112 |         return True
113 | 
114 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
115 |     job_config = bigquery.LoadJobConfig(
116 |         schema=[
117 |             bigquery.SchemaField("id", "STRING", mode="REQUIRED"),
118 |             bigquery.SchemaField("created_at", "TIMESTAMP", mode="REQUIRED"),
119 |             bigquery.SchemaField("message", "STRING", mode="REQUIRED"),
120 |         ],
121 |         write_disposition="WRITE_APPEND",
122 |     )
123 |     try:
124 |         job = client.load_table_from_json(
125 |             posts,
126 |             "pycontw-225217.ods.ods_pycontw_fb_posts",
127 |             job_config=job_config,
128 |         )
129 |         job.result()
130 |         return True
131 |     except Exception as e:
132 |         logger.error(f"Failed to dump posts to BigQuery: {e}", exc_info=True)
133 |         return False
134 | 
135 | 
136 | def dump_posts_insights_to_bigquery(posts: List[dict]) -> bool:
137 |     if not posts:
138 |         logger.info("No post insights to dump!")
139 |         return True
140 | 
141 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
142 |     job_config = bigquery.LoadJobConfig(
143 |         schema=[
144 |             bigquery.SchemaField("post_id", "STRING", mode="REQUIRED"),
145 |             bigquery.SchemaField("query_time", "TIMESTAMP", mode="REQUIRED"),
146 |             bigquery.SchemaField("comments", "INTEGER", mode="NULLABLE"),
147 |             bigquery.SchemaField("reactions", "INTEGER", mode="NULLABLE"),
148 |             bigquery.SchemaField("share", "INTEGER", mode="NULLABLE"),
149 |         ],
150 |         write_disposition="WRITE_APPEND",
151 |     )
152 |     try:
153 |         job = client.load_table_from_json(
154 |             posts,
155 |             "pycontw-225217.ods.ods_pycontw_fb_posts_insights",
156 |             job_config=job_config,
157 |         )
158 |         job.result()
159 |         return True
160 |     except Exception as e:
161 |         logger.error(f"Failed to dump posts insights to BigQuery: {e}", exc_info=True)
162 |         return False
163 | 
164 | 
165 | def convert_fb_time(time_string: str) -> str:
166 |     return (
167 |         datetime.strptime(time_string, "%Y-%m-%dT%H:%M:%S%z")
168 |         .strftime("%Y-%m-%d %H:%M:%S%z")
169 |         .replace("+0000", "UTC")
170 |     )
171 | 


--------------------------------------------------------------------------------
/dags/ods/google_search_console/dag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Send Google Search Report to Discord
 3 | """
 4 | 
 5 | from datetime import datetime, timedelta
 6 | 
 7 | from airflow import DAG
 8 | from airflow.operators.python_operator import PythonOperator
 9 | from ods.google_search_console.udfs.google_search import GoogleSearchConsoleReporter
10 | 
11 | DEFAULT_ARGS = {
12 |     "owner": "davidtnfsh",
13 |     "depends_on_past": False,
14 |     "start_date": datetime(2020, 12, 9),
15 |     "retries": 2,
16 |     "retry_delay": timedelta(minutes=5),
17 |     "on_failure_callback": lambda x: "Need to send notification to Discord",
18 | }
19 | dag = DAG(
20 |     "GOOGLE_SEARCH_REPORT",
21 |     default_args=DEFAULT_ARGS,
22 |     schedule_interval=timedelta(weeks=2),
23 |     max_active_runs=1,
24 |     catchup=False,
25 | )
26 | GOOGLE_SEARCH_REPORTER = GoogleSearchConsoleReporter()
27 | with dag:
28 |     GET_AND_SEND_REPORT = PythonOperator(
29 |         task_id="GET_AND_SEND_REPORT",
30 |         python_callable=GOOGLE_SEARCH_REPORTER.main,
31 |     )
32 | 
33 | if __name__ == "__main__":
34 |     dag.cli()
35 | 


--------------------------------------------------------------------------------
/dags/ods/google_search_console/udfs/google_search.py:
--------------------------------------------------------------------------------
  1 | import heapq
  2 | import os
  3 | from pathlib import Path
  4 | 
  5 | import requests
  6 | import searchconsole
  7 | 
  8 | TOPK = 5
  9 | 
 10 | AIRFLOW_HOME = os.getenv("AIRFLOW_HOME")
 11 | 
 12 | 
 13 | class GoogleSearchConsoleReporter:
 14 |     def __init__(self):
 15 |         self.top_k_ctr = []
 16 |         self.top_k_position = []
 17 |         self.top_k_clicks = []
 18 |         self.top_k_impressions = []
 19 |         for top_k_heap in (
 20 |             self.top_k_ctr,
 21 |             self.top_k_position,
 22 |             self.top_k_clicks,
 23 |             self.top_k_impressions,
 24 |         ):
 25 |             heapq.heapify(top_k_heap)
 26 |         self.report = None
 27 | 
 28 |     def main(self):
 29 |         report_msg = self._get_report()
 30 |         self._send_report(report_msg)
 31 | 
 32 |     def _get_report(self):
 33 |         client_config_path = (
 34 |             Path(AIRFLOW_HOME) / "dags/client_secret_google_search_console.json"
 35 |         )
 36 |         credentials_path = (
 37 |             Path(AIRFLOW_HOME)
 38 |             / "dags/client_secret_google_search_console_serialized.json"
 39 |         )
 40 |         account = searchconsole.authenticate(
 41 |             client_config=client_config_path,
 42 |             credentials=credentials_path,
 43 |         )
 44 |         webproperty = account["https://tw.pycon.org/"]
 45 |         return webproperty.query.range("today", days=-7).dimension("query").get()
 46 | 
 47 |     def _send_report(self, report_msg):
 48 |         self._maitain_topk_heap(report_msg)
 49 |         msg_heap_dict = {
 50 |             f"透過 google 搜尋點進 PyConTW 官網的所有關鍵字中，ctr 最高的前{TOPK}名關鍵字": self.top_k_ctr,
 51 |             f"透過 google 搜尋點進 PyConTW 官網的所有關鍵字中，官網排名位置越靠前的前{TOPK}名": self.top_k_position,
 52 |             f"透過 google 搜尋點進 PyConTW 官網的所有關鍵字中，clicks 數最高的前{TOPK}名關鍵字": self.top_k_clicks,
 53 |             f"透過 google 搜尋點進 PyConTW 官網的所有關鍵字中，impressions 數最高的前{TOPK}名關鍵字": self.top_k_impressions,
 54 |         }
 55 | 
 56 |         for msg, heap in msg_heap_dict.items():
 57 |             self._send_msg_to_discord(msg, heap)
 58 |         self._send_team_msg()
 59 | 
 60 |     def _maitain_topk_heap(self, report_msg):
 61 |         def heappush(heap, item, topk):
 62 |             heapq.heappush(heap, item)
 63 |             while len(heap) > topk:
 64 |                 heapq.heappop(heap)
 65 | 
 66 |         for row in report_msg.rows:
 67 |             heappush(self.top_k_ctr, (row.ctr, row.query), TOPK)
 68 |             heappush(self.top_k_position, (-row.position, row.query), TOPK)
 69 |             heappush(self.top_k_clicks, (row.clicks, row.query), TOPK)
 70 |             heappush(self.top_k_impressions, (row.impressions, row.query), TOPK)
 71 | 
 72 |     @staticmethod
 73 |     def _send_msg_to_discord(msg, heap):
 74 |         def get_topk_from_heap(heap):
 75 |             def turn_negative_back_to_positive_int(heap):
 76 |                 return [(num if num >= 0 else -num, query) for num, query in heap]
 77 | 
 78 |             return turn_negative_back_to_positive_int(sorted(heap, key=lambda x: -x[0]))
 79 | 
 80 |         def format_heap_content(topk_heap):
 81 |             return "\n".join([f'"{query}"\t{num}' for num, query in topk_heap])
 82 | 
 83 |         topk_heap = get_topk_from_heap(heap)
 84 |         formatted_heap_content = format_heap_content(topk_heap)
 85 |         requests.post(
 86 |             os.getenv("DISCORD_WEBHOOK"),
 87 |             json={
 88 |                 "username": "Data Team 雙週報",
 89 |                 "content": f"{msg}：\n {formatted_heap_content}\n----------------------\n",
 90 |             },
 91 |         )
 92 | 
 93 |     @staticmethod
 94 |     def _send_team_msg():
 95 |         requests.post(
 96 |             os.getenv("DISCORD_WEBHOOK"),
 97 |             json={
 98 |                 "username": "Data Team 雙週報",
 99 |                 "content": "有任何問題，歡迎敲 data team 任何一位成員~",
100 |             },
101 |         )
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     g = GoogleSearchConsoleReporter()
106 |     g.main()
107 | 


--------------------------------------------------------------------------------
/dags/ods/ig_post_insights/dags.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.python_operator import PythonOperator
 5 | from ods.ig_post_insights import udfs
 6 | 
 7 | DEFAULT_ARGS = {
 8 |     "owner": "Angus Yang",
 9 |     "depends_on_past": False,
10 |     "start_date": datetime(2023, 6, 14, 0),
11 |     "retries": 2,
12 |     "retry_delay": timedelta(minutes=5),
13 |     "on_failure_callback": lambda x: "Need to send notification to Discord!",
14 | }
15 | dag = DAG(
16 |     "IG_POST_INSIGHTS_V1",
17 |     default_args=DEFAULT_ARGS,
18 |     schedule_interval="5 8 * * *",
19 |     max_active_runs=1,
20 |     catchup=False,
21 | )
22 | with dag:
23 |     CREATE_TABLE_IF_NEEDED = PythonOperator(
24 |         task_id="CREATE_TABLE_IF_NEEDED",
25 |         python_callable=udfs.create_table_if_needed,
26 |     )
27 | 
28 |     SAVE_TWITTER_POSTS_AND_INSIGHTS = PythonOperator(
29 |         task_id="SAVE_IG_POSTS_AND_INSIGHTS",
30 |         python_callable=udfs.save_posts_and_insights,
31 |     )
32 | 
33 |     CREATE_TABLE_IF_NEEDED >> SAVE_TWITTER_POSTS_AND_INSIGHTS
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     dag.cli()
38 | 


--------------------------------------------------------------------------------
/dags/ods/ig_post_insights/udfs.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from datetime import datetime
  4 | from typing import List, Optional
  5 | 
  6 | import requests
  7 | from airflow.models import Variable
  8 | from google.cloud import bigquery
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | # IG API docs
 14 | # https://developers.facebook.com/docs/instagram-api/reference/ig-user/media?locale=zh_TW
 15 | # https://developers.facebook.com/docs/instagram-api/reference/ig-media
 16 | 
 17 | # // get list of media-id
 18 | # GET /v20.0/{page-id}/media/?access_token={access_token}
 19 | 
 20 | # // get media detail
 21 | # GET /v20.0/{media-id}?access_token={access_token}&fields=id,media_type,caption,timestamp,comments_count,like_count
 22 | 
 23 | # PyConTW IG page-id: 17841405043609765
 24 | # ps. IG api 目前不提供分享數, 所以只有點讚數和留言數
 25 | 
 26 | # Access Token
 27 | # Check Henry
 28 | 
 29 | 
 30 | def create_table_if_needed() -> None:
 31 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
 32 |     post_sql = """
 33 |     CREATE TABLE IF NOT EXISTS `pycontw-225217.ods.ods_pycontw_ig_posts` (
 34 |         id STRING,
 35 |         created_at TIMESTAMP,
 36 |         message STRING
 37 |     )
 38 |     """
 39 |     client.query(post_sql)
 40 |     insights_sql = """
 41 |     CREATE TABLE IF NOT EXISTS `pycontw-225217.ods.ods_pycontw_ig_posts_insights` (
 42 |         post_id STRING,
 43 |         query_time TIMESTAMP,
 44 |         period STRING,
 45 |         favorite INTEGER,
 46 |         reply INTEGER,
 47 |         retweet INTEGER,
 48 |         views INTEGER
 49 |     )
 50 |     """
 51 |     client.query(insights_sql)
 52 | 
 53 | 
 54 | def save_posts_and_insights() -> None:
 55 |     posts = request_posts_data()
 56 | 
 57 |     last_post = query_last_post()
 58 |     new_posts = (
 59 |         [
 60 |             post
 61 |             for post in posts
 62 |             if post["timestamp"] > last_post["created_at"].timestamp()
 63 |         ]
 64 |         if last_post
 65 |         else posts
 66 |     )
 67 | 
 68 |     if not dump_posts_to_bigquery(
 69 |         [
 70 |             {
 71 |                 "id": post["id"],
 72 |                 "created_at": post["timestamp"],
 73 |                 "message": post["caption"],
 74 |             }
 75 |             for post in new_posts
 76 |         ]
 77 |     ):
 78 |         raise RuntimeError("Failed to dump posts to BigQuery")
 79 | 
 80 |     if not dump_posts_insights_to_bigquery(
 81 |         [
 82 |             {
 83 |                 "post_id": post["id"],
 84 |                 "query_time": datetime.now().timestamp(),
 85 |                 "period": "lifetime",
 86 |                 "favorite": post["like_count"],
 87 |                 "reply": post["comments_count"],
 88 |                 "retweet": "0",  # API not supported
 89 |                 "views": "0",  # API not supported
 90 |             }
 91 |             for post in posts
 92 |         ]
 93 |     ):
 94 |         raise RuntimeError("Failed to dump posts insights to BigQuery")
 95 | 
 96 | 
 97 | def query_last_post() -> Optional[dict]:
 98 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
 99 |     sql = """
100 |     SELECT
101 |         created_at
102 |     FROM
103 |         `pycontw-225217.ods.ods_pycontw_ig_posts`
104 |     ORDER BY
105 |         created_at DESC
106 |     LIMIT 1
107 |     """
108 |     result = client.query(sql)
109 |     data = list(result)
110 |     return data[0] if data else None
111 | 
112 | 
113 | def request_posts_data() -> List[dict]:
114 |     media_list_url = "https://graph.facebook.com/v20.0/17841405043609765/media"
115 |     querystring = {"access_token": Variable.get("IG_ACCESS_TOKEN"), "limit": "0"}
116 |     headers = {"Content-Type": "application/json"}
117 | 
118 |     response = requests.get(
119 |         media_list_url, headers=headers, params=querystring, timeout=180
120 |     )
121 |     if not response.ok:
122 |         raise RuntimeError(f"Failed to fetch posts data: {response.text}")
123 |     media_list = response.json()["data"]
124 | 
125 |     media_insight_list = []
126 | 
127 |     for media in media_list:
128 |         media_insight_url = f"https://graph.facebook.com/v20.0/{media['id']}"
129 |         querystring = {
130 |             "access_token": Variable.get("IG_ACCESS_TOKEN"),
131 |             "fields": "id,media_type,caption,timestamp,comments_count,like_count",
132 |         }
133 |         response = requests.get(
134 |             media_insight_url, headers=headers, params=querystring, timeout=180
135 |         )
136 |         if not response.ok:
137 |             raise RuntimeError(f"Failed to fetch posts data: {response.text}")
138 | 
139 |         media_insight = {}
140 |         media_res: dict = response.json()
141 |         # Error handling, the response may not include the required fields, media id: 17889558458829258, no "caption"
142 |         media_insight["id"] = media_res.get("id", "0")
143 |         media_insight["timestamp"] = datetime.strptime(
144 |             media_res.get("timestamp", "0"), "%Y-%m-%dT%H:%M:%S%z"
145 |         ).timestamp()
146 |         media_insight["caption"] = media_res.get("caption", "No Content")
147 |         media_insight["comments_count"] = media_res.get("comments_count", "0")
148 |         media_insight["like_count"] = media_res.get("like_count", "0")
149 |         media_insight["media_type"] = media_res.get("media_type", "No Content")
150 | 
151 |         # print(media_insight)
152 |         media_insight_list.append(media_insight)
153 | 
154 |     return media_insight_list
155 | 
156 | 
157 | def dump_posts_to_bigquery(posts: List[dict]) -> bool:
158 |     if not posts:
159 |         logger.info("No posts to dump!")
160 |         return True
161 | 
162 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
163 |     job_config = bigquery.LoadJobConfig(
164 |         schema=[
165 |             bigquery.SchemaField("id", "STRING", mode="REQUIRED"),
166 |             bigquery.SchemaField("created_at", "TIMESTAMP", mode="REQUIRED"),
167 |             bigquery.SchemaField("message", "STRING", mode="REQUIRED"),
168 |         ],
169 |         write_disposition="WRITE_APPEND",
170 |     )
171 |     try:
172 |         job = client.load_table_from_json(
173 |             posts,
174 |             "pycontw-225217.ods.ods_pycontw_ig_posts",
175 |             job_config=job_config,
176 |         )
177 |         job.result()
178 |         return True
179 |     except Exception as e:
180 |         logger.error(f"Failed to dump posts to BigQuery: {e}", exc_info=True)
181 |         return False
182 | 
183 | 
184 | def dump_posts_insights_to_bigquery(posts: List[dict]) -> bool:
185 |     if not posts:
186 |         logger.info("No post insights to dump!")
187 |         return True
188 | 
189 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
190 |     job_config = bigquery.LoadJobConfig(
191 |         schema=[
192 |             bigquery.SchemaField("post_id", "STRING", mode="REQUIRED"),
193 |             bigquery.SchemaField("query_time", "TIMESTAMP", mode="REQUIRED"),
194 |             bigquery.SchemaField("period", "STRING", mode="REQUIRED"),
195 |             bigquery.SchemaField("favorite", "INTEGER", mode="NULLABLE"),
196 |             bigquery.SchemaField("reply", "INTEGER", mode="NULLABLE"),
197 |             bigquery.SchemaField("retweet", "INTEGER", mode="NULLABLE"),
198 |             bigquery.SchemaField("views", "INTEGER", mode="NULLABLE"),
199 |         ],
200 |         write_disposition="WRITE_APPEND",
201 |     )
202 |     try:
203 |         job = client.load_table_from_json(
204 |             posts,
205 |             "pycontw-225217.ods.ods_pycontw_ig_posts_insights",
206 |             job_config=job_config,
207 |         )
208 |         job.result()
209 |         return True
210 |     except Exception as e:
211 |         logger.error(f"Failed to dump posts insights to BigQuery: {e}", exc_info=True)
212 |         return False
213 | 
214 | 
215 | def test_main():
216 |     create_table_if_needed()
217 | 
218 |     save_posts_and_insights()
219 | 
220 | 
221 | if __name__ == "__main__":
222 |     test_main()
223 | 


--------------------------------------------------------------------------------
/dags/ods/kktix_ticket_orders/kktix_dag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Ingest KKTIX's data and load them to BigQuery every 5mins
 3 | """
 4 | 
 5 | from datetime import datetime, timedelta
 6 | 
 7 | from airflow import DAG
 8 | from airflow.operators.python_operator import PythonOperator
 9 | from ods.kktix_ticket_orders.udfs import bigquery_loader, kktix_api
10 | 
11 | DEFAULT_ARGS = {
12 |     "owner": "davidtnfsh@gmail.com",
13 |     "depends_on_past": False,
14 |     "start_date": datetime(2024, 6, 16, 15),  # 23 (+8)
15 |     "retries": 2,
16 |     "retry_delay": timedelta(minutes=5),
17 |     "on_failure_callback": lambda x: "Need to send notification to Discord!",
18 | }
19 | dag = DAG(
20 |     "KKTIX_TICKET_ORDERS_V10",
21 |     default_args=DEFAULT_ARGS,
22 |     schedule_interval="50 * * * *",
23 |     max_active_runs=1,
24 |     catchup=True,
25 | )
26 | with dag:
27 |     CREATE_TABLE_IF_NEEDED = PythonOperator(
28 |         task_id="CREATE_TABLE_IF_NEEDED",
29 |         python_callable=bigquery_loader.create_table_if_needed,
30 |     )
31 | 
32 |     GET_ATTENDEE_INFOS = PythonOperator(
33 |         task_id="GET_ATTENDEE_INFOS",
34 |         python_callable=kktix_api.main,
35 |         provide_context=True,
36 |     )
37 | 
38 |     CREATE_TABLE_IF_NEEDED >> GET_ATTENDEE_INFOS
39 | 
40 | if __name__ == "__main__":
41 |     dag.cli()
42 | 


--------------------------------------------------------------------------------
/dags/ods/kktix_ticket_orders/kktix_refund_dag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Update KKTIX's data if attendee has been refunded
 3 | """
 4 | 
 5 | from datetime import datetime, timedelta
 6 | 
 7 | from airflow import DAG
 8 | from airflow.operators.python_operator import PythonOperator
 9 | from ods.kktix_ticket_orders.udfs import kktix_refund
10 | 
11 | DEFAULT_ARGS = {
12 |     "owner": "henry410213028@gmail.com",
13 |     "depends_on_past": False,
14 |     "start_date": datetime(2024, 6, 18, 0),
15 |     "retries": 2,
16 |     "retry_delay": timedelta(minutes=5),
17 |     "on_failure_callback": lambda x: "Need to send notification to Discord!",
18 | }
19 | dag = DAG(
20 |     "KKTIX_TICKET_REFUND_V3",
21 |     default_args=DEFAULT_ARGS,
22 |     schedule_interval="50 23 * * *",  # At 23:50 (everyday)
23 |     max_active_runs=1,
24 |     catchup=True,
25 | )
26 | with dag:
27 |     UPDATE_REFUNDED_ATTENDEE_IDS = PythonOperator(
28 |         task_id="UPDATE_REFUNDED_ATTENDEE_IDS",
29 |         python_callable=kktix_refund.main,
30 |     )
31 | 
32 |     UPDATE_REFUNDED_ATTENDEE_IDS
33 | 
34 | if __name__ == "__main__":
35 |     dag.cli()
36 | 


--------------------------------------------------------------------------------
/dags/ods/kktix_ticket_orders/klaviyo_backfill_dag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Ingest KKTIX's daily data and load them to Mailer
 3 | """
 4 | 
 5 | from datetime import datetime, timedelta
 6 | 
 7 | from airflow import DAG
 8 | from airflow.operators.python_operator import PythonOperator
 9 | from ods.kktix_ticket_orders.udfs import batch_kktix2mailer
10 | 
11 | DEFAULT_ARGS = {
12 |     "owner": "henry410213028@gmail.com",
13 |     "depends_on_past": False,
14 |     "start_date": datetime(2022, 8, 29),
15 |     "retries": 2,
16 |     "retry_delay": timedelta(minutes=5),
17 |     "on_failure_callback": lambda x: "Need to send notification to Discord!",
18 | }
19 | dag = DAG(
20 |     "KLAVIYO_SEND_MAIL_V3",
21 |     default_args=DEFAULT_ARGS,
22 |     schedule_interval="0 * * * *",
23 |     max_active_runs=1,
24 |     catchup=True,
25 | )
26 | with dag:
27 |     GET_ATTENDEE_INFOS = PythonOperator(
28 |         task_id="GET_ATTENDEE_INFOS",
29 |         python_callable=batch_kktix2mailer.main,
30 |         provide_context=True,
31 |     )
32 | 
33 |     GET_ATTENDEE_INFOS
34 | 
35 | if __name__ == "__main__":
36 |     dag.cli()
37 | 


--------------------------------------------------------------------------------
/dags/ods/kktix_ticket_orders/sqls/create_table.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS `{}`
2 | (
3 |     ID INT64 NOT NULL,
4 |     NAME STRING NOT NULL,
5 |     ATTENDEE_INFO STRING NOT NULL
6 | );
7 | 


--------------------------------------------------------------------------------
/dags/ods/kktix_ticket_orders/udfs/batch_kktix2mailer.py:
--------------------------------------------------------------------------------
 1 | from dateutil.parser import parse
 2 | from ods.kktix_ticket_orders.udfs import kktix_api, klaviyo_loader
 3 | 
 4 | 
 5 | def main(**context):
 6 |     """
 7 |     Extract user info from kktix api and load to mailer
 8 |     """
 9 |     schedule_interval = context["dag"].schedule_interval
10 |     # If we change the schedule_interval, we need to update the logic in condition_filter_callback
11 |     assert schedule_interval == "0 * * * *"  # nosec
12 |     ts_datetime_obj = parse(context["ts"])
13 |     year = ts_datetime_obj.year
14 |     timestamp = ts_datetime_obj.timestamp()
15 |     event_raw_data_array = kktix_api._extract(
16 |         year=year,
17 |         timestamp=timestamp,
18 |     )
19 |     # load name and email to mailer before data has been hashed
20 |     klaviyo_loader.load(event_raw_data_array)
21 |     print(f"Batch load {len(event_raw_data_array)} data to downstream task")
22 | 


--------------------------------------------------------------------------------
/dags/ods/kktix_ticket_orders/udfs/bigquery_loader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | from google.cloud import bigquery
 5 | 
 6 | TABLE = f"{os.getenv('BIGQUERY_PROJECT', 'pycontw-225217')}.ods.ods_kktix_attendeeId_datetime"
 7 | # since backfill would insert duplicate records, we need this dedupe to make it idempotent
 8 | DEDUPE_SQL = f"""
 9 | CREATE OR REPLACE TABLE
10 |   `{TABLE}` AS
11 | SELECT
12 |   DISTINCT *
13 | FROM
14 |   `{TABLE}`
15 | """  # nosec
16 | AIRFLOW_HOME = os.getenv("AIRFLOW_HOME")
17 | 
18 | 
19 | def create_table_if_needed() -> None:
20 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
21 |     sql_filepath = (
22 |         Path(AIRFLOW_HOME) / "dags/ods/kktix_ticket_orders/sqls/create_table.sql"
23 |     )
24 |     sql = sql_filepath.read_text().format(TABLE)
25 |     client.query(sql)
26 |     client.query(DEDUPE_SQL)
27 | 


--------------------------------------------------------------------------------
/dags/ods/kktix_ticket_orders/udfs/gather_town_loader.py:
--------------------------------------------------------------------------------
 1 | """
 2 | load user's name, email etc into gather town whitelist
 3 | please refer to this document for details: https://hackmd.io/PM_sWO5USo6dxMqT1uCrCQ?view
 4 | """
 5 | 
 6 | import requests
 7 | import tenacity
 8 | from airflow.hooks.http_hook import HttpHook
 9 | from airflow.models import Variable
10 | 
11 | RETRY_ARGS = dict(
12 |     wait=tenacity.wait_none(),
13 |     stop=tenacity.stop_after_attempt(3),
14 |     retry=tenacity.retry_if_exception_type(requests.exceptions.ConnectionError),
15 | )
16 | 
17 | GATHERTOWN_HTTP_HOOK = HttpHook(http_conn_id="gathertown_api", method="POST")
18 | 
19 | 
20 | def load(**context):
21 |     event_raw_data_array = context["ti"].xcom_pull(task_ids="GET_ATTENDEE_INFOS")
22 |     for event_raw_data in event_raw_data_array:
23 |         resp = GATHERTOWN_HTTP_HOOK.run_with_advanced_retry(
24 |             endpoint="/api/setEmailGuestlist",
25 |             _retry_args=RETRY_ARGS,
26 |             json={
27 |                 "spaceId": Variable.get("gather_town_space_id"),
28 |                 "apiKey": Variable.get("gather_town_api_key"),
29 |                 "guestlist": {
30 |                     event_raw_data["聯絡人 Email"]: {
31 |                         "name": "",
32 |                         "role": "guest",
33 |                         "affiliation": "Attendee",
34 |                     }
35 |                 },
36 |             },
37 |             headers={"Accept": "application/json", "Content-Type": "application/json"},
38 |         ).json()
39 |         print(resp)
40 | 


--------------------------------------------------------------------------------
/dags/ods/kktix_ticket_orders/udfs/kktix_api.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from typing import Callable, Dict, List
  3 | 
  4 | import requests
  5 | import tenacity
  6 | from airflow.hooks.http_hook import HttpHook
  7 | from airflow.models import Variable
  8 | from dateutil.parser import parse
  9 | from ods.kktix_ticket_orders.udfs import kktix_loader, kktix_transformer
 10 | 
 11 | SCHEDULE_INTERVAL_SECONDS: int = 3600
 12 | HTTP_HOOK = HttpHook(http_conn_id="kktix_api", method="GET")
 13 | RETRY_ARGS = dict(
 14 |     wait=tenacity.wait_none(),
 15 |     stop=tenacity.stop_after_attempt(3),
 16 |     retry=tenacity.retry_if_exception_type(requests.exceptions.ConnectionError),
 17 | )
 18 | 
 19 | 
 20 | def main(**context):
 21 |     """
 22 |     ETL pipeline should consists of extract, transform and load
 23 |     """
 24 |     schedule_interval = context["dag"].schedule_interval
 25 |     # If we change the schedule_interval, we need to update the logic in condition_filter_callback
 26 |     assert schedule_interval == "50 * * * *"  # nosec
 27 |     ts_datetime_obj = parse(context["ts"])
 28 |     year = ts_datetime_obj.year
 29 |     timestamp = ts_datetime_obj.timestamp()
 30 |     event_raw_data_array = _extract(
 31 |         year=year,
 32 |         timestamp=timestamp,
 33 |     )
 34 |     transformed_event_raw_data_array = kktix_transformer.transform(
 35 |         copy.deepcopy(event_raw_data_array)
 36 |     )
 37 |     kktix_loader.load(transformed_event_raw_data_array)
 38 |     print(f"Loaded {len(transformed_event_raw_data_array)} rows to BigQuery!")
 39 | 
 40 |     # pass these unhashed data through xcom to next airflow task
 41 |     return kktix_transformer._extract_sensitive_unhashed_raw_data(
 42 |         copy.deepcopy(event_raw_data_array)
 43 |     )
 44 | 
 45 | 
 46 | def _extract(year: int, timestamp: float) -> List[Dict]:
 47 |     """
 48 |     get data from KKTIX's API
 49 |     1. condition_filter_callb: use this callbacl to filter out unwanted event!
 50 |     2. right now schedule_interval_seconds is a hardcoded value!
 51 |     """
 52 |     event_raw_data_array: List[Dict] = []
 53 | 
 54 |     def _condition_filter_callback(event):
 55 |         return str(year) in event["name"] and "registration" in event["name"].lower()
 56 | 
 57 |     event_metadatas = get_event_metadatas(_condition_filter_callback)
 58 |     for event_metadata in event_metadatas:
 59 |         event_id = event_metadata["id"]
 60 |         for attendee_info in get_attendee_infos(event_id, timestamp):
 61 |             event_raw_data_array.append(
 62 |                 {
 63 |                     "id": event_id,
 64 |                     "name": event_metadata["name"],
 65 |                     "attendee_info": attendee_info,
 66 |                 }
 67 |             )
 68 |     return event_raw_data_array
 69 | 
 70 | 
 71 | def get_attendee_infos(event_id: int, timestamp: float) -> List:
 72 |     """
 73 |     it's a public wrapper for people to get attendee infos!
 74 |     """
 75 |     attendance_book_id = _get_attendance_book_id(event_id)
 76 |     attendee_ids = _get_attendee_ids(event_id, attendance_book_id)
 77 |     attendee_infos = _get_attendee_infos(event_id, attendee_ids, timestamp)
 78 |     return attendee_infos
 79 | 
 80 | 
 81 | def get_event_metadatas(condition_filter: Callable) -> List[Dict]:
 82 |     """
 83 |     Fetch all the ongoing events
 84 |     """
 85 |     event_list_resp = HTTP_HOOK.run_with_advanced_retry(
 86 |         endpoint=f"{Variable.get('kktix_events_endpoint')}?only_not_ended_event=true",
 87 |         _retry_args=RETRY_ARGS,
 88 |     ).json()
 89 |     event_metadatas: List[dict] = []
 90 |     for event in event_list_resp["data"]:
 91 |         if condition_filter(event):
 92 |             event_metadatas.append(event)
 93 |     return event_metadatas
 94 | 
 95 | 
 96 | def _get_attendance_book_id(event_id: int) -> int:
 97 |     """
 98 |     Fetch attendance books
 99 |     """
100 |     attendance_books_resp = HTTP_HOOK.run_with_advanced_retry(
101 |         endpoint=f"{Variable.get('kktix_events_endpoint')}/{event_id}/attendance_books?only_not_ended_event=true",
102 |         _retry_args=RETRY_ARGS,
103 |     ).json()
104 |     return attendance_books_resp[0]["id"]
105 | 
106 | 
107 | def _get_attendee_ids(event_id: int, attendance_book_id: int) -> List[int]:
108 |     """
109 |     get all attendee ids!
110 |     """
111 |     attendee_ids = []
112 |     attendees_resp = HTTP_HOOK.run_with_advanced_retry(
113 |         endpoint=f"{Variable.get('kktix_events_endpoint')}/{event_id}/attendance_books/{attendance_book_id}",
114 |         _retry_args=RETRY_ARGS,
115 |     ).json()
116 |     for signin_status_tuple in attendees_resp["signin_status"]:
117 |         attendee_ids.append(signin_status_tuple[0])
118 |     return attendee_ids
119 | 
120 | 
121 | def _get_attendee_infos(
122 |     event_id: int, attendee_ids: List[int], timestamp: float
123 | ) -> List:
124 |     """
125 |     get attendee infos, e.g. email, phonenumber, name and etc
126 |     """
127 |     print(
128 |         f"Fetching attendee infos between {timestamp} and {timestamp + SCHEDULE_INTERVAL_SECONDS}"
129 |     )
130 |     attendee_infos = []
131 |     for attendee_id in attendee_ids:
132 |         attendee_info = HTTP_HOOK.run_with_advanced_retry(
133 |             endpoint=f"{Variable.get('kktix_events_endpoint')}/{event_id}/attendees/{attendee_id}",
134 |             _retry_args=RETRY_ARGS,
135 |         ).json()
136 |         if not attendee_info["is_paid"]:
137 |             continue
138 |         if (
139 |             timestamp
140 |             < attendee_info["updated_at"]
141 |             < timestamp + SCHEDULE_INTERVAL_SECONDS
142 |         ):
143 |             attendee_infos.append(attendee_info)
144 |     return attendee_infos
145 | 


--------------------------------------------------------------------------------
/dags/ods/kktix_ticket_orders/udfs/kktix_loader.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import json
 3 | import os
 4 | from typing import Dict, List
 5 | 
 6 | import pandas as pd
 7 | from google.cloud import bigquery
 8 | from ods.kktix_ticket_orders.udfs import kktix_bq_dwd_etl
 9 | from ods.kktix_ticket_orders.udfs.bigquery_loader import TABLE
10 | 
11 | SCHEMA = [
12 |     bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"),
13 |     bigquery.SchemaField("name", "STRING", mode="REQUIRED"),
14 |     bigquery.SchemaField("attendee_info", "STRING", mode="REQUIRED"),
15 |     bigquery.SchemaField("refunded", "BOOLEAN", mode="REQUIRED"),
16 | ]
17 | JOB_CONFIG = bigquery.LoadJobConfig(schema=SCHEMA)
18 | 
19 | 
20 | def load(event_raw_data_array: List):
21 |     """
22 |     load data into bigquery!
23 |     """
24 |     # data quality check
25 |     if len(event_raw_data_array) == 0:
26 |         print("Nothing to load, skip!")
27 |         return
28 |     payload = []
29 |     for event_raw_data in event_raw_data_array:
30 |         sanitized_event_raw_data = _sanitize_payload(event_raw_data)
31 |         payload.append(sanitized_event_raw_data)
32 |     _load_to_bigquery(payload)
33 |     _load_to_bigquery_dwd(payload)
34 | 
35 | 
36 | def _load_to_bigquery(payload: List[Dict]) -> None:
37 |     """
38 |     load data to BigQuery's `TABLE`
39 |     """
40 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
41 |     df = pd.DataFrame(
42 |         payload,
43 |         columns=["id", "name", "attendee_info"],
44 |     )
45 |     # for now, these attendees haven't refunded our ticket, yet...
46 |     # we don't know if they would refund down the road
47 |     df["refunded"] = [False] * len(payload)
48 |     job = client.load_table_from_dataframe(df, TABLE, job_config=JOB_CONFIG)
49 |     job.result()
50 | 
51 | 
52 | def _load_to_bigquery_dwd(payload: List[Dict]) -> None:
53 |     """
54 |     load data to BigQuery's `TABLE`
55 |     """
56 |     # Spilt payload to dict lists by ticket group
57 |     ticket_groups = ["corporate", "individual", "reserved"]
58 |     dol = collections.defaultdict(list)
59 |     for d in payload:
60 |         for tg in ticket_groups:
61 |             if tg in d["name"].lower():
62 |                 dol[tg].append(d)
63 | 
64 |     print(dol[tg])
65 |     project_id = os.getenv("BIGQUERY_PROJECT")
66 |     dataset_id = "dwd"
67 |     for tg in ticket_groups:
68 |         if len(dol[tg]) > 0:
69 |             _, sanitized_df = kktix_bq_dwd_etl.load_to_df_from_list(dol[tg])
70 |             table_id = f"kktix_ticket_{tg}_attendees"
71 |             kktix_bq_dwd_etl.upload_dataframe_to_bigquery(
72 |                 sanitized_df, project_id, dataset_id, table_id
73 |             )
74 | 
75 | 
76 | def _sanitize_payload(event_raw_data) -> Dict:
77 |     """
78 |     BigQuery has some constraints for nested data type
79 |     So we put out sanitization/data cleansing logic here!
80 |     """
81 |     event_raw_data["attendee_info"] = json.dumps(event_raw_data["attendee_info"])
82 |     return event_raw_data
83 | 


--------------------------------------------------------------------------------
/dags/ods/kktix_ticket_orders/udfs/kktix_refund.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from collections import defaultdict
 3 | from typing import List
 4 | 
 5 | from google.cloud import bigquery
 6 | from ods.kktix_ticket_orders.udfs.bigquery_loader import TABLE
 7 | from ods.kktix_ticket_orders.udfs.kktix_api import (
 8 |     _get_attendance_book_id,
 9 |     _get_attendee_ids,
10 | )
11 | 
12 | CLIENT = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
13 | 
14 | 
15 | def main() -> None:
16 |     refunded_attendee_ids = _check_if_refunded_ticket_exists()
17 |     if refunded_attendee_ids:
18 |         _mark_tickets_as_refunded(refunded_attendee_ids)
19 | 
20 | 
21 | def _check_if_refunded_ticket_exists() -> List[int]:
22 |     """
23 |     1. iterate through all unrefunded tickets
24 |     2. build up a hash map
25 |     3. get the latest attendance book
26 |     4. compare the difference, the diff would be refunded attendee ids
27 |     """
28 |     refunded_attendee_ids: List[int] = []
29 |     query_job = CLIENT.query(
30 |         f"""
31 |             SELECT
32 |               ID,
33 |               CAST(REPLACE(JSON_EXTRACT(ATTENDEE_INFO,
34 |                   '$.id'), '"', '') AS INT64) AS ATTENDEE_ID
35 |             FROM
36 |               `{TABLE}`
37 |             WHERE
38 |               REFUNDED IS NULL OR REFUNDED = FALSE
39 |         """  # nosec
40 |     )
41 |     event_ids_and_attendee_ids = query_job.result()
42 | 
43 |     bigquery_side_event_attendee_id_dict = defaultdict(list)
44 |     for event_id, attendee_id in event_ids_and_attendee_ids:
45 |         bigquery_side_event_attendee_id_dict[event_id].append(attendee_id)
46 |     for (
47 |         event_id,
48 |         outdated_latest_attendee_ids,
49 |     ) in bigquery_side_event_attendee_id_dict.items():
50 |         attendance_book_id = _get_attendance_book_id(event_id)
51 |         latest_attendee_ids = _get_attendee_ids(event_id, attendance_book_id)
52 |         refunded_attendee_ids_in_this_event = set(
53 |             outdated_latest_attendee_ids
54 |         ).difference(set(latest_attendee_ids))
55 |         refunded_attendee_ids += list(refunded_attendee_ids_in_this_event)
56 |     return refunded_attendee_ids
57 | 
58 | 
59 | def _mark_tickets_as_refunded(refunded_attendee_ids: List[int]) -> None:
60 |     """
61 |     set these attendee info to refunded=true, if we cannot find its attendee_info right now by using KKTIX's API!
62 |     """
63 |     query_job = CLIENT.query(
64 |         f"""
65 |     UPDATE
66 |       `{TABLE}`
67 |     SET
68 |       refunded=TRUE
69 |     WHERE
70 |       CAST(REPLACE(JSON_EXTRACT(ATTENDEE_INFO,
71 |           '$.id'), '"', '') AS INT64) in ({",".join(str(i) for i in refunded_attendee_ids)})
72 |     """
73 |     )
74 |     result = query_job.result()
75 |     print(f"Result of _mark_tickets_as_refunded: {result}")
76 | 


--------------------------------------------------------------------------------
/dags/ods/kktix_ticket_orders/udfs/kktix_transformer.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | from typing import Dict, List
 3 | 
 4 | SENSITIVE_KEY_NAME_SET = {
 5 |     "聯絡人 姓名",
 6 |     "聯絡人 Email",
 7 |     "聯絡人 手機",
 8 |     "Address",
 9 | }
10 | 
11 | 
12 | def transform(event_raw_data_array: List) -> List[Dict]:
13 |     """
14 |     de-identify user's email in this block!
15 |     """
16 |     for event in event_raw_data_array:
17 |         attendee_info = event["attendee_info"]
18 |         # search string contains personal information and it's unstructured. Therefore just drop it!
19 |         del attendee_info["search_string"]
20 |         for index, (key, value) in enumerate(attendee_info["data"]):
21 |             for key_should_be_hashed in SENSITIVE_KEY_NAME_SET:
22 |                 if key_should_be_hashed in key:
23 |                     hashed_value = hashlib.sha256(value.encode("utf-8")).hexdigest()
24 |                     attendee_info["data"][index][1] = hashed_value
25 |                 else:
26 |                     continue
27 |     return event_raw_data_array
28 | 
29 | 
30 | def _extract_sensitive_unhashed_raw_data(event_raw_data_array: List) -> List[Dict]:
31 |     """
32 |     only keep these data in xcom and pass them to next Airflow task
33 |     """
34 |     sensitive_unhashed_raw_data_array = []
35 |     for event in event_raw_data_array:
36 |         attendee_info = event["attendee_info"]
37 |         payload = {}
38 |         for key, value in attendee_info["data"]:
39 |             if key in SENSITIVE_KEY_NAME_SET:
40 |                 payload[key] = value
41 |         sensitive_unhashed_raw_data_array.append(payload)
42 |     return sensitive_unhashed_raw_data_array
43 | 


--------------------------------------------------------------------------------
/dags/ods/kktix_ticket_orders/udfs/klaviyo_loader.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable, List
 2 | 
 3 | from airflow.models import Variable
 4 | from ods.kktix_ticket_orders.udfs import klaviyo_mailer
 5 | 
 6 | 
 7 | def _load_raw_data(event_raw_data_array: List) -> Iterable:
 8 |     for event in event_raw_data_array:
 9 |         attendee_info = event["attendee_info"]
10 |         # search string contains personal information and it's unstructured. Therefore just drop it!
11 |         del attendee_info["search_string"]
12 |         tmp = {
13 |             key: value
14 |             for index, (key, value) in enumerate(attendee_info["data"])
15 |             if key in ("聯絡人 Email", "聯絡人 姓名")
16 |         }
17 |         tmp.update({"qrcode": attendee_info["qrcode"]})
18 |         yield tmp
19 | 
20 | 
21 | def load(event_raw_data_array: List) -> None:
22 |     """
23 |     Send a notify mail for all participants via third-party service
24 |     """
25 |     try:
26 |         list_id = Variable.get("KLAVIYO_LIST_ID")
27 |         campaign_id = Variable.get("KLAVIYO_CAMPAIGN_ID")
28 |     except KeyError:
29 |         print(
30 |             "Skip klaviyo mailer, 'KLAVIYO_LIST_ID' or 'KLAVIYO_CAMPAIGN_ID' variable not found"
31 |         )
32 |         return
33 | 
34 |     datas = [
35 |         {
36 |             "email": item["聯絡人 Email"],
37 |             "name": item["聯絡人 姓名"],
38 |             "qrcode": item["qrcode"],
39 |         }
40 |         for item in _load_raw_data(event_raw_data_array)
41 |     ]
42 |     if not datas:
43 |         print("Skip klaviyo mailer, no user profiles")
44 |         return
45 | 
46 |     klaviyo_mailer.main(
47 |         list_id=list_id,
48 |         campaign_id=campaign_id,
49 |         campaign_name="隨買即用",
50 |         datas=datas,
51 |     )
52 | 


--------------------------------------------------------------------------------
/dags/ods/kktix_ticket_orders/udfs/klaviyo_mailer.py:
--------------------------------------------------------------------------------
  1 | """Send a mail via Klaviyo
  2 | 
  3 | Requirements:
  4 | 
  5 | 1. Create a [Klaviyo List](https://www.klaviyo.com/lists)
  6 | 
  7 | 2. Create a template [campaign](https://www.klaviyo.com/campaigns) and set the previous List as target recipients list
  8 | 
  9 | """
 10 | 
 11 | from datetime import datetime
 12 | from typing import List
 13 | 
 14 | import requests
 15 | import tenacity
 16 | from airflow.hooks.http_hook import HttpHook
 17 | from airflow.models import Variable
 18 | 
 19 | SCHEDULE_INTERVAL_SECONDS: int = 300
 20 | RETRY_ARGS = dict(
 21 |     wait=tenacity.wait_none(),
 22 |     stop=tenacity.stop_after_attempt(3),
 23 |     retry=tenacity.retry_if_exception_type(requests.exceptions.ConnectionError),
 24 | )
 25 | 
 26 | 
 27 | def main(
 28 |     list_id: str,
 29 |     campaign_id: str,
 30 |     campaign_name: str,
 31 |     datas: List[dict],
 32 | ):
 33 |     """
 34 |     Args:
 35 |         list_id (str): Klaviyo list id, that will be save your target recipients
 36 |         campaign_id (str): A existed campaign you want to copy from
 37 |         campaign_name (str): A new campaign name
 38 |         datas (List[dict]): Recipient profile, example like below
 39 | 
 40 |     [
 41 |         {
 42 |             "email": "foo@example.com",
 43 |             "name": "Foo",
 44 |             "property1": "value1",
 45 |             "property2": "value2",
 46 |         },
 47 |         {
 48 |             "email": "bar@example.com",
 49 |             "name": "Bar",
 50 |             "property1": "value1",
 51 |             "property2": "value2",
 52 |         },
 53 |     ]
 54 |     """
 55 |     # check list and compaign existed
 56 |     assert _klaviyo_get_list_info(list_id)
 57 |     assert _klaviyo_get_campaign_info(campaign_id)
 58 | 
 59 |     # update list members
 60 |     existed_members = _klaviyo_get_list_members(list_id)["records"]
 61 |     if existed_members:
 62 |         _klaviyo_remove_list_members(
 63 |             list_id, body={"emails": list(map(lambda x: x["email"], existed_members))}
 64 |         )
 65 | 
 66 |     _klaviyo_add_list_members(list_id, body={"profiles": datas})
 67 |     new_members = _klaviyo_get_list_members(list_id)["records"]
 68 |     assert new_members
 69 | 
 70 |     # create a new compaign and send mail immediately
 71 |     campaign_suffix = f"{datetime.now():%Y-%m-%d_%H:%M:%S}"
 72 |     response = _klaviyo_clone_campaign(
 73 |         campaign_id,
 74 |         name=f"{campaign_name}_{campaign_suffix}",
 75 |         list_id=list_id,
 76 |     )
 77 |     new_campaign_id = response["id"]
 78 |     _klaviyo_send_campaign(new_campaign_id)
 79 |     print(f"Send {len(new_members)} Mails")
 80 | 
 81 | 
 82 | def _klaviyo_get_list_info(list_id: str) -> dict:
 83 |     HTTP_HOOK = HttpHook(http_conn_id="klaviyo_api", method="GET")
 84 |     API_KEY = Variable.get("KLAVIYO_KEY")
 85 |     return HTTP_HOOK.run_with_advanced_retry(
 86 |         endpoint=f"/v2/list/{list_id}?api_key={API_KEY}",
 87 |         _retry_args=RETRY_ARGS,
 88 |         headers={"Accept": "application/json"},
 89 |     ).json()
 90 | 
 91 | 
 92 | def _klaviyo_get_list_members(list_id: str) -> dict:
 93 |     HTTP_HOOK = HttpHook(http_conn_id="klaviyo_api", method="GET")
 94 |     API_KEY = Variable.get("KLAVIYO_KEY")
 95 |     return HTTP_HOOK.run_with_advanced_retry(
 96 |         endpoint=f"/v2/group/{list_id}/members/all?api_key={API_KEY}",
 97 |         _retry_args=RETRY_ARGS,
 98 |         headers={"Accept": "application/json"},
 99 |     ).json()
100 | 
101 | 
102 | def _klaviyo_remove_list_members(list_id: str, body: dict) -> dict:
103 |     HTTP_HOOK = HttpHook(http_conn_id="klaviyo_api", method="DELETE")
104 |     API_KEY = Variable.get("KLAVIYO_KEY")
105 |     return HTTP_HOOK.run_with_advanced_retry(
106 |         endpoint=f"/v2/list/{list_id}/members?api_key={API_KEY}",
107 |         _retry_args=RETRY_ARGS,
108 |         json=body,
109 |         headers={"Content-Type": "application/json"},
110 |     )
111 | 
112 | 
113 | def _klaviyo_add_list_members(list_id: str, body: dict) -> dict:
114 |     HTTP_HOOK = HttpHook(http_conn_id="klaviyo_api", method="POST")
115 |     API_KEY = Variable.get("KLAVIYO_KEY")
116 |     return HTTP_HOOK.run_with_advanced_retry(
117 |         endpoint=f"/v2/list/{list_id}/members?api_key={API_KEY}",
118 |         _retry_args=RETRY_ARGS,
119 |         json=body,
120 |         headers={"Accept": "application/json", "Content-Type": "application/json"},
121 |     ).json()
122 | 
123 | 
124 | def _klaviyo_get_campaign_info(campaign_id: str) -> dict:
125 |     HTTP_HOOK = HttpHook(http_conn_id="klaviyo_api", method="GET")
126 |     API_KEY = Variable.get("KLAVIYO_KEY")
127 |     return HTTP_HOOK.run_with_advanced_retry(
128 |         endpoint=f"/v1/campaign/{campaign_id}?api_key={API_KEY}",
129 |         _retry_args=RETRY_ARGS,
130 |         headers={"Accept": "application/json"},
131 |     ).json()
132 | 
133 | 
134 | def _klaviyo_clone_campaign(campaign_id: str, name: str, list_id: str) -> dict:
135 |     HTTP_HOOK = HttpHook(http_conn_id="klaviyo_api", method="POST")
136 |     API_KEY = Variable.get("KLAVIYO_KEY")
137 |     return HTTP_HOOK.run_with_advanced_retry(
138 |         endpoint=f"/v1/campaign/{campaign_id}/clone?api_key={API_KEY}",
139 |         _retry_args=RETRY_ARGS,
140 |         data={"name": name, "list_id": list_id},
141 |         headers={
142 |             "Accept": "application/json",
143 |             "Content-Type": "application/x-www-form-urlencoded",
144 |         },
145 |     ).json()
146 | 
147 | 
148 | def _klaviyo_send_campaign(campaign_id: str) -> dict:
149 |     HTTP_HOOK = HttpHook(http_conn_id="klaviyo_api", method="POST")
150 |     API_KEY = Variable.get("KLAVIYO_KEY")
151 |     return HTTP_HOOK.run_with_advanced_retry(
152 |         endpoint=f"/v1/campaign/{campaign_id}/send?api_key={API_KEY}",
153 |         _retry_args=RETRY_ARGS,
154 |         headers={"Accept": "application/json"},
155 |     ).json()
156 | 


--------------------------------------------------------------------------------
/dags/ods/linkedin_post_insights/dags.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.python_operator import PythonOperator
 5 | from ods.linkedin_post_insights import udfs
 6 | 
 7 | DEFAULT_ARGS = {
 8 |     "owner": "Angus Yang",
 9 |     "depends_on_past": False,
10 |     "start_date": datetime(2023, 6, 14, 0),
11 |     "retries": 2,
12 |     "retry_delay": timedelta(minutes=5),
13 |     "on_failure_callback": lambda x: "Need to send notification to Discord!",
14 | }
15 | dag = DAG(
16 |     "LINKEDIN_POST_INSIGHTS_V2",
17 |     default_args=DEFAULT_ARGS,
18 |     schedule_interval="5 8 */2 * *",
19 |     max_active_runs=1,
20 |     catchup=False,
21 | )
22 | with dag:
23 |     CREATE_TABLE_IF_NEEDED = PythonOperator(
24 |         task_id="CREATE_TABLE_IF_NEEDED",
25 |         python_callable=udfs.create_table_if_needed,
26 |     )
27 | 
28 |     SAVE_TWITTER_POSTS_AND_INSIGHTS = PythonOperator(
29 |         task_id="SAVE_LINKEDIN_POSTS_AND_INSIGHTS",
30 |         python_callable=udfs.save_posts_and_insights,
31 |     )
32 | 
33 |     CREATE_TABLE_IF_NEEDED >> SAVE_TWITTER_POSTS_AND_INSIGHTS
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     dag.cli()
38 | 


--------------------------------------------------------------------------------
/dags/ods/linkedin_post_insights/udfs.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from datetime import datetime
  4 | from typing import List, Optional
  5 | 
  6 | import requests
  7 | from airflow.models import Variable
  8 | from google.cloud import bigquery
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | def create_table_if_needed() -> None:
 14 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
 15 |     post_sql = """
 16 |     CREATE TABLE IF NOT EXISTS `pycontw-225217.ods.ods_pycontw_linkedin_posts` (
 17 |         id STRING,
 18 |         created_at TIMESTAMP,
 19 |         message STRING
 20 |     )
 21 |     """
 22 |     client.query(post_sql)
 23 |     insights_sql = """
 24 |     CREATE TABLE IF NOT EXISTS `pycontw-225217.ods.ods_pycontw_linkedin_posts_insights` (
 25 |         post_id STRING,
 26 |         query_time TIMESTAMP,
 27 |         period STRING,
 28 |         favorite INTEGER,
 29 |         reply INTEGER,
 30 |         retweet INTEGER,
 31 |         views INTEGER
 32 |     )
 33 |     """
 34 |     client.query(insights_sql)
 35 | 
 36 |     # Example output from the Rapid API, not all fields will exists for a specific post
 37 |     #
 38 |     # {
 39 |     #   "text": "For your kids in senior high.",
 40 |     #   "totalReactionCount": 6,
 41 |     #   "likeCount": 6,
 42 |     #   "repostsCount": 1,
 43 |     #   "empathyCount": 1,
 44 |     #   "commentsCount": 20,
 45 |     #   repostsCount:1,
 46 |     #   "postUrl": "https://www.linkedin.com/feed/update/urn:li:activity:6940542340960763905/",
 47 |     #   "postedAt": "1yr",
 48 |     #   "postedDate": "2022-06-09 05:57:23.126 +0000 UTC",
 49 |     #   "postedDateTimestamp": 1654754243126,
 50 |     #   "urn": "6940542340960763905",
 51 |     #   "author": {
 52 |     #     "firstName": "Angus",
 53 |     #     "lastName": "Yang",
 54 |     #     "username": "angus-yang-8885279a",
 55 |     #     "url": "https://www.linkedin.com/in/angus-yang-8885279a"
 56 |     #   },
 57 |     #   "company": {},
 58 |     #   "article": {
 59 |     #     "title": "2022 AWS STEM Summer Camp On The Cloud",
 60 |     #     "subtitle": "pages.awscloud.com • 2 min read",
 61 |     #     "link": "https://pages.awscloud.com/tw-2022-aws-stem-summer-camp-on-the-cloud_registration.html"
 62 |     #   }
 63 |     # },
 64 | 
 65 | 
 66 | def save_posts_and_insights() -> None:
 67 |     posts = request_posts_data()
 68 | 
 69 |     last_post = query_last_post()
 70 |     new_posts = (
 71 |         [
 72 |             post
 73 |             for post in posts
 74 |             if post["postedDateTimestamp"] > last_post["created_at"].timestamp()
 75 |         ]
 76 |         if last_post
 77 |         else posts
 78 |     )
 79 | 
 80 |     if not dump_posts_to_bigquery(
 81 |         [
 82 |             {
 83 |                 "id": post["urn"],
 84 |                 "created_at": post["postedDateTimestamp"],
 85 |                 "message": post["text"],
 86 |             }
 87 |             for post in new_posts
 88 |         ]
 89 |     ):
 90 |         raise RuntimeError("Failed to dump posts to BigQuery")
 91 | 
 92 |     if not dump_posts_insights_to_bigquery(
 93 |         [
 94 |             {
 95 |                 "post_id": post["urn"],
 96 |                 "query_time": datetime.now().timestamp(),
 97 |                 "period": "lifetime",
 98 |                 "favorite": post["likeCount"],
 99 |                 "reply": post["commentsCount"],
100 |                 "retweet": post["repostsCount"],
101 |                 "views": "0",  # not support by RapidAPI
102 |             }
103 |             for post in posts
104 |         ]
105 |     ):
106 |         raise RuntimeError("Failed to dump posts insights to BigQuery")
107 | 
108 | 
109 | def query_last_post() -> Optional[dict]:
110 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
111 |     sql = """
112 |     SELECT
113 |         created_at
114 |     FROM
115 |         `pycontw-225217.ods.ods_pycontw_linkedin_posts`
116 |     ORDER BY
117 |         created_at DESC
118 |     LIMIT 1
119 |     """
120 |     result = client.query(sql)
121 |     data = list(result)
122 |     return data[0] if data else None
123 | 
124 | 
125 | def request_posts_data() -> List[dict]:
126 |     # Define the request options
127 |     # url = 'https://linkedin-data-api.p.rapidapi.com/get-profile-posts' # for user
128 |     url = "https://linkedin-data-api.p.rapidapi.com/get-company-posts"
129 |     querystring = {"username": "pycontw"}
130 |     headers = {
131 |         "X-RapidAPI-Key": Variable.get("LINKEDIN_RAPIDAPI_KEY"),
132 |         "X-RapidAPI-Host": "linkedin-data-api.p.rapidapi.com",
133 |     }
134 | 
135 |     response = requests.get(url, headers=headers, params=querystring, timeout=180)
136 |     if not response.ok:
137 |         raise RuntimeError(f"Failed to fetch posts data: {response.text}")
138 | 
139 |     media_insight_list = []
140 |     media_res_list = response.json()["data"]
141 |     # format handling, the response may not include the required fields
142 |     for media_res in media_res_list:
143 |         media_insight = {}
144 |         media_insight["urn"] = media_res.get("urn", "0")
145 |         media_insight["postedDateTimestamp"] = (
146 |             media_res.get("postedDateTimestamp", "0") / 1000
147 |         )
148 |         media_insight["text"] = media_res.get("text", "No Content")
149 |         media_insight["likeCount"] = media_res.get("totalReactionCount", "0")
150 |         media_insight["commentsCount"] = media_res.get("commentsCount", "0")
151 |         media_insight["repostsCount"] = media_res.get("repostsCount", "0")
152 |         # logger.info(media_insight)
153 |         media_insight_list.append(media_insight)
154 | 
155 |     return media_insight_list
156 | 
157 | 
158 | def dump_posts_to_bigquery(posts: List[dict]) -> bool:
159 |     if not posts:
160 |         logger.info("No posts to dump!")
161 |         return True
162 | 
163 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
164 |     job_config = bigquery.LoadJobConfig(
165 |         schema=[
166 |             bigquery.SchemaField("id", "STRING", mode="REQUIRED"),
167 |             bigquery.SchemaField("created_at", "TIMESTAMP", mode="REQUIRED"),
168 |             bigquery.SchemaField("message", "STRING", mode="REQUIRED"),
169 |         ],
170 |         write_disposition="WRITE_APPEND",
171 |     )
172 |     try:
173 |         job = client.load_table_from_json(
174 |             posts,
175 |             "pycontw-225217.ods.ods_pycontw_linkedin_posts",
176 |             job_config=job_config,
177 |         )
178 |         job.result()
179 |         return True
180 |     except Exception as e:
181 |         logger.error(f"Failed to dump posts to BigQuery: {e}", exc_info=True)
182 |         return False
183 | 
184 | 
185 | def dump_posts_insights_to_bigquery(posts: List[dict]) -> bool:
186 |     if not posts:
187 |         logger.info("No post insights to dump!")
188 |         return True
189 | 
190 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
191 |     job_config = bigquery.LoadJobConfig(
192 |         schema=[
193 |             bigquery.SchemaField("post_id", "STRING", mode="REQUIRED"),
194 |             bigquery.SchemaField("query_time", "TIMESTAMP", mode="REQUIRED"),
195 |             bigquery.SchemaField("period", "STRING", mode="REQUIRED"),
196 |             bigquery.SchemaField("favorite", "INTEGER", mode="NULLABLE"),
197 |             bigquery.SchemaField("reply", "INTEGER", mode="NULLABLE"),
198 |             bigquery.SchemaField("retweet", "INTEGER", mode="NULLABLE"),
199 |             bigquery.SchemaField("views", "INTEGER", mode="NULLABLE"),
200 |         ],
201 |         write_disposition="WRITE_APPEND",
202 |     )
203 |     try:
204 |         job = client.load_table_from_json(
205 |             posts,
206 |             "pycontw-225217.ods.ods_pycontw_linkedin_posts_insights",
207 |             job_config=job_config,
208 |         )
209 |         job.result()
210 |         return True
211 |     except Exception as e:
212 |         logger.error(f"Failed to dump posts insights to BigQuery: {e}", exc_info=True)
213 |         return False
214 | 
215 | 
216 | def test_main():
217 |     create_table_if_needed()
218 | 
219 |     # request_posts_data()
220 | 
221 |     save_posts_and_insights()
222 | 
223 | 
224 | if __name__ == "__main__":
225 |     test_main()
226 | 


--------------------------------------------------------------------------------
/dags/ods/opening_crawler/dags/cakeresume_crawler.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A crawler which would crawl the openings
 3 | """
 4 | 
 5 | from datetime import datetime, timedelta
 6 | 
 7 | from airflow import DAG
 8 | from airflow.operators.python_operator import PythonOperator
 9 | from ods.opening_crawler.udfs.crawlers import CakeResumeCrawler
10 | 
11 | DEFAULT_ARGS = {
12 |     "owner": "davidtnfsh",
13 |     "depends_on_past": False,
14 |     "start_date": datetime(2020, 8, 30),
15 |     "retries": 2,
16 |     "retry_delay": timedelta(minutes=5),
17 |     "on_failure_callback": lambda x: "Need to send notification to Telegrame",
18 | }
19 | dag = DAG(
20 |     "OPENING_CRAWLER_V1",
21 |     default_args=DEFAULT_ARGS,
22 |     schedule_interval="@daily",
23 |     max_active_runs=1,
24 |     catchup=False,
25 | )
26 | with dag:
27 |     CRAWLER = PythonOperator(
28 |         task_id="CRAWLER",
29 |         python_callable=CakeResumeCrawler.crawl,
30 |         provide_context=True,
31 |         op_kwargs={},
32 |     )
33 | 
34 | if __name__ == "__main__":
35 |     dag.cli()
36 | 


--------------------------------------------------------------------------------
/dags/ods/opening_crawler/udfs/crawlers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Crawler of openings
 3 | """
 4 | 
 5 | from abc import ABC, abstractclassmethod
 6 | 
 7 | 
 8 | class BaseCrawler(ABC):
 9 |     """
10 |     Abstract cralwer
11 |     """
12 | 
13 |     @classmethod
14 |     @abstractclassmethod
15 |     def crawl(cls, **conf):
16 |         pass
17 | 
18 | 
19 | class CakeResumeCrawler(BaseCrawler):
20 |     """
21 |     Crawler of cakeresume
22 |     """
23 | 
24 |     @classmethod
25 |     def crawl(cls, **conf):
26 |         print("i'm a CakeResume crawler!")
27 |         return "i'm a CakeResume crawler!"
28 | 


--------------------------------------------------------------------------------
/dags/ods/survey_cake/dags/questionnaire_2_bigquery.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A crawler which would crawl the openings
 3 | """
 4 | 
 5 | from __future__ import annotations
 6 | 
 7 | import os
 8 | from datetime import datetime, timedelta
 9 | from pathlib import Path
10 | 
11 | from airflow import DAG
12 | from airflow.operators.python_operator import PythonOperator
13 | from ods.survey_cake.udfs.survey_cake_csv_uploader import SurveyCakeCSVUploader
14 | 
15 | AIRFLOW_HOME = os.getenv("AIRFLOW_HOME")
16 | 
17 | DEFAULT_ARGS = {
18 |     "owner": "davidtnfsh",
19 |     "depends_on_past": False,
20 |     "start_date": datetime(2020, 9, 30),
21 |     "retries": 2,
22 |     "retry_delay": timedelta(minutes=5),
23 |     "on_failure_callback": lambda x: "Need to send notification to Telegrame",
24 | }
25 | dag = DAG(
26 |     "QUESTIONNAIRE_2_BIGQUERY",
27 |     default_args=DEFAULT_ARGS,
28 |     schedule_interval=None,
29 |     max_active_runs=1,
30 |     catchup=False,
31 | )
32 | with dag:
33 |     if bool(os.getenv("AIRFLOW_TEST_MODE")):
34 |         filepath = Path(AIRFLOW_HOME) / "dags/fixtures/data_questionnaire.csv"
35 |         FILENAMES: dict[str, dict] = {str(filepath): {}}
36 |     else:
37 |         FILENAMES = {
38 |             "data_questionnaire.csv": {
39 |                 "data_domain": "questionnaire",
40 |                 "primary_key": "ip",
41 |                 "time_dimension": "datetime",
42 |             },
43 |             "data_sponsor_questionnaire.csv": {
44 |                 "data_domain": "sponsorQuestionnaire",
45 |                 "primary_key": "ip",
46 |                 "time_dimension": "datetime",
47 |             },
48 |         }
49 |     for filename, metadata in FILENAMES.items():
50 |         FILENAME_STEM = Path(filename).stem
51 |         SURVEY_CAKE_CSV_UPLOADER = SurveyCakeCSVUploader(filename=filename)
52 |         TRANSFORM = PythonOperator(
53 |             task_id=f"TRANSFORM_{FILENAME_STEM}",
54 |             python_callable=SURVEY_CAKE_CSV_UPLOADER.transform,
55 |             provide_context=True,
56 |         )
57 | 
58 |         if not bool(os.getenv("AIRFLOW_TEST_MODE")):
59 |             UPLOAD_FACTTABLE = PythonOperator(
60 |                 task_id=f"UPLOAD_FACTTABLE_{FILENAME_STEM}",
61 |                 python_callable=SURVEY_CAKE_CSV_UPLOADER.upload,
62 |                 op_kwargs={
63 |                     "facttable_or_dimension_table": "fact",
64 |                     "data_layer": "ods",
65 |                     "data_domain": metadata["data_domain"],
66 |                     "primary_key": metadata["primary_key"],
67 |                     "time_dimension": metadata["time_dimension"],
68 |                 },
69 |             )
70 |             UPLOAD_DIMENSION_TABLE = PythonOperator(
71 |                 task_id=f"UPLOAD_DIMENSION_TABLE_{FILENAME_STEM}",
72 |                 python_callable=SURVEY_CAKE_CSV_UPLOADER.upload,
73 |                 op_kwargs={
74 |                     "facttable_or_dimension_table": "dim",
75 |                     "data_layer": "dim",
76 |                     "data_domain": metadata["data_domain"],
77 |                     "primary_key": "questionId",
78 |                     "time_dimension": "year",
79 |                 },
80 |             )
81 |             TRANSFORM >> UPLOAD_FACTTABLE
82 |             TRANSFORM >> UPLOAD_DIMENSION_TABLE
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     dag.cli()
87 | 


--------------------------------------------------------------------------------
/dags/ods/survey_cake/udfs/survey_cake_csv_uploader.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | from pathlib import Path
  4 | 
  5 | from google.cloud import bigquery
  6 | 
  7 | AIRFLOW_HOME = os.getenv("AIRFLOW_HOME")
  8 | 
  9 | 
 10 | class SurveyCakeCSVUploader:
 11 |     def __init__(self, filename):
 12 |         self.filename = Path(filename)
 13 |         self.year = None
 14 |         if not bool(os.getenv("AIRFLOW_TEST_MODE")):
 15 |             self.client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
 16 | 
 17 |         self.facttable_filepath = (
 18 |             self.filename.parent / f"{self.filename.stem}_facttable.csv"
 19 |         )
 20 |         self.dimension_table_filepath = (
 21 |             self.filename.parent / f"{self.filename.stem}_dimension.csv"
 22 |         )
 23 | 
 24 |     @property
 25 |     def bigquery_project(self):
 26 |         return os.getenv("BIGQUERY_PROJECT")
 27 | 
 28 |     def transform(self, **context):
 29 |         self.year = context["execution_date"].year
 30 |         self._transform()
 31 | 
 32 |     def _transform(self):
 33 |         def _export_facttable(header_of_fact_table):
 34 |             with open(self.facttable_filepath, "w") as target:
 35 |                 writer = csv.writer(target)
 36 |                 writer.writerow(header_of_fact_table)
 37 |                 for row in rows_of_fact_table:
 38 |                     writer.writerow(row)
 39 | 
 40 |         def _export_dimension_table(question_id_dienstion_table):
 41 |             with open(self.dimension_table_filepath, "w") as target:
 42 |                 writer = csv.writer(target)
 43 |                 writer.writerow(("question_id", "question", "year"))
 44 |                 for question_id, question in question_id_dienstion_table.items():
 45 |                     writer.writerow((question_id, question, self.year))
 46 | 
 47 |         filepath = Path(AIRFLOW_HOME) / "dags" / self.filename
 48 |         with open(filepath, encoding="utf-8-sig") as csvfile:
 49 |             rows = csv.reader(csvfile)
 50 |             # skip header
 51 |             header = next(iter(rows))
 52 |             question_id_dienstion_table = self._generate_question_id_dimension_table(
 53 |                 header
 54 |             )
 55 |             question_ids = sorted(question_id_dienstion_table.keys())
 56 |             header_of_fact_table = ("ip", "question_id", "answer")
 57 |             rows_of_fact_table = self._transform_raw_data_to_fact_table_format(
 58 |                 rows, question_id_dienstion_table, question_ids
 59 |             )
 60 | 
 61 |         _export_facttable(header_of_fact_table)
 62 |         _export_dimension_table(question_id_dienstion_table)
 63 | 
 64 |     def upload(
 65 |         self,
 66 |         facttable_or_dimension_table,
 67 |         data_layer,
 68 |         data_domain,
 69 |         primary_key,
 70 |         time_dimension,
 71 |     ):
 72 |         if facttable_or_dimension_table == "fact":
 73 |             print(self.facttable_filepath)
 74 |             print(self.facttable_filepath)
 75 |             print(self.facttable_filepath)
 76 |             print(self.facttable_filepath)
 77 |             self._upload_2_bigquery(
 78 |                 self.facttable_filepath,
 79 |                 f"{self.bigquery_project}.{data_layer}.{data_layer}_{data_domain}_{primary_key}_{time_dimension}",
 80 |             )
 81 |         elif facttable_or_dimension_table == "dim":
 82 |             self._upload_2_bigquery(
 83 |                 self.dimension_table_filepath,
 84 |                 f"{self.bigquery_project}.{data_layer}.{data_layer}_{data_domain}_{primary_key}_{time_dimension}",
 85 |             )
 86 | 
 87 |     def _upload_2_bigquery(self, file_path, table_id):
 88 |         job_config = bigquery.LoadJobConfig(
 89 |             source_format=bigquery.SourceFormat.CSV,
 90 |             skip_leading_rows=1,
 91 |             autodetect=True,
 92 |             allow_quoted_newlines=True,
 93 |             write_disposition="WRITE_TRUNCATE",
 94 |         )
 95 |         with open(file_path, "rb") as source_file:
 96 |             job = self.client.load_table_from_file(
 97 |                 source_file, table_id, job_config=job_config
 98 |             )
 99 | 
100 |         job.result()  # Waits for the job to complete.
101 | 
102 |         table = self.client.get_table(table_id)  # Make an API request.
103 |         print(
104 |             f"Loaded {table.num_rows} rows and {len(table.schema)} columns to {table_id}"
105 |         )
106 | 
107 |     def _generate_question_id_dimension_table(self, header):
108 |         question_id_dim_table = {}
109 |         for index, column in enumerate(header):
110 |             column = column.strip()
111 |             question_id_dim_table[
112 |                 index if column != "其他" else self._get_index_of_else_column(index)
113 |             ] = column
114 |         return question_id_dim_table
115 | 
116 |     @staticmethod
117 |     def _get_index_of_else_column(index):
118 |         """
119 |         use 0.1 to represent "其他"
120 |         """
121 |         return index - 1 + 0.1
122 | 
123 |     @staticmethod
124 |     def _transform_raw_data_to_fact_table_format(
125 |         rows, question_id_dienstion_table, question_ids
126 |     ):
127 |         result = []
128 |         for row in rows:
129 |             row_dict = dict(zip(question_ids, row))
130 |             question_id_of_primary_key = [
131 |                 key
132 |                 for key, value in question_id_dienstion_table.items()
133 |                 if value == "IP紀錄"
134 |             ][0]
135 |             primary_key = row_dict[question_id_of_primary_key]
136 |             for question_id, answer in row_dict.items():
137 |                 result.append((primary_key, question_id, answer))
138 |         return result
139 | 


--------------------------------------------------------------------------------
/dags/ods/twitter_post_insights/dags.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.python_operator import PythonOperator
 5 | from ods.twitter_post_insights import udfs
 6 | 
 7 | DEFAULT_ARGS = {
 8 |     "owner": "Henry Lee",
 9 |     "depends_on_past": False,
10 |     "start_date": datetime(2023, 6, 14, 0),
11 |     "retries": 2,
12 |     "retry_delay": timedelta(minutes=5),
13 |     "on_failure_callback": lambda x: "Need to send notification to Discord!",
14 | }
15 | dag = DAG(
16 |     "TWITTER_POST_INSIGHTS_V1",
17 |     default_args=DEFAULT_ARGS,
18 |     schedule_interval="5 8 * * *",
19 |     max_active_runs=1,
20 |     catchup=False,
21 | )
22 | with dag:
23 |     CREATE_TABLE_IF_NEEDED = PythonOperator(
24 |         task_id="CREATE_TABLE_IF_NEEDED",
25 |         python_callable=udfs.create_table_if_needed,
26 |     )
27 | 
28 |     SAVE_TWITTER_POSTS_AND_INSIGHTS = PythonOperator(
29 |         task_id="SAVE_TWITTER_POSTS_AND_INSIGHTS",
30 |         python_callable=udfs.save_twitter_posts_and_insights,
31 |     )
32 | 
33 |     CREATE_TABLE_IF_NEEDED >> SAVE_TWITTER_POSTS_AND_INSIGHTS
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     dag.cli()
38 | 


--------------------------------------------------------------------------------
/dags/ods/twitter_post_insights/udfs.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from datetime import datetime
  4 | from typing import List, Optional
  5 | 
  6 | import requests
  7 | from airflow.models import Variable
  8 | from google.cloud import bigquery
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | def create_table_if_needed() -> None:
 14 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
 15 |     post_sql = """
 16 |     CREATE TABLE IF NOT EXISTS `pycontw-225217.ods.ods_pycontw_twitter_posts` (
 17 |         id STRING,
 18 |         created_at TIMESTAMP,
 19 |         message STRING
 20 |     )
 21 |     """
 22 |     client.query(post_sql)
 23 |     insights_sql = """
 24 |     CREATE TABLE IF NOT EXISTS `pycontw-225217.ods.ods_pycontw_twitter_posts_insights` (
 25 |         post_id STRING,
 26 |         query_time TIMESTAMP,
 27 |         period STRING,
 28 |         favorite INTEGER,
 29 |         reply INTEGER,
 30 |         retweet INTEGER,
 31 |         views INTEGER
 32 |     )
 33 |     """
 34 |     client.query(insights_sql)
 35 | 
 36 | 
 37 | def save_twitter_posts_and_insights() -> None:
 38 |     posts = request_posts_data()
 39 | 
 40 |     last_post = query_last_post()
 41 |     if last_post is None:
 42 |         new_posts = posts
 43 |     else:
 44 |         new_posts = [
 45 |             post
 46 |             for post in posts
 47 |             if post["timestamp"] > last_post["created_at"].timestamp()
 48 |         ]
 49 | 
 50 |     if not dump_posts_to_bigquery(
 51 |         [
 52 |             {
 53 |                 "id": post["tweet_id"],
 54 |                 "created_at": post["timestamp"],
 55 |                 "message": post["text"],
 56 |             }
 57 |             for post in new_posts
 58 |         ]
 59 |     ):
 60 |         raise RuntimeError("Failed to dump posts to BigQuery")
 61 | 
 62 |     if not dump_posts_insights_to_bigquery(
 63 |         [
 64 |             {
 65 |                 "post_id": post["tweet_id"],
 66 |                 "query_time": datetime.now().timestamp(),
 67 |                 "period": "lifetime",
 68 |                 "favorite": post["favorite_count"],
 69 |                 "reply": post["reply_count"],
 70 |                 "retweet": post["retweet_count"],
 71 |                 "views": post["views"],
 72 |             }
 73 |             for post in posts
 74 |         ]
 75 |     ):
 76 |         raise RuntimeError("Failed to dump posts insights to BigQuery")
 77 | 
 78 | 
 79 | def query_last_post() -> Optional[dict]:
 80 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
 81 |     sql = """
 82 |     SELECT
 83 |         created_at
 84 |     FROM
 85 |         `pycontw-225217.ods.ods_pycontw_twitter_posts`
 86 |     ORDER BY
 87 |         created_at DESC
 88 |     LIMIT 1
 89 |     """
 90 |     result = client.query(sql)
 91 |     data = list(result)
 92 |     return data[0] if data else None
 93 | 
 94 | 
 95 | def request_posts_data() -> List[dict]:
 96 |     url = "https://twitter154.p.rapidapi.com/user/tweets"
 97 |     # 499339900 is PyConTW's twitter id
 98 |     querystring = {
 99 |         "username": "pycontw",
100 |         "user_id": "499339900",
101 |         "limit": "40",
102 |         "include_replies": "false",
103 |         "include_pinned": "false",
104 |     }
105 |     headers = {
106 |         "X-RapidAPI-Key": Variable.get("RAPIDAPIAPI_KEY"),
107 |         "X-RapidAPI-Host": "twitter154.p.rapidapi.com",
108 |     }
109 |     response = requests.get(url, headers=headers, params=querystring)
110 |     if response.ok:
111 |         return response.json()["results"]
112 |     raise RuntimeError(f"Failed to fetch posts data: {response.text}")
113 | 
114 | 
115 | def dump_posts_to_bigquery(posts: List[dict]) -> bool:
116 |     if not posts:
117 |         logger.info("No posts to dump!")
118 |         return True
119 | 
120 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
121 |     job_config = bigquery.LoadJobConfig(
122 |         schema=[
123 |             bigquery.SchemaField("id", "STRING", mode="REQUIRED"),
124 |             bigquery.SchemaField("created_at", "TIMESTAMP", mode="REQUIRED"),
125 |             bigquery.SchemaField("message", "STRING", mode="REQUIRED"),
126 |         ],
127 |         write_disposition="WRITE_APPEND",
128 |     )
129 |     try:
130 |         job = client.load_table_from_json(
131 |             posts,
132 |             "pycontw-225217.ods.ods_pycontw_twitter_posts",
133 |             job_config=job_config,
134 |         )
135 |         job.result()
136 |         return True
137 |     except Exception as e:
138 |         logger.error(f"Failed to dump posts to BigQuery: {e}", exc_info=True)
139 |         return False
140 | 
141 | 
142 | def dump_posts_insights_to_bigquery(posts: List[dict]) -> bool:
143 |     if not posts:
144 |         logger.info("No post insights to dump!")
145 |         return True
146 | 
147 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
148 |     job_config = bigquery.LoadJobConfig(
149 |         schema=[
150 |             bigquery.SchemaField("post_id", "STRING", mode="REQUIRED"),
151 |             bigquery.SchemaField("query_time", "TIMESTAMP", mode="REQUIRED"),
152 |             bigquery.SchemaField("period", "STRING", mode="REQUIRED"),
153 |             bigquery.SchemaField("favorite", "INTEGER", mode="NULLABLE"),
154 |             bigquery.SchemaField("reply", "INTEGER", mode="NULLABLE"),
155 |             bigquery.SchemaField("retweet", "INTEGER", mode="NULLABLE"),
156 |             bigquery.SchemaField("views", "INTEGER", mode="NULLABLE"),
157 |         ],
158 |         write_disposition="WRITE_APPEND",
159 |     )
160 |     try:
161 |         job = client.load_table_from_json(
162 |             posts,
163 |             "pycontw-225217.ods.ods_pycontw_twitter_posts_insights",
164 |             job_config=job_config,
165 |         )
166 |         job.result()
167 |         return True
168 |     except Exception as e:
169 |         logger.error(f"Failed to dump posts insights to BigQuery: {e}", exc_info=True)
170 |         return False
171 | 


--------------------------------------------------------------------------------
/dags/ods/youtube/dags/dag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Save view, like count these kind of metrics into BigQuery
 3 | """
 4 | 
 5 | from datetime import datetime, timedelta
 6 | 
 7 | from airflow import DAG
 8 | from airflow.operators.python_operator import PythonOperator
 9 | from ods.youtube.udfs import youtube_api
10 | 
11 | DEFAULT_ARGS = {
12 |     "owner": "davidtnfsh",
13 |     "depends_on_past": False,
14 |     "start_date": datetime(2021, 9, 19),
15 |     "retries": 2,
16 |     "retry_delay": timedelta(minutes=5),
17 |     "on_failure_callback": lambda x: "Need to send notification to Telegram",
18 | }
19 | dag = DAG(
20 |     "ODS_YOUTUBE_2_BIGQUERY",
21 |     default_args=DEFAULT_ARGS,
22 |     schedule_interval="@daily",
23 |     max_active_runs=1,
24 |     catchup=False,
25 | )
26 | with dag:
27 |     CREATE_TABLE_IF_NEEDED = PythonOperator(
28 |         task_id="CREATE_TABLE_IF_NEEDED",
29 |         python_callable=youtube_api.create_table_if_needed,
30 |     )
31 | 
32 |     GET_VIDEO_IDS = PythonOperator(
33 |         task_id="GET_VIDEO_IDS",
34 |         python_callable=youtube_api.get_video_ids,
35 |         provide_context=True,
36 |     )
37 | 
38 |     SAVE_STATISTICS_DATA_2_BQ = PythonOperator(
39 |         task_id="SAVE_STATISTICS_DATA_2_BQ",
40 |         python_callable=youtube_api.save_video_data_2_bq,
41 |         provide_context=True,
42 |         op_kwargs={"datatype": "statistics"},
43 |     )
44 |     CREATE_TABLE_IF_NEEDED >> GET_VIDEO_IDS >> SAVE_STATISTICS_DATA_2_BQ
45 | 
46 |     SAVE_INFO_DATA_2_BQ = PythonOperator(
47 |         task_id="SAVE_INFO_DATA_2_BQ",
48 |         python_callable=youtube_api.save_video_data_2_bq,
49 |         provide_context=True,
50 |         op_kwargs={"datatype": "info"},
51 |     )
52 |     GET_VIDEO_IDS >> SAVE_INFO_DATA_2_BQ
53 | 
54 | if __name__ == "__main__":
55 |     dag.cli()
56 | 


--------------------------------------------------------------------------------
/dags/ods/youtube/sqls/create_table.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS `{}.ods.ods_youtubeStatistics_videoId_datetime`
 2 | (
 3 |     created_at TIMESTAMP NOT NULL,
 4 |     videoId STRING NOT NULL,
 5 |     title STRING NOT NULL,
 6 |     viewCount INT64 NOT NULL,
 7 |     likeCount INT64 NOT NULL,
 8 |     dislikeCount INT64 NOT NULL,
 9 |     favoriteCount INT64 NOT NULL,
10 |     commentCount INT64 NOT NULL
11 | );
12 | 
13 | CREATE TABLE IF NOT EXISTS `{}.ods.ods_youtubeInfo_videoId_datetime`
14 | (
15 |     created_at TIMESTAMP NOT NULL,
16 |     videoId STRING NOT NULL,
17 |     title STRING NOT NULL,
18 |     image_url STRING NOT NULL,
19 |     subtitle STRING NOT NULL,
20 |     time TIMESTAMP NOT NULL,
21 |     url STRING NOT NULL
22 | );
23 | 
24 | 


--------------------------------------------------------------------------------
/dags/ods/youtube/udfs/youtube_api.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import datetime
  3 | from pathlib import Path
  4 | 
  5 | import pandas as pd
  6 | from airflow import macros
  7 | from airflow.hooks.http_hook import HttpHook
  8 | from airflow.models import Variable
  9 | from google.cloud import bigquery
 10 | from utils.hook_related import RETRY_ARGS
 11 | 
 12 | # channel id of YouTube is public to everyone, so it's okay to commit this ID into git
 13 | CHANNEL_ID = "UCHLnNgRnfGYDzPCCH8qGbQw"
 14 | MAX_RESULTS = 50
 15 | PROJECT = os.getenv("BIGQUERY_PROJECT")
 16 | AIRFLOW_HOME = os.getenv("AIRFLOW_HOME")
 17 | 
 18 | 
 19 | def create_table_if_needed():
 20 |     client = bigquery.Client(project=os.getenv("BIGQUERY_PROJECT"))
 21 |     sql_filepath = Path(AIRFLOW_HOME) / "dags/ods/youtube/sqls/create_table.sql"
 22 |     sql = sql_filepath.read_text().format(PROJECT)
 23 |     client.query(sql)
 24 | 
 25 | 
 26 | def get_video_ids(**context) -> None:
 27 |     video_metadatas = []
 28 |     execution_date = context["execution_date"].replace(tzinfo=None)
 29 |     last_year = execution_date - macros.timedelta(days=365)
 30 |     last_year_RFC_3339_format = f"{last_year.date()}T00:00:00Z"
 31 |     http_conn = HttpHook(method="GET", http_conn_id="youtube")
 32 |     base_url = f"/youtube/v3/search?key={Variable.get('YOUTUBE_KEY')}&channelId={CHANNEL_ID}&part=snippet,id&order=date&maxResults={MAX_RESULTS}&publishedAfter={last_year_RFC_3339_format}"
 33 | 
 34 |     response_json = http_conn.run_with_advanced_retry(
 35 |         endpoint=base_url,
 36 |         _retry_args=RETRY_ARGS,
 37 |         headers={"Content-Type": "application/json", "Cache-Control": "no-cache"},
 38 |     ).json()
 39 |     video_metadatas += [
 40 |         {"videoId": item["id"]["videoId"], "title": item["snippet"]["title"]}
 41 |         for item in response_json["items"]
 42 |         if "videoId" in item["id"]
 43 |     ]
 44 |     while response_json.get("nextPageToken"):
 45 |         next_page_token = response_json["nextPageToken"]
 46 |         response_json = http_conn.run_with_advanced_retry(
 47 |             endpoint=f"{base_url}&pageToken={next_page_token}",
 48 |             _retry_args=RETRY_ARGS,
 49 |             headers={"Content-Type": "application/json", "Cache-Control": "no-cache"},
 50 |         ).json()
 51 |         video_metadatas += [
 52 |             {"videoId": item["id"]["videoId"], "title": item["snippet"]["title"]}
 53 |             for item in response_json["items"]
 54 |             if "videoId" in item["id"]
 55 |         ]
 56 |     task_instance = context["task_instance"]
 57 |     task_instance.xcom_push("GET_VIDEO_IDS", video_metadatas)
 58 | 
 59 | 
 60 | def save_video_data_2_bq(**context):
 61 |     def _init():
 62 |         client = bigquery.Client(project=PROJECT)
 63 |         http_conn = HttpHook(method="GET", http_conn_id="youtube")
 64 |         execution_date = context["execution_date"].replace(tzinfo=None)
 65 |         task_instance = context["task_instance"]
 66 |         datatype = context["datatype"]
 67 |         video_metadatas = task_instance.xcom_pull("GET_VIDEO_IDS", key="GET_VIDEO_IDS")
 68 |         result = []
 69 |         return (
 70 |             client,
 71 |             http_conn,
 72 |             execution_date,
 73 |             task_instance,
 74 |             datatype,
 75 |             video_metadatas,
 76 |             result,
 77 |         )
 78 | 
 79 |     def _get_statistics():
 80 |         for video_metadata in video_metadatas:
 81 |             video_id = video_metadata["videoId"]
 82 |             title = video_metadata["title"]
 83 |             response_json = http_conn.run_with_advanced_retry(
 84 |                 endpoint=f"/youtube/v3/videos?id={video_id}&key={Variable.get('YOUTUBE_KEY')}&part=statistics",
 85 |                 _retry_args=RETRY_ARGS,
 86 |                 headers={
 87 |                     "Content-Type": "application/json",
 88 |                     "Cache-Control": "no-cache",
 89 |                 },
 90 |             ).json()
 91 |             print(response_json["items"][0]["statistics"].keys())
 92 |             result.append(
 93 |                 (
 94 |                     execution_date,
 95 |                     video_id,
 96 |                     title,
 97 |                     int(response_json["items"][0]["statistics"]["viewCount"]),
 98 |                     int(response_json["items"][0]["statistics"]["likeCount"]),
 99 |                     0,  # dislikeCount field is not available in statistics API since 2021!
100 |                     int(response_json["items"][0]["statistics"]["favoriteCount"]),
101 |                     int(response_json["items"][0]["statistics"]["commentCount"]),
102 |                 )
103 |             )
104 |         return result
105 | 
106 |     def _get_info():
107 |         for video_metadata in video_metadatas:
108 |             video_id = video_metadata["videoId"]
109 |             title = video_metadata["title"]
110 |             response_json = http_conn.run_with_advanced_retry(
111 |                 endpoint=f"/youtube/v3/videos?id={video_id}&key={Variable.get('YOUTUBE_KEY')}&part=snippet",
112 |                 _retry_args=RETRY_ARGS,
113 |                 headers={
114 |                     "Content-Type": "application/json",
115 |                     "Cache-Control": "no-cache",
116 |                 },
117 |             ).json()
118 |             result.append(
119 |                 (
120 |                     execution_date,
121 |                     video_id,
122 |                     title,
123 |                     response_json["items"][0]["snippet"]["thumbnails"]["default"][
124 |                         "url"
125 |                     ],
126 |                     response_json["items"][0]["description"],
127 |                     datetime.strptime(
128 |                         response_json["items"][0]["publishedAt"], "%Y-%m-%dT%H:%M:%SZ"
129 |                     ),
130 |                     f"https://www.youtube.com/watch?v={response_json['items'][0]['id']}",
131 |                 )
132 |             )
133 |         return result
134 | 
135 |     def _transform_to_pandas_dataframe(result):
136 |         df = pd.DataFrame(
137 |             result,
138 |             columns=[
139 |                 "created_at",
140 |                 "videoId",
141 |                 "title",
142 |                 "viewCount",
143 |                 "likeCount",
144 |                 "dislikeCount",
145 |                 "favoriteCount",
146 |                 "commentCount",
147 |             ],
148 |         )
149 |         return df
150 | 
151 |     def _insert_to_bq(df, tablename):
152 |         TABLE = f"PROJECT.{tablename}"
153 |         job = client.load_table_from_dataframe(df, TABLE)
154 |         job.result()
155 | 
156 |     (
157 |         client,
158 |         http_conn,
159 |         execution_date,
160 |         task_instance,
161 |         datatype,
162 |         video_metadatas,
163 |         result,
164 |     ) = _init()
165 | 
166 |     if datatype == "statistics":
167 |         tablename = "ods.ods_youtubeStatistics_videoId_datetime"
168 |         result = _get_statistics()
169 |     elif datatype == "info":
170 |         tablename = "ods.ods_youtubeInfo_videoId_datetime"
171 |         result = _get_info()
172 |     else:
173 |         raise RuntimeError(f"Unsupported datatype: {datatype}")
174 | 
175 |     df = _transform_to_pandas_dataframe(result)
176 |     _insert_to_bq(df, tablename)
177 | 


--------------------------------------------------------------------------------
/dags/utils/hook_related.py:
--------------------------------------------------------------------------------
1 | import tenacity
2 | 
3 | RETRY_ARGS = {
4 |     "stop": tenacity.stop_after_attempt(10),
5 |     "wait": tenacity.wait_fixed(120),
6 |     "reraise": True,
7 | }
8 | 


--------------------------------------------------------------------------------
/docker-compose-dev.yml:
--------------------------------------------------------------------------------
 1 | x-docker-common: &docker-common
 2 |   env_file: .env.staging
 3 |   image: pycon-etl
 4 |   build:
 5 |     context: .
 6 |     dockerfile: Dockerfile.test
 7 |   volumes:
 8 |     - ./airflow.db:/opt/airflow/airflow.db
 9 |     # you can comment out the following line if you don't have service-account.json
10 |     - ./service-account.json:/opt/airflow/service-account.json
11 |   restart: unless-stopped
12 |   logging:
13 |     driver: json-file
14 |     options:
15 |       max-size: 10m
16 | 
17 | services:
18 |   airflow:
19 |     <<: *docker-common
20 |     container_name: airflow
21 |     ports:
22 |       - "8080:8080"
23 |     command: webserver
24 | 
25 |   scheduler:
26 |     <<: *docker-common
27 |     container_name: scheduler
28 |     depends_on:
29 |       - airflow
30 |     command: scheduler
31 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | x-docker-common: &docker-common
 2 |   env_file: .env.production
 3 |   image: asia-east1-docker.pkg.dev/pycontw-225217/data-team/pycon-etl:latest
 4 |   volumes:
 5 |     - ./service-account.json:/opt/airflow/service-account.json
 6 |     - ./airflow.db:/opt/airflow/airflow.db
 7 |   restart: unless-stopped
 8 |   logging:
 9 |     driver: json-file
10 |     options:
11 |       max-size: 10m
12 | 
13 | services:
14 |   airflow:
15 |     <<: *docker-common
16 |     container_name: airflow
17 |     ports:
18 |       - "8080:8080"
19 |     command: webserver
20 | 
21 |   scheduler:
22 |     <<: *docker-common
23 |     container_name: scheduler
24 |     depends_on:
25 |       - airflow
26 |     command: scheduler
27 | 


--------------------------------------------------------------------------------
/docs/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guide
 2 | 
 3 | ## How to Contribute to this Project
 4 | 
 5 | 1. Clone this repository:
 6 | 
 7 |     ```bash
 8 |     git clone https://github.com/pycontw/pycon-etl
 9 |     ```
10 | 
11 | 2. Create a new branch:
12 | 
13 |     ```bash
14 |     git checkout -b <branch-name>
15 |     ```
16 | 
17 | 3. Make your changes.
18 | 
19 |     > **NOTICE:** We are still using Airflow v1, so please read the official document [Apache Airflow v1.10.15 Documentation](https://airflow.apache.org/docs/apache-airflow/1.10.15/) to ensure your changes are compatible with our current version.
20 | 
21 |     If your task uses an external service, add the connection and variable in the Airflow UI.
22 | 
23 | 4. Test your changes in your local environment:
24 | 
25 |     - Ensure the DAG file is loaded successfully.
26 |     - Verify that the task runs successfully.
27 |     - Confirm that your code is correctly formatted and linted.
28 |     - Check that all necessary dependencies are included in `requirements.txt`.
29 | 
30 | 5. Push your branch:
31 | 
32 |     ```bash
33 |     git push origin <branch-name>
34 |     ```
35 | 
36 | 6. Create a Pull Request (PR).
37 | 
38 | 7. Wait for the review and merge.
39 | 
40 | 8. Write any necessary documentation.
41 | 
42 | ## Release Management
43 | 
44 | Please use [GitLab Flow](https://about.gitlab.com/topics/version-control/what-is-gitlab-flow/); otherwise, you cannot pass Docker Hub CI.
45 | 
46 | ## Dependency Management
47 | 
48 | Airflow dependencies are managed by [uv]. For more information, refer to the [Airflow Installation Documentation](https://airflow.apache.org/docs/apache-airflow/1.10.15/installation.html).
49 | 
50 | ## Code Convention
51 | 
52 | ### Airflow DAG
53 | 
54 | - Please refer to [this article](https://medium.com/@davidtnfsh/%E5%A4%A7%E6%95%B0%E6%8D%AE%E4%B9%8B%E8%B7%AF-%E9%98%BF%E9%87%8C%E5%B7%B4%E5%B7%B4%E5%A4%A7%E6%95%B0%E6%8D%AE%E5%AE%9E%E8%B7%B5-%E8%AE%80%E6%9B%B8%E5%BF%83%E5%BE%97-54e795c2b8c) for naming guidelines.
55 | 
56 |   - Examples:
57 |     1. `ods/opening_crawler`: Crawlers written by @Rain. These openings can be used for the recruitment board, which was implemented by @tai271828 and @stacy.
58 |     2. `ods/survey_cake`: A manually triggered uploader that uploads questionnaires to BigQuery. The uploader should be invoked after we receive the SurveyCake questionnaire.
59 | 
60 | - Table name convention:
61 |   ![img](https://miro.medium.com/max/1400/1*bppuEKMnL9gFnvoRHUO8CQ.png)
62 | 
63 | ### Format
64 | 
65 | Please use `make format` to format your code before committing, otherwise, the CI will fail.
66 | 
67 | ### Commit Message
68 | 
69 | It is recommended to use [Commitizen](https://commitizen-tools.github.io/commitizen/).
70 | 
71 | ### CI/CD
72 | 
73 | Please check the [.github/workflows](.github/workflows) directory for details.
74 | 
75 | [uv]: https://docs.astral.sh/uv/


--------------------------------------------------------------------------------
/docs/DEPLOYMENT.md:
--------------------------------------------------------------------------------
 1 | # Deployment Guide
 2 | 
 3 | 1. Login to the data team's server:
 4 |     1. Run: `gcloud compute ssh --zone "asia-east1-b" "data-team" --project "pycontw-225217"`
 5 |     2. Services:
 6 |         * ETL: `/srv/pycon-etl`
 7 |         * Metabase is located here: `/mnt/disks/data-team-additional-disk/pycontw-infra-scripts/data_team/metabase_server`
 8 | 
 9 | 2. Pull the latest codebase to this server: `git pull`
10 | 
11 | 3. Add credentials to the `.env.production` file (only needs to be done once).
12 | 
13 | 4. Start the services:
14 | 
15 | ```bash
16 | # Start production services
17 | docker-compose -f ./docker-compose.yml up
18 | 
19 | # Stop production services
20 | # docker-compose -f ./docker-compose.yml down
21 | ```


--------------------------------------------------------------------------------
/docs/MAINTENANCE.md:
--------------------------------------------------------------------------------
 1 | # Maintenance Guide
 2 | 
 3 | ## Disk Space
 4 | 
 5 | Currently, the disk space is limited, so please check the disk space before running any ETL jobs.
 6 | 
 7 | This section will be deprecated if we no longer encounter out-of-disk issues.
 8 | 
 9 | 1. Find the largest folders:
10 |     ```bash
11 |     du -a /var/lib/docker/overlay2 | sort -n -r | head -n 20
12 |     ```
13 | 2. Show the folder size:
14 |     ```bash
15 |     du -hs xxxx
16 |     ```
17 | 3. Delete the large folders identified.
18 | 4. Check disk space:
19 |     ```bash
20 |     df -h
21 |     ```
22 | 
23 | ## Token Expiration
24 | 
25 | Some API tokens might expire, so please check them regularly.
26 | 
27 | ## Year-to-Year Jobs
28 | 
29 | Please refer to [Dev Data Team - Year to Year Jobs - HackMD](https://hackmd.io/R417olqPQSWnQYY1Oc_-Sw?view) for more details.
30 | 


--------------------------------------------------------------------------------
/docs/airflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/docs/airflow.png


--------------------------------------------------------------------------------
/docs/kktix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/docs/kktix.png


--------------------------------------------------------------------------------
/docs/youtube-connection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/docs/youtube-connection.png


--------------------------------------------------------------------------------
/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Exit script on first error
 3 | set -e
 4 | 
 5 | # Check if the AIRFLOW_HOME variable is set
 6 | if [ -z "${AIRFLOW_HOME}" ]; then
 7 |     echo 'AIRFLOW_HOME not set'
 8 |     exit 1
 9 | fi
10 | 
11 | # Create Fernet key if not exists
12 | if [ -z "${AIRFLOW__CORE__FERNET_KEY}" ]; then
13 |     echo "Fernet key not set. Generating a new one."
14 |     export AIRFLOW__CORE__FERNET_KEY=$(python -c 'from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())')
15 |     echo "Fernet key generated and set."
16 |     echo "[WARNING] Please save the AIRFLOW__CORE__FERNET_KEY for future use."
17 | else
18 |     echo "Fernet key exists."
19 | fi
20 | 
21 | # Check if the database exists and initialize it if not
22 | if [ ! -f "${AIRFLOW_HOME}/airflow.db" ]; then
23 |     airflow db init
24 |     echo 'Database initialized'
25 | else
26 |     echo 'Database existed'
27 | fi
28 | 
29 | # Check if the GCP service account is provided
30 | if [ -z "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
31 |     echo "No GCP service account provided, set to default path"
32 |     export GOOGLE_APPLICATION_CREDENTIALS="${AIRFLOW_HOME}/service-account.json"
33 | fi
34 | 
35 | # Check if the command is provided
36 | if [ -z "$1" ]; then
37 |     echo "No command provided. Usage: $0 {airflow_command}"
38 |     exit 1
39 | fi
40 | 
41 | # Execute the provided Airflow command
42 | echo "Running command: airflow $@"
43 | exec airflow "$@"
44 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "pycon-etl"
  3 | version = "0.1.0"
  4 | description = "Add your description here"
  5 | readme = "README.md"
  6 | requires-python = ">=3.8,<3.9"
  7 | dependencies = [
  8 |     # Please use pip to manage airflow dependencies.
  9 |     "apache-airflow==1.10.15",
 10 |     # Editable install with no version specified.
 11 |     "google-cloud-bigquery",
 12 |     "pandas",
 13 |     "pyarrow",
 14 |     "pydantic<2",
 15 |     "pygsheets",
 16 |     "requests",
 17 |     "searchconsole",
 18 | ]
 19 | 
 20 | [dependency-groups]
 21 | dev = [
 22 |     "bandit",
 23 |     "ruff",
 24 |     "mypy",
 25 |     "pytest",
 26 |     "pytest-cov",
 27 |     "coverage[toml]",
 28 |     "safety",
 29 | ]
 30 | 
 31 | 
 32 | [tool.ruff]
 33 | line-length = 88
 34 | 
 35 | [tool.ruff.lint]
 36 | extend-select = [
 37 |     "I",     # Missing required import (auto-fixable)
 38 |     "UP",    # Pyupgrade
 39 |     "PT",    # flake8-pytest-style rules
 40 |     "TID25", # flake8-tidy-imports rules
 41 | ]
 42 | 
 43 | ignore = ["E501", "D1", "D415"]
 44 | 
 45 | [tool.ruff.lint.isort]
 46 | combine-as-imports = true
 47 | known-first-party = ["tests"]
 48 | 
 49 | [tool.ruff.lint.pydocstyle]
 50 | convention = "google"
 51 | 
 52 | 
 53 | [tool.coverage]
 54 | [tool.coverage.report]
 55 | show_missing = true
 56 | exclude_lines = [
 57 |     # Have to re-enable the standard pragma
 58 |     'pragma: no cover',
 59 | 
 60 |     # Don't complain about missing debug-only code:
 61 |     'def __repr__',
 62 |     'if self\.debug',
 63 | 
 64 |     # Don't complain if tests don't hit defensive assertion code:
 65 |     'raise AssertionError',
 66 |     'raise NotImplementedError',
 67 | 
 68 |     # Don't complain if non-runnable code isn't run:
 69 |     'if 0:',
 70 |     'if __name__ == .__main__.:',
 71 | ]
 72 | omit = ['env/*', 'venv/*', '*/virtualenv/*', '*/virtualenvs/*', '*/tests/*']
 73 | 
 74 | [tool.uv]
 75 | constraint-dependencies = [
 76 |     # constraints-3.8.txt for apache-airflow==1.10.15
 77 |     # Editable install with no version control (apache-airflow==1.10.15)
 78 |     "Babel==2.9.0",
 79 |     "Flask-Admin==1.5.4",
 80 |     "Flask-AppBuilder==2.3.4",
 81 |     "Flask-Babel==1.0.0",
 82 |     "Flask-Bcrypt==0.7.1",
 83 |     "Flask-Caching==1.3.3",
 84 |     "Flask-JWT-Extended==3.25.0",
 85 |     "Flask-Login==0.4.1",
 86 |     "Flask-OpenID==1.3.0",
 87 |     "Flask-SQLAlchemy==2.4.4",
 88 |     "Flask-WTF==0.14.3",
 89 |     "Flask==1.1.2",
 90 |     "GitPython==3.1.11",
 91 |     "JPype1==0.7.1",
 92 |     "JayDeBeApi==1.2.3",
 93 |     "Jinja2==2.11.2",
 94 |     "Mako==1.1.3",
 95 |     "Markdown==2.6.11",
 96 |     "MarkupSafe==1.1.1",
 97 |     "PyHive==0.6.3",
 98 |     "PyJWT==1.7.1",
 99 |     "PyNaCl==1.4.0",
100 |     "PySmbClient==0.1.5",
101 |     "PyYAML==5.3.1",
102 |     "Pygments==2.7.2",
103 |     "SQLAlchemy-JSONField==0.9.0",
104 |     "SQLAlchemy-Utils==0.36.8",
105 |     "SQLAlchemy==1.3.20",
106 |     "Sphinx==3.3.1",
107 |     "Unidecode==1.1.1",
108 |     "WTForms==2.3.3",
109 |     "Werkzeug==0.16.1",
110 |     "adal==1.2.5",
111 |     "aiohttp==3.7.3",
112 |     "alabaster==0.7.12",
113 |     "alembic==1.4.3",
114 |     "amqp==2.6.1",
115 |     "analytics-python==1.2.9",
116 |     "ansiwrap==0.8.4",
117 |     "apispec==1.3.3",
118 |     "appdirs==1.4.4",
119 |     "argcomplete==1.12.2",
120 |     "asn1crypto==1.4.0",
121 |     "astroid==2.4.2",
122 |     "async-generator==1.10",
123 |     "async-timeout==3.0.1",
124 |     "atlasclient==1.0.0",
125 |     "attrs==20.3.0",
126 |     "aws-sam-translator==1.31.0",
127 |     "aws-xray-sdk==2.6.0",
128 |     "azure-common==1.1.26",
129 |     "azure-core==1.9.0",
130 |     "azure-cosmos==3.2.0",
131 |     "azure-datalake-store==0.0.51",
132 |     "azure-identity==1.5.0",
133 |     "azure-keyvault-certificates==4.2.1",
134 |     "azure-keyvault-keys==4.3.0",
135 |     "azure-keyvault-secrets==4.2.0",
136 |     "azure-keyvault==4.1.0",
137 |     "azure-mgmt-containerinstance==1.5.0",
138 |     "azure-mgmt-core==1.2.2",
139 |     "azure-mgmt-datalake-nspkg==3.0.1",
140 |     "azure-mgmt-datalake-store==0.5.0",
141 |     "azure-mgmt-nspkg==3.0.2",
142 |     "azure-mgmt-resource==15.0.0",
143 |     "azure-nspkg==3.0.2",
144 |     "azure-storage-blob==12.6.0",
145 |     "azure-storage-common==2.1.0",
146 |     "azure-storage==0.36.0",
147 |     "backcall==0.2.0",
148 |     "bcrypt==3.2.0",
149 |     "beautifulsoup4==4.7.1",
150 |     "billiard==3.6.3.0",
151 |     "black==20.8b0",
152 |     "blinker==1.4",
153 |     "boto3==1.10.50",
154 |     "boto==2.49.0",
155 |     "botocore==1.13.50",
156 |     "cached-property==1.5.2",
157 |     "cachetools==4.1.1",
158 |     "cassandra-driver==3.20.2",
159 |     "cattrs==1.1.2",
160 |     "celery==4.4.7",
161 |     "certifi==2020.11.8",
162 |     "cffi==1.14.4",
163 |     "cfgv==3.2.0",
164 |     "cfn-lint==0.42.0",
165 |     "cgroupspy==0.1.6",
166 |     "chardet==3.0.4",
167 |     "click==6.7",
168 |     "cloudant==0.5.10",
169 |     "colorama==0.4.4",
170 |     "colorlog==4.0.2",
171 |     "configparser==3.5.3",
172 |     "coverage==5.3",
173 |     "croniter==0.3.36",
174 |     "cryptography==3.2.1",
175 |     "cx-Oracle==8.0.1",
176 |     "datadog==0.39.0",
177 |     "decorator==4.4.2",
178 |     "defusedxml==0.6.0",
179 |     "dill==0.3.3",
180 |     "distlib==0.3.1",
181 |     "dnspython==1.16.0",
182 |     "docker-pycreds==0.4.0",
183 |     "docker==3.7.3",
184 |     "docopt==0.6.2",
185 |     "docutils==0.15.2",
186 |     "ecdsa==0.14.1",
187 |     "elasticsearch-dsl==5.4.0",
188 |     "elasticsearch==5.5.3",
189 |     "email-validator==1.1.2",
190 |     "entrypoints==0.3",
191 |     "fastavro==1.2.0",
192 |     "filelock==3.0.12",
193 |     "flake8-colors==0.1.9",
194 |     "flake8==3.8.4",
195 |     "flaky==3.7.0",
196 |     "flask-swagger==0.2.14",
197 |     "flower==0.9.5",
198 |     "freezegun==1.0.0",
199 |     "fsspec==0.8.4",
200 |     "funcsigs==1.0.2",
201 |     "future-fstrings==1.2.0",
202 |     "future==0.18.2",
203 |     "gcsfs==0.7.1",
204 |     "gitdb==4.0.5",
205 |     "google-api-core==1.23.0",
206 |     "google-api-python-client==1.12.8",
207 |     "google-auth-httplib2==0.0.4",
208 |     "google-auth-oauthlib==0.4.2",
209 |     "google-auth==1.23.0",
210 |     "google-cloud-bigquery-storage==2.1.0",
211 |     "google-cloud-bigquery==2.4.0",
212 |     "google-cloud-bigtable==1.6.0",
213 |     "google-cloud-container==1.0.1",
214 |     "google-cloud-core==1.4.3",
215 |     "google-cloud-dlp==1.0.0",
216 |     "google-cloud-language==1.3.0",
217 |     "google-cloud-secret-manager==1.0.0",
218 |     "google-cloud-spanner==1.19.1",
219 |     "google-cloud-speech==1.3.2",
220 |     "google-cloud-storage==1.33.0",
221 |     "google-cloud-texttospeech==1.0.1",
222 |     "google-cloud-translate==1.7.0",
223 |     "google-cloud-videointelligence==1.16.1",
224 |     "google-cloud-vision==1.0.0",
225 |     "google-crc32c==1.0.0",
226 |     "google-resumable-media==1.1.0",
227 |     "googleapis-common-protos==1.52.0",
228 |     "graphviz==0.15",
229 |     "grpc-google-iam-v1==0.12.3",
230 |     "grpcio-gcp==0.2.2",
231 |     "grpcio==1.33.2",
232 |     "gunicorn==20.0.4",
233 |     "hdfs==2.5.8",
234 |     "hmsclient==0.1.1",
235 |     "httplib2==0.18.1",
236 |     "humanize==3.1.0",
237 |     "hvac==0.10.5",
238 |     "identify==1.5.10",
239 |     "idna==2.8",
240 |     "imagesize==1.2.0",
241 |     "importlib-metadata==2.1.1",
242 |     "importlib-resources==1.5.0",
243 |     "inflection==0.5.1",
244 |     "ipdb==0.13.4",
245 |     "ipython-genutils==0.2.0",
246 |     "ipython==7.19.0",
247 |     "iso8601==0.1.13",
248 |     "isodate==0.6.0",
249 |     "itsdangerous==1.1.0",
250 |     "jedi==0.17.2",
251 |     "jira==2.0.0",
252 |     "jmespath==0.10.0",
253 |     "json-merge-patch==0.2",
254 |     "jsondiff==1.1.2",
255 |     "jsonpatch==1.27",
256 |     "jsonpickle==1.4.1",
257 |     "jsonpointer==2.0",
258 |     "jsonschema==3.2.0",
259 |     "junit-xml==1.9",
260 |     "jupyter-client==6.1.7",
261 |     "jupyter-core==4.7.0",
262 |     "kombu==4.6.11",
263 |     "kubernetes==11.0.0",
264 |     "lazy-object-proxy==1.4.3",
265 |     "ldap3==2.8.1",
266 |     "libcst==0.3.14",
267 |     "lockfile==0.12.2",
268 |     "marshmallow-enum==1.5.1",
269 |     "marshmallow-sqlalchemy==0.23.1",
270 |     "marshmallow==2.21.0",
271 |     "mccabe==0.6.1",
272 |     "mock==4.0.2",
273 |     "mongomock==3.21.0",
274 |     "more-itertools==8.6.0",
275 |     "moto==1.3.14",
276 |     "msal-extensions==0.3.0",
277 |     "msal==1.6.0",
278 |     "msrest==0.6.19",
279 |     "msrestazure==0.6.4",
280 |     "multi-key-dict==2.0.3",
281 |     "multidict==5.0.2",
282 |     "mypy-extensions==0.4.3",
283 |     "mypy==0.720",
284 |     "mysqlclient==1.3.14",
285 |     "natsort==7.1.0",
286 |     "nbclient==0.5.1",
287 |     "nbformat==5.0.8",
288 |     "nest-asyncio==1.4.3",
289 |     "networkx==2.5",
290 |     "nodeenv==1.5.0",
291 |     "nteract-scrapbook==0.4.1",
292 |     "ntlm-auth==1.5.0",
293 |     "numpy==1.19.4",
294 |     "oauthlib==3.1.0",
295 |     "oscrypto==1.2.1",
296 |     "packaging==20.7",
297 |     "pandas-gbq==0.14.1",
298 |     "pandas==1.1.4",
299 |     "papermill==2.2.2",
300 |     "parameterized==0.7.4",
301 |     "paramiko==2.7.2",
302 |     "parso==0.7.1",
303 |     "pathspec==0.8.1",
304 |     "pbr==5.5.1",
305 |     "pendulum==1.4.4",
306 |     "pexpect==4.8.0",
307 |     "pickleshare==0.7.5",
308 |     "pinotdb==0.1.1",
309 |     "pipdeptree==1.0.0",
310 |     "pluggy==0.13.1",
311 |     "portalocker==1.7.1",
312 |     "pre-commit==2.9.2",
313 |     "presto-python-client==0.7.0",
314 |     "prison==0.1.3",
315 |     "prometheus-client==0.8.0",
316 |     "prompt-toolkit==3.0.8",
317 |     "proto-plus==1.11.0",
318 |     "protobuf==3.14.0",
319 |     "psutil==5.7.3",
320 |     "psycopg2-binary==2.8.6",
321 |     "ptyprocess==0.6.0",
322 |     "py==1.9.0",
323 |     "pyOpenSSL==20.0.0",
324 |     "pyarrow==0.17.1",
325 |     "pyasn1-modules==0.2.8",
326 |     "pyasn1==0.4.8",
327 |     "pycodestyle==2.6.0",
328 |     "pycparser==2.20",
329 |     "pycryptodomex==3.9.9",
330 |     "pydata-google-auth==1.1.0",
331 |     "pydruid==0.5.8",
332 |     "pyflakes==2.2.0",
333 |     "pykerberos==1.2.1",
334 |     "pymongo==3.10.1",
335 |     "pyparsing==2.4.7",
336 |     "pyrsistent==0.17.3",
337 |     "pysftp==0.2.9",
338 |     "pytest-cov==2.10.1",
339 |     "pytest-instafail==0.4.2",
340 |     "pytest-timeouts==1.2.1",
341 |     "pytest==5.4.3",
342 |     "python-daemon==2.2.4",
343 |     "python-dateutil==2.8.1",
344 |     "python-editor==1.0.4",
345 |     "python-http-client==3.3.1",
346 |     "python-jenkins==1.7.0",
347 |     "python-jose==3.2.0",
348 |     "python-nvd3==0.15.0",
349 |     "python-slugify==4.0.1",
350 |     "python3-openid==3.2.0",
351 |     "pytz==2020.4",
352 |     "pytzdata==2020.1",
353 |     "pywinrm==0.4.1",
354 |     "pyzmq==20.0.0",
355 |     "qds-sdk==1.16.1",
356 |     "redis==3.5.3",
357 |     "regex==2020.11.13",
358 |     "requests-futures==0.9.4",
359 |     "requests-kerberos==0.12.0",
360 |     "requests-mock==1.8.0",
361 |     "requests-ntlm==1.1.0",
362 |     "requests-oauthlib==1.3.0",
363 |     "requests-toolbelt==0.9.1",
364 |     "requests==2.23.0",
365 |     "responses==0.12.1",
366 |     "rsa==4.6",
367 |     "s3transfer==0.2.1",
368 |     "sasl==0.2.1",
369 |     "sendgrid==5.6.0",
370 |     "sentinels==1.0.0",
371 |     "sentry-sdk==0.19.4",
372 |     "setproctitle==1.2",
373 |     "six==1.15.0",
374 |     "slackclient==1.3.2",
375 |     "smmap==3.0.4",
376 |     "snakebite-py3==3.0.5",
377 |     "snowballstemmer==2.0.0",
378 |     "snowflake-connector-python==2.3.6",
379 |     "snowflake-sqlalchemy==1.2.4",
380 |     "soupsieve==2.0.1",
381 |     "sphinx-argparse==0.2.5",
382 |     "sphinx-autoapi==1.0.0",
383 |     "sphinx-copybutton==0.3.1",
384 |     "sphinx-jinja==1.1.1",
385 |     "sphinx-rtd-theme==0.5.0",
386 |     "sphinxcontrib-applehelp==1.0.2",
387 |     "sphinxcontrib-devhelp==1.0.2",
388 |     "sphinxcontrib-dotnetdomain==0.4",
389 |     "sphinxcontrib-golangdomain==0.2.0.dev0",
390 |     "sphinxcontrib-htmlhelp==1.0.3",
391 |     "sphinxcontrib-httpdomain==1.7.0",
392 |     "sphinxcontrib-jsmath==1.0.1",
393 |     "sphinxcontrib-qthelp==1.0.3",
394 |     "sphinxcontrib-serializinghtml==1.1.4",
395 |     "sshpubkeys==3.1.0",
396 |     "sshtunnel==0.1.5",
397 |     "tabulate==0.8.7",
398 |     "tenacity==4.12.0",
399 |     "text-unidecode==1.3",
400 |     "textwrap3==0.9.2",
401 |     "thrift-sasl==0.4.2",
402 |     "thrift==0.13.0",
403 |     "toml==0.10.2",
404 |     "tornado==5.1.1",
405 |     "tqdm==4.54.0",
406 |     "traitlets==5.0.5",
407 |     "typed-ast==1.4.1",
408 |     "typing-extensions==3.7.4.3",
409 |     "typing-inspect==0.6.0",
410 |     "tzlocal==1.5.1",
411 |     "unicodecsv==0.14.1",
412 |     "uritemplate==3.0.1",
413 |     "urllib3==1.25.11",
414 |     "vertica-python==1.0.0",
415 |     "vine==1.3.0",
416 |     "virtualenv==20.2.1",
417 |     "wcwidth==0.2.5",
418 |     "websocket-client==0.54.0",
419 |     "wrapt==1.12.1",
420 |     "xmltodict==0.12.0",
421 |     "yamllint==1.25.0",
422 |     "yarl==1.6.3",
423 |     "zdesk==2.7.1",
424 |     "zipp==3.4.0",
425 |     "zope.deprecation==4.4.0",
426 | ]
427 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [mypy]
2 | files = dags, tests
3 | ignore_missing_imports = true
4 | follow_imports = silent
5 | warn_redundant_casts = True
6 | warn_unused_ignores = True
7 | warn_unused_configs = True
8 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pycontw/pycon-etl/e593ee0037d7035e7412f7b88d3690e594cc0bb0/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.fixture
 5 | def kktix_api_data():
 6 |     return [
 7 |         {
 8 |             "id": 84296,
 9 |             "name": "PyCon APAC 2022 Registration: Individual【Online Conference】",
10 |             "attendee_info": {
11 |                 "id": 84748358,
12 |                 "ticket_id": 449116,
13 |                 "ticket_name": "Regular 一般票（with Pyckage）",
14 |                 "reg_no": 104,
15 |                 "state": "activated",
16 |                 "checkin_code": "BC7B",
17 |                 "qrcode": "bc7bd846f49d2d2e1g833cc92gdg2cf9",
18 |                 "is_paid": True,
19 |                 "price": 2600,
20 |                 "currency": "TWD",
21 |                 "payment_method": "WEBSITE",
22 |                 "data": [
23 |                     ["Nickname / 暱稱", "Stanley"],
24 |                     ["Gender / 生理性別", "Male / 男性"],
25 |                     [
26 |                         "If you buy the ticket with PySafe, remember to fill out correct address and size of t-shirt for us to send the parcel. if you fill the wrong information to cause missed delivery, we will not resend th",
27 |                         "",
28 |                     ],
29 |                     [
30 |                         "購買含 Pyckage 票卷者，請務必填寫正確之「Address / 收件地址」和「Size of T-shirt / T恤尺寸 」（僅限台灣及離島區域），以避免 Pyckage 無法送達，如因填寫錯誤致未收到 Pyckage ，報名人須自行負責，大會恕不再另行補寄",
31 |                         "",
32 |                     ],
33 |                     [
34 |                         "Address / 收件地址  Ex: No. 128, Sec. 2, Academia Rd., Nangang Dist., Taipei City 115201, Taiwan (R.O.C.) / 115台北市南港區研究院路二段128號",
35 |                         "新竹市北區天府路一段162號4樓之3",
36 |                     ],
37 |                     [
38 |                         "Size of T-shirt / T恤尺寸",
39 |                         "M / 胸寬(F.W.): 49cm / 衣長(C.L.): 70cm",
40 |                     ],
41 |                     ["Come From / 國家或地區", "Taiwan 台灣"],
42 |                     ["Age range / 年齡區間", "36 - 45"],
43 |                     [
44 |                         'Job Title / 職稱 (If you are a student, fill in "student")',
45 |                         "全端工程師",
46 |                     ],
47 |                     [
48 |                         "Company  / 服務單位 (For students or teachers, fill in the School + Department Name)",
49 |                         "雲灣資訊有限公司",
50 |                     ],
51 |                     ["Years of Using Python / 使用 Python 多久", "6-10 years"],
52 |                     [
53 |                         "Area of Interest / 興趣領域",
54 |                         "Web Development, DevOps, Engineering & Mathematics",
55 |                     ],
56 |                     [
57 |                         "Have you ever attended PyCon TW？/ 是否曾參加 PyCon TW？",
58 |                         "5-7 times",
59 |                     ],
60 |                     [
61 |                         "Would you like to receive an email from sponsors？/ 是否願意收到贊助商轉發 Email 訊息？",
62 |                         "Yes",
63 |                     ],
64 |                     [
65 |                         "I would like to donate invoice to Open Culture Foundation / 我願意捐贈發票給開放文化基金會 (ref: https://reurl.cc/ZQ6VY6)",
66 |                         "No",
67 |                     ],
68 |                     [
69 |                         "Privacy Policy of PyCon APAC 2022 / PyCon APAC 2022 個人資料保護聲明",
70 |                         "",
71 |                     ],
72 |                     [
73 |                         "I’ve already read and I accept the Privacy Policy of PyCon APAC 2022 / 我已閱讀並同意 PyCon APAC 2022 個人資料保護聲明",
74 |                         "Yes",
75 |                     ],
76 |                     [
77 |                         "I am fully aware of the Gather Privacy Policy,  only participants that are over the age of 18 can access to the venue / 我已被告知因為 gather 政策，需滿18歲以上方能登入會議場地",
78 |                         "",
79 |                     ],
80 |                     ["聯絡人 姓名", "李xx"],
81 |                     ["聯絡人 Email", "xxx@gmail.com"],
82 |                     ["聯絡人 手機", "0900000000"],
83 |                     ["標籤", ""],
84 |                 ],
85 |                 "kyc": {},
86 |                 "id_number": None,
87 |                 "search_string": "Stanley\nMale",
88 |                 "updated_at": 1656921502.5667331,
89 |                 "ticket_type": "qrcode",
90 |                 "slot": {},
91 |                 "order_no": 127666621,
92 |             },
93 |         }
94 |     ]
95 | 


--------------------------------------------------------------------------------
/tests/data_questionnaire.csv:
--------------------------------------------------------------------------------
1 | ﻿你是從哪裡過來呢？,填答時間,填答秒數,IP紀錄,額滿結束註記,使用者紀錄,會員時間,會員編號,自訂ID,備註
2 | 台灣南部,"2021-07-03 15:34:50",311,36.226.4.68,,,,',,
3 | 台灣北部,"2021-07-03 16:13:18",5,36.226.4.68,,,,',,
4 | 其他,"2021-07-03 16:55:40",8,2001:b400:e23c:9eb4:90b4:98ef:33:8eee,,,,',,
5 | 


--------------------------------------------------------------------------------
/tests/kktix_ticket_orders/test_klaviyo_loader.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | 
 3 | from dags.ods.kktix_ticket_orders.udfs import klaviyo_loader
 4 | 
 5 | 
 6 | def fake_airflow_variable():
 7 |     return {
 8 |         "KLAVIYO_LIST_ID": "abc",
 9 |         "KLAVIYO_CAMPAIGN_ID": "123",
10 |     }
11 | 
12 | 
13 | @patch("dags.ods.kktix_ticket_orders.udfs.klaviyo_loader.klaviyo_mailer.main")
14 | @patch(
15 |     "dags.ods.kktix_ticket_orders.udfs.klaviyo_loader.Variable",
16 |     new_callable=fake_airflow_variable,
17 | )
18 | def test_klaviyo_loader(variable, mailer, kktix_api_data):
19 |     klaviyo_loader.load(kktix_api_data)
20 |     mailer.assert_called_once_with(
21 |         list_id="abc",
22 |         campaign_id="123",
23 |         campaign_name="隨買即用",
24 |         datas=[
25 |             {
26 |                 "email": "xxx@gmail.com",
27 |                 "name": "李xx",
28 |                 "qrcode": "bc7bd846f49d2d2e1g833cc92gdg2cf9",
29 |             }
30 |         ],
31 |     )
32 | 


--------------------------------------------------------------------------------
/tests/kktix_ticket_orders/test_transformer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | test transformer
  3 | """
  4 | 
  5 | from dags.ods.kktix_ticket_orders.udfs.kktix_transformer import transform
  6 | 
  7 | 
  8 | def test_transform(kktix_api_data) -> None:
  9 |     ground_truth = [
 10 |         {
 11 |             "id": 84296,
 12 |             "name": "PyCon APAC 2022 Registration: Individual【Online Conference】",
 13 |             "attendee_info": {
 14 |                 "id": 84748358,
 15 |                 "ticket_id": 449116,
 16 |                 "ticket_name": "Regular 一般票（with Pyckage）",
 17 |                 "reg_no": 104,
 18 |                 "state": "activated",
 19 |                 "checkin_code": "BC7B",
 20 |                 "qrcode": "bc7bd846f49d2d2e1g833cc92gdg2cf9",
 21 |                 "is_paid": True,
 22 |                 "price": 2600,
 23 |                 "currency": "TWD",
 24 |                 "payment_method": "WEBSITE",
 25 |                 "data": [
 26 |                     ["Nickname / 暱稱", "Stanley"],
 27 |                     ["Gender / 生理性別", "Male / 男性"],
 28 |                     [
 29 |                         "If you buy the ticket with PySafe, remember to fill out correct address and size of t-shirt for us to send the parcel. if you fill the wrong information to cause missed delivery, we will not resend th",
 30 |                         "",
 31 |                     ],
 32 |                     [
 33 |                         "購買含 Pyckage 票卷者，請務必填寫正確之「Address / 收件地址」和「Size of T-shirt / T恤尺寸 」（僅限台灣及離島區域），以避免 Pyckage 無法送達，如因填寫錯誤致未收到 Pyckage ，報名人須自行負責，大會恕不再另行補寄",
 34 |                         "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
 35 |                     ],
 36 |                     [
 37 |                         "Address / 收件地址  Ex: No. 128, Sec. 2, Academia Rd., Nangang Dist., Taipei City 115201, Taiwan (R.O.C.) / 115台北市南港區研究院路二段128號",
 38 |                         "36190b79eb7396cfb91e413fecef9707bca87f32012fb01fc38caa236fb053d0",
 39 |                     ],
 40 |                     [
 41 |                         "Size of T-shirt / T恤尺寸",
 42 |                         "M / 胸寬(F.W.): 49cm / 衣長(C.L.): 70cm",
 43 |                     ],
 44 |                     ["Come From / 國家或地區", "Taiwan 台灣"],
 45 |                     ["Age range / 年齡區間", "36 - 45"],
 46 |                     [
 47 |                         'Job Title / 職稱 (If you are a student, fill in "student")',
 48 |                         "全端工程師",
 49 |                     ],
 50 |                     [
 51 |                         "Company  / 服務單位 (For students or teachers, fill in the School + Department Name)",
 52 |                         "雲灣資訊有限公司",
 53 |                     ],
 54 |                     ["Years of Using Python / 使用 Python 多久", "6-10 years"],
 55 |                     [
 56 |                         "Area of Interest / 興趣領域",
 57 |                         "Web Development, DevOps, Engineering & Mathematics",
 58 |                     ],
 59 |                     [
 60 |                         "Have you ever attended PyCon TW？/ 是否曾參加 PyCon TW？",
 61 |                         "5-7 times",
 62 |                     ],
 63 |                     [
 64 |                         "Would you like to receive an email from sponsors？/ 是否願意收到贊助商轉發 Email 訊息？",
 65 |                         "Yes",
 66 |                     ],
 67 |                     [
 68 |                         "I would like to donate invoice to Open Culture Foundation / 我願意捐贈發票給開放文化基金會 (ref: https://reurl.cc/ZQ6VY6)",
 69 |                         "No",
 70 |                     ],
 71 |                     [
 72 |                         "Privacy Policy of PyCon APAC 2022 / PyCon APAC 2022 個人資料保護聲明",
 73 |                         "",
 74 |                     ],
 75 |                     [
 76 |                         "I’ve already read and I accept the Privacy Policy of PyCon APAC 2022 / 我已閱讀並同意 PyCon APAC 2022 個人資料保護聲明",
 77 |                         "Yes",
 78 |                     ],
 79 |                     [
 80 |                         "I am fully aware of the Gather Privacy Policy,  only participants that are over the age of 18 can access to the venue / 我已被告知因為 gather 政策，需滿18歲以上方能登入會議場地",
 81 |                         "",
 82 |                     ],
 83 |                     [
 84 |                         "聯絡人 姓名",
 85 |                         "2150750f32ee8dcd40537be8b5bee7c26e893a77cb23049eb3a0ca49a7512791",
 86 |                     ],
 87 |                     [
 88 |                         "聯絡人 Email",
 89 |                         "26a695fcd9d98ffa1fba78cb5a1eacf0fbe19e40bf9de0cafa0080cdf4c14514",
 90 |                     ],
 91 |                     [
 92 |                         "聯絡人 手機",
 93 |                         "86f3abfffd2f714a6d611429f82fac9264e8036b0fb490320bfe3e56c494a0e0",
 94 |                     ],
 95 |                     ["標籤", ""],
 96 |                 ],
 97 |                 "kyc": {},
 98 |                 "id_number": None,
 99 |                 "updated_at": 1656921502.5667331,
100 |                 "ticket_type": "qrcode",
101 |                 "slot": {},
102 |                 "order_no": 127666621,
103 |             },
104 |         }
105 |     ]
106 |     if __debug__:
107 |         if transform(kktix_api_data) != ground_truth:
108 |             raise AssertionError(
109 |                 "Transform() might forget to de-identify some columns! e.g. name, email or phone number"
110 |             )
111 | 


--------------------------------------------------------------------------------
/tests/test_cakeresume_uploader.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from pathlib import Path
 3 | 
 4 | from contrib.survey_cake.udfs.survey_cake_csv_uploader import SurveyCakeCSVUploader
 5 | 
 6 | 
 7 | def test_cakeresume_uploader() -> None:
 8 |     fixtures = {
 9 |         "tests/data_questionnaire.csv": {
10 |             "data_domain": "questionnaire",
11 |             "primary_key": "ip",
12 |             "time_dimension": "datetime",
13 |         }
14 |     }
15 | 
16 |     for filename, metadata in fixtures.items():
17 |         SURVEY_CAKE_CSV_UPLOADER = SurveyCakeCSVUploader(year=2146, filename=filename)
18 |         SURVEY_CAKE_CSV_UPLOADER.transform()
19 |     with open(
20 |         Path("tests/data_questionnaire_dimension.csv")
21 |     ) as data_questionnaire_dimension:
22 |         rows = csv.reader(data_questionnaire_dimension)
23 |         header = next(iter(rows))
24 |         if __debug__:
25 |             if header != ["question_id", "question", "year"]:
26 |                 raise AssertionError("wrong header!")
27 | 
28 |     with open(
29 |         Path("tests/data_questionnaire_facttable.csv")
30 |     ) as data_questionnaire_facttable:
31 |         rows = csv.reader(data_questionnaire_facttable)
32 |         header = next(iter(rows))
33 |         if __debug__:
34 |             if header != ["ip", "question_id", "answer", "year"]:
35 |                 raise AssertionError("wrong header!")
36 | 


--------------------------------------------------------------------------------
/tests/test_crawler.py:
--------------------------------------------------------------------------------
 1 | """
 2 | test crawler
 3 | """
 4 | 
 5 | from dags.ods.opening_crawler.udfs.crawlers import CakeResumeCrawler
 6 | 
 7 | 
 8 | def test_demo() -> None:
 9 |     if __debug__:
10 |         if CakeResumeCrawler.crawl() != "i'm a CakeResume crawler!":
11 |             raise AssertionError("CakeResumeCrawler Error!")
12 | 


--------------------------------------------------------------------------------