├── .github └── workflows │ ├── codeql.yml │ ├── dockerimage.yml │ └── test.yml ├── .gitignore ├── Dockerfile ├── Dockerfile.metadata_service ├── Dockerfile.migration_service ├── Dockerfile.service.test ├── Dockerfile.ui_service ├── LICENSE ├── MANIFEST.in ├── README.md ├── RELEASE.md ├── docker-compose.development.yml ├── docker-compose.test.yml ├── docker-compose.yml ├── migration_tools.py ├── pytest.ini ├── requirements.dev.txt ├── requirements.txt ├── run_goose.py ├── services ├── __init__.py ├── data │ ├── __init__.py │ ├── db_utils.py │ ├── models.py │ ├── postgres_async_db.py │ ├── service_configs.py │ └── tagging_utils.py ├── metadata_service │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── admin.py │ │ ├── artifact.py │ │ ├── flow.py │ │ ├── metadata.py │ │ ├── run.py │ │ ├── step.py │ │ ├── task.py │ │ └── utils.py │ ├── requirements.txt │ ├── server.py │ └── tests │ │ ├── __init__.py │ │ ├── integration_tests │ │ ├── __init__.py │ │ ├── artifact_test.py │ │ ├── flow_test.py │ │ ├── metadata_test.py │ │ ├── run_test.py │ │ ├── step_test.py │ │ ├── task_test.py │ │ └── utils.py │ │ └── unit_tests │ │ ├── __init__.py │ │ ├── api_util_test.py │ │ └── task_test.py ├── migration_service │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── admin.py │ │ └── utils.py │ ├── data │ │ ├── __init__.py │ │ └── postgres_async_db.py │ ├── get_virtual_env.py │ ├── migration_config.py │ ├── migration_files │ │ ├── 1_create_tables.sql │ │ ├── 20200603104139_add_str_id_cols.sql │ │ ├── 20201002000616_update_metadata_primary_key.sql │ │ ├── 20210202145952_add_runs_idx_ts_epoch_flow_id.sql │ │ ├── 20210260056859_add_tasks_idx_on_.sql │ │ ├── 20211202100726_add_str_id_indices.sql │ │ ├── 20220503175500_add_run_epoch_index.sql │ │ └── 20230118020300_drop_partial_indexes.sql │ ├── migration_server.py │ ├── requirements.txt │ └── run_script.py ├── ui_backend_service │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── admin.py │ │ ├── artifact.py │ │ ├── autocomplete.py │ │ ├── card.py │ │ ├── config.py │ │ ├── dag.py │ │ ├── features.py │ │ ├── flow.py │ │ ├── heartbeat_monitor.py │ │ ├── log.py │ │ ├── metadata.py │ │ ├── notify.py │ │ ├── plugins.py │ │ ├── run.py │ │ ├── search.py │ │ ├── step.py │ │ ├── tag.py │ │ ├── task.py │ │ ├── utils.py │ │ └── ws.py │ ├── data │ │ ├── __init__.py │ │ ├── cache │ │ │ ├── __init__.py │ │ │ ├── card_cache_manager.py │ │ │ ├── card_cache_service.py │ │ │ ├── client │ │ │ │ ├── __init__.py │ │ │ │ ├── cache_action.py │ │ │ │ ├── cache_async_client.py │ │ │ │ ├── cache_client.py │ │ │ │ ├── cache_server.py │ │ │ │ ├── cache_store.py │ │ │ │ └── cache_worker.py │ │ │ ├── custom_flowgraph.py │ │ │ ├── generate_dag_action.py │ │ │ ├── get_artifacts_action.py │ │ │ ├── get_data_action.py │ │ │ ├── get_log_file_action.py │ │ │ ├── get_parameters_action.py │ │ │ ├── get_task_action.py │ │ │ ├── search_artifacts_action.py │ │ │ ├── store.py │ │ │ └── utils.py │ │ ├── db │ │ │ ├── __init__.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ ├── artifact_row.py │ │ │ │ ├── base_row.py │ │ │ │ ├── flow_row.py │ │ │ │ ├── metadata_row.py │ │ │ │ ├── run_row.py │ │ │ │ ├── step_row.py │ │ │ │ └── task_row.py │ │ │ ├── postgres_async_db.py │ │ │ ├── tables │ │ │ │ ├── __init__.py │ │ │ │ ├── artifact.py │ │ │ │ ├── base.py │ │ │ │ ├── flow.py │ │ │ │ ├── metadata.py │ │ │ │ ├── run.py │ │ │ │ ├── step.py │ │ │ │ └── task.py │ │ │ └── utils.py │ │ └── refiner │ │ │ ├── __init__.py │ │ │ ├── artifact_refiner.py │ │ │ ├── parameter_refiner.py │ │ │ ├── refinery.py │ │ │ └── task_refiner.py │ ├── doc.py │ ├── docs │ │ ├── README.md │ │ ├── api.md │ │ ├── architecture.md │ │ ├── environment.md │ │ ├── images │ │ │ ├── cache_architecture.png │ │ │ ├── heartbeat_monitoring.png │ │ │ └── websocket_communication.png │ │ ├── plugins.md │ │ └── websockets.md │ ├── download_ui.sh │ ├── example.custom_quicklinks.json │ ├── example.notifications.json │ ├── example.plugins.json │ ├── features.py │ ├── frontend.py │ ├── plugins │ │ ├── __init__.py │ │ ├── installed │ │ │ └── .gitignore │ │ └── plugin.py │ ├── requirements.txt │ ├── tests │ │ ├── __init__.py │ │ ├── integration_tests │ │ │ ├── __init__.py │ │ │ ├── admin_test.py │ │ │ ├── artifact_test.py │ │ │ ├── autocomplete_test.py │ │ │ ├── card_test.py │ │ │ ├── features_test.py │ │ │ ├── flows_test.py │ │ │ ├── grouped_runs_test.py │ │ │ ├── log_test.py │ │ │ ├── metadata_test.py │ │ │ ├── notify_test.py │ │ │ ├── plugins_test.py │ │ │ ├── runs_test.py │ │ │ ├── status_attempts_test.py │ │ │ ├── status_runs_test.py │ │ │ ├── status_tasks_test.py │ │ │ ├── steps_test.py │ │ │ ├── tasks_test.py │ │ │ ├── utils.py │ │ │ └── ws_test.py │ │ └── unit_tests │ │ │ ├── __init__.py │ │ │ ├── cache_utils_test.py │ │ │ ├── custom_flowgraph_test.py │ │ │ ├── data_test.py │ │ │ ├── get_artifacts_action_test.py │ │ │ ├── get_log_file_action_test.py │ │ │ ├── search_artifacts_action_test.py │ │ │ ├── search_test.py │ │ │ └── utils_test.py │ ├── ui │ │ ├── .dockerignore │ │ ├── .gitignore │ │ └── static │ │ │ └── .gitignore │ └── ui_server.py └── utils │ ├── __init__.py │ └── tests │ ├── __init__.py │ └── unit_tests │ └── utils_test.py ├── setup.cfg ├── setup.py ├── tox.ini └── wait-for-postgres.sh /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ "master" ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ "master" ] 20 | schedule: 21 | - cron: '22 12 * * 5' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Use only 'java' to analyze code written in Java, Kotlin or both 38 | # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both 39 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 40 | 41 | steps: 42 | - name: Checkout repository 43 | uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2 44 | 45 | # Initializes the CodeQL tools for scanning. 46 | - name: Initialize CodeQL 47 | uses: github/codeql-action/init@7df0ce34898d659f95c0c4a09eaa8d4e32ee64db # v2.2.12 48 | with: 49 | languages: ${{ matrix.language }} 50 | # If you wish to specify custom queries, you can do so here or in a config file. 51 | # By default, queries listed here will override any specified in a config file. 52 | # Prefix the list here with "+" to use these queries and those in the config file. 53 | 54 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 55 | # queries: security-extended,security-and-quality 56 | 57 | 58 | # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). 59 | # If this step fails, then you should remove it and run the build manually (see below) 60 | - name: Autobuild 61 | uses: github/codeql-action/autobuild@7df0ce34898d659f95c0c4a09eaa8d4e32ee64db # v2.2.12 62 | 63 | # ℹ️ Command-line programs to run using the OS shell. 64 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 65 | 66 | # If the Autobuild fails above, remove it and uncomment the following three lines. 67 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 68 | 69 | # - run: | 70 | # echo "Run, Build Application using script" 71 | # ./location_of_script_within_repo/buildscript.sh 72 | 73 | - name: Perform CodeQL Analysis 74 | uses: github/codeql-action/analyze@7df0ce34898d659f95c0c4a09eaa8d4e32ee64db # v2.2.12 75 | with: 76 | category: "/language:${{matrix.language}}" 77 | -------------------------------------------------------------------------------- /.github/workflows/dockerimage.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | on: 4 | release: 5 | branches: [ master ] 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - 12 | name: Checkout 13 | uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 14 | - 15 | name: Docker meta 16 | id: meta 17 | uses: docker/metadata-action@818d4b7b91585d195f67373fd9cb0332e31a7175 # v4.6.0 18 | with: 19 | images: | 20 | netflixoss/metaflow_metadata_service 21 | tags: | 22 | type=semver,pattern={{raw}} 23 | type=sha 24 | type=raw,value=latest 25 | - 26 | name: Login to Docker Hub 27 | uses: docker/login-action@465a07811f14bebb1938fbed4728c6a1ff8901fc # v2.2.0 28 | with: 29 | username: ${{ secrets.DOCKER_USERNAME_NETFLIX_OSS }} 30 | password: ${{ secrets.DOCKER_AUTH_TOKEN_NETFLIX_OSS }} 31 | - 32 | name: Build and push # We have a single-platform build, so use of setup-buildx-action is currently omitted. 33 | uses: docker/build-push-action@2eb1c1961a95fc15694676618e422e8ba1d63825 # v4.1.1 34 | with: 35 | context: . 36 | push: true 37 | tags: ${{ steps.meta.outputs.tags }} 38 | labels: ${{ steps.meta.outputs.labels }} 39 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | branches: 8 | - master 9 | jobs: 10 | codestyle: 11 | runs-on: ubuntu-latest 12 | 13 | strategy: 14 | matrix: 15 | python-version: [3.11] 16 | 17 | steps: 18 | - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@75f3110429a8c05be0e1bf360334e4cced2b63fa # v2.3.3 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install Python ${{ matrix.python-version }} dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | python -m pip install pycodestyle 27 | - name: Run Python PEP8 code style checks 28 | run: pycodestyle 29 | 30 | pylint: 31 | runs-on: ubuntu-latest 32 | 33 | strategy: 34 | matrix: 35 | python-version: [3.11] 36 | 37 | steps: 38 | - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0 39 | - name: Set up Python ${{ matrix.python-version }} 40 | uses: actions/setup-python@75f3110429a8c05be0e1bf360334e4cced2b63fa # v2.3.3 41 | with: 42 | python-version: ${{ matrix.python-version }} 43 | - name: Install Python ${{ matrix.python-version }} dependencies 44 | run: | 45 | python -m pip install --upgrade pip 46 | python -m pip install tox pylint 47 | - name: Run Tox (pylint) 48 | run: tox -e pylint 49 | 50 | unit: 51 | runs-on: ubuntu-latest 52 | 53 | strategy: 54 | matrix: 55 | python-version: [3.11] 56 | 57 | steps: 58 | - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0 59 | - name: Set up Python ${{ matrix.python-version }} 60 | uses: actions/setup-python@75f3110429a8c05be0e1bf360334e4cced2b63fa # v2.3.3 61 | with: 62 | python-version: ${{ matrix.python-version }} 63 | - name: Install Python ${{ matrix.python-version }} dependencies 64 | run: | 65 | python -m pip install --upgrade pip 66 | python -m pip install tox 67 | - name: Run Tox 68 | run: tox -e unit 69 | 70 | integration: 71 | runs-on: ubuntu-latest 72 | 73 | services: 74 | db_test: # This will be the hostname 75 | image: postgres:11 76 | env: 77 | POSTGRES_USER: test 78 | POSTGRES_PASSWORD: test 79 | POSTGRES_DB: test 80 | options: >- 81 | --health-cmd "pg_isready -d test -U test" 82 | --health-interval 10s 83 | --health-timeout 5s 84 | --health-retries 5 85 | ports: 86 | - 5432:5432 87 | 88 | strategy: 89 | matrix: 90 | python-version: [3.11] 91 | golang-version: ["^1.14.5"] 92 | 93 | env: 94 | MF_METADATA_DB_HOST: db_test 95 | MF_METADATA_DB_PORT: 5432 96 | MF_METADATA_DB_USER: test 97 | MF_METADATA_DB_PSWD: test 98 | MF_METADATA_DB_NAME: test 99 | 100 | steps: 101 | - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0 102 | - uses: actions/setup-go@bfdd3570ce990073878bf10f6b2d79082de49492 # v2.2.0 103 | with: 104 | go-version: ${{ matrix.golang-version }} 105 | - name: Install goose migration tool 106 | run: go install github.com/pressly/goose/v3/cmd/goose@v3.5.3 107 | - name: Set up Python ${{ matrix.python-version }} 108 | uses: actions/setup-python@75f3110429a8c05be0e1bf360334e4cced2b63fa # v2.3.3 109 | with: 110 | python-version: ${{ matrix.python-version }} 111 | - name: Install Python ${{ matrix.python-version }} dependencies 112 | run: | 113 | python -m pip install --upgrade pip 114 | python -m pip install tox 115 | - name: Add required test DB alias name for localhost 116 | run: echo "127.0.0.1 db_test" | sudo tee -a /etc/hosts 117 | - name: Run Tox 118 | run: tox -v -e integration 119 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Distribution / packaging 2 | .Python 3 | env3/ 4 | env/ 5 | bin/ 6 | build/ 7 | develop-eggs/ 8 | dist/ 9 | eggs/ 10 | lib/ 11 | lib64/ 12 | parts/ 13 | sdist/ 14 | var/ 15 | *.egg-info/ 16 | .installed.cfg 17 | *.egg 18 | *.eggs 19 | *.zip 20 | 21 | # Test artifacts 22 | .tox/ 23 | .coverage 24 | 25 | # Byte-compiled / optimized / DLL files 26 | __pycache__/ 27 | *.py[cod] 28 | 29 | # C extensions 30 | *.so 31 | 32 | # jetbrains 33 | .idea 34 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.20.2-buster as amd64-golang 2 | FROM arm64v8/golang:1.20.2-buster as arm64-golang 3 | 4 | FROM ${TARGETARCH}-golang as goose 5 | RUN go install github.com/pressly/goose/v3/cmd/goose@v3.9.0 6 | 7 | FROM python:3.11.6-slim-bookworm 8 | COPY --from=goose /go/bin/goose /usr/local/bin/ 9 | 10 | ARG BUILD_TIMESTAMP 11 | ARG BUILD_COMMIT_HASH 12 | ENV BUILD_TIMESTAMP=$BUILD_TIMESTAMP 13 | ENV BUILD_COMMIT_HASH=$BUILD_COMMIT_HASH 14 | 15 | ARG UI_ENABLED="1" 16 | ARG UI_VERSION="v1.3.13" 17 | ENV UI_ENABLED=$UI_ENABLED 18 | ENV UI_VERSION=$UI_VERSION 19 | 20 | ENV FEATURE_RUN_GROUPS=0 21 | ENV FEATURE_DEBUG_VIEW=1 22 | 23 | RUN apt-get update -y \ 24 | && apt-get -y install libpq-dev unzip gcc curl 25 | 26 | RUN pip3 install virtualenv requests 27 | 28 | # TODO: possibly unused virtualenv. See if it can be removed 29 | RUN virtualenv /opt/v_1_0_1 -p python3 30 | # All of the official deployment templates reference this virtualenv for launching services. 31 | RUN virtualenv /opt/latest -p python3 32 | 33 | RUN /opt/v_1_0_1/bin/pip install https://github.com/Netflix/metaflow-service/archive/1.0.1.zip 34 | 35 | ADD services/__init__.py /root/services/ 36 | ADD services/data/service_configs.py /root/services/ 37 | ADD services/data /root/services/data 38 | ADD services/metadata_service /root/services/metadata_service 39 | ADD services/ui_backend_service /root/services/ui_backend_service 40 | ADD services/utils /root/services/utils 41 | ADD setup.py setup.cfg run_goose.py /root/ 42 | WORKDIR /root 43 | RUN /opt/latest/bin/pip install . 44 | 45 | # Install Netflix/metaflow-ui release artifact 46 | RUN /root/services/ui_backend_service/download_ui.sh 47 | 48 | # Migration Service 49 | ADD services/migration_service /root/services/migration_service 50 | RUN pip3 install -r /root/services/migration_service/requirements.txt 51 | 52 | RUN chmod 777 /root/services/migration_service/run_script.py 53 | CMD python3 services/migration_service/run_script.py 54 | -------------------------------------------------------------------------------- /Dockerfile.metadata_service: -------------------------------------------------------------------------------- 1 | FROM python:3.11.7-bookworm 2 | 3 | RUN apt-get update -y \ 4 | && apt-get -y install libpq-dev gcc 5 | 6 | ADD services/__init__.py /root/services/ 7 | ADD services/data /root/services/data 8 | ADD services/utils /root/services/utils 9 | ADD services/metadata_service /root/services/metadata_service 10 | ADD setup.py setup.cfg /root/ 11 | WORKDIR /root 12 | RUN pip install --editable . 13 | CMD metadata_service -------------------------------------------------------------------------------- /Dockerfile.migration_service: -------------------------------------------------------------------------------- 1 | FROM golang:1.20.2 AS goose 2 | RUN go install github.com/pressly/goose/v3/cmd/goose@v3.9.0 3 | 4 | FROM python:3.11.7-bookworm 5 | COPY --from=goose /go/bin/goose /usr/local/bin/ 6 | 7 | RUN apt-get update -y \ 8 | && apt-get -y install libpq-dev 9 | 10 | ADD services/__init__.py /root/services/__init__.py 11 | ADD services/utils /root/services/utils 12 | ADD services/migration_service /root/services/migration_service 13 | ADD setup.py setup.cfg run_goose.py /root/ 14 | WORKDIR /root 15 | RUN pip install --editable . 16 | CMD migration_service -------------------------------------------------------------------------------- /Dockerfile.service.test: -------------------------------------------------------------------------------- 1 | FROM golang:1.20.2 AS goose 2 | RUN go install github.com/pressly/goose/v3/cmd/goose@v3.9.0 3 | 4 | FROM python:3.11.7-bookworm 5 | COPY --from=goose /go/bin/goose /usr/local/bin/ 6 | 7 | RUN apt-get update -y \ 8 | && apt-get -y install libpq-dev gcc 9 | 10 | RUN pip install tox 11 | 12 | COPY . /app 13 | WORKDIR /app 14 | 15 | CMD /app/wait-for-postgres.sh tox -------------------------------------------------------------------------------- /Dockerfile.ui_service: -------------------------------------------------------------------------------- 1 | FROM python:3.11.7-bookworm 2 | 3 | ARG UI_ENABLED="1" 4 | ARG UI_VERSION="v1.3.13" 5 | ENV UI_ENABLED=$UI_ENABLED 6 | ENV UI_VERSION=$UI_VERSION 7 | 8 | ARG BUILD_TIMESTAMP 9 | ARG BUILD_COMMIT_HASH 10 | 11 | ENV BUILD_TIMESTAMP=$BUILD_TIMESTAMP 12 | ENV BUILD_COMMIT_HASH=$BUILD_COMMIT_HASH 13 | 14 | ARG CUSTOM_QUICKLINKS 15 | 16 | ENV CUSTOM_QUICKLINKS=$CUSTOM_QUICKLINKS 17 | 18 | RUN apt-get update -y \ 19 | && apt-get -y install libpq-dev unzip gcc curl 20 | 21 | ADD services/__init__.py /root/services/__init__.py 22 | ADD services/data /root/services/data 23 | ADD services/utils /root/services/utils 24 | ADD services/metadata_service /root/services/metadata_service 25 | ADD services/ui_backend_service /root/services/ui_backend_service 26 | ADD setup.py setup.cfg /root/ 27 | 28 | WORKDIR /root 29 | 30 | # Install Netflix/metaflow-ui release artifact 31 | RUN /root/services/ui_backend_service/download_ui.sh 32 | 33 | RUN pip install --editable . 34 | 35 | CMD ui_backend_service 36 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include services/metadata_service/requirements.txt 2 | include services/migration_service/requirements.txt 3 | include services/ui_backend_service/requirements.txt 4 | include requirements.dev.txt 5 | include requirements.txt 6 | include README.md -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # Release process 2 | 3 | We follow [Semantic Versioning Specification 2.0.0](https://semver.org/spec/v2.0.0.html). 4 | 5 | In short, given a version number MAJOR.MINOR.PATCH, increment the: 6 | 7 | 1. MAJOR version when you make incompatible API changes, 8 | 2. MINOR version when you add functionality in a backwards compatible manner, and 9 | 3. PATCH version when you make backwards compatible bug fixes. 10 | 11 | Additional labels for pre-release and build metadata are available as extensions to the MAJOR.MINOR.PATCH format. 12 | 13 | ## Shipping a new version 14 | 15 | The release process is mostly automated via Github Actions, however a few manual steps are required: 16 | 17 | - [ ] [Edit `setup.py`](https://github.com/Netflix/metaflow-service/edit/master/setup.py) version in `master` branch (e.g. `"version": "1.0.0"`) 18 | - [ ] [Edit `Dockerfile.ui_service`](https://github.com/Netflix/metaflow-service/edit/master/Dockerfile.ui_service) and [edit `Dockerfile`](https://github.com/Netflix/metaflow-service/edit/master/Dockerfile) to set `ARG UI_VERSION="v7.7.7"` to the _latest version of `metaflow-ui`_ (if changed) 19 | - [ ] Create new tag from `master` branch (e.g. `git tag v1.0.0`, note the `v` -prefix) 20 | - [ ] Push tag to remote (e.g. `git push origin v1.0.0`) 21 | - [ ] Create a new release draft in [releases](https://github.com/Netflix/metaflow-service/releases) 22 | - [ ] Edit release draft 23 | - [ ] Make sure current and previous version are correct 24 | - [ ] Edit `Compatibility` section (Correct [Netflix/metaflow-service](https://github.com/Netflix/metaflow-service/releases) release versions) 25 | - [ ] Edit/remove `Additional resources` section 26 | - [ ] Make sure release artifact is uploaded 27 | - [ ] Publish release draft 28 | 29 | GitHub Actions will automatically publish the docker image to [netflixoss/metaflow_metadata_service](https://hub.docker.com/r/netflixoss/metaflow_metadata_service) 30 | -------------------------------------------------------------------------------- /docker-compose.development.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | ui_backend: 4 | platform: linux/amd64 5 | build: 6 | context: . 7 | dockerfile: Dockerfile.ui_service 8 | args: 9 | UI_ENABLED: 1 10 | ports: 11 | - "${MF_UI_METADATA_PORT:-8083}:${MF_UI_METADATA_PORT:-8083}" 12 | volumes: 13 | - ./services:/root/services 14 | - ${HOME}/.aws:/root/.aws 15 | # Add container capability for benchmarking processes. required for py-spy 16 | cap_add: 17 | - SYS_PTRACE 18 | environment: 19 | - MF_METADATA_DB_HOST=db 20 | - MF_METADATA_DB_PORT=5432 21 | - MF_METADATA_DB_USER=postgres 22 | - MF_METADATA_DB_PSWD=postgres 23 | - MF_METADATA_DB_NAME=postgres 24 | - MF_UI_METADATA_PORT=${MF_UI_METADATA_PORT:-8083} 25 | - MF_UI_METADATA_HOST=${MF_UI_METADATA_HOST:-0.0.0.0} 26 | - MF_METADATA_DB_POOL_MIN=1 27 | - MF_METADATA_DB_POOL_MAX=10 28 | - METAFLOW_S3_RETRY_COUNT=0 29 | - LOGLEVEL=INFO 30 | - AIOPG_ECHO=0 31 | - UI_ENABLED=0 32 | - PREFETCH_RUNS_SINCE=2592000 # 30 days in seconds 33 | - PREFETCH_RUNS_LIMIT=1 # Prefetch only one run 34 | - S3_NUM_WORKERS=2 35 | - CACHE_ARTIFACT_MAX_ACTIONS=1 36 | - CACHE_DAG_MAX_ACTIONS=1 37 | - CACHE_LOG_MAX_ACTIONS=1 38 | - CACHE_ARTIFACT_STORAGE_LIMIT=16000000 39 | - CACHE_DAG_STORAGE_LIMIT=16000000 40 | - WS_POSTPROCESS_CONCURRENCY_LIMIT=8 41 | - FEATURE_PREFETCH_DISABLE=0 42 | - FEATURE_CACHE_DISABLE=0 43 | - FEATURE_S3_DISABLE=0 44 | - FEATURE_REFINE_DISABLE=0 45 | - FEATURE_WS_DISABLE=0 46 | - FEATURE_HEARTBEAT_DISABLE=0 47 | - FEATURE_DB_LISTEN_DISABLE=0 48 | - FEATURE_ARTIFACT_SEARCH=1 49 | - FEATURE_FOREACH_VAR_SEARCH=1 50 | - FEATURE_ARTIFACT_TABLE=1 51 | - CUSTOM_QUICKLINKS=$CUSTOM_QUICKLINKS 52 | - NOTIFICATIONS=$NOTIFICATIONS 53 | - GA_TRACKING_ID=none 54 | - PLUGINS=$PLUGINS 55 | - AWS_PROFILE=$AWS_PROFILE 56 | depends_on: 57 | - migration 58 | metadata: 59 | platform: linux/amd64 60 | build: 61 | context: . 62 | dockerfile: Dockerfile.metadata_service 63 | ports: 64 | - "${MF_METADATA_PORT:-8080}:${MF_METADATA_PORT:-8080}" 65 | volumes: 66 | - ./services:/root/services 67 | environment: 68 | - LOGLEVEL=WARNING 69 | - MF_METADATA_DB_HOST=db 70 | - MF_METADATA_DB_PORT=5432 71 | - MF_METADATA_DB_USER=postgres 72 | - MF_METADATA_DB_PSWD=postgres 73 | - MF_METADATA_DB_NAME=postgres 74 | - MF_METADATA_PORT=${MF_METADATA_PORT:-8080} 75 | - MF_METADATA_HOST=${MF_METADATA_HOST:-0.0.0.0} 76 | - MF_MIGRATION_PORT=${MF_MIGRATION_PORT:-8082} 77 | depends_on: 78 | - migration 79 | migration: 80 | command: ["python", "/root/run_goose.py"] 81 | platform: linux/amd64 82 | build: 83 | context: . 84 | dockerfile: Dockerfile.migration_service 85 | volumes: 86 | - ./services:/root/services 87 | environment: 88 | - MF_METADATA_DB_HOST=db 89 | - MF_METADATA_DB_PORT=5432 90 | - MF_METADATA_DB_USER=postgres 91 | - MF_METADATA_DB_PSWD=postgres 92 | - MF_METADATA_DB_NAME=postgres 93 | - MF_METADATA_PORT=${MF_METADATA_PORT:-8080} 94 | - MF_METADATA_HOST=${MF_METADATA_HOST:-0.0.0.0} 95 | - MF_MIGRATION_ENDPOINTS_ENABLED=1 96 | - MF_MIGRATION_PORT=${MF_MIGRATION_PORT:-8082} 97 | depends_on: 98 | - db 99 | db: 100 | image: "postgres:11" 101 | command: ["postgres", "-c", "log_statement=none", "-c", "wal_level=logical"] 102 | environment: 103 | POSTGRES_USER: postgres 104 | POSTGRES_PASSWORD: postgres 105 | POSTGRES_DB: postgres 106 | ports: 107 | - "5432:5432" 108 | volumes: 109 | - db_dev_data:/var/lib/postgresql/data2 110 | healthcheck: 111 | test: ["CMD-SHELL", "pg_isready -U postgres"] 112 | interval: 10s 113 | timeout: 5s 114 | retries: 5 115 | volumes: 116 | db_dev_data: 117 | -------------------------------------------------------------------------------- /docker-compose.test.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | service_test: 4 | container_name: service_test 5 | build: 6 | context: . 7 | dockerfile: Dockerfile.service.test 8 | volumes: 9 | - .:/app 10 | environment: 11 | - MF_METADATA_DB_HOST=db_test 12 | - MF_METADATA_DB_PORT=5432 13 | - MF_METADATA_DB_USER=test 14 | - MF_METADATA_DB_PSWD=test 15 | - MF_METADATA_DB_NAME=test 16 | - MF_MIGRATION_ENDPOINTS_ENABLED=1 17 | depends_on: 18 | - db_test 19 | db_test: 20 | container_name: db_test 21 | image: "postgres:11" 22 | environment: 23 | POSTGRES_USER: test 24 | POSTGRES_PASSWORD: test 25 | POSTGRES_DB: test 26 | ports: 27 | - "5432:5432" 28 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | metadata: 4 | image: "metadata_service:latest" 5 | restart: always 6 | container_name: "metadata_service" 7 | ports: 8 | - "${MF_MIGRATION_PORT:-8082}:${MF_MIGRATION_PORT:-8082}" 9 | - "${MF_METADATA_PORT:-8080}:${MF_METADATA_PORT:-8080}" 10 | volumes: 11 | - .:/code 12 | environment: 13 | - MF_METADATA_DB_HOST=db 14 | - MF_METADATA_DB_PORT=5432 15 | - MF_METADATA_DB_USER=postgres 16 | - MF_METADATA_DB_PSWD=postgres 17 | - MF_METADATA_DB_NAME=postgres 18 | - MF_MIGRATION_ENDPOINTS_ENABLED=1 19 | - MF_METADATA_PORT=${MF_METADATA_PORT:-8080} 20 | - MF_METADATA_HOST=${MF_METADATA_HOST:-0.0.0.0} 21 | - MF_MIGRATION_PORT=${MF_MIGRATION_PORT:-8082} 22 | links: 23 | - db 24 | db: 25 | image: "postgres:11" 26 | restart: always 27 | container_name: "my_postgres" 28 | environment: 29 | POSTGRES_USER: postgres 30 | POSTGRES_PASSWORD: postgres 31 | POSTGRES_DB: postgres 32 | ports: 33 | - "5432:5432" 34 | volumes: 35 | - my_dbdata:/var/lib/postgresql/data2 36 | volumes: 37 | my_dbdata: 38 | -------------------------------------------------------------------------------- /migration_tools.py: -------------------------------------------------------------------------------- 1 | import click 2 | import requests 3 | 4 | 5 | @click.group() 6 | def tools(): 7 | pass 8 | 9 | 10 | @tools.command() 11 | @click.option('--base-url', 12 | default=None, 13 | required=True, 14 | help='url to migration service ex: http://localhost:8082') 15 | def upgrade(base_url): 16 | """Upgrade to latest db schema""" 17 | url = base_url + "/upgrade" 18 | response = requests.patch(url) 19 | print(response.text) 20 | 21 | 22 | @tools.command() 23 | @click.option('--base-url', 24 | default=None, 25 | required=True, 26 | help='url to migration service ex: http://localhost:8082') 27 | def db_status(base_url): 28 | """get status of current db schema""" 29 | url = base_url + "/db_schema_status" 30 | response = requests.get(url) 31 | print(response.json()) 32 | 33 | 34 | @tools.command() 35 | @click.option('--base-url', 36 | default=None, 37 | required=True, 38 | help='url to metadata service ex: http://localhost:8080') 39 | def metadata_service_version(base_url): 40 | """get status of current db schema""" 41 | url = base_url + "/version" 42 | response = requests.get(url) 43 | print(response.text) 44 | 45 | 46 | cli = click.CommandCollection(sources=[tools]) 47 | 48 | 49 | if __name__ == "__main__": 50 | cli() 51 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | unit_tests: Unit tests (deselect with '-m "not unit_tests"') 4 | integration_tests: Integration tests (deselect with '-m "not integration_tests"') 5 | asyncio_mode=auto 6 | -------------------------------------------------------------------------------- /requirements.dev.txt: -------------------------------------------------------------------------------- 1 | tox 2 | pylint 3 | pytest 4 | pytest-cov 5 | pytest-aiohttp >= 1.0.3, < 2 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -r services/metadata_service/requirements.txt 2 | -r services/migration_service/requirements.txt 3 | -r services/ui_backend_service/requirements.txt 4 | -------------------------------------------------------------------------------- /run_goose.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import argparse 5 | from subprocess import Popen 6 | from urllib.parse import quote 7 | import psycopg2 8 | import psycopg2.errorcodes 9 | 10 | 11 | DB_SCHEMA_NAME = os.environ.get("DB_SCHEMA_NAME", "public") 12 | 13 | 14 | def check_if_goose_table_exists(db_connection_string: str): 15 | conn = psycopg2.connect(db_connection_string) 16 | cur = conn.cursor() 17 | try: 18 | cur.execute("SELECT schemaname,tablename FROM pg_tables") 19 | tables = [name for schema, name in cur.fetchall() if schema == DB_SCHEMA_NAME] 20 | if "goose_db_version" not in tables: 21 | print( 22 | f"Goose migration table not found among tables in schema {DB_SCHEMA_NAME}. Found: {', '.join(tables)}", 23 | file=sys.stderr, 24 | ) 25 | return False 26 | else: 27 | print(f"Goose migration table found in schema {DB_SCHEMA_NAME}", file=sys.stderr) 28 | return True 29 | finally: 30 | conn.close() 31 | 32 | 33 | def wait_for_postgres(db_connection_string: str, timeout_seconds: int): 34 | deadline = time.time() + timeout_seconds 35 | while True: 36 | try: 37 | conn = psycopg2.connect(db_connection_string) 38 | conn.close() 39 | return 40 | except psycopg2.OperationalError as e: 41 | if time.time() < deadline: 42 | print(f"Failed to connect to postgres ({e}), sleeping", file=sys.stderr) 43 | time.sleep(.5) 44 | else: 45 | raise 46 | 47 | 48 | def main(): 49 | parser = argparse.ArgumentParser(description="Run goose migrations") 50 | parser.add_argument("--only-if-empty-db", default=False, action="store_true") 51 | parser.add_argument("--wait", type=int, default=30, help="Wait for connection for X seconds") 52 | args = parser.parse_args() 53 | 54 | db_connection_string = f'postgresql://{quote(os.environ["MF_METADATA_DB_USER"])}:'\ 55 | f'{quote(os.environ["MF_METADATA_DB_PSWD"])}@{os.environ["MF_METADATA_DB_HOST"]}:'\ 56 | f'{os.environ["MF_METADATA_DB_PORT"]}/{os.environ["MF_METADATA_DB_NAME"]}' 57 | 58 | ssl_mode = os.environ.get("MF_METADATA_DB_SSL_MODE") 59 | ssl_cert_path = os.environ.get("MF_METADATA_DB_SSL_CERT_PATH") 60 | ssl_key_path = os.environ.get("MF_METADATA_DB_SSL_KEY_PATH") 61 | ssl_root_cert_path = os.environ.get("MF_METADATA_DB_SSL_ROOT_CERT") 62 | 63 | if ssl_mode in ['allow', 'prefer', 'require', 'verify-ca', 'verify-full']: 64 | ssl_query = f'sslmode={ssl_mode}' 65 | if ssl_cert_path is not None: 66 | ssl_query = f'{ssl_query}&sslcert={ssl_cert_path}' 67 | if ssl_key_path is not None: 68 | ssl_query = f'{ssl_query}&sslkey={ssl_key_path}' 69 | if ssl_root_cert_path is not None: 70 | ssl_query = f'{ssl_query}&sslrootcert={ssl_root_cert_path}' 71 | else: 72 | ssl_query = f'sslmode=disable' 73 | 74 | db_connection_string = f'{db_connection_string}?{ssl_query}' 75 | 76 | if args.wait: 77 | wait_for_postgres(db_connection_string, timeout_seconds=args.wait) 78 | 79 | if args.only_if_empty_db: 80 | if check_if_goose_table_exists(db_connection_string): 81 | print( 82 | f"Skipping migrations since --only-if-empty-db flag is used", 83 | file=sys.stderr, 84 | ) 85 | sys.exit(0) 86 | 87 | p = Popen( 88 | [ 89 | "goose", 90 | "-dir", 91 | "/root/services/migration_service/migration_files/", 92 | "postgres", 93 | db_connection_string, 94 | "up", 95 | ] 96 | ) 97 | if p.wait() != 0: 98 | raise Exception("Failed to run initial migration") 99 | 100 | 101 | if __name__ == "__main__": 102 | main() 103 | -------------------------------------------------------------------------------- /services/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Netflix/metaflow-service/9e47d2d85e127d2673d457dde7ae535a3341de0f/services/__init__.py -------------------------------------------------------------------------------- /services/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import FlowRow, RunRow, StepRow, TaskRow, ArtifactRow, MetadataRow 2 | -------------------------------------------------------------------------------- /services/data/db_utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import List, Dict, Any 3 | import psycopg2 4 | import collections 5 | import datetime 6 | import time 7 | import json 8 | 9 | 10 | DBResponse = collections.namedtuple("DBResponse", "response_code body") 11 | 12 | DBPagination = collections.namedtuple("DBPagination", "limit offset count page") 13 | 14 | 15 | def aiopg_exception_handling(exception): 16 | err_msg = str(exception) 17 | body = {"err_msg": err_msg} 18 | if isinstance(exception, asyncio.TimeoutError): 19 | body = { 20 | "err_msg": { 21 | "type": "timeout error", 22 | } 23 | } 24 | elif isinstance(exception, psycopg2.Error): 25 | # this means that this is a psycopg2 exception 26 | # since this is of type `psycopg2.Error` we can use https://www.psycopg.org/docs/module.html#psycopg2.Error 27 | body = { 28 | "err_msg": { 29 | "pgerror": exception.pgerror, 30 | "pgcode": exception.pgcode, 31 | "diag": None 32 | if exception.diag is None 33 | else { 34 | "message_primary": exception.diag.message_primary, 35 | "severity": exception.diag.severity, 36 | }, 37 | } 38 | } 39 | 40 | if isinstance(exception, psycopg2.IntegrityError): 41 | if "duplicate key" in err_msg: 42 | return DBResponse(response_code=409, body=json.dumps(body)) 43 | elif "foreign key" in err_msg: 44 | return DBResponse(response_code=404, body=json.dumps(body)) 45 | else: 46 | return DBResponse(response_code=500, body=json.dumps(body)) 47 | elif isinstance(exception, psycopg2.errors.UniqueViolation): 48 | return DBResponse(response_code=409, body=json.dumps(body)) 49 | elif isinstance(exception, IndexError): 50 | return DBResponse(response_code=404, body={}) 51 | else: 52 | return DBResponse(response_code=500, body=json.dumps(body)) 53 | 54 | 55 | def get_db_ts_epoch_str(): 56 | return str(int(round(time.time() * 1000))) 57 | 58 | 59 | def new_heartbeat_ts(): 60 | return int(datetime.datetime.utcnow().timestamp()) 61 | 62 | 63 | def translate_run_key(v: str): 64 | value = str(v) 65 | return "run_number" if value.isnumeric() else "run_id", value 66 | 67 | 68 | def translate_task_key(v: str): 69 | value = str(v) 70 | return "task_id" if value.isnumeric() else "task_name", value 71 | 72 | 73 | def get_exposed_run_id(run_number, run_id): 74 | if run_id is not None: 75 | return run_id 76 | return run_number 77 | 78 | 79 | def get_exposed_task_id(task_id, task_name): 80 | if task_name is not None: 81 | return task_name 82 | return task_id 83 | 84 | 85 | def get_latest_attempt_id_for_tasks(artifacts): 86 | attempt_ids = {} 87 | for artifact in artifacts: 88 | attempt_ids[artifact["task_id"]] = max( 89 | artifact["attempt_id"], attempt_ids.get(artifact["task_id"], 0) 90 | ) 91 | return attempt_ids 92 | 93 | 94 | def filter_artifacts_for_latest_attempt( 95 | artifacts: List[Dict[str, Any]] 96 | ) -> List[Dict[str, Any]]: 97 | # `artifacts` is a `list` of dictionaries where each item in the list 98 | # consists of `ArtifactRow` in a dictionary form 99 | attempt_ids = get_latest_attempt_id_for_tasks(artifacts) 100 | return filter_artifacts_by_attempt_id_for_tasks(artifacts, attempt_ids) 101 | 102 | 103 | def filter_artifacts_by_attempt_id_for_tasks( 104 | artifacts: List[Dict[str, Any]], attempt_for_tasks: Dict[str, Any] 105 | ) -> List[dict]: 106 | # `artifacts` is a `list` of dictionaries where each item in the list 107 | # consists of `ArtifactRow` in a dictionary form 108 | # `attempt_for_tasks` is a dictionary for form : {task_id:attempt_id} 109 | result = [] 110 | for artifact in artifacts: 111 | if artifact["attempt_id"] == attempt_for_tasks[artifact["task_id"]]: 112 | result.append(artifact) 113 | return result 114 | -------------------------------------------------------------------------------- /services/data/service_configs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | max_connection_retires = int(os.environ.get("MF_SERVICE_CONNECTION_RETRIES", 3)) 4 | connection_retry_wait_time_seconds = int(os.environ.get("MF_SERVICE_CONNECTION_RETRY_WAITTIME_SECONDS", 1)) 5 | max_startup_retries = int(os.environ.get("MF_SERVICE_STARTUP_RETRIES", 5)) 6 | startup_retry_wait_time_seconds = int(os.environ.get("MF_SERVICE_STARTUP_WAITTIME_SECONDS", 1)) 7 | -------------------------------------------------------------------------------- /services/data/tagging_utils.py: -------------------------------------------------------------------------------- 1 | from services.data.db_utils import DBResponse 2 | import copy 3 | 4 | 5 | async def apply_run_tags_to_db_response(flow_id, run_number, run_table_postgres, db_response: DBResponse) -> DBResponse: 6 | """ 7 | We want read APIs to return steps, tasks and artifact objects with tags 8 | and system_tags set to their ancestral Run. 9 | 10 | This is a prerequisite for supporting Run-based tag mutation. 11 | """ 12 | # we will return a modified copy of db_response 13 | new_db_response = copy.deepcopy(db_response) 14 | # Only replace tags if response code is legit 15 | # Object creation ought to return 201 (let's prepare for that) 16 | if new_db_response.response_code not in (200, 201): 17 | return new_db_response 18 | if isinstance(new_db_response.body, list): 19 | items_to_modify = new_db_response.body 20 | else: 21 | items_to_modify = [new_db_response.body] 22 | if not items_to_modify: 23 | return new_db_response 24 | # items_to_modify now references all the items we want to modify 25 | 26 | # The ancestral run must be successfully read from DB 27 | db_response_for_run = await run_table_postgres.get_run(flow_id, run_number) 28 | if db_response_for_run.response_code != 200: 29 | return DBResponse(response_code=500, body=db_response_for_run.body) 30 | run = db_response_for_run.body 31 | for item_as_dict in items_to_modify: 32 | item_as_dict['tags'] = run['tags'] 33 | item_as_dict['system_tags'] = run['system_tags'] 34 | return new_db_response 35 | -------------------------------------------------------------------------------- /services/metadata_service/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Netflix/metaflow-service/9e47d2d85e127d2673d457dde7ae535a3341de0f/services/metadata_service/__init__.py -------------------------------------------------------------------------------- /services/metadata_service/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Netflix/metaflow-service/9e47d2d85e127d2673d457dde7ae535a3341de0f/services/metadata_service/api/__init__.py -------------------------------------------------------------------------------- /services/metadata_service/api/flow.py: -------------------------------------------------------------------------------- 1 | from services.data import FlowRow 2 | from services.data.postgres_async_db import AsyncPostgresDB 3 | from services.utils import read_body 4 | from services.metadata_service.api.utils import format_response, \ 5 | handle_exceptions 6 | import asyncio 7 | 8 | 9 | class FlowApi(object): 10 | _flow_table = None 11 | lock = asyncio.Lock() 12 | 13 | def __init__(self, app): 14 | app.router.add_route("GET", "/flows", self.get_all_flows) 15 | app.router.add_route("GET", "/flows/{flow_id}", self.get_flow) 16 | app.router.add_route("POST", "/flows/{flow_id}", self.create_flow) 17 | self._async_table = AsyncPostgresDB.get_instance().flow_table_postgres 18 | 19 | @format_response 20 | @handle_exceptions 21 | async def create_flow(self, request): 22 | """ 23 | --- 24 | description: create/register a flow 25 | tags: 26 | - Flow 27 | parameters: 28 | - name: "flow_id" 29 | in: "path" 30 | description: "flow_id" 31 | required: true 32 | type: "string" 33 | - name: "body" 34 | in: "body" 35 | description: "body" 36 | required: true 37 | schema: 38 | type: object 39 | properties: 40 | user_name: 41 | type: string 42 | tags: 43 | type: object 44 | system_tags: 45 | type: object 46 | 47 | produces: 48 | - 'text/plain' 49 | responses: 50 | "200": 51 | description: successfully created flow row 52 | "409": 53 | description: CONFLICT record exists 54 | """ 55 | flow_name = request.match_info.get("flow_id") 56 | 57 | body = await read_body(request.content) 58 | user = body.get("user_name") 59 | tags = body.get("tags") 60 | system_tags = body.get("system_tags") 61 | flow = FlowRow( 62 | flow_id=flow_name, user_name=user, tags=tags, system_tags=system_tags 63 | ) 64 | return await self._async_table.add_flow(flow) 65 | 66 | @format_response 67 | @handle_exceptions 68 | async def get_flow(self, request): 69 | """ 70 | --- 71 | description: Get flow by id 72 | tags: 73 | - Flow 74 | parameters: 75 | - name: "flow_id" 76 | in: "path" 77 | description: "flow_id" 78 | required: true 79 | type: "string" 80 | produces: 81 | - text/plain 82 | responses: 83 | "200": 84 | description: successful operation. Return flow 85 | "404": 86 | description: flow not found 87 | "405": 88 | description: invalid HTTP Method 89 | """ 90 | 91 | flow_name = request.match_info.get("flow_id") 92 | return await self._async_table.get_flow(flow_name) 93 | 94 | @format_response 95 | @handle_exceptions 96 | async def get_all_flows(self, request): 97 | """ 98 | --- 99 | description: Get all flows 100 | tags: 101 | - Flow 102 | produces: 103 | - text/plain 104 | responses: 105 | "200": 106 | description: successful operation. Returned all registered flows 107 | "405": 108 | description: invalid HTTP Method 109 | """ 110 | return await self._async_table.get_all_flows() 111 | -------------------------------------------------------------------------------- /services/metadata_service/api/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | from functools import wraps 3 | 4 | import collections 5 | from aiohttp import web 6 | from multidict import MultiDict 7 | from importlib import metadata 8 | 9 | from services.utils import get_traceback_str 10 | 11 | version = metadata.version("metadata_service") 12 | METADATA_SERVICE_VERSION = version 13 | METADATA_SERVICE_HEADER = 'METADATA_SERVICE_VERSION' 14 | 15 | ServiceResponse = collections.namedtuple("ServiceResponse", "response_code body") 16 | 17 | 18 | def format_response(func): 19 | """handle formatting""" 20 | 21 | @wraps(func) 22 | async def wrapper(*args, **kwargs): 23 | db_response = await func(*args, **kwargs) 24 | return web.Response(status=db_response.response_code, 25 | body=json.dumps(db_response.body), 26 | headers=MultiDict( 27 | {METADATA_SERVICE_HEADER: METADATA_SERVICE_VERSION})) 28 | 29 | return wrapper 30 | 31 | 32 | def web_response(status: int, body): 33 | return web.Response(status=status, 34 | body=json.dumps(body), 35 | headers=MultiDict( 36 | {"Content-Type": "application/json", 37 | METADATA_SERVICE_HEADER: METADATA_SERVICE_VERSION})) 38 | 39 | 40 | def http_500(msg, traceback_str=None): 41 | # NOTE: worth considering if we want to expose tracebacks in the future in the api messages. 42 | if traceback_str is None: 43 | traceback_str = get_traceback_str() 44 | body = { 45 | 'traceback': traceback_str, 46 | 'detail': msg, 47 | 'status': 500, 48 | 'title': 'Internal Server Error', 49 | 'type': 'about:blank' 50 | } 51 | 52 | return ServiceResponse(500, body) 53 | 54 | 55 | def handle_exceptions(func): 56 | """Catch exceptions and return appropriate HTTP error.""" 57 | 58 | @wraps(func) 59 | async def wrapper(*args, **kwargs): 60 | try: 61 | return await func(*args, **kwargs) 62 | except web.HTTPClientError as ex: 63 | return ServiceResponse(ex.status_code, ex.reason) 64 | except Exception as err: 65 | return http_500(str(err)) 66 | 67 | return wrapper 68 | -------------------------------------------------------------------------------- /services/metadata_service/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp >= 3.8.1, < 4 2 | packaging 3 | psycopg2 4 | boto3 5 | aiopg 6 | -------------------------------------------------------------------------------- /services/metadata_service/server.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | 4 | from aiohttp import web 5 | 6 | from .api.run import RunApi 7 | from .api.flow import FlowApi 8 | 9 | from .api.step import StepApi 10 | from .api.task import TaskApi 11 | from .api.artifact import ArtificatsApi 12 | from .api.admin import AuthApi 13 | 14 | from .api.metadata import MetadataApi 15 | from services.data.postgres_async_db import AsyncPostgresDB 16 | from services.utils import DBConfiguration 17 | 18 | PATH_PREFIX = os.environ.get("PATH_PREFIX", "") 19 | 20 | 21 | def app(loop=None, db_conf: DBConfiguration = None, middlewares=None, path_prefix=""): 22 | 23 | loop = loop or asyncio.get_event_loop() 24 | 25 | _app = web.Application(loop=loop) 26 | app = web.Application(loop=loop) if path_prefix else _app 27 | async_db = AsyncPostgresDB() 28 | loop.run_until_complete(async_db._init(db_conf)) 29 | FlowApi(app) 30 | RunApi(app) 31 | StepApi(app) 32 | TaskApi(app) 33 | MetadataApi(app) 34 | ArtificatsApi(app) 35 | AuthApi(app) 36 | 37 | if path_prefix: 38 | _app.add_subapp(path_prefix, app) 39 | if middlewares: 40 | _app.middlewares.extend(middlewares) 41 | return _app 42 | 43 | 44 | def main(): 45 | loop = asyncio.get_event_loop() 46 | the_app = app(loop, DBConfiguration(), path_prefix=PATH_PREFIX) 47 | handler = web.AppRunner(the_app) 48 | loop.run_until_complete(handler.setup()) 49 | 50 | port = os.environ.get("MF_METADATA_PORT", 8080) 51 | host = str(os.environ.get("MF_METADATA_HOST", "0.0.0.0")) 52 | f = loop.create_server(handler.server, host, port) 53 | 54 | srv = loop.run_until_complete(f) 55 | print("serving on", srv.sockets[0].getsockname()) 56 | try: 57 | loop.run_forever() 58 | except KeyboardInterrupt: 59 | pass 60 | 61 | 62 | if __name__ == "__main__": 63 | main() 64 | -------------------------------------------------------------------------------- /services/metadata_service/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Netflix/metaflow-service/9e47d2d85e127d2673d457dde7ae535a3341de0f/services/metadata_service/tests/__init__.py -------------------------------------------------------------------------------- /services/metadata_service/tests/integration_tests/__init__.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | # we need to register the utils helper for assert rewriting in order to get descriptive assertion errors. 4 | pytest.register_assert_rewrite("services.metadata_service.tests.integration_tests.utils") 5 | -------------------------------------------------------------------------------- /services/metadata_service/tests/integration_tests/flow_test.py: -------------------------------------------------------------------------------- 1 | from .utils import ( 2 | cli, db, 3 | assert_api_get_response, assert_api_post_response, compare_partial, 4 | add_flow 5 | ) 6 | import pytest 7 | 8 | pytestmark = [pytest.mark.integration_tests] 9 | 10 | 11 | async def test_flows_post(cli, db): 12 | payload = { 13 | "user_name": "test_user", 14 | "tags": ["a_tag", "b_tag"], 15 | "system_tags": ["runtime:test"] 16 | } 17 | await assert_api_post_response( 18 | cli, 19 | path="/flows/{}".format("TestFlow"), 20 | payload=payload, 21 | status=200 # why 200 instead of 201? 22 | ) 23 | 24 | # Record should be found in DB 25 | _flow = (await db.flow_table_postgres.get_flow(flow_id="TestFlow")).body 26 | 27 | compare_partial(_flow, payload) 28 | 29 | # Second post should fail as flow already exists. 30 | await assert_api_post_response( 31 | cli, 32 | path="/flows/{}".format("TestFlow"), 33 | payload=payload, 34 | status=409 35 | ) 36 | 37 | 38 | async def test_flows_get(cli, db): 39 | # create a few flows for test 40 | _first_flow = (await add_flow(db, flow_id="TestFlow", user_name="test_user-1", tags=["a_tag", "b_tag"], system_tags=["runtime:test"])).body 41 | _second_flow = (await add_flow(db, flow_id="AnotherTestFlow", user_name="test_user-1")).body 42 | 43 | # try to get all the created flows 44 | await assert_api_get_response(cli, "/flows", data=[_first_flow, _second_flow], data_is_unordered_list_of_dicts=True) 45 | 46 | 47 | async def test_flow_get(cli, db): 48 | # create flow for test 49 | _flow = (await add_flow(db, flow_id="TestFlow", user_name="test_user-1")).body 50 | 51 | # try to get created flow 52 | await assert_api_get_response(cli, "/flows/TestFlow", data=_flow) 53 | 54 | # non-existent flow should return 404 55 | await assert_api_get_response(cli, "/flows/AnotherFlow", status=404) 56 | -------------------------------------------------------------------------------- /services/metadata_service/tests/integration_tests/step_test.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | from .utils import ( 4 | cli, db, 5 | assert_api_get_response, assert_api_post_response, compare_partial, 6 | add_flow, add_run, add_step, update_objects_with_run_tags 7 | ) 8 | import pytest 9 | 10 | pytestmark = [pytest.mark.integration_tests] 11 | 12 | 13 | async def test_step_post(cli, db): 14 | # create flow and run to add steps for. 15 | _flow = (await add_flow(db)).body 16 | _run = (await add_run(db, flow_id=_flow["flow_id"])).body 17 | 18 | payload = { 19 | "user_name": "test_user", 20 | "tags": ["a_tag", "b_tag"], 21 | "system_tags": ["runtime:test"] 22 | } 23 | 24 | # Check all fields from payload match what we get back from POST, 25 | # except for tags, which should match run tags instead. 26 | def _check_response_body(body): 27 | payload_cp = copy.deepcopy(payload) 28 | payload_cp["tags"] = _run["tags"] 29 | payload_cp["system_tags"] = _run["system_tags"] 30 | compare_partial(body, payload_cp) 31 | 32 | _step = await assert_api_post_response( 33 | cli, 34 | path="/flows/{flow_id}/runs/{run_number}/steps/test_step/step".format(**_run), 35 | payload=payload, 36 | status=200, # why 200 instead of 201? 37 | check_fn=_check_response_body 38 | ) 39 | 40 | # Record should be found in DB 41 | _found = (await db.step_table_postgres.get_step(_step["flow_id"], _step["run_number"], _step["step_name"])).body 42 | 43 | compare_partial(_found, {"step_name": "test_step", **payload}) 44 | 45 | # Duplicate step names should not be accepted for a run 46 | await assert_api_post_response( 47 | cli, 48 | path="/flows/{flow_id}/runs/{run_number}/steps/test_step/step".format(**_run), 49 | payload=payload, 50 | status=409 51 | ) 52 | 53 | # Posting on a non-existent flow_id should result in error 54 | await assert_api_post_response( 55 | cli, 56 | path="/flows/NonExistentFlow/runs/{run_number}/steps/test_step/step".format(**_run), 57 | payload=payload, 58 | status=500 59 | ) 60 | 61 | # posting on a non-existent run number should result in an error 62 | await assert_api_post_response( 63 | cli, 64 | path="/flows/{flow_id}/runs/1234/steps/test_step/step".format(**_run), 65 | payload=payload, 66 | status=500 67 | ) 68 | 69 | 70 | async def test_steps_get(cli, db): 71 | # create a flow and run for the test 72 | _flow = (await add_flow(db, "TestFlow", "test_user-1", ["a_tag", "b_tag"], ["runtime:test"])).body 73 | _run = (await add_run(db, flow_id=_flow["flow_id"])).body 74 | 75 | # add steps to the run 76 | _first_step = (await add_step(db, flow_id=_run["flow_id"], run_number=_run["run_number"], step_name="first_step")).body 77 | _second_step = (await add_step(db, flow_id=_run["flow_id"], run_number=_run["run_number"], step_name="second_step")).body 78 | 79 | # expect steps' tags to be overridden by tags of their ancestral run 80 | update_objects_with_run_tags('step', [_first_step, _second_step], _run) 81 | 82 | # try to get all the created steps 83 | await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps".format(**_first_step), 84 | data=[_first_step, _second_step], data_is_unordered_list_of_dicts=True) 85 | 86 | # getting steps for non-existent flow should return empty list 87 | await assert_api_get_response(cli, "/flows/NonExistentFlow/runs/{run_number}/steps".format(**_first_step), status=200, data=[]) 88 | 89 | # getting steps for non-existent run should return empty list 90 | await assert_api_get_response(cli, "/flows/{flow_id}/runs/1234/steps".format(**_first_step), status=200, data=[]) 91 | 92 | 93 | async def test_step_get(cli, db): 94 | # create flow for test 95 | _flow = (await add_flow(db, "TestFlow", "test_user-1", ["a_tag", "b_tag"], ["runtime:test"])).body 96 | _run = (await add_run(db, flow_id=_flow["flow_id"])).body 97 | 98 | # add step to run for testing 99 | _step = (await add_step(db, flow_id=_run["flow_id"], run_number=_run["run_number"], step_name="first_step")).body 100 | 101 | # expect step's tags to be overridden by tags of their ancestral run 102 | update_objects_with_run_tags('step', [_step], _run) 103 | 104 | # try to get created step 105 | await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}".format(**_step), data=_step) 106 | 107 | # non-existent flow, run, or step should return 404 108 | await assert_api_get_response(cli, "/flows/NonExistentFlow/runs/{run_number}/steps/{step_name}".format(**_step), status=404) 109 | await assert_api_get_response(cli, "/flows/{flow_id}/runs/1234/steps/{step_name}".format(**_step), status=404) 110 | await assert_api_get_response(cli, "/flows/{flow_id}/runs/{run_number}/steps/nonexistent_step".format(**_step), status=404) 111 | -------------------------------------------------------------------------------- /services/metadata_service/tests/unit_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Netflix/metaflow-service/9e47d2d85e127d2673d457dde7ae535a3341de0f/services/metadata_service/tests/unit_tests/__init__.py -------------------------------------------------------------------------------- /services/metadata_service/tests/unit_tests/api_util_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | from services.metadata_service.api.utils import handle_exceptions, format_response 3 | 4 | async def test_handle_exceptions(): 5 | 6 | @handle_exceptions 7 | async def do_not_raise(): 8 | return True 9 | 10 | @format_response 11 | @handle_exceptions 12 | async def raise_without_id(): 13 | raise Exception("test") 14 | 15 | # wrapper should not touch successful calls. 16 | assert (await do_not_raise()) 17 | 18 | # NOTE: aiohttp Response StringPayload only has the internal property _value for accessing the payload value. 19 | 20 | response_without_id = await raise_without_id() 21 | assert response_without_id.status == 500 22 | _body = json.loads(response_without_id.body._value) 23 | assert _body['traceback'] is not None 24 | -------------------------------------------------------------------------------- /services/metadata_service/tests/unit_tests/task_test.py: -------------------------------------------------------------------------------- 1 | 2 | import pytest 3 | from services.utils import has_heartbeat_capable_version_tag 4 | 5 | 6 | expectations = [ 7 | ([], False), 8 | (["2.2.12"], False), 9 | (["metaflow_version:0.5"], False), 10 | (["metaflow_version:1.13"], False), 11 | (["metaflow_version:1"], False), 12 | (["metaflow_version:1.14.0"], True), 13 | (["metaflow_version:1.22.1"], True), 14 | (["metaflow_version:2.0.0"], False), 15 | (["metaflow_version:2.0"], False), 16 | (["metaflow_version:2"], False), 17 | (["metaflow_version:2.0.5"], False), 18 | (["metaflow_version:2.2.11"], False), 19 | (["metaflow_version:2.2.12"], True), 20 | (["metaflow_version:2.2.12+ab1234"], True), 21 | (["metaflow_version:2.3"], True), 22 | (["metaflow_version:2.3.1"], True), 23 | (["metaflow_version:2.4.1"], True), 24 | (["metaflow_version:2.12.24.post9-git2a5367b+ob(v1)"], True), 25 | (["metaflow_version:2.12.24+inconsequential+trailing-string"], True), 26 | (["metaflow_version:2.12.24.break"], True), 27 | (["metaflow_version:3"], True), 28 | (["metaflow_version:custom-1"], True), 29 | ] 30 | 31 | 32 | @pytest.mark.parametrize("system_tags, expected_boolean", expectations) 33 | async def test_has_heartbeat_capable_version_tag(system_tags, expected_boolean): 34 | _result_bool = has_heartbeat_capable_version_tag(system_tags) 35 | 36 | assert expected_boolean == _result_bool 37 | -------------------------------------------------------------------------------- /services/migration_service/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Netflix/metaflow-service/9e47d2d85e127d2673d457dde7ae535a3341de0f/services/migration_service/__init__.py -------------------------------------------------------------------------------- /services/migration_service/api/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shlex 3 | 4 | version_dict = { 5 | '0': 'v_1_0_1', 6 | '1': 'v_1_0_1', 7 | '20200603104139': '20200603104139', 8 | '20201002000616': '20201002000616', 9 | '20210202145952': '20210202145952', 10 | '20210260056859': '20210260056859', 11 | '20211202100726': '20211202100726', 12 | '20220503175500': '20220503175500', 13 | '20230118020300': 'latest', 14 | } 15 | 16 | latest = "latest" 17 | 18 | 19 | def make_goose_template(conn_str, command): 20 | return ' '.join(shlex.quote(arg) for arg in [ 21 | "goose", 22 | "postgres", 23 | f"{conn_str}", 24 | f"{command}" 25 | ]) 26 | 27 | 28 | path = os.path.dirname(__file__) + "/../migration_files" 29 | 30 | 31 | def make_goose_migration_template(conn_str, command): 32 | return ' '.join(shlex.quote(arg) for arg in [ 33 | "goose", 34 | "-dir", 35 | path, 36 | "postgres", 37 | f"{conn_str}", 38 | f"{command}" 39 | ]) 40 | -------------------------------------------------------------------------------- /services/migration_service/api/admin.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from aiohttp import web 4 | from subprocess import Popen 5 | from multidict import MultiDict 6 | from .utils import ApiUtils 7 | from . import make_goose_migration_template 8 | from services.migration_service.migration_config import db_conf 9 | 10 | 11 | class AdminApi(object): 12 | def __init__(self, app): 13 | app.router.add_route("GET", "/version", self.version) 14 | app.router.add_route("GET", "/ping", self.ping) 15 | app.router.add_route("GET", "/db_schema_status", self.db_schema_status) 16 | 17 | endpoints_enabled = int(os.environ.get("MF_MIGRATION_ENDPOINTS_ENABLED", 18 | 1)) 19 | if endpoints_enabled: 20 | app.router.add_route("PATCH", "/upgrade", self.upgrade) 21 | 22 | async def ping(self, request): 23 | """ 24 | --- 25 | description: This end-point allow to test that service is up. 26 | tags: 27 | - Admin 28 | produces: 29 | - 'text/plain' 30 | responses: 31 | "202": 32 | description: successful operation. Return "pong" text 33 | "405": 34 | description: invalid HTTP Method 35 | """ 36 | return web.Response(text="pong") 37 | 38 | async def version(self, request): 39 | """ 40 | --- 41 | description: This end-point returns the latest compatible version of the 42 | metadata service 43 | tags: 44 | - Admin 45 | produces: 46 | - 'application/json' 47 | responses: 48 | "200": 49 | description: successful operation. Return version text 50 | "405": 51 | description: invalid HTTP Method 52 | """ 53 | version = await ApiUtils.get_latest_compatible_version() 54 | return web.Response(text=version) 55 | 56 | async def upgrade(self, request): 57 | """ 58 | --- 59 | description: This end-point upgrades to the latest available version of 60 | of the schema 61 | tags: 62 | - Admin 63 | produces: 64 | - 'text/plain' 65 | responses: 66 | "200": 67 | description: successful operation. Return text 68 | "500": 69 | description: could not upgrade 70 | """ 71 | goose_version_cmd = make_goose_migration_template( 72 | db_conf.connection_string_url(), 73 | "up" 74 | ) 75 | p = Popen(goose_version_cmd, shell=True, 76 | close_fds=True) 77 | p.wait() 78 | if p.returncode == 0: 79 | return web.Response(text="upgrade success") 80 | else: 81 | return web.Response(text="upgrade failed", status=500) 82 | 83 | async def db_schema_status(self, request): 84 | """ 85 | --- 86 | description: This end-point returns varius stats around 87 | tags: 88 | - Admin 89 | produces: 90 | - 'application/json' 91 | responses: 92 | "200": 93 | description: successful operation. returns status of db schema and migrations 94 | "500": 95 | description: could not upgrade 96 | """ 97 | try: 98 | version = await ApiUtils.get_goose_version() 99 | migration_in_progress = await ApiUtils.is_migration_in_progress() 100 | unapplied_migrations = ApiUtils.get_unapplied_migrations(version) 101 | body = { 102 | "is_up_to_date": len(unapplied_migrations) == 0, 103 | "current_version": version, 104 | "migration_in_progress": migration_in_progress, 105 | "db_schema_versions": ApiUtils.list_migrations(), 106 | "unapplied_migrations": unapplied_migrations 107 | } 108 | return web.Response(body=json.dumps(body), 109 | headers=MultiDict({"Content-Type": "application/json"})) 110 | 111 | except Exception as e: 112 | body = { 113 | "detail": repr(e) 114 | } 115 | return web.Response(status=500, body=json.dumps(body), 116 | headers=MultiDict({"Content-Type": "application/json"})) 117 | -------------------------------------------------------------------------------- /services/migration_service/api/utils.py: -------------------------------------------------------------------------------- 1 | from subprocess import Popen, PIPE 2 | from ..data.postgres_async_db import PostgresUtils 3 | from . import version_dict, latest, \ 4 | make_goose_migration_template, make_goose_template 5 | from services.migration_service.migration_config import db_conf 6 | import sys 7 | 8 | 9 | class ApiUtils(object): 10 | @staticmethod 11 | def list_migrations(): 12 | migrations_list = list((version_dict.keys())) 13 | migrations_list.sort(key=int) 14 | return migrations_list[1:] 15 | 16 | @staticmethod 17 | def get_unapplied_migrations(current_version): 18 | try: 19 | migrations_list = ApiUtils.list_migrations() 20 | index_version = migrations_list.index(current_version) 21 | return migrations_list[index_version + 1:] 22 | except: 23 | return migrations_list 24 | 25 | @staticmethod 26 | async def get_goose_version(): 27 | # if tables exist but goose doesn't find version table then 28 | goose_version_cmd = make_goose_template(db_conf.connection_string_url(), 'version') 29 | 30 | p = Popen(goose_version_cmd, stdout=PIPE, stderr=PIPE, shell=True, 31 | close_fds=True) 32 | p.wait() 33 | 34 | version = None 35 | std_err = p.stderr.read() 36 | lines_err = std_err.decode("utf-8").split("\n") 37 | for line in lines_err: 38 | if "goose: version" in line: 39 | s = line.split("goose: version ") 40 | version = s[1] 41 | print(line) 42 | break 43 | 44 | if version: 45 | return version 46 | else: 47 | raise Exception( 48 | "unable to get db version via goose: " + std_err.decode("utf-8")) 49 | 50 | @staticmethod 51 | async def get_latest_compatible_version(): 52 | is_present = await PostgresUtils.is_present("flows_v3") 53 | if is_present: 54 | version = await ApiUtils.get_goose_version() 55 | return version_dict[version] 56 | else: 57 | print("Running initial migration..", file=sys.stderr) 58 | goose_version_cmd = make_goose_migration_template(db_conf.connection_string_url(), 'up') 59 | p = Popen(goose_version_cmd, shell=True, 60 | close_fds=True) 61 | if p.wait() != 0: 62 | raise Exception("Failed to run initial migration") 63 | return latest 64 | 65 | @staticmethod 66 | async def is_migration_in_progress(): 67 | goose_version_cmd = make_goose_template( 68 | db_conf.connection_string_url(), "status" 69 | ) 70 | 71 | p = Popen(goose_version_cmd, stdout=PIPE, stderr=PIPE, shell=True, 72 | close_fds=True) 73 | p.wait() 74 | 75 | std_err = p.stderr.read() 76 | lines_err = std_err.decode("utf-8") 77 | if "Pending" in lines_err: 78 | return True 79 | 80 | return False 81 | -------------------------------------------------------------------------------- /services/migration_service/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Netflix/metaflow-service/9e47d2d85e127d2673d457dde7ae535a3341de0f/services/migration_service/data/__init__.py -------------------------------------------------------------------------------- /services/migration_service/data/postgres_async_db.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import aiopg 4 | 5 | from services.utils import DBConfiguration 6 | 7 | 8 | class PostgresUtils(object): 9 | @staticmethod 10 | async def is_present(table_name): 11 | with (await AsyncPostgresDB.get_instance().pool.cursor()) as cur: 12 | await cur.execute( 13 | "select * from information_schema.tables where table_name=%s", 14 | (table_name,), 15 | ) 16 | return bool(cur.rowcount) 17 | 18 | 19 | class AsyncPostgresDB(object): 20 | connection = None 21 | __instance = None 22 | 23 | pool = None 24 | 25 | @staticmethod 26 | def get_instance(): 27 | if AsyncPostgresDB.__instance is None: 28 | AsyncPostgresDB() 29 | return AsyncPostgresDB.__instance 30 | 31 | def __init__(self): 32 | if self.__instance is not None: 33 | return 34 | 35 | AsyncPostgresDB.__instance = self 36 | 37 | async def _init(self, db_conf: DBConfiguration): 38 | # todo make poolsize min and max configurable as well as timeout 39 | # todo add retry and better error message 40 | retries = 3 41 | for i in range(retries): 42 | try: 43 | self.pool = await aiopg.create_pool(db_conf.get_dsn(), timeout=db_conf.timeout) 44 | except Exception as e: 45 | print("printing connection exception: " + str(e)) 46 | if retries - i < 1: 47 | raise e 48 | time.sleep(1) 49 | continue 50 | -------------------------------------------------------------------------------- /services/migration_service/get_virtual_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import requests 4 | import socket 5 | import time 6 | from services.data.service_configs import max_startup_retries, \ 7 | startup_retry_wait_time_seconds 8 | 9 | port = int(os.environ.get("MF_MIGRATION_PORT", 8082)) 10 | 11 | try: 12 | s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 13 | retry_count = max_startup_retries 14 | while retry_count > 0: 15 | print(retry_count) 16 | try: 17 | print("connecting") 18 | s.connect(('localhost', port)) 19 | print("Port reachable", port) 20 | break 21 | except socket.error as e: 22 | print("booting...") 23 | print(e) 24 | time.sleep(startup_retry_wait_time_seconds) 25 | except Exception: 26 | print("something broke") 27 | finally: 28 | retry_count = retry_count - 1 29 | # continue 30 | s.close() 31 | if retry_count == 0: 32 | print("ran out of retries to get migration version, exiting") 33 | sys.exit(1) 34 | except Exception as e: 35 | print(e) 36 | sys.exit(1) 37 | 38 | r = requests.get('http://localhost:{0}/version'.format(port)) 39 | r.raise_for_status() 40 | 41 | conf_file = open('/root/services/migration_service/config', 'w') 42 | print(r.text, file=conf_file) 43 | conf_file.close() 44 | -------------------------------------------------------------------------------- /services/migration_service/migration_config.py: -------------------------------------------------------------------------------- 1 | from services.utils import DBConfiguration 2 | db_conf = DBConfiguration() 3 | -------------------------------------------------------------------------------- /services/migration_service/migration_files/1_create_tables.sql: -------------------------------------------------------------------------------- 1 | -- +goose Up 2 | -- +goose StatementBegin 3 | SELECT 'up SQL query'; 4 | CREATE TABLE IF NOT EXISTS flows_v3 ( 5 | flow_id VARCHAR(255) PRIMARY KEY, 6 | user_name VARCHAR(255), 7 | ts_epoch BIGINT NOT NULL, 8 | tags JSONB, 9 | system_tags JSONB 10 | ); 11 | 12 | CREATE TABLE IF NOT EXISTS runs_v3 ( 13 | flow_id VARCHAR(255) NOT NULL, 14 | run_number SERIAL NOT NULL, 15 | user_name VARCHAR(255), 16 | ts_epoch BIGINT NOT NULL, 17 | tags JSONB, 18 | system_tags JSONB, 19 | PRIMARY KEY(flow_id, run_number), 20 | FOREIGN KEY(flow_id) REFERENCES flows_v3 (flow_id) 21 | ); 22 | 23 | CREATE TABLE IF NOT EXISTS steps_v3 ( 24 | flow_id VARCHAR(255) NOT NULL, 25 | run_number BIGINT NOT NULL, 26 | step_name VARCHAR(255) NOT NULL, 27 | user_name VARCHAR(255), 28 | ts_epoch BIGINT NOT NULL, 29 | tags JSONB, 30 | system_tags JSONB, 31 | PRIMARY KEY(flow_id, run_number, step_name), 32 | FOREIGN KEY(flow_id, run_number) REFERENCES runs_v3 (flow_id, run_number) 33 | ); 34 | 35 | 36 | CREATE TABLE IF NOT EXISTS tasks_v3 ( 37 | flow_id VARCHAR(255) NOT NULL, 38 | run_number BIGINT NOT NULL, 39 | step_name VARCHAR(255) NOT NULL, 40 | task_id BIGSERIAL PRIMARY KEY, 41 | user_name VARCHAR(255), 42 | ts_epoch BIGINT NOT NULL, 43 | tags JSONB, 44 | system_tags JSONB, 45 | FOREIGN KEY(flow_id, run_number, step_name) REFERENCES steps_v3 (flow_id, run_number, step_name) 46 | ); 47 | 48 | CREATE TABLE IF NOT EXISTS metadata_v3 ( 49 | flow_id VARCHAR(255), 50 | run_number BIGINT NOT NULL, 51 | step_name VARCHAR(255) NOT NULL, 52 | task_id BIGINT NOT NULL, 53 | id BIGSERIAL NOT NULL, 54 | field_name VARCHAR(255) NOT NULL, 55 | value TEXT NOT NULL, 56 | type VARCHAR(255) NOT NULL, 57 | user_name VARCHAR(255), 58 | ts_epoch BIGINT NOT NULL, 59 | tags JSONB, 60 | system_tags JSONB, 61 | PRIMARY KEY(flow_id, run_number, step_name, task_id, field_name) 62 | ); 63 | 64 | CREATE TABLE IF NOT EXISTS artifact_v3 ( 65 | flow_id VARCHAR(255) NOT NULL, 66 | run_number BIGINT NOT NULL, 67 | step_name VARCHAR(255) NOT NULL, 68 | task_id BIGINT NOT NULL, 69 | name VARCHAR(255) NOT NULL, 70 | location VARCHAR(255) NOT NULL, 71 | ds_type VARCHAR(255) NOT NULL, 72 | sha VARCHAR(255), 73 | type VARCHAR(255), 74 | content_type VARCHAR(255), 75 | user_name VARCHAR(255), 76 | attempt_id SMALLINT NOT NULL, 77 | ts_epoch BIGINT NOT NULL, 78 | tags JSONB, 79 | system_tags JSONB, 80 | PRIMARY KEY(flow_id, run_number, step_name, task_id, attempt_id, name) 81 | ); 82 | 83 | -- +goose StatementEnd 84 | 85 | -- +goose Down 86 | -- +goose StatementBegin 87 | SELECT 'down SQL query'; 88 | 89 | -- +goose StatementEnd 90 | -------------------------------------------------------------------------------- /services/migration_service/migration_files/20200603104139_add_str_id_cols.sql: -------------------------------------------------------------------------------- 1 | -- +goose Up 2 | -- +goose StatementBegin 3 | SELECT 'up SQL query'; 4 | ALTER TABLE runs_v3 5 | ADD COLUMN run_id VARCHAR(255); 6 | 7 | ALTER TABLE runs_v3 8 | ADD COLUMN last_heartbeat_ts BIGINT; 9 | 10 | ALTER TABLE runs_v3 11 | ADD CONSTRAINT runs_v3_flow_id_run_id_key UNIQUE (flow_id, run_id); 12 | 13 | ALTER TABLE steps_v3 14 | ADD COLUMN run_id VARCHAR(255); 15 | 16 | ALTER TABLE steps_v3 17 | ADD CONSTRAINT steps_v3_flow_id_run_id_step_name_key UNIQUE (flow_id, run_id, step_name); 18 | 19 | ALTER TABLE tasks_v3 20 | ADD COLUMN run_id VARCHAR(255); 21 | 22 | ALTER TABLE tasks_v3 23 | ADD COLUMN task_name VARCHAR(255); 24 | 25 | ALTER TABLE tasks_v3 26 | ADD COLUMN last_heartbeat_ts BIGINT; 27 | 28 | ALTER TABLE tasks_v3 29 | ADD CONSTRAINT tasks_v3_flow_id_run_number_step_name_task_name_key UNIQUE (flow_id, run_number, step_name, task_name); 30 | 31 | ALTER TABLE metadata_v3 32 | ADD COLUMN run_id VARCHAR(255); 33 | 34 | ALTER TABLE metadata_v3 35 | ADD COLUMN task_name VARCHAR(255); 36 | 37 | ALTER TABLE artifact_v3 38 | ADD COLUMN run_id VARCHAR(255); 39 | 40 | ALTER TABLE artifact_v3 41 | ADD COLUMN task_name VARCHAR(255); 42 | 43 | -- +goose StatementEnd 44 | 45 | -- +goose Down 46 | -- +goose StatementBegin 47 | SELECT 'down SQL query'; 48 | ALTER TABLE artifact_v3 49 | DROP COLUMN task_name; 50 | 51 | ALTER TABLE artifact_v3 52 | DROP COLUMN run_id; 53 | 54 | ALTER TABLE metadata_v3 55 | DROP COLUMN run_id; 56 | 57 | ALTER TABLE metadata_v3 58 | DROP COLUMN task_name; 59 | 60 | ALTER TABLE tasks_v3 61 | DROP CONSTRAINT tasks_v3_flow_id_run_number_step_name_task_name_key; 62 | 63 | ALTER TABLE tasks_v3 64 | DROP COLUMN run_id; 65 | 66 | ALTER TABLE tasks_v3 67 | DROP COLUMN task_name; 68 | 69 | ALTER TABLE tasks_v3 70 | DROP COLUMN last_heartbeat_ts; 71 | 72 | ALTER TABLE steps_v3 73 | DROP CONSTRAINT steps_v3_flow_id_run_id_step_name_key; 74 | 75 | ALTER TABLE steps_v3 76 | DROP COLUMN run_id; 77 | 78 | ALTER TABLE runs_v3 79 | DROP CONSTRAINT runs_v3_flow_id_run_id_key; 80 | 81 | ALTER TABLE runs_v3 82 | DROP COLUMN last_heartbeat_ts; 83 | 84 | ALTER TABLE runs_v3 85 | DROP COLUMN run_id; 86 | 87 | -- +goose StatementEnd 88 | -------------------------------------------------------------------------------- /services/migration_service/migration_files/20201002000616_update_metadata_primary_key.sql: -------------------------------------------------------------------------------- 1 | -- +goose Up 2 | -- +goose StatementBegin 3 | SELECT 'up SQL query'; 4 | 5 | ALTER TABLE metadata_v3 6 | ADD CONSTRAINT metadata_v3_primary_key UNIQUE (id,flow_id, run_number, step_name, task_id, field_name); 7 | 8 | CREATE INDEX metadata_v3_akey ON metadata_v3(flow_id, run_number, step_name, task_id, field_name); 9 | 10 | ALTER TABLE metadata_v3 11 | DROP CONSTRAINT metadata_v3_pkey; 12 | 13 | ALTER TABLE metadata_v3 14 | ADD PRIMARY KEY (id,flow_id, run_number, step_name, task_id, field_name); 15 | 16 | ALTER TABLE metadata_v3 17 | DROP CONSTRAINT metadata_v3_primary_key; 18 | -- +goose StatementEnd 19 | 20 | -- +goose Down 21 | -- +goose StatementBegin 22 | SELECT 'down SQL query'; 23 | 24 | -- create index that will become the primary key 25 | ALTER TABLE metadata_v3 26 | ADD CONSTRAINT metadata_v3_primary_key UNIQUE (flow_id, run_number, step_name, task_id, field_name); 27 | 28 | -- drop index created for optimized access 29 | ALTER TABLE metadata_v3 30 | DROP metadata_v3 metadata_v3_akey; 31 | 32 | -- drop primary key 33 | ALTER TABLE metadata_v3 34 | DROP CONSTRAINT metadata_v3_pkey; 35 | 36 | -- set index as primary key 37 | ALTER TABLE metadata_v3 38 | ADD PRIMARY KEY (flow_id, run_number, step_name, task_id, field_name); 39 | 40 | -- drop index 41 | ALTER TABLE metadata_v3 42 | DROP CONSTRAINT metadata_v3_primary_key; 43 | -- +goose StatementEnd 44 | -------------------------------------------------------------------------------- /services/migration_service/migration_files/20210202145952_add_runs_idx_ts_epoch_flow_id.sql: -------------------------------------------------------------------------------- 1 | -- +goose Up 2 | -- +goose StatementBegin 3 | SELECT 'up SQL query'; 4 | 5 | -- Others 6 | 7 | CREATE INDEX IF NOT EXISTS runs_v3_idx_ts_epoch ON runs_v3 (ts_epoch); 8 | 9 | CREATE INDEX IF NOT EXISTS runs_v3_idx_gin_tags_combined ON runs_v3 USING gin ((tags || system_tags)); 10 | 11 | -- flow_id + ts_epoch 12 | 13 | CREATE INDEX IF NOT EXISTS runs_v3_idx_flow_id_asc_ts_epoch_desc ON runs_v3 (flow_id ASC, ts_epoch DESC); 14 | 15 | -- user && ts_epoch 16 | 17 | CREATE INDEX IF NOT EXISTS runs_v3_idx_user_asc_ts_epoch_desc ON runs_v3 ( 18 | (CASE 19 | WHEN system_tags ? ('user:' || user_name) 20 | THEN user_name 21 | ELSE NULL 22 | END) ASC, ts_epoch DESC 23 | ); 24 | 25 | -- +goose StatementEnd 26 | 27 | -- +goose Down 28 | -- +goose StatementBegin 29 | SELECT 'down SQL query'; 30 | 31 | DROP INDEX IF EXISTS runs_v3_idx_user_asc_ts_epoch_desc; 32 | 33 | DROP INDEX IF EXISTS runs_v3_idx_flow_id_asc_ts_epoch_desc; 34 | 35 | DROP INDEX IF EXISTS runs_v3_idx_gin_tags_combined; 36 | 37 | DROP INDEX IF EXISTS runs_v3_idx_ts_epoch; 38 | 39 | 40 | -- +goose StatementEnd 41 | -------------------------------------------------------------------------------- /services/migration_service/migration_files/20210260056859_add_tasks_idx_on_.sql: -------------------------------------------------------------------------------- 1 | -- +goose NO TRANSACTION 2 | -- +goose Up 3 | -- +goose StatementBegin 4 | 5 | -- tasks on flow_id, run_id, step_name and task_name 6 | CREATE INDEX CONCURRENTLY IF NOT EXISTS tasks_v3_idx_flow_id_run_id_step_name_task_name ON tasks_v3 ( 7 | flow_id, run_id, step_name, task_name) WHERE run_id IS NOT NULL AND task_name IS NOT NULL; 8 | 9 | -- +goose StatementEnd 10 | 11 | -- +goose Down 12 | -- +goose StatementBegin 13 | DROP INDEX IF EXISTS tasks_v3_idx_flow_id_run_id_step_name_task_name; 14 | 15 | -- +goose StatementEnd 16 | -------------------------------------------------------------------------------- /services/migration_service/migration_files/20211202100726_add_str_id_indices.sql: -------------------------------------------------------------------------------- 1 | -- +goose NO TRANSACTION 2 | -- +goose Up 3 | -- runs idx on flow_id, run_id 4 | CREATE INDEX CONCURRENTLY IF NOT EXISTS runs_v3_idx_str_ids_primary_key ON runs_v3 (flow_id, run_id) 5 | WHERE 6 | run_id IS NOT NULL; 7 | 8 | -- steps idx on flow_id, run_id 9 | CREATE INDEX CONCURRENTLY IF NOT EXISTS steps_v3_idx_str_ids_primary_key ON steps_v3 (flow_id, run_id, step_name) 10 | WHERE 11 | run_id IS NOT NULL; 12 | 13 | -- metadata idx on id, flow_id, run_id, step_name and task_name, field_name 14 | CREATE INDEX CONCURRENTLY IF NOT EXISTS metadata_v3_idx_str_ids_primary_key ON metadata_v3 ( 15 | id, 16 | flow_id, 17 | run_id, 18 | step_name, 19 | task_name, 20 | field_name 21 | ) 22 | WHERE 23 | run_id IS NOT NULL 24 | AND task_name IS NOT NULL; 25 | 26 | -- artifact idx on flow_id, run_id, step_name and task_name, attempt_id, name 27 | CREATE INDEX CONCURRENTLY IF NOT EXISTS artifact_v3_idx_str_ids_primary_key ON artifact_v3 ( 28 | flow_id, 29 | run_id, 30 | step_name, 31 | task_name, 32 | attempt_id, 33 | name 34 | ) 35 | WHERE 36 | run_id IS NOT NULL 37 | AND task_name IS NOT NULL; 38 | 39 | -- +goose Down 40 | -- +goose StatementBegin 41 | DROP INDEX IF EXISTS runs_v3_idx_str_ids_primary_key; 42 | 43 | DROP INDEX IF EXISTS steps_v3_idx_str_ids_primary_key; 44 | 45 | DROP INDEX IF EXISTS metadata_v3_idx_str_ids_primary_key; 46 | 47 | DROP INDEX IF EXISTS artifact_v3_idx_str_ids_primary_key; 48 | 49 | -- +goose StatementEnd -------------------------------------------------------------------------------- /services/migration_service/migration_files/20220503175500_add_run_epoch_index.sql: -------------------------------------------------------------------------------- 1 | -- +goose NO TRANSACTION 2 | -- +goose Up 3 | -- +goose StatementBegin 4 | 5 | -- UI requests recent runs a lot, this index helps make those queries go faster. 6 | -- (it seems to help it push down LIMITs even if there aren't too many runs in the db) 7 | CREATE INDEX CONCURRENTLY IF NOT EXISTS runs_v3_idx_epoch_ts_desc ON runs_v3 (ts_epoch DESC); 8 | 9 | -- +goose StatementEnd 10 | 11 | -- +goose Down 12 | -- +goose StatementBegin 13 | 14 | DROP INDEX IF EXISTS runs_v3_idx_epoch_ts_desc; 15 | 16 | -- +goose StatementEnd -------------------------------------------------------------------------------- /services/migration_service/migration_files/20230118020300_drop_partial_indexes.sql: -------------------------------------------------------------------------------- 1 | -- +goose NO TRANSACTION 2 | -- +goose Up 3 | 4 | -- Drop partial str_ids indexes created with 5 | -- 20211202100726_add_str_id_indices.sql and 20210260056859_add_tasks_idx_on_.sql 6 | -- and recreate them without the constraining WHERE clause. 7 | -- This is being done as the psql query planner is not using these indexes many times. 8 | -- To avoid perf downtime we first create the new indexes and then drop the old ones. 9 | CREATE INDEX CONCURRENTLY IF NOT EXISTS runs_v3_idx_str_ids_primary_key_v2 10 | ON runs_v3 (flow_id, run_id); 11 | 12 | DROP INDEX CONCURRENTLY IF EXISTS runs_v3_idx_str_ids_primary_key; 13 | 14 | CREATE INDEX CONCURRENTLY IF NOT EXISTS steps_v3_idx_str_ids_primary_key_v2 15 | ON steps_v3 (flow_id, run_id, step_name); 16 | 17 | DROP INDEX CONCURRENTLY IF EXISTS steps_v3_idx_str_ids_primary_key; 18 | 19 | CREATE INDEX CONCURRENTLY IF NOT EXISTS tasks_v3_idx_flow_id_run_id_step_name_task_name_v2 20 | ON tasks_v3(flow_id, run_id, step_name, task_name); 21 | 22 | DROP INDEX CONCURRENTLY IF EXISTS tasks_v3_idx_flow_id_run_id_step_name_task_name; 23 | 24 | CREATE INDEX CONCURRENTLY IF NOT EXISTS metadata_v3_idx_str_ids_a_key 25 | ON metadata_v3 ( 26 | flow_id, 27 | run_id, 28 | step_name, 29 | task_name, 30 | field_name 31 | ); 32 | 33 | CREATE INDEX CONCURRENTLY IF NOT EXISTS metadata_v3_idx_str_ids_a_key_with_task_id 34 | ON metadata_v3 ( 35 | flow_id, 36 | run_id, 37 | step_name, 38 | task_id, 39 | field_name 40 | ); 41 | 42 | DROP INDEX CONCURRENTLY IF EXISTS metadata_v3_idx_str_ids_primary_key; 43 | 44 | CREATE INDEX CONCURRENTLY IF NOT EXISTS artifact_v3_idx_str_ids_primary_key_v2 ON artifact_v3 ( 45 | flow_id, 46 | run_id, 47 | step_name, 48 | task_name, 49 | attempt_id, 50 | name 51 | ); 52 | 53 | CREATE INDEX CONCURRENTLY IF NOT EXISTS artifact_v3_idx_str_ids_primary_key_with_task_id ON artifact_v3 ( 54 | flow_id, 55 | run_id, 56 | step_name, 57 | task_id, 58 | attempt_id, 59 | name 60 | ); 61 | 62 | DROP INDEX CONCURRENTLY IF EXISTS artifact_v3_idx_str_ids_primary_key; 63 | 64 | 65 | -- +goose Down 66 | 67 | -- copy of 20211202100726_add_str_id_indices.sql and 20210260056859_add_tasks_idx_on_.sql 68 | -- runs idx on flow_id, run_id 69 | CREATE INDEX CONCURRENTLY IF NOT EXISTS runs_v3_idx_str_ids_primary_key ON runs_v3 (flow_id, run_id) 70 | WHERE 71 | run_id IS NOT NULL; 72 | 73 | -- steps idx on flow_id, run_id 74 | CREATE INDEX CONCURRENTLY IF NOT EXISTS steps_v3_idx_str_ids_primary_key ON steps_v3 (flow_id, run_id, step_name) 75 | WHERE 76 | run_id IS NOT NULL; 77 | 78 | -- metadata idx on id, flow_id, run_id, step_name and task_name, field_name 79 | CREATE INDEX CONCURRENTLY IF NOT EXISTS metadata_v3_idx_str_ids_primary_key ON metadata_v3 ( 80 | id, 81 | flow_id, 82 | run_id, 83 | step_name, 84 | task_name, 85 | field_name 86 | ) 87 | WHERE 88 | run_id IS NOT NULL 89 | AND task_name IS NOT NULL; 90 | 91 | -- artifact idx on flow_id, run_id, step_name and task_name, attempt_id, name 92 | CREATE INDEX CONCURRENTLY IF NOT EXISTS artifact_v3_idx_str_ids_primary_key ON artifact_v3 ( 93 | flow_id, 94 | run_id, 95 | step_name, 96 | task_name, 97 | attempt_id, 98 | name 99 | ) 100 | WHERE 101 | run_id IS NOT NULL 102 | AND task_name IS NOT NULL; 103 | 104 | -- tasks on flow_id, run_id, step_name and task_name 105 | CREATE INDEX CONCURRENTLY IF NOT EXISTS tasks_v3_idx_flow_id_run_id_step_name_task_name ON tasks_v3 ( 106 | flow_id, run_id, step_name, task_name) WHERE run_id IS NOT NULL AND task_name IS NOT NULL; 107 | 108 | 109 | DROP INDEX CONCURRENTLY IF EXISTS runs_v3_idx_str_ids_primary_key_v2; 110 | DROP INDEX CONCURRENTLY IF EXISTS steps_v3_idx_str_ids_primary_key_v2; 111 | DROP INDEX CONCURRENTLY IF EXISTS tasks_v3_idx_flow_id_run_id_step_name_task_name_v2; 112 | DROP INDEX CONCURRENTLY IF EXISTS metadata_v3_idx_str_ids_a_key; 113 | DROP INDEX CONCURRENTLY IF EXISTS metadata_v3_idx_str_ids_a_key_with_task_id; 114 | DROP INDEX CONCURRENTLY IF EXISTS artifact_v3_idx_str_ids_primary_key_v2; 115 | DROP INDEX CONCURRENTLY IF EXISTS artifact_v3_idx_str_ids_primary_key_with_task_id; 116 | -------------------------------------------------------------------------------- /services/migration_service/migration_server.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | 4 | from aiohttp import web 5 | 6 | from .api.admin import AdminApi 7 | 8 | from .data.postgres_async_db import AsyncPostgresDB 9 | from services.utils import DBConfiguration 10 | from .migration_config import db_conf 11 | 12 | 13 | def app(loop=None, db_conf: DBConfiguration = None): 14 | 15 | loop = loop or asyncio.get_event_loop() 16 | app = web.Application(loop=loop) 17 | async_db = AsyncPostgresDB() 18 | loop.run_until_complete(async_db._init(db_conf)) 19 | AdminApi(app) 20 | return app 21 | 22 | 23 | def main(): 24 | loop = asyncio.get_event_loop() 25 | the_app = app(loop, db_conf) 26 | handler = web.AppRunner(the_app) 27 | loop.run_until_complete(handler.setup()) 28 | 29 | port = os.environ.get("MF_MIGRATION_PORT", 8082) 30 | host = str(os.environ.get("MF_METADATA_HOST", "0.0.0.0")) 31 | f = loop.create_server(handler.server, host, port) 32 | 33 | srv = loop.run_until_complete(f) 34 | 35 | print("serving on", srv.sockets[0].getsockname()) 36 | try: 37 | loop.run_forever() 38 | except KeyboardInterrupt: 39 | pass 40 | 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /services/migration_service/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp >= 3.8.1, < 4 2 | packaging 3 | psycopg2 4 | aiopg 5 | -------------------------------------------------------------------------------- /services/migration_service/run_script.py: -------------------------------------------------------------------------------- 1 | from subprocess import Popen 2 | import os 3 | import sys 4 | 5 | 6 | def setup_env(version_value: str): 7 | _env = os.environ 8 | virtual_env_path = '/opt/' + version_value 9 | _env['VIRTUAL_ENV'] = virtual_env_path 10 | path = _env['PATH'] 11 | _env['PATH'] = virtual_env_path + "/bin:" + path 12 | return _env 13 | 14 | 15 | if __name__ == "__main__": 16 | try: 17 | migration_server_process = Popen( 18 | "PYTHONPATH=/ python3 -m services.migration_service.migration_server", 19 | shell=True, 20 | close_fds=True, 21 | env=setup_env('latest') 22 | ) 23 | 24 | get_env_version = Popen( 25 | "python3 -m services.migration_service.get_virtual_env", 26 | shell=True, 27 | close_fds=True 28 | ) 29 | 30 | if get_env_version.wait() != 0: 31 | print("Failed to get env version", file=sys.stderr) 32 | sys.exit(1) 33 | 34 | # read in version of metadata service to load 35 | version_value_file = open('/root/services/migration_service/config', 'r') 36 | version_value = str(version_value_file.read()).strip() 37 | 38 | # start proper version of metadata service 39 | metadata_server_process = Popen( 40 | "metadata_service", 41 | shell=True, 42 | close_fds=True, 43 | env=setup_env(version_value) 44 | ) 45 | 46 | rc = metadata_server_process.wait() 47 | if rc != 0: 48 | print("Metadata server exited with non zero status") 49 | sys.exit(rc) 50 | rc = migration_server_process.wait() 51 | if rc != 0: 52 | print("Migration server exited with non zero status") 53 | sys.exit(rc) 54 | except Exception as e: 55 | print(e) 56 | sys.exit(1) 57 | -------------------------------------------------------------------------------- /services/ui_backend_service/.gitignore: -------------------------------------------------------------------------------- 1 | config.* -------------------------------------------------------------------------------- /services/ui_backend_service/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Netflix/metaflow-service/9e47d2d85e127d2673d457dde7ae535a3341de0f/services/ui_backend_service/__init__.py -------------------------------------------------------------------------------- /services/ui_backend_service/api/__init__.py: -------------------------------------------------------------------------------- 1 | # api routes 2 | from .admin import AdminApi 3 | from .autocomplete import AutoCompleteApi 4 | from .artifact import ArtificatsApi 5 | from .search import SearchApi 6 | from .dag import DagApi 7 | from .flow import FlowApi 8 | from .run import RunApi 9 | from .step import StepApi 10 | from .task import TaskApi 11 | from .log import LogApi 12 | from .tag import TagApi 13 | from .metadata import MetadataApi 14 | from .features import FeaturesApi 15 | from .config import ConfigApi 16 | from .plugins import PluginsApi 17 | from .card import CardsApi 18 | 19 | # service processes 20 | from .notify import ListenNotify 21 | from .heartbeat_monitor import RunHeartbeatMonitor, TaskHeartbeatMonitor 22 | from .ws import Websocket 23 | -------------------------------------------------------------------------------- /services/ui_backend_service/api/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from services.utils import handle_exceptions, web_response 3 | 4 | # These environment values will be available to the frontend 5 | ALLOWED_CONFIG_KEYS = [ 6 | 'GA_TRACKING_ID' 7 | ] 8 | 9 | 10 | class ConfigApi(object): 11 | """ 12 | Adds an Api endpoint for fetching required configuration variables for the frontend. 13 | """ 14 | def __init__(self, app): 15 | app.router.add_route('GET', '/config', self.get_config) 16 | 17 | @handle_exceptions 18 | async def get_config(self, request): 19 | """ 20 | --- 21 | description: Get all frontend configuration key-value pairs. 22 | tags: 23 | - Admin 24 | produces: 25 | - application/json 26 | responses: 27 | "200": 28 | description: Returns all allowed configuration key-value pairs for the frontend. 29 | schema: 30 | type: object 31 | properties: 32 | "ALLOWED_CONFIG_VARIABLE": 33 | type: string 34 | example: "value-to-pass-frontend-1234" 35 | description: "A frontend configuration variable from the server environment. These are exposed based on a whitelist on the server." 36 | "405": 37 | description: invalid HTTP Method 38 | """ 39 | config = {} 40 | for key in ALLOWED_CONFIG_KEYS: 41 | val = os.environ.get(key, None) 42 | if val: 43 | config[key] = val 44 | return web_response(200, config) 45 | -------------------------------------------------------------------------------- /services/ui_backend_service/api/dag.py: -------------------------------------------------------------------------------- 1 | from services.data.db_utils import DBResponse, translate_run_key 2 | from services.utils import handle_exceptions 3 | from .utils import format_response, web_response, query_param_enabled 4 | from services.ui_backend_service.data.db.utils import get_run_dag_data 5 | 6 | 7 | class DagApi(object): 8 | def __init__(self, app, db, cache=None): 9 | self.db = db 10 | app.router.add_route( 11 | "GET", "/flows/{flow_id}/runs/{run_number}/dag", self.get_run_dag 12 | ) 13 | self._dag_store = getattr(cache, "dag_cache", None) 14 | 15 | @handle_exceptions 16 | async def get_run_dag(self, request): 17 | """ 18 | --- 19 | description: Get DAG structure for a run. 20 | tags: 21 | - Run 22 | parameters: 23 | - $ref: '#/definitions/Params/Path/flow_id' 24 | - $ref: '#/definitions/Params/Path/run_number' 25 | - $ref: '#/definitions/Params/Custom/invalidate' 26 | produces: 27 | - application/json 28 | responses: 29 | "200": 30 | description: Return DAG structure for a specific run 31 | schema: 32 | $ref: '#/definitions/ResponsesDag' 33 | "405": 34 | description: invalid HTTP Method 35 | schema: 36 | $ref: '#/definitions/ResponsesError405' 37 | "404": 38 | description: necessary data for DAG generation Not Found 39 | schema: 40 | $ref: '#/definitions/ResponsesError404' 41 | "500": 42 | description: Internal Server Error (with error id) 43 | schema: 44 | $ref: '#/definitions/ResponsesDagError500' 45 | """ 46 | flow_name = request.match_info['flow_id'] 47 | run_number = request.match_info.get("run_number") 48 | # Before running the cache action, we make sure that the run has 49 | # the necessary data to generate a DAG. 50 | db_response = await get_run_dag_data(self.db, flow_name, run_number) 51 | 52 | if not db_response.response_code == 200: 53 | # DAG data was not found, return with the corresponding status. 54 | status, body = format_response(request, db_response) 55 | return web_response(status, body) 56 | 57 | # Prefer run_id over run_number 58 | flow_name = db_response.body['flow_id'] 59 | run_id = db_response.body.get('run_id') or db_response.body['run_number'] 60 | invalidate_cache = query_param_enabled(request, "invalidate") 61 | 62 | dag = await self._dag_store.cache.GenerateDag( 63 | flow_name, run_id, invalidate_cache=invalidate_cache) 64 | 65 | if dag.has_pending_request(): 66 | async for event in dag.stream(): 67 | if event["type"] == "error": 68 | # raise error, there was an exception during processing. 69 | raise GenerateDAGFailed(event["message"], event["id"], event["traceback"]) 70 | await dag.wait() # wait until results are ready 71 | dag = dag.get() 72 | response = DBResponse(200, dag) 73 | status, body = format_response(request, response) 74 | 75 | return web_response(status, body) 76 | 77 | 78 | class GenerateDAGFailed(Exception): 79 | def __init__(self, msg="Failed to process DAG", id="failed-to-process-dag", traceback_str=None): 80 | self.message = msg 81 | self.id = id 82 | self.traceback_str = traceback_str 83 | 84 | def __str__(self): 85 | return self.message 86 | -------------------------------------------------------------------------------- /services/ui_backend_service/api/features.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from services.utils import handle_exceptions, web_response 4 | 5 | from ..features import get_features 6 | 7 | 8 | class FeaturesApi(object): 9 | """ 10 | Adds an Api endpoint that returns a list of enabled/disabled features for the UI Backend Service 11 | """ 12 | 13 | def __init__(self, app): 14 | app.router.add_route("GET", "/features", self.get_all_features) 15 | 16 | @handle_exceptions 17 | async def get_all_features(self, request): 18 | """ 19 | --- 20 | description: Get all of enabled/disabled features as key-value pairs. 21 | tags: 22 | - Admin 23 | produces: 24 | - application/json 25 | responses: 26 | "200": 27 | description: Returns all features to be enabled or disabled by the frontend. 28 | schema: 29 | type: object 30 | properties: 31 | "FEATURE_*": 32 | type: boolean 33 | example: true 34 | description: "An environment variable from the server with a FEATURE_ prefix, and its value as a boolean" 35 | "405": 36 | description: invalid HTTP Method 37 | """ 38 | return web_response(200, get_features()) 39 | -------------------------------------------------------------------------------- /services/ui_backend_service/api/flow.py: -------------------------------------------------------------------------------- 1 | from services.utils import handle_exceptions 2 | from .utils import find_records 3 | 4 | 5 | class FlowApi(object): 6 | def __init__(self, app, db): 7 | self.db = db 8 | app.router.add_route("GET", "/flows", self.get_all_flows) 9 | app.router.add_route("GET", "/flows/{flow_id}", self.get_flow) 10 | self._async_table = self.db.flow_table_postgres 11 | 12 | @handle_exceptions 13 | async def get_flow(self, request): 14 | """ 15 | --- 16 | description: Get one flow 17 | tags: 18 | - Flow 19 | parameters: 20 | - $ref: '#/definitions/Params/Path/flow_id' 21 | produces: 22 | - application/json 23 | responses: 24 | "200": 25 | description: Returns one flow 26 | schema: 27 | $ref: '#/definitions/ResponsesFlow' 28 | "405": 29 | description: invalid HTTP Method 30 | schema: 31 | $ref: '#/definitions/ResponsesError405' 32 | """ 33 | 34 | flow_name = request.match_info.get("flow_id") 35 | 36 | return await find_records(request, 37 | self._async_table, 38 | fetch_single=True, 39 | initial_conditions=["flow_id = %s"], 40 | initial_values=[flow_name]) 41 | 42 | @handle_exceptions 43 | async def get_all_flows(self, request): 44 | """ 45 | --- 46 | description: Get all flows 47 | tags: 48 | - Flow 49 | parameters: 50 | - $ref: '#/definitions/Params/Builtin/_page' 51 | - $ref: '#/definitions/Params/Builtin/_limit' 52 | - $ref: '#/definitions/Params/Builtin/_order' 53 | - $ref: '#/definitions/Params/Builtin/_tags' 54 | - $ref: '#/definitions/Params/Builtin/_group' 55 | - $ref: '#/definitions/Params/Custom/flow_id' 56 | - $ref: '#/definitions/Params/Custom/user_name' 57 | - $ref: '#/definitions/Params/Custom/ts_epoch' 58 | produces: 59 | - application/json 60 | responses: 61 | "200": 62 | description: Returns all flows 63 | schema: 64 | $ref: '#/definitions/ResponsesFlowList' 65 | "405": 66 | description: invalid HTTP Method 67 | schema: 68 | $ref: '#/definitions/ResponsesError405' 69 | """ 70 | 71 | return await find_records(request, 72 | self._async_table, 73 | initial_conditions=[], 74 | initial_values=[], 75 | allowed_order=self._async_table.keys, 76 | allowed_group=self._async_table.keys, 77 | allowed_filters=self._async_table.keys 78 | ) 79 | -------------------------------------------------------------------------------- /services/ui_backend_service/api/plugins.py: -------------------------------------------------------------------------------- 1 | from services.utils import handle_exceptions, web_response 2 | from ..plugins import list_plugins 3 | 4 | 5 | class PluginsApi(object): 6 | """ 7 | Adds an Api endpoint for fetching UI plugins. 8 | """ 9 | 10 | def __init__(self, app): 11 | app.router.add_route('GET', '/plugin', self.get_plugins) 12 | app.router.add_route("GET", "/plugin/{plugin_name}", self.get_plugin) 13 | app.router.add_route("GET", "/plugin/{plugin_name}/{filename:.+}", self.get_plugin_asset) 14 | 15 | @handle_exceptions 16 | async def get_plugins(self, request): 17 | """ 18 | --- 19 | description: List all plugins 20 | tags: 21 | - Plugin 22 | produces: 23 | - application/json 24 | responses: 25 | "200": 26 | description: Returns list of all plugins 27 | schema: 28 | $ref: '#/definitions/ResponsesPluginList' 29 | "405": 30 | description: invalid HTTP Method 31 | schema: 32 | $ref: '#/definitions/ResponsesError405' 33 | """ 34 | plugins = [] 35 | for plugin in list_plugins(): 36 | plugins.append(dict(plugin)) 37 | 38 | return web_response(200, plugins) 39 | 40 | @handle_exceptions 41 | async def get_plugin(self, request): 42 | """ 43 | --- 44 | description: Get one plugin 45 | tags: 46 | - Plugin 47 | parameters: 48 | - $ref: '#/definitions/Params/Path/plugin_name' 49 | produces: 50 | - application/json 51 | responses: 52 | "200": 53 | description: Returns one plugin 54 | schema: 55 | $ref: '#/definitions/ResponsesPlugin' 56 | "405": 57 | description: invalid HTTP Method 58 | schema: 59 | $ref: '#/definitions/ResponsesError405' 60 | """ 61 | plugin = _get_plugin_from_request(request) 62 | if not plugin: 63 | return web_response(404, "Plugin not found") 64 | 65 | return web_response(200, dict(plugin)) 66 | 67 | @handle_exceptions 68 | async def get_plugin_asset(self, request): 69 | """ 70 | --- 71 | description: Serve plugin asset 72 | tags: 73 | - Plugin 74 | parameters: 75 | - $ref: '#/definitions/Params/Path/plugin_name' 76 | - $ref: '#/definitions/Params/Path/plugin_filename' 77 | produces: 78 | - application/json 79 | responses: 80 | "200": 81 | description: Serve plugin asset, e.g. dist/index.html 82 | "405": 83 | description: invalid HTTP Method 84 | schema: 85 | $ref: '#/definitions/ResponsesError405' 86 | """ 87 | plugin = _get_plugin_from_request(request) 88 | if not plugin: 89 | return web_response(404, "Plugin not found") 90 | 91 | filename = request.match_info.get("filename") 92 | try: 93 | return plugin.serve(filename) 94 | except: 95 | return web_response(500, "Internal server error") 96 | 97 | 98 | def _get_plugin_from_request(request): 99 | _plugins = list_plugins() 100 | plugin_name = request.match_info.get("plugin_name") 101 | for plugin in _plugins: 102 | if plugin.name == plugin_name: 103 | return plugin 104 | return None 105 | -------------------------------------------------------------------------------- /services/ui_backend_service/api/step.py: -------------------------------------------------------------------------------- 1 | from services.data.db_utils import translate_run_key 2 | from services.utils import handle_exceptions 3 | from .utils import find_records, apply_run_tags_postprocess 4 | 5 | 6 | class StepApi(object): 7 | def __init__(self, app, db): 8 | self.db = db 9 | app.router.add_route( 10 | "GET", "/flows/{flow_id}/runs/{run_number}/steps", self.get_steps 11 | ) 12 | app.router.add_route( 13 | "GET", "/flows/{flow_id}/runs/{run_number}/steps/{step_name}", self.get_step 14 | ) 15 | self._async_table = self.db.step_table_postgres 16 | self._async_run_table = self.db.run_table_postgres 17 | 18 | @handle_exceptions 19 | async def get_steps(self, request): 20 | """ 21 | --- 22 | description: Get all steps of specified run 23 | tags: 24 | - Step 25 | parameters: 26 | - $ref: '#/definitions/Params/Path/flow_id' 27 | - $ref: '#/definitions/Params/Path/run_number' 28 | - $ref: '#/definitions/Params/Builtin/_page' 29 | - $ref: '#/definitions/Params/Builtin/_limit' 30 | - $ref: '#/definitions/Params/Builtin/_order' 31 | - $ref: '#/definitions/Params/Builtin/_tags' 32 | - $ref: '#/definitions/Params/Builtin/_group' 33 | - $ref: '#/definitions/Params/Custom/flow_id' 34 | - $ref: '#/definitions/Params/Custom/run_number' 35 | - $ref: '#/definitions/Params/Custom/step_name' 36 | - $ref: '#/definitions/Params/Custom/user_name' 37 | - $ref: '#/definitions/Params/Custom/ts_epoch' 38 | produces: 39 | - application/json 40 | responses: 41 | "200": 42 | description: Returns all steps of specified run 43 | schema: 44 | $ref: '#/definitions/ResponsesStepList' 45 | "405": 46 | description: invalid HTTP Method 47 | schema: 48 | $ref: '#/definitions/ResponsesError405' 49 | """ 50 | 51 | flow_name = request.match_info.get("flow_id") 52 | run_number = request.match_info.get("run_number") 53 | run_id_key, run_id_value = translate_run_key(run_number) 54 | 55 | return await find_records(request, 56 | self._async_table, 57 | initial_conditions=[ 58 | "flow_id = %s", 59 | "{run_id_key} = %s".format(run_id_key=run_id_key)], 60 | initial_values=[flow_name, run_id_value], 61 | allowed_order=self._async_table.keys, 62 | allowed_group=self._async_table.keys, 63 | allowed_filters=self._async_table.keys, 64 | enable_joins=True, 65 | postprocess=apply_run_tags_postprocess(flow_name, run_number, self._async_run_table)) 66 | 67 | @handle_exceptions 68 | async def get_step(self, request): 69 | """ 70 | --- 71 | description: Get one step 72 | tags: 73 | - Step 74 | parameters: 75 | - $ref: '#/definitions/Params/Path/flow_id' 76 | - $ref: '#/definitions/Params/Path/run_number' 77 | - $ref: '#/definitions/Params/Path/step_name' 78 | produces: 79 | - application/json 80 | responses: 81 | "200": 82 | description: Returns one step 83 | schema: 84 | $ref: '#/definitions/ResponsesStep' 85 | "405": 86 | description: invalid HTTP Method 87 | schema: 88 | $ref: '#/definitions/ResponsesError405' 89 | """ 90 | 91 | flow_name = request.match_info.get("flow_id") 92 | run_number = request.match_info.get("run_number") 93 | run_id_key, run_id_value = translate_run_key(run_number) 94 | step_name = request.match_info.get("step_name") 95 | 96 | return await find_records(request, 97 | self._async_table, 98 | fetch_single=True, 99 | initial_conditions=[ 100 | "flow_id = %s", 101 | "{run_id_key} = %s".format( 102 | run_id_key=run_id_key), 103 | "step_name = %s"], 104 | initial_values=[ 105 | flow_name, run_id_value, step_name], 106 | enable_joins=True, 107 | postprocess=apply_run_tags_postprocess(flow_name, run_number, self._async_run_table) 108 | ) 109 | -------------------------------------------------------------------------------- /services/ui_backend_service/api/tag.py: -------------------------------------------------------------------------------- 1 | from services.utils import handle_exceptions, web_response 2 | 3 | 4 | class TagApi(object): 5 | def __init__(self, app, db): 6 | self.db = db 7 | app.router.add_route("GET", "/tags", self.get_all_tags) 8 | self._async_table = self.db.run_table_postgres 9 | 10 | @handle_exceptions 11 | async def get_all_tags(self, request): 12 | db_response, _ = await self._async_table.get_tags() 13 | return web_response(db_response.response_code, db_response.body) 14 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Optional 2 | 3 | 4 | # Shared helpers 5 | 6 | 7 | def unpack_processed_value(value) -> Tuple[bool, Optional[str], Optional[str], Optional[str]]: 8 | ''' 9 | Unpack cached value returning tuple of: success, value, detail, stacktrace 10 | 11 | Defaults to None in case values are not defined. 12 | 13 | Success example: 14 | True, 'foo', None 15 | 16 | Failure examples: 17 | False, 'failure-id', 'error-details', None 18 | False, 'failure-id-without-details', None, None 19 | False, None, None, None 20 | False, 'CustomError', 'Custom failure description', 'stacktrace of error' 21 | 22 | Returns 23 | ------- 24 | tuple : (bool, optional(str), optional(str), optional(str)) 25 | success, value, description, stacktrace 26 | ''' 27 | return (list(value) + [None] * 4)[:4] 28 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/cache/__init__.py: -------------------------------------------------------------------------------- 1 | from .store import CacheStore 2 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/cache/client/__init__.py: -------------------------------------------------------------------------------- 1 | # This module is a copy of an implementation of a cache store 2 | # originally from https://github.com/Netflix/metaflow/pull/316 3 | # TODO: use the metaflow cli cache implementation if the aforementioned PR gets merged 4 | from .cache_action import CacheAction 5 | from .cache_async_client import CacheAsyncClient 6 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/cache/client/cache_action.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import importlib 3 | 4 | LO_PRIO = 'lo_prio' 5 | HI_PRIO = 'hi_prio' 6 | 7 | 8 | class CacheServerInitFailed(Exception): 9 | pass 10 | 11 | 12 | def import_action_class_spec(action_spec): 13 | parts = action_spec.split('.') 14 | package = '.'.join(action_spec.split('.')[:-1]) 15 | action_name = action_spec.split('.')[-1] 16 | return import_action_class('.'.join(parts[:-1]), parts[-1]) 17 | 18 | 19 | def import_action_class(mod, cls): 20 | return getattr(importlib.import_module(mod), cls) 21 | 22 | 23 | class CacheAction(object): 24 | 25 | PRIORITY = LO_PRIO 26 | 27 | @classmethod 28 | def format_request(cls, *args, **kwargs): 29 | """ 30 | Encode the given arguments as a request. This method 31 | is proxied by `cache_client` as a client-facing API 32 | of the action. 33 | 34 | Function returns a tuple: 35 | 1. `message`: an arbitrary JSON-encodable payload that 36 | is passed to `execute`. 37 | 2. `obj_keys`: a list of keys that the action promises 38 | to produce in `execute`. 39 | 3. `stream_key`: an optional key name for a streaming 40 | result of the action. May be `None` if the action 41 | doesn't have any streaming results. 42 | 4. `disposable_keys`: a subset of `obj_keys` that will 43 | be purged from the cache before other objects. 44 | 5. `invalidate_cache`: boolean to indicate if existing 45 | cache keys should be invalidated. 46 | 6. `ephemeral_storage_path` : optional path for persisting files across cache action invocations 47 | """ 48 | # return message, obj_keys, stream_key, disposable_keys, invalidate_cache, ephemeral_storage_path 49 | raise NotImplementedError 50 | 51 | @classmethod 52 | def response(cls, keys_objs): 53 | """ 54 | Decodes and refines `execute` output before it is returned 55 | to the client. The argument `keys_objs` is the return value 56 | of `execute`. This method is called by `cache_client` to 57 | convert serialized, cached results to a client-facing object. 58 | 59 | The function may return anything. 60 | """ 61 | raise NotImplementedError 62 | 63 | @classmethod 64 | def stream_response(cls, it): 65 | """ 66 | Iterator that iterates over streamed events in `it`. This 67 | generator is the reader counterpart to the `stream_output` 68 | writer in `execute`. This method is called by `cache_client` 69 | to convert serialized events to client-facing objects. 70 | 71 | If the event is `None`, it should be yield as-is. For other 72 | events, the function may perform any stateful manipulation and 73 | yield zero or more refined objects. 74 | """ 75 | raise NotImplementedError 76 | 77 | @classmethod 78 | def execute(cls, 79 | message=None, 80 | keys=[], 81 | existing_keys={}, 82 | stream_output=None, 83 | invalidate_cache=False): 84 | """ 85 | Execute an action. This method is called by `cache_worker` to 86 | execute the action as a subprocess. 87 | 88 | - `message` is an arbitrary payload produced by format_request. 89 | - `keys` is a list of objects that the action needs to produce. 90 | - `existing_keys` refers to existing values of caches keys, if 91 | available. 92 | - `stream_output` is a function that can be called to produce 93 | an output event to the stream object. 94 | - `invalidate_cache` boolean to indicate whether to invalidate 95 | existing cache keys. 96 | 97 | Returns a dictionary that includes a string/byte result 98 | per key that will be stored in the cache. 99 | """ 100 | raise NotImplementedError 101 | 102 | 103 | class Check(CacheAction): 104 | 105 | PRIORITY = HI_PRIO 106 | 107 | @classmethod 108 | def format_request(cls, *args, **kwargs): 109 | key = 'check-%s' % uuid.uuid4() 110 | return None, [key], None, [key], False, None 111 | 112 | @classmethod 113 | def response(cls, keys_objs): 114 | for key, blob in keys_objs.items(): 115 | if blob != b'works: %s' % key.encode('utf-8'): 116 | raise CacheServerInitFailed() 117 | return True 118 | 119 | @classmethod 120 | def stream_response(cls, it): 121 | pass 122 | 123 | @classmethod 124 | def execute(cls, keys=[], **kwargs): 125 | return {key: 'works: %s' % key for key in keys} 126 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/cache/client/cache_async_client.py: -------------------------------------------------------------------------------- 1 | from json.decoder import JSONDecodeError 2 | import time 3 | import asyncio 4 | import json 5 | from asyncio.subprocess import PIPE, STDOUT 6 | 7 | from .cache_client import CacheClient, CacheServerUnreachable, CacheClientTimeout 8 | 9 | from services.utils import logging 10 | 11 | OP_WORKER_CREATE = "worker_create" 12 | OP_WORKER_TERMINATE = "worker_terminate" 13 | 14 | WAIT_FREQUENCY = 0.2 15 | HEARTBEAT_FREQUENCY = 1 16 | 17 | 18 | class CacheAsyncClient(CacheClient): 19 | _drain_lock = asyncio.Lock() 20 | _restart_requested = False 21 | 22 | async def start_server(self, cmdline, env): 23 | self.logger = logging.getLogger( 24 | "CacheAsyncClient:{root}".format(root=self._root) 25 | ) 26 | 27 | self._proc = await asyncio.create_subprocess_exec( 28 | *cmdline, env=env, stdin=PIPE, stdout=PIPE, stderr=STDOUT, limit=1024000 29 | ) # 1024KB 30 | 31 | asyncio.gather(self._heartbeat(), self.read_stdout()) 32 | 33 | async def _read_pipe(self, src): 34 | while self._is_alive: 35 | line = await src.readline() 36 | if not line: 37 | await asyncio.sleep(WAIT_FREQUENCY) 38 | break 39 | yield line.rstrip().decode("utf-8") 40 | 41 | async def read_stdout(self): 42 | async for line in self._read_pipe(self._proc.stdout): 43 | await self.read_message(line) 44 | 45 | async def read_message(self, line: str): 46 | try: 47 | # We check for isEnabledFor because some things may be very long to print 48 | # (in particularly pending_requests) 49 | message = json.loads(line) 50 | if self.logger.isEnabledFor(logging.INFO): 51 | self.logger.info(message) 52 | if message["op"] == OP_WORKER_CREATE: 53 | self.pending_requests.add(message["stream_key"]) 54 | elif message["op"] == OP_WORKER_TERMINATE: 55 | self.pending_requests.remove(message["stream_key"]) 56 | 57 | if self.logger.isEnabledFor(logging.INFO): 58 | self.logger.info( 59 | "Pending stream keys: {}".format(len(list(self.pending_requests))) 60 | ) 61 | except JSONDecodeError as ex: 62 | if self.logger.isEnabledFor(logging.INFO): 63 | self.logger.info("Message: {}".format(line)) 64 | except Exception as ex: 65 | self.logger.exception(ex) 66 | 67 | async def check(self): 68 | ret = await self.Check() # pylint: disable=no-member 69 | await ret.wait() 70 | ret.get() 71 | 72 | async def stop_server(self): 73 | if self._is_alive: 74 | self._is_alive = False 75 | self._proc.terminate() 76 | self.logger.info("Waiting for cache server to terminate") 77 | await self._proc.wait() 78 | 79 | async def send_request(self, blob): 80 | try: 81 | self._proc.stdin.write(blob) 82 | async with self._drain_lock: 83 | await asyncio.wait_for(self._proc.stdin.drain(), timeout=WAIT_FREQUENCY) 84 | except asyncio.TimeoutError: 85 | self.logger.warning( 86 | "StreamWriter.drain timeout, request restart: {}".format( 87 | repr(self._proc.stdin) 88 | ) 89 | ) 90 | # Drain timeout error indicates unrecoverable critical issue, 91 | # essentially the cache functionality remains broken after the first asyncio.TimeoutError. 92 | # Request restart from CacheStore so that normal operation can be resumed. 93 | self._restart_requested = True 94 | except ConnectionResetError: 95 | self._is_alive = False 96 | # This could indicate that the cache worker pool has unexpectedly crashed. 97 | # Request restart from CacheStore so that normal operation can be resumed. 98 | self._restart_requested = True 99 | raise CacheServerUnreachable() 100 | 101 | async def wait_iter(self, it, timeout): 102 | end = time.time() + timeout 103 | for obj in it: 104 | if obj is None: 105 | await asyncio.sleep(WAIT_FREQUENCY) 106 | if not self._is_alive: 107 | raise CacheServerUnreachable() 108 | elif time.time() > end: 109 | raise CacheClientTimeout() 110 | else: 111 | yield obj 112 | 113 | async def wait(self, fun, timeout): 114 | def _repeat(): 115 | while True: 116 | yield fun() 117 | 118 | async for obj in self.wait_iter(_repeat(), timeout): 119 | return obj 120 | 121 | async def request_and_return(self, reqs, ret): 122 | for req in reqs: 123 | await req 124 | return ret 125 | 126 | async def _heartbeat(self): 127 | while self._is_alive: 128 | try: 129 | await self.ping() 130 | except CacheServerUnreachable: 131 | self._is_alive = False 132 | await asyncio.sleep(HEARTBEAT_FREQUENCY) 133 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/cache/client/cache_worker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | import signal 5 | 6 | from .cache_action import import_action_class_spec 7 | 8 | 9 | def best_effort_read(key_paths): 10 | for key, path in key_paths: 11 | try: 12 | with open(path, 'rb') as f: 13 | yield key, f.read() 14 | except: 15 | pass 16 | 17 | 18 | def execute_action(tempdir, action_spec, request_file, timeout=0): 19 | def timeout_handler(signum, frame): 20 | raise WorkerTimeoutException() 21 | 22 | signal.signal(signal.SIGALRM, timeout_handler) 23 | signal.alarm(timeout) # Activate timeout, 0 = no timeout 24 | 25 | action_cls = import_action_class_spec(action_spec) 26 | with open(os.path.join(tempdir, request_file)) as f: 27 | request = json.load(f) 28 | 29 | execute(tempdir, action_cls, request) 30 | 31 | signal.alarm(0) # Disable timeout 32 | 33 | 34 | def execute(tempdir, action_cls, req): 35 | try: 36 | # prepare stream 37 | stream = None 38 | if req['stream_key']: 39 | stream = open(os.path.join(tempdir, req['stream_key']), 'a', buffering=1) 40 | 41 | def stream_output(obj): 42 | stream.write(json.dumps(obj) + '\n') 43 | else: 44 | stream_output = None 45 | 46 | # prepare keys 47 | keys = list(req['keys']) 48 | ex_keys = dict(best_effort_read(req['existing_keys'].items())) 49 | 50 | # execute action 51 | res = action_cls.execute( 52 | message=req['message'], 53 | keys=keys, 54 | existing_keys=ex_keys, 55 | stream_output=stream_output, 56 | invalidate_cache=req.get('invalidate_cache', False)) 57 | 58 | # write outputs to keys 59 | for key, val in res.items(): 60 | if key in ex_keys and ex_keys[key] == val: 61 | # Reduce disk churn by not unnecessarily writing existing keys 62 | # that have identical values to the newly produced ones. 63 | continue 64 | blob = val if isinstance(val, bytes) else val.encode('utf-8') 65 | with open(os.path.join(tempdir, req['keys'][key]), 'wb') as f: 66 | f.write(blob) 67 | finally: 68 | # make sure the stream is finalized so clients won't hang even if 69 | # the worker crashes 70 | if stream: 71 | stream.write('\n\n') 72 | stream.close() 73 | 74 | 75 | class WorkerTimeoutException(Exception): 76 | pass 77 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/cache/generate_dag_action.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import json 3 | 4 | from .client import CacheAction 5 | from .utils import streamed_errors, DAGParsingFailed, DAGUnsupportedFlowLanguage 6 | 7 | from .custom_flowgraph import FlowGraph 8 | 9 | from metaflow import Run, Step, DataArtifact, namespace 10 | from metaflow.exception import MetaflowNotFound 11 | namespace(None) # Always use global namespace by default 12 | 13 | 14 | class GenerateDag(CacheAction): 15 | """ 16 | Generates a DAG for a given Run. 17 | 18 | Parameters 19 | ---------- 20 | flow_id : str 21 | The flow id that this codepackage belongs to. 22 | Required for finding the correct class inside the parser logic. 23 | run_number : str 24 | Run number to construct rest of the pathspec 25 | 26 | Returns 27 | -------- 28 | List or None 29 | example: 30 | [ 31 | boolean, 32 | { 33 | "step_name": { 34 | 'type': string, 35 | 'box_next': boolean, 36 | 'box_ends': string, 37 | 'next': list, 38 | 'doc': string 39 | }, 40 | ... 41 | } 42 | ] 43 | First field conveys whether dag generation was successful. 44 | Second field contains the actual DAG. 45 | """ 46 | 47 | @classmethod 48 | def format_request(cls, flow_id, run_number, invalidate_cache=False): 49 | msg = { 50 | 'flow_id': flow_id, 51 | 'run_number': run_number 52 | } 53 | key_identifier = "{}/{}".format(flow_id, run_number) 54 | result_key = 'dag:result:%s' % hashlib.sha1((key_identifier).encode('utf-8')).hexdigest() 55 | stream_key = 'dag:stream:%s' % hashlib.sha1((key_identifier).encode('utf-8')).hexdigest() 56 | 57 | return msg, \ 58 | [result_key], \ 59 | stream_key, \ 60 | [stream_key], \ 61 | invalidate_cache, \ 62 | None 63 | 64 | @classmethod 65 | def response(cls, keys_objs): 66 | ''' 67 | Returns the generated DAG result 68 | ''' 69 | return [json.loads(val) for key, val in keys_objs.items() if key.startswith('dag:result')][0] 70 | 71 | @classmethod 72 | def stream_response(cls, it): 73 | for msg in it: 74 | yield msg 75 | 76 | @classmethod 77 | def execute(cls, 78 | message=None, 79 | keys=None, 80 | existing_keys={}, 81 | stream_output=None, 82 | invalidate_cache=False, 83 | **kwargs): 84 | results = {} 85 | flow_id = message['flow_id'] 86 | run_number = message['run_number'] 87 | 88 | result_key = [key for key in keys if key.startswith('dag:result')][0] 89 | 90 | with streamed_errors(stream_output): 91 | run = Run("{}/{}".format(flow_id, run_number)) 92 | param_step = Step("{}/_parameters".format(run.pathspec)) 93 | try: 94 | dag = DataArtifact("{}/_graph_info".format(param_step.task.pathspec)).data 95 | except MetaflowNotFound: 96 | dag = generate_dag(run) 97 | 98 | results[result_key] = json.dumps(dag) 99 | 100 | return results 101 | 102 | # Utilities 103 | 104 | 105 | def generate_dag(run: Run): 106 | try: 107 | # Initialize a FlowGraph object 108 | graph = FlowGraph(source=run.code.flowspec, name=run.parent.id) 109 | # Build the DAG based on the DAGNodes given by the FlowGraph for the found FlowSpec class. 110 | steps_info, graph_structure = graph.output_steps() 111 | graph_info = { 112 | "steps": steps_info, 113 | "graph_structure": graph_structure, 114 | "doc": graph.doc 115 | } 116 | 117 | return graph_info 118 | except Exception as ex: 119 | if ex.__class__.__name__ == 'KeyError' and "python" in str(ex): 120 | raise DAGUnsupportedFlowLanguage( 121 | 'DAG parsing is not supported for the language used in this Flow.' 122 | ) from None 123 | else: 124 | raise DAGParsingFailed(f"DAG Parsing failed: {str(ex)}") 125 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/cache/get_artifacts_action.py: -------------------------------------------------------------------------------- 1 | from typing import List, Callable 2 | 3 | from .get_data_action import GetData 4 | from .utils import unpack_pathspec_with_attempt_id, artifact_value 5 | 6 | from metaflow import DataArtifact 7 | 8 | 9 | class GetArtifacts(GetData): 10 | @classmethod 11 | def format_request(cls, pathspecs: List[str], invalidate_cache=False): 12 | """ 13 | Cache Action to fetch Artifact values 14 | 15 | Parameters 16 | ---------- 17 | pathspecs : List[str] 18 | List of Artifact pathspecs with attempt id as last component: 19 | ["FlowId/RunNumber/StepName/TaskId/ArtifactName/0"] 20 | invalidate_cache : bool 21 | Force cache invalidation, defaults to False 22 | """ 23 | return super().format_request(targets=pathspecs, invalidate_cache=invalidate_cache) 24 | 25 | @classmethod 26 | def fetch_data(cls, pathspec: str, stream_output: Callable[[str], None]): 27 | """ 28 | Fetch data using Metaflow Client. 29 | 30 | Parameters 31 | ---------- 32 | pathspec : str 33 | Artifact pathspec with attempt id as last component: 34 | "FlowId/RunNumber/StepName/TaskId/ArtifactName/0" 35 | stream_output : Callable[[object], None] 36 | Stream output callable from execute() that accepts a JSON serializable object. 37 | Used for generic messaging. 38 | 39 | Errors can be streamed to cache client using `stream_output` in combination with 40 | the error_event_msg helper. This way failures won't be cached for individual artifacts, 41 | thus making it necessary to retry fetching during next attempt. 42 | (Will add significant overhead/delay). 43 | 44 | Stream error example: 45 | stream_output(error_event_msg(str(ex), "s3-not-found", get_traceback_str())) 46 | """ 47 | pathspec_without_attempt, attempt_id = unpack_pathspec_with_attempt_id(pathspec) 48 | 49 | artifact = DataArtifact(pathspec_without_attempt, attempt=attempt_id) 50 | return artifact_value(artifact) 51 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/cache/get_parameters_action.py: -------------------------------------------------------------------------------- 1 | from typing import List, Callable 2 | 3 | from .get_data_action import GetData 4 | from .utils import MAX_S3_SIZE, streamed_errors 5 | 6 | from metaflow import Step 7 | 8 | 9 | class GetParameters(GetData): 10 | @classmethod 11 | def format_request(cls, pathspecs: List[str], invalidate_cache=False): 12 | """ 13 | Cache Action to fetch Run parameters for list of runs. 14 | 15 | Parameters 16 | ---------- 17 | pathspecs : List[str] 18 | List of Run pathspecs: ["FlowId/RunNumber"] 19 | invalidate_cache : bool 20 | Force cache invalidation, defaults to False 21 | """ 22 | return super().format_request(targets=pathspecs, invalidate_cache=invalidate_cache) 23 | 24 | @classmethod 25 | def fetch_data(cls, pathspec: str, stream_output: Callable[[object], None]): 26 | """ 27 | Fetch data using Metaflow Client. 28 | 29 | Parameters 30 | ---------- 31 | pathspec : str 32 | Run pathspec: "FlowId/RunNumber" 33 | stream_output : Callable[[object], None] 34 | Stream output callable from execute() that accepts a JSON serializable object. 35 | Used for generic messaging. 36 | 37 | Errors can be streamed to cache client using `stream_output` in combination with 38 | the error_event_msg helper. This way failures won't be cached for individual artifacts, 39 | thus making it necessary to retry fetching during next attempt. 40 | (Will add significant overhead/delay). 41 | 42 | Stream error example: 43 | stream_output(error_event_msg(str(ex), "s3-not-found", get_traceback_str())) 44 | """ 45 | try: 46 | with streamed_errors(stream_output): 47 | step = Step("{}/_parameters".format(pathspec)) 48 | except Exception as ex: 49 | # NOTE: return false in order not to cache this 50 | # since parameters might be available later 51 | return False 52 | 53 | values = {} 54 | for artifact_name, artifact in step.task.artifacts._asdict().items(): 55 | # Exclude following internal only artifacts from results: 56 | # - Artifacts prefixed with underscore (_) 57 | # - Artifacts with 'name' or 'script_name' 58 | if artifact_name.startswith('_') or artifact_name in ['name', 'script_name']: 59 | continue 60 | try: 61 | if artifact.size < MAX_S3_SIZE: 62 | values[artifact_name] = artifact.data 63 | else: 64 | values[artifact_name] = "Artifact too large: {} bytes".format(artifact.size) 65 | except Exception as ex: 66 | values[artifact_name] = str(ex) 67 | 68 | return [True, values] 69 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/cache/get_task_action.py: -------------------------------------------------------------------------------- 1 | from typing import List, Callable 2 | 3 | from .get_data_action import GetData 4 | from .utils import unpack_pathspec_with_attempt_id, MAX_S3_SIZE 5 | 6 | from metaflow import Task 7 | from metaflow.exception import MetaflowNotFound 8 | 9 | 10 | class GetTask(GetData): 11 | @classmethod 12 | def format_request(cls, pathspecs: List[str], invalidate_cache=False): 13 | """ 14 | Cache Action to fetch Task status and foreach labels. 15 | 16 | Parameters 17 | ---------- 18 | pathspecs : List[str] 19 | List of Task pathspecs with attempt id as last component: 20 | ["FlowId/RunNumber/StepName/TaskId/0"] 21 | invalidate_cache : bool 22 | Force cache invalidation, defaults to False 23 | """ 24 | return super().format_request(targets=pathspecs, invalidate_cache=invalidate_cache) 25 | 26 | @classmethod 27 | def fetch_data(cls, pathspec: str, stream_output: Callable[[object], None]): 28 | """ 29 | Fetch data using Metaflow Client. 30 | 31 | Parameters 32 | ---------- 33 | pathspec : str 34 | Task pathspec with attempt id as last component: 35 | "FlowId/RunNumber/StepName/TaskId/0" 36 | stream_output : Callable[[object], None] 37 | Stream output callable from execute() that accepts a JSON serializable object. 38 | Used for generic messaging. 39 | 40 | Errors can be streamed to cache client using `stream_output` in combination with 41 | the error_event_msg helper. This way failures won't be cached for individual artifacts, 42 | thus making it necessary to retry fetching during next attempt. 43 | (Will add significant overhead/delay). 44 | 45 | Stream error example: 46 | stream_output(error_event_msg(str(ex), "s3-not-found", get_traceback_str())) 47 | """ 48 | try: 49 | pathspec_without_attempt, attempt_id = unpack_pathspec_with_attempt_id(pathspec) 50 | task = Task(pathspec_without_attempt, attempt=attempt_id) 51 | except MetaflowNotFound: 52 | return False # Skip cache persist if Task cannot be found 53 | 54 | if '_task_ok' not in task: 55 | # Skip cache persist if _task_ok artifact cannot be found 56 | return False 57 | 58 | values = {} 59 | for artifact_name in ['_task_ok', '_foreach_stack']: 60 | if artifact_name in task: 61 | artifact = task[artifact_name] 62 | if artifact.size < MAX_S3_SIZE: 63 | values[artifact_name] = artifact.data 64 | else: 65 | return [False, 'artifact-too-large', "{}: {} bytes".format(artifact.pathspec, artifact.size)] 66 | 67 | return [True, values] 68 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/db/__init__.py: -------------------------------------------------------------------------------- 1 | from .postgres_async_db import AsyncPostgresDB 2 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/db/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .flow_row import FlowRow 2 | from .run_row import RunRow 3 | from .step_row import StepRow 4 | from .task_row import TaskRow 5 | from .artifact_row import ArtifactRow 6 | from .metadata_row import MetadataRow 7 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/db/models/artifact_row.py: -------------------------------------------------------------------------------- 1 | from services.data.db_utils import get_exposed_run_id, get_exposed_task_id 2 | from .base_row import BaseRow 3 | import time 4 | 5 | 6 | class ArtifactRow(BaseRow): 7 | flow_id: str = None 8 | run_number: int = None 9 | run_id: str = None 10 | step_name: str = None 11 | task_id: int = None 12 | task_name: str = None 13 | name: str = None 14 | location: str = None 15 | sha: str = None 16 | type: str = None 17 | content_type: str = None 18 | user_name: str = None 19 | attempt_id: int = 0 20 | ts_epoch: int = 0 21 | 22 | def __init__( 23 | self, 24 | flow_id, 25 | run_number, 26 | run_id, 27 | step_name, 28 | task_id, 29 | task_name, 30 | name, 31 | location, 32 | ds_type, 33 | sha, 34 | type, 35 | content_type, 36 | user_name, 37 | attempt_id, 38 | ts_epoch=None, 39 | tags=None, 40 | system_tags=None, 41 | **kwargs 42 | ): 43 | self.flow_id = flow_id 44 | self.run_number = run_number 45 | self.run_id = run_id 46 | self.step_name = step_name 47 | self.task_id = task_id 48 | self.task_name = task_name 49 | self.name = name 50 | self.location = location 51 | self.ds_type = ds_type 52 | self.sha = sha 53 | self.type = type 54 | self.content_type = content_type 55 | self.user_name = user_name 56 | self.attempt_id = attempt_id 57 | if ts_epoch is None: 58 | ts_epoch = int(round(time.time() * 1000)) 59 | 60 | self.ts_epoch = ts_epoch 61 | self.tags = tags 62 | self.system_tags = system_tags 63 | 64 | def serialize(self, expanded: bool = False): 65 | if expanded: 66 | return { 67 | "flow_id": self.flow_id, 68 | "run_number": self.run_number, 69 | "run_id": self.run_id, 70 | "step_name": self.step_name, 71 | "task_id": self.task_id, 72 | "task_name": self.task_name, 73 | "name": self.name, 74 | "location": self.location, 75 | "ds_type": self.ds_type, 76 | "sha": self.sha, 77 | "type": self.type, 78 | "content_type": self.content_type, 79 | "user_name": self.user_name, 80 | "attempt_id": self.attempt_id, 81 | "ts_epoch": self.ts_epoch, 82 | "tags": self.tags, 83 | "system_tags": self.system_tags, 84 | } 85 | else: 86 | return { 87 | "flow_id": self.flow_id, 88 | "run_number": str(get_exposed_run_id(self.run_number, self.run_id)), 89 | "step_name": self.step_name, 90 | "task_id": str(get_exposed_task_id(self.task_id, self.task_name)), 91 | "name": self.name, 92 | "location": self.location, 93 | "ds_type": self.ds_type, 94 | "sha": self.sha, 95 | "type": self.type, 96 | "content_type": self.content_type, 97 | "user_name": self.user_name, 98 | "attempt_id": self.attempt_id, 99 | "ts_epoch": self.ts_epoch, 100 | "tags": self.tags, 101 | "system_tags": self.system_tags, 102 | } 103 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/db/models/base_row.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | 4 | class BaseRow(object): 5 | """ 6 | Base class for Row serialization of database query results. 7 | Inherited by all row classes and ensures that serialize() is implemented. 8 | """ 9 | 10 | def serialize(self) -> Dict: 11 | raise NotImplementedError("Row model needs to define a serialize function") 12 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/db/models/flow_row.py: -------------------------------------------------------------------------------- 1 | from .base_row import BaseRow 2 | import time 3 | 4 | 5 | class FlowRow(BaseRow): 6 | flow_id: str = None 7 | user_name: str = None 8 | ts_epoch: int = 0 9 | 10 | def __init__(self, flow_id, user_name, ts_epoch=None, tags=None, system_tags=None, **kwargs): 11 | self.flow_id = flow_id 12 | self.user_name = user_name 13 | if ts_epoch is None: 14 | ts_epoch = int(round(time.time() * 1000)) 15 | self.ts_epoch = ts_epoch 16 | self.tags = tags 17 | self.system_tags = system_tags 18 | 19 | def serialize(self, expanded: bool = False): 20 | return { 21 | "flow_id": self.flow_id, 22 | "user_name": self.user_name, 23 | "ts_epoch": self.ts_epoch, 24 | "tags": self.tags, 25 | "system_tags": self.system_tags, 26 | } 27 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/db/models/metadata_row.py: -------------------------------------------------------------------------------- 1 | from .base_row import BaseRow 2 | from services.data.db_utils import get_exposed_run_id, get_exposed_task_id 3 | import time 4 | 5 | 6 | class MetadataRow(BaseRow): 7 | flow_id: str = None 8 | run_number: int = None 9 | run_id: str = None 10 | step_name: str = None 11 | task_id: int = None 12 | task_name: str = None 13 | attempt_id: int = None 14 | id: int = None # autoincrement 15 | field_name: str = None 16 | value: dict = None 17 | type: str = None 18 | user_name: str = None 19 | ts_epoch: int = 0 20 | tags = None 21 | system_tags = None 22 | 23 | def __init__( 24 | self, 25 | flow_id, 26 | run_number, 27 | run_id, 28 | step_name, 29 | task_id, 30 | task_name, 31 | id, 32 | field_name, 33 | value, 34 | type, 35 | user_name, 36 | attempt_id=None, 37 | ts_epoch=None, 38 | tags=None, 39 | system_tags=None, 40 | **kwargs 41 | ): 42 | self.flow_id = flow_id 43 | self.run_number = run_number 44 | self.run_id = run_id 45 | self.step_name = step_name 46 | self.task_id = task_id 47 | self.task_name = task_name 48 | self.attempt_id = attempt_id 49 | self.field_name = field_name 50 | self.value = value 51 | self.type = type 52 | self.user_name = user_name 53 | if ts_epoch is None: 54 | ts_epoch = int(round(time.time() * 1000)) 55 | 56 | self.ts_epoch = ts_epoch 57 | self.id = id 58 | self.tags = tags 59 | self.system_tags = system_tags 60 | 61 | def serialize(self, expanded: bool = False): 62 | if expanded: 63 | return { 64 | "id": self.id, 65 | "flow_id": self.flow_id, 66 | "run_number": self.run_number, 67 | "run_id": self.run_id, 68 | "step_name": self.step_name, 69 | "task_id": self.task_id, 70 | "task_name": self.task_name, 71 | "attempt_id": self.attempt_id, 72 | "field_name": self.field_name, 73 | "value": self.value, 74 | "type": self.type, 75 | "user_name": self.user_name, 76 | "ts_epoch": self.ts_epoch, 77 | "tags": self.tags, 78 | "system_tags": self.system_tags, 79 | } 80 | else: 81 | return { 82 | "id": self.id, 83 | "flow_id": self.flow_id, 84 | "run_number": str(get_exposed_run_id(self.run_number, self.run_id)), 85 | "step_name": self.step_name, 86 | "task_id": str(get_exposed_task_id(self.task_id, self.task_name)), 87 | "attempt_id": self.attempt_id, 88 | "field_name": self.field_name, 89 | "value": self.value, 90 | "type": self.type, 91 | "user_name": self.user_name, 92 | "ts_epoch": self.ts_epoch, 93 | "tags": self.tags, 94 | "system_tags": self.system_tags, 95 | } 96 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/db/models/run_row.py: -------------------------------------------------------------------------------- 1 | from .base_row import BaseRow 2 | import time 3 | from services.data.db_utils import get_exposed_run_id 4 | 5 | 6 | class RunRow(BaseRow): 7 | flow_id: str = None 8 | run_number: int = None 9 | run_id: str = None 10 | run: str = None 11 | user_name: str = None 12 | user: str = None 13 | status: str = None 14 | ts_epoch: int = 0 15 | finished_at: int = None 16 | duration: int = None 17 | 18 | def __init__( 19 | self, 20 | flow_id, 21 | user_name, 22 | user=None, 23 | run_number=None, 24 | run_id=None, 25 | run=None, 26 | status=None, 27 | ts_epoch=None, 28 | finished_at=None, 29 | duration=None, 30 | tags=None, 31 | system_tags=None, 32 | last_heartbeat_ts=None, 33 | **kwargs 34 | ): 35 | self.flow_id = flow_id 36 | self.user_name = user_name 37 | self.user = user 38 | self.run_number = run_number 39 | self.run_id = run_id 40 | self.run = run 41 | self.status = status 42 | self.tags = tags 43 | self.system_tags = system_tags 44 | if ts_epoch is None: 45 | ts_epoch = int(round(time.time() * 1000)) 46 | 47 | self.ts_epoch = ts_epoch 48 | self.last_heartbeat_ts = last_heartbeat_ts 49 | self.finished_at = finished_at 50 | self.duration = duration 51 | self.last_heartbeat_ts = last_heartbeat_ts 52 | 53 | def serialize(self, expanded: bool = False): 54 | if expanded: 55 | return { 56 | "flow_id": self.flow_id, 57 | "run_number": self.run_number, 58 | "run_id": self.run_id, 59 | "user_name": self.user_name, 60 | "user": self.user, 61 | "run": self.run, 62 | "status": self.status, 63 | "ts_epoch": self.ts_epoch, 64 | "finished_at": self.finished_at, 65 | "duration": self.duration, 66 | "last_heartbeat_ts": self.last_heartbeat_ts, 67 | "tags": self.tags, 68 | "system_tags": self.system_tags 69 | } 70 | else: 71 | return { 72 | "flow_id": self.flow_id, 73 | "run_number": str(get_exposed_run_id(self.run_number, self.run_id)), 74 | "user_name": self.user_name, 75 | "status": self.status, 76 | "ts_epoch": self.ts_epoch, 77 | "finished_at": self.finished_at, 78 | "duration": self.duration, 79 | "last_heartbeat_ts": self.last_heartbeat_ts, 80 | "tags": self.tags, 81 | "system_tags": self.system_tags 82 | } 83 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/db/models/step_row.py: -------------------------------------------------------------------------------- 1 | from .base_row import BaseRow 2 | import time 3 | from services.data.db_utils import get_exposed_run_id 4 | 5 | 6 | class StepRow(BaseRow): 7 | flow_id: str = None 8 | run_number: int = None 9 | run_id: str = None 10 | step_name: str = None 11 | user_name: str = None 12 | ts_epoch: int = 0 13 | duration: int = 0 14 | tags = None 15 | system_tags = None 16 | 17 | def __init__( 18 | self, 19 | flow_id, 20 | run_number, 21 | run_id, 22 | user_name, 23 | step_name, 24 | ts_epoch=None, 25 | duration=None, 26 | tags=None, 27 | system_tags=None, 28 | **kwargs 29 | ): 30 | self.flow_id = flow_id 31 | self.run_number = run_number 32 | 33 | if run_id is None: 34 | run_id = str(run_number) 35 | self.run_id = run_id 36 | 37 | self.step_name = step_name 38 | self.user_name = user_name 39 | if ts_epoch is None: 40 | ts_epoch = int(round(time.time() * 1000)) 41 | 42 | self.ts_epoch = ts_epoch 43 | self.duration = duration 44 | self.tags = tags 45 | self.system_tags = system_tags 46 | 47 | def serialize(self, expanded: bool = False): 48 | if expanded: 49 | return { 50 | "flow_id": self.flow_id, 51 | "run_number": self.run_number, 52 | "run_id": self.run_id, 53 | "step_name": self.step_name, 54 | "user_name": self.user_name, 55 | "ts_epoch": self.ts_epoch, 56 | "duration": self.duration, 57 | "tags": self.tags, 58 | "system_tags": self.system_tags, 59 | } 60 | else: 61 | return { 62 | "flow_id": self.flow_id, 63 | "run_number": str(get_exposed_run_id(self.run_number, self.run_id)), 64 | "step_name": self.step_name, 65 | "user_name": self.user_name, 66 | "ts_epoch": self.ts_epoch, 67 | "duration": self.duration, 68 | "tags": self.tags, 69 | "system_tags": self.system_tags, 70 | } 71 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/db/models/task_row.py: -------------------------------------------------------------------------------- 1 | from .base_row import BaseRow 2 | import time 3 | from services.data.db_utils import get_exposed_task_id, get_exposed_run_id 4 | 5 | 6 | class TaskRow(BaseRow): 7 | flow_id: str = None 8 | run_number: int = None 9 | run_id: str = None 10 | step_name: str = None 11 | task_id: int = None 12 | task_name: str = None 13 | user_name: str = None 14 | status: str = None 15 | task_ok: str = None 16 | ts_epoch: int = 0 17 | started_at: int = None 18 | finished_at: int = None 19 | duration: int = None 20 | attempt_id: int = 0 21 | tags = None 22 | system_tags = None 23 | 24 | def __init__( 25 | self, 26 | flow_id, 27 | run_number, 28 | run_id, 29 | user_name, 30 | step_name, 31 | task_id=None, 32 | task_name=None, 33 | status=None, 34 | task_ok=None, 35 | ts_epoch=None, 36 | started_at=None, 37 | finished_at=None, 38 | duration=None, 39 | attempt_id=0, 40 | tags=None, 41 | system_tags=None, 42 | last_heartbeat_ts=None, 43 | **kwargs 44 | ): 45 | self.flow_id = flow_id 46 | self.run_number = run_number 47 | self.run_id = run_id 48 | self.step_name = step_name 49 | self.task_id = task_id 50 | self.task_name = task_name 51 | 52 | self.user_name = user_name 53 | if ts_epoch is None: 54 | ts_epoch = int(round(time.time() * 1000)) 55 | 56 | self.status = status 57 | self.task_ok = task_ok 58 | self.ts_epoch = ts_epoch 59 | self.started_at = started_at 60 | self.finished_at = finished_at 61 | self.duration = duration 62 | self.attempt_id = attempt_id 63 | self.tags = tags 64 | self.system_tags = system_tags 65 | self.last_heartbeat_ts = last_heartbeat_ts 66 | 67 | def serialize(self, expanded: bool = False): 68 | if expanded: 69 | return { 70 | "flow_id": self.flow_id, 71 | "run_number": self.run_number, 72 | "run_id": self.run_id, 73 | "step_name": self.step_name, 74 | "task_id": self.task_id, 75 | "task_name": self.task_name, 76 | "user_name": self.user_name, 77 | "status": self.status, 78 | "task_ok": self.task_ok, 79 | "ts_epoch": self.ts_epoch, 80 | "started_at": self.started_at, 81 | "finished_at": self.finished_at, 82 | "duration": self.duration, 83 | "attempt_id": self.attempt_id, 84 | "tags": self.tags, 85 | "system_tags": self.system_tags, 86 | "last_heartbeat_ts": self.last_heartbeat_ts 87 | } 88 | else: 89 | return { 90 | "flow_id": self.flow_id, 91 | "run_number": str(get_exposed_run_id(self.run_number, self.run_id)), 92 | "step_name": self.step_name, 93 | "task_id": str(get_exposed_task_id(self.task_id, self.task_name)), 94 | "user_name": self.user_name, 95 | "status": self.status, 96 | "task_ok": self.task_ok, 97 | "ts_epoch": self.ts_epoch, 98 | "started_at": self.started_at, 99 | "finished_at": self.finished_at, 100 | "duration": self.duration, 101 | "attempt_id": self.attempt_id, 102 | "tags": self.tags, 103 | "system_tags": self.system_tags, 104 | "last_heartbeat_ts": self.last_heartbeat_ts 105 | } 106 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/db/postgres_async_db.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | from typing import List 5 | 6 | import aiopg 7 | import psycopg2 8 | import psycopg2.extras 9 | # baselevel classes from shared data adapter to inherit from. 10 | from services.data.postgres_async_db import \ 11 | _AsyncPostgresDB as BaseAsyncPostgresDB 12 | from services.utils import DBConfiguration, logging 13 | 14 | from .tables import (AsyncArtifactTablePostgres, AsyncFlowTablePostgres, 15 | AsyncMetadataTablePostgres, AsyncRunTablePostgres, 16 | AsyncStepTablePostgres, AsyncTaskTablePostgres) 17 | 18 | 19 | class AsyncPostgresDB(BaseAsyncPostgresDB): 20 | """ 21 | UI Backend specific database adapter. 22 | Basic functionality is inherited from the classes provided by the shared services.data.postgres_async_db module. 23 | 24 | Parameters 25 | ---------- 26 | name : str (optional) 27 | name for the DB Adapter instance. Used primarily for naming the associated logger. 28 | """ 29 | connection = None 30 | flow_table_postgres = None 31 | run_table_postgres = None 32 | step_table_postgres = None 33 | task_table_postgres = None 34 | artifact_table_postgres = None 35 | metadata_table_postgres = None 36 | 37 | pool = None 38 | reader_pool = None 39 | db_conf: DBConfiguration = None 40 | 41 | def __init__(self, name='global'): 42 | self.name = name 43 | self.logger = logging.getLogger("AsyncPostgresDB:{name}".format(name=self.name)) 44 | 45 | tables = [] 46 | self.flow_table_postgres = AsyncFlowTablePostgres(self) 47 | self.run_table_postgres = AsyncRunTablePostgres(self) 48 | self.step_table_postgres = AsyncStepTablePostgres(self) 49 | self.task_table_postgres = AsyncTaskTablePostgres(self) 50 | self.artifact_table_postgres = AsyncArtifactTablePostgres(self) 51 | self.metadata_table_postgres = AsyncMetadataTablePostgres(self) 52 | tables.append(self.flow_table_postgres) 53 | tables.append(self.run_table_postgres) 54 | tables.append(self.step_table_postgres) 55 | tables.append(self.task_table_postgres) 56 | tables.append(self.artifact_table_postgres) 57 | tables.append(self.metadata_table_postgres) 58 | self.tables = tables 59 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/db/tables/__init__.py: -------------------------------------------------------------------------------- 1 | from .flow import AsyncFlowTablePostgres 2 | from .run import AsyncRunTablePostgres 3 | from .step import AsyncStepTablePostgres 4 | from .task import AsyncTaskTablePostgres 5 | from .metadata import AsyncMetadataTablePostgres 6 | from .artifact import AsyncArtifactTablePostgres 7 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/db/tables/artifact.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | from .base import AsyncPostgresTable 3 | from .task import AsyncTaskTablePostgres 4 | from ..models import ArtifactRow 5 | # use schema constants from the .data module to keep things consistent 6 | from services.data.postgres_async_db import AsyncArtifactTablePostgres as MetadataArtifactTable 7 | from services.data.db_utils import translate_run_key, DBResponse, DBPagination 8 | 9 | 10 | class AsyncArtifactTablePostgres(AsyncPostgresTable): 11 | _row_type = ArtifactRow 12 | table_name = MetadataArtifactTable.table_name 13 | task_table_name = AsyncTaskTablePostgres.table_name 14 | ordering = ["attempt_id DESC"] 15 | keys = MetadataArtifactTable.keys 16 | primary_keys = MetadataArtifactTable.primary_keys 17 | trigger_keys = None 18 | trigger_operations = None 19 | select_columns = keys 20 | 21 | async def get_run_parameter_artifacts(self, flow_name, run_number, postprocess=None, invalidate_cache=False): 22 | run_id_key, run_id_value = translate_run_key(run_number) 23 | 24 | # '_parameters' step has all the parameters as artifacts. only pick the 25 | # public parameters (no underscore prefix) 26 | return await self.find_records( 27 | conditions=[ 28 | "flow_id = %s", 29 | "{run_id_key} = %s".format(run_id_key=run_id_key), 30 | "step_name = %s", 31 | "name NOT LIKE %s", 32 | "name <> %s", 33 | "name <> %s" 34 | ], 35 | values=[ 36 | flow_name, 37 | run_id_value, 38 | "_parameters", 39 | r"\_%", 40 | "name", # exclude the 'name' parameter as this always exists, and contains the FlowName 41 | "script_name" # exclude the internally used 'script_name' parameter. 42 | ], 43 | fetch_single=False, 44 | expanded=True, 45 | postprocess=postprocess, 46 | invalidate_cache=invalidate_cache 47 | ) 48 | 49 | async def get_artifact_names(self, conditions: List[str] = [], 50 | values: List[str] = [], limit: int = 0, offset: int = 0) -> Tuple[DBResponse, DBPagination]: 51 | """ 52 | Get a paginated set of artifact names. 53 | 54 | Parameters 55 | ---------- 56 | conditions : List[str] 57 | list of conditions to pass the sql execute, with %s placeholders for values 58 | values : List[str] 59 | list of values to be passed for the sql execute. 60 | limit : int (optional) (default 0) 61 | limit for the number of results 62 | offset : int (optional) (default 0) 63 | offset for the results. 64 | 65 | Returns 66 | ------- 67 | (DBResponse, DBPagination) 68 | """ 69 | sql_template = """ 70 | SELECT name FROM ( 71 | SELECT DISTINCT name, flow_id, run_number, run_id 72 | FROM {table_name} 73 | ) T 74 | {conditions} 75 | {limit} 76 | {offset} 77 | """ 78 | select_sql = sql_template.format( 79 | table_name=self.table_name, 80 | keys=",".join(self.select_columns), 81 | conditions=("WHERE {}".format(" AND ".join(conditions)) if conditions else ""), 82 | limit="LIMIT {}".format(limit) if limit else "", 83 | offset="OFFSET {}".format(offset) if offset else "" 84 | ) 85 | 86 | res, pag = await self.execute_sql(select_sql=select_sql, values=values, fetch_single=False, 87 | expanded=False, 88 | limit=limit, offset=offset, serialize=False) 89 | # process the unserialized DBResponse 90 | _body = [row[0] for row in res.body] 91 | 92 | return DBResponse(res.response_code, _body), pag 93 | 94 | async def get_run_graph_info_artifact(self, flow_name: str, run_id: str) -> DBResponse: 95 | """ 96 | Tries to locate '_graph_info' in run artifacts 97 | """ 98 | run_id_key, run_id_value = translate_run_key(run_id) 99 | 100 | db_response, *_ = await self.find_records( 101 | conditions=[ 102 | "flow_id = %s", 103 | "{run_id_key} = %s".format( 104 | run_id_key=run_id_key), 105 | "step_name = %s", 106 | "name = %s" 107 | ], 108 | values=[ 109 | flow_name, run_id_value, "_parameters", 110 | "_graph_info", 111 | ], 112 | fetch_single=True, expanded=True 113 | ) 114 | 115 | return db_response 116 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/db/tables/flow.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from .base import AsyncPostgresTable 3 | from ..models import FlowRow 4 | from services.data.db_utils import DBResponse, DBPagination 5 | # use schema constants from the .data module to keep things consistent 6 | from services.data.postgres_async_db import AsyncFlowTablePostgres as MetadataFlowTable 7 | 8 | 9 | class AsyncFlowTablePostgres(AsyncPostgresTable): 10 | table_name = MetadataFlowTable.table_name 11 | keys = MetadataFlowTable.keys 12 | primary_keys = MetadataFlowTable.primary_keys 13 | trigger_keys = MetadataFlowTable.trigger_keys 14 | select_columns = keys 15 | _row_type = FlowRow 16 | 17 | async def get_flow_ids(self, conditions: List[str] = [], 18 | values: List[str] = [], limit: int = 0, offset: int = 0) -> (DBResponse, DBPagination): 19 | """ 20 | Get a paginated set of flow ids. 21 | 22 | Parameters 23 | ---------- 24 | conditions : List[str] 25 | list of conditions to pass the sql execute, with %s placeholders for values 26 | values : List[str] 27 | list of values to be passed for the sql execute. 28 | limit : int (optional) (default 0) 29 | limit for the number of results 30 | offset : int (optional) (default 0) 31 | offset for the results. 32 | 33 | Returns 34 | ------- 35 | (DBResponse, DBPagination) 36 | """ 37 | sql_template = """ 38 | SELECT DISTINCT flow_id 39 | FROM {table_name} 40 | {conditions} 41 | {limit} 42 | {offset} 43 | """ 44 | select_sql = sql_template.format( 45 | table_name=self.table_name, 46 | keys=",".join(self.select_columns), 47 | conditions=("WHERE {}".format(" AND ".join(conditions)) if conditions else ""), 48 | limit="LIMIT {}".format(limit) if limit else "", 49 | offset="OFFSET {}".format(offset) if offset else "" 50 | ) 51 | 52 | res, pag = await self.execute_sql(select_sql=select_sql, values=values, fetch_single=False, 53 | expanded=False, 54 | limit=limit, offset=offset, serialize=False) 55 | # process the unserialized DBResponse 56 | _body = [row[0] for row in res.body] 57 | 58 | return DBResponse(res.response_code, _body), pag 59 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/db/tables/metadata.py: -------------------------------------------------------------------------------- 1 | from services.data.db_utils import DBResponse, translate_run_key 2 | from .base import AsyncPostgresTable 3 | from .task import AsyncTaskTablePostgres 4 | from ..models import MetadataRow 5 | # use schema constants from the .data module to keep things consistent 6 | from services.data.postgres_async_db import AsyncMetadataTablePostgres as MetaserviceMetadataTable 7 | 8 | 9 | class AsyncMetadataTablePostgres(AsyncPostgresTable): 10 | _row_type = MetadataRow 11 | table_name = MetaserviceMetadataTable.table_name 12 | task_table_name = AsyncTaskTablePostgres.table_name 13 | keys = MetaserviceMetadataTable.keys 14 | primary_keys = MetaserviceMetadataTable.primary_keys 15 | trigger_keys = MetaserviceMetadataTable.trigger_keys 16 | trigger_operations = ["INSERT"] 17 | trigger_conditions = [ 18 | "NEW.field_name = 'attempt'", 19 | "NEW.field_name = 'attempt_ok'", 20 | "NEW.field_name = 'code-package'", 21 | "NEW.field_name = 'code-package-url'", 22 | ] 23 | 24 | @property 25 | def select_columns(self): 26 | keys = ["{table_name}.{col} AS {col}".format(table_name=self.table_name, col=k) for k in self.keys] 27 | 28 | # Must use SELECT on the regexp matches in order to include non-matches as well, otherwise 29 | # we won't be able to fill attempt_id with NULL in case no id has been recorded 30 | # (f.ex. run-level metadata) 31 | keys.append( 32 | "(SELECT regexp_matches(tags::text, 'attempt_id:(\\d+)'))[1]::int as attempt_id" 33 | ) 34 | return keys 35 | 36 | async def get_run_codepackage_metadata(self, flow_name: str, run_id: str) -> DBResponse: 37 | """ 38 | Tries to locate 'code-package' or 'code-package-url' in run metadata. 39 | """ 40 | run_id_key, run_id_value = translate_run_key(run_id) 41 | # 'code-package' value contains json with dstype, sha1 hash and location 42 | # 'code-package-url' value contains only location as a string 43 | db_response, *_ = await self.find_records( 44 | conditions=[ 45 | "flow_id = %s", 46 | "{run_id_key} = %s".format( 47 | run_id_key=run_id_key), 48 | "(field_name = %s OR field_name = %s)" 49 | ], 50 | values=[ 51 | flow_name, run_id_value, 52 | "code-package", "code-package-url" 53 | ], 54 | fetch_single=True, expanded=True 55 | ) 56 | 57 | return db_response 58 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/db/utils.py: -------------------------------------------------------------------------------- 1 | 2 | from services.data.db_utils import DBResponse 3 | from services.ui_backend_service.data.db.postgres_async_db import AsyncPostgresDB 4 | 5 | 6 | async def get_run_dag_data(db: AsyncPostgresDB, flow_name: str, run_number: str) -> DBResponse: 7 | """ 8 | Fetches either a _graph_info artifact, or a code-package metadata entry if the artifact is missing. 9 | Used to determine whether a run can display a DAG. 10 | """ 11 | db_response = await db.artifact_table_postgres.get_run_graph_info_artifact(flow_name, run_number) 12 | if not db_response.response_code == 200: 13 | # Try to look for codepackage if graph artifact is missing 14 | db_response = await db.metadata_table_postgres.get_run_codepackage_metadata(flow_name, run_number) 15 | 16 | return db_response 17 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/refiner/__init__.py: -------------------------------------------------------------------------------- 1 | from .task_refiner import TaskRefiner 2 | from .parameter_refiner import ParameterRefiner 3 | from .artifact_refiner import ArtifactRefiner 4 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/refiner/artifact_refiner.py: -------------------------------------------------------------------------------- 1 | from .refinery import Refinery 2 | 3 | 4 | class ArtifactRefiner(Refinery): 5 | """ 6 | Refiner class for postprocessing Artifact rows. 7 | 8 | Uses Metaflow Client API to refine Artifact's actual content from Metaflow Service and Datastore. 9 | 10 | Parameters 11 | ----------- 12 | cache : AsyncCacheClient 13 | An instance of a cache that implements the GetArtifacts action. 14 | """ 15 | 16 | def __init__(self, cache): 17 | super().__init__(cache=cache) 18 | 19 | def _action(self): 20 | return self.cache_store.cache.GetArtifacts 21 | 22 | def _record_to_action_input(self, record): 23 | # Prefer run_id over run_number 24 | # Prefer task_name over task_id 25 | return "{flow_id}/{run_id}/{step_name}/{task_name}/{name}/{attempt_id}".format( 26 | flow_id=record['flow_id'], 27 | run_id=record.get('run_id') or record['run_number'], 28 | step_name=record['step_name'], 29 | task_name=record.get('task_name') or record['task_id'], 30 | name=record['name'], 31 | attempt_id=record['attempt_id']) 32 | 33 | async def refine_record(self, record, values): 34 | record['content'] = str(values) 35 | return record 36 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/refiner/parameter_refiner.py: -------------------------------------------------------------------------------- 1 | from .refinery import Refinery 2 | 3 | 4 | class ParameterRefiner(Refinery): 5 | """ 6 | Refiner class for postprocessing Run parameters. 7 | 8 | Uses Metaflow Client API to refine Run parameters from Metaflow Datastore. 9 | 10 | Parameters 11 | ----------- 12 | cache : AsyncCacheClient 13 | An instance of a cache that implements the GetParameters action. 14 | """ 15 | 16 | def __init__(self, cache): 17 | super().__init__(cache=cache) 18 | 19 | def _action(self): 20 | return self.cache_store.cache.GetParameters 21 | 22 | async def fetch_data(self, targets, event_stream=None, invalidate_cache=False): 23 | _res = await self._action()(targets, invalidate_cache=invalidate_cache) 24 | if _res.has_pending_request(): 25 | async for event in _res.stream(): 26 | if event["type"] == "error": 27 | # raise error, there was an exception during processing. 28 | raise GetParametersFailed(event["message"], event["id"], event["traceback"]) 29 | await _res.wait() # wait for results to be ready 30 | return _res.get() or {} # cache get() might return None if no keys are produced. 31 | 32 | def _record_to_action_input(self, record): 33 | # Prefer run_id over run_number 34 | return "{flow_id}/{run_id}".format( 35 | flow_id=record['flow_id'], 36 | run_id=record.get('run_id') or record['run_number']) 37 | 38 | async def refine_record(self, record, values): 39 | return {k: {'value': v} for k, v in values.items()} 40 | 41 | 42 | class GetParametersFailed(Exception): 43 | def __init__(self, msg="Failed to Get Parameters", id="failed-to-get-parameters", traceback_str=None): 44 | self.message = msg 45 | self.id = id 46 | self.traceback_str = traceback_str 47 | 48 | def __str__(self): 49 | return self.message 50 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/refiner/refinery.py: -------------------------------------------------------------------------------- 1 | from services.data.db_utils import DBResponse 2 | from services.ui_backend_service.features import FEATURE_REFINE_DISABLE 3 | from services.ui_backend_service.data import unpack_processed_value 4 | from services.utils import logging 5 | 6 | 7 | class Refinery(object): 8 | """ 9 | Refiner class for postprocessing database rows. 10 | 11 | Uses predefined cache actions to refine database responses with Metaflow Datastore artifacts. 12 | 13 | Parameters 14 | ----------- 15 | cache : AsyncCacheClient 16 | An instance of a cache that implements the GetArtifacts action. 17 | """ 18 | 19 | def __init__(self, cache): 20 | self.cache_store = cache 21 | self.logger = logging.getLogger(self.__class__.__name__) 22 | 23 | def _action(self): 24 | return self.cache_store.cache.GetData 25 | 26 | async def fetch_data(self, targets, event_stream=None, invalidate_cache=False): 27 | _res = await self._action()(targets, invalidate_cache=invalidate_cache) 28 | if _res.has_pending_request(): 29 | async for event in _res.stream(): 30 | if event["type"] == "error": 31 | if event_stream: 32 | event_stream(event) 33 | await _res.wait() # wait for results to be ready 34 | return _res.get() or {} # cache get() might return None if no keys are produced. 35 | 36 | async def refine_record(self, record, values): 37 | """No refinement necessary here""" 38 | return record 39 | 40 | def _response_to_action_input(self, response: DBResponse): 41 | if isinstance(response.body, list): 42 | return [self._record_to_action_input(task) for task in response.body] 43 | else: 44 | return [self._record_to_action_input(response.body)] 45 | 46 | def _record_to_action_input(self, record): 47 | return "{flow_id}/{run_number}/{step_name}/{task_id}".format(**record) 48 | 49 | async def postprocess(self, response: DBResponse, invalidate_cache=False): 50 | """ 51 | Calls the refiner postprocessing to fetch Metaflow artifacts. 52 | 53 | Parameters 54 | ---------- 55 | response : DBResponse 56 | The DBResponse to be refined 57 | 58 | Returns 59 | ------- 60 | A refined DBResponse, or in case of errors, the original DBResponse 61 | """ 62 | if FEATURE_REFINE_DISABLE: 63 | return response 64 | 65 | if response.response_code != 200 or not response.body: 66 | return response 67 | 68 | input = self._response_to_action_input(response) 69 | 70 | errors = {} 71 | 72 | def _event_stream(event): 73 | if event.get("type") == "error" and event.get("key"): 74 | # Get last element from cache key which usually translates to "target" 75 | target = event["key"].split(':')[-1:][0] 76 | errors[target] = event 77 | 78 | data = await self.fetch_data( 79 | input, event_stream=_event_stream, invalidate_cache=invalidate_cache) 80 | 81 | async def _process(record): 82 | target = self._record_to_action_input(record) 83 | 84 | if target in errors: 85 | # Add streamed postprocess errors if any 86 | record["postprocess_error"] = format_error_body( 87 | errors[target].get("id"), 88 | errors[target].get("message"), 89 | errors[target].get("traceback") 90 | ) 91 | 92 | if target in data: 93 | success, value, detail, trace = unpack_processed_value(data[target]) 94 | if success: 95 | record = await self.refine_record(record, value) 96 | else: 97 | record['postprocess_error'] = format_error_body( 98 | value if value else "artifact-handle-failed", 99 | detail if detail else "Unknown error during postprocessing", 100 | trace 101 | ) 102 | else: 103 | record['postprocess_error'] = format_error_body( 104 | "artifact-value-not-found", 105 | "Artifact value not found" 106 | ) 107 | 108 | return record 109 | 110 | if isinstance(response.body, list): 111 | body = [await _process(task) for task in response.body] 112 | else: 113 | body = await _process(response.body) 114 | 115 | return DBResponse(response_code=response.response_code, body=body) 116 | 117 | 118 | def format_error_body(id=None, detail=None, traceback=None): 119 | ''' 120 | formatter for the "postprocess_error" key added to refined items in case of errors. 121 | ''' 122 | return { 123 | "id": id or "artifact-refine-failure", 124 | "detail": detail, 125 | "traceback": traceback 126 | } 127 | -------------------------------------------------------------------------------- /services/ui_backend_service/data/refiner/task_refiner.py: -------------------------------------------------------------------------------- 1 | from .refinery import Refinery 2 | 3 | 4 | class TaskRefiner(Refinery): 5 | """ 6 | Refiner class for postprocessing Task rows. 7 | 8 | Uses Metaflow Client API to refine Task's actual status from Metaflow Service and Datastore. 9 | 10 | Parameters 11 | ----------- 12 | cache : AsyncCacheClient 13 | An instance of a cache that implements the GetTask action. 14 | """ 15 | 16 | def __init__(self, cache): 17 | super().__init__(cache=cache) 18 | 19 | def _action(self): 20 | return self.cache_store.cache.GetTask 21 | 22 | def _record_to_action_input(self, record): 23 | # Prefer run_id over run_number 24 | # Prefer task_name over task_id 25 | return "{flow_id}/{run_id}/{step_name}/{task_name}/{attempt_id}".format( 26 | flow_id=record['flow_id'], 27 | run_id=record.get('run_id') or record['run_number'], 28 | step_name=record['step_name'], 29 | task_name=record.get('task_name') or record['task_id'], 30 | attempt_id=record['attempt_id']) 31 | 32 | async def refine_record(self, record, values): 33 | if record['status'] == 'unknown' and values.get('_task_ok') is not None: 34 | value = values['_task_ok'] 35 | if value is False: 36 | record['status'] = 'failed' 37 | elif value is True: 38 | record['status'] = 'completed' 39 | 40 | if values.get('_foreach_stack'): 41 | value = values['_foreach_stack'] 42 | if len(value) > 0 and len(value[0]) >= 4: 43 | # The third one in the tuple is the foreach index. We access this way for backwards compatibility. 44 | record['foreach_label'] = "{}[{}]".format(record['task_id'], value[0][3]) 45 | 46 | return record 47 | -------------------------------------------------------------------------------- /services/ui_backend_service/docs/README.md: -------------------------------------------------------------------------------- 1 | # Metaflow UI Service Documentation 2 | 3 | ## Table of Contents 4 | 5 | - Optional configuration 6 | - [Configurable environment variables](environment.md) 7 | - [Plugin system](plugins.md) 8 | - API Documentation 9 | - [REST API routes](api.md) 10 | - [Realtime resource subscriptions](websockets.md#realtime-state-subscriptions-for-resources) 11 | - [Artifact Search](websockets.md#search-api) 12 | - Architecture descriptions 13 | - [Disk Cache structure](architecture.md#cache) 14 | - [Heartbeat Monitoring of active resources](architecture.md#heartbeat-monitoring) 15 | - [Realtime events through websockets](architecture.md#realtime-events-over-web-sockets) 16 | -------------------------------------------------------------------------------- /services/ui_backend_service/docs/api.md: -------------------------------------------------------------------------------- 1 | # API documentation 2 | 3 | A thorough documentation of the RESTful API routes, responses and types can be accessed through the Swagger docs that the backend serves. 4 | These are accessible at `example.com/api/doc` 5 | 6 | ## Examples 7 | 8 | ``` 9 | /flows/HelloFlow/runs?_page=4 List page 4 10 | /flows/HelloFlow/runs?_page=2&_limit=10 List page 4, each page contains 10 items 11 | 12 | /flows/HelloFlow/runs?_order=run_number Order by `run_number` in descending order 13 | /flows/HelloFlow/runs?_order=+run_number Order by `run_number` in ascending order 14 | /flows/HelloFlow/runs?_order=-run_number Order by `run_number` in descending order 15 | /flows/HelloFlow/runs?_order=run_number,ts_epoch Order by `run_number` and `ts_epoch` in descending order 16 | 17 | /runs?_tags=user:dipper Filter by one tag 18 | /runs?_tags=user:dipper,runtime:dev Filter by multiple tags (AND) 19 | /runs?_tags:all=user:dipper,runtime:dev Filter by multiple tags (AND) 20 | /runs?_tags:any=user:dipper,runtime:dev Filter by multiple tags (OR) 21 | /runs?_tags:likeall=user:dip,untime:de Filter by multiple tags that contains string (AND) 22 | /runs?_tags:likeany=user:,untime:de Filter by multiple tags that contains string (OR) 23 | 24 | /runs?_group=flow_id Group by `flow_id` 25 | /runs?_group=flow_id,user_name Group by `flow_id` and `user_name` 26 | /runs?_group=user_name&_limit=2 Group by `user_name` and limit each group to `2` runs 27 | /runs?_group=flow_id&_order=flow_id,run_number Group by `flow_id` and order by `flow_id & run_number` 28 | /runs?_group=flow_id&user_name=dipper List runs by `dipper` and group by `flow_id` 29 | /runs?user=null `user` is NULL 30 | 31 | /flows/HelloFlow/runs?run_number=40 `run_number` equals `40` 32 | /flows/HelloFlow/runs?run_number:eq=40 `run_number` equals `40` 33 | /flows/HelloFlow/runs?run_number:ne=40 `run_number` not equals `40` 34 | /flows/HelloFlow/runs?run_number:lt=40 `run_number` less than `40` 35 | /flows/HelloFlow/runs?run_number:le=40 `run_number` less than or equals `40` 36 | /flows/HelloFlow/runs?run_number:gt=40 `run_number` greater than `40` 37 | /flows/HelloFlow/runs?run_number:ge=40 `run_number` greater than equals `40` 38 | 39 | /flows/HelloFlow/runs?user_name:co=atia `user_name` contains `atia` 40 | /flows/HelloFlow/runs?user_name:sw=mati `user_name` starts with `mati` 41 | /flows/HelloFlow/runs?user_name:ew=tias `user_name` ends with `tias` 42 | 43 | /flows?user_name=dipper,mabel `user_name` is either `dipper` OR `mabel` 44 | 45 | /flows/HelloFlow/runs?run_number:lt=60&run_number:gt=40 `run_number` less than 60 and greater than 40 46 | ``` 47 | 48 | ## Available operators 49 | 50 | | URL operator | Description | SQL operator | 51 | |--------------|-------------------------|--------------| 52 | | `eq` | equals | `=` | 53 | | `ne` | not equals | `!=` | 54 | | `lt` | less than | `<` | 55 | | `le` | less than equals | `<=` | 56 | | `gt` | greater than | `>` | 57 | | `ge` | greater than equals | `>=` | 58 | | `co` | contains | `*string*` | 59 | | `sw` | starts with | `^string*` | 60 | | `ew` | ends with | `*string$` | 61 | | `is` | is | `IS` | 62 | | `li` | is like (use with %val) | `ILIKE` | -------------------------------------------------------------------------------- /services/ui_backend_service/docs/architecture.md: -------------------------------------------------------------------------------- 1 | # Architecture documentation for UI Service 2 | 3 | ## Cache 4 | 5 | ![Cache architecture diagram](images/cache_architecture.png) 6 | 7 | The Cache system is split into three main components 8 | - [Async Cache client](#async-cache-client) (cache interface) 9 | - [CacheFuture](#cachefuture) (awaitable cache result) 10 | - [Cache server](#cache-server) (worker queueing) 11 | - [Cache worker](#cache-worker) (execution of cache requests) 12 | 13 | ### Async Cache Client 14 | 15 | The Cache client is the interface for accessing cached content. It is responsible for setting up and starting the Cache server as a *subprocess*, setting the necessary configuration variables, such as maximum allowed diskspace and number of cache workers. 16 | 17 | The cache client instance exposes a number of Cache Actions after the server subprocess has successfully started. These are the interface for accessing cached content. The response of a cache action is an awaitable `CacheFuture`. 18 | 19 | #### CacheFuture 20 | 21 | The inner workings of the cache future are best explained with an example. Take the following cache action 22 | 23 | ```python 24 | result = await cache_client_instance.GetArtifacts("s3://location") 25 | ``` 26 | the `result` will be a CacheFuture instance, which will check if all cache keys required by the request are present on disk (cache hit). 27 | 28 | In case of a cache miss, the CacheFuture will send a cache request through the Cache Client instance, and wait to perform another check for keys on disk. The cache keys will be finally present when the worker has finished processing the action. The future has a very generous timeout so in case the worker/server/client experiences an issue, it will take a while for the future to timeout. 29 | 30 | ### Cache Server 31 | 32 | The cache server is responsible for receiving cache requests from the subprocess stdin, queueing the requests, and starting cache workers to process the queue, up to a limit. Note that the cache workers run their cache action as a *subprocesses* of the cache server. 33 | 34 | Each cache server is responsible for maintaining a non-ephemeral cache worker pool. UI Service has multiple cache worker pools for different types of resources, such as DAG and artifacts. The size of each pool can be controller via environment variables [via environment variables](./environment.md). 35 | 36 | For starting a cache worker, the server writes the request payload to disk as a `request.json` tempfile, which the worker process then reads at start. 37 | 38 | ### Cache Worker 39 | 40 | The cache worker is a subprocess whose sole responsibility is to read the request payload from `request.json` and execute the corresponding cache action as a subprocess, with the inputs contained in the request, and persisting the produced cache keys to disk. 41 | 42 | ## Heartbeat monitoring 43 | 44 | ![Heartbeat monitoring architecture diagram](images/heartbeat_monitoring.png) 45 | 46 | Heartbeat monitoring is required to track in-flight resources that might stop executing without producing any trace of failure. 47 | 48 | ### Basic structure 49 | A heartbeat monitor has a list of resources, with their respective latest heartbeat timestamps. The list is iterated through periodically (heartbeat interval + buffer), and further processing is done on items that have an expired timestamp, for example broadcasting them as failures. 50 | 51 | Adding items for tracking is implemented with the `PyEE` event emitter internally. A `HeartbeatMonitor` class sets up its event listeners for adding and removing tracked items. Monitoring responsibilities are shared with the `ListenNotify` component as follows 52 | 53 | [`HeartbeatMonitor`](../api/heartbeat_monitor.py) 54 | - periodically checks for expired heartbeats on tracked items 55 | - manages list of tracked items (add/update/remove) 56 | 57 | [`ListenNotify`](../api/notify.py) 58 | - broadcast resources to add or update heartbeats for tracking 59 | - broadcast when a resource should be removed from heartbeat tracking (completion events) 60 | 61 | ## Realtime events over web sockets 62 | 63 | ![Websocket architecture diagram](images/websocket_communication.png) 64 | 65 | ### Basic structure 66 | For receiving realtime events regarding specific resources, there are two distinct components that interact together over `PyEE` events; `Websocket` and `ListenNotify`. Their respective responsibilities are as follows 67 | 68 | [`Websocket`](../api/ws.py) 69 | - handles opening web sockets and subscribing to resources 70 | - receives resources for broadcasting to subscribers 71 | - handles loading(from database) and broadcasting of resources to affected subscriptions. 72 | 73 | [`ListenNotify`](../api/notify.py) 74 | - broadcast resources received from the database events -------------------------------------------------------------------------------- /services/ui_backend_service/docs/images/cache_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Netflix/metaflow-service/9e47d2d85e127d2673d457dde7ae535a3341de0f/services/ui_backend_service/docs/images/cache_architecture.png -------------------------------------------------------------------------------- /services/ui_backend_service/docs/images/heartbeat_monitoring.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Netflix/metaflow-service/9e47d2d85e127d2673d457dde7ae535a3341de0f/services/ui_backend_service/docs/images/heartbeat_monitoring.png -------------------------------------------------------------------------------- /services/ui_backend_service/docs/images/websocket_communication.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Netflix/metaflow-service/9e47d2d85e127d2673d457dde7ae535a3341de0f/services/ui_backend_service/docs/images/websocket_communication.png -------------------------------------------------------------------------------- /services/ui_backend_service/docs/websockets.md: -------------------------------------------------------------------------------- 1 | # Documentation for Web Socket endpoints 2 | 3 | - Realtime state subscriptions for resources 4 | - [Subscribing and unsubscribing](#subscribing-and-unsubscribing) 5 | - [Resources](#resources) 6 | - Search API 7 | - [Searching](#searching) 8 | - [Search responses](#search-responses) 9 | 10 | ## Realtime state subscriptions for resources 11 | 12 | ### Subscribing and unsubscribing. 13 | Subscribing to a RESTful resources realtime events is done by sending the following message to the 14 | `ws://HOSTNAME/ws` endpoint. 15 | 16 | ```json 17 | { 18 | "type": "SUBSCRIBE", 19 | "resource": "path-to-subscribable-restful-resource", 20 | "uuid": "client-generated-uuid" 21 | } 22 | ``` 23 | 24 | Subscribe to future events and return past data since unix time (seconds): 25 | ```json 26 | { 27 | "type": "SUBSCRIBE", 28 | "resource": "path-to-subscribable-restful-resource", 29 | "uuid": "client-generated-uuid", 30 | "since": 1602752197 31 | } 32 | ``` 33 | 34 | Unsubscribing is done through the same endpoint with the message: 35 | ```json 36 | { 37 | "type": "UNSUBSCRIBE", 38 | "uuid": "existing-client-generated-uuid" 39 | } 40 | ``` 41 | 42 | ### Resources 43 | Subscribable resource endpoints include. All subscriptions also adhere to the corresponding RESTful routes query parameters to further filter received messages. 44 | 45 | ``` 46 | /flow_name/runs/ 47 | /flow_name/runs/run_number 48 | /flow_name/runs/run_number/steps 49 | /flow_name/runs/run_number/steps/step_name 50 | /flow_name/runs/run_number/steps/step_name/tasks 51 | /flow_name/runs/run_number/steps/step_name/tasks/task_id 52 | /flow_name/runs/run_number/steps/step_name/tasks/task_id/logs/out 53 | /flow_name/runs/run_number/steps/step_name/tasks/task_id/logs/err 54 | ``` 55 | 56 | ### Received messages 57 | The web socket client can receive three types of messages for its subscription: 58 | 59 | ```json 60 | { 61 | "type": "type-of-event", 62 | "resource": "path/of/subscribed/resource", 63 | "data": {}, 64 | "uuid": "uuid-of-subscription" 65 | } 66 | ``` 67 | The type can be one of `INSERT`, `UPDATE` or `DELETE`, corresponding to similar database actions. 68 | The `data` property contains the complete object of the subscribed resource, as it would be received from a basic GET request. 69 | 70 | # SEARCH API 71 | 72 | The Search Api provides a way to search which tasks have matching artifacts for a given run. Searching is performed through a websocket connection. 73 | 74 | ## Searching 75 | 76 | The endpoint to perform searches for a given run looks like 77 | ``` 78 | ws://HOSTNAME/flows/flow_id/runs/run_number/search?key=ARTIFACT_NAME&value=VALUE 79 | ``` 80 | where `ARTIFACT_NAME` is the name of an artifact to look for, and `VALUE` is the content of the artifact that we are searching for. 81 | 82 | ### Search Responses 83 | When the web socket opens for the search, the client starts receiving messages. These include progress, possible errors, and eventually the results. 84 | 85 | Progress message example: 86 | ```json 87 | { 88 | "event": { 89 | "type": "progress", 90 | "fraction": 1 91 | } 92 | } 93 | ``` 94 | The fraction is a percentage of objects loaded for the search. 95 | 96 | Error example: 97 | ```json 98 | { 99 | "event": { 100 | "type": "error", 101 | "message": "error message", 102 | "id": "uniqueErrorId" 103 | } 104 | } 105 | ``` 106 | The unique id is either the classname of the exception, or a custom id. Here are some of the most common ones: 107 | | Error ID | Description | 108 | |---------------------------|-------------------------------------------------------------------| 109 | | `MetaflowS3AccessDenied` | server does not have access to s3 bucket | 110 | | `MetaflowS3NotFound` | s3 404 response | 111 | | `MetaflowS3URLException` | malformed s3 url | 112 | | `MetaflowS3Exception` | something went wrong with s3 access | 113 | | `artifact-handle-failed` | something went wrong with processing the artifact | 114 | 115 | Results example: 116 | ```json 117 | { 118 | "event": { 119 | "type": "result", 120 | "matches": [ 121 | { 122 | "flow_id": "FlowName", 123 | "run_number": 123, 124 | "step_name": "some_step", 125 | "task_id": 456, 126 | "searchable": true 127 | } 128 | ] 129 | } 130 | } 131 | ``` 132 | The `searchable` boolean of a single task conveys whether the task had an artifact that could be included in the search process. 133 | 134 | -------------------------------------------------------------------------------- /services/ui_backend_service/download_ui.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 6 | 7 | FILENAME="metaflow-ui.zip" 8 | DEST=${1:-$DIR/ui} 9 | 10 | UI_RELEASE_URL="https://github.com/Netflix/metaflow-ui/releases/download/${UI_VERSION}/metaflow-ui-${UI_VERSION}.zip" 11 | 12 | if [ $UI_ENABLED = "1" ] 13 | then 14 | echo "Download UI version ${UI_VERSION} from $UI_RELEASE_URL to $DEST" 15 | curl -L $UI_RELEASE_URL -o $FILENAME 16 | unzip -o $FILENAME -d $DEST 17 | rm $FILENAME 18 | else 19 | echo "UI not enabled, skip download." 20 | fi -------------------------------------------------------------------------------- /services/ui_backend_service/example.custom_quicklinks.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "href": "https://docs.metaflow.org/", 4 | "label": "Metaflow documentation" 5 | }, 6 | { 7 | "href": "https://github.com/Netflix/metaflow", 8 | "label": "GitHub" 9 | } 10 | ] -------------------------------------------------------------------------------- /services/ui_backend_service/example.notifications.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "created": 1618404534000, 4 | "message": "Upcoming service maintenance" 5 | } 6 | ] -------------------------------------------------------------------------------- /services/ui_backend_service/example.plugins.json: -------------------------------------------------------------------------------- 1 | { 2 | "plugin-example": "git@github.com:Netflix/metaflow-ui-plugin-example.git" 3 | } -------------------------------------------------------------------------------- /services/ui_backend_service/features.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | FEATURE_ENV_PREFIX = 'FEATURE_' 4 | 5 | 6 | def get_features(): 7 | """ 8 | Get a dict of features that are enabled or disabled for the process 9 | 10 | Returns 11 | ------- 12 | Dict 13 | example: 14 | { 15 | "FEATURE_SOME_FEAT": True 16 | } 17 | """ 18 | features = {} 19 | for key, val in os.environ.items(): 20 | if key.startswith(FEATURE_ENV_PREFIX): 21 | val = val.lower() 22 | features[key] = val != '0' and val != 'false' and val != 'f' 23 | return features 24 | 25 | 26 | FEATURES = get_features() 27 | 28 | FEATURE_PREFETCH_DISABLE = FEATURES.get('FEATURE_PREFETCH_DISABLE', False) 29 | FEATURE_CACHE_DISABLE = FEATURES.get('FEATURE_CACHE_DISABLE', False) 30 | FEATURE_S3_DISABLE = FEATURES.get('FEATURE_S3_DISABLE', False) 31 | FEATURE_REFINE_DISABLE = FEATURES.get('FEATURE_REFINE_DISABLE', False) 32 | 33 | FEATURE_PREFETCH_ENABLE = not FEATURE_PREFETCH_DISABLE 34 | FEATURE_CACHE_ENABLE = not FEATURE_CACHE_DISABLE 35 | FEATURE_S3_ENABLE = not FEATURE_S3_DISABLE 36 | FEATURE_REFINE_ENABLE = not FEATURE_REFINE_DISABLE 37 | 38 | FEATURE_WS_DISABLE = FEATURES.get('FEATURE_WS_DISABLE', False) 39 | FEATURE_DB_LISTEN_DISABLE = FEATURES.get('FEATURE_DB_LISTEN_DISABLE', False) 40 | FEATURE_HEARTBEAT_DISABLE = FEATURES.get('FEATURE_HEARTBEAT_DISABLE', False) 41 | 42 | FEATURE_WS_ENABLE = not FEATURE_WS_DISABLE 43 | FEATURE_DB_LISTEN_ENABLE = not FEATURE_DB_LISTEN_DISABLE 44 | FEATURE_HEARTBEAT_ENABLE = not FEATURE_HEARTBEAT_DISABLE 45 | 46 | if FEATURE_S3_DISABLE: 47 | os.environ["AWS_ACCESS_KEY_ID"] = "None" 48 | os.environ["AWS_SECRET_ACCESS_KEY"] = "None" 49 | os.environ["AWS_DEFAULT_REGION"] = "None" 50 | -------------------------------------------------------------------------------- /services/ui_backend_service/frontend.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | 4 | from aiohttp import web 5 | 6 | dirname = os.path.dirname(os.path.realpath(__file__)) 7 | static_ui_path = os.path.join(dirname, "ui") 8 | 9 | 10 | METAFLOW_SERVICE = os.environ.get("METAFLOW_SERVICE", "/") 11 | 12 | METAFLOW_HEAD = os.environ.get("METAFLOW_HEAD", None) 13 | METAFLOW_BODY_BEFORE = os.environ.get("METAFLOW_BODY_BEFORE", None) 14 | METAFLOW_BODY_AFTER = os.environ.get("METAFLOW_BODY_AFTER", None) 15 | 16 | 17 | class Frontend(object): 18 | """ 19 | Provides routes for the static UI webpage. 20 | Require this as the last Api, as it is a catch-all route. 21 | """ 22 | 23 | def __init__(self, app): 24 | app.router.add_static('/static', 25 | path=os.path.join(static_ui_path, "static"), 26 | name='static') 27 | 28 | # serve the root static files separately. 29 | static_files = glob.glob(os.path.join(static_ui_path, "*.*")) 30 | for filepath in static_files: 31 | filename = filepath[len(static_ui_path) + 1:] 32 | app.router.add_route( 33 | 'GET', f'/{filename}', self.serve_file(filename)) 34 | 35 | # catch-all route that unfortunately messes with root static file serving. 36 | # Refreshing SPA pages won't work without the tail. 37 | app.router.add_route('GET', '/{tail:.*}', self.serve_index_html) 38 | 39 | def serve_file(self, filename: str): 40 | "Generator for single static file serving handlers" 41 | async def filehandler(request): 42 | return web.FileResponse(os.path.join(static_ui_path, filename)) 43 | return filehandler 44 | 45 | async def serve_index_html(self, request): 46 | "Serve index.html by injecting `METAFLOW_SERVICE` variable to define API base url." 47 | try: 48 | with open(os.path.join(static_ui_path, "index.html")) as f: 49 | content = f.read() \ 50 | .replace("", 51 | "".format(METAFLOW_SERVICE=METAFLOW_SERVICE)) 52 | 53 | if METAFLOW_HEAD: 54 | content = content.replace("", "{METAFLOW_HEAD}" 55 | .format(METAFLOW_HEAD=METAFLOW_HEAD)) 56 | 57 | if METAFLOW_BODY_BEFORE: 58 | content = content.replace("", "{METAFLOW_BODY_BEFORE}" 59 | .format(METAFLOW_BODY_BEFORE=METAFLOW_BODY_BEFORE)) 60 | 61 | if METAFLOW_BODY_AFTER: 62 | content = content.replace("", "{METAFLOW_BODY_AFTER}" 63 | .format(METAFLOW_BODY_AFTER=METAFLOW_BODY_AFTER)) 64 | 65 | return web.Response(text=content, content_type='text/html') 66 | except Exception as err: 67 | return web.Response(text=str(err), status=500, content_type='text/plain') 68 | -------------------------------------------------------------------------------- /services/ui_backend_service/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | from services.utils import logging 2 | from ..api.utils import get_json_config 3 | 4 | from .plugin import (Plugin, PluginException) 5 | 6 | _PLUGINS = [] 7 | 8 | logger = logging.getLogger("Plugin") 9 | 10 | 11 | def list_plugins(): 12 | global _PLUGINS 13 | return _PLUGINS 14 | 15 | 16 | def init_plugins(): 17 | global _PLUGINS 18 | 19 | logger.info("Init plugins") 20 | 21 | plugins = get_json_config("plugins") 22 | if plugins: 23 | global_auth = None 24 | if "auth" in plugins and isinstance(plugins["auth"], dict): 25 | global_auth = plugins["auth"] 26 | 27 | for identifier, value in plugins.items(): 28 | if isinstance(value, str): 29 | repository = value 30 | ref = None 31 | parameters = {} 32 | paths = None 33 | auth = global_auth 34 | elif identifier == "auth": 35 | continue 36 | elif isinstance(value, dict): 37 | repository = value.get("repository", None) 38 | ref = value.get("ref", None) 39 | parameters = value.get("parameters", {}) 40 | paths = value.get("paths", None) 41 | if "auth" in value: 42 | auth = value.get("auth", None) 43 | else: 44 | auth = global_auth 45 | else: 46 | logger.warning(" [{}] Invalid plugin format, skipping".format(identifier)) 47 | continue 48 | 49 | if paths and isinstance(paths, list): 50 | for path in paths: 51 | _load_plugin(identifier=identifier, repository=repository, ref=ref, parameters=parameters, path=path, auth=auth) 52 | else: 53 | _load_plugin(identifier=identifier, repository=repository, ref=ref, parameters=parameters, auth=auth) 54 | 55 | logger.info("Plugins ready: {}".format(list(map(lambda p: p.identifier, _PLUGINS)))) 56 | 57 | 58 | def _load_plugin(identifier: str, repository: str = None, ref: str = None, parameters: dict = {}, path: str = None, auth: dict = {}): 59 | global _PLUGINS 60 | try: 61 | plugin = Plugin(identifier=identifier, repository=repository, ref=ref, parameters=parameters, path=path, auth=auth) 62 | _PLUGINS.append(plugin.init()) 63 | except PluginException as err: 64 | logger.error(" [{}:{}] PluginException: {}".format(identifier, path, err)) 65 | except Exception as err: 66 | logger.error(" [{}:{}] Unknown error loading plugin {}".format(identifier, path, err)) 67 | 68 | 69 | def _reset_plugins(): 70 | global _PLUGINS 71 | _PLUGINS = [] 72 | -------------------------------------------------------------------------------- /services/ui_backend_service/plugins/installed/.gitignore: -------------------------------------------------------------------------------- 1 | * -------------------------------------------------------------------------------- /services/ui_backend_service/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp >= 3.8.1, < 4 2 | pyee==8.0.1 3 | throttler==1.2.0 4 | packaging 5 | psycopg2 6 | aiopg 7 | pygit2==1.12.1 8 | aiohttp_cors==0.7.0 9 | metaflow>=2.11.4 10 | click==8.0.3 11 | azure-storage-blob==12.13.1 12 | azure-identity==1.16.1 13 | google-cloud-storage~=2.10.0 14 | -------------------------------------------------------------------------------- /services/ui_backend_service/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Netflix/metaflow-service/9e47d2d85e127d2673d457dde7ae535a3341de0f/services/ui_backend_service/tests/__init__.py -------------------------------------------------------------------------------- /services/ui_backend_service/tests/integration_tests/__init__.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | # we need to register the utils helper for assert rewriting in order to get descriptive assertion errors. 4 | pytest.register_assert_rewrite("services.ui_backend_service.tests.integration_tests.utils") 5 | -------------------------------------------------------------------------------- /services/ui_backend_service/tests/integration_tests/features_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from .utils import ( 3 | init_app, set_env, cli 4 | ) 5 | pytestmark = [pytest.mark.integration_tests] 6 | 7 | 8 | async def get_features(cli): 9 | return await (await cli.get('/features')).json() 10 | 11 | 12 | async def expect_features(cli, expected_features={}): 13 | assert await get_features(cli) == expected_features 14 | 15 | 16 | async def test_features_none(cli): 17 | with set_env(): 18 | assert await get_features(cli) == {} 19 | 20 | 21 | async def test_features_true(cli): 22 | with set_env({ 23 | 'FEATURE_ONE': 'true', 24 | 'FEATURE_SECOND': 'foo', 25 | 'FEATURE_THIRD': '1', 26 | 'FEATURE_FOURTH': '', 27 | 'FEATURE_FIFTH': ' ' 28 | }): 29 | await expect_features(cli, { 30 | 'FEATURE_ONE': True, 31 | 'FEATURE_SECOND': True, 32 | 'FEATURE_THIRD': True, 33 | 'FEATURE_FOURTH': True, 34 | 'FEATURE_FIFTH': True 35 | }) 36 | 37 | 38 | async def test_features_false(cli): 39 | with set_env({'FEATURE_ONE': 'false'}): 40 | await expect_features(cli, { 41 | 'FEATURE_ONE': False 42 | }) 43 | 44 | 45 | async def test_features_f(cli): 46 | with set_env({'FEATURE_ONE': 'f'}): 47 | await expect_features(cli, { 48 | 'FEATURE_ONE': False 49 | }) 50 | 51 | 52 | async def test_features_0(cli): 53 | with set_env({'FEATURE_ONE': '0'}): 54 | await expect_features(cli, { 55 | 'FEATURE_ONE': False 56 | }) 57 | 58 | 59 | async def test_features_only(cli): 60 | with set_env({ 61 | 'FEATURE_FOO': 'true', 62 | 'ANOTHER_ENV_VAR': 'bar' 63 | }): 64 | await expect_features(cli, { 65 | 'FEATURE_FOO': True 66 | }) 67 | -------------------------------------------------------------------------------- /services/ui_backend_service/tests/integration_tests/flows_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from .utils import ( 3 | cli, db, 4 | add_flow, 5 | _test_list_resources, _test_single_resource 6 | ) 7 | pytestmark = [pytest.mark.integration_tests] 8 | 9 | 10 | async def test_list_flows(cli, db): 11 | await _test_list_resources(cli, db, "/flows", 200, []) 12 | 13 | _flow = (await add_flow(db, flow_id="HelloFlow")).body 14 | 15 | await _test_list_resources(cli, db, "/flows", 200, [_flow]) 16 | 17 | 18 | async def test_single_flow(cli, db): 19 | await _test_single_resource(cli, db, "/flows/HelloFlow", 404, {}) 20 | 21 | _flow = (await add_flow(db, flow_id="HelloFlow")).body 22 | 23 | await _test_single_resource(cli, db, "/flows/{flow_id}".format(**_flow), 200, _flow) 24 | -------------------------------------------------------------------------------- /services/ui_backend_service/tests/integration_tests/grouped_runs_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import time 3 | from .utils import ( 4 | cli, db, 5 | add_flow, add_run, add_artifact, 6 | add_step, add_task, add_metadata, 7 | _test_list_resources, _test_single_resource, get_heartbeat_ts 8 | ) 9 | pytestmark = [pytest.mark.integration_tests] 10 | 11 | @pytest.mark.skip("Test failing due to refactor. TODO: fix later if applicable") 12 | async def test_list_runs_group_by_flow_id(cli, db): 13 | await _test_list_resources(cli, db, "/runs", 200, []) 14 | await _test_list_resources(cli, db, "/runs?_group=flow_id", 200, []) 15 | 16 | first_runs = await create_n_runs(db, 11, "A-FirstFlow") 17 | second_runs = await create_n_runs(db, 11, "B-SecondFlow") 18 | 19 | # default per-group limit should be 10 20 | await _test_list_resources(cli, db, "/runs?_group=flow_id", 200, [*first_runs[:10], *second_runs[:10]], approx_keys=["duration"]) 21 | 22 | # _group_limit should limit number of records returned per group 23 | await _test_list_resources(cli, db, "/runs?_group=flow_id&_group_limit=1", 200, [first_runs[0], second_runs[0]], approx_keys=["duration"]) 24 | 25 | # _limit should limit number of groups, not number of rows. 26 | await _test_list_resources(cli, db, "/runs?_group=flow_id&_group_limit=2&_limit=1&_order=%2Brun_number", 200, first_runs[:2], approx_keys=["duration"]) 27 | 28 | # _order should order within groups. 29 | await _test_list_resources(cli, db, "/runs?_group=flow_id&_order=run_number", 200, [*first_runs[::-1][:10], *second_runs[::-1][:10]], approx_keys=["duration"]) 30 | 31 | @pytest.mark.skip("Test failing due to refactor. TODO: fix later if applicable") 32 | async def test_list_runs_group_by_user(cli, db): 33 | await _test_list_resources(cli, db, "/runs", 200, []) 34 | await _test_list_resources(cli, db, "/runs?_group=user", 200, []) 35 | 36 | first_runs = await create_n_runs(db, 11, "A-Flow", "B-user") 37 | second_runs = await create_n_runs(db, 11, "B-Flow", "A-user") 38 | 39 | # default per-group should be 10. ordering by run_number ASC within group to test sorting, 40 | # and to retain order of test runs list. 41 | await _test_list_resources(cli, db, "/runs?_group=user&_order=%2Brun", 200, [*second_runs[:10], *first_runs[:10]], approx_keys=["duration"]) 42 | 43 | # _group_limit should limit number of records returned per group 44 | await _test_list_resources(cli, db, "/runs?_group=user&&_order=%2Brun&_group_limit=1", 200, [second_runs[0], first_runs[0]], approx_keys=["duration"]) 45 | 46 | # _limit should limit number of groups, not number of rows. 47 | await _test_list_resources(cli, db, "/runs?_group=user&&_order=%2Brun&_group_limit=2&_limit=1", 200, second_runs[:2], approx_keys=["duration"]) 48 | 49 | 50 | async def create_n_runs(db, n=1, flow_id="TestFlow", user="TestUser"): 51 | await add_flow(db, flow_id=flow_id) 52 | created_runs = [] 53 | for _ in range(n): 54 | _run = (await add_run(db, flow_id=flow_id, user_name=user, system_tags=["runtime:dev", "user:{}".format(user)])).body 55 | _run["run"] = _run["run_number"] 56 | _run["status"] = "running" 57 | _run["duration"] = max(int(round(time.time() * 1000)) - _run["ts_epoch"], 1) # approx assert breaks in the odd case when duration==0 58 | _run["user"] = user 59 | created_runs.append(_run) 60 | return created_runs 61 | -------------------------------------------------------------------------------- /services/ui_backend_service/tests/integration_tests/steps_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from .utils import ( 3 | cli, db, 4 | add_flow, add_run, add_step, add_task, 5 | add_artifact, get_heartbeat_ts, 6 | _test_list_resources, _test_single_resource, update_objects_with_run_tags 7 | ) 8 | pytestmark = [pytest.mark.integration_tests] 9 | 10 | 11 | async def test_list_steps(cli, db): 12 | _flow = (await add_flow(db, flow_id="HelloFlow")).body 13 | _run = (await add_run(db, flow_id=_flow.get("flow_id"))).body 14 | 15 | await _test_list_resources(cli, db, "/flows/{flow_id}/runs/{run_number}/steps".format(**_run), 200, []) 16 | 17 | _step = (await add_step(db, flow_id=_run.get("flow_id"), step_name="step", run_number=_run.get("run_number"), run_id=_run.get("run_id"))).body 18 | 19 | _, data = await _test_list_resources(cli, db, "/flows/{flow_id}/runs/{run_number}/steps".format(**_step), 200, None) 20 | 21 | assert len(data) == 1 22 | assert data[0]['run_number'] == int(_run.get('run_number')) 23 | assert data[0]['step_name'] == 'step' 24 | 25 | 26 | async def test_single_step(cli, db): 27 | await _test_single_resource(cli, db, "/flows/HelloFlow/runs/404/steps/none", 404, {}) 28 | 29 | _flow = (await add_flow(db, flow_id="HelloFlow")).body 30 | _run = (await add_run(db, flow_id=_flow.get("flow_id"))).body 31 | _step = (await add_step(db, flow_id=_run.get("flow_id"), step_name="step", run_number=_run.get("run_number"))).body 32 | 33 | _, data = await _test_single_resource(cli, db, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}".format(**_step), 200, None) 34 | 35 | assert data['run_number'] == int(_run.get('run_number')) 36 | assert data['step_name'] == 'step' 37 | 38 | 39 | async def test_step_duration(cli, db): 40 | _flow = (await add_flow(db, flow_id="HelloFlow")).body 41 | _run = (await add_run(db, flow_id=_flow.get("flow_id"))).body 42 | _step = (await add_step(db, flow_id=_run.get("flow_id"), step_name="step", run_number=_run.get("run_number"))).body 43 | _step['run_id'] = _run['run_number'] 44 | _step['duration'] = 1 # approx step duration for started step 45 | update_objects_with_run_tags('step', [_step], _run) 46 | 47 | # step duration should fallback to current time when no tasks exist. 48 | await _test_single_resource(cli, db, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}".format(**_step), 200, _step, approx_keys=["duration"]) 49 | 50 | # existing task should have an effect on step duration 51 | _task = (await add_task( 52 | db, 53 | flow_id=_flow.get("flow_id"), 54 | run_number=_run.get("run_number"), 55 | step_name=_step.get("step_name"), 56 | last_heartbeat_ts=get_heartbeat_ts(offset=10) 57 | )).body 58 | 59 | # if only task heartbeat exists, this should be used for the step duration 60 | _step['duration'] = _task['last_heartbeat_ts'] * 1000 - _step['ts_epoch'] 61 | 62 | await _test_single_resource(cli, db, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}".format(**_step), 200, _step) 63 | 64 | # more recent _task_ok artifact timestamp should be used in favor of last_heartbeat if exists. 65 | 66 | _task_ok = (await add_artifact( 67 | db, 68 | flow_id=_flow.get("flow_id"), 69 | run_number=_run.get("run_number"), 70 | step_name=_step.get("step_name"), 71 | task_id=_task.get("task_id"), 72 | artifact={ 73 | "name": "_task_ok", 74 | "location": "location", 75 | "ds_type": "ds_type", 76 | "sha": "sha", 77 | "type": "type", 78 | "content_type": "content_type", 79 | "attempt_id": 0 80 | } 81 | )).body 82 | 83 | # update ts_epoch to be newer than the task heartbeat. 84 | _new_ts = _task['last_heartbeat_ts'] * 1000 + 10 85 | await db.artifact_table_postgres.update_row( 86 | filter_dict={ 87 | "flow_id": _task_ok.get("flow_id"), 88 | "run_number": _task_ok.get("run_number"), 89 | "step_name": _task_ok.get("step_name"), 90 | "task_id": _task_ok.get("task_id") 91 | }, 92 | update_dict={ 93 | "ts_epoch": _new_ts 94 | } 95 | ) 96 | 97 | # _task_ok should be used in favor of heartbeat_ts for step duration. 98 | _step['duration'] = _new_ts - _step['ts_epoch'] 99 | 100 | await _test_single_resource(cli, db, "/flows/{flow_id}/runs/{run_number}/steps/{step_name}".format(**_step), 200, _step) 101 | -------------------------------------------------------------------------------- /services/ui_backend_service/tests/unit_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Netflix/metaflow-service/9e47d2d85e127d2673d457dde7ae535a3341de0f/services/ui_backend_service/tests/unit_tests/__init__.py -------------------------------------------------------------------------------- /services/ui_backend_service/tests/unit_tests/cache_utils_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from services.ui_backend_service.data.cache.utils import ( 4 | error_event_msg, progress_event_msg, search_result_event_msg, 5 | artifact_location_from_key, artifact_cache_id, unpack_pathspec_with_attempt_id, 6 | streamed_errors, cacheable_artifact_value, artifact_value 7 | ) 8 | 9 | pytestmark = [pytest.mark.unit_tests] 10 | 11 | 12 | def test_error_event_msg(): 13 | assert error_event_msg("test message", "test-id") == \ 14 | {"type": "error", "message": "test message", "id": "test-id", "traceback": None, "key": None} 15 | 16 | assert error_event_msg("test message", "test-id", "test-traceback") == \ 17 | {"type": "error", "message": "test message", "id": "test-id", "traceback": "test-traceback", "key": None} 18 | 19 | assert error_event_msg("test message", "test-id", "test-traceback", "search:artifact:s3://etc") == \ 20 | {"type": "error", "message": "test message", "id": "test-id", "traceback": "test-traceback", "key": "search:artifact:s3://etc"} 21 | 22 | 23 | def test_progress_event_msg(): 24 | assert progress_event_msg(0.5) == {"type": "progress", "fraction": 0.5} 25 | 26 | 27 | def test_search_result_event_msg(): 28 | assert search_result_event_msg([1, 2, 3]) == {"type": "result", "matches": [1, 2, 3]} 29 | 30 | 31 | def test_artifact_cache_key_and_location_from_key(): 32 | # first generate an artifact cache key with any location 33 | _loc = "s3://test-s3-locations/artifact_location/for/cache/1" 34 | 35 | key = artifact_cache_id(_loc) 36 | 37 | assert _loc in key 38 | 39 | # We need to be able to extract the location from a cache key, to form correctly keyed responses 40 | _extracted_loc = artifact_location_from_key(key) 41 | 42 | assert _extracted_loc == _loc 43 | 44 | 45 | def test_unpack_pathspec_with_attempt_id(): 46 | pathspec = "FlowName/RunNumber/StepName/TaskId/4" 47 | pathspec_without_attempt_id, attempt_id = unpack_pathspec_with_attempt_id(pathspec) 48 | assert pathspec_without_attempt_id == "FlowName/RunNumber/StepName/TaskId" 49 | assert attempt_id == 4 50 | 51 | 52 | def test_streamed_errors_no_op(): 53 | # if nothing raised, callable should not be called 54 | def _called(): 55 | # should not have been called 56 | assert False 57 | try: 58 | with streamed_errors(_called): 59 | pass 60 | except Exception as ex: 61 | assert False # Should not have raised any exception 62 | 63 | 64 | 65 | def test_streamed_errors_exception_output(): 66 | # raised errors should be written to output callable. 67 | def _raised(output): 68 | assert output['type'] == 'error' 69 | assert output['id'] == 'Exception' 70 | assert output['message'] == 'Custom exception' 71 | assert output['traceback'] is not None 72 | 73 | try: 74 | with streamed_errors(_raised): 75 | raise Exception("Custom exception") 76 | assert False # Should never get here due to re-raising of the exception 77 | except Exception as ex: 78 | assert str(ex) == "Custom exception" 79 | 80 | 81 | def test_streamed_errors_exception_output_no_re_raise(): 82 | # should not raise any exception with re_raise set to false. 83 | def _re_raise(output): 84 | pass 85 | 86 | try: 87 | with streamed_errors(_re_raise, re_raise=False): 88 | raise Exception("Should not be reraised") 89 | except Exception as ex: 90 | assert False # Should not have re-raised exception 91 | 92 | 93 | def test_cacheable_artifact_value(): 94 | artifact = MockArtifact("pathspec/to", 1, "test") 95 | big_artifact = MockArtifact("pathspec/to", 123456789, "test") 96 | 97 | assert cacheable_artifact_value(artifact) == '[true, "test"]' 98 | assert cacheable_artifact_value(big_artifact) == '[false, "artifact-too-large", "pathspec/to: 123456789 bytes"]' 99 | 100 | 101 | def test_artifact_value(): 102 | artifact = MockArtifact("pathspec/to", 1, "test") 103 | big_artifact = MockArtifact("pathspec/to", 123456789, "test") 104 | 105 | assert artifact_value(artifact) == (True, "test") 106 | assert artifact_value(big_artifact) == (False, "artifact-too-large", "pathspec/to: 123456789 bytes") 107 | 108 | 109 | class MockArtifact(): 110 | def __init__(self, pathspec, size, data): 111 | self.pathspec = pathspec 112 | self.size = size 113 | self.data = data 114 | -------------------------------------------------------------------------------- /services/ui_backend_service/tests/unit_tests/data_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from services.ui_backend_service.data import unpack_processed_value 4 | 5 | pytestmark = [pytest.mark.unit_tests] 6 | 7 | 8 | @pytest.mark.parametrize("value, expected", [ 9 | ([True, "test_value"], [True, 'test_value', None, None]), 10 | ([False, "CustomException"], [False, 'CustomException', None, None]), 11 | ([False, "CustomException", "error details"], [False, 'CustomException', "error details", None]), 12 | ([False, "CustomException", "error details", "stacktrace"], [False, 'CustomException', "error details", "stacktrace"]), 13 | ]) 14 | def test_unpack_processed_value_padding(value, expected): 15 | # test that the helper pads the output list with enough None items by default. 16 | assert unpack_processed_value(value) == expected 17 | -------------------------------------------------------------------------------- /services/ui_backend_service/tests/unit_tests/get_artifacts_action_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from services.ui_backend_service.data.cache.get_data_action import lookup_id 4 | 5 | pytestmark = [pytest.mark.unit_tests] 6 | 7 | 8 | async def test_cache_key_independent_of_location_order(): 9 | locs = ["a", "b", "c"] 10 | a = lookup_id(locs) 11 | b = lookup_id(reversed(locs)) 12 | 13 | assert a == b 14 | -------------------------------------------------------------------------------- /services/ui_backend_service/tests/unit_tests/search_artifacts_action_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from services.ui_backend_service.data.cache.search_artifacts_action import lookup_id 4 | 5 | pytestmark = [pytest.mark.unit_tests] 6 | 7 | 8 | async def test_cache_key_independent_of_location_order(): 9 | locs = ["a", "b", "c"] 10 | a = lookup_id(locs, "test", "eq") 11 | b = lookup_id(reversed(locs), "test", "eq") 12 | 13 | assert a == b 14 | 15 | 16 | async def test_cache_key_dependent_on_searchterm(): 17 | locs = ["a", "b", "c"] 18 | a = lookup_id(locs, "test", "eq") 19 | b = lookup_id(locs, "another test", "eq") 20 | c = lookup_id(locs, "another test", "co") 21 | 22 | assert not a == b 23 | assert not b == c 24 | -------------------------------------------------------------------------------- /services/ui_backend_service/tests/unit_tests/search_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from services.ui_backend_service.api.search import _parse_search_term 4 | 5 | pytestmark = [pytest.mark.unit_tests] 6 | 7 | 8 | async def test_search_term_parsing(): 9 | 10 | op, term = _parse_search_term("\"test term\"") 11 | 12 | assert op == "eq" 13 | assert term == "test term" 14 | 15 | op, term = _parse_search_term("test term") 16 | 17 | assert op == "co" 18 | assert term == "test term" 19 | 20 | op, term = _parse_search_term("test \"term\"") 21 | 22 | assert op == "co" 23 | assert term == "test \"term\"" 24 | -------------------------------------------------------------------------------- /services/ui_backend_service/ui/.dockerignore: -------------------------------------------------------------------------------- 1 | * -------------------------------------------------------------------------------- /services/ui_backend_service/ui/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | !.dockerignore 4 | !static/ -------------------------------------------------------------------------------- /services/ui_backend_service/ui/static/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /services/utils/tests/__init__.py: -------------------------------------------------------------------------------- 1 | from services.utils import DBConfiguration 2 | import pytest 3 | 4 | 5 | def get_test_dbconf(): 6 | """ 7 | Returns a DBConfiguration suitable for the test environment, or exits pytest completely upon failure 8 | """ 9 | db_conf = DBConfiguration(timeout=1) 10 | 11 | if db_conf.get_dsn() != "dbname=test user=test host=db_test port=5432 password=test": 12 | pytest.exit("The test suite should only be run in a test environment. \n \ 13 | Configured database host is not suited for running tests. \n \ 14 | expected DSN to be: dbname=test user=test host=db_test port=5432 password=test") 15 | 16 | return db_conf 17 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | license_files = LICENSE 3 | 4 | [pycodestyle] 5 | count = False 6 | exclude = *_test.py,.svn,CVS,.bzr,.hg,.git,__pycache__,.tox,env 7 | ignore = E722,W503 8 | max-line-length = 160 9 | statistics = True 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from os.path import dirname, join, exists 2 | from setuptools import setup, find_packages 3 | 4 | 5 | def open_and_read_if_exists(path: str): 6 | try: 7 | with open(join(dirname(__file__), path)) as f: 8 | return f.read() 9 | except: 10 | return "" 11 | 12 | 13 | requirements = [] 14 | for service in ["metadata_service", "migration_service", "ui_backend_service"]: 15 | requirements += open_and_read_if_exists( 16 | "services/{}/requirements.txt".format(service) 17 | ).splitlines() 18 | 19 | requirements_tests = open_and_read_if_exists("requirements.dev.txt").splitlines() 20 | 21 | long_description = open_and_read_if_exists("README.md") 22 | 23 | setup( 24 | name="metadata_service", 25 | version="2.5.0", 26 | license="Apache License 2.0", 27 | description="Metadata Service: backend service for Metaflow", 28 | long_description=long_description, 29 | author="Machine Learning Infrastructure Team at Netflix", 30 | author_email="help@metaflow.org", 31 | url="https://github.com/Netflix/metaflow-service", 32 | keywords=["metaflow", "machinelearning", "ml"], 33 | py_modules=["services.metadata_service"], 34 | packages=find_packages(exclude=("tests",)), 35 | entry_points=""" 36 | [console_scripts] 37 | metadata_service=services.metadata_service.server:main 38 | migration_service=services.migration_service.migration_server:main 39 | ui_backend_service=services.ui_backend_service.ui_server:main 40 | """, 41 | install_requires=requirements, 42 | tests_require=requirements + requirements_tests, 43 | extras_require={"test": requirements + requirements_tests}, 44 | classifiers=[ 45 | "Development Status :: 5 - Production/Stable", 46 | "Intended Audience :: Developers", 47 | "Topic :: Software Development :: Build Tools", 48 | "License :: OSI Approved :: Apache Software License", 49 | "Programming Language :: Python :: 3.11", 50 | ], 51 | ) 52 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py311 3 | 4 | [testenv] 5 | deps = 6 | -rrequirements.txt 7 | -rrequirements.dev.txt 8 | commands = pytest --cov=services 9 | passenv = MF_METADATA_DB_HOST,MF_METADATA_DB_PORT,MF_METADATA_DB_USER,MF_METADATA_DB_PSWD,MF_METADATA_DB_NAME,MF_UI_METADATA_PORT,MF_UI_METADATA_HOST 10 | extras = tests 11 | 12 | [testenv:pylint] 13 | commands = pylint -E services --ignored-modules=psycopg2,pygit2 14 | 15 | [testenv:unit] 16 | commands = pytest --cov=services -m unit_tests 17 | 18 | [testenv:integration] 19 | commands = pytest --cov=services -m integration_tests 20 | 21 | -------------------------------------------------------------------------------- /wait-for-postgres.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | RETRIES=1; 3 | MAX_RETRIES=${POSTGRES_WAIT_MAX_RETRIES:=5}; 4 | SLEEP_SECONDS=${POSTGRES_WAIT_SLEEP_SECONDS:=1}; 5 | 6 | # Retry loop for postgres server. 7 | while !