├── .dockerignore ├── .github ├── CODEOWNERS ├── dependabot.yml └── workflows │ └── main.yml ├── .gitignore ├── .python-version ├── LICENSE ├── README.md ├── build-lambda.sh ├── data └── .gitignore ├── dataprep_example ├── __init__.py ├── ingest_retailrocket_dataset.py └── repartition.py ├── docker-compose.yml ├── docker ├── all-in-one.Dockerfile ├── entrypoint.sh └── local-lambda.Dockerfile ├── docs ├── api.md ├── example-dataset.md ├── logo-blue.svg ├── logo-icon-dark-blue.svg ├── logo-icon-light-blue.svg ├── logo-small-blue.svg └── operating.md ├── frocket ├── __init__.py ├── apiserver.py ├── cli.py ├── cli_commands.py ├── common │ ├── __init__.py │ ├── config.py │ ├── dataset.py │ ├── helpers │ │ ├── __init__.py │ │ ├── pandas.py │ │ ├── storage.py │ │ └── utils.py │ ├── metrics.py │ ├── serializable.py │ ├── tasks │ │ ├── __init__.py │ │ ├── async_tracker.py │ │ ├── base.py │ │ ├── query.py │ │ └── registration.py │ └── validation │ │ ├── __init__.py │ │ ├── consts.py │ │ ├── error.py │ │ ├── path_visitor.py │ │ ├── query_validator.py │ │ ├── relation_parser.py │ │ ├── result.py │ │ └── visitor_functions.py ├── datastore │ ├── __init__.py │ ├── blobstore.py │ ├── datastore.py │ ├── redis_store.py │ └── registered_datastores.py ├── engine │ ├── __init__.py │ ├── query_engine.py │ └── relation_to_pandas.py ├── invoker │ ├── __init__.py │ ├── base_invoker.py │ ├── impl │ │ ├── __init__.py │ │ ├── async_invoker.py │ │ ├── aws_lambda_invoker.py │ │ ├── registered_invokers.py │ │ └── work_queue_invoker.py │ ├── invoker_api.py │ ├── jobs │ │ ├── __init__.py │ │ ├── job.py │ │ ├── query_job.py │ │ └── registration_job.py │ ├── metrics_frame.py │ ├── prom_adapter.py │ └── stats_builder.py ├── resources │ └── query_schema.json └── worker │ ├── __init__.py │ ├── impl │ ├── __init__.py │ ├── aws_lambda_metrics.py │ ├── aws_lambda_worker.py │ ├── generic_env_metrics.py │ └── queue_worker.py │ └── runners │ ├── __init__.py │ ├── base_task_runner.py │ ├── part_loader.py │ ├── query_task_runner.py │ ├── registered_runners.py │ └── registration_task_runner.py ├── requirements.txt ├── setup.py ├── test-requirements.txt └── tests ├── __init__.py ├── test_apiserver.py ├── test_cli.py ├── test_invoker_api.py ├── test_part_loader.py ├── test_path_visitor.py ├── test_query_engine.py ├── test_query_job.py ├── test_query_task.py ├── test_query_validator.py ├── test_registration_job.py ├── test_registration_task.py └── utils ├── __init__.py ├── base_query_example.json ├── base_test_utils.py ├── dataset_utils.py ├── lambda_fixture.py ├── mock_s3_utils.py ├── redis_fixture.py └── task_and_job_utils.py /.dockerignore: -------------------------------------------------------------------------------- 1 | **/__pycache__/ 2 | **/*.py[cod] 3 | **/.* 4 | docker/*.Dockerfile 5 | *.so 6 | *.parquet 7 | *.zip 8 | data/ 9 | layers/ 10 | scratch/ 11 | build/ 12 | dist/ 13 | sdist/ 14 | *.egg-info/ 15 | *.egg 16 | venv/ 17 | map/ 18 | reduce/ -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | /frocket @dynamicyield/eladroz 2 | /docker @dynamicyield/omrisk -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: github-actions 4 | directory: / 5 | schedule: 6 | interval: daily 7 | commit-message: 8 | prefix: fix(deps) 9 | - package-ecosystem: pip 10 | directory: / 11 | schedule: 12 | interval: daily 13 | commit-message: 14 | prefix: fix(deps) 15 | - package-ecosystem: docker 16 | directory: /docker 17 | schedule: 18 | interval: daily 19 | commit-message: 20 | prefix: fix(deps) 21 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: [ push ] 3 | jobs: 4 | ci: 5 | runs-on: ubuntu-18.04 6 | timeout-minutes: 10 7 | 8 | steps: 9 | - name: Checkout repo 10 | uses: actions/checkout@v2.3.4 11 | 12 | - name: Setup Python 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: '3.8' 16 | architecture: 'x64' 17 | 18 | - name: Cache dependencies 19 | uses: actions/cache@v2.1.4 20 | id: cache-venv 21 | with: 22 | path: ./venv/ 23 | key: ${{ runner.os }}-venv-cache-${{ hashFiles('./requirements.txt','./test-requirements.txt','./setup.py') }} 24 | 25 | - name: Build virtual environment and install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | python -m venv venv 29 | source venv/bin/activate 30 | pip install -e . 31 | pip install -r test-requirements.txt 32 | if: steps.cache-venv.outputs.cache-hit != 'true' 33 | 34 | - name: Set up Docker Buildx 35 | id: buildx 36 | uses: docker/setup-buildx-action@master 37 | 38 | - name: Cache Docker layers for all-in-one 39 | uses: actions/cache@v2.1.4 40 | with: 41 | path: /tmp/.buildx-cache-all-in-one 42 | key: ${{ runner.os }}-buildx-all-in-one-${{ github.sha }} 43 | restore-keys: | 44 | ${{ runner.os }}-buildx-all-in-one- 45 | 46 | - name: Docker build all-in-one 47 | id: docker_build_all_in_one 48 | uses: docker/build-push-action@v2 49 | with: 50 | context: . 51 | file: ./docker/all-in-one.Dockerfile 52 | builder: ${{ steps.buildx.outputs.name }} 53 | load: true 54 | tags: frocket/all-in-one:latest 55 | cache-from: type=local,src=/tmp/.buildx-cache-all-in-one 56 | cache-to: type=local,dest=/tmp/.buildx-cache-all-in-one,mode=max 57 | 58 | - name: Cache Docker layers for local-lambda 59 | uses: actions/cache@v2.1.4 60 | with: 61 | path: /tmp/.buildx-cache-local-lambda 62 | key: ${{ runner.os }}-buildx-local-lambda-${{ github.sha }} 63 | restore-keys: | 64 | ${{ runner.os }}-buildx-local-lambda- 65 | 66 | - name: Docker build local-lambda 67 | id: docker_build_all_local_lambda 68 | uses: docker/build-push-action@v2 69 | with: 70 | context: . 71 | file: ./docker/local-lambda.Dockerfile 72 | builder: ${{ steps.buildx.outputs.name }} 73 | load: true 74 | tags: frocket/local-lambda:latest 75 | cache-from: type=local,src=/tmp/.buildx-cache-local-lambda 76 | cache-to: type=local,dest=/tmp/.buildx-cache-local-lambda,mode=max 77 | 78 | - name: Launch docker-compose 79 | run: | 80 | docker-compose up -d 81 | sleep 2 82 | 83 | - name: Test with pytest 84 | run: | 85 | source venv/bin/activate 86 | export SKIP_SLOW_TESTS=true 87 | pytest --cov=frocket --cov-report=html 88 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/.DS_Store 2 | **/.ipynb_checkpoints/ 3 | **/__pycache__ 4 | **/*.pyc 5 | **/*.zip 6 | **/*.so 7 | **/*.parquet 8 | *.egg-info 9 | .eggs 10 | venv 11 | .idea 12 | *.iml 13 | .awsenv 14 | scratch 15 | map 16 | reduce 17 | build 18 | dist 19 | # Coverage report 20 | htmlcov 21 | .coverage 22 | .vscode -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.8.0 2 | -------------------------------------------------------------------------------- /build-lambda.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | set -e 3 | RED='\033[0;31m' 4 | GREEN='\033[0;32m' 5 | YELLOW='\033[1;33m' 6 | NC='\033[0m' # No Color 7 | GITHASH=`git rev-parse HEAD | cut -c1-8``[[ -z $(git status -s) ]] || echo dirty` 8 | [[ $1 == '--layer' ]] && LAYER=true || LAYER=false 9 | 10 | echo "${YELLOW}==> Building layer: ${LAYER}${NC}" 11 | echo "${YELLOW}==> Git commit hash: ${GITHASH}${NC}" 12 | echo "${YELLOW}==> Running docker build to install packages in Lambda-like image...${NC}" 13 | docker build -f docker/local-lambda.Dockerfile . -t frocket/local-lambda:latest 14 | docker run -d --name lambda-builder frocket/local-lambda:latest 15 | 16 | BUILD_DIR=$(mktemp -d -t build-lambda) 17 | echo "${YELLOW}==> Copying files from container to build directory: ${BUILD_DIR}...${NC}" 18 | mkdir -p $BUILD_DIR/function 19 | docker cp lambda-builder:/var/task/frocket $BUILD_DIR/function/frocket 20 | if [ "$LAYER" = true ]; then 21 | mkdir -p $BUILD_DIR/layer 22 | docker cp lambda-builder:/opt/python $BUILD_DIR/layer/python 23 | fi 24 | 25 | echo "${YELLOW}==> Stopping & removing container...${NC}" 26 | docker stop lambda-builder 27 | docker rm lambda-builder 28 | 29 | pushd $BUILD_DIR 30 | echo "${YELLOW}==> Cleaning-up a bit and zipping...${NC}" 31 | FUNCTION_ZIPFILE=lambda-function-${GITHASH}.zip 32 | [ "$LAYER" = true ] && LAYER_ZIPFILE=lambda-layer-${GITHASH}.zip || LAYER_ZIPFILE= 33 | 34 | if [ "$LAYER" = true ]; then 35 | find ./layer/python -type d -name tests | xargs rm -rf 36 | find ./layer/python -type d -name include | xargs rm -rf 37 | (cd layer && zip -qr ../$LAYER_ZIPFILE ./python) 38 | echo "${YELLOW}NOTE: Lambda size limit is 50mb compressed/250mb uncompressed for the function PLUS any layers it uses (unless using containers)${NC}" 39 | echo "${YELLOW}Lambda layer size, uncompressed:${NC}" 40 | du -sh ./layer 41 | echo "${YELLOW}Lambda layer size, zipped:${NC}" 42 | du -h $LAYER_ZIPFILE 43 | fi 44 | 45 | (cd function && zip -qr ../$FUNCTION_ZIPFILE ./frocket) 46 | echo "${YELLOW}Lambda function, zipped:${NC}" 47 | du -h $FUNCTION_ZIPFILE 48 | 49 | popd 50 | # Don't fail if previous files don't exist 51 | rm lambda-function-*.zip || true 52 | cp $BUILD_DIR/$FUNCTION_ZIPFILE . 53 | if [ "$LAYER" = true ]; then 54 | rm lambda-layer-*.zip || true 55 | cp $BUILD_DIR/$LAYER_ZIPFILE ./ 56 | fi 57 | rm -rf $BUILD_DIR 58 | echo "${YELLOW}DONE! copied to current dir:${NC}\n${FUNCTION_ZIPFILE} ${LAYER_ZIPFILE}" 59 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | * -------------------------------------------------------------------------------- /dataprep_example/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DynamicYieldProjects/funnel-rocket/70963fddc0881cebdc6da1af2654d412f95d660c/dataprep_example/__init__.py -------------------------------------------------------------------------------- /dataprep_example/ingest_retailrocket_dataset.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import argparse 4 | from pathlib import Path 5 | from contextlib import contextmanager 6 | import pandas as pd 7 | from pandas import DataFrame 8 | 9 | EVENTS_FILE = 'events.csv' 10 | PROPS_FILE_1 = 'item_properties_part1.csv' 11 | PROPS_FILE_2 = 'item_properties_part2.csv' 12 | INPUT_FILENAMES = {EVENTS_FILE, PROPS_FILE_1, PROPS_FILE_2} 13 | ITEM_PROPERTY_COLUMNS = {'categoryid', 'available', '790', '888'} 14 | EXPECTED_EVENT_COUNT = 2_500_516 15 | 16 | 17 | def progress_msg(msg: str): 18 | print(f"\033[33m{msg}\033[0m") # Yellow, just yellow 19 | 20 | 21 | @contextmanager 22 | def timed(caption: str): 23 | start = time.time() 24 | yield 25 | total = time.time() - start 26 | print(f"Time to {caption}: {total:.3f} seconds") 27 | 28 | 29 | # Read item properties files, filter for relevant columns and 'pivot' its structure from rows to columns 30 | def read_item_props(filepath: Path) -> DataFrame: 31 | df = pd.read_csv(filepath) 32 | df = df[df['property'].isin(ITEM_PROPERTY_COLUMNS)] 33 | first_value_per_item = df.groupby(["itemid", "property"])["value"].first() 34 | df = first_value_per_item.to_frame() 35 | df = df.unstack(level=-1) 36 | df.columns = df.columns.droplevel(0) 37 | return df 38 | 39 | 40 | def ingest(path: Path): 41 | with timed("read & transform item properties of all products"): 42 | item_props_tempfile = path / "item_props.parquet" 43 | if item_props_tempfile.exists(): 44 | progress_msg(f"Reading item properties from cached file {item_props_tempfile}") 45 | item_props_df = pd.read_parquet(item_props_tempfile) 46 | else: 47 | progress_msg("Reading item properties... (this takes a bit)") 48 | item_props_df1 = read_item_props(path / PROPS_FILE_1) 49 | item_props_df2 = read_item_props(path / PROPS_FILE_2) 50 | item_props_df = item_props_df1.combine_first(item_props_df2) 51 | progress_msg(f"Storing item properties to {item_props_tempfile} for faster re-runs...") 52 | item_props_df.to_parquet(item_props_tempfile) 53 | 54 | with timed("read & transform user events"): 55 | progress_msg("Reading user events...") 56 | events = pd.read_csv(path / EVENTS_FILE) 57 | progress_msg("Joining events with item properties...") 58 | events = pd.merge(events, item_props_df, how='inner', on='itemid') 59 | 60 | progress_msg("Making columns more queryable...") 61 | events['price'] = events['790'].str[1:].astype(float) / 1000 62 | events.drop(columns=['790'], inplace=True) 63 | events['available'] = events['available'].astype(int).astype(bool) 64 | events['categoryid'] = events['categoryid'].astype('category') 65 | events['event'] = events['event'].astype('category') 66 | events.rename(columns={'888': 'cryptic_attrs'}, inplace=True) 67 | progress_msg("Storing 'cryptic_attrs' also as categorical column 'cryptic_attrs_cat'...") 68 | events['cryptic_attrs_cat'] = events['cryptic_attrs'].astype('category') 69 | events.reset_index(drop=True) 70 | 71 | progress_msg("Excerpt from final DataFrame:") 72 | print(events) 73 | progress_msg("Columns types (a.k.a. dtypes):") 74 | print(events.dtypes) 75 | progress_msg("Breakdown of event types:") 76 | print(events['event'].value_counts()) 77 | 78 | if len(events) != EXPECTED_EVENT_COUNT: 79 | progress_msg(f"WARNING: Expected {EXPECTED_EVENT_COUNT} events, but final DataFrame has {len(events)}") 80 | 81 | output_file = path / 'retailrocket.parquet' 82 | events.to_parquet(output_file) 83 | col_memory_sizes = (events.memory_usage(deep=True) / 1024 ** 2).round(decimals=2) 84 | progress_msg(f'Size of DataFrame columns in memory (in MB):') 85 | print(col_memory_sizes) 86 | progress_msg(f"==> Saved output file to: {output_file}, size: {output_file.stat().st_size / 1024 ** 2:.1f}MB") 87 | 88 | with timed("load file - all columns"): 89 | pd.read_parquet(output_file) 90 | 91 | with timed("load file - just the 'cryptic_attrs' column"): 92 | pd.read_parquet(output_file, columns=['cryptic_attrs']) 93 | 94 | with timed("load file - just the 'cryptic_attrs_cat' column"): 95 | pd.read_parquet(output_file, columns=['cryptic_attrs_cat']) 96 | 97 | with timed("load file - all columns *except* these two"): 98 | cols = [col for col in events.dtypes.index 99 | if col not in ['cryptic_attrs', 'cryptic_attrs_cat']] 100 | pd.read_parquet(output_file, columns=cols) 101 | 102 | 103 | if __name__ == '__main__': 104 | parser = argparse.ArgumentParser( 105 | description='Ingest RetailRocket dataset (to download: https://www.kaggle.com/retailrocket/ecommerce-dataset/)') 106 | parser.add_argument( 107 | 'path', type=str, 108 | help='Directory where downloaded dataset files are found and output file will be written') 109 | args = parser.parse_args() 110 | 111 | path = Path(args.path) 112 | if not path.exists() or not path.is_dir(): 113 | sys.exit(f'No such directory: {path}') 114 | files_in_path = {f.name for f in path.iterdir()} 115 | if not files_in_path >= INPUT_FILENAMES: 116 | sys.exit(f'Missing one or more input files: {INPUT_FILENAMES}') 117 | ingest(path) 118 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | services: 3 | redis: 4 | image: redis:6 5 | ports: 6 | - ${FROCKET_REDIS_PORT:-6379}:${FROCKET_REDIS_PORT:-6379} 7 | entrypoint: [ "redis-server", "--port", "${FROCKET_REDIS_PORT:-6379}" ] 8 | 9 | mock-s3: 10 | image: minio/minio:latest 11 | container_name: mock-s3 12 | ports: 13 | - 9000:9000 14 | environment: 15 | - MINIO_ROOT_USER=testonly 16 | - MINIO_ROOT_PASSWORD=testonly 17 | command: server /data 18 | 19 | frocket-queue-worker: 20 | build: 21 | dockerfile: docker/all-in-one.Dockerfile 22 | context: . 23 | image: frocket/all-in-one:latest 24 | volumes: 25 | - ./data:/app/data:ro,cached 26 | environment: 27 | - FROCKET_REDIS_HOST=redis 28 | - FROCKET_REDIS_PORT=${FROCKET_REDIS_PORT:-6379} 29 | - FROCKET_S3_AWS_ENDPOINT_URL=http://mock-s3:9000 30 | - FROCKET_S3_AWS_ACCESS_KEY_ID=testonly 31 | - FROCKET_S3_AWS_SECRET_ACCESS_KEY=testonly 32 | depends_on: 33 | - redis 34 | - mock-s3 35 | command: worker 36 | 37 | frocket-lambda-worker: 38 | build: 39 | dockerfile: docker/local-lambda.Dockerfile 40 | context: . 41 | image: frocket/local-lambda:latest 42 | container_name: mock-lambda 43 | volumes: 44 | - ./data:/data:ro,cached 45 | environment: 46 | - FROCKET_REDIS_HOST=redis 47 | - FROCKET_REDIS_PORT=${FROCKET_REDIS_PORT:-6379} 48 | - FROCKET_S3_AWS_ENDPOINT_URL=http://mock-s3:9000 49 | - FROCKET_S3_AWS_ACCESS_KEY_ID=testonly 50 | - FROCKET_S3_AWS_SECRET_ACCESS_KEY=testonly 51 | - AWS_REGION=us-east-1 52 | depends_on: 53 | - redis 54 | - mock-s3 55 | ports: 56 | - 9001:9001 57 | command: frocket.worker.impl.aws_lambda_worker.lambda_handler 58 | 59 | frocket-apiserver: 60 | image: frocket/all-in-one:latest 61 | container_name: frocket-apiserver 62 | ports: 63 | - 5000:5000 64 | volumes: 65 | - ./data:/app/data:ro,cached 66 | environment: 67 | - APISERVER_NUM_WORKERS=2 68 | - FROCKET_REDIS_HOST=redis 69 | - FROCKET_REDIS_PORT=${FROCKET_REDIS_PORT:-6379} 70 | - FROCKET_S3_AWS_ENDPOINT_URL=http://mock-s3:9000 71 | - FROCKET_S3_AWS_ACCESS_KEY_ID=testonly 72 | - FROCKET_S3_AWS_SECRET_ACCESS_KEY=testonly 73 | - FROCKET_LAMBDA_AWS_NO_SIGNATURE=true 74 | - FROCKET_LAMBDA_AWS_ENDPOINT_URL=http://mock-lambda:9001 75 | - FROCKET_LAMBDA_AWS_REGION=us-east-1 76 | - FROCKET_INVOKER_LAMBDA_LEGACY_ASYNC=false 77 | - FROCKET_INVOKER_RETRY_FAILED_INTERVAL=0.05 78 | # - FROCKET_INVOKER=aws_lambda 79 | depends_on: 80 | - redis 81 | command: apiserver 82 | -------------------------------------------------------------------------------- /docker/all-in-one.Dockerfile: -------------------------------------------------------------------------------- 1 | # Base Python image with up-to-date OS packages & pip 2 | FROM python:3.8-slim as base 3 | RUN apt-get update && apt-get clean && \ 4 | python -m pip install --upgrade pip 5 | 6 | # Builder image: install packages and then cleanup some un-needed large files and directories 7 | FROM base as package-install 8 | WORKDIR /app 9 | COPY ./requirements.txt . 10 | RUN pip install --no-cache-dir --no-compile -r requirements.txt -t ./packages 11 | # Delete un-needed big files in pyarrow, tests & include dirs, 12 | # and all directories in botocore/data except for services actually used by frocket 13 | RUN rm ./packages/pyarrow/*flight*.so* \ 14 | ./packages/pyarrow/*plasma*.so* \ 15 | ./packages/pyarrow/plasma-store-server && \ 16 | find ./packages -type d -name tests | xargs rm -rf && \ 17 | find ./packages -type d -name include | xargs rm -rf && \ 18 | find ./packages/botocore/data -type d -mindepth 1 -maxdepth 1 | grep -vE 's3|lambda' | xargs rm -rf 19 | 20 | # This image is based on 'base' again, so it doesn't carry over intermediate fat layers from package-install image. 21 | # It copies over only the pruned packages to the final image. 22 | FROM base 23 | WORKDIR /app 24 | COPY ./docker/entrypoint.sh . 25 | RUN chmod +x ./entrypoint.sh 26 | RUN useradd -ms /bin/bash frocket 27 | COPY --from=package-install /app/packages packages 28 | # The most frequently-changing file set - the source code itself, is copied last so previous layers are unaffected 29 | COPY ./requirements.txt . 30 | COPY ./test-requirements.txt . 31 | COPY ./setup.py . 32 | COPY ./frocket frocket 33 | COPY ./tests tests 34 | RUN pip install --no-cache-dir --no-compile --no-deps . -t ./packages 35 | USER frocket 36 | ENV PYTHONPATH=/app/packages 37 | ENTRYPOINT ["./entrypoint.sh"] 38 | -------------------------------------------------------------------------------- /docker/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | case "$1" in 3 | worker) 4 | echo "Starting Funnel Rocket queue-based worker" 5 | python -m frocket.worker.impl.queue_worker 6 | ;; 7 | apiserver) 8 | PORT=${APISERVER_PORT:-5000} 9 | NUM_WORKERS=${APISERVER_NUM_WORKERS:-8} 10 | echo "Starting Funnel Rocket API server with $NUM_WORKERS workers on port $PORT" 11 | python -m gunicorn.app.wsgiapp frocket.apiserver:app --bind=0.0.0.0:$PORT --workers=$NUM_WORKERS 12 | ;; 13 | *) 14 | echo "Invalid command supplied" 15 | exit 1 16 | ;; 17 | esac 18 | -------------------------------------------------------------------------------- /docker/local-lambda.Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PYTHON_VERSION=3.8 2 | # Note: not using multi-stage build here, in contrary to all-in-one image. 3 | # This has the pro of very fast incremental builds locally, and the con of large image size - ok for tests. 4 | # Since we're switching to root during build, 5 | # need to return to default Lambda user afterwards (as defined in base image) 6 | ARG RUN_USER=sbx_user1051 7 | FROM lambci/lambda:python3.8 8 | # Lambda function code should be in /var/task 9 | WORKDIR /var/task 10 | COPY ./setup.py . 11 | COPY ./requirements.txt . 12 | # Lambda layer(s) (useful for holding all big & infrequently changing dependencies) 13 | # should be located under /opt, which is only writable by root. 14 | # Don't install boto3/botocore, which is vendored by AWS in its most appropriate version 15 | USER root 16 | RUN grep -v boto requirements.txt > lambda_requirements.txt 17 | RUN mkdir /opt/python && pip install --no-compile --no-cache-dir -r lambda_requirements.txt -t /opt/python 18 | # Clean-up some big files 19 | RUN rm /opt/python/pyarrow/*flight*.so* \ 20 | /opt/python/pyarrow/*plasma*.so* \ 21 | /opt/python/pyarrow/plasma-store-server \ 22 | setup.py requirements.txt lambda_requirements.txt 23 | # Go back to user & workdir of base image 24 | USER ${RUN_USER} 25 | # Copy package source code, which is frequently changing, only at end of Dockerfile 26 | COPY ./frocket /var/task/frocket 27 | WORKDIR /var/task 28 | # These values are for running tests, not production usage 29 | ENV DOCKER_LAMBDA_STAY_OPEN=1 \ 30 | AWS_LAMBDA_FUNCTION_NAME=frocket \ 31 | AWS_LAMBDA_FUNCTION_TIMEOUT=15 \ 32 | AWS_LAMBDA_FUNCTION_MEMORY_SIZE=256 33 | -------------------------------------------------------------------------------- /docs/logo-blue.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 16 | 17 | 18 | 20 | 23 | 26 | 29 | 32 | 34 | 37 | 40 | 43 | 46 | 49 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /docs/logo-icon-dark-blue.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /docs/logo-icon-light-blue.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /docs/logo-small-blue.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 16 | 17 | 18 | 19 | 21 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /frocket/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /frocket/cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple CLI for Funnel Rocket. 3 | 4 | This is currently a wrapper over invoker_api directly (meaning that the CLI process is the invoker), rather than 5 | calling an API server - meaning that it does not rely on a running server, but needs the same permissions (listing files 6 | in remote storage, access to Redis as datastore, optionally being able to invoke Lambdas). 7 | 8 | This makes the CLI more suitable for onboarding and evaluation, but in production it's preferable to use the API 9 | (for a better permissions model and centralized monitoring/logging, if nothing else). 10 | 11 | The CLI does provide a few optional flags which make it also suitable for automating jobs: 12 | * --nopretty returns JSON object/s without any captions 13 | * --notrim and --nocolor prevents data from bein shortened or surrounded by ANSI color codes 14 | * The log level is controllbable, and all log lines have a prefix making them easy to ignore. 15 | """ 16 | # Copyright 2021 The Funnel Rocket Maintainers 17 | # 18 | # Licensed under the Apache License, Version 2.0 (the "License"); 19 | # you may not use this file except in compliance with the License. 20 | # You may obtain a copy of the License at 21 | # 22 | # http://www.apache.org/licenses/LICENSE-2.0 23 | # 24 | # Unless required by applicable law or agreed to in writing, software 25 | # distributed under the License is distributed on an "AS IS" BASIS, 26 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 27 | # See the License for the specific language governing permissions and 28 | # limitations under the License. 29 | 30 | import argparse 31 | # TODO backlog don't import any frocket modules but a carefully selected set which does not then import heavy packages 32 | # or initialize mechanisms. This is only partially done now (see import at end of file). 33 | from frocket.common.config import config 34 | from frocket.common.tasks.registration import DatasetValidationMode, REGISTER_DEFAULT_VALIDATION_MODE, \ 35 | REGISTER_DEFAULT_FILENAME_PATTERN, REGISTER_DEFAULT_VALIDATE_UNIQUES 36 | 37 | REGISTER_VALIDATION_MODE_CHOICES = [e.value.lower() for e in DatasetValidationMode] 38 | LOG_LEVEL_CHOICES = ['debug', 'info', 'warning', 'error', 'critical'] 39 | LOG_LINE_PREFIX = '[Log ' 40 | LOG_FORMAT = LOG_LINE_PREFIX + '%(levelname)s %(name)s] %(message)s' 41 | 42 | 43 | def build_parser() -> argparse.ArgumentParser: 44 | parser = argparse.ArgumentParser(description='Simple CLI for Funnel Rocket', 45 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 46 | parser.add_argument('--notrim', action='store_true', help='Don\'t trim any text') 47 | parser.add_argument('--nocolor', action='store_true', help='Don\'t trim colorize any text') 48 | parser.add_argument('--nopretty', action='store_true', help='Don\'t pretty-print the response') 49 | parser.add_argument('--loglevel', type=str.lower, choices=LOG_LEVEL_CHOICES, 50 | help=f'Set log level {LOG_LEVEL_CHOICES}') 51 | subparsers = parser.add_subparsers(dest='command', title='commands') 52 | subparsers.required = True 53 | 54 | register_parser = subparsers.add_parser('register', help='Register a dataset', 55 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 56 | register_parser.add_argument('name', type=str, help='Dataset name') 57 | register_parser.add_argument('basepath', type=str, 58 | help='The path all files are directly under. Local and s3://... paths supported.') 59 | register_parser.add_argument('group_id_column', type=str, 60 | help='The column to group rows by, e.g. "userId", "userHash". ' 61 | 'This column is required and no values can be missing. Each part (file) in the ' 62 | 'dataset should have a distinct set of values for this column.') 63 | register_parser.add_argument( 64 | 'timestamp_column', type=str, 65 | help='The column holding the timestamp of each row, e.g. "timestamp", "ts". ' 66 | 'Must be a numeric column with no values missing. Using a unix timestamp is advised - ' 67 | 'with or without sub-second resoluton based on your needs, either as int or float.') 68 | register_parser.add_argument('--pattern', type=str, default=REGISTER_DEFAULT_FILENAME_PATTERN, 69 | help='Filename pattern. Sub-directories are currently not supported.') 70 | register_parser.add_argument('--validation', type=str.lower, 71 | choices=REGISTER_VALIDATION_MODE_CHOICES, 72 | default=REGISTER_DEFAULT_VALIDATION_MODE.value.lower(), 73 | help=f"Validation mode to use {REGISTER_VALIDATION_MODE_CHOICES}", 74 | metavar='MODE') 75 | register_parser.add_argument('--skip-uniques', action='store_true', 76 | default=not REGISTER_DEFAULT_VALIDATE_UNIQUES, 77 | help='Skip validation of group_id_column values uniqueness across files ' 78 | '(the set of files to test is determined by --validation argument)') 79 | 80 | list_parser = subparsers.add_parser('list', help='List datasets') 81 | 82 | run_query_parser = subparsers.add_parser('run', help='Run query') 83 | run_query_parser.add_argument('dataset') 84 | query_sources_group = run_query_parser.add_mutually_exclusive_group(required=True) 85 | query_sources_group.add_argument('--file', '-f', type=str, help='Run query stored in file', dest='filename') 86 | query_sources_group.add_argument('--empty', '-e', action='store_true', 87 | help='Run an empty query with no conditions') 88 | query_sources_group.add_argument('--string' '-s', type=str, 89 | help='Run the following query string', dest='query_string') 90 | 91 | info_parser = subparsers.add_parser('info', help='Show dataset information') 92 | info_parser.add_argument('dataset', type=str) 93 | info_parser.add_argument('--full', action='store_true', help='Show full schema') 94 | 95 | unreg_parser = subparsers.add_parser('unregister', help='Unregister a dataset') 96 | unreg_parser.add_argument('dataset', type=str) 97 | unreg_parser.add_argument('--force', action='store_true', 98 | help='Unregister a dataset even if it\'s currently in use') 99 | 100 | config_parser = subparsers.add_parser('config', help='Show configuration') 101 | return parser 102 | 103 | 104 | def run_from_args(args: argparse.Namespace): 105 | config['log.format'] = LOG_FORMAT if args.nocolor else f"\033[33m{LOG_FORMAT}\033[0m" 106 | if args.loglevel: 107 | config['log.level'] = args.loglevel 108 | config.init_logging(force_console_output=True) 109 | 110 | # invoker_api isn't loaded (or logging implicitly initialized) till arguments are validated and log level is set 111 | from frocket.cli_commands import run_command 112 | run_command(args.command, args) 113 | 114 | 115 | if __name__ == '__main__': 116 | parser = build_parser() 117 | args = parser.parse_args() 118 | run_from_args(args) 119 | -------------------------------------------------------------------------------- /frocket/cli_commands.py: -------------------------------------------------------------------------------- 1 | """ 2 | implementation of CLI commands. 3 | """ 4 | # Copyright 2021 The Funnel Rocket Maintainers 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import argparse 19 | import json 20 | import sys 21 | from json.decoder import JSONDecodeError 22 | from pathlib import Path 23 | from typing import Any 24 | from tabulate import tabulate 25 | from frocket.common.config import config 26 | from frocket.common.serializable import SerializableDataClass 27 | from frocket.common.tasks.base import BaseApiResult 28 | from frocket.common.tasks.registration import DatasetValidationMode, RegisterArgs 29 | from frocket.invoker import invoker_api 30 | 31 | DATE_FORMAT = '%Y-%m-%d %H:%M:%S %Z' 32 | 33 | 34 | def run_command(cmd: str, args: argparse.Namespace): 35 | mapping = { 36 | 'register': register_dataset_cmd, 37 | 'unregister': unregister_dataset_cmd, 38 | 'list': list_datasets_cmd, 39 | 'run': run_query_cmd, 40 | 'info': dataset_info_cmd, 41 | 'config': show_config_cmd 42 | } 43 | mapping[cmd](args) 44 | 45 | 46 | def fail_missing_dataset(name: str): 47 | sys.exit(f"Dataset '{name}' not found!") 48 | 49 | 50 | def trim_column(s: str, args: argparse.Namespace, maxwidth: int) -> str: 51 | if args.notrim or args.nopretty or len(s) <= maxwidth: 52 | return s 53 | else: 54 | return s[:maxwidth - 3] + '...' 55 | 56 | 57 | def print_json(name: str, o: Any, pretty_print: bool): 58 | def to_json(o: Any, indent: int = None) -> str: 59 | return o.to_json(indent=indent) if isinstance(o, SerializableDataClass) else json.dumps(o, indent=indent) 60 | 61 | if pretty_print: 62 | print(name + ':', to_json(o, indent=2)) 63 | else: 64 | print(to_json(o)) 65 | 66 | 67 | def handle_api_result(res: BaseApiResult, pretty_print: bool): 68 | print_json('API Result', res, pretty_print) 69 | if not res.success: 70 | sys.exit('FAILED' if pretty_print else 1) 71 | 72 | 73 | def register_dataset_cmd(args): 74 | validation_mode = DatasetValidationMode[args.validation.upper()] 75 | register_args = RegisterArgs(name=args.name, 76 | basepath=args.basepath, 77 | group_id_column=args.group_id_column, 78 | timestamp_column=args.timestamp_column, 79 | pattern=args.pattern, 80 | validation_mode=validation_mode, 81 | validate_uniques=not args.skip_uniques) 82 | res = invoker_api.register_dataset(register_args) 83 | handle_api_result(res, pretty_print=not args.nopretty) 84 | 85 | 86 | def unregister_dataset_cmd(args): 87 | res = invoker_api.unregister_dataset(args.dataset, force=args.force) 88 | handle_api_result(res, pretty_print=not args.nopretty) 89 | 90 | 91 | def list_datasets_cmd(args): 92 | datasets = sorted(invoker_api.list_datasets(), key=lambda ds: ds.id.registered_at, reverse=True) 93 | display_datasets = [{'name': trim_column(ds.id.name, args, maxwidth=30), 94 | 'registered at': ds.id.registered_at.strftime(DATE_FORMAT), 95 | 'parts': ds.total_parts, 96 | 'group id': ds.group_id_column, 97 | 'timestamp': ds.timestamp_column, 98 | 'path': trim_column(ds.basepath, args, maxwidth=50)} 99 | for ds in datasets] 100 | if args.nopretty: 101 | print(json.dumps(display_datasets)) 102 | else: 103 | if len(datasets) == 0: 104 | print('No datasets registered yet') 105 | else: 106 | print(tabulate(display_datasets, headers='keys')) 107 | 108 | 109 | def json_parse(s: str) -> dict: 110 | try: 111 | return json.loads(s) 112 | except JSONDecodeError as e: 113 | sys.exit(f'JSON Error: {e}') 114 | 115 | 116 | def run_query_cmd(args): 117 | ds_info = invoker_api.get_dataset(args.dataset) 118 | if not ds_info: 119 | fail_missing_dataset(args.dataset) 120 | query = None 121 | if args.empty: 122 | query = {} 123 | elif args.query_string: 124 | query = json_parse(args.query_string) 125 | elif args.filename: 126 | filepath = Path(args.filename) 127 | if not filepath.exists(): 128 | sys.exit(f'File not found: {args.filename}') 129 | else: 130 | query_str = filepath.read_text(encoding='utf-8') 131 | query = json_parse(query_str) 132 | else: 133 | sys.exit('Unknown mode') 134 | 135 | try: 136 | res = invoker_api.run_query(ds_info, query) 137 | handle_api_result(res, pretty_print=not args.nopretty) 138 | except Exception as e: 139 | sys.exit(f'Error: {e}') 140 | 141 | 142 | def dataset_info_cmd(args): 143 | show_full = args.full 144 | ds_info = invoker_api.get_dataset(args.dataset) 145 | if not ds_info: 146 | fail_missing_dataset(args.dataset) 147 | parts_info = invoker_api.get_dataset_parts(ds_info) 148 | schema_info = invoker_api.get_dataset_schema(ds_info, full=show_full) 149 | print_json('Basic information', ds_info, pretty_print=not args.nopretty) 150 | print_json('Parts', parts_info, pretty_print=not args.nopretty) 151 | print_json(f'Schema (full: {show_full})', schema_info, pretty_print=not args.nopretty) 152 | 153 | 154 | def show_config_cmd(args): 155 | print_json(f'Configuration', config, pretty_print=not args.nopretty) 156 | -------------------------------------------------------------------------------- /frocket/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /frocket/common/dataset.py: -------------------------------------------------------------------------------- 1 | """Base classes for registered datasets and their metadata.""" 2 | # Copyright 2021 The Funnel Rocket Maintainers 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | from enum import auto 18 | from datetime import datetime, timezone 19 | from typing import Optional, List, Dict 20 | from dataclasses import dataclass, field 21 | from frocket.common.serializable import SerializableDataClass, AutoNamedEnum 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | class PartNamingMethod(AutoNamedEnum): 27 | """ 28 | For future use: currently the full list of dataset filenames is stored as metadata, however if a consistent 29 | numbering pattern should be identified, it may be useful both for more compact metadata and for a more predictable 30 | part index -> filename mapping. 31 | """ 32 | RUNNING_NUMBER = auto() 33 | LIST = auto() 34 | 35 | 36 | @dataclass(frozen=True) 37 | class DatasetId(SerializableDataClass): 38 | """ 39 | The main reason why this class exists: datasets can be re-registered multiple times with the same name, but any 40 | caching behavior should be sensitive to the registered date and become invalid on re-registration. 41 | In concrete terms, caching should be based on DatasetId keys (which are immutable) rather than a dataset name. 42 | 43 | Re-registering a dataset is useful, in cases such as: 44 | 1. When you don't need to manage revisions yourself (via specifying a new dataset name and un-registering old ones). 45 | 2. As an alias to the current version (datasets are only metadata, you can register the same physical files N times) 46 | 3. If the datafiles were found to be incomplete/invalid, and after fixing the issue you want to invalidate caching. 47 | """ 48 | name: str 49 | registered_at: datetime 50 | 51 | @classmethod 52 | def now(cls, name: str): 53 | return DatasetId(name, registered_at=datetime.now(tz=timezone.utc)) 54 | 55 | 56 | @dataclass(frozen=True) 57 | class DatasetPartId(SerializableDataClass): 58 | """Specifies a single part (file) in a dataset version (see documetation for DatasetId above!).""" 59 | dataset_id: DatasetId 60 | path: str 61 | part_idx: int 62 | 63 | 64 | @dataclass(frozen=True) 65 | class DatasetInfo(SerializableDataClass): 66 | """ 67 | Basic metadata for a dataset. 68 | 69 | This class should be kept pretty small, as it's passed along in task requests. 70 | More detailed metadata is found in the data schema object, which is stored separately and read when needed 71 | (and also exists in both short and full versions) 72 | """ 73 | basepath: str 74 | total_parts: int 75 | id: DatasetId 76 | group_id_column: str # The column by which the dataset is partitioned, and grouping is done. 77 | timestamp_column: str # The column by which timeframe conditions and funnels are run. 78 | 79 | 80 | @dataclass(frozen=True) 81 | class DatasetPartsInfo(SerializableDataClass): 82 | """Holds the list of files in the dataset. Separate from DatasetInfo only due to size (this data is usually not 83 | needed to be sent in network calls).""" 84 | naming_method: PartNamingMethod 85 | total_parts: int 86 | total_size: int 87 | running_number_pattern: Optional[str] = field(default=None) 88 | filenames: Optional[List[str]] = field(default=None) 89 | 90 | def fullpaths(self, parent: DatasetInfo) -> List[str]: 91 | parentpath = parent.basepath if parent.basepath.endswith('/') else parent.basepath + '/' 92 | 93 | if self.naming_method == PartNamingMethod.LIST: 94 | assert (self.filenames and len(self.filenames) == parent.total_parts) 95 | return [parentpath + filename for filename in self.filenames] 96 | else: 97 | assert self.running_number_pattern 98 | return [parentpath + self.running_number_pattern.format(idx) 99 | for idx in range(parent.total_parts)] 100 | 101 | 102 | class DatasetColumnType(AutoNamedEnum): 103 | INT = auto() 104 | FLOAT = auto() 105 | BOOL = auto() 106 | # Categorical columns are not a separate type to the query engine. That designation exists and is used separately. 107 | STRING = auto() 108 | 109 | 110 | @dataclass(frozen=True) 111 | class DatasetColumnAttributes(SerializableDataClass): 112 | """ 113 | The 'full' information on each column. TODO backlog use polymorphism? (needs support in de-serialization) 114 | 115 | For columns which were either saved by Pandas as categoricals, or are identified during registration to be such, 116 | store a mapping of top N values (configurable) to their their normalized share in the dataset. Since registration 117 | does not read all files but only a sample, that ratio cannot be an absolute number or the exact ratio - but still 118 | useful for clients. 119 | 120 | cat_unique_ratio is the ratio of unique value count to all values (or: series.nunique()/len(series)), and may be 121 | a useful rough indicator to how much RAM is saved (and str.match() operations sped-up!) by the categorical 122 | representation. Columns are determined to be loaded as categorical if this value is lower than configured. 123 | Loading of columns as categoricals is also usually much faster, but that greatly depends on whether a dictionary 124 | was saved for that column in the Parquet file or not - so it depends on the tool used to create these files. 125 | """ 126 | numeric_min: Optional[float] = None 127 | numeric_max: Optional[float] = None 128 | categorical: bool = False 129 | cat_top_values: Optional[Dict[str, float]] = None 130 | cat_unique_ratio: Optional[float] = None 131 | 132 | 133 | @dataclass(frozen=True) 134 | class DatasetColumn(SerializableDataClass): 135 | name: str 136 | dtype_name: str 137 | coltype: DatasetColumnType 138 | colattrs: DatasetColumnAttributes 139 | 140 | 141 | @dataclass(frozen=True) 142 | class DatasetShortSchema(SerializableDataClass): 143 | """Schema, the short version - typically all you may need.""" 144 | columns: Dict[str, DatasetColumnType] 145 | min_timestamp: float 146 | max_timestamp: float 147 | # In files created by Pandas with its metadata intact in the Parquet file, columns marked as categoricals. 148 | source_categoricals: List[str] = field(default=None) 149 | # Columns detected during registration to be good candidates for explicitly loading as categoricals (by PyArrow). 150 | potential_categoricals: List[str] = field(default=None) 151 | 152 | 153 | @dataclass(frozen=True) 154 | class DatasetSchema(SerializableDataClass): 155 | group_id_column: str 156 | timestamp_column: str 157 | columns: Dict[str, DatasetColumn] 158 | # Just the names->dtypes of all columns not (currently) supported. 159 | unsupported_columns: Dict[str, str] 160 | 161 | def short(self) -> DatasetShortSchema: 162 | """Make short from full.""" 163 | cols = {name: col.coltype for name, col in self.columns.items()} 164 | source_categoricals = [] 165 | potential_categoricals = [] 166 | for name, col in self.columns.items(): 167 | if col.colattrs.categorical: 168 | if col.dtype_name == 'category': 169 | source_categoricals.append(name) 170 | else: 171 | potential_categoricals.append(name) 172 | ts_attrs = self.columns[self.timestamp_column].colattrs 173 | min_ts = ts_attrs.numeric_min 174 | max_ts = ts_attrs.numeric_max 175 | 176 | return DatasetShortSchema(columns=cols, 177 | source_categoricals=source_categoricals, 178 | potential_categoricals=potential_categoricals, 179 | min_timestamp=min_ts, max_timestamp=max_ts) 180 | -------------------------------------------------------------------------------- /frocket/common/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /frocket/common/helpers/pandas.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Iterable 16 | import pandas as pd 17 | import numpy as np 18 | 19 | 20 | def filter_by_isin(df: pd.DataFrame, column: str, values: Iterable) -> pd.DataFrame: 21 | """ 22 | For the given DataFrame, return only rows where df[column] is in the given values. 23 | This is a surprisingly faster alternative to built-in Pandas/NumPy functions: df[np.isin(df[column], values)] 24 | A value can appear in multiple rows (e.g. the same user ID appearing multiple rows) 25 | 26 | TODO Merge a [Numba-based isin()](https://stackoverflow.com/questions/53046473/numpy-isin-performance-improvement) 27 | function, compiled AOT for relevant array dtypes. This would be arch-dependent and optional (with fallback) 28 | """ 29 | # First, create a "map" series from all possible values in the column => whether they should pass the filter 30 | all_ids = df[column].unique() 31 | is_id_relevant = pd.Series(np.zeros(len(all_ids)), index=all_ids).astype('bool') # Default false 32 | is_id_relevant.loc[values] = True 33 | 34 | # Create a boolean mask for column, based on the mapping above. Grab the raw array. 35 | mask = is_id_relevant[df[column]].values 36 | # Apply mask 37 | return df[mask] 38 | 39 | 40 | def add_column_by_value_map(df: pd.DataFrame, keys_column: str, values_map_series: pd.Series, new_column: str) -> None: 41 | """ 42 | Add a new column to the given df. For each row, df[new_column] will be set to an appropriate value from 43 | values_map_series: the value whose index is df[keys_column] in that row. 44 | 45 | e.g. given a DF of user activities having a userId column (with potentially multiple rows per user), and a 46 | values_map_series whose unique index is a User ID, and its values are the age of that user, the function will add 47 | a new column to the given DF with the age of that row's user ID 48 | 49 | If a value in keys_column does not have a matching index in values_map_series, the cell value would be NaN. 50 | This function is optimized for performance. 51 | 52 | The given DF is modified inplace. 53 | """ 54 | # Create a new mapping between ALL unique values of IDs of df[keys_column] and their matching value (or NaN) 55 | unique_keys = df[keys_column].unique() 56 | key_to_value = pd.Series(data=np.nan, index=unique_keys) 57 | key_to_value.loc[values_map_series.index] = values_map_series 58 | 59 | # Now we can create the new column, using the mapping 60 | df[new_column] = key_to_value[df[keys_column]].values 61 | -------------------------------------------------------------------------------- /frocket/common/helpers/storage.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple abstraction of local & remote filesystems. 3 | 4 | Currently supports either a local filesystem (for non-distributed usage, or potentially a fast network share) 5 | and S3 (or S3-compatible object stores such as MinIO, which is used for running tests). 6 | Additional protocols are welcome. 7 | 8 | TODO backlog: support pagination for S3 listing (so more than 1,000 files per dataset) 9 | TODO backlog: support auto-identification of numbering pattern in dataset files, so the full list of filenames 10 | would not have to reside in the datastore 11 | """ 12 | # Copyright 2021 The Funnel Rocket Maintainers 13 | # 14 | # Licensed under the Apache License, Version 2.0 (the "License"); 15 | # you may not use this file except in compliance with the License. 16 | # You may obtain a copy of the License at 17 | # 18 | # http://www.apache.org/licenses/LICENSE-2.0 19 | # 20 | # Unless required by applicable law or agreed to in writing, software 21 | # distributed under the License is distributed on an "AS IS" BASIS, 22 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 23 | # See the License for the specific language governing permissions and 24 | # limitations under the License. 25 | 26 | import logging 27 | import re 28 | import tempfile 29 | import uuid 30 | from abc import abstractmethod 31 | from enum import Enum, auto 32 | from fnmatch import fnmatch 33 | from pathlib import Path 34 | from typing import NamedTuple, Optional, List 35 | import boto3 36 | from frocket.common.config import config 37 | from frocket.common.dataset import DatasetPartsInfo, PartNamingMethod 38 | 39 | logger = logging.getLogger(__name__) 40 | 41 | 42 | class StorageHandler: 43 | """Simple abstraction of a storage protocol.""" 44 | class FileBaseInfo(NamedTuple): 45 | relpath: str 46 | size: int 47 | 48 | def __init__(self, path: str): 49 | assert self.valid(path) 50 | self._path = path 51 | 52 | @classmethod 53 | def valid(cls, path: str) -> bool: 54 | """For validation of a path prior to instantiating the handler - a nicety instead of exceptions later, 55 | to be overriden where appropriate.""" 56 | return True 57 | 58 | @property 59 | @abstractmethod 60 | def remote(self) -> bool: 61 | """This affects the caching behavior used by workers (see part_loader.py).""" 62 | pass 63 | 64 | @abstractmethod 65 | def _list_files(self, pattern: str) -> List[FileBaseInfo]: 66 | """Override in subclasses""" 67 | pass 68 | 69 | def discover_files(self, pattern: str) -> DatasetPartsInfo: 70 | files = self._list_files(pattern) 71 | files.sort(key=lambda fi: fi.relpath) 72 | # TODO backlog implement PartNamingMethod.RUNNING_NUMBER for compact metadata in large datasets 73 | parts_info = DatasetPartsInfo(naming_method=PartNamingMethod.LIST, 74 | total_parts=len(files), 75 | total_size=sum([fi.size for fi in files]), 76 | filenames=[fi.relpath for fi in files], 77 | running_number_pattern=None) 78 | return parts_info 79 | 80 | @abstractmethod 81 | def _local_path(self, fullpath: str) -> str: 82 | """ 83 | If the filesystem is remote, download and return a local copy. 84 | Files should be cleaned-up by the caller which controls the caching behavior. 85 | """ 86 | pass 87 | 88 | def get_local_path(self, fullpath: str) -> str: 89 | if not fullpath.startswith(self._path): 90 | raise Exception(f"Given full path {fullpath} is not under handler's path {self._path}") 91 | 92 | return self._local_path(fullpath) 93 | 94 | 95 | class FileStorageHanler(StorageHandler): 96 | """Super-simple local filesystem handler""" 97 | @property 98 | def remote(self): 99 | return False 100 | 101 | def _list_files(self, pattern): 102 | paths = Path(self._path).iterdir() 103 | files = [StorageHandler.FileBaseInfo(path.name, path.stat().st_size) 104 | for path in paths 105 | if fnmatch(path.name, pattern)] 106 | return files 107 | 108 | def _local_path(self, fullpath): 109 | if not Path(fullpath).is_file(): 110 | raise Exception(f"Path is missing/not a file: {fullpath}") 111 | return fullpath 112 | 113 | 114 | class S3StorageHanler(StorageHandler): 115 | """S3 filesystem handler, supports datasets directly under the bucket or within a sub-directory.""" 116 | S3_PATH_REGEX = re.compile(r"^s3://([a-zA-Z0-9_\-.]+)/([a-zA-Z0-9_\-./]*)$") 117 | 118 | def __init__(self, path: str): 119 | super().__init__(path) 120 | path_parts = self.S3_PATH_REGEX.match(path) 121 | self._bucket = path_parts.group(1) 122 | self._path_in_bucket = path_parts.group(2) 123 | no_trailing_slash = self._path_in_bucket and self._path_in_bucket[-1:] != '/' 124 | self._path_in_bucket_normalized = self._path_in_bucket + ('/' if no_trailing_slash else '') 125 | 126 | @classmethod 127 | def valid(cls, path): 128 | return True if cls.S3_PATH_REGEX.match(path) else False 129 | 130 | @property 131 | def remote(self): 132 | return True 133 | 134 | def _list_files(self, pattern): 135 | path_in_bucket = self._path_in_bucket_normalized 136 | logger.info(f"Listing files in S3 with bucket {self._bucket} and prefix {path_in_bucket}...") 137 | # TODO backlog support pagination 138 | s3response = self._client().list_objects_v2(Bucket=self._bucket, Prefix=path_in_bucket) 139 | 140 | filename_start_idx = len(path_in_bucket) 141 | path_to_size = {obj['Key'][filename_start_idx:]: obj['Size'] for obj in s3response['Contents']} 142 | files = [StorageHandler.FileBaseInfo(path, size) 143 | for path, size in path_to_size.items() 144 | if fnmatch(path, pattern)] 145 | return files 146 | 147 | def _local_path(self, fullpath): 148 | localpath = str(Path(tempfile.gettempdir()) / str(uuid.uuid4())) 149 | logger.info(f"Downloading {fullpath} to {localpath}...") 150 | self._client().download_file(self._bucket, self._path_in_bucket, localpath) 151 | return localpath 152 | 153 | @classmethod 154 | def _client(cls): 155 | if not hasattr(cls, '_s3client'): 156 | cls._s3client = boto3.client('s3', **config.aws_client_settings(service='s3')) 157 | return cls._s3client 158 | 159 | 160 | class StorageProtocol(Enum): 161 | FILE = auto() 162 | S3 = auto() 163 | 164 | @classmethod 165 | def get(cls, name: str): 166 | return cls.__members__.get(name.upper()) 167 | 168 | @classmethod 169 | def names(cls) -> List[str]: 170 | return list(cls.__members__.keys()) 171 | 172 | 173 | PATH_WITH_PROTOCOL_RE = r'(\w+)://(.+)$' 174 | PROTOCOL_TO_HANDLER = { 175 | StorageProtocol.FILE: FileStorageHanler, 176 | StorageProtocol.S3: S3StorageHanler 177 | } 178 | 179 | 180 | def storage_handler_for(path: str, throw_if_missing: bool = True) -> Optional[StorageHandler]: 181 | """ 182 | Instantiate the appropriate handler for the given path. 183 | Paths without explicit protocol are considered local. 184 | """ 185 | path_and_protocol = re.match(PATH_WITH_PROTOCOL_RE, path) 186 | if path_and_protocol: 187 | method_name = path_and_protocol.groups()[0] 188 | method = StorageProtocol.get(method_name) 189 | if not method: 190 | if throw_if_missing: 191 | raise Exception(f"Storage protocol '{method_name}' is not in supported list: {StorageProtocol.names()}") 192 | else: 193 | return None 194 | elif method == StorageProtocol.FILE: 195 | path = path_and_protocol.groups()[1] 196 | else: 197 | method = StorageProtocol.FILE 198 | 199 | handler_cls = PROTOCOL_TO_HANDLER[method] 200 | if not handler_cls.valid(path): 201 | raise Exception(f"Invalid path: {path} (protocol: {method.name})") 202 | return handler_cls(path) 203 | -------------------------------------------------------------------------------- /frocket/common/helpers/utils.py: -------------------------------------------------------------------------------- 1 | """For everything but the kitchen sink.""" 2 | # Copyright 2021 The Funnel Rocket Maintainers 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import functools 17 | import math 18 | import random 19 | import uuid 20 | import time 21 | from io import BytesIO 22 | from typing import Optional, List 23 | import pandas as pd 24 | import pyarrow.feather as feather 25 | import numpy as np 26 | 27 | 28 | def terminal_red(message: str) -> str: 29 | return f"\033[31m{message}\033[0m" 30 | 31 | 32 | def terminal_green(message: str) -> str: 33 | return f"\033[32m{message}\033[0m" 34 | 35 | 36 | def memoize(obj): 37 | """Standard issue memoization decorator for caching function results (which don't need invalidation).""" 38 | cache = obj._cache = {} 39 | 40 | @functools.wraps(obj) 41 | def memoizer(*args, **kwargs): 42 | key = str(args) + str(kwargs) 43 | if key not in cache: 44 | cache[key] = obj(*args, **kwargs) 45 | return cache[key] 46 | 47 | return memoizer 48 | 49 | 50 | def sample_from_range(range_max: int, 51 | sample_ratio: float, 52 | max_samples: int, 53 | preselected: Optional[List[int]]) -> List[int]: 54 | """ 55 | Given a range of numbers in 0..range_max, return random samples. 56 | Count of samples is set by sample_ratio, up to max_samples. 57 | If preselected is passed, include these indexes first. 58 | """ 59 | available_indexes = list(range(range_max)) 60 | sample_count = min(math.floor(range_max * sample_ratio), max_samples) 61 | 62 | if preselected: 63 | chosen = list(preselected) 64 | for i in preselected: 65 | available_indexes.remove(i) 66 | sample_count = max(sample_count - len(preselected), 0) 67 | else: 68 | chosen = [] 69 | 70 | if sample_count > 0: 71 | chosen += random.choices(available_indexes, k=sample_count) 72 | return chosen 73 | 74 | 75 | def timestamped_uuid(prefix: str = None) -> str: 76 | return f"{prefix or ''}{math.floor(time.time())}-{str(uuid.uuid4())[:8]}" 77 | 78 | 79 | def ndarray_to_bytes(arr: np.ndarray) -> bytes: 80 | """Use PyArrow's feather format as a compute- and space-efficient format for serializing NumPy arrays.""" 81 | df = pd.DataFrame(data={'arr': arr}) 82 | buf = BytesIO() 83 | # noinspection PyTypeChecker 84 | feather.write_feather(df, buf) 85 | buf.seek(0) 86 | return buf.read() 87 | 88 | 89 | def bytes_to_ndarray(data: bytes) -> np.ndarray: 90 | df = feather.read_feather(BytesIO(data)) 91 | return df['arr'] 92 | -------------------------------------------------------------------------------- /frocket/common/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /frocket/common/tasks/async_tracker.py: -------------------------------------------------------------------------------- 1 | """ 2 | AsyncJobTracker object is handed by invoker_api to clients that launch a job in a non-blocking fashion. 3 | It enables either periodic polling or blocking on updates. Updates are guaranteed to be atomic - that is, 4 | there may be further updates, but the status you have in hand is consistent. 5 | """ 6 | # Copyright 2021 The Funnel Rocket Maintainers 7 | # 8 | # Licensed under the Apache License, Version 2.0 (the "License"); 9 | # you may not use this file except in compliance with the License. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | 20 | import logging 21 | import time 22 | from abc import ABCMeta, abstractmethod 23 | from dataclasses import dataclass 24 | from enum import auto 25 | from queue import Queue, Empty 26 | from typing import Optional, Dict, Generator 27 | from frocket.common.serializable import AutoNamedEnum 28 | from frocket.common.tasks.base import BaseJobResult, TaskStatus 29 | 30 | logger = logging.getLogger(__name__) 31 | 32 | 33 | class AsyncJobStage(AutoNamedEnum): 34 | STARTING = auto() 35 | RUNNING = auto() 36 | FINISHING = auto() 37 | DONE = auto() 38 | 39 | 40 | @dataclass(frozen=True) 41 | class AsyncJobStatus: 42 | stage: AsyncJobStage 43 | message: Optional[str] = None # The job may set descriptive text for what it's doing 44 | result: Optional[BaseJobResult] = None # Only available on stage=AsyncJobStage.DONE 45 | task_counters: Optional[Dict[TaskStatus, int]] = None 46 | 47 | 48 | class JobTimeoutError(Exception): 49 | pass 50 | 51 | 52 | class AsyncJobTracker(metaclass=ABCMeta): 53 | """The interface as known to clients""" 54 | 55 | @property 56 | @abstractmethod 57 | def status(self) -> AsyncJobStatus: 58 | """Get the latest status - as a consistent object which will not be mutated while using it""" 59 | pass 60 | 61 | @property 62 | @abstractmethod 63 | def elapsed_time(self) -> float: 64 | pass 65 | 66 | @property 67 | @abstractmethod 68 | def wait_time_remaining(self) -> Optional[float]: 69 | """ 70 | If a tracker object was initialized with a timeout value by its creator (the invoker_api, 71 | based on configuration), then time remaining till timeout is known and can be returned. 72 | """ 73 | pass 74 | 75 | @abstractmethod 76 | def wait(self, timeout: float = None) -> bool: 77 | """ 78 | Blocking wait for updates with the given timeout, in seconds - but always capped to max wait time if set. 79 | By default, timeout is None - meaning wait up till max wait time (or indefinitely, in case it wasn't set). 80 | Assuming wait time is set, this is a good choice since no busy loop or semi-busy loop is needed. 81 | """ 82 | pass 83 | 84 | def generator(self) -> Generator[AsyncJobStatus, None, None]: 85 | """ 86 | Returns updates as they come for easier consumption, if blocking behavior is ok. 87 | This generator does not rely on any private attributes. 88 | """ 89 | while True: 90 | update_available = self.wait() 91 | if not self.wait_time_remaining: 92 | raise JobTimeoutError() 93 | 94 | status_snapshot = self.status 95 | if status_snapshot.result: 96 | break 97 | 98 | if update_available: 99 | yield status_snapshot 100 | 101 | yield status_snapshot 102 | 103 | 104 | class AsyncJobStatusUpdater(AsyncJobTracker): 105 | """ 106 | Implementation of AsyncJobTracker, which is only created within invoker_api and updated by invoker/job code. 107 | 108 | The one curiousity here is the blocking wait() mechanism which is based on a Queue instance. 109 | How it works: the client's wait() call blocks on waiting for a queue item. If there's already one, 110 | it's immediately returned. Once consumed, the queue is empty again and a subsequent wait() will repeat 111 | the process. Typically, the queue should have either zero or a only single item - see _signal_update() below. 112 | """ 113 | def __init__(self, max_wait: float = None): 114 | self._status: AsyncJobStatus = AsyncJobStatus(stage=AsyncJobStage.STARTING) 115 | self._update_queue = Queue() 116 | self._max_wait = max_wait 117 | self._start_time = time.time() 118 | 119 | @property 120 | def elapsed_time(self) -> float: 121 | return time.time() - self._start_time 122 | 123 | @property 124 | def wait_time_remaining(self) -> Optional[float]: 125 | assert self._max_wait 126 | remaining = self._max_wait - self.elapsed_time 127 | return remaining if remaining > 0 else 0 128 | 129 | @property 130 | def status(self) -> AsyncJobStatus: 131 | return self._status 132 | 133 | def _update_status(self, new_status: AsyncJobStatus) -> None: 134 | """Only signal an update if there was actually any change.""" 135 | modified = self._status != new_status 136 | self._status = new_status 137 | if modified: 138 | if logger.isEnabledFor(logging.DEBUG): 139 | logger.debug(f"Updated async status from\n:{self._status} to:\n{new_status}") 140 | self._signal_update() 141 | pass 142 | 143 | def update(self, stage: AsyncJobStage = None, message: str = None, task_counters: Dict[TaskStatus, int] = None): 144 | # Asserts are used here as the invoker/job classes are internal to the invoker_api, and are expected to conform 145 | # to this class' requirements. If not, it's probably a bug. 146 | assert stage != AsyncJobStage.DONE # To move to DONE stage, done() should be explicitly called 147 | assert self._status.stage != AsyncJobStage.DONE # No more updates after DONE was called once 148 | stage = stage or self._status.stage 149 | task_counters = task_counters or self._status.task_counters 150 | # Automatically cleanup message when moving in stages 151 | message = message or (self._status.message if (stage == self._status.stage) else None) 152 | 153 | self._update_status(AsyncJobStatus(stage=stage, message=message, 154 | task_counters=task_counters)) 155 | 156 | def done(self, result: BaseJobResult): 157 | self._update_status(AsyncJobStatus(stage=AsyncJobStage.DONE, result=result, 158 | task_counters=self._status.task_counters)) 159 | 160 | def _signal_update(self): 161 | if self._update_queue.empty(): 162 | # If the client *already* has an update waiting for it, no need to do anything - it will read the latest 163 | # state anyway when it gets to consume it (the queue item itself doesn't hold any information). 164 | # In case of more than single updater thread, there might momentarily be more than a single item. 165 | # However, this is not currently used in this way, and it seems that having multiple items would not 166 | # have any detrimental effect (i.e. break correctness) if it actually occurs in other edge cases. 167 | # TODO backlog to ensure a single item always, consider a lock here and re-test empty() within that lock. 168 | self._update_queue.put(object()) 169 | 170 | def wait(self, timeout=None): 171 | assert timeout is None or timeout > 0 172 | try: 173 | should_block = True 174 | if self._max_wait: 175 | remaining = self.wait_time_remaining 176 | if remaining == 0: 177 | # No more blocking wait - immediately return what's in the queue (or None) 178 | should_block = False 179 | timeout = None 180 | elif timeout: 181 | timeout = min(timeout, remaining) 182 | else: 183 | timeout = remaining 184 | 185 | self._update_queue.get(block=should_block, timeout=timeout) 186 | return True 187 | except Empty: 188 | return False 189 | -------------------------------------------------------------------------------- /frocket/common/tasks/query.py: -------------------------------------------------------------------------------- 1 | """ 2 | Query job's task classes 3 | """ 4 | # Copyright 2021 The Funnel Rocket Maintainers 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from dataclasses import dataclass 19 | from enum import auto 20 | from typing import Optional, List, Dict, Union, cast 21 | import inflection 22 | from frocket.common.dataset import DatasetInfo, DatasetPartId 23 | from frocket.common.serializable import AutoNamedEnum, enveloped, SerializableDataClass, reducable 24 | from frocket.common.tasks.base import BaseTaskRequest, BaseTaskResult, BaseJobResult 25 | 26 | 27 | class PartSelectionMode(AutoNamedEnum): 28 | """Whether the invoker sets the task_index or the worker selects it from available tasks in the datastore.""" 29 | SET_BY_INVOKER = auto() 30 | SELECTED_BY_WORKER = auto() 31 | 32 | 33 | @enveloped 34 | @dataclass(frozen=True) 35 | class QueryTaskRequest(BaseTaskRequest): 36 | dataset: DatasetInfo 37 | # String columns to load as Pandas categoricals, as performance optimization. These columns are detected during 38 | # dataset registration. Not needed for columns already of categorical type in files saved by Pandas. 39 | load_as_categoricals: Optional[List[str]] 40 | mode: PartSelectionMode 41 | # If (and only if) mode=SET_BY_INVOKER, the invoker also sets the dataset part index to query 42 | # Note that task_index not necessarily equals part ID 43 | invoker_set_part: Optional[DatasetPartId] 44 | used_columns: List[str] # Which columns to actually load (as optimization), as analyzed by QueryValidator. 45 | query: dict 46 | 47 | 48 | class AggregationType(AutoNamedEnum): 49 | # noinspection PyUnusedLocal 50 | def __init__(self, *args): 51 | if not hasattr(self.__class__, '_camels'): 52 | self.__class__._camels = {} 53 | 54 | self.camelized = inflection.camelize(self.name.lower(), uppercase_first_letter=False) 55 | self.__class__._camels[self.camelized] = self 56 | self.value_is_dict = self.name.endswith("_PER_VALUE") 57 | 58 | COUNT = auto() 59 | COUNT_PER_VALUE = auto() 60 | GROUPS_PER_VALUE = auto() 61 | SUM_PER_VALUE = auto() 62 | MEAN_PER_VALUE = auto() 63 | 64 | @classmethod 65 | def from_camelcase(cls, camelcase_name: str) -> AutoNamedEnum: 66 | return cls._camels[camelcase_name] 67 | 68 | 69 | AggrValue = Union[int, float] 70 | AggrValueMap = Dict[str, AggrValue] 71 | 72 | 73 | @reducable 74 | @dataclass(frozen=True) 75 | class AggregationResult(SerializableDataClass): 76 | column: str 77 | type: str 78 | # For some aggregation types ('count') the value is a single number. In others (the 'perValue' ones), value is 79 | # a dict of column value->aggregated number 80 | value: Optional[Union[AggrValue, AggrValueMap]] 81 | top: Optional[int] # Relevant for values of type dict 82 | name: Optional[str] # Only set if the user has set a custom name for this aggregation 83 | 84 | @classmethod 85 | def _reduce_fields(cls, serializables): 86 | """See: SerializableDataClass.""" 87 | all_values = [e.value for e in cast(List[AggregationResult], serializables)] 88 | # Reduce either a primitive values or a dicts of counters 89 | if isinstance(all_values[0], dict): 90 | reduced_value = cls.reduce_counter_dicts(all_values, top_count=cast(cls, serializables[0]).top) 91 | else: 92 | reduced_value = sum(all_values) 93 | return {'value': reduced_value} 94 | 95 | 96 | @reducable 97 | @dataclass(frozen=True) 98 | class QueryConditionsResult(SerializableDataClass): 99 | matching_groups: int # e.g. user ID 100 | matching_group_rows: int # All rows of the matching groups, whether that row matches a condition or not 101 | aggregations: Optional[List[AggregationResult]] 102 | 103 | @classmethod 104 | def _reduce_fields(cls, serializables): 105 | results = cast(List[cls], serializables) 106 | return {'matching_groups': sum([e.matching_groups for e in results]), 107 | 'matching_group_rows': sum([e.matching_group_rows for e in results]), 108 | 'aggregations': cls.reduce_lists([e.aggregations for e in results])} 109 | 110 | 111 | @reducable 112 | @dataclass(frozen=True) 113 | class FunnelResult(SerializableDataClass): 114 | sequence: List[QueryConditionsResult] 115 | end_aggregations: Optional[List[AggregationResult]] 116 | 117 | @classmethod 118 | def _reduce_fields(cls, serializables): 119 | funnel_results = cast(List[cls], serializables) 120 | return {'sequence': cls.reduce_lists([e.sequence for e in funnel_results]), 121 | 'end_aggregations': cls.reduce_lists([e.end_aggregations for e in funnel_results])} 122 | 123 | 124 | @reducable 125 | @dataclass(frozen=True) 126 | class QueryResult(SerializableDataClass): 127 | query: QueryConditionsResult 128 | funnel: Optional[FunnelResult] 129 | 130 | @classmethod 131 | def _reduce_fields(cls, serializables): 132 | query_results = cast(List[cls], serializables) 133 | return {'query': QueryConditionsResult.reduce([e.query for e in query_results]), 134 | 'funnel': FunnelResult.reduce([e.funnel for e in query_results])} 135 | 136 | 137 | @enveloped 138 | @dataclass(frozen=True) 139 | class QueryTaskResult(BaseTaskResult): 140 | query_result: Optional[QueryResult] # Not set if query failed (when success=False) 141 | 142 | 143 | @dataclass(frozen=True) 144 | class QueryJobResult(BaseJobResult): 145 | query: Optional[QueryConditionsResult] 146 | funnel: Optional[FunnelResult] 147 | -------------------------------------------------------------------------------- /frocket/common/tasks/registration.py: -------------------------------------------------------------------------------- 1 | """ 2 | Task request/response classes for the registration job (discovering, validating and storing metadata for a dataset) 3 | """ 4 | # Copyright 2021 The Funnel Rocket Maintainers 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from dataclasses import dataclass 19 | from enum import auto 20 | from typing import Optional 21 | from frocket.common.dataset import DatasetInfo, DatasetPartId, DatasetSchema 22 | from frocket.common.serializable import SerializableDataClass, AutoNamedEnum, enveloped 23 | from frocket.common.tasks.base import BaseTaskRequest, BaseTaskResult, BlobId, BaseJobResult, BaseApiResult 24 | 25 | 26 | class DatasetValidationMode(AutoNamedEnum): 27 | SINGLE = auto() # Only validate a single file in the dataset (meaning no cross-file consistency checks are done!) 28 | FIRST_LAST = auto() # Validate only first and last files (by lexicographic sorting) and cross-check them 29 | SAMPLE = auto() # Takes a sample of files, proportional to the no.o of files and up to a configured maximum. 30 | 31 | 32 | REGISTER_DEFAULT_FILENAME_PATTERN = '*.parquet' # Ignore files such as '_SUCCESS' and the like in discovery 33 | REGISTER_DEFAULT_VALIDATION_MODE = DatasetValidationMode.SAMPLE 34 | REGISTER_DEFAULT_VALIDATE_UNIQUES = True 35 | 36 | 37 | @dataclass(frozen=True) 38 | class RegisterArgs(SerializableDataClass): 39 | """Parameters collected by the CLI / API server for the registration job""" 40 | name: str 41 | basepath: str 42 | group_id_column: str 43 | timestamp_column: str 44 | pattern: str = REGISTER_DEFAULT_FILENAME_PATTERN 45 | validation_mode: DatasetValidationMode = REGISTER_DEFAULT_VALIDATION_MODE 46 | validate_uniques: bool = REGISTER_DEFAULT_VALIDATE_UNIQUES 47 | 48 | 49 | @enveloped 50 | @dataclass(frozen=True) 51 | class RegistrationTaskRequest(BaseTaskRequest): 52 | dataset: DatasetInfo 53 | part_id: DatasetPartId 54 | # If RegisterArgs.validate_uniques=true, task should return all group IDs in file 55 | return_group_ids: bool 56 | 57 | 58 | @enveloped 59 | @dataclass(frozen=True) 60 | class RegistrationTaskResult(BaseTaskResult): 61 | dataset_schema: Optional[DatasetSchema] # None on failures 62 | part_id: DatasetPartId 63 | # If RegistrationTaskRequest.return_group_ids=true, a reference to the blob with the group IDs 64 | group_ids_blob_id: Optional[BlobId] 65 | 66 | 67 | @dataclass(frozen=True) 68 | class RegistrationJobResult(BaseJobResult): 69 | dataset: DatasetInfo 70 | 71 | 72 | @dataclass(frozen=True) 73 | class UnregisterApiResult(BaseApiResult): 74 | dataset_found: bool 75 | dataset_last_used: Optional[float] 76 | -------------------------------------------------------------------------------- /frocket/common/validation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /frocket/common/validation/consts.py: -------------------------------------------------------------------------------- 1 | """ 2 | Consts and types for the query validation package 3 | TODO backlog create a nice enum for all query keywords 4 | """ 5 | # Copyright 2021 The Funnel Rocket Maintainers 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | import json 20 | import os 21 | import re 22 | from pathlib import Path 23 | from typing import Dict, NamedTuple 24 | from frocket.common.dataset import DatasetColumnType 25 | from frocket.common.validation.path_visitor import PathVisitor 26 | 27 | # JSON Schema file 28 | QUERY_SCHEMA_LOCATION = Path(os.path.dirname(__file__)) / '../../resources/query_schema.json' 29 | QUERY_SCHEMA = json.load(open(QUERY_SCHEMA_LOCATION, 'r')) 30 | 31 | TARGET_TYPES_WITH_INCLUDE_ZERO = ['count'] 32 | TARGET_OPS_SUPPORTING_INCLUDE_ZERO = ['<', '<=', '==', '!=', '>='] 33 | TARGET_TYPES_WITH_OTHER_COLUMN = ['sum'] 34 | AGGR_TYPES_WITH_OTHER_COLUMN = ['sumPerValue', 'meanPerValue'] 35 | DEFAULT_TARGET = {'type': 'count', 'op': '>=', 'value': 1} 36 | DEFAULT_AGGREGATIONS = ['count', 'countPerValue', 'groupsPerValue'] 37 | AGGREGATIONS_PATHS = ['query.aggregations', 38 | 'funnel.stepAggregations', 39 | 'funnel.endAggregations'] 40 | SINGLE_FILTER_PATHS = ['query.conditions.filter', 41 | 'query.conditions.sequence.filter', 42 | 'funnel.sequence.filter'] 43 | FILTER_ARRAY_PATHS = ['query.conditions.filters', 44 | 'query.conditions.sequence.filters', 45 | 'funnel.sequence.filters'] 46 | 47 | VALID_IDENTIFIER_PATTERN = re.compile(r'[A-Z][A-Z_0-9]*$', re.IGNORECASE) 48 | UNIQUE_IDENTIFIER_SCOPES = ['query.conditions.name'] + \ 49 | [f"{path}.name" for path in AGGREGATIONS_PATHS] 50 | 51 | EQUALITY_OPERATORS = ['==', '!='] 52 | NUMERIC_OPERATORS = [*EQUALITY_OPERATORS, '>', '>=', '<', '<='] 53 | STRING_OPERATORS = [*EQUALITY_OPERATORS, 'contains', 'regex'] 54 | OPERATORS_BY_COLTYPE = { 55 | DatasetColumnType.INT: NUMERIC_OPERATORS, 56 | DatasetColumnType.FLOAT: NUMERIC_OPERATORS, 57 | DatasetColumnType.BOOL: EQUALITY_OPERATORS, 58 | DatasetColumnType.STRING: STRING_OPERATORS 59 | } 60 | VALUE_TYPES_BY_COLTYPE = { 61 | DatasetColumnType.INT: [int], 62 | DatasetColumnType.FLOAT: [int, float], 63 | DatasetColumnType.BOOL: [bool], 64 | DatasetColumnType.STRING: [str] 65 | } 66 | NUMERIC_COLTYPES = [DatasetColumnType.INT, DatasetColumnType.FLOAT] 67 | 68 | RELATION_OPS = ['and', 'or', '||', '&&'] 69 | DEFAULT_RELATION_OP = 'and' 70 | CONDITION_COLUMN_PREFIX = "__cond_" 71 | 72 | 73 | class QueryConditionsMap(NamedTuple): 74 | count: int 75 | names: Dict[str, int] 76 | 77 | 78 | def map_condition_names(query: dict) -> QueryConditionsMap: 79 | """Map named conditions (which is optional) to the condition ID (index in conditions list).""" 80 | conditions = PathVisitor(query, 'query.conditions').list() 81 | names = {cond['name'].strip().lower(): i 82 | for i, cond in enumerate(conditions) if 'name' in cond} 83 | return QueryConditionsMap(count=len(conditions), names=names) 84 | -------------------------------------------------------------------------------- /frocket/common/validation/error.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from enum import auto 16 | from frocket.common.serializable import AutoNamedEnum 17 | 18 | 19 | class ValidationErrorKind(AutoNamedEnum): 20 | """Distinguish between types of validation issues in query""" 21 | INVALID_ARGUMENTS = auto() # Validator given wrong arguments 22 | SCHEMA = auto() # Failure at JSON Schema level 23 | TYPE_MISMATCH = auto() # Operator or value type don't match each other, or the context 24 | DATASET_MISMATCH = auto() # Column names, types, etc. do not match the schema of the given dataset 25 | RELATION = auto() # query.relation expression found invalid by relation_parser.py 26 | # Note for unexpected errors: unlike other kinds, the message associated with this kind may leak sensitive data 27 | # if it was returned to the caller - so it is not returned by the API server in PUBLIC mode. 28 | UNEXPECTED = auto() 29 | 30 | 31 | class QueryValidationError(Exception): 32 | def __init__(self, message: str, kind: ValidationErrorKind = None): 33 | self.message = message 34 | self.kind = kind or ValidationErrorKind.UNEXPECTED # Default, but should be rare. 35 | 36 | @staticmethod 37 | def wrap(e: Exception, kind: ValidationErrorKind = None): 38 | return QueryValidationError(str(e), kind) 39 | 40 | def __str__(self): 41 | return f"ValidationError({self.kind.value}: {self.message})" 42 | -------------------------------------------------------------------------------- /frocket/common/validation/path_visitor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Callable, Any, Optional 16 | 17 | PathVisitorCallback = Callable[[Any], Optional[Any]] 18 | 19 | 20 | class PathVisitor: 21 | """ 22 | A helper class for safely fetching nested attributes in a dictionary. 23 | It is used extensively by the QueryValidator to extract and transform nested attributes. 24 | 25 | The class is instantiated with a root dict and a dot-delimited string path (e.g. 'attr.sub_attr.sub_sub'). 26 | Then, visit() can be called once (or more) to run code over the matching value/s, if any. If the key is not found, 27 | no error is thrown. list() is a convenience method which visits the elements and returns them as a list, 28 | returning an empty list on no matches. 29 | 30 | By default, if the leaf key is a list, the visitor function is called for each element. 31 | However, if the list itself is what you need, pass list_to_items=False on init. 32 | 33 | Modifying attributes *below* the visited value is safe (be it a dict, a list, an object), however sometimes you 34 | may want to replace the whole value itself being itereated. For example, the QueryValidator replaces shorthand- 35 | notation objects, which are lists, into full-notation dicts. 36 | To support that, init the object with modifiable=true and return the replacement value from the visitor function, 37 | or None to keep the value. 38 | 39 | For usage examples, see test_path_visitor.py. 40 | """ 41 | _KEY_NOT_FOUND = object() 42 | 43 | def __init__(self, root: dict, path: str, modifiable: bool = False, list_to_items: bool = True): 44 | assert (isinstance(root, dict)) 45 | self._root = root 46 | self._paths = path.strip().split(".") 47 | self._modifiable = modifiable 48 | self._list_to_items = list_to_items 49 | 50 | def visit(self, func: PathVisitorCallback): 51 | if len(self._paths) > 0: 52 | self._visit_dict(self._root, 0, func) 53 | 54 | def list(self) -> list: 55 | result = [] 56 | self.visit(lambda v: result.append(v)) 57 | return result 58 | 59 | def _visit_dict(self, d: dict, depth: int, func: PathVisitorCallback): 60 | v = d.get(self._paths[depth], self._KEY_NOT_FOUND) # Differentiate a None value from an inexisting key 61 | if v == self._KEY_NOT_FOUND: 62 | return # Bumped into a wall 63 | 64 | if isinstance(v, list) and self._list_to_items: 65 | self._visit_list(v, depth + 1, func) 66 | return 67 | 68 | if depth == len(self._paths) - 1: 69 | replacement = func(v) # Includes None 70 | if self._modifiable and replacement: 71 | d[self._paths[depth]] = replacement 72 | else: 73 | if not v: 74 | return 75 | elif isinstance(v, dict): 76 | self._visit_dict(v, depth + 1, func) 77 | elif isinstance(v, list): 78 | self._visit_list(v, depth + 1, func) 79 | else: 80 | return # Can't go further 81 | 82 | def _visit_list(self, lst: list, depth: int, func: PathVisitorCallback): 83 | if depth == len(self._paths): 84 | assert self._list_to_items 85 | for i, elem in enumerate(lst): 86 | replacement = func(elem) 87 | if self._modifiable and replacement: 88 | lst[i] = replacement 89 | else: 90 | for i, elem in enumerate(lst): 91 | # Note: depth is not incremented in this case, since elements are at the same 'path depth' as the list 92 | if isinstance(elem, dict): 93 | self._visit_dict(elem, depth, func) 94 | elif isinstance(elem, list): 95 | self._visit_list(elem, depth, func) 96 | -------------------------------------------------------------------------------- /frocket/common/validation/relation_parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | While the query schema is generally JSON-based (good for machines) rather then textual (like SQL, 3 | supposedly human-friendly or at least more concise), there's one exception: an optional 'relation' expression allowing 4 | to specify arbitrarily complex and/or relations between conditions, rather than just and/or over all. 5 | 6 | The RelationParser class validates and breaks down the expression to a list of elements. However, it does not transform 7 | them back into a Pandas query or similar - that is the query engine's responsibility and may change independently. 8 | 9 | Note that conditions may be represented either by index ($0, $3, etc.) or by name - for named conditions. 10 | """ 11 | # Copyright 2021 The Funnel Rocket Maintainers 12 | # 13 | # Licensed under the Apache License, Version 2.0 (the "License"); 14 | # you may not use this file except in compliance with the License. 15 | # You may obtain a copy of the License at 16 | # 17 | # http://www.apache.org/licenses/LICENSE-2.0 18 | # 19 | # Unless required by applicable law or agreed to in writing, software 20 | # distributed under the License is distributed on an "AS IS" BASIS, 21 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 22 | # See the License for the specific language governing permissions and 23 | # limitations under the License. 24 | 25 | import logging 26 | from typing import Type, List, Optional 27 | from parsimonious.grammar import Grammar, NodeVisitor 28 | from parsimonious.nodes import Node 29 | from dataclasses import dataclass 30 | from parsimonious.exceptions import ParseError, VisitationError 31 | from abc import ABCMeta 32 | from frocket.common.validation.consts import RELATION_OPS, map_condition_names, CONDITION_COLUMN_PREFIX 33 | from frocket.common.validation.path_visitor import PathVisitor 34 | from frocket.common.validation.error import ValidationErrorKind, QueryValidationError 35 | from frocket.common.tasks.base import ErrorMessage 36 | 37 | logger = logging.getLogger(__name__) 38 | 39 | # TODO backlog fix the grammar to require whitespace between conditions and wordy-operators (and,or), 40 | # but not around symbol ops (&&, ||) 41 | # TODO backlog fix "DeprecationWarning: invalid escape sequence \$" 42 | RELATION_EXPRESSION_GRAMMAR = Grammar( 43 | """ 44 | expression = (identifier / (open_paren ws? expression ws? close_paren)) (ws? op ws? expression)* 45 | identifier = condition_name / condition_id 46 | condition_name = ~r"\$[A-Z][A-Z_0-9]*"i 47 | condition_id = ~r"\$[0-9]+" 48 | op = "and" / "or" / "&&" / "||" 49 | ws = ~r"\s*" 50 | open_paren = "(" 51 | close_paren = ")" 52 | """) 53 | 54 | 55 | @dataclass(frozen=True) 56 | class RelationParserContext: 57 | condition_count: int 58 | named_conditions: dict 59 | column_prefix: str 60 | 61 | 62 | @dataclass 63 | class RBaseElement(metaclass=ABCMeta): 64 | text: str 65 | ctx: RelationParserContext 66 | condition_id: Optional[int] = None 67 | 68 | def validate(self) -> Optional[ErrorMessage]: 69 | pass 70 | 71 | def __str__(self): 72 | return f"{self.__class__.__name__}('{self.text}')" 73 | 74 | 75 | @dataclass 76 | class RTextElement(RBaseElement): 77 | pass 78 | 79 | 80 | @dataclass 81 | class RConditionBaseElement(RBaseElement): 82 | pass 83 | 84 | 85 | @dataclass 86 | class RConditionId(RConditionBaseElement): 87 | def validate(self): 88 | cid = int(self.text[1:]) 89 | if cid >= self.ctx.condition_count: 90 | return f"Condition no. {cid} does not exist" 91 | self.condition_id = cid 92 | 93 | 94 | @dataclass 95 | class RConditionName(RConditionBaseElement): 96 | def validate(self): 97 | cname = self.text[1:] 98 | cid = self.ctx.named_conditions.get(cname, None) 99 | if cid is not None: # Can be zero 100 | self.condition_id = cid 101 | else: 102 | return f"Condition named {self.text[1:]} does not exist" 103 | 104 | 105 | @dataclass 106 | class ROperator(RBaseElement): 107 | def validate(self): 108 | if self.text not in RELATION_OPS: 109 | return f"Operator {self.text} not in {RELATION_OPS}" 110 | 111 | 112 | # noinspection PyMethodMayBeStatic,PyUnusedLocal 113 | @dataclass 114 | class RelationExpressionVisitor(NodeVisitor): 115 | """ 116 | Used by the RelationParser to build the element list. 117 | Note that while the grammar is hierarchical, the resulting list isn't (no need, currently). 118 | """ 119 | ctx: RelationParserContext 120 | 121 | def _build_element(self, node: Node, cls: Type[RBaseElement]): 122 | # noinspection PyArgumentList 123 | return cls(node.text, self.ctx) 124 | 125 | def visit_ws(self, node: Node, visited_children): 126 | return None # Ignore whitespaces 127 | 128 | def visit_op(self, node: Node, visited_children): 129 | return self._build_element(node, ROperator) 130 | 131 | def visit_open_paren(self, node: Node, visited_children): 132 | return self._build_element(node, RTextElement) 133 | 134 | def visit_close_paren(self, node: Node, visited_children): 135 | return self._build_element(node, RTextElement) 136 | 137 | def visit_identifier(self, node: Node, visited_children): 138 | """Return the actual condition name / ID element (see grammar: identifier wraps conditions).""" 139 | return visited_children[0] 140 | 141 | def visit_condition_name(self, node: Node, visited_children): 142 | return self._build_element(node, RConditionName) 143 | 144 | def visit_condition_id(self, node: Node, visited_children): 145 | return self._build_element(node, RConditionId) 146 | 147 | def generic_visit(self, node: Node, visited_children): 148 | """Ignore current node, but return children (if any) as a flat list.""" 149 | flat_result = [] 150 | for child in visited_children: 151 | if type(child) is list: 152 | flat_result += child # Unpack child array 153 | elif child: 154 | flat_result.append(child) 155 | return flat_result if len(flat_result) > 0 else None 156 | 157 | 158 | class RelationParser: 159 | def __init__(self, query: dict): 160 | self._query = query 161 | self._condition_mapping = map_condition_names(query) 162 | self._used_conditions = None 163 | 164 | found_relations = PathVisitor(self._query, 'query.relation').list() 165 | assert len(found_relations) in [0, 1] 166 | self._relation = found_relations[0].strip().lower() if found_relations else None 167 | 168 | def parse(self) -> List[RBaseElement]: 169 | if not self._relation: 170 | return [] 171 | 172 | ctx = RelationParserContext(condition_count=self._condition_mapping.count, 173 | named_conditions=self._condition_mapping.names, 174 | column_prefix=CONDITION_COLUMN_PREFIX) 175 | try: 176 | tree = RELATION_EXPRESSION_GRAMMAR.parse(self._relation) 177 | except ParseError as pe: 178 | # Adopted from within the ParseError class, but without the sometimes-confusing issue 179 | excerpt = pe.text[pe.pos:pe.pos + 20] if (pe.text and pe.pos is not None) else None 180 | if excerpt: 181 | message = f"Query relation is invalid around '{excerpt}' " 182 | else: 183 | message = f"Query relation '{self._relation}' is invalid" 184 | raise QueryValidationError(message, kind=ValidationErrorKind.RELATION) 185 | 186 | try: 187 | elements = RelationExpressionVisitor(ctx).visit(tree) 188 | except VisitationError as ve: 189 | logger.exception('Unexpected error while visiting parse tree') 190 | raise QueryValidationError(message=str(ve), kind=ValidationErrorKind.UNEXPECTED) 191 | 192 | for e in elements: 193 | error_message = e.validate() 194 | if error_message: 195 | raise QueryValidationError(message=error_message, kind=ValidationErrorKind.RELATION) 196 | 197 | self._used_conditions = [e.condition_id for e in elements if e.condition_id is not None] 198 | return elements 199 | 200 | @property 201 | def used_conditions(self) -> List[str]: 202 | return self._used_conditions 203 | -------------------------------------------------------------------------------- /frocket/common/validation/result.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | from typing import Optional, List, cast, Dict 17 | from frocket.common.serializable import SerializableDataClass 18 | from frocket.common.validation.error import ValidationErrorKind, QueryValidationError 19 | from frocket.common.validation.relation_parser import RBaseElement 20 | 21 | 22 | @dataclass(frozen=True) 23 | class QueryValidationResult(SerializableDataClass): 24 | success: bool 25 | source_query: dict 26 | error_message: Optional[str] = None 27 | error_kind: Optional[ValidationErrorKind] = None 28 | expanded_query: Optional[dict] = None 29 | # TODO backlog support non-critical warning/hints to user (e.g. conditions unused by relation expression) 30 | warnings: Optional[List[str]] = None 31 | used_columns: Optional[List[str]] = None 32 | used_conditions: Optional[List[str]] = None 33 | named_conditions: Optional[Dict[str, int]] = None 34 | relation_elements: Optional[List[RBaseElement]] = None 35 | 36 | @staticmethod 37 | def from_exception(e: Exception, source_query: dict): 38 | if type(e) is QueryValidationError: 39 | error_kind = cast(QueryValidationError, e).kind 40 | else: 41 | error_kind = ValidationErrorKind.UNEXPECTED 42 | return QueryValidationResult(success=False, error_message=str(e), error_kind=error_kind, 43 | source_query=source_query) 44 | -------------------------------------------------------------------------------- /frocket/common/validation/visitor_functions.py: -------------------------------------------------------------------------------- 1 | """ 2 | A collection of callback functions which the QueryValidator uses to extarct, validate and transform query elements, 3 | with the kind help of PathVisitor class. 4 | 5 | Functions which return a value are used to replace the given object with a different one, 6 | which is handled by PathVisitor in its 'modifiable' mode. 7 | 8 | Since callbacks are regular functions (not methods), and there's a bunch of them, they're in a separate file from 9 | the QueryValidator class. 10 | 11 | asserts are used where processing elements which should be already validated (so failures should be bugs). 12 | """ 13 | # Copyright 2021 The Funnel Rocket Maintainers 14 | # 15 | # Licensed under the Apache License, Version 2.0 (the "License"); 16 | # you may not use this file except in compliance with the License. 17 | # You may obtain a copy of the License at 18 | # 19 | # http://www.apache.org/licenses/LICENSE-2.0 20 | # 21 | # Unless required by applicable law or agreed to in writing, software 22 | # distributed under the License is distributed on an "AS IS" BASIS, 23 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 24 | # See the License for the specific language governing permissions and 25 | # limitations under the License. 26 | 27 | from typing import Optional 28 | from frocket.common.validation.consts import DEFAULT_TARGET, AGGR_TYPES_WITH_OTHER_COLUMN, \ 29 | DEFAULT_AGGREGATIONS, TARGET_TYPES_WITH_INCLUDE_ZERO, TARGET_OPS_SUPPORTING_INCLUDE_ZERO 30 | from frocket.common.validation.error import ValidationErrorKind, QueryValidationError 31 | 32 | 33 | def _to_verbose_filter(fltr) -> Optional[dict]: 34 | """If a condition filter is in short-hand notation (list), convert to verbose notation.""" 35 | assert type(fltr) in [list, dict] 36 | if type(fltr) is list: 37 | assert len(fltr) == 3 38 | return {'column': fltr[0], 'op': fltr[1], 'value': fltr[2]} 39 | 40 | 41 | def _to_verbose_target(target) -> Optional[dict]: 42 | """If a condition target is in short-hand notation (list), convert to verbose notation.""" 43 | assert type(target) in [list, dict] 44 | if type(target) is list: 45 | assert len(target) in [3, 4] 46 | if len(target) == 3: 47 | return {'type': target[0], 'op': target[1], 'value': target[2]} 48 | elif len(target) == 4: 49 | return {'type': target[0], 'column': target[1], 'op': target[2], 'value': target[3]} 50 | 51 | 52 | def _add_default_target(cond: dict) -> None: 53 | assert type(cond) is dict 54 | # (Modification is done on a key under the given object, so no need to return a modified dict) 55 | if ('filter' in cond or 'filters' in cond) and 'target' not in cond: # Don't touch sequence conditions 56 | cond['target'] = DEFAULT_TARGET 57 | 58 | 59 | def _validate_aggregation(aggr: dict) -> None: 60 | assert type(aggr) is dict 61 | aggr_type = aggr.get('type', None) 62 | other_column_required = aggr_type in AGGR_TYPES_WITH_OTHER_COLUMN 63 | other_column_found = 'otherColumn' in aggr 64 | 65 | if other_column_required != other_column_found: 66 | message = f"For aggregation {aggr} with type '{aggr_type}', other column name is " 67 | if other_column_required: 68 | message += 'required but was not found' 69 | else: 70 | message += 'not relevant but was given' 71 | raise QueryValidationError(message, kind=ValidationErrorKind.SCHEMA) 72 | 73 | 74 | def _expand_aggregations(col_aggregations: list) -> Optional[list]: 75 | assert type(col_aggregations) is list 76 | result = [] 77 | for aggr in col_aggregations: 78 | if aggr.get('type', None): 79 | result.append(aggr) 80 | else: 81 | if 'name' in aggr: 82 | message = f"Aggregation {aggr} expands into multiple default aggregations, " \ 83 | f"and thus a name attributeis not supported" 84 | raise QueryValidationError(message, kind=ValidationErrorKind.SCHEMA) 85 | for added_type in DEFAULT_AGGREGATIONS: 86 | result.append({**aggr, 'type': added_type}) 87 | 88 | return result 89 | 90 | 91 | def _validate_or_set_include_zero(cond: dict) -> None: 92 | """ 93 | 'includeZero' attribute of conditions may be tricky to get right. 94 | This function validates that its usage makes sense, and sets the correct default where it's ommitted. 95 | """ 96 | assert type(cond) is dict 97 | if not ('filter' in cond or 'filters' in cond): 98 | return # Skip sequence condition (and possibly other future types without a target) 99 | 100 | # This should run after _to_verbose_target() and _add_default_target() have already ran, ensuring target exists 101 | target_type = cond['target']['type'] 102 | target_op = cond['target']['op'] 103 | target_value = cond['target']['value'] 104 | include_zero_value = cond.get('includeZero', None) 105 | target_as_string = f"{target_type} {target_op} {target_value}" 106 | 107 | if target_type not in TARGET_TYPES_WITH_INCLUDE_ZERO: 108 | if include_zero_value: # Exists and set to True 109 | raise QueryValidationError( 110 | message=f"'includeZero' is not applicable for target type '{target_type}'. In condition: {cond}", 111 | kind=ValidationErrorKind.TYPE_MISMATCH) 112 | else: 113 | assert type(target_value) is int 114 | assert target_value >= 0 115 | 116 | if include_zero_value: # Exists and set to True 117 | # Operator never relevant for includeZero=True 118 | if target_op not in TARGET_OPS_SUPPORTING_INCLUDE_ZERO: 119 | raise QueryValidationError( 120 | message=f"For target operator '{target_op}', 'includeZero' cannot be true. In condition: {cond}", 121 | kind=ValidationErrorKind.TYPE_MISMATCH) 122 | 123 | # Additional check when an operator is *potentially* relevant for includeZero=True 124 | if target_op == '<' and target_value == 0: 125 | raise QueryValidationError( 126 | message=f"Target implies a negative value. In condition: {cond}", 127 | kind=ValidationErrorKind.TYPE_MISMATCH) 128 | 129 | if (target_op == '!=' and target_value == 0) or \ 130 | (target_op in ['==', '>='] and target_value != 0): 131 | message = f"Target {target_as_string} explicitly precludes zero, and thus 'includeZero' " \ 132 | f"cannot be true. In condition: {cond}" 133 | raise QueryValidationError(message, kind=ValidationErrorKind.TYPE_MISMATCH) 134 | else: 135 | if target_op == '==' and target_value == 0: 136 | if include_zero_value is None: 137 | # Explicitly set includeZero when target is count == 0 138 | # Note: modifying a key under the given object, so no need to return a modified dict 139 | cond['includeZero'] = True 140 | elif not include_zero_value: 141 | message = f"When using a target of {target_as_string}, 'includeZero' cannot be false. " \ 142 | f"Condition: {cond}" 143 | raise QueryValidationError(message, kind=ValidationErrorKind.TYPE_MISMATCH) 144 | -------------------------------------------------------------------------------- /frocket/datastore/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /frocket/datastore/blobstore.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import abstractmethod 16 | from typing import Optional 17 | from frocket.common.config import config 18 | from frocket.common.tasks.base import BlobId 19 | 20 | BLOB_DEFAULT_TTL = config.int('blobstore.default.ttl') 21 | BLOB_MAX_TTL = config.int('blobstore.max.ttl') 22 | 23 | 24 | class Blobstore: 25 | """Simple interface for storing and fetching arbitrary binary data, for ephemeral transport over the network. 26 | The data is assumed to always have a default TTL - it's not a permanent or big data store.""" 27 | @abstractmethod 28 | def write_blob(self, data: bytes, ttl: int = None, tag: str = None) -> BlobId: 29 | pass 30 | 31 | @abstractmethod 32 | def read_blob(self, blobid: BlobId) -> Optional[bytes]: 33 | pass 34 | 35 | @abstractmethod 36 | def delete_blob(self, blobid: BlobId) -> bool: 37 | pass 38 | -------------------------------------------------------------------------------- /frocket/datastore/datastore.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import abstractmethod, ABCMeta 16 | from dataclasses import dataclass 17 | from typing import List, Dict, Set, Optional, Union 18 | from frocket.common.tasks.base import TaskStatus, BaseTaskResult, TaskAttemptId, TaskStatusUpdate, BaseTaskRequest 19 | from frocket.common.dataset import DatasetInfo, DatasetPartsInfo, DatasetPartId, DatasetShortSchema, DatasetSchema 20 | from frocket.common.serializable import SerializableDataClass 21 | 22 | DEFAULT_QUEUE = 'default' 23 | DEFAULT_DEQUEUE_WAIT_TIME = 60 24 | 25 | 26 | # Used in PartSelectionMode.SELECTED_BY_WORKER 27 | @dataclass(frozen=True) 28 | class WorkerSelectedPart(SerializableDataClass): 29 | part_id: DatasetPartId 30 | random: bool 31 | task_attempt_no: int 32 | 33 | 34 | class Datastore(metaclass=ABCMeta): 35 | """ 36 | Interface to the data store, which holds: 37 | 38 | * The list, metadata and schema of all registered datasets 39 | * For running jobs: 40 | * Task statuses and results 41 | * Atomic attempt counter for retried tasks 42 | * For jobs running in mode PartSelectionMode.SELECTED_BY_WORKER, the manifest of available tasks to select from. 43 | * When the system is configured to use the 'work_queue' invoker (rather than 'aws_lambda'), the datastore also 44 | provides the queue through which tasks are enqueued by the invoker and picked up by the workers, like a very 45 | simplistic queue management system. 46 | 47 | The datastore is not for storing the actual dataset or other persistent large data. 48 | """ 49 | @abstractmethod 50 | def write_dataset_info(self, dataset: DatasetInfo, parts: DatasetPartsInfo, schema: DatasetSchema) -> None: 51 | pass 52 | 53 | @abstractmethod 54 | def remove_dataset_info(self, name: str) -> bool: 55 | pass 56 | 57 | @abstractmethod 58 | def dataset_info(self, name: str) -> DatasetInfo: 59 | pass 60 | 61 | @abstractmethod 62 | def dataset_parts_info(self, ds: DatasetInfo) -> DatasetPartsInfo: 63 | pass 64 | 65 | @abstractmethod 66 | def schema(self, ds: DatasetInfo) -> DatasetSchema: 67 | pass 68 | 69 | @abstractmethod 70 | def short_schema(self, ds: DatasetInfo) -> DatasetShortSchema: 71 | pass 72 | 73 | @abstractmethod 74 | def last_used(self, ds: DatasetInfo) -> int: 75 | pass 76 | 77 | @abstractmethod 78 | def mark_used(self, ds: DatasetInfo): 79 | pass 80 | 81 | @abstractmethod 82 | def datasets(self) -> List[DatasetInfo]: 83 | pass 84 | 85 | @abstractmethod 86 | def enqueue(self, requests: List[BaseTaskRequest], queue: str = DEFAULT_QUEUE) -> None: 87 | pass 88 | 89 | @abstractmethod 90 | def dequeue(self, queue: str = DEFAULT_QUEUE, timeout: int = DEFAULT_DEQUEUE_WAIT_TIME) -> BaseTaskRequest: 91 | pass 92 | 93 | @abstractmethod 94 | def update_task_status(self, reqid: str, 95 | tasks: Union[TaskAttemptId, List[TaskAttemptId]], status: TaskStatus) -> None: 96 | pass 97 | 98 | @abstractmethod 99 | def tasks_status(self, reqid: str) -> Dict[TaskAttemptId, TaskStatusUpdate]: 100 | pass 101 | 102 | @abstractmethod 103 | def write_task_result(self, reqid: str, taskid: TaskAttemptId, result: BaseTaskResult) -> None: 104 | pass 105 | 106 | @abstractmethod 107 | def task_results(self, reqid: str) -> Dict[TaskAttemptId, BaseTaskResult]: 108 | pass 109 | 110 | @abstractmethod 111 | def increment_attempt(self, reqid: str, part_idx: int) -> int: 112 | pass 113 | 114 | @abstractmethod 115 | def publish_for_worker_selection(self, reqid: str, attempt_round: int, parts: Set[DatasetPartId]) -> None: 116 | pass 117 | 118 | @abstractmethod 119 | def self_select_part(self, reqid: str, attempt_round: int, 120 | candidates: Set[DatasetPartId] = None) -> Optional[WorkerSelectedPart]: 121 | pass 122 | 123 | @abstractmethod 124 | def cleanup_request_data(self, reqid: str) -> None: 125 | pass 126 | -------------------------------------------------------------------------------- /frocket/datastore/registered_datastores.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | from frocket.common.config import config 17 | from frocket.common.helpers.utils import memoize 18 | from frocket.datastore.datastore import Datastore 19 | from frocket.datastore.blobstore import Blobstore 20 | from frocket.datastore.redis_store import RedisStore 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | DATASTORE_CLASSES = { 25 | "redis": RedisStore, 26 | } 27 | 28 | BLOBSTORE_CLASSES = { 29 | "redis": RedisStore, 30 | } 31 | 32 | 33 | # TODO backlog consider thread-safety here: while RedisStore is thread-safe and having more than one is ok, future 34 | # implementations may not be? (or should be required to) 35 | def _get_store(store_kind: str, store_mapping: dict): 36 | store_class = store_mapping[config.get(store_kind).lower()] 37 | store = store_class(role=store_kind) 38 | logger.info(f"Initialized {store}") 39 | return store 40 | 41 | 42 | @memoize 43 | def get_datastore() -> Datastore: 44 | return _get_store("datastore", DATASTORE_CLASSES) 45 | 46 | 47 | @memoize 48 | def get_blobstore() -> Blobstore: 49 | return _get_store("blobstore", BLOBSTORE_CLASSES) 50 | -------------------------------------------------------------------------------- /frocket/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /frocket/engine/relation_to_pandas.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Dict, Type, Callable, List, cast 16 | from frocket.common.validation.relation_parser import RBaseElement, RTextElement, RConditionBaseElement, ROperator 17 | 18 | 19 | def relation_to_pandas_query(elements: List[RBaseElement], column_prefix: str) -> str: 20 | """Convert the generic pasred representation of query.relation expression (as returned by QueryValidator or its 21 | helper class RelationParser) into a Pandas query string.""" 22 | 23 | # Mapping of generic element type to a lambda function constructing the Pandas equivalent. Note below that not 24 | # every concreate element type needs an entry here, as the code would also look for its superclass 25 | etype_to_handler: Dict[Type[RBaseElement], Callable[[RBaseElement], str]] = { 26 | RTextElement: lambda v: v.text, 27 | RConditionBaseElement: lambda v: f"{column_prefix}{v.condition_id}", 28 | ROperator: lambda v: " & " if v.text in ["and", "&&"] else " | " 29 | } 30 | 31 | transformed = [] 32 | for e in elements: 33 | func = None 34 | # Either there's a handler above for this element type, or go up the superclass chain to find one. 35 | class_and_supers = cast(List[Type[RBaseElement]], type(e).mro()) 36 | for cls in class_and_supers: 37 | func = etype_to_handler.get(cls, None) 38 | if func: 39 | break 40 | if not func: 41 | raise Exception(f"{e} has no handler for any of its superclasses: {class_and_supers}") 42 | transformed.append(func(e)) 43 | return "".join(transformed) 44 | -------------------------------------------------------------------------------- /frocket/invoker/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /frocket/invoker/impl/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /frocket/invoker/impl/aws_lambda_invoker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Invoke tasks by invoking an AWS Lambda function asynchronously. 3 | 4 | This is a great feature of Lamdba, which implictly manages a queue of invocation for you with configurable retention 5 | (probably based on SQS). As long as the concurrent invocations limit in your account/burst limit of the AWS region are 6 | not reached, AWS will launch Lambdas for queued invocation immediately, with no meaningful delay. This also prevents 7 | getting rate-limited on momentary invocation spikes. 8 | 9 | A few important notes: 10 | 11 | 1. As noted in the setup guide, the retry count for the Lambda function *should be set to zero*, as it's the invoker's 12 | job to launch retries with slightly different arguments, based on its own configuration, with logic that is agnostic 13 | to whether the actual invoker is using Lambdas or anything else (which does not have its optional retry feature). 14 | 15 | 2. Unfortunately, there's no API for batch Lambda invocation, so we're invoking one by one with multiple threads - 16 | and still the time to invoke all tasks can add up to 1-2 seconds or more. 17 | TODO backlog optimize! this also hurts caching as not all tasks get their fair chance to pick a locally cached part. 18 | 19 | 3. The InvokeAsync() Lambda API is considered deprecated and replaced by the 'InvocationType' parameter in Invoke(). 20 | However, the InvokeAsync API currently seems to take about half the time to return! Which one to use is configurable. 21 | 22 | TODO backlog stress-test queue limits till reaching rate limiting (status 429). 23 | TODO backlog for each invocation, add its actual invoke time as parameter 24 | (now we only measure time since invocation of all tasks started) 25 | """ 26 | # Copyright 2021 The Funnel Rocket Maintainers 27 | # 28 | # Licensed under the Apache License, Version 2.0 (the "License"); 29 | # you may not use this file except in compliance with the License. 30 | # You may obtain a copy of the License at 31 | # 32 | # http://www.apache.org/licenses/LICENSE-2.0 33 | # 34 | # Unless required by applicable law or agreed to in writing, software 35 | # distributed under the License is distributed on an "AS IS" BASIS, 36 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 37 | # See the License for the specific language governing permissions and 38 | # limitations under the License. 39 | 40 | import logging 41 | import time 42 | import concurrent.futures 43 | from typing import cast 44 | import boto3 45 | from botocore.client import BaseClient 46 | from botocore.config import Config 47 | from frocket.common.serializable import Envelope 48 | from frocket.common.tasks.base import BaseTaskRequest, BaseApiResult 49 | from frocket.invoker.impl.async_invoker import AsyncInvoker 50 | from frocket.common.config import config 51 | 52 | logger = logging.getLogger(__name__) 53 | 54 | DEBUG_PRINT_PAYLOADS = config.bool("invoker.lambda.debug.payload") 55 | LAMBDA_ASYNC_OK_STATUS = 202 56 | 57 | 58 | def _worker_task(req: BaseTaskRequest, client: BaseClient, lambda_name: str) -> BaseApiResult: 59 | """Run by the thread pool below.""" 60 | # noinspection PyBroadException 61 | try: 62 | result = None 63 | json_payload = Envelope.seal_to_json(req) # Encodes the actual object and its type, for correct decoding later. 64 | if DEBUG_PRINT_PAYLOADS: 65 | logger.debug(json_payload) 66 | 67 | legacy_invoke_async = config.bool("invoker.lambda.legacy.async") 68 | status_field = 'Status' if legacy_invoke_async else 'StatusCode' 69 | 70 | if legacy_invoke_async: 71 | response = client.invoke_async(FunctionName=lambda_name, InvokeArgs=json_payload) 72 | else: 73 | response = client.invoke(FunctionName=lambda_name, InvocationType='Event', Payload=json_payload) 74 | 75 | if response[status_field] == LAMBDA_ASYNC_OK_STATUS: 76 | result = BaseApiResult(success=True, error_message=None) 77 | else: 78 | message = f"Response status differs from expected ({LAMBDA_ASYNC_OK_STATUS}): {response}" 79 | result = BaseApiResult(success=False, error_message=message) 80 | except Exception as e: 81 | result = BaseApiResult(success=False, error_message=f"Failed to invoke lambda function '{lambda_name}': {e}") 82 | return result 83 | 84 | 85 | class AwsLambdaInvoker(AsyncInvoker): 86 | def _enqueue(self, requests) -> None: 87 | lambda_name = config.get('invoker.lambda.name') 88 | num_threads = config.int('invoker.lambda.threads') 89 | boto_config = Config(**config.aws_config_dict(service='lambda')) 90 | client = boto3.client('lambda', 91 | **config.aws_client_settings(service='lambda'), 92 | config=boto_config) 93 | logger.debug(f"Invoking lambdas, name: {lambda_name}, no. of invocations: {len(requests)}" 94 | f", no. of invoker threads: {num_threads}") 95 | futures = [] 96 | start_invoke_time = time.time() 97 | # TODO backlog consider lifecycle of the thread pool 98 | with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor: 99 | for req in requests: 100 | futures.append(executor.submit(_worker_task, req, client, lambda_name)) 101 | futures = concurrent.futures.as_completed(futures) # Wait till all complete! 102 | executor.shutdown() 103 | 104 | error_message = None 105 | for f in futures: 106 | assert f.done() 107 | if f.cancelled(): 108 | error_message = "Lambda invocation interrupted" 109 | elif f.exception(): 110 | error_message = f"Invocation failed with error: {f.exception()}" 111 | else: 112 | result = f.result() 113 | if not result or type(result) is not BaseApiResult: 114 | error_message = f"Invocation returned with response: {result}" 115 | result = cast(BaseApiResult, result) 116 | if not result.success: 117 | error_message = result.error_message 118 | if error_message: 119 | break 120 | 121 | if error_message: 122 | logger.error(error_message) 123 | raise Exception(error_message) 124 | else: 125 | logger.info(f"Async invocation done in {time.time() - start_invoke_time:.3f}") 126 | -------------------------------------------------------------------------------- /frocket/invoker/impl/registered_invokers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | from enum import Enum, auto 17 | from frocket.common.config import config 18 | from frocket.invoker.base_invoker import BaseInvoker 19 | from frocket.invoker.jobs.job import Job 20 | from frocket.invoker.impl.aws_lambda_invoker import AwsLambdaInvoker 21 | from frocket.invoker.impl.work_queue_invoker import WorkQueueInvoker 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | class InvocationType(Enum): 27 | WORK_QUEUE = auto() 28 | AWS_LAMBDA = auto() 29 | 30 | 31 | INVOKER_CLASSES = { 32 | InvocationType.WORK_QUEUE: WorkQueueInvoker, 33 | InvocationType.AWS_LAMBDA: AwsLambdaInvoker 34 | } 35 | 36 | 37 | def new_invoker(request_builder: Job) -> BaseInvoker: 38 | invoker_type = InvocationType[config.get("invoker").upper()] 39 | invoker_class = INVOKER_CLASSES[invoker_type] 40 | logger.info(f"Creating invoker type: {invoker_class.__name__}, for request builder type: {type(request_builder)}") 41 | return invoker_class(request_builder) 42 | -------------------------------------------------------------------------------- /frocket/invoker/impl/work_queue_invoker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Invoke tasks by enqueing them in the datastore. Not much to do here :-) 3 | """ 4 | # Copyright 2021 The Funnel Rocket Maintainers 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from frocket.invoker.impl.async_invoker import AsyncInvoker 19 | 20 | 21 | class WorkQueueInvoker(AsyncInvoker): 22 | def _enqueue(self, requests) -> None: 23 | self._datastore.enqueue(requests) 24 | -------------------------------------------------------------------------------- /frocket/invoker/invoker_api.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is the "Funnel Rocket" frontend API - wrappedby the CLI & API server, and may be embeddable in other apps. 3 | Clients are not expected to bypass this API (call the datastore directly, initialize an invoker, etc.) 4 | """ 5 | # Copyright 2021 The Funnel Rocket Maintainers 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | import concurrent.futures 20 | import logging 21 | import time 22 | from typing import List, Optional, cast, Union 23 | from frocket.common.config import config 24 | from frocket.common.dataset import DatasetInfo, DatasetShortSchema, DatasetSchema, DatasetPartsInfo 25 | from frocket.common.tasks.registration import RegistrationJobResult, RegisterArgs, UnregisterApiResult 26 | from frocket.common.tasks.query import QueryJobResult 27 | from frocket.common.tasks.async_tracker import AsyncJobTracker, AsyncJobStatusUpdater 28 | from frocket.common.validation.query_validator import QueryValidator 29 | from frocket.common.validation.result import QueryValidationResult 30 | from frocket.datastore.registered_datastores import get_datastore 31 | from frocket.invoker.jobs.query_job import QueryJob 32 | from frocket.invoker.jobs.registration_job import RegistrationJob 33 | from frocket.invoker.impl.registered_invokers import new_invoker 34 | 35 | logger = logging.getLogger(__name__) 36 | executor = concurrent.futures.ThreadPoolExecutor() 37 | 38 | # TODO backlog allow configurable timeout per job type (async or not) 39 | ASYNC_MAX_WAIT = config.int("invoker.run.timeout") * 1.1 # Adding a bit of grace around the invoker 40 | 41 | 42 | def _unregister_safety_interval() -> int: 43 | """How long after a dataset was last used to block unregister (can be set to zero, or overidden with force=True).""" 44 | interval = config.get('unregister.last.used.interval', None) 45 | if not interval: # Not defined, or empty string (explicit '0' is truthy) 46 | interval = config.int('invoker.run.timeout') * 2 47 | else: 48 | interval = int(interval) 49 | return interval 50 | 51 | 52 | def register_dataset(args: RegisterArgs) -> RegistrationJobResult: 53 | request_builder = RegistrationJob(args) 54 | invoker = new_invoker(request_builder) 55 | result = cast(RegistrationJobResult, invoker.run()) 56 | logger.info(f"Registration {'successful' if result.success else f'failed! {result.error_message}'}") 57 | return result 58 | 59 | 60 | def register_dataset_async(args: RegisterArgs, set_max_wait: bool = True) -> AsyncJobTracker: 61 | """The async version starts the invoker in a separate thread and then returns, handing back 62 | an AsyncJobTracker to poll for progress/completion.""" 63 | def worker(register_args, async_status): 64 | invoker = new_invoker(RegistrationJob(register_args)) 65 | return invoker.run(async_status) 66 | 67 | async_status = AsyncJobStatusUpdater(max_wait=(ASYNC_MAX_WAIT if set_max_wait else None)) 68 | executor.submit(worker, args, async_status) 69 | logger.info(f"Submitted async registration for dataset named {args.name} in basepath {args.basepath}") 70 | return async_status 71 | 72 | 73 | def get_dataset(name: str, throw_if_missing: bool = False) -> Optional[DatasetInfo]: 74 | dataset = get_datastore().dataset_info(name) 75 | if not dataset and throw_if_missing: 76 | raise Exception(f"Dataset '{name}' not found") 77 | return dataset 78 | 79 | 80 | def get_dataset_schema(dataset: DatasetInfo, full: bool = False) -> Union[DatasetSchema, DatasetShortSchema]: 81 | return get_datastore().schema(dataset) if full else get_datastore().short_schema(dataset) 82 | 83 | 84 | def get_dataset_parts(dataset: DatasetInfo) -> DatasetPartsInfo: 85 | return get_datastore().dataset_parts_info(dataset) 86 | 87 | 88 | def unregister_dataset(name: str, force: bool = False) -> UnregisterApiResult: 89 | dataset = get_dataset(name=name) 90 | if not dataset: 91 | return UnregisterApiResult(success=True, error_message=None, 92 | dataset_found=False, dataset_last_used=None) 93 | 94 | datastore = get_datastore() 95 | last_used = datastore.last_used(dataset) 96 | if last_used: 97 | time_since_used = int(time.time() - last_used) 98 | safety_interval = _unregister_safety_interval() 99 | message = f"Dataset was last used {time_since_used} seconds ago, which is less than safety interval " \ 100 | f"{safety_interval}. Use the 'force' parameter to unregister anyway." 101 | if safety_interval > time_since_used and not force: 102 | return UnregisterApiResult(success=False, error_message=message, 103 | dataset_found=True, dataset_last_used=last_used) 104 | 105 | get_datastore().remove_dataset_info(name) 106 | return UnregisterApiResult(success=True, error_message=None, 107 | dataset_found=True, dataset_last_used=last_used) 108 | 109 | 110 | def expand_and_validate_query(dataset: DatasetInfo, query: dict) -> QueryValidationResult: 111 | short_schema = get_dataset_schema(dataset) 112 | return QueryValidator(query, dataset, short_schema).expand_and_validate() 113 | 114 | 115 | def _build_query_job(dataset: DatasetInfo, 116 | query: dict, 117 | validation_result: QueryValidationResult) -> QueryJob: 118 | """If the query was already validated, skip re-validating.""" 119 | if validation_result: 120 | assert validation_result.success 121 | assert query in [validation_result.source_query, validation_result.expanded_query] 122 | else: 123 | validation_result = expand_and_validate_query(dataset, query) 124 | if not validation_result.success: 125 | raise Exception(f"Query validation failed: {validation_result.error_message}") 126 | 127 | get_datastore().mark_used(dataset) 128 | dataset_parts = get_datastore().dataset_parts_info(dataset) 129 | short_schema = get_datastore().short_schema(dataset) 130 | return QueryJob(dataset, dataset_parts, short_schema, 131 | validation_result.expanded_query, validation_result.used_columns) 132 | 133 | 134 | def run_query(dataset: DatasetInfo, 135 | query: dict, 136 | validation_result: QueryValidationResult = None) -> QueryJobResult: 137 | job_builder = _build_query_job(dataset, query, validation_result) 138 | invoker = new_invoker(job_builder) 139 | result = cast(QueryJobResult, invoker.run()) 140 | if result.success: 141 | logger.info("Query completed successfully") 142 | else: 143 | logger.error(f"Query failed with message: {result.error_message}") 144 | return result 145 | 146 | 147 | def run_query_async(dataset: DatasetInfo, 148 | query: dict, 149 | set_max_wait: bool = True, 150 | validation_result: QueryValidationResult = None) -> AsyncJobTracker: 151 | """The async version starts the invoker in a separate thread and then returns, handing back 152 | an AsyncJobTracker to poll for progress/completion.""" 153 | def worker(job_builder, async_status): 154 | invoker = new_invoker(job_builder) 155 | return invoker.run(async_status) 156 | 157 | job_builder = _build_query_job(dataset, query, validation_result) 158 | async_status = AsyncJobStatusUpdater(max_wait=(ASYNC_MAX_WAIT if set_max_wait else None)) 159 | executor.submit(worker, job_builder, async_status) 160 | logger.info(f"Submitted async query for dataset '{dataset.id.name}'") 161 | return async_status 162 | 163 | 164 | def list_datasets() -> List[DatasetInfo]: 165 | datasets = get_datastore().datasets() 166 | return datasets 167 | -------------------------------------------------------------------------------- /frocket/invoker/jobs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /frocket/invoker/jobs/job.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import abstractmethod, ABCMeta 16 | from typing import List, Optional, Set 17 | from frocket.common.dataset import DatasetPartId, DatasetPartsInfo 18 | from frocket.common.metrics import LabelsDict 19 | from frocket.common.tasks.base import BaseTaskRequest, BaseTaskResult, BaseJobResult, JobStatus, ErrorMessage 20 | from frocket.common.tasks.async_tracker import AsyncJobStatusUpdater 21 | 22 | 23 | class Job(metaclass=ABCMeta): 24 | """ 25 | For each job type (registratioThen, qeury, and future ones) there is a concrete subclass. 26 | That concrete class is handed to the invoker object, which is agnostic to the job details but calls the 27 | job's method in a set order. 28 | 29 | The flow, at high level: 30 | 1. On prerun(), the job validates its arguments (and can fail by returnning an error message) and can prepare data 31 | for building tasks. 32 | 33 | 2. When build_tasks() is called by the invoker - return a list of concrete task request objects, 34 | all with attempt no. 0. 35 | 36 | 3. If the job supports task self-selection by workers, it should override dataset_parts_to_publish() and 37 | return a list of parts to be consumed by workers (workers would try to select parts they have cached locally). 38 | This list is published by the data store before tasks are invoked. 39 | 40 | 4. In case the invoker decides to retry a task, it calls build_retry_task() to create a specific retry task 41 | 42 | 5. After all tasks have completed, either successfully or not, complete() is called to run any validations on the 43 | final results of all tasks, and perform any needed aggregations. The job may fail at this stage if the results of 44 | tasks, taken together, are invalid. 45 | 46 | 6. Lastly, build_result() is called to construct the final job result. 47 | At this stage, the final success status of the job should not change. 48 | """ 49 | _request_id = None 50 | _labels = {} 51 | 52 | @property 53 | def request_id(self) -> Optional[str]: 54 | return self._request_id 55 | 56 | @request_id.setter 57 | def request_id(self, request_id: str): 58 | self._request_id = request_id 59 | 60 | def prerun(self, async_updater: AsyncJobStatusUpdater = None) -> Optional[ErrorMessage]: 61 | pass 62 | 63 | @abstractmethod 64 | def build_tasks(self) -> List[BaseTaskRequest]: 65 | pass 66 | 67 | def dataset_parts_to_publish(self) -> Optional[Set[DatasetPartId]]: 68 | return None 69 | 70 | @abstractmethod 71 | def total_tasks(self) -> int: 72 | pass 73 | 74 | @abstractmethod 75 | def build_retry_task(self, attempt_no: int, task_index: int) -> BaseTaskRequest: 76 | pass 77 | 78 | def complete(self, 79 | tasks_final_status: JobStatus, 80 | latest_task_results: List[BaseTaskResult], 81 | async_updater: AsyncJobStatusUpdater = None) -> JobStatus: 82 | return tasks_final_status 83 | 84 | @abstractmethod 85 | def build_result(self, 86 | base_attributes: dict, 87 | final_status: JobStatus, 88 | latest_task_results: List[BaseTaskResult]) -> BaseJobResult: 89 | pass 90 | 91 | @property 92 | def metric_labels(self) -> LabelsDict: 93 | return self._labels 94 | 95 | @abstractmethod 96 | def parts_info(self) -> Optional[DatasetPartsInfo]: 97 | pass 98 | -------------------------------------------------------------------------------- /frocket/invoker/jobs/query_job.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import time 16 | from typing import List, cast 17 | from frocket.common.config import config 18 | from frocket.common.dataset import DatasetInfo, DatasetPartId, DatasetPartsInfo, DatasetShortSchema 19 | from frocket.common.metrics import JobTypeLabel, DATASET_LABEL 20 | from frocket.common.tasks.query import PartSelectionMode, QueryTaskRequest, QueryTaskResult, QueryJobResult, QueryResult 21 | from frocket.invoker.jobs.job import Job 22 | 23 | 24 | class QueryJob(Job): 25 | def __init__(self, dataset: DatasetInfo, parts: DatasetPartsInfo, 26 | short_schema: DatasetShortSchema, query: dict, used_columns: List[str], 27 | worker_can_select_part: bool = None): 28 | self._dataset = dataset 29 | self._parts = parts 30 | self._query = query 31 | self._used_columns = used_columns 32 | self._paths = parts.fullpaths(parent=dataset) 33 | self._worker_can_select_part = worker_can_select_part \ 34 | if worker_can_select_part is not None else config.bool('worker.self.select.enabled') 35 | if config.bool('dataset.categorical.potential.use'): 36 | self._load_as_categoricals = short_schema.potential_categoricals 37 | else: 38 | self._load_as_categoricals = None 39 | self._labels = { 40 | JobTypeLabel.QUERY.label_name: JobTypeLabel.QUERY.label_value, 41 | DATASET_LABEL: self._dataset.id.name 42 | } 43 | 44 | def parts_info(self): 45 | return self._parts 46 | 47 | def total_tasks(self): 48 | return len(self._paths) 49 | 50 | def build_tasks(self): 51 | if self._worker_can_select_part: 52 | mode = PartSelectionMode.SELECTED_BY_WORKER 53 | else: 54 | mode = PartSelectionMode.SET_BY_INVOKER 55 | 56 | requests = [self._build_task(mode, i) for i in range(self.total_tasks())] 57 | return requests 58 | 59 | def dataset_parts_to_publish(self): 60 | if self._worker_can_select_part: 61 | parts_to_publish = {DatasetPartId(self._dataset.id, path, part_index) 62 | for part_index, path in enumerate(self._paths)} 63 | return parts_to_publish 64 | else: 65 | return None 66 | 67 | def build_retry_task(self, attempt_no, task_index): 68 | return self._build_task(PartSelectionMode.SET_BY_INVOKER, 69 | part_index=task_index, 70 | attempt_no=attempt_no) 71 | 72 | def _build_task(self, mode: PartSelectionMode, part_index: int, attempt_no: int = 0) -> QueryTaskRequest: 73 | if mode == PartSelectionMode.SET_BY_INVOKER: 74 | invoker_set_part = DatasetPartId(dataset_id=self._dataset.id, 75 | path=self._paths[part_index], 76 | part_idx=part_index) 77 | task_index = part_index 78 | elif mode == PartSelectionMode.SELECTED_BY_WORKER: 79 | assert attempt_no == 0 80 | invoker_set_part = None 81 | task_index = None 82 | else: 83 | raise Exception("Unknown mode {mode}") 84 | 85 | request = QueryTaskRequest( 86 | request_id=self._request_id, 87 | invoke_time=time.time(), 88 | dataset=self._dataset, 89 | load_as_categoricals=self._load_as_categoricals, 90 | query=self._query, 91 | invoker_set_task_index=task_index, 92 | attempt_no=attempt_no, 93 | mode=mode, 94 | invoker_set_part=invoker_set_part, 95 | used_columns=self._used_columns) 96 | return request 97 | 98 | def build_result(self, base_attributes, final_status, latest_task_results): 99 | aggregated_query_result = None 100 | # Only if query was successful, aggregate query results (for each task - from a single successful attempt) 101 | if final_status.success: 102 | latest_task_results = cast(List[QueryTaskResult], latest_task_results) 103 | query_results = [task_result.query_result for task_result in latest_task_results] 104 | aggregated_query_result = cast(QueryResult, 105 | QueryResult.reduce(query_results)) 106 | 107 | result = QueryJobResult( 108 | **base_attributes, 109 | query=aggregated_query_result.query if aggregated_query_result else None, 110 | funnel=aggregated_query_result.funnel if aggregated_query_result else None 111 | ) 112 | return result 113 | -------------------------------------------------------------------------------- /frocket/invoker/metrics_frame.py: -------------------------------------------------------------------------------- 1 | """ 2 | Transform a given list of metrics from multiple sources (invoker, workers) into one DataFrame, for easy analysis. 3 | Export the data into file and/or Prometheus, by configuration. 4 | """ 5 | # Copyright 2021 The Funnel Rocket Maintainers 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | import logging 20 | from typing import List, Dict, Union 21 | import pandas as pd 22 | from pandas import DataFrame 23 | from frocket.common.config import config 24 | from frocket.common.metrics import SourceAndMetricTuple, ALL_LABEL_NAMES 25 | from frocket.invoker import prom_adapter 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | METRIC_SOURCE_COLUMN = 'source' 30 | METRIC_NAME_COLUMN = 'metric' 31 | METRIC_VALUE_COLUMN = 'value' 32 | 33 | PANDAS_FLOAT_FORMAT = '{:.5f}' # No pesky scientific notation ;-) 34 | pd.options.display.float_format = PANDAS_FLOAT_FORMAT.format 35 | 36 | # 'Last run' file, if defines, stores the most recent job's metrics as a file in CSV or Parquet format (by extension) 37 | EXPORT_LASTRUN_FILE = config.get('metrics.export.lastrun', None) 38 | EXPORT_TO_PROMETHEUS = config.bool('metrics.export.prometheus') 39 | 40 | if EXPORT_TO_PROMETHEUS: 41 | prom_adapter.init_prom_metrics() 42 | 43 | 44 | class MetricsFrame: 45 | def __init__(self, source_and_metrics: List[SourceAndMetricTuple]): 46 | self._sources = [ms.source for ms in source_and_metrics] 47 | self._metrics = [ms.metric for ms in source_and_metrics] 48 | self._build_df() 49 | 50 | def _build_df(self): 51 | """ 52 | Build the DataFrame: each row is one reported metric, but the DF is created with columns. Hence, we're creating 53 | columns here rather than rows. 54 | """ 55 | metric_source_column = self._sources 56 | metric_name_column = [m.name.name for m in self._metrics] # Metric names column 57 | metric_value_column = [m.value for m in self._metrics] # Metric values column 58 | 59 | # Init empty columns for all possible label names. 60 | # Cells not not filled (see below) will remain empty (and possibly even entire columns) 61 | label_columns: Dict[str, List[Union[str, None]]] = {} 62 | for label_name in ALL_LABEL_NAMES: 63 | label_columns[label_name] = [None] * len(self._metrics) 64 | 65 | # Fill labels columns with what labels are actually set per metric 66 | for i, metric in enumerate(self._metrics): 67 | for label_name, label_value in metric.labels.items(): 68 | label_columns[label_name][i] = label_value 69 | 70 | df_columns = {METRIC_SOURCE_COLUMN: metric_source_column, 71 | METRIC_NAME_COLUMN: metric_name_column, 72 | METRIC_VALUE_COLUMN: metric_value_column, 73 | **label_columns} 74 | self._df = pd.DataFrame(data=df_columns) 75 | # logger.debug(f"Types: {self._df.dtypes.index.tolist()}, data:\n{self._df}") # If needed 76 | 77 | def export(self) -> None: 78 | if EXPORT_LASTRUN_FILE: 79 | self._to_lastrun_file(EXPORT_LASTRUN_FILE) 80 | if EXPORT_TO_PROMETHEUS: 81 | self._to_prometheus() 82 | 83 | def _to_prometheus(self) -> None: 84 | prom_adapter.update(self._metrics) 85 | 86 | def _to_lastrun_file(self, filename: str) -> None: 87 | if filename.lower().endswith('.parquet'): 88 | self._df.to_parquet(filename, index=False) 89 | else: 90 | self._df.to_csv(filename, float_format=PANDAS_FLOAT_FORMAT, index=False) 91 | 92 | @property 93 | def dataframe(self) -> DataFrame: 94 | return self._df 95 | -------------------------------------------------------------------------------- /frocket/invoker/prom_adapter.py: -------------------------------------------------------------------------------- 1 | """ 2 | While metrics support in Funnel Rocket is built with Prometheus (or more generally OpenMetrics) in mind, 3 | all Prometheus-specific code is in this module. 4 | 5 | TODO backlog support help string (documentation) per each member in MetricName enum 6 | """ 7 | # Copyright 2021 The Funnel Rocket Maintainers 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | 21 | from typing import List, Dict, Type 22 | from prometheus_client import Counter, Histogram 23 | from prometheus_client.metrics import MetricWrapperBase 24 | from frocket.common.config import config 25 | from frocket.common.helpers.utils import memoize 26 | from frocket.common.metrics import MetricName, MeasuredUnit, supported_label_names, MetricData, empty_label_names 27 | 28 | prom_counters: Dict[MetricName, Counter] = {} 29 | prom_histograms: Dict[MetricName, Histogram] = {} 30 | 31 | 32 | @memoize 33 | def buckets_by_unit(unit: MeasuredUnit) -> List[float]: 34 | """Each unit (seconds, bytes, dollars) may have its own buckets configured, or fallback to the default.""" 35 | assert unit is not MeasuredUnit.COUNT # COUNT should not use a histogram 36 | buckets_string = config.get_with_fallbacks(f'metrics.buckets.{unit.name.lower()}', 'metrics.buckets.default') 37 | buckets = [float(b) for b in buckets_string.split(',')] 38 | return buckets 39 | 40 | 41 | def unit_to_metric_type(unit: MeasuredUnit) -> Type[MetricWrapperBase]: 42 | """The type of Prometheus metric is automatically derived from the type of measured unit.""" 43 | if unit is MeasuredUnit.COUNT: 44 | return Counter 45 | else: 46 | return Histogram 47 | 48 | 49 | def init_prom_metrics(): 50 | """In Prometheus clients, all metrics should be defined only once before use, along with their possible labels. 51 | This is not a technical limitation of Prometheus itself, but rather enforced by official clients.""" 52 | for e in MetricName: 53 | base_args = {'name': e.name.lower(), 54 | 'documentation': e.name, 55 | 'labelnames': supported_label_names(e)} 56 | metric_type = unit_to_metric_type(e) 57 | if metric_type == Counter: 58 | prom_counters[e] = Counter(**base_args) 59 | elif metric_type == Histogram: 60 | prom_histograms[e] = Histogram(**base_args, buckets=buckets_by_unit(e.unit)) 61 | 62 | 63 | def update(metrics: List[MetricData]): 64 | """Update (increment/observe) new values after a job completes, etc.""" 65 | for md in metrics: 66 | empty_labels = empty_label_names(md.name) 67 | all_labels = {**empty_labels, **md.labels} 68 | metric_type = unit_to_metric_type(md.name.unit) 69 | if metric_type == Counter: 70 | prom_counters[md.name].labels(**all_labels).inc(md.value) 71 | elif metric_type == Histogram: 72 | prom_histograms[md.name].labels(**all_labels).observe(md.value) 73 | -------------------------------------------------------------------------------- /frocket/invoker/stats_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Build JobStats (returned to the client after job completion) - based mostly on the DataFrame of collected metrics from 3 | the invoker and all workers. 4 | """ 5 | # Copyright 2021 The Funnel Rocket Maintainers 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | import logging 20 | import sys 21 | from typing import Optional, Union, List, Dict 22 | import pandas 23 | import numpy as np 24 | from pandas import DataFrame 25 | from frocket.common.config import config 26 | from frocket.common.dataset import DatasetPartsInfo, PartNamingMethod 27 | from frocket.common.tasks.base import JobStats, JobDatasetStats, JobInvokerStats, TimingStats, JobWorkerStats 28 | from frocket.invoker.metrics_frame import MetricsFrame, METRIC_NAME_COLUMN, METRIC_VALUE_COLUMN, METRIC_SOURCE_COLUMN 29 | from frocket.common.metrics import MetricName, ComponentLabel, SUCCESS_LABEL, MetricLabelEnum, \ 30 | WorkerStartupLabel, LoadFromLabel 31 | 32 | logger = logging.getLogger(__name__) 33 | 34 | TASK_COMPLETION_GRANULARITY_SECONDS = 0.25 # Data series of task success over time is measured in this resolution 35 | TIMING_PERCENTILES = [float(pct) for pct in config.get('stats.timing.percentiles').split(',')] 36 | MIN_METRICS_FOR_PERCENTILES = 20 # Below this sample count, don't return percentiles 37 | MIN_METRICS_FOR_99_PERCENTILE = 100 # Below this count, don't return 99th percentile 38 | # List of keys to pull from Pandas' describe() 39 | TIMING_DESCRIBE_KEYS = ['min', 'mean', 'max'] + [f"{int(pct*100)}%" for pct in TIMING_PERCENTILES] 40 | 41 | 42 | def build_stats(frame: MetricsFrame, parts_info: DatasetPartsInfo = None) -> JobStats: 43 | df = frame.dataframe 44 | if df is None: # In job failure cases 45 | return JobStats() 46 | 47 | if parts_info: 48 | ds_stats = JobDatasetStats(total_size=parts_info.total_size, parts=parts_info.total_parts) 49 | else: 50 | ds_stats = None 51 | 52 | # Invoker stats 53 | all_task_rows_df = _filter_by_label(df, ComponentLabel.WORKER) 54 | successful_task_rows_df = _filter_by_success(all_task_rows_df) 55 | total_tasks = _count_tasks(all_task_rows_df) 56 | failed_tasks = total_tasks - _count_tasks(successful_task_rows_df) 57 | 58 | invoker_stats = JobInvokerStats( 59 | enqueue_time=_sum_value(df, MetricName.ASYNC_ENQUEUE_SECONDS, single_value=True), 60 | poll_time=_sum_value(df, MetricName.ASYNC_POLL_SECONDS, single_value=True), 61 | total_tasks=total_tasks, 62 | failed_tasks=failed_tasks, 63 | task_success_over_time=_task_success_over_time(successful_task_rows_df) 64 | # TODO backlog add: lost_task_retries as counted by the invoker; support sync. invokers? 65 | ) 66 | 67 | # Worker stats 68 | worker_stats = JobWorkerStats( 69 | cold_tasks=_count_tasks(_filter_by_label(successful_task_rows_df, WorkerStartupLabel.COLD)), 70 | warm_tasks=_count_tasks(_filter_by_label(successful_task_rows_df, WorkerStartupLabel.WARM)), 71 | scanned_rows=_sum_value(successful_task_rows_df, MetricName.SCANNED_ROWS, as_int=True), 72 | scanned_groups=_sum_value(successful_task_rows_df, MetricName.SCANNED_GROUPS, as_int=True), 73 | cache=_cache_performance(successful_task_rows_df), 74 | invoke_latency=_timing_stats(successful_task_rows_df, MetricName.INVOKE_TO_RUN_SECONDS), 75 | load_time=_timing_stats(successful_task_rows_df, MetricName.TASK_TOTAL_LOAD_SECONDS), 76 | total_time=_timing_stats(successful_task_rows_df, MetricName.TASK_TOTAL_RUN_SECONDS) 77 | # TODO backlog add: loaded_column_types - mapping of column type to count, which affects load time 78 | ) 79 | 80 | job_stats = JobStats( 81 | total_time=_sum_value(df, MetricName.INVOKER_TOTAL_SECONDS, single_value=True), 82 | cost=_total_cost(df), 83 | dataset=ds_stats, 84 | invoker=invoker_stats, 85 | worker=worker_stats) 86 | return job_stats 87 | 88 | 89 | def _task_success_over_time(task_rows_df: DataFrame) -> Dict[float, int]: 90 | """Return a sparse series of data points - for each time slot (e.g. 0.25 secs) since the job started, return how 91 | many tasks completed successfully in that slot. Non-cumulative, does not include zeros.""" 92 | task_duration_rows = _filter_by_metrics( 93 | task_rows_df, metrics=[MetricName.INVOKE_TO_RUN_SECONDS, MetricName.TASK_TOTAL_RUN_SECONDS]) 94 | task_durations = task_duration_rows.groupby(METRIC_SOURCE_COLUMN)[METRIC_VALUE_COLUMN].sum() 95 | quantized_task_durations = \ 96 | np.ceil(task_durations / TASK_COMPLETION_GRANULARITY_SECONDS) * TASK_COMPLETION_GRANULARITY_SECONDS 97 | return quantized_task_durations.value_counts().sort_index().to_dict() 98 | 99 | 100 | def _cache_performance(task_rows_df: DataFrame) -> Dict[str, int]: 101 | return { 102 | # Note the 'source' is always the case for locally-loaded files, in which case caching is N/A. 103 | 'source': _count_tasks(_filter_by_label(task_rows_df, LoadFromLabel.SOURCE)), 104 | 'diskCache': _count_tasks(_filter_by_label(task_rows_df, LoadFromLabel.DISK_CACHE)) 105 | } 106 | 107 | 108 | def _sum_value(df: DataFrame, metric: MetricName, 109 | single_value: bool = False, 110 | as_int: bool = False) -> Union[float, int, None]: 111 | df = _filter_by_metrics(df, metric) 112 | if single_value: 113 | assert len(df) <= 1 114 | if df.empty: 115 | return None 116 | else: 117 | values_sum = df[METRIC_VALUE_COLUMN].sum() 118 | return int(values_sum) if as_int else float(values_sum) 119 | 120 | 121 | def _count(df: DataFrame, metric: MetricName) -> int: 122 | return _filter_by_metrics(df, metric)[METRIC_VALUE_COLUMN].count() 123 | 124 | 125 | def _timing_stats(task_rows_df: DataFrame, metric: MetricName) -> TimingStats: 126 | values_df = _filter_by_metrics(task_rows_df, metric)[METRIC_VALUE_COLUMN] 127 | if len(values_df) < MIN_METRICS_FOR_PERCENTILES: 128 | percentiles = [0.5] 129 | else: 130 | percentiles = TIMING_PERCENTILES 131 | if len(values_df) < MIN_METRICS_FOR_99_PERCENTILE: 132 | percentiles = [pct for pct in percentiles if pct < 0.99] 133 | 134 | raw_stats = values_df.describe(percentiles=percentiles).to_dict() 135 | return {k: v for k, v in raw_stats.items() 136 | if k in TIMING_DESCRIBE_KEYS and not np.isnan(v)} 137 | 138 | 139 | def _filter_by_metrics(df: DataFrame, metrics: Union[MetricName, List[MetricName]]) -> DataFrame: 140 | if type(metrics) is MetricName: 141 | return df[df[METRIC_NAME_COLUMN] == metrics.name] 142 | else: 143 | return df[df[METRIC_NAME_COLUMN].isin([m.name for m in metrics])] 144 | 145 | 146 | def _filter_by_label(df: DataFrame, label: MetricLabelEnum) -> DataFrame: 147 | return df[df[label.label_name] == label.label_value.lower()] 148 | 149 | 150 | def _filter_by_success(df: DataFrame, value: bool = True) -> DataFrame: 151 | return df[df[SUCCESS_LABEL] == str(value)] 152 | 153 | 154 | def _count_tasks(task_rows_df: DataFrame) -> int: 155 | """Each task attempt (e.g. task index 117, attempt 2) has a unique name in the source column, which ofc appears in 156 | multiple rows. This count the unique count of task attempt IDs in the given DF.""" 157 | return task_rows_df[METRIC_SOURCE_COLUMN].nunique() 158 | 159 | 160 | def _total_cost(df: DataFrame) -> Optional[float]: 161 | cost_reports_df = _filter_by_metrics(df, MetricName.COST_DOLLARS) 162 | num_reports = len(cost_reports_df) 163 | if num_reports == 0: 164 | logger.debug(f"Total cost: no metrics found") 165 | return None 166 | else: 167 | total_cost = float(cost_reports_df[METRIC_VALUE_COLUMN].sum()) 168 | logger.debug(f"Total cost: ${total_cost:.6f} (sum of {num_reports} metric reports)") 169 | return total_cost 170 | 171 | 172 | # Stand-alone testing 173 | if __name__ == "__main__": 174 | config.init_logging(force_level=logging.DEBUG, force_console_output=True) 175 | filename = config.get('metrics.export.lastrun', None) 176 | if not filename: 177 | sys.exit('No lastrun file defined') 178 | 179 | df = pandas.read_parquet(filename) 180 | dummy_frame = MetricsFrame([]) 181 | dummy_frame._df = df 182 | dummy_parts_info = DatasetPartsInfo(naming_method=PartNamingMethod.LIST, total_parts=4, total_size=1024) 183 | build_stats(dummy_frame, dummy_parts_info) 184 | -------------------------------------------------------------------------------- /frocket/worker/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /frocket/worker/impl/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /frocket/worker/impl/aws_lambda_metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Calculate physical memory & cost for AWS Lambda-based workers. 3 | 4 | Important note re. Lambda billing: although this is not explicitly stated and subject to change, you are not charged for 5 | the duration in which a cold-started Lambda loads up till the point when the actual handler is called - 6 | meaning, all imports are "free"! this means that cold-started Lambdas mainly impact clock-time latency but typically 7 | won't inflate cost to a similar degree. This is in line with how the task duration is measured w/o cold-start imports. 8 | """ 9 | # Copyright 2021 The Funnel Rocket Maintainers 10 | # 11 | # Licensed under the Apache License, Version 2.0 (the "License"); 12 | # you may not use this file except in compliance with the License. 13 | # You may obtain a copy of the License at 14 | # 15 | # http://www.apache.org/licenses/LICENSE-2.0 16 | # 17 | # Unless required by applicable law or agreed to in writing, software 18 | # distributed under the License is distributed on an "AS IS" BASIS, 19 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | # See the License for the specific language governing permissions and 21 | # limitations under the License. 22 | 23 | import logging 24 | import math 25 | import re 26 | from frocket.common.metrics import MetricName, EnvironmentMetricsProvider, MetricData 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | # TODO backlog setup a recurring task to check for pricing changes, so this can be updated. 31 | DEFAULT_PRICE_GB_SEC = 0.0000166667 32 | REGION_PRICING = { 33 | "eu-south-1": 0.0000195172, # Milan 34 | "me-south-1": 0.0000206667, # Bahrain 35 | "ap-northeast-3": 0.00002153, # Osaka 36 | "af-south-1": 0.0000221, # Capetown 37 | "ap-east-1": 0.00002292 # Hong-kong 38 | } 39 | # Assume the actual run takes this amount of seconds more than what's been measured, 40 | # e.g. time spent in decoding the task reqeust, and time still to spend on writing results (incl. these metrics...) 41 | # to datastore. 42 | LAMBDA_TIME_OVERHEAD = 0.008 # 8ms, a conservative value based on a few observations 43 | 44 | 45 | class AwsLambdaMetricsProvider(EnvironmentMetricsProvider): 46 | def __init__(self, lambda_context): 47 | # See https://docs.aws.amazon.com/lambda/latest/dg/python-context.html 48 | assert lambda_context.__class__.__name__ == 'LambdaContext' 49 | self._lambda_context = lambda_context 50 | 51 | # What region are we in? figure out by the full ARN in the context 52 | # (ARN example: arn:aws:lambda:us-west-2:123456789012:function:my-function) 53 | arn_parts = lambda_context.invoked_function_arn.split(':') 54 | region = arn_parts[3] 55 | if re.match(r'\w+-\w+-\d+', region): 56 | self._region = region 57 | else: 58 | self._region = None 59 | logger.warning(f"Seems like an invalid region: '{region}' in ARN: {lambda_context.invoked_function_arn}, " 60 | f"not calculating cost") 61 | 62 | def _memory_bytes(self): 63 | mem_bytes = int(self._lambda_context.memory_limit_in_mb) * (1024 ** 2) 64 | return MetricData(MetricName.MACHINE_MEMORY_BYTES, mem_bytes) 65 | 66 | def _cost_dollars(self, duration=None): 67 | if not duration or not self._region: 68 | return None 69 | 70 | # noinspection PyBroadException 71 | try: 72 | memory_gb = self._memory_bytes().value / (1024 ** 3) 73 | # Lambdas are currently billed in 1ms granularity, so rounding up 74 | rounded_duration = duration + LAMBDA_TIME_OVERHEAD 75 | rounded_duration = math.ceil(rounded_duration * 1000) / 1000 76 | 77 | gb_second_units = rounded_duration * memory_gb 78 | cost_per_unit = REGION_PRICING.get(self._region, DEFAULT_PRICE_GB_SEC) 79 | cost = gb_second_units * cost_per_unit 80 | message = \ 81 | f"Cost: original duration: {duration: .4f} sec, rounded duration: {rounded_duration:.3f}, memory: " \ 82 | f"{memory_gb}GB, GB/second units: {gb_second_units}, unit cost for region {self._region}: " \ 83 | f"${cost_per_unit:.10f} => total run cost is ${cost:.10f}" 84 | logger.debug(message) 85 | return MetricData(MetricName.COST_DOLLARS, cost) 86 | except Exception: 87 | logger.exception("Failed calculating cost") 88 | return None 89 | -------------------------------------------------------------------------------- /frocket/worker/impl/aws_lambda_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | lambda_handler() in this module is the AWS Lambda's defined entrypoint. 3 | There's minimal code here that's Lambda-specific (== a good thing). 4 | """ 5 | # Copyright 2021 The Funnel Rocket Maintainers 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | import logging 20 | from typing import cast 21 | from frocket.common.serializable import Envelope 22 | from frocket.common.tasks.base import BaseTaskRequest 23 | from frocket.common.metrics import MetricsBag, WorkerStartupLabel, ComponentLabel 24 | from frocket.worker.impl.aws_lambda_metrics import AwsLambdaMetricsProvider 25 | from frocket.worker.runners.base_task_runner import BaseTaskRunner, TaskRunnerContext 26 | from frocket.common.config import config 27 | from frocket.worker.runners.registered_runners import REGISTERED_RUNNERS 28 | 29 | config.init_lambda_logging() # Adapted to the logger being already-inited by the Lambda runtime 30 | logger = logging.getLogger(__name__) 31 | 32 | # This flag only set when a new Lambda instance is cold-started. Warm lambdas would go straight to the handler function. 33 | cold_start_flag = True 34 | 35 | 36 | def is_cold_start(): 37 | global cold_start_flag 38 | if cold_start_flag: 39 | cold_start_flag = False # For next invocation 40 | return True 41 | else: 42 | return False 43 | 44 | 45 | def init_task_metrics(lambda_context) -> MetricsBag: 46 | metrics = MetricsBag(component=ComponentLabel.WORKER, 47 | env_metrics_provider=AwsLambdaMetricsProvider(lambda_context)) 48 | if is_cold_start(): 49 | metrics.set_label_enum(WorkerStartupLabel.COLD) 50 | else: 51 | metrics.set_label_enum(WorkerStartupLabel.WARM) 52 | return metrics 53 | 54 | 55 | def lambda_handler(event, context): 56 | metrics = init_task_metrics(context) 57 | # The event JSON was already parsed to dict by the Lambda runtime - 58 | # now read from that dict that actual task request object 59 | envelope = Envelope.from_dict(event) 60 | req = cast(BaseTaskRequest, envelope.open(expected_superclass=BaseTaskRequest)) 61 | logger.info(f"Got request: {req}") 62 | 63 | result = None 64 | should_run, reject_reason = BaseTaskRunner.should_run(req) 65 | if should_run: 66 | runner_class = REGISTERED_RUNNERS[type(req)] 67 | runner = runner_class(req, TaskRunnerContext(metrics)) 68 | result = runner.run() 69 | 70 | """ 71 | A note about the Lambda response: unlike most request/response Lambdas, Funnel Rocket's invoker does not rely on the 72 | function's result coming from the Lambda directly (as it's invoked async.) but rather always through the datastore. 73 | The retry mechanism is also based on polling the tasks' status and result payload in the datastore, hence the 74 | Lambda itself should not normally return a non-200 status (unless it crashed unexpectedly), and the Lambda should 75 | be configured to have no retries at the AWS level. 76 | """ 77 | 78 | lambda_response = { 79 | 'statusCode': 200, 80 | } 81 | 82 | # Getting the result object in the Lambda response is still useful for manual testing 83 | if logger.isEnabledFor(logging.DEBUG): 84 | if result: 85 | lambda_response['result'] = result.to_json() 86 | else: 87 | lambda_response['reject_reason'] = reject_reason 88 | return lambda_response 89 | -------------------------------------------------------------------------------- /frocket/worker/impl/generic_env_metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is the most generic implementation for getting runtime-environment metrics: 3 | it does not assume we know the cost of the host machine for the request duration, 4 | and getting physical memory size should generally work on Linux variants and OS X versions. 5 | """ 6 | # Copyright 2021 The Funnel Rocket Maintainers 7 | # 8 | # Licensed under the Apache License, Version 2.0 (the "License"); 9 | # you may not use this file except in compliance with the License. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | 20 | import logging 21 | import os 22 | from frocket.common.metrics import EnvironmentMetricsProvider, MetricData, MetricName 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | class GenericEnvMetricsProvider(EnvironmentMetricsProvider): 28 | def _memory_bytes(self): 29 | # Tested on Linux and OS X 30 | try: 31 | mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') 32 | except ValueError: 33 | # Fallback to sysctl in case that os.sysconf('SC_PHYS_PAGES') fails on OS X (seems version specific) 34 | # noinspection PyBroadException 35 | try: 36 | stream = os.popen('sysctl hw.memsize') 37 | mem_bytes = int(stream.read().split(' ')[1]) 38 | except Exception as e: 39 | logger.warning(f"Can't detect machine memory: {e}") 40 | return None 41 | 42 | return MetricData(MetricName.MACHINE_MEMORY_BYTES, mem_bytes) 43 | 44 | def _cost_dollars(self, duration=None): 45 | return None 46 | -------------------------------------------------------------------------------- /frocket/worker/impl/queue_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | A worker that gets its tasks by a blocking dequeue from the datastore. Doesn't get any simpler - 3 | but is easily scalable, and requires no load balancer or orchestrator (except for the queue's atomic guarantees). 4 | 5 | TODO backlog having a cache-friendly task assignment would require more work, if it makes sense to do. 6 | """ 7 | # Copyright 2021 The Funnel Rocket Maintainers 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | 21 | import logging 22 | from frocket.common.metrics import MetricsBag, WorkerStartupLabel, ComponentLabel 23 | from frocket.common.tasks.base import BaseTaskRequest 24 | from frocket.datastore.registered_datastores import get_datastore 25 | from frocket.worker.impl.generic_env_metrics import GenericEnvMetricsProvider 26 | from frocket.worker.runners.base_task_runner import BaseTaskRunner, TaskRunnerContext 27 | from frocket.common.config import config 28 | from frocket.worker.runners.registered_runners import REGISTERED_RUNNERS 29 | 30 | config.init_logging() 31 | logger = logging.getLogger(__name__) 32 | datastore = get_datastore() 33 | 34 | 35 | def handle(req: BaseTaskRequest) -> None: 36 | metrics = MetricsBag(component=ComponentLabel.WORKER, 37 | env_metrics_provider=GenericEnvMetricsProvider()) 38 | metrics.set_label_enum(WorkerStartupLabel.WARM) # Always warm this worker is, uhmmhmmhmmhmm 39 | 40 | runner_class = REGISTERED_RUNNERS[type(req)] 41 | runner = runner_class(req, TaskRunnerContext(metrics)) 42 | result = runner.run() 43 | if logger.isEnabledFor(logging.DEBUG): 44 | logger.debug(result.to_json()) 45 | 46 | 47 | def main_loop(): 48 | # TODO backlog currently workers that encounter an unexpected data format will crash rather than continuing to 49 | # consume and (probably) fail. This has a pro (outdated worker versions fail fast), but of course also cons - 50 | # consider the desired/configurable behavior (e.g. crash after N unexpected errors?) 51 | try: 52 | while True: 53 | logger.info('Waiting for work...') 54 | req: BaseTaskRequest = datastore.dequeue() 55 | if req: 56 | logger.info(f"Got request: {req}") 57 | 58 | should_run, reject_reason = BaseTaskRunner.should_run(req) 59 | if should_run: 60 | handle(req) 61 | else: 62 | logger.warning(f"Request rejected: {reject_reason}") 63 | except KeyboardInterrupt: 64 | logger.info('Bye') 65 | 66 | 67 | main_loop() 68 | -------------------------------------------------------------------------------- /frocket/worker/runners/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /frocket/worker/runners/base_task_runner.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base class for running a task in a worker - to be subclassed for concerete task runners. 3 | """ 4 | # Copyright 2021 The Funnel Rocket Maintainers 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import logging 19 | import time 20 | from abc import abstractmethod 21 | from typing import Optional 22 | from frocket.common.config import config 23 | from frocket.common.metrics import MetricName, MetricsBag 24 | from frocket.common.tasks.base import TaskStatus, BaseTaskRequest, BaseTaskResult, TaskAttemptId 25 | from frocket.datastore.datastore import Datastore 26 | from frocket.datastore.blobstore import Blobstore 27 | from frocket.datastore.registered_datastores import get_datastore, get_blobstore 28 | from frocket.worker.runners.part_loader import PartLoader, shared_part_loader 29 | 30 | logger = logging.getLogger(__name__) 31 | REQUEST_MAX_AGE = int(config.get("worker.reject.age")) 32 | DEFAULT_PREFLIGHT_DURATION_MS = config.int("part.selection.preflight.ms") 33 | 34 | 35 | class TaskRunnerContext: 36 | """simple dependency provider... (for easier testing).""" 37 | def __init__(self, 38 | metrics: MetricsBag, 39 | private_part_loader: PartLoader = None, 40 | preflight_duration_ms: int = None): 41 | self._metrics = metrics 42 | # By default, files are loaded and cached by a re-usable loaded. 43 | # Having a 'private' one allows testing in isolation 44 | self._part_loader = private_part_loader or shared_part_loader() 45 | if preflight_duration_ms is None: 46 | preflight_duration_ms = DEFAULT_PREFLIGHT_DURATION_MS 47 | self._preflight_duration_seconds = preflight_duration_ms / 1000 48 | 49 | @property 50 | def metrics(self) -> MetricsBag: 51 | return self._metrics 52 | 53 | # The underlying get_datastore and get_blobstore are memoized - initialized on demand 54 | @property 55 | def datastore(self) -> Datastore: 56 | return get_datastore() 57 | 58 | @property 59 | def blobstore(self) -> Blobstore: 60 | return get_blobstore() 61 | 62 | @property 63 | def part_loader(self) -> PartLoader: 64 | return self._part_loader 65 | 66 | @property 67 | def preflight_duration_seconds(self) -> float: 68 | return self._preflight_duration_seconds 69 | 70 | 71 | class BaseTaskRunner: 72 | # Returns (should_run, reject_reason) 73 | @classmethod 74 | def should_run(cls, req: BaseTaskRequest) -> (bool, str): 75 | if cls.time_since_invocation(req) > REQUEST_MAX_AGE: 76 | return False, f"request is more than {REQUEST_MAX_AGE} seconds old" 77 | else: 78 | return True, None 79 | 80 | @staticmethod 81 | def time_since_invocation(req: BaseTaskRequest): 82 | return time.time() - req.invoke_time 83 | 84 | def __init__(self, req: BaseTaskRequest, 85 | ctx: TaskRunnerContext): 86 | self._req = req 87 | self._ctx = ctx 88 | # TODO backlog initialize the attempt_id on init, if available (n/a here in self-select part mode) 89 | self._task_attempt_id: Optional[TaskAttemptId] = None 90 | 91 | def run(self) -> BaseTaskResult: 92 | error_message, engine_result = None, None 93 | with self._ctx.metrics.measure(MetricName.TASK_TOTAL_RUN_SECONDS): 94 | try: 95 | self._ctx.metrics.set_metric(MetricName.INVOKE_TO_RUN_SECONDS, 96 | self.time_since_invocation(self._req)) 97 | 98 | self._do_run() # Call concrete class to do the actual work 99 | final_status = TaskStatus.ENDED_SUCCESS 100 | except Exception as e: 101 | final_status = TaskStatus.ENDED_FAILED 102 | error_message = str(e) 103 | logger.exception('Task FAILED!') 104 | 105 | # Post-run: extracting the task metrics, building the concrete result object 106 | final_metrics = self._ctx.metrics.finalize(success=(final_status == TaskStatus.ENDED_SUCCESS)) 107 | # First, set the base attributes in a dict as kind of a 'skeleton' response - then pass it to the concrete 108 | # task runner to pass as **args to the concrete result class 109 | base_attributes = BaseTaskResult( 110 | task_index=self._task_attempt_id.task_index, 111 | status=final_status, 112 | error_message=error_message, 113 | metrics=final_metrics).shallowdict(include_none=True) 114 | result = self._build_result(base_attributes) # Call concrete class 115 | 116 | # If the job failed to get a task attempt ID assigned to it (self-select failed), 117 | # or if the datastore is not available - task status and result cannot be written 118 | # TODO backlog consider having an optional secondary channel to report such failures 119 | # (aside from centralized logging?) 120 | if self._task_attempt_id: 121 | self._ctx.datastore.write_task_result(self._req.request_id, self._task_attempt_id, result) 122 | else: 123 | logger.error("Can't report result: no part was selected for loading") 124 | 125 | if logger.isEnabledFor(logging.DEBUG): 126 | logger.debug(result) 127 | return result 128 | 129 | def _update_status(self, status: TaskStatus): 130 | self._ctx.datastore.update_task_status(self._req.request_id, self._task_attempt_id, status) 131 | 132 | @abstractmethod 133 | def _do_run(self): 134 | pass 135 | 136 | @abstractmethod 137 | def _build_result(self, base_attributes: dict): 138 | """This method is still called by run() above even if _do_run() has raised an exception - having a sane 139 | result object is important even if a failed one.""" 140 | pass 141 | -------------------------------------------------------------------------------- /frocket/worker/runners/part_loader.py: -------------------------------------------------------------------------------- 1 | """ 2 | Load and cache parts (data files). 3 | """ 4 | # Copyright 2021 The Funnel Rocket Maintainers 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import logging 19 | import time 20 | import os 21 | from pathlib import Path 22 | from typing import List, Dict, Optional, Set, NamedTuple, Union 23 | from pandas import DataFrame 24 | import pyarrow.parquet 25 | from frocket.common.config import config 26 | from frocket.common.helpers.storage import storage_handler_for 27 | from frocket.common.helpers.utils import memoize 28 | from frocket.common.metrics import MetricName, LoadFromLabel, MetricsBag 29 | from frocket.common.dataset import DatasetPartId, DatasetId 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | 34 | # Just a little typed nicety over tuples which PyArrow accepts as predicate pushdown filters 35 | class FilterPredicate(NamedTuple): 36 | column: str 37 | op: str 38 | value: Union[str, int, float, bool] 39 | 40 | 41 | class CacheEntry: 42 | local_path: str 43 | size_mb: float 44 | last_used: float 45 | 46 | 47 | class PartLoader: 48 | _cache: Dict[DatasetPartId, CacheEntry] = None # DatasetPartId is a dataclass with proper hash & equality 49 | _disk_cache_max_size: float = None 50 | 51 | def __init__(self): 52 | self._setup() 53 | 54 | # Support re-initialization and overriding the configured size, for testing 55 | def _setup(self, disk_cache_max_size: float = None): 56 | if self._cache: 57 | for entry in self._cache.values(): 58 | os.remove(entry.local_path) 59 | self._cache = {} 60 | self._disk_cache_max_size = disk_cache_max_size if disk_cache_max_size is not None \ 61 | else config.float('worker.disk.cache.size.mb') 62 | 63 | @property 64 | def cache_current_size_mb(self) -> float: 65 | return sum(entry.size_mb for entry in self._cache.values()) 66 | 67 | @property 68 | def cache_len(self) -> int: 69 | return len(self._cache) 70 | 71 | def _prune_cache(self) -> None: 72 | curr_size_mb = self.cache_current_size_mb 73 | while curr_size_mb > 0 and curr_size_mb > self._disk_cache_max_size: 74 | logger.info(f"Current cache size is {curr_size_mb}mb, more than the configured " 75 | f"{self._disk_cache_max_size}mb") 76 | lru_key = min(self._cache, key=lambda k: self._cache[k].last_used) 77 | lru_entry = self._cache[lru_key] 78 | logger.info(f"Deleting LRU entry of dataset: {lru_key.dataset_id.name} " 79 | f"source path: {lru_key.path}, " 80 | f"last used {time.time() - lru_entry.last_used:.1f} seconds ago") 81 | try: 82 | os.remove(lru_entry.local_path) 83 | except OSError: 84 | logger.exception('Failed to delete file!') # TODO backlog consider disabling any further caching 85 | del self._cache[lru_key] 86 | curr_size_mb = self.cache_current_size_mb 87 | 88 | def load_dataframe(self, 89 | file_id: DatasetPartId, 90 | metrics: MetricsBag, 91 | needed_columns: List[str] = None, 92 | filters: List[FilterPredicate] = None, 93 | load_as_categoricals: List[str] = None) -> DataFrame: 94 | self._prune_cache() 95 | loaded_from: Optional[LoadFromLabel] = LoadFromLabel.SOURCE 96 | handler = storage_handler_for(file_id.path) 97 | is_source_remote = handler.remote 98 | 99 | local_path = None 100 | if not is_source_remote: 101 | local_path = file_id.path # No caching for local files 102 | else: 103 | if file_id in self._cache: 104 | local_path = self._cache[file_id].local_path 105 | loaded_from = LoadFromLabel.DISK_CACHE 106 | self._cache[file_id].last_used = time.time() 107 | logger.info("File is locally cached, yay") 108 | 109 | if not local_path: 110 | with metrics.measure(MetricName.TASK_DOWNLOAD_SECONDS): 111 | local_path = str(handler.get_local_path(file_id.path)) # Download to a local temp file 112 | 113 | entry = CacheEntry() 114 | entry.local_path = local_path 115 | entry.size_mb = Path(local_path).stat().st_size / 1024 ** 2 116 | entry.last_used = time.time() 117 | self._cache[file_id] = entry 118 | 119 | with metrics.measure(MetricName.TASK_LOAD_FILE_SECONDS): 120 | # Using PyArrow directly (rather than wrapped through Pandas) allows specifying column names to explicitly 121 | # load as 'dictionary' type, which then translates to categoricals in Pandas. 122 | # If the file was created with Pandas, categorical columns are loaded back as such - but we go beyond 123 | # that to detect 'potential categorical' string columns and load them as such. 124 | # Except for the memory usage saving, there is a performance gain here if the Parquet file already has a 125 | # dictionary for the column. Otherwise, PyArrow will create one - but without a performance gain. 126 | df = pyarrow.parquet.read_table(local_path, 127 | columns=needed_columns, 128 | filters=filters, 129 | read_dictionary=load_as_categoricals).to_pandas() 130 | 131 | metrics.set_label_enum(loaded_from) 132 | return df 133 | 134 | def get_cached_candidates(self, dataset_id: DatasetId) -> Optional[Set[DatasetPartId]]: 135 | """Do we have cached parts for this DatasetId, that can be used to self-select parts?""" 136 | logger.debug(f"Looking for cached candidates matching: {dataset_id}") 137 | candidates = None 138 | if self._cache: 139 | candidates = {part_id for part_id in self._cache.keys() if part_id.dataset_id == dataset_id} 140 | 141 | logger.debug(f"Found candidates: {candidates}") 142 | return candidates if (candidates and len(candidates) > 0) else None 143 | 144 | 145 | @memoize 146 | def shared_part_loader() -> PartLoader: 147 | """This is used by default, but can be overriden in tests.""" 148 | return PartLoader() 149 | -------------------------------------------------------------------------------- /frocket/worker/runners/query_task_runner.py: -------------------------------------------------------------------------------- 1 | """ 2 | Execute a single query task. 3 | """ 4 | # Copyright 2021 The Funnel Rocket Maintainers 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import logging 19 | import time 20 | from typing import List, cast, Optional 21 | from pandas import DataFrame 22 | from frocket.common.dataset import DatasetPartId 23 | from frocket.common.metrics import MetricName, PartSelectMethodLabel 24 | from frocket.common.tasks.base import TaskStatus, TaskAttemptId, BaseTaskRequest 25 | from frocket.common.tasks.query import PartSelectionMode, QueryTaskRequest, QueryResult, QueryTaskResult 26 | from frocket.engine.query_engine import QueryEngine 27 | from frocket.worker.runners.base_task_runner import BaseTaskRunner, TaskRunnerContext 28 | from frocket.worker.runners.part_loader import FilterPredicate 29 | 30 | logger = logging.getLogger(__name__) 31 | 32 | 33 | class QueryTaskRunner(BaseTaskRunner): 34 | def __init__(self, req: BaseTaskRequest, ctx: TaskRunnerContext): 35 | super().__init__(req, ctx) 36 | self._req = cast(QueryTaskRequest, req) # Avoid type warnings 37 | self._dataset_part_id: Optional[DatasetPartId] = None 38 | self._query_result: Optional[QueryResult] = None 39 | 40 | def _do_run(self): 41 | self._set_part_to_load() 42 | self._update_status(TaskStatus.LOADING_DATA) 43 | with self._ctx.metrics.measure(MetricName.TASK_TOTAL_LOAD_SECONDS): 44 | df = self._load(needed_columns=self._req.used_columns, 45 | load_as_categoricals=self._req.load_as_categoricals) 46 | 47 | self._update_status(TaskStatus.RUNNING_QUERY) 48 | with self._ctx.metrics.measure(MetricName.TASK_RUN_QUERY_SECONDS): 49 | engine = QueryEngine(self._req.dataset.group_id_column, self._req.dataset.timestamp_column) 50 | engine_result = engine.run(df, self._req.query) 51 | self._query_result = engine_result 52 | 53 | def _set_part_to_load(self) -> None: 54 | task_attempt_no = self._req.attempt_no 55 | if self._req.mode == PartSelectionMode.SET_BY_INVOKER: 56 | part_id = self._req.invoker_set_part 57 | actual_select_method = PartSelectMethodLabel.SET_BY_INVOKER 58 | elif self._req.mode == PartSelectionMode.SELECTED_BY_WORKER: 59 | actual_select_method, part_id = self._select_part_myself() 60 | logger.info(f"Worker selected part: method: {actual_select_method}, file ID: {part_id}, " 61 | f"task attempt no.: {task_attempt_no}") 62 | else: 63 | raise Exception(f"Don't know how to handle request mode: {self._req.mode}") 64 | 65 | if not part_id: 66 | raise Exception("No part to load") 67 | 68 | self._ctx.metrics.set_label_enum(actual_select_method) 69 | self._dataset_part_id = part_id 70 | self._task_attempt_id = TaskAttemptId(part_id.part_idx, task_attempt_no) 71 | 72 | def _select_part_myself(self): 73 | """See configuration guide for 'preflight' concept. In general, that's a configurable time period in self-select 74 | part mode, where 'warm' workers can select the candidates they wish without interruption.""" 75 | time_left_in_preflight = self._ctx.preflight_duration_seconds - BaseTaskRunner.time_since_invocation(self._req) 76 | candidates = self._ctx.part_loader.get_cached_candidates(self._req.dataset.id) 77 | sleep_time = 0 78 | if not candidates and time_left_in_preflight > 0: 79 | logger.info("Got no candidates but we're still during preflight" 80 | f", so sleeping for {time_left_in_preflight} seconds") 81 | sleep_time = time_left_in_preflight 82 | 83 | if sleep_time: 84 | time.sleep(time_left_in_preflight) 85 | self._ctx.metrics.set_metric(MetricName.TASK_PREFLIGHT_SLEEP_SECONDS, sleep_time) 86 | 87 | # If a worker got some candidates, we still gonna try to grab them even if preflight time has ended 88 | selected_part = self._ctx.datastore.self_select_part(self._req.request_id, self._req.attempt_no, candidates) 89 | if not selected_part.part_id: 90 | # Not supposed to happen, unless there's a retry mechanism gone awry 91 | raise Exception("Got no part for me!") 92 | 93 | if candidates: 94 | if not selected_part.random: 95 | actual_select_method = PartSelectMethodLabel.SPECIFIC_CANDIDATE 96 | else: 97 | actual_select_method = PartSelectMethodLabel.RANDOM_CANDIDATES_TAKEN 98 | else: 99 | actual_select_method = PartSelectMethodLabel.RANDOM_NO_CANDIDATES 100 | 101 | return actual_select_method, selected_part.part_id 102 | 103 | def _load(self, needed_columns: List[str] = None, load_as_categoricals: List[str] = None) -> DataFrame: 104 | filters = self._predicate_pushdown_filters() 105 | if logger.isEnabledFor(logging.DEBUG): 106 | logger.debug(f"Filters used when loading: {filters}") 107 | logger.debug(f"Columns to explicitly load as categorical: {load_as_categoricals}") 108 | 109 | df = self._ctx.part_loader.load_dataframe(file_id=self._dataset_part_id, metrics=self._ctx.metrics, 110 | needed_columns=needed_columns, filters=filters, 111 | load_as_categoricals=load_as_categoricals) 112 | self._ctx.metrics.set_metric(MetricName.SCANNED_ROWS, len(df)) 113 | self._ctx.metrics.set_metric(MetricName.SCANNED_GROUPS, df[self._req.dataset.group_id_column].nunique()) 114 | return df 115 | 116 | def _predicate_pushdown_filters(self): 117 | """ 118 | Build PyArrow-compatible pushdown predicates to pass the part loader. 119 | An important reminder here is that any filter applied would affect not just conditions/sequences, but also 120 | any defined aggregations - meaning it's suitable for limiting scope to the (optional) query timeframe, 121 | but should be evaluated carefully for any other optimizations. 122 | """ 123 | filters = [] 124 | timeframe = self._req.query.get('timeframe', None) 125 | if timeframe: 126 | fromtime = timeframe.get('from', None) 127 | if fromtime is not None: 128 | filters.append(FilterPredicate(column=self._req.dataset.timestamp_column, op='>=', value=fromtime)) 129 | totime = timeframe.get('to', None) 130 | if totime is not None: 131 | filters.append(FilterPredicate(column=self._req.dataset.timestamp_column, op='<', value=totime)) 132 | 133 | return filters if len(filters) > 0 else None 134 | 135 | def _build_result(self, base_attributes): 136 | return QueryTaskResult( 137 | **base_attributes, 138 | query_result=self._query_result) 139 | -------------------------------------------------------------------------------- /frocket/worker/runners/registered_runners.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Dict, Type 16 | from frocket.common.tasks.base import BaseTaskRequest 17 | from frocket.common.tasks.registration import RegistrationTaskRequest 18 | from frocket.common.tasks.query import QueryTaskRequest 19 | from frocket.worker.runners.base_task_runner import BaseTaskRunner 20 | from frocket.worker.runners.query_task_runner import QueryTaskRunner 21 | from frocket.worker.runners.registration_task_runner import RegistrationTaskRunner 22 | 23 | REGISTERED_RUNNERS: Dict[Type[BaseTaskRequest], Type[BaseTaskRunner]] = { 24 | QueryTaskRequest: QueryTaskRunner, 25 | RegistrationTaskRequest: RegistrationTaskRunner 26 | } 27 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pyarrow>=2.0.0 2 | pandas>=1.2.0 3 | boto3>=1.16.0 4 | redis>=3.5.0 5 | tabulate>=0.8.0 6 | prometheus_client>=0.9.0 7 | flask>=1.1.0 8 | jsonschema>=3.2.0 9 | dataclasses-json>=0.5.2 10 | inflection>=0.5.0 11 | parsimonious>=0.8.0 12 | gunicorn>=20.0.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import setuptools 3 | 4 | this_dir = pathlib.Path(__file__).parent 5 | requirements_file = this_dir / "requirements.txt" 6 | readme_file = this_dir / "README.md" 7 | 8 | install_requires = requirements_file.read_text().splitlines() 9 | long_description = readme_file.read_text() if readme_file.exists() else '' 10 | 11 | setuptools.setup( 12 | name="funnel-rocket", 13 | version="0.5.3", 14 | author="Elad Rosenheim, Avshalom Manevich", 15 | author_email="elad@dynamicyield.com", 16 | description="Cloud native distributed funnel queries", 17 | long_description=long_description, 18 | long_description_content_type="text/markdown", 19 | url="https://github.com/DynamicYieldProjects/funnel-rocket-oss", 20 | packages=setuptools.find_packages(), 21 | package_data={ 22 | "frocket": ["resources/*.*"], 23 | }, 24 | classifiers=[ 25 | "Programming Language :: Python :: 3.8", 26 | "Programming Language :: Python :: 3.9", 27 | "License :: OSI Approved :: Apache Software License", 28 | "Operating System :: OS Independent", 29 | ], 30 | python_requires='>=3.8', 31 | install_requires=install_requires 32 | ) 33 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | pytest>=6.2.0 2 | pytest-cov>=2.11.0 3 | icdiff>=0.5.0 4 | requests>=2.25.0 -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tests/utils/base_query_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "timeframe": { 3 | "from": 1590918400516, 4 | "to": 1618918400516 5 | }, 6 | "query": { 7 | "relation": "( $1 and $2) || $seq || (( $made_multiple_purchases ) ) ", 8 | "conditions": [ 9 | { 10 | "name": "made_multiple_purchases", 11 | "filter": [ 12 | "eventId", 13 | "==", 14 | 18765 15 | ], 16 | "target": [ 17 | "count", 18 | ">=", 19 | 0 20 | ], 21 | "includeZero": true 22 | }, 23 | { 24 | "name": "made_multiple_purchases2", 25 | "filter": [ 26 | "eventId", 27 | "==", 28 | 18766 29 | ], 30 | "includeZero": false 31 | }, 32 | { 33 | "filter": { 34 | "column": "eventId", 35 | "op": "==", 36 | "value": 18767 37 | }, 38 | "target": { 39 | "type": "sum", 40 | "column": "eventValue", 41 | "op": "<", 42 | "value": 350 43 | } 44 | }, 45 | { 46 | "filter": { 47 | "column": "eventId", 48 | "op": "==", 49 | "value": 18768 50 | }, 51 | "target": [ 52 | "sum", 53 | "eventValue", 54 | "<", 55 | 350 56 | ] 57 | }, 58 | { 59 | "filters": [ 60 | { 61 | "column": "eventType", 62 | "op": "==", 63 | "value": "purchase" 64 | }, 65 | { 66 | "column": "goalValue", 67 | "op": ">=", 68 | "value": 3 69 | } 70 | ], 71 | "target": [ 72 | "sum", 73 | "eventValue", 74 | "<", 75 | 350 76 | ], 77 | "includeZero": false 78 | }, 79 | { 80 | "name": "seq", 81 | "sequence": [ 82 | { 83 | "filter": [ 84 | "eventType", 85 | "==", 86 | "addToCart" 87 | ] 88 | }, 89 | { 90 | "filters": [ 91 | { 92 | "column": "eventType", 93 | "op": "==", 94 | "value": "purchase" 95 | }, 96 | { 97 | "column": "goalValue", 98 | "op": ">=", 99 | "value": 3 100 | } 101 | ] 102 | }, 103 | { 104 | "rowFound": false, 105 | "filter": { 106 | "column": "eventType", 107 | "op": "==", 108 | "value": "signToClub" 109 | } 110 | } 111 | ], 112 | "maxDuration": 23443 113 | } 114 | ], 115 | "aggregations": [ 116 | { 117 | "column": "device" 118 | }, 119 | { 120 | "column": "transactionId", 121 | "type": "count", 122 | "name": "purchase_count" 123 | }, 124 | { 125 | "column": "goalId" 126 | }, 127 | { 128 | "column": "goalId", 129 | "type": "sumPerValue", 130 | "otherColumn": "goalValue", 131 | "name": "hoola" 132 | } 133 | ] 134 | }, 135 | "funnel": { 136 | "sequence": [ 137 | { 138 | "filter": [ 139 | "eventType", 140 | "==", 141 | "addToCart" 142 | ] 143 | }, 144 | { 145 | "filter": { 146 | "column": "eventId", 147 | "op": "==", 148 | "value": 18765 149 | } 150 | } 151 | ], 152 | "maxDuration": 23443, 153 | "stepAggregations": [ 154 | { 155 | "column": "goalId", 156 | "type": "count", 157 | "name": "mosh" 158 | }, 159 | { 160 | "column": "eventId", 161 | "type": "groupsPerValue", 162 | "name": "mosh2" 163 | } 164 | ], 165 | "endAggregations": [ 166 | { 167 | "column": "goalId" 168 | } 169 | ] 170 | } 171 | } -------------------------------------------------------------------------------- /tests/utils/base_test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import tempfile 17 | from typing import List, Type 18 | from frocket.common.metrics import MetricName, MetricData, MetricLabelEnum 19 | 20 | SKIP_SLOW_TESTS = os.environ.get('SKIP_SLOW_TESTS', "False").lower() == 'true' 21 | SKIP_LAMBDA_TESTS = os.environ.get('SKIP_LAMBDA_TESTS', "False").lower() == 'true' 22 | # noinspection PyProtectedMember,PyUnresolvedReferences 23 | TEMP_DIR = tempfile._get_default_tempdir() 24 | 25 | 26 | # noinspection PyProtectedMember,PyUnresolvedReferences 27 | def temp_filename(suffix='', with_dir: bool = True): 28 | fname = next(tempfile._get_candidate_names()) + suffix 29 | return f"{TEMP_DIR}/{fname}" if with_dir else fname 30 | 31 | 32 | # A mixin to allow defining utility classes named "Test" without pytest trying to collect test cases in them, 33 | # which results in warnings (and without needing a pytest.ini entry). See https://stackoverflow.com/a/46199666 34 | class DisablePyTestCollectionMixin(object): 35 | __test__ = False 36 | 37 | 38 | def get_metric_value(metrics: List[MetricData], name: MetricName) -> float: 39 | assert metrics 40 | metric = next(filter(lambda metric: metric.name == name, metrics), None) 41 | assert metric is not None 42 | return metric.value 43 | 44 | 45 | def assert_metric_value(metrics: List[MetricData], name: MetricName, value: float): 46 | assert get_metric_value(metrics, name) == value 47 | 48 | 49 | def find_first_label_value(metrics: List[MetricData], label_type: Type[MetricLabelEnum]) -> str: 50 | assert metrics 51 | found_metric = next(filter(lambda metric: label_type.label_name in metric.labels, metrics), None) 52 | return found_metric.labels[label_type.label_name] 53 | 54 | 55 | def assert_label_value_exists(metrics: List[MetricData], label: MetricLabelEnum): 56 | assert find_first_label_value(metrics, label.__class__) == label.label_value 57 | -------------------------------------------------------------------------------- /tests/utils/lambda_fixture.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | from frocket.common.config import config 17 | 18 | 19 | @pytest.fixture(scope="session", autouse=True) 20 | def init_mock_lambda_settings(): 21 | config['lambda.aws.endpoint.url'] = config.get('lambda.aws.endpoint.url', 'http://localhost:9001') 22 | config['lambda.aws.region'] = config.get('lambda.aws.region', 'us-east-1') 23 | config['lambda.aws.no.signature'] = 'true' 24 | config['invoker.lambda.legacy.async'] = 'false' 25 | -------------------------------------------------------------------------------- /tests/utils/mock_s3_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import boto3 17 | from frocket.common.config import config, ConfigDict 18 | from frocket.common.helpers.utils import timestamped_uuid, memoize 19 | 20 | SKIP_S3_TESTS = os.environ.get('SKIP_S3_TESTS', "False").lower() == 'true' 21 | 22 | 23 | @memoize 24 | def _init_mock_s3_config(): 25 | if SKIP_S3_TESTS: 26 | print(f"Skipping mock S3 config") 27 | config['s3.aws.endpoint.url'] = \ 28 | os.environ.get('MOCK_S3_URL', config.get('s3.aws.endpoint.url', 'http://localhost:9000')) 29 | config['s3.aws.access.key.id'] = \ 30 | os.environ.get('MOCK_S3_USER', config.get('s3.aws.access.key.id', 'testonly')) 31 | config['s3.aws.secret.access.key'] = \ 32 | os.environ.get('MOCK_S3_SERCET', config.get('s3.aws.secret.access.key', 'testonly')) 33 | 34 | 35 | def mock_s3_env_variables(): 36 | _init_mock_s3_config() 37 | return { 38 | ConfigDict.to_env_variable(key): config.get(key) 39 | for key in ['s3.aws.endpoint.url', 's3.aws.access.key.id', 's3.aws.secret.access.key'] 40 | } 41 | 42 | 43 | def new_mock_s3_bucket(): 44 | if SKIP_S3_TESTS: 45 | return None 46 | _init_mock_s3_config() 47 | 48 | bucket_name = timestamped_uuid('testbucket-') 49 | s3 = boto3.resource('s3', **config.aws_client_settings(service='s3')) 50 | bucket = s3.Bucket(bucket_name) 51 | bucket.create() 52 | print(f"Bucket '{bucket_name}' created") 53 | return bucket 54 | -------------------------------------------------------------------------------- /tests/utils/redis_fixture.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Funnel Rocket Maintainers 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import pytest 17 | from frocket.common.config import config, ConfigDict 18 | from frocket.datastore.registered_datastores import get_datastore, get_blobstore 19 | 20 | 21 | @pytest.fixture(scope="session", autouse=True) 22 | def init_test_redis_settings(): 23 | config['redis.host'] = os.environ.get('TEST_REDIS_HOST', config['redis.host']) 24 | config['redis.port'] = os.environ.get('TEST_REDIS_PORT', config['redis.port']) 25 | config['redis.db'] = os.environ.get('TEST_REDIS_DB', config['redis.db']) 26 | print(get_datastore(), get_blobstore()) # Fail on no connection, print connection details 27 | 28 | 29 | def get_test_redis_env_variables(): 30 | return { 31 | ConfigDict.to_env_variable(key): config.get(key) 32 | for key in ['redis.host', 'redis.port', 'redis.db', 'datastore.redis.prefix'] 33 | } 34 | --------------------------------------------------------------------------------