├── .dockerignore
├── .github
├── CODEOWNERS
├── dependabot.yml
└── workflows
│ └── main.yml
├── .gitignore
├── .python-version
├── LICENSE
├── README.md
├── build-lambda.sh
├── data
└── .gitignore
├── dataprep_example
├── __init__.py
├── ingest_retailrocket_dataset.py
└── repartition.py
├── docker-compose.yml
├── docker
├── all-in-one.Dockerfile
├── entrypoint.sh
└── local-lambda.Dockerfile
├── docs
├── api.md
├── example-dataset.md
├── logo-blue.svg
├── logo-icon-dark-blue.svg
├── logo-icon-light-blue.svg
├── logo-small-blue.svg
└── operating.md
├── frocket
├── __init__.py
├── apiserver.py
├── cli.py
├── cli_commands.py
├── common
│ ├── __init__.py
│ ├── config.py
│ ├── dataset.py
│ ├── helpers
│ │ ├── __init__.py
│ │ ├── pandas.py
│ │ ├── storage.py
│ │ └── utils.py
│ ├── metrics.py
│ ├── serializable.py
│ ├── tasks
│ │ ├── __init__.py
│ │ ├── async_tracker.py
│ │ ├── base.py
│ │ ├── query.py
│ │ └── registration.py
│ └── validation
│ │ ├── __init__.py
│ │ ├── consts.py
│ │ ├── error.py
│ │ ├── path_visitor.py
│ │ ├── query_validator.py
│ │ ├── relation_parser.py
│ │ ├── result.py
│ │ └── visitor_functions.py
├── datastore
│ ├── __init__.py
│ ├── blobstore.py
│ ├── datastore.py
│ ├── redis_store.py
│ └── registered_datastores.py
├── engine
│ ├── __init__.py
│ ├── query_engine.py
│ └── relation_to_pandas.py
├── invoker
│ ├── __init__.py
│ ├── base_invoker.py
│ ├── impl
│ │ ├── __init__.py
│ │ ├── async_invoker.py
│ │ ├── aws_lambda_invoker.py
│ │ ├── registered_invokers.py
│ │ └── work_queue_invoker.py
│ ├── invoker_api.py
│ ├── jobs
│ │ ├── __init__.py
│ │ ├── job.py
│ │ ├── query_job.py
│ │ └── registration_job.py
│ ├── metrics_frame.py
│ ├── prom_adapter.py
│ └── stats_builder.py
├── resources
│ └── query_schema.json
└── worker
│ ├── __init__.py
│ ├── impl
│ ├── __init__.py
│ ├── aws_lambda_metrics.py
│ ├── aws_lambda_worker.py
│ ├── generic_env_metrics.py
│ └── queue_worker.py
│ └── runners
│ ├── __init__.py
│ ├── base_task_runner.py
│ ├── part_loader.py
│ ├── query_task_runner.py
│ ├── registered_runners.py
│ └── registration_task_runner.py
├── requirements.txt
├── setup.py
├── test-requirements.txt
└── tests
├── __init__.py
├── test_apiserver.py
├── test_cli.py
├── test_invoker_api.py
├── test_part_loader.py
├── test_path_visitor.py
├── test_query_engine.py
├── test_query_job.py
├── test_query_task.py
├── test_query_validator.py
├── test_registration_job.py
├── test_registration_task.py
└── utils
├── __init__.py
├── base_query_example.json
├── base_test_utils.py
├── dataset_utils.py
├── lambda_fixture.py
├── mock_s3_utils.py
├── redis_fixture.py
└── task_and_job_utils.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | **/__pycache__/
2 | **/*.py[cod]
3 | **/.*
4 | docker/*.Dockerfile
5 | *.so
6 | *.parquet
7 | *.zip
8 | data/
9 | layers/
10 | scratch/
11 | build/
12 | dist/
13 | sdist/
14 | *.egg-info/
15 | *.egg
16 | venv/
17 | map/
18 | reduce/
--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | /frocket @dynamicyield/eladroz
2 | /docker @dynamicyield/omrisk
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: github-actions
4 | directory: /
5 | schedule:
6 | interval: daily
7 | commit-message:
8 | prefix: fix(deps)
9 | - package-ecosystem: pip
10 | directory: /
11 | schedule:
12 | interval: daily
13 | commit-message:
14 | prefix: fix(deps)
15 | - package-ecosystem: docker
16 | directory: /docker
17 | schedule:
18 | interval: daily
19 | commit-message:
20 | prefix: fix(deps)
21 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: Tests
2 | on: [ push ]
3 | jobs:
4 | ci:
5 | runs-on: ubuntu-18.04
6 | timeout-minutes: 10
7 |
8 | steps:
9 | - name: Checkout repo
10 | uses: actions/checkout@v2.3.4
11 |
12 | - name: Setup Python
13 | uses: actions/setup-python@v2
14 | with:
15 | python-version: '3.8'
16 | architecture: 'x64'
17 |
18 | - name: Cache dependencies
19 | uses: actions/cache@v2.1.4
20 | id: cache-venv
21 | with:
22 | path: ./venv/
23 | key: ${{ runner.os }}-venv-cache-${{ hashFiles('./requirements.txt','./test-requirements.txt','./setup.py') }}
24 |
25 | - name: Build virtual environment and install dependencies
26 | run: |
27 | python -m pip install --upgrade pip
28 | python -m venv venv
29 | source venv/bin/activate
30 | pip install -e .
31 | pip install -r test-requirements.txt
32 | if: steps.cache-venv.outputs.cache-hit != 'true'
33 |
34 | - name: Set up Docker Buildx
35 | id: buildx
36 | uses: docker/setup-buildx-action@master
37 |
38 | - name: Cache Docker layers for all-in-one
39 | uses: actions/cache@v2.1.4
40 | with:
41 | path: /tmp/.buildx-cache-all-in-one
42 | key: ${{ runner.os }}-buildx-all-in-one-${{ github.sha }}
43 | restore-keys: |
44 | ${{ runner.os }}-buildx-all-in-one-
45 |
46 | - name: Docker build all-in-one
47 | id: docker_build_all_in_one
48 | uses: docker/build-push-action@v2
49 | with:
50 | context: .
51 | file: ./docker/all-in-one.Dockerfile
52 | builder: ${{ steps.buildx.outputs.name }}
53 | load: true
54 | tags: frocket/all-in-one:latest
55 | cache-from: type=local,src=/tmp/.buildx-cache-all-in-one
56 | cache-to: type=local,dest=/tmp/.buildx-cache-all-in-one,mode=max
57 |
58 | - name: Cache Docker layers for local-lambda
59 | uses: actions/cache@v2.1.4
60 | with:
61 | path: /tmp/.buildx-cache-local-lambda
62 | key: ${{ runner.os }}-buildx-local-lambda-${{ github.sha }}
63 | restore-keys: |
64 | ${{ runner.os }}-buildx-local-lambda-
65 |
66 | - name: Docker build local-lambda
67 | id: docker_build_all_local_lambda
68 | uses: docker/build-push-action@v2
69 | with:
70 | context: .
71 | file: ./docker/local-lambda.Dockerfile
72 | builder: ${{ steps.buildx.outputs.name }}
73 | load: true
74 | tags: frocket/local-lambda:latest
75 | cache-from: type=local,src=/tmp/.buildx-cache-local-lambda
76 | cache-to: type=local,dest=/tmp/.buildx-cache-local-lambda,mode=max
77 |
78 | - name: Launch docker-compose
79 | run: |
80 | docker-compose up -d
81 | sleep 2
82 |
83 | - name: Test with pytest
84 | run: |
85 | source venv/bin/activate
86 | export SKIP_SLOW_TESTS=true
87 | pytest --cov=frocket --cov-report=html
88 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | **/.DS_Store
2 | **/.ipynb_checkpoints/
3 | **/__pycache__
4 | **/*.pyc
5 | **/*.zip
6 | **/*.so
7 | **/*.parquet
8 | *.egg-info
9 | .eggs
10 | venv
11 | .idea
12 | *.iml
13 | .awsenv
14 | scratch
15 | map
16 | reduce
17 | build
18 | dist
19 | # Coverage report
20 | htmlcov
21 | .coverage
22 | .vscode
--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.8.0
2 |
--------------------------------------------------------------------------------
/build-lambda.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | set -e
3 | RED='\033[0;31m'
4 | GREEN='\033[0;32m'
5 | YELLOW='\033[1;33m'
6 | NC='\033[0m' # No Color
7 | GITHASH=`git rev-parse HEAD | cut -c1-8``[[ -z $(git status -s) ]] || echo dirty`
8 | [[ $1 == '--layer' ]] && LAYER=true || LAYER=false
9 |
10 | echo "${YELLOW}==> Building layer: ${LAYER}${NC}"
11 | echo "${YELLOW}==> Git commit hash: ${GITHASH}${NC}"
12 | echo "${YELLOW}==> Running docker build to install packages in Lambda-like image...${NC}"
13 | docker build -f docker/local-lambda.Dockerfile . -t frocket/local-lambda:latest
14 | docker run -d --name lambda-builder frocket/local-lambda:latest
15 |
16 | BUILD_DIR=$(mktemp -d -t build-lambda)
17 | echo "${YELLOW}==> Copying files from container to build directory: ${BUILD_DIR}...${NC}"
18 | mkdir -p $BUILD_DIR/function
19 | docker cp lambda-builder:/var/task/frocket $BUILD_DIR/function/frocket
20 | if [ "$LAYER" = true ]; then
21 | mkdir -p $BUILD_DIR/layer
22 | docker cp lambda-builder:/opt/python $BUILD_DIR/layer/python
23 | fi
24 |
25 | echo "${YELLOW}==> Stopping & removing container...${NC}"
26 | docker stop lambda-builder
27 | docker rm lambda-builder
28 |
29 | pushd $BUILD_DIR
30 | echo "${YELLOW}==> Cleaning-up a bit and zipping...${NC}"
31 | FUNCTION_ZIPFILE=lambda-function-${GITHASH}.zip
32 | [ "$LAYER" = true ] && LAYER_ZIPFILE=lambda-layer-${GITHASH}.zip || LAYER_ZIPFILE=
33 |
34 | if [ "$LAYER" = true ]; then
35 | find ./layer/python -type d -name tests | xargs rm -rf
36 | find ./layer/python -type d -name include | xargs rm -rf
37 | (cd layer && zip -qr ../$LAYER_ZIPFILE ./python)
38 | echo "${YELLOW}NOTE: Lambda size limit is 50mb compressed/250mb uncompressed for the function PLUS any layers it uses (unless using containers)${NC}"
39 | echo "${YELLOW}Lambda layer size, uncompressed:${NC}"
40 | du -sh ./layer
41 | echo "${YELLOW}Lambda layer size, zipped:${NC}"
42 | du -h $LAYER_ZIPFILE
43 | fi
44 |
45 | (cd function && zip -qr ../$FUNCTION_ZIPFILE ./frocket)
46 | echo "${YELLOW}Lambda function, zipped:${NC}"
47 | du -h $FUNCTION_ZIPFILE
48 |
49 | popd
50 | # Don't fail if previous files don't exist
51 | rm lambda-function-*.zip || true
52 | cp $BUILD_DIR/$FUNCTION_ZIPFILE .
53 | if [ "$LAYER" = true ]; then
54 | rm lambda-layer-*.zip || true
55 | cp $BUILD_DIR/$LAYER_ZIPFILE ./
56 | fi
57 | rm -rf $BUILD_DIR
58 | echo "${YELLOW}DONE! copied to current dir:${NC}\n${FUNCTION_ZIPFILE} ${LAYER_ZIPFILE}"
59 |
--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | *
--------------------------------------------------------------------------------
/dataprep_example/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DynamicYieldProjects/funnel-rocket/70963fddc0881cebdc6da1af2654d412f95d660c/dataprep_example/__init__.py
--------------------------------------------------------------------------------
/dataprep_example/ingest_retailrocket_dataset.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import time
3 | import argparse
4 | from pathlib import Path
5 | from contextlib import contextmanager
6 | import pandas as pd
7 | from pandas import DataFrame
8 |
9 | EVENTS_FILE = 'events.csv'
10 | PROPS_FILE_1 = 'item_properties_part1.csv'
11 | PROPS_FILE_2 = 'item_properties_part2.csv'
12 | INPUT_FILENAMES = {EVENTS_FILE, PROPS_FILE_1, PROPS_FILE_2}
13 | ITEM_PROPERTY_COLUMNS = {'categoryid', 'available', '790', '888'}
14 | EXPECTED_EVENT_COUNT = 2_500_516
15 |
16 |
17 | def progress_msg(msg: str):
18 | print(f"\033[33m{msg}\033[0m") # Yellow, just yellow
19 |
20 |
21 | @contextmanager
22 | def timed(caption: str):
23 | start = time.time()
24 | yield
25 | total = time.time() - start
26 | print(f"Time to {caption}: {total:.3f} seconds")
27 |
28 |
29 | # Read item properties files, filter for relevant columns and 'pivot' its structure from rows to columns
30 | def read_item_props(filepath: Path) -> DataFrame:
31 | df = pd.read_csv(filepath)
32 | df = df[df['property'].isin(ITEM_PROPERTY_COLUMNS)]
33 | first_value_per_item = df.groupby(["itemid", "property"])["value"].first()
34 | df = first_value_per_item.to_frame()
35 | df = df.unstack(level=-1)
36 | df.columns = df.columns.droplevel(0)
37 | return df
38 |
39 |
40 | def ingest(path: Path):
41 | with timed("read & transform item properties of all products"):
42 | item_props_tempfile = path / "item_props.parquet"
43 | if item_props_tempfile.exists():
44 | progress_msg(f"Reading item properties from cached file {item_props_tempfile}")
45 | item_props_df = pd.read_parquet(item_props_tempfile)
46 | else:
47 | progress_msg("Reading item properties... (this takes a bit)")
48 | item_props_df1 = read_item_props(path / PROPS_FILE_1)
49 | item_props_df2 = read_item_props(path / PROPS_FILE_2)
50 | item_props_df = item_props_df1.combine_first(item_props_df2)
51 | progress_msg(f"Storing item properties to {item_props_tempfile} for faster re-runs...")
52 | item_props_df.to_parquet(item_props_tempfile)
53 |
54 | with timed("read & transform user events"):
55 | progress_msg("Reading user events...")
56 | events = pd.read_csv(path / EVENTS_FILE)
57 | progress_msg("Joining events with item properties...")
58 | events = pd.merge(events, item_props_df, how='inner', on='itemid')
59 |
60 | progress_msg("Making columns more queryable...")
61 | events['price'] = events['790'].str[1:].astype(float) / 1000
62 | events.drop(columns=['790'], inplace=True)
63 | events['available'] = events['available'].astype(int).astype(bool)
64 | events['categoryid'] = events['categoryid'].astype('category')
65 | events['event'] = events['event'].astype('category')
66 | events.rename(columns={'888': 'cryptic_attrs'}, inplace=True)
67 | progress_msg("Storing 'cryptic_attrs' also as categorical column 'cryptic_attrs_cat'...")
68 | events['cryptic_attrs_cat'] = events['cryptic_attrs'].astype('category')
69 | events.reset_index(drop=True)
70 |
71 | progress_msg("Excerpt from final DataFrame:")
72 | print(events)
73 | progress_msg("Columns types (a.k.a. dtypes):")
74 | print(events.dtypes)
75 | progress_msg("Breakdown of event types:")
76 | print(events['event'].value_counts())
77 |
78 | if len(events) != EXPECTED_EVENT_COUNT:
79 | progress_msg(f"WARNING: Expected {EXPECTED_EVENT_COUNT} events, but final DataFrame has {len(events)}")
80 |
81 | output_file = path / 'retailrocket.parquet'
82 | events.to_parquet(output_file)
83 | col_memory_sizes = (events.memory_usage(deep=True) / 1024 ** 2).round(decimals=2)
84 | progress_msg(f'Size of DataFrame columns in memory (in MB):')
85 | print(col_memory_sizes)
86 | progress_msg(f"==> Saved output file to: {output_file}, size: {output_file.stat().st_size / 1024 ** 2:.1f}MB")
87 |
88 | with timed("load file - all columns"):
89 | pd.read_parquet(output_file)
90 |
91 | with timed("load file - just the 'cryptic_attrs' column"):
92 | pd.read_parquet(output_file, columns=['cryptic_attrs'])
93 |
94 | with timed("load file - just the 'cryptic_attrs_cat' column"):
95 | pd.read_parquet(output_file, columns=['cryptic_attrs_cat'])
96 |
97 | with timed("load file - all columns *except* these two"):
98 | cols = [col for col in events.dtypes.index
99 | if col not in ['cryptic_attrs', 'cryptic_attrs_cat']]
100 | pd.read_parquet(output_file, columns=cols)
101 |
102 |
103 | if __name__ == '__main__':
104 | parser = argparse.ArgumentParser(
105 | description='Ingest RetailRocket dataset (to download: https://www.kaggle.com/retailrocket/ecommerce-dataset/)')
106 | parser.add_argument(
107 | 'path', type=str,
108 | help='Directory where downloaded dataset files are found and output file will be written')
109 | args = parser.parse_args()
110 |
111 | path = Path(args.path)
112 | if not path.exists() or not path.is_dir():
113 | sys.exit(f'No such directory: {path}')
114 | files_in_path = {f.name for f in path.iterdir()}
115 | if not files_in_path >= INPUT_FILENAMES:
116 | sys.exit(f'Missing one or more input files: {INPUT_FILENAMES}')
117 | ingest(path)
118 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.8'
2 | services:
3 | redis:
4 | image: redis:6
5 | ports:
6 | - ${FROCKET_REDIS_PORT:-6379}:${FROCKET_REDIS_PORT:-6379}
7 | entrypoint: [ "redis-server", "--port", "${FROCKET_REDIS_PORT:-6379}" ]
8 |
9 | mock-s3:
10 | image: minio/minio:latest
11 | container_name: mock-s3
12 | ports:
13 | - 9000:9000
14 | environment:
15 | - MINIO_ROOT_USER=testonly
16 | - MINIO_ROOT_PASSWORD=testonly
17 | command: server /data
18 |
19 | frocket-queue-worker:
20 | build:
21 | dockerfile: docker/all-in-one.Dockerfile
22 | context: .
23 | image: frocket/all-in-one:latest
24 | volumes:
25 | - ./data:/app/data:ro,cached
26 | environment:
27 | - FROCKET_REDIS_HOST=redis
28 | - FROCKET_REDIS_PORT=${FROCKET_REDIS_PORT:-6379}
29 | - FROCKET_S3_AWS_ENDPOINT_URL=http://mock-s3:9000
30 | - FROCKET_S3_AWS_ACCESS_KEY_ID=testonly
31 | - FROCKET_S3_AWS_SECRET_ACCESS_KEY=testonly
32 | depends_on:
33 | - redis
34 | - mock-s3
35 | command: worker
36 |
37 | frocket-lambda-worker:
38 | build:
39 | dockerfile: docker/local-lambda.Dockerfile
40 | context: .
41 | image: frocket/local-lambda:latest
42 | container_name: mock-lambda
43 | volumes:
44 | - ./data:/data:ro,cached
45 | environment:
46 | - FROCKET_REDIS_HOST=redis
47 | - FROCKET_REDIS_PORT=${FROCKET_REDIS_PORT:-6379}
48 | - FROCKET_S3_AWS_ENDPOINT_URL=http://mock-s3:9000
49 | - FROCKET_S3_AWS_ACCESS_KEY_ID=testonly
50 | - FROCKET_S3_AWS_SECRET_ACCESS_KEY=testonly
51 | - AWS_REGION=us-east-1
52 | depends_on:
53 | - redis
54 | - mock-s3
55 | ports:
56 | - 9001:9001
57 | command: frocket.worker.impl.aws_lambda_worker.lambda_handler
58 |
59 | frocket-apiserver:
60 | image: frocket/all-in-one:latest
61 | container_name: frocket-apiserver
62 | ports:
63 | - 5000:5000
64 | volumes:
65 | - ./data:/app/data:ro,cached
66 | environment:
67 | - APISERVER_NUM_WORKERS=2
68 | - FROCKET_REDIS_HOST=redis
69 | - FROCKET_REDIS_PORT=${FROCKET_REDIS_PORT:-6379}
70 | - FROCKET_S3_AWS_ENDPOINT_URL=http://mock-s3:9000
71 | - FROCKET_S3_AWS_ACCESS_KEY_ID=testonly
72 | - FROCKET_S3_AWS_SECRET_ACCESS_KEY=testonly
73 | - FROCKET_LAMBDA_AWS_NO_SIGNATURE=true
74 | - FROCKET_LAMBDA_AWS_ENDPOINT_URL=http://mock-lambda:9001
75 | - FROCKET_LAMBDA_AWS_REGION=us-east-1
76 | - FROCKET_INVOKER_LAMBDA_LEGACY_ASYNC=false
77 | - FROCKET_INVOKER_RETRY_FAILED_INTERVAL=0.05
78 | # - FROCKET_INVOKER=aws_lambda
79 | depends_on:
80 | - redis
81 | command: apiserver
82 |
--------------------------------------------------------------------------------
/docker/all-in-one.Dockerfile:
--------------------------------------------------------------------------------
1 | # Base Python image with up-to-date OS packages & pip
2 | FROM python:3.8-slim as base
3 | RUN apt-get update && apt-get clean && \
4 | python -m pip install --upgrade pip
5 |
6 | # Builder image: install packages and then cleanup some un-needed large files and directories
7 | FROM base as package-install
8 | WORKDIR /app
9 | COPY ./requirements.txt .
10 | RUN pip install --no-cache-dir --no-compile -r requirements.txt -t ./packages
11 | # Delete un-needed big files in pyarrow, tests & include dirs,
12 | # and all directories in botocore/data except for services actually used by frocket
13 | RUN rm ./packages/pyarrow/*flight*.so* \
14 | ./packages/pyarrow/*plasma*.so* \
15 | ./packages/pyarrow/plasma-store-server && \
16 | find ./packages -type d -name tests | xargs rm -rf && \
17 | find ./packages -type d -name include | xargs rm -rf && \
18 | find ./packages/botocore/data -type d -mindepth 1 -maxdepth 1 | grep -vE 's3|lambda' | xargs rm -rf
19 |
20 | # This image is based on 'base' again, so it doesn't carry over intermediate fat layers from package-install image.
21 | # It copies over only the pruned packages to the final image.
22 | FROM base
23 | WORKDIR /app
24 | COPY ./docker/entrypoint.sh .
25 | RUN chmod +x ./entrypoint.sh
26 | RUN useradd -ms /bin/bash frocket
27 | COPY --from=package-install /app/packages packages
28 | # The most frequently-changing file set - the source code itself, is copied last so previous layers are unaffected
29 | COPY ./requirements.txt .
30 | COPY ./test-requirements.txt .
31 | COPY ./setup.py .
32 | COPY ./frocket frocket
33 | COPY ./tests tests
34 | RUN pip install --no-cache-dir --no-compile --no-deps . -t ./packages
35 | USER frocket
36 | ENV PYTHONPATH=/app/packages
37 | ENTRYPOINT ["./entrypoint.sh"]
38 |
--------------------------------------------------------------------------------
/docker/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | case "$1" in
3 | worker)
4 | echo "Starting Funnel Rocket queue-based worker"
5 | python -m frocket.worker.impl.queue_worker
6 | ;;
7 | apiserver)
8 | PORT=${APISERVER_PORT:-5000}
9 | NUM_WORKERS=${APISERVER_NUM_WORKERS:-8}
10 | echo "Starting Funnel Rocket API server with $NUM_WORKERS workers on port $PORT"
11 | python -m gunicorn.app.wsgiapp frocket.apiserver:app --bind=0.0.0.0:$PORT --workers=$NUM_WORKERS
12 | ;;
13 | *)
14 | echo "Invalid command supplied"
15 | exit 1
16 | ;;
17 | esac
18 |
--------------------------------------------------------------------------------
/docker/local-lambda.Dockerfile:
--------------------------------------------------------------------------------
1 | ARG PYTHON_VERSION=3.8
2 | # Note: not using multi-stage build here, in contrary to all-in-one image.
3 | # This has the pro of very fast incremental builds locally, and the con of large image size - ok for tests.
4 | # Since we're switching to root during build,
5 | # need to return to default Lambda user afterwards (as defined in base image)
6 | ARG RUN_USER=sbx_user1051
7 | FROM lambci/lambda:python3.8
8 | # Lambda function code should be in /var/task
9 | WORKDIR /var/task
10 | COPY ./setup.py .
11 | COPY ./requirements.txt .
12 | # Lambda layer(s) (useful for holding all big & infrequently changing dependencies)
13 | # should be located under /opt, which is only writable by root.
14 | # Don't install boto3/botocore, which is vendored by AWS in its most appropriate version
15 | USER root
16 | RUN grep -v boto requirements.txt > lambda_requirements.txt
17 | RUN mkdir /opt/python && pip install --no-compile --no-cache-dir -r lambda_requirements.txt -t /opt/python
18 | # Clean-up some big files
19 | RUN rm /opt/python/pyarrow/*flight*.so* \
20 | /opt/python/pyarrow/*plasma*.so* \
21 | /opt/python/pyarrow/plasma-store-server \
22 | setup.py requirements.txt lambda_requirements.txt
23 | # Go back to user & workdir of base image
24 | USER ${RUN_USER}
25 | # Copy package source code, which is frequently changing, only at end of Dockerfile
26 | COPY ./frocket /var/task/frocket
27 | WORKDIR /var/task
28 | # These values are for running tests, not production usage
29 | ENV DOCKER_LAMBDA_STAY_OPEN=1 \
30 | AWS_LAMBDA_FUNCTION_NAME=frocket \
31 | AWS_LAMBDA_FUNCTION_TIMEOUT=15 \
32 | AWS_LAMBDA_FUNCTION_MEMORY_SIZE=256
33 |
--------------------------------------------------------------------------------
/docs/logo-blue.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
66 |
--------------------------------------------------------------------------------
/docs/logo-icon-dark-blue.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
32 |
--------------------------------------------------------------------------------
/docs/logo-icon-light-blue.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
32 |
--------------------------------------------------------------------------------
/docs/logo-small-blue.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
41 |
--------------------------------------------------------------------------------
/frocket/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/frocket/cli.py:
--------------------------------------------------------------------------------
1 | """
2 | Simple CLI for Funnel Rocket.
3 |
4 | This is currently a wrapper over invoker_api directly (meaning that the CLI process is the invoker), rather than
5 | calling an API server - meaning that it does not rely on a running server, but needs the same permissions (listing files
6 | in remote storage, access to Redis as datastore, optionally being able to invoke Lambdas).
7 |
8 | This makes the CLI more suitable for onboarding and evaluation, but in production it's preferable to use the API
9 | (for a better permissions model and centralized monitoring/logging, if nothing else).
10 |
11 | The CLI does provide a few optional flags which make it also suitable for automating jobs:
12 | * --nopretty returns JSON object/s without any captions
13 | * --notrim and --nocolor prevents data from bein shortened or surrounded by ANSI color codes
14 | * The log level is controllbable, and all log lines have a prefix making them easy to ignore.
15 | """
16 | # Copyright 2021 The Funnel Rocket Maintainers
17 | #
18 | # Licensed under the Apache License, Version 2.0 (the "License");
19 | # you may not use this file except in compliance with the License.
20 | # You may obtain a copy of the License at
21 | #
22 | # http://www.apache.org/licenses/LICENSE-2.0
23 | #
24 | # Unless required by applicable law or agreed to in writing, software
25 | # distributed under the License is distributed on an "AS IS" BASIS,
26 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
27 | # See the License for the specific language governing permissions and
28 | # limitations under the License.
29 |
30 | import argparse
31 | # TODO backlog don't import any frocket modules but a carefully selected set which does not then import heavy packages
32 | # or initialize mechanisms. This is only partially done now (see import at end of file).
33 | from frocket.common.config import config
34 | from frocket.common.tasks.registration import DatasetValidationMode, REGISTER_DEFAULT_VALIDATION_MODE, \
35 | REGISTER_DEFAULT_FILENAME_PATTERN, REGISTER_DEFAULT_VALIDATE_UNIQUES
36 |
37 | REGISTER_VALIDATION_MODE_CHOICES = [e.value.lower() for e in DatasetValidationMode]
38 | LOG_LEVEL_CHOICES = ['debug', 'info', 'warning', 'error', 'critical']
39 | LOG_LINE_PREFIX = '[Log '
40 | LOG_FORMAT = LOG_LINE_PREFIX + '%(levelname)s %(name)s] %(message)s'
41 |
42 |
43 | def build_parser() -> argparse.ArgumentParser:
44 | parser = argparse.ArgumentParser(description='Simple CLI for Funnel Rocket',
45 | formatter_class=argparse.ArgumentDefaultsHelpFormatter)
46 | parser.add_argument('--notrim', action='store_true', help='Don\'t trim any text')
47 | parser.add_argument('--nocolor', action='store_true', help='Don\'t trim colorize any text')
48 | parser.add_argument('--nopretty', action='store_true', help='Don\'t pretty-print the response')
49 | parser.add_argument('--loglevel', type=str.lower, choices=LOG_LEVEL_CHOICES,
50 | help=f'Set log level {LOG_LEVEL_CHOICES}')
51 | subparsers = parser.add_subparsers(dest='command', title='commands')
52 | subparsers.required = True
53 |
54 | register_parser = subparsers.add_parser('register', help='Register a dataset',
55 | formatter_class=argparse.ArgumentDefaultsHelpFormatter)
56 | register_parser.add_argument('name', type=str, help='Dataset name')
57 | register_parser.add_argument('basepath', type=str,
58 | help='The path all files are directly under. Local and s3://... paths supported.')
59 | register_parser.add_argument('group_id_column', type=str,
60 | help='The column to group rows by, e.g. "userId", "userHash". '
61 | 'This column is required and no values can be missing. Each part (file) in the '
62 | 'dataset should have a distinct set of values for this column.')
63 | register_parser.add_argument(
64 | 'timestamp_column', type=str,
65 | help='The column holding the timestamp of each row, e.g. "timestamp", "ts". '
66 | 'Must be a numeric column with no values missing. Using a unix timestamp is advised - '
67 | 'with or without sub-second resoluton based on your needs, either as int or float.')
68 | register_parser.add_argument('--pattern', type=str, default=REGISTER_DEFAULT_FILENAME_PATTERN,
69 | help='Filename pattern. Sub-directories are currently not supported.')
70 | register_parser.add_argument('--validation', type=str.lower,
71 | choices=REGISTER_VALIDATION_MODE_CHOICES,
72 | default=REGISTER_DEFAULT_VALIDATION_MODE.value.lower(),
73 | help=f"Validation mode to use {REGISTER_VALIDATION_MODE_CHOICES}",
74 | metavar='MODE')
75 | register_parser.add_argument('--skip-uniques', action='store_true',
76 | default=not REGISTER_DEFAULT_VALIDATE_UNIQUES,
77 | help='Skip validation of group_id_column values uniqueness across files '
78 | '(the set of files to test is determined by --validation argument)')
79 |
80 | list_parser = subparsers.add_parser('list', help='List datasets')
81 |
82 | run_query_parser = subparsers.add_parser('run', help='Run query')
83 | run_query_parser.add_argument('dataset')
84 | query_sources_group = run_query_parser.add_mutually_exclusive_group(required=True)
85 | query_sources_group.add_argument('--file', '-f', type=str, help='Run query stored in file', dest='filename')
86 | query_sources_group.add_argument('--empty', '-e', action='store_true',
87 | help='Run an empty query with no conditions')
88 | query_sources_group.add_argument('--string' '-s', type=str,
89 | help='Run the following query string', dest='query_string')
90 |
91 | info_parser = subparsers.add_parser('info', help='Show dataset information')
92 | info_parser.add_argument('dataset', type=str)
93 | info_parser.add_argument('--full', action='store_true', help='Show full schema')
94 |
95 | unreg_parser = subparsers.add_parser('unregister', help='Unregister a dataset')
96 | unreg_parser.add_argument('dataset', type=str)
97 | unreg_parser.add_argument('--force', action='store_true',
98 | help='Unregister a dataset even if it\'s currently in use')
99 |
100 | config_parser = subparsers.add_parser('config', help='Show configuration')
101 | return parser
102 |
103 |
104 | def run_from_args(args: argparse.Namespace):
105 | config['log.format'] = LOG_FORMAT if args.nocolor else f"\033[33m{LOG_FORMAT}\033[0m"
106 | if args.loglevel:
107 | config['log.level'] = args.loglevel
108 | config.init_logging(force_console_output=True)
109 |
110 | # invoker_api isn't loaded (or logging implicitly initialized) till arguments are validated and log level is set
111 | from frocket.cli_commands import run_command
112 | run_command(args.command, args)
113 |
114 |
115 | if __name__ == '__main__':
116 | parser = build_parser()
117 | args = parser.parse_args()
118 | run_from_args(args)
119 |
--------------------------------------------------------------------------------
/frocket/cli_commands.py:
--------------------------------------------------------------------------------
1 | """
2 | implementation of CLI commands.
3 | """
4 | # Copyright 2021 The Funnel Rocket Maintainers
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | import argparse
19 | import json
20 | import sys
21 | from json.decoder import JSONDecodeError
22 | from pathlib import Path
23 | from typing import Any
24 | from tabulate import tabulate
25 | from frocket.common.config import config
26 | from frocket.common.serializable import SerializableDataClass
27 | from frocket.common.tasks.base import BaseApiResult
28 | from frocket.common.tasks.registration import DatasetValidationMode, RegisterArgs
29 | from frocket.invoker import invoker_api
30 |
31 | DATE_FORMAT = '%Y-%m-%d %H:%M:%S %Z'
32 |
33 |
34 | def run_command(cmd: str, args: argparse.Namespace):
35 | mapping = {
36 | 'register': register_dataset_cmd,
37 | 'unregister': unregister_dataset_cmd,
38 | 'list': list_datasets_cmd,
39 | 'run': run_query_cmd,
40 | 'info': dataset_info_cmd,
41 | 'config': show_config_cmd
42 | }
43 | mapping[cmd](args)
44 |
45 |
46 | def fail_missing_dataset(name: str):
47 | sys.exit(f"Dataset '{name}' not found!")
48 |
49 |
50 | def trim_column(s: str, args: argparse.Namespace, maxwidth: int) -> str:
51 | if args.notrim or args.nopretty or len(s) <= maxwidth:
52 | return s
53 | else:
54 | return s[:maxwidth - 3] + '...'
55 |
56 |
57 | def print_json(name: str, o: Any, pretty_print: bool):
58 | def to_json(o: Any, indent: int = None) -> str:
59 | return o.to_json(indent=indent) if isinstance(o, SerializableDataClass) else json.dumps(o, indent=indent)
60 |
61 | if pretty_print:
62 | print(name + ':', to_json(o, indent=2))
63 | else:
64 | print(to_json(o))
65 |
66 |
67 | def handle_api_result(res: BaseApiResult, pretty_print: bool):
68 | print_json('API Result', res, pretty_print)
69 | if not res.success:
70 | sys.exit('FAILED' if pretty_print else 1)
71 |
72 |
73 | def register_dataset_cmd(args):
74 | validation_mode = DatasetValidationMode[args.validation.upper()]
75 | register_args = RegisterArgs(name=args.name,
76 | basepath=args.basepath,
77 | group_id_column=args.group_id_column,
78 | timestamp_column=args.timestamp_column,
79 | pattern=args.pattern,
80 | validation_mode=validation_mode,
81 | validate_uniques=not args.skip_uniques)
82 | res = invoker_api.register_dataset(register_args)
83 | handle_api_result(res, pretty_print=not args.nopretty)
84 |
85 |
86 | def unregister_dataset_cmd(args):
87 | res = invoker_api.unregister_dataset(args.dataset, force=args.force)
88 | handle_api_result(res, pretty_print=not args.nopretty)
89 |
90 |
91 | def list_datasets_cmd(args):
92 | datasets = sorted(invoker_api.list_datasets(), key=lambda ds: ds.id.registered_at, reverse=True)
93 | display_datasets = [{'name': trim_column(ds.id.name, args, maxwidth=30),
94 | 'registered at': ds.id.registered_at.strftime(DATE_FORMAT),
95 | 'parts': ds.total_parts,
96 | 'group id': ds.group_id_column,
97 | 'timestamp': ds.timestamp_column,
98 | 'path': trim_column(ds.basepath, args, maxwidth=50)}
99 | for ds in datasets]
100 | if args.nopretty:
101 | print(json.dumps(display_datasets))
102 | else:
103 | if len(datasets) == 0:
104 | print('No datasets registered yet')
105 | else:
106 | print(tabulate(display_datasets, headers='keys'))
107 |
108 |
109 | def json_parse(s: str) -> dict:
110 | try:
111 | return json.loads(s)
112 | except JSONDecodeError as e:
113 | sys.exit(f'JSON Error: {e}')
114 |
115 |
116 | def run_query_cmd(args):
117 | ds_info = invoker_api.get_dataset(args.dataset)
118 | if not ds_info:
119 | fail_missing_dataset(args.dataset)
120 | query = None
121 | if args.empty:
122 | query = {}
123 | elif args.query_string:
124 | query = json_parse(args.query_string)
125 | elif args.filename:
126 | filepath = Path(args.filename)
127 | if not filepath.exists():
128 | sys.exit(f'File not found: {args.filename}')
129 | else:
130 | query_str = filepath.read_text(encoding='utf-8')
131 | query = json_parse(query_str)
132 | else:
133 | sys.exit('Unknown mode')
134 |
135 | try:
136 | res = invoker_api.run_query(ds_info, query)
137 | handle_api_result(res, pretty_print=not args.nopretty)
138 | except Exception as e:
139 | sys.exit(f'Error: {e}')
140 |
141 |
142 | def dataset_info_cmd(args):
143 | show_full = args.full
144 | ds_info = invoker_api.get_dataset(args.dataset)
145 | if not ds_info:
146 | fail_missing_dataset(args.dataset)
147 | parts_info = invoker_api.get_dataset_parts(ds_info)
148 | schema_info = invoker_api.get_dataset_schema(ds_info, full=show_full)
149 | print_json('Basic information', ds_info, pretty_print=not args.nopretty)
150 | print_json('Parts', parts_info, pretty_print=not args.nopretty)
151 | print_json(f'Schema (full: {show_full})', schema_info, pretty_print=not args.nopretty)
152 |
153 |
154 | def show_config_cmd(args):
155 | print_json(f'Configuration', config, pretty_print=not args.nopretty)
156 |
--------------------------------------------------------------------------------
/frocket/common/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/frocket/common/dataset.py:
--------------------------------------------------------------------------------
1 | """Base classes for registered datasets and their metadata."""
2 | # Copyright 2021 The Funnel Rocket Maintainers
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | import logging
17 | from enum import auto
18 | from datetime import datetime, timezone
19 | from typing import Optional, List, Dict
20 | from dataclasses import dataclass, field
21 | from frocket.common.serializable import SerializableDataClass, AutoNamedEnum
22 |
23 | logger = logging.getLogger(__name__)
24 |
25 |
26 | class PartNamingMethod(AutoNamedEnum):
27 | """
28 | For future use: currently the full list of dataset filenames is stored as metadata, however if a consistent
29 | numbering pattern should be identified, it may be useful both for more compact metadata and for a more predictable
30 | part index -> filename mapping.
31 | """
32 | RUNNING_NUMBER = auto()
33 | LIST = auto()
34 |
35 |
36 | @dataclass(frozen=True)
37 | class DatasetId(SerializableDataClass):
38 | """
39 | The main reason why this class exists: datasets can be re-registered multiple times with the same name, but any
40 | caching behavior should be sensitive to the registered date and become invalid on re-registration.
41 | In concrete terms, caching should be based on DatasetId keys (which are immutable) rather than a dataset name.
42 |
43 | Re-registering a dataset is useful, in cases such as:
44 | 1. When you don't need to manage revisions yourself (via specifying a new dataset name and un-registering old ones).
45 | 2. As an alias to the current version (datasets are only metadata, you can register the same physical files N times)
46 | 3. If the datafiles were found to be incomplete/invalid, and after fixing the issue you want to invalidate caching.
47 | """
48 | name: str
49 | registered_at: datetime
50 |
51 | @classmethod
52 | def now(cls, name: str):
53 | return DatasetId(name, registered_at=datetime.now(tz=timezone.utc))
54 |
55 |
56 | @dataclass(frozen=True)
57 | class DatasetPartId(SerializableDataClass):
58 | """Specifies a single part (file) in a dataset version (see documetation for DatasetId above!)."""
59 | dataset_id: DatasetId
60 | path: str
61 | part_idx: int
62 |
63 |
64 | @dataclass(frozen=True)
65 | class DatasetInfo(SerializableDataClass):
66 | """
67 | Basic metadata for a dataset.
68 |
69 | This class should be kept pretty small, as it's passed along in task requests.
70 | More detailed metadata is found in the data schema object, which is stored separately and read when needed
71 | (and also exists in both short and full versions)
72 | """
73 | basepath: str
74 | total_parts: int
75 | id: DatasetId
76 | group_id_column: str # The column by which the dataset is partitioned, and grouping is done.
77 | timestamp_column: str # The column by which timeframe conditions and funnels are run.
78 |
79 |
80 | @dataclass(frozen=True)
81 | class DatasetPartsInfo(SerializableDataClass):
82 | """Holds the list of files in the dataset. Separate from DatasetInfo only due to size (this data is usually not
83 | needed to be sent in network calls)."""
84 | naming_method: PartNamingMethod
85 | total_parts: int
86 | total_size: int
87 | running_number_pattern: Optional[str] = field(default=None)
88 | filenames: Optional[List[str]] = field(default=None)
89 |
90 | def fullpaths(self, parent: DatasetInfo) -> List[str]:
91 | parentpath = parent.basepath if parent.basepath.endswith('/') else parent.basepath + '/'
92 |
93 | if self.naming_method == PartNamingMethod.LIST:
94 | assert (self.filenames and len(self.filenames) == parent.total_parts)
95 | return [parentpath + filename for filename in self.filenames]
96 | else:
97 | assert self.running_number_pattern
98 | return [parentpath + self.running_number_pattern.format(idx)
99 | for idx in range(parent.total_parts)]
100 |
101 |
102 | class DatasetColumnType(AutoNamedEnum):
103 | INT = auto()
104 | FLOAT = auto()
105 | BOOL = auto()
106 | # Categorical columns are not a separate type to the query engine. That designation exists and is used separately.
107 | STRING = auto()
108 |
109 |
110 | @dataclass(frozen=True)
111 | class DatasetColumnAttributes(SerializableDataClass):
112 | """
113 | The 'full' information on each column. TODO backlog use polymorphism? (needs support in de-serialization)
114 |
115 | For columns which were either saved by Pandas as categoricals, or are identified during registration to be such,
116 | store a mapping of top N values (configurable) to their their normalized share in the dataset. Since registration
117 | does not read all files but only a sample, that ratio cannot be an absolute number or the exact ratio - but still
118 | useful for clients.
119 |
120 | cat_unique_ratio is the ratio of unique value count to all values (or: series.nunique()/len(series)), and may be
121 | a useful rough indicator to how much RAM is saved (and str.match() operations sped-up!) by the categorical
122 | representation. Columns are determined to be loaded as categorical if this value is lower than configured.
123 | Loading of columns as categoricals is also usually much faster, but that greatly depends on whether a dictionary
124 | was saved for that column in the Parquet file or not - so it depends on the tool used to create these files.
125 | """
126 | numeric_min: Optional[float] = None
127 | numeric_max: Optional[float] = None
128 | categorical: bool = False
129 | cat_top_values: Optional[Dict[str, float]] = None
130 | cat_unique_ratio: Optional[float] = None
131 |
132 |
133 | @dataclass(frozen=True)
134 | class DatasetColumn(SerializableDataClass):
135 | name: str
136 | dtype_name: str
137 | coltype: DatasetColumnType
138 | colattrs: DatasetColumnAttributes
139 |
140 |
141 | @dataclass(frozen=True)
142 | class DatasetShortSchema(SerializableDataClass):
143 | """Schema, the short version - typically all you may need."""
144 | columns: Dict[str, DatasetColumnType]
145 | min_timestamp: float
146 | max_timestamp: float
147 | # In files created by Pandas with its metadata intact in the Parquet file, columns marked as categoricals.
148 | source_categoricals: List[str] = field(default=None)
149 | # Columns detected during registration to be good candidates for explicitly loading as categoricals (by PyArrow).
150 | potential_categoricals: List[str] = field(default=None)
151 |
152 |
153 | @dataclass(frozen=True)
154 | class DatasetSchema(SerializableDataClass):
155 | group_id_column: str
156 | timestamp_column: str
157 | columns: Dict[str, DatasetColumn]
158 | # Just the names->dtypes of all columns not (currently) supported.
159 | unsupported_columns: Dict[str, str]
160 |
161 | def short(self) -> DatasetShortSchema:
162 | """Make short from full."""
163 | cols = {name: col.coltype for name, col in self.columns.items()}
164 | source_categoricals = []
165 | potential_categoricals = []
166 | for name, col in self.columns.items():
167 | if col.colattrs.categorical:
168 | if col.dtype_name == 'category':
169 | source_categoricals.append(name)
170 | else:
171 | potential_categoricals.append(name)
172 | ts_attrs = self.columns[self.timestamp_column].colattrs
173 | min_ts = ts_attrs.numeric_min
174 | max_ts = ts_attrs.numeric_max
175 |
176 | return DatasetShortSchema(columns=cols,
177 | source_categoricals=source_categoricals,
178 | potential_categoricals=potential_categoricals,
179 | min_timestamp=min_ts, max_timestamp=max_ts)
180 |
--------------------------------------------------------------------------------
/frocket/common/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/frocket/common/helpers/pandas.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import Iterable
16 | import pandas as pd
17 | import numpy as np
18 |
19 |
20 | def filter_by_isin(df: pd.DataFrame, column: str, values: Iterable) -> pd.DataFrame:
21 | """
22 | For the given DataFrame, return only rows where df[column] is in the given values.
23 | This is a surprisingly faster alternative to built-in Pandas/NumPy functions: df[np.isin(df[column], values)]
24 | A value can appear in multiple rows (e.g. the same user ID appearing multiple rows)
25 |
26 | TODO Merge a [Numba-based isin()](https://stackoverflow.com/questions/53046473/numpy-isin-performance-improvement)
27 | function, compiled AOT for relevant array dtypes. This would be arch-dependent and optional (with fallback)
28 | """
29 | # First, create a "map" series from all possible values in the column => whether they should pass the filter
30 | all_ids = df[column].unique()
31 | is_id_relevant = pd.Series(np.zeros(len(all_ids)), index=all_ids).astype('bool') # Default false
32 | is_id_relevant.loc[values] = True
33 |
34 | # Create a boolean mask for column, based on the mapping above. Grab the raw array.
35 | mask = is_id_relevant[df[column]].values
36 | # Apply mask
37 | return df[mask]
38 |
39 |
40 | def add_column_by_value_map(df: pd.DataFrame, keys_column: str, values_map_series: pd.Series, new_column: str) -> None:
41 | """
42 | Add a new column to the given df. For each row, df[new_column] will be set to an appropriate value from
43 | values_map_series: the value whose index is df[keys_column] in that row.
44 |
45 | e.g. given a DF of user activities having a userId column (with potentially multiple rows per user), and a
46 | values_map_series whose unique index is a User ID, and its values are the age of that user, the function will add
47 | a new column to the given DF with the age of that row's user ID
48 |
49 | If a value in keys_column does not have a matching index in values_map_series, the cell value would be NaN.
50 | This function is optimized for performance.
51 |
52 | The given DF is modified inplace.
53 | """
54 | # Create a new mapping between ALL unique values of IDs of df[keys_column] and their matching value (or NaN)
55 | unique_keys = df[keys_column].unique()
56 | key_to_value = pd.Series(data=np.nan, index=unique_keys)
57 | key_to_value.loc[values_map_series.index] = values_map_series
58 |
59 | # Now we can create the new column, using the mapping
60 | df[new_column] = key_to_value[df[keys_column]].values
61 |
--------------------------------------------------------------------------------
/frocket/common/helpers/storage.py:
--------------------------------------------------------------------------------
1 | """
2 | Simple abstraction of local & remote filesystems.
3 |
4 | Currently supports either a local filesystem (for non-distributed usage, or potentially a fast network share)
5 | and S3 (or S3-compatible object stores such as MinIO, which is used for running tests).
6 | Additional protocols are welcome.
7 |
8 | TODO backlog: support pagination for S3 listing (so more than 1,000 files per dataset)
9 | TODO backlog: support auto-identification of numbering pattern in dataset files, so the full list of filenames
10 | would not have to reside in the datastore
11 | """
12 | # Copyright 2021 The Funnel Rocket Maintainers
13 | #
14 | # Licensed under the Apache License, Version 2.0 (the "License");
15 | # you may not use this file except in compliance with the License.
16 | # You may obtain a copy of the License at
17 | #
18 | # http://www.apache.org/licenses/LICENSE-2.0
19 | #
20 | # Unless required by applicable law or agreed to in writing, software
21 | # distributed under the License is distributed on an "AS IS" BASIS,
22 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23 | # See the License for the specific language governing permissions and
24 | # limitations under the License.
25 |
26 | import logging
27 | import re
28 | import tempfile
29 | import uuid
30 | from abc import abstractmethod
31 | from enum import Enum, auto
32 | from fnmatch import fnmatch
33 | from pathlib import Path
34 | from typing import NamedTuple, Optional, List
35 | import boto3
36 | from frocket.common.config import config
37 | from frocket.common.dataset import DatasetPartsInfo, PartNamingMethod
38 |
39 | logger = logging.getLogger(__name__)
40 |
41 |
42 | class StorageHandler:
43 | """Simple abstraction of a storage protocol."""
44 | class FileBaseInfo(NamedTuple):
45 | relpath: str
46 | size: int
47 |
48 | def __init__(self, path: str):
49 | assert self.valid(path)
50 | self._path = path
51 |
52 | @classmethod
53 | def valid(cls, path: str) -> bool:
54 | """For validation of a path prior to instantiating the handler - a nicety instead of exceptions later,
55 | to be overriden where appropriate."""
56 | return True
57 |
58 | @property
59 | @abstractmethod
60 | def remote(self) -> bool:
61 | """This affects the caching behavior used by workers (see part_loader.py)."""
62 | pass
63 |
64 | @abstractmethod
65 | def _list_files(self, pattern: str) -> List[FileBaseInfo]:
66 | """Override in subclasses"""
67 | pass
68 |
69 | def discover_files(self, pattern: str) -> DatasetPartsInfo:
70 | files = self._list_files(pattern)
71 | files.sort(key=lambda fi: fi.relpath)
72 | # TODO backlog implement PartNamingMethod.RUNNING_NUMBER for compact metadata in large datasets
73 | parts_info = DatasetPartsInfo(naming_method=PartNamingMethod.LIST,
74 | total_parts=len(files),
75 | total_size=sum([fi.size for fi in files]),
76 | filenames=[fi.relpath for fi in files],
77 | running_number_pattern=None)
78 | return parts_info
79 |
80 | @abstractmethod
81 | def _local_path(self, fullpath: str) -> str:
82 | """
83 | If the filesystem is remote, download and return a local copy.
84 | Files should be cleaned-up by the caller which controls the caching behavior.
85 | """
86 | pass
87 |
88 | def get_local_path(self, fullpath: str) -> str:
89 | if not fullpath.startswith(self._path):
90 | raise Exception(f"Given full path {fullpath} is not under handler's path {self._path}")
91 |
92 | return self._local_path(fullpath)
93 |
94 |
95 | class FileStorageHanler(StorageHandler):
96 | """Super-simple local filesystem handler"""
97 | @property
98 | def remote(self):
99 | return False
100 |
101 | def _list_files(self, pattern):
102 | paths = Path(self._path).iterdir()
103 | files = [StorageHandler.FileBaseInfo(path.name, path.stat().st_size)
104 | for path in paths
105 | if fnmatch(path.name, pattern)]
106 | return files
107 |
108 | def _local_path(self, fullpath):
109 | if not Path(fullpath).is_file():
110 | raise Exception(f"Path is missing/not a file: {fullpath}")
111 | return fullpath
112 |
113 |
114 | class S3StorageHanler(StorageHandler):
115 | """S3 filesystem handler, supports datasets directly under the bucket or within a sub-directory."""
116 | S3_PATH_REGEX = re.compile(r"^s3://([a-zA-Z0-9_\-.]+)/([a-zA-Z0-9_\-./]*)$")
117 |
118 | def __init__(self, path: str):
119 | super().__init__(path)
120 | path_parts = self.S3_PATH_REGEX.match(path)
121 | self._bucket = path_parts.group(1)
122 | self._path_in_bucket = path_parts.group(2)
123 | no_trailing_slash = self._path_in_bucket and self._path_in_bucket[-1:] != '/'
124 | self._path_in_bucket_normalized = self._path_in_bucket + ('/' if no_trailing_slash else '')
125 |
126 | @classmethod
127 | def valid(cls, path):
128 | return True if cls.S3_PATH_REGEX.match(path) else False
129 |
130 | @property
131 | def remote(self):
132 | return True
133 |
134 | def _list_files(self, pattern):
135 | path_in_bucket = self._path_in_bucket_normalized
136 | logger.info(f"Listing files in S3 with bucket {self._bucket} and prefix {path_in_bucket}...")
137 | # TODO backlog support pagination
138 | s3response = self._client().list_objects_v2(Bucket=self._bucket, Prefix=path_in_bucket)
139 |
140 | filename_start_idx = len(path_in_bucket)
141 | path_to_size = {obj['Key'][filename_start_idx:]: obj['Size'] for obj in s3response['Contents']}
142 | files = [StorageHandler.FileBaseInfo(path, size)
143 | for path, size in path_to_size.items()
144 | if fnmatch(path, pattern)]
145 | return files
146 |
147 | def _local_path(self, fullpath):
148 | localpath = str(Path(tempfile.gettempdir()) / str(uuid.uuid4()))
149 | logger.info(f"Downloading {fullpath} to {localpath}...")
150 | self._client().download_file(self._bucket, self._path_in_bucket, localpath)
151 | return localpath
152 |
153 | @classmethod
154 | def _client(cls):
155 | if not hasattr(cls, '_s3client'):
156 | cls._s3client = boto3.client('s3', **config.aws_client_settings(service='s3'))
157 | return cls._s3client
158 |
159 |
160 | class StorageProtocol(Enum):
161 | FILE = auto()
162 | S3 = auto()
163 |
164 | @classmethod
165 | def get(cls, name: str):
166 | return cls.__members__.get(name.upper())
167 |
168 | @classmethod
169 | def names(cls) -> List[str]:
170 | return list(cls.__members__.keys())
171 |
172 |
173 | PATH_WITH_PROTOCOL_RE = r'(\w+)://(.+)$'
174 | PROTOCOL_TO_HANDLER = {
175 | StorageProtocol.FILE: FileStorageHanler,
176 | StorageProtocol.S3: S3StorageHanler
177 | }
178 |
179 |
180 | def storage_handler_for(path: str, throw_if_missing: bool = True) -> Optional[StorageHandler]:
181 | """
182 | Instantiate the appropriate handler for the given path.
183 | Paths without explicit protocol are considered local.
184 | """
185 | path_and_protocol = re.match(PATH_WITH_PROTOCOL_RE, path)
186 | if path_and_protocol:
187 | method_name = path_and_protocol.groups()[0]
188 | method = StorageProtocol.get(method_name)
189 | if not method:
190 | if throw_if_missing:
191 | raise Exception(f"Storage protocol '{method_name}' is not in supported list: {StorageProtocol.names()}")
192 | else:
193 | return None
194 | elif method == StorageProtocol.FILE:
195 | path = path_and_protocol.groups()[1]
196 | else:
197 | method = StorageProtocol.FILE
198 |
199 | handler_cls = PROTOCOL_TO_HANDLER[method]
200 | if not handler_cls.valid(path):
201 | raise Exception(f"Invalid path: {path} (protocol: {method.name})")
202 | return handler_cls(path)
203 |
--------------------------------------------------------------------------------
/frocket/common/helpers/utils.py:
--------------------------------------------------------------------------------
1 | """For everything but the kitchen sink."""
2 | # Copyright 2021 The Funnel Rocket Maintainers
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | import functools
17 | import math
18 | import random
19 | import uuid
20 | import time
21 | from io import BytesIO
22 | from typing import Optional, List
23 | import pandas as pd
24 | import pyarrow.feather as feather
25 | import numpy as np
26 |
27 |
28 | def terminal_red(message: str) -> str:
29 | return f"\033[31m{message}\033[0m"
30 |
31 |
32 | def terminal_green(message: str) -> str:
33 | return f"\033[32m{message}\033[0m"
34 |
35 |
36 | def memoize(obj):
37 | """Standard issue memoization decorator for caching function results (which don't need invalidation)."""
38 | cache = obj._cache = {}
39 |
40 | @functools.wraps(obj)
41 | def memoizer(*args, **kwargs):
42 | key = str(args) + str(kwargs)
43 | if key not in cache:
44 | cache[key] = obj(*args, **kwargs)
45 | return cache[key]
46 |
47 | return memoizer
48 |
49 |
50 | def sample_from_range(range_max: int,
51 | sample_ratio: float,
52 | max_samples: int,
53 | preselected: Optional[List[int]]) -> List[int]:
54 | """
55 | Given a range of numbers in 0..range_max, return random samples.
56 | Count of samples is set by sample_ratio, up to max_samples.
57 | If preselected is passed, include these indexes first.
58 | """
59 | available_indexes = list(range(range_max))
60 | sample_count = min(math.floor(range_max * sample_ratio), max_samples)
61 |
62 | if preselected:
63 | chosen = list(preselected)
64 | for i in preselected:
65 | available_indexes.remove(i)
66 | sample_count = max(sample_count - len(preselected), 0)
67 | else:
68 | chosen = []
69 |
70 | if sample_count > 0:
71 | chosen += random.choices(available_indexes, k=sample_count)
72 | return chosen
73 |
74 |
75 | def timestamped_uuid(prefix: str = None) -> str:
76 | return f"{prefix or ''}{math.floor(time.time())}-{str(uuid.uuid4())[:8]}"
77 |
78 |
79 | def ndarray_to_bytes(arr: np.ndarray) -> bytes:
80 | """Use PyArrow's feather format as a compute- and space-efficient format for serializing NumPy arrays."""
81 | df = pd.DataFrame(data={'arr': arr})
82 | buf = BytesIO()
83 | # noinspection PyTypeChecker
84 | feather.write_feather(df, buf)
85 | buf.seek(0)
86 | return buf.read()
87 |
88 |
89 | def bytes_to_ndarray(data: bytes) -> np.ndarray:
90 | df = feather.read_feather(BytesIO(data))
91 | return df['arr']
92 |
--------------------------------------------------------------------------------
/frocket/common/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/frocket/common/tasks/async_tracker.py:
--------------------------------------------------------------------------------
1 | """
2 | AsyncJobTracker object is handed by invoker_api to clients that launch a job in a non-blocking fashion.
3 | It enables either periodic polling or blocking on updates. Updates are guaranteed to be atomic - that is,
4 | there may be further updates, but the status you have in hand is consistent.
5 | """
6 | # Copyright 2021 The Funnel Rocket Maintainers
7 | #
8 | # Licensed under the Apache License, Version 2.0 (the "License");
9 | # you may not use this file except in compliance with the License.
10 | # You may obtain a copy of the License at
11 | #
12 | # http://www.apache.org/licenses/LICENSE-2.0
13 | #
14 | # Unless required by applicable law or agreed to in writing, software
15 | # distributed under the License is distributed on an "AS IS" BASIS,
16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | # See the License for the specific language governing permissions and
18 | # limitations under the License.
19 |
20 | import logging
21 | import time
22 | from abc import ABCMeta, abstractmethod
23 | from dataclasses import dataclass
24 | from enum import auto
25 | from queue import Queue, Empty
26 | from typing import Optional, Dict, Generator
27 | from frocket.common.serializable import AutoNamedEnum
28 | from frocket.common.tasks.base import BaseJobResult, TaskStatus
29 |
30 | logger = logging.getLogger(__name__)
31 |
32 |
33 | class AsyncJobStage(AutoNamedEnum):
34 | STARTING = auto()
35 | RUNNING = auto()
36 | FINISHING = auto()
37 | DONE = auto()
38 |
39 |
40 | @dataclass(frozen=True)
41 | class AsyncJobStatus:
42 | stage: AsyncJobStage
43 | message: Optional[str] = None # The job may set descriptive text for what it's doing
44 | result: Optional[BaseJobResult] = None # Only available on stage=AsyncJobStage.DONE
45 | task_counters: Optional[Dict[TaskStatus, int]] = None
46 |
47 |
48 | class JobTimeoutError(Exception):
49 | pass
50 |
51 |
52 | class AsyncJobTracker(metaclass=ABCMeta):
53 | """The interface as known to clients"""
54 |
55 | @property
56 | @abstractmethod
57 | def status(self) -> AsyncJobStatus:
58 | """Get the latest status - as a consistent object which will not be mutated while using it"""
59 | pass
60 |
61 | @property
62 | @abstractmethod
63 | def elapsed_time(self) -> float:
64 | pass
65 |
66 | @property
67 | @abstractmethod
68 | def wait_time_remaining(self) -> Optional[float]:
69 | """
70 | If a tracker object was initialized with a timeout value by its creator (the invoker_api,
71 | based on configuration), then time remaining till timeout is known and can be returned.
72 | """
73 | pass
74 |
75 | @abstractmethod
76 | def wait(self, timeout: float = None) -> bool:
77 | """
78 | Blocking wait for updates with the given timeout, in seconds - but always capped to max wait time if set.
79 | By default, timeout is None - meaning wait up till max wait time (or indefinitely, in case it wasn't set).
80 | Assuming wait time is set, this is a good choice since no busy loop or semi-busy loop is needed.
81 | """
82 | pass
83 |
84 | def generator(self) -> Generator[AsyncJobStatus, None, None]:
85 | """
86 | Returns updates as they come for easier consumption, if blocking behavior is ok.
87 | This generator does not rely on any private attributes.
88 | """
89 | while True:
90 | update_available = self.wait()
91 | if not self.wait_time_remaining:
92 | raise JobTimeoutError()
93 |
94 | status_snapshot = self.status
95 | if status_snapshot.result:
96 | break
97 |
98 | if update_available:
99 | yield status_snapshot
100 |
101 | yield status_snapshot
102 |
103 |
104 | class AsyncJobStatusUpdater(AsyncJobTracker):
105 | """
106 | Implementation of AsyncJobTracker, which is only created within invoker_api and updated by invoker/job code.
107 |
108 | The one curiousity here is the blocking wait() mechanism which is based on a Queue instance.
109 | How it works: the client's wait() call blocks on waiting for a queue item. If there's already one,
110 | it's immediately returned. Once consumed, the queue is empty again and a subsequent wait() will repeat
111 | the process. Typically, the queue should have either zero or a only single item - see _signal_update() below.
112 | """
113 | def __init__(self, max_wait: float = None):
114 | self._status: AsyncJobStatus = AsyncJobStatus(stage=AsyncJobStage.STARTING)
115 | self._update_queue = Queue()
116 | self._max_wait = max_wait
117 | self._start_time = time.time()
118 |
119 | @property
120 | def elapsed_time(self) -> float:
121 | return time.time() - self._start_time
122 |
123 | @property
124 | def wait_time_remaining(self) -> Optional[float]:
125 | assert self._max_wait
126 | remaining = self._max_wait - self.elapsed_time
127 | return remaining if remaining > 0 else 0
128 |
129 | @property
130 | def status(self) -> AsyncJobStatus:
131 | return self._status
132 |
133 | def _update_status(self, new_status: AsyncJobStatus) -> None:
134 | """Only signal an update if there was actually any change."""
135 | modified = self._status != new_status
136 | self._status = new_status
137 | if modified:
138 | if logger.isEnabledFor(logging.DEBUG):
139 | logger.debug(f"Updated async status from\n:{self._status} to:\n{new_status}")
140 | self._signal_update()
141 | pass
142 |
143 | def update(self, stage: AsyncJobStage = None, message: str = None, task_counters: Dict[TaskStatus, int] = None):
144 | # Asserts are used here as the invoker/job classes are internal to the invoker_api, and are expected to conform
145 | # to this class' requirements. If not, it's probably a bug.
146 | assert stage != AsyncJobStage.DONE # To move to DONE stage, done() should be explicitly called
147 | assert self._status.stage != AsyncJobStage.DONE # No more updates after DONE was called once
148 | stage = stage or self._status.stage
149 | task_counters = task_counters or self._status.task_counters
150 | # Automatically cleanup message when moving in stages
151 | message = message or (self._status.message if (stage == self._status.stage) else None)
152 |
153 | self._update_status(AsyncJobStatus(stage=stage, message=message,
154 | task_counters=task_counters))
155 |
156 | def done(self, result: BaseJobResult):
157 | self._update_status(AsyncJobStatus(stage=AsyncJobStage.DONE, result=result,
158 | task_counters=self._status.task_counters))
159 |
160 | def _signal_update(self):
161 | if self._update_queue.empty():
162 | # If the client *already* has an update waiting for it, no need to do anything - it will read the latest
163 | # state anyway when it gets to consume it (the queue item itself doesn't hold any information).
164 | # In case of more than single updater thread, there might momentarily be more than a single item.
165 | # However, this is not currently used in this way, and it seems that having multiple items would not
166 | # have any detrimental effect (i.e. break correctness) if it actually occurs in other edge cases.
167 | # TODO backlog to ensure a single item always, consider a lock here and re-test empty() within that lock.
168 | self._update_queue.put(object())
169 |
170 | def wait(self, timeout=None):
171 | assert timeout is None or timeout > 0
172 | try:
173 | should_block = True
174 | if self._max_wait:
175 | remaining = self.wait_time_remaining
176 | if remaining == 0:
177 | # No more blocking wait - immediately return what's in the queue (or None)
178 | should_block = False
179 | timeout = None
180 | elif timeout:
181 | timeout = min(timeout, remaining)
182 | else:
183 | timeout = remaining
184 |
185 | self._update_queue.get(block=should_block, timeout=timeout)
186 | return True
187 | except Empty:
188 | return False
189 |
--------------------------------------------------------------------------------
/frocket/common/tasks/query.py:
--------------------------------------------------------------------------------
1 | """
2 | Query job's task classes
3 | """
4 | # Copyright 2021 The Funnel Rocket Maintainers
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from dataclasses import dataclass
19 | from enum import auto
20 | from typing import Optional, List, Dict, Union, cast
21 | import inflection
22 | from frocket.common.dataset import DatasetInfo, DatasetPartId
23 | from frocket.common.serializable import AutoNamedEnum, enveloped, SerializableDataClass, reducable
24 | from frocket.common.tasks.base import BaseTaskRequest, BaseTaskResult, BaseJobResult
25 |
26 |
27 | class PartSelectionMode(AutoNamedEnum):
28 | """Whether the invoker sets the task_index or the worker selects it from available tasks in the datastore."""
29 | SET_BY_INVOKER = auto()
30 | SELECTED_BY_WORKER = auto()
31 |
32 |
33 | @enveloped
34 | @dataclass(frozen=True)
35 | class QueryTaskRequest(BaseTaskRequest):
36 | dataset: DatasetInfo
37 | # String columns to load as Pandas categoricals, as performance optimization. These columns are detected during
38 | # dataset registration. Not needed for columns already of categorical type in files saved by Pandas.
39 | load_as_categoricals: Optional[List[str]]
40 | mode: PartSelectionMode
41 | # If (and only if) mode=SET_BY_INVOKER, the invoker also sets the dataset part index to query
42 | # Note that task_index not necessarily equals part ID
43 | invoker_set_part: Optional[DatasetPartId]
44 | used_columns: List[str] # Which columns to actually load (as optimization), as analyzed by QueryValidator.
45 | query: dict
46 |
47 |
48 | class AggregationType(AutoNamedEnum):
49 | # noinspection PyUnusedLocal
50 | def __init__(self, *args):
51 | if not hasattr(self.__class__, '_camels'):
52 | self.__class__._camels = {}
53 |
54 | self.camelized = inflection.camelize(self.name.lower(), uppercase_first_letter=False)
55 | self.__class__._camels[self.camelized] = self
56 | self.value_is_dict = self.name.endswith("_PER_VALUE")
57 |
58 | COUNT = auto()
59 | COUNT_PER_VALUE = auto()
60 | GROUPS_PER_VALUE = auto()
61 | SUM_PER_VALUE = auto()
62 | MEAN_PER_VALUE = auto()
63 |
64 | @classmethod
65 | def from_camelcase(cls, camelcase_name: str) -> AutoNamedEnum:
66 | return cls._camels[camelcase_name]
67 |
68 |
69 | AggrValue = Union[int, float]
70 | AggrValueMap = Dict[str, AggrValue]
71 |
72 |
73 | @reducable
74 | @dataclass(frozen=True)
75 | class AggregationResult(SerializableDataClass):
76 | column: str
77 | type: str
78 | # For some aggregation types ('count') the value is a single number. In others (the 'perValue' ones), value is
79 | # a dict of column value->aggregated number
80 | value: Optional[Union[AggrValue, AggrValueMap]]
81 | top: Optional[int] # Relevant for values of type dict
82 | name: Optional[str] # Only set if the user has set a custom name for this aggregation
83 |
84 | @classmethod
85 | def _reduce_fields(cls, serializables):
86 | """See: SerializableDataClass."""
87 | all_values = [e.value for e in cast(List[AggregationResult], serializables)]
88 | # Reduce either a primitive values or a dicts of counters
89 | if isinstance(all_values[0], dict):
90 | reduced_value = cls.reduce_counter_dicts(all_values, top_count=cast(cls, serializables[0]).top)
91 | else:
92 | reduced_value = sum(all_values)
93 | return {'value': reduced_value}
94 |
95 |
96 | @reducable
97 | @dataclass(frozen=True)
98 | class QueryConditionsResult(SerializableDataClass):
99 | matching_groups: int # e.g. user ID
100 | matching_group_rows: int # All rows of the matching groups, whether that row matches a condition or not
101 | aggregations: Optional[List[AggregationResult]]
102 |
103 | @classmethod
104 | def _reduce_fields(cls, serializables):
105 | results = cast(List[cls], serializables)
106 | return {'matching_groups': sum([e.matching_groups for e in results]),
107 | 'matching_group_rows': sum([e.matching_group_rows for e in results]),
108 | 'aggregations': cls.reduce_lists([e.aggregations for e in results])}
109 |
110 |
111 | @reducable
112 | @dataclass(frozen=True)
113 | class FunnelResult(SerializableDataClass):
114 | sequence: List[QueryConditionsResult]
115 | end_aggregations: Optional[List[AggregationResult]]
116 |
117 | @classmethod
118 | def _reduce_fields(cls, serializables):
119 | funnel_results = cast(List[cls], serializables)
120 | return {'sequence': cls.reduce_lists([e.sequence for e in funnel_results]),
121 | 'end_aggregations': cls.reduce_lists([e.end_aggregations for e in funnel_results])}
122 |
123 |
124 | @reducable
125 | @dataclass(frozen=True)
126 | class QueryResult(SerializableDataClass):
127 | query: QueryConditionsResult
128 | funnel: Optional[FunnelResult]
129 |
130 | @classmethod
131 | def _reduce_fields(cls, serializables):
132 | query_results = cast(List[cls], serializables)
133 | return {'query': QueryConditionsResult.reduce([e.query for e in query_results]),
134 | 'funnel': FunnelResult.reduce([e.funnel for e in query_results])}
135 |
136 |
137 | @enveloped
138 | @dataclass(frozen=True)
139 | class QueryTaskResult(BaseTaskResult):
140 | query_result: Optional[QueryResult] # Not set if query failed (when success=False)
141 |
142 |
143 | @dataclass(frozen=True)
144 | class QueryJobResult(BaseJobResult):
145 | query: Optional[QueryConditionsResult]
146 | funnel: Optional[FunnelResult]
147 |
--------------------------------------------------------------------------------
/frocket/common/tasks/registration.py:
--------------------------------------------------------------------------------
1 | """
2 | Task request/response classes for the registration job (discovering, validating and storing metadata for a dataset)
3 | """
4 | # Copyright 2021 The Funnel Rocket Maintainers
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from dataclasses import dataclass
19 | from enum import auto
20 | from typing import Optional
21 | from frocket.common.dataset import DatasetInfo, DatasetPartId, DatasetSchema
22 | from frocket.common.serializable import SerializableDataClass, AutoNamedEnum, enveloped
23 | from frocket.common.tasks.base import BaseTaskRequest, BaseTaskResult, BlobId, BaseJobResult, BaseApiResult
24 |
25 |
26 | class DatasetValidationMode(AutoNamedEnum):
27 | SINGLE = auto() # Only validate a single file in the dataset (meaning no cross-file consistency checks are done!)
28 | FIRST_LAST = auto() # Validate only first and last files (by lexicographic sorting) and cross-check them
29 | SAMPLE = auto() # Takes a sample of files, proportional to the no.o of files and up to a configured maximum.
30 |
31 |
32 | REGISTER_DEFAULT_FILENAME_PATTERN = '*.parquet' # Ignore files such as '_SUCCESS' and the like in discovery
33 | REGISTER_DEFAULT_VALIDATION_MODE = DatasetValidationMode.SAMPLE
34 | REGISTER_DEFAULT_VALIDATE_UNIQUES = True
35 |
36 |
37 | @dataclass(frozen=True)
38 | class RegisterArgs(SerializableDataClass):
39 | """Parameters collected by the CLI / API server for the registration job"""
40 | name: str
41 | basepath: str
42 | group_id_column: str
43 | timestamp_column: str
44 | pattern: str = REGISTER_DEFAULT_FILENAME_PATTERN
45 | validation_mode: DatasetValidationMode = REGISTER_DEFAULT_VALIDATION_MODE
46 | validate_uniques: bool = REGISTER_DEFAULT_VALIDATE_UNIQUES
47 |
48 |
49 | @enveloped
50 | @dataclass(frozen=True)
51 | class RegistrationTaskRequest(BaseTaskRequest):
52 | dataset: DatasetInfo
53 | part_id: DatasetPartId
54 | # If RegisterArgs.validate_uniques=true, task should return all group IDs in file
55 | return_group_ids: bool
56 |
57 |
58 | @enveloped
59 | @dataclass(frozen=True)
60 | class RegistrationTaskResult(BaseTaskResult):
61 | dataset_schema: Optional[DatasetSchema] # None on failures
62 | part_id: DatasetPartId
63 | # If RegistrationTaskRequest.return_group_ids=true, a reference to the blob with the group IDs
64 | group_ids_blob_id: Optional[BlobId]
65 |
66 |
67 | @dataclass(frozen=True)
68 | class RegistrationJobResult(BaseJobResult):
69 | dataset: DatasetInfo
70 |
71 |
72 | @dataclass(frozen=True)
73 | class UnregisterApiResult(BaseApiResult):
74 | dataset_found: bool
75 | dataset_last_used: Optional[float]
76 |
--------------------------------------------------------------------------------
/frocket/common/validation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/frocket/common/validation/consts.py:
--------------------------------------------------------------------------------
1 | """
2 | Consts and types for the query validation package
3 | TODO backlog create a nice enum for all query keywords
4 | """
5 | # Copyright 2021 The Funnel Rocket Maintainers
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | import json
20 | import os
21 | import re
22 | from pathlib import Path
23 | from typing import Dict, NamedTuple
24 | from frocket.common.dataset import DatasetColumnType
25 | from frocket.common.validation.path_visitor import PathVisitor
26 |
27 | # JSON Schema file
28 | QUERY_SCHEMA_LOCATION = Path(os.path.dirname(__file__)) / '../../resources/query_schema.json'
29 | QUERY_SCHEMA = json.load(open(QUERY_SCHEMA_LOCATION, 'r'))
30 |
31 | TARGET_TYPES_WITH_INCLUDE_ZERO = ['count']
32 | TARGET_OPS_SUPPORTING_INCLUDE_ZERO = ['<', '<=', '==', '!=', '>=']
33 | TARGET_TYPES_WITH_OTHER_COLUMN = ['sum']
34 | AGGR_TYPES_WITH_OTHER_COLUMN = ['sumPerValue', 'meanPerValue']
35 | DEFAULT_TARGET = {'type': 'count', 'op': '>=', 'value': 1}
36 | DEFAULT_AGGREGATIONS = ['count', 'countPerValue', 'groupsPerValue']
37 | AGGREGATIONS_PATHS = ['query.aggregations',
38 | 'funnel.stepAggregations',
39 | 'funnel.endAggregations']
40 | SINGLE_FILTER_PATHS = ['query.conditions.filter',
41 | 'query.conditions.sequence.filter',
42 | 'funnel.sequence.filter']
43 | FILTER_ARRAY_PATHS = ['query.conditions.filters',
44 | 'query.conditions.sequence.filters',
45 | 'funnel.sequence.filters']
46 |
47 | VALID_IDENTIFIER_PATTERN = re.compile(r'[A-Z][A-Z_0-9]*$', re.IGNORECASE)
48 | UNIQUE_IDENTIFIER_SCOPES = ['query.conditions.name'] + \
49 | [f"{path}.name" for path in AGGREGATIONS_PATHS]
50 |
51 | EQUALITY_OPERATORS = ['==', '!=']
52 | NUMERIC_OPERATORS = [*EQUALITY_OPERATORS, '>', '>=', '<', '<=']
53 | STRING_OPERATORS = [*EQUALITY_OPERATORS, 'contains', 'regex']
54 | OPERATORS_BY_COLTYPE = {
55 | DatasetColumnType.INT: NUMERIC_OPERATORS,
56 | DatasetColumnType.FLOAT: NUMERIC_OPERATORS,
57 | DatasetColumnType.BOOL: EQUALITY_OPERATORS,
58 | DatasetColumnType.STRING: STRING_OPERATORS
59 | }
60 | VALUE_TYPES_BY_COLTYPE = {
61 | DatasetColumnType.INT: [int],
62 | DatasetColumnType.FLOAT: [int, float],
63 | DatasetColumnType.BOOL: [bool],
64 | DatasetColumnType.STRING: [str]
65 | }
66 | NUMERIC_COLTYPES = [DatasetColumnType.INT, DatasetColumnType.FLOAT]
67 |
68 | RELATION_OPS = ['and', 'or', '||', '&&']
69 | DEFAULT_RELATION_OP = 'and'
70 | CONDITION_COLUMN_PREFIX = "__cond_"
71 |
72 |
73 | class QueryConditionsMap(NamedTuple):
74 | count: int
75 | names: Dict[str, int]
76 |
77 |
78 | def map_condition_names(query: dict) -> QueryConditionsMap:
79 | """Map named conditions (which is optional) to the condition ID (index in conditions list)."""
80 | conditions = PathVisitor(query, 'query.conditions').list()
81 | names = {cond['name'].strip().lower(): i
82 | for i, cond in enumerate(conditions) if 'name' in cond}
83 | return QueryConditionsMap(count=len(conditions), names=names)
84 |
--------------------------------------------------------------------------------
/frocket/common/validation/error.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from enum import auto
16 | from frocket.common.serializable import AutoNamedEnum
17 |
18 |
19 | class ValidationErrorKind(AutoNamedEnum):
20 | """Distinguish between types of validation issues in query"""
21 | INVALID_ARGUMENTS = auto() # Validator given wrong arguments
22 | SCHEMA = auto() # Failure at JSON Schema level
23 | TYPE_MISMATCH = auto() # Operator or value type don't match each other, or the context
24 | DATASET_MISMATCH = auto() # Column names, types, etc. do not match the schema of the given dataset
25 | RELATION = auto() # query.relation expression found invalid by relation_parser.py
26 | # Note for unexpected errors: unlike other kinds, the message associated with this kind may leak sensitive data
27 | # if it was returned to the caller - so it is not returned by the API server in PUBLIC mode.
28 | UNEXPECTED = auto()
29 |
30 |
31 | class QueryValidationError(Exception):
32 | def __init__(self, message: str, kind: ValidationErrorKind = None):
33 | self.message = message
34 | self.kind = kind or ValidationErrorKind.UNEXPECTED # Default, but should be rare.
35 |
36 | @staticmethod
37 | def wrap(e: Exception, kind: ValidationErrorKind = None):
38 | return QueryValidationError(str(e), kind)
39 |
40 | def __str__(self):
41 | return f"ValidationError({self.kind.value}: {self.message})"
42 |
--------------------------------------------------------------------------------
/frocket/common/validation/path_visitor.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import Callable, Any, Optional
16 |
17 | PathVisitorCallback = Callable[[Any], Optional[Any]]
18 |
19 |
20 | class PathVisitor:
21 | """
22 | A helper class for safely fetching nested attributes in a dictionary.
23 | It is used extensively by the QueryValidator to extract and transform nested attributes.
24 |
25 | The class is instantiated with a root dict and a dot-delimited string path (e.g. 'attr.sub_attr.sub_sub').
26 | Then, visit() can be called once (or more) to run code over the matching value/s, if any. If the key is not found,
27 | no error is thrown. list() is a convenience method which visits the elements and returns them as a list,
28 | returning an empty list on no matches.
29 |
30 | By default, if the leaf key is a list, the visitor function is called for each element.
31 | However, if the list itself is what you need, pass list_to_items=False on init.
32 |
33 | Modifying attributes *below* the visited value is safe (be it a dict, a list, an object), however sometimes you
34 | may want to replace the whole value itself being itereated. For example, the QueryValidator replaces shorthand-
35 | notation objects, which are lists, into full-notation dicts.
36 | To support that, init the object with modifiable=true and return the replacement value from the visitor function,
37 | or None to keep the value.
38 |
39 | For usage examples, see test_path_visitor.py.
40 | """
41 | _KEY_NOT_FOUND = object()
42 |
43 | def __init__(self, root: dict, path: str, modifiable: bool = False, list_to_items: bool = True):
44 | assert (isinstance(root, dict))
45 | self._root = root
46 | self._paths = path.strip().split(".")
47 | self._modifiable = modifiable
48 | self._list_to_items = list_to_items
49 |
50 | def visit(self, func: PathVisitorCallback):
51 | if len(self._paths) > 0:
52 | self._visit_dict(self._root, 0, func)
53 |
54 | def list(self) -> list:
55 | result = []
56 | self.visit(lambda v: result.append(v))
57 | return result
58 |
59 | def _visit_dict(self, d: dict, depth: int, func: PathVisitorCallback):
60 | v = d.get(self._paths[depth], self._KEY_NOT_FOUND) # Differentiate a None value from an inexisting key
61 | if v == self._KEY_NOT_FOUND:
62 | return # Bumped into a wall
63 |
64 | if isinstance(v, list) and self._list_to_items:
65 | self._visit_list(v, depth + 1, func)
66 | return
67 |
68 | if depth == len(self._paths) - 1:
69 | replacement = func(v) # Includes None
70 | if self._modifiable and replacement:
71 | d[self._paths[depth]] = replacement
72 | else:
73 | if not v:
74 | return
75 | elif isinstance(v, dict):
76 | self._visit_dict(v, depth + 1, func)
77 | elif isinstance(v, list):
78 | self._visit_list(v, depth + 1, func)
79 | else:
80 | return # Can't go further
81 |
82 | def _visit_list(self, lst: list, depth: int, func: PathVisitorCallback):
83 | if depth == len(self._paths):
84 | assert self._list_to_items
85 | for i, elem in enumerate(lst):
86 | replacement = func(elem)
87 | if self._modifiable and replacement:
88 | lst[i] = replacement
89 | else:
90 | for i, elem in enumerate(lst):
91 | # Note: depth is not incremented in this case, since elements are at the same 'path depth' as the list
92 | if isinstance(elem, dict):
93 | self._visit_dict(elem, depth, func)
94 | elif isinstance(elem, list):
95 | self._visit_list(elem, depth, func)
96 |
--------------------------------------------------------------------------------
/frocket/common/validation/relation_parser.py:
--------------------------------------------------------------------------------
1 | """
2 | While the query schema is generally JSON-based (good for machines) rather then textual (like SQL,
3 | supposedly human-friendly or at least more concise), there's one exception: an optional 'relation' expression allowing
4 | to specify arbitrarily complex and/or relations between conditions, rather than just and/or over all.
5 |
6 | The RelationParser class validates and breaks down the expression to a list of elements. However, it does not transform
7 | them back into a Pandas query or similar - that is the query engine's responsibility and may change independently.
8 |
9 | Note that conditions may be represented either by index ($0, $3, etc.) or by name - for named conditions.
10 | """
11 | # Copyright 2021 The Funnel Rocket Maintainers
12 | #
13 | # Licensed under the Apache License, Version 2.0 (the "License");
14 | # you may not use this file except in compliance with the License.
15 | # You may obtain a copy of the License at
16 | #
17 | # http://www.apache.org/licenses/LICENSE-2.0
18 | #
19 | # Unless required by applicable law or agreed to in writing, software
20 | # distributed under the License is distributed on an "AS IS" BASIS,
21 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 | # See the License for the specific language governing permissions and
23 | # limitations under the License.
24 |
25 | import logging
26 | from typing import Type, List, Optional
27 | from parsimonious.grammar import Grammar, NodeVisitor
28 | from parsimonious.nodes import Node
29 | from dataclasses import dataclass
30 | from parsimonious.exceptions import ParseError, VisitationError
31 | from abc import ABCMeta
32 | from frocket.common.validation.consts import RELATION_OPS, map_condition_names, CONDITION_COLUMN_PREFIX
33 | from frocket.common.validation.path_visitor import PathVisitor
34 | from frocket.common.validation.error import ValidationErrorKind, QueryValidationError
35 | from frocket.common.tasks.base import ErrorMessage
36 |
37 | logger = logging.getLogger(__name__)
38 |
39 | # TODO backlog fix the grammar to require whitespace between conditions and wordy-operators (and,or),
40 | # but not around symbol ops (&&, ||)
41 | # TODO backlog fix "DeprecationWarning: invalid escape sequence \$"
42 | RELATION_EXPRESSION_GRAMMAR = Grammar(
43 | """
44 | expression = (identifier / (open_paren ws? expression ws? close_paren)) (ws? op ws? expression)*
45 | identifier = condition_name / condition_id
46 | condition_name = ~r"\$[A-Z][A-Z_0-9]*"i
47 | condition_id = ~r"\$[0-9]+"
48 | op = "and" / "or" / "&&" / "||"
49 | ws = ~r"\s*"
50 | open_paren = "("
51 | close_paren = ")"
52 | """)
53 |
54 |
55 | @dataclass(frozen=True)
56 | class RelationParserContext:
57 | condition_count: int
58 | named_conditions: dict
59 | column_prefix: str
60 |
61 |
62 | @dataclass
63 | class RBaseElement(metaclass=ABCMeta):
64 | text: str
65 | ctx: RelationParserContext
66 | condition_id: Optional[int] = None
67 |
68 | def validate(self) -> Optional[ErrorMessage]:
69 | pass
70 |
71 | def __str__(self):
72 | return f"{self.__class__.__name__}('{self.text}')"
73 |
74 |
75 | @dataclass
76 | class RTextElement(RBaseElement):
77 | pass
78 |
79 |
80 | @dataclass
81 | class RConditionBaseElement(RBaseElement):
82 | pass
83 |
84 |
85 | @dataclass
86 | class RConditionId(RConditionBaseElement):
87 | def validate(self):
88 | cid = int(self.text[1:])
89 | if cid >= self.ctx.condition_count:
90 | return f"Condition no. {cid} does not exist"
91 | self.condition_id = cid
92 |
93 |
94 | @dataclass
95 | class RConditionName(RConditionBaseElement):
96 | def validate(self):
97 | cname = self.text[1:]
98 | cid = self.ctx.named_conditions.get(cname, None)
99 | if cid is not None: # Can be zero
100 | self.condition_id = cid
101 | else:
102 | return f"Condition named {self.text[1:]} does not exist"
103 |
104 |
105 | @dataclass
106 | class ROperator(RBaseElement):
107 | def validate(self):
108 | if self.text not in RELATION_OPS:
109 | return f"Operator {self.text} not in {RELATION_OPS}"
110 |
111 |
112 | # noinspection PyMethodMayBeStatic,PyUnusedLocal
113 | @dataclass
114 | class RelationExpressionVisitor(NodeVisitor):
115 | """
116 | Used by the RelationParser to build the element list.
117 | Note that while the grammar is hierarchical, the resulting list isn't (no need, currently).
118 | """
119 | ctx: RelationParserContext
120 |
121 | def _build_element(self, node: Node, cls: Type[RBaseElement]):
122 | # noinspection PyArgumentList
123 | return cls(node.text, self.ctx)
124 |
125 | def visit_ws(self, node: Node, visited_children):
126 | return None # Ignore whitespaces
127 |
128 | def visit_op(self, node: Node, visited_children):
129 | return self._build_element(node, ROperator)
130 |
131 | def visit_open_paren(self, node: Node, visited_children):
132 | return self._build_element(node, RTextElement)
133 |
134 | def visit_close_paren(self, node: Node, visited_children):
135 | return self._build_element(node, RTextElement)
136 |
137 | def visit_identifier(self, node: Node, visited_children):
138 | """Return the actual condition name / ID element (see grammar: identifier wraps conditions)."""
139 | return visited_children[0]
140 |
141 | def visit_condition_name(self, node: Node, visited_children):
142 | return self._build_element(node, RConditionName)
143 |
144 | def visit_condition_id(self, node: Node, visited_children):
145 | return self._build_element(node, RConditionId)
146 |
147 | def generic_visit(self, node: Node, visited_children):
148 | """Ignore current node, but return children (if any) as a flat list."""
149 | flat_result = []
150 | for child in visited_children:
151 | if type(child) is list:
152 | flat_result += child # Unpack child array
153 | elif child:
154 | flat_result.append(child)
155 | return flat_result if len(flat_result) > 0 else None
156 |
157 |
158 | class RelationParser:
159 | def __init__(self, query: dict):
160 | self._query = query
161 | self._condition_mapping = map_condition_names(query)
162 | self._used_conditions = None
163 |
164 | found_relations = PathVisitor(self._query, 'query.relation').list()
165 | assert len(found_relations) in [0, 1]
166 | self._relation = found_relations[0].strip().lower() if found_relations else None
167 |
168 | def parse(self) -> List[RBaseElement]:
169 | if not self._relation:
170 | return []
171 |
172 | ctx = RelationParserContext(condition_count=self._condition_mapping.count,
173 | named_conditions=self._condition_mapping.names,
174 | column_prefix=CONDITION_COLUMN_PREFIX)
175 | try:
176 | tree = RELATION_EXPRESSION_GRAMMAR.parse(self._relation)
177 | except ParseError as pe:
178 | # Adopted from within the ParseError class, but without the sometimes-confusing issue
179 | excerpt = pe.text[pe.pos:pe.pos + 20] if (pe.text and pe.pos is not None) else None
180 | if excerpt:
181 | message = f"Query relation is invalid around '{excerpt}' "
182 | else:
183 | message = f"Query relation '{self._relation}' is invalid"
184 | raise QueryValidationError(message, kind=ValidationErrorKind.RELATION)
185 |
186 | try:
187 | elements = RelationExpressionVisitor(ctx).visit(tree)
188 | except VisitationError as ve:
189 | logger.exception('Unexpected error while visiting parse tree')
190 | raise QueryValidationError(message=str(ve), kind=ValidationErrorKind.UNEXPECTED)
191 |
192 | for e in elements:
193 | error_message = e.validate()
194 | if error_message:
195 | raise QueryValidationError(message=error_message, kind=ValidationErrorKind.RELATION)
196 |
197 | self._used_conditions = [e.condition_id for e in elements if e.condition_id is not None]
198 | return elements
199 |
200 | @property
201 | def used_conditions(self) -> List[str]:
202 | return self._used_conditions
203 |
--------------------------------------------------------------------------------
/frocket/common/validation/result.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from dataclasses import dataclass
16 | from typing import Optional, List, cast, Dict
17 | from frocket.common.serializable import SerializableDataClass
18 | from frocket.common.validation.error import ValidationErrorKind, QueryValidationError
19 | from frocket.common.validation.relation_parser import RBaseElement
20 |
21 |
22 | @dataclass(frozen=True)
23 | class QueryValidationResult(SerializableDataClass):
24 | success: bool
25 | source_query: dict
26 | error_message: Optional[str] = None
27 | error_kind: Optional[ValidationErrorKind] = None
28 | expanded_query: Optional[dict] = None
29 | # TODO backlog support non-critical warning/hints to user (e.g. conditions unused by relation expression)
30 | warnings: Optional[List[str]] = None
31 | used_columns: Optional[List[str]] = None
32 | used_conditions: Optional[List[str]] = None
33 | named_conditions: Optional[Dict[str, int]] = None
34 | relation_elements: Optional[List[RBaseElement]] = None
35 |
36 | @staticmethod
37 | def from_exception(e: Exception, source_query: dict):
38 | if type(e) is QueryValidationError:
39 | error_kind = cast(QueryValidationError, e).kind
40 | else:
41 | error_kind = ValidationErrorKind.UNEXPECTED
42 | return QueryValidationResult(success=False, error_message=str(e), error_kind=error_kind,
43 | source_query=source_query)
44 |
--------------------------------------------------------------------------------
/frocket/common/validation/visitor_functions.py:
--------------------------------------------------------------------------------
1 | """
2 | A collection of callback functions which the QueryValidator uses to extarct, validate and transform query elements,
3 | with the kind help of PathVisitor class.
4 |
5 | Functions which return a value are used to replace the given object with a different one,
6 | which is handled by PathVisitor in its 'modifiable' mode.
7 |
8 | Since callbacks are regular functions (not methods), and there's a bunch of them, they're in a separate file from
9 | the QueryValidator class.
10 |
11 | asserts are used where processing elements which should be already validated (so failures should be bugs).
12 | """
13 | # Copyright 2021 The Funnel Rocket Maintainers
14 | #
15 | # Licensed under the Apache License, Version 2.0 (the "License");
16 | # you may not use this file except in compliance with the License.
17 | # You may obtain a copy of the License at
18 | #
19 | # http://www.apache.org/licenses/LICENSE-2.0
20 | #
21 | # Unless required by applicable law or agreed to in writing, software
22 | # distributed under the License is distributed on an "AS IS" BASIS,
23 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
24 | # See the License for the specific language governing permissions and
25 | # limitations under the License.
26 |
27 | from typing import Optional
28 | from frocket.common.validation.consts import DEFAULT_TARGET, AGGR_TYPES_WITH_OTHER_COLUMN, \
29 | DEFAULT_AGGREGATIONS, TARGET_TYPES_WITH_INCLUDE_ZERO, TARGET_OPS_SUPPORTING_INCLUDE_ZERO
30 | from frocket.common.validation.error import ValidationErrorKind, QueryValidationError
31 |
32 |
33 | def _to_verbose_filter(fltr) -> Optional[dict]:
34 | """If a condition filter is in short-hand notation (list), convert to verbose notation."""
35 | assert type(fltr) in [list, dict]
36 | if type(fltr) is list:
37 | assert len(fltr) == 3
38 | return {'column': fltr[0], 'op': fltr[1], 'value': fltr[2]}
39 |
40 |
41 | def _to_verbose_target(target) -> Optional[dict]:
42 | """If a condition target is in short-hand notation (list), convert to verbose notation."""
43 | assert type(target) in [list, dict]
44 | if type(target) is list:
45 | assert len(target) in [3, 4]
46 | if len(target) == 3:
47 | return {'type': target[0], 'op': target[1], 'value': target[2]}
48 | elif len(target) == 4:
49 | return {'type': target[0], 'column': target[1], 'op': target[2], 'value': target[3]}
50 |
51 |
52 | def _add_default_target(cond: dict) -> None:
53 | assert type(cond) is dict
54 | # (Modification is done on a key under the given object, so no need to return a modified dict)
55 | if ('filter' in cond or 'filters' in cond) and 'target' not in cond: # Don't touch sequence conditions
56 | cond['target'] = DEFAULT_TARGET
57 |
58 |
59 | def _validate_aggregation(aggr: dict) -> None:
60 | assert type(aggr) is dict
61 | aggr_type = aggr.get('type', None)
62 | other_column_required = aggr_type in AGGR_TYPES_WITH_OTHER_COLUMN
63 | other_column_found = 'otherColumn' in aggr
64 |
65 | if other_column_required != other_column_found:
66 | message = f"For aggregation {aggr} with type '{aggr_type}', other column name is "
67 | if other_column_required:
68 | message += 'required but was not found'
69 | else:
70 | message += 'not relevant but was given'
71 | raise QueryValidationError(message, kind=ValidationErrorKind.SCHEMA)
72 |
73 |
74 | def _expand_aggregations(col_aggregations: list) -> Optional[list]:
75 | assert type(col_aggregations) is list
76 | result = []
77 | for aggr in col_aggregations:
78 | if aggr.get('type', None):
79 | result.append(aggr)
80 | else:
81 | if 'name' in aggr:
82 | message = f"Aggregation {aggr} expands into multiple default aggregations, " \
83 | f"and thus a name attributeis not supported"
84 | raise QueryValidationError(message, kind=ValidationErrorKind.SCHEMA)
85 | for added_type in DEFAULT_AGGREGATIONS:
86 | result.append({**aggr, 'type': added_type})
87 |
88 | return result
89 |
90 |
91 | def _validate_or_set_include_zero(cond: dict) -> None:
92 | """
93 | 'includeZero' attribute of conditions may be tricky to get right.
94 | This function validates that its usage makes sense, and sets the correct default where it's ommitted.
95 | """
96 | assert type(cond) is dict
97 | if not ('filter' in cond or 'filters' in cond):
98 | return # Skip sequence condition (and possibly other future types without a target)
99 |
100 | # This should run after _to_verbose_target() and _add_default_target() have already ran, ensuring target exists
101 | target_type = cond['target']['type']
102 | target_op = cond['target']['op']
103 | target_value = cond['target']['value']
104 | include_zero_value = cond.get('includeZero', None)
105 | target_as_string = f"{target_type} {target_op} {target_value}"
106 |
107 | if target_type not in TARGET_TYPES_WITH_INCLUDE_ZERO:
108 | if include_zero_value: # Exists and set to True
109 | raise QueryValidationError(
110 | message=f"'includeZero' is not applicable for target type '{target_type}'. In condition: {cond}",
111 | kind=ValidationErrorKind.TYPE_MISMATCH)
112 | else:
113 | assert type(target_value) is int
114 | assert target_value >= 0
115 |
116 | if include_zero_value: # Exists and set to True
117 | # Operator never relevant for includeZero=True
118 | if target_op not in TARGET_OPS_SUPPORTING_INCLUDE_ZERO:
119 | raise QueryValidationError(
120 | message=f"For target operator '{target_op}', 'includeZero' cannot be true. In condition: {cond}",
121 | kind=ValidationErrorKind.TYPE_MISMATCH)
122 |
123 | # Additional check when an operator is *potentially* relevant for includeZero=True
124 | if target_op == '<' and target_value == 0:
125 | raise QueryValidationError(
126 | message=f"Target implies a negative value. In condition: {cond}",
127 | kind=ValidationErrorKind.TYPE_MISMATCH)
128 |
129 | if (target_op == '!=' and target_value == 0) or \
130 | (target_op in ['==', '>='] and target_value != 0):
131 | message = f"Target {target_as_string} explicitly precludes zero, and thus 'includeZero' " \
132 | f"cannot be true. In condition: {cond}"
133 | raise QueryValidationError(message, kind=ValidationErrorKind.TYPE_MISMATCH)
134 | else:
135 | if target_op == '==' and target_value == 0:
136 | if include_zero_value is None:
137 | # Explicitly set includeZero when target is count == 0
138 | # Note: modifying a key under the given object, so no need to return a modified dict
139 | cond['includeZero'] = True
140 | elif not include_zero_value:
141 | message = f"When using a target of {target_as_string}, 'includeZero' cannot be false. " \
142 | f"Condition: {cond}"
143 | raise QueryValidationError(message, kind=ValidationErrorKind.TYPE_MISMATCH)
144 |
--------------------------------------------------------------------------------
/frocket/datastore/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/frocket/datastore/blobstore.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import abstractmethod
16 | from typing import Optional
17 | from frocket.common.config import config
18 | from frocket.common.tasks.base import BlobId
19 |
20 | BLOB_DEFAULT_TTL = config.int('blobstore.default.ttl')
21 | BLOB_MAX_TTL = config.int('blobstore.max.ttl')
22 |
23 |
24 | class Blobstore:
25 | """Simple interface for storing and fetching arbitrary binary data, for ephemeral transport over the network.
26 | The data is assumed to always have a default TTL - it's not a permanent or big data store."""
27 | @abstractmethod
28 | def write_blob(self, data: bytes, ttl: int = None, tag: str = None) -> BlobId:
29 | pass
30 |
31 | @abstractmethod
32 | def read_blob(self, blobid: BlobId) -> Optional[bytes]:
33 | pass
34 |
35 | @abstractmethod
36 | def delete_blob(self, blobid: BlobId) -> bool:
37 | pass
38 |
--------------------------------------------------------------------------------
/frocket/datastore/datastore.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import abstractmethod, ABCMeta
16 | from dataclasses import dataclass
17 | from typing import List, Dict, Set, Optional, Union
18 | from frocket.common.tasks.base import TaskStatus, BaseTaskResult, TaskAttemptId, TaskStatusUpdate, BaseTaskRequest
19 | from frocket.common.dataset import DatasetInfo, DatasetPartsInfo, DatasetPartId, DatasetShortSchema, DatasetSchema
20 | from frocket.common.serializable import SerializableDataClass
21 |
22 | DEFAULT_QUEUE = 'default'
23 | DEFAULT_DEQUEUE_WAIT_TIME = 60
24 |
25 |
26 | # Used in PartSelectionMode.SELECTED_BY_WORKER
27 | @dataclass(frozen=True)
28 | class WorkerSelectedPart(SerializableDataClass):
29 | part_id: DatasetPartId
30 | random: bool
31 | task_attempt_no: int
32 |
33 |
34 | class Datastore(metaclass=ABCMeta):
35 | """
36 | Interface to the data store, which holds:
37 |
38 | * The list, metadata and schema of all registered datasets
39 | * For running jobs:
40 | * Task statuses and results
41 | * Atomic attempt counter for retried tasks
42 | * For jobs running in mode PartSelectionMode.SELECTED_BY_WORKER, the manifest of available tasks to select from.
43 | * When the system is configured to use the 'work_queue' invoker (rather than 'aws_lambda'), the datastore also
44 | provides the queue through which tasks are enqueued by the invoker and picked up by the workers, like a very
45 | simplistic queue management system.
46 |
47 | The datastore is not for storing the actual dataset or other persistent large data.
48 | """
49 | @abstractmethod
50 | def write_dataset_info(self, dataset: DatasetInfo, parts: DatasetPartsInfo, schema: DatasetSchema) -> None:
51 | pass
52 |
53 | @abstractmethod
54 | def remove_dataset_info(self, name: str) -> bool:
55 | pass
56 |
57 | @abstractmethod
58 | def dataset_info(self, name: str) -> DatasetInfo:
59 | pass
60 |
61 | @abstractmethod
62 | def dataset_parts_info(self, ds: DatasetInfo) -> DatasetPartsInfo:
63 | pass
64 |
65 | @abstractmethod
66 | def schema(self, ds: DatasetInfo) -> DatasetSchema:
67 | pass
68 |
69 | @abstractmethod
70 | def short_schema(self, ds: DatasetInfo) -> DatasetShortSchema:
71 | pass
72 |
73 | @abstractmethod
74 | def last_used(self, ds: DatasetInfo) -> int:
75 | pass
76 |
77 | @abstractmethod
78 | def mark_used(self, ds: DatasetInfo):
79 | pass
80 |
81 | @abstractmethod
82 | def datasets(self) -> List[DatasetInfo]:
83 | pass
84 |
85 | @abstractmethod
86 | def enqueue(self, requests: List[BaseTaskRequest], queue: str = DEFAULT_QUEUE) -> None:
87 | pass
88 |
89 | @abstractmethod
90 | def dequeue(self, queue: str = DEFAULT_QUEUE, timeout: int = DEFAULT_DEQUEUE_WAIT_TIME) -> BaseTaskRequest:
91 | pass
92 |
93 | @abstractmethod
94 | def update_task_status(self, reqid: str,
95 | tasks: Union[TaskAttemptId, List[TaskAttemptId]], status: TaskStatus) -> None:
96 | pass
97 |
98 | @abstractmethod
99 | def tasks_status(self, reqid: str) -> Dict[TaskAttemptId, TaskStatusUpdate]:
100 | pass
101 |
102 | @abstractmethod
103 | def write_task_result(self, reqid: str, taskid: TaskAttemptId, result: BaseTaskResult) -> None:
104 | pass
105 |
106 | @abstractmethod
107 | def task_results(self, reqid: str) -> Dict[TaskAttemptId, BaseTaskResult]:
108 | pass
109 |
110 | @abstractmethod
111 | def increment_attempt(self, reqid: str, part_idx: int) -> int:
112 | pass
113 |
114 | @abstractmethod
115 | def publish_for_worker_selection(self, reqid: str, attempt_round: int, parts: Set[DatasetPartId]) -> None:
116 | pass
117 |
118 | @abstractmethod
119 | def self_select_part(self, reqid: str, attempt_round: int,
120 | candidates: Set[DatasetPartId] = None) -> Optional[WorkerSelectedPart]:
121 | pass
122 |
123 | @abstractmethod
124 | def cleanup_request_data(self, reqid: str) -> None:
125 | pass
126 |
--------------------------------------------------------------------------------
/frocket/datastore/registered_datastores.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 | from frocket.common.config import config
17 | from frocket.common.helpers.utils import memoize
18 | from frocket.datastore.datastore import Datastore
19 | from frocket.datastore.blobstore import Blobstore
20 | from frocket.datastore.redis_store import RedisStore
21 |
22 | logger = logging.getLogger(__name__)
23 |
24 | DATASTORE_CLASSES = {
25 | "redis": RedisStore,
26 | }
27 |
28 | BLOBSTORE_CLASSES = {
29 | "redis": RedisStore,
30 | }
31 |
32 |
33 | # TODO backlog consider thread-safety here: while RedisStore is thread-safe and having more than one is ok, future
34 | # implementations may not be? (or should be required to)
35 | def _get_store(store_kind: str, store_mapping: dict):
36 | store_class = store_mapping[config.get(store_kind).lower()]
37 | store = store_class(role=store_kind)
38 | logger.info(f"Initialized {store}")
39 | return store
40 |
41 |
42 | @memoize
43 | def get_datastore() -> Datastore:
44 | return _get_store("datastore", DATASTORE_CLASSES)
45 |
46 |
47 | @memoize
48 | def get_blobstore() -> Blobstore:
49 | return _get_store("blobstore", BLOBSTORE_CLASSES)
50 |
--------------------------------------------------------------------------------
/frocket/engine/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/frocket/engine/relation_to_pandas.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import Dict, Type, Callable, List, cast
16 | from frocket.common.validation.relation_parser import RBaseElement, RTextElement, RConditionBaseElement, ROperator
17 |
18 |
19 | def relation_to_pandas_query(elements: List[RBaseElement], column_prefix: str) -> str:
20 | """Convert the generic pasred representation of query.relation expression (as returned by QueryValidator or its
21 | helper class RelationParser) into a Pandas query string."""
22 |
23 | # Mapping of generic element type to a lambda function constructing the Pandas equivalent. Note below that not
24 | # every concreate element type needs an entry here, as the code would also look for its superclass
25 | etype_to_handler: Dict[Type[RBaseElement], Callable[[RBaseElement], str]] = {
26 | RTextElement: lambda v: v.text,
27 | RConditionBaseElement: lambda v: f"{column_prefix}{v.condition_id}",
28 | ROperator: lambda v: " & " if v.text in ["and", "&&"] else " | "
29 | }
30 |
31 | transformed = []
32 | for e in elements:
33 | func = None
34 | # Either there's a handler above for this element type, or go up the superclass chain to find one.
35 | class_and_supers = cast(List[Type[RBaseElement]], type(e).mro())
36 | for cls in class_and_supers:
37 | func = etype_to_handler.get(cls, None)
38 | if func:
39 | break
40 | if not func:
41 | raise Exception(f"{e} has no handler for any of its superclasses: {class_and_supers}")
42 | transformed.append(func(e))
43 | return "".join(transformed)
44 |
--------------------------------------------------------------------------------
/frocket/invoker/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/frocket/invoker/impl/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/frocket/invoker/impl/aws_lambda_invoker.py:
--------------------------------------------------------------------------------
1 | """
2 | Invoke tasks by invoking an AWS Lambda function asynchronously.
3 |
4 | This is a great feature of Lamdba, which implictly manages a queue of invocation for you with configurable retention
5 | (probably based on SQS). As long as the concurrent invocations limit in your account/burst limit of the AWS region are
6 | not reached, AWS will launch Lambdas for queued invocation immediately, with no meaningful delay. This also prevents
7 | getting rate-limited on momentary invocation spikes.
8 |
9 | A few important notes:
10 |
11 | 1. As noted in the setup guide, the retry count for the Lambda function *should be set to zero*, as it's the invoker's
12 | job to launch retries with slightly different arguments, based on its own configuration, with logic that is agnostic
13 | to whether the actual invoker is using Lambdas or anything else (which does not have its optional retry feature).
14 |
15 | 2. Unfortunately, there's no API for batch Lambda invocation, so we're invoking one by one with multiple threads -
16 | and still the time to invoke all tasks can add up to 1-2 seconds or more.
17 | TODO backlog optimize! this also hurts caching as not all tasks get their fair chance to pick a locally cached part.
18 |
19 | 3. The InvokeAsync() Lambda API is considered deprecated and replaced by the 'InvocationType' parameter in Invoke().
20 | However, the InvokeAsync API currently seems to take about half the time to return! Which one to use is configurable.
21 |
22 | TODO backlog stress-test queue limits till reaching rate limiting (status 429).
23 | TODO backlog for each invocation, add its actual invoke time as parameter
24 | (now we only measure time since invocation of all tasks started)
25 | """
26 | # Copyright 2021 The Funnel Rocket Maintainers
27 | #
28 | # Licensed under the Apache License, Version 2.0 (the "License");
29 | # you may not use this file except in compliance with the License.
30 | # You may obtain a copy of the License at
31 | #
32 | # http://www.apache.org/licenses/LICENSE-2.0
33 | #
34 | # Unless required by applicable law or agreed to in writing, software
35 | # distributed under the License is distributed on an "AS IS" BASIS,
36 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
37 | # See the License for the specific language governing permissions and
38 | # limitations under the License.
39 |
40 | import logging
41 | import time
42 | import concurrent.futures
43 | from typing import cast
44 | import boto3
45 | from botocore.client import BaseClient
46 | from botocore.config import Config
47 | from frocket.common.serializable import Envelope
48 | from frocket.common.tasks.base import BaseTaskRequest, BaseApiResult
49 | from frocket.invoker.impl.async_invoker import AsyncInvoker
50 | from frocket.common.config import config
51 |
52 | logger = logging.getLogger(__name__)
53 |
54 | DEBUG_PRINT_PAYLOADS = config.bool("invoker.lambda.debug.payload")
55 | LAMBDA_ASYNC_OK_STATUS = 202
56 |
57 |
58 | def _worker_task(req: BaseTaskRequest, client: BaseClient, lambda_name: str) -> BaseApiResult:
59 | """Run by the thread pool below."""
60 | # noinspection PyBroadException
61 | try:
62 | result = None
63 | json_payload = Envelope.seal_to_json(req) # Encodes the actual object and its type, for correct decoding later.
64 | if DEBUG_PRINT_PAYLOADS:
65 | logger.debug(json_payload)
66 |
67 | legacy_invoke_async = config.bool("invoker.lambda.legacy.async")
68 | status_field = 'Status' if legacy_invoke_async else 'StatusCode'
69 |
70 | if legacy_invoke_async:
71 | response = client.invoke_async(FunctionName=lambda_name, InvokeArgs=json_payload)
72 | else:
73 | response = client.invoke(FunctionName=lambda_name, InvocationType='Event', Payload=json_payload)
74 |
75 | if response[status_field] == LAMBDA_ASYNC_OK_STATUS:
76 | result = BaseApiResult(success=True, error_message=None)
77 | else:
78 | message = f"Response status differs from expected ({LAMBDA_ASYNC_OK_STATUS}): {response}"
79 | result = BaseApiResult(success=False, error_message=message)
80 | except Exception as e:
81 | result = BaseApiResult(success=False, error_message=f"Failed to invoke lambda function '{lambda_name}': {e}")
82 | return result
83 |
84 |
85 | class AwsLambdaInvoker(AsyncInvoker):
86 | def _enqueue(self, requests) -> None:
87 | lambda_name = config.get('invoker.lambda.name')
88 | num_threads = config.int('invoker.lambda.threads')
89 | boto_config = Config(**config.aws_config_dict(service='lambda'))
90 | client = boto3.client('lambda',
91 | **config.aws_client_settings(service='lambda'),
92 | config=boto_config)
93 | logger.debug(f"Invoking lambdas, name: {lambda_name}, no. of invocations: {len(requests)}"
94 | f", no. of invoker threads: {num_threads}")
95 | futures = []
96 | start_invoke_time = time.time()
97 | # TODO backlog consider lifecycle of the thread pool
98 | with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
99 | for req in requests:
100 | futures.append(executor.submit(_worker_task, req, client, lambda_name))
101 | futures = concurrent.futures.as_completed(futures) # Wait till all complete!
102 | executor.shutdown()
103 |
104 | error_message = None
105 | for f in futures:
106 | assert f.done()
107 | if f.cancelled():
108 | error_message = "Lambda invocation interrupted"
109 | elif f.exception():
110 | error_message = f"Invocation failed with error: {f.exception()}"
111 | else:
112 | result = f.result()
113 | if not result or type(result) is not BaseApiResult:
114 | error_message = f"Invocation returned with response: {result}"
115 | result = cast(BaseApiResult, result)
116 | if not result.success:
117 | error_message = result.error_message
118 | if error_message:
119 | break
120 |
121 | if error_message:
122 | logger.error(error_message)
123 | raise Exception(error_message)
124 | else:
125 | logger.info(f"Async invocation done in {time.time() - start_invoke_time:.3f}")
126 |
--------------------------------------------------------------------------------
/frocket/invoker/impl/registered_invokers.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 | from enum import Enum, auto
17 | from frocket.common.config import config
18 | from frocket.invoker.base_invoker import BaseInvoker
19 | from frocket.invoker.jobs.job import Job
20 | from frocket.invoker.impl.aws_lambda_invoker import AwsLambdaInvoker
21 | from frocket.invoker.impl.work_queue_invoker import WorkQueueInvoker
22 |
23 | logger = logging.getLogger(__name__)
24 |
25 |
26 | class InvocationType(Enum):
27 | WORK_QUEUE = auto()
28 | AWS_LAMBDA = auto()
29 |
30 |
31 | INVOKER_CLASSES = {
32 | InvocationType.WORK_QUEUE: WorkQueueInvoker,
33 | InvocationType.AWS_LAMBDA: AwsLambdaInvoker
34 | }
35 |
36 |
37 | def new_invoker(request_builder: Job) -> BaseInvoker:
38 | invoker_type = InvocationType[config.get("invoker").upper()]
39 | invoker_class = INVOKER_CLASSES[invoker_type]
40 | logger.info(f"Creating invoker type: {invoker_class.__name__}, for request builder type: {type(request_builder)}")
41 | return invoker_class(request_builder)
42 |
--------------------------------------------------------------------------------
/frocket/invoker/impl/work_queue_invoker.py:
--------------------------------------------------------------------------------
1 | """
2 | Invoke tasks by enqueing them in the datastore. Not much to do here :-)
3 | """
4 | # Copyright 2021 The Funnel Rocket Maintainers
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from frocket.invoker.impl.async_invoker import AsyncInvoker
19 |
20 |
21 | class WorkQueueInvoker(AsyncInvoker):
22 | def _enqueue(self, requests) -> None:
23 | self._datastore.enqueue(requests)
24 |
--------------------------------------------------------------------------------
/frocket/invoker/invoker_api.py:
--------------------------------------------------------------------------------
1 | """
2 | This is the "Funnel Rocket" frontend API - wrappedby the CLI & API server, and may be embeddable in other apps.
3 | Clients are not expected to bypass this API (call the datastore directly, initialize an invoker, etc.)
4 | """
5 | # Copyright 2021 The Funnel Rocket Maintainers
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | import concurrent.futures
20 | import logging
21 | import time
22 | from typing import List, Optional, cast, Union
23 | from frocket.common.config import config
24 | from frocket.common.dataset import DatasetInfo, DatasetShortSchema, DatasetSchema, DatasetPartsInfo
25 | from frocket.common.tasks.registration import RegistrationJobResult, RegisterArgs, UnregisterApiResult
26 | from frocket.common.tasks.query import QueryJobResult
27 | from frocket.common.tasks.async_tracker import AsyncJobTracker, AsyncJobStatusUpdater
28 | from frocket.common.validation.query_validator import QueryValidator
29 | from frocket.common.validation.result import QueryValidationResult
30 | from frocket.datastore.registered_datastores import get_datastore
31 | from frocket.invoker.jobs.query_job import QueryJob
32 | from frocket.invoker.jobs.registration_job import RegistrationJob
33 | from frocket.invoker.impl.registered_invokers import new_invoker
34 |
35 | logger = logging.getLogger(__name__)
36 | executor = concurrent.futures.ThreadPoolExecutor()
37 |
38 | # TODO backlog allow configurable timeout per job type (async or not)
39 | ASYNC_MAX_WAIT = config.int("invoker.run.timeout") * 1.1 # Adding a bit of grace around the invoker
40 |
41 |
42 | def _unregister_safety_interval() -> int:
43 | """How long after a dataset was last used to block unregister (can be set to zero, or overidden with force=True)."""
44 | interval = config.get('unregister.last.used.interval', None)
45 | if not interval: # Not defined, or empty string (explicit '0' is truthy)
46 | interval = config.int('invoker.run.timeout') * 2
47 | else:
48 | interval = int(interval)
49 | return interval
50 |
51 |
52 | def register_dataset(args: RegisterArgs) -> RegistrationJobResult:
53 | request_builder = RegistrationJob(args)
54 | invoker = new_invoker(request_builder)
55 | result = cast(RegistrationJobResult, invoker.run())
56 | logger.info(f"Registration {'successful' if result.success else f'failed! {result.error_message}'}")
57 | return result
58 |
59 |
60 | def register_dataset_async(args: RegisterArgs, set_max_wait: bool = True) -> AsyncJobTracker:
61 | """The async version starts the invoker in a separate thread and then returns, handing back
62 | an AsyncJobTracker to poll for progress/completion."""
63 | def worker(register_args, async_status):
64 | invoker = new_invoker(RegistrationJob(register_args))
65 | return invoker.run(async_status)
66 |
67 | async_status = AsyncJobStatusUpdater(max_wait=(ASYNC_MAX_WAIT if set_max_wait else None))
68 | executor.submit(worker, args, async_status)
69 | logger.info(f"Submitted async registration for dataset named {args.name} in basepath {args.basepath}")
70 | return async_status
71 |
72 |
73 | def get_dataset(name: str, throw_if_missing: bool = False) -> Optional[DatasetInfo]:
74 | dataset = get_datastore().dataset_info(name)
75 | if not dataset and throw_if_missing:
76 | raise Exception(f"Dataset '{name}' not found")
77 | return dataset
78 |
79 |
80 | def get_dataset_schema(dataset: DatasetInfo, full: bool = False) -> Union[DatasetSchema, DatasetShortSchema]:
81 | return get_datastore().schema(dataset) if full else get_datastore().short_schema(dataset)
82 |
83 |
84 | def get_dataset_parts(dataset: DatasetInfo) -> DatasetPartsInfo:
85 | return get_datastore().dataset_parts_info(dataset)
86 |
87 |
88 | def unregister_dataset(name: str, force: bool = False) -> UnregisterApiResult:
89 | dataset = get_dataset(name=name)
90 | if not dataset:
91 | return UnregisterApiResult(success=True, error_message=None,
92 | dataset_found=False, dataset_last_used=None)
93 |
94 | datastore = get_datastore()
95 | last_used = datastore.last_used(dataset)
96 | if last_used:
97 | time_since_used = int(time.time() - last_used)
98 | safety_interval = _unregister_safety_interval()
99 | message = f"Dataset was last used {time_since_used} seconds ago, which is less than safety interval " \
100 | f"{safety_interval}. Use the 'force' parameter to unregister anyway."
101 | if safety_interval > time_since_used and not force:
102 | return UnregisterApiResult(success=False, error_message=message,
103 | dataset_found=True, dataset_last_used=last_used)
104 |
105 | get_datastore().remove_dataset_info(name)
106 | return UnregisterApiResult(success=True, error_message=None,
107 | dataset_found=True, dataset_last_used=last_used)
108 |
109 |
110 | def expand_and_validate_query(dataset: DatasetInfo, query: dict) -> QueryValidationResult:
111 | short_schema = get_dataset_schema(dataset)
112 | return QueryValidator(query, dataset, short_schema).expand_and_validate()
113 |
114 |
115 | def _build_query_job(dataset: DatasetInfo,
116 | query: dict,
117 | validation_result: QueryValidationResult) -> QueryJob:
118 | """If the query was already validated, skip re-validating."""
119 | if validation_result:
120 | assert validation_result.success
121 | assert query in [validation_result.source_query, validation_result.expanded_query]
122 | else:
123 | validation_result = expand_and_validate_query(dataset, query)
124 | if not validation_result.success:
125 | raise Exception(f"Query validation failed: {validation_result.error_message}")
126 |
127 | get_datastore().mark_used(dataset)
128 | dataset_parts = get_datastore().dataset_parts_info(dataset)
129 | short_schema = get_datastore().short_schema(dataset)
130 | return QueryJob(dataset, dataset_parts, short_schema,
131 | validation_result.expanded_query, validation_result.used_columns)
132 |
133 |
134 | def run_query(dataset: DatasetInfo,
135 | query: dict,
136 | validation_result: QueryValidationResult = None) -> QueryJobResult:
137 | job_builder = _build_query_job(dataset, query, validation_result)
138 | invoker = new_invoker(job_builder)
139 | result = cast(QueryJobResult, invoker.run())
140 | if result.success:
141 | logger.info("Query completed successfully")
142 | else:
143 | logger.error(f"Query failed with message: {result.error_message}")
144 | return result
145 |
146 |
147 | def run_query_async(dataset: DatasetInfo,
148 | query: dict,
149 | set_max_wait: bool = True,
150 | validation_result: QueryValidationResult = None) -> AsyncJobTracker:
151 | """The async version starts the invoker in a separate thread and then returns, handing back
152 | an AsyncJobTracker to poll for progress/completion."""
153 | def worker(job_builder, async_status):
154 | invoker = new_invoker(job_builder)
155 | return invoker.run(async_status)
156 |
157 | job_builder = _build_query_job(dataset, query, validation_result)
158 | async_status = AsyncJobStatusUpdater(max_wait=(ASYNC_MAX_WAIT if set_max_wait else None))
159 | executor.submit(worker, job_builder, async_status)
160 | logger.info(f"Submitted async query for dataset '{dataset.id.name}'")
161 | return async_status
162 |
163 |
164 | def list_datasets() -> List[DatasetInfo]:
165 | datasets = get_datastore().datasets()
166 | return datasets
167 |
--------------------------------------------------------------------------------
/frocket/invoker/jobs/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/frocket/invoker/jobs/job.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import abstractmethod, ABCMeta
16 | from typing import List, Optional, Set
17 | from frocket.common.dataset import DatasetPartId, DatasetPartsInfo
18 | from frocket.common.metrics import LabelsDict
19 | from frocket.common.tasks.base import BaseTaskRequest, BaseTaskResult, BaseJobResult, JobStatus, ErrorMessage
20 | from frocket.common.tasks.async_tracker import AsyncJobStatusUpdater
21 |
22 |
23 | class Job(metaclass=ABCMeta):
24 | """
25 | For each job type (registratioThen, qeury, and future ones) there is a concrete subclass.
26 | That concrete class is handed to the invoker object, which is agnostic to the job details but calls the
27 | job's method in a set order.
28 |
29 | The flow, at high level:
30 | 1. On prerun(), the job validates its arguments (and can fail by returnning an error message) and can prepare data
31 | for building tasks.
32 |
33 | 2. When build_tasks() is called by the invoker - return a list of concrete task request objects,
34 | all with attempt no. 0.
35 |
36 | 3. If the job supports task self-selection by workers, it should override dataset_parts_to_publish() and
37 | return a list of parts to be consumed by workers (workers would try to select parts they have cached locally).
38 | This list is published by the data store before tasks are invoked.
39 |
40 | 4. In case the invoker decides to retry a task, it calls build_retry_task() to create a specific retry task
41 |
42 | 5. After all tasks have completed, either successfully or not, complete() is called to run any validations on the
43 | final results of all tasks, and perform any needed aggregations. The job may fail at this stage if the results of
44 | tasks, taken together, are invalid.
45 |
46 | 6. Lastly, build_result() is called to construct the final job result.
47 | At this stage, the final success status of the job should not change.
48 | """
49 | _request_id = None
50 | _labels = {}
51 |
52 | @property
53 | def request_id(self) -> Optional[str]:
54 | return self._request_id
55 |
56 | @request_id.setter
57 | def request_id(self, request_id: str):
58 | self._request_id = request_id
59 |
60 | def prerun(self, async_updater: AsyncJobStatusUpdater = None) -> Optional[ErrorMessage]:
61 | pass
62 |
63 | @abstractmethod
64 | def build_tasks(self) -> List[BaseTaskRequest]:
65 | pass
66 |
67 | def dataset_parts_to_publish(self) -> Optional[Set[DatasetPartId]]:
68 | return None
69 |
70 | @abstractmethod
71 | def total_tasks(self) -> int:
72 | pass
73 |
74 | @abstractmethod
75 | def build_retry_task(self, attempt_no: int, task_index: int) -> BaseTaskRequest:
76 | pass
77 |
78 | def complete(self,
79 | tasks_final_status: JobStatus,
80 | latest_task_results: List[BaseTaskResult],
81 | async_updater: AsyncJobStatusUpdater = None) -> JobStatus:
82 | return tasks_final_status
83 |
84 | @abstractmethod
85 | def build_result(self,
86 | base_attributes: dict,
87 | final_status: JobStatus,
88 | latest_task_results: List[BaseTaskResult]) -> BaseJobResult:
89 | pass
90 |
91 | @property
92 | def metric_labels(self) -> LabelsDict:
93 | return self._labels
94 |
95 | @abstractmethod
96 | def parts_info(self) -> Optional[DatasetPartsInfo]:
97 | pass
98 |
--------------------------------------------------------------------------------
/frocket/invoker/jobs/query_job.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import time
16 | from typing import List, cast
17 | from frocket.common.config import config
18 | from frocket.common.dataset import DatasetInfo, DatasetPartId, DatasetPartsInfo, DatasetShortSchema
19 | from frocket.common.metrics import JobTypeLabel, DATASET_LABEL
20 | from frocket.common.tasks.query import PartSelectionMode, QueryTaskRequest, QueryTaskResult, QueryJobResult, QueryResult
21 | from frocket.invoker.jobs.job import Job
22 |
23 |
24 | class QueryJob(Job):
25 | def __init__(self, dataset: DatasetInfo, parts: DatasetPartsInfo,
26 | short_schema: DatasetShortSchema, query: dict, used_columns: List[str],
27 | worker_can_select_part: bool = None):
28 | self._dataset = dataset
29 | self._parts = parts
30 | self._query = query
31 | self._used_columns = used_columns
32 | self._paths = parts.fullpaths(parent=dataset)
33 | self._worker_can_select_part = worker_can_select_part \
34 | if worker_can_select_part is not None else config.bool('worker.self.select.enabled')
35 | if config.bool('dataset.categorical.potential.use'):
36 | self._load_as_categoricals = short_schema.potential_categoricals
37 | else:
38 | self._load_as_categoricals = None
39 | self._labels = {
40 | JobTypeLabel.QUERY.label_name: JobTypeLabel.QUERY.label_value,
41 | DATASET_LABEL: self._dataset.id.name
42 | }
43 |
44 | def parts_info(self):
45 | return self._parts
46 |
47 | def total_tasks(self):
48 | return len(self._paths)
49 |
50 | def build_tasks(self):
51 | if self._worker_can_select_part:
52 | mode = PartSelectionMode.SELECTED_BY_WORKER
53 | else:
54 | mode = PartSelectionMode.SET_BY_INVOKER
55 |
56 | requests = [self._build_task(mode, i) for i in range(self.total_tasks())]
57 | return requests
58 |
59 | def dataset_parts_to_publish(self):
60 | if self._worker_can_select_part:
61 | parts_to_publish = {DatasetPartId(self._dataset.id, path, part_index)
62 | for part_index, path in enumerate(self._paths)}
63 | return parts_to_publish
64 | else:
65 | return None
66 |
67 | def build_retry_task(self, attempt_no, task_index):
68 | return self._build_task(PartSelectionMode.SET_BY_INVOKER,
69 | part_index=task_index,
70 | attempt_no=attempt_no)
71 |
72 | def _build_task(self, mode: PartSelectionMode, part_index: int, attempt_no: int = 0) -> QueryTaskRequest:
73 | if mode == PartSelectionMode.SET_BY_INVOKER:
74 | invoker_set_part = DatasetPartId(dataset_id=self._dataset.id,
75 | path=self._paths[part_index],
76 | part_idx=part_index)
77 | task_index = part_index
78 | elif mode == PartSelectionMode.SELECTED_BY_WORKER:
79 | assert attempt_no == 0
80 | invoker_set_part = None
81 | task_index = None
82 | else:
83 | raise Exception("Unknown mode {mode}")
84 |
85 | request = QueryTaskRequest(
86 | request_id=self._request_id,
87 | invoke_time=time.time(),
88 | dataset=self._dataset,
89 | load_as_categoricals=self._load_as_categoricals,
90 | query=self._query,
91 | invoker_set_task_index=task_index,
92 | attempt_no=attempt_no,
93 | mode=mode,
94 | invoker_set_part=invoker_set_part,
95 | used_columns=self._used_columns)
96 | return request
97 |
98 | def build_result(self, base_attributes, final_status, latest_task_results):
99 | aggregated_query_result = None
100 | # Only if query was successful, aggregate query results (for each task - from a single successful attempt)
101 | if final_status.success:
102 | latest_task_results = cast(List[QueryTaskResult], latest_task_results)
103 | query_results = [task_result.query_result for task_result in latest_task_results]
104 | aggregated_query_result = cast(QueryResult,
105 | QueryResult.reduce(query_results))
106 |
107 | result = QueryJobResult(
108 | **base_attributes,
109 | query=aggregated_query_result.query if aggregated_query_result else None,
110 | funnel=aggregated_query_result.funnel if aggregated_query_result else None
111 | )
112 | return result
113 |
--------------------------------------------------------------------------------
/frocket/invoker/metrics_frame.py:
--------------------------------------------------------------------------------
1 | """
2 | Transform a given list of metrics from multiple sources (invoker, workers) into one DataFrame, for easy analysis.
3 | Export the data into file and/or Prometheus, by configuration.
4 | """
5 | # Copyright 2021 The Funnel Rocket Maintainers
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | import logging
20 | from typing import List, Dict, Union
21 | import pandas as pd
22 | from pandas import DataFrame
23 | from frocket.common.config import config
24 | from frocket.common.metrics import SourceAndMetricTuple, ALL_LABEL_NAMES
25 | from frocket.invoker import prom_adapter
26 |
27 | logger = logging.getLogger(__name__)
28 |
29 | METRIC_SOURCE_COLUMN = 'source'
30 | METRIC_NAME_COLUMN = 'metric'
31 | METRIC_VALUE_COLUMN = 'value'
32 |
33 | PANDAS_FLOAT_FORMAT = '{:.5f}' # No pesky scientific notation ;-)
34 | pd.options.display.float_format = PANDAS_FLOAT_FORMAT.format
35 |
36 | # 'Last run' file, if defines, stores the most recent job's metrics as a file in CSV or Parquet format (by extension)
37 | EXPORT_LASTRUN_FILE = config.get('metrics.export.lastrun', None)
38 | EXPORT_TO_PROMETHEUS = config.bool('metrics.export.prometheus')
39 |
40 | if EXPORT_TO_PROMETHEUS:
41 | prom_adapter.init_prom_metrics()
42 |
43 |
44 | class MetricsFrame:
45 | def __init__(self, source_and_metrics: List[SourceAndMetricTuple]):
46 | self._sources = [ms.source for ms in source_and_metrics]
47 | self._metrics = [ms.metric for ms in source_and_metrics]
48 | self._build_df()
49 |
50 | def _build_df(self):
51 | """
52 | Build the DataFrame: each row is one reported metric, but the DF is created with columns. Hence, we're creating
53 | columns here rather than rows.
54 | """
55 | metric_source_column = self._sources
56 | metric_name_column = [m.name.name for m in self._metrics] # Metric names column
57 | metric_value_column = [m.value for m in self._metrics] # Metric values column
58 |
59 | # Init empty columns for all possible label names.
60 | # Cells not not filled (see below) will remain empty (and possibly even entire columns)
61 | label_columns: Dict[str, List[Union[str, None]]] = {}
62 | for label_name in ALL_LABEL_NAMES:
63 | label_columns[label_name] = [None] * len(self._metrics)
64 |
65 | # Fill labels columns with what labels are actually set per metric
66 | for i, metric in enumerate(self._metrics):
67 | for label_name, label_value in metric.labels.items():
68 | label_columns[label_name][i] = label_value
69 |
70 | df_columns = {METRIC_SOURCE_COLUMN: metric_source_column,
71 | METRIC_NAME_COLUMN: metric_name_column,
72 | METRIC_VALUE_COLUMN: metric_value_column,
73 | **label_columns}
74 | self._df = pd.DataFrame(data=df_columns)
75 | # logger.debug(f"Types: {self._df.dtypes.index.tolist()}, data:\n{self._df}") # If needed
76 |
77 | def export(self) -> None:
78 | if EXPORT_LASTRUN_FILE:
79 | self._to_lastrun_file(EXPORT_LASTRUN_FILE)
80 | if EXPORT_TO_PROMETHEUS:
81 | self._to_prometheus()
82 |
83 | def _to_prometheus(self) -> None:
84 | prom_adapter.update(self._metrics)
85 |
86 | def _to_lastrun_file(self, filename: str) -> None:
87 | if filename.lower().endswith('.parquet'):
88 | self._df.to_parquet(filename, index=False)
89 | else:
90 | self._df.to_csv(filename, float_format=PANDAS_FLOAT_FORMAT, index=False)
91 |
92 | @property
93 | def dataframe(self) -> DataFrame:
94 | return self._df
95 |
--------------------------------------------------------------------------------
/frocket/invoker/prom_adapter.py:
--------------------------------------------------------------------------------
1 | """
2 | While metrics support in Funnel Rocket is built with Prometheus (or more generally OpenMetrics) in mind,
3 | all Prometheus-specific code is in this module.
4 |
5 | TODO backlog support help string (documentation) per each member in MetricName enum
6 | """
7 | # Copyright 2021 The Funnel Rocket Maintainers
8 | #
9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | # http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 |
21 | from typing import List, Dict, Type
22 | from prometheus_client import Counter, Histogram
23 | from prometheus_client.metrics import MetricWrapperBase
24 | from frocket.common.config import config
25 | from frocket.common.helpers.utils import memoize
26 | from frocket.common.metrics import MetricName, MeasuredUnit, supported_label_names, MetricData, empty_label_names
27 |
28 | prom_counters: Dict[MetricName, Counter] = {}
29 | prom_histograms: Dict[MetricName, Histogram] = {}
30 |
31 |
32 | @memoize
33 | def buckets_by_unit(unit: MeasuredUnit) -> List[float]:
34 | """Each unit (seconds, bytes, dollars) may have its own buckets configured, or fallback to the default."""
35 | assert unit is not MeasuredUnit.COUNT # COUNT should not use a histogram
36 | buckets_string = config.get_with_fallbacks(f'metrics.buckets.{unit.name.lower()}', 'metrics.buckets.default')
37 | buckets = [float(b) for b in buckets_string.split(',')]
38 | return buckets
39 |
40 |
41 | def unit_to_metric_type(unit: MeasuredUnit) -> Type[MetricWrapperBase]:
42 | """The type of Prometheus metric is automatically derived from the type of measured unit."""
43 | if unit is MeasuredUnit.COUNT:
44 | return Counter
45 | else:
46 | return Histogram
47 |
48 |
49 | def init_prom_metrics():
50 | """In Prometheus clients, all metrics should be defined only once before use, along with their possible labels.
51 | This is not a technical limitation of Prometheus itself, but rather enforced by official clients."""
52 | for e in MetricName:
53 | base_args = {'name': e.name.lower(),
54 | 'documentation': e.name,
55 | 'labelnames': supported_label_names(e)}
56 | metric_type = unit_to_metric_type(e)
57 | if metric_type == Counter:
58 | prom_counters[e] = Counter(**base_args)
59 | elif metric_type == Histogram:
60 | prom_histograms[e] = Histogram(**base_args, buckets=buckets_by_unit(e.unit))
61 |
62 |
63 | def update(metrics: List[MetricData]):
64 | """Update (increment/observe) new values after a job completes, etc."""
65 | for md in metrics:
66 | empty_labels = empty_label_names(md.name)
67 | all_labels = {**empty_labels, **md.labels}
68 | metric_type = unit_to_metric_type(md.name.unit)
69 | if metric_type == Counter:
70 | prom_counters[md.name].labels(**all_labels).inc(md.value)
71 | elif metric_type == Histogram:
72 | prom_histograms[md.name].labels(**all_labels).observe(md.value)
73 |
--------------------------------------------------------------------------------
/frocket/invoker/stats_builder.py:
--------------------------------------------------------------------------------
1 | """
2 | Build JobStats (returned to the client after job completion) - based mostly on the DataFrame of collected metrics from
3 | the invoker and all workers.
4 | """
5 | # Copyright 2021 The Funnel Rocket Maintainers
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | import logging
20 | import sys
21 | from typing import Optional, Union, List, Dict
22 | import pandas
23 | import numpy as np
24 | from pandas import DataFrame
25 | from frocket.common.config import config
26 | from frocket.common.dataset import DatasetPartsInfo, PartNamingMethod
27 | from frocket.common.tasks.base import JobStats, JobDatasetStats, JobInvokerStats, TimingStats, JobWorkerStats
28 | from frocket.invoker.metrics_frame import MetricsFrame, METRIC_NAME_COLUMN, METRIC_VALUE_COLUMN, METRIC_SOURCE_COLUMN
29 | from frocket.common.metrics import MetricName, ComponentLabel, SUCCESS_LABEL, MetricLabelEnum, \
30 | WorkerStartupLabel, LoadFromLabel
31 |
32 | logger = logging.getLogger(__name__)
33 |
34 | TASK_COMPLETION_GRANULARITY_SECONDS = 0.25 # Data series of task success over time is measured in this resolution
35 | TIMING_PERCENTILES = [float(pct) for pct in config.get('stats.timing.percentiles').split(',')]
36 | MIN_METRICS_FOR_PERCENTILES = 20 # Below this sample count, don't return percentiles
37 | MIN_METRICS_FOR_99_PERCENTILE = 100 # Below this count, don't return 99th percentile
38 | # List of keys to pull from Pandas' describe()
39 | TIMING_DESCRIBE_KEYS = ['min', 'mean', 'max'] + [f"{int(pct*100)}%" for pct in TIMING_PERCENTILES]
40 |
41 |
42 | def build_stats(frame: MetricsFrame, parts_info: DatasetPartsInfo = None) -> JobStats:
43 | df = frame.dataframe
44 | if df is None: # In job failure cases
45 | return JobStats()
46 |
47 | if parts_info:
48 | ds_stats = JobDatasetStats(total_size=parts_info.total_size, parts=parts_info.total_parts)
49 | else:
50 | ds_stats = None
51 |
52 | # Invoker stats
53 | all_task_rows_df = _filter_by_label(df, ComponentLabel.WORKER)
54 | successful_task_rows_df = _filter_by_success(all_task_rows_df)
55 | total_tasks = _count_tasks(all_task_rows_df)
56 | failed_tasks = total_tasks - _count_tasks(successful_task_rows_df)
57 |
58 | invoker_stats = JobInvokerStats(
59 | enqueue_time=_sum_value(df, MetricName.ASYNC_ENQUEUE_SECONDS, single_value=True),
60 | poll_time=_sum_value(df, MetricName.ASYNC_POLL_SECONDS, single_value=True),
61 | total_tasks=total_tasks,
62 | failed_tasks=failed_tasks,
63 | task_success_over_time=_task_success_over_time(successful_task_rows_df)
64 | # TODO backlog add: lost_task_retries as counted by the invoker; support sync. invokers?
65 | )
66 |
67 | # Worker stats
68 | worker_stats = JobWorkerStats(
69 | cold_tasks=_count_tasks(_filter_by_label(successful_task_rows_df, WorkerStartupLabel.COLD)),
70 | warm_tasks=_count_tasks(_filter_by_label(successful_task_rows_df, WorkerStartupLabel.WARM)),
71 | scanned_rows=_sum_value(successful_task_rows_df, MetricName.SCANNED_ROWS, as_int=True),
72 | scanned_groups=_sum_value(successful_task_rows_df, MetricName.SCANNED_GROUPS, as_int=True),
73 | cache=_cache_performance(successful_task_rows_df),
74 | invoke_latency=_timing_stats(successful_task_rows_df, MetricName.INVOKE_TO_RUN_SECONDS),
75 | load_time=_timing_stats(successful_task_rows_df, MetricName.TASK_TOTAL_LOAD_SECONDS),
76 | total_time=_timing_stats(successful_task_rows_df, MetricName.TASK_TOTAL_RUN_SECONDS)
77 | # TODO backlog add: loaded_column_types - mapping of column type to count, which affects load time
78 | )
79 |
80 | job_stats = JobStats(
81 | total_time=_sum_value(df, MetricName.INVOKER_TOTAL_SECONDS, single_value=True),
82 | cost=_total_cost(df),
83 | dataset=ds_stats,
84 | invoker=invoker_stats,
85 | worker=worker_stats)
86 | return job_stats
87 |
88 |
89 | def _task_success_over_time(task_rows_df: DataFrame) -> Dict[float, int]:
90 | """Return a sparse series of data points - for each time slot (e.g. 0.25 secs) since the job started, return how
91 | many tasks completed successfully in that slot. Non-cumulative, does not include zeros."""
92 | task_duration_rows = _filter_by_metrics(
93 | task_rows_df, metrics=[MetricName.INVOKE_TO_RUN_SECONDS, MetricName.TASK_TOTAL_RUN_SECONDS])
94 | task_durations = task_duration_rows.groupby(METRIC_SOURCE_COLUMN)[METRIC_VALUE_COLUMN].sum()
95 | quantized_task_durations = \
96 | np.ceil(task_durations / TASK_COMPLETION_GRANULARITY_SECONDS) * TASK_COMPLETION_GRANULARITY_SECONDS
97 | return quantized_task_durations.value_counts().sort_index().to_dict()
98 |
99 |
100 | def _cache_performance(task_rows_df: DataFrame) -> Dict[str, int]:
101 | return {
102 | # Note the 'source' is always the case for locally-loaded files, in which case caching is N/A.
103 | 'source': _count_tasks(_filter_by_label(task_rows_df, LoadFromLabel.SOURCE)),
104 | 'diskCache': _count_tasks(_filter_by_label(task_rows_df, LoadFromLabel.DISK_CACHE))
105 | }
106 |
107 |
108 | def _sum_value(df: DataFrame, metric: MetricName,
109 | single_value: bool = False,
110 | as_int: bool = False) -> Union[float, int, None]:
111 | df = _filter_by_metrics(df, metric)
112 | if single_value:
113 | assert len(df) <= 1
114 | if df.empty:
115 | return None
116 | else:
117 | values_sum = df[METRIC_VALUE_COLUMN].sum()
118 | return int(values_sum) if as_int else float(values_sum)
119 |
120 |
121 | def _count(df: DataFrame, metric: MetricName) -> int:
122 | return _filter_by_metrics(df, metric)[METRIC_VALUE_COLUMN].count()
123 |
124 |
125 | def _timing_stats(task_rows_df: DataFrame, metric: MetricName) -> TimingStats:
126 | values_df = _filter_by_metrics(task_rows_df, metric)[METRIC_VALUE_COLUMN]
127 | if len(values_df) < MIN_METRICS_FOR_PERCENTILES:
128 | percentiles = [0.5]
129 | else:
130 | percentiles = TIMING_PERCENTILES
131 | if len(values_df) < MIN_METRICS_FOR_99_PERCENTILE:
132 | percentiles = [pct for pct in percentiles if pct < 0.99]
133 |
134 | raw_stats = values_df.describe(percentiles=percentiles).to_dict()
135 | return {k: v for k, v in raw_stats.items()
136 | if k in TIMING_DESCRIBE_KEYS and not np.isnan(v)}
137 |
138 |
139 | def _filter_by_metrics(df: DataFrame, metrics: Union[MetricName, List[MetricName]]) -> DataFrame:
140 | if type(metrics) is MetricName:
141 | return df[df[METRIC_NAME_COLUMN] == metrics.name]
142 | else:
143 | return df[df[METRIC_NAME_COLUMN].isin([m.name for m in metrics])]
144 |
145 |
146 | def _filter_by_label(df: DataFrame, label: MetricLabelEnum) -> DataFrame:
147 | return df[df[label.label_name] == label.label_value.lower()]
148 |
149 |
150 | def _filter_by_success(df: DataFrame, value: bool = True) -> DataFrame:
151 | return df[df[SUCCESS_LABEL] == str(value)]
152 |
153 |
154 | def _count_tasks(task_rows_df: DataFrame) -> int:
155 | """Each task attempt (e.g. task index 117, attempt 2) has a unique name in the source column, which ofc appears in
156 | multiple rows. This count the unique count of task attempt IDs in the given DF."""
157 | return task_rows_df[METRIC_SOURCE_COLUMN].nunique()
158 |
159 |
160 | def _total_cost(df: DataFrame) -> Optional[float]:
161 | cost_reports_df = _filter_by_metrics(df, MetricName.COST_DOLLARS)
162 | num_reports = len(cost_reports_df)
163 | if num_reports == 0:
164 | logger.debug(f"Total cost: no metrics found")
165 | return None
166 | else:
167 | total_cost = float(cost_reports_df[METRIC_VALUE_COLUMN].sum())
168 | logger.debug(f"Total cost: ${total_cost:.6f} (sum of {num_reports} metric reports)")
169 | return total_cost
170 |
171 |
172 | # Stand-alone testing
173 | if __name__ == "__main__":
174 | config.init_logging(force_level=logging.DEBUG, force_console_output=True)
175 | filename = config.get('metrics.export.lastrun', None)
176 | if not filename:
177 | sys.exit('No lastrun file defined')
178 |
179 | df = pandas.read_parquet(filename)
180 | dummy_frame = MetricsFrame([])
181 | dummy_frame._df = df
182 | dummy_parts_info = DatasetPartsInfo(naming_method=PartNamingMethod.LIST, total_parts=4, total_size=1024)
183 | build_stats(dummy_frame, dummy_parts_info)
184 |
--------------------------------------------------------------------------------
/frocket/worker/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/frocket/worker/impl/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/frocket/worker/impl/aws_lambda_metrics.py:
--------------------------------------------------------------------------------
1 | """
2 | Calculate physical memory & cost for AWS Lambda-based workers.
3 |
4 | Important note re. Lambda billing: although this is not explicitly stated and subject to change, you are not charged for
5 | the duration in which a cold-started Lambda loads up till the point when the actual handler is called -
6 | meaning, all imports are "free"! this means that cold-started Lambdas mainly impact clock-time latency but typically
7 | won't inflate cost to a similar degree. This is in line with how the task duration is measured w/o cold-start imports.
8 | """
9 | # Copyright 2021 The Funnel Rocket Maintainers
10 | #
11 | # Licensed under the Apache License, Version 2.0 (the "License");
12 | # you may not use this file except in compliance with the License.
13 | # You may obtain a copy of the License at
14 | #
15 | # http://www.apache.org/licenses/LICENSE-2.0
16 | #
17 | # Unless required by applicable law or agreed to in writing, software
18 | # distributed under the License is distributed on an "AS IS" BASIS,
19 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | # See the License for the specific language governing permissions and
21 | # limitations under the License.
22 |
23 | import logging
24 | import math
25 | import re
26 | from frocket.common.metrics import MetricName, EnvironmentMetricsProvider, MetricData
27 |
28 | logger = logging.getLogger(__name__)
29 |
30 | # TODO backlog setup a recurring task to check for pricing changes, so this can be updated.
31 | DEFAULT_PRICE_GB_SEC = 0.0000166667
32 | REGION_PRICING = {
33 | "eu-south-1": 0.0000195172, # Milan
34 | "me-south-1": 0.0000206667, # Bahrain
35 | "ap-northeast-3": 0.00002153, # Osaka
36 | "af-south-1": 0.0000221, # Capetown
37 | "ap-east-1": 0.00002292 # Hong-kong
38 | }
39 | # Assume the actual run takes this amount of seconds more than what's been measured,
40 | # e.g. time spent in decoding the task reqeust, and time still to spend on writing results (incl. these metrics...)
41 | # to datastore.
42 | LAMBDA_TIME_OVERHEAD = 0.008 # 8ms, a conservative value based on a few observations
43 |
44 |
45 | class AwsLambdaMetricsProvider(EnvironmentMetricsProvider):
46 | def __init__(self, lambda_context):
47 | # See https://docs.aws.amazon.com/lambda/latest/dg/python-context.html
48 | assert lambda_context.__class__.__name__ == 'LambdaContext'
49 | self._lambda_context = lambda_context
50 |
51 | # What region are we in? figure out by the full ARN in the context
52 | # (ARN example: arn:aws:lambda:us-west-2:123456789012:function:my-function)
53 | arn_parts = lambda_context.invoked_function_arn.split(':')
54 | region = arn_parts[3]
55 | if re.match(r'\w+-\w+-\d+', region):
56 | self._region = region
57 | else:
58 | self._region = None
59 | logger.warning(f"Seems like an invalid region: '{region}' in ARN: {lambda_context.invoked_function_arn}, "
60 | f"not calculating cost")
61 |
62 | def _memory_bytes(self):
63 | mem_bytes = int(self._lambda_context.memory_limit_in_mb) * (1024 ** 2)
64 | return MetricData(MetricName.MACHINE_MEMORY_BYTES, mem_bytes)
65 |
66 | def _cost_dollars(self, duration=None):
67 | if not duration or not self._region:
68 | return None
69 |
70 | # noinspection PyBroadException
71 | try:
72 | memory_gb = self._memory_bytes().value / (1024 ** 3)
73 | # Lambdas are currently billed in 1ms granularity, so rounding up
74 | rounded_duration = duration + LAMBDA_TIME_OVERHEAD
75 | rounded_duration = math.ceil(rounded_duration * 1000) / 1000
76 |
77 | gb_second_units = rounded_duration * memory_gb
78 | cost_per_unit = REGION_PRICING.get(self._region, DEFAULT_PRICE_GB_SEC)
79 | cost = gb_second_units * cost_per_unit
80 | message = \
81 | f"Cost: original duration: {duration: .4f} sec, rounded duration: {rounded_duration:.3f}, memory: " \
82 | f"{memory_gb}GB, GB/second units: {gb_second_units}, unit cost for region {self._region}: " \
83 | f"${cost_per_unit:.10f} => total run cost is ${cost:.10f}"
84 | logger.debug(message)
85 | return MetricData(MetricName.COST_DOLLARS, cost)
86 | except Exception:
87 | logger.exception("Failed calculating cost")
88 | return None
89 |
--------------------------------------------------------------------------------
/frocket/worker/impl/aws_lambda_worker.py:
--------------------------------------------------------------------------------
1 | """
2 | lambda_handler() in this module is the AWS Lambda's defined entrypoint.
3 | There's minimal code here that's Lambda-specific (== a good thing).
4 | """
5 | # Copyright 2021 The Funnel Rocket Maintainers
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | import logging
20 | from typing import cast
21 | from frocket.common.serializable import Envelope
22 | from frocket.common.tasks.base import BaseTaskRequest
23 | from frocket.common.metrics import MetricsBag, WorkerStartupLabel, ComponentLabel
24 | from frocket.worker.impl.aws_lambda_metrics import AwsLambdaMetricsProvider
25 | from frocket.worker.runners.base_task_runner import BaseTaskRunner, TaskRunnerContext
26 | from frocket.common.config import config
27 | from frocket.worker.runners.registered_runners import REGISTERED_RUNNERS
28 |
29 | config.init_lambda_logging() # Adapted to the logger being already-inited by the Lambda runtime
30 | logger = logging.getLogger(__name__)
31 |
32 | # This flag only set when a new Lambda instance is cold-started. Warm lambdas would go straight to the handler function.
33 | cold_start_flag = True
34 |
35 |
36 | def is_cold_start():
37 | global cold_start_flag
38 | if cold_start_flag:
39 | cold_start_flag = False # For next invocation
40 | return True
41 | else:
42 | return False
43 |
44 |
45 | def init_task_metrics(lambda_context) -> MetricsBag:
46 | metrics = MetricsBag(component=ComponentLabel.WORKER,
47 | env_metrics_provider=AwsLambdaMetricsProvider(lambda_context))
48 | if is_cold_start():
49 | metrics.set_label_enum(WorkerStartupLabel.COLD)
50 | else:
51 | metrics.set_label_enum(WorkerStartupLabel.WARM)
52 | return metrics
53 |
54 |
55 | def lambda_handler(event, context):
56 | metrics = init_task_metrics(context)
57 | # The event JSON was already parsed to dict by the Lambda runtime -
58 | # now read from that dict that actual task request object
59 | envelope = Envelope.from_dict(event)
60 | req = cast(BaseTaskRequest, envelope.open(expected_superclass=BaseTaskRequest))
61 | logger.info(f"Got request: {req}")
62 |
63 | result = None
64 | should_run, reject_reason = BaseTaskRunner.should_run(req)
65 | if should_run:
66 | runner_class = REGISTERED_RUNNERS[type(req)]
67 | runner = runner_class(req, TaskRunnerContext(metrics))
68 | result = runner.run()
69 |
70 | """
71 | A note about the Lambda response: unlike most request/response Lambdas, Funnel Rocket's invoker does not rely on the
72 | function's result coming from the Lambda directly (as it's invoked async.) but rather always through the datastore.
73 | The retry mechanism is also based on polling the tasks' status and result payload in the datastore, hence the
74 | Lambda itself should not normally return a non-200 status (unless it crashed unexpectedly), and the Lambda should
75 | be configured to have no retries at the AWS level.
76 | """
77 |
78 | lambda_response = {
79 | 'statusCode': 200,
80 | }
81 |
82 | # Getting the result object in the Lambda response is still useful for manual testing
83 | if logger.isEnabledFor(logging.DEBUG):
84 | if result:
85 | lambda_response['result'] = result.to_json()
86 | else:
87 | lambda_response['reject_reason'] = reject_reason
88 | return lambda_response
89 |
--------------------------------------------------------------------------------
/frocket/worker/impl/generic_env_metrics.py:
--------------------------------------------------------------------------------
1 | """
2 | This is the most generic implementation for getting runtime-environment metrics:
3 | it does not assume we know the cost of the host machine for the request duration,
4 | and getting physical memory size should generally work on Linux variants and OS X versions.
5 | """
6 | # Copyright 2021 The Funnel Rocket Maintainers
7 | #
8 | # Licensed under the Apache License, Version 2.0 (the "License");
9 | # you may not use this file except in compliance with the License.
10 | # You may obtain a copy of the License at
11 | #
12 | # http://www.apache.org/licenses/LICENSE-2.0
13 | #
14 | # Unless required by applicable law or agreed to in writing, software
15 | # distributed under the License is distributed on an "AS IS" BASIS,
16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | # See the License for the specific language governing permissions and
18 | # limitations under the License.
19 |
20 | import logging
21 | import os
22 | from frocket.common.metrics import EnvironmentMetricsProvider, MetricData, MetricName
23 |
24 | logger = logging.getLogger(__name__)
25 |
26 |
27 | class GenericEnvMetricsProvider(EnvironmentMetricsProvider):
28 | def _memory_bytes(self):
29 | # Tested on Linux and OS X
30 | try:
31 | mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
32 | except ValueError:
33 | # Fallback to sysctl in case that os.sysconf('SC_PHYS_PAGES') fails on OS X (seems version specific)
34 | # noinspection PyBroadException
35 | try:
36 | stream = os.popen('sysctl hw.memsize')
37 | mem_bytes = int(stream.read().split(' ')[1])
38 | except Exception as e:
39 | logger.warning(f"Can't detect machine memory: {e}")
40 | return None
41 |
42 | return MetricData(MetricName.MACHINE_MEMORY_BYTES, mem_bytes)
43 |
44 | def _cost_dollars(self, duration=None):
45 | return None
46 |
--------------------------------------------------------------------------------
/frocket/worker/impl/queue_worker.py:
--------------------------------------------------------------------------------
1 | """
2 | A worker that gets its tasks by a blocking dequeue from the datastore. Doesn't get any simpler -
3 | but is easily scalable, and requires no load balancer or orchestrator (except for the queue's atomic guarantees).
4 |
5 | TODO backlog having a cache-friendly task assignment would require more work, if it makes sense to do.
6 | """
7 | # Copyright 2021 The Funnel Rocket Maintainers
8 | #
9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | # http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 |
21 | import logging
22 | from frocket.common.metrics import MetricsBag, WorkerStartupLabel, ComponentLabel
23 | from frocket.common.tasks.base import BaseTaskRequest
24 | from frocket.datastore.registered_datastores import get_datastore
25 | from frocket.worker.impl.generic_env_metrics import GenericEnvMetricsProvider
26 | from frocket.worker.runners.base_task_runner import BaseTaskRunner, TaskRunnerContext
27 | from frocket.common.config import config
28 | from frocket.worker.runners.registered_runners import REGISTERED_RUNNERS
29 |
30 | config.init_logging()
31 | logger = logging.getLogger(__name__)
32 | datastore = get_datastore()
33 |
34 |
35 | def handle(req: BaseTaskRequest) -> None:
36 | metrics = MetricsBag(component=ComponentLabel.WORKER,
37 | env_metrics_provider=GenericEnvMetricsProvider())
38 | metrics.set_label_enum(WorkerStartupLabel.WARM) # Always warm this worker is, uhmmhmmhmmhmm
39 |
40 | runner_class = REGISTERED_RUNNERS[type(req)]
41 | runner = runner_class(req, TaskRunnerContext(metrics))
42 | result = runner.run()
43 | if logger.isEnabledFor(logging.DEBUG):
44 | logger.debug(result.to_json())
45 |
46 |
47 | def main_loop():
48 | # TODO backlog currently workers that encounter an unexpected data format will crash rather than continuing to
49 | # consume and (probably) fail. This has a pro (outdated worker versions fail fast), but of course also cons -
50 | # consider the desired/configurable behavior (e.g. crash after N unexpected errors?)
51 | try:
52 | while True:
53 | logger.info('Waiting for work...')
54 | req: BaseTaskRequest = datastore.dequeue()
55 | if req:
56 | logger.info(f"Got request: {req}")
57 |
58 | should_run, reject_reason = BaseTaskRunner.should_run(req)
59 | if should_run:
60 | handle(req)
61 | else:
62 | logger.warning(f"Request rejected: {reject_reason}")
63 | except KeyboardInterrupt:
64 | logger.info('Bye')
65 |
66 |
67 | main_loop()
68 |
--------------------------------------------------------------------------------
/frocket/worker/runners/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/frocket/worker/runners/base_task_runner.py:
--------------------------------------------------------------------------------
1 | """
2 | Base class for running a task in a worker - to be subclassed for concerete task runners.
3 | """
4 | # Copyright 2021 The Funnel Rocket Maintainers
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | import logging
19 | import time
20 | from abc import abstractmethod
21 | from typing import Optional
22 | from frocket.common.config import config
23 | from frocket.common.metrics import MetricName, MetricsBag
24 | from frocket.common.tasks.base import TaskStatus, BaseTaskRequest, BaseTaskResult, TaskAttemptId
25 | from frocket.datastore.datastore import Datastore
26 | from frocket.datastore.blobstore import Blobstore
27 | from frocket.datastore.registered_datastores import get_datastore, get_blobstore
28 | from frocket.worker.runners.part_loader import PartLoader, shared_part_loader
29 |
30 | logger = logging.getLogger(__name__)
31 | REQUEST_MAX_AGE = int(config.get("worker.reject.age"))
32 | DEFAULT_PREFLIGHT_DURATION_MS = config.int("part.selection.preflight.ms")
33 |
34 |
35 | class TaskRunnerContext:
36 | """simple dependency provider... (for easier testing)."""
37 | def __init__(self,
38 | metrics: MetricsBag,
39 | private_part_loader: PartLoader = None,
40 | preflight_duration_ms: int = None):
41 | self._metrics = metrics
42 | # By default, files are loaded and cached by a re-usable loaded.
43 | # Having a 'private' one allows testing in isolation
44 | self._part_loader = private_part_loader or shared_part_loader()
45 | if preflight_duration_ms is None:
46 | preflight_duration_ms = DEFAULT_PREFLIGHT_DURATION_MS
47 | self._preflight_duration_seconds = preflight_duration_ms / 1000
48 |
49 | @property
50 | def metrics(self) -> MetricsBag:
51 | return self._metrics
52 |
53 | # The underlying get_datastore and get_blobstore are memoized - initialized on demand
54 | @property
55 | def datastore(self) -> Datastore:
56 | return get_datastore()
57 |
58 | @property
59 | def blobstore(self) -> Blobstore:
60 | return get_blobstore()
61 |
62 | @property
63 | def part_loader(self) -> PartLoader:
64 | return self._part_loader
65 |
66 | @property
67 | def preflight_duration_seconds(self) -> float:
68 | return self._preflight_duration_seconds
69 |
70 |
71 | class BaseTaskRunner:
72 | # Returns (should_run, reject_reason)
73 | @classmethod
74 | def should_run(cls, req: BaseTaskRequest) -> (bool, str):
75 | if cls.time_since_invocation(req) > REQUEST_MAX_AGE:
76 | return False, f"request is more than {REQUEST_MAX_AGE} seconds old"
77 | else:
78 | return True, None
79 |
80 | @staticmethod
81 | def time_since_invocation(req: BaseTaskRequest):
82 | return time.time() - req.invoke_time
83 |
84 | def __init__(self, req: BaseTaskRequest,
85 | ctx: TaskRunnerContext):
86 | self._req = req
87 | self._ctx = ctx
88 | # TODO backlog initialize the attempt_id on init, if available (n/a here in self-select part mode)
89 | self._task_attempt_id: Optional[TaskAttemptId] = None
90 |
91 | def run(self) -> BaseTaskResult:
92 | error_message, engine_result = None, None
93 | with self._ctx.metrics.measure(MetricName.TASK_TOTAL_RUN_SECONDS):
94 | try:
95 | self._ctx.metrics.set_metric(MetricName.INVOKE_TO_RUN_SECONDS,
96 | self.time_since_invocation(self._req))
97 |
98 | self._do_run() # Call concrete class to do the actual work
99 | final_status = TaskStatus.ENDED_SUCCESS
100 | except Exception as e:
101 | final_status = TaskStatus.ENDED_FAILED
102 | error_message = str(e)
103 | logger.exception('Task FAILED!')
104 |
105 | # Post-run: extracting the task metrics, building the concrete result object
106 | final_metrics = self._ctx.metrics.finalize(success=(final_status == TaskStatus.ENDED_SUCCESS))
107 | # First, set the base attributes in a dict as kind of a 'skeleton' response - then pass it to the concrete
108 | # task runner to pass as **args to the concrete result class
109 | base_attributes = BaseTaskResult(
110 | task_index=self._task_attempt_id.task_index,
111 | status=final_status,
112 | error_message=error_message,
113 | metrics=final_metrics).shallowdict(include_none=True)
114 | result = self._build_result(base_attributes) # Call concrete class
115 |
116 | # If the job failed to get a task attempt ID assigned to it (self-select failed),
117 | # or if the datastore is not available - task status and result cannot be written
118 | # TODO backlog consider having an optional secondary channel to report such failures
119 | # (aside from centralized logging?)
120 | if self._task_attempt_id:
121 | self._ctx.datastore.write_task_result(self._req.request_id, self._task_attempt_id, result)
122 | else:
123 | logger.error("Can't report result: no part was selected for loading")
124 |
125 | if logger.isEnabledFor(logging.DEBUG):
126 | logger.debug(result)
127 | return result
128 |
129 | def _update_status(self, status: TaskStatus):
130 | self._ctx.datastore.update_task_status(self._req.request_id, self._task_attempt_id, status)
131 |
132 | @abstractmethod
133 | def _do_run(self):
134 | pass
135 |
136 | @abstractmethod
137 | def _build_result(self, base_attributes: dict):
138 | """This method is still called by run() above even if _do_run() has raised an exception - having a sane
139 | result object is important even if a failed one."""
140 | pass
141 |
--------------------------------------------------------------------------------
/frocket/worker/runners/part_loader.py:
--------------------------------------------------------------------------------
1 | """
2 | Load and cache parts (data files).
3 | """
4 | # Copyright 2021 The Funnel Rocket Maintainers
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | import logging
19 | import time
20 | import os
21 | from pathlib import Path
22 | from typing import List, Dict, Optional, Set, NamedTuple, Union
23 | from pandas import DataFrame
24 | import pyarrow.parquet
25 | from frocket.common.config import config
26 | from frocket.common.helpers.storage import storage_handler_for
27 | from frocket.common.helpers.utils import memoize
28 | from frocket.common.metrics import MetricName, LoadFromLabel, MetricsBag
29 | from frocket.common.dataset import DatasetPartId, DatasetId
30 |
31 | logger = logging.getLogger(__name__)
32 |
33 |
34 | # Just a little typed nicety over tuples which PyArrow accepts as predicate pushdown filters
35 | class FilterPredicate(NamedTuple):
36 | column: str
37 | op: str
38 | value: Union[str, int, float, bool]
39 |
40 |
41 | class CacheEntry:
42 | local_path: str
43 | size_mb: float
44 | last_used: float
45 |
46 |
47 | class PartLoader:
48 | _cache: Dict[DatasetPartId, CacheEntry] = None # DatasetPartId is a dataclass with proper hash & equality
49 | _disk_cache_max_size: float = None
50 |
51 | def __init__(self):
52 | self._setup()
53 |
54 | # Support re-initialization and overriding the configured size, for testing
55 | def _setup(self, disk_cache_max_size: float = None):
56 | if self._cache:
57 | for entry in self._cache.values():
58 | os.remove(entry.local_path)
59 | self._cache = {}
60 | self._disk_cache_max_size = disk_cache_max_size if disk_cache_max_size is not None \
61 | else config.float('worker.disk.cache.size.mb')
62 |
63 | @property
64 | def cache_current_size_mb(self) -> float:
65 | return sum(entry.size_mb for entry in self._cache.values())
66 |
67 | @property
68 | def cache_len(self) -> int:
69 | return len(self._cache)
70 |
71 | def _prune_cache(self) -> None:
72 | curr_size_mb = self.cache_current_size_mb
73 | while curr_size_mb > 0 and curr_size_mb > self._disk_cache_max_size:
74 | logger.info(f"Current cache size is {curr_size_mb}mb, more than the configured "
75 | f"{self._disk_cache_max_size}mb")
76 | lru_key = min(self._cache, key=lambda k: self._cache[k].last_used)
77 | lru_entry = self._cache[lru_key]
78 | logger.info(f"Deleting LRU entry of dataset: {lru_key.dataset_id.name} "
79 | f"source path: {lru_key.path}, "
80 | f"last used {time.time() - lru_entry.last_used:.1f} seconds ago")
81 | try:
82 | os.remove(lru_entry.local_path)
83 | except OSError:
84 | logger.exception('Failed to delete file!') # TODO backlog consider disabling any further caching
85 | del self._cache[lru_key]
86 | curr_size_mb = self.cache_current_size_mb
87 |
88 | def load_dataframe(self,
89 | file_id: DatasetPartId,
90 | metrics: MetricsBag,
91 | needed_columns: List[str] = None,
92 | filters: List[FilterPredicate] = None,
93 | load_as_categoricals: List[str] = None) -> DataFrame:
94 | self._prune_cache()
95 | loaded_from: Optional[LoadFromLabel] = LoadFromLabel.SOURCE
96 | handler = storage_handler_for(file_id.path)
97 | is_source_remote = handler.remote
98 |
99 | local_path = None
100 | if not is_source_remote:
101 | local_path = file_id.path # No caching for local files
102 | else:
103 | if file_id in self._cache:
104 | local_path = self._cache[file_id].local_path
105 | loaded_from = LoadFromLabel.DISK_CACHE
106 | self._cache[file_id].last_used = time.time()
107 | logger.info("File is locally cached, yay")
108 |
109 | if not local_path:
110 | with metrics.measure(MetricName.TASK_DOWNLOAD_SECONDS):
111 | local_path = str(handler.get_local_path(file_id.path)) # Download to a local temp file
112 |
113 | entry = CacheEntry()
114 | entry.local_path = local_path
115 | entry.size_mb = Path(local_path).stat().st_size / 1024 ** 2
116 | entry.last_used = time.time()
117 | self._cache[file_id] = entry
118 |
119 | with metrics.measure(MetricName.TASK_LOAD_FILE_SECONDS):
120 | # Using PyArrow directly (rather than wrapped through Pandas) allows specifying column names to explicitly
121 | # load as 'dictionary' type, which then translates to categoricals in Pandas.
122 | # If the file was created with Pandas, categorical columns are loaded back as such - but we go beyond
123 | # that to detect 'potential categorical' string columns and load them as such.
124 | # Except for the memory usage saving, there is a performance gain here if the Parquet file already has a
125 | # dictionary for the column. Otherwise, PyArrow will create one - but without a performance gain.
126 | df = pyarrow.parquet.read_table(local_path,
127 | columns=needed_columns,
128 | filters=filters,
129 | read_dictionary=load_as_categoricals).to_pandas()
130 |
131 | metrics.set_label_enum(loaded_from)
132 | return df
133 |
134 | def get_cached_candidates(self, dataset_id: DatasetId) -> Optional[Set[DatasetPartId]]:
135 | """Do we have cached parts for this DatasetId, that can be used to self-select parts?"""
136 | logger.debug(f"Looking for cached candidates matching: {dataset_id}")
137 | candidates = None
138 | if self._cache:
139 | candidates = {part_id for part_id in self._cache.keys() if part_id.dataset_id == dataset_id}
140 |
141 | logger.debug(f"Found candidates: {candidates}")
142 | return candidates if (candidates and len(candidates) > 0) else None
143 |
144 |
145 | @memoize
146 | def shared_part_loader() -> PartLoader:
147 | """This is used by default, but can be overriden in tests."""
148 | return PartLoader()
149 |
--------------------------------------------------------------------------------
/frocket/worker/runners/query_task_runner.py:
--------------------------------------------------------------------------------
1 | """
2 | Execute a single query task.
3 | """
4 | # Copyright 2021 The Funnel Rocket Maintainers
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | import logging
19 | import time
20 | from typing import List, cast, Optional
21 | from pandas import DataFrame
22 | from frocket.common.dataset import DatasetPartId
23 | from frocket.common.metrics import MetricName, PartSelectMethodLabel
24 | from frocket.common.tasks.base import TaskStatus, TaskAttemptId, BaseTaskRequest
25 | from frocket.common.tasks.query import PartSelectionMode, QueryTaskRequest, QueryResult, QueryTaskResult
26 | from frocket.engine.query_engine import QueryEngine
27 | from frocket.worker.runners.base_task_runner import BaseTaskRunner, TaskRunnerContext
28 | from frocket.worker.runners.part_loader import FilterPredicate
29 |
30 | logger = logging.getLogger(__name__)
31 |
32 |
33 | class QueryTaskRunner(BaseTaskRunner):
34 | def __init__(self, req: BaseTaskRequest, ctx: TaskRunnerContext):
35 | super().__init__(req, ctx)
36 | self._req = cast(QueryTaskRequest, req) # Avoid type warnings
37 | self._dataset_part_id: Optional[DatasetPartId] = None
38 | self._query_result: Optional[QueryResult] = None
39 |
40 | def _do_run(self):
41 | self._set_part_to_load()
42 | self._update_status(TaskStatus.LOADING_DATA)
43 | with self._ctx.metrics.measure(MetricName.TASK_TOTAL_LOAD_SECONDS):
44 | df = self._load(needed_columns=self._req.used_columns,
45 | load_as_categoricals=self._req.load_as_categoricals)
46 |
47 | self._update_status(TaskStatus.RUNNING_QUERY)
48 | with self._ctx.metrics.measure(MetricName.TASK_RUN_QUERY_SECONDS):
49 | engine = QueryEngine(self._req.dataset.group_id_column, self._req.dataset.timestamp_column)
50 | engine_result = engine.run(df, self._req.query)
51 | self._query_result = engine_result
52 |
53 | def _set_part_to_load(self) -> None:
54 | task_attempt_no = self._req.attempt_no
55 | if self._req.mode == PartSelectionMode.SET_BY_INVOKER:
56 | part_id = self._req.invoker_set_part
57 | actual_select_method = PartSelectMethodLabel.SET_BY_INVOKER
58 | elif self._req.mode == PartSelectionMode.SELECTED_BY_WORKER:
59 | actual_select_method, part_id = self._select_part_myself()
60 | logger.info(f"Worker selected part: method: {actual_select_method}, file ID: {part_id}, "
61 | f"task attempt no.: {task_attempt_no}")
62 | else:
63 | raise Exception(f"Don't know how to handle request mode: {self._req.mode}")
64 |
65 | if not part_id:
66 | raise Exception("No part to load")
67 |
68 | self._ctx.metrics.set_label_enum(actual_select_method)
69 | self._dataset_part_id = part_id
70 | self._task_attempt_id = TaskAttemptId(part_id.part_idx, task_attempt_no)
71 |
72 | def _select_part_myself(self):
73 | """See configuration guide for 'preflight' concept. In general, that's a configurable time period in self-select
74 | part mode, where 'warm' workers can select the candidates they wish without interruption."""
75 | time_left_in_preflight = self._ctx.preflight_duration_seconds - BaseTaskRunner.time_since_invocation(self._req)
76 | candidates = self._ctx.part_loader.get_cached_candidates(self._req.dataset.id)
77 | sleep_time = 0
78 | if not candidates and time_left_in_preflight > 0:
79 | logger.info("Got no candidates but we're still during preflight"
80 | f", so sleeping for {time_left_in_preflight} seconds")
81 | sleep_time = time_left_in_preflight
82 |
83 | if sleep_time:
84 | time.sleep(time_left_in_preflight)
85 | self._ctx.metrics.set_metric(MetricName.TASK_PREFLIGHT_SLEEP_SECONDS, sleep_time)
86 |
87 | # If a worker got some candidates, we still gonna try to grab them even if preflight time has ended
88 | selected_part = self._ctx.datastore.self_select_part(self._req.request_id, self._req.attempt_no, candidates)
89 | if not selected_part.part_id:
90 | # Not supposed to happen, unless there's a retry mechanism gone awry
91 | raise Exception("Got no part for me!")
92 |
93 | if candidates:
94 | if not selected_part.random:
95 | actual_select_method = PartSelectMethodLabel.SPECIFIC_CANDIDATE
96 | else:
97 | actual_select_method = PartSelectMethodLabel.RANDOM_CANDIDATES_TAKEN
98 | else:
99 | actual_select_method = PartSelectMethodLabel.RANDOM_NO_CANDIDATES
100 |
101 | return actual_select_method, selected_part.part_id
102 |
103 | def _load(self, needed_columns: List[str] = None, load_as_categoricals: List[str] = None) -> DataFrame:
104 | filters = self._predicate_pushdown_filters()
105 | if logger.isEnabledFor(logging.DEBUG):
106 | logger.debug(f"Filters used when loading: {filters}")
107 | logger.debug(f"Columns to explicitly load as categorical: {load_as_categoricals}")
108 |
109 | df = self._ctx.part_loader.load_dataframe(file_id=self._dataset_part_id, metrics=self._ctx.metrics,
110 | needed_columns=needed_columns, filters=filters,
111 | load_as_categoricals=load_as_categoricals)
112 | self._ctx.metrics.set_metric(MetricName.SCANNED_ROWS, len(df))
113 | self._ctx.metrics.set_metric(MetricName.SCANNED_GROUPS, df[self._req.dataset.group_id_column].nunique())
114 | return df
115 |
116 | def _predicate_pushdown_filters(self):
117 | """
118 | Build PyArrow-compatible pushdown predicates to pass the part loader.
119 | An important reminder here is that any filter applied would affect not just conditions/sequences, but also
120 | any defined aggregations - meaning it's suitable for limiting scope to the (optional) query timeframe,
121 | but should be evaluated carefully for any other optimizations.
122 | """
123 | filters = []
124 | timeframe = self._req.query.get('timeframe', None)
125 | if timeframe:
126 | fromtime = timeframe.get('from', None)
127 | if fromtime is not None:
128 | filters.append(FilterPredicate(column=self._req.dataset.timestamp_column, op='>=', value=fromtime))
129 | totime = timeframe.get('to', None)
130 | if totime is not None:
131 | filters.append(FilterPredicate(column=self._req.dataset.timestamp_column, op='<', value=totime))
132 |
133 | return filters if len(filters) > 0 else None
134 |
135 | def _build_result(self, base_attributes):
136 | return QueryTaskResult(
137 | **base_attributes,
138 | query_result=self._query_result)
139 |
--------------------------------------------------------------------------------
/frocket/worker/runners/registered_runners.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import Dict, Type
16 | from frocket.common.tasks.base import BaseTaskRequest
17 | from frocket.common.tasks.registration import RegistrationTaskRequest
18 | from frocket.common.tasks.query import QueryTaskRequest
19 | from frocket.worker.runners.base_task_runner import BaseTaskRunner
20 | from frocket.worker.runners.query_task_runner import QueryTaskRunner
21 | from frocket.worker.runners.registration_task_runner import RegistrationTaskRunner
22 |
23 | REGISTERED_RUNNERS: Dict[Type[BaseTaskRequest], Type[BaseTaskRunner]] = {
24 | QueryTaskRequest: QueryTaskRunner,
25 | RegistrationTaskRequest: RegistrationTaskRunner
26 | }
27 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pyarrow>=2.0.0
2 | pandas>=1.2.0
3 | boto3>=1.16.0
4 | redis>=3.5.0
5 | tabulate>=0.8.0
6 | prometheus_client>=0.9.0
7 | flask>=1.1.0
8 | jsonschema>=3.2.0
9 | dataclasses-json>=0.5.2
10 | inflection>=0.5.0
11 | parsimonious>=0.8.0
12 | gunicorn>=20.0.0
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 | import setuptools
3 |
4 | this_dir = pathlib.Path(__file__).parent
5 | requirements_file = this_dir / "requirements.txt"
6 | readme_file = this_dir / "README.md"
7 |
8 | install_requires = requirements_file.read_text().splitlines()
9 | long_description = readme_file.read_text() if readme_file.exists() else ''
10 |
11 | setuptools.setup(
12 | name="funnel-rocket",
13 | version="0.5.3",
14 | author="Elad Rosenheim, Avshalom Manevich",
15 | author_email="elad@dynamicyield.com",
16 | description="Cloud native distributed funnel queries",
17 | long_description=long_description,
18 | long_description_content_type="text/markdown",
19 | url="https://github.com/DynamicYieldProjects/funnel-rocket-oss",
20 | packages=setuptools.find_packages(),
21 | package_data={
22 | "frocket": ["resources/*.*"],
23 | },
24 | classifiers=[
25 | "Programming Language :: Python :: 3.8",
26 | "Programming Language :: Python :: 3.9",
27 | "License :: OSI Approved :: Apache Software License",
28 | "Operating System :: OS Independent",
29 | ],
30 | python_requires='>=3.8',
31 | install_requires=install_requires
32 | )
33 |
--------------------------------------------------------------------------------
/test-requirements.txt:
--------------------------------------------------------------------------------
1 | pytest>=6.2.0
2 | pytest-cov>=2.11.0
3 | icdiff>=0.5.0
4 | requests>=2.25.0
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/tests/utils/base_query_example.json:
--------------------------------------------------------------------------------
1 | {
2 | "timeframe": {
3 | "from": 1590918400516,
4 | "to": 1618918400516
5 | },
6 | "query": {
7 | "relation": "( $1 and $2) || $seq || (( $made_multiple_purchases ) ) ",
8 | "conditions": [
9 | {
10 | "name": "made_multiple_purchases",
11 | "filter": [
12 | "eventId",
13 | "==",
14 | 18765
15 | ],
16 | "target": [
17 | "count",
18 | ">=",
19 | 0
20 | ],
21 | "includeZero": true
22 | },
23 | {
24 | "name": "made_multiple_purchases2",
25 | "filter": [
26 | "eventId",
27 | "==",
28 | 18766
29 | ],
30 | "includeZero": false
31 | },
32 | {
33 | "filter": {
34 | "column": "eventId",
35 | "op": "==",
36 | "value": 18767
37 | },
38 | "target": {
39 | "type": "sum",
40 | "column": "eventValue",
41 | "op": "<",
42 | "value": 350
43 | }
44 | },
45 | {
46 | "filter": {
47 | "column": "eventId",
48 | "op": "==",
49 | "value": 18768
50 | },
51 | "target": [
52 | "sum",
53 | "eventValue",
54 | "<",
55 | 350
56 | ]
57 | },
58 | {
59 | "filters": [
60 | {
61 | "column": "eventType",
62 | "op": "==",
63 | "value": "purchase"
64 | },
65 | {
66 | "column": "goalValue",
67 | "op": ">=",
68 | "value": 3
69 | }
70 | ],
71 | "target": [
72 | "sum",
73 | "eventValue",
74 | "<",
75 | 350
76 | ],
77 | "includeZero": false
78 | },
79 | {
80 | "name": "seq",
81 | "sequence": [
82 | {
83 | "filter": [
84 | "eventType",
85 | "==",
86 | "addToCart"
87 | ]
88 | },
89 | {
90 | "filters": [
91 | {
92 | "column": "eventType",
93 | "op": "==",
94 | "value": "purchase"
95 | },
96 | {
97 | "column": "goalValue",
98 | "op": ">=",
99 | "value": 3
100 | }
101 | ]
102 | },
103 | {
104 | "rowFound": false,
105 | "filter": {
106 | "column": "eventType",
107 | "op": "==",
108 | "value": "signToClub"
109 | }
110 | }
111 | ],
112 | "maxDuration": 23443
113 | }
114 | ],
115 | "aggregations": [
116 | {
117 | "column": "device"
118 | },
119 | {
120 | "column": "transactionId",
121 | "type": "count",
122 | "name": "purchase_count"
123 | },
124 | {
125 | "column": "goalId"
126 | },
127 | {
128 | "column": "goalId",
129 | "type": "sumPerValue",
130 | "otherColumn": "goalValue",
131 | "name": "hoola"
132 | }
133 | ]
134 | },
135 | "funnel": {
136 | "sequence": [
137 | {
138 | "filter": [
139 | "eventType",
140 | "==",
141 | "addToCart"
142 | ]
143 | },
144 | {
145 | "filter": {
146 | "column": "eventId",
147 | "op": "==",
148 | "value": 18765
149 | }
150 | }
151 | ],
152 | "maxDuration": 23443,
153 | "stepAggregations": [
154 | {
155 | "column": "goalId",
156 | "type": "count",
157 | "name": "mosh"
158 | },
159 | {
160 | "column": "eventId",
161 | "type": "groupsPerValue",
162 | "name": "mosh2"
163 | }
164 | ],
165 | "endAggregations": [
166 | {
167 | "column": "goalId"
168 | }
169 | ]
170 | }
171 | }
--------------------------------------------------------------------------------
/tests/utils/base_test_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | import tempfile
17 | from typing import List, Type
18 | from frocket.common.metrics import MetricName, MetricData, MetricLabelEnum
19 |
20 | SKIP_SLOW_TESTS = os.environ.get('SKIP_SLOW_TESTS', "False").lower() == 'true'
21 | SKIP_LAMBDA_TESTS = os.environ.get('SKIP_LAMBDA_TESTS', "False").lower() == 'true'
22 | # noinspection PyProtectedMember,PyUnresolvedReferences
23 | TEMP_DIR = tempfile._get_default_tempdir()
24 |
25 |
26 | # noinspection PyProtectedMember,PyUnresolvedReferences
27 | def temp_filename(suffix='', with_dir: bool = True):
28 | fname = next(tempfile._get_candidate_names()) + suffix
29 | return f"{TEMP_DIR}/{fname}" if with_dir else fname
30 |
31 |
32 | # A mixin to allow defining utility classes named "Test" without pytest trying to collect test cases in them,
33 | # which results in warnings (and without needing a pytest.ini entry). See https://stackoverflow.com/a/46199666
34 | class DisablePyTestCollectionMixin(object):
35 | __test__ = False
36 |
37 |
38 | def get_metric_value(metrics: List[MetricData], name: MetricName) -> float:
39 | assert metrics
40 | metric = next(filter(lambda metric: metric.name == name, metrics), None)
41 | assert metric is not None
42 | return metric.value
43 |
44 |
45 | def assert_metric_value(metrics: List[MetricData], name: MetricName, value: float):
46 | assert get_metric_value(metrics, name) == value
47 |
48 |
49 | def find_first_label_value(metrics: List[MetricData], label_type: Type[MetricLabelEnum]) -> str:
50 | assert metrics
51 | found_metric = next(filter(lambda metric: label_type.label_name in metric.labels, metrics), None)
52 | return found_metric.labels[label_type.label_name]
53 |
54 |
55 | def assert_label_value_exists(metrics: List[MetricData], label: MetricLabelEnum):
56 | assert find_first_label_value(metrics, label.__class__) == label.label_value
57 |
--------------------------------------------------------------------------------
/tests/utils/lambda_fixture.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 | from frocket.common.config import config
17 |
18 |
19 | @pytest.fixture(scope="session", autouse=True)
20 | def init_mock_lambda_settings():
21 | config['lambda.aws.endpoint.url'] = config.get('lambda.aws.endpoint.url', 'http://localhost:9001')
22 | config['lambda.aws.region'] = config.get('lambda.aws.region', 'us-east-1')
23 | config['lambda.aws.no.signature'] = 'true'
24 | config['invoker.lambda.legacy.async'] = 'false'
25 |
--------------------------------------------------------------------------------
/tests/utils/mock_s3_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | import boto3
17 | from frocket.common.config import config, ConfigDict
18 | from frocket.common.helpers.utils import timestamped_uuid, memoize
19 |
20 | SKIP_S3_TESTS = os.environ.get('SKIP_S3_TESTS', "False").lower() == 'true'
21 |
22 |
23 | @memoize
24 | def _init_mock_s3_config():
25 | if SKIP_S3_TESTS:
26 | print(f"Skipping mock S3 config")
27 | config['s3.aws.endpoint.url'] = \
28 | os.environ.get('MOCK_S3_URL', config.get('s3.aws.endpoint.url', 'http://localhost:9000'))
29 | config['s3.aws.access.key.id'] = \
30 | os.environ.get('MOCK_S3_USER', config.get('s3.aws.access.key.id', 'testonly'))
31 | config['s3.aws.secret.access.key'] = \
32 | os.environ.get('MOCK_S3_SERCET', config.get('s3.aws.secret.access.key', 'testonly'))
33 |
34 |
35 | def mock_s3_env_variables():
36 | _init_mock_s3_config()
37 | return {
38 | ConfigDict.to_env_variable(key): config.get(key)
39 | for key in ['s3.aws.endpoint.url', 's3.aws.access.key.id', 's3.aws.secret.access.key']
40 | }
41 |
42 |
43 | def new_mock_s3_bucket():
44 | if SKIP_S3_TESTS:
45 | return None
46 | _init_mock_s3_config()
47 |
48 | bucket_name = timestamped_uuid('testbucket-')
49 | s3 = boto3.resource('s3', **config.aws_client_settings(service='s3'))
50 | bucket = s3.Bucket(bucket_name)
51 | bucket.create()
52 | print(f"Bucket '{bucket_name}' created")
53 | return bucket
54 |
--------------------------------------------------------------------------------
/tests/utils/redis_fixture.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The Funnel Rocket Maintainers
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | import pytest
17 | from frocket.common.config import config, ConfigDict
18 | from frocket.datastore.registered_datastores import get_datastore, get_blobstore
19 |
20 |
21 | @pytest.fixture(scope="session", autouse=True)
22 | def init_test_redis_settings():
23 | config['redis.host'] = os.environ.get('TEST_REDIS_HOST', config['redis.host'])
24 | config['redis.port'] = os.environ.get('TEST_REDIS_PORT', config['redis.port'])
25 | config['redis.db'] = os.environ.get('TEST_REDIS_DB', config['redis.db'])
26 | print(get_datastore(), get_blobstore()) # Fail on no connection, print connection details
27 |
28 |
29 | def get_test_redis_env_variables():
30 | return {
31 | ConfigDict.to_env_variable(key): config.get(key)
32 | for key in ['redis.host', 'redis.port', 'redis.db', 'datastore.redis.prefix']
33 | }
34 |
--------------------------------------------------------------------------------