├── .dockerignore
├── .github
    ├── CODEOWNERS
    ├── dependabot.yml
    └── workflows
    │   └── main.yml
├── .gitignore
├── .python-version
├── LICENSE
├── README.md
├── build-lambda.sh
├── data
    └── .gitignore
├── dataprep_example
    ├── __init__.py
    ├── ingest_retailrocket_dataset.py
    └── repartition.py
├── docker-compose.yml
├── docker
    ├── all-in-one.Dockerfile
    ├── entrypoint.sh
    └── local-lambda.Dockerfile
├── docs
    ├── api.md
    ├── example-dataset.md
    ├── logo-blue.svg
    ├── logo-icon-dark-blue.svg
    ├── logo-icon-light-blue.svg
    ├── logo-small-blue.svg
    └── operating.md
├── frocket
    ├── __init__.py
    ├── apiserver.py
    ├── cli.py
    ├── cli_commands.py
    ├── common
    │   ├── __init__.py
    │   ├── config.py
    │   ├── dataset.py
    │   ├── helpers
    │   │   ├── __init__.py
    │   │   ├── pandas.py
    │   │   ├── storage.py
    │   │   └── utils.py
    │   ├── metrics.py
    │   ├── serializable.py
    │   ├── tasks
    │   │   ├── __init__.py
    │   │   ├── async_tracker.py
    │   │   ├── base.py
    │   │   ├── query.py
    │   │   └── registration.py
    │   └── validation
    │   │   ├── __init__.py
    │   │   ├── consts.py
    │   │   ├── error.py
    │   │   ├── path_visitor.py
    │   │   ├── query_validator.py
    │   │   ├── relation_parser.py
    │   │   ├── result.py
    │   │   └── visitor_functions.py
    ├── datastore
    │   ├── __init__.py
    │   ├── blobstore.py
    │   ├── datastore.py
    │   ├── redis_store.py
    │   └── registered_datastores.py
    ├── engine
    │   ├── __init__.py
    │   ├── query_engine.py
    │   └── relation_to_pandas.py
    ├── invoker
    │   ├── __init__.py
    │   ├── base_invoker.py
    │   ├── impl
    │   │   ├── __init__.py
    │   │   ├── async_invoker.py
    │   │   ├── aws_lambda_invoker.py
    │   │   ├── registered_invokers.py
    │   │   └── work_queue_invoker.py
    │   ├── invoker_api.py
    │   ├── jobs
    │   │   ├── __init__.py
    │   │   ├── job.py
    │   │   ├── query_job.py
    │   │   └── registration_job.py
    │   ├── metrics_frame.py
    │   ├── prom_adapter.py
    │   └── stats_builder.py
    ├── resources
    │   └── query_schema.json
    └── worker
    │   ├── __init__.py
    │   ├── impl
    │       ├── __init__.py
    │       ├── aws_lambda_metrics.py
    │       ├── aws_lambda_worker.py
    │       ├── generic_env_metrics.py
    │       └── queue_worker.py
    │   └── runners
    │       ├── __init__.py
    │       ├── base_task_runner.py
    │       ├── part_loader.py
    │       ├── query_task_runner.py
    │       ├── registered_runners.py
    │       └── registration_task_runner.py
├── requirements.txt
├── setup.py
├── test-requirements.txt
└── tests
    ├── __init__.py
    ├── test_apiserver.py
    ├── test_cli.py
    ├── test_invoker_api.py
    ├── test_part_loader.py
    ├── test_path_visitor.py
    ├── test_query_engine.py
    ├── test_query_job.py
    ├── test_query_task.py
    ├── test_query_validator.py
    ├── test_registration_job.py
    ├── test_registration_task.py
    └── utils
        ├── __init__.py
        ├── base_query_example.json
        ├── base_test_utils.py
        ├── dataset_utils.py
        ├── lambda_fixture.py
        ├── mock_s3_utils.py
        ├── redis_fixture.py
        └── task_and_job_utils.py


/.dockerignore:
--------------------------------------------------------------------------------
 1 | **/__pycache__/
 2 | **/*.py[cod]
 3 | **/.*
 4 | docker/*.Dockerfile
 5 | *.so
 6 | *.parquet
 7 | *.zip
 8 | data/
 9 | layers/
10 | scratch/
11 | build/
12 | dist/
13 | sdist/
14 | *.egg-info/
15 | *.egg
16 | venv/
17 | map/
18 | reduce/


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | /frocket @dynamicyield/eladroz
2 | /docker @dynamicyield/omrisk


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: github-actions
 4 |     directory: /
 5 |     schedule:
 6 |       interval: daily
 7 |     commit-message:
 8 |       prefix: fix(deps)
 9 |   - package-ecosystem: pip
10 |     directory: /
11 |     schedule:
12 |       interval: daily
13 |     commit-message:
14 |       prefix: fix(deps)
15 |   - package-ecosystem: docker
16 |     directory: /docker
17 |     schedule:
18 |       interval: daily
19 |     commit-message:
20 |       prefix: fix(deps)
21 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | on: [ push ]
 3 | jobs:
 4 |   ci:
 5 |     runs-on: ubuntu-18.04
 6 |     timeout-minutes: 10
 7 | 
 8 |     steps:
 9 |       - name: Checkout repo
10 |         uses: actions/checkout@v2.3.4
11 | 
12 |       - name: Setup Python
13 |         uses: actions/setup-python@v2
14 |         with:
15 |           python-version: '3.8'
16 |           architecture: 'x64'
17 | 
18 |       - name: Cache dependencies
19 |         uses: actions/cache@v2.1.4
20 |         id: cache-venv
21 |         with:
22 |           path: ./venv/
23 |           key: ${{ runner.os }}-venv-cache-${{ hashFiles('./requirements.txt','./test-requirements.txt','./setup.py') }}
24 | 
25 |       - name: Build virtual environment and install dependencies
26 |         run: |
27 |           python -m pip install --upgrade pip
28 |           python -m venv venv
29 |           source venv/bin/activate
30 |           pip install -e .
31 |           pip install -r test-requirements.txt
32 |         if: steps.cache-venv.outputs.cache-hit != 'true'
33 | 
34 |       - name: Set up Docker Buildx
35 |         id: buildx
36 |         uses: docker/setup-buildx-action@master
37 | 
38 |       - name: Cache Docker layers for all-in-one
39 |         uses: actions/cache@v2.1.4
40 |         with:
41 |           path: /tmp/.buildx-cache-all-in-one
42 |           key: ${{ runner.os }}-buildx-all-in-one-${{ github.sha }}
43 |           restore-keys: |
44 |             ${{ runner.os }}-buildx-all-in-one-
45 | 
46 |       - name: Docker build all-in-one
47 |         id: docker_build_all_in_one
48 |         uses: docker/build-push-action@v2
49 |         with:
50 |           context: .
51 |           file: ./docker/all-in-one.Dockerfile
52 |           builder: ${{ steps.buildx.outputs.name }}
53 |           load: true
54 |           tags: frocket/all-in-one:latest
55 |           cache-from: type=local,src=/tmp/.buildx-cache-all-in-one
56 |           cache-to: type=local,dest=/tmp/.buildx-cache-all-in-one,mode=max
57 | 
58 |       - name: Cache Docker layers for local-lambda
59 |         uses: actions/cache@v2.1.4
60 |         with:
61 |           path: /tmp/.buildx-cache-local-lambda
62 |           key: ${{ runner.os }}-buildx-local-lambda-${{ github.sha }}
63 |           restore-keys: |
64 |             ${{ runner.os }}-buildx-local-lambda-
65 | 
66 |       - name: Docker build local-lambda
67 |         id: docker_build_all_local_lambda
68 |         uses: docker/build-push-action@v2
69 |         with:
70 |           context: .
71 |           file: ./docker/local-lambda.Dockerfile
72 |           builder: ${{ steps.buildx.outputs.name }}
73 |           load: true
74 |           tags: frocket/local-lambda:latest
75 |           cache-from: type=local,src=/tmp/.buildx-cache-local-lambda
76 |           cache-to: type=local,dest=/tmp/.buildx-cache-local-lambda,mode=max
77 | 
78 |       - name: Launch docker-compose
79 |         run: |
80 |           docker-compose up -d
81 |           sleep 2
82 | 
83 |       - name: Test with pytest
84 |         run: |
85 |           source venv/bin/activate
86 |           export SKIP_SLOW_TESTS=true
87 |           pytest --cov=frocket --cov-report=html
88 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | **/.DS_Store
 2 | **/.ipynb_checkpoints/
 3 | **/__pycache__
 4 | **/*.pyc
 5 | **/*.zip
 6 | **/*.so
 7 | **/*.parquet
 8 | *.egg-info
 9 | .eggs
10 | venv
11 | .idea
12 | *.iml
13 | .awsenv
14 | scratch
15 | map
16 | reduce
17 | build
18 | dist
19 | # Coverage report
20 | htmlcov
21 | .coverage
22 | .vscode


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.8.0
2 | 


--------------------------------------------------------------------------------
/build-lambda.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | set -e
 3 | RED='\033[0;31m'
 4 | GREEN='\033[0;32m'
 5 | YELLOW='\033[1;33m'
 6 | NC='\033[0m' # No Color
 7 | GITHASH=`git rev-parse HEAD | cut -c1-8``[[ -z $(git status -s) ]] || echo dirty`
 8 | [[ $1 == '--layer' ]] && LAYER=true || LAYER=false
 9 | 
10 | echo "${YELLOW}==> Building layer: ${LAYER}${NC}"
11 | echo "${YELLOW}==> Git commit hash: ${GITHASH}${NC}"
12 | echo "${YELLOW}==> Running docker build to install packages in Lambda-like image...${NC}"
13 | docker build -f docker/local-lambda.Dockerfile . -t frocket/local-lambda:latest
14 | docker run -d --name lambda-builder frocket/local-lambda:latest
15 | 
16 | BUILD_DIR=$(mktemp -d -t build-lambda)
17 | echo "${YELLOW}==> Copying files from container to build directory: ${BUILD_DIR}...${NC}"
18 | mkdir -p $BUILD_DIR/function
19 | docker cp lambda-builder:/var/task/frocket $BUILD_DIR/function/frocket
20 | if [ "$LAYER" = true ]; then
21 |   mkdir -p $BUILD_DIR/layer
22 |   docker cp lambda-builder:/opt/python $BUILD_DIR/layer/python
23 | fi
24 | 
25 | echo "${YELLOW}==> Stopping & removing container...${NC}"
26 | docker stop lambda-builder
27 | docker rm lambda-builder
28 | 
29 | pushd $BUILD_DIR
30 | echo "${YELLOW}==> Cleaning-up a bit and zipping...${NC}"
31 | FUNCTION_ZIPFILE=lambda-function-${GITHASH}.zip
32 | [ "$LAYER" = true ] && LAYER_ZIPFILE=lambda-layer-${GITHASH}.zip || LAYER_ZIPFILE=
33 | 
34 | if [ "$LAYER" = true ]; then
35 |   find ./layer/python -type d -name tests | xargs rm -rf
36 |   find ./layer/python -type d -name include | xargs rm -rf
37 |   (cd layer && zip -qr ../$LAYER_ZIPFILE ./python)
38 |   echo "${YELLOW}NOTE: Lambda size limit is 50mb compressed/250mb uncompressed for the function PLUS any layers it uses (unless using containers)${NC}"
39 |   echo "${YELLOW}Lambda layer size, uncompressed:${NC}"
40 |   du -sh ./layer
41 |   echo "${YELLOW}Lambda layer size, zipped:${NC}"
42 |   du -h $LAYER_ZIPFILE
43 | fi
44 | 
45 | (cd function && zip -qr ../$FUNCTION_ZIPFILE ./frocket)
46 | echo "${YELLOW}Lambda function, zipped:${NC}"
47 | du -h $FUNCTION_ZIPFILE
48 | 
49 | popd
50 | # Don't fail if previous files don't exist
51 | rm lambda-function-*.zip || true
52 | cp $BUILD_DIR/$FUNCTION_ZIPFILE .
53 | if [ "$LAYER" = true ]; then
54 |   rm lambda-layer-*.zip || true
55 |   cp $BUILD_DIR/$LAYER_ZIPFILE ./
56 | fi
57 | rm -rf $BUILD_DIR
58 | echo "${YELLOW}DONE! copied to current dir:${NC}\n${FUNCTION_ZIPFILE} ${LAYER_ZIPFILE}"
59 | 


--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | *


--------------------------------------------------------------------------------
/dataprep_example/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DynamicYieldProjects/funnel-rocket/70963fddc0881cebdc6da1af2654d412f95d660c/dataprep_example/__init__.py


--------------------------------------------------------------------------------
/dataprep_example/ingest_retailrocket_dataset.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import argparse
  4 | from pathlib import Path
  5 | from contextlib import contextmanager
  6 | import pandas as pd
  7 | from pandas import DataFrame
  8 | 
  9 | EVENTS_FILE = 'events.csv'
 10 | PROPS_FILE_1 = 'item_properties_part1.csv'
 11 | PROPS_FILE_2 = 'item_properties_part2.csv'
 12 | INPUT_FILENAMES = {EVENTS_FILE, PROPS_FILE_1, PROPS_FILE_2}
 13 | ITEM_PROPERTY_COLUMNS = {'categoryid', 'available', '790', '888'}
 14 | EXPECTED_EVENT_COUNT = 2_500_516
 15 | 
 16 | 
 17 | def progress_msg(msg: str):
 18 |     print(f"\033[33m{msg}\033[0m")  # Yellow, just yellow
 19 | 
 20 | 
 21 | @contextmanager
 22 | def timed(caption: str):
 23 |     start = time.time()
 24 |     yield
 25 |     total = time.time() - start
 26 |     print(f"Time to {caption}: {total:.3f} seconds")
 27 | 
 28 | 
 29 | # Read item properties files, filter for relevant columns and 'pivot' its structure from rows to columns
 30 | def read_item_props(filepath: Path) -> DataFrame:
 31 |     df = pd.read_csv(filepath)
 32 |     df = df[df['property'].isin(ITEM_PROPERTY_COLUMNS)]
 33 |     first_value_per_item = df.groupby(["itemid", "property"])["value"].first()
 34 |     df = first_value_per_item.to_frame()
 35 |     df = df.unstack(level=-1)
 36 |     df.columns = df.columns.droplevel(0)
 37 |     return df
 38 | 
 39 | 
 40 | def ingest(path: Path):
 41 |     with timed("read & transform item properties of all products"):
 42 |         item_props_tempfile = path / "item_props.parquet"
 43 |         if item_props_tempfile.exists():
 44 |             progress_msg(f"Reading item properties from cached file {item_props_tempfile}")
 45 |             item_props_df = pd.read_parquet(item_props_tempfile)
 46 |         else:
 47 |             progress_msg("Reading item properties... (this takes a bit)")
 48 |             item_props_df1 = read_item_props(path / PROPS_FILE_1)
 49 |             item_props_df2 = read_item_props(path / PROPS_FILE_2)
 50 |             item_props_df = item_props_df1.combine_first(item_props_df2)
 51 |             progress_msg(f"Storing item properties to {item_props_tempfile} for faster re-runs...")
 52 |             item_props_df.to_parquet(item_props_tempfile)
 53 | 
 54 |     with timed("read & transform user events"):
 55 |         progress_msg("Reading user events...")
 56 |         events = pd.read_csv(path / EVENTS_FILE)
 57 |         progress_msg("Joining events with item properties...")
 58 |         events = pd.merge(events, item_props_df, how='inner', on='itemid')
 59 | 
 60 |         progress_msg("Making columns more queryable...")
 61 |         events['price'] = events['790'].str[1:].astype(float) / 1000
 62 |         events.drop(columns=['790'], inplace=True)
 63 |         events['available'] = events['available'].astype(int).astype(bool)
 64 |         events['categoryid'] = events['categoryid'].astype('category')
 65 |         events['event'] = events['event'].astype('category')
 66 |         events.rename(columns={'888': 'cryptic_attrs'}, inplace=True)
 67 |         progress_msg("Storing 'cryptic_attrs' also as categorical column 'cryptic_attrs_cat'...")
 68 |         events['cryptic_attrs_cat'] = events['cryptic_attrs'].astype('category')
 69 |         events.reset_index(drop=True)
 70 | 
 71 |     progress_msg("Excerpt from final DataFrame:")
 72 |     print(events)
 73 |     progress_msg("Columns types (a.k.a. dtypes):")
 74 |     print(events.dtypes)
 75 |     progress_msg("Breakdown of event types:")
 76 |     print(events['event'].value_counts())
 77 | 
 78 |     if len(events) != EXPECTED_EVENT_COUNT:
 79 |         progress_msg(f"WARNING: Expected {EXPECTED_EVENT_COUNT} events, but final DataFrame has {len(events)}")
 80 | 
 81 |     output_file = path / 'retailrocket.parquet'
 82 |     events.to_parquet(output_file)
 83 |     col_memory_sizes = (events.memory_usage(deep=True) / 1024 ** 2).round(decimals=2)
 84 |     progress_msg(f'Size of DataFrame columns in memory (in MB):')
 85 |     print(col_memory_sizes)
 86 |     progress_msg(f"==> Saved output file to: {output_file}, size: {output_file.stat().st_size / 1024 ** 2:.1f}MB")
 87 | 
 88 |     with timed("load file - all columns"):
 89 |         pd.read_parquet(output_file)
 90 | 
 91 |     with timed("load file - just the 'cryptic_attrs' column"):
 92 |         pd.read_parquet(output_file, columns=['cryptic_attrs'])
 93 | 
 94 |     with timed("load file - just the 'cryptic_attrs_cat' column"):
 95 |         pd.read_parquet(output_file, columns=['cryptic_attrs_cat'])
 96 | 
 97 |     with timed("load file - all columns *except* these two"):
 98 |         cols = [col for col in events.dtypes.index
 99 |                 if col not in ['cryptic_attrs', 'cryptic_attrs_cat']]
100 |         pd.read_parquet(output_file, columns=cols)
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     parser = argparse.ArgumentParser(
105 |         description='Ingest RetailRocket dataset (to download: https://www.kaggle.com/retailrocket/ecommerce-dataset/)')
106 |     parser.add_argument(
107 |         'path', type=str,
108 |         help='Directory where downloaded dataset files are found and output file will be written')
109 |     args = parser.parse_args()
110 | 
111 |     path = Path(args.path)
112 |     if not path.exists() or not path.is_dir():
113 |         sys.exit(f'No such directory: {path}')
114 |     files_in_path = {f.name for f in path.iterdir()}
115 |     if not files_in_path >= INPUT_FILENAMES:
116 |         sys.exit(f'Missing one or more input files: {INPUT_FILENAMES}')
117 |     ingest(path)
118 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | services:
 3 |   redis:
 4 |     image: redis:6
 5 |     ports:
 6 |       - ${FROCKET_REDIS_PORT:-6379}:${FROCKET_REDIS_PORT:-6379}
 7 |     entrypoint: [ "redis-server", "--port", "${FROCKET_REDIS_PORT:-6379}" ]
 8 | 
 9 |   mock-s3:
10 |     image: minio/minio:latest
11 |     container_name: mock-s3
12 |     ports:
13 |       - 9000:9000
14 |     environment:
15 |       - MINIO_ROOT_USER=testonly
16 |       - MINIO_ROOT_PASSWORD=testonly
17 |     command: server /data
18 | 
19 |   frocket-queue-worker:
20 |     build:
21 |       dockerfile: docker/all-in-one.Dockerfile
22 |       context: .
23 |     image: frocket/all-in-one:latest
24 |     volumes:
25 |       - ./data:/app/data:ro,cached
26 |     environment:
27 |       - FROCKET_REDIS_HOST=redis
28 |       - FROCKET_REDIS_PORT=${FROCKET_REDIS_PORT:-6379}
29 |       - FROCKET_S3_AWS_ENDPOINT_URL=http://mock-s3:9000
30 |       - FROCKET_S3_AWS_ACCESS_KEY_ID=testonly
31 |       - FROCKET_S3_AWS_SECRET_ACCESS_KEY=testonly
32 |     depends_on:
33 |       - redis
34 |       - mock-s3
35 |     command: worker
36 | 
37 |   frocket-lambda-worker:
38 |     build:
39 |       dockerfile: docker/local-lambda.Dockerfile
40 |       context: .
41 |     image: frocket/local-lambda:latest
42 |     container_name: mock-lambda
43 |     volumes:
44 |       - ./data:/data:ro,cached
45 |     environment:
46 |       - FROCKET_REDIS_HOST=redis
47 |       - FROCKET_REDIS_PORT=${FROCKET_REDIS_PORT:-6379}
48 |       - FROCKET_S3_AWS_ENDPOINT_URL=http://mock-s3:9000
49 |       - FROCKET_S3_AWS_ACCESS_KEY_ID=testonly
50 |       - FROCKET_S3_AWS_SECRET_ACCESS_KEY=testonly
51 |       - AWS_REGION=us-east-1
52 |     depends_on:
53 |       - redis
54 |       - mock-s3
55 |     ports:
56 |       - 9001:9001
57 |     command: frocket.worker.impl.aws_lambda_worker.lambda_handler
58 | 
59 |   frocket-apiserver:
60 |     image: frocket/all-in-one:latest
61 |     container_name: frocket-apiserver
62 |     ports:
63 |       - 5000:5000
64 |     volumes:
65 |       - ./data:/app/data:ro,cached
66 |     environment:
67 |       - APISERVER_NUM_WORKERS=2
68 |       - FROCKET_REDIS_HOST=redis
69 |       - FROCKET_REDIS_PORT=${FROCKET_REDIS_PORT:-6379}
70 |       - FROCKET_S3_AWS_ENDPOINT_URL=http://mock-s3:9000
71 |       - FROCKET_S3_AWS_ACCESS_KEY_ID=testonly
72 |       - FROCKET_S3_AWS_SECRET_ACCESS_KEY=testonly
73 |       - FROCKET_LAMBDA_AWS_NO_SIGNATURE=true
74 |       - FROCKET_LAMBDA_AWS_ENDPOINT_URL=http://mock-lambda:9001
75 |       - FROCKET_LAMBDA_AWS_REGION=us-east-1
76 |       - FROCKET_INVOKER_LAMBDA_LEGACY_ASYNC=false
77 |       - FROCKET_INVOKER_RETRY_FAILED_INTERVAL=0.05
78 |     #  - FROCKET_INVOKER=aws_lambda
79 |     depends_on:
80 |       - redis
81 |     command: apiserver
82 | 


--------------------------------------------------------------------------------
/docker/all-in-one.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base Python image with up-to-date OS packages & pip
 2 | FROM python:3.8-slim as base
 3 | RUN apt-get update && apt-get clean && \
 4 |     python -m pip install --upgrade pip
 5 | 
 6 | # Builder image: install packages and then cleanup some un-needed large files and directories
 7 | FROM base as package-install
 8 | WORKDIR /app
 9 | COPY ./requirements.txt .
10 | RUN pip install --no-cache-dir --no-compile -r requirements.txt -t ./packages
11 | # Delete un-needed big files in pyarrow, tests & include dirs,
12 | # and all directories in botocore/data except for services actually used by frocket
13 | RUN rm ./packages/pyarrow/*flight*.so* \
14 |     ./packages/pyarrow/*plasma*.so* \
15 |     ./packages/pyarrow/plasma-store-server && \
16 |     find ./packages -type d -name tests | xargs rm -rf && \
17 |     find ./packages -type d -name include | xargs rm -rf && \
18 |     find ./packages/botocore/data -type d -mindepth 1 -maxdepth 1 | grep -vE 's3|lambda' | xargs rm -rf
19 | 
20 | # This image is based on 'base' again, so it doesn't carry over intermediate fat layers from package-install image.
21 | # It copies over only the pruned packages to the final image.
22 | FROM base
23 | WORKDIR /app
24 | COPY ./docker/entrypoint.sh .
25 | RUN chmod +x ./entrypoint.sh
26 | RUN useradd -ms /bin/bash frocket
27 | COPY --from=package-install /app/packages packages
28 | # The most frequently-changing file set - the source code itself, is copied last so previous layers are unaffected
29 | COPY ./requirements.txt .
30 | COPY ./test-requirements.txt .
31 | COPY ./setup.py .
32 | COPY ./frocket frocket
33 | COPY ./tests tests
34 | RUN pip install --no-cache-dir --no-compile --no-deps . -t ./packages
35 | USER frocket
36 | ENV PYTHONPATH=/app/packages
37 | ENTRYPOINT ["./entrypoint.sh"]
38 | 


--------------------------------------------------------------------------------
/docker/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | case "$1" in
 3 |   worker)
 4 |     echo "Starting Funnel Rocket queue-based worker"
 5 |     python -m frocket.worker.impl.queue_worker
 6 |     ;;
 7 |   apiserver)
 8 |     PORT=${APISERVER_PORT:-5000}
 9 |     NUM_WORKERS=${APISERVER_NUM_WORKERS:-8}
10 |     echo "Starting Funnel Rocket API server with $NUM_WORKERS workers on port $PORT"
11 |     python -m gunicorn.app.wsgiapp frocket.apiserver:app --bind=0.0.0.0:$PORT --workers=$NUM_WORKERS
12 |     ;;
13 |   *)
14 |     echo "Invalid command supplied"
15 |     exit 1
16 |     ;;
17 | esac
18 | 


--------------------------------------------------------------------------------
/docker/local-lambda.Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG PYTHON_VERSION=3.8
 2 | # Note: not using multi-stage build here, in contrary to all-in-one image.
 3 | # This has the pro of very fast incremental builds locally, and the con of large image size - ok for tests.
 4 | # Since we're switching to root during build,
 5 | # need to return to default Lambda user afterwards (as defined in base image)
 6 | ARG RUN_USER=sbx_user1051
 7 | FROM lambci/lambda:python3.8
 8 | # Lambda function code should be in /var/task
 9 | WORKDIR /var/task
10 | COPY ./setup.py .
11 | COPY ./requirements.txt .
12 | # Lambda layer(s) (useful for holding all big & infrequently changing dependencies)
13 | # should be located under /opt, which is only writable by root.
14 | # Don't install boto3/botocore, which is vendored by AWS in its most appropriate version
15 | USER root
16 | RUN grep -v boto requirements.txt > lambda_requirements.txt
17 | RUN mkdir /opt/python && pip install --no-compile --no-cache-dir -r lambda_requirements.txt -t /opt/python
18 | # Clean-up some big files
19 | RUN rm /opt/python/pyarrow/*flight*.so* \
20 |     /opt/python/pyarrow/*plasma*.so* \
21 |     /opt/python/pyarrow/plasma-store-server \
22 |     setup.py requirements.txt lambda_requirements.txt
23 | # Go back to user & workdir of base image
24 | USER ${RUN_USER}
25 | # Copy package source code, which is frequently changing, only at end of Dockerfile
26 | COPY ./frocket /var/task/frocket
27 | WORKDIR /var/task
28 | # These values are for running tests, not production usage
29 | ENV DOCKER_LAMBDA_STAY_OPEN=1 \
30 |     AWS_LAMBDA_FUNCTION_NAME=frocket \
31 |     AWS_LAMBDA_FUNCTION_TIMEOUT=15 \
32 |     AWS_LAMBDA_FUNCTION_MEMORY_SIZE=256
33 | 


--------------------------------------------------------------------------------
/docs/logo-blue.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 25.2.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 330 105.8" style="enable-background:new 0 0 330 105.8;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#240635;}
 7 | 	.st1{fill:#005691;}
 8 | 	.st2{fill:#007BD4;}
 9 | 	.st3{fill:#009EFB;}
10 | 	.st4{fill:#FFFFFF;}
11 | 	.st5{fill:#F2EBF7;}
12 | 	.st6{fill:#F50061;}
13 | 	.st7{fill:#FF4142;}
14 | 	.st8{fill:#FFAB00;}
15 | </style>
16 | <g>
17 | 	<g>
18 | 		<path class="st0" d="M67.5,50.1c-1.7,0-2.5-0.8-2.5-2.5V3.2c0-1.7,0.8-2.5,2.5-2.5h30.6c1.7,0,2.5,0.8,2.5,2.5v8.2
19 | 			c0,1.7-0.8,2.5-2.5,2.5H80.6V21h11.9c1.7,0,2.5,0.8,2.5,2.5v8.2c0,1.7-0.8,2.5-2.5,2.5H80.6v13.5c0,1.7-0.8,2.5-2.5,2.5H67.5z"/>
20 | 		<path class="st0" d="M124.9,51.2c-17,0-21.2-5.6-21.2-14v-34c0-1.7,0.8-2.5,2.5-2.5h10.8c1.7,0,2.5,0.8,2.5,2.5v31.3
21 | 			c0,1.9,0.8,3.2,5.6,3.2c4.9,0,5.7-1.4,5.7-3.2V3.2c0-1.7,0.7-2.5,2.5-2.5h10.4c1.7,0,2.5,0.8,2.5,2.5v34
22 | 			C146.1,45.5,141.9,51.2,124.9,51.2z"/>
23 | 		<path class="st0" d="M154.3,50.1c-1.7,0-2.5-0.8-2.5-2.5V3.2c0-1.7,0.8-2.5,2.5-2.5h7.1c1.8,0,2.9,0.5,4,1.9l13.3,15.2V3.2
24 | 			c0-1.7,0.8-2.5,2.5-2.5h10.3c1.7,0,2.5,0.8,2.5,2.5v44.5c0,1.7-0.8,2.5-2.5,2.5h-10.3c-1.7,0-2.5-0.8-2.5-2.5v-8.4L167,25v22.6
25 | 			c0,1.7-0.8,2.5-2.5,2.5H154.3z"/>
26 | 		<path class="st0" d="M202.5,50.1c-1.7,0-2.5-0.8-2.5-2.5V3.2c0-1.7,0.8-2.5,2.5-2.5h7.1c1.8,0,2.9,0.5,4,1.9L227,17.8V3.2
27 | 			c0-1.7,0.8-2.5,2.5-2.5h10.3c1.7,0,2.5,0.8,2.5,2.5v44.5c0,1.7-0.8,2.5-2.5,2.5h-10.3c-1.7,0-2.5-0.8-2.5-2.5v-8.4L215.2,25v22.6
28 | 			c0,1.7-0.8,2.5-2.5,2.5H202.5z"/>
29 | 		<path class="st0" d="M250.7,50.1c-1.7,0-2.5-0.8-2.5-2.5V3.2c0-1.7,0.8-2.5,2.5-2.5h31.8c1.7,0,2.5,0.8,2.5,2.5v8
30 | 			c0,1.7-0.8,2.5-2.5,2.5h-18.9V19H277c1.7,0,2.5,0.8,2.5,2.5v7.3c0,1.7-0.8,2.5-2.5,2.5h-13.4v5.9h18.9c1.7,0,2.5,0.8,2.5,2.5v8
31 | 			c0,1.7-0.8,2.5-2.5,2.5H250.7z"/>
32 | 		<path class="st0" d="M292.2,50.1c-1.7,0-2.5-0.8-2.5-2.5V3.2c0-1.7,0.8-2.5,2.5-2.5h10.6c1.7,0,2.5,0.8,2.5,2.5v34h9.4V26.1
33 | 			c0-1.7,0.8-2.5,2.5-2.5h9.9c1.7,0,2.5,0.8,2.5,2.5v21.6c0,1.7-0.8,2.5-2.5,2.5H292.2z"/>
34 | 		<path class="st0" d="M67.4,104.8c-1.7,0-2.5-0.8-2.5-2.5V57.8c0-1.7,0.8-2.5,2.5-2.5h22.8c10.3,0,13.5,4.7,13.5,11.5V70
35 | 			c0,3.8-1,6.6-3.8,7.8c5,0.5,8.3,4.2,8.3,9.9v14.5c0,1.7-0.8,2.5-2.5,2.5H95.1c-1.7,0-2.5-0.8-2.5-2.5V91.8c0-2.1-0.7-2.9-2.6-2.9
36 | 			h-9.6v13.4c0,1.7-0.8,2.5-2.5,2.5H67.4z M80.4,76.5h5.1c2.4,0,3-1.4,3-3.2v-2.3c0-1.9-0.6-3.2-3-3.2h-5.1V76.5z"/>
37 | 		<path class="st0" d="M132.7,105.8c-17.2,0-21.3-5.6-21.3-14V68.4c0-8.4,4.2-14.1,21.3-14.1S154,60,154,68.4v23.4
38 | 			C154,100.2,149.9,105.8,132.7,105.8z M132.7,92.6c5,0,5.8-1.4,5.8-3.2V70.9c0-1.8-0.8-3.2-5.8-3.2c-4.9,0-5.8,1.4-5.8,3.2v18.5
39 | 			C126.9,91.2,127.8,92.6,132.7,92.6z"/>
40 | 		<path class="st0" d="M175.4,104.8c-11.5,0-16.9-4.5-16.9-14V69.4c0-9.6,5.4-14.1,16.9-14.1h16.7c1.7,0,2.5,0.8,2.5,2.5v8.3
41 | 			c0,1.7-0.8,2.5-2.5,2.5h-13.8c-2.9,0-4.2,1.1-4.2,3.4v16.3c0,2.3,1.3,3.3,4.2,3.3h13.8c1.7,0,2.5,0.8,2.5,2.5v8.2
42 | 			c0,1.7-0.8,2.5-2.5,2.5H175.4z"/>
43 | 		<path class="st0" d="M201.4,104.8c-1.7,0-2.5-0.8-2.5-2.5V57.8c0-1.7,0.8-2.5,2.5-2.5H212c1.7,0,2.5,0.8,2.5,2.5v15.8h2.7l7.1-16
44 | 			c0.7-1.6,1.6-2.3,3.4-2.3h10.6c1.6,0,2.1,0.8,1.5,2.3l-7,16c5.1,0.3,8.4,3.8,8.4,9.3v19.3c0,1.7-0.8,2.5-2.5,2.5h-10.6
45 | 			c-1.7,0-2.5-0.8-2.5-2.5V88.9c0-1.5-0.8-2.3-2.2-2.3h-9.1v15.7c0,1.7-0.8,2.5-2.5,2.5H201.4z"/>
46 | 		<path class="st0" d="M250.5,104.8c-1.7,0-2.5-0.8-2.5-2.5V57.8c0-1.7,0.8-2.5,2.5-2.5h31.8c1.7,0,2.5,0.8,2.5,2.5v8
47 | 			c0,1.7-0.8,2.5-2.5,2.5h-18.9v5.3h13.4c1.7,0,2.5,0.8,2.5,2.5v7.3c0,1.7-0.8,2.5-2.5,2.5h-13.4v5.9h18.9c1.7,0,2.5,0.8,2.5,2.5v8
48 | 			c0,1.7-0.8,2.5-2.5,2.5H250.5z"/>
49 | 		<path class="st0" d="M303.9,104.8c-1.7,0-2.5-0.8-2.5-2.5V68.6h-10.1c-1.7,0-2.5-0.8-2.5-2.5v-8.3c0-1.7,0.8-2.5,2.5-2.5h36.2
50 | 			c1.7,0,2.5,0.8,2.5,2.5v8.3c0,1.7-0.8,2.5-2.5,2.5h-10.1v33.7c0,1.7-0.8,2.5-2.5,2.5H303.9z"/>
51 | 	</g>
52 | 	<g>
53 | 		<g>
54 | 			<polygon class="st1" points="28.4,65.7 27.1,65.7 24.4,65.7 23.1,65.7 21.1,76.9 24.4,76.9 27.1,76.9 30.4,76.9 			"/>
55 | 			<polygon class="st2" points="30.4,79.7 27.1,79.7 24.4,79.7 21.1,79.7 18.5,91 24.4,91 27.1,91 33.1,91 			"/>
56 | 			<polygon class="st3" points="35.1,93.8 27.1,93.8 24.4,93.8 16.5,93.8 9.8,105 24.4,105 27.1,105 41.8,105 			"/>
57 | 		</g>
58 | 		<g>
59 | 			<path class="st0" d="M24.8,0.4C15.1,9.7,9.3,25.5,10.1,40C3.7,46.7,0,55.8,0,65.2c0,0.5,11.3-3.5,11.3-3.5c0,0.8,0.7,1.4,1.4,1.4
60 | 				h26c0.4,0,0.8-0.2,1-0.4c0.2-0.2,0.4-0.6,0.4-0.9c0,0,11.3,3.8,11.3,3.4c0-9.3-3.7-18.3-10.1-25c0.8-14.6-5-30.4-14.7-39.6
61 | 				C26.2-0.1,25.3-0.1,24.8,0.4L24.8,0.4z"/>
62 | 		</g>
63 | 	</g>
64 | </g>
65 | </svg>
66 | 


--------------------------------------------------------------------------------
/docs/logo-icon-dark-blue.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 25.2.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 82 82" style="enable-background:new 0 0 82 82;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#240635;}
 7 | 	.st1{fill:#005691;}
 8 | 	.st2{fill:#007BD4;}
 9 | 	.st3{fill:#009EFB;}
10 | 	.st4{fill:#FFFFFF;}
11 | 	.st5{fill:#F2EBF7;}
12 | 	.st6{fill:#F50061;}
13 | 	.st7{fill:#FF4142;}
14 | 	.st8{fill:#FFAB00;}
15 | </style>
16 | <g>
17 | 	<circle class="st0" cx="41" cy="41" r="41"/>
18 | 	<g>
19 | 		<g>
20 | 			<polygon class="st1" points="42.7,48.5 41.8,48.5 40.1,48.5 39.3,48.5 38,55.7 40.1,55.7 41.8,55.7 44,55.7 			"/>
21 | 			<polygon class="st2" points="44,57.5 41.8,57.5 40.1,57.5 38,57.5 36.3,64.7 40.1,64.7 41.8,64.7 45.7,64.7 			"/>
22 | 			<polygon class="st3" points="47,66.5 41.8,66.5 40.1,66.5 35,66.5 30.7,73.7 40.1,73.7 41.8,73.7 51.3,73.7 			"/>
23 | 		</g>
24 | 		<g>
25 | 			<path class="st4" d="M40.4,6.5c-6.2,6-10,16.1-9.4,25.5c-4.1,4.3-6.5,10.1-6.5,16.1c0,0.3,7.3-2.2,7.3-2.2c0,0.5,0.4,0.9,0.9,0.9
26 | 				h16.7c0.3,0,0.5-0.1,0.7-0.3c0.2-0.2,0.3-0.4,0.3-0.6c0,0,7.3,2.4,7.3,2.2c0-5.9-2.4-11.8-6.5-16c0.5-9.4-3.2-19.5-9.4-25.5
27 | 				C41.3,6.2,40.7,6.2,40.4,6.5L40.4,6.5z"/>
28 | 		</g>
29 | 	</g>
30 | </g>
31 | </svg>
32 | 


--------------------------------------------------------------------------------
/docs/logo-icon-light-blue.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 25.2.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 82 82" style="enable-background:new 0 0 82 82;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#240635;}
 7 | 	.st1{fill:#005691;}
 8 | 	.st2{fill:#007BD4;}
 9 | 	.st3{fill:#009EFB;}
10 | 	.st4{fill:#FFFFFF;}
11 | 	.st5{fill:#F2EBF7;}
12 | 	.st6{fill:#F50061;}
13 | 	.st7{fill:#FF4142;}
14 | 	.st8{fill:#FFAB00;}
15 | </style>
16 | <g>
17 | 	<circle class="st5" cx="41" cy="41" r="41"/>
18 | 	<g>
19 | 		<g>
20 | 			<polygon class="st1" points="42.7,48.5 41.8,48.5 40.1,48.5 39.3,48.5 38,55.7 40.1,55.7 41.8,55.7 44,55.7 			"/>
21 | 			<polygon class="st2" points="44,57.5 41.8,57.5 40.1,57.5 38,57.5 36.3,64.7 40.1,64.7 41.8,64.7 45.7,64.7 			"/>
22 | 			<polygon class="st3" points="47,66.5 41.8,66.5 40.1,66.5 35,66.5 30.7,73.7 40.1,73.7 41.8,73.7 51.3,73.7 			"/>
23 | 		</g>
24 | 		<g>
25 | 			<path class="st0" d="M40.4,6.5c-6.2,6-10,16.1-9.4,25.5c-4.1,4.3-6.5,10.1-6.5,16.1c0,0.3,7.3-2.2,7.3-2.2c0,0.5,0.4,0.9,0.9,0.9
26 | 				h16.7c0.3,0,0.5-0.1,0.7-0.3c0.2-0.2,0.3-0.4,0.3-0.6c0,0,7.3,2.4,7.3,2.2c0-5.9-2.4-11.8-6.5-16c0.5-9.4-3.2-19.5-9.4-25.5
27 | 				C41.3,6.2,40.7,6.2,40.4,6.5L40.4,6.5z"/>
28 | 		</g>
29 | 	</g>
30 | </g>
31 | </svg>
32 | 


--------------------------------------------------------------------------------
/docs/logo-small-blue.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 25.2.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 124.2 67.4" style="enable-background:new 0 0 124.2 67.4;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#240635;}
 7 | 	.st1{fill:#005691;}
 8 | 	.st2{fill:#007BD4;}
 9 | 	.st3{fill:#009EFB;}
10 | 	.st4{fill:#FFFFFF;}
11 | 	.st5{fill:#F2EBF7;}
12 | 	.st6{fill:#F50061;}
13 | 	.st7{fill:#FF4142;}
14 | 	.st8{fill:#FFAB00;}
15 | </style>
16 | <g>
17 | 	<g>
18 | 		<g>
19 | 			<path class="st0" d="M82,19c2.2,0,3.2-1,3.2-3.2v-10c0-2.2-1-3.2-3.2-3.2H42.3c-2.2,0-3.2,1-3.2,3.2v58.3c0,2.2,1,3.2,3.2,3.2
20 | 				h13.8c2.2,0,3.2-1,3.2-3.2V46.6h15.5c2.2,0,3.2-1,3.2-3.2V33.3c0-2.2-1-3.2-3.2-3.2H59.3V19H82z"/>
21 | 			<path class="st0" d="M113.4,31.9c3.7-1.6,5-5.2,5-10.1v-4.1c0-8.8-4.2-14.9-17.5-14.9h-8.1c-1.4,0-2.5,1.1-2.5,2.5v11.1
22 | 				c0,1.4,1.1,2.5,2.5,2.5v0h2.1c3.1,0,3.9,1.8,3.9,4.2v3c0,2.4-0.8,4.2-3.9,4.2c0,0-8.6,0-8.7,0c-1.1,0.2-2,1.2-2,2.4v11.1
23 | 				c0,1.4,1.1,2.5,2.5,2.5l0,0h14c2.5,0,3.4,1.2,3.4,3.8v13.6c0,2.2,1,3.2,3.2,3.2H121c2.2,0,3.2-1,3.2-3.2V44.8
24 | 				C124.2,37.3,119.9,32.6,113.4,31.9z"/>
25 | 		</g>
26 | 	</g>
27 | 	<g>
28 | 		<g>
29 | 			<polygon class="st1" points="18.3,42.2 17.4,42.2 15.7,42.2 14.8,42.2 13.6,49.4 15.7,49.4 17.4,49.4 19.5,49.4 			"/>
30 | 			<polygon class="st2" points="19.5,51.2 17.4,51.2 15.7,51.2 13.6,51.2 11.9,58.4 15.7,58.4 17.4,58.4 21.3,58.4 			"/>
31 | 			<polygon class="st3" points="22.5,60.2 17.4,60.2 15.7,60.2 10.6,60.2 6.3,67.4 15.7,67.4 17.4,67.4 26.8,67.4 			"/>
32 | 		</g>
33 | 		<g>
34 | 			<path class="st0" d="M15.9,0.3c-6.2,6-10,16.1-9.4,25.5C2.4,30,0,35.8,0,41.8c0,0.3,7.3-2.2,7.3-2.2c0,0.5,0.4,0.9,0.9,0.9h16.7
35 | 				c0.3,0,0.5-0.1,0.7-0.3c0.2-0.2,0.3-0.4,0.3-0.6c0,0,7.3,2.4,7.3,2.2c0-5.9-2.4-11.8-6.5-16c0.5-9.4-3.2-19.5-9.4-25.5
36 | 				C16.8-0.1,16.3-0.1,15.9,0.3L15.9,0.3z"/>
37 | 		</g>
38 | 	</g>
39 | </g>
40 | </svg>
41 | 


--------------------------------------------------------------------------------
/frocket/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/frocket/cli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Simple CLI for Funnel Rocket.
  3 | 
  4 | This is currently a wrapper over invoker_api directly (meaning that the CLI process is the invoker), rather than
  5 | calling an API server - meaning that it does not rely on a running server, but needs the same permissions (listing files
  6 | in remote storage, access to Redis as datastore, optionally being able to invoke Lambdas).
  7 | 
  8 | This makes the CLI more suitable for onboarding and evaluation, but in production it's preferable to use the API
  9 | (for a better permissions model and centralized monitoring/logging, if nothing else).
 10 | 
 11 | The CLI does provide a few optional flags which make it also suitable for automating jobs:
 12 | * --nopretty returns JSON object/s without any captions
 13 | * --notrim and --nocolor prevents data from bein shortened or surrounded by ANSI color codes
 14 | * The log level is controllbable, and all log lines have a prefix making them easy to ignore.
 15 | """
 16 | #  Copyright 2021 The Funnel Rocket Maintainers
 17 | #
 18 | #  Licensed under the Apache License, Version 2.0 (the "License");
 19 | #  you may not use this file except in compliance with the License.
 20 | #  You may obtain a copy of the License at
 21 | #
 22 | #      http://www.apache.org/licenses/LICENSE-2.0
 23 | #
 24 | #  Unless required by applicable law or agreed to in writing, software
 25 | #  distributed under the License is distributed on an "AS IS" BASIS,
 26 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 27 | #  See the License for the specific language governing permissions and
 28 | #  limitations under the License.
 29 | 
 30 | import argparse
 31 | # TODO backlog don't import any frocket modules but a carefully selected set which does not then import heavy packages
 32 | #  or initialize mechanisms. This is only partially done now (see import at end of file).
 33 | from frocket.common.config import config
 34 | from frocket.common.tasks.registration import DatasetValidationMode, REGISTER_DEFAULT_VALIDATION_MODE, \
 35 |     REGISTER_DEFAULT_FILENAME_PATTERN, REGISTER_DEFAULT_VALIDATE_UNIQUES
 36 | 
 37 | REGISTER_VALIDATION_MODE_CHOICES = [e.value.lower() for e in DatasetValidationMode]
 38 | LOG_LEVEL_CHOICES = ['debug', 'info', 'warning', 'error', 'critical']
 39 | LOG_LINE_PREFIX = '[Log '
 40 | LOG_FORMAT = LOG_LINE_PREFIX + '%(levelname)s %(name)s] %(message)s'
 41 | 
 42 | 
 43 | def build_parser() -> argparse.ArgumentParser:
 44 |     parser = argparse.ArgumentParser(description='Simple CLI for Funnel Rocket',
 45 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 46 |     parser.add_argument('--notrim', action='store_true', help='Don\'t trim any text')
 47 |     parser.add_argument('--nocolor', action='store_true', help='Don\'t trim colorize any text')
 48 |     parser.add_argument('--nopretty', action='store_true', help='Don\'t pretty-print the response')
 49 |     parser.add_argument('--loglevel', type=str.lower, choices=LOG_LEVEL_CHOICES,
 50 |                         help=f'Set log level {LOG_LEVEL_CHOICES}')
 51 |     subparsers = parser.add_subparsers(dest='command', title='commands')
 52 |     subparsers.required = True
 53 | 
 54 |     register_parser = subparsers.add_parser('register', help='Register a dataset',
 55 |                                             formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 56 |     register_parser.add_argument('name', type=str, help='Dataset name')
 57 |     register_parser.add_argument('basepath', type=str,
 58 |                                  help='The path all files are directly under. Local and s3://... paths supported.')
 59 |     register_parser.add_argument('group_id_column', type=str,
 60 |                                  help='The column to group rows by, e.g. "userId", "userHash". '
 61 |                                       'This column is required and no values can be missing. Each part (file) in the '
 62 |                                       'dataset should have a distinct set of values for this column.')
 63 |     register_parser.add_argument(
 64 |         'timestamp_column', type=str,
 65 |         help='The column holding the timestamp of each row, e.g. "timestamp", "ts". '
 66 |              'Must be a numeric column with no values missing. Using a unix timestamp is advised - '
 67 |              'with or without sub-second resoluton based on your needs, either as int or float.')
 68 |     register_parser.add_argument('--pattern', type=str, default=REGISTER_DEFAULT_FILENAME_PATTERN,
 69 |                                  help='Filename pattern. Sub-directories are currently not supported.')
 70 |     register_parser.add_argument('--validation', type=str.lower,
 71 |                                  choices=REGISTER_VALIDATION_MODE_CHOICES,
 72 |                                  default=REGISTER_DEFAULT_VALIDATION_MODE.value.lower(),
 73 |                                  help=f"Validation mode to use {REGISTER_VALIDATION_MODE_CHOICES}",
 74 |                                  metavar='MODE')
 75 |     register_parser.add_argument('--skip-uniques', action='store_true',
 76 |                                  default=not REGISTER_DEFAULT_VALIDATE_UNIQUES,
 77 |                                  help='Skip validation of group_id_column values uniqueness across files '
 78 |                                       '(the set of files to test is determined by --validation argument)')
 79 | 
 80 |     list_parser = subparsers.add_parser('list', help='List datasets')
 81 | 
 82 |     run_query_parser = subparsers.add_parser('run', help='Run query')
 83 |     run_query_parser.add_argument('dataset')
 84 |     query_sources_group = run_query_parser.add_mutually_exclusive_group(required=True)
 85 |     query_sources_group.add_argument('--file', '-f', type=str, help='Run query stored in file', dest='filename')
 86 |     query_sources_group.add_argument('--empty', '-e', action='store_true',
 87 |                                      help='Run an empty query with no conditions')
 88 |     query_sources_group.add_argument('--string' '-s', type=str,
 89 |                                      help='Run the following query string', dest='query_string')
 90 | 
 91 |     info_parser = subparsers.add_parser('info', help='Show dataset information')
 92 |     info_parser.add_argument('dataset', type=str)
 93 |     info_parser.add_argument('--full', action='store_true', help='Show full schema')
 94 | 
 95 |     unreg_parser = subparsers.add_parser('unregister', help='Unregister a dataset')
 96 |     unreg_parser.add_argument('dataset', type=str)
 97 |     unreg_parser.add_argument('--force', action='store_true',
 98 |                               help='Unregister a dataset even if it\'s currently in use')
 99 | 
100 |     config_parser = subparsers.add_parser('config', help='Show configuration')
101 |     return parser
102 | 
103 | 
104 | def run_from_args(args: argparse.Namespace):
105 |     config['log.format'] = LOG_FORMAT if args.nocolor else f"\033[33m{LOG_FORMAT}\033[0m"
106 |     if args.loglevel:
107 |         config['log.level'] = args.loglevel
108 |     config.init_logging(force_console_output=True)
109 | 
110 |     # invoker_api isn't loaded (or logging implicitly initialized) till arguments are validated and log level is set
111 |     from frocket.cli_commands import run_command
112 |     run_command(args.command, args)
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     parser = build_parser()
117 |     args = parser.parse_args()
118 |     run_from_args(args)
119 | 


--------------------------------------------------------------------------------
/frocket/cli_commands.py:
--------------------------------------------------------------------------------
  1 | """
  2 | implementation of CLI commands.
  3 | """
  4 | #  Copyright 2021 The Funnel Rocket Maintainers
  5 | #
  6 | #  Licensed under the Apache License, Version 2.0 (the "License");
  7 | #  you may not use this file except in compliance with the License.
  8 | #  You may obtain a copy of the License at
  9 | #
 10 | #      http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | #  Unless required by applicable law or agreed to in writing, software
 13 | #  distributed under the License is distributed on an "AS IS" BASIS,
 14 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | #  See the License for the specific language governing permissions and
 16 | #  limitations under the License.
 17 | 
 18 | import argparse
 19 | import json
 20 | import sys
 21 | from json.decoder import JSONDecodeError
 22 | from pathlib import Path
 23 | from typing import Any
 24 | from tabulate import tabulate
 25 | from frocket.common.config import config
 26 | from frocket.common.serializable import SerializableDataClass
 27 | from frocket.common.tasks.base import BaseApiResult
 28 | from frocket.common.tasks.registration import DatasetValidationMode, RegisterArgs
 29 | from frocket.invoker import invoker_api
 30 | 
 31 | DATE_FORMAT = '%Y-%m-%d %H:%M:%S %Z'
 32 | 
 33 | 
 34 | def run_command(cmd: str, args: argparse.Namespace):
 35 |     mapping = {
 36 |         'register': register_dataset_cmd,
 37 |         'unregister': unregister_dataset_cmd,
 38 |         'list': list_datasets_cmd,
 39 |         'run': run_query_cmd,
 40 |         'info': dataset_info_cmd,
 41 |         'config': show_config_cmd
 42 |     }
 43 |     mapping[cmd](args)
 44 | 
 45 | 
 46 | def fail_missing_dataset(name: str):
 47 |     sys.exit(f"Dataset '{name}' not found!")
 48 | 
 49 | 
 50 | def trim_column(s: str, args: argparse.Namespace, maxwidth: int) -> str:
 51 |     if args.notrim or args.nopretty or len(s) <= maxwidth:
 52 |         return s
 53 |     else:
 54 |         return s[:maxwidth - 3] + '...'
 55 | 
 56 | 
 57 | def print_json(name: str, o: Any, pretty_print: bool):
 58 |     def to_json(o: Any, indent: int = None) -> str:
 59 |         return o.to_json(indent=indent) if isinstance(o, SerializableDataClass) else json.dumps(o, indent=indent)
 60 | 
 61 |     if pretty_print:
 62 |         print(name + ':', to_json(o, indent=2))
 63 |     else:
 64 |         print(to_json(o))
 65 | 
 66 | 
 67 | def handle_api_result(res: BaseApiResult, pretty_print: bool):
 68 |     print_json('API Result', res, pretty_print)
 69 |     if not res.success:
 70 |         sys.exit('FAILED' if pretty_print else 1)
 71 | 
 72 | 
 73 | def register_dataset_cmd(args):
 74 |     validation_mode = DatasetValidationMode[args.validation.upper()]
 75 |     register_args = RegisterArgs(name=args.name,
 76 |                                  basepath=args.basepath,
 77 |                                  group_id_column=args.group_id_column,
 78 |                                  timestamp_column=args.timestamp_column,
 79 |                                  pattern=args.pattern,
 80 |                                  validation_mode=validation_mode,
 81 |                                  validate_uniques=not args.skip_uniques)
 82 |     res = invoker_api.register_dataset(register_args)
 83 |     handle_api_result(res, pretty_print=not args.nopretty)
 84 | 
 85 | 
 86 | def unregister_dataset_cmd(args):
 87 |     res = invoker_api.unregister_dataset(args.dataset, force=args.force)
 88 |     handle_api_result(res, pretty_print=not args.nopretty)
 89 | 
 90 | 
 91 | def list_datasets_cmd(args):
 92 |     datasets = sorted(invoker_api.list_datasets(), key=lambda ds: ds.id.registered_at, reverse=True)
 93 |     display_datasets = [{'name': trim_column(ds.id.name, args, maxwidth=30),
 94 |                          'registered at': ds.id.registered_at.strftime(DATE_FORMAT),
 95 |                          'parts': ds.total_parts,
 96 |                          'group id': ds.group_id_column,
 97 |                          'timestamp': ds.timestamp_column,
 98 |                          'path': trim_column(ds.basepath, args, maxwidth=50)}
 99 |                         for ds in datasets]
100 |     if args.nopretty:
101 |         print(json.dumps(display_datasets))
102 |     else:
103 |         if len(datasets) == 0:
104 |             print('No datasets registered yet')
105 |         else:
106 |             print(tabulate(display_datasets, headers='keys'))
107 | 
108 | 
109 | def json_parse(s: str) -> dict:
110 |     try:
111 |         return json.loads(s)
112 |     except JSONDecodeError as e:
113 |         sys.exit(f'JSON Error: {e}')
114 | 
115 | 
116 | def run_query_cmd(args):
117 |     ds_info = invoker_api.get_dataset(args.dataset)
118 |     if not ds_info:
119 |         fail_missing_dataset(args.dataset)
120 |     query = None
121 |     if args.empty:
122 |         query = {}
123 |     elif args.query_string:
124 |         query = json_parse(args.query_string)
125 |     elif args.filename:
126 |         filepath = Path(args.filename)
127 |         if not filepath.exists():
128 |             sys.exit(f'File not found: {args.filename}')
129 |         else:
130 |             query_str = filepath.read_text(encoding='utf-8')
131 |             query = json_parse(query_str)
132 |     else:
133 |         sys.exit('Unknown mode')
134 | 
135 |     try:
136 |         res = invoker_api.run_query(ds_info, query)
137 |         handle_api_result(res, pretty_print=not args.nopretty)
138 |     except Exception as e:
139 |         sys.exit(f'Error: {e}')
140 | 
141 | 
142 | def dataset_info_cmd(args):
143 |     show_full = args.full
144 |     ds_info = invoker_api.get_dataset(args.dataset)
145 |     if not ds_info:
146 |         fail_missing_dataset(args.dataset)
147 |     parts_info = invoker_api.get_dataset_parts(ds_info)
148 |     schema_info = invoker_api.get_dataset_schema(ds_info, full=show_full)
149 |     print_json('Basic information', ds_info, pretty_print=not args.nopretty)
150 |     print_json('Parts', parts_info, pretty_print=not args.nopretty)
151 |     print_json(f'Schema (full: {show_full})', schema_info, pretty_print=not args.nopretty)
152 | 
153 | 
154 | def show_config_cmd(args):
155 |     print_json(f'Configuration', config, pretty_print=not args.nopretty)
156 | 


--------------------------------------------------------------------------------
/frocket/common/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/frocket/common/dataset.py:
--------------------------------------------------------------------------------
  1 | """Base classes for registered datasets and their metadata."""
  2 | #  Copyright 2021 The Funnel Rocket Maintainers
  3 | #
  4 | #  Licensed under the Apache License, Version 2.0 (the "License");
  5 | #  you may not use this file except in compliance with the License.
  6 | #  You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #  Unless required by applicable law or agreed to in writing, software
 11 | #  distributed under the License is distributed on an "AS IS" BASIS,
 12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #  See the License for the specific language governing permissions and
 14 | #  limitations under the License.
 15 | 
 16 | import logging
 17 | from enum import auto
 18 | from datetime import datetime, timezone
 19 | from typing import Optional, List, Dict
 20 | from dataclasses import dataclass, field
 21 | from frocket.common.serializable import SerializableDataClass, AutoNamedEnum
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | 
 26 | class PartNamingMethod(AutoNamedEnum):
 27 |     """
 28 |     For future use: currently the full list of dataset filenames is stored as metadata, however if a consistent
 29 |     numbering pattern should be identified, it may be useful both for more compact metadata and for a more predictable
 30 |     part index -> filename mapping.
 31 |     """
 32 |     RUNNING_NUMBER = auto()
 33 |     LIST = auto()
 34 | 
 35 | 
 36 | @dataclass(frozen=True)
 37 | class DatasetId(SerializableDataClass):
 38 |     """
 39 |     The main reason why this class exists: datasets can be re-registered multiple times with the same name, but any
 40 |     caching behavior should be sensitive to the registered date and become invalid on re-registration.
 41 |     In concrete terms, caching should be based on DatasetId keys (which are immutable) rather than a dataset name.
 42 | 
 43 |     Re-registering a dataset is useful, in cases such as:
 44 |     1. When you don't need to manage revisions yourself (via specifying a new dataset name and un-registering old ones).
 45 |     2. As an alias to the current version (datasets are only metadata, you can register the same physical files N times)
 46 |     3. If the datafiles were found to be incomplete/invalid, and after fixing the issue you want to invalidate caching.
 47 |     """
 48 |     name: str
 49 |     registered_at: datetime
 50 | 
 51 |     @classmethod
 52 |     def now(cls, name: str):
 53 |         return DatasetId(name, registered_at=datetime.now(tz=timezone.utc))
 54 | 
 55 | 
 56 | @dataclass(frozen=True)
 57 | class DatasetPartId(SerializableDataClass):
 58 |     """Specifies a single part (file) in a dataset version (see documetation for DatasetId above!)."""
 59 |     dataset_id: DatasetId
 60 |     path: str
 61 |     part_idx: int
 62 | 
 63 | 
 64 | @dataclass(frozen=True)
 65 | class DatasetInfo(SerializableDataClass):
 66 |     """
 67 |     Basic metadata for a dataset.
 68 | 
 69 |     This class should be kept pretty small, as it's passed along in task requests.
 70 |     More detailed metadata is found in the data schema object, which is stored separately and read when needed
 71 |     (and also exists in both short and full versions)
 72 |     """
 73 |     basepath: str
 74 |     total_parts: int
 75 |     id: DatasetId
 76 |     group_id_column: str  # The column by which the dataset is partitioned, and grouping is done.
 77 |     timestamp_column: str  # The column by which timeframe conditions and funnels are run.
 78 | 
 79 | 
 80 | @dataclass(frozen=True)
 81 | class DatasetPartsInfo(SerializableDataClass):
 82 |     """Holds the list of files in the dataset. Separate from DatasetInfo only due to size (this data is usually not
 83 |     needed to be sent in network calls)."""
 84 |     naming_method: PartNamingMethod
 85 |     total_parts: int
 86 |     total_size: int
 87 |     running_number_pattern: Optional[str] = field(default=None)
 88 |     filenames: Optional[List[str]] = field(default=None)
 89 | 
 90 |     def fullpaths(self, parent: DatasetInfo) -> List[str]:
 91 |         parentpath = parent.basepath if parent.basepath.endswith('/') else parent.basepath + '/'
 92 | 
 93 |         if self.naming_method == PartNamingMethod.LIST:
 94 |             assert (self.filenames and len(self.filenames) == parent.total_parts)
 95 |             return [parentpath + filename for filename in self.filenames]
 96 |         else:
 97 |             assert self.running_number_pattern
 98 |             return [parentpath + self.running_number_pattern.format(idx)
 99 |                     for idx in range(parent.total_parts)]
100 | 
101 | 
102 | class DatasetColumnType(AutoNamedEnum):
103 |     INT = auto()
104 |     FLOAT = auto()
105 |     BOOL = auto()
106 |     # Categorical columns are not a separate type to the query engine. That designation exists and is used separately.
107 |     STRING = auto()
108 | 
109 | 
110 | @dataclass(frozen=True)
111 | class DatasetColumnAttributes(SerializableDataClass):
112 |     """
113 |     The 'full' information on each column. TODO backlog use polymorphism? (needs support in de-serialization)
114 | 
115 |     For columns which were either saved by Pandas as categoricals, or are identified during registration to be such,
116 |     store a mapping of top N values (configurable) to their their normalized share in the dataset. Since registration
117 |     does not read all files but only a sample, that ratio cannot be an absolute number or the exact ratio - but still
118 |     useful for clients.
119 | 
120 |     cat_unique_ratio is the ratio of unique value count to all values (or: series.nunique()/len(series)), and may be
121 |     a useful rough indicator to how much RAM is saved (and str.match() operations sped-up!) by the categorical
122 |     representation. Columns are determined to be loaded as categorical if this value is lower than configured.
123 |     Loading of columns as categoricals is also usually much faster, but that greatly depends on whether a dictionary
124 |     was saved for that column in the Parquet file or not - so it depends on the tool used to create these files.
125 |     """
126 |     numeric_min: Optional[float] = None
127 |     numeric_max: Optional[float] = None
128 |     categorical: bool = False
129 |     cat_top_values: Optional[Dict[str, float]] = None
130 |     cat_unique_ratio: Optional[float] = None
131 | 
132 | 
133 | @dataclass(frozen=True)
134 | class DatasetColumn(SerializableDataClass):
135 |     name: str
136 |     dtype_name: str
137 |     coltype: DatasetColumnType
138 |     colattrs: DatasetColumnAttributes
139 | 
140 | 
141 | @dataclass(frozen=True)
142 | class DatasetShortSchema(SerializableDataClass):
143 |     """Schema, the short version - typically all you may need."""
144 |     columns: Dict[str, DatasetColumnType]
145 |     min_timestamp: float
146 |     max_timestamp: float
147 |     # In files created by Pandas with its metadata intact in the Parquet file, columns marked as categoricals.
148 |     source_categoricals: List[str] = field(default=None)
149 |     # Columns detected during registration to be good candidates for explicitly loading as categoricals (by PyArrow).
150 |     potential_categoricals: List[str] = field(default=None)
151 | 
152 | 
153 | @dataclass(frozen=True)
154 | class DatasetSchema(SerializableDataClass):
155 |     group_id_column: str
156 |     timestamp_column: str
157 |     columns: Dict[str, DatasetColumn]
158 |     # Just the names->dtypes of all columns not (currently) supported.
159 |     unsupported_columns: Dict[str, str]
160 | 
161 |     def short(self) -> DatasetShortSchema:
162 |         """Make short from full."""
163 |         cols = {name: col.coltype for name, col in self.columns.items()}
164 |         source_categoricals = []
165 |         potential_categoricals = []
166 |         for name, col in self.columns.items():
167 |             if col.colattrs.categorical:
168 |                 if col.dtype_name == 'category':
169 |                     source_categoricals.append(name)
170 |                 else:
171 |                     potential_categoricals.append(name)
172 |         ts_attrs = self.columns[self.timestamp_column].colattrs
173 |         min_ts = ts_attrs.numeric_min
174 |         max_ts = ts_attrs.numeric_max
175 | 
176 |         return DatasetShortSchema(columns=cols,
177 |                                   source_categoricals=source_categoricals,
178 |                                   potential_categoricals=potential_categoricals,
179 |                                   min_timestamp=min_ts, max_timestamp=max_ts)
180 | 


--------------------------------------------------------------------------------
/frocket/common/helpers/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/frocket/common/helpers/pandas.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | from typing import Iterable
16 | import pandas as pd
17 | import numpy as np
18 | 
19 | 
20 | def filter_by_isin(df: pd.DataFrame, column: str, values: Iterable) -> pd.DataFrame:
21 |     """
22 |     For the given DataFrame, return only rows where df[column] is in the given values.
23 |     This is a surprisingly faster alternative to built-in Pandas/NumPy functions: df[np.isin(df[column], values)]
24 |     A value can appear in multiple rows (e.g. the same user ID appearing multiple rows)
25 | 
26 |     TODO Merge a [Numba-based isin()](https://stackoverflow.com/questions/53046473/numpy-isin-performance-improvement)
27 |      function, compiled AOT for relevant array dtypes. This would be arch-dependent and optional (with fallback)
28 |     """
29 |     # First, create a "map" series from all possible values in the column => whether they should pass the filter
30 |     all_ids = df[column].unique()
31 |     is_id_relevant = pd.Series(np.zeros(len(all_ids)), index=all_ids).astype('bool')  # Default false
32 |     is_id_relevant.loc[values] = True
33 | 
34 |     # Create a boolean mask for column, based on the mapping above. Grab the raw array.
35 |     mask = is_id_relevant[df[column]].values
36 |     # Apply mask
37 |     return df[mask]
38 | 
39 | 
40 | def add_column_by_value_map(df: pd.DataFrame, keys_column: str, values_map_series: pd.Series, new_column: str) -> None:
41 |     """
42 |     Add a new column to the given df. For each row, df[new_column] will be set to an appropriate value from
43 |     values_map_series: the value whose index is df[keys_column] in that row.
44 | 
45 |     e.g. given a DF of user activities having a userId column (with potentially multiple rows per user), and a
46 |     values_map_series whose unique index is a User ID, and its values are the age of that user, the function will add
47 |     a new column to the given DF with the age of that row's user ID
48 | 
49 |     If a value in keys_column does not have a matching index in values_map_series, the cell value would be NaN.
50 |     This function is optimized for performance.
51 | 
52 |     The given DF is modified inplace.
53 |     """
54 |     # Create a new mapping between ALL unique values of IDs of df[keys_column] and their matching value (or NaN)
55 |     unique_keys = df[keys_column].unique()
56 |     key_to_value = pd.Series(data=np.nan, index=unique_keys)
57 |     key_to_value.loc[values_map_series.index] = values_map_series
58 | 
59 |     # Now we can create the new column, using the mapping
60 |     df[new_column] = key_to_value[df[keys_column]].values
61 | 


--------------------------------------------------------------------------------
/frocket/common/helpers/storage.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Simple abstraction of local & remote filesystems.
  3 | 
  4 | Currently supports either a local filesystem (for non-distributed usage, or potentially a fast network share)
  5 | and S3 (or S3-compatible object stores such as MinIO, which is used for running tests).
  6 | Additional protocols are welcome.
  7 | 
  8 | TODO backlog: support pagination for S3 listing (so more than 1,000 files per dataset)
  9 | TODO backlog: support auto-identification of numbering pattern in dataset files, so the full list of filenames
 10 |  would not have to reside in the datastore
 11 | """
 12 | #  Copyright 2021 The Funnel Rocket Maintainers
 13 | #
 14 | #  Licensed under the Apache License, Version 2.0 (the "License");
 15 | #  you may not use this file except in compliance with the License.
 16 | #  You may obtain a copy of the License at
 17 | #
 18 | #      http://www.apache.org/licenses/LICENSE-2.0
 19 | #
 20 | #  Unless required by applicable law or agreed to in writing, software
 21 | #  distributed under the License is distributed on an "AS IS" BASIS,
 22 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 23 | #  See the License for the specific language governing permissions and
 24 | #  limitations under the License.
 25 | 
 26 | import logging
 27 | import re
 28 | import tempfile
 29 | import uuid
 30 | from abc import abstractmethod
 31 | from enum import Enum, auto
 32 | from fnmatch import fnmatch
 33 | from pathlib import Path
 34 | from typing import NamedTuple, Optional, List
 35 | import boto3
 36 | from frocket.common.config import config
 37 | from frocket.common.dataset import DatasetPartsInfo, PartNamingMethod
 38 | 
 39 | logger = logging.getLogger(__name__)
 40 | 
 41 | 
 42 | class StorageHandler:
 43 |     """Simple abstraction of a storage protocol."""
 44 |     class FileBaseInfo(NamedTuple):
 45 |         relpath: str
 46 |         size: int
 47 | 
 48 |     def __init__(self, path: str):
 49 |         assert self.valid(path)
 50 |         self._path = path
 51 | 
 52 |     @classmethod
 53 |     def valid(cls, path: str) -> bool:
 54 |         """For validation of a path prior to instantiating the handler - a nicety instead of exceptions later,
 55 |         to be overriden where appropriate."""
 56 |         return True
 57 | 
 58 |     @property
 59 |     @abstractmethod
 60 |     def remote(self) -> bool:
 61 |         """This affects the caching behavior used by workers (see part_loader.py)."""
 62 |         pass
 63 | 
 64 |     @abstractmethod
 65 |     def _list_files(self, pattern: str) -> List[FileBaseInfo]:
 66 |         """Override in subclasses"""
 67 |         pass
 68 | 
 69 |     def discover_files(self, pattern: str) -> DatasetPartsInfo:
 70 |         files = self._list_files(pattern)
 71 |         files.sort(key=lambda fi: fi.relpath)
 72 |         # TODO backlog implement PartNamingMethod.RUNNING_NUMBER for compact metadata in large datasets
 73 |         parts_info = DatasetPartsInfo(naming_method=PartNamingMethod.LIST,
 74 |                                       total_parts=len(files),
 75 |                                       total_size=sum([fi.size for fi in files]),
 76 |                                       filenames=[fi.relpath for fi in files],
 77 |                                       running_number_pattern=None)
 78 |         return parts_info
 79 | 
 80 |     @abstractmethod
 81 |     def _local_path(self, fullpath: str) -> str:
 82 |         """
 83 |         If the filesystem is remote, download and return a local copy.
 84 |         Files should be cleaned-up by the caller which controls the caching behavior.
 85 |         """
 86 |         pass
 87 | 
 88 |     def get_local_path(self, fullpath: str) -> str:
 89 |         if not fullpath.startswith(self._path):
 90 |             raise Exception(f"Given full path {fullpath} is not under handler's path {self._path}")
 91 | 
 92 |         return self._local_path(fullpath)
 93 | 
 94 | 
 95 | class FileStorageHanler(StorageHandler):
 96 |     """Super-simple local filesystem handler"""
 97 |     @property
 98 |     def remote(self):
 99 |         return False
100 | 
101 |     def _list_files(self, pattern):
102 |         paths = Path(self._path).iterdir()
103 |         files = [StorageHandler.FileBaseInfo(path.name, path.stat().st_size)
104 |                  for path in paths
105 |                  if fnmatch(path.name, pattern)]
106 |         return files
107 | 
108 |     def _local_path(self, fullpath):
109 |         if not Path(fullpath).is_file():
110 |             raise Exception(f"Path is missing/not a file: {fullpath}")
111 |         return fullpath
112 | 
113 | 
114 | class S3StorageHanler(StorageHandler):
115 |     """S3 filesystem handler, supports datasets directly under the bucket or within a sub-directory."""
116 |     S3_PATH_REGEX = re.compile(r"^s3://([a-zA-Z0-9_\-.]+)/([a-zA-Z0-9_\-./]*)$")
117 | 
118 |     def __init__(self, path: str):
119 |         super().__init__(path)
120 |         path_parts = self.S3_PATH_REGEX.match(path)
121 |         self._bucket = path_parts.group(1)
122 |         self._path_in_bucket = path_parts.group(2)
123 |         no_trailing_slash = self._path_in_bucket and self._path_in_bucket[-1:] != '/'
124 |         self._path_in_bucket_normalized = self._path_in_bucket + ('/' if no_trailing_slash else '')
125 | 
126 |     @classmethod
127 |     def valid(cls, path):
128 |         return True if cls.S3_PATH_REGEX.match(path) else False
129 | 
130 |     @property
131 |     def remote(self):
132 |         return True
133 | 
134 |     def _list_files(self, pattern):
135 |         path_in_bucket = self._path_in_bucket_normalized
136 |         logger.info(f"Listing files in S3 with bucket {self._bucket} and prefix {path_in_bucket}...")
137 |         # TODO backlog support pagination
138 |         s3response = self._client().list_objects_v2(Bucket=self._bucket, Prefix=path_in_bucket)
139 | 
140 |         filename_start_idx = len(path_in_bucket)
141 |         path_to_size = {obj['Key'][filename_start_idx:]: obj['Size'] for obj in s3response['Contents']}
142 |         files = [StorageHandler.FileBaseInfo(path, size)
143 |                  for path, size in path_to_size.items()
144 |                  if fnmatch(path, pattern)]
145 |         return files
146 | 
147 |     def _local_path(self, fullpath):
148 |         localpath = str(Path(tempfile.gettempdir()) / str(uuid.uuid4()))
149 |         logger.info(f"Downloading {fullpath} to {localpath}...")
150 |         self._client().download_file(self._bucket, self._path_in_bucket, localpath)
151 |         return localpath
152 | 
153 |     @classmethod
154 |     def _client(cls):
155 |         if not hasattr(cls, '_s3client'):
156 |             cls._s3client = boto3.client('s3', **config.aws_client_settings(service='s3'))
157 |         return cls._s3client
158 | 
159 | 
160 | class StorageProtocol(Enum):
161 |     FILE = auto()
162 |     S3 = auto()
163 | 
164 |     @classmethod
165 |     def get(cls, name: str):
166 |         return cls.__members__.get(name.upper())
167 | 
168 |     @classmethod
169 |     def names(cls) -> List[str]:
170 |         return list(cls.__members__.keys())
171 | 
172 | 
173 | PATH_WITH_PROTOCOL_RE = r'(\w+)://(.+)$'
174 | PROTOCOL_TO_HANDLER = {
175 |     StorageProtocol.FILE: FileStorageHanler,
176 |     StorageProtocol.S3: S3StorageHanler
177 | }
178 | 
179 | 
180 | def storage_handler_for(path: str, throw_if_missing: bool = True) -> Optional[StorageHandler]:
181 |     """
182 |     Instantiate the appropriate handler for the given path.
183 |     Paths without explicit protocol are considered local.
184 |     """
185 |     path_and_protocol = re.match(PATH_WITH_PROTOCOL_RE, path)
186 |     if path_and_protocol:
187 |         method_name = path_and_protocol.groups()[0]
188 |         method = StorageProtocol.get(method_name)
189 |         if not method:
190 |             if throw_if_missing:
191 |                 raise Exception(f"Storage protocol '{method_name}' is not in supported list: {StorageProtocol.names()}")
192 |             else:
193 |                 return None
194 |         elif method == StorageProtocol.FILE:
195 |             path = path_and_protocol.groups()[1]
196 |     else:
197 |         method = StorageProtocol.FILE
198 | 
199 |     handler_cls = PROTOCOL_TO_HANDLER[method]
200 |     if not handler_cls.valid(path):
201 |         raise Exception(f"Invalid path: {path} (protocol: {method.name})")
202 |     return handler_cls(path)
203 | 


--------------------------------------------------------------------------------
/frocket/common/helpers/utils.py:
--------------------------------------------------------------------------------
 1 | """For everything but the kitchen sink."""
 2 | #  Copyright 2021 The Funnel Rocket Maintainers
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | 
16 | import functools
17 | import math
18 | import random
19 | import uuid
20 | import time
21 | from io import BytesIO
22 | from typing import Optional, List
23 | import pandas as pd
24 | import pyarrow.feather as feather
25 | import numpy as np
26 | 
27 | 
28 | def terminal_red(message: str) -> str:
29 |     return f"\033[31m{message}\033[0m"
30 | 
31 | 
32 | def terminal_green(message: str) -> str:
33 |     return f"\033[32m{message}\033[0m"
34 | 
35 | 
36 | def memoize(obj):
37 |     """Standard issue memoization decorator for caching function results (which don't need invalidation)."""
38 |     cache = obj._cache = {}
39 | 
40 |     @functools.wraps(obj)
41 |     def memoizer(*args, **kwargs):
42 |         key = str(args) + str(kwargs)
43 |         if key not in cache:
44 |             cache[key] = obj(*args, **kwargs)
45 |         return cache[key]
46 | 
47 |     return memoizer
48 | 
49 | 
50 | def sample_from_range(range_max: int,
51 |                       sample_ratio: float,
52 |                       max_samples: int,
53 |                       preselected: Optional[List[int]]) -> List[int]:
54 |     """
55 |     Given a range of numbers in 0..range_max, return random samples.
56 |     Count of samples is set by sample_ratio, up to max_samples.
57 |     If preselected is passed, include these indexes first.
58 |     """
59 |     available_indexes = list(range(range_max))
60 |     sample_count = min(math.floor(range_max * sample_ratio), max_samples)
61 | 
62 |     if preselected:
63 |         chosen = list(preselected)
64 |         for i in preselected:
65 |             available_indexes.remove(i)
66 |         sample_count = max(sample_count - len(preselected), 0)
67 |     else:
68 |         chosen = []
69 | 
70 |     if sample_count > 0:
71 |         chosen += random.choices(available_indexes, k=sample_count)
72 |     return chosen
73 | 
74 | 
75 | def timestamped_uuid(prefix: str = None) -> str:
76 |     return f"{prefix or ''}{math.floor(time.time())}-{str(uuid.uuid4())[:8]}"
77 | 
78 | 
79 | def ndarray_to_bytes(arr: np.ndarray) -> bytes:
80 |     """Use PyArrow's feather format as a compute- and space-efficient format for serializing NumPy arrays."""
81 |     df = pd.DataFrame(data={'arr': arr})
82 |     buf = BytesIO()
83 |     # noinspection PyTypeChecker
84 |     feather.write_feather(df, buf)
85 |     buf.seek(0)
86 |     return buf.read()
87 | 
88 | 
89 | def bytes_to_ndarray(data: bytes) -> np.ndarray:
90 |     df = feather.read_feather(BytesIO(data))
91 |     return df['arr']
92 | 


--------------------------------------------------------------------------------
/frocket/common/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/frocket/common/tasks/async_tracker.py:
--------------------------------------------------------------------------------
  1 | """
  2 | AsyncJobTracker object is handed by invoker_api to clients that launch a job in a non-blocking fashion.
  3 | It enables either periodic polling or blocking on updates. Updates are guaranteed to be atomic - that is,
  4 | there may be further updates, but the status you have in hand is consistent.
  5 | """
  6 | #  Copyright 2021 The Funnel Rocket Maintainers
  7 | #
  8 | #  Licensed under the Apache License, Version 2.0 (the "License");
  9 | #  you may not use this file except in compliance with the License.
 10 | #  You may obtain a copy of the License at
 11 | #
 12 | #      http://www.apache.org/licenses/LICENSE-2.0
 13 | #
 14 | #  Unless required by applicable law or agreed to in writing, software
 15 | #  distributed under the License is distributed on an "AS IS" BASIS,
 16 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 | #  See the License for the specific language governing permissions and
 18 | #  limitations under the License.
 19 | 
 20 | import logging
 21 | import time
 22 | from abc import ABCMeta, abstractmethod
 23 | from dataclasses import dataclass
 24 | from enum import auto
 25 | from queue import Queue, Empty
 26 | from typing import Optional, Dict, Generator
 27 | from frocket.common.serializable import AutoNamedEnum
 28 | from frocket.common.tasks.base import BaseJobResult, TaskStatus
 29 | 
 30 | logger = logging.getLogger(__name__)
 31 | 
 32 | 
 33 | class AsyncJobStage(AutoNamedEnum):
 34 |     STARTING = auto()
 35 |     RUNNING = auto()
 36 |     FINISHING = auto()
 37 |     DONE = auto()
 38 | 
 39 | 
 40 | @dataclass(frozen=True)
 41 | class AsyncJobStatus:
 42 |     stage: AsyncJobStage
 43 |     message: Optional[str] = None  # The job may set descriptive text for what it's doing
 44 |     result: Optional[BaseJobResult] = None  # Only available on stage=AsyncJobStage.DONE
 45 |     task_counters: Optional[Dict[TaskStatus, int]] = None
 46 | 
 47 | 
 48 | class JobTimeoutError(Exception):
 49 |     pass
 50 | 
 51 | 
 52 | class AsyncJobTracker(metaclass=ABCMeta):
 53 |     """The interface as known to clients"""
 54 | 
 55 |     @property
 56 |     @abstractmethod
 57 |     def status(self) -> AsyncJobStatus:
 58 |         """Get the latest status - as a consistent object which will not be mutated while using it"""
 59 |         pass
 60 | 
 61 |     @property
 62 |     @abstractmethod
 63 |     def elapsed_time(self) -> float:
 64 |         pass
 65 | 
 66 |     @property
 67 |     @abstractmethod
 68 |     def wait_time_remaining(self) -> Optional[float]:
 69 |         """
 70 |         If a tracker object was initialized with a timeout value by its creator (the invoker_api,
 71 |         based on configuration), then time remaining till timeout is known and can be returned.
 72 |         """
 73 |         pass
 74 | 
 75 |     @abstractmethod
 76 |     def wait(self, timeout: float = None) -> bool:
 77 |         """
 78 |         Blocking wait for updates with the given timeout, in seconds - but always capped to max wait time if set.
 79 |         By default, timeout is None - meaning wait up till max wait time (or indefinitely, in case it wasn't set).
 80 |         Assuming wait time is set, this is a good choice since no busy loop or semi-busy loop is needed.
 81 |         """
 82 |         pass
 83 | 
 84 |     def generator(self) -> Generator[AsyncJobStatus, None, None]:
 85 |         """
 86 |         Returns updates as they come for easier consumption, if blocking behavior is ok.
 87 |         This generator does not rely on any private attributes.
 88 |         """
 89 |         while True:
 90 |             update_available = self.wait()
 91 |             if not self.wait_time_remaining:
 92 |                 raise JobTimeoutError()
 93 | 
 94 |             status_snapshot = self.status
 95 |             if status_snapshot.result:
 96 |                 break
 97 | 
 98 |             if update_available:
 99 |                 yield status_snapshot
100 | 
101 |         yield status_snapshot
102 | 
103 | 
104 | class AsyncJobStatusUpdater(AsyncJobTracker):
105 |     """
106 |     Implementation of AsyncJobTracker, which is only created within invoker_api and updated by invoker/job code.
107 | 
108 |     The one curiousity here is the blocking wait() mechanism which is based on a Queue instance.
109 |     How it works: the client's wait() call blocks on waiting for a queue item. If there's already one,
110 |     it's immediately returned. Once consumed, the queue is empty again and a subsequent wait() will repeat
111 |     the process. Typically, the queue should have either zero or a only single item - see _signal_update() below.
112 |     """
113 |     def __init__(self, max_wait: float = None):
114 |         self._status: AsyncJobStatus = AsyncJobStatus(stage=AsyncJobStage.STARTING)
115 |         self._update_queue = Queue()
116 |         self._max_wait = max_wait
117 |         self._start_time = time.time()
118 | 
119 |     @property
120 |     def elapsed_time(self) -> float:
121 |         return time.time() - self._start_time
122 | 
123 |     @property
124 |     def wait_time_remaining(self) -> Optional[float]:
125 |         assert self._max_wait
126 |         remaining = self._max_wait - self.elapsed_time
127 |         return remaining if remaining > 0 else 0
128 | 
129 |     @property
130 |     def status(self) -> AsyncJobStatus:
131 |         return self._status
132 | 
133 |     def _update_status(self, new_status: AsyncJobStatus) -> None:
134 |         """Only signal an update if there was actually any change."""
135 |         modified = self._status != new_status
136 |         self._status = new_status
137 |         if modified:
138 |             if logger.isEnabledFor(logging.DEBUG):
139 |                 logger.debug(f"Updated async status from\n:{self._status} to:\n{new_status}")
140 |             self._signal_update()
141 |         pass
142 | 
143 |     def update(self, stage: AsyncJobStage = None, message: str = None, task_counters: Dict[TaskStatus, int] = None):
144 |         # Asserts are used here as the invoker/job classes are internal to the invoker_api, and are expected to conform
145 |         # to this class' requirements. If not, it's probably a bug.
146 |         assert stage != AsyncJobStage.DONE  # To move to DONE stage, done() should be explicitly called
147 |         assert self._status.stage != AsyncJobStage.DONE  # No more updates after DONE was called once
148 |         stage = stage or self._status.stage
149 |         task_counters = task_counters or self._status.task_counters
150 |         # Automatically cleanup message when moving in stages
151 |         message = message or (self._status.message if (stage == self._status.stage) else None)
152 | 
153 |         self._update_status(AsyncJobStatus(stage=stage, message=message,
154 |                                            task_counters=task_counters))
155 | 
156 |     def done(self, result: BaseJobResult):
157 |         self._update_status(AsyncJobStatus(stage=AsyncJobStage.DONE, result=result,
158 |                                            task_counters=self._status.task_counters))
159 | 
160 |     def _signal_update(self):
161 |         if self._update_queue.empty():
162 |             # If the client *already* has an update waiting for it, no need to do anything - it will read the latest
163 |             # state anyway when it gets to consume it (the queue item itself doesn't hold any information).
164 |             # In case of more than single updater thread, there might momentarily be more than a single item.
165 |             # However, this is not currently used in this way, and it seems that having multiple items would not
166 |             # have any detrimental effect (i.e. break correctness) if it actually occurs in other edge cases.
167 |             # TODO backlog to ensure a single item always, consider a lock here and re-test empty() within that lock.
168 |             self._update_queue.put(object())
169 | 
170 |     def wait(self, timeout=None):
171 |         assert timeout is None or timeout > 0
172 |         try:
173 |             should_block = True
174 |             if self._max_wait:
175 |                 remaining = self.wait_time_remaining
176 |                 if remaining == 0:
177 |                     # No more blocking wait - immediately return what's in the queue (or None)
178 |                     should_block = False
179 |                     timeout = None
180 |                 elif timeout:
181 |                     timeout = min(timeout, remaining)
182 |                 else:
183 |                     timeout = remaining
184 | 
185 |             self._update_queue.get(block=should_block, timeout=timeout)
186 |             return True
187 |         except Empty:
188 |             return False
189 | 


--------------------------------------------------------------------------------
/frocket/common/tasks/query.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Query job's task classes
  3 | """
  4 | #  Copyright 2021 The Funnel Rocket Maintainers
  5 | #
  6 | #  Licensed under the Apache License, Version 2.0 (the "License");
  7 | #  you may not use this file except in compliance with the License.
  8 | #  You may obtain a copy of the License at
  9 | #
 10 | #      http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | #  Unless required by applicable law or agreed to in writing, software
 13 | #  distributed under the License is distributed on an "AS IS" BASIS,
 14 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | #  See the License for the specific language governing permissions and
 16 | #  limitations under the License.
 17 | 
 18 | from dataclasses import dataclass
 19 | from enum import auto
 20 | from typing import Optional, List, Dict, Union, cast
 21 | import inflection
 22 | from frocket.common.dataset import DatasetInfo, DatasetPartId
 23 | from frocket.common.serializable import AutoNamedEnum, enveloped, SerializableDataClass, reducable
 24 | from frocket.common.tasks.base import BaseTaskRequest, BaseTaskResult, BaseJobResult
 25 | 
 26 | 
 27 | class PartSelectionMode(AutoNamedEnum):
 28 |     """Whether the invoker sets the task_index or the worker selects it from available tasks in the datastore."""
 29 |     SET_BY_INVOKER = auto()
 30 |     SELECTED_BY_WORKER = auto()
 31 | 
 32 | 
 33 | @enveloped
 34 | @dataclass(frozen=True)
 35 | class QueryTaskRequest(BaseTaskRequest):
 36 |     dataset: DatasetInfo
 37 |     # String columns to load as Pandas categoricals, as performance optimization. These columns are detected during
 38 |     # dataset registration. Not needed for columns already of categorical type in files saved by Pandas.
 39 |     load_as_categoricals: Optional[List[str]]
 40 |     mode: PartSelectionMode
 41 |     # If (and only if) mode=SET_BY_INVOKER, the invoker also sets the dataset part index to query
 42 |     # Note that task_index not necessarily equals part ID
 43 |     invoker_set_part: Optional[DatasetPartId]
 44 |     used_columns: List[str]  # Which columns to actually load (as optimization), as analyzed by QueryValidator.
 45 |     query: dict
 46 | 
 47 | 
 48 | class AggregationType(AutoNamedEnum):
 49 |     # noinspection PyUnusedLocal
 50 |     def __init__(self, *args):
 51 |         if not hasattr(self.__class__, '_camels'):
 52 |             self.__class__._camels = {}
 53 | 
 54 |         self.camelized = inflection.camelize(self.name.lower(), uppercase_first_letter=False)
 55 |         self.__class__._camels[self.camelized] = self
 56 |         self.value_is_dict = self.name.endswith("_PER_VALUE")
 57 | 
 58 |     COUNT = auto()
 59 |     COUNT_PER_VALUE = auto()
 60 |     GROUPS_PER_VALUE = auto()
 61 |     SUM_PER_VALUE = auto()
 62 |     MEAN_PER_VALUE = auto()
 63 | 
 64 |     @classmethod
 65 |     def from_camelcase(cls, camelcase_name: str) -> AutoNamedEnum:
 66 |         return cls._camels[camelcase_name]
 67 | 
 68 | 
 69 | AggrValue = Union[int, float]
 70 | AggrValueMap = Dict[str, AggrValue]
 71 | 
 72 | 
 73 | @reducable
 74 | @dataclass(frozen=True)
 75 | class AggregationResult(SerializableDataClass):
 76 |     column: str
 77 |     type: str
 78 |     # For some aggregation types ('count') the value is a single number. In others (the '<X>perValue' ones), value is
 79 |     # a dict of column value->aggregated number
 80 |     value: Optional[Union[AggrValue, AggrValueMap]]
 81 |     top: Optional[int]  # Relevant for values of type dict
 82 |     name: Optional[str]  # Only set if the user has set a custom name for this aggregation
 83 | 
 84 |     @classmethod
 85 |     def _reduce_fields(cls, serializables):
 86 |         """See: SerializableDataClass."""
 87 |         all_values = [e.value for e in cast(List[AggregationResult], serializables)]
 88 |         # Reduce either a primitive values or a dicts of counters
 89 |         if isinstance(all_values[0], dict):
 90 |             reduced_value = cls.reduce_counter_dicts(all_values, top_count=cast(cls, serializables[0]).top)
 91 |         else:
 92 |             reduced_value = sum(all_values)
 93 |         return {'value': reduced_value}
 94 | 
 95 | 
 96 | @reducable
 97 | @dataclass(frozen=True)
 98 | class QueryConditionsResult(SerializableDataClass):
 99 |     matching_groups: int  # e.g. user ID
100 |     matching_group_rows: int  # All rows of the matching groups, whether that row matches a condition or not
101 |     aggregations: Optional[List[AggregationResult]]
102 | 
103 |     @classmethod
104 |     def _reduce_fields(cls, serializables):
105 |         results = cast(List[cls], serializables)
106 |         return {'matching_groups': sum([e.matching_groups for e in results]),
107 |                 'matching_group_rows': sum([e.matching_group_rows for e in results]),
108 |                 'aggregations': cls.reduce_lists([e.aggregations for e in results])}
109 | 
110 | 
111 | @reducable
112 | @dataclass(frozen=True)
113 | class FunnelResult(SerializableDataClass):
114 |     sequence: List[QueryConditionsResult]
115 |     end_aggregations: Optional[List[AggregationResult]]
116 | 
117 |     @classmethod
118 |     def _reduce_fields(cls, serializables):
119 |         funnel_results = cast(List[cls], serializables)
120 |         return {'sequence': cls.reduce_lists([e.sequence for e in funnel_results]),
121 |                 'end_aggregations': cls.reduce_lists([e.end_aggregations for e in funnel_results])}
122 | 
123 | 
124 | @reducable
125 | @dataclass(frozen=True)
126 | class QueryResult(SerializableDataClass):
127 |     query: QueryConditionsResult
128 |     funnel: Optional[FunnelResult]
129 | 
130 |     @classmethod
131 |     def _reduce_fields(cls, serializables):
132 |         query_results = cast(List[cls], serializables)
133 |         return {'query': QueryConditionsResult.reduce([e.query for e in query_results]),
134 |                 'funnel': FunnelResult.reduce([e.funnel for e in query_results])}
135 | 
136 | 
137 | @enveloped
138 | @dataclass(frozen=True)
139 | class QueryTaskResult(BaseTaskResult):
140 |     query_result: Optional[QueryResult]  # Not set if query failed (when success=False)
141 | 
142 | 
143 | @dataclass(frozen=True)
144 | class QueryJobResult(BaseJobResult):
145 |     query: Optional[QueryConditionsResult]
146 |     funnel: Optional[FunnelResult]
147 | 


--------------------------------------------------------------------------------
/frocket/common/tasks/registration.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Task request/response classes for the registration job (discovering, validating and storing metadata for a dataset)
 3 | """
 4 | #  Copyright 2021 The Funnel Rocket Maintainers
 5 | #
 6 | #  Licensed under the Apache License, Version 2.0 (the "License");
 7 | #  you may not use this file except in compliance with the License.
 8 | #  You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | #  Unless required by applicable law or agreed to in writing, software
13 | #  distributed under the License is distributed on an "AS IS" BASIS,
14 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | #  See the License for the specific language governing permissions and
16 | #  limitations under the License.
17 | 
18 | from dataclasses import dataclass
19 | from enum import auto
20 | from typing import Optional
21 | from frocket.common.dataset import DatasetInfo, DatasetPartId, DatasetSchema
22 | from frocket.common.serializable import SerializableDataClass, AutoNamedEnum, enveloped
23 | from frocket.common.tasks.base import BaseTaskRequest, BaseTaskResult, BlobId, BaseJobResult, BaseApiResult
24 | 
25 | 
26 | class DatasetValidationMode(AutoNamedEnum):
27 |     SINGLE = auto()  # Only validate a single file in the dataset (meaning no cross-file consistency checks are done!)
28 |     FIRST_LAST = auto()  # Validate only first and last files (by lexicographic sorting) and cross-check them
29 |     SAMPLE = auto()  # Takes a sample of files, proportional to the no.o of files and up to a configured maximum.
30 | 
31 | 
32 | REGISTER_DEFAULT_FILENAME_PATTERN = '*.parquet'  # Ignore files such as '_SUCCESS' and the like in discovery
33 | REGISTER_DEFAULT_VALIDATION_MODE = DatasetValidationMode.SAMPLE
34 | REGISTER_DEFAULT_VALIDATE_UNIQUES = True
35 | 
36 | 
37 | @dataclass(frozen=True)
38 | class RegisterArgs(SerializableDataClass):
39 |     """Parameters collected by the CLI / API server for the registration job"""
40 |     name: str
41 |     basepath: str
42 |     group_id_column: str
43 |     timestamp_column: str
44 |     pattern: str = REGISTER_DEFAULT_FILENAME_PATTERN
45 |     validation_mode: DatasetValidationMode = REGISTER_DEFAULT_VALIDATION_MODE
46 |     validate_uniques: bool = REGISTER_DEFAULT_VALIDATE_UNIQUES
47 | 
48 | 
49 | @enveloped
50 | @dataclass(frozen=True)
51 | class RegistrationTaskRequest(BaseTaskRequest):
52 |     dataset: DatasetInfo
53 |     part_id: DatasetPartId
54 |     # If RegisterArgs.validate_uniques=true, task should return all group IDs in file
55 |     return_group_ids: bool
56 | 
57 | 
58 | @enveloped
59 | @dataclass(frozen=True)
60 | class RegistrationTaskResult(BaseTaskResult):
61 |     dataset_schema: Optional[DatasetSchema]  # None on failures
62 |     part_id: DatasetPartId
63 |     # If RegistrationTaskRequest.return_group_ids=true, a reference to the blob with the group IDs
64 |     group_ids_blob_id: Optional[BlobId]
65 | 
66 | 
67 | @dataclass(frozen=True)
68 | class RegistrationJobResult(BaseJobResult):
69 |     dataset: DatasetInfo
70 | 
71 | 
72 | @dataclass(frozen=True)
73 | class UnregisterApiResult(BaseApiResult):
74 |     dataset_found: bool
75 |     dataset_last_used: Optional[float]
76 | 


--------------------------------------------------------------------------------
/frocket/common/validation/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/frocket/common/validation/consts.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Consts and types for the query validation package
 3 | TODO backlog create a nice enum for all query keywords
 4 | """
 5 | #  Copyright 2021 The Funnel Rocket Maintainers
 6 | #
 7 | #  Licensed under the Apache License, Version 2.0 (the "License");
 8 | #  you may not use this file except in compliance with the License.
 9 | #  You may obtain a copy of the License at
10 | #
11 | #      http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | #  Unless required by applicable law or agreed to in writing, software
14 | #  distributed under the License is distributed on an "AS IS" BASIS,
15 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | #  See the License for the specific language governing permissions and
17 | #  limitations under the License.
18 | 
19 | import json
20 | import os
21 | import re
22 | from pathlib import Path
23 | from typing import Dict, NamedTuple
24 | from frocket.common.dataset import DatasetColumnType
25 | from frocket.common.validation.path_visitor import PathVisitor
26 | 
27 | # JSON Schema file
28 | QUERY_SCHEMA_LOCATION = Path(os.path.dirname(__file__)) / '../../resources/query_schema.json'
29 | QUERY_SCHEMA = json.load(open(QUERY_SCHEMA_LOCATION, 'r'))
30 | 
31 | TARGET_TYPES_WITH_INCLUDE_ZERO = ['count']
32 | TARGET_OPS_SUPPORTING_INCLUDE_ZERO = ['<', '<=', '==', '!=', '>=']
33 | TARGET_TYPES_WITH_OTHER_COLUMN = ['sum']
34 | AGGR_TYPES_WITH_OTHER_COLUMN = ['sumPerValue', 'meanPerValue']
35 | DEFAULT_TARGET = {'type': 'count', 'op': '>=', 'value': 1}
36 | DEFAULT_AGGREGATIONS = ['count', 'countPerValue', 'groupsPerValue']
37 | AGGREGATIONS_PATHS = ['query.aggregations',
38 |                       'funnel.stepAggregations',
39 |                       'funnel.endAggregations']
40 | SINGLE_FILTER_PATHS = ['query.conditions.filter',
41 |                        'query.conditions.sequence.filter',
42 |                        'funnel.sequence.filter']
43 | FILTER_ARRAY_PATHS = ['query.conditions.filters',
44 |                       'query.conditions.sequence.filters',
45 |                       'funnel.sequence.filters']
46 | 
47 | VALID_IDENTIFIER_PATTERN = re.compile(r'[A-Z][A-Z_0-9]*$', re.IGNORECASE)
48 | UNIQUE_IDENTIFIER_SCOPES = ['query.conditions.name'] + \
49 |                            [f"{path}.name" for path in AGGREGATIONS_PATHS]
50 | 
51 | EQUALITY_OPERATORS = ['==', '!=']
52 | NUMERIC_OPERATORS = [*EQUALITY_OPERATORS, '>', '>=', '<', '<=']
53 | STRING_OPERATORS = [*EQUALITY_OPERATORS, 'contains', 'regex']
54 | OPERATORS_BY_COLTYPE = {
55 |     DatasetColumnType.INT: NUMERIC_OPERATORS,
56 |     DatasetColumnType.FLOAT: NUMERIC_OPERATORS,
57 |     DatasetColumnType.BOOL: EQUALITY_OPERATORS,
58 |     DatasetColumnType.STRING: STRING_OPERATORS
59 | }
60 | VALUE_TYPES_BY_COLTYPE = {
61 |     DatasetColumnType.INT: [int],
62 |     DatasetColumnType.FLOAT: [int, float],
63 |     DatasetColumnType.BOOL: [bool],
64 |     DatasetColumnType.STRING: [str]
65 | }
66 | NUMERIC_COLTYPES = [DatasetColumnType.INT, DatasetColumnType.FLOAT]
67 | 
68 | RELATION_OPS = ['and', 'or', '||', '&&']
69 | DEFAULT_RELATION_OP = 'and'
70 | CONDITION_COLUMN_PREFIX = "__cond_"
71 | 
72 | 
73 | class QueryConditionsMap(NamedTuple):
74 |     count: int
75 |     names: Dict[str, int]
76 | 
77 | 
78 | def map_condition_names(query: dict) -> QueryConditionsMap:
79 |     """Map named conditions (which is optional) to the condition ID (index in conditions list)."""
80 |     conditions = PathVisitor(query, 'query.conditions').list()
81 |     names = {cond['name'].strip().lower(): i
82 |              for i, cond in enumerate(conditions) if 'name' in cond}
83 |     return QueryConditionsMap(count=len(conditions), names=names)
84 | 


--------------------------------------------------------------------------------
/frocket/common/validation/error.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | from enum import auto
16 | from frocket.common.serializable import AutoNamedEnum
17 | 
18 | 
19 | class ValidationErrorKind(AutoNamedEnum):
20 |     """Distinguish between types of validation issues in query"""
21 |     INVALID_ARGUMENTS = auto()  # Validator given wrong arguments
22 |     SCHEMA = auto()  # Failure at JSON Schema level
23 |     TYPE_MISMATCH = auto()  # Operator or value type don't match each other, or the context
24 |     DATASET_MISMATCH = auto()  # Column names, types, etc. do not match the schema of the given dataset
25 |     RELATION = auto()  # query.relation expression found invalid by relation_parser.py
26 |     # Note for unexpected errors: unlike other kinds, the message associated with this kind may leak sensitive data
27 |     # if it was returned to the caller - so it is not returned by the API server in PUBLIC mode.
28 |     UNEXPECTED = auto()
29 | 
30 | 
31 | class QueryValidationError(Exception):
32 |     def __init__(self, message: str, kind: ValidationErrorKind = None):
33 |         self.message = message
34 |         self.kind = kind or ValidationErrorKind.UNEXPECTED  # Default, but should be rare.
35 | 
36 |     @staticmethod
37 |     def wrap(e: Exception, kind: ValidationErrorKind = None):
38 |         return QueryValidationError(str(e), kind)
39 | 
40 |     def __str__(self):
41 |         return f"ValidationError({self.kind.value}: {self.message})"
42 | 


--------------------------------------------------------------------------------
/frocket/common/validation/path_visitor.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | from typing import Callable, Any, Optional
16 | 
17 | PathVisitorCallback = Callable[[Any], Optional[Any]]
18 | 
19 | 
20 | class PathVisitor:
21 |     """
22 |     A helper class for safely fetching nested attributes in a dictionary.
23 |     It is used extensively by the QueryValidator to extract and transform nested attributes.
24 | 
25 |     The class is instantiated with a root dict and a dot-delimited string path (e.g. 'attr.sub_attr.sub_sub').
26 |     Then, visit() can be called once (or more) to run code over the matching value/s, if any. If the key is not found,
27 |     no error is thrown. list() is a convenience method which visits the elements and returns them as a list,
28 |     returning an empty list on no matches.
29 | 
30 |     By default, if the leaf key is a list, the visitor function is called for each element.
31 |     However, if the list itself is what you need, pass list_to_items=False on init.
32 | 
33 |     Modifying attributes *below* the visited value is safe (be it a dict, a list, an object), however sometimes you
34 |     may want to replace the whole value itself being itereated. For example, the QueryValidator replaces shorthand-
35 |     notation objects, which are lists, into full-notation dicts.
36 |     To support that, init the object with modifiable=true and return the replacement value from the visitor function,
37 |     or None to keep the value.
38 | 
39 |     For usage examples, see test_path_visitor.py.
40 |     """
41 |     _KEY_NOT_FOUND = object()
42 | 
43 |     def __init__(self, root: dict, path: str, modifiable: bool = False, list_to_items: bool = True):
44 |         assert (isinstance(root, dict))
45 |         self._root = root
46 |         self._paths = path.strip().split(".")
47 |         self._modifiable = modifiable
48 |         self._list_to_items = list_to_items
49 | 
50 |     def visit(self, func: PathVisitorCallback):
51 |         if len(self._paths) > 0:
52 |             self._visit_dict(self._root, 0, func)
53 | 
54 |     def list(self) -> list:
55 |         result = []
56 |         self.visit(lambda v: result.append(v))
57 |         return result
58 | 
59 |     def _visit_dict(self, d: dict, depth: int, func: PathVisitorCallback):
60 |         v = d.get(self._paths[depth], self._KEY_NOT_FOUND)  # Differentiate a None value from an inexisting key
61 |         if v == self._KEY_NOT_FOUND:
62 |             return  # Bumped into a wall
63 | 
64 |         if isinstance(v, list) and self._list_to_items:
65 |             self._visit_list(v, depth + 1, func)
66 |             return
67 | 
68 |         if depth == len(self._paths) - 1:
69 |             replacement = func(v)  # Includes None
70 |             if self._modifiable and replacement:
71 |                 d[self._paths[depth]] = replacement
72 |         else:
73 |             if not v:
74 |                 return
75 |             elif isinstance(v, dict):
76 |                 self._visit_dict(v, depth + 1, func)
77 |             elif isinstance(v, list):
78 |                 self._visit_list(v, depth + 1, func)
79 |             else:
80 |                 return  # Can't go further
81 | 
82 |     def _visit_list(self, lst: list, depth: int, func: PathVisitorCallback):
83 |         if depth == len(self._paths):
84 |             assert self._list_to_items
85 |             for i, elem in enumerate(lst):
86 |                 replacement = func(elem)
87 |                 if self._modifiable and replacement:
88 |                     lst[i] = replacement
89 |         else:
90 |             for i, elem in enumerate(lst):
91 |                 # Note: depth is not incremented in this case, since elements are at the same 'path depth' as the list
92 |                 if isinstance(elem, dict):
93 |                     self._visit_dict(elem, depth, func)
94 |                 elif isinstance(elem, list):
95 |                     self._visit_list(elem, depth, func)
96 | 


--------------------------------------------------------------------------------
/frocket/common/validation/relation_parser.py:
--------------------------------------------------------------------------------
  1 | """
  2 | While the query schema is generally JSON-based (good for machines) rather then textual (like SQL,
  3 | supposedly human-friendly or at least more concise), there's one exception: an optional 'relation' expression allowing
  4 | to specify arbitrarily complex and/or relations between conditions, rather than just and/or over all.
  5 | 
  6 | The RelationParser class validates and breaks down the expression to a list of elements. However, it does not transform
  7 | them back into a Pandas query or similar - that is the query engine's responsibility and may change independently.
  8 | 
  9 | Note that conditions may be represented either by index ($0, $3, etc.) or by name - for named conditions.
 10 | """
 11 | #  Copyright 2021 The Funnel Rocket Maintainers
 12 | #
 13 | #  Licensed under the Apache License, Version 2.0 (the "License");
 14 | #  you may not use this file except in compliance with the License.
 15 | #  You may obtain a copy of the License at
 16 | #
 17 | #      http://www.apache.org/licenses/LICENSE-2.0
 18 | #
 19 | #  Unless required by applicable law or agreed to in writing, software
 20 | #  distributed under the License is distributed on an "AS IS" BASIS,
 21 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 22 | #  See the License for the specific language governing permissions and
 23 | #  limitations under the License.
 24 | 
 25 | import logging
 26 | from typing import Type, List, Optional
 27 | from parsimonious.grammar import Grammar, NodeVisitor
 28 | from parsimonious.nodes import Node
 29 | from dataclasses import dataclass
 30 | from parsimonious.exceptions import ParseError, VisitationError
 31 | from abc import ABCMeta
 32 | from frocket.common.validation.consts import RELATION_OPS, map_condition_names, CONDITION_COLUMN_PREFIX
 33 | from frocket.common.validation.path_visitor import PathVisitor
 34 | from frocket.common.validation.error import ValidationErrorKind, QueryValidationError
 35 | from frocket.common.tasks.base import ErrorMessage
 36 | 
 37 | logger = logging.getLogger(__name__)
 38 | 
 39 | # TODO backlog fix the grammar to require whitespace between conditions and wordy-operators (and,or),
 40 | #  but not around symbol ops (&&, ||)
 41 | # TODO backlog fix "DeprecationWarning: invalid escape sequence \$"
 42 | RELATION_EXPRESSION_GRAMMAR = Grammar(
 43 |     """
 44 |     expression = (identifier / (open_paren ws? expression ws? close_paren)) (ws? op ws? expression)*
 45 |     identifier = condition_name / condition_id
 46 |     condition_name = ~r"\$[A-Z][A-Z_0-9]*"i
 47 |     condition_id = ~r"\$[0-9]+"
 48 |     op = "and" / "or" / "&&" / "||"
 49 |     ws = ~r"\s*"
 50 |     open_paren  = "("
 51 |     close_paren = ")"
 52 |     """)
 53 | 
 54 | 
 55 | @dataclass(frozen=True)
 56 | class RelationParserContext:
 57 |     condition_count: int
 58 |     named_conditions: dict
 59 |     column_prefix: str
 60 | 
 61 | 
 62 | @dataclass
 63 | class RBaseElement(metaclass=ABCMeta):
 64 |     text: str
 65 |     ctx: RelationParserContext
 66 |     condition_id: Optional[int] = None
 67 | 
 68 |     def validate(self) -> Optional[ErrorMessage]:
 69 |         pass
 70 | 
 71 |     def __str__(self):
 72 |         return f"{self.__class__.__name__}('{self.text}')"
 73 | 
 74 | 
 75 | @dataclass
 76 | class RTextElement(RBaseElement):
 77 |     pass
 78 | 
 79 | 
 80 | @dataclass
 81 | class RConditionBaseElement(RBaseElement):
 82 |     pass
 83 | 
 84 | 
 85 | @dataclass
 86 | class RConditionId(RConditionBaseElement):
 87 |     def validate(self):
 88 |         cid = int(self.text[1:])
 89 |         if cid >= self.ctx.condition_count:
 90 |             return f"Condition no. {cid} does not exist"
 91 |         self.condition_id = cid
 92 | 
 93 | 
 94 | @dataclass
 95 | class RConditionName(RConditionBaseElement):
 96 |     def validate(self):
 97 |         cname = self.text[1:]
 98 |         cid = self.ctx.named_conditions.get(cname, None)
 99 |         if cid is not None:  # Can be zero
100 |             self.condition_id = cid
101 |         else:
102 |             return f"Condition named {self.text[1:]} does not exist"
103 | 
104 | 
105 | @dataclass
106 | class ROperator(RBaseElement):
107 |     def validate(self):
108 |         if self.text not in RELATION_OPS:
109 |             return f"Operator {self.text} not in {RELATION_OPS}"
110 | 
111 | 
112 | # noinspection PyMethodMayBeStatic,PyUnusedLocal
113 | @dataclass
114 | class RelationExpressionVisitor(NodeVisitor):
115 |     """
116 |     Used by the RelationParser to build the element list.
117 |     Note that while the grammar is hierarchical, the resulting list isn't (no need, currently).
118 |     """
119 |     ctx: RelationParserContext
120 | 
121 |     def _build_element(self, node: Node, cls: Type[RBaseElement]):
122 |         # noinspection PyArgumentList
123 |         return cls(node.text, self.ctx)
124 | 
125 |     def visit_ws(self, node: Node, visited_children):
126 |         return None  # Ignore whitespaces
127 | 
128 |     def visit_op(self, node: Node, visited_children):
129 |         return self._build_element(node, ROperator)
130 | 
131 |     def visit_open_paren(self, node: Node, visited_children):
132 |         return self._build_element(node, RTextElement)
133 | 
134 |     def visit_close_paren(self, node: Node, visited_children):
135 |         return self._build_element(node, RTextElement)
136 | 
137 |     def visit_identifier(self, node: Node, visited_children):
138 |         """Return the actual condition name / ID element (see grammar: identifier wraps conditions)."""
139 |         return visited_children[0]
140 | 
141 |     def visit_condition_name(self, node: Node, visited_children):
142 |         return self._build_element(node, RConditionName)
143 | 
144 |     def visit_condition_id(self, node: Node, visited_children):
145 |         return self._build_element(node, RConditionId)
146 | 
147 |     def generic_visit(self, node: Node, visited_children):
148 |         """Ignore current node, but return children (if any) as a flat list."""
149 |         flat_result = []
150 |         for child in visited_children:
151 |             if type(child) is list:
152 |                 flat_result += child  # Unpack child array
153 |             elif child:
154 |                 flat_result.append(child)
155 |         return flat_result if len(flat_result) > 0 else None
156 | 
157 | 
158 | class RelationParser:
159 |     def __init__(self, query: dict):
160 |         self._query = query
161 |         self._condition_mapping = map_condition_names(query)
162 |         self._used_conditions = None
163 | 
164 |         found_relations = PathVisitor(self._query, 'query.relation').list()
165 |         assert len(found_relations) in [0, 1]
166 |         self._relation = found_relations[0].strip().lower() if found_relations else None
167 | 
168 |     def parse(self) -> List[RBaseElement]:
169 |         if not self._relation:
170 |             return []
171 | 
172 |         ctx = RelationParserContext(condition_count=self._condition_mapping.count,
173 |                                     named_conditions=self._condition_mapping.names,
174 |                                     column_prefix=CONDITION_COLUMN_PREFIX)
175 |         try:
176 |             tree = RELATION_EXPRESSION_GRAMMAR.parse(self._relation)
177 |         except ParseError as pe:
178 |             # Adopted from within the ParseError class, but without the sometimes-confusing issue
179 |             excerpt = pe.text[pe.pos:pe.pos + 20] if (pe.text and pe.pos is not None) else None
180 |             if excerpt:
181 |                 message = f"Query relation is invalid around '{excerpt}' "
182 |             else:
183 |                 message = f"Query relation '{self._relation}' is invalid"
184 |             raise QueryValidationError(message, kind=ValidationErrorKind.RELATION)
185 | 
186 |         try:
187 |             elements = RelationExpressionVisitor(ctx).visit(tree)
188 |         except VisitationError as ve:
189 |             logger.exception('Unexpected error while visiting parse tree')
190 |             raise QueryValidationError(message=str(ve), kind=ValidationErrorKind.UNEXPECTED)
191 | 
192 |         for e in elements:
193 |             error_message = e.validate()
194 |             if error_message:
195 |                 raise QueryValidationError(message=error_message, kind=ValidationErrorKind.RELATION)
196 | 
197 |         self._used_conditions = [e.condition_id for e in elements if e.condition_id is not None]
198 |         return elements
199 | 
200 |     @property
201 |     def used_conditions(self) -> List[str]:
202 |         return self._used_conditions
203 | 


--------------------------------------------------------------------------------
/frocket/common/validation/result.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | from dataclasses import dataclass
16 | from typing import Optional, List, cast, Dict
17 | from frocket.common.serializable import SerializableDataClass
18 | from frocket.common.validation.error import ValidationErrorKind, QueryValidationError
19 | from frocket.common.validation.relation_parser import RBaseElement
20 | 
21 | 
22 | @dataclass(frozen=True)
23 | class QueryValidationResult(SerializableDataClass):
24 |     success: bool
25 |     source_query: dict
26 |     error_message: Optional[str] = None
27 |     error_kind: Optional[ValidationErrorKind] = None
28 |     expanded_query: Optional[dict] = None
29 |     # TODO backlog support non-critical warning/hints to user (e.g. conditions unused by relation expression)
30 |     warnings: Optional[List[str]] = None
31 |     used_columns: Optional[List[str]] = None
32 |     used_conditions: Optional[List[str]] = None
33 |     named_conditions: Optional[Dict[str, int]] = None
34 |     relation_elements: Optional[List[RBaseElement]] = None
35 | 
36 |     @staticmethod
37 |     def from_exception(e: Exception, source_query: dict):
38 |         if type(e) is QueryValidationError:
39 |             error_kind = cast(QueryValidationError, e).kind
40 |         else:
41 |             error_kind = ValidationErrorKind.UNEXPECTED
42 |         return QueryValidationResult(success=False, error_message=str(e), error_kind=error_kind,
43 |                                      source_query=source_query)
44 | 


--------------------------------------------------------------------------------
/frocket/common/validation/visitor_functions.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A collection of callback functions which the QueryValidator uses to extarct, validate and transform query elements,
  3 | with the kind help of PathVisitor class.
  4 | 
  5 | Functions which return a value are used to replace the given object with a different one,
  6 | which is handled by PathVisitor in its 'modifiable' mode.
  7 | 
  8 | Since callbacks are regular functions (not methods), and there's a bunch of them, they're in a separate file from
  9 | the QueryValidator class.
 10 | 
 11 | asserts are used where processing elements which should be already validated (so failures should be bugs).
 12 | """
 13 | #  Copyright 2021 The Funnel Rocket Maintainers
 14 | #
 15 | #  Licensed under the Apache License, Version 2.0 (the "License");
 16 | #  you may not use this file except in compliance with the License.
 17 | #  You may obtain a copy of the License at
 18 | #
 19 | #      http://www.apache.org/licenses/LICENSE-2.0
 20 | #
 21 | #  Unless required by applicable law or agreed to in writing, software
 22 | #  distributed under the License is distributed on an "AS IS" BASIS,
 23 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 24 | #  See the License for the specific language governing permissions and
 25 | #  limitations under the License.
 26 | 
 27 | from typing import Optional
 28 | from frocket.common.validation.consts import DEFAULT_TARGET, AGGR_TYPES_WITH_OTHER_COLUMN, \
 29 |     DEFAULT_AGGREGATIONS, TARGET_TYPES_WITH_INCLUDE_ZERO, TARGET_OPS_SUPPORTING_INCLUDE_ZERO
 30 | from frocket.common.validation.error import ValidationErrorKind, QueryValidationError
 31 | 
 32 | 
 33 | def _to_verbose_filter(fltr) -> Optional[dict]:
 34 |     """If a condition filter is in short-hand notation (list), convert to verbose notation."""
 35 |     assert type(fltr) in [list, dict]
 36 |     if type(fltr) is list:
 37 |         assert len(fltr) == 3
 38 |         return {'column': fltr[0], 'op': fltr[1], 'value': fltr[2]}
 39 | 
 40 | 
 41 | def _to_verbose_target(target) -> Optional[dict]:
 42 |     """If a condition target is in short-hand notation (list), convert to verbose notation."""
 43 |     assert type(target) in [list, dict]
 44 |     if type(target) is list:
 45 |         assert len(target) in [3, 4]
 46 |         if len(target) == 3:
 47 |             return {'type': target[0], 'op': target[1], 'value': target[2]}
 48 |         elif len(target) == 4:
 49 |             return {'type': target[0], 'column': target[1], 'op': target[2], 'value': target[3]}
 50 | 
 51 | 
 52 | def _add_default_target(cond: dict) -> None:
 53 |     assert type(cond) is dict
 54 |     # (Modification is done on a key under the given object, so no need to return a modified dict)
 55 |     if ('filter' in cond or 'filters' in cond) and 'target' not in cond:  # Don't touch sequence conditions
 56 |         cond['target'] = DEFAULT_TARGET
 57 | 
 58 | 
 59 | def _validate_aggregation(aggr: dict) -> None:
 60 |     assert type(aggr) is dict
 61 |     aggr_type = aggr.get('type', None)
 62 |     other_column_required = aggr_type in AGGR_TYPES_WITH_OTHER_COLUMN
 63 |     other_column_found = 'otherColumn' in aggr
 64 | 
 65 |     if other_column_required != other_column_found:
 66 |         message = f"For aggregation {aggr} with type '{aggr_type}', other column name is "
 67 |         if other_column_required:
 68 |             message += 'required but was not found'
 69 |         else:
 70 |             message += 'not relevant but was given'
 71 |         raise QueryValidationError(message, kind=ValidationErrorKind.SCHEMA)
 72 | 
 73 | 
 74 | def _expand_aggregations(col_aggregations: list) -> Optional[list]:
 75 |     assert type(col_aggregations) is list
 76 |     result = []
 77 |     for aggr in col_aggregations:
 78 |         if aggr.get('type', None):
 79 |             result.append(aggr)
 80 |         else:
 81 |             if 'name' in aggr:
 82 |                 message = f"Aggregation {aggr} expands into multiple default aggregations, " \
 83 |                           f"and thus a name attributeis not supported"
 84 |                 raise QueryValidationError(message, kind=ValidationErrorKind.SCHEMA)
 85 |             for added_type in DEFAULT_AGGREGATIONS:
 86 |                 result.append({**aggr, 'type': added_type})
 87 | 
 88 |     return result
 89 | 
 90 | 
 91 | def _validate_or_set_include_zero(cond: dict) -> None:
 92 |     """
 93 |     'includeZero' attribute of conditions may be tricky to get right.
 94 |     This function validates that its usage makes sense, and sets the correct default where it's ommitted.
 95 |     """
 96 |     assert type(cond) is dict
 97 |     if not ('filter' in cond or 'filters' in cond):
 98 |         return  # Skip sequence condition (and possibly other future types without a target)
 99 | 
100 |     # This should run after _to_verbose_target() and _add_default_target() have already ran, ensuring target exists
101 |     target_type = cond['target']['type']
102 |     target_op = cond['target']['op']
103 |     target_value = cond['target']['value']
104 |     include_zero_value = cond.get('includeZero', None)
105 |     target_as_string = f"{target_type} {target_op} {target_value}"
106 | 
107 |     if target_type not in TARGET_TYPES_WITH_INCLUDE_ZERO:
108 |         if include_zero_value:  # Exists and set to True
109 |             raise QueryValidationError(
110 |                 message=f"'includeZero' is not applicable for target type '{target_type}'. In condition: {cond}",
111 |                 kind=ValidationErrorKind.TYPE_MISMATCH)
112 |     else:
113 |         assert type(target_value) is int
114 |         assert target_value >= 0
115 | 
116 |         if include_zero_value:  # Exists and set to True
117 |             # Operator never relevant for includeZero=True
118 |             if target_op not in TARGET_OPS_SUPPORTING_INCLUDE_ZERO:
119 |                 raise QueryValidationError(
120 |                     message=f"For target operator '{target_op}', 'includeZero' cannot be true. In condition: {cond}",
121 |                     kind=ValidationErrorKind.TYPE_MISMATCH)
122 | 
123 |             # Additional check when an operator is *potentially* relevant for includeZero=True
124 |             if target_op == '<' and target_value == 0:
125 |                 raise QueryValidationError(
126 |                     message=f"Target implies a negative value. In condition: {cond}",
127 |                     kind=ValidationErrorKind.TYPE_MISMATCH)
128 | 
129 |             if (target_op == '!=' and target_value == 0) or \
130 |                     (target_op in ['==', '>='] and target_value != 0):
131 |                 message = f"Target {target_as_string} explicitly precludes zero, and thus 'includeZero' " \
132 |                           f"cannot be true. In condition: {cond}"
133 |                 raise QueryValidationError(message, kind=ValidationErrorKind.TYPE_MISMATCH)
134 |         else:
135 |             if target_op == '==' and target_value == 0:
136 |                 if include_zero_value is None:
137 |                     # Explicitly set includeZero when target is count == 0
138 |                     # Note: modifying a key under the given object, so no need to return a modified dict
139 |                     cond['includeZero'] = True
140 |                 elif not include_zero_value:
141 |                     message = f"When using a target of {target_as_string}, 'includeZero' cannot be false. " \
142 |                                     f"Condition: {cond}"
143 |                     raise QueryValidationError(message, kind=ValidationErrorKind.TYPE_MISMATCH)
144 | 


--------------------------------------------------------------------------------
/frocket/datastore/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/frocket/datastore/blobstore.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | from abc import abstractmethod
16 | from typing import Optional
17 | from frocket.common.config import config
18 | from frocket.common.tasks.base import BlobId
19 | 
20 | BLOB_DEFAULT_TTL = config.int('blobstore.default.ttl')
21 | BLOB_MAX_TTL = config.int('blobstore.max.ttl')
22 | 
23 | 
24 | class Blobstore:
25 |     """Simple interface for storing and fetching arbitrary binary data, for ephemeral transport over the network.
26 |     The data is assumed to always have a default TTL - it's not a permanent or big data store."""
27 |     @abstractmethod
28 |     def write_blob(self, data: bytes, ttl: int = None, tag: str = None) -> BlobId:
29 |         pass
30 | 
31 |     @abstractmethod
32 |     def read_blob(self, blobid: BlobId) -> Optional[bytes]:
33 |         pass
34 | 
35 |     @abstractmethod
36 |     def delete_blob(self, blobid: BlobId) -> bool:
37 |         pass
38 | 


--------------------------------------------------------------------------------
/frocket/datastore/datastore.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2021 The Funnel Rocket Maintainers
  2 | #
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #  See the License for the specific language governing permissions and
 13 | #  limitations under the License.
 14 | 
 15 | from abc import abstractmethod, ABCMeta
 16 | from dataclasses import dataclass
 17 | from typing import List, Dict, Set, Optional, Union
 18 | from frocket.common.tasks.base import TaskStatus, BaseTaskResult, TaskAttemptId, TaskStatusUpdate, BaseTaskRequest
 19 | from frocket.common.dataset import DatasetInfo, DatasetPartsInfo, DatasetPartId, DatasetShortSchema, DatasetSchema
 20 | from frocket.common.serializable import SerializableDataClass
 21 | 
 22 | DEFAULT_QUEUE = 'default'
 23 | DEFAULT_DEQUEUE_WAIT_TIME = 60
 24 | 
 25 | 
 26 | # Used in PartSelectionMode.SELECTED_BY_WORKER
 27 | @dataclass(frozen=True)
 28 | class WorkerSelectedPart(SerializableDataClass):
 29 |     part_id: DatasetPartId
 30 |     random: bool
 31 |     task_attempt_no: int
 32 | 
 33 | 
 34 | class Datastore(metaclass=ABCMeta):
 35 |     """
 36 |     Interface to the data store, which holds:
 37 | 
 38 |     * The list, metadata and schema of all registered datasets
 39 |     * For running jobs:
 40 |       * Task statuses and results
 41 |       * Atomic attempt counter for retried tasks
 42 |       * For jobs running in mode PartSelectionMode.SELECTED_BY_WORKER, the manifest of available tasks to select from.
 43 |       * When the system is configured to use the 'work_queue' invoker (rather than 'aws_lambda'), the datastore also
 44 |         provides the queue through which tasks are enqueued by the invoker and picked up by the workers, like a very
 45 |         simplistic queue management system.
 46 | 
 47 |     The datastore is not for storing the actual dataset or other persistent large data.
 48 |     """
 49 |     @abstractmethod
 50 |     def write_dataset_info(self, dataset: DatasetInfo, parts: DatasetPartsInfo, schema: DatasetSchema) -> None:
 51 |         pass
 52 | 
 53 |     @abstractmethod
 54 |     def remove_dataset_info(self, name: str) -> bool:
 55 |         pass
 56 | 
 57 |     @abstractmethod
 58 |     def dataset_info(self, name: str) -> DatasetInfo:
 59 |         pass
 60 | 
 61 |     @abstractmethod
 62 |     def dataset_parts_info(self, ds: DatasetInfo) -> DatasetPartsInfo:
 63 |         pass
 64 | 
 65 |     @abstractmethod
 66 |     def schema(self, ds: DatasetInfo) -> DatasetSchema:
 67 |         pass
 68 | 
 69 |     @abstractmethod
 70 |     def short_schema(self, ds: DatasetInfo) -> DatasetShortSchema:
 71 |         pass
 72 | 
 73 |     @abstractmethod
 74 |     def last_used(self, ds: DatasetInfo) -> int:
 75 |         pass
 76 | 
 77 |     @abstractmethod
 78 |     def mark_used(self, ds: DatasetInfo):
 79 |         pass
 80 | 
 81 |     @abstractmethod
 82 |     def datasets(self) -> List[DatasetInfo]:
 83 |         pass
 84 | 
 85 |     @abstractmethod
 86 |     def enqueue(self, requests: List[BaseTaskRequest], queue: str = DEFAULT_QUEUE) -> None:
 87 |         pass
 88 | 
 89 |     @abstractmethod
 90 |     def dequeue(self, queue: str = DEFAULT_QUEUE, timeout: int = DEFAULT_DEQUEUE_WAIT_TIME) -> BaseTaskRequest:
 91 |         pass
 92 | 
 93 |     @abstractmethod
 94 |     def update_task_status(self, reqid: str,
 95 |                            tasks: Union[TaskAttemptId, List[TaskAttemptId]], status: TaskStatus) -> None:
 96 |         pass
 97 | 
 98 |     @abstractmethod
 99 |     def tasks_status(self, reqid: str) -> Dict[TaskAttemptId, TaskStatusUpdate]:
100 |         pass
101 | 
102 |     @abstractmethod
103 |     def write_task_result(self, reqid: str, taskid: TaskAttemptId, result: BaseTaskResult) -> None:
104 |         pass
105 | 
106 |     @abstractmethod
107 |     def task_results(self, reqid: str) -> Dict[TaskAttemptId, BaseTaskResult]:
108 |         pass
109 | 
110 |     @abstractmethod
111 |     def increment_attempt(self, reqid: str, part_idx: int) -> int:
112 |         pass
113 | 
114 |     @abstractmethod
115 |     def publish_for_worker_selection(self, reqid: str, attempt_round: int, parts: Set[DatasetPartId]) -> None:
116 |         pass
117 | 
118 |     @abstractmethod
119 |     def self_select_part(self, reqid: str, attempt_round: int,
120 |                          candidates: Set[DatasetPartId] = None) -> Optional[WorkerSelectedPart]:
121 |         pass
122 | 
123 |     @abstractmethod
124 |     def cleanup_request_data(self, reqid: str) -> None:
125 |         pass
126 | 


--------------------------------------------------------------------------------
/frocket/datastore/registered_datastores.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | import logging
16 | from frocket.common.config import config
17 | from frocket.common.helpers.utils import memoize
18 | from frocket.datastore.datastore import Datastore
19 | from frocket.datastore.blobstore import Blobstore
20 | from frocket.datastore.redis_store import RedisStore
21 | 
22 | logger = logging.getLogger(__name__)
23 | 
24 | DATASTORE_CLASSES = {
25 |     "redis": RedisStore,
26 | }
27 | 
28 | BLOBSTORE_CLASSES = {
29 |     "redis": RedisStore,
30 | }
31 | 
32 | 
33 | # TODO backlog consider thread-safety here: while RedisStore is thread-safe and having more than one is ok, future
34 | #  implementations may not be? (or should be required to)
35 | def _get_store(store_kind: str, store_mapping: dict):
36 |     store_class = store_mapping[config.get(store_kind).lower()]
37 |     store = store_class(role=store_kind)
38 |     logger.info(f"Initialized {store}")
39 |     return store
40 | 
41 | 
42 | @memoize
43 | def get_datastore() -> Datastore:
44 |     return _get_store("datastore", DATASTORE_CLASSES)
45 | 
46 | 
47 | @memoize
48 | def get_blobstore() -> Blobstore:
49 |     return _get_store("blobstore", BLOBSTORE_CLASSES)
50 | 


--------------------------------------------------------------------------------
/frocket/engine/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/frocket/engine/relation_to_pandas.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | from typing import Dict, Type, Callable, List, cast
16 | from frocket.common.validation.relation_parser import RBaseElement, RTextElement, RConditionBaseElement, ROperator
17 | 
18 | 
19 | def relation_to_pandas_query(elements: List[RBaseElement], column_prefix: str) -> str:
20 |     """Convert the generic pasred representation of query.relation expression (as returned by QueryValidator or its
21 |     helper class RelationParser) into a Pandas query string."""
22 | 
23 |     # Mapping of generic element type to a lambda function constructing the Pandas equivalent. Note below that not
24 |     # every concreate element type needs an entry here, as the code would also look for its superclass
25 |     etype_to_handler: Dict[Type[RBaseElement], Callable[[RBaseElement], str]] = {
26 |         RTextElement: lambda v: v.text,
27 |         RConditionBaseElement: lambda v: f"{column_prefix}{v.condition_id}",
28 |         ROperator: lambda v: " & " if v.text in ["and", "&&"] else " | "
29 |     }
30 | 
31 |     transformed = []
32 |     for e in elements:
33 |         func = None
34 |         # Either there's a handler above for this element type, or go up the superclass chain to find one.
35 |         class_and_supers = cast(List[Type[RBaseElement]], type(e).mro())
36 |         for cls in class_and_supers:
37 |             func = etype_to_handler.get(cls, None)
38 |             if func:
39 |                 break
40 |         if not func:
41 |             raise Exception(f"{e} has no handler for any of its superclasses: {class_and_supers}")
42 |         transformed.append(func(e))
43 |     return "".join(transformed)
44 | 


--------------------------------------------------------------------------------
/frocket/invoker/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/frocket/invoker/impl/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/frocket/invoker/impl/aws_lambda_invoker.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Invoke tasks by invoking an AWS Lambda function asynchronously.
  3 | 
  4 | This is a great feature of Lamdba, which implictly manages a queue of invocation for you with configurable retention
  5 | (probably based on SQS). As long as the concurrent invocations limit in your account/burst limit of the AWS region are
  6 | not reached, AWS will launch Lambdas for queued invocation immediately, with no meaningful delay. This also prevents
  7 | getting rate-limited on momentary invocation spikes.
  8 | 
  9 | A few important notes:
 10 | 
 11 | 1. As noted in the setup guide, the retry count for the Lambda function *should be set to zero*, as it's the invoker's
 12 | job to launch retries with slightly different arguments, based on its own configuration, with logic that is agnostic
 13 | to whether the actual invoker is using Lambdas or anything else (which does not have its optional retry feature).
 14 | 
 15 | 2. Unfortunately, there's no API for batch Lambda invocation, so we're invoking one by one with multiple threads -
 16 | and still the time to invoke all tasks can add up to 1-2 seconds or more.
 17 | TODO backlog optimize! this also hurts caching as not all tasks get their fair chance to pick a locally cached part.
 18 | 
 19 | 3. The InvokeAsync() Lambda API is considered deprecated and replaced by the 'InvocationType' parameter in Invoke().
 20 | However, the InvokeAsync API currently seems to take about half the time to return! Which one to use is configurable.
 21 | 
 22 | TODO backlog stress-test queue limits till reaching rate limiting (status 429).
 23 | TODO backlog for each invocation, add its actual invoke time as parameter
 24 |  (now we only measure time since invocation of all tasks started)
 25 | """
 26 | #  Copyright 2021 The Funnel Rocket Maintainers
 27 | #
 28 | #  Licensed under the Apache License, Version 2.0 (the "License");
 29 | #  you may not use this file except in compliance with the License.
 30 | #  You may obtain a copy of the License at
 31 | #
 32 | #      http://www.apache.org/licenses/LICENSE-2.0
 33 | #
 34 | #  Unless required by applicable law or agreed to in writing, software
 35 | #  distributed under the License is distributed on an "AS IS" BASIS,
 36 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 37 | #  See the License for the specific language governing permissions and
 38 | #  limitations under the License.
 39 | 
 40 | import logging
 41 | import time
 42 | import concurrent.futures
 43 | from typing import cast
 44 | import boto3
 45 | from botocore.client import BaseClient
 46 | from botocore.config import Config
 47 | from frocket.common.serializable import Envelope
 48 | from frocket.common.tasks.base import BaseTaskRequest, BaseApiResult
 49 | from frocket.invoker.impl.async_invoker import AsyncInvoker
 50 | from frocket.common.config import config
 51 | 
 52 | logger = logging.getLogger(__name__)
 53 | 
 54 | DEBUG_PRINT_PAYLOADS = config.bool("invoker.lambda.debug.payload")
 55 | LAMBDA_ASYNC_OK_STATUS = 202
 56 | 
 57 | 
 58 | def _worker_task(req: BaseTaskRequest, client: BaseClient, lambda_name: str) -> BaseApiResult:
 59 |     """Run by the thread pool below."""
 60 |     # noinspection PyBroadException
 61 |     try:
 62 |         result = None
 63 |         json_payload = Envelope.seal_to_json(req)  # Encodes the actual object and its type, for correct decoding later.
 64 |         if DEBUG_PRINT_PAYLOADS:
 65 |             logger.debug(json_payload)
 66 | 
 67 |         legacy_invoke_async = config.bool("invoker.lambda.legacy.async")
 68 |         status_field = 'Status' if legacy_invoke_async else 'StatusCode'
 69 | 
 70 |         if legacy_invoke_async:
 71 |             response = client.invoke_async(FunctionName=lambda_name, InvokeArgs=json_payload)
 72 |         else:
 73 |             response = client.invoke(FunctionName=lambda_name, InvocationType='Event', Payload=json_payload)
 74 | 
 75 |         if response[status_field] == LAMBDA_ASYNC_OK_STATUS:
 76 |             result = BaseApiResult(success=True, error_message=None)
 77 |         else:
 78 |             message = f"Response status differs from expected ({LAMBDA_ASYNC_OK_STATUS}): {response}"
 79 |             result = BaseApiResult(success=False, error_message=message)
 80 |     except Exception as e:
 81 |         result = BaseApiResult(success=False, error_message=f"Failed to invoke lambda function '{lambda_name}': {e}")
 82 |     return result
 83 | 
 84 | 
 85 | class AwsLambdaInvoker(AsyncInvoker):
 86 |     def _enqueue(self, requests) -> None:
 87 |         lambda_name = config.get('invoker.lambda.name')
 88 |         num_threads = config.int('invoker.lambda.threads')
 89 |         boto_config = Config(**config.aws_config_dict(service='lambda'))
 90 |         client = boto3.client('lambda',
 91 |                               **config.aws_client_settings(service='lambda'),
 92 |                               config=boto_config)
 93 |         logger.debug(f"Invoking lambdas, name: {lambda_name}, no. of invocations: {len(requests)}"
 94 |                      f", no. of invoker threads: {num_threads}")
 95 |         futures = []
 96 |         start_invoke_time = time.time()
 97 |         # TODO backlog consider lifecycle of the thread pool
 98 |         with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
 99 |             for req in requests:
100 |                 futures.append(executor.submit(_worker_task, req, client, lambda_name))
101 |             futures = concurrent.futures.as_completed(futures)  # Wait till all complete!
102 |             executor.shutdown()
103 | 
104 |         error_message = None
105 |         for f in futures:
106 |             assert f.done()
107 |             if f.cancelled():
108 |                 error_message = "Lambda invocation interrupted"
109 |             elif f.exception():
110 |                 error_message = f"Invocation failed with error: {f.exception()}"
111 |             else:
112 |                 result = f.result()
113 |                 if not result or type(result) is not BaseApiResult:
114 |                     error_message = f"Invocation returned with response: {result}"
115 |                 result = cast(BaseApiResult, result)
116 |                 if not result.success:
117 |                     error_message = result.error_message
118 |             if error_message:
119 |                 break
120 | 
121 |         if error_message:
122 |             logger.error(error_message)
123 |             raise Exception(error_message)
124 |         else:
125 |             logger.info(f"Async invocation done in {time.time() - start_invoke_time:.3f}")
126 | 


--------------------------------------------------------------------------------
/frocket/invoker/impl/registered_invokers.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | import logging
16 | from enum import Enum, auto
17 | from frocket.common.config import config
18 | from frocket.invoker.base_invoker import BaseInvoker
19 | from frocket.invoker.jobs.job import Job
20 | from frocket.invoker.impl.aws_lambda_invoker import AwsLambdaInvoker
21 | from frocket.invoker.impl.work_queue_invoker import WorkQueueInvoker
22 | 
23 | logger = logging.getLogger(__name__)
24 | 
25 | 
26 | class InvocationType(Enum):
27 |     WORK_QUEUE = auto()
28 |     AWS_LAMBDA = auto()
29 | 
30 | 
31 | INVOKER_CLASSES = {
32 |     InvocationType.WORK_QUEUE: WorkQueueInvoker,
33 |     InvocationType.AWS_LAMBDA: AwsLambdaInvoker
34 | }
35 | 
36 | 
37 | def new_invoker(request_builder: Job) -> BaseInvoker:
38 |     invoker_type = InvocationType[config.get("invoker").upper()]
39 |     invoker_class = INVOKER_CLASSES[invoker_type]
40 |     logger.info(f"Creating invoker type: {invoker_class.__name__}, for request builder type: {type(request_builder)}")
41 |     return invoker_class(request_builder)
42 | 


--------------------------------------------------------------------------------
/frocket/invoker/impl/work_queue_invoker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Invoke tasks by enqueing them in the datastore. Not much to do here :-)
 3 | """
 4 | #  Copyright 2021 The Funnel Rocket Maintainers
 5 | #
 6 | #  Licensed under the Apache License, Version 2.0 (the "License");
 7 | #  you may not use this file except in compliance with the License.
 8 | #  You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | #  Unless required by applicable law or agreed to in writing, software
13 | #  distributed under the License is distributed on an "AS IS" BASIS,
14 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | #  See the License for the specific language governing permissions and
16 | #  limitations under the License.
17 | 
18 | from frocket.invoker.impl.async_invoker import AsyncInvoker
19 | 
20 | 
21 | class WorkQueueInvoker(AsyncInvoker):
22 |     def _enqueue(self, requests) -> None:
23 |         self._datastore.enqueue(requests)
24 | 


--------------------------------------------------------------------------------
/frocket/invoker/invoker_api.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is the "Funnel Rocket" frontend API - wrappedby the CLI & API server, and may be embeddable in other apps.
  3 | Clients are not expected to bypass this API (call the datastore directly, initialize an invoker, etc.)
  4 | """
  5 | #  Copyright 2021 The Funnel Rocket Maintainers
  6 | #
  7 | #  Licensed under the Apache License, Version 2.0 (the "License");
  8 | #  you may not use this file except in compliance with the License.
  9 | #  You may obtain a copy of the License at
 10 | #
 11 | #      http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | #  Unless required by applicable law or agreed to in writing, software
 14 | #  distributed under the License is distributed on an "AS IS" BASIS,
 15 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | #  See the License for the specific language governing permissions and
 17 | #  limitations under the License.
 18 | 
 19 | import concurrent.futures
 20 | import logging
 21 | import time
 22 | from typing import List, Optional, cast, Union
 23 | from frocket.common.config import config
 24 | from frocket.common.dataset import DatasetInfo, DatasetShortSchema, DatasetSchema, DatasetPartsInfo
 25 | from frocket.common.tasks.registration import RegistrationJobResult, RegisterArgs, UnregisterApiResult
 26 | from frocket.common.tasks.query import QueryJobResult
 27 | from frocket.common.tasks.async_tracker import AsyncJobTracker, AsyncJobStatusUpdater
 28 | from frocket.common.validation.query_validator import QueryValidator
 29 | from frocket.common.validation.result import QueryValidationResult
 30 | from frocket.datastore.registered_datastores import get_datastore
 31 | from frocket.invoker.jobs.query_job import QueryJob
 32 | from frocket.invoker.jobs.registration_job import RegistrationJob
 33 | from frocket.invoker.impl.registered_invokers import new_invoker
 34 | 
 35 | logger = logging.getLogger(__name__)
 36 | executor = concurrent.futures.ThreadPoolExecutor()
 37 | 
 38 | # TODO backlog allow configurable timeout per job type (async or not)
 39 | ASYNC_MAX_WAIT = config.int("invoker.run.timeout") * 1.1  # Adding a bit of grace around the invoker
 40 | 
 41 | 
 42 | def _unregister_safety_interval() -> int:
 43 |     """How long after a dataset was last used to block unregister (can be set to zero, or overidden with force=True)."""
 44 |     interval = config.get('unregister.last.used.interval', None)
 45 |     if not interval:  # Not defined, or empty string (explicit '0' is truthy)
 46 |         interval = config.int('invoker.run.timeout') * 2
 47 |     else:
 48 |         interval = int(interval)
 49 |     return interval
 50 | 
 51 | 
 52 | def register_dataset(args: RegisterArgs) -> RegistrationJobResult:
 53 |     request_builder = RegistrationJob(args)
 54 |     invoker = new_invoker(request_builder)
 55 |     result = cast(RegistrationJobResult, invoker.run())
 56 |     logger.info(f"Registration {'successful' if result.success else f'failed! {result.error_message}'}")
 57 |     return result
 58 | 
 59 | 
 60 | def register_dataset_async(args: RegisterArgs, set_max_wait: bool = True) -> AsyncJobTracker:
 61 |     """The async version starts the invoker in a separate thread and then returns, handing back
 62 |     an AsyncJobTracker to poll for progress/completion."""
 63 |     def worker(register_args, async_status):
 64 |         invoker = new_invoker(RegistrationJob(register_args))
 65 |         return invoker.run(async_status)
 66 | 
 67 |     async_status = AsyncJobStatusUpdater(max_wait=(ASYNC_MAX_WAIT if set_max_wait else None))
 68 |     executor.submit(worker, args, async_status)
 69 |     logger.info(f"Submitted async registration for dataset named {args.name} in basepath {args.basepath}")
 70 |     return async_status
 71 | 
 72 | 
 73 | def get_dataset(name: str, throw_if_missing: bool = False) -> Optional[DatasetInfo]:
 74 |     dataset = get_datastore().dataset_info(name)
 75 |     if not dataset and throw_if_missing:
 76 |         raise Exception(f"Dataset '{name}' not found")
 77 |     return dataset
 78 | 
 79 | 
 80 | def get_dataset_schema(dataset: DatasetInfo, full: bool = False) -> Union[DatasetSchema, DatasetShortSchema]:
 81 |     return get_datastore().schema(dataset) if full else get_datastore().short_schema(dataset)
 82 | 
 83 | 
 84 | def get_dataset_parts(dataset: DatasetInfo) -> DatasetPartsInfo:
 85 |     return get_datastore().dataset_parts_info(dataset)
 86 | 
 87 | 
 88 | def unregister_dataset(name: str, force: bool = False) -> UnregisterApiResult:
 89 |     dataset = get_dataset(name=name)
 90 |     if not dataset:
 91 |         return UnregisterApiResult(success=True, error_message=None,
 92 |                                    dataset_found=False, dataset_last_used=None)
 93 | 
 94 |     datastore = get_datastore()
 95 |     last_used = datastore.last_used(dataset)
 96 |     if last_used:
 97 |         time_since_used = int(time.time() - last_used)
 98 |         safety_interval = _unregister_safety_interval()
 99 |         message = f"Dataset was last used {time_since_used} seconds ago, which is less than safety interval " \
100 |                   f"{safety_interval}. Use the 'force' parameter to unregister anyway."
101 |         if safety_interval > time_since_used and not force:
102 |             return UnregisterApiResult(success=False, error_message=message,
103 |                                        dataset_found=True, dataset_last_used=last_used)
104 | 
105 |     get_datastore().remove_dataset_info(name)
106 |     return UnregisterApiResult(success=True, error_message=None,
107 |                                dataset_found=True, dataset_last_used=last_used)
108 | 
109 | 
110 | def expand_and_validate_query(dataset: DatasetInfo, query: dict) -> QueryValidationResult:
111 |     short_schema = get_dataset_schema(dataset)
112 |     return QueryValidator(query, dataset, short_schema).expand_and_validate()
113 | 
114 | 
115 | def _build_query_job(dataset: DatasetInfo,
116 |                      query: dict,
117 |                      validation_result: QueryValidationResult) -> QueryJob:
118 |     """If the query was already validated, skip re-validating."""
119 |     if validation_result:
120 |         assert validation_result.success
121 |         assert query in [validation_result.source_query, validation_result.expanded_query]
122 |     else:
123 |         validation_result = expand_and_validate_query(dataset, query)
124 |         if not validation_result.success:
125 |             raise Exception(f"Query validation failed: {validation_result.error_message}")
126 | 
127 |     get_datastore().mark_used(dataset)
128 |     dataset_parts = get_datastore().dataset_parts_info(dataset)
129 |     short_schema = get_datastore().short_schema(dataset)
130 |     return QueryJob(dataset, dataset_parts, short_schema,
131 |                     validation_result.expanded_query, validation_result.used_columns)
132 | 
133 | 
134 | def run_query(dataset: DatasetInfo,
135 |               query: dict,
136 |               validation_result: QueryValidationResult = None) -> QueryJobResult:
137 |     job_builder = _build_query_job(dataset, query, validation_result)
138 |     invoker = new_invoker(job_builder)
139 |     result = cast(QueryJobResult, invoker.run())
140 |     if result.success:
141 |         logger.info("Query completed successfully")
142 |     else:
143 |         logger.error(f"Query failed with message: {result.error_message}")
144 |     return result
145 | 
146 | 
147 | def run_query_async(dataset: DatasetInfo,
148 |                     query: dict,
149 |                     set_max_wait: bool = True,
150 |                     validation_result: QueryValidationResult = None) -> AsyncJobTracker:
151 |     """The async version starts the invoker in a separate thread and then returns, handing back
152 |     an AsyncJobTracker to poll for progress/completion."""
153 |     def worker(job_builder, async_status):
154 |         invoker = new_invoker(job_builder)
155 |         return invoker.run(async_status)
156 | 
157 |     job_builder = _build_query_job(dataset, query, validation_result)
158 |     async_status = AsyncJobStatusUpdater(max_wait=(ASYNC_MAX_WAIT if set_max_wait else None))
159 |     executor.submit(worker, job_builder, async_status)
160 |     logger.info(f"Submitted async query for dataset '{dataset.id.name}'")
161 |     return async_status
162 | 
163 | 
164 | def list_datasets() -> List[DatasetInfo]:
165 |     datasets = get_datastore().datasets()
166 |     return datasets
167 | 


--------------------------------------------------------------------------------
/frocket/invoker/jobs/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/frocket/invoker/jobs/job.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | from abc import abstractmethod, ABCMeta
16 | from typing import List, Optional, Set
17 | from frocket.common.dataset import DatasetPartId, DatasetPartsInfo
18 | from frocket.common.metrics import LabelsDict
19 | from frocket.common.tasks.base import BaseTaskRequest, BaseTaskResult, BaseJobResult, JobStatus, ErrorMessage
20 | from frocket.common.tasks.async_tracker import AsyncJobStatusUpdater
21 | 
22 | 
23 | class Job(metaclass=ABCMeta):
24 |     """
25 |     For each job type (registratioThen, qeury, and future ones) there is a concrete subclass.
26 |     That concrete class is handed to the invoker object, which is agnostic to the job details but calls the
27 |     job's method in a set order.
28 | 
29 |     The flow, at high level:
30 |     1. On prerun(), the job validates its arguments (and can fail by returnning an error message) and can prepare data
31 |        for building tasks.
32 | 
33 |     2. When build_tasks() is called by the invoker - return a list of concrete task request objects,
34 |        all with attempt no. 0.
35 | 
36 |     3. If the job supports task self-selection by workers, it should override dataset_parts_to_publish() and
37 |        return a list of parts to be consumed by workers (workers would try to select parts they have cached locally).
38 |        This list is published by the data store before tasks are invoked.
39 | 
40 |     4.  In case the invoker decides to retry a task, it calls build_retry_task() to create a specific retry task
41 | 
42 |     5. After all tasks have completed, either successfully or not, complete() is called to run any validations on the
43 |        final results of all tasks, and perform any needed aggregations. The job may fail at this stage if the results of
44 |        tasks, taken together, are invalid.
45 | 
46 |     6. Lastly, build_result() is called to construct the final job result.
47 |        At this stage, the final success status of the job should not change.
48 |     """
49 |     _request_id = None
50 |     _labels = {}
51 | 
52 |     @property
53 |     def request_id(self) -> Optional[str]:
54 |         return self._request_id
55 | 
56 |     @request_id.setter
57 |     def request_id(self, request_id: str):
58 |         self._request_id = request_id
59 | 
60 |     def prerun(self, async_updater: AsyncJobStatusUpdater = None) -> Optional[ErrorMessage]:
61 |         pass
62 | 
63 |     @abstractmethod
64 |     def build_tasks(self) -> List[BaseTaskRequest]:
65 |         pass
66 | 
67 |     def dataset_parts_to_publish(self) -> Optional[Set[DatasetPartId]]:
68 |         return None
69 | 
70 |     @abstractmethod
71 |     def total_tasks(self) -> int:
72 |         pass
73 | 
74 |     @abstractmethod
75 |     def build_retry_task(self, attempt_no: int, task_index: int) -> BaseTaskRequest:
76 |         pass
77 | 
78 |     def complete(self,
79 |                  tasks_final_status: JobStatus,
80 |                  latest_task_results: List[BaseTaskResult],
81 |                  async_updater: AsyncJobStatusUpdater = None) -> JobStatus:
82 |         return tasks_final_status
83 | 
84 |     @abstractmethod
85 |     def build_result(self,
86 |                      base_attributes: dict,
87 |                      final_status: JobStatus,
88 |                      latest_task_results: List[BaseTaskResult]) -> BaseJobResult:
89 |         pass
90 | 
91 |     @property
92 |     def metric_labels(self) -> LabelsDict:
93 |         return self._labels
94 | 
95 |     @abstractmethod
96 |     def parts_info(self) -> Optional[DatasetPartsInfo]:
97 |         pass
98 | 


--------------------------------------------------------------------------------
/frocket/invoker/jobs/query_job.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2021 The Funnel Rocket Maintainers
  2 | #
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #  See the License for the specific language governing permissions and
 13 | #  limitations under the License.
 14 | 
 15 | import time
 16 | from typing import List, cast
 17 | from frocket.common.config import config
 18 | from frocket.common.dataset import DatasetInfo, DatasetPartId, DatasetPartsInfo, DatasetShortSchema
 19 | from frocket.common.metrics import JobTypeLabel, DATASET_LABEL
 20 | from frocket.common.tasks.query import PartSelectionMode, QueryTaskRequest, QueryTaskResult, QueryJobResult, QueryResult
 21 | from frocket.invoker.jobs.job import Job
 22 | 
 23 | 
 24 | class QueryJob(Job):
 25 |     def __init__(self, dataset: DatasetInfo, parts: DatasetPartsInfo,
 26 |                  short_schema: DatasetShortSchema, query: dict, used_columns: List[str],
 27 |                  worker_can_select_part: bool = None):
 28 |         self._dataset = dataset
 29 |         self._parts = parts
 30 |         self._query = query
 31 |         self._used_columns = used_columns
 32 |         self._paths = parts.fullpaths(parent=dataset)
 33 |         self._worker_can_select_part = worker_can_select_part \
 34 |             if worker_can_select_part is not None else config.bool('worker.self.select.enabled')
 35 |         if config.bool('dataset.categorical.potential.use'):
 36 |             self._load_as_categoricals = short_schema.potential_categoricals
 37 |         else:
 38 |             self._load_as_categoricals = None
 39 |         self._labels = {
 40 |             JobTypeLabel.QUERY.label_name: JobTypeLabel.QUERY.label_value,
 41 |             DATASET_LABEL: self._dataset.id.name
 42 |         }
 43 | 
 44 |     def parts_info(self):
 45 |         return self._parts
 46 | 
 47 |     def total_tasks(self):
 48 |         return len(self._paths)
 49 | 
 50 |     def build_tasks(self):
 51 |         if self._worker_can_select_part:
 52 |             mode = PartSelectionMode.SELECTED_BY_WORKER
 53 |         else:
 54 |             mode = PartSelectionMode.SET_BY_INVOKER
 55 | 
 56 |         requests = [self._build_task(mode, i) for i in range(self.total_tasks())]
 57 |         return requests
 58 | 
 59 |     def dataset_parts_to_publish(self):
 60 |         if self._worker_can_select_part:
 61 |             parts_to_publish = {DatasetPartId(self._dataset.id, path, part_index)
 62 |                                 for part_index, path in enumerate(self._paths)}
 63 |             return parts_to_publish
 64 |         else:
 65 |             return None
 66 | 
 67 |     def build_retry_task(self, attempt_no, task_index):
 68 |         return self._build_task(PartSelectionMode.SET_BY_INVOKER,
 69 |                                 part_index=task_index,
 70 |                                 attempt_no=attempt_no)
 71 | 
 72 |     def _build_task(self, mode: PartSelectionMode, part_index: int, attempt_no: int = 0) -> QueryTaskRequest:
 73 |         if mode == PartSelectionMode.SET_BY_INVOKER:
 74 |             invoker_set_part = DatasetPartId(dataset_id=self._dataset.id,
 75 |                                              path=self._paths[part_index],
 76 |                                              part_idx=part_index)
 77 |             task_index = part_index
 78 |         elif mode == PartSelectionMode.SELECTED_BY_WORKER:
 79 |             assert attempt_no == 0
 80 |             invoker_set_part = None
 81 |             task_index = None
 82 |         else:
 83 |             raise Exception("Unknown mode {mode}")
 84 | 
 85 |         request = QueryTaskRequest(
 86 |             request_id=self._request_id,
 87 |             invoke_time=time.time(),
 88 |             dataset=self._dataset,
 89 |             load_as_categoricals=self._load_as_categoricals,
 90 |             query=self._query,
 91 |             invoker_set_task_index=task_index,
 92 |             attempt_no=attempt_no,
 93 |             mode=mode,
 94 |             invoker_set_part=invoker_set_part,
 95 |             used_columns=self._used_columns)
 96 |         return request
 97 | 
 98 |     def build_result(self, base_attributes, final_status, latest_task_results):
 99 |         aggregated_query_result = None
100 |         # Only if query was successful, aggregate query results (for each task - from a single successful attempt)
101 |         if final_status.success:
102 |             latest_task_results = cast(List[QueryTaskResult], latest_task_results)
103 |             query_results = [task_result.query_result for task_result in latest_task_results]
104 |             aggregated_query_result = cast(QueryResult,
105 |                                            QueryResult.reduce(query_results))
106 | 
107 |         result = QueryJobResult(
108 |             **base_attributes,
109 |             query=aggregated_query_result.query if aggregated_query_result else None,
110 |             funnel=aggregated_query_result.funnel if aggregated_query_result else None
111 |         )
112 |         return result
113 | 


--------------------------------------------------------------------------------
/frocket/invoker/metrics_frame.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Transform a given list of metrics from multiple sources (invoker, workers) into one DataFrame, for easy analysis.
 3 | Export the data into file and/or Prometheus, by configuration.
 4 | """
 5 | #  Copyright 2021 The Funnel Rocket Maintainers
 6 | #
 7 | #  Licensed under the Apache License, Version 2.0 (the "License");
 8 | #  you may not use this file except in compliance with the License.
 9 | #  You may obtain a copy of the License at
10 | #
11 | #      http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | #  Unless required by applicable law or agreed to in writing, software
14 | #  distributed under the License is distributed on an "AS IS" BASIS,
15 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | #  See the License for the specific language governing permissions and
17 | #  limitations under the License.
18 | 
19 | import logging
20 | from typing import List, Dict, Union
21 | import pandas as pd
22 | from pandas import DataFrame
23 | from frocket.common.config import config
24 | from frocket.common.metrics import SourceAndMetricTuple, ALL_LABEL_NAMES
25 | from frocket.invoker import prom_adapter
26 | 
27 | logger = logging.getLogger(__name__)
28 | 
29 | METRIC_SOURCE_COLUMN = 'source'
30 | METRIC_NAME_COLUMN = 'metric'
31 | METRIC_VALUE_COLUMN = 'value'
32 | 
33 | PANDAS_FLOAT_FORMAT = '{:.5f}'  # No pesky scientific notation ;-)
34 | pd.options.display.float_format = PANDAS_FLOAT_FORMAT.format
35 | 
36 | # 'Last run' file, if defines, stores the most recent job's metrics as a file in CSV or Parquet format (by extension)
37 | EXPORT_LASTRUN_FILE = config.get('metrics.export.lastrun', None)
38 | EXPORT_TO_PROMETHEUS = config.bool('metrics.export.prometheus')
39 | 
40 | if EXPORT_TO_PROMETHEUS:
41 |     prom_adapter.init_prom_metrics()
42 | 
43 | 
44 | class MetricsFrame:
45 |     def __init__(self, source_and_metrics: List[SourceAndMetricTuple]):
46 |         self._sources = [ms.source for ms in source_and_metrics]
47 |         self._metrics = [ms.metric for ms in source_and_metrics]
48 |         self._build_df()
49 | 
50 |     def _build_df(self):
51 |         """
52 |         Build the DataFrame: each row is one reported metric, but the DF is created with columns. Hence, we're creating
53 |         columns here rather than rows.
54 |         """
55 |         metric_source_column = self._sources
56 |         metric_name_column = [m.name.name for m in self._metrics]  # Metric names column
57 |         metric_value_column = [m.value for m in self._metrics]     # Metric values column
58 | 
59 |         # Init empty columns for all possible label names.
60 |         # Cells not not filled (see below) will remain empty (and possibly even entire columns)
61 |         label_columns: Dict[str, List[Union[str, None]]] = {}
62 |         for label_name in ALL_LABEL_NAMES:
63 |             label_columns[label_name] = [None] * len(self._metrics)
64 | 
65 |         # Fill labels columns with what labels are actually set per metric
66 |         for i, metric in enumerate(self._metrics):
67 |             for label_name, label_value in metric.labels.items():
68 |                 label_columns[label_name][i] = label_value
69 | 
70 |         df_columns = {METRIC_SOURCE_COLUMN: metric_source_column,
71 |                       METRIC_NAME_COLUMN: metric_name_column,
72 |                       METRIC_VALUE_COLUMN: metric_value_column,
73 |                       **label_columns}
74 |         self._df = pd.DataFrame(data=df_columns)
75 |         # logger.debug(f"Types: {self._df.dtypes.index.tolist()}, data:\n{self._df}")  # If needed
76 | 
77 |     def export(self) -> None:
78 |         if EXPORT_LASTRUN_FILE:
79 |             self._to_lastrun_file(EXPORT_LASTRUN_FILE)
80 |         if EXPORT_TO_PROMETHEUS:
81 |             self._to_prometheus()
82 | 
83 |     def _to_prometheus(self) -> None:
84 |         prom_adapter.update(self._metrics)
85 | 
86 |     def _to_lastrun_file(self, filename: str) -> None:
87 |         if filename.lower().endswith('.parquet'):
88 |             self._df.to_parquet(filename, index=False)
89 |         else:
90 |             self._df.to_csv(filename, float_format=PANDAS_FLOAT_FORMAT, index=False)
91 | 
92 |     @property
93 |     def dataframe(self) -> DataFrame:
94 |         return self._df
95 | 


--------------------------------------------------------------------------------
/frocket/invoker/prom_adapter.py:
--------------------------------------------------------------------------------
 1 | """
 2 | While metrics support in Funnel Rocket is built with Prometheus (or more generally OpenMetrics) in mind,
 3 | all Prometheus-specific code is in this module.
 4 | 
 5 | TODO backlog support help string (documentation) per each member in MetricName enum
 6 | """
 7 | #  Copyright 2021 The Funnel Rocket Maintainers
 8 | #
 9 | #  Licensed under the Apache License, Version 2.0 (the "License");
10 | #  you may not use this file except in compliance with the License.
11 | #  You may obtain a copy of the License at
12 | #
13 | #      http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | #  Unless required by applicable law or agreed to in writing, software
16 | #  distributed under the License is distributed on an "AS IS" BASIS,
17 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | #  See the License for the specific language governing permissions and
19 | #  limitations under the License.
20 | 
21 | from typing import List, Dict, Type
22 | from prometheus_client import Counter, Histogram
23 | from prometheus_client.metrics import MetricWrapperBase
24 | from frocket.common.config import config
25 | from frocket.common.helpers.utils import memoize
26 | from frocket.common.metrics import MetricName, MeasuredUnit, supported_label_names, MetricData, empty_label_names
27 | 
28 | prom_counters: Dict[MetricName, Counter] = {}
29 | prom_histograms: Dict[MetricName, Histogram] = {}
30 | 
31 | 
32 | @memoize
33 | def buckets_by_unit(unit: MeasuredUnit) -> List[float]:
34 |     """Each unit (seconds, bytes, dollars) may have its own buckets configured, or fallback to the default."""
35 |     assert unit is not MeasuredUnit.COUNT  # COUNT should not use a histogram
36 |     buckets_string = config.get_with_fallbacks(f'metrics.buckets.{unit.name.lower()}', 'metrics.buckets.default')
37 |     buckets = [float(b) for b in buckets_string.split(',')]
38 |     return buckets
39 | 
40 | 
41 | def unit_to_metric_type(unit: MeasuredUnit) -> Type[MetricWrapperBase]:
42 |     """The type of Prometheus metric is automatically derived from the type of measured unit."""
43 |     if unit is MeasuredUnit.COUNT:
44 |         return Counter
45 |     else:
46 |         return Histogram
47 | 
48 | 
49 | def init_prom_metrics():
50 |     """In Prometheus clients, all metrics should be defined only once before use, along with their possible labels.
51 |     This is not a technical limitation of Prometheus itself, but rather enforced by official clients."""
52 |     for e in MetricName:
53 |         base_args = {'name': e.name.lower(),
54 |                      'documentation': e.name,
55 |                      'labelnames': supported_label_names(e)}
56 |         metric_type = unit_to_metric_type(e)
57 |         if metric_type == Counter:
58 |             prom_counters[e] = Counter(**base_args)
59 |         elif metric_type == Histogram:
60 |             prom_histograms[e] = Histogram(**base_args, buckets=buckets_by_unit(e.unit))
61 | 
62 | 
63 | def update(metrics: List[MetricData]):
64 |     """Update (increment/observe) new values after a job completes, etc."""
65 |     for md in metrics:
66 |         empty_labels = empty_label_names(md.name)
67 |         all_labels = {**empty_labels, **md.labels}
68 |         metric_type = unit_to_metric_type(md.name.unit)
69 |         if metric_type == Counter:
70 |             prom_counters[md.name].labels(**all_labels).inc(md.value)
71 |         elif metric_type == Histogram:
72 |             prom_histograms[md.name].labels(**all_labels).observe(md.value)
73 | 


--------------------------------------------------------------------------------
/frocket/invoker/stats_builder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Build JobStats (returned to the client after job completion) - based mostly on the DataFrame of collected metrics from
  3 | the invoker and all workers.
  4 | """
  5 | #  Copyright 2021 The Funnel Rocket Maintainers
  6 | #
  7 | #  Licensed under the Apache License, Version 2.0 (the "License");
  8 | #  you may not use this file except in compliance with the License.
  9 | #  You may obtain a copy of the License at
 10 | #
 11 | #      http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | #  Unless required by applicable law or agreed to in writing, software
 14 | #  distributed under the License is distributed on an "AS IS" BASIS,
 15 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | #  See the License for the specific language governing permissions and
 17 | #  limitations under the License.
 18 | 
 19 | import logging
 20 | import sys
 21 | from typing import Optional, Union, List, Dict
 22 | import pandas
 23 | import numpy as np
 24 | from pandas import DataFrame
 25 | from frocket.common.config import config
 26 | from frocket.common.dataset import DatasetPartsInfo, PartNamingMethod
 27 | from frocket.common.tasks.base import JobStats, JobDatasetStats, JobInvokerStats, TimingStats, JobWorkerStats
 28 | from frocket.invoker.metrics_frame import MetricsFrame, METRIC_NAME_COLUMN, METRIC_VALUE_COLUMN, METRIC_SOURCE_COLUMN
 29 | from frocket.common.metrics import MetricName, ComponentLabel, SUCCESS_LABEL, MetricLabelEnum, \
 30 |     WorkerStartupLabel, LoadFromLabel
 31 | 
 32 | logger = logging.getLogger(__name__)
 33 | 
 34 | TASK_COMPLETION_GRANULARITY_SECONDS = 0.25  # Data series of task success over time is measured in this resolution
 35 | TIMING_PERCENTILES = [float(pct) for pct in config.get('stats.timing.percentiles').split(',')]
 36 | MIN_METRICS_FOR_PERCENTILES = 20  # Below this sample count, don't return percentiles
 37 | MIN_METRICS_FOR_99_PERCENTILE = 100  # Below this count, don't return 99th percentile
 38 | # List of keys to pull from Pandas' describe()
 39 | TIMING_DESCRIBE_KEYS = ['min', 'mean', 'max'] + [f"{int(pct*100)}%" for pct in TIMING_PERCENTILES]
 40 | 
 41 | 
 42 | def build_stats(frame: MetricsFrame, parts_info: DatasetPartsInfo = None) -> JobStats:
 43 |     df = frame.dataframe
 44 |     if df is None:  # In job failure cases
 45 |         return JobStats()
 46 | 
 47 |     if parts_info:
 48 |         ds_stats = JobDatasetStats(total_size=parts_info.total_size, parts=parts_info.total_parts)
 49 |     else:
 50 |         ds_stats = None
 51 | 
 52 |     # Invoker stats
 53 |     all_task_rows_df = _filter_by_label(df, ComponentLabel.WORKER)
 54 |     successful_task_rows_df = _filter_by_success(all_task_rows_df)
 55 |     total_tasks = _count_tasks(all_task_rows_df)
 56 |     failed_tasks = total_tasks - _count_tasks(successful_task_rows_df)
 57 | 
 58 |     invoker_stats = JobInvokerStats(
 59 |         enqueue_time=_sum_value(df, MetricName.ASYNC_ENQUEUE_SECONDS, single_value=True),
 60 |         poll_time=_sum_value(df, MetricName.ASYNC_POLL_SECONDS, single_value=True),
 61 |         total_tasks=total_tasks,
 62 |         failed_tasks=failed_tasks,
 63 |         task_success_over_time=_task_success_over_time(successful_task_rows_df)
 64 |         # TODO backlog add: lost_task_retries as counted by the invoker; support sync. invokers?
 65 |     )
 66 | 
 67 |     # Worker stats
 68 |     worker_stats = JobWorkerStats(
 69 |         cold_tasks=_count_tasks(_filter_by_label(successful_task_rows_df, WorkerStartupLabel.COLD)),
 70 |         warm_tasks=_count_tasks(_filter_by_label(successful_task_rows_df, WorkerStartupLabel.WARM)),
 71 |         scanned_rows=_sum_value(successful_task_rows_df, MetricName.SCANNED_ROWS, as_int=True),
 72 |         scanned_groups=_sum_value(successful_task_rows_df, MetricName.SCANNED_GROUPS, as_int=True),
 73 |         cache=_cache_performance(successful_task_rows_df),
 74 |         invoke_latency=_timing_stats(successful_task_rows_df, MetricName.INVOKE_TO_RUN_SECONDS),
 75 |         load_time=_timing_stats(successful_task_rows_df, MetricName.TASK_TOTAL_LOAD_SECONDS),
 76 |         total_time=_timing_stats(successful_task_rows_df, MetricName.TASK_TOTAL_RUN_SECONDS)
 77 |         # TODO backlog add: loaded_column_types - mapping of column type to count, which affects load time
 78 |     )
 79 | 
 80 |     job_stats = JobStats(
 81 |         total_time=_sum_value(df, MetricName.INVOKER_TOTAL_SECONDS, single_value=True),
 82 |         cost=_total_cost(df),
 83 |         dataset=ds_stats,
 84 |         invoker=invoker_stats,
 85 |         worker=worker_stats)
 86 |     return job_stats
 87 | 
 88 | 
 89 | def _task_success_over_time(task_rows_df: DataFrame) -> Dict[float, int]:
 90 |     """Return a sparse series of data points - for each time slot (e.g. 0.25 secs) since the job started, return how
 91 |     many tasks completed successfully in that slot. Non-cumulative, does not include zeros."""
 92 |     task_duration_rows = _filter_by_metrics(
 93 |         task_rows_df, metrics=[MetricName.INVOKE_TO_RUN_SECONDS, MetricName.TASK_TOTAL_RUN_SECONDS])
 94 |     task_durations = task_duration_rows.groupby(METRIC_SOURCE_COLUMN)[METRIC_VALUE_COLUMN].sum()
 95 |     quantized_task_durations = \
 96 |         np.ceil(task_durations / TASK_COMPLETION_GRANULARITY_SECONDS) * TASK_COMPLETION_GRANULARITY_SECONDS
 97 |     return quantized_task_durations.value_counts().sort_index().to_dict()
 98 | 
 99 | 
100 | def _cache_performance(task_rows_df: DataFrame) -> Dict[str, int]:
101 |     return {
102 |         # Note the 'source' is always the case for locally-loaded files, in which case caching is N/A.
103 |         'source': _count_tasks(_filter_by_label(task_rows_df, LoadFromLabel.SOURCE)),
104 |         'diskCache': _count_tasks(_filter_by_label(task_rows_df, LoadFromLabel.DISK_CACHE))
105 |     }
106 | 
107 | 
108 | def _sum_value(df: DataFrame, metric: MetricName,
109 |                single_value: bool = False,
110 |                as_int: bool = False) -> Union[float, int, None]:
111 |     df = _filter_by_metrics(df, metric)
112 |     if single_value:
113 |         assert len(df) <= 1
114 |     if df.empty:
115 |         return None
116 |     else:
117 |         values_sum = df[METRIC_VALUE_COLUMN].sum()
118 |         return int(values_sum) if as_int else float(values_sum)
119 | 
120 | 
121 | def _count(df: DataFrame, metric: MetricName) -> int:
122 |     return _filter_by_metrics(df, metric)[METRIC_VALUE_COLUMN].count()
123 | 
124 | 
125 | def _timing_stats(task_rows_df: DataFrame, metric: MetricName) -> TimingStats:
126 |     values_df = _filter_by_metrics(task_rows_df, metric)[METRIC_VALUE_COLUMN]
127 |     if len(values_df) < MIN_METRICS_FOR_PERCENTILES:
128 |         percentiles = [0.5]
129 |     else:
130 |         percentiles = TIMING_PERCENTILES
131 |         if len(values_df) < MIN_METRICS_FOR_99_PERCENTILE:
132 |             percentiles = [pct for pct in percentiles if pct < 0.99]
133 | 
134 |     raw_stats = values_df.describe(percentiles=percentiles).to_dict()
135 |     return {k: v for k, v in raw_stats.items()
136 |             if k in TIMING_DESCRIBE_KEYS and not np.isnan(v)}
137 | 
138 | 
139 | def _filter_by_metrics(df: DataFrame, metrics: Union[MetricName, List[MetricName]]) -> DataFrame:
140 |     if type(metrics) is MetricName:
141 |         return df[df[METRIC_NAME_COLUMN] == metrics.name]
142 |     else:
143 |         return df[df[METRIC_NAME_COLUMN].isin([m.name for m in metrics])]
144 | 
145 | 
146 | def _filter_by_label(df: DataFrame, label: MetricLabelEnum) -> DataFrame:
147 |     return df[df[label.label_name] == label.label_value.lower()]
148 | 
149 | 
150 | def _filter_by_success(df: DataFrame, value: bool = True) -> DataFrame:
151 |     return df[df[SUCCESS_LABEL] == str(value)]
152 | 
153 | 
154 | def _count_tasks(task_rows_df: DataFrame) -> int:
155 |     """Each task attempt (e.g. task index 117, attempt 2) has a unique name in the source column, which ofc appears in
156 |     multiple rows. This count the unique count of task attempt IDs in the given DF."""
157 |     return task_rows_df[METRIC_SOURCE_COLUMN].nunique()
158 | 
159 | 
160 | def _total_cost(df: DataFrame) -> Optional[float]:
161 |     cost_reports_df = _filter_by_metrics(df, MetricName.COST_DOLLARS)
162 |     num_reports = len(cost_reports_df)
163 |     if num_reports == 0:
164 |         logger.debug(f"Total cost: no metrics found")
165 |         return None
166 |     else:
167 |         total_cost = float(cost_reports_df[METRIC_VALUE_COLUMN].sum())
168 |         logger.debug(f"Total cost: ${total_cost:.6f} (sum of {num_reports} metric reports)")
169 |         return total_cost
170 | 
171 | 
172 | # Stand-alone testing
173 | if __name__ == "__main__":
174 |     config.init_logging(force_level=logging.DEBUG, force_console_output=True)
175 |     filename = config.get('metrics.export.lastrun', None)
176 |     if not filename:
177 |         sys.exit('No lastrun file defined')
178 | 
179 |     df = pandas.read_parquet(filename)
180 |     dummy_frame = MetricsFrame([])
181 |     dummy_frame._df = df
182 |     dummy_parts_info = DatasetPartsInfo(naming_method=PartNamingMethod.LIST, total_parts=4, total_size=1024)
183 |     build_stats(dummy_frame, dummy_parts_info)
184 | 


--------------------------------------------------------------------------------
/frocket/worker/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/frocket/worker/impl/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/frocket/worker/impl/aws_lambda_metrics.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Calculate physical memory & cost for AWS Lambda-based workers.
 3 | 
 4 | Important note re. Lambda billing: although this is not explicitly stated and subject to change, you are not charged for
 5 | the duration in which a cold-started Lambda loads up till the point when the actual handler is called -
 6 | meaning, all imports are "free"! this means that cold-started Lambdas mainly impact clock-time latency but typically
 7 | won't inflate cost to a similar degree. This is in line with how the task duration is measured w/o cold-start imports.
 8 | """
 9 | #  Copyright 2021 The Funnel Rocket Maintainers
10 | #
11 | #  Licensed under the Apache License, Version 2.0 (the "License");
12 | #  you may not use this file except in compliance with the License.
13 | #  You may obtain a copy of the License at
14 | #
15 | #      http://www.apache.org/licenses/LICENSE-2.0
16 | #
17 | #  Unless required by applicable law or agreed to in writing, software
18 | #  distributed under the License is distributed on an "AS IS" BASIS,
19 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | #  See the License for the specific language governing permissions and
21 | #  limitations under the License.
22 | 
23 | import logging
24 | import math
25 | import re
26 | from frocket.common.metrics import MetricName, EnvironmentMetricsProvider, MetricData
27 | 
28 | logger = logging.getLogger(__name__)
29 | 
30 | # TODO backlog setup a recurring task to check for pricing changes, so this can be updated.
31 | DEFAULT_PRICE_GB_SEC = 0.0000166667
32 | REGION_PRICING = {
33 |     "eu-south-1": 0.0000195172,  # Milan
34 |     "me-south-1": 0.0000206667,  # Bahrain
35 |     "ap-northeast-3": 0.00002153,  # Osaka
36 |     "af-south-1": 0.0000221,  # Capetown
37 |     "ap-east-1": 0.00002292  # Hong-kong
38 | }
39 | # Assume the actual run takes this amount of seconds more than what's been measured,
40 | # e.g. time spent in decoding the task reqeust, and time still to spend on writing results (incl. these metrics...)
41 | # to datastore.
42 | LAMBDA_TIME_OVERHEAD = 0.008  # 8ms, a conservative value based on a few observations
43 | 
44 | 
45 | class AwsLambdaMetricsProvider(EnvironmentMetricsProvider):
46 |     def __init__(self, lambda_context):
47 |         # See https://docs.aws.amazon.com/lambda/latest/dg/python-context.html
48 |         assert lambda_context.__class__.__name__ == 'LambdaContext'
49 |         self._lambda_context = lambda_context
50 | 
51 |         # What region are we in? figure out by the full ARN in the context
52 |         # (ARN example: arn:aws:lambda:us-west-2:123456789012:function:my-function)
53 |         arn_parts = lambda_context.invoked_function_arn.split(':')
54 |         region = arn_parts[3]
55 |         if re.match(r'\w+-\w+-\d+', region):
56 |             self._region = region
57 |         else:
58 |             self._region = None
59 |             logger.warning(f"Seems like an invalid region: '{region}' in ARN: {lambda_context.invoked_function_arn}, "
60 |                            f"not calculating cost")
61 | 
62 |     def _memory_bytes(self):
63 |         mem_bytes = int(self._lambda_context.memory_limit_in_mb) * (1024 ** 2)
64 |         return MetricData(MetricName.MACHINE_MEMORY_BYTES, mem_bytes)
65 | 
66 |     def _cost_dollars(self, duration=None):
67 |         if not duration or not self._region:
68 |             return None
69 | 
70 |         # noinspection PyBroadException
71 |         try:
72 |             memory_gb = self._memory_bytes().value / (1024 ** 3)
73 |             # Lambdas are currently billed in 1ms granularity, so rounding up
74 |             rounded_duration = duration + LAMBDA_TIME_OVERHEAD
75 |             rounded_duration = math.ceil(rounded_duration * 1000) / 1000
76 | 
77 |             gb_second_units = rounded_duration * memory_gb
78 |             cost_per_unit = REGION_PRICING.get(self._region, DEFAULT_PRICE_GB_SEC)
79 |             cost = gb_second_units * cost_per_unit
80 |             message = \
81 |                 f"Cost: original duration: {duration: .4f} sec, rounded duration: {rounded_duration:.3f}, memory: " \
82 |                 f"{memory_gb}GB, GB/second units: {gb_second_units}, unit cost for region {self._region}: " \
83 |                 f"${cost_per_unit:.10f} => total run cost is ${cost:.10f}"
84 |             logger.debug(message)
85 |             return MetricData(MetricName.COST_DOLLARS, cost)
86 |         except Exception:
87 |             logger.exception("Failed calculating cost")
88 |             return None
89 | 


--------------------------------------------------------------------------------
/frocket/worker/impl/aws_lambda_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | lambda_handler() in this module is the AWS Lambda's defined entrypoint.
 3 | There's minimal code here that's Lambda-specific (== a good thing).
 4 | """
 5 | #  Copyright 2021 The Funnel Rocket Maintainers
 6 | #
 7 | #  Licensed under the Apache License, Version 2.0 (the "License");
 8 | #  you may not use this file except in compliance with the License.
 9 | #  You may obtain a copy of the License at
10 | #
11 | #      http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | #  Unless required by applicable law or agreed to in writing, software
14 | #  distributed under the License is distributed on an "AS IS" BASIS,
15 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | #  See the License for the specific language governing permissions and
17 | #  limitations under the License.
18 | 
19 | import logging
20 | from typing import cast
21 | from frocket.common.serializable import Envelope
22 | from frocket.common.tasks.base import BaseTaskRequest
23 | from frocket.common.metrics import MetricsBag, WorkerStartupLabel, ComponentLabel
24 | from frocket.worker.impl.aws_lambda_metrics import AwsLambdaMetricsProvider
25 | from frocket.worker.runners.base_task_runner import BaseTaskRunner, TaskRunnerContext
26 | from frocket.common.config import config
27 | from frocket.worker.runners.registered_runners import REGISTERED_RUNNERS
28 | 
29 | config.init_lambda_logging()  # Adapted to the logger being already-inited by the Lambda runtime
30 | logger = logging.getLogger(__name__)
31 | 
32 | # This flag only set when a new Lambda instance is cold-started. Warm lambdas would go straight to the handler function.
33 | cold_start_flag = True
34 | 
35 | 
36 | def is_cold_start():
37 |     global cold_start_flag
38 |     if cold_start_flag:
39 |         cold_start_flag = False  # For next invocation
40 |         return True
41 |     else:
42 |         return False
43 | 
44 | 
45 | def init_task_metrics(lambda_context) -> MetricsBag:
46 |     metrics = MetricsBag(component=ComponentLabel.WORKER,
47 |                          env_metrics_provider=AwsLambdaMetricsProvider(lambda_context))
48 |     if is_cold_start():
49 |         metrics.set_label_enum(WorkerStartupLabel.COLD)
50 |     else:
51 |         metrics.set_label_enum(WorkerStartupLabel.WARM)
52 |     return metrics
53 | 
54 | 
55 | def lambda_handler(event, context):
56 |     metrics = init_task_metrics(context)
57 |     # The event JSON was already parsed to dict by the Lambda runtime -
58 |     # now read from that dict that actual task request object
59 |     envelope = Envelope.from_dict(event)
60 |     req = cast(BaseTaskRequest, envelope.open(expected_superclass=BaseTaskRequest))
61 |     logger.info(f"Got request: {req}")
62 | 
63 |     result = None
64 |     should_run, reject_reason = BaseTaskRunner.should_run(req)
65 |     if should_run:
66 |         runner_class = REGISTERED_RUNNERS[type(req)]
67 |         runner = runner_class(req, TaskRunnerContext(metrics))
68 |         result = runner.run()
69 | 
70 |     """
71 |     A note about the Lambda response: unlike most request/response Lambdas, Funnel Rocket's invoker does not rely on the
72 |     function's result coming from the Lambda directly (as it's invoked async.) but rather always through the datastore.
73 |     The retry mechanism is also based on polling the tasks' status and result payload in the datastore, hence the 
74 |     Lambda itself should not normally return a non-200 status (unless it crashed unexpectedly), and the Lambda should
75 |     be configured to have no retries at the AWS level.
76 |     """
77 | 
78 |     lambda_response = {
79 |         'statusCode': 200,
80 |     }
81 | 
82 |     # Getting the result object in the Lambda response is still useful for manual testing
83 |     if logger.isEnabledFor(logging.DEBUG):
84 |         if result:
85 |             lambda_response['result'] = result.to_json()
86 |         else:
87 |             lambda_response['reject_reason'] = reject_reason
88 |     return lambda_response
89 | 


--------------------------------------------------------------------------------
/frocket/worker/impl/generic_env_metrics.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is the most generic implementation for getting runtime-environment metrics:
 3 | it does not assume we know the cost of the host machine for the request duration,
 4 | and getting physical memory size should generally work on Linux variants and OS X versions.
 5 | """
 6 | #  Copyright 2021 The Funnel Rocket Maintainers
 7 | #
 8 | #  Licensed under the Apache License, Version 2.0 (the "License");
 9 | #  you may not use this file except in compliance with the License.
10 | #  You may obtain a copy of the License at
11 | #
12 | #      http://www.apache.org/licenses/LICENSE-2.0
13 | #
14 | #  Unless required by applicable law or agreed to in writing, software
15 | #  distributed under the License is distributed on an "AS IS" BASIS,
16 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | #  See the License for the specific language governing permissions and
18 | #  limitations under the License.
19 | 
20 | import logging
21 | import os
22 | from frocket.common.metrics import EnvironmentMetricsProvider, MetricData, MetricName
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | 
27 | class GenericEnvMetricsProvider(EnvironmentMetricsProvider):
28 |     def _memory_bytes(self):
29 |         # Tested on Linux and OS X
30 |         try:
31 |             mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
32 |         except ValueError:
33 |             # Fallback to sysctl in case that os.sysconf('SC_PHYS_PAGES') fails on OS X (seems version specific)
34 |             # noinspection PyBroadException
35 |             try:
36 |                 stream = os.popen('sysctl hw.memsize')
37 |                 mem_bytes = int(stream.read().split(' ')[1])
38 |             except Exception as e:
39 |                 logger.warning(f"Can't detect machine memory: {e}")
40 |                 return None
41 | 
42 |         return MetricData(MetricName.MACHINE_MEMORY_BYTES, mem_bytes)
43 | 
44 |     def _cost_dollars(self, duration=None):
45 |         return None
46 | 


--------------------------------------------------------------------------------
/frocket/worker/impl/queue_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A worker that gets its tasks by a blocking dequeue from the datastore. Doesn't get any simpler -
 3 | but is easily scalable, and requires no load balancer or orchestrator (except for the queue's atomic guarantees).
 4 | 
 5 | TODO backlog having a cache-friendly task assignment would require more work, if it makes sense to do.
 6 | """
 7 | #  Copyright 2021 The Funnel Rocket Maintainers
 8 | #
 9 | #  Licensed under the Apache License, Version 2.0 (the "License");
10 | #  you may not use this file except in compliance with the License.
11 | #  You may obtain a copy of the License at
12 | #
13 | #      http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | #  Unless required by applicable law or agreed to in writing, software
16 | #  distributed under the License is distributed on an "AS IS" BASIS,
17 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | #  See the License for the specific language governing permissions and
19 | #  limitations under the License.
20 | 
21 | import logging
22 | from frocket.common.metrics import MetricsBag, WorkerStartupLabel, ComponentLabel
23 | from frocket.common.tasks.base import BaseTaskRequest
24 | from frocket.datastore.registered_datastores import get_datastore
25 | from frocket.worker.impl.generic_env_metrics import GenericEnvMetricsProvider
26 | from frocket.worker.runners.base_task_runner import BaseTaskRunner, TaskRunnerContext
27 | from frocket.common.config import config
28 | from frocket.worker.runners.registered_runners import REGISTERED_RUNNERS
29 | 
30 | config.init_logging()
31 | logger = logging.getLogger(__name__)
32 | datastore = get_datastore()
33 | 
34 | 
35 | def handle(req: BaseTaskRequest) -> None:
36 |     metrics = MetricsBag(component=ComponentLabel.WORKER,
37 |                          env_metrics_provider=GenericEnvMetricsProvider())
38 |     metrics.set_label_enum(WorkerStartupLabel.WARM)  # Always warm this worker is, uhmmhmmhmmhmm
39 | 
40 |     runner_class = REGISTERED_RUNNERS[type(req)]
41 |     runner = runner_class(req, TaskRunnerContext(metrics))
42 |     result = runner.run()
43 |     if logger.isEnabledFor(logging.DEBUG):
44 |         logger.debug(result.to_json())
45 | 
46 | 
47 | def main_loop():
48 |     # TODO backlog currently workers that encounter an unexpected data format will crash rather than continuing to
49 |     #  consume and (probably) fail. This has a pro (outdated worker versions fail fast), but of course also cons -
50 |     #  consider the desired/configurable behavior (e.g. crash after N unexpected errors?)
51 |     try:
52 |         while True:
53 |             logger.info('Waiting for work...')
54 |             req: BaseTaskRequest = datastore.dequeue()
55 |             if req:
56 |                 logger.info(f"Got request: {req}")
57 | 
58 |                 should_run, reject_reason = BaseTaskRunner.should_run(req)
59 |                 if should_run:
60 |                     handle(req)
61 |                 else:
62 |                     logger.warning(f"Request rejected: {reject_reason}")
63 |     except KeyboardInterrupt:
64 |         logger.info('Bye')
65 | 
66 | 
67 | main_loop()
68 | 


--------------------------------------------------------------------------------
/frocket/worker/runners/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/frocket/worker/runners/base_task_runner.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Base class for running a task in a worker - to be subclassed for concerete task runners.
  3 | """
  4 | #  Copyright 2021 The Funnel Rocket Maintainers
  5 | #
  6 | #  Licensed under the Apache License, Version 2.0 (the "License");
  7 | #  you may not use this file except in compliance with the License.
  8 | #  You may obtain a copy of the License at
  9 | #
 10 | #      http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | #  Unless required by applicable law or agreed to in writing, software
 13 | #  distributed under the License is distributed on an "AS IS" BASIS,
 14 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | #  See the License for the specific language governing permissions and
 16 | #  limitations under the License.
 17 | 
 18 | import logging
 19 | import time
 20 | from abc import abstractmethod
 21 | from typing import Optional
 22 | from frocket.common.config import config
 23 | from frocket.common.metrics import MetricName, MetricsBag
 24 | from frocket.common.tasks.base import TaskStatus, BaseTaskRequest, BaseTaskResult, TaskAttemptId
 25 | from frocket.datastore.datastore import Datastore
 26 | from frocket.datastore.blobstore import Blobstore
 27 | from frocket.datastore.registered_datastores import get_datastore, get_blobstore
 28 | from frocket.worker.runners.part_loader import PartLoader, shared_part_loader
 29 | 
 30 | logger = logging.getLogger(__name__)
 31 | REQUEST_MAX_AGE = int(config.get("worker.reject.age"))
 32 | DEFAULT_PREFLIGHT_DURATION_MS = config.int("part.selection.preflight.ms")
 33 | 
 34 | 
 35 | class TaskRunnerContext:
 36 |     """simple dependency provider... (for easier testing)."""
 37 |     def __init__(self,
 38 |                  metrics: MetricsBag,
 39 |                  private_part_loader: PartLoader = None,
 40 |                  preflight_duration_ms: int = None):
 41 |         self._metrics = metrics
 42 |         # By default, files are loaded and cached by a re-usable loaded.
 43 |         # Having a 'private' one allows testing in isolation
 44 |         self._part_loader = private_part_loader or shared_part_loader()
 45 |         if preflight_duration_ms is None:
 46 |             preflight_duration_ms = DEFAULT_PREFLIGHT_DURATION_MS
 47 |         self._preflight_duration_seconds = preflight_duration_ms / 1000
 48 | 
 49 |     @property
 50 |     def metrics(self) -> MetricsBag:
 51 |         return self._metrics
 52 | 
 53 |     # The underlying get_datastore and get_blobstore are memoized - initialized on demand
 54 |     @property
 55 |     def datastore(self) -> Datastore:
 56 |         return get_datastore()
 57 | 
 58 |     @property
 59 |     def blobstore(self) -> Blobstore:
 60 |         return get_blobstore()
 61 | 
 62 |     @property
 63 |     def part_loader(self) -> PartLoader:
 64 |         return self._part_loader
 65 | 
 66 |     @property
 67 |     def preflight_duration_seconds(self) -> float:
 68 |         return self._preflight_duration_seconds
 69 | 
 70 | 
 71 | class BaseTaskRunner:
 72 |     # Returns (should_run, reject_reason)
 73 |     @classmethod
 74 |     def should_run(cls, req: BaseTaskRequest) -> (bool, str):
 75 |         if cls.time_since_invocation(req) > REQUEST_MAX_AGE:
 76 |             return False, f"request is more than {REQUEST_MAX_AGE} seconds old"
 77 |         else:
 78 |             return True, None
 79 | 
 80 |     @staticmethod
 81 |     def time_since_invocation(req: BaseTaskRequest):
 82 |         return time.time() - req.invoke_time
 83 | 
 84 |     def __init__(self, req: BaseTaskRequest,
 85 |                  ctx: TaskRunnerContext):
 86 |         self._req = req
 87 |         self._ctx = ctx
 88 |         # TODO backlog initialize the attempt_id on init, if available (n/a here in self-select part mode)
 89 |         self._task_attempt_id: Optional[TaskAttemptId] = None
 90 | 
 91 |     def run(self) -> BaseTaskResult:
 92 |         error_message, engine_result = None, None
 93 |         with self._ctx.metrics.measure(MetricName.TASK_TOTAL_RUN_SECONDS):
 94 |             try:
 95 |                 self._ctx.metrics.set_metric(MetricName.INVOKE_TO_RUN_SECONDS,
 96 |                                              self.time_since_invocation(self._req))
 97 | 
 98 |                 self._do_run()  # Call concrete class to do the actual work
 99 |                 final_status = TaskStatus.ENDED_SUCCESS
100 |             except Exception as e:
101 |                 final_status = TaskStatus.ENDED_FAILED
102 |                 error_message = str(e)
103 |                 logger.exception('Task FAILED!')
104 | 
105 |         # Post-run: extracting the task metrics, building the concrete result object
106 |         final_metrics = self._ctx.metrics.finalize(success=(final_status == TaskStatus.ENDED_SUCCESS))
107 |         # First, set the base attributes in a dict as kind of a 'skeleton' response - then pass it to the concrete
108 |         # task runner to pass as **args to the concrete result class
109 |         base_attributes = BaseTaskResult(
110 |             task_index=self._task_attempt_id.task_index,
111 |             status=final_status,
112 |             error_message=error_message,
113 |             metrics=final_metrics).shallowdict(include_none=True)
114 |         result = self._build_result(base_attributes)  # Call concrete class
115 | 
116 |         # If the job failed to get a task attempt ID assigned to it (self-select failed),
117 |         # or if the datastore is not available - task status and result cannot be written
118 |         # TODO backlog consider having an optional secondary channel to report such failures
119 |         #  (aside from centralized logging?)
120 |         if self._task_attempt_id:
121 |             self._ctx.datastore.write_task_result(self._req.request_id, self._task_attempt_id, result)
122 |         else:
123 |             logger.error("Can't report result: no part was selected for loading")
124 | 
125 |         if logger.isEnabledFor(logging.DEBUG):
126 |             logger.debug(result)
127 |         return result
128 | 
129 |     def _update_status(self, status: TaskStatus):
130 |         self._ctx.datastore.update_task_status(self._req.request_id, self._task_attempt_id, status)
131 | 
132 |     @abstractmethod
133 |     def _do_run(self):
134 |         pass
135 | 
136 |     @abstractmethod
137 |     def _build_result(self, base_attributes: dict):
138 |         """This method is still called by run() above even if _do_run() has raised an exception - having a sane
139 |         result object is important even if a failed one."""
140 |         pass
141 | 


--------------------------------------------------------------------------------
/frocket/worker/runners/part_loader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Load and cache parts (data files).
  3 | """
  4 | #  Copyright 2021 The Funnel Rocket Maintainers
  5 | #
  6 | #  Licensed under the Apache License, Version 2.0 (the "License");
  7 | #  you may not use this file except in compliance with the License.
  8 | #  You may obtain a copy of the License at
  9 | #
 10 | #      http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | #  Unless required by applicable law or agreed to in writing, software
 13 | #  distributed under the License is distributed on an "AS IS" BASIS,
 14 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | #  See the License for the specific language governing permissions and
 16 | #  limitations under the License.
 17 | 
 18 | import logging
 19 | import time
 20 | import os
 21 | from pathlib import Path
 22 | from typing import List, Dict, Optional, Set, NamedTuple, Union
 23 | from pandas import DataFrame
 24 | import pyarrow.parquet
 25 | from frocket.common.config import config
 26 | from frocket.common.helpers.storage import storage_handler_for
 27 | from frocket.common.helpers.utils import memoize
 28 | from frocket.common.metrics import MetricName, LoadFromLabel, MetricsBag
 29 | from frocket.common.dataset import DatasetPartId, DatasetId
 30 | 
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | 
 34 | # Just a little typed nicety over tuples which PyArrow accepts as predicate pushdown filters
 35 | class FilterPredicate(NamedTuple):
 36 |     column: str
 37 |     op: str
 38 |     value: Union[str, int, float, bool]
 39 | 
 40 | 
 41 | class CacheEntry:
 42 |     local_path: str
 43 |     size_mb: float
 44 |     last_used: float
 45 | 
 46 | 
 47 | class PartLoader:
 48 |     _cache: Dict[DatasetPartId, CacheEntry] = None  # DatasetPartId is a dataclass with proper hash & equality
 49 |     _disk_cache_max_size: float = None
 50 | 
 51 |     def __init__(self):
 52 |         self._setup()
 53 | 
 54 |     # Support re-initialization and overriding the configured size, for testing
 55 |     def _setup(self, disk_cache_max_size: float = None):
 56 |         if self._cache:
 57 |             for entry in self._cache.values():
 58 |                 os.remove(entry.local_path)
 59 |         self._cache = {}
 60 |         self._disk_cache_max_size = disk_cache_max_size if disk_cache_max_size is not None \
 61 |             else config.float('worker.disk.cache.size.mb')
 62 | 
 63 |     @property
 64 |     def cache_current_size_mb(self) -> float:
 65 |         return sum(entry.size_mb for entry in self._cache.values())
 66 | 
 67 |     @property
 68 |     def cache_len(self) -> int:
 69 |         return len(self._cache)
 70 | 
 71 |     def _prune_cache(self) -> None:
 72 |         curr_size_mb = self.cache_current_size_mb
 73 |         while curr_size_mb > 0 and curr_size_mb > self._disk_cache_max_size:
 74 |             logger.info(f"Current cache size is {curr_size_mb}mb, more than the configured "
 75 |                         f"{self._disk_cache_max_size}mb")
 76 |             lru_key = min(self._cache, key=lambda k: self._cache[k].last_used)
 77 |             lru_entry = self._cache[lru_key]
 78 |             logger.info(f"Deleting LRU entry of dataset: {lru_key.dataset_id.name} "
 79 |                         f"source path: {lru_key.path}, "
 80 |                         f"last used {time.time() - lru_entry.last_used:.1f} seconds ago")
 81 |             try:
 82 |                 os.remove(lru_entry.local_path)
 83 |             except OSError:
 84 |                 logger.exception('Failed to delete file!')  # TODO backlog consider disabling any further caching
 85 |             del self._cache[lru_key]
 86 |             curr_size_mb = self.cache_current_size_mb
 87 | 
 88 |     def load_dataframe(self,
 89 |                        file_id: DatasetPartId,
 90 |                        metrics: MetricsBag,
 91 |                        needed_columns: List[str] = None,
 92 |                        filters: List[FilterPredicate] = None,
 93 |                        load_as_categoricals: List[str] = None) -> DataFrame:
 94 |         self._prune_cache()
 95 |         loaded_from: Optional[LoadFromLabel] = LoadFromLabel.SOURCE
 96 |         handler = storage_handler_for(file_id.path)
 97 |         is_source_remote = handler.remote
 98 | 
 99 |         local_path = None
100 |         if not is_source_remote:
101 |             local_path = file_id.path  # No caching for local files
102 |         else:
103 |             if file_id in self._cache:
104 |                 local_path = self._cache[file_id].local_path
105 |                 loaded_from = LoadFromLabel.DISK_CACHE
106 |                 self._cache[file_id].last_used = time.time()
107 |                 logger.info("File is locally cached, yay")
108 | 
109 |         if not local_path:
110 |             with metrics.measure(MetricName.TASK_DOWNLOAD_SECONDS):
111 |                 local_path = str(handler.get_local_path(file_id.path))  # Download to a local temp file
112 | 
113 |             entry = CacheEntry()
114 |             entry.local_path = local_path
115 |             entry.size_mb = Path(local_path).stat().st_size / 1024 ** 2
116 |             entry.last_used = time.time()
117 |             self._cache[file_id] = entry
118 | 
119 |         with metrics.measure(MetricName.TASK_LOAD_FILE_SECONDS):
120 |             # Using PyArrow directly (rather than wrapped through Pandas) allows specifying column names to explicitly
121 |             # load as 'dictionary' type, which then translates to categoricals in Pandas.
122 |             # If the file was created with Pandas, categorical columns are loaded back as such - but we go beyond
123 |             # that to detect 'potential categorical' string columns and load them as such.
124 |             # Except for the memory usage saving, there is a performance gain here if the Parquet file already has a
125 |             # dictionary for the column. Otherwise, PyArrow will create one - but without a performance gain.
126 |             df = pyarrow.parquet.read_table(local_path,
127 |                                             columns=needed_columns,
128 |                                             filters=filters,
129 |                                             read_dictionary=load_as_categoricals).to_pandas()
130 | 
131 |         metrics.set_label_enum(loaded_from)
132 |         return df
133 | 
134 |     def get_cached_candidates(self, dataset_id: DatasetId) -> Optional[Set[DatasetPartId]]:
135 |         """Do we have cached parts for this DatasetId, that can be used to self-select parts?"""
136 |         logger.debug(f"Looking for cached candidates matching: {dataset_id}")
137 |         candidates = None
138 |         if self._cache:
139 |             candidates = {part_id for part_id in self._cache.keys() if part_id.dataset_id == dataset_id}
140 | 
141 |         logger.debug(f"Found candidates: {candidates}")
142 |         return candidates if (candidates and len(candidates) > 0) else None
143 | 
144 | 
145 | @memoize
146 | def shared_part_loader() -> PartLoader:
147 |     """This is used by default, but can be overriden in tests."""
148 |     return PartLoader()
149 | 


--------------------------------------------------------------------------------
/frocket/worker/runners/query_task_runner.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Execute a single query task.
  3 | """
  4 | #  Copyright 2021 The Funnel Rocket Maintainers
  5 | #
  6 | #  Licensed under the Apache License, Version 2.0 (the "License");
  7 | #  you may not use this file except in compliance with the License.
  8 | #  You may obtain a copy of the License at
  9 | #
 10 | #      http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | #  Unless required by applicable law or agreed to in writing, software
 13 | #  distributed under the License is distributed on an "AS IS" BASIS,
 14 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | #  See the License for the specific language governing permissions and
 16 | #  limitations under the License.
 17 | 
 18 | import logging
 19 | import time
 20 | from typing import List, cast, Optional
 21 | from pandas import DataFrame
 22 | from frocket.common.dataset import DatasetPartId
 23 | from frocket.common.metrics import MetricName, PartSelectMethodLabel
 24 | from frocket.common.tasks.base import TaskStatus, TaskAttemptId, BaseTaskRequest
 25 | from frocket.common.tasks.query import PartSelectionMode, QueryTaskRequest, QueryResult, QueryTaskResult
 26 | from frocket.engine.query_engine import QueryEngine
 27 | from frocket.worker.runners.base_task_runner import BaseTaskRunner, TaskRunnerContext
 28 | from frocket.worker.runners.part_loader import FilterPredicate
 29 | 
 30 | logger = logging.getLogger(__name__)
 31 | 
 32 | 
 33 | class QueryTaskRunner(BaseTaskRunner):
 34 |     def __init__(self, req: BaseTaskRequest, ctx: TaskRunnerContext):
 35 |         super().__init__(req, ctx)
 36 |         self._req = cast(QueryTaskRequest, req)  # Avoid type warnings
 37 |         self._dataset_part_id: Optional[DatasetPartId] = None
 38 |         self._query_result: Optional[QueryResult] = None
 39 | 
 40 |     def _do_run(self):
 41 |         self._set_part_to_load()
 42 |         self._update_status(TaskStatus.LOADING_DATA)
 43 |         with self._ctx.metrics.measure(MetricName.TASK_TOTAL_LOAD_SECONDS):
 44 |             df = self._load(needed_columns=self._req.used_columns,
 45 |                             load_as_categoricals=self._req.load_as_categoricals)
 46 | 
 47 |         self._update_status(TaskStatus.RUNNING_QUERY)
 48 |         with self._ctx.metrics.measure(MetricName.TASK_RUN_QUERY_SECONDS):
 49 |             engine = QueryEngine(self._req.dataset.group_id_column, self._req.dataset.timestamp_column)
 50 |             engine_result = engine.run(df, self._req.query)
 51 |         self._query_result = engine_result
 52 | 
 53 |     def _set_part_to_load(self) -> None:
 54 |         task_attempt_no = self._req.attempt_no
 55 |         if self._req.mode == PartSelectionMode.SET_BY_INVOKER:
 56 |             part_id = self._req.invoker_set_part
 57 |             actual_select_method = PartSelectMethodLabel.SET_BY_INVOKER
 58 |         elif self._req.mode == PartSelectionMode.SELECTED_BY_WORKER:
 59 |             actual_select_method, part_id = self._select_part_myself()
 60 |             logger.info(f"Worker selected part: method: {actual_select_method}, file ID: {part_id}, "
 61 |                         f"task attempt no.: {task_attempt_no}")
 62 |         else:
 63 |             raise Exception(f"Don't know how to handle request mode: {self._req.mode}")
 64 | 
 65 |         if not part_id:
 66 |             raise Exception("No part to load")
 67 | 
 68 |         self._ctx.metrics.set_label_enum(actual_select_method)
 69 |         self._dataset_part_id = part_id
 70 |         self._task_attempt_id = TaskAttemptId(part_id.part_idx, task_attempt_no)
 71 | 
 72 |     def _select_part_myself(self):
 73 |         """See configuration guide for 'preflight' concept. In general, that's a configurable time period in self-select
 74 |         part mode, where 'warm' workers can select the candidates they wish without interruption."""
 75 |         time_left_in_preflight = self._ctx.preflight_duration_seconds - BaseTaskRunner.time_since_invocation(self._req)
 76 |         candidates = self._ctx.part_loader.get_cached_candidates(self._req.dataset.id)
 77 |         sleep_time = 0
 78 |         if not candidates and time_left_in_preflight > 0:
 79 |             logger.info("Got no candidates but we're still during preflight"
 80 |                         f", so sleeping for {time_left_in_preflight} seconds")
 81 |             sleep_time = time_left_in_preflight
 82 | 
 83 |         if sleep_time:
 84 |             time.sleep(time_left_in_preflight)
 85 |         self._ctx.metrics.set_metric(MetricName.TASK_PREFLIGHT_SLEEP_SECONDS, sleep_time)
 86 | 
 87 |         # If a worker got some candidates, we still gonna try to grab them even if preflight time has ended
 88 |         selected_part = self._ctx.datastore.self_select_part(self._req.request_id, self._req.attempt_no, candidates)
 89 |         if not selected_part.part_id:
 90 |             # Not supposed to happen, unless there's a retry mechanism gone awry
 91 |             raise Exception("Got no part for me!")
 92 | 
 93 |         if candidates:
 94 |             if not selected_part.random:
 95 |                 actual_select_method = PartSelectMethodLabel.SPECIFIC_CANDIDATE
 96 |             else:
 97 |                 actual_select_method = PartSelectMethodLabel.RANDOM_CANDIDATES_TAKEN
 98 |         else:
 99 |             actual_select_method = PartSelectMethodLabel.RANDOM_NO_CANDIDATES
100 | 
101 |         return actual_select_method, selected_part.part_id
102 | 
103 |     def _load(self, needed_columns: List[str] = None, load_as_categoricals: List[str] = None) -> DataFrame:
104 |         filters = self._predicate_pushdown_filters()
105 |         if logger.isEnabledFor(logging.DEBUG):
106 |             logger.debug(f"Filters used when loading: {filters}")
107 |             logger.debug(f"Columns to explicitly load as categorical: {load_as_categoricals}")
108 | 
109 |         df = self._ctx.part_loader.load_dataframe(file_id=self._dataset_part_id, metrics=self._ctx.metrics,
110 |                                                   needed_columns=needed_columns, filters=filters,
111 |                                                   load_as_categoricals=load_as_categoricals)
112 |         self._ctx.metrics.set_metric(MetricName.SCANNED_ROWS, len(df))
113 |         self._ctx.metrics.set_metric(MetricName.SCANNED_GROUPS, df[self._req.dataset.group_id_column].nunique())
114 |         return df
115 | 
116 |     def _predicate_pushdown_filters(self):
117 |         """
118 |         Build PyArrow-compatible pushdown predicates to pass the part loader.
119 |         An important reminder here is that any filter applied would affect not just conditions/sequences, but also
120 |         any defined aggregations - meaning it's suitable for limiting scope to the (optional) query timeframe,
121 |         but should be evaluated carefully for any other optimizations.
122 |         """
123 |         filters = []
124 |         timeframe = self._req.query.get('timeframe', None)
125 |         if timeframe:
126 |             fromtime = timeframe.get('from', None)
127 |             if fromtime is not None:
128 |                 filters.append(FilterPredicate(column=self._req.dataset.timestamp_column, op='>=', value=fromtime))
129 |             totime = timeframe.get('to', None)
130 |             if totime is not None:
131 |                 filters.append(FilterPredicate(column=self._req.dataset.timestamp_column, op='<', value=totime))
132 | 
133 |         return filters if len(filters) > 0 else None
134 | 
135 |     def _build_result(self, base_attributes):
136 |         return QueryTaskResult(
137 |             **base_attributes,
138 |             query_result=self._query_result)
139 | 


--------------------------------------------------------------------------------
/frocket/worker/runners/registered_runners.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | from typing import Dict, Type
16 | from frocket.common.tasks.base import BaseTaskRequest
17 | from frocket.common.tasks.registration import RegistrationTaskRequest
18 | from frocket.common.tasks.query import QueryTaskRequest
19 | from frocket.worker.runners.base_task_runner import BaseTaskRunner
20 | from frocket.worker.runners.query_task_runner import QueryTaskRunner
21 | from frocket.worker.runners.registration_task_runner import RegistrationTaskRunner
22 | 
23 | REGISTERED_RUNNERS: Dict[Type[BaseTaskRequest], Type[BaseTaskRunner]] = {
24 |     QueryTaskRequest: QueryTaskRunner,
25 |     RegistrationTaskRequest: RegistrationTaskRunner
26 | }
27 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pyarrow>=2.0.0
 2 | pandas>=1.2.0
 3 | boto3>=1.16.0
 4 | redis>=3.5.0
 5 | tabulate>=0.8.0
 6 | prometheus_client>=0.9.0
 7 | flask>=1.1.0
 8 | jsonschema>=3.2.0
 9 | dataclasses-json>=0.5.2
10 | inflection>=0.5.0
11 | parsimonious>=0.8.0
12 | gunicorn>=20.0.0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import setuptools
 3 | 
 4 | this_dir = pathlib.Path(__file__).parent
 5 | requirements_file = this_dir / "requirements.txt"
 6 | readme_file = this_dir / "README.md"
 7 | 
 8 | install_requires = requirements_file.read_text().splitlines()
 9 | long_description = readme_file.read_text() if readme_file.exists() else ''
10 | 
11 | setuptools.setup(
12 |     name="funnel-rocket",
13 |     version="0.5.3",
14 |     author="Elad Rosenheim, Avshalom Manevich",
15 |     author_email="elad@dynamicyield.com",
16 |     description="Cloud native distributed funnel queries",
17 |     long_description=long_description,
18 |     long_description_content_type="text/markdown",
19 |     url="https://github.com/DynamicYieldProjects/funnel-rocket-oss",
20 |     packages=setuptools.find_packages(),
21 |     package_data={
22 |         "frocket": ["resources/*.*"],
23 |     },
24 |     classifiers=[
25 |         "Programming Language :: Python :: 3.8",
26 |         "Programming Language :: Python :: 3.9",
27 |         "License :: OSI Approved :: Apache Software License",
28 |         "Operating System :: OS Independent",
29 |     ],
30 |     python_requires='>=3.8',
31 |     install_requires=install_requires
32 | )
33 | 


--------------------------------------------------------------------------------
/test-requirements.txt:
--------------------------------------------------------------------------------
1 | pytest>=6.2.0
2 | pytest-cov>=2.11.0
3 | icdiff>=0.5.0
4 | requests>=2.25.0


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/tests/utils/base_query_example.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "timeframe": {
  3 |         "from": 1590918400516,
  4 |         "to": 1618918400516
  5 |     },
  6 |     "query": {
  7 |         "relation": "( $1 and $2) || $seq || ((  $made_multiple_purchases )  ) ",
  8 |         "conditions": [
  9 |             {
 10 |                 "name": "made_multiple_purchases",
 11 |                 "filter": [
 12 |                     "eventId",
 13 |                     "==",
 14 |                     18765
 15 |                 ],
 16 |                 "target": [
 17 |                     "count",
 18 |                     ">=",
 19 |                     0
 20 |                 ],
 21 |                 "includeZero": true
 22 |             },
 23 |             {
 24 |                 "name": "made_multiple_purchases2",
 25 |                 "filter": [
 26 |                     "eventId",
 27 |                     "==",
 28 |                     18766
 29 |                 ],
 30 |                 "includeZero": false
 31 |             },
 32 |             {
 33 |                 "filter": {
 34 |                     "column": "eventId",
 35 |                     "op": "==",
 36 |                     "value": 18767
 37 |                 },
 38 |                 "target": {
 39 |                     "type": "sum",
 40 |                     "column": "eventValue",
 41 |                     "op": "<",
 42 |                     "value": 350
 43 |                 }
 44 |             },
 45 |             {
 46 |                 "filter": {
 47 |                     "column": "eventId",
 48 |                     "op": "==",
 49 |                     "value": 18768
 50 |                 },
 51 |                 "target": [
 52 |                     "sum",
 53 |                     "eventValue",
 54 |                     "<",
 55 |                     350
 56 |                 ]
 57 |             },
 58 |             {
 59 |                 "filters": [
 60 |                     {
 61 |                         "column": "eventType",
 62 |                         "op": "==",
 63 |                         "value": "purchase"
 64 |                     },
 65 |                     {
 66 |                         "column": "goalValue",
 67 |                         "op": ">=",
 68 |                         "value": 3
 69 |                     }
 70 |                 ],
 71 |                 "target": [
 72 |                     "sum",
 73 |                     "eventValue",
 74 |                     "<",
 75 |                     350
 76 |                 ],
 77 |                 "includeZero": false
 78 |             },
 79 |             {
 80 |                 "name": "seq",
 81 |                 "sequence": [
 82 |                     {
 83 |                         "filter": [
 84 |                             "eventType",
 85 |                             "==",
 86 |                             "addToCart"
 87 |                         ]
 88 |                     },
 89 |                     {
 90 |                         "filters": [
 91 |                             {
 92 |                                 "column": "eventType",
 93 |                                 "op": "==",
 94 |                                 "value": "purchase"
 95 |                             },
 96 |                             {
 97 |                                 "column": "goalValue",
 98 |                                 "op": ">=",
 99 |                                 "value": 3
100 |                             }
101 |                         ]
102 |                     },
103 |                     {
104 |                         "rowFound": false,
105 |                         "filter": {
106 |                             "column": "eventType",
107 |                             "op": "==",
108 |                             "value": "signToClub"
109 |                         }
110 |                     }
111 |                 ],
112 |                 "maxDuration": 23443
113 |             }
114 |         ],
115 |         "aggregations": [
116 |             {
117 |                 "column": "device"
118 |             },
119 |             {
120 |                 "column": "transactionId",
121 |                 "type": "count",
122 |                 "name": "purchase_count"
123 |             },
124 |             {
125 |                 "column": "goalId"
126 |             },
127 |             {
128 |                 "column": "goalId",
129 |                 "type": "sumPerValue",
130 |                 "otherColumn": "goalValue",
131 |                 "name": "hoola"
132 |             }
133 |         ]
134 |     },
135 |     "funnel": {
136 |         "sequence": [
137 |             {
138 |                 "filter": [
139 |                     "eventType",
140 |                     "==",
141 |                     "addToCart"
142 |                 ]
143 |             },
144 |             {
145 |                 "filter": {
146 |                     "column": "eventId",
147 |                     "op": "==",
148 |                     "value": 18765
149 |                 }
150 |             }
151 |         ],
152 |         "maxDuration": 23443,
153 |         "stepAggregations": [
154 |             {
155 |                 "column": "goalId",
156 |                 "type": "count",
157 |                 "name": "mosh"
158 |             },
159 |             {
160 |                 "column": "eventId",
161 |                 "type": "groupsPerValue",
162 |                 "name": "mosh2"
163 |             }
164 |         ],
165 |         "endAggregations": [
166 |             {
167 |                 "column": "goalId"
168 |             }
169 |         ]
170 |     }
171 | }


--------------------------------------------------------------------------------
/tests/utils/base_test_utils.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | import os
16 | import tempfile
17 | from typing import List, Type
18 | from frocket.common.metrics import MetricName, MetricData, MetricLabelEnum
19 | 
20 | SKIP_SLOW_TESTS = os.environ.get('SKIP_SLOW_TESTS', "False").lower() == 'true'
21 | SKIP_LAMBDA_TESTS = os.environ.get('SKIP_LAMBDA_TESTS', "False").lower() == 'true'
22 | # noinspection PyProtectedMember,PyUnresolvedReferences
23 | TEMP_DIR = tempfile._get_default_tempdir()
24 | 
25 | 
26 | # noinspection PyProtectedMember,PyUnresolvedReferences
27 | def temp_filename(suffix='', with_dir: bool = True):
28 |     fname = next(tempfile._get_candidate_names()) + suffix
29 |     return f"{TEMP_DIR}/{fname}" if with_dir else fname
30 | 
31 | 
32 | # A mixin to allow defining utility classes named "Test<X>" without pytest trying to collect test cases in them,
33 | # which results in warnings (and without needing a pytest.ini entry). See https://stackoverflow.com/a/46199666
34 | class DisablePyTestCollectionMixin(object):
35 |     __test__ = False
36 | 
37 | 
38 | def get_metric_value(metrics: List[MetricData], name: MetricName) -> float:
39 |     assert metrics
40 |     metric = next(filter(lambda metric: metric.name == name, metrics), None)
41 |     assert metric is not None
42 |     return metric.value
43 | 
44 | 
45 | def assert_metric_value(metrics: List[MetricData], name: MetricName, value: float):
46 |     assert get_metric_value(metrics, name) == value
47 | 
48 | 
49 | def find_first_label_value(metrics: List[MetricData], label_type: Type[MetricLabelEnum]) -> str:
50 |     assert metrics
51 |     found_metric = next(filter(lambda metric: label_type.label_name in metric.labels, metrics), None)
52 |     return found_metric.labels[label_type.label_name]
53 | 
54 | 
55 | def assert_label_value_exists(metrics: List[MetricData], label: MetricLabelEnum):
56 |     assert find_first_label_value(metrics, label.__class__) == label.label_value
57 | 


--------------------------------------------------------------------------------
/tests/utils/lambda_fixture.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | import pytest
16 | from frocket.common.config import config
17 | 
18 | 
19 | @pytest.fixture(scope="session", autouse=True)
20 | def init_mock_lambda_settings():
21 |     config['lambda.aws.endpoint.url'] = config.get('lambda.aws.endpoint.url', 'http://localhost:9001')
22 |     config['lambda.aws.region'] = config.get('lambda.aws.region', 'us-east-1')
23 |     config['lambda.aws.no.signature'] = 'true'
24 |     config['invoker.lambda.legacy.async'] = 'false'
25 | 


--------------------------------------------------------------------------------
/tests/utils/mock_s3_utils.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | import os
16 | import boto3
17 | from frocket.common.config import config, ConfigDict
18 | from frocket.common.helpers.utils import timestamped_uuid, memoize
19 | 
20 | SKIP_S3_TESTS = os.environ.get('SKIP_S3_TESTS', "False").lower() == 'true'
21 | 
22 | 
23 | @memoize
24 | def _init_mock_s3_config():
25 |     if SKIP_S3_TESTS:
26 |         print(f"Skipping mock S3 config")
27 |     config['s3.aws.endpoint.url'] = \
28 |         os.environ.get('MOCK_S3_URL', config.get('s3.aws.endpoint.url', 'http://localhost:9000'))
29 |     config['s3.aws.access.key.id'] = \
30 |         os.environ.get('MOCK_S3_USER', config.get('s3.aws.access.key.id', 'testonly'))
31 |     config['s3.aws.secret.access.key'] = \
32 |         os.environ.get('MOCK_S3_SERCET', config.get('s3.aws.secret.access.key', 'testonly'))
33 | 
34 | 
35 | def mock_s3_env_variables():
36 |     _init_mock_s3_config()
37 |     return {
38 |         ConfigDict.to_env_variable(key): config.get(key)
39 |         for key in ['s3.aws.endpoint.url', 's3.aws.access.key.id', 's3.aws.secret.access.key']
40 |     }
41 | 
42 | 
43 | def new_mock_s3_bucket():
44 |     if SKIP_S3_TESTS:
45 |         return None
46 |     _init_mock_s3_config()
47 | 
48 |     bucket_name = timestamped_uuid('testbucket-')
49 |     s3 = boto3.resource('s3', **config.aws_client_settings(service='s3'))
50 |     bucket = s3.Bucket(bucket_name)
51 |     bucket.create()
52 |     print(f"Bucket '{bucket_name}' created")
53 |     return bucket
54 | 


--------------------------------------------------------------------------------
/tests/utils/redis_fixture.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 The Funnel Rocket Maintainers
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | import os
16 | import pytest
17 | from frocket.common.config import config, ConfigDict
18 | from frocket.datastore.registered_datastores import get_datastore, get_blobstore
19 | 
20 | 
21 | @pytest.fixture(scope="session", autouse=True)
22 | def init_test_redis_settings():
23 |     config['redis.host'] = os.environ.get('TEST_REDIS_HOST', config['redis.host'])
24 |     config['redis.port'] = os.environ.get('TEST_REDIS_PORT', config['redis.port'])
25 |     config['redis.db'] = os.environ.get('TEST_REDIS_DB', config['redis.db'])
26 |     print(get_datastore(), get_blobstore())  # Fail on no connection, print connection details
27 | 
28 | 
29 | def get_test_redis_env_variables():
30 |     return {
31 |         ConfigDict.to_env_variable(key): config.get(key)
32 |         for key in ['redis.host', 'redis.port', 'redis.db', 'datastore.redis.prefix']
33 |     }
34 | 


--------------------------------------------------------------------------------