├── .github
    ├── pull_request_template.md
    └── workflows
    │   ├── master_only.yml
    │   ├── mirror.yml
    │   ├── pr.yml
    │   ├── pr_full_access.yml
    │   └── release.yml
├── .gitignore
├── .prow.yaml
├── .prow
    ├── config.yaml
    └── plugins.yaml
├── .readthedocs.yml
├── .scalafmt.conf
├── CHANGELOG.md
├── Makefile
├── OWNERS
├── README.md
├── infra
    ├── charts
    │   └── feast-spark
    │   │   ├── .helmignore
    │   │   ├── Chart.yaml
    │   │   ├── README.md
    │   │   ├── README.md.gotmpl
    │   │   ├── charts
    │   │       ├── feast-jobservice
    │   │       │   ├── Chart.yaml
    │   │       │   ├── README.md
    │   │       │   ├── templates
    │   │       │   │   ├── _helpers.tpl
    │   │       │   │   ├── configmap.yaml
    │   │       │   │   ├── deployment.yaml
    │   │       │   │   └── service.yaml
    │   │       │   └── values.yaml
    │   │       └── prometheus-statsd-exporter
    │   │       │   ├── .helmignore
    │   │       │   ├── Chart.yaml
    │   │       │   ├── README.md
    │   │       │   ├── templates
    │   │       │       ├── NOTES.txt
    │   │       │       ├── _helpers.tpl
    │   │       │       ├── config.yaml
    │   │       │       ├── deployment.yaml
    │   │       │       ├── pvc.yaml
    │   │       │       ├── service.yaml
    │   │       │       └── serviceaccount.yaml
    │   │       │   └── values.yaml
    │   │   ├── requirements.lock
    │   │   ├── requirements.yaml
    │   │   └── values.yaml
    ├── codebuild_runner.py
    ├── docker
    │   ├── jobservice
    │   │   └── Dockerfile
    │   ├── spark
    │   │   ├── Dockerfile
    │   │   └── dev.Dockerfile
    │   └── tests
    │   │   └── Dockerfile
    └── scripts
    │   ├── aws-runner.sh
    │   ├── build-ingestion-py-dependencies.sh
    │   ├── codebuild-entrypoint.sh
    │   ├── codebuild_runner.py
    │   ├── download-maven-cache.sh
    │   ├── helm
    │       ├── k8s-jobservice.tpl.yaml
    │       ├── kafka-values.tpl.yaml
    │       └── redis-cluster-values.tpl.yaml
    │   ├── install-google-cloud-sdk.sh
    │   ├── install-helm.sh
    │   ├── k8s-common-functions.sh
    │   ├── publish-docker-image.sh
    │   ├── publish-java-sdk.sh
    │   ├── publish-python-sdk.sh
    │   ├── push-helm-charts.sh
    │   ├── run-minikube-test.sh
    │   ├── setup-common-functions.sh
    │   ├── setup-e2e-env-aws.sh
    │   ├── setup-e2e-env-gcp.sh
    │   ├── setup-e2e-env-sparkop.sh
    │   ├── setup-e2e-local.sh
    │   ├── test-core-ingestion.sh
    │   ├── test-docker-compose.sh
    │   ├── test-end-to-end-local.sh
    │   ├── test-end-to-end-sparkop.sh
    │   ├── test-golang-sdk.sh
    │   ├── test-integration.sh
    │   ├── test-java-sdk.sh
    │   ├── test-load.sh
    │   ├── test-python-sdk.sh
    │   ├── test-serving.sh
    │   ├── test_job.yaml
    │   ├── validate-helm-chart-versions.sh
    │   ├── validate-version-consistency.sh
    │   └── wait-for-it.sh
├── pom.xml
├── protos
    ├── feast
    │   ├── core
    │   │   ├── CoreService.proto
    │   │   ├── DataFormat.proto
    │   │   ├── DataSource.proto
    │   │   ├── Entity.proto
    │   │   ├── Feature.proto
    │   │   ├── FeatureTable.proto
    │   │   ├── JobService.proto
    │   │   └── Store.proto
    │   ├── serving
    │   │   └── ServingService.proto
    │   ├── storage
    │   │   └── Redis.proto
    │   ├── third_party
    │   │   └── grpc
    │   │   │   └── health
    │   │   │       └── v1
    │   │   │           └── HealthService.proto
    │   └── types
    │   │   ├── Field.proto
    │   │   └── Value.proto
    └── feast_spark
    │   ├── api
    │       └── JobService.proto
    │   └── third_party
    │       └── grpc
    │           └── health
    │               └── v1
    │                   └── HealthService.proto
├── python
    ├── docs
    │   ├── Makefile
    │   └── source
    │   │   ├── conf.py
    │   │   ├── feast_spark.api.rst
    │   │   ├── feast_spark.contrib.rst
    │   │   ├── feast_spark.contrib.validation.rst
    │   │   ├── feast_spark.pyspark.launchers.aws.rst
    │   │   ├── feast_spark.pyspark.launchers.gcloud.rst
    │   │   ├── feast_spark.pyspark.launchers.k8s.rst
    │   │   ├── feast_spark.pyspark.launchers.rst
    │   │   ├── feast_spark.pyspark.launchers.standalone.rst
    │   │   ├── feast_spark.pyspark.rst
    │   │   ├── feast_spark.rst
    │   │   ├── feast_spark.third_party.grpc.health.rst
    │   │   ├── feast_spark.third_party.grpc.health.v1.rst
    │   │   ├── feast_spark.third_party.grpc.rst
    │   │   ├── feast_spark.third_party.rst
    │   │   ├── index.rst
    │   │   └── modules.rst
    ├── feast_spark
    │   ├── __init__.py
    │   ├── api
    │   │   └── __init__.py
    │   ├── cli.py
    │   ├── client.py
    │   ├── constants.py
    │   ├── contrib
    │   │   ├── __init__.py
    │   │   └── validation
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   └── ge.py
    │   ├── job_service.py
    │   ├── lock_manager.py
    │   ├── metrics.py
    │   ├── pyspark
    │   │   ├── __init__.py
    │   │   ├── abc.py
    │   │   ├── historical_feature_retrieval_job.py
    │   │   ├── launcher.py
    │   │   └── launchers
    │   │   │   ├── __init__.py
    │   │   │   ├── aws
    │   │   │       ├── __init__.py
    │   │   │       ├── emr.py
    │   │   │       └── emr_utils.py
    │   │   │   ├── gcloud
    │   │   │       ├── __init__.py
    │   │   │       └── dataproc.py
    │   │   │   ├── k8s
    │   │   │       ├── __init__.py
    │   │   │       ├── k8s.py
    │   │   │       └── k8s_utils.py
    │   │   │   └── standalone
    │   │   │       ├── __init__.py
    │   │   │       └── local.py
    │   ├── remote_job.py
    │   └── third_party
    │   │   ├── __init__.py
    │   │   └── grpc
    │   │       ├── __init__.py
    │   │       └── health
    │   │           ├── __init__.py
    │   │           └── v1
    │   │               └── __init__.py
    ├── pyproject.toml
    ├── requirements-ci.txt
    ├── setup.cfg
    ├── setup.py
    └── tests
    │   ├── __init__.py
    │   ├── data
    │       ├── bookings.csv
    │       ├── column_mapping_test_entity.csv
    │       ├── column_mapping_test_feature.csv
    │       ├── customer_driver_pairs.csv
    │       ├── customers.csv
    │       ├── single_customer.csv
    │       └── transactions.csv
    │   ├── test_historical_feature_retrieval.py
    │   ├── test_launcher_abc.py
    │   ├── test_lock_manager.py
    │   └── test_streaming_job_scheduling.py
├── spark
    └── ingestion
    │   ├── pom.xml
    │   └── src
    │       ├── main
    │           ├── resources
    │           │   └── log4j.properties
    │           └── scala
    │           │   ├── feast
    │           │       └── ingestion
    │           │       │   ├── BasePipeline.scala
    │           │       │   ├── BatchPipeline.scala
    │           │       │   ├── IngestionJob.scala
    │           │       │   ├── IngestionJobConfig.scala
    │           │       │   ├── StreamingPipeline.scala
    │           │       │   ├── metrics
    │           │       │       ├── IngestionPipelineMetrics.scala
    │           │       │       ├── StatsdReporterWithTags.scala
    │           │       │       └── StreamingMetrics.scala
    │           │       │   ├── registry
    │           │       │       └── proto
    │           │       │       │   ├── LocalProtoRegistry.scala
    │           │       │       │   ├── ProtoRegistry.scala
    │           │       │       │   ├── ProtoRegistryFactory.scala
    │           │       │       │   └── StencilProtoRegistry.scala
    │           │       │   ├── sources
    │           │       │       ├── bq
    │           │       │       │   └── BigQueryReader.scala
    │           │       │       └── file
    │           │       │       │   └── FileReader.scala
    │           │       │   ├── stores
    │           │       │       ├── bigtable
    │           │       │       │   ├── BigTableSinkRelation.scala
    │           │       │       │   ├── DefaultSource.scala
    │           │       │       │   └── SparkBigtableConfig.scala
    │           │       │       ├── cassandra
    │           │       │       │   ├── CassandraSinkRelation.scala
    │           │       │       │   ├── DefaultSource.scala
    │           │       │       │   └── SparkCassandraConfig.scala
    │           │       │       ├── redis
    │           │       │       │   ├── ClusterPipelineProvider.scala
    │           │       │       │   ├── DefaultSource.scala
    │           │       │       │   ├── HashTypePersistence.scala
    │           │       │       │   ├── Persistence.scala
    │           │       │       │   ├── PipelineProvider.scala
    │           │       │       │   ├── PipelineProviderFactory.scala
    │           │       │       │   ├── RedisEndpoint.scala
    │           │       │       │   ├── RedisSinkRelation.scala
    │           │       │       │   ├── SingleNodePipelineProvider.scala
    │           │       │       │   └── SparkRedisConfig.scala
    │           │       │       └── serialization
    │           │       │       │   ├── AvroSerializer.scala
    │           │       │       │   └── Serializer.scala
    │           │       │   ├── utils
    │           │       │       ├── JsonUtils.scala
    │           │       │       ├── ProtoReflection.scala
    │           │       │       ├── StringUtils.scala
    │           │       │       ├── TypeConversion.scala
    │           │       │       └── testing
    │           │       │       │   └── MemoryStreamingSource.scala
    │           │       │   └── validation
    │           │       │       ├── Expectation.scala
    │           │       │       ├── RowValidator.scala
    │           │       │       └── TypeCheck.scala
    │           │   └── org
    │           │       └── apache
    │           │           └── spark
    │           │               ├── api
    │           │                   └── python
    │           │                   │   └── DynamicPythonFunction.scala
    │           │               └── metrics
    │           │                   ├── AtomicGauge.scala
    │           │                   ├── sink
    │           │                       └── StatsdSinkWithTags.scala
    │           │                   └── source
    │           │                       ├── BaseMetricSource.scala
    │           │                       ├── BigTableSinkMetricSource.scala
    │           │                       ├── IngestionPipelineMetricSource.scala
    │           │                       ├── RedisSinkMetricSource.scala
    │           │                       └── StreamingMetricSource.scala
    │       └── test
    │           ├── proto
    │               └── com
    │               │   └── example
    │               │       └── source.proto
    │           ├── resources
    │               ├── python
    │               │   ├── setup.sh
    │               │   └── udf.py
    │               └── stencil
    │               │   └── __files
    │               │       └── source.desc
    │           └── scala
    │               ├── com
    │                   └── example
    │                   │   └── protos
    │                   │       ├── AllTypesMessage.java
    │                   │       ├── AllTypesMessageOrBuilder.java
    │                   │       ├── InnerMessage.java
    │                   │       ├── InnerMessageOrBuilder.java
    │                   │       ├── Source.java
    │                   │       ├── TestMessage.java
    │                   │       ├── TestMessageOrBuilder.java
    │                   │       ├── VehicleType.java
    │                   │       └── VehicleTypeOrBuilder.java
    │               └── feast
    │                   └── ingestion
    │                       ├── BatchPipelineIT.scala
    │                       ├── BigTableIngestionSpec.scala
    │                       ├── CassandraIngestionSpec.scala
    │                       ├── PandasUDF.scala
    │                       ├── RowValidatorTest.scala
    │                       ├── SparkSpec.scala
    │                       ├── StreamingPipelineIT.scala
    │                       ├── UnitSpec.scala
    │                       ├── helpers
    │                           ├── DataHelper.scala
    │                           └── RedisStorageHelper.scala
    │                       ├── metrics
    │                           ├── StatsDStub.scala
    │                           └── StatsReporterSpec.scala
    │                       └── registry
    │                           └── StencilSpec.scala
└── tests
    ├── README.md
    ├── __init__.py
    ├── e2e
        ├── __init__.py
        ├── conftest.py
        ├── fixtures
        │   ├── __init__.py
        │   ├── base.py
        │   ├── client.py
        │   ├── data.py
        │   ├── external_services.py
        │   ├── feast_services.py
        │   ├── services.py
        │   └── statsd_stub.py
        ├── test_historical_features.py
        ├── test_job_scheduling.py
        ├── test_online_features.py
        ├── test_register.py
        ├── test_validation.py
        └── utils
        │   ├── __init__.py
        │   ├── common.py
        │   └── kafka.py
    ├── requirements.txt
    └── setup.cfg


/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | <!--  Thanks for sending a pull request!  Here are some tips for you:
 2 | 
 3 | 1. Ensure that your code follows our code conventions: https://github.com/feast-dev/feast/blob/master/docs/contributing.md#code-conventions
 4 | 2. Run unit tests and ensure that they are passing: https://github.com/feast-dev/feast/blob/master/docs/contributing.md#running-unit-tests
 5 | 3. If your change introduces any API changes, make sure to update the integration tests scripts here: https://github.com/feast-dev/feast/tree/master/tests/e2e
 6 | 4. Make sure documentation is updated for your PR!
 7 | 5. Make sure you have signed the CLA https://cla.developers.google.com/clas
 8 | 
 9 | -->
10 | 
11 | **What this PR does / why we need it**:
12 | 
13 | **Which issue(s) this PR fixes**:
14 | <!--
15 | *Automatically closes linked issue when PR is merged.
16 | Usage: `Fixes #<issue number>`, or `Fixes (paste link of issue)`.
17 | -->
18 | Fixes #
19 | 
20 | **Does this PR introduce a user-facing change?**:
21 | <!--
22 | If no, just write "NONE" in the release-note block below.
23 | If yes, a release note is required:
24 | Enter your extended release note in the block below. If the PR requires additional action from users switching to the new release, include the string "action required".
25 | 
26 | For more information about release notes, see kubernetes' guide here:
27 | http://git.k8s.io/community/contributors/guide/release-notes.md
28 | -->
29 | ```release-note
30 | 
31 | ```
32 | 


--------------------------------------------------------------------------------
/.github/workflows/mirror.yml:
--------------------------------------------------------------------------------
 1 | name: mirror
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: master
 6 |     tags:
 7 |       - 'v*.*.*'
 8 | 
 9 | jobs:
10 |   mirror:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |         with:
15 |           fetch-depth: 0
16 |       - uses: webfactory/ssh-agent@v0.4.1
17 |         with:
18 |           ssh-private-key: ${{ secrets.MIRROR_SSH_KEY }}
19 |       - name: Mirror all origin branches and tags to internal repo
20 |         run: |
21 |           export GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no"
22 |           git remote add internal ${{ secrets.INTERNAL_REPO }}
23 |           git push internal --all -f
24 |           git push internal --tags -f


--------------------------------------------------------------------------------
/.github/workflows/pr.yml:
--------------------------------------------------------------------------------
 1 | name: pull request
 2 | 
 3 | on: [pull_request]
 4 | 
 5 | jobs:
 6 |   lint-java:
 7 |     container: gcr.io/kf-feast/feast-ci:latest
 8 |     runs-on: [ubuntu-latest]
 9 |     steps:
10 |       - uses: actions/checkout@v2
11 |       - name: Lint java
12 |         run: make lint-java
13 | 
14 |   test-java:
15 |     runs-on: ubuntu-latest
16 |     needs: lint-java
17 |     steps:
18 |       - uses: actions/checkout@v2
19 |       - name: Set up JDK 11
20 |         uses: actions/setup-java@v1
21 |         with:
22 |           java-version: '11'
23 |           java-package: jdk
24 |           architecture: x64
25 |       - uses: actions/setup-python@v2
26 |         with:
27 |           python-version: '3.8'
28 |           architecture: 'x64'
29 |       - uses: actions/cache@v2
30 |         with:
31 |           path: ~/.m2/repository
32 |           key: ${{ runner.os }}-ut-maven-${{ hashFiles('**/pom.xml') }}
33 |           restore-keys: |
34 |             ${{ runner.os }}-ut-maven-
35 |       - name: Test java
36 |         run: make test-java
37 | 
38 |   lint-python:
39 |     container: python:3.8
40 |     runs-on: [ubuntu-latest]
41 |     steps:
42 |       - uses: actions/checkout@v2
43 |       - name: Install dependencies
44 |         run: make install-python-ci-dependencies
45 |       - name: Lint python
46 |         run: make lint-python
47 | 
48 |   unit-test-python:
49 |     runs-on: ubuntu-latest
50 |     needs: lint-python
51 |     env:
52 |       PYSPARK_PYTHON: python3.8
53 |     steps:
54 |       - uses: actions/checkout@v2
55 |       - name: Set up JDK 11
56 |         uses: actions/setup-java@v1
57 |         with:
58 |           java-version: '11'
59 |           java-package: jdk
60 |           architecture: x64
61 |       - uses: actions/setup-python@v2
62 |         with:
63 |           python-version: '3.8'
64 |           architecture: 'x64'
65 |       - name: Install python
66 |         run: make install-python
67 |       - name: Test python
68 |         run: make test-python
69 | 


--------------------------------------------------------------------------------
/.github/workflows/pr_full_access.yml:
--------------------------------------------------------------------------------
 1 | # contains additional jobs to run for 'complete' workflow that involve secrets
 2 | name: pull request (full access)
 3 | 
 4 | on:
 5 |   # 'pull_request_target' required to make secrets available for jobs
 6 |   pull_request_target:
 7 |     types:
 8 |       - opened
 9 |       - synchronize
10 |       - labeled
11 | 
12 | jobs:
13 |   # all jobs should have a if check for 'ok-to-test' label in order to be gated by the label.
14 |   # otherwise secrets might be unintentionally exposed to malicious forks.
15 |   build-push-docker-images-for-e2e-tests:
16 |     if: contains(github.event.pull_request.labels.*.name, 'ok-to-test') || contains(github.event.pull_request.labels.*.name, 'approved')
17 |     runs-on: [ubuntu-latest]
18 |     strategy:
19 |       matrix:
20 |         component: [jobservice, spark]
21 |     env:
22 |       GITHUB_PR_SHA: ${{ github.event.pull_request.head.sha }}
23 |       REGISTRY: gcr.io/kf-feast
24 |       MAVEN_CACHE: gs://feast-templocation-kf-feast/.m2.2020-08-19.tar
25 |     steps:
26 |       - uses: actions/checkout@v2
27 |         with:
28 |           # pull_request_target runs the workflow in the context of the base repo
29 |           # as such actions/checkout needs to be explicit configured to retrieve
30 |           # code from the PR. 
31 |           ref: ${{ github.event.pull_request.merge_commit_sha }}
32 |       - uses: docker/setup-qemu-action@v1
33 |       - name: Set up Docker Buildx
34 |         uses: docker/setup-buildx-action@v1
35 |       - uses: google-github-actions/setup-gcloud@v0
36 |         with:
37 |           version: '290.0.1'
38 |           export_default_credentials: true
39 |           project_id: ${{ secrets.GCP_PROJECT_ID }}
40 |           service_account_key: ${{ secrets.GCP_SA_KEY }}
41 |       - run: gcloud auth configure-docker --quiet
42 |       - name: Get m2 cache
43 |         run: |
44 |           infra/scripts/download-maven-cache.sh \
45 |           --archive-uri ${MAVEN_CACHE} \
46 |           --output-dir $HOME
47 |       - name: Build image
48 |         run: make build-${{ matrix.component }}-docker REGISTRY=${REGISTRY} VERSION=${GITHUB_PR_SHA}
49 |       - name: Push image
50 |         run: |
51 |           docker push ${REGISTRY}/feast-${{ matrix.component }}:${GITHUB_PR_SHA}
52 | 


--------------------------------------------------------------------------------
/.prow.yaml:
--------------------------------------------------------------------------------
 1 | presubmits:
 2 | - name: test-end-to-end-sparkop
 3 |   decorate: true
 4 |   always_run: true
 5 |   max_concurrency: 1
 6 |   spec:
 7 |     metadata:
 8 |       namespace: sparkop-e2e
 9 |     containers:
10 |       - image: gcr.io/kf-feast/feast-ci:latest
11 |         command: [ "infra/scripts/test-end-to-end-sparkop.sh"]
12 |         resources:
13 |           requests:
14 |             cpu: "2"
15 |             memory: "2Gi"
16 |         env:
17 |           - name: GOOGLE_APPLICATION_CREDENTIALS
18 |             value: /etc/gcloud/service-account.json
19 |           - name: DOCKER_REPOSITORY
20 |             value: gcr.io/kf-feast
21 |         volumeMounts:
22 |           - mountPath: /etc/gcloud/service-account.json
23 |             name: service-account
24 |             readOnly: true
25 |             subPath: service-account.json
26 |     volumes:
27 |       - name: service-account
28 |         secret:
29 |           secretName: feast-service-account
30 | 
31 | postsubmits:
32 | - name: test-end-to-end-sparkop
33 |   decorate: true
34 |   always_run: true
35 |   max_concurrency: 1
36 |   branches:
37 |     - ^master$
38 |   spec:
39 |     metadata:
40 |       namespace: sparkop-e2e
41 |     containers:
42 |       - image: gcr.io/kf-feast/feast-ci:latest
43 |         command: [ "infra/scripts/test-end-to-end-sparkop.sh"]
44 |         resources:
45 |           requests:
46 |             cpu: "2"
47 |             memory: "2048Mi"
48 |         env:
49 |           - name: GOOGLE_APPLICATION_CREDENTIALS
50 |             value: /etc/gcloud/service-account.json
51 |           - name: DOCKER_REPOSITORY
52 |             value: gcr.io/kf-feast
53 |         volumeMounts:
54 |           - mountPath: /etc/gcloud/service-account.json
55 |             name: service-account
56 |             readOnly: true
57 |             subPath: service-account.json
58 |     volumes:
59 |       - name: service-account
60 |         secret:
61 |           secretName: feast-service-account
62 | 


--------------------------------------------------------------------------------
/.prow/config.yaml:
--------------------------------------------------------------------------------
 1 | prowjob_namespace: prow
 2 | pod_namespace: test-pods
 3 | 
 4 | in_repo_config:
 5 |   enabled:
 6 |     "*": true
 7 |   allowed_clusters:
 8 |     "*": ["default"]
 9 | 
10 | plank:
11 |   job_url_prefix_config:
12 |     "*": https://prow.feast.dev/view/gcs
13 |   pod_pending_timeout: 60m
14 |   report_templates:
15 |     '*': >-
16 |         [Full PR test history](https://prow.feast.dev/pr-history?org={{.Spec.Refs.Org}}&repo={{.Spec.Refs.Repo}}&pr={{with index .Spec.Refs.Pulls 0}}{{.Number}}{{end}}).
17 |         [Your PR dashboard](https://prow.feast.dev/pr?query=is:pr+state:open+author:{{with
18 |         index .Spec.Refs.Pulls 0}}{{.Author}}{{end}}).
19 |   default_decoration_configs:
20 |     "*":
21 |       timeout: 1h
22 |       grace_period: 15s
23 |       gcs_configuration:
24 |         bucket: gs://feast-prow-artifacts
25 |         path_strategy: explicit
26 |       gcs_credentials_secret: gcs-credentials
27 |       utility_images:
28 |         clonerefs: gcr.io/k8s-prow/clonerefs:v20201112-00537d1bb4
29 |         entrypoint: gcr.io/k8s-prow/entrypoint:v20201112-00537d1bb4
30 |         initupload: gcr.io/k8s-prow/initupload:v20201112-00537d1bb4
31 |         sidecar: gcr.io/k8s-prow/sidecar:v20201112-00537d1bb4
32 | 
33 | deck:
34 |   tide_update_period: 1s
35 |   spyglass:
36 |     size_limit: 10e+6 # 10MB
37 |     lenses:
38 |     - lens:
39 |         name: metadata
40 |       required_files:
41 |       - started.json|finished.json
42 |     - lens:
43 |         name: buildlog
44 |       required_files:
45 |       - build-log.txt
46 |     - lens:
47 |         name: junit
48 |       required_files:
49 |       - artifacts/.*\.xml
50 | 
51 | tide:
52 |   queries:
53 |   - repos:
54 |     - feast-dev/feast
55 |     - feast-dev/feast-spark
56 |     labels:
57 |     - lgtm
58 |     - approved
59 |     missingLabels:
60 |     - do-not-merge
61 |     - do-not-merge/hold
62 |     - do-not-merge/invalid-owners-file
63 |     - do-not-merge/work-in-progress
64 |     - needs-rebase
65 |     - needs-kind
66 |   merge_method:
67 |     feast-dev/feast: squash
68 |     feast-dev/feast-spark: squash
69 |   blocker_label: merge-blocker
70 |   squash_label: tide/squash
71 | 
72 | # presubmits and postsubmits configure ProwJobs:
73 | # https://github.com/kubernetes/test-infra/blob/6571843b1aa7bd6cf577a7a8b9e9971241f424d5/prow/jobs.md
74 | 


--------------------------------------------------------------------------------
/.prow/plugins.yaml:
--------------------------------------------------------------------------------
 1 | plugins:
 2 |   feast-dev/feast-spark:
 3 |   - approve      
 4 |   - assign
 5 |   - help
 6 |   - hold
 7 |   - label
 8 |   - lgtm
 9 |   - lifecycle     
10 |   - size
11 |   - verify-owners 
12 |   - wip         
13 |   - trigger
14 |   - config-updater
15 |   - require-matching-label
16 |   - release-note
17 | 
18 | config_updater:
19 |   maps:
20 |     .prow/config.yaml:
21 |       name: config
22 | 
23 | external_plugins:
24 |   feast-dev/feast-spark:
25 |   - name: needs-rebase
26 |     events:
27 |     - pull_request
28 | 
29 | require_matching_label:
30 | - missing_label: needs-kind
31 |   org: feast-dev
32 |   repo: feast
33 |   prs: true
34 |   regexp: ^kind/
35 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sphinx:
 4 |    configuration: python/docs/source/conf.py
 5 | 
 6 | formats:
 7 |    - pdf
 8 | 
 9 | python:
10 |   version: 3.7
11 |   install:
12 |       - requirements: python/requirements-ci.txt
13 |       - path: python/
14 |         method: setuptools


--------------------------------------------------------------------------------
/.scalafmt.conf:
--------------------------------------------------------------------------------
1 | align.preset = more
2 | maxColumn = 100


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | MVN := mvn ${MAVEN_EXTRA_OPTS}
 2 | ROOT_DIR 	:= $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 3 | 
 4 | PROTO_TYPE_SUBDIRS = api
 5 | PROTO_SERVICE_SUBDIRS = api
 6 | 
 7 | # Make sure env vars are available to submakes
 8 | export
 9 | 
10 | # Java
11 | 
12 | format-java:
13 | 	cd spark/ingestion && ${MVN} spotless:apply
14 | 
15 | lint-java:
16 | 	cd spark/ingestion && ${MVN} --no-transfer-progress spotless:check
17 | 
18 | test-java:
19 | 	${MVN} --no-transfer-progress clean verify
20 | 
21 | # Python
22 | 
23 | format-python:
24 | 	# Sort
25 | 	cd ${ROOT_DIR}/python ; isort feast_spark/
26 | 	#cd ${ROOT_DIR}/tests/e2e; isort .
27 | 
28 | 	# Format
29 | 	cd ${ROOT_DIR}/python; black --target-version py37 feast_spark
30 | 	#cd ${ROOT_DIR}/tests/e2e; black --target-version py37 .
31 | 
32 | install-python-ci-dependencies:
33 | 	pip install -U --no-cache-dir -r python/requirements-ci.txt
34 | 
35 | # Supports feast-dev repo master branch
36 | install-python: install-python-ci-dependencies
37 | 	pip install -e python
38 | 
39 | lint-python:
40 | 	cd ${ROOT_DIR}/python ; mypy feast_spark/ tests/
41 | 	cd ${ROOT_DIR}/python ; isort feast_spark/ tests/ --check-only
42 | 	cd ${ROOT_DIR}/python ; flake8 feast_spark/ tests/
43 | 	cd ${ROOT_DIR}/python ; black --check feast_spark tests
44 | 	cd ${ROOT_DIR}/tests; mypy e2e
45 | 	cd ${ROOT_DIR}/tests; isort e2e --check-only
46 | 	cd ${ROOT_DIR}/tests; flake8 e2e
47 | 	cd ${ROOT_DIR}/tests; black --check e2e
48 | 
49 | test-python:
50 | 	pytest --verbose --color=yes python/tests
51 | 
52 | build-local-test-docker:
53 | 	docker build -t feast:local -f infra/docker/tests/Dockerfile .
54 | 
55 | build-ingestion-jar-no-tests:
56 | 	cd spark/ingestion && ${MVN} --no-transfer-progress -Dmaven.javadoc.skip=true -Dgpg.skip -DskipUTs=true -DskipITs=true -Drevision=${REVISION} clean package
57 | 
58 | build-jobservice-docker:
59 | 	docker build -t $(REGISTRY)/feast-jobservice:$(VERSION) -f infra/docker/jobservice/Dockerfile .
60 | 
61 | push-jobservice-docker:
62 | 	docker push $(REGISTRY)/feast-jobservice:$(VERSION)
63 | 
64 | build-spark-docker:
65 | 	docker build -t $(REGISTRY)/feast-spark:$(VERSION) --build-arg VERSION=$(VERSION) -f infra/docker/spark/Dockerfile .
66 | 
67 | build-spark-docker-dev:
68 | 	docker build -t $(REGISTRY)/feast-spark:$(VERSION) --build-arg VERSION=$(VERSION) -f infra/docker/spark/dev.Dockerfile .
69 | 
70 | push-spark-docker:
71 | 	docker push $(REGISTRY)/feast-spark:$(VERSION)
72 | 
73 | install-ci-dependencies: install-python-ci-dependencies
74 | 


--------------------------------------------------------------------------------
/OWNERS:
--------------------------------------------------------------------------------
1 | approvers:
2 |   - khorshuheng
3 |   - pyalex
4 |   - woop
5 | reviewers:
6 |   - khorshuheng
7 |   - pyalex
8 |   - woop
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Feast Spark
 2 | 
 3 | Contains
 4 | * Spark ingestion jobs for [Feast](https://github.com/feast-dev/feast) versions 0.9 and below
 5 | * Feast Job Service
 6 | * Feast Python SDK Spark extensions 
 7 | 
 8 | Usage:
 9 | 
10 | ```python
11 | 
12 | import feast_spark
13 | import feast
14 | 
15 | client = feast.Client()
16 | 
17 | client.set_project("project1")
18 | entity = feast.Entity(
19 |     name="driver_car_id",
20 |     description="Car driver id",
21 |     value_type=ValueType.STRING,
22 |     labels={"team": "matchmaking"},
23 | )
24 | 
25 | # Create Feature Tables using Feast SDK
26 | batch_source = feast.FileSource(
27 |     file_format=ParquetFormat(),
28 |     file_url="file://feast/*",
29 |     event_timestamp_column="ts_col",
30 |     created_timestamp_column="timestamp",
31 |     date_partition_column="date_partition_col",
32 | )
33 | 
34 | stream_source = feast.KafkaSource(
35 |     bootstrap_servers="localhost:9094",
36 |     message_format=ProtoFormat("class.path"),
37 |     topic="test_topic",
38 |     event_timestamp_column="ts_col",
39 | )
40 | 
41 | ft = feast.FeatureTable(
42 |     name="my-feature-table-1",
43 |     features=[
44 |         Feature(name="fs1-my-feature-1", dtype=ValueType.INT64),
45 |         Feature(name="fs1-my-feature-2", dtype=ValueType.STRING),
46 |         Feature(name="fs1-my-feature-3", dtype=ValueType.STRING_LIST),
47 |         Feature(name="fs1-my-feature-4", dtype=ValueType.BYTES_LIST),
48 |     ],
49 |     entities=["fs1-my-entity-1"],
50 |     labels={"team": "matchmaking"},
51 |     batch_source=batch_source,
52 |     stream_source=stream_source,
53 | )
54 | 
55 | # Register objects in Feast
56 | client.apply(entity, ft)
57 | 
58 | # Start spark streaming ingestion job that reads from kafka and writes to the online store
59 | feast_spark.Client(client).start_stream_to_online_ingestion(ft)
60 | ```
61 | 


--------------------------------------------------------------------------------
/infra/charts/feast-spark/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *~
18 | # Various IDEs
19 | .project
20 | .idea/
21 | *.tmproj
22 | .vscode/
23 | 


--------------------------------------------------------------------------------
/infra/charts/feast-spark/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | description: Feast Extension for running Ingestion on Spark
3 | name: feast-spark
4 | version: 0.2.29
5 | 


--------------------------------------------------------------------------------
/infra/charts/feast-spark/README.md:
--------------------------------------------------------------------------------
 1 | # feast-spark
 2 | 
 3 | Feast Extension for running Ingestion on Spark 0.2.24
 4 | 
 5 | ## Installation
 6 | 
 7 | https://docs.feast.dev/v/master/getting-started/deploying-feast/kubernetes
 8 | 
 9 | ## Requirements
10 | 
11 | | Repository | Name | Version |
12 | |------------|------|---------|
13 | |  | feast-jobservice | 0.2.24 |
14 | |  | prometheus-statsd-exporter | 0.1.2 |
15 | 
16 | ## Values
17 | 
18 | | Key | Type | Default | Description |
19 | |-----|------|---------|-------------|
20 | | feast-jobservice.enabled | bool | `true` | Flag to install Feast Job Service |
21 | 
22 | ### Documentation development
23 | 
24 | This `README.md` is generated using [helm-docs](https://github.com/norwoodj/helm-docs/).
25 | Please run `helm-docs` to regenerate the `README.md` every time `README.md.gotmpl`
26 | or `values.yaml` are updated.
27 | 


--------------------------------------------------------------------------------
/infra/charts/feast-spark/README.md.gotmpl:
--------------------------------------------------------------------------------
 1 | {{ template "chart.header" . }} 
 2 | 
 3 | {{ template "chart.description" . }} {{ template "chart.version" . }}
 4 | 
 5 | ## Installation
 6 | 
 7 | https://docs.feast.dev/v/master/getting-started/deploying-feast/kubernetes
 8 | 
 9 | {{ template "chart.requirementsSection" . }}
10 | 
11 | {{ template "chart.valuesSection" . }}
12 | 
13 | 
14 | ### Documentation development
15 | 
16 | This `README.md` is generated using [helm-docs](https://github.com/norwoodj/helm-docs/).
17 | Please run `helm-docs` to regenerate the `README.md` every time `README.md.gotmpl`
18 | or `values.yaml` are updated.
19 | 


--------------------------------------------------------------------------------
/infra/charts/feast-spark/charts/feast-jobservice/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | description: Feast Job Service manage ingestion jobs.
3 | name: feast-jobservice
4 | version: 0.2.24
5 | 


--------------------------------------------------------------------------------
/infra/charts/feast-spark/charts/feast-jobservice/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/* vim: set filetype=mustache: */}}
 2 | {{/*
 3 | Expand the name of the chart.
 4 | */}}
 5 | {{- define "feast-jobservice.name" -}}
 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
 7 | {{- end -}}
 8 | 
 9 | {{/*
10 | Create a default fully qualified app name.
11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
12 | If release name contains chart name it will be used as a full name.
13 | */}}
14 | {{- define "feast-jobservice.fullname" -}}
15 | {{- if .Values.fullnameOverride -}}
16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
17 | {{- else -}}
18 | {{- $name := default .Chart.Name .Values.nameOverride -}}
19 | {{- if contains $name .Release.Name -}}
20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}}
21 | {{- else -}}
22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
23 | {{- end -}}
24 | {{- end -}}
25 | {{- end -}}
26 | 
27 | {{/*
28 | Create chart name and version as used by the chart label.
29 | */}}
30 | {{- define "feast-jobservice.chart" -}}
31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
32 | {{- end -}}
33 | 
34 | {{/*
35 | Common labels
36 | */}}
37 | {{- define "feast-jobservice.labels" -}}
38 | app.kubernetes.io/name: {{ include "feast-jobservice.name" . }}
39 | helm.sh/chart: {{ include "feast-jobservice.chart" . }}
40 | app.kubernetes.io/instance: {{ .Release.Name }}
41 | {{- if .Chart.AppVersion }}
42 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
43 | {{- end }}
44 | app.kubernetes.io/managed-by: {{ .Release.Service }}
45 | {{- end -}}
46 | 


--------------------------------------------------------------------------------
/infra/charts/feast-spark/charts/feast-jobservice/templates/configmap.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.sparkOperator.enabled }}
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: {{ template "feast-jobservice.fullname" . }}-spark-template
 6 |   namespace: {{ .Release.Namespace }}
 7 |   labels:
 8 |     app: {{ template "feast-jobservice.name" . }}
 9 |     component: jobservice
10 |     chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
11 |     release: {{ .Release.Name }}
12 |     heritage: {{ .Release.Service }}
13 | data:
14 |   jobTemplate.yaml: |
15 | {{- toYaml .Values.sparkOperator.jobTemplate | nindent 4 }}
16 |   batchJobTemplate.yaml: |
17 | {{- toYaml .Values.sparkOperator.batchJobTemplate | nindent 4 }}
18 |   streamJobTemplate.yaml: |
19 | {{- toYaml .Values.sparkOperator.streamJobTemplate | nindent 4 }}
20 |   historicalJobTemplate.yaml: |
21 | {{- toYaml .Values.sparkOperator.historicalJobTemplate | nindent 4 }}
22 | {{- end }}


--------------------------------------------------------------------------------
/infra/charts/feast-spark/charts/feast-jobservice/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ template "feast-jobservice.fullname" . }}
 5 |   namespace: {{ .Release.Namespace }}
 6 |   labels:
 7 |     app: {{ template "feast-jobservice.name" . }}
 8 |     chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
 9 |     release: {{ .Release.Name }}
10 |     heritage: {{ .Release.Service }}
11 |   {{- with .Values.service.annotations }}
12 |   annotations:
13 |     {{ toYaml . | nindent 4 }}
14 |   {{- end }}
15 | spec:
16 |   type: {{ .Values.service.type }}
17 |   {{- if .Values.service.loadBalancerIP }}
18 |   loadBalancerIP: {{ .Values.service.loadBalancerIP }}
19 |   {{- end }}
20 |   {{- if .Values.service.loadBalancerSourceRanges }}
21 |   loadBalancerSourceRanges:
22 |   {{ toYaml .Values.service.loadBalancerSourceRanges | nindent 2 }}
23 |   {{- end }}
24 |   ports:
25 |   - name: http
26 |     port: {{ .Values.service.http.port }}
27 |     targetPort: {{ .Values.service.http.targetPort }}
28 |     {{- if .Values.service.http.nodePort }}
29 |     nodePort: {{ .Values.service.http.nodePort }}
30 |     {{- end }}
31 |   - name: grpc
32 |     port: {{ .Values.service.grpc.port }}
33 |     targetPort: {{ .Values.service.grpc.targetPort }}
34 |     {{- if .Values.service.grpc.nodePort }}
35 |     nodePort: {{ .Values.service.grpc.nodePort }}
36 |     {{- end }}
37 |   selector:
38 |     app: {{ template "feast-jobservice.name" . }}
39 |     component: jobservice
40 |     release: {{ .Release.Name }}
41 | 
42 | 


--------------------------------------------------------------------------------
/infra/charts/feast-spark/charts/prometheus-statsd-exporter/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *~
18 | # Various IDEs
19 | .project
20 | .idea/
21 | *.tmproj


--------------------------------------------------------------------------------
/infra/charts/feast-spark/charts/prometheus-statsd-exporter/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | appVersion: 0.8.0
 3 | description: A Helm chart for prometheus statsd-exporter Scrape metrics stored statsd
 4 | home: https://github.com/prometheus/statsd_exporter
 5 | keywords:
 6 |   - prometheus
 7 |   - statsd
 8 | maintainers:
 9 |   - name: enflo
10 |     email: toniflorithomar@gmail.com
11 | name: prometheus-statsd-exporter
12 | version: 0.1.2


--------------------------------------------------------------------------------
/infra/charts/feast-spark/charts/prometheus-statsd-exporter/README.md:
--------------------------------------------------------------------------------
 1 | # prometheus-statsd-exporter
 2 | 
 3 | ![Version: 0.1.2](https://img.shields.io/badge/Version-0.1.2-informational?style=flat-square) ![AppVersion: 0.8.0](https://img.shields.io/badge/AppVersion-0.8.0-informational?style=flat-square)
 4 | 
 5 | A Helm chart for prometheus statsd-exporter Scrape metrics stored statsd
 6 | 
 7 | **Homepage:** <https://github.com/prometheus/statsd_exporter>
 8 | 
 9 | ## Maintainers
10 | 
11 | | Name | Email | Url |
12 | | ---- | ------ | --- |
13 | | enflo | toniflorithomar@gmail.com |  |
14 | 
15 | ## Values
16 | 
17 | | Key | Type | Default | Description |
18 | |-----|------|---------|-------------|
19 | | image.pullPolicy | string | `"IfNotPresent"` |  |
20 | | image.repository | string | `"prom/statsd-exporter"` |  |
21 | | image.tag | string | `"v0.12.1"` |  |
22 | | persistentVolume.accessModes[0] | string | `"ReadWriteOnce"` |  |
23 | | persistentVolume.annotations | object | `{}` |  |
24 | | persistentVolume.claimName | string | `"prometheus-statsd-exporter"` |  |
25 | | persistentVolume.enabled | bool | `true` |  |
26 | | persistentVolume.existingClaim | string | `""` |  |
27 | | persistentVolume.mountPath | string | `"/data"` |  |
28 | | persistentVolume.name | string | `"storage-volume"` |  |
29 | | persistentVolume.size | string | `"20Gi"` |  |
30 | | persistentVolume.storageClass | object | `{}` |  |
31 | | persistentVolume.subPath | string | `""` |  |
32 | | service.annotations | object | `{}` |  |
33 | | service.clusterIP | string | `""` |  |
34 | | service.externalIPs | list | `[]` |  |
35 | | service.labels | object | `{}` |  |
36 | | service.loadBalancerIP | string | `""` |  |
37 | | service.loadBalancerSourceRanges | list | `[]` |  |
38 | | service.metricsPort | int | `9102` |  |
39 | | service.servicePort | int | `80` |  |
40 | | service.statsdPort | int | `9125` |  |
41 | | service.type | string | `"ClusterIP"` |  |
42 | | serviceAccount.componentName | string | `"prometheus-statsd-exporter"` |  |
43 | | serviceAccount.enable | bool | `false` |  |
44 | | statsdexporter.affinity | object | `{}` |  |
45 | | statsdexporter.extraArgs | object | `{}` |  |
46 | | statsdexporter.ingress.enabled | bool | `false` |  |
47 | | statsdexporter.nodeSelector | object | `{}` |  |
48 | | statsdexporter.podAnnotations."prometheus.io/path" | string | `"/metrics"` |  |
49 | | statsdexporter.podAnnotations."prometheus.io/port" | string | `"9102"` |  |
50 | | statsdexporter.podAnnotations."prometheus.io/scrape" | string | `"true"` |  |
51 | | statsdexporter.replicaCount | int | `1` |  |
52 | | statsdexporter.resources | object | `{}` |  |
53 | | statsdexporter.tolerations | object | `{}` |  |
54 | 
55 | ----------------------------------------------
56 | Autogenerated from chart metadata using [helm-docs v1.5.0](https://github.com/norwoodj/helm-docs/releases/v1.5.0)
57 | 


--------------------------------------------------------------------------------
/infra/charts/feast-spark/charts/prometheus-statsd-exporter/templates/NOTES.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | To verify that prometheus-statsd-exporter has started, run:
 3 | 
 4 | {{- if contains "NodePort" .Values.service.type }}
 5 |   export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ template "prometheus-statsd-exporter.fullname" . }})
 6 |   export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
 7 |   echo http://$NODE_IP:$NODE_PORT
 8 | {{- else if contains "LoadBalancer" .Values.service.type }}
 9 |   NOTE: It may take a few minutes for the LoadBalancer IP to be available.
10 |         You can watch the status of by running 'kubectl get svc --namespace {{ .Release.Namespace }} -w {{ template "prometheus-statsd-exporter.fullname" . }}'
11 | 
12 |   export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "prometheus-statsd-exporter.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
13 |   echo http://$SERVICE_IP:{{ .Values.service.servicePort }}
14 | {{- else if contains "ClusterIP"  .Values.service.type }}
15 |   export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app={{ template "prometheus-statsd-exporter.name" . }},component={{ .Chart.Name }}" -o jsonpath="{.items[0].metadata.name}")
16 |   kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 9090
17 | {{- end }}


--------------------------------------------------------------------------------
/infra/charts/feast-spark/charts/prometheus-statsd-exporter/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/* vim: set filetype=mustache: */}}
 2 | {{/*
 3 | Expand the name of the chart.
 4 | */}}
 5 | {{- define "prometheus-statsd-exporter.name" -}}
 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
 7 | {{- end -}}
 8 | 
 9 | {{/*
10 | Create a default fully qualified app name.
11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
12 | If release name contains chart name it will be used as a full name.
13 | */}}
14 | {{- define "prometheus-statsd-exporter.fullname" -}}
15 | {{- if .Values.fullnameOverride -}}
16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
17 | {{- else -}}
18 | {{- $name := default .Chart.Name .Values.nameOverride -}}
19 | {{- if contains $name .Release.Name -}}
20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}}
21 | {{- else -}}
22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
23 | {{- end -}}
24 | {{- end -}}
25 | {{- end -}}
26 | 
27 | {{/*
28 | Create chart name and version as used by the chart label.
29 | */}}
30 | {{- define "prometheus-statsd-exporter.chart" -}}
31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
32 | {{- end -}}
33 | 
34 | 
35 | {{/*
36 | Create the name of the service account to use
37 | */}}
38 | {{- define "prometheus-statsd-exporter.serviceAccountName" -}}
39 | {{- if .Values.serviceAccount.enable -}}
40 |     {{ default (include "prometheus-statsd-expoter.fullname" .) .Values.serviceAccount.name }}
41 | {{- else -}}
42 |     {{ default "default" .Values.serviceAccount.name }}
43 | {{- end -}}
44 | {{- end -}}


--------------------------------------------------------------------------------
/infra/charts/feast-spark/charts/prometheus-statsd-exporter/templates/config.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: {{ template "prometheus-statsd-exporter.fullname" . }}-config
 5 |   labels:
 6 |     app: {{ template "prometheus-statsd-exporter.name" . }}
 7 |     chart: {{ .Chart.Name }}-{{ .Chart.Version }}
 8 |     release: {{ .Release.Name }}
 9 |     heritage: {{ .Release.Service }}
10 | data:
11 |   statsd_mappings.yaml: |
12 | #
13 | #     defaults:
14 | #       ttl: "45s"


--------------------------------------------------------------------------------
/infra/charts/feast-spark/charts/prometheus-statsd-exporter/templates/deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: {{ template "prometheus-statsd-exporter.fullname" . }}
 5 |   labels:
 6 |     app: {{ template "prometheus-statsd-exporter.name" . }}
 7 |     chart: {{ .Chart.Name }}-{{ .Chart.Version }}
 8 |     release: {{ .Release.Name }}
 9 |     heritage: {{ .Release.Service }}
10 | spec:
11 |   replicas: {{ .Values.statsdexporter.replicaCount }}
12 |   selector:
13 |     matchLabels:
14 |       app: {{ template "prometheus-statsd-exporter.name" . }}
15 |       release: {{ .Release.Name }}
16 |   template:
17 |     metadata:
18 |       annotations:
19 | {{ toYaml .Values.statsdexporter.podAnnotations | indent 8 }}
20 |       labels:
21 |         app: {{ template "prometheus-statsd-exporter.name" . }}
22 |         release: {{ .Release.Name }}
23 |     spec:
24 |       serviceAccountName: {{ template "prometheus-statsd-exporter.serviceAccountName" . }}
25 |       containers:
26 |         - name: {{ .Chart.Name }}
27 |           image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
28 |           imagePullPolicy: {{ .Values.image.pullPolicy }}
29 |           args:
30 |             - --statsd.mapping-config=/etc/statsd_conf/statsd_mappings.yaml
31 |           {{- range $key, $value := .Values.statsdexporter.extraArgs }}
32 |             - --{{ $key }}={{ $value }}
33 |           {{- end }}
34 |           volumeMounts:
35 |           - mountPath: /data
36 |             name: {{ .Values.persistentVolume.name }}
37 |           - name: statsd-config
38 |             mountPath: /etc/statsd_conf
39 |           env:
40 |           - name: HOME
41 |             value: /data
42 |           ports:
43 |             - name: metrics
44 |               containerPort: 9102
45 |               protocol: TCP
46 |             - name: statsd-tcp
47 |               containerPort: 9125
48 |               protocol: TCP
49 |             - name: statsd-udp
50 |               containerPort: 9125
51 |               protocol: UDP
52 |           livenessProbe:
53 |             httpGet:
54 |               path: /#/status
55 |               port: 9102
56 |             initialDelaySeconds: 10
57 |             timeoutSeconds: 10
58 |           readinessProbe:
59 |             httpGet:
60 |               path: /#/status
61 |               port: 9102
62 |             initialDelaySeconds: 10
63 |             timeoutSeconds: 10
64 |           resources:
65 | {{ toYaml .Values.statsdexporter.resources | indent 12 }}
66 | {{- if .Values.statsdexporter.nodeSelector }}
67 |       nodeSelector:
68 | {{ toYaml .Values.statsdexporter.nodeSelector | indent 8 }}
69 |     {{- end }}
70 |       volumes:
71 |       - name: statsd-config
72 |         configMap:
73 |           name: {{ template "prometheus-statsd-exporter.fullname" . }}-config
74 |       - name: {{ .Values.persistentVolume.name }}
75 |         {{- if .Values.persistentVolume.enabled }}
76 |         persistentVolumeClaim:
77 |         claimName: {{ if .Values.persistentVolume.claimName }}{{- else }}{{ template "prometheus-statsd-exporter.fullname" . }}{{- end }}
78 |         {{- else }}
79 |         emptyDir: {}
80 |         {{- end -}}
81 | 


--------------------------------------------------------------------------------
/infra/charts/feast-spark/charts/prometheus-statsd-exporter/templates/pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   labels:
 5 |     app: {{ template "prometheus-statsd-exporter.fullname" . }}
 6 |     chart: {{ .Chart.Name }}-{{ .Chart.Version }}
 7 |     component: "{{ .Chart.Name }}"
 8 |     heritage: {{ .Release.Service }}
 9 |     release: {{ .Release.Name }}
10 |   name: {{ template "prometheus-statsd-exporter.fullname" . }}
11 | spec:
12 |   accessModes:
13 | {{ toYaml .Values.persistentVolume.accessModes | indent 4 }}
14 | {{- if .Values.persistentVolume.storageClass }}
15 | {{- if (eq "-" .Values.persistentVolume.storageClass) }}
16 |   storageClassName: ""
17 | {{- else }}
18 |   storageClassName: "{{ .Values.persistentVolume.storageClass }}"
19 | {{- end }}
20 | {{- end }}
21 |   resources:
22 |     requests:
23 |       storage: "{{ .Values.persistentVolume.size }}"


--------------------------------------------------------------------------------
/infra/charts/feast-spark/charts/prometheus-statsd-exporter/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 | {{- if .Values.service.annotations }}
 5 |   annotations:
 6 | {{ toYaml .Values.service.annotations | indent 4 }}
 7 | {{- end }}
 8 |   labels:
 9 |     app: {{ template "prometheus-statsd-exporter.fullname" . }}
10 |     chart: {{ .Chart.Name }}-{{ .Chart.Version }}
11 |     component: "{{ .Chart.Name }}"
12 |     heritage: {{ .Release.Service }}
13 |     release: {{ .Release.Name }}
14 | {{- if .Values.service.labels }}
15 | {{ toYaml .Values.service.labels | indent 4 }}
16 | {{- end }}
17 |   name: {{ template "prometheus-statsd-exporter.fullname" . }}
18 | spec:
19 |   ports:
20 |     - name: metrics
21 |       port: {{ .Values.service.metricsPort }}
22 |       protocol: TCP
23 |       targetPort: 9102
24 |     - name: statsd-tcp
25 |       port: {{ .Values.service.statsdPort }}
26 |       protocol: TCP
27 |       targetPort: 9125
28 |   selector:
29 |     app: {{ template "prometheus-statsd-exporter.name" . }}
30 |     release: {{ .Release.Name }}
31 |   type: ClusterIP
32 | ---
33 | apiVersion: v1
34 | kind: Service
35 | metadata:
36 | {{- if .Values.service.annotations }}
37 |   annotations:
38 | {{ toYaml .Values.service.annotations | indent 4 }}
39 | {{- end }}
40 |   labels:
41 |     app: {{ template "prometheus-statsd-exporter.fullname" . }}
42 |     chart: {{ .Chart.Name }}-{{ .Chart.Version }}
43 |     component: "{{ .Chart.Name }}"
44 |     heritage: {{ .Release.Service }}
45 |     release: {{ .Release.Name }}
46 | {{- if .Values.service.labels }}
47 | {{ toYaml .Values.service.labels | indent 4 }}
48 | {{- end }}
49 |   name: {{ template "prometheus-statsd-exporter.fullname" . }}-udp
50 | spec:
51 | {{- if .Values.service.clusterIP }}
52 |   clusterIP: {{ .Values.service.clusterIP }}
53 | {{- end }}
54 | {{- if .Values.service.externalIPs }}
55 |   externalIPs:
56 | {{ toYaml .Values.service.externalIPs | indent 4 }}
57 | {{- end }}
58 | {{- if .Values.service.loadBalancerIP }}
59 |   loadBalancerIP: {{ .Values.service.loadBalancerIP }}
60 | {{- end }}
61 | {{- if .Values.service.loadBalancerSourceRanges }}
62 |   loadBalancerSourceRanges:
63 |   {{- range $cidr := .Values.service.loadBalancerSourceRanges }}
64 |     - {{ $cidr }}
65 |   {{- end }}
66 | {{- end }}
67 |   ports:
68 |     - name: statsd-udp
69 |       port: {{ .Values.service.statsdPort }}
70 |       protocol: UDP
71 |       targetPort: 9125
72 |   selector:
73 |     app: {{ template "prometheus-statsd-exporter.name" . }}
74 |     release: {{ .Release.Name }}
75 |   type: "{{ .Values.service.type }}"


--------------------------------------------------------------------------------
/infra/charts/feast-spark/charts/prometheus-statsd-exporter/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.serviceAccount.enable -}}
 2 | apiVersion: v1
 3 | kind: ServiceAccount
 4 | metadata:
 5 |   labels:
 6 |     app: {{ template "prometheus-statsd-exporter.fullname" . }}
 7 |     chart: {{ .Chart.Name }}-{{ .Chart.Version }}
 8 |     component: "{{ .Values.serviceaccount.componentName }}"
 9 |     heritage: {{ .Release.Service }}
10 |     release: {{ .Release.Name }}
11 |   name: {{ template "prometheus-statsd-exporter.fullname" . }}
12 | {{- end -}}


--------------------------------------------------------------------------------
/infra/charts/feast-spark/requirements.lock:
--------------------------------------------------------------------------------
 1 | dependencies:
 2 | - name: feast-jobservice
 3 |   repository: ""
 4 |   version: 0.2.24
 5 | - name: prometheus-statsd-exporter
 6 |   repository: ""
 7 |   version: 0.1.2
 8 | digest: sha256:4b52339a644ff2785f8a89e6d3aa30261f091645e88c36ab00e147ac64d15297
 9 | generated: "2022-03-30T10:27:54.642418517+08:00"
10 | 


--------------------------------------------------------------------------------
/infra/charts/feast-spark/requirements.yaml:
--------------------------------------------------------------------------------
1 | dependencies:
2 |   - name: feast-jobservice
3 |     version: 0.2.24
4 |     condition: feast-jobservice.enabled
5 |   - name: prometheus-statsd-exporter
6 |     version: 0.1.2
7 |     condition: prometheus-statsd-exporter.enabled


--------------------------------------------------------------------------------
/infra/charts/feast-spark/values.yaml:
--------------------------------------------------------------------------------
1 | feast-jobservice:
2 |   # feast-jobservice.enabled -- Flag to install Feast Job Service
3 |   enabled: true


--------------------------------------------------------------------------------
/infra/docker/jobservice/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8
 2 | 
 3 | USER root
 4 | WORKDIR /app
 5 | 
 6 | COPY python python
 7 | COPY protos protos
 8 | COPY Makefile Makefile
 9 | 
10 | # Install necessary tools for later steps
11 | RUN apt-get update && apt-get -y install make git wget
12 | 
13 | # Install Feast SDK
14 | RUN git init .
15 | COPY README.md README.md
16 | RUN make install-python
17 | 
18 | #
19 | # Download grpc_health_probe to run health checks
20 | # https://kubernetes.io/blog/2018/10/01/health-checking-grpc-servers-on-kubernetes/
21 | #
22 | RUN wget -q https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/v0.3.1/grpc_health_probe-linux-amd64 \
23 |          -O /usr/bin/grpc-health-probe && \
24 |     chmod +x /usr/bin/grpc-health-probe
25 | 
26 | ENV FEAST_TELEMETRY=false
27 | 
28 | CMD ["python", "-m", "feast_spark.cli", "server"]
29 | 


--------------------------------------------------------------------------------
/infra/docker/spark/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM maven:3.6-jdk-11 as builder
 2 | 
 3 | RUN apt-get update && apt-get install -y build-essential
 4 | WORKDIR /build
 5 | 
 6 | COPY . .
 7 | ARG VERSION=dev
 8 | 
 9 | RUN REVISION=$VERSION make build-ingestion-jar-no-tests
10 | 
11 | FROM gcr.io/kf-feast/feast-spark-base:v3.1.3 as runtime
12 | 
13 | ARG VERSION=dev
14 | 
15 | ARG TFRECORD_VERSION=0.3.0
16 | ARG GCS_CONNECTOR_VERSION=2.2.5
17 | ARG BQ_CONNECTOR_VERSION=0.18.1
18 | 
19 | COPY --from=builder /build/spark/ingestion/target/feast-ingestion-spark-${VERSION}.jar /opt/spark/jars
20 | 
21 | USER root
22 | ADD https://repo1.maven.org/maven2/com/linkedin/sparktfrecord/spark-tfrecord_2.12/${TFRECORD_VERSION}/spark-tfrecord_2.12-${TFRECORD_VERSION}.jar /opt/spark/jars
23 | ADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-${GCS_CONNECTOR_VERSION}.jar /opt/spark/jars
24 | ADD https://repo1.maven.org/maven2/com/google/cloud/spark/spark-bigquery-with-dependencies_2.12/${BQ_CONNECTOR_VERSION}/spark-bigquery-with-dependencies_2.12-${BQ_CONNECTOR_VERSION}.jar /opt/spark/jars
25 | 
26 | # Fix arrow issue for jdk-11
27 | RUN mkdir -p /opt/spark/conf
28 | RUN echo 'spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true"' >> $SPARK_HOME/conf/spark-defaults.conf
29 | RUN echo 'spark.driver.extraJavaOptions="-Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true"' >> $SPARK_HOME/conf/spark-defaults.conf
30 | RUN echo 'spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true"' >> $SPARK_HOME/conf/spark-defaults.conf
31 | RUN echo 'spark.executor.extraJavaOptions="-Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true"' >> $SPARK_HOME/conf/spark-defaults.conf
32 | 
33 | # python dependencies
34 | RUN pip3 install pandas==1.3.5 great-expectations==0.13.2 pyarrow==2.0.0 Jinja2==3.0.3 datadog==0.44.0 'numpy<1.20.0'
35 | 
36 | # For logging to /dev/termination-log
37 | RUN mkdir -p /dev
38 | 
39 | 
40 | ENTRYPOINT [ "/opt/entrypoint.sh" ]


--------------------------------------------------------------------------------
/infra/docker/spark/dev.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM gcr.io/kf-feast/feast-spark-base:v3.1.3 as runtime
 2 | 
 3 | ARG VERSION=dev
 4 | 
 5 | ARG TFRECORD_VERSION=0.3.0
 6 | ARG GCS_CONNECTOR_VERSION=2.0.1
 7 | ARG BQ_CONNECTOR_VERSION=0.18.1
 8 | 
 9 | USER root
10 | ADD https://repo1.maven.org/maven2/com/linkedin/sparktfrecord/spark-tfrecord_2.12/${TFRECORD_VERSION}/spark-tfrecord_2.12-${TFRECORD_VERSION}.jar /opt/spark/jars
11 | ADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-${GCS_CONNECTOR_VERSION}.jar /opt/spark/jars
12 | ADD https://repo1.maven.org/maven2/com/google/cloud/spark/spark-bigquery-with-dependencies_2.12/${BQ_CONNECTOR_VERSION}/spark-bigquery-with-dependencies_2.12-${BQ_CONNECTOR_VERSION}.jar /opt/spark/jars
13 | 
14 | # Fix arrow issue for jdk-11
15 | RUN mkdir -p /opt/spark/conf
16 | RUN echo 'spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true"' >> $SPARK_HOME/conf/spark-defaults.conf
17 | RUN echo 'spark.driver.extraJavaOptions="-Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true"' >> $SPARK_HOME/conf/spark-defaults.conf
18 | RUN echo 'spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true"' >> $SPARK_HOME/conf/spark-defaults.conf
19 | RUN echo 'spark.executor.extraJavaOptions="-Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true"' >> $SPARK_HOME/conf/spark-defaults.conf \
20 | 
21 | # python dependencies
22 | RUN pip3 install pandas==1.3.5 great-expectations==0.13.2 pyarrow==2.0.0 Jinja2==3.0.3 datadog==0.44.0 'numpy<1.20.0'
23 | 
24 | # For logging to /dev/termination-log
25 | RUN mkdir -p /dev
26 | 
27 | COPY spark/ingestion/target/feast-ingestion-spark-${VERSION}.jar /opt/spark/jars
28 | 
29 | 
30 | ENTRYPOINT [ "/opt/entrypoint.sh" ]


--------------------------------------------------------------------------------
/infra/docker/tests/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE=gcr.io/kf-feast/feast-ci:latest
 2 | 
 3 | FROM ${BASE_IMAGE}
 4 | 
 5 | RUN mkdir -p /src/ /src/spark/ingestion
 6 | 
 7 | COPY python /src/python
 8 | 
 9 | COPY README.md /src/README.md
10 | 
11 | WORKDIR /src
12 | 
13 | RUN pip install -r python/requirements-ci.txt
14 | 
15 | RUN git init .
16 | RUN pip install -e python -U
17 | RUN pip install "s3fs" "boto3" "urllib3>=1.25.4"
18 | 
19 | COPY tests /src/tests
20 | 
21 | RUN pip install -r tests/requirements.txt
22 | 
23 | COPY infra/scripts /src/infra/scripts
24 | COPY spark/ingestion/target /src/spark/ingestion/target
25 | 


--------------------------------------------------------------------------------
/infra/scripts/aws-runner.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -euo pipefail
 4 | 
 5 | GIT_TAG=${PULL_PULL_SHA:-${PULL_BASE_SHA}}
 6 | 
 7 | source infra/scripts/k8s-common-functions.sh
 8 | wait_for_image "${DOCKER_REPOSITORY}" feast-jobservice "${GIT_TAG}"
 9 | 
10 | infra/scripts/codebuild_runner.py "$@"


--------------------------------------------------------------------------------
/infra/scripts/build-ingestion-py-dependencies.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euo pipefail
 3 | PLATFORM=$1
 4 | DESTINATION=$2
 5 | PACKAGES=${PACKAGES:-"great-expectations==0.13.2 pyarrow==2.0.0 datadog==0.39.0"}
 6 | 
 7 | tmp_dir=$(mktemp -d)
 8 | 
 9 | pip3 install -t ${tmp_dir}/libs $PACKAGES
10 | 
11 | cd $tmp_dir
12 | tar -czf pylibs-ge-$PLATFORM.tar.gz libs/
13 | if [[ $DESTINATION == gs* ]]; then
14 |   gsutil cp pylibs-ge-$PLATFORM.tar.gz $DESTINATION
15 | else
16 |   mv pylibs-ge-$PLATFORM.tar.gz $DESTINATION
17 | fi
18 | 


--------------------------------------------------------------------------------
/infra/scripts/download-maven-cache.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # This script downloads previous maven packages that have been downloaded 
 5 | # from Google Cloud Storage to local path for faster build
 6 | 
 7 | usage()
 8 | {
 9 |     echo "usage: prepare_maven_cache.sh
10 |     --archive-uri   gcs uri to retrieve maven .m2 archive
11 |     --output-dir    output directory for .m2 directory"
12 | }
13 | 
14 | while [ "$1" != "" ]; do
15 |   case "$1" in
16 |       --archive-uri )       ARCHIVE_URI="$2";    shift;;
17 |       --output-dir )        OUTPUT_DIR="$2";     shift;;
18 |       * )                   usage; exit 1
19 |   esac
20 |   shift
21 | done
22 | 
23 | if [[ ! ${ARCHIVE_URI} ]]; then usage; exit 1; fi
24 | if [[ ! ${OUTPUT_DIR}  ]]; then usage; exit 1; fi
25 | 
26 | # Install Google Cloud SDK if gsutil command not exists
27 | if [[ ! $(command -v gsutil) ]]; then 
28 |   CURRENT_DIR=$(dirname "$BASH_SOURCE")
29 |   . "${CURRENT_DIR}"/install-google-cloud-sdk.sh
30 | fi  
31 | 
32 | gsutil -q cp ${ARCHIVE_URI} /tmp/.m2.tar
33 | tar xf /tmp/.m2.tar -C ${OUTPUT_DIR}
34 | 


--------------------------------------------------------------------------------
/infra/scripts/helm/k8s-jobservice.tpl.yaml:
--------------------------------------------------------------------------------
 1 | feast-jobservice:
 2 |   image:
 3 |     tag: ${IMAGE_TAG}
 4 |   envOverrides:
 5 |     FEAST_CORE_URL: feast-release-feast-core:6565
 6 |     FEAST_SPARK_LAUNCHER: k8s
 7 |     FEAST_SPARK_K8S_NAMESPACE: sparkop-e2e
 8 |     FEAST_SPARK_K8S_USE_INCLUSTER_CONFIG: True
 9 |     FEAST_TELEMETRY: False
10 |     FEAST_SPARK_STAGING_LOCATION: gs://feast-templocation-kf-feast
11 |     FEAST_REDIS_HOST: feast-release-redis-master
12 |     FEAST_REDIS_PORT: 6379
13 |     FEAST_JOB_SERVICE_ENABLE_CONTROL_LOOP: False
14 |     FEAST_SPARK_INGESTION_JAR: local:///opt/spark/jars/feast-ingestion-spark-${IMAGE_TAG}.jar
15 | 
16 |   sparkOperator:
17 |     enabled: true
18 |     jobTemplate:
19 |       apiVersion: "sparkoperator.k8s.io/v1beta2"
20 |       kind: SparkApplication
21 |       spec:
22 |         type: Scala
23 |         mode: cluster
24 |         image: "gcr.io/kf-feast/feast-spark:${IMAGE_TAG}"
25 |         hadoopConf:
26 |           "fs.gs.project.id": "kf-feast"
27 |           "google.cloud.auth.service.account.enable": "true"
28 |           "google.cloud.auth.service.account.json.keyfile": "/mnt/secrets/credentials.json"
29 |         sparkVersion: "3.1.3"
30 |         timeToLiveSeconds: 3600
31 |         pythonVersion: "3"
32 |         restartPolicy:
33 |           type: Never
34 |         driver:
35 |           cores: 1
36 |           coreLimit: "1200m"
37 |           memory: "600m"
38 |           labels:
39 |             version: 3.0.2
40 |           javaOptions: "-Dio.netty.tryReflectionSetAccessible=true -Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true"
41 |           secrets:
42 |             - name: feast-gcp-service-account
43 |               path: /mnt/secrets
44 |               secretType: GCPServiceAccount
45 |         executor:
46 |           cores: 1
47 |           instances: 1
48 |           memory: "800m"
49 |           labels:
50 |             version: 3.0.2
51 |           javaOptions: "-Dio.netty.tryReflectionSetAccessible=true -Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true"
52 |           secrets:
53 |             - name: feast-gcp-service-account
54 |               path: /mnt/secrets
55 |               secretType: GCPServiceAccount
56 | 
57 | 


--------------------------------------------------------------------------------
/infra/scripts/helm/kafka-values.tpl.yaml:
--------------------------------------------------------------------------------
 1 | externalAccess:
 2 |   enabled: true
 3 |   service:
 4 |     loadBalancerIPs:
 5 |       - $feast_kafka_ip
 6 |     annotations:
 7 |       cloud.google.com/load-balancer-type: Internal
 8 |     loadBalancerSourceRanges:
 9 |       - 10.0.0.0/8
10 |       - 172.16.0.0/12
11 |       - 192.168.0.0/16
12 | 
13 | persistence:
14 |   enabled: false
15 | 
16 | zookeeper:
17 |   persistence:
18 |     enabled: false


--------------------------------------------------------------------------------
/infra/scripts/helm/redis-cluster-values.tpl.yaml:
--------------------------------------------------------------------------------
 1 | cluster:
 2 |   nodes: 3
 3 |   replicas: 0
 4 |   externalAccess:
 5 |     enabled: true
 6 |     service:
 7 |       annotations:
 8 |         cloud.google.com/load-balancer-type: Internal
 9 |       loadBalancerIP:
10 |         - $feast_redis_1_ip
11 |         - $feast_redis_2_ip
12 |         - $feast_redis_3_ip
13 | 
14 | persistence:
15 |   enabled: false
16 | 
17 | usePassword: false


--------------------------------------------------------------------------------
/infra/scripts/install-google-cloud-sdk.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | usage()
 5 | {
 6 |     echo "usage: . install-google-cloud-sdk.sh
 7 |     [--with-key-file   local file path to service account json]
 8 | 
 9 | NOTE: requires 'dot' before install-google-cloud-sdk.sh
10 |       so that the PATH variable  is exported succesfully to 
11 |       the calling process, i.e. you don't need to provide
12 |       full path to gcloud command after installation
13 | 
14 |       --with-key-file is optional, 
15 |         if no authentication is required"
16 | }
17 | 
18 | while [ "$1" != "" ]; do
19 |   case "$1" in
20 |       --with-key-file )     KEY_FILE="$2";    shift;;
21 |       * )                   usage; exit 1
22 |   esac
23 |   shift
24 | done
25 | 
26 | GOOGLE_CLOUD_SDK_ARCHIVE_URL=https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-266.0.0-linux-x86_64.tar.gz
27 | GOOGLE_PROJECT_ID=kf-feast
28 | KUBE_CLUSTER_NAME=primary-test-cluster
29 | KUBE_CLUSTER_ZONE=us-central1-a
30 | 
31 | curl -s ${GOOGLE_CLOUD_SDK_ARCHIVE_URL} | tar xz -C /
32 | export PATH=/google-cloud-sdk/bin:${PATH}
33 | gcloud -q components install kubectl &> /var/log/kubectl.install.log
34 | 
35 | if [[ ${KEY_FILE} ]]; then 
36 |     gcloud -q auth activate-service-account --key-file=${KEY_FILE}
37 |     gcloud -q auth configure-docker
38 |     gcloud -q config set project ${GOOGLE_PROJECT_ID}
39 |     gcloud -q container clusters get-credentials ${KUBE_CLUSTER_NAME} --zone ${KUBE_CLUSTER_ZONE} --project ${GOOGLE_PROJECT_ID}
40 |     export GOOGLE_APPLICATION_CREDENTIALS=${KEY_FILE}
41 | fi
42 | 
43 | # Restore bash option
44 | set +e


--------------------------------------------------------------------------------
/infra/scripts/install-helm.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | readonly HELM_URL=https://storage.googleapis.com/kubernetes-helm
 4 | readonly HELM_TARBALL="helm-${HELM_VERSION}-linux-amd64.tar.gz"
 5 | readonly STABLE_REPO_URL=https://charts.helm.sh/stable
 6 | readonly INCUBATOR_REPO_URL=https://charts.helm.sh/incubator
 7 | curl -s "https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz" | tar -C /tmp -xz
 8 | sudo mv /tmp/linux-amd64/helm /usr/bin/helm
 9 | helm init --client-only
10 | helm repo add incubator "$INCUBATOR_REPO_URL"
11 | 


--------------------------------------------------------------------------------
/infra/scripts/publish-docker-image.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e 
 4 | set -o pipefail
 5 | 
 6 | usage()
 7 | {
 8 |     echo "usage: publish-docker-image.sh
 9 | 
10 |     --repository  the target repository to upload the Docker image, example:
11 |                   gcr.io/kf-feast/feast-core
12 | 
13 |     --tag         the tag for the Docker image, example: 1.0.4
14 | 
15 |     --file        path to the Dockerfile
16 | 
17 |     [--google-service-account-file  
18 |     path to Google Cloud service account JSON key file]
19 | "
20 | }
21 | 
22 | while [ "$1" != "" ]; do
23 |   case "$1" in
24 |       --repository )           REPOSITORY="$2";         shift;;
25 |       --tag        )           TAG="$2";                shift;;
26 |       --file       )           FILE="$2";               shift;;
27 |       --google-service-account-file ) GOOGLE_SERVICE_ACCOUNT_FILE="$2";        shift;;
28 |       -h | --help )            usage;                   exit;; 
29 |       * )                      usage;                   exit 1
30 |   esac
31 |   shift
32 | done
33 | 
34 | if [ -z $REPOSITORY ]; then usage; exit 1; fi
35 | if [ -z $TAG ]; then usage; exit 1; fi
36 | if [ -z $FILE ]; then usage; exit 1; fi
37 |   
38 | if [ $GOOGLE_SERVICE_ACCOUNT_FILE ]; then 
39 |     gcloud -q auth activate-service-account --key-file $GOOGLE_SERVICE_ACCOUNT_FILE
40 |     gcloud -q auth configure-docker
41 | fi
42 | 
43 | echo "============================================================"
44 | echo "Building Docker image $REPOSITORY:$TAG"
45 | echo "============================================================"
46 | docker build -t $REPOSITORY:$TAG --build-arg REVISION=$TAG -f $FILE .
47 | 
48 | echo "============================================================"
49 | echo "Pushing Docker image $REPOSITORY:$TAG"
50 | echo "============================================================"
51 | docker push $REPOSITORY:$TAG
52 | 


--------------------------------------------------------------------------------
/infra/scripts/publish-java-sdk.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e 
 4 | set -o pipefail
 5 | 
 6 | GPG_KEY_IMPORT_DIR=/etc/gpg
 7 | 
 8 | usage()
 9 | {
10 |     echo "usage: publish-java-sdk.sh
11 | 
12 |   --revision            Value for the revision e.g. '0.2.3'
13 |   --gpg-key-import-dir  Directory containing existing GPG keys to import.
14 |                         The directory should contain these 2 files:
15 |                         - public-key
16 |                         - private-key
17 |                         The default value is '/etc/gpg'
18 |   
19 |   This script assumes the GPG private key is protected by a passphrase.
20 |   The passphrase can be specified in \$HOME/.m2/settings.xml. In the same xml
21 |   file, credentials to upload releases to Sonatype must also be provided.
22 | 
23 |   # Example settings: ~/.m2/settings.xml
24 |   <settings>
25 |     <servers>
26 |       <server>
27 |         <id>ossrh</id>
28 |         <username>SONATYPE_USER</username>
29 |         <password>SONATYPE_PASSWORD</password>
30 |       </server>
31 |     </servers>
32 |     <profiles>
33 |       <profile>
34 |         <id>ossrh</id>
35 |         <properties>
36 |           <gpg.passphrase>GPG_PASSPHRASE</gpg.passphrase>
37 |         </properties>
38 |       </profile>
39 |     </profiles>
40 |   </settings>
41 | "
42 | }
43 | 
44 | while [ "$1" != "" ]; do
45 |   case "$1" in
46 |       --revision )             REVISION="$2";            shift;;
47 |       --gpg-key-import-dir )   GPG_KEY_IMPORT_DIR="$2";  shift;;
48 |       -h | --help )            usage;                    exit;; 
49 |       * )                      usage;                    exit 1
50 |   esac
51 |   shift
52 | done
53 | 
54 | if [ -z $REVISION ]; then usage; exit 1; fi
55 | 
56 | echo "============================================================"
57 | echo "Checking Maven and GPG versions"
58 | echo "============================================================"
59 | mvn --version 
60 | echo ""
61 | gpg --version
62 | 
63 | echo "============================================================"
64 | echo "Importing GPG keys"
65 | echo "============================================================"
66 | gpg --import --batch --yes $GPG_KEY_IMPORT_DIR/public-key
67 | gpg --import --batch --yes $GPG_KEY_IMPORT_DIR/private-key
68 | 
69 | echo "============================================================"
70 | echo "Deploying Java SDK with revision: $REVISION"
71 | echo "============================================================"
72 | mvn --projects datatypes/java,sdk/java -Drevision=$REVISION --batch-mode clean deploy
73 | 


--------------------------------------------------------------------------------
/infra/scripts/publish-python-sdk.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e 
 4 | set -o pipefail
 5 | 
 6 | usage()
 7 | {
 8 |     echo "usage: publish-python-sdk.sh
 9 | 
10 |     --directory-path  absolute path to the python package, this directory 
11 |                       should contain 'setup.py' file
12 | 
13 |     --repository      the repository name where the package will be uploaded,
14 |                       check your .pypirc configuration file for the list of 
15 |                       valid repositories, usually it's 'pypi' or 'testpypi'
16 | "
17 | }
18 | 
19 | while [ "$1" != "" ]; do
20 |   case "$1" in
21 |       --directory-path )       DIRECTORY_PATH="$2";     shift;;
22 |       --repository )           REPOSITORY="$2";         shift;;
23 |       -h | --help )            usage;                   exit;; 
24 |       * )                      usage;                   exit 1
25 |   esac
26 |   shift
27 | done
28 | 
29 | if [ -z $DIRECTORY_PATH ]; then usage; exit 1; fi
30 | if [ -z $REPOSITORY ]; then usage; exit 1; fi
31 | 
32 | ORIGINAL_DIR=$PWD
33 | cd $DIRECTORY_PATH
34 | 
35 | echo "============================================================"
36 | echo "Generating distribution archives"
37 | echo "============================================================"
38 | python3 -m pip install --user --upgrade setuptools wheel
39 | python3 setup.py sdist bdist_wheel
40 | 
41 | echo "============================================================"
42 | echo "Uploading distribution archives"
43 | echo "============================================================"
44 | python3 -m pip install --user --upgrade twine
45 | python3 -m twine upload --repository $REPOSITORY dist/*
46 | 
47 | cd $ORIGINAL_DIR
48 | 


--------------------------------------------------------------------------------
/infra/scripts/push-helm-charts.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ $# -ne 1 ]; then
 6 |     echo "Please provide a single semver version (without a \"v\" prefix) to test the repository against, e.g 0.99.0"
 7 |     exit 1
 8 | fi
 9 | 
10 | bucket=gs://feast-helm-charts
11 | repo_url=https://feast-helm-charts.storage.googleapis.com/
12 | 
13 | helm plugin install https://github.com/hayorov/helm-gcs.git --version 0.2.2  || true
14 | 
15 | helm repo add feast-helm-chart-repo $bucket
16 | 
17 | helm package infra/charts/feast-spark --version ${1}
18 | 
19 | helm gcs push --public --force feast-spark-${1}.tgz feast-helm-chart-repo


--------------------------------------------------------------------------------
/infra/scripts/run-minikube-test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -euo pipefail
 4 | 
 5 | NAMESPACE=sparkop
 6 | JOB_NAME=test-runner
 7 | 
 8 | # Delete all sparkapplication resources that may be left over from the previous test runs.
 9 | kubectl delete sparkapplication --all -n sparkop || true
10 | 
11 | JOB_SPEC=$(dirname $0)/test_job.yaml
12 | 
13 | # Delete previous instance of the job if it exists
14 | kubectl delete -n ${NAMESPACE} "job/$JOB_NAME"  2>/dev/null || true
15 | 
16 | # Create the job
17 | kubectl apply -n ${NAMESPACE} -f "$JOB_SPEC"
18 | 
19 | # Wait for job to have a pod.
20 | for i in {1..10}
21 | do
22 | 	POD=$(kubectl get pods -n ${NAMESPACE} --selector=job-name=$JOB_NAME --output=jsonpath='{.items[0].metadata.name}')
23 | 	if [ ! -z "$POD" ]; then
24 | 		break
25 | 	else
26 | 		sleep 1
27 | 	fi
28 | done
29 | 
30 | echo "Waiting for pod to be ready:"
31 | kubectl wait -n ${NAMESPACE} --for=condition=ContainersReady "pod/$POD" --timeout=60s || true
32 | 
33 | echo "Job output:"
34 | kubectl logs -n ${NAMESPACE} -f "job/$JOB_NAME"
35 | 
36 | # Can't wait for both conditions at once, so wait for complete first then wait for failure
37 | kubectl wait -n ${NAMESPACE} --for=condition=complete "job/$JOB_NAME" --timeout=60s && exit 0
38 | kubectl wait -n ${NAMESPACE} --for=condition=failure "job/$JOB_NAME" --timeout=60s && exit 1
39 | 


--------------------------------------------------------------------------------
/infra/scripts/setup-common-functions.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Get Feast project repository root and scripts directory
 4 | export PROJECT_ROOT_DIR=$(git rev-parse --show-toplevel)
 5 | export SCRIPTS_DIR=${PROJECT_ROOT_DIR}/infra/scripts
 6 | 
 7 | install_test_tools() {
 8 |   apt-get -qq update
 9 |   apt-get -y install wget netcat kafkacat build-essential
10 | }
11 | 
12 | print_banner() {
13 |   echo "
14 | ============================================================
15 | $1
16 | ============================================================
17 | "
18 | }
19 | 
20 | wait_for_docker_image(){
21 |   # This script will block until a docker image is ready
22 | 
23 |   [[ -z "$1" ]] && { echo "Please pass the docker image URI as the first parameter" ; exit 1; }
24 |   oldopt=$-
25 |   set +e
26 | 
27 |   DOCKER_IMAGE=$1
28 |   poll_count=0
29 |   maximum_poll_count=150
30 | 
31 |   # Wait for Feast Core to be available on GCR
32 |   until docker pull "$DOCKER_IMAGE"
33 |   do
34 |     # Exit when we have tried enough times
35 |     if [[ "$poll_count" -gt "$maximum_poll_count" ]]; then
36 |          set -$oldopt
37 |          exit 1
38 |     fi
39 |     # Sleep and increment counter on failure
40 |     echo "${DOCKER_IMAGE} could not be found";
41 |     sleep 5;
42 |     ((poll_count++))
43 |   done
44 | 
45 |   set -$oldopt
46 | }
47 | 
48 | # Usage: TAG=$(get_tag_release [-ms])
49 | # Parses the last release from git tags.
50 | # Options:
51 | # -m - Use only tags that are tagged on the current branch
52 | # -s - Use only stable version tags. (ie no prerelease tags).
53 | get_tag_release() {
54 |   local GIT_TAG_CMD="git tag -l"
55 |   # Match only Semver tags
56 |   # Regular expression should match MAJOR.MINOR.PATCH[-PRERELEASE[.IDENTIFIER]]
57 |   # eg. v0.7.1 v0.7.2-alpha v0.7.2-rc.1
58 |   local TAG_REGEX='^v[0-9]+\.[0-9]+\.[0-9]+(-([0-9A-Za-z-]+(\.[0-9A-Za-z-]+)*))?$'
59 |   local OPTIND opt
60 |   while getopts "ms" opt; do
61 |     case "${opt}" in
62 |       m)
63 |         GIT_TAG_CMD="$GIT_TAG_CMD --merged"
64 |         ;;
65 |       s)
66 |         # Match only stable version tags.
67 |         TAG_REGEX="^v[0-9]+\.[0-9]+\.[0-9]+$"
68 |         ;;
69 |       *)
70 |         echo "get_tag_release(): Error: Bad arguments: $@"
71 |         return 1
72 |         ;;
73 |     esac
74 |   done
75 |   shift $((OPTIND-1))
76 | 
77 |   # Retrieve tags from git and filter as per regex.
78 |   local FILTERED_TAGS=$(bash -c "$GIT_TAG_CMD" | grep -P "$TAG_REGEX")
79 | 
80 |   # Sort version tags in highest semver version first.
81 |   # To make sure that prerelease versions (ie versions vMAJOR.MINOR.PATCH-PRERELEASE suffix)
82 |   # are sorted after stable versions (ie vMAJOR.MINOR.PATCH), we append '_' after 
83 |   # eachustable version as '_' is after '-' found in prerelease version
84 |   # alphanumerically and remove after sorting.
85 |   local SEMVER_SORTED_TAGS=$(echo "$FILTERED_TAGS" | sed -e '/-/!{s/$/_/}' | sort -rV \
86 |     | sed -e 's/_$//')
87 |   echo $(echo "$SEMVER_SORTED_TAGS" | head -n 1)
88 | }
89 | 


--------------------------------------------------------------------------------
/infra/scripts/setup-e2e-env-aws.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m pip install --upgrade pip==20.2 setuptools wheel
 4 | 
 5 | make install-python
 6 | 
 7 | python -m pip install -qr tests/requirements.txt
 8 | 
 9 | # Using mvn -q to make it less verbose. This step happens after docker containers were
10 | # succesfully built so it should be unlikely to fail, therefore we likely won't need detailed logs.
11 | echo "########## Building ingestion jar"
12 | TIMEFORMAT='########## took %R seconds'
13 | 
14 | time make build-ingestion-jar-no-tests REVISION=develop MAVEN_EXTRA_OPTS="-q --no-transfer-progress"
15 | 


--------------------------------------------------------------------------------
/infra/scripts/setup-e2e-env-gcp.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # GCloud, kubectl, helm should be already installed
 4 | # And kubernetes cluster already configured
 5 | 
 6 | test -z ${GCLOUD_REGION} && GCLOUD_REGION="us-central1"
 7 | test -z ${GCLOUD_NETWORK} && GCLOUD_NETWORK="default"
 8 | test -z ${GCLOUD_SUBNET} && GCLOUD_SUBNET="default"
 9 | 
10 | 
11 | feast_kafka_ip_name="feast-kafka"
12 | feast_redis_1_ip_name="feast-redis-1"
13 | feast_redis_2_ip_name="feast-redis-2"
14 | feast_redis_3_ip_name="feast-redis-3"
15 | 
16 | helm repo add bitnami https://charts.bitnami.com/bitnami
17 | 
18 | gcloud compute addresses create \
19 |       $feast_kafka_ip_name $feast_redis_1_ip_name $feast_redis_2_ip_name $feast_redis_3_ip_name \
20 |       --region ${GCLOUD_REGION} --subnet ${GCLOUD_SUBNET}
21 | 
22 | export feast_kafka_ip=$(gcloud compute addresses describe $feast_kafka_ip_name --region=${GCLOUD_REGION} --format "value(address)")
23 | export feast_redis_1_ip=$(gcloud compute addresses describe $feast_redis_1_ip_name --region=${GCLOUD_REGION} --format "value(address)")
24 | export feast_redis_2_ip=$(gcloud compute addresses describe $feast_redis_2_ip_name --region=${GCLOUD_REGION} --format "value(address)")
25 | export feast_redis_3_ip=$(gcloud compute addresses describe $feast_redis_3_ip_name --region=${GCLOUD_REGION} --format "value(address)")
26 | 
27 | 
28 | envsubst '$feast_kafka_ip' < helm/kafka-values.tpl.yaml > helm/kafka-values.yaml
29 | envsubst '$feast_redis_1_ip,$feast_redis_2_ip,$feast_redis_3_ip' < helm/redis-cluster-values.tpl.yaml > helm/redis-cluster-values.yaml
30 | 
31 | helm install e2e-kafka bitnami/kafka \
32 |   --values helm/kafka-values.yaml --namespace infra --create-namespace
33 | 
34 | helm install e2e-redis-cluster bitnami/redis-cluster \
35 |   --values helm/redis-cluster-values.yaml --namespace infra \
36 |   --create-namespace


--------------------------------------------------------------------------------
/infra/scripts/setup-e2e-env-sparkop.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m pip install --upgrade pip==20.2 setuptools wheel
 4 | 
 5 | make install-python
 6 | 
 7 | python -m pip install -qr tests/requirements.txt
 8 | 
 9 | # Using mvn -q to make it less verbose. This step happens after docker containers were
10 | # succesfully built so it should be unlikely to fail, therefore we likely won't need detailed logs.
11 | echo "########## Building ingestion jar"
12 | TIMEFORMAT='########## took %R seconds'
13 | 
14 | time make build-ingestion-jar-no-tests REVISION=develop MAVEN_EXTRA_OPTS="-q --no-transfer-progress"
15 | 


--------------------------------------------------------------------------------
/infra/scripts/setup-e2e-local.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | STEP_BREADCRUMB='~~~~~~~~'
 5 | 
 6 | pushd "$(dirname $0)"
 7 | source k8s-common-functions.sh
 8 | 
 9 | # spark k8s test - runs in sparkop namespace (so it doesn't interfere with a concurrently
10 | # running EMR test).
11 | NAMESPACE=sparkop
12 | RELEASE=sparkop
13 | 
14 | # Clean up old release
15 | k8s_cleanup "$RELEASE" "$NAMESPACE"
16 | 
17 | # Helm install everything in a namespace
18 | helm_install "$RELEASE" "${DOCKER_REPOSITORY}" "${GIT_TAG}" "$NAMESPACE" --create-namespace
19 | 
20 | # Delete all sparkapplication resources that may be left over from the previous test runs.
21 | kubectl delete sparkapplication --all -n "$NAMESPACE" || true
22 | 
23 | # Make sure the test pod has permissions to create sparkapplication resources
24 | setup_sparkop_role
25 | 
26 | echo "DONE"


--------------------------------------------------------------------------------
/infra/scripts/test-core-ingestion.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | apt-get -qq update
 4 | apt-get -y install build-essential
 5 | 
 6 | make lint-java
 7 | 
 8 | infra/scripts/download-maven-cache.sh \
 9 |     --archive-uri gs://feast-templocation-kf-feast/.m2.2019-10-24.tar \
10 |     --output-dir /root/
11 | 
12 | # Core depends on Ingestion so they are tested together
13 | # Skip Maven enforcer: https://stackoverflow.com/questions/50647223/maven-enforcer-issue-when-running-from-reactor-level
14 | mvn --projects core,ingestion --batch-mode --define skipTests=true \
15 |     --define enforcer.skip=true clean install
16 | mvn --projects core,ingestion --define enforcer.skip=true test
17 | TEST_EXIT_CODE=$?
18 | 
19 | # Default artifact location setting in Prow jobs
20 | LOGS_ARTIFACT_PATH=/logs/artifacts
21 | mkdir -p ${LOGS_ARTIFACT_PATH}/surefire-reports
22 | cp core/target/surefire-reports/* ${LOGS_ARTIFACT_PATH}/surefire-reports/
23 | cp ingestion/target/surefire-reports/* ${LOGS_ARTIFACT_PATH}/surefire-reports/
24 | 
25 | exit ${TEST_EXIT_CODE}


--------------------------------------------------------------------------------
/infra/scripts/test-end-to-end-local.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -euo pipefail
 4 | 
 5 | export DISABLE_FEAST_SERVICE_FIXTURES=1
 6 | export DISABLE_SERVICE_FIXTURES=1
 7 | 
 8 | export FEAST_SPARK_K8S_NAMESPACE=sparkop
 9 | export FEAST_S3_ENDPOINT_URL=http://minio.minio.svc.cluster.local:9000
10 | 
11 | # Used by tests
12 | export AWS_S3_ENDPOINT_URL=http://minio.minio.svc.cluster.local:9000
13 | 
14 | cat << SPARK_CONF_END >/tmp/spark_conf.yml
15 | apiVersion: "sparkoperator.k8s.io/v1beta2"
16 | kind: SparkApplication
17 | metadata:
18 |   namespace: default
19 | spec:
20 |   type: Scala
21 |   mode: cluster
22 |   image: "gcr.io/kf-feast/spark-py:v3.0.1"
23 |   imagePullPolicy: Always
24 |   sparkVersion: "3.0.1"
25 |   timeToLiveSeconds: 3600
26 |   pythonVersion: "3"
27 |   sparkConf:
28 |     "spark.hadoop.fs.s3a.endpoint": http://minio.minio.svc.cluster.local:9000
29 |     "spark.hadoop.fs.s3a.path.style.access": "true"
30 |     "spark.hadoop.fs.s3a.access.key": ${AWS_ACCESS_KEY_ID}
31 |     "spark.hadoop.fs.s3a.secret.key": ${AWS_SECRET_ACCESS_KEY}
32 |   restartPolicy:
33 |     type: Never
34 |   volumes:
35 |     - name: "test-volume"
36 |       hostPath:
37 |         path: "/tmp"
38 |         type: Directory
39 |   driver:
40 |     cores: 1
41 |     coreLimit: "1200m"
42 |     memory: "512m"
43 |     labels:
44 |       version: 3.0.1
45 |     serviceAccount: spark
46 |     volumeMounts:
47 |       - name: "test-volume"
48 |         mountPath: "/tmp"
49 |   executor:
50 |     cores: 1
51 |     instances: 1
52 |     memory: "512m"
53 |     labels:
54 |       version: 3.0.1
55 |     volumeMounts:
56 |       - name: "test-volume"
57 |         mountPath: "/tmp"
58 | SPARK_CONF_END
59 | export FEAST_SPARK_K8S_JOB_TEMPLATE_PATH=/tmp/spark_conf.yml
60 | 
61 | PYTHONPATH=sdk/python pytest tests/e2e/ \
62 |       --feast-version develop \
63 |       --core-url sparkop-feast-core:6565 \
64 |       --serving-url sparkop-feast-online-serving:6566 \
65 |       --env k8s \
66 |       --staging-path s3a://feast-staging \
67 |       --redis-url sparkop-redis-master.sparkop.svc.cluster.local:6379 \
68 |       --kafka-brokers sparkop-kafka.sparkop.svc.cluster.local:9092 \
69 |       -m "not bq and not k8s"


--------------------------------------------------------------------------------
/infra/scripts/test-golang-sdk.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o pipefail
 4 | 
 5 | make lint-go
 6 | 
 7 | cd sdk/go
 8 | go test -v 2>&1 | tee /tmp/test_output
 9 | TEST_EXIT_CODE=$?
10 | 
11 | # Default artifact location setting in Prow jobs
12 | LOGS_ARTIFACT_PATH=/logs/artifacts
13 | 
14 | go get -u github.com/jstemmer/go-junit-report
15 | cat /tmp/test_output | ${GOPATH}/bin/go-junit-report > ${LOGS_ARTIFACT_PATH}/golang-sdk-test-report.xml
16 | 
17 | exit ${TEST_EXIT_CODE}


--------------------------------------------------------------------------------
/infra/scripts/test-integration.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | python -m pip install --upgrade pip setuptools wheel
4 | make install-python
5 | python -m pip install -qr tests/requirements.txt
6 | 
7 | export FEAST_TELEMETRY="False"
8 | pytest tests/integration --dataproc-cluster-name feast-e2e --dataproc-project kf-feast --dataproc-region us-central1  --dataproc-staging-location gs://feast-templocation-kf-feast
9 | 


--------------------------------------------------------------------------------
/infra/scripts/test-java-sdk.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Skip Maven enforcer: https://stackoverflow.com/questions/50647223/maven-enforcer-issue-when-running-from-reactor-level
 4 | mvn --projects sdk/java --batch-mode --define skipTests=true \
 5 |     --define enforcer.skip=true clean install
 6 | mvn --projects sdk/java --define enforcer.skip=true test
 7 | TEST_EXIT_CODE=$?
 8 | 
 9 | # Default artifact location setting in Prow jobs
10 | LOGS_ARTIFACT_PATH=/logs/artifacts
11 | cp -r sdk/java/target/surefire-reports ${LOGS_ARTIFACT_PATH}/surefire-reports
12 | 
13 | exit ${TEST_EXIT_CODE}


--------------------------------------------------------------------------------
/infra/scripts/test-python-sdk.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | # Default artifact location setting in Prow jobs
 6 | LOGS_ARTIFACT_PATH=/logs/artifacts
 7 | 
 8 | pip install -r sdk/python/requirements-ci.txt
 9 | make compile-protos-python
10 | make lint-python
11 | 
12 | cd sdk/python/
13 | pip install -e .
14 | pytest --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml
15 | 


--------------------------------------------------------------------------------
/infra/scripts/test-serving.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infra/scripts/download-maven-cache.sh \
 4 |     --archive-uri gs://feast-templocation-kf-feast/.m2.2019-10-24.tar \
 5 |     --output-dir /root/
 6 | 
 7 | mvn --batch-mode --also-make --projects serving test
 8 | TEST_EXIT_CODE=$?
 9 | 
10 | # Default artifact location setting in Prow jobs
11 | LOGS_ARTIFACT_PATH=/logs/artifacts
12 | cp -r serving/target/surefire-reports ${LOGS_ARTIFACT_PATH}/surefire-reports
13 | 
14 | exit ${TEST_EXIT_CODE}
15 | 


--------------------------------------------------------------------------------
/infra/scripts/test_job.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: test-runner
 5 |   namespace: sparkop
 6 | spec:
 7 |   backoffLimit: 1
 8 |   template:
 9 |     spec:
10 |       containers:
11 |       - name: ubuntu
12 |         image: feast:local
13 |         command: ["bash",  "-c", "./infra/scripts/test-end-to-end-local.sh"]
14 |         imagePullPolicy: Never
15 |         args:
16 |         - bash
17 |         stdin: true
18 |         stdinOnce: true
19 |         tty: true
20 |         env:
21 |           - name: AWS_ACCESS_KEY_ID
22 |             valueFrom:
23 |               secretKeyRef:
24 |                 name: minio
25 |                 key: accesskey
26 |           - name: AWS_SECRET_ACCESS_KEY
27 |             valueFrom:
28 |               secretKeyRef:
29 |                 name: minio
30 |                 key: secretkey
31 |           - name: AWS_DEFAULT_REGION
32 |             value: us-east-1
33 |           - name: AWS_S3_SIGNATURE_VERSION
34 |             value: s3v4
35 |       restartPolicy: Never
36 | 


--------------------------------------------------------------------------------
/infra/scripts/validate-helm-chart-versions.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | function finish {
 4 |   echo "Please ensure the Chart.yaml have the version ${1}"
 5 |   exit
 6 | }
 7 | 
 8 | trap "finish $1" ERR
 9 | 
10 | set -e
11 | 
12 | if [ $# -ne 1 ]; then
13 |     echo "Please provide a single semver version (without a \"v\" prefix) to test the repository against, e.g 0.99.0"
14 |     exit 1
15 | fi
16 | 
17 | # Get project root
18 | PROJECT_ROOT_DIR=$(git rev-parse --show-toplevel)
19 | 
20 | echo "Trying to find version ${1} in the feast-spark Chart.yaml. Exiting if not found."
21 | grep "version: ${1}" "${PROJECT_ROOT_DIR}/infra/charts/feast-spark/Chart.yaml"
22 | 
23 | 
24 | echo "Trying to find version ${1} in the feast-jobservice Chart.yaml. Exiting if not found."
25 | grep "version: ${1}" "${PROJECT_ROOT_DIR}/infra/charts/feast-spark/charts/feast-jobservice/Chart.yaml"
26 | 
27 | echo "Success! All versions found!"


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 | 
 4 |     <modelVersion>4.0.0</modelVersion>
 5 |     <name>Feast Spark</name>
 6 | 
 7 |     <groupId>dev.feast</groupId>
 8 |     <artifactId>feast-spark-parent</artifactId>
 9 |     <version>${revision}</version>
10 |     <packaging>pom</packaging>
11 | 
12 |     <modules>
13 |         <module>spark/ingestion</module>
14 |     </modules>
15 | 
16 |     <properties>
17 |         <revision>0.2.24</revision>
18 |         <maven.compiler.source>1.8</maven.compiler.source>
19 |         <maven.compiler.target>1.8</maven.compiler.target>
20 |         <scala.version>2.12</scala.version>
21 |         <scala.fullVersion>${scala.version}.10</scala.fullVersion>
22 |         <spark.version>3.1.3</spark.version>
23 |         <scala-maven-plugin.version>4.4.0</scala-maven-plugin.version>
24 |         <maven-assembly-plugin.version>3.3.0</maven-assembly-plugin.version>
25 |         <protobuf.version>3.12.2</protobuf.version>
26 |         <commons.lang3.version>3.10</commons.lang3.version>
27 |         <hbase.version>2.4.11</hbase.version>
28 | 
29 |         <license.content><![CDATA[
30 | /*
31 |  * SPDX-License-Identifier: Apache-2.0
32 |  * Copyright 2018-$YEAR The Feast Authors
33 |  *
34 |  * Licensed under the Apache License, Version 2.0 (the "License");
35 |  * you may not use this file except in compliance with the License.
36 |  * You may obtain a copy of the License at
37 |  *
38 |  *     https://www.apache.org/licenses/LICENSE-2.0
39 |  *
40 |  * Unless required by applicable law or agreed to in writing, software
41 |  * distributed under the License is distributed on an "AS IS" BASIS,
42 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
43 |  * See the License for the specific language governing permissions and
44 |  * limitations under the License.
45 |  */
46 | ]]>
47 |         </license.content>
48 |     </properties>
49 | 
50 | 
51 | </project>
52 | 


--------------------------------------------------------------------------------
/protos/feast/core/DataFormat.proto:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright 2020 The Feast Authors
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     https://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | //
16 | 
17 | 
18 | syntax = "proto3";
19 | package feast.core;
20 | 
21 | option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core";
22 | option java_outer_classname = "DataFormatProto";
23 | option java_package = "feast.proto.core";
24 | 
25 | // Defines the file format encoding the features/entity data in files
26 | message FileFormat {
27 |   // Defines options for the Parquet data format
28 |   message ParquetFormat {}
29 |   
30 |   oneof format {
31 |     ParquetFormat parquet_format = 1;
32 |   }
33 | }
34 | 
35 | // Defines the data format encoding features/entity data in data streams
36 | message StreamFormat {
37 |   // Defines options for the protobuf data format
38 |   message ProtoFormat {
39 |     // Classpath to the generated Java Protobuf class that can be used to decode
40 |     // Feature data from the obtained stream message
41 |     string class_path = 1;
42 |   }
43 |   
44 |   // Defines options for the avro data format
45 |   message AvroFormat {
46 |     // Optional if used in a File DataSource as schema is embedded in avro file.
47 |     // Specifies the schema of the Avro message as JSON string.
48 |     string schema_json = 1;
49 |   }
50 | 
51 |   // Specifies the data format and format specific options
52 |   oneof format {
53 |     AvroFormat avro_format = 1;
54 |     ProtoFormat proto_format = 2;
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/protos/feast/core/Entity.proto:
--------------------------------------------------------------------------------
 1 | //
 2 | // * Copyright 2020 The Feast Authors
 3 | // *
 4 | // * Licensed under the Apache License, Version 2.0 (the "License");
 5 | // * you may not use this file except in compliance with the License.
 6 | // * You may obtain a copy of the License at
 7 | // *
 8 | // *     https://www.apache.org/licenses/LICENSE-2.0
 9 | // *
10 | // * Unless required by applicable law or agreed to in writing, software
11 | // * distributed under the License is distributed on an "AS IS" BASIS,
12 | // * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // * See the License for the specific language governing permissions and
14 | // * limitations under the License.
15 | //
16 | 
17 | syntax = "proto3";
18 | 
19 | package feast.core;
20 | option java_package = "feast.proto.core";
21 | option java_outer_classname = "EntityProto";
22 | option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core";
23 | 
24 | import "feast/types/Value.proto";
25 | import "google/protobuf/timestamp.proto";
26 | 
27 | message Entity {
28 |     // User-specified specifications of this entity.
29 |     EntitySpecV2 spec = 1;
30 |     // System-populated metadata for this entity.
31 |     EntityMeta meta = 2;
32 | }
33 | 
34 | message EntitySpecV2 {
35 |     // Name of the entity.
36 |     string name = 1;
37 | 
38 |     // Type of the entity.
39 |     feast.types.ValueType.Enum value_type = 2;
40 | 
41 |     // Description of the entity.
42 |     string description = 3;
43 | 
44 |     // User defined metadata
45 |     map<string,string> labels = 8;
46 | }
47 | 
48 | message EntityMeta {
49 |     google.protobuf.Timestamp created_timestamp = 1;
50 |     google.protobuf.Timestamp last_updated_timestamp = 2;
51 | }
52 | 


--------------------------------------------------------------------------------
/protos/feast/core/Feature.proto:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright 2020 The Feast Authors
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     https://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | //
16 | 
17 | syntax = "proto3";
18 | package feast.core;
19 | 
20 | 
21 | option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core";
22 | option java_outer_classname = "FeatureProto";
23 | option java_package = "feast.proto.core";
24 | 
25 | import "feast/types/Value.proto";
26 | 
27 | message FeatureSpecV2 {
28 |     // Name of the feature. Not updatable.
29 |     string name = 1;
30 | 
31 |     // Value type of the feature. Not updatable.
32 |     feast.types.ValueType.Enum value_type = 2;
33 | 
34 |     // Labels for user defined metadata on a feature
35 |     map<string,string> labels = 3;
36 | }
37 | 


--------------------------------------------------------------------------------
/protos/feast/serving/ServingService.proto:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/protos/feast/serving/ServingService.proto


--------------------------------------------------------------------------------
/protos/feast/storage/Redis.proto:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 The Feast Authors
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | syntax = "proto3";
18 | 
19 | import "feast/types/Field.proto";
20 | import "feast/types/Value.proto";
21 | 
22 | package feast.storage;
23 | 
24 | option java_outer_classname = "RedisProto";
25 | option java_package = "feast.proto.storage";
26 | option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/storage";
27 | 
28 | message RedisKeyV2 {
29 |   string project = 1;
30 | 
31 |   repeated string entity_names = 2;
32 | 
33 |   repeated feast.types.Value entity_values = 3;
34 | }
35 | 


--------------------------------------------------------------------------------
/protos/feast/third_party/grpc/health/v1/HealthService.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package grpc.health.v1;
 4 | 
 5 | option java_package = "io.grpc.health.v1";
 6 | option java_outer_classname = "HealthProto";
 7 | 
 8 | message HealthCheckRequest {
 9 |   string service = 1;
10 | }
11 | 
12 | enum ServingStatus {
13 |   UNKNOWN = 0;
14 |   SERVING = 1;
15 |   NOT_SERVING = 2;
16 | }
17 | 
18 | message HealthCheckResponse {
19 |   ServingStatus status = 1;
20 | }
21 | 
22 | service Health {
23 |   rpc Check(HealthCheckRequest) returns (HealthCheckResponse);
24 | }


--------------------------------------------------------------------------------
/protos/feast/types/Field.proto:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 The Feast Authors
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | syntax = "proto3";
18 | 
19 | import "feast/types/Value.proto";
20 | 
21 | package feast.types;
22 | 
23 | option java_package = "feast.proto.types";
24 | option java_outer_classname = "FieldProto";
25 | option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/types";
26 | 
27 | message Field {
28 |   string name = 1;
29 |   feast.types.Value value = 2;
30 | }
31 | 


--------------------------------------------------------------------------------
/protos/feast/types/Value.proto:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 The Feast Authors
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | syntax = "proto3";
18 | 
19 | package feast.types;
20 | 
21 | option java_package = "feast.proto.types";
22 | option java_outer_classname = "ValueProto";
23 | option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/types";
24 | 
25 | message ValueType {
26 |   enum Enum {
27 |     INVALID = 0;
28 |     BYTES = 1;
29 |     STRING = 2;
30 |     INT32 = 3;
31 |     INT64 = 4;
32 |     DOUBLE = 5;
33 |     FLOAT = 6;
34 |     BOOL = 7;
35 |     BYTES_LIST = 11;
36 |     STRING_LIST = 12;
37 |     INT32_LIST = 13;
38 |     INT64_LIST = 14;
39 |     DOUBLE_LIST = 15;
40 |     FLOAT_LIST = 16;
41 |     BOOL_LIST = 17;
42 |   }
43 | }
44 | 
45 | message Value {
46 |   // ValueType is referenced by the metadata types, FeatureInfo and EntityInfo.
47 |   // The enum values do not have to match the oneof val field ids, but they should.
48 |   oneof val {
49 |     bytes bytes_val = 1;
50 |     string string_val = 2;
51 |     int32 int32_val = 3;
52 |     int64 int64_val = 4;
53 |     double double_val = 5;
54 |     float float_val = 6;
55 |     bool bool_val = 7;
56 |     BytesList bytes_list_val = 11;
57 |     StringList string_list_val = 12;
58 |     Int32List int32_list_val = 13;
59 |     Int64List int64_list_val = 14;
60 |     DoubleList double_list_val = 15;
61 |     FloatList float_list_val = 16;
62 |     BoolList bool_list_val = 17;
63 |   }
64 | }
65 | 
66 | message BytesList {
67 |   repeated bytes val = 1;
68 | }
69 | 
70 | message StringList {
71 |   repeated string val = 1;
72 | }
73 | 
74 | message Int32List {
75 |   repeated int32 val = 1;
76 | }
77 | 
78 | message Int64List {
79 |   repeated int64 val = 1;
80 | }
81 | 
82 | message DoubleList {
83 |   repeated double val = 1;
84 | }
85 | 
86 | message FloatList {
87 |   repeated float val = 1;
88 | }
89 | 
90 | message BoolList {
91 |   repeated bool val = 1;
92 | }
93 | 


--------------------------------------------------------------------------------
/protos/feast_spark/third_party/grpc/health/v1/HealthService.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package grpc.health.v1;
 4 | 
 5 | option java_package = "io.grpc.health.v1";
 6 | option java_outer_classname = "HealthProto";
 7 | 
 8 | message HealthCheckRequest {
 9 |   string service = 1;
10 | }
11 | 
12 | enum ServingStatus {
13 |   UNKNOWN = 0;
14 |   SERVING = 1;
15 |   NOT_SERVING = 2;
16 | }
17 | 
18 | message HealthCheckResponse {
19 |   ServingStatus status = 1;
20 | }
21 | 
22 | service Health {
23 |   rpc Check(HealthCheckRequest) returns (HealthCheckResponse);
24 | }


--------------------------------------------------------------------------------
/python/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/python/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | 
16 | sys.path.insert(0, os.path.abspath("../../feast_spark"))
17 | sys.path.insert(0, os.path.abspath("../.."))
18 | 
19 | # -- Project information -----------------------------------------------------
20 | 
21 | project = 'Feast Spark SDK'
22 | copyright = '2021, Feast Authors'
23 | author = 'Feast Authors'
24 | 
25 | # -- General configuration ---------------------------------------------------
26 | 
27 | # Add any Sphinx extension module names here, as strings. They can be
28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
29 | # ones.
30 | extensions = [
31 |     "sphinx.ext.doctest",
32 |     "sphinx.ext.intersphinx",
33 |     "sphinx.ext.todo",
34 |     "sphinx.ext.coverage",
35 |     "sphinx.ext.mathjax",
36 |     "sphinx.ext.ifconfig",
37 |     "sphinx.ext.viewcode",
38 |     "sphinx.ext.githubpages",
39 |     "sphinx.ext.napoleon",
40 |     "sphinx.ext.autodoc",
41 |     "sphinx_rtd_theme",
42 | ]
43 | 
44 | # Add any paths that contain templates here, relative to this directory.
45 | templates_path = ['_templates']
46 | 
47 | # List of patterns, relative to source directory, that match files and
48 | # directories to ignore when looking for source files.
49 | # This pattern also affects html_static_path and html_extra_path.
50 | exclude_patterns = []
51 | 
52 | # -- Options for HTML output -------------------------------------------------
53 | 
54 | # The theme to use for HTML and HTML Help pages.  See the documentation for
55 | # a list of builtin themes.
56 | #
57 | html_theme = "sphinx_rtd_theme"
58 | 
59 | # Add any paths that contain custom static files (such as style sheets) here,
60 | # relative to this directory. They are copied after the builtin static files,
61 | # so a file named "default.css" will overwrite the builtin "default.css".
62 | html_static_path = ['_static']
63 | 


--------------------------------------------------------------------------------
/python/docs/source/feast_spark.api.rst:
--------------------------------------------------------------------------------
 1 | feast\_spark.api package
 2 | ========================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | feast\_spark.api.JobService\_pb2 module
 8 | ---------------------------------------
 9 | 
10 | .. automodule:: feast_spark.api.JobService_pb2
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | feast\_spark.api.JobService\_pb2\_grpc module
16 | ---------------------------------------------
17 | 
18 | .. automodule:: feast_spark.api.JobService_pb2_grpc
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: feast_spark.api
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/python/docs/source/feast_spark.contrib.rst:
--------------------------------------------------------------------------------
 1 | feast\_spark.contrib package
 2 | ============================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    feast_spark.contrib.validation
11 | 
12 | Module contents
13 | ---------------
14 | 
15 | .. automodule:: feast_spark.contrib
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 


--------------------------------------------------------------------------------
/python/docs/source/feast_spark.contrib.validation.rst:
--------------------------------------------------------------------------------
 1 | feast\_spark.contrib.validation package
 2 | =======================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | feast\_spark.contrib.validation.base module
 8 | -------------------------------------------
 9 | 
10 | .. automodule:: feast_spark.contrib.validation.base
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | feast\_spark.contrib.validation.ge module
16 | -----------------------------------------
17 | 
18 | .. automodule:: feast_spark.contrib.validation.ge
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: feast_spark.contrib.validation
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/python/docs/source/feast_spark.pyspark.launchers.aws.rst:
--------------------------------------------------------------------------------
 1 | feast\_spark.pyspark.launchers.aws package
 2 | ==========================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | feast\_spark.pyspark.launchers.aws.emr module
 8 | ---------------------------------------------
 9 | 
10 | .. automodule:: feast_spark.pyspark.launchers.aws.emr
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | feast\_spark.pyspark.launchers.aws.emr\_utils module
16 | ----------------------------------------------------
17 | 
18 | .. automodule:: feast_spark.pyspark.launchers.aws.emr_utils
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: feast_spark.pyspark.launchers.aws
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/python/docs/source/feast_spark.pyspark.launchers.gcloud.rst:
--------------------------------------------------------------------------------
 1 | feast\_spark.pyspark.launchers.gcloud package
 2 | =============================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | feast\_spark.pyspark.launchers.gcloud.dataproc module
 8 | -----------------------------------------------------
 9 | 
10 | .. automodule:: feast_spark.pyspark.launchers.gcloud.dataproc
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: feast_spark.pyspark.launchers.gcloud
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/python/docs/source/feast_spark.pyspark.launchers.k8s.rst:
--------------------------------------------------------------------------------
 1 | feast\_spark.pyspark.launchers.k8s package
 2 | ==========================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | feast\_spark.pyspark.launchers.k8s.k8s module
 8 | ---------------------------------------------
 9 | 
10 | .. automodule:: feast_spark.pyspark.launchers.k8s.k8s
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | feast\_spark.pyspark.launchers.k8s.k8s\_utils module
16 | ----------------------------------------------------
17 | 
18 | .. automodule:: feast_spark.pyspark.launchers.k8s.k8s_utils
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: feast_spark.pyspark.launchers.k8s
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/python/docs/source/feast_spark.pyspark.launchers.rst:
--------------------------------------------------------------------------------
 1 | feast\_spark.pyspark.launchers package
 2 | ======================================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    feast_spark.pyspark.launchers.aws
11 |    feast_spark.pyspark.launchers.gcloud
12 |    feast_spark.pyspark.launchers.k8s
13 |    feast_spark.pyspark.launchers.standalone
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: feast_spark.pyspark.launchers
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/python/docs/source/feast_spark.pyspark.launchers.standalone.rst:
--------------------------------------------------------------------------------
 1 | feast\_spark.pyspark.launchers.standalone package
 2 | =================================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | feast\_spark.pyspark.launchers.standalone.local module
 8 | ------------------------------------------------------
 9 | 
10 | .. automodule:: feast_spark.pyspark.launchers.standalone.local
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: feast_spark.pyspark.launchers.standalone
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/python/docs/source/feast_spark.pyspark.rst:
--------------------------------------------------------------------------------
 1 | feast\_spark.pyspark package
 2 | ============================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    feast_spark.pyspark.launchers
11 | 
12 | Submodules
13 | ----------
14 | 
15 | feast\_spark.pyspark.abc module
16 | -------------------------------
17 | 
18 | .. automodule:: feast_spark.pyspark.abc
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | feast\_spark.pyspark.historical\_feature\_retrieval\_job module
24 | ---------------------------------------------------------------
25 | 
26 | .. automodule:: feast_spark.pyspark.historical_feature_retrieval_job
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | feast\_spark.pyspark.launcher module
32 | ------------------------------------
33 | 
34 | .. automodule:: feast_spark.pyspark.launcher
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | Module contents
40 | ---------------
41 | 
42 | .. automodule:: feast_spark.pyspark
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 


--------------------------------------------------------------------------------
/python/docs/source/feast_spark.rst:
--------------------------------------------------------------------------------
 1 | feast\_spark package
 2 | ====================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    feast_spark.api
11 |    feast_spark.contrib
12 |    feast_spark.pyspark
13 |    feast_spark.third_party
14 | 
15 | Submodules
16 | ----------
17 | 
18 | feast\_spark.cli module
19 | -----------------------
20 | 
21 | .. automodule:: feast_spark.cli
22 |    :members:
23 |    :undoc-members:
24 |    :show-inheritance:
25 | 
26 | feast\_spark.client module
27 | --------------------------
28 | 
29 | .. automodule:: feast_spark.client
30 |    :members:
31 |    :undoc-members:
32 |    :show-inheritance:
33 | 
34 | feast\_spark.constants module
35 | -----------------------------
36 | 
37 | .. automodule:: feast_spark.constants
38 |    :members:
39 |    :undoc-members:
40 |    :show-inheritance:
41 | 
42 | feast\_spark.job\_service module
43 | --------------------------------
44 | 
45 | .. automodule:: feast_spark.job_service
46 |    :members:
47 |    :undoc-members:
48 |    :show-inheritance:
49 | 
50 | feast\_spark.remote\_job module
51 | -------------------------------
52 | 
53 | .. automodule:: feast_spark.remote_job
54 |    :members:
55 |    :undoc-members:
56 |    :show-inheritance:
57 | 
58 | Module contents
59 | ---------------
60 | 
61 | .. automodule:: feast_spark
62 |    :members:
63 |    :undoc-members:
64 |    :show-inheritance:
65 | 


--------------------------------------------------------------------------------
/python/docs/source/feast_spark.third_party.grpc.health.rst:
--------------------------------------------------------------------------------
 1 | feast\_spark.third\_party.grpc.health package
 2 | =============================================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    feast_spark.third_party.grpc.health.v1
11 | 
12 | Module contents
13 | ---------------
14 | 
15 | .. automodule:: feast_spark.third_party.grpc.health
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 


--------------------------------------------------------------------------------
/python/docs/source/feast_spark.third_party.grpc.health.v1.rst:
--------------------------------------------------------------------------------
 1 | feast\_spark.third\_party.grpc.health.v1 package
 2 | ================================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | feast\_spark.third\_party.grpc.health.v1.HealthService\_pb2 module
 8 | ------------------------------------------------------------------
 9 | 
10 | .. automodule:: feast_spark.third_party.grpc.health.v1.HealthService_pb2
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | feast\_spark.third\_party.grpc.health.v1.HealthService\_pb2\_grpc module
16 | ------------------------------------------------------------------------
17 | 
18 | .. automodule:: feast_spark.third_party.grpc.health.v1.HealthService_pb2_grpc
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: feast_spark.third_party.grpc.health.v1
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/python/docs/source/feast_spark.third_party.grpc.rst:
--------------------------------------------------------------------------------
 1 | feast\_spark.third\_party.grpc package
 2 | ======================================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    feast_spark.third_party.grpc.health
11 | 
12 | Module contents
13 | ---------------
14 | 
15 | .. automodule:: feast_spark.third_party.grpc
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 


--------------------------------------------------------------------------------
/python/docs/source/feast_spark.third_party.rst:
--------------------------------------------------------------------------------
 1 | feast\_spark.third\_party package
 2 | =================================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    feast_spark.third_party.grpc
11 | 
12 | Module contents
13 | ---------------
14 | 
15 | .. automodule:: feast_spark.third_party
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 


--------------------------------------------------------------------------------
/python/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. Feast Spark SDK documentation master file, created by
 2 |    sphinx-quickstart on Sun Mar 21 17:00:24 2021.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Feast Spark SDK's documentation!
 7 | ===========================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 | 
14 | Client
15 | ==================
16 | 
17 | .. automodule:: feast_spark.client
18 |    :members:
19 |    :undoc-members:
20 |    :show-inheritance:


--------------------------------------------------------------------------------
/python/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | feast_spark
2 | ===========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    feast_spark
8 | 


--------------------------------------------------------------------------------
/python/feast_spark/__init__.py:
--------------------------------------------------------------------------------
1 | from .client import Client
2 | 
3 | __all__ = [
4 |     "Client",
5 | ]
6 | 


--------------------------------------------------------------------------------
/python/feast_spark/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/api/__init__.py


--------------------------------------------------------------------------------
/python/feast_spark/cli.py:
--------------------------------------------------------------------------------
 1 | import logging.config
 2 | 
 3 | import click
 4 | 
 5 | from feast_spark.job_service import start_job_service
 6 | 
 7 | logging.config.dictConfig(
 8 |     {
 9 |         "version": 1,
10 |         "disable_existing_loggers": True,
11 |         "formatters": {
12 |             "standard": {"format": "%(asctime)s [%(levelname)s] %(name)s: %(message)s"},
13 |         },
14 |         "handlers": {
15 |             "debug": {
16 |                 "level": "INFO",
17 |                 "formatter": "standard",
18 |                 "class": "logging.StreamHandler",
19 |                 "stream": "ext://sys.stdout",
20 |             },
21 |             "standard": {
22 |                 "level": "WARNING",
23 |                 "formatter": "standard",
24 |                 "class": "logging.StreamHandler",
25 |                 "stream": "ext://sys.stderr",
26 |             },
27 |         },
28 |         "loggers": {
29 |             "": {"handlers": ["standard"], "level": "WARNING", "propagate": False},
30 |             "feast_spark": {
31 |                 "handlers": ["debug", "standard"],
32 |                 "level": "INFO",
33 |                 "propagate": False,
34 |             },
35 |             "feast": {
36 |                 "handlers": ["debug", "standard"],
37 |                 "level": "INFO",
38 |                 "propagate": False,
39 |             },
40 |         },
41 |     }
42 | )
43 | 
44 | 
45 | @click.group()
46 | def cli():
47 |     pass
48 | 
49 | 
50 | @cli.command(name="server")
51 | def server():
52 |     """
53 |     Start Feast Job Service
54 |     """
55 |     start_job_service()
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     cli()
60 | 


--------------------------------------------------------------------------------
/python/feast_spark/contrib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/contrib/__init__.py


--------------------------------------------------------------------------------
/python/feast_spark/contrib/validation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/contrib/validation/__init__.py


--------------------------------------------------------------------------------
/python/feast_spark/contrib/validation/base.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | 
 3 | try:
 4 |     from pyspark import cloudpickle
 5 | except ImportError:
 6 |     raise ImportError("pyspark must be installed to enable validation functionality")
 7 | 
 8 | 
 9 | def serialize_udf(fun, return_type) -> bytes:
10 |     buffer = io.BytesIO()
11 |     command = (fun, return_type)
12 |     cloudpickle.dump(command, buffer)
13 |     return buffer.getvalue()
14 | 


--------------------------------------------------------------------------------
/python/feast_spark/metrics.py:
--------------------------------------------------------------------------------
 1 | from prometheus_client import Counter
 2 | 
 3 | job_whitelist_failure_count = Counter(
 4 |     "feast_job_whitelist_failure_count",
 5 |     "request failures due to feature table not being whitelisted",
 6 |     ["project", "table"],
 7 | )
 8 | job_submission_count = Counter(
 9 |     "feast_job_submission_count",
10 |     "request to submit feast job",
11 |     ["job_type", "project", "table"],
12 | )
13 | job_schedule_count = Counter(
14 |     "feast_job_schedule_count", "request to schedule feast job", ["project", "table"]
15 | )
16 | 


--------------------------------------------------------------------------------
/python/feast_spark/pyspark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/pyspark/__init__.py


--------------------------------------------------------------------------------
/python/feast_spark/pyspark/launchers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/pyspark/launchers/__init__.py


--------------------------------------------------------------------------------
/python/feast_spark/pyspark/launchers/aws/__init__.py:
--------------------------------------------------------------------------------
 1 | from .emr import (
 2 |     EmrBatchIngestionJob,
 3 |     EmrClusterLauncher,
 4 |     EmrRetrievalJob,
 5 |     EmrStreamIngestionJob,
 6 | )
 7 | 
 8 | __all__ = [
 9 |     "EmrRetrievalJob",
10 |     "EmrBatchIngestionJob",
11 |     "EmrStreamIngestionJob",
12 |     "EmrClusterLauncher",
13 | ]
14 | 


--------------------------------------------------------------------------------
/python/feast_spark/pyspark/launchers/gcloud/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataproc import DataprocClusterLauncher, DataprocRetrievalJob
2 | 
3 | __all__ = ["DataprocRetrievalJob", "DataprocClusterLauncher"]
4 | 


--------------------------------------------------------------------------------
/python/feast_spark/pyspark/launchers/k8s/__init__.py:
--------------------------------------------------------------------------------
 1 | from .k8s import (
 2 |     KubernetesBatchIngestionJob,
 3 |     KubernetesJobLauncher,
 4 |     KubernetesRetrievalJob,
 5 |     KubernetesStreamIngestionJob,
 6 | )
 7 | 
 8 | __all__ = [
 9 |     "KubernetesRetrievalJob",
10 |     "KubernetesBatchIngestionJob",
11 |     "KubernetesStreamIngestionJob",
12 |     "KubernetesJobLauncher",
13 | ]
14 | 


--------------------------------------------------------------------------------
/python/feast_spark/pyspark/launchers/standalone/__init__.py:
--------------------------------------------------------------------------------
 1 | from .local import (
 2 |     StandaloneClusterLauncher,
 3 |     StandaloneClusterRetrievalJob,
 4 |     reset_job_cache,
 5 | )
 6 | 
 7 | __all__ = [
 8 |     "StandaloneClusterRetrievalJob",
 9 |     "StandaloneClusterLauncher",
10 |     "reset_job_cache",
11 | ]
12 | 


--------------------------------------------------------------------------------
/python/feast_spark/third_party/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/third_party/__init__.py


--------------------------------------------------------------------------------
/python/feast_spark/third_party/grpc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/third_party/grpc/__init__.py


--------------------------------------------------------------------------------
/python/feast_spark/third_party/grpc/health/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/third_party/grpc/health/__init__.py


--------------------------------------------------------------------------------
/python/feast_spark/third_party/grpc/health/v1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/third_party/grpc/health/v1/__init__.py


--------------------------------------------------------------------------------
/python/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 88
 3 | target-version = ['py37']
 4 | include = '\.pyi?$'
 5 | exclude = '''
 6 | (
 7 |   /(
 8 |       \.eggs         # exclude a few common directories in the
 9 |     | \.git          # root of the project
10 |     | \.hg
11 |     | \.mypy_cache
12 |     | \.tox
13 |     | \.venv
14 |     | _build
15 |     | api
16 |     | buck-out
17 |     | build
18 |     | dist
19 |     | pb2.py
20 |     | \.pyi
21 |     | storage
22 |     | types
23 |     | third_party
24 |   )/
25 | )
26 | '''
27 | 


--------------------------------------------------------------------------------
/python/requirements-ci.txt:
--------------------------------------------------------------------------------
 1 | feast>=0.9.8,<0.10.0
 2 | cryptography==3.1
 3 | flake8
 4 | black==19.10b0
 5 | isort>=5
 6 | grpcio-tools==1.31.0
 7 | pyspark==3.1.3
 8 | pandas~=1.0.0
 9 | mock==2.0.0
10 | pandavro==1.5.*
11 | moto
12 | mypy==0.790
13 | mypy-protobuf
14 | avro==1.10.0
15 | gcsfs
16 | urllib3>=1.25.4
17 | pytest==6.0.0
18 | pytest-lazy-fixture==0.6.3
19 | pytest-timeout==1.4.2
20 | pytest-ordering==0.6.*
21 | pytest-mock==1.10.4
22 | PyYAML>=5.4.*
23 | great-expectations==0.13.2
24 | adlfs==0.5.9
25 | redis==4.1.*
26 | Jinja2==3.0.3
27 | croniter==1.*


--------------------------------------------------------------------------------
/python/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | multi_line_output=3
 3 | include_trailing_comma=True
 4 | force_grid_wrap=0
 5 | use_parentheses=True
 6 | line_length=88
 7 | skip=feast_spark/api,feast_spark/third_party
 8 | known_first_party=feast
 9 | default_section=THIRDPARTY
10 | 
11 | [flake8]
12 | ignore = E203, E266, E501, W503
13 | max-line-length = 88
14 | max-complexity = 20
15 | select = B,C,E,F,W,T4
16 | exclude = .git,__pycache__,docs/conf.py,dist,feast_spark/api,feast_spark/third_party
17 | 
18 | [mypy]
19 | files=feast_spark,test
20 | ignore_missing_imports=true
21 | 


--------------------------------------------------------------------------------
/python/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/tests/__init__.py


--------------------------------------------------------------------------------
/python/tests/data/bookings.csv:
--------------------------------------------------------------------------------
1 | driver_id,event_timestamp,created_timestamp,completed_bookings
2 | 8001,2020-08-31T00:00:00.000,2020-08-31T00:00:00.000,200
3 | 8001,2020-09-01T00:00:00.000,2020-09-01T00:00:00.000,300
4 | 8002,2020-09-01T00:00:00.000,2020-09-01T00:00:00.000,600
5 | 8002,2020-09-01T00:00:00.000,2020-09-02T00:00:00.000,500
6 | 8003,2020-09-01T00:00:00.000,2020-09-02T00:00:00.000,700
7 | 


--------------------------------------------------------------------------------
/python/tests/data/column_mapping_test_entity.csv:
--------------------------------------------------------------------------------
1 | id,event_timestamp
2 | 1001,2020-09-02T00:00:00.000
3 | 1001,2020-09-03T00:00:00.000
4 | 2001,2020-09-04T00:00:00.000
5 | 2001,2020-09-04T00:00:00.000
6 | 3001,2020-09-04T00:00:00.000
7 | 


--------------------------------------------------------------------------------
/python/tests/data/column_mapping_test_feature.csv:
--------------------------------------------------------------------------------
1 | customer_id,total_bookings,datetime,created_datetime
2 | 1001,200,2020-09-02T00:00:00.000,2020-09-02T00:00:00.000
3 | 1001,400,2020-09-04T00:00:00.000,2020-09-02T00:00:00.000
4 | 2001,500,2020-09-03T00:00:00.000,2020-09-01T00:00:00.000
5 | 2001,600,2020-09-03T00:00:00.000,2020-09-02T00:00:00.000
6 | 3001,700,2020-09-03T00:00:00.000,2020-09-03T00:00:00.000
7 | 


--------------------------------------------------------------------------------
/python/tests/data/customer_driver_pairs.csv:
--------------------------------------------------------------------------------
1 | customer_id,driver_id,event_timestamp
2 | 1001,8001,2020-09-02T00:00:00.000
3 | 1001,8002,2020-09-02T00:00:00.000
4 | 1001,8002,2020-09-03T00:00:00.000
5 | 2001,8002,2020-09-03T00:00:00.000
6 | 2001,8002,2020-09-04T00:00:00.000


--------------------------------------------------------------------------------
/python/tests/data/customers.csv:
--------------------------------------------------------------------------------
1 | customer_id,event_timestamp
2 | 1001,2020-09-02T00:00:00.000
3 | 1002,2020-09-02T00:00:00.000
4 | 1003,2020-09-03T00:00:00.000
5 | 1004,2020-09-03T00:00:00.000
6 | 1005,2020-09-04T00:00:00.000
7 | 


--------------------------------------------------------------------------------
/python/tests/data/single_customer.csv:
--------------------------------------------------------------------------------
1 | customer_id,event_timestamp
2 | 1001,2020-09-02T00:00:00.000
3 | 


--------------------------------------------------------------------------------
/python/tests/data/transactions.csv:
--------------------------------------------------------------------------------
1 | customer_id,event_timestamp,created_timestamp,daily_transactions
2 | 1001,2020-08-31T00:00:00.000,2020-09-01T00:00:00.000,50.0
3 | 1001,2020-09-01T00:00:00.000,2020-09-01T00:00:00.000,100.0
4 | 2001,2020-09-01T00:00:00.000,2020-08-31T00:00:00.000,80.0
5 | 2001,2020-09-01T00:00:00.000,2020-09-01T00:00:00.000,200.0
6 | 3001,2020-09-01T00:00:00.000,2020-09-01T00:00:00.000,300.0


--------------------------------------------------------------------------------
/python/tests/test_launcher_abc.py:
--------------------------------------------------------------------------------
 1 | from feast_spark.pyspark.abc import StreamIngestionJobParameters
 2 | 
 3 | 
 4 | def test_stream_ingestion_job_hash():
 5 |     streaming_source = {
 6 |         "kafka": {
 7 |             "event_timestamp_column": "event_timestamp",
 8 |             "bootstrap_servers": "localhost:9092",
 9 |             "topic": "test",
10 |             "format": {
11 |                 "class_path": "com.test.someprotos",
12 |                 "json_class": "ProtoFormat",
13 |             },
14 |         }
15 |     }
16 |     feature_table = {
17 |         "features": [
18 |             {"name": "feature_1", "type": "STRING"},
19 |             {"name": "feature_2", "type": "STRING"},
20 |         ],
21 |         "entities": [
22 |             {"name": "entity_1", "type": "STRING"},
23 |             {"name": "entity_2", "type": "STRING"},
24 |         ],
25 |         "project": "someproject",
26 |     }
27 |     feature_table_with_different_order = {
28 |         "features": [
29 |             {"name": "feature_2", "type": "STRING"},
30 |             {"name": "feature_1", "type": "STRING"},
31 |         ],
32 |         "entities": [
33 |             {"name": "entity_2", "type": "STRING"},
34 |             {"name": "entity_1", "type": "STRING"},
35 |         ],
36 |         "project": "someproject",
37 |     }
38 |     param = StreamIngestionJobParameters(
39 |         source=streaming_source, feature_table=feature_table, jar=""
40 |     )
41 |     param_different_order = StreamIngestionJobParameters(
42 |         source=streaming_source,
43 |         feature_table=feature_table_with_different_order,
44 |         jar="",
45 |     )
46 |     assert param.get_job_hash() == param_different_order.get_job_hash()
47 | 


--------------------------------------------------------------------------------
/python/tests/test_lock_manager.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | 
 3 | import pytest
 4 | 
 5 | from feast_spark.lock_manager import JobOperation, JobOperationLock
 6 | 
 7 | job_hash = "dummy_hash"
 8 | 
 9 | 
10 | class MockRedis:
11 |     def __init__(self, cache=dict()):
12 |         self.cache = cache
13 | 
14 |     def get(self, name):
15 |         if name in self.cache:
16 |             return self.cache[name]
17 |         return None
18 | 
19 |     def set(self, name, value, *args, **kwargs):
20 |         if name not in self.cache:
21 |             self.cache[name] = value.encode("utf-8")
22 |             return "OK"
23 | 
24 |     def delete(self, name):
25 |         if name in self.cache:
26 |             self.cache.pop(name)
27 |         return None
28 | 
29 | 
30 | @pytest.fixture
31 | def lock_config():
32 |     return {"redis_host": "localhost", "redis_port": 0, "lock_expiry": 5}
33 | 
34 | 
35 | @patch("redis.Redis")
36 | def test_lock_manager_context(mock_redis, lock_config):
37 |     mock_redis_connection = MockRedis()
38 |     mock_redis.return_value = mock_redis_connection
39 |     with JobOperationLock(
40 |         job_hash=job_hash, operation=JobOperation.START, **lock_config
41 |     ) as lock:
42 |         # test lock acquired
43 |         assert lock
44 |         # verify lock key in cache
45 |         assert (
46 |             f"lock_{JobOperation.START.value}_{job_hash}" in mock_redis_connection.cache
47 |         )
48 |     # verify release
49 |     assert (
50 |         f"lock_{JobOperation.START.value}_{job_hash}" not in mock_redis_connection.cache
51 |     )
52 | 
53 | 
54 | @patch("redis.Redis")
55 | def test_lock_manager_lock_not_available(mock_redis, lock_config):
56 |     cache = {"lock_st_dummy_hash": b"127a32aaf729dc87"}
57 |     mock_redis_connection = MockRedis(cache)
58 |     mock_redis.return_value = mock_redis_connection
59 |     with JobOperationLock(
60 |         job_hash=job_hash, operation=JobOperation.START, **lock_config
61 |     ) as lock:
62 |         # test lock not acquired
63 |         assert not lock
64 | 
65 | 
66 | def test_lock_manager_connection_error(lock_config):
67 |     with JobOperationLock(
68 |         job_hash=job_hash, operation=JobOperation.START, **lock_config
69 |     ) as lock:
70 |         # test lock not acquired
71 |         assert not lock
72 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Standard spark configuration #
 2 | 
 3 | log4j.rootCategory=INFO, console
 4 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 5 | log4j.appender.console.target=System.out
 6 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 7 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 8 | 
 9 | # Settings to quiet third party logs that are too verbose
10 | log4j.logger.org.sparkproject.jetty=WARN
11 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
12 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
13 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
14 | log4j.logger.org.apache.parquet=ERROR
15 | log4j.logger.parquet=ERROR
16 | 
17 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
18 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
19 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
20 | 
21 | 
22 | # Feast #
23 | log4j.appender.termination=org.apache.log4j.FileAppender
24 | log4j.appender.termination.File=/dev/termination-log
25 | log4j.appender.file.Append=true
26 | log4j.appender.file.ImmediateFlush=true
27 | log4j.appender.termination.layout=org.apache.log4j.PatternLayout
28 | log4j.appender.termination.layout.ConversionPattern=%c{1}: %m%n
29 | 
30 | log4j.logger.feast=FATAL, termination


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/metrics/IngestionPipelineMetrics.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.metrics
18 | 
19 | import org.apache.spark.SparkEnv
20 | import org.apache.spark.metrics.source.IngestionPipelineMetricSource
21 | import org.apache.spark.sql.Row
22 | 
23 | class IngestionPipelineMetrics extends Serializable {
24 | 
25 |   def incrementDeadLetters(row: Row): Row = {
26 |     metricSource.foreach(_.METRIC_DEADLETTER_ROWS_INSERTED.inc())
27 |     row
28 |   }
29 | 
30 |   def incrementRead(row: Row): Row = {
31 |     metricSource.foreach(_.METRIC_ROWS_READ_FROM_SOURCE.inc())
32 |     row
33 |   }
34 | 
35 |   private lazy val metricSource: Option[IngestionPipelineMetricSource] = {
36 |     val metricsSystem = SparkEnv.get.metricsSystem
37 |     IngestionPipelineMetricsLock.synchronized {
38 |       if (metricsSystem.getSourcesByName(IngestionPipelineMetricSource.sourceName).isEmpty) {
39 |         metricsSystem.registerSource(new IngestionPipelineMetricSource)
40 |       }
41 |     }
42 | 
43 |     metricsSystem.getSourcesByName(IngestionPipelineMetricSource.sourceName) match {
44 |       case Seq(head) => Some(head.asInstanceOf[IngestionPipelineMetricSource])
45 |       case _         => None
46 |     }
47 |   }
48 | }
49 | 
50 | private object IngestionPipelineMetricsLock
51 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/metrics/StreamingMetrics.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.metrics
18 | 
19 | import org.apache.spark.SparkEnv
20 | import org.apache.spark.metrics.source.StreamingMetricSource
21 | import org.apache.spark.sql.streaming.StreamingQueryProgress
22 | 
23 | class StreamingMetrics extends Serializable {
24 | 
25 |   private val metricSource: Option[StreamingMetricSource] = {
26 |     val metricsSystem = SparkEnv.get.metricsSystem
27 | 
28 |     metricsSystem.getSourcesByName(StreamingMetricSource.sourceName) match {
29 |       case Seq(head) => Some(head.asInstanceOf[StreamingMetricSource])
30 |       case _         => None
31 |     }
32 |   }
33 | 
34 |   def updateStreamingProgress(
35 |       progress: StreamingQueryProgress
36 |   ): Unit = {
37 |     metricSource.foreach(_.updateStreamingProgress(progress))
38 |   }
39 | 
40 |   def updateKafkaTimestamp(timestamp: Long): Unit = {
41 |     metricSource.foreach(_.updateKafkaTimestamp(timestamp))
42 |   }
43 | }
44 | 
45 | private object StreamingMetricsLock
46 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/registry/proto/LocalProtoRegistry.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.registry.proto
18 | import java.io.{IOException, ObjectInputStream}
19 | 
20 | import com.google.protobuf.Descriptors.Descriptor
21 | 
22 | import collection.mutable
23 | import scala.util.control.NonFatal
24 | 
25 | class LocalProtoRegistry extends ProtoRegistry {
26 |   @transient
27 |   private var cache: mutable.Map[String, Descriptor] = mutable.Map.empty
28 | 
29 |   @throws(classOf[IOException])
30 |   private def readObject(ois: ObjectInputStream): Unit = {
31 |     try {
32 |       ois.defaultReadObject()
33 |       cache = mutable.Map.empty
34 |     } catch {
35 |       case NonFatal(e) =>
36 |         throw new IOException(e)
37 |     }
38 |   }
39 | 
40 |   override def getProtoDescriptor(className: String): Descriptor = {
41 |     if (!cache.contains(className)) {
42 |       cache(className) = Class
43 |         .forName(className, true, getClass.getClassLoader)
44 |         .getMethod("getDescriptor")
45 |         .invoke(null)
46 |         .asInstanceOf[Descriptor]
47 |     }
48 | 
49 |     cache(className)
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/registry/proto/ProtoRegistry.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.registry.proto
18 | 
19 | import com.google.protobuf.Descriptors.Descriptor
20 | 
21 | trait ProtoRegistry extends Serializable {
22 |   def getProtoDescriptor(className: String): Descriptor
23 | }
24 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/registry/proto/ProtoRegistryFactory.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.registry.proto
18 | 
19 | import org.apache.spark.sql.SparkSession
20 | 
21 | object ProtoRegistryFactory {
22 |   val CONFIG_PREFIX       = "feast.ingestion.registry.proto."
23 |   val PROTO_REGISTRY_KIND = s"${CONFIG_PREFIX}kind"
24 |   val DEFAULT_KIND        = "local"
25 | 
26 |   def resolveProtoRegistry(sparkSession: SparkSession): ProtoRegistry = {
27 |     val config     = sparkSession.sparkContext.getConf
28 |     val kind       = config.get(PROTO_REGISTRY_KIND, DEFAULT_KIND)
29 |     val properties = config.getAllWithPrefix(CONFIG_PREFIX).toMap
30 |     protoRegistry(kind, properties)
31 |   }
32 | 
33 |   private def protoRegistry(name: String, properties: Map[String, String]): ProtoRegistry =
34 |     name match {
35 |       case "local"   => new LocalProtoRegistry
36 |       case "stencil" => new StencilProtoRegistry(properties("url"), properties.get("token"))
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/registry/proto/StencilProtoRegistry.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.registry.proto
18 | import com.google.protobuf.Descriptors
19 | import io.odpf.stencil.StencilClientFactory
20 | import io.odpf.stencil.client.StencilClient
21 | import io.odpf.stencil.config.StencilConfig
22 | import org.apache.http.{Header, HttpHeaders}
23 | import org.apache.http.message.BasicHeader
24 | 
25 | import scala.collection.JavaConverters._
26 | 
27 | class StencilProtoRegistry(url: String, token: Option[String]) extends ProtoRegistry {
28 |   import StencilProtoRegistry.stencilClient
29 | 
30 |   override def getProtoDescriptor(className: String): Descriptors.Descriptor = {
31 |     stencilClient(url, token).get(className)
32 |   }
33 | }
34 | 
35 | object StencilProtoRegistry {
36 |   @transient
37 |   private var _stencilClient: StencilClient = _
38 | 
39 |   def stencilClient(url: String, token: Option[String]): StencilClient = {
40 |     if (_stencilClient == null) {
41 |       val stencilConfigBuilder = StencilConfig.builder
42 |       for (t <- token) {
43 |         val authHeader = new BasicHeader(HttpHeaders.AUTHORIZATION, "Bearer " + t)
44 |         val headers    = List[Header](authHeader)
45 |         stencilConfigBuilder.fetchHeaders(headers.asJava)
46 |       }
47 |       val stencilConfig = stencilConfigBuilder.build()
48 |       _stencilClient = StencilClientFactory.getClient(url, stencilConfig)
49 |     }
50 |     _stencilClient
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/sources/bq/BigQueryReader.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.sources.bq
18 | 
19 | import java.sql.Timestamp
20 | 
21 | import feast.ingestion.BQSource
22 | import org.joda.time.DateTime
23 | import org.apache.spark.sql.{DataFrame, SQLContext}
24 | import org.apache.spark.sql.functions.col
25 | 
26 | object BigQueryReader {
27 |   def createBatchSource(
28 |       sqlContext: SQLContext,
29 |       source: BQSource,
30 |       start: DateTime,
31 |       end: DateTime
32 |   ): DataFrame = {
33 |     val reader = sqlContext.read
34 |       .format("bigquery")
35 |       .option("viewsEnabled", "true")
36 | 
37 |     source.materialization foreach { materializationConfig =>
38 |       reader
39 |         .option("materializationProject", materializationConfig.project)
40 |         .option("materializationDataset", materializationConfig.dataset)
41 |     }
42 | 
43 |     reader
44 |       .load(s"${source.project}.${source.dataset}.${source.table}")
45 |       .filter(col(source.eventTimestampColumn) >= new Timestamp(start.getMillis))
46 |       .filter(col(source.eventTimestampColumn) < new Timestamp(end.getMillis))
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/sources/file/FileReader.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.sources.file
18 | 
19 | import java.sql.{Timestamp, Date}
20 | 
21 | import feast.ingestion.FileSource
22 | import org.apache.spark.sql.functions.col
23 | import org.apache.spark.sql.{DataFrame, SQLContext}
24 | import org.joda.time.DateTime
25 | 
26 | object FileReader {
27 |   def createBatchSource(
28 |       sqlContext: SQLContext,
29 |       source: FileSource,
30 |       start: DateTime,
31 |       end: DateTime
32 |   ): DataFrame = {
33 |     val reader = sqlContext.read
34 |       .parquet(source.path)
35 |       .filter(col(source.eventTimestampColumn) >= new Timestamp(start.getMillis))
36 |       .filter(col(source.eventTimestampColumn) < new Timestamp(end.getMillis))
37 | 
38 |     source.datePartitionColumn match {
39 |       case Some(partitionColumn) if partitionColumn.nonEmpty =>
40 |         reader
41 |           .filter(col(partitionColumn) >= new Date(start.getMillis))
42 |           .filter(col(partitionColumn) <= new Date(end.getMillis))
43 |       case _ => reader
44 |     }
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/stores/bigtable/SparkBigtableConfig.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2021 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.stores.bigtable
18 | 
19 | case class SparkBigtableConfig(
20 |     namespace: String,
21 |     projectName: String,
22 |     entityColumns: Array[String],
23 |     timestampColumn: String,
24 |     maxAge: Long
25 | )
26 | object SparkBigtableConfig {
27 |   val NAMESPACE      = "namespace"
28 |   val ENTITY_COLUMNS = "entity_columns"
29 |   val TS_COLUMN      = "timestamp_column"
30 |   val PROJECT_NAME   = "project_name"
31 |   val MAX_AGE        = "max_age"
32 | 
33 |   def parse(parameters: Map[String, String]): SparkBigtableConfig =
34 |     SparkBigtableConfig(
35 |       namespace = parameters.getOrElse(NAMESPACE, ""),
36 |       projectName = parameters.getOrElse(PROJECT_NAME, "default"),
37 |       entityColumns = parameters.getOrElse(ENTITY_COLUMNS, "").split(","),
38 |       timestampColumn = parameters.getOrElse(TS_COLUMN, "event_timestamp"),
39 |       maxAge = parameters.get(MAX_AGE).map(_.toLong).getOrElse(0)
40 |     )
41 | }
42 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/stores/cassandra/DefaultSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2021 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.stores.cassandra
18 | 
19 | import feast.ingestion.stores.serialization.AvroSerializer
20 | import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider}
21 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
22 | 
23 | class DefaultSource extends CreatableRelationProvider {
24 |   override def createRelation(
25 |       sqlContext: SQLContext,
26 |       mode: SaveMode,
27 |       parameters: Map[String, String],
28 |       data: DataFrame
29 |   ): BaseRelation = {
30 | 
31 |     val rel =
32 |       new CassandraSinkRelation(
33 |         sqlContext,
34 |         new AvroSerializer,
35 |         SparkCassandraConfig.parse(parameters)
36 |       )
37 |     rel.createTable()
38 |     rel.saveWriteSchema(data)
39 |     rel.insert(data, overwrite = false)
40 |     rel
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/stores/cassandra/SparkCassandraConfig.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2021 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.stores.cassandra
18 | 
19 | case class SparkCassandraConfig(
20 |     namespace: String,
21 |     projectName: String,
22 |     entityColumns: Array[String],
23 |     timestampColumn: String,
24 |     maxAge: Long
25 | )
26 | 
27 | object SparkCassandraConfig {
28 |   val NAMESPACE      = "namespace"
29 |   val ENTITY_COLUMNS = "entity_columns"
30 |   val TS_COLUMN      = "timestamp_column"
31 |   val PROJECT_NAME   = "project_name"
32 |   val MAX_AGE        = "max_age"
33 | 
34 |   def parse(parameters: Map[String, String]): SparkCassandraConfig =
35 |     SparkCassandraConfig(
36 |       namespace = parameters.getOrElse(NAMESPACE, ""),
37 |       projectName = parameters.getOrElse(PROJECT_NAME, "default"),
38 |       entityColumns = parameters.getOrElse(ENTITY_COLUMNS, "").split(","),
39 |       timestampColumn = parameters.getOrElse(TS_COLUMN, "event_timestamp"),
40 |       maxAge = parameters.get(MAX_AGE).map(_.toLong).getOrElse(0)
41 |     )
42 | }
43 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/stores/redis/ClusterPipelineProvider.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2022 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.stores.redis
18 | 
19 | import redis.clients.jedis.commands.PipelineBinaryCommands
20 | import redis.clients.jedis.{ClusterPipeline, DefaultJedisClientConfig, HostAndPort, Response}
21 | import redis.clients.jedis.providers.ClusterConnectionProvider
22 | 
23 | import scala.collection.JavaConverters._
24 | 
25 | /**
26 |   * Provide pipeline for Redis cluster.
27 |   */
28 | case class ClusterPipelineProvider(endpoint: RedisEndpoint) extends PipelineProvider {
29 | 
30 |   val nodes = Set(new HostAndPort(endpoint.host, endpoint.port)).asJava
31 |   val DEFAULT_CLIENT_CONFIG = DefaultJedisClientConfig
32 |     .builder()
33 |     .password(endpoint.password)
34 |     .build()
35 |   val provider = new ClusterConnectionProvider(nodes, DEFAULT_CLIENT_CONFIG)
36 | 
37 |   /**
38 |     * @return execute commands within a pipeline and return the result
39 |     */
40 |   override def withPipeline[T](ops: PipelineBinaryCommands => T): T = {
41 |     val pipeline = new ClusterPipeline(provider)
42 |     val response = ops(pipeline)
43 |     pipeline.close()
44 |     response
45 |   }
46 | 
47 |   /**
48 |     * Close client connection
49 |     */
50 |   override def close(): Unit = {
51 |     provider.close()
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/stores/redis/DefaultSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.stores.redis
18 | 
19 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
20 | import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider}
21 | 
22 | /**
23 |   * Entrypoint to Redis Storage. Implements only `CreatableRelationProvider` since it's only possible write to Redis.
24 |   * Here we parse configuration from spark parameters & provide SparkRedisConfig to `RedisSinkRelation`
25 |   */
26 | class RedisRelationProvider extends CreatableRelationProvider {
27 |   override def createRelation(
28 |       sqlContext: SQLContext,
29 |       mode: SaveMode,
30 |       parameters: Map[String, String],
31 |       data: DataFrame
32 |   ): BaseRelation = {
33 |     val config   = SparkRedisConfig.parse(parameters)
34 |     val relation = new RedisSinkRelation(sqlContext, config)
35 | 
36 |     relation.insert(data, overwrite = false)
37 | 
38 |     relation
39 |   }
40 | }
41 | 
42 | class DefaultSource extends RedisRelationProvider
43 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/stores/redis/Persistence.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.stores.redis
18 | 
19 | import java.sql.Timestamp
20 | import java.util
21 | import org.apache.spark.sql.Row
22 | import redis.clients.jedis.commands.PipelineBinaryCommands
23 | import redis.clients.jedis.Response
24 | 
25 | /**
26 |   * Determine how a Spark row should be serialized and stored on Redis.
27 |   */
28 | trait Persistence {
29 | 
30 |   /**
31 |     * Persist a Spark row to Redis
32 |     *
33 |     * @param pipeline              Redis pipeline
34 |     * @param key                   Redis key in serialized bytes format
35 |     * @param row                   Row representing the value to be persist
36 |     * @param expiryTimestamp       Expiry timestamp for the row
37 |     * @param maxExpiryTimestamp    No ttl should be set if the expiry timestamp
38 |     *                              is equal to the maxExpiryTimestamp
39 |     */
40 |   def save(
41 |       pipeline: PipelineBinaryCommands,
42 |       key: Array[Byte],
43 |       row: Row,
44 |       expiryTimestamp: Option[Timestamp]
45 |   ): Unit
46 | 
47 |   /**
48 |     * Returns a Redis response, which can be used by `storedTimestamp` and `newExpiryTimestamp` to
49 |     * derive the currently stored event timestamp, and the updated expiry timestamp. This method will
50 |     * be called prior to persisting the row to Redis, so that `RedisSinkRelation` can decide whether
51 |     * the currently stored value should be updated.
52 |     *
53 |     * @param pipeline              Redis pipeline
54 |     * @param key                   Redis key in serialized bytes format
55 |     * @return                      Redis response representing the row value
56 |     */
57 |   def get(
58 |       pipeline: PipelineBinaryCommands,
59 |       key: Array[Byte]
60 |   ): Response[util.Map[Array[Byte], Array[Byte]]]
61 | 
62 |   /**
63 |     * Returns the currently stored event timestamp for the key and the feature table associated with the ingestion job.
64 |     *
65 |     * @param value              Response returned from `get`
66 |     * @return                   Stored event timestamp associated with the key. Returns `None` if
67 |     *                           the key is not present in Redis, or if timestamp information is
68 |     *                           unavailable on the stored value.
69 |     */
70 |   def storedTimestamp(value: util.Map[Array[Byte], Array[Byte]]): Option[Timestamp]
71 | 
72 | }
73 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/stores/redis/PipelineProvider.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2022 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.stores.redis
18 | 
19 | import redis.clients.jedis.Response
20 | import redis.clients.jedis.commands.PipelineBinaryCommands
21 | 
22 | import java.io.Closeable
23 | 
24 | /**
25 |   * Provide either a pipeline or cluster pipeline to read and write data into Redis.
26 |   */
27 | trait PipelineProvider {
28 | 
29 |   def withPipeline[T](ops: PipelineBinaryCommands => T): T
30 | 
31 |   /**
32 |     * Close client connection
33 |     */
34 |   def close(): Unit
35 | }
36 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/stores/redis/PipelineProviderFactory.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2022 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.stores.redis
18 | 
19 | import redis.clients.jedis.Jedis
20 | 
21 | import scala.collection.mutable
22 | import scala.util.Try
23 | 
24 | object PipelineProviderFactory {
25 | 
26 |   private lazy val providers: mutable.Map[RedisEndpoint, PipelineProvider] = mutable.Map.empty
27 | 
28 |   private def newJedisClient(endpoint: RedisEndpoint): Jedis = {
29 |     val jedis = new Jedis(endpoint.host, endpoint.port)
30 |     if (endpoint.password.nonEmpty) {
31 |       jedis.auth(endpoint.password)
32 |     }
33 |     jedis
34 |   }
35 | 
36 |   private def checkIfInClusterMode(endpoint: RedisEndpoint): Boolean = {
37 |     val jedis     = newJedisClient(endpoint)
38 |     val isCluster = Try(jedis.clusterInfo()).isSuccess
39 |     jedis.close()
40 |     isCluster
41 |   }
42 | 
43 |   private def clusterPipelineProvider(endpoint: RedisEndpoint): PipelineProvider = {
44 |     ClusterPipelineProvider(endpoint)
45 |   }
46 | 
47 |   private def singleNodePipelineProvider(endpoint: RedisEndpoint): PipelineProvider = {
48 |     SingleNodePipelineProvider(endpoint)
49 |   }
50 | 
51 |   def newProvider(endpoint: RedisEndpoint): PipelineProvider = {
52 |     if (checkIfInClusterMode(endpoint)) {
53 |       clusterPipelineProvider(endpoint)
54 |     } else {
55 |       singleNodePipelineProvider(endpoint)
56 |     }
57 |   }
58 | 
59 |   def provider(endpoint: RedisEndpoint): PipelineProvider = {
60 |     providers.getOrElseUpdate(endpoint, newProvider(endpoint))
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/stores/redis/RedisEndpoint.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2022 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.stores.redis
18 | 
19 | case class RedisEndpoint(host: String, port: Int, password: String)
20 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/stores/redis/SingleNodePipelineProvider.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2022 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.stores.redis
18 | 
19 | import redis.clients.jedis.commands.PipelineBinaryCommands
20 | import redis.clients.jedis.{JedisPool, Response}
21 | 
22 | /**
23 |   * Provide pipeline for single node Redis.
24 |   */
25 | case class SingleNodePipelineProvider(endpoint: RedisEndpoint) extends PipelineProvider {
26 | 
27 |   val jedisPool = new JedisPool(endpoint.host, endpoint.port)
28 | 
29 |   /**
30 |     * @return execute command within a pipeline and return the result
31 |     */
32 |   override def withPipeline[T](ops: PipelineBinaryCommands => T): T = {
33 |     val jedis = jedisPool.getResource
34 |     if (endpoint.password.nonEmpty) {
35 |       jedis.auth(endpoint.password)
36 |     }
37 |     val response = ops(jedis.pipelined())
38 |     jedis.close()
39 |     response
40 |   }
41 | 
42 |   /**
43 |     * Close client connection
44 |     */
45 |   override def close(): Unit = jedisPool.close()
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/stores/redis/SparkRedisConfig.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.stores.redis
18 | 
19 | case class SparkRedisConfig(
20 |     namespace: String,
21 |     projectName: String,
22 |     entityColumns: Array[String],
23 |     timestampColumn: String,
24 |     timestampPrefix: String = "_ts",
25 |     repartitionByEntity: Boolean = true,
26 |     maxAge: Long = 0,
27 |     expiryPrefix: String = "_ex"
28 | )
29 | 
30 | object SparkRedisConfig {
31 |   val NAMESPACE          = "namespace"
32 |   val ENTITY_COLUMNS     = "entity_columns"
33 |   val TS_COLUMN          = "timestamp_column"
34 |   val ENTITY_REPARTITION = "entity_repartition"
35 |   val PROJECT_NAME       = "project_name"
36 |   val MAX_AGE            = "max_age"
37 | 
38 |   def parse(parameters: Map[String, String]): SparkRedisConfig =
39 |     SparkRedisConfig(
40 |       namespace = parameters.getOrElse(NAMESPACE, ""),
41 |       projectName = parameters.getOrElse(PROJECT_NAME, "default"),
42 |       entityColumns = parameters.getOrElse(ENTITY_COLUMNS, "").split(","),
43 |       timestampColumn = parameters.getOrElse(TS_COLUMN, "event_timestamp"),
44 |       repartitionByEntity = parameters.getOrElse(ENTITY_REPARTITION, "true") == "true",
45 |       maxAge = parameters.get(MAX_AGE).map(_.toLong).getOrElse(0)
46 |     )
47 | }
48 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/stores/serialization/AvroSerializer.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2021 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.stores.serialization
18 | 
19 | import com.google.common.hash.Hashing
20 | import org.apache.spark.sql.Column
21 | import org.apache.spark.sql.avro.SchemaConverters
22 | import org.apache.spark.sql.avro.functions.to_avro
23 | import org.apache.spark.sql.types.StructType
24 | 
25 | class AvroSerializer extends Serializer {
26 |   override type SchemaType = String
27 | 
28 |   def convertSchema(schema: StructType): String = {
29 |     val avroSchema = SchemaConverters.toAvroType(schema)
30 |     avroSchema.toString
31 |   }
32 | 
33 |   def schemaReference(schema: String): Array[Byte] = {
34 |     Hashing.murmur3_32().hashBytes(schema.getBytes).asBytes()
35 |   }
36 | 
37 |   def serializeData(schema: String): Column => Column = to_avro(_, schema)
38 | }
39 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/stores/serialization/Serializer.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2021 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.stores.serialization
18 | 
19 | import org.apache.spark.sql.Column
20 | import org.apache.spark.sql.types.StructType
21 | 
22 | trait Serializer {
23 |   type SchemaType
24 | 
25 |   def convertSchema(schema: StructType): SchemaType
26 | 
27 |   def schemaReference(schema: SchemaType): Array[Byte]
28 | 
29 |   def serializeData(schema: SchemaType): Column => Column
30 | }
31 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/utils/JsonUtils.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2021 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.utils
18 | 
19 | import java.util.Locale.ENGLISH
20 | 
21 | import org.json4s.{JArray, JField, JObject, JValue}
22 | 
23 | object JsonUtils {
24 |   def mapFieldWithParent(jv: JValue)(f: (String, JField) => JField): JValue = {
25 |     def rec(v: JValue, parent: String = ""): JValue = v match {
26 |       case JObject(l) => JObject(l.map { case (key, va) => f(parent, key -> rec(va, key)) })
27 |       case JArray(l)  => JArray(l.map(rec(_, parent)))
28 |       case x          => x
29 |     }
30 |     rec(jv)
31 |   }
32 | 
33 |   def camelize(word: String): String = {
34 |     if (word.nonEmpty) {
35 |       val w = pascalize(word)
36 |       w.substring(0, 1).toLowerCase(ENGLISH) + w.substring(1)
37 |     } else {
38 |       word
39 |     }
40 |   }
41 | 
42 |   def pascalize(word: String): String = {
43 |     val lst = word.split("_").toList
44 |     (lst.headOption.map(s => s.substring(0, 1).toUpperCase(ENGLISH) + s.substring(1)).get ::
45 |       lst.tail.map(s => s.substring(0, 1).toUpperCase + s.substring(1))).mkString("")
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/utils/StringUtils.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2021 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.utils
18 | 
19 | import com.google.common.hash.Hashing
20 | 
21 | object StringUtils {
22 |   private def suffixHash(expr: String): String = {
23 |     Hashing.murmur3_32().hashBytes(expr.getBytes).toString
24 |   }
25 | 
26 |   def trimAndHash(expr: String, maxLength: Int): String = {
27 |     // Length 8 suffix as derived from murmurhash_32 implementation
28 |     val maxPrefixLength = maxLength - 8
29 |     if (expr.length > maxLength)
30 |       expr
31 |         .take(maxPrefixLength)
32 |         .concat(suffixHash(expr.substring(maxPrefixLength)))
33 |     else
34 |       expr
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/utils/testing/MemoryStreamingSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.utils.testing
18 | 
19 | import feast.ingestion.{DataFormat, StreamingSource}
20 | import org.apache.spark.sql.DataFrame
21 | import org.apache.spark.sql.execution.streaming.MemoryStream
22 | 
23 | // For test purposes
24 | case class MemoryStreamingSource(
25 |     stream: MemoryStream[_],
26 |     override val fieldMapping: Map[String, String] = Map.empty,
27 |     override val eventTimestampColumn: String = "timestamp",
28 |     override val createdTimestampColumn: Option[String] = None,
29 |     override val datePartitionColumn: Option[String] = None
30 | ) extends StreamingSource {
31 |   def read: DataFrame = stream.toDF()
32 | 
33 |   override def format: DataFormat = null
34 | }
35 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/validation/Expectation.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2022 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.validation
18 | 
19 | import org.apache.spark.sql.Column
20 | import org.apache.spark.sql.functions.{col, lit}
21 | import org.json4s.{CustomSerializer, DefaultFormats, Extraction, Formats, JObject, JValue}
22 | 
23 | trait Expectation {
24 | 
25 |   def validate: Column
26 | }
27 | 
28 | case class ExpectColumnValuesToNotBeNull(columnName: String) extends Expectation {
29 |   override def validate: Column = col(columnName).isNotNull
30 | }
31 | 
32 | case class ExpectColumnValuesToBeBetween(
33 |     columnName: String,
34 |     minValue: Option[Int],
35 |     maxValue: Option[Int]
36 | ) extends Expectation {
37 |   override def validate: Column = {
38 |     (minValue, maxValue) match {
39 |       case (Some(min), Some(max)) => col(columnName).between(min, max)
40 |       case (Some(min), None)      => col(columnName).>=(min)
41 |       case (None, Some(max))      => col(columnName).<=(max)
42 |       case _                      => lit(true)
43 |     }
44 |   }
45 | }
46 | 
47 | object Expectation {
48 |   implicit val format: Formats = DefaultFormats
49 | 
50 |   def extractColumn(kwargs: JValue): String = {
51 |     (kwargs \ "column").extract[String]
52 |   }
53 | 
54 |   def apply(expectationType: String, kwargs: JValue): Expectation = {
55 |     expectationType match {
56 |       case "expect_column_values_to_not_be_null" =>
57 |         ExpectColumnValuesToNotBeNull(extractColumn(kwargs))
58 |       case "expect_column_values_to_be_between" =>
59 |         val column   = extractColumn(kwargs)
60 |         val minValue = (kwargs \ "minValue").toSome.map(_.extract[Int])
61 |         val maxValue = (kwargs \ "maxValue").toSome.map(_.extract[Int])
62 |         ExpectColumnValuesToBeBetween(column, minValue, maxValue)
63 |     }
64 |   }
65 | }
66 | 
67 | object ExpectationCodec
68 |     extends CustomSerializer[Expectation](implicit format =>
69 |       (
70 |         { case x: JObject =>
71 |           val eType: String  = (x \ "expectationType").extract[String]
72 |           val kwargs: JValue = (x \ "kwargs")
73 |           Expectation(eType, kwargs)
74 |         },
75 |         { case x: Expectation =>
76 |           Extraction.decompose(x)
77 |         }
78 |       )
79 |     )
80 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/feast/ingestion/validation/RowValidator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.validation
18 | 
19 | import feast.ingestion.{FeatureTable, ExpectationSpec}
20 | import org.apache.spark.sql.Column
21 | import org.apache.spark.sql.functions.{col, lit}
22 | 
23 | class RowValidator(
24 |     featureTable: FeatureTable,
25 |     timestampColumn: String,
26 |     expectationSpec: Option[ExpectationSpec]
27 | ) extends Serializable {
28 | 
29 |   def allEntitiesPresent: Column =
30 |     featureTable.entities.map(e => col(e.name).isNotNull).reduce(_.&&(_))
31 | 
32 |   def atLeastOneFeatureNotNull: Column =
33 |     featureTable.features.map(f => col(f.name).isNotNull).reduce(_.||(_))
34 | 
35 |   def timestampPresent: Column =
36 |     col(timestampColumn).isNotNull
37 | 
38 |   def validationChecks: Column = {
39 | 
40 |     expectationSpec match {
41 |       case Some(value) if value.expectations.isEmpty => lit(true)
42 |       case Some(value) =>
43 |         value.expectations.map(_.validate).reduce(_.&&(_))
44 |       case None => lit(true)
45 |     }
46 |   }
47 | 
48 |   def allChecks: Column =
49 |     allEntitiesPresent && timestampPresent && validationChecks
50 | }
51 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/org/apache/spark/metrics/AtomicGauge.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2021 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.spark.metrics
18 | 
19 | import com.codahale.metrics.Gauge
20 | 
21 | import java.util.concurrent.atomic.{AtomicInteger, AtomicLong}
22 | 
23 | class AtomicLongGauge(initialValue: Long = 0L) extends Gauge[Long] {
24 |   val value                   = new AtomicLong(initialValue)
25 |   override def getValue: Long = value.get()
26 | }
27 | 
28 | class AtomicIntegerGauge(initialValue: Int = 0) extends Gauge[Int] {
29 |   val value                  = new AtomicInteger(initialValue)
30 |   override def getValue: Int = value.get()
31 | }
32 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/org/apache/spark/metrics/sink/StatsdSinkWithTags.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.spark.metrics.sink
18 | 
19 | import java.util.Properties
20 | import java.util.concurrent.TimeUnit
21 | 
22 | import com.codahale.metrics.MetricRegistry
23 | import feast.ingestion.metrics.StatsdReporterWithTags
24 | import org.apache.spark.SecurityManager
25 | import org.apache.spark.internal.Logging
26 | import org.apache.spark.metrics.MetricsSystem
27 | 
28 | class StatsdSinkWithTags(
29 |     val property: Properties,
30 |     val registry: MetricRegistry,
31 |     securityMgr: SecurityManager
32 | ) extends Sink
33 |     with Logging {
34 |   import StatsdSink._
35 | 
36 |   val host = property.getProperty(STATSD_KEY_HOST, STATSD_DEFAULT_HOST)
37 |   val port = property.getProperty(STATSD_KEY_PORT, STATSD_DEFAULT_PORT).toInt
38 | 
39 |   val pollPeriod = property.getProperty(STATSD_KEY_PERIOD, STATSD_DEFAULT_PERIOD).toInt
40 |   val pollUnit =
41 |     TimeUnit.valueOf(property.getProperty(STATSD_KEY_UNIT, STATSD_DEFAULT_UNIT).toUpperCase)
42 | 
43 |   val prefix = property.getProperty(STATSD_KEY_PREFIX, STATSD_DEFAULT_PREFIX)
44 | 
45 |   MetricsSystem.checkMinimalPollingPeriod(pollUnit, pollPeriod)
46 | 
47 |   val reporter = new StatsdReporterWithTags(registry, host, port, prefix)
48 | 
49 |   override def start(): Unit = {
50 |     reporter.start(pollPeriod, pollUnit)
51 |     logInfo(s"StatsdSink started with prefix: '$prefix'")
52 |   }
53 | 
54 |   override def stop(): Unit = {
55 |     reporter.stop()
56 |     logInfo("StatsdSink stopped.")
57 |   }
58 | 
59 |   override def report(): Unit = reporter.report()
60 | }
61 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/org/apache/spark/metrics/source/BaseMetricSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.spark.metrics.source
18 | 
19 | import com.codahale.metrics.MetricRegistry
20 | import org.apache.spark.SparkEnv
21 | 
22 | class BaseMetricSource extends Source {
23 |   override val sourceName: String = ""
24 | 
25 |   override val metricRegistry: MetricRegistry = new MetricRegistry
26 | 
27 |   private val sparkConfig = SparkEnv.get.conf
28 | 
29 |   private val metricLabels = sparkConfig.get("spark.metrics.labels", "")
30 | 
31 |   private val appId = sparkConfig.get("spark.app.id", "")
32 | 
33 |   private val executorId = sparkConfig.get("spark.executor.id", "")
34 | 
35 |   protected def metricWithLabels(name: String) = {
36 |     if (metricLabels.isEmpty) {
37 |       name
38 |     } else {
39 |       s"$name#$metricLabels,job_id=$appId-$executorId"
40 |     }
41 |   }
42 | 
43 |   protected def counterWithLabels(name: String) = {
44 |     if (metricLabels.isEmpty) {
45 |       name
46 |     } else {
47 |       s"$name#$metricLabels"
48 |     }
49 |   }
50 | 
51 |   protected def gaugeWithLabels(name: String) = {
52 |     if (metricLabels.isEmpty) {
53 |       name
54 |     } else {
55 |       s"$name#$metricLabels"
56 |     }
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/org/apache/spark/metrics/source/BigTableSinkMetricSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.spark.metrics.source
18 | 
19 | class BigTableSinkMetricSource extends BaseMetricSource {
20 |   override val sourceName: String = BigTableSinkMetricSource.sourceName
21 | 
22 |   val METRIC_TOTAL_ROWS_INSERTED =
23 |     metricRegistry.counter(counterWithLabels("feature_row_ingested_count"))
24 | 
25 |   val METRIC_ROWS_LAG =
26 |     metricRegistry.histogram(metricWithLabels("feature_row_lag_ms"))
27 | }
28 | 
29 | object BigTableSinkMetricSource {
30 |   val sourceName = "bigtable_sink"
31 | }
32 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/org/apache/spark/metrics/source/IngestionPipelineMetricSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.spark.metrics.source
18 | 
19 | class IngestionPipelineMetricSource extends BaseMetricSource {
20 |   override val sourceName: String = IngestionPipelineMetricSource.sourceName
21 | 
22 |   val METRIC_DEADLETTER_ROWS_INSERTED =
23 |     metricRegistry.counter(counterWithLabels("deadletter_count"))
24 | 
25 |   val METRIC_ROWS_READ_FROM_SOURCE =
26 |     metricRegistry.counter(counterWithLabels("read_from_source_count"))
27 | }
28 | 
29 | object IngestionPipelineMetricSource {
30 |   val sourceName = "ingestion_pipeline"
31 | }
32 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/org/apache/spark/metrics/source/RedisSinkMetricSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.spark.metrics.source
18 | 
19 | class RedisSinkMetricSource extends BaseMetricSource {
20 |   override val sourceName: String = RedisSinkMetricSource.sourceName
21 | 
22 |   val METRIC_TOTAL_ROWS_INSERTED =
23 |     metricRegistry.counter(counterWithLabels("feature_row_ingested_count"))
24 | 
25 |   val METRIC_ROWS_LAG =
26 |     metricRegistry.histogram(metricWithLabels("feature_row_lag_ms"))
27 | }
28 | 
29 | object RedisSinkMetricSource {
30 |   val sourceName = "redis_sink"
31 | }
32 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/main/scala/org/apache/spark/metrics/source/StreamingMetricSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.spark.metrics.source
18 | 
19 | import org.apache.spark.metrics.AtomicLongGauge
20 | import org.apache.spark.sql.streaming.StreamingQueryProgress
21 | 
22 | import java.time.Instant
23 | 
24 | class StreamingMetricSource extends BaseMetricSource {
25 |   override val sourceName: String = StreamingMetricSource.sourceName
26 | 
27 |   private val BATCH_DURATION_GAUGE =
28 |     metricRegistry.register(gaugeWithLabels("batch_duration_ms"), new AtomicLongGauge())
29 |   private val PROCESSED_ROWS_PER_SECOND_GAUGE =
30 |     metricRegistry.register(gaugeWithLabels("input_rows_per_second"), new AtomicLongGauge())
31 |   private val INPUT_ROWS_PER_SECOND_GAUGE =
32 |     metricRegistry.register(gaugeWithLabels("processed_rows_per_second"), new AtomicLongGauge())
33 |   private val LAST_CONSUMED_KAFKA_TIMESTAMP_GAUGE =
34 |     metricRegistry.register(gaugeWithLabels("last_consumed_kafka_timestamp"), new AtomicLongGauge())
35 |   private val LAST_PROCESSED_EVENT_TIMESTAMP_GAUGE =
36 |     metricRegistry.register(
37 |       gaugeWithLabels("last_processed_event_timestamp"),
38 |       new AtomicLongGauge()
39 |     )
40 | 
41 |   def updateStreamingProgress(progress: StreamingQueryProgress): Unit = {
42 |     BATCH_DURATION_GAUGE.value.set(progress.batchDuration)
43 |     INPUT_ROWS_PER_SECOND_GAUGE.value.set(progress.inputRowsPerSecond.toLong)
44 |     PROCESSED_ROWS_PER_SECOND_GAUGE.value.set(progress.processedRowsPerSecond.toLong)
45 | 
46 |     val epochTimestamp = Instant.parse(progress.timestamp).getEpochSecond
47 |     LAST_PROCESSED_EVENT_TIMESTAMP_GAUGE.value.set(epochTimestamp)
48 |   }
49 | 
50 |   def updateKafkaTimestamp(timestamp: Long): Unit = {
51 |     LAST_CONSUMED_KAFKA_TIMESTAMP_GAUGE.value.set(timestamp)
52 |   }
53 | }
54 | 
55 | object StreamingMetricSource {
56 |   val sourceName = "streaming"
57 | }
58 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/test/proto/com/example/source.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package com.example;
 4 | 
 5 | option java_multiple_files = true;
 6 | option java_package = "com.example.protos";
 7 | 
 8 | import "google/protobuf/timestamp.proto";
 9 | 
10 | message TestMessage {
11 |   int64 s2_id = 1;
12 |   VehicleType.Enum vehicle_type = 2;
13 |   int64 unique_drivers = 3;
14 |   google.protobuf.Timestamp event_timestamp = 4;
15 | }
16 | 
17 | message VehicleType {
18 |   enum Enum {
19 |     UNKNOWN = 0;
20 |     CAR = 1;
21 |     BIKE = 2;
22 |   }
23 | }
24 | 
25 | message InnerMessage {
26 |   repeated double double = 1;
27 |   repeated float float = 2;
28 |   repeated int32 integer = 3;
29 |   repeated int64 long = 4;
30 |   enum Enum {
31 |     zero = 0;
32 |     one = 1;
33 |   }
34 |   Enum enum = 5;
35 | }
36 | 
37 | message AllTypesMessage {
38 |   double double = 1;
39 |   float float = 2;
40 |   int32 integer = 3;
41 |   int64 long = 4;
42 |   uint32 uinteger = 5;
43 |   uint64 ulong = 6;
44 |   sint32 sinteger = 7;
45 |   sint64 slong = 8;
46 |   fixed32 finteger = 9;
47 |   fixed64 flong = 10;
48 |   sfixed32 sfinteger = 11;
49 |   sfixed64 sflong = 13;
50 |   bool bool = 14;
51 |   string string = 15;
52 |   bytes bytes = 16;
53 |   map<string, string> map = 17;
54 |   InnerMessage inner = 18;
55 | 
56 |   google.protobuf.Timestamp event_timestamp = 19;
57 | }


--------------------------------------------------------------------------------
/spark/ingestion/src/test/resources/python/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CURRENT_PATH=$PWD
 4 | DESTINATION=${DESTINATION:-$CURRENT_PATH}
 5 | 
 6 | # 1. Create libraries (dependencies) package
 7 | if [[ -f "$DESTINATION/libs.tar.gz" ]]; then
 8 |     echo "$DESTINATION/libs.tar.gz exists."
 9 | else
10 |   tmp_dir=$(mktemp -d)
11 |   pip3 install -t ${tmp_dir}/libs great-expectations==0.13.2 pyarrow==2.0.0 Jinja2==3.0.3
12 |   cd $tmp_dir && tar -czf libs.tar.gz libs/ && mv libs.tar.gz $DESTINATION/libs.tar.gz
13 | fi
14 | 
15 | # 2. Pickle python udf
16 | cd $CURRENT_PATH
17 | pip3 install great-expectations==0.13.2 setuptools pyspark==3.1.3 Jinja2==3.0.3 pyarrow==2.0.0
18 | python3 udf.py $DESTINATION/udf.pickle
19 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/test/resources/python/udf.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from pyspark import cloudpickle
 4 | from pyspark.sql.types import BooleanType
 5 | 
 6 | import pandas as pd
 7 | import numpy as np
 8 | 
 9 | from great_expectations.dataset import PandasDataset
10 | 
11 | 
12 | def create_suite():
13 |     df = pd.DataFrame()
14 |     df['num'] = np.random.randint(0, 10, 100)
15 |     df['num2'] = np.random.randint(0, 20, 100)
16 |     ds = PandasDataset.from_dataset(df)
17 | 
18 |     ds.expect_column_values_to_be_between('num', 0, 10)
19 |     ds.expect_column_values_to_be_between('num2', 0, 20)
20 | 
21 |     return ds.get_expectation_suite()
22 | 
23 | 
24 | def create_validator(suite):
25 |     def validate(df) -> pd.DataFrame:
26 |         ds = PandasDataset.from_dataset(df)
27 |         # print(ds, ds.shape)
28 |         result = ds.validate(suite, result_format='COMPLETE')
29 |         valid_rows = pd.Series([True] * ds.shape[0])
30 |         # print(result)
31 |         for check in result.results:
32 |             if check.success:
33 |                 continue
34 | 
35 |             valid_rows.iloc[check.result['unexpected_index_list']] = False
36 |         return valid_rows
37 | 
38 |     return validate
39 | 
40 | 
41 | def main(dest_path):
42 |     with open(dest_path, 'wb') as f:
43 |         fun = create_validator(create_suite())
44 |         command = (fun, BooleanType())
45 |         cloudpickle.dump(command, f)
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     main(sys.argv[1])
50 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/test/resources/stencil/__files/source.desc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/spark/ingestion/src/test/resources/stencil/__files/source.desc


--------------------------------------------------------------------------------
/spark/ingestion/src/test/scala/com/example/protos/InnerMessageOrBuilder.java:
--------------------------------------------------------------------------------
 1 | // Generated by the protocol buffer compiler.  DO NOT EDIT!
 2 | // source: source.proto
 3 | 
 4 | package com.example.protos;
 5 | 
 6 | public interface InnerMessageOrBuilder extends
 7 |     // @@protoc_insertion_point(interface_extends:com.example.InnerMessage)
 8 |     com.google.protobuf.MessageOrBuilder {
 9 | 
10 |   /**
11 |    * <code>repeated double double = 1;</code>
12 |    * @return A list containing the double.
13 |    */
14 |   java.util.List<java.lang.Double> getDoubleList();
15 |   /**
16 |    * <code>repeated double double = 1;</code>
17 |    * @return The count of double.
18 |    */
19 |   int getDoubleCount();
20 |   /**
21 |    * <code>repeated double double = 1;</code>
22 |    * @param index The index of the element to return.
23 |    * @return The double at the given index.
24 |    */
25 |   double getDouble(int index);
26 | 
27 |   /**
28 |    * <code>repeated float float = 2;</code>
29 |    * @return A list containing the float.
30 |    */
31 |   java.util.List<java.lang.Float> getFloatList();
32 |   /**
33 |    * <code>repeated float float = 2;</code>
34 |    * @return The count of float.
35 |    */
36 |   int getFloatCount();
37 |   /**
38 |    * <code>repeated float float = 2;</code>
39 |    * @param index The index of the element to return.
40 |    * @return The float at the given index.
41 |    */
42 |   float getFloat(int index);
43 | 
44 |   /**
45 |    * <code>repeated int32 integer = 3;</code>
46 |    * @return A list containing the integer.
47 |    */
48 |   java.util.List<java.lang.Integer> getIntegerList();
49 |   /**
50 |    * <code>repeated int32 integer = 3;</code>
51 |    * @return The count of integer.
52 |    */
53 |   int getIntegerCount();
54 |   /**
55 |    * <code>repeated int32 integer = 3;</code>
56 |    * @param index The index of the element to return.
57 |    * @return The integer at the given index.
58 |    */
59 |   int getInteger(int index);
60 | 
61 |   /**
62 |    * <code>repeated int64 long = 4;</code>
63 |    * @return A list containing the long.
64 |    */
65 |   java.util.List<java.lang.Long> getLongList();
66 |   /**
67 |    * <code>repeated int64 long = 4;</code>
68 |    * @return The count of long.
69 |    */
70 |   int getLongCount();
71 |   /**
72 |    * <code>repeated int64 long = 4;</code>
73 |    * @param index The index of the element to return.
74 |    * @return The long at the given index.
75 |    */
76 |   long getLong(int index);
77 | 
78 |   /**
79 |    * <code>.com.example.InnerMessage.Enum enum = 5;</code>
80 |    * @return The enum numeric value on the wire for enum.
81 |    */
82 |   int getEnumValue();
83 |   /**
84 |    * <code>.com.example.InnerMessage.Enum enum = 5;</code>
85 |    * @return The enum.
86 |    */
87 |   com.example.protos.InnerMessage.Enum getEnum();
88 | }
89 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/test/scala/com/example/protos/TestMessageOrBuilder.java:
--------------------------------------------------------------------------------
 1 | // Generated by the protocol buffer compiler.  DO NOT EDIT!
 2 | // source: source.proto
 3 | 
 4 | package com.example.protos;
 5 | 
 6 | public interface TestMessageOrBuilder extends
 7 |     // @@protoc_insertion_point(interface_extends:com.example.TestMessage)
 8 |     com.google.protobuf.MessageOrBuilder {
 9 | 
10 |   /**
11 |    * <code>int64 s2_id = 1;</code>
12 |    * @return The s2Id.
13 |    */
14 |   long getS2Id();
15 | 
16 |   /**
17 |    * <code>.com.example.VehicleType.Enum vehicle_type = 2;</code>
18 |    * @return The enum numeric value on the wire for vehicleType.
19 |    */
20 |   int getVehicleTypeValue();
21 |   /**
22 |    * <code>.com.example.VehicleType.Enum vehicle_type = 2;</code>
23 |    * @return The vehicleType.
24 |    */
25 |   com.example.protos.VehicleType.Enum getVehicleType();
26 | 
27 |   /**
28 |    * <code>int64 unique_drivers = 3;</code>
29 |    * @return The uniqueDrivers.
30 |    */
31 |   long getUniqueDrivers();
32 | 
33 |   /**
34 |    * <code>.google.protobuf.Timestamp event_timestamp = 4;</code>
35 |    * @return Whether the eventTimestamp field is set.
36 |    */
37 |   boolean hasEventTimestamp();
38 |   /**
39 |    * <code>.google.protobuf.Timestamp event_timestamp = 4;</code>
40 |    * @return The eventTimestamp.
41 |    */
42 |   com.google.protobuf.Timestamp getEventTimestamp();
43 |   /**
44 |    * <code>.google.protobuf.Timestamp event_timestamp = 4;</code>
45 |    */
46 |   com.google.protobuf.TimestampOrBuilder getEventTimestampOrBuilder();
47 | }
48 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/test/scala/com/example/protos/VehicleTypeOrBuilder.java:
--------------------------------------------------------------------------------
 1 | // Generated by the protocol buffer compiler.  DO NOT EDIT!
 2 | // source: source.proto
 3 | 
 4 | package com.example.protos;
 5 | 
 6 | public interface VehicleTypeOrBuilder extends
 7 |     // @@protoc_insertion_point(interface_extends:com.example.VehicleType)
 8 |     com.google.protobuf.MessageOrBuilder {
 9 | }
10 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/test/scala/feast/ingestion/SparkSpec.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion
18 | 
19 | import org.apache.spark.SparkConf
20 | import org.apache.spark.sql.SparkSession
21 | import org.scalatest.BeforeAndAfter
22 | 
23 | class SparkSpec extends UnitSpec with BeforeAndAfter {
24 |   System.setProperty("io.netty.tryReflectionSetAccessible", "true")
25 | 
26 |   var sparkSession: SparkSession                         = null
27 |   def withSparkConfOverrides(conf: SparkConf): SparkConf = conf
28 | 
29 |   before {
30 |     val sparkConf = new SparkConf()
31 |       .setMaster("local[4]")
32 |       .setAppName("Testing")
33 |       .set("spark.driver.bindAddress", "localhost")
34 |       .set("spark.default.parallelism", "8")
35 |       .set(
36 |         "spark.metrics.conf.*.sink.statsd.class",
37 |         "org.apache.spark.metrics.sink.StatsdSinkWithTags"
38 |       )
39 |       .set("spark.metrics.conf.*.sink.statsd.host", "localhost")
40 |       .set("spark.metrics.conf.*.sink.statsd.period", "999") // disable scheduled reporting
41 |       .set("spark.metrics.conf.*.sink.statsd.unit", "minutes")
42 |       .set("spark.metrics.labels", "job_id=test")
43 |       .set("spark.metrics.namespace", "")
44 |       .set("spark.sql.legacy.allowUntypedScalaUDF", "true")
45 |       .set("spark.sql.execution.arrow.maxRecordsPerBatch", "50000")
46 | 
47 |     sparkSession = SparkSession
48 |       .builder()
49 |       .config(withSparkConfOverrides(sparkConf))
50 |       .getOrCreate()
51 |   }
52 | 
53 |   after {
54 |     sparkSession.stop()
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/test/scala/feast/ingestion/UnitSpec.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion
18 | 
19 | import org.scalatest.flatspec.AnyFlatSpec
20 | import org.scalatest._
21 | import matchers._
22 | 
23 | abstract class UnitSpec
24 |     extends AnyFlatSpec
25 |     with should.Matchers
26 |     with OptionValues
27 |     with Inside
28 |     with Inspectors
29 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/test/scala/feast/ingestion/helpers/DataHelper.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.helpers
18 | 
19 | import java.nio.file.{Files, Paths}
20 | 
21 | import org.apache.spark.sql.SparkSession
22 | import org.apache.spark.sql.functions.to_date
23 | import org.joda.time.{DateTime, Seconds}
24 | import org.scalacheck.Gen
25 | 
26 | import scala.reflect.runtime.universe.TypeTag
27 | 
28 | case class TestRow(
29 |     customer: String,
30 |     feature1: Int,
31 |     feature2: Float,
32 |     eventTimestamp: java.sql.Timestamp
33 | )
34 | 
35 | object DataHelper {
36 |   def generateRows[A](gen: Gen[A], N: Int): Seq[A] =
37 |     Gen.listOfN(N, gen).sample.get
38 | 
39 |   def generateDistinctRows[A](gen: Gen[A], N: Int, entityFun: A => String): Seq[A] =
40 |     generateRows(gen, N).groupBy(entityFun).map(_._2.head).toSeq
41 | 
42 |   def generateTempPath(last: String): String =
43 |     Paths.get(Files.createTempDirectory("test-dir").toString, last).toString
44 | 
45 |   def storeAsParquet[T <: Product: TypeTag](sparkSession: SparkSession, rows: Seq[T]): String = {
46 |     import sparkSession.implicits._
47 | 
48 |     val tempPath = generateTempPath("rows")
49 | 
50 |     sparkSession
51 |       .createDataset(rows)
52 |       .withColumn("date", to_date($"eventTimestamp"))
53 |       .write
54 |       .partitionBy("date")
55 |       .save(tempPath)
56 | 
57 |     tempPath
58 |   }
59 | 
60 |   def rowGenerator(
61 |       start: DateTime,
62 |       end: DateTime,
63 |       customerGen: Option[Gen[String]] = None
64 |   ): Gen[TestRow] =
65 |     for {
66 |       customer <- customerGen.getOrElse(Gen.asciiPrintableStr)
67 |       feature1 <- Gen.choose(0, 100)
68 |       feature2 <- Gen.choose[Float](0, 1)
69 |       eventTimestamp <- Gen
70 |         .choose(0, Seconds.secondsBetween(start, end).getSeconds - 1)
71 |         .map(start.withMillisOfSecond(0).plusSeconds)
72 |     } yield TestRow(
73 |       customer,
74 |       feature1,
75 |       feature2,
76 |       new java.sql.Timestamp(eventTimestamp.getMillis)
77 |     )
78 | }
79 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/test/scala/feast/ingestion/helpers/RedisStorageHelper.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.helpers
18 | 
19 | import java.nio.charset.StandardCharsets
20 | import java.nio.{ByteBuffer, ByteOrder}
21 | import com.google.protobuf.Timestamp
22 | import feast.ingestion.FeatureTable
23 | import feast.proto.types.ValueProto
24 | import feast.ingestion.utils.TypeConversion._
25 | import org.scalatest.matchers.Matcher
26 | import org.scalatest.matchers.must.Matchers.contain
27 | import com.google.common.hash.Hashing
28 | 
29 | import scala.util.Try
30 | 
31 | object RedisStorageHelper {
32 |   def encodeFeatureKey(featureTable: FeatureTable)(feature: String): String = {
33 |     val fullReference = s"${featureTable.name}:$feature"
34 |     murmurHashHexString(fullReference)
35 |   }
36 | 
37 |   def murmurHashHexString(s: String): String = {
38 |     Hashing.murmur3_32().hashString(s, StandardCharsets.UTF_8).asInt().toHexString
39 |   }
40 | 
41 |   def beStoredRow(mappedRow: Map[String, Any]): Matcher[Map[Array[Byte], Array[Byte]]] = {
42 |     val m: Matcher[Map[String, Any]] = contain.allElementsOf(mappedRow).matcher
43 | 
44 |     m compose {
45 |       (_: Map[Array[Byte], Array[Byte]])
46 |         .map {
47 |           case (k, v) if k.sameElements("_ex".getBytes()) =>
48 |             (new String(k), Timestamp.parseFrom(v).asScala)
49 | 
50 |           case (k, v) if k.length == 4 =>
51 |             (
52 |               ByteBuffer.wrap(k).order(ByteOrder.LITTLE_ENDIAN).getInt.toHexString,
53 |               Try(ValueProto.Value.parseFrom(v).asScala).getOrElse(Timestamp.parseFrom(v).asScala)
54 |             )
55 |         }
56 |     }
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/test/scala/feast/ingestion/metrics/StatsDStub.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.metrics
18 | 
19 | import java.net.{DatagramPacket, DatagramSocket, SocketTimeoutException}
20 | 
21 | import scala.collection.mutable.ArrayBuffer
22 | 
23 | class StatsDStub {
24 |   val socket = new DatagramSocket()
25 |   socket.setSoTimeout(100)
26 | 
27 |   def port: Int = socket.getLocalPort
28 | 
29 |   def receive: Array[String] = {
30 |     val messages: ArrayBuffer[String] = ArrayBuffer()
31 |     var finished                      = false
32 | 
33 |     do {
34 |       val buf = new Array[Byte](65535)
35 |       val p   = new DatagramPacket(buf, buf.length)
36 |       try {
37 |         socket.receive(p)
38 |       } catch {
39 |         case _: SocketTimeoutException =>
40 |           finished = true
41 |       }
42 |       messages += new String(p.getData, 0, p.getLength)
43 |     } while (!finished)
44 | 
45 |     messages.toArray
46 |   }
47 | 
48 |   private val metricLine = """(.+):(.+)\|(.+)#(.+)""".r
49 | 
50 |   def receivedMetrics: Map[String, Float] = {
51 |     receive
52 |       .flatMap {
53 |         case metricLine(name, value, type_, tags) =>
54 |           Seq(name -> value.toFloat)
55 |         case s: String =>
56 |           Seq()
57 |       }
58 |       .groupBy(_._1)
59 |       .mapValues(_.map(_._2).sum)
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/spark/ingestion/src/test/scala/feast/ingestion/metrics/StatsReporterSpec.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright 2018-2020 The Feast Authors
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     https://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package feast.ingestion.metrics
18 | 
19 | import java.util
20 | import java.util.Collections
21 | 
22 | import com.codahale.metrics.{Gauge, Histogram, MetricRegistry, UniformReservoir}
23 | import feast.ingestion.UnitSpec
24 | 
25 | import scala.jdk.CollectionConverters._
26 | 
27 | class StatsReporterSpec extends UnitSpec {
28 |   trait Scope {
29 |     val server = new StatsDStub
30 |     val reporter = new StatsdReporterWithTags(
31 |       new MetricRegistry,
32 |       "127.0.0.1",
33 |       server.port
34 |     )
35 | 
36 |     def gauge[A](v: A): Gauge[A] = new Gauge[A] {
37 |       override def getValue: A = v
38 |     }
39 | 
40 |     def histogram(values: Seq[Int]): Histogram = {
41 |       val hist = new Histogram(new UniformReservoir)
42 |       values.foreach(hist.update)
43 |       hist
44 |     }
45 |   }
46 | 
47 |   "Statsd reporter" should "send simple gauge unmodified" in new Scope {
48 |     reporter.report(
49 |       gauges = new util.TreeMap(
50 |         Map(
51 |           "test" -> gauge(0)
52 |         ).asJava
53 |       ),
54 |       counters = Collections.emptySortedMap(),
55 |       histograms = Collections.emptySortedMap(),
56 |       meters = Collections.emptySortedMap(),
57 |       timers = Collections.emptySortedMap()
58 |     )
59 | 
60 |     server.receive should contain("test:0|g")
61 |   }
62 | 
63 |   "Statsd reporter" should "keep tags part in the message's end" in new Scope {
64 |     reporter.report(
65 |       gauges = Collections.emptySortedMap(),
66 |       counters = Collections.emptySortedMap(),
67 |       histograms = new util.TreeMap(
68 |         Map(
69 |           "prefix.1111.test#fs=name,job=aaa" -> histogram((1 to 100))
70 |         ).asJava
71 |       ),
72 |       meters = Collections.emptySortedMap(),
73 |       timers = Collections.emptySortedMap()
74 |     )
75 | 
76 |     server.receive should contain("prefix.test.p95:95.95|ms|#fs:name,job:aaa")
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/tests/__init__.py


--------------------------------------------------------------------------------
/tests/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/tests/e2e/__init__.py


--------------------------------------------------------------------------------
/tests/e2e/fixtures/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/tests/e2e/fixtures/__init__.py


--------------------------------------------------------------------------------
/tests/e2e/fixtures/base.py:
--------------------------------------------------------------------------------
 1 | import xml.etree.ElementTree as ET
 2 | from pathlib import Path
 3 | 
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.fixture(scope="session")
 8 | def project_root():
 9 |     # This file is %root%/tests/e2e/fixtures/base.py
10 |     return Path(__file__).parent.parent.parent.parent
11 | 
12 | 
13 | @pytest.fixture(scope="session")
14 | def project_version(pytestconfig, project_root):
15 |     if pytestconfig.getoption("feast_version"):
16 |         return pytestconfig.getoption("feast_version")
17 | 
18 |     pom_xml = ET.parse(project_root / "pom.xml")
19 |     root = pom_xml.getroot()
20 |     return root.find(".properties/revision").text
21 | 


--------------------------------------------------------------------------------
/tests/e2e/fixtures/data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | from datetime import datetime
 4 | 
 5 | import pytest
 6 | from _pytest.fixtures import FixtureRequest
 7 | from google.cloud import bigquery
 8 | 
 9 | from feast import BigQuerySource, FileSource
10 | from feast.data_format import ParquetFormat
11 | 
12 | __all__ = ("bq_dataset", "batch_source")
13 | 
14 | 
15 | @pytest.fixture(scope="session")
16 | def bq_dataset(pytestconfig):
17 |     client = bigquery.Client(project=pytestconfig.getoption("bq_project"))
18 |     timestamp = int(time.time())
19 |     name = f"feast_e2e_{timestamp}"
20 |     client.create_dataset(name)
21 |     yield name
22 |     client.delete_dataset(name, delete_contents=True)
23 | 
24 | 
25 | @pytest.fixture
26 | def batch_source(local_staging_path: str, pytestconfig, request: FixtureRequest):
27 |     if pytestconfig.getoption("env") == "gcloud":
28 |         bq_project = pytestconfig.getoption("bq_project")
29 |         bq_dataset = request.getfixturevalue("bq_dataset")
30 |         return BigQuerySource(
31 |             event_timestamp_column="event_timestamp",
32 |             created_timestamp_column="created_timestamp",
33 |             table_ref=f"{bq_project}:{bq_dataset}.source_{datetime.now():%Y%m%d%H%M%s}",
34 |         )
35 |     else:
36 |         return FileSource(
37 |             event_timestamp_column="event_timestamp",
38 |             created_timestamp_column="created_timestamp",
39 |             file_format=ParquetFormat(),
40 |             file_url=os.path.join(local_staging_path, "transactions"),
41 |         )
42 | 


--------------------------------------------------------------------------------
/tests/e2e/fixtures/external_services.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pytest_redis.executor import NoopRedis
 3 | 
 4 | from tests.e2e.fixtures.statsd_stub import PrometheusStatsDServer
 5 | 
 6 | __all__ = (
 7 |     "feast_core",
 8 |     "feast_serving",
 9 |     "redis_server",
10 |     "kafka_server",
11 |     "enable_auth",
12 |     "feast_jobservice",
13 |     "statsd_server",
14 | )
15 | 
16 | 
17 | @pytest.fixture(scope="session")
18 | def redis_server(pytestconfig):
19 |     host, port = pytestconfig.getoption("redis_url").split(":")
20 |     return NoopRedis(host, port, None)
21 | 
22 | 
23 | @pytest.fixture(scope="session")
24 | def feast_core(pytestconfig):
25 |     host, port = pytestconfig.getoption("core_url").split(":")
26 |     return host, port
27 | 
28 | 
29 | @pytest.fixture(scope="session")
30 | def feast_serving(pytestconfig):
31 |     host, port = pytestconfig.getoption("serving_url").split(":")
32 |     return host, port
33 | 
34 | 
35 | @pytest.fixture(scope="session")
36 | def kafka_server(pytestconfig):
37 |     host, port = pytestconfig.getoption("kafka_brokers").split(":")
38 |     return host, port
39 | 
40 | 
41 | @pytest.fixture(scope="session")
42 | def enable_auth():
43 |     return False
44 | 
45 | 
46 | @pytest.fixture(scope="session")
47 | def feast_jobservice(pytestconfig):
48 |     host, port = pytestconfig.getoption("job_service_url").split(":")
49 |     return host, port
50 | 
51 | 
52 | @pytest.fixture(scope="session")
53 | def statsd_server(pytestconfig):
54 |     host, port = pytestconfig.getoption("statsd_url").split(":")
55 |     prometheus_host, prometheus_port = pytestconfig.getoption("prometheus_url").split(
56 |         ":"
57 |     )
58 |     return PrometheusStatsDServer(host, port, prometheus_host, prometheus_port)
59 | 


--------------------------------------------------------------------------------
/tests/e2e/fixtures/services.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pathlib
 3 | import shutil
 4 | 
 5 | import port_for
 6 | import pytest
 7 | import requests
 8 | from pytest_kafka import make_kafka_server, make_zookeeper_process
 9 | from pytest_postgresql import factories as pg_factories
10 | from pytest_redis import factories as redis_factories
11 | 
12 | __all__ = (
13 |     "kafka_server",
14 |     "kafka_port",
15 |     "zookeeper_server",
16 |     "postgres_server",
17 |     "redis_server",
18 |     "statsd_server",
19 | )
20 | 
21 | from tests.e2e.fixtures.statsd_stub import StatsDStub
22 | 
23 | 
24 | def download_kafka(version="2.12-2.6.0"):
25 |     temp_dir = pathlib.Path("/tmp")
26 |     local_path = temp_dir / f"kafka_{version}.tgz"
27 | 
28 |     if not os.path.isfile(local_path):
29 |         r = requests.get(
30 |             f"https://archive.apache.org/dist/kafka/2.6.0/kafka_{version}.tgz"
31 |         )
32 | 
33 |         r.raise_for_status()
34 | 
35 |         with open(local_path, "wb") as f:
36 |             f.write(r.content)
37 | 
38 |     shutil.unpack_archive(str(local_path), str(temp_dir))
39 |     return temp_dir / f"kafka_{version}" / "bin"
40 | 
41 | 
42 | @pytest.fixture
43 | def kafka_server(kafka_port):
44 |     _, port = kafka_port
45 |     return "localhost", port
46 | 
47 | 
48 | @pytest.fixture
49 | def statsd_server():
50 |     port = port_for.select_random(None)
51 |     server = StatsDStub(port=port)
52 |     server.start()
53 |     yield server
54 |     server.stop()
55 | 
56 | 
57 | postgres_server = pg_factories.postgresql_proc(password="password")
58 | redis_server = redis_factories.redis_proc(
59 |     executable=shutil.which("redis-server"), timeout=3600
60 | )
61 | 
62 | KAFKA_BIN = download_kafka()
63 | zookeeper_server = make_zookeeper_process(
64 |     str(KAFKA_BIN / "zookeeper-server-start.sh"),
65 |     zk_config_template="""
66 | dataDir={zk_data_dir}
67 | clientPort={zk_port}
68 | maxClientCnxns=0
69 | admin.enableServer=false""",
70 | )
71 | kafka_port = make_kafka_server(
72 |     kafka_bin=str(KAFKA_BIN / "kafka-server-start.sh"),
73 |     zookeeper_fixture_name="zookeeper_server",
74 | )
75 | 


--------------------------------------------------------------------------------
/tests/e2e/test_job_scheduling.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import uuid
 3 | 
 4 | import pytest as pytest
 5 | from kubernetes import client, config
 6 | 
 7 | from feast import Client, Entity, Feature, FeatureTable, FileSource, ValueType
 8 | from feast.data_format import ParquetFormat
 9 | from feast_spark import Client as SparkClient
10 | 
11 | 
12 | @pytest.mark.env("k8s")
13 | def test_schedule_batch_ingestion_jobs(
14 |     pytestconfig, feast_client: Client, feast_spark_client: SparkClient
15 | ):
16 |     entity = Entity(name="s2id", description="S2id", value_type=ValueType.INT64,)
17 |     batch_source = FileSource(
18 |         file_format=ParquetFormat(),
19 |         file_url="gs://example/feast/*",
20 |         event_timestamp_column="datetime_col",
21 |         created_timestamp_column="timestamp",
22 |         date_partition_column="datetime",
23 |     )
24 |     feature_table = FeatureTable(
25 |         name=f"schedule_{str(uuid.uuid4())}".replace("-", "_"),
26 |         entities=["s2id"],
27 |         features=[Feature("unique_drivers", ValueType.INT64)],
28 |         batch_source=batch_source,
29 |     )
30 |     feast_client.apply(entity)
31 |     feast_client.apply(feature_table)
32 | 
33 |     feast_spark_client.schedule_offline_to_online_ingestion(
34 |         feature_table, 1, "0 0 * * *"
35 |     )
36 |     config.load_incluster_config()
37 |     k8s_api = client.CustomObjectsApi()
38 | 
39 |     def get_scheduled_spark_application():
40 |         job_hash = hashlib.md5(
41 |             f"{feast_client.project}-{feature_table.name}".encode()
42 |         ).hexdigest()
43 |         resource_name = f"feast-{job_hash}"
44 | 
45 |         return k8s_api.get_namespaced_custom_object(
46 |             group="sparkoperator.k8s.io",
47 |             version="v1beta2",
48 |             namespace=pytestconfig.getoption("k8s_namespace"),
49 |             plural="scheduledsparkapplications",
50 |             name=resource_name,
51 |         )
52 | 
53 |     response = get_scheduled_spark_application()
54 |     assert response["spec"]["schedule"] == "0 0 * * *"
55 |     feast_spark_client.schedule_offline_to_online_ingestion(
56 |         feature_table, 1, "1 0 * * *"
57 |     )
58 |     response = get_scheduled_spark_application()
59 |     assert response["spec"]["schedule"] == "1 0 * * *"
60 | 
61 |     feast_spark_client.unschedule_offline_to_online_ingestion(feature_table)
62 | 


--------------------------------------------------------------------------------
/tests/e2e/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/tests/e2e/utils/__init__.py


--------------------------------------------------------------------------------
/tests/e2e/utils/common.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from feast import Entity, Feature, FeatureTable, FileSource, KafkaSource, ValueType
 4 | from feast.data_format import AvroFormat, ParquetFormat
 5 | from feast.wait import wait_retry_backoff
 6 | from feast_spark import Client as SparkClient
 7 | from feast_spark.pyspark.abc import SparkJobStatus
 8 | 
 9 | 
10 | def create_schema(kafka_broker, topic_name, feature_table_name):
11 |     entity = Entity(name="key", description="Key", value_type=ValueType.INT64)
12 |     feature_table = FeatureTable(
13 |         name=feature_table_name,
14 |         entities=["key"],
15 |         features=[Feature("num", ValueType.INT64), Feature("set", ValueType.STRING)],
16 |         batch_source=FileSource(
17 |             event_timestamp_column="event_timestamp",
18 |             file_format=ParquetFormat(),
19 |             file_url="/dev/null",
20 |         ),
21 |         stream_source=KafkaSource(
22 |             event_timestamp_column="event_timestamp",
23 |             bootstrap_servers=kafka_broker,
24 |             message_format=AvroFormat(avro_schema()),
25 |             topic=topic_name,
26 |         ),
27 |     )
28 |     return entity, feature_table
29 | 
30 | 
31 | def start_job(
32 |     feast_spark_client: SparkClient, feature_table: FeatureTable, pytestconfig
33 | ):
34 |     if pytestconfig.getoption("scheduled_streaming_job"):
35 |         return
36 | 
37 |     job = feast_spark_client.start_stream_to_online_ingestion(feature_table)
38 |     wait_retry_backoff(
39 |         lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 180
40 |     )
41 |     return job
42 | 
43 | 
44 | def stop_job(job, feast_spark_client: SparkClient, feature_table: FeatureTable):
45 |     if job:
46 |         job.cancel()
47 |     else:
48 |         feast_spark_client._feast.delete_feature_table(feature_table.name)
49 | 
50 | 
51 | def avro_schema():
52 |     return json.dumps(
53 |         {
54 |             "type": "record",
55 |             "name": "TestMessage",
56 |             "fields": [
57 |                 {"name": "key", "type": "long"},
58 |                 {"name": "num", "type": "long"},
59 |                 {"name": "set", "type": "string"},
60 |                 {
61 |                     "name": "event_timestamp",
62 |                     "type": {"type": "long", "logicalType": "timestamp-micros"},
63 |                 },
64 |             ],
65 |         }
66 |     )
67 | 


--------------------------------------------------------------------------------
/tests/e2e/utils/kafka.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | from typing import Any, Dict, List, Optional
 3 | 
 4 | import avro.schema
 5 | import pandas as pd
 6 | import pytz
 7 | from avro.io import BinaryEncoder, DatumWriter
 8 | from kafka import KafkaAdminClient, KafkaProducer
 9 | 
10 | from feast import Client
11 | from feast.wait import wait_retry_backoff
12 | 
13 | 
14 | def send_avro_record_to_kafka(topic, value, bootstrap_servers, avro_schema_json):
15 |     value_schema = avro.schema.parse(avro_schema_json)
16 | 
17 |     producer = KafkaProducer(bootstrap_servers=bootstrap_servers)
18 | 
19 |     writer = DatumWriter(value_schema)
20 |     bytes_writer = io.BytesIO()
21 |     encoder = BinaryEncoder(bytes_writer)
22 | 
23 |     writer.write(value, encoder)
24 | 
25 |     try:
26 |         producer.send(topic=topic, value=bytes_writer.getvalue())
27 |     except Exception as e:
28 |         print(
29 |             f"Exception while producing record value - {value} to topic - {topic}: {e}"
30 |         )
31 |     else:
32 |         print(f"Successfully producing record value - {value} to topic - {topic}")
33 | 
34 |     producer.flush()
35 | 
36 | 
37 | def check_consumer_exist(bootstrap_servers, topic_name):
38 |     admin = KafkaAdminClient(bootstrap_servers=bootstrap_servers)
39 |     consumer_groups = admin.describe_consumer_groups(
40 |         group_ids=[
41 |             group_id
42 |             for group_id, _ in admin.list_consumer_groups()
43 |             if group_id.startswith("spark-kafka-source")
44 |         ]
45 |     )
46 |     subscriptions = {
47 |         subscription
48 |         for group in consumer_groups
49 |         for member in group.members
50 |         if not isinstance(member.member_metadata, bytes)
51 |         for subscription in member.member_metadata.subscription
52 |     }
53 |     return topic_name in subscriptions
54 | 
55 | 
56 | def ingest_and_retrieve(
57 |     feast_client: Client,
58 |     df: pd.DataFrame,
59 |     topic_name: str,
60 |     kafka_broker: str,
61 |     avro_schema_json: str,
62 |     entity_rows: List[Dict[str, Any]],
63 |     feature_names: List[Any],
64 |     expected_ingested_count: Optional[int] = None,
65 | ):
66 |     expected_ingested_count = expected_ingested_count or df.shape[0]
67 | 
68 |     for record in df.to_dict("records"):
69 |         record["event_timestamp"] = (
70 |             record["event_timestamp"].to_pydatetime().replace(tzinfo=pytz.utc)
71 |         )
72 | 
73 |         send_avro_record_to_kafka(
74 |             topic_name,
75 |             record,
76 |             bootstrap_servers=kafka_broker,
77 |             avro_schema_json=avro_schema_json,
78 |         )
79 | 
80 |     def get_online_features():
81 |         features = feast_client.get_online_features(
82 |             feature_names, entity_rows=entity_rows,
83 |         ).to_dict()
84 |         out_df = pd.DataFrame.from_dict(features)
85 |         return out_df, out_df[feature_names].count().min() >= expected_ingested_count
86 | 
87 |     ingested = wait_retry_backoff(get_online_features, 180)
88 |     return ingested
89 | 


--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
 1 | pytest==6.0.0
 2 | pytest-lazy-fixture==0.6.3
 3 | pytest-timeout==1.4.2
 4 | pytest-ordering==0.6.*
 5 | pytest-benchmark==3.2.2
 6 | pytest-mock==1.10.4
 7 | pytest-ordering==0.6.*
 8 | pytest-xdist==2.1.0
 9 | pytest-postgresql==2.5.1
10 | pytest-redis==2.0.0
11 | pytest-kafka==0.4.0
12 | deepdiff==4.3.2
13 | kafka-python==2.0.2
14 | great-expectations==0.13.2
15 | Jinja2==3.0.3
16 | pandavro==1.5.*
17 | avro==1.10.0
18 | pyspark==3.1.3
19 | gcsfs
20 | 


--------------------------------------------------------------------------------
/tests/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | multi_line_output=3
 3 | include_trailing_comma=True
 4 | force_grid_wrap=0
 5 | use_parentheses=True
 6 | line_length=88
 7 | known_first_party=feast,feast_serving_server,feast_core_server,feast_spark
 8 | default_section=THIRDPARTY
 9 | 
10 | [flake8]
11 | ignore = E203, E266, E501, W503
12 | max-line-length = 88
13 | max-complexity = 20
14 | select = B,C,E,F,W,T4
15 | 
16 | [mypy]
17 | ignore_missing_imports=true


--------------------------------------------------------------------------------