├── .github ├── pull_request_template.md └── workflows │ ├── master_only.yml │ ├── mirror.yml │ ├── pr.yml │ ├── pr_full_access.yml │ └── release.yml ├── .gitignore ├── .prow.yaml ├── .prow ├── config.yaml └── plugins.yaml ├── .readthedocs.yml ├── .scalafmt.conf ├── CHANGELOG.md ├── Makefile ├── OWNERS ├── README.md ├── infra ├── charts │ └── feast-spark │ │ ├── .helmignore │ │ ├── Chart.yaml │ │ ├── README.md │ │ ├── README.md.gotmpl │ │ ├── charts │ │ ├── feast-jobservice │ │ │ ├── Chart.yaml │ │ │ ├── README.md │ │ │ ├── templates │ │ │ │ ├── _helpers.tpl │ │ │ │ ├── configmap.yaml │ │ │ │ ├── deployment.yaml │ │ │ │ └── service.yaml │ │ │ └── values.yaml │ │ └── prometheus-statsd-exporter │ │ │ ├── .helmignore │ │ │ ├── Chart.yaml │ │ │ ├── README.md │ │ │ ├── templates │ │ │ ├── NOTES.txt │ │ │ ├── _helpers.tpl │ │ │ ├── config.yaml │ │ │ ├── deployment.yaml │ │ │ ├── pvc.yaml │ │ │ ├── service.yaml │ │ │ └── serviceaccount.yaml │ │ │ └── values.yaml │ │ ├── requirements.lock │ │ ├── requirements.yaml │ │ └── values.yaml ├── codebuild_runner.py ├── docker │ ├── jobservice │ │ └── Dockerfile │ ├── spark │ │ ├── Dockerfile │ │ └── dev.Dockerfile │ └── tests │ │ └── Dockerfile └── scripts │ ├── aws-runner.sh │ ├── build-ingestion-py-dependencies.sh │ ├── codebuild-entrypoint.sh │ ├── codebuild_runner.py │ ├── download-maven-cache.sh │ ├── helm │ ├── k8s-jobservice.tpl.yaml │ ├── kafka-values.tpl.yaml │ └── redis-cluster-values.tpl.yaml │ ├── install-google-cloud-sdk.sh │ ├── install-helm.sh │ ├── k8s-common-functions.sh │ ├── publish-docker-image.sh │ ├── publish-java-sdk.sh │ ├── publish-python-sdk.sh │ ├── push-helm-charts.sh │ ├── run-minikube-test.sh │ ├── setup-common-functions.sh │ ├── setup-e2e-env-aws.sh │ ├── setup-e2e-env-gcp.sh │ ├── setup-e2e-env-sparkop.sh │ ├── setup-e2e-local.sh │ ├── test-core-ingestion.sh │ ├── test-docker-compose.sh │ ├── test-end-to-end-local.sh │ ├── test-end-to-end-sparkop.sh │ ├── test-golang-sdk.sh │ ├── test-integration.sh │ ├── test-java-sdk.sh │ ├── test-load.sh │ ├── test-python-sdk.sh │ ├── test-serving.sh │ ├── test_job.yaml │ ├── validate-helm-chart-versions.sh │ ├── validate-version-consistency.sh │ └── wait-for-it.sh ├── pom.xml ├── protos ├── feast │ ├── core │ │ ├── CoreService.proto │ │ ├── DataFormat.proto │ │ ├── DataSource.proto │ │ ├── Entity.proto │ │ ├── Feature.proto │ │ ├── FeatureTable.proto │ │ ├── JobService.proto │ │ └── Store.proto │ ├── serving │ │ └── ServingService.proto │ ├── storage │ │ └── Redis.proto │ ├── third_party │ │ └── grpc │ │ │ └── health │ │ │ └── v1 │ │ │ └── HealthService.proto │ └── types │ │ ├── Field.proto │ │ └── Value.proto └── feast_spark │ ├── api │ └── JobService.proto │ └── third_party │ └── grpc │ └── health │ └── v1 │ └── HealthService.proto ├── python ├── docs │ ├── Makefile │ └── source │ │ ├── conf.py │ │ ├── feast_spark.api.rst │ │ ├── feast_spark.contrib.rst │ │ ├── feast_spark.contrib.validation.rst │ │ ├── feast_spark.pyspark.launchers.aws.rst │ │ ├── feast_spark.pyspark.launchers.gcloud.rst │ │ ├── feast_spark.pyspark.launchers.k8s.rst │ │ ├── feast_spark.pyspark.launchers.rst │ │ ├── feast_spark.pyspark.launchers.standalone.rst │ │ ├── feast_spark.pyspark.rst │ │ ├── feast_spark.rst │ │ ├── feast_spark.third_party.grpc.health.rst │ │ ├── feast_spark.third_party.grpc.health.v1.rst │ │ ├── feast_spark.third_party.grpc.rst │ │ ├── feast_spark.third_party.rst │ │ ├── index.rst │ │ └── modules.rst ├── feast_spark │ ├── __init__.py │ ├── api │ │ └── __init__.py │ ├── cli.py │ ├── client.py │ ├── constants.py │ ├── contrib │ │ ├── __init__.py │ │ └── validation │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── ge.py │ ├── job_service.py │ ├── lock_manager.py │ ├── metrics.py │ ├── pyspark │ │ ├── __init__.py │ │ ├── abc.py │ │ ├── historical_feature_retrieval_job.py │ │ ├── launcher.py │ │ └── launchers │ │ │ ├── __init__.py │ │ │ ├── aws │ │ │ ├── __init__.py │ │ │ ├── emr.py │ │ │ └── emr_utils.py │ │ │ ├── gcloud │ │ │ ├── __init__.py │ │ │ └── dataproc.py │ │ │ ├── k8s │ │ │ ├── __init__.py │ │ │ ├── k8s.py │ │ │ └── k8s_utils.py │ │ │ └── standalone │ │ │ ├── __init__.py │ │ │ └── local.py │ ├── remote_job.py │ └── third_party │ │ ├── __init__.py │ │ └── grpc │ │ ├── __init__.py │ │ └── health │ │ ├── __init__.py │ │ └── v1 │ │ └── __init__.py ├── pyproject.toml ├── requirements-ci.txt ├── setup.cfg ├── setup.py └── tests │ ├── __init__.py │ ├── data │ ├── bookings.csv │ ├── column_mapping_test_entity.csv │ ├── column_mapping_test_feature.csv │ ├── customer_driver_pairs.csv │ ├── customers.csv │ ├── single_customer.csv │ └── transactions.csv │ ├── test_historical_feature_retrieval.py │ ├── test_launcher_abc.py │ ├── test_lock_manager.py │ └── test_streaming_job_scheduling.py ├── spark └── ingestion │ ├── pom.xml │ └── src │ ├── main │ ├── resources │ │ └── log4j.properties │ └── scala │ │ ├── feast │ │ └── ingestion │ │ │ ├── BasePipeline.scala │ │ │ ├── BatchPipeline.scala │ │ │ ├── IngestionJob.scala │ │ │ ├── IngestionJobConfig.scala │ │ │ ├── StreamingPipeline.scala │ │ │ ├── metrics │ │ │ ├── IngestionPipelineMetrics.scala │ │ │ ├── StatsdReporterWithTags.scala │ │ │ └── StreamingMetrics.scala │ │ │ ├── registry │ │ │ └── proto │ │ │ │ ├── LocalProtoRegistry.scala │ │ │ │ ├── ProtoRegistry.scala │ │ │ │ ├── ProtoRegistryFactory.scala │ │ │ │ └── StencilProtoRegistry.scala │ │ │ ├── sources │ │ │ ├── bq │ │ │ │ └── BigQueryReader.scala │ │ │ └── file │ │ │ │ └── FileReader.scala │ │ │ ├── stores │ │ │ ├── bigtable │ │ │ │ ├── BigTableSinkRelation.scala │ │ │ │ ├── DefaultSource.scala │ │ │ │ └── SparkBigtableConfig.scala │ │ │ ├── cassandra │ │ │ │ ├── CassandraSinkRelation.scala │ │ │ │ ├── DefaultSource.scala │ │ │ │ └── SparkCassandraConfig.scala │ │ │ ├── redis │ │ │ │ ├── ClusterPipelineProvider.scala │ │ │ │ ├── DefaultSource.scala │ │ │ │ ├── HashTypePersistence.scala │ │ │ │ ├── Persistence.scala │ │ │ │ ├── PipelineProvider.scala │ │ │ │ ├── PipelineProviderFactory.scala │ │ │ │ ├── RedisEndpoint.scala │ │ │ │ ├── RedisSinkRelation.scala │ │ │ │ ├── SingleNodePipelineProvider.scala │ │ │ │ └── SparkRedisConfig.scala │ │ │ └── serialization │ │ │ │ ├── AvroSerializer.scala │ │ │ │ └── Serializer.scala │ │ │ ├── utils │ │ │ ├── JsonUtils.scala │ │ │ ├── ProtoReflection.scala │ │ │ ├── StringUtils.scala │ │ │ ├── TypeConversion.scala │ │ │ └── testing │ │ │ │ └── MemoryStreamingSource.scala │ │ │ └── validation │ │ │ ├── Expectation.scala │ │ │ ├── RowValidator.scala │ │ │ └── TypeCheck.scala │ │ └── org │ │ └── apache │ │ └── spark │ │ ├── api │ │ └── python │ │ │ └── DynamicPythonFunction.scala │ │ └── metrics │ │ ├── AtomicGauge.scala │ │ ├── sink │ │ └── StatsdSinkWithTags.scala │ │ └── source │ │ ├── BaseMetricSource.scala │ │ ├── BigTableSinkMetricSource.scala │ │ ├── IngestionPipelineMetricSource.scala │ │ ├── RedisSinkMetricSource.scala │ │ └── StreamingMetricSource.scala │ └── test │ ├── proto │ └── com │ │ └── example │ │ └── source.proto │ ├── resources │ ├── python │ │ ├── setup.sh │ │ └── udf.py │ └── stencil │ │ └── __files │ │ └── source.desc │ └── scala │ ├── com │ └── example │ │ └── protos │ │ ├── AllTypesMessage.java │ │ ├── AllTypesMessageOrBuilder.java │ │ ├── InnerMessage.java │ │ ├── InnerMessageOrBuilder.java │ │ ├── Source.java │ │ ├── TestMessage.java │ │ ├── TestMessageOrBuilder.java │ │ ├── VehicleType.java │ │ └── VehicleTypeOrBuilder.java │ └── feast │ └── ingestion │ ├── BatchPipelineIT.scala │ ├── BigTableIngestionSpec.scala │ ├── CassandraIngestionSpec.scala │ ├── PandasUDF.scala │ ├── RowValidatorTest.scala │ ├── SparkSpec.scala │ ├── StreamingPipelineIT.scala │ ├── UnitSpec.scala │ ├── helpers │ ├── DataHelper.scala │ └── RedisStorageHelper.scala │ ├── metrics │ ├── StatsDStub.scala │ └── StatsReporterSpec.scala │ └── registry │ └── StencilSpec.scala └── tests ├── README.md ├── __init__.py ├── e2e ├── __init__.py ├── conftest.py ├── fixtures │ ├── __init__.py │ ├── base.py │ ├── client.py │ ├── data.py │ ├── external_services.py │ ├── feast_services.py │ ├── services.py │ └── statsd_stub.py ├── test_historical_features.py ├── test_job_scheduling.py ├── test_online_features.py ├── test_register.py ├── test_validation.py └── utils │ ├── __init__.py │ ├── common.py │ └── kafka.py ├── requirements.txt └── setup.cfg /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 10 | 11 | **What this PR does / why we need it**: 12 | 13 | **Which issue(s) this PR fixes**: 14 | 18 | Fixes # 19 | 20 | **Does this PR introduce a user-facing change?**: 21 | 29 | ```release-note 30 | 31 | ``` 32 | -------------------------------------------------------------------------------- /.github/workflows/mirror.yml: -------------------------------------------------------------------------------- 1 | name: mirror 2 | 3 | on: 4 | push: 5 | branches: master 6 | tags: 7 | - 'v*.*.*' 8 | 9 | jobs: 10 | mirror: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | with: 15 | fetch-depth: 0 16 | - uses: webfactory/ssh-agent@v0.4.1 17 | with: 18 | ssh-private-key: ${{ secrets.MIRROR_SSH_KEY }} 19 | - name: Mirror all origin branches and tags to internal repo 20 | run: | 21 | export GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" 22 | git remote add internal ${{ secrets.INTERNAL_REPO }} 23 | git push internal --all -f 24 | git push internal --tags -f -------------------------------------------------------------------------------- /.github/workflows/pr.yml: -------------------------------------------------------------------------------- 1 | name: pull request 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | lint-java: 7 | container: gcr.io/kf-feast/feast-ci:latest 8 | runs-on: [ubuntu-latest] 9 | steps: 10 | - uses: actions/checkout@v2 11 | - name: Lint java 12 | run: make lint-java 13 | 14 | test-java: 15 | runs-on: ubuntu-latest 16 | needs: lint-java 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up JDK 11 20 | uses: actions/setup-java@v1 21 | with: 22 | java-version: '11' 23 | java-package: jdk 24 | architecture: x64 25 | - uses: actions/setup-python@v2 26 | with: 27 | python-version: '3.8' 28 | architecture: 'x64' 29 | - uses: actions/cache@v2 30 | with: 31 | path: ~/.m2/repository 32 | key: ${{ runner.os }}-ut-maven-${{ hashFiles('**/pom.xml') }} 33 | restore-keys: | 34 | ${{ runner.os }}-ut-maven- 35 | - name: Test java 36 | run: make test-java 37 | 38 | lint-python: 39 | container: python:3.8 40 | runs-on: [ubuntu-latest] 41 | steps: 42 | - uses: actions/checkout@v2 43 | - name: Install dependencies 44 | run: make install-python-ci-dependencies 45 | - name: Lint python 46 | run: make lint-python 47 | 48 | unit-test-python: 49 | runs-on: ubuntu-latest 50 | needs: lint-python 51 | env: 52 | PYSPARK_PYTHON: python3.8 53 | steps: 54 | - uses: actions/checkout@v2 55 | - name: Set up JDK 11 56 | uses: actions/setup-java@v1 57 | with: 58 | java-version: '11' 59 | java-package: jdk 60 | architecture: x64 61 | - uses: actions/setup-python@v2 62 | with: 63 | python-version: '3.8' 64 | architecture: 'x64' 65 | - name: Install python 66 | run: make install-python 67 | - name: Test python 68 | run: make test-python 69 | -------------------------------------------------------------------------------- /.github/workflows/pr_full_access.yml: -------------------------------------------------------------------------------- 1 | # contains additional jobs to run for 'complete' workflow that involve secrets 2 | name: pull request (full access) 3 | 4 | on: 5 | # 'pull_request_target' required to make secrets available for jobs 6 | pull_request_target: 7 | types: 8 | - opened 9 | - synchronize 10 | - labeled 11 | 12 | jobs: 13 | # all jobs should have a if check for 'ok-to-test' label in order to be gated by the label. 14 | # otherwise secrets might be unintentionally exposed to malicious forks. 15 | build-push-docker-images-for-e2e-tests: 16 | if: contains(github.event.pull_request.labels.*.name, 'ok-to-test') || contains(github.event.pull_request.labels.*.name, 'approved') 17 | runs-on: [ubuntu-latest] 18 | strategy: 19 | matrix: 20 | component: [jobservice, spark] 21 | env: 22 | GITHUB_PR_SHA: ${{ github.event.pull_request.head.sha }} 23 | REGISTRY: gcr.io/kf-feast 24 | MAVEN_CACHE: gs://feast-templocation-kf-feast/.m2.2020-08-19.tar 25 | steps: 26 | - uses: actions/checkout@v2 27 | with: 28 | # pull_request_target runs the workflow in the context of the base repo 29 | # as such actions/checkout needs to be explicit configured to retrieve 30 | # code from the PR. 31 | ref: ${{ github.event.pull_request.merge_commit_sha }} 32 | - uses: docker/setup-qemu-action@v1 33 | - name: Set up Docker Buildx 34 | uses: docker/setup-buildx-action@v1 35 | - uses: google-github-actions/setup-gcloud@v0 36 | with: 37 | version: '290.0.1' 38 | export_default_credentials: true 39 | project_id: ${{ secrets.GCP_PROJECT_ID }} 40 | service_account_key: ${{ secrets.GCP_SA_KEY }} 41 | - run: gcloud auth configure-docker --quiet 42 | - name: Get m2 cache 43 | run: | 44 | infra/scripts/download-maven-cache.sh \ 45 | --archive-uri ${MAVEN_CACHE} \ 46 | --output-dir $HOME 47 | - name: Build image 48 | run: make build-${{ matrix.component }}-docker REGISTRY=${REGISTRY} VERSION=${GITHUB_PR_SHA} 49 | - name: Push image 50 | run: | 51 | docker push ${REGISTRY}/feast-${{ matrix.component }}:${GITHUB_PR_SHA} 52 | -------------------------------------------------------------------------------- /.prow.yaml: -------------------------------------------------------------------------------- 1 | presubmits: 2 | - name: test-end-to-end-sparkop 3 | decorate: true 4 | always_run: true 5 | max_concurrency: 1 6 | spec: 7 | metadata: 8 | namespace: sparkop-e2e 9 | containers: 10 | - image: gcr.io/kf-feast/feast-ci:latest 11 | command: [ "infra/scripts/test-end-to-end-sparkop.sh"] 12 | resources: 13 | requests: 14 | cpu: "2" 15 | memory: "2Gi" 16 | env: 17 | - name: GOOGLE_APPLICATION_CREDENTIALS 18 | value: /etc/gcloud/service-account.json 19 | - name: DOCKER_REPOSITORY 20 | value: gcr.io/kf-feast 21 | volumeMounts: 22 | - mountPath: /etc/gcloud/service-account.json 23 | name: service-account 24 | readOnly: true 25 | subPath: service-account.json 26 | volumes: 27 | - name: service-account 28 | secret: 29 | secretName: feast-service-account 30 | 31 | postsubmits: 32 | - name: test-end-to-end-sparkop 33 | decorate: true 34 | always_run: true 35 | max_concurrency: 1 36 | branches: 37 | - ^master$ 38 | spec: 39 | metadata: 40 | namespace: sparkop-e2e 41 | containers: 42 | - image: gcr.io/kf-feast/feast-ci:latest 43 | command: [ "infra/scripts/test-end-to-end-sparkop.sh"] 44 | resources: 45 | requests: 46 | cpu: "2" 47 | memory: "2048Mi" 48 | env: 49 | - name: GOOGLE_APPLICATION_CREDENTIALS 50 | value: /etc/gcloud/service-account.json 51 | - name: DOCKER_REPOSITORY 52 | value: gcr.io/kf-feast 53 | volumeMounts: 54 | - mountPath: /etc/gcloud/service-account.json 55 | name: service-account 56 | readOnly: true 57 | subPath: service-account.json 58 | volumes: 59 | - name: service-account 60 | secret: 61 | secretName: feast-service-account 62 | -------------------------------------------------------------------------------- /.prow/config.yaml: -------------------------------------------------------------------------------- 1 | prowjob_namespace: prow 2 | pod_namespace: test-pods 3 | 4 | in_repo_config: 5 | enabled: 6 | "*": true 7 | allowed_clusters: 8 | "*": ["default"] 9 | 10 | plank: 11 | job_url_prefix_config: 12 | "*": https://prow.feast.dev/view/gcs 13 | pod_pending_timeout: 60m 14 | report_templates: 15 | '*': >- 16 | [Full PR test history](https://prow.feast.dev/pr-history?org={{.Spec.Refs.Org}}&repo={{.Spec.Refs.Repo}}&pr={{with index .Spec.Refs.Pulls 0}}{{.Number}}{{end}}). 17 | [Your PR dashboard](https://prow.feast.dev/pr?query=is:pr+state:open+author:{{with 18 | index .Spec.Refs.Pulls 0}}{{.Author}}{{end}}). 19 | default_decoration_configs: 20 | "*": 21 | timeout: 1h 22 | grace_period: 15s 23 | gcs_configuration: 24 | bucket: gs://feast-prow-artifacts 25 | path_strategy: explicit 26 | gcs_credentials_secret: gcs-credentials 27 | utility_images: 28 | clonerefs: gcr.io/k8s-prow/clonerefs:v20201112-00537d1bb4 29 | entrypoint: gcr.io/k8s-prow/entrypoint:v20201112-00537d1bb4 30 | initupload: gcr.io/k8s-prow/initupload:v20201112-00537d1bb4 31 | sidecar: gcr.io/k8s-prow/sidecar:v20201112-00537d1bb4 32 | 33 | deck: 34 | tide_update_period: 1s 35 | spyglass: 36 | size_limit: 10e+6 # 10MB 37 | lenses: 38 | - lens: 39 | name: metadata 40 | required_files: 41 | - started.json|finished.json 42 | - lens: 43 | name: buildlog 44 | required_files: 45 | - build-log.txt 46 | - lens: 47 | name: junit 48 | required_files: 49 | - artifacts/.*\.xml 50 | 51 | tide: 52 | queries: 53 | - repos: 54 | - feast-dev/feast 55 | - feast-dev/feast-spark 56 | labels: 57 | - lgtm 58 | - approved 59 | missingLabels: 60 | - do-not-merge 61 | - do-not-merge/hold 62 | - do-not-merge/invalid-owners-file 63 | - do-not-merge/work-in-progress 64 | - needs-rebase 65 | - needs-kind 66 | merge_method: 67 | feast-dev/feast: squash 68 | feast-dev/feast-spark: squash 69 | blocker_label: merge-blocker 70 | squash_label: tide/squash 71 | 72 | # presubmits and postsubmits configure ProwJobs: 73 | # https://github.com/kubernetes/test-infra/blob/6571843b1aa7bd6cf577a7a8b9e9971241f424d5/prow/jobs.md 74 | -------------------------------------------------------------------------------- /.prow/plugins.yaml: -------------------------------------------------------------------------------- 1 | plugins: 2 | feast-dev/feast-spark: 3 | - approve 4 | - assign 5 | - help 6 | - hold 7 | - label 8 | - lgtm 9 | - lifecycle 10 | - size 11 | - verify-owners 12 | - wip 13 | - trigger 14 | - config-updater 15 | - require-matching-label 16 | - release-note 17 | 18 | config_updater: 19 | maps: 20 | .prow/config.yaml: 21 | name: config 22 | 23 | external_plugins: 24 | feast-dev/feast-spark: 25 | - name: needs-rebase 26 | events: 27 | - pull_request 28 | 29 | require_matching_label: 30 | - missing_label: needs-kind 31 | org: feast-dev 32 | repo: feast 33 | prs: true 34 | regexp: ^kind/ 35 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sphinx: 4 | configuration: python/docs/source/conf.py 5 | 6 | formats: 7 | - pdf 8 | 9 | python: 10 | version: 3.7 11 | install: 12 | - requirements: python/requirements-ci.txt 13 | - path: python/ 14 | method: setuptools -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | align.preset = more 2 | maxColumn = 100 -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | MVN := mvn ${MAVEN_EXTRA_OPTS} 2 | ROOT_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) 3 | 4 | PROTO_TYPE_SUBDIRS = api 5 | PROTO_SERVICE_SUBDIRS = api 6 | 7 | # Make sure env vars are available to submakes 8 | export 9 | 10 | # Java 11 | 12 | format-java: 13 | cd spark/ingestion && ${MVN} spotless:apply 14 | 15 | lint-java: 16 | cd spark/ingestion && ${MVN} --no-transfer-progress spotless:check 17 | 18 | test-java: 19 | ${MVN} --no-transfer-progress clean verify 20 | 21 | # Python 22 | 23 | format-python: 24 | # Sort 25 | cd ${ROOT_DIR}/python ; isort feast_spark/ 26 | #cd ${ROOT_DIR}/tests/e2e; isort . 27 | 28 | # Format 29 | cd ${ROOT_DIR}/python; black --target-version py37 feast_spark 30 | #cd ${ROOT_DIR}/tests/e2e; black --target-version py37 . 31 | 32 | install-python-ci-dependencies: 33 | pip install -U --no-cache-dir -r python/requirements-ci.txt 34 | 35 | # Supports feast-dev repo master branch 36 | install-python: install-python-ci-dependencies 37 | pip install -e python 38 | 39 | lint-python: 40 | cd ${ROOT_DIR}/python ; mypy feast_spark/ tests/ 41 | cd ${ROOT_DIR}/python ; isort feast_spark/ tests/ --check-only 42 | cd ${ROOT_DIR}/python ; flake8 feast_spark/ tests/ 43 | cd ${ROOT_DIR}/python ; black --check feast_spark tests 44 | cd ${ROOT_DIR}/tests; mypy e2e 45 | cd ${ROOT_DIR}/tests; isort e2e --check-only 46 | cd ${ROOT_DIR}/tests; flake8 e2e 47 | cd ${ROOT_DIR}/tests; black --check e2e 48 | 49 | test-python: 50 | pytest --verbose --color=yes python/tests 51 | 52 | build-local-test-docker: 53 | docker build -t feast:local -f infra/docker/tests/Dockerfile . 54 | 55 | build-ingestion-jar-no-tests: 56 | cd spark/ingestion && ${MVN} --no-transfer-progress -Dmaven.javadoc.skip=true -Dgpg.skip -DskipUTs=true -DskipITs=true -Drevision=${REVISION} clean package 57 | 58 | build-jobservice-docker: 59 | docker build -t $(REGISTRY)/feast-jobservice:$(VERSION) -f infra/docker/jobservice/Dockerfile . 60 | 61 | push-jobservice-docker: 62 | docker push $(REGISTRY)/feast-jobservice:$(VERSION) 63 | 64 | build-spark-docker: 65 | docker build -t $(REGISTRY)/feast-spark:$(VERSION) --build-arg VERSION=$(VERSION) -f infra/docker/spark/Dockerfile . 66 | 67 | build-spark-docker-dev: 68 | docker build -t $(REGISTRY)/feast-spark:$(VERSION) --build-arg VERSION=$(VERSION) -f infra/docker/spark/dev.Dockerfile . 69 | 70 | push-spark-docker: 71 | docker push $(REGISTRY)/feast-spark:$(VERSION) 72 | 73 | install-ci-dependencies: install-python-ci-dependencies 74 | -------------------------------------------------------------------------------- /OWNERS: -------------------------------------------------------------------------------- 1 | approvers: 2 | - khorshuheng 3 | - pyalex 4 | - woop 5 | reviewers: 6 | - khorshuheng 7 | - pyalex 8 | - woop 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Feast Spark 2 | 3 | Contains 4 | * Spark ingestion jobs for [Feast](https://github.com/feast-dev/feast) versions 0.9 and below 5 | * Feast Job Service 6 | * Feast Python SDK Spark extensions 7 | 8 | Usage: 9 | 10 | ```python 11 | 12 | import feast_spark 13 | import feast 14 | 15 | client = feast.Client() 16 | 17 | client.set_project("project1") 18 | entity = feast.Entity( 19 | name="driver_car_id", 20 | description="Car driver id", 21 | value_type=ValueType.STRING, 22 | labels={"team": "matchmaking"}, 23 | ) 24 | 25 | # Create Feature Tables using Feast SDK 26 | batch_source = feast.FileSource( 27 | file_format=ParquetFormat(), 28 | file_url="file://feast/*", 29 | event_timestamp_column="ts_col", 30 | created_timestamp_column="timestamp", 31 | date_partition_column="date_partition_col", 32 | ) 33 | 34 | stream_source = feast.KafkaSource( 35 | bootstrap_servers="localhost:9094", 36 | message_format=ProtoFormat("class.path"), 37 | topic="test_topic", 38 | event_timestamp_column="ts_col", 39 | ) 40 | 41 | ft = feast.FeatureTable( 42 | name="my-feature-table-1", 43 | features=[ 44 | Feature(name="fs1-my-feature-1", dtype=ValueType.INT64), 45 | Feature(name="fs1-my-feature-2", dtype=ValueType.STRING), 46 | Feature(name="fs1-my-feature-3", dtype=ValueType.STRING_LIST), 47 | Feature(name="fs1-my-feature-4", dtype=ValueType.BYTES_LIST), 48 | ], 49 | entities=["fs1-my-entity-1"], 50 | labels={"team": "matchmaking"}, 51 | batch_source=batch_source, 52 | stream_source=stream_source, 53 | ) 54 | 55 | # Register objects in Feast 56 | client.apply(entity, ft) 57 | 58 | # Start spark streaming ingestion job that reads from kafka and writes to the online store 59 | feast_spark.Client(client).start_stream_to_online_ingestion(ft) 60 | ``` 61 | -------------------------------------------------------------------------------- /infra/charts/feast-spark/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *~ 18 | # Various IDEs 19 | .project 20 | .idea/ 21 | *.tmproj 22 | .vscode/ 23 | -------------------------------------------------------------------------------- /infra/charts/feast-spark/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | description: Feast Extension for running Ingestion on Spark 3 | name: feast-spark 4 | version: 0.2.29 5 | -------------------------------------------------------------------------------- /infra/charts/feast-spark/README.md: -------------------------------------------------------------------------------- 1 | # feast-spark 2 | 3 | Feast Extension for running Ingestion on Spark 0.2.24 4 | 5 | ## Installation 6 | 7 | https://docs.feast.dev/v/master/getting-started/deploying-feast/kubernetes 8 | 9 | ## Requirements 10 | 11 | | Repository | Name | Version | 12 | |------------|------|---------| 13 | | | feast-jobservice | 0.2.24 | 14 | | | prometheus-statsd-exporter | 0.1.2 | 15 | 16 | ## Values 17 | 18 | | Key | Type | Default | Description | 19 | |-----|------|---------|-------------| 20 | | feast-jobservice.enabled | bool | `true` | Flag to install Feast Job Service | 21 | 22 | ### Documentation development 23 | 24 | This `README.md` is generated using [helm-docs](https://github.com/norwoodj/helm-docs/). 25 | Please run `helm-docs` to regenerate the `README.md` every time `README.md.gotmpl` 26 | or `values.yaml` are updated. 27 | -------------------------------------------------------------------------------- /infra/charts/feast-spark/README.md.gotmpl: -------------------------------------------------------------------------------- 1 | {{ template "chart.header" . }} 2 | 3 | {{ template "chart.description" . }} {{ template "chart.version" . }} 4 | 5 | ## Installation 6 | 7 | https://docs.feast.dev/v/master/getting-started/deploying-feast/kubernetes 8 | 9 | {{ template "chart.requirementsSection" . }} 10 | 11 | {{ template "chart.valuesSection" . }} 12 | 13 | 14 | ### Documentation development 15 | 16 | This `README.md` is generated using [helm-docs](https://github.com/norwoodj/helm-docs/). 17 | Please run `helm-docs` to regenerate the `README.md` every time `README.md.gotmpl` 18 | or `values.yaml` are updated. 19 | -------------------------------------------------------------------------------- /infra/charts/feast-spark/charts/feast-jobservice/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | description: Feast Job Service manage ingestion jobs. 3 | name: feast-jobservice 4 | version: 0.2.24 5 | -------------------------------------------------------------------------------- /infra/charts/feast-spark/charts/feast-jobservice/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* vim: set filetype=mustache: */}} 2 | {{/* 3 | Expand the name of the chart. 4 | */}} 5 | {{- define "feast-jobservice.name" -}} 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} 7 | {{- end -}} 8 | 9 | {{/* 10 | Create a default fully qualified app name. 11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 12 | If release name contains chart name it will be used as a full name. 13 | */}} 14 | {{- define "feast-jobservice.fullname" -}} 15 | {{- if .Values.fullnameOverride -}} 16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} 17 | {{- else -}} 18 | {{- $name := default .Chart.Name .Values.nameOverride -}} 19 | {{- if contains $name .Release.Name -}} 20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}} 21 | {{- else -}} 22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} 23 | {{- end -}} 24 | {{- end -}} 25 | {{- end -}} 26 | 27 | {{/* 28 | Create chart name and version as used by the chart label. 29 | */}} 30 | {{- define "feast-jobservice.chart" -}} 31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 32 | {{- end -}} 33 | 34 | {{/* 35 | Common labels 36 | */}} 37 | {{- define "feast-jobservice.labels" -}} 38 | app.kubernetes.io/name: {{ include "feast-jobservice.name" . }} 39 | helm.sh/chart: {{ include "feast-jobservice.chart" . }} 40 | app.kubernetes.io/instance: {{ .Release.Name }} 41 | {{- if .Chart.AppVersion }} 42 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 43 | {{- end }} 44 | app.kubernetes.io/managed-by: {{ .Release.Service }} 45 | {{- end -}} 46 | -------------------------------------------------------------------------------- /infra/charts/feast-spark/charts/feast-jobservice/templates/configmap.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.sparkOperator.enabled }} 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ template "feast-jobservice.fullname" . }}-spark-template 6 | namespace: {{ .Release.Namespace }} 7 | labels: 8 | app: {{ template "feast-jobservice.name" . }} 9 | component: jobservice 10 | chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }} 11 | release: {{ .Release.Name }} 12 | heritage: {{ .Release.Service }} 13 | data: 14 | jobTemplate.yaml: | 15 | {{- toYaml .Values.sparkOperator.jobTemplate | nindent 4 }} 16 | batchJobTemplate.yaml: | 17 | {{- toYaml .Values.sparkOperator.batchJobTemplate | nindent 4 }} 18 | streamJobTemplate.yaml: | 19 | {{- toYaml .Values.sparkOperator.streamJobTemplate | nindent 4 }} 20 | historicalJobTemplate.yaml: | 21 | {{- toYaml .Values.sparkOperator.historicalJobTemplate | nindent 4 }} 22 | {{- end }} -------------------------------------------------------------------------------- /infra/charts/feast-spark/charts/feast-jobservice/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ template "feast-jobservice.fullname" . }} 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | app: {{ template "feast-jobservice.name" . }} 8 | chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }} 9 | release: {{ .Release.Name }} 10 | heritage: {{ .Release.Service }} 11 | {{- with .Values.service.annotations }} 12 | annotations: 13 | {{ toYaml . | nindent 4 }} 14 | {{- end }} 15 | spec: 16 | type: {{ .Values.service.type }} 17 | {{- if .Values.service.loadBalancerIP }} 18 | loadBalancerIP: {{ .Values.service.loadBalancerIP }} 19 | {{- end }} 20 | {{- if .Values.service.loadBalancerSourceRanges }} 21 | loadBalancerSourceRanges: 22 | {{ toYaml .Values.service.loadBalancerSourceRanges | nindent 2 }} 23 | {{- end }} 24 | ports: 25 | - name: http 26 | port: {{ .Values.service.http.port }} 27 | targetPort: {{ .Values.service.http.targetPort }} 28 | {{- if .Values.service.http.nodePort }} 29 | nodePort: {{ .Values.service.http.nodePort }} 30 | {{- end }} 31 | - name: grpc 32 | port: {{ .Values.service.grpc.port }} 33 | targetPort: {{ .Values.service.grpc.targetPort }} 34 | {{- if .Values.service.grpc.nodePort }} 35 | nodePort: {{ .Values.service.grpc.nodePort }} 36 | {{- end }} 37 | selector: 38 | app: {{ template "feast-jobservice.name" . }} 39 | component: jobservice 40 | release: {{ .Release.Name }} 41 | 42 | -------------------------------------------------------------------------------- /infra/charts/feast-spark/charts/prometheus-statsd-exporter/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *~ 18 | # Various IDEs 19 | .project 20 | .idea/ 21 | *.tmproj -------------------------------------------------------------------------------- /infra/charts/feast-spark/charts/prometheus-statsd-exporter/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: 0.8.0 3 | description: A Helm chart for prometheus statsd-exporter Scrape metrics stored statsd 4 | home: https://github.com/prometheus/statsd_exporter 5 | keywords: 6 | - prometheus 7 | - statsd 8 | maintainers: 9 | - name: enflo 10 | email: toniflorithomar@gmail.com 11 | name: prometheus-statsd-exporter 12 | version: 0.1.2 -------------------------------------------------------------------------------- /infra/charts/feast-spark/charts/prometheus-statsd-exporter/README.md: -------------------------------------------------------------------------------- 1 | # prometheus-statsd-exporter 2 | 3 | ![Version: 0.1.2](https://img.shields.io/badge/Version-0.1.2-informational?style=flat-square) ![AppVersion: 0.8.0](https://img.shields.io/badge/AppVersion-0.8.0-informational?style=flat-square) 4 | 5 | A Helm chart for prometheus statsd-exporter Scrape metrics stored statsd 6 | 7 | **Homepage:** 8 | 9 | ## Maintainers 10 | 11 | | Name | Email | Url | 12 | | ---- | ------ | --- | 13 | | enflo | toniflorithomar@gmail.com | | 14 | 15 | ## Values 16 | 17 | | Key | Type | Default | Description | 18 | |-----|------|---------|-------------| 19 | | image.pullPolicy | string | `"IfNotPresent"` | | 20 | | image.repository | string | `"prom/statsd-exporter"` | | 21 | | image.tag | string | `"v0.12.1"` | | 22 | | persistentVolume.accessModes[0] | string | `"ReadWriteOnce"` | | 23 | | persistentVolume.annotations | object | `{}` | | 24 | | persistentVolume.claimName | string | `"prometheus-statsd-exporter"` | | 25 | | persistentVolume.enabled | bool | `true` | | 26 | | persistentVolume.existingClaim | string | `""` | | 27 | | persistentVolume.mountPath | string | `"/data"` | | 28 | | persistentVolume.name | string | `"storage-volume"` | | 29 | | persistentVolume.size | string | `"20Gi"` | | 30 | | persistentVolume.storageClass | object | `{}` | | 31 | | persistentVolume.subPath | string | `""` | | 32 | | service.annotations | object | `{}` | | 33 | | service.clusterIP | string | `""` | | 34 | | service.externalIPs | list | `[]` | | 35 | | service.labels | object | `{}` | | 36 | | service.loadBalancerIP | string | `""` | | 37 | | service.loadBalancerSourceRanges | list | `[]` | | 38 | | service.metricsPort | int | `9102` | | 39 | | service.servicePort | int | `80` | | 40 | | service.statsdPort | int | `9125` | | 41 | | service.type | string | `"ClusterIP"` | | 42 | | serviceAccount.componentName | string | `"prometheus-statsd-exporter"` | | 43 | | serviceAccount.enable | bool | `false` | | 44 | | statsdexporter.affinity | object | `{}` | | 45 | | statsdexporter.extraArgs | object | `{}` | | 46 | | statsdexporter.ingress.enabled | bool | `false` | | 47 | | statsdexporter.nodeSelector | object | `{}` | | 48 | | statsdexporter.podAnnotations."prometheus.io/path" | string | `"/metrics"` | | 49 | | statsdexporter.podAnnotations."prometheus.io/port" | string | `"9102"` | | 50 | | statsdexporter.podAnnotations."prometheus.io/scrape" | string | `"true"` | | 51 | | statsdexporter.replicaCount | int | `1` | | 52 | | statsdexporter.resources | object | `{}` | | 53 | | statsdexporter.tolerations | object | `{}` | | 54 | 55 | ---------------------------------------------- 56 | Autogenerated from chart metadata using [helm-docs v1.5.0](https://github.com/norwoodj/helm-docs/releases/v1.5.0) 57 | -------------------------------------------------------------------------------- /infra/charts/feast-spark/charts/prometheus-statsd-exporter/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | 2 | To verify that prometheus-statsd-exporter has started, run: 3 | 4 | {{- if contains "NodePort" .Values.service.type }} 5 | export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ template "prometheus-statsd-exporter.fullname" . }}) 6 | export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") 7 | echo http://$NODE_IP:$NODE_PORT 8 | {{- else if contains "LoadBalancer" .Values.service.type }} 9 | NOTE: It may take a few minutes for the LoadBalancer IP to be available. 10 | You can watch the status of by running 'kubectl get svc --namespace {{ .Release.Namespace }} -w {{ template "prometheus-statsd-exporter.fullname" . }}' 11 | 12 | export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "prometheus-statsd-exporter.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}') 13 | echo http://$SERVICE_IP:{{ .Values.service.servicePort }} 14 | {{- else if contains "ClusterIP" .Values.service.type }} 15 | export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app={{ template "prometheus-statsd-exporter.name" . }},component={{ .Chart.Name }}" -o jsonpath="{.items[0].metadata.name}") 16 | kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 9090 17 | {{- end }} -------------------------------------------------------------------------------- /infra/charts/feast-spark/charts/prometheus-statsd-exporter/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* vim: set filetype=mustache: */}} 2 | {{/* 3 | Expand the name of the chart. 4 | */}} 5 | {{- define "prometheus-statsd-exporter.name" -}} 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} 7 | {{- end -}} 8 | 9 | {{/* 10 | Create a default fully qualified app name. 11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 12 | If release name contains chart name it will be used as a full name. 13 | */}} 14 | {{- define "prometheus-statsd-exporter.fullname" -}} 15 | {{- if .Values.fullnameOverride -}} 16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} 17 | {{- else -}} 18 | {{- $name := default .Chart.Name .Values.nameOverride -}} 19 | {{- if contains $name .Release.Name -}} 20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}} 21 | {{- else -}} 22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} 23 | {{- end -}} 24 | {{- end -}} 25 | {{- end -}} 26 | 27 | {{/* 28 | Create chart name and version as used by the chart label. 29 | */}} 30 | {{- define "prometheus-statsd-exporter.chart" -}} 31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 32 | {{- end -}} 33 | 34 | 35 | {{/* 36 | Create the name of the service account to use 37 | */}} 38 | {{- define "prometheus-statsd-exporter.serviceAccountName" -}} 39 | {{- if .Values.serviceAccount.enable -}} 40 | {{ default (include "prometheus-statsd-expoter.fullname" .) .Values.serviceAccount.name }} 41 | {{- else -}} 42 | {{ default "default" .Values.serviceAccount.name }} 43 | {{- end -}} 44 | {{- end -}} -------------------------------------------------------------------------------- /infra/charts/feast-spark/charts/prometheus-statsd-exporter/templates/config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ template "prometheus-statsd-exporter.fullname" . }}-config 5 | labels: 6 | app: {{ template "prometheus-statsd-exporter.name" . }} 7 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 8 | release: {{ .Release.Name }} 9 | heritage: {{ .Release.Service }} 10 | data: 11 | statsd_mappings.yaml: | 12 | # 13 | # defaults: 14 | # ttl: "45s" -------------------------------------------------------------------------------- /infra/charts/feast-spark/charts/prometheus-statsd-exporter/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: {{ template "prometheus-statsd-exporter.fullname" . }} 5 | labels: 6 | app: {{ template "prometheus-statsd-exporter.name" . }} 7 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 8 | release: {{ .Release.Name }} 9 | heritage: {{ .Release.Service }} 10 | spec: 11 | replicas: {{ .Values.statsdexporter.replicaCount }} 12 | selector: 13 | matchLabels: 14 | app: {{ template "prometheus-statsd-exporter.name" . }} 15 | release: {{ .Release.Name }} 16 | template: 17 | metadata: 18 | annotations: 19 | {{ toYaml .Values.statsdexporter.podAnnotations | indent 8 }} 20 | labels: 21 | app: {{ template "prometheus-statsd-exporter.name" . }} 22 | release: {{ .Release.Name }} 23 | spec: 24 | serviceAccountName: {{ template "prometheus-statsd-exporter.serviceAccountName" . }} 25 | containers: 26 | - name: {{ .Chart.Name }} 27 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" 28 | imagePullPolicy: {{ .Values.image.pullPolicy }} 29 | args: 30 | - --statsd.mapping-config=/etc/statsd_conf/statsd_mappings.yaml 31 | {{- range $key, $value := .Values.statsdexporter.extraArgs }} 32 | - --{{ $key }}={{ $value }} 33 | {{- end }} 34 | volumeMounts: 35 | - mountPath: /data 36 | name: {{ .Values.persistentVolume.name }} 37 | - name: statsd-config 38 | mountPath: /etc/statsd_conf 39 | env: 40 | - name: HOME 41 | value: /data 42 | ports: 43 | - name: metrics 44 | containerPort: 9102 45 | protocol: TCP 46 | - name: statsd-tcp 47 | containerPort: 9125 48 | protocol: TCP 49 | - name: statsd-udp 50 | containerPort: 9125 51 | protocol: UDP 52 | livenessProbe: 53 | httpGet: 54 | path: /#/status 55 | port: 9102 56 | initialDelaySeconds: 10 57 | timeoutSeconds: 10 58 | readinessProbe: 59 | httpGet: 60 | path: /#/status 61 | port: 9102 62 | initialDelaySeconds: 10 63 | timeoutSeconds: 10 64 | resources: 65 | {{ toYaml .Values.statsdexporter.resources | indent 12 }} 66 | {{- if .Values.statsdexporter.nodeSelector }} 67 | nodeSelector: 68 | {{ toYaml .Values.statsdexporter.nodeSelector | indent 8 }} 69 | {{- end }} 70 | volumes: 71 | - name: statsd-config 72 | configMap: 73 | name: {{ template "prometheus-statsd-exporter.fullname" . }}-config 74 | - name: {{ .Values.persistentVolume.name }} 75 | {{- if .Values.persistentVolume.enabled }} 76 | persistentVolumeClaim: 77 | claimName: {{ if .Values.persistentVolume.claimName }}{{- else }}{{ template "prometheus-statsd-exporter.fullname" . }}{{- end }} 78 | {{- else }} 79 | emptyDir: {} 80 | {{- end -}} 81 | -------------------------------------------------------------------------------- /infra/charts/feast-spark/charts/prometheus-statsd-exporter/templates/pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | labels: 5 | app: {{ template "prometheus-statsd-exporter.fullname" . }} 6 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 7 | component: "{{ .Chart.Name }}" 8 | heritage: {{ .Release.Service }} 9 | release: {{ .Release.Name }} 10 | name: {{ template "prometheus-statsd-exporter.fullname" . }} 11 | spec: 12 | accessModes: 13 | {{ toYaml .Values.persistentVolume.accessModes | indent 4 }} 14 | {{- if .Values.persistentVolume.storageClass }} 15 | {{- if (eq "-" .Values.persistentVolume.storageClass) }} 16 | storageClassName: "" 17 | {{- else }} 18 | storageClassName: "{{ .Values.persistentVolume.storageClass }}" 19 | {{- end }} 20 | {{- end }} 21 | resources: 22 | requests: 23 | storage: "{{ .Values.persistentVolume.size }}" -------------------------------------------------------------------------------- /infra/charts/feast-spark/charts/prometheus-statsd-exporter/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | {{- if .Values.service.annotations }} 5 | annotations: 6 | {{ toYaml .Values.service.annotations | indent 4 }} 7 | {{- end }} 8 | labels: 9 | app: {{ template "prometheus-statsd-exporter.fullname" . }} 10 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 11 | component: "{{ .Chart.Name }}" 12 | heritage: {{ .Release.Service }} 13 | release: {{ .Release.Name }} 14 | {{- if .Values.service.labels }} 15 | {{ toYaml .Values.service.labels | indent 4 }} 16 | {{- end }} 17 | name: {{ template "prometheus-statsd-exporter.fullname" . }} 18 | spec: 19 | ports: 20 | - name: metrics 21 | port: {{ .Values.service.metricsPort }} 22 | protocol: TCP 23 | targetPort: 9102 24 | - name: statsd-tcp 25 | port: {{ .Values.service.statsdPort }} 26 | protocol: TCP 27 | targetPort: 9125 28 | selector: 29 | app: {{ template "prometheus-statsd-exporter.name" . }} 30 | release: {{ .Release.Name }} 31 | type: ClusterIP 32 | --- 33 | apiVersion: v1 34 | kind: Service 35 | metadata: 36 | {{- if .Values.service.annotations }} 37 | annotations: 38 | {{ toYaml .Values.service.annotations | indent 4 }} 39 | {{- end }} 40 | labels: 41 | app: {{ template "prometheus-statsd-exporter.fullname" . }} 42 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 43 | component: "{{ .Chart.Name }}" 44 | heritage: {{ .Release.Service }} 45 | release: {{ .Release.Name }} 46 | {{- if .Values.service.labels }} 47 | {{ toYaml .Values.service.labels | indent 4 }} 48 | {{- end }} 49 | name: {{ template "prometheus-statsd-exporter.fullname" . }}-udp 50 | spec: 51 | {{- if .Values.service.clusterIP }} 52 | clusterIP: {{ .Values.service.clusterIP }} 53 | {{- end }} 54 | {{- if .Values.service.externalIPs }} 55 | externalIPs: 56 | {{ toYaml .Values.service.externalIPs | indent 4 }} 57 | {{- end }} 58 | {{- if .Values.service.loadBalancerIP }} 59 | loadBalancerIP: {{ .Values.service.loadBalancerIP }} 60 | {{- end }} 61 | {{- if .Values.service.loadBalancerSourceRanges }} 62 | loadBalancerSourceRanges: 63 | {{- range $cidr := .Values.service.loadBalancerSourceRanges }} 64 | - {{ $cidr }} 65 | {{- end }} 66 | {{- end }} 67 | ports: 68 | - name: statsd-udp 69 | port: {{ .Values.service.statsdPort }} 70 | protocol: UDP 71 | targetPort: 9125 72 | selector: 73 | app: {{ template "prometheus-statsd-exporter.name" . }} 74 | release: {{ .Release.Name }} 75 | type: "{{ .Values.service.type }}" -------------------------------------------------------------------------------- /infra/charts/feast-spark/charts/prometheus-statsd-exporter/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.enable -}} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | labels: 6 | app: {{ template "prometheus-statsd-exporter.fullname" . }} 7 | chart: {{ .Chart.Name }}-{{ .Chart.Version }} 8 | component: "{{ .Values.serviceaccount.componentName }}" 9 | heritage: {{ .Release.Service }} 10 | release: {{ .Release.Name }} 11 | name: {{ template "prometheus-statsd-exporter.fullname" . }} 12 | {{- end -}} -------------------------------------------------------------------------------- /infra/charts/feast-spark/requirements.lock: -------------------------------------------------------------------------------- 1 | dependencies: 2 | - name: feast-jobservice 3 | repository: "" 4 | version: 0.2.24 5 | - name: prometheus-statsd-exporter 6 | repository: "" 7 | version: 0.1.2 8 | digest: sha256:4b52339a644ff2785f8a89e6d3aa30261f091645e88c36ab00e147ac64d15297 9 | generated: "2022-03-30T10:27:54.642418517+08:00" 10 | -------------------------------------------------------------------------------- /infra/charts/feast-spark/requirements.yaml: -------------------------------------------------------------------------------- 1 | dependencies: 2 | - name: feast-jobservice 3 | version: 0.2.24 4 | condition: feast-jobservice.enabled 5 | - name: prometheus-statsd-exporter 6 | version: 0.1.2 7 | condition: prometheus-statsd-exporter.enabled -------------------------------------------------------------------------------- /infra/charts/feast-spark/values.yaml: -------------------------------------------------------------------------------- 1 | feast-jobservice: 2 | # feast-jobservice.enabled -- Flag to install Feast Job Service 3 | enabled: true -------------------------------------------------------------------------------- /infra/docker/jobservice/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8 2 | 3 | USER root 4 | WORKDIR /app 5 | 6 | COPY python python 7 | COPY protos protos 8 | COPY Makefile Makefile 9 | 10 | # Install necessary tools for later steps 11 | RUN apt-get update && apt-get -y install make git wget 12 | 13 | # Install Feast SDK 14 | RUN git init . 15 | COPY README.md README.md 16 | RUN make install-python 17 | 18 | # 19 | # Download grpc_health_probe to run health checks 20 | # https://kubernetes.io/blog/2018/10/01/health-checking-grpc-servers-on-kubernetes/ 21 | # 22 | RUN wget -q https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/v0.3.1/grpc_health_probe-linux-amd64 \ 23 | -O /usr/bin/grpc-health-probe && \ 24 | chmod +x /usr/bin/grpc-health-probe 25 | 26 | ENV FEAST_TELEMETRY=false 27 | 28 | CMD ["python", "-m", "feast_spark.cli", "server"] 29 | -------------------------------------------------------------------------------- /infra/docker/spark/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM maven:3.6-jdk-11 as builder 2 | 3 | RUN apt-get update && apt-get install -y build-essential 4 | WORKDIR /build 5 | 6 | COPY . . 7 | ARG VERSION=dev 8 | 9 | RUN REVISION=$VERSION make build-ingestion-jar-no-tests 10 | 11 | FROM gcr.io/kf-feast/feast-spark-base:v3.1.3 as runtime 12 | 13 | ARG VERSION=dev 14 | 15 | ARG TFRECORD_VERSION=0.3.0 16 | ARG GCS_CONNECTOR_VERSION=2.2.5 17 | ARG BQ_CONNECTOR_VERSION=0.18.1 18 | 19 | COPY --from=builder /build/spark/ingestion/target/feast-ingestion-spark-${VERSION}.jar /opt/spark/jars 20 | 21 | USER root 22 | ADD https://repo1.maven.org/maven2/com/linkedin/sparktfrecord/spark-tfrecord_2.12/${TFRECORD_VERSION}/spark-tfrecord_2.12-${TFRECORD_VERSION}.jar /opt/spark/jars 23 | ADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-${GCS_CONNECTOR_VERSION}.jar /opt/spark/jars 24 | ADD https://repo1.maven.org/maven2/com/google/cloud/spark/spark-bigquery-with-dependencies_2.12/${BQ_CONNECTOR_VERSION}/spark-bigquery-with-dependencies_2.12-${BQ_CONNECTOR_VERSION}.jar /opt/spark/jars 25 | 26 | # Fix arrow issue for jdk-11 27 | RUN mkdir -p /opt/spark/conf 28 | RUN echo 'spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true"' >> $SPARK_HOME/conf/spark-defaults.conf 29 | RUN echo 'spark.driver.extraJavaOptions="-Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true"' >> $SPARK_HOME/conf/spark-defaults.conf 30 | RUN echo 'spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true"' >> $SPARK_HOME/conf/spark-defaults.conf 31 | RUN echo 'spark.executor.extraJavaOptions="-Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true"' >> $SPARK_HOME/conf/spark-defaults.conf 32 | 33 | # python dependencies 34 | RUN pip3 install pandas==1.3.5 great-expectations==0.13.2 pyarrow==2.0.0 Jinja2==3.0.3 datadog==0.44.0 'numpy<1.20.0' 35 | 36 | # For logging to /dev/termination-log 37 | RUN mkdir -p /dev 38 | 39 | 40 | ENTRYPOINT [ "/opt/entrypoint.sh" ] -------------------------------------------------------------------------------- /infra/docker/spark/dev.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/kf-feast/feast-spark-base:v3.1.3 as runtime 2 | 3 | ARG VERSION=dev 4 | 5 | ARG TFRECORD_VERSION=0.3.0 6 | ARG GCS_CONNECTOR_VERSION=2.0.1 7 | ARG BQ_CONNECTOR_VERSION=0.18.1 8 | 9 | USER root 10 | ADD https://repo1.maven.org/maven2/com/linkedin/sparktfrecord/spark-tfrecord_2.12/${TFRECORD_VERSION}/spark-tfrecord_2.12-${TFRECORD_VERSION}.jar /opt/spark/jars 11 | ADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-${GCS_CONNECTOR_VERSION}.jar /opt/spark/jars 12 | ADD https://repo1.maven.org/maven2/com/google/cloud/spark/spark-bigquery-with-dependencies_2.12/${BQ_CONNECTOR_VERSION}/spark-bigquery-with-dependencies_2.12-${BQ_CONNECTOR_VERSION}.jar /opt/spark/jars 13 | 14 | # Fix arrow issue for jdk-11 15 | RUN mkdir -p /opt/spark/conf 16 | RUN echo 'spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true"' >> $SPARK_HOME/conf/spark-defaults.conf 17 | RUN echo 'spark.driver.extraJavaOptions="-Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true"' >> $SPARK_HOME/conf/spark-defaults.conf 18 | RUN echo 'spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true"' >> $SPARK_HOME/conf/spark-defaults.conf 19 | RUN echo 'spark.executor.extraJavaOptions="-Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true"' >> $SPARK_HOME/conf/spark-defaults.conf \ 20 | 21 | # python dependencies 22 | RUN pip3 install pandas==1.3.5 great-expectations==0.13.2 pyarrow==2.0.0 Jinja2==3.0.3 datadog==0.44.0 'numpy<1.20.0' 23 | 24 | # For logging to /dev/termination-log 25 | RUN mkdir -p /dev 26 | 27 | COPY spark/ingestion/target/feast-ingestion-spark-${VERSION}.jar /opt/spark/jars 28 | 29 | 30 | ENTRYPOINT [ "/opt/entrypoint.sh" ] -------------------------------------------------------------------------------- /infra/docker/tests/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE=gcr.io/kf-feast/feast-ci:latest 2 | 3 | FROM ${BASE_IMAGE} 4 | 5 | RUN mkdir -p /src/ /src/spark/ingestion 6 | 7 | COPY python /src/python 8 | 9 | COPY README.md /src/README.md 10 | 11 | WORKDIR /src 12 | 13 | RUN pip install -r python/requirements-ci.txt 14 | 15 | RUN git init . 16 | RUN pip install -e python -U 17 | RUN pip install "s3fs" "boto3" "urllib3>=1.25.4" 18 | 19 | COPY tests /src/tests 20 | 21 | RUN pip install -r tests/requirements.txt 22 | 23 | COPY infra/scripts /src/infra/scripts 24 | COPY spark/ingestion/target /src/spark/ingestion/target 25 | -------------------------------------------------------------------------------- /infra/scripts/aws-runner.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | GIT_TAG=${PULL_PULL_SHA:-${PULL_BASE_SHA}} 6 | 7 | source infra/scripts/k8s-common-functions.sh 8 | wait_for_image "${DOCKER_REPOSITORY}" feast-jobservice "${GIT_TAG}" 9 | 10 | infra/scripts/codebuild_runner.py "$@" -------------------------------------------------------------------------------- /infra/scripts/build-ingestion-py-dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | PLATFORM=$1 4 | DESTINATION=$2 5 | PACKAGES=${PACKAGES:-"great-expectations==0.13.2 pyarrow==2.0.0 datadog==0.39.0"} 6 | 7 | tmp_dir=$(mktemp -d) 8 | 9 | pip3 install -t ${tmp_dir}/libs $PACKAGES 10 | 11 | cd $tmp_dir 12 | tar -czf pylibs-ge-$PLATFORM.tar.gz libs/ 13 | if [[ $DESTINATION == gs* ]]; then 14 | gsutil cp pylibs-ge-$PLATFORM.tar.gz $DESTINATION 15 | else 16 | mv pylibs-ge-$PLATFORM.tar.gz $DESTINATION 17 | fi 18 | -------------------------------------------------------------------------------- /infra/scripts/download-maven-cache.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # This script downloads previous maven packages that have been downloaded 5 | # from Google Cloud Storage to local path for faster build 6 | 7 | usage() 8 | { 9 | echo "usage: prepare_maven_cache.sh 10 | --archive-uri gcs uri to retrieve maven .m2 archive 11 | --output-dir output directory for .m2 directory" 12 | } 13 | 14 | while [ "$1" != "" ]; do 15 | case "$1" in 16 | --archive-uri ) ARCHIVE_URI="$2"; shift;; 17 | --output-dir ) OUTPUT_DIR="$2"; shift;; 18 | * ) usage; exit 1 19 | esac 20 | shift 21 | done 22 | 23 | if [[ ! ${ARCHIVE_URI} ]]; then usage; exit 1; fi 24 | if [[ ! ${OUTPUT_DIR} ]]; then usage; exit 1; fi 25 | 26 | # Install Google Cloud SDK if gsutil command not exists 27 | if [[ ! $(command -v gsutil) ]]; then 28 | CURRENT_DIR=$(dirname "$BASH_SOURCE") 29 | . "${CURRENT_DIR}"/install-google-cloud-sdk.sh 30 | fi 31 | 32 | gsutil -q cp ${ARCHIVE_URI} /tmp/.m2.tar 33 | tar xf /tmp/.m2.tar -C ${OUTPUT_DIR} 34 | -------------------------------------------------------------------------------- /infra/scripts/helm/k8s-jobservice.tpl.yaml: -------------------------------------------------------------------------------- 1 | feast-jobservice: 2 | image: 3 | tag: ${IMAGE_TAG} 4 | envOverrides: 5 | FEAST_CORE_URL: feast-release-feast-core:6565 6 | FEAST_SPARK_LAUNCHER: k8s 7 | FEAST_SPARK_K8S_NAMESPACE: sparkop-e2e 8 | FEAST_SPARK_K8S_USE_INCLUSTER_CONFIG: True 9 | FEAST_TELEMETRY: False 10 | FEAST_SPARK_STAGING_LOCATION: gs://feast-templocation-kf-feast 11 | FEAST_REDIS_HOST: feast-release-redis-master 12 | FEAST_REDIS_PORT: 6379 13 | FEAST_JOB_SERVICE_ENABLE_CONTROL_LOOP: False 14 | FEAST_SPARK_INGESTION_JAR: local:///opt/spark/jars/feast-ingestion-spark-${IMAGE_TAG}.jar 15 | 16 | sparkOperator: 17 | enabled: true 18 | jobTemplate: 19 | apiVersion: "sparkoperator.k8s.io/v1beta2" 20 | kind: SparkApplication 21 | spec: 22 | type: Scala 23 | mode: cluster 24 | image: "gcr.io/kf-feast/feast-spark:${IMAGE_TAG}" 25 | hadoopConf: 26 | "fs.gs.project.id": "kf-feast" 27 | "google.cloud.auth.service.account.enable": "true" 28 | "google.cloud.auth.service.account.json.keyfile": "/mnt/secrets/credentials.json" 29 | sparkVersion: "3.1.3" 30 | timeToLiveSeconds: 3600 31 | pythonVersion: "3" 32 | restartPolicy: 33 | type: Never 34 | driver: 35 | cores: 1 36 | coreLimit: "1200m" 37 | memory: "600m" 38 | labels: 39 | version: 3.0.2 40 | javaOptions: "-Dio.netty.tryReflectionSetAccessible=true -Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true" 41 | secrets: 42 | - name: feast-gcp-service-account 43 | path: /mnt/secrets 44 | secretType: GCPServiceAccount 45 | executor: 46 | cores: 1 47 | instances: 1 48 | memory: "800m" 49 | labels: 50 | version: 3.0.2 51 | javaOptions: "-Dio.netty.tryReflectionSetAccessible=true -Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true" 52 | secrets: 53 | - name: feast-gcp-service-account 54 | path: /mnt/secrets 55 | secretType: GCPServiceAccount 56 | 57 | -------------------------------------------------------------------------------- /infra/scripts/helm/kafka-values.tpl.yaml: -------------------------------------------------------------------------------- 1 | externalAccess: 2 | enabled: true 3 | service: 4 | loadBalancerIPs: 5 | - $feast_kafka_ip 6 | annotations: 7 | cloud.google.com/load-balancer-type: Internal 8 | loadBalancerSourceRanges: 9 | - 10.0.0.0/8 10 | - 172.16.0.0/12 11 | - 192.168.0.0/16 12 | 13 | persistence: 14 | enabled: false 15 | 16 | zookeeper: 17 | persistence: 18 | enabled: false -------------------------------------------------------------------------------- /infra/scripts/helm/redis-cluster-values.tpl.yaml: -------------------------------------------------------------------------------- 1 | cluster: 2 | nodes: 3 3 | replicas: 0 4 | externalAccess: 5 | enabled: true 6 | service: 7 | annotations: 8 | cloud.google.com/load-balancer-type: Internal 9 | loadBalancerIP: 10 | - $feast_redis_1_ip 11 | - $feast_redis_2_ip 12 | - $feast_redis_3_ip 13 | 14 | persistence: 15 | enabled: false 16 | 17 | usePassword: false -------------------------------------------------------------------------------- /infra/scripts/install-google-cloud-sdk.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | usage() 5 | { 6 | echo "usage: . install-google-cloud-sdk.sh 7 | [--with-key-file local file path to service account json] 8 | 9 | NOTE: requires 'dot' before install-google-cloud-sdk.sh 10 | so that the PATH variable is exported succesfully to 11 | the calling process, i.e. you don't need to provide 12 | full path to gcloud command after installation 13 | 14 | --with-key-file is optional, 15 | if no authentication is required" 16 | } 17 | 18 | while [ "$1" != "" ]; do 19 | case "$1" in 20 | --with-key-file ) KEY_FILE="$2"; shift;; 21 | * ) usage; exit 1 22 | esac 23 | shift 24 | done 25 | 26 | GOOGLE_CLOUD_SDK_ARCHIVE_URL=https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-266.0.0-linux-x86_64.tar.gz 27 | GOOGLE_PROJECT_ID=kf-feast 28 | KUBE_CLUSTER_NAME=primary-test-cluster 29 | KUBE_CLUSTER_ZONE=us-central1-a 30 | 31 | curl -s ${GOOGLE_CLOUD_SDK_ARCHIVE_URL} | tar xz -C / 32 | export PATH=/google-cloud-sdk/bin:${PATH} 33 | gcloud -q components install kubectl &> /var/log/kubectl.install.log 34 | 35 | if [[ ${KEY_FILE} ]]; then 36 | gcloud -q auth activate-service-account --key-file=${KEY_FILE} 37 | gcloud -q auth configure-docker 38 | gcloud -q config set project ${GOOGLE_PROJECT_ID} 39 | gcloud -q container clusters get-credentials ${KUBE_CLUSTER_NAME} --zone ${KUBE_CLUSTER_ZONE} --project ${GOOGLE_PROJECT_ID} 40 | export GOOGLE_APPLICATION_CREDENTIALS=${KEY_FILE} 41 | fi 42 | 43 | # Restore bash option 44 | set +e -------------------------------------------------------------------------------- /infra/scripts/install-helm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | readonly HELM_URL=https://storage.googleapis.com/kubernetes-helm 4 | readonly HELM_TARBALL="helm-${HELM_VERSION}-linux-amd64.tar.gz" 5 | readonly STABLE_REPO_URL=https://charts.helm.sh/stable 6 | readonly INCUBATOR_REPO_URL=https://charts.helm.sh/incubator 7 | curl -s "https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz" | tar -C /tmp -xz 8 | sudo mv /tmp/linux-amd64/helm /usr/bin/helm 9 | helm init --client-only 10 | helm repo add incubator "$INCUBATOR_REPO_URL" 11 | -------------------------------------------------------------------------------- /infra/scripts/publish-docker-image.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | set -o pipefail 5 | 6 | usage() 7 | { 8 | echo "usage: publish-docker-image.sh 9 | 10 | --repository the target repository to upload the Docker image, example: 11 | gcr.io/kf-feast/feast-core 12 | 13 | --tag the tag for the Docker image, example: 1.0.4 14 | 15 | --file path to the Dockerfile 16 | 17 | [--google-service-account-file 18 | path to Google Cloud service account JSON key file] 19 | " 20 | } 21 | 22 | while [ "$1" != "" ]; do 23 | case "$1" in 24 | --repository ) REPOSITORY="$2"; shift;; 25 | --tag ) TAG="$2"; shift;; 26 | --file ) FILE="$2"; shift;; 27 | --google-service-account-file ) GOOGLE_SERVICE_ACCOUNT_FILE="$2"; shift;; 28 | -h | --help ) usage; exit;; 29 | * ) usage; exit 1 30 | esac 31 | shift 32 | done 33 | 34 | if [ -z $REPOSITORY ]; then usage; exit 1; fi 35 | if [ -z $TAG ]; then usage; exit 1; fi 36 | if [ -z $FILE ]; then usage; exit 1; fi 37 | 38 | if [ $GOOGLE_SERVICE_ACCOUNT_FILE ]; then 39 | gcloud -q auth activate-service-account --key-file $GOOGLE_SERVICE_ACCOUNT_FILE 40 | gcloud -q auth configure-docker 41 | fi 42 | 43 | echo "============================================================" 44 | echo "Building Docker image $REPOSITORY:$TAG" 45 | echo "============================================================" 46 | docker build -t $REPOSITORY:$TAG --build-arg REVISION=$TAG -f $FILE . 47 | 48 | echo "============================================================" 49 | echo "Pushing Docker image $REPOSITORY:$TAG" 50 | echo "============================================================" 51 | docker push $REPOSITORY:$TAG 52 | -------------------------------------------------------------------------------- /infra/scripts/publish-java-sdk.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | set -o pipefail 5 | 6 | GPG_KEY_IMPORT_DIR=/etc/gpg 7 | 8 | usage() 9 | { 10 | echo "usage: publish-java-sdk.sh 11 | 12 | --revision Value for the revision e.g. '0.2.3' 13 | --gpg-key-import-dir Directory containing existing GPG keys to import. 14 | The directory should contain these 2 files: 15 | - public-key 16 | - private-key 17 | The default value is '/etc/gpg' 18 | 19 | This script assumes the GPG private key is protected by a passphrase. 20 | The passphrase can be specified in \$HOME/.m2/settings.xml. In the same xml 21 | file, credentials to upload releases to Sonatype must also be provided. 22 | 23 | # Example settings: ~/.m2/settings.xml 24 | 25 | 26 | 27 | ossrh 28 | SONATYPE_USER 29 | SONATYPE_PASSWORD 30 | 31 | 32 | 33 | 34 | ossrh 35 | 36 | GPG_PASSPHRASE 37 | 38 | 39 | 40 | 41 | " 42 | } 43 | 44 | while [ "$1" != "" ]; do 45 | case "$1" in 46 | --revision ) REVISION="$2"; shift;; 47 | --gpg-key-import-dir ) GPG_KEY_IMPORT_DIR="$2"; shift;; 48 | -h | --help ) usage; exit;; 49 | * ) usage; exit 1 50 | esac 51 | shift 52 | done 53 | 54 | if [ -z $REVISION ]; then usage; exit 1; fi 55 | 56 | echo "============================================================" 57 | echo "Checking Maven and GPG versions" 58 | echo "============================================================" 59 | mvn --version 60 | echo "" 61 | gpg --version 62 | 63 | echo "============================================================" 64 | echo "Importing GPG keys" 65 | echo "============================================================" 66 | gpg --import --batch --yes $GPG_KEY_IMPORT_DIR/public-key 67 | gpg --import --batch --yes $GPG_KEY_IMPORT_DIR/private-key 68 | 69 | echo "============================================================" 70 | echo "Deploying Java SDK with revision: $REVISION" 71 | echo "============================================================" 72 | mvn --projects datatypes/java,sdk/java -Drevision=$REVISION --batch-mode clean deploy 73 | -------------------------------------------------------------------------------- /infra/scripts/publish-python-sdk.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | set -o pipefail 5 | 6 | usage() 7 | { 8 | echo "usage: publish-python-sdk.sh 9 | 10 | --directory-path absolute path to the python package, this directory 11 | should contain 'setup.py' file 12 | 13 | --repository the repository name where the package will be uploaded, 14 | check your .pypirc configuration file for the list of 15 | valid repositories, usually it's 'pypi' or 'testpypi' 16 | " 17 | } 18 | 19 | while [ "$1" != "" ]; do 20 | case "$1" in 21 | --directory-path ) DIRECTORY_PATH="$2"; shift;; 22 | --repository ) REPOSITORY="$2"; shift;; 23 | -h | --help ) usage; exit;; 24 | * ) usage; exit 1 25 | esac 26 | shift 27 | done 28 | 29 | if [ -z $DIRECTORY_PATH ]; then usage; exit 1; fi 30 | if [ -z $REPOSITORY ]; then usage; exit 1; fi 31 | 32 | ORIGINAL_DIR=$PWD 33 | cd $DIRECTORY_PATH 34 | 35 | echo "============================================================" 36 | echo "Generating distribution archives" 37 | echo "============================================================" 38 | python3 -m pip install --user --upgrade setuptools wheel 39 | python3 setup.py sdist bdist_wheel 40 | 41 | echo "============================================================" 42 | echo "Uploading distribution archives" 43 | echo "============================================================" 44 | python3 -m pip install --user --upgrade twine 45 | python3 -m twine upload --repository $REPOSITORY dist/* 46 | 47 | cd $ORIGINAL_DIR 48 | -------------------------------------------------------------------------------- /infra/scripts/push-helm-charts.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | if [ $# -ne 1 ]; then 6 | echo "Please provide a single semver version (without a \"v\" prefix) to test the repository against, e.g 0.99.0" 7 | exit 1 8 | fi 9 | 10 | bucket=gs://feast-helm-charts 11 | repo_url=https://feast-helm-charts.storage.googleapis.com/ 12 | 13 | helm plugin install https://github.com/hayorov/helm-gcs.git --version 0.2.2 || true 14 | 15 | helm repo add feast-helm-chart-repo $bucket 16 | 17 | helm package infra/charts/feast-spark --version ${1} 18 | 19 | helm gcs push --public --force feast-spark-${1}.tgz feast-helm-chart-repo -------------------------------------------------------------------------------- /infra/scripts/run-minikube-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail 4 | 5 | NAMESPACE=sparkop 6 | JOB_NAME=test-runner 7 | 8 | # Delete all sparkapplication resources that may be left over from the previous test runs. 9 | kubectl delete sparkapplication --all -n sparkop || true 10 | 11 | JOB_SPEC=$(dirname $0)/test_job.yaml 12 | 13 | # Delete previous instance of the job if it exists 14 | kubectl delete -n ${NAMESPACE} "job/$JOB_NAME" 2>/dev/null || true 15 | 16 | # Create the job 17 | kubectl apply -n ${NAMESPACE} -f "$JOB_SPEC" 18 | 19 | # Wait for job to have a pod. 20 | for i in {1..10} 21 | do 22 | POD=$(kubectl get pods -n ${NAMESPACE} --selector=job-name=$JOB_NAME --output=jsonpath='{.items[0].metadata.name}') 23 | if [ ! -z "$POD" ]; then 24 | break 25 | else 26 | sleep 1 27 | fi 28 | done 29 | 30 | echo "Waiting for pod to be ready:" 31 | kubectl wait -n ${NAMESPACE} --for=condition=ContainersReady "pod/$POD" --timeout=60s || true 32 | 33 | echo "Job output:" 34 | kubectl logs -n ${NAMESPACE} -f "job/$JOB_NAME" 35 | 36 | # Can't wait for both conditions at once, so wait for complete first then wait for failure 37 | kubectl wait -n ${NAMESPACE} --for=condition=complete "job/$JOB_NAME" --timeout=60s && exit 0 38 | kubectl wait -n ${NAMESPACE} --for=condition=failure "job/$JOB_NAME" --timeout=60s && exit 1 39 | -------------------------------------------------------------------------------- /infra/scripts/setup-common-functions.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Get Feast project repository root and scripts directory 4 | export PROJECT_ROOT_DIR=$(git rev-parse --show-toplevel) 5 | export SCRIPTS_DIR=${PROJECT_ROOT_DIR}/infra/scripts 6 | 7 | install_test_tools() { 8 | apt-get -qq update 9 | apt-get -y install wget netcat kafkacat build-essential 10 | } 11 | 12 | print_banner() { 13 | echo " 14 | ============================================================ 15 | $1 16 | ============================================================ 17 | " 18 | } 19 | 20 | wait_for_docker_image(){ 21 | # This script will block until a docker image is ready 22 | 23 | [[ -z "$1" ]] && { echo "Please pass the docker image URI as the first parameter" ; exit 1; } 24 | oldopt=$- 25 | set +e 26 | 27 | DOCKER_IMAGE=$1 28 | poll_count=0 29 | maximum_poll_count=150 30 | 31 | # Wait for Feast Core to be available on GCR 32 | until docker pull "$DOCKER_IMAGE" 33 | do 34 | # Exit when we have tried enough times 35 | if [[ "$poll_count" -gt "$maximum_poll_count" ]]; then 36 | set -$oldopt 37 | exit 1 38 | fi 39 | # Sleep and increment counter on failure 40 | echo "${DOCKER_IMAGE} could not be found"; 41 | sleep 5; 42 | ((poll_count++)) 43 | done 44 | 45 | set -$oldopt 46 | } 47 | 48 | # Usage: TAG=$(get_tag_release [-ms]) 49 | # Parses the last release from git tags. 50 | # Options: 51 | # -m - Use only tags that are tagged on the current branch 52 | # -s - Use only stable version tags. (ie no prerelease tags). 53 | get_tag_release() { 54 | local GIT_TAG_CMD="git tag -l" 55 | # Match only Semver tags 56 | # Regular expression should match MAJOR.MINOR.PATCH[-PRERELEASE[.IDENTIFIER]] 57 | # eg. v0.7.1 v0.7.2-alpha v0.7.2-rc.1 58 | local TAG_REGEX='^v[0-9]+\.[0-9]+\.[0-9]+(-([0-9A-Za-z-]+(\.[0-9A-Za-z-]+)*))?$' 59 | local OPTIND opt 60 | while getopts "ms" opt; do 61 | case "${opt}" in 62 | m) 63 | GIT_TAG_CMD="$GIT_TAG_CMD --merged" 64 | ;; 65 | s) 66 | # Match only stable version tags. 67 | TAG_REGEX="^v[0-9]+\.[0-9]+\.[0-9]+$" 68 | ;; 69 | *) 70 | echo "get_tag_release(): Error: Bad arguments: $@" 71 | return 1 72 | ;; 73 | esac 74 | done 75 | shift $((OPTIND-1)) 76 | 77 | # Retrieve tags from git and filter as per regex. 78 | local FILTERED_TAGS=$(bash -c "$GIT_TAG_CMD" | grep -P "$TAG_REGEX") 79 | 80 | # Sort version tags in highest semver version first. 81 | # To make sure that prerelease versions (ie versions vMAJOR.MINOR.PATCH-PRERELEASE suffix) 82 | # are sorted after stable versions (ie vMAJOR.MINOR.PATCH), we append '_' after 83 | # eachustable version as '_' is after '-' found in prerelease version 84 | # alphanumerically and remove after sorting. 85 | local SEMVER_SORTED_TAGS=$(echo "$FILTERED_TAGS" | sed -e '/-/!{s/$/_/}' | sort -rV \ 86 | | sed -e 's/_$//') 87 | echo $(echo "$SEMVER_SORTED_TAGS" | head -n 1) 88 | } 89 | -------------------------------------------------------------------------------- /infra/scripts/setup-e2e-env-aws.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m pip install --upgrade pip==20.2 setuptools wheel 4 | 5 | make install-python 6 | 7 | python -m pip install -qr tests/requirements.txt 8 | 9 | # Using mvn -q to make it less verbose. This step happens after docker containers were 10 | # succesfully built so it should be unlikely to fail, therefore we likely won't need detailed logs. 11 | echo "########## Building ingestion jar" 12 | TIMEFORMAT='########## took %R seconds' 13 | 14 | time make build-ingestion-jar-no-tests REVISION=develop MAVEN_EXTRA_OPTS="-q --no-transfer-progress" 15 | -------------------------------------------------------------------------------- /infra/scripts/setup-e2e-env-gcp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # GCloud, kubectl, helm should be already installed 4 | # And kubernetes cluster already configured 5 | 6 | test -z ${GCLOUD_REGION} && GCLOUD_REGION="us-central1" 7 | test -z ${GCLOUD_NETWORK} && GCLOUD_NETWORK="default" 8 | test -z ${GCLOUD_SUBNET} && GCLOUD_SUBNET="default" 9 | 10 | 11 | feast_kafka_ip_name="feast-kafka" 12 | feast_redis_1_ip_name="feast-redis-1" 13 | feast_redis_2_ip_name="feast-redis-2" 14 | feast_redis_3_ip_name="feast-redis-3" 15 | 16 | helm repo add bitnami https://charts.bitnami.com/bitnami 17 | 18 | gcloud compute addresses create \ 19 | $feast_kafka_ip_name $feast_redis_1_ip_name $feast_redis_2_ip_name $feast_redis_3_ip_name \ 20 | --region ${GCLOUD_REGION} --subnet ${GCLOUD_SUBNET} 21 | 22 | export feast_kafka_ip=$(gcloud compute addresses describe $feast_kafka_ip_name --region=${GCLOUD_REGION} --format "value(address)") 23 | export feast_redis_1_ip=$(gcloud compute addresses describe $feast_redis_1_ip_name --region=${GCLOUD_REGION} --format "value(address)") 24 | export feast_redis_2_ip=$(gcloud compute addresses describe $feast_redis_2_ip_name --region=${GCLOUD_REGION} --format "value(address)") 25 | export feast_redis_3_ip=$(gcloud compute addresses describe $feast_redis_3_ip_name --region=${GCLOUD_REGION} --format "value(address)") 26 | 27 | 28 | envsubst '$feast_kafka_ip' < helm/kafka-values.tpl.yaml > helm/kafka-values.yaml 29 | envsubst '$feast_redis_1_ip,$feast_redis_2_ip,$feast_redis_3_ip' < helm/redis-cluster-values.tpl.yaml > helm/redis-cluster-values.yaml 30 | 31 | helm install e2e-kafka bitnami/kafka \ 32 | --values helm/kafka-values.yaml --namespace infra --create-namespace 33 | 34 | helm install e2e-redis-cluster bitnami/redis-cluster \ 35 | --values helm/redis-cluster-values.yaml --namespace infra \ 36 | --create-namespace -------------------------------------------------------------------------------- /infra/scripts/setup-e2e-env-sparkop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m pip install --upgrade pip==20.2 setuptools wheel 4 | 5 | make install-python 6 | 7 | python -m pip install -qr tests/requirements.txt 8 | 9 | # Using mvn -q to make it less verbose. This step happens after docker containers were 10 | # succesfully built so it should be unlikely to fail, therefore we likely won't need detailed logs. 11 | echo "########## Building ingestion jar" 12 | TIMEFORMAT='########## took %R seconds' 13 | 14 | time make build-ingestion-jar-no-tests REVISION=develop MAVEN_EXTRA_OPTS="-q --no-transfer-progress" 15 | -------------------------------------------------------------------------------- /infra/scripts/setup-e2e-local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | STEP_BREADCRUMB='~~~~~~~~' 5 | 6 | pushd "$(dirname $0)" 7 | source k8s-common-functions.sh 8 | 9 | # spark k8s test - runs in sparkop namespace (so it doesn't interfere with a concurrently 10 | # running EMR test). 11 | NAMESPACE=sparkop 12 | RELEASE=sparkop 13 | 14 | # Clean up old release 15 | k8s_cleanup "$RELEASE" "$NAMESPACE" 16 | 17 | # Helm install everything in a namespace 18 | helm_install "$RELEASE" "${DOCKER_REPOSITORY}" "${GIT_TAG}" "$NAMESPACE" --create-namespace 19 | 20 | # Delete all sparkapplication resources that may be left over from the previous test runs. 21 | kubectl delete sparkapplication --all -n "$NAMESPACE" || true 22 | 23 | # Make sure the test pod has permissions to create sparkapplication resources 24 | setup_sparkop_role 25 | 26 | echo "DONE" -------------------------------------------------------------------------------- /infra/scripts/test-core-ingestion.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | apt-get -qq update 4 | apt-get -y install build-essential 5 | 6 | make lint-java 7 | 8 | infra/scripts/download-maven-cache.sh \ 9 | --archive-uri gs://feast-templocation-kf-feast/.m2.2019-10-24.tar \ 10 | --output-dir /root/ 11 | 12 | # Core depends on Ingestion so they are tested together 13 | # Skip Maven enforcer: https://stackoverflow.com/questions/50647223/maven-enforcer-issue-when-running-from-reactor-level 14 | mvn --projects core,ingestion --batch-mode --define skipTests=true \ 15 | --define enforcer.skip=true clean install 16 | mvn --projects core,ingestion --define enforcer.skip=true test 17 | TEST_EXIT_CODE=$? 18 | 19 | # Default artifact location setting in Prow jobs 20 | LOGS_ARTIFACT_PATH=/logs/artifacts 21 | mkdir -p ${LOGS_ARTIFACT_PATH}/surefire-reports 22 | cp core/target/surefire-reports/* ${LOGS_ARTIFACT_PATH}/surefire-reports/ 23 | cp ingestion/target/surefire-reports/* ${LOGS_ARTIFACT_PATH}/surefire-reports/ 24 | 25 | exit ${TEST_EXIT_CODE} -------------------------------------------------------------------------------- /infra/scripts/test-end-to-end-local.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | export DISABLE_FEAST_SERVICE_FIXTURES=1 6 | export DISABLE_SERVICE_FIXTURES=1 7 | 8 | export FEAST_SPARK_K8S_NAMESPACE=sparkop 9 | export FEAST_S3_ENDPOINT_URL=http://minio.minio.svc.cluster.local:9000 10 | 11 | # Used by tests 12 | export AWS_S3_ENDPOINT_URL=http://minio.minio.svc.cluster.local:9000 13 | 14 | cat << SPARK_CONF_END >/tmp/spark_conf.yml 15 | apiVersion: "sparkoperator.k8s.io/v1beta2" 16 | kind: SparkApplication 17 | metadata: 18 | namespace: default 19 | spec: 20 | type: Scala 21 | mode: cluster 22 | image: "gcr.io/kf-feast/spark-py:v3.0.1" 23 | imagePullPolicy: Always 24 | sparkVersion: "3.0.1" 25 | timeToLiveSeconds: 3600 26 | pythonVersion: "3" 27 | sparkConf: 28 | "spark.hadoop.fs.s3a.endpoint": http://minio.minio.svc.cluster.local:9000 29 | "spark.hadoop.fs.s3a.path.style.access": "true" 30 | "spark.hadoop.fs.s3a.access.key": ${AWS_ACCESS_KEY_ID} 31 | "spark.hadoop.fs.s3a.secret.key": ${AWS_SECRET_ACCESS_KEY} 32 | restartPolicy: 33 | type: Never 34 | volumes: 35 | - name: "test-volume" 36 | hostPath: 37 | path: "/tmp" 38 | type: Directory 39 | driver: 40 | cores: 1 41 | coreLimit: "1200m" 42 | memory: "512m" 43 | labels: 44 | version: 3.0.1 45 | serviceAccount: spark 46 | volumeMounts: 47 | - name: "test-volume" 48 | mountPath: "/tmp" 49 | executor: 50 | cores: 1 51 | instances: 1 52 | memory: "512m" 53 | labels: 54 | version: 3.0.1 55 | volumeMounts: 56 | - name: "test-volume" 57 | mountPath: "/tmp" 58 | SPARK_CONF_END 59 | export FEAST_SPARK_K8S_JOB_TEMPLATE_PATH=/tmp/spark_conf.yml 60 | 61 | PYTHONPATH=sdk/python pytest tests/e2e/ \ 62 | --feast-version develop \ 63 | --core-url sparkop-feast-core:6565 \ 64 | --serving-url sparkop-feast-online-serving:6566 \ 65 | --env k8s \ 66 | --staging-path s3a://feast-staging \ 67 | --redis-url sparkop-redis-master.sparkop.svc.cluster.local:6379 \ 68 | --kafka-brokers sparkop-kafka.sparkop.svc.cluster.local:9092 \ 69 | -m "not bq and not k8s" -------------------------------------------------------------------------------- /infra/scripts/test-golang-sdk.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o pipefail 4 | 5 | make lint-go 6 | 7 | cd sdk/go 8 | go test -v 2>&1 | tee /tmp/test_output 9 | TEST_EXIT_CODE=$? 10 | 11 | # Default artifact location setting in Prow jobs 12 | LOGS_ARTIFACT_PATH=/logs/artifacts 13 | 14 | go get -u github.com/jstemmer/go-junit-report 15 | cat /tmp/test_output | ${GOPATH}/bin/go-junit-report > ${LOGS_ARTIFACT_PATH}/golang-sdk-test-report.xml 16 | 17 | exit ${TEST_EXIT_CODE} -------------------------------------------------------------------------------- /infra/scripts/test-integration.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python -m pip install --upgrade pip setuptools wheel 4 | make install-python 5 | python -m pip install -qr tests/requirements.txt 6 | 7 | export FEAST_TELEMETRY="False" 8 | pytest tests/integration --dataproc-cluster-name feast-e2e --dataproc-project kf-feast --dataproc-region us-central1 --dataproc-staging-location gs://feast-templocation-kf-feast 9 | -------------------------------------------------------------------------------- /infra/scripts/test-java-sdk.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Skip Maven enforcer: https://stackoverflow.com/questions/50647223/maven-enforcer-issue-when-running-from-reactor-level 4 | mvn --projects sdk/java --batch-mode --define skipTests=true \ 5 | --define enforcer.skip=true clean install 6 | mvn --projects sdk/java --define enforcer.skip=true test 7 | TEST_EXIT_CODE=$? 8 | 9 | # Default artifact location setting in Prow jobs 10 | LOGS_ARTIFACT_PATH=/logs/artifacts 11 | cp -r sdk/java/target/surefire-reports ${LOGS_ARTIFACT_PATH}/surefire-reports 12 | 13 | exit ${TEST_EXIT_CODE} -------------------------------------------------------------------------------- /infra/scripts/test-python-sdk.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | # Default artifact location setting in Prow jobs 6 | LOGS_ARTIFACT_PATH=/logs/artifacts 7 | 8 | pip install -r sdk/python/requirements-ci.txt 9 | make compile-protos-python 10 | make lint-python 11 | 12 | cd sdk/python/ 13 | pip install -e . 14 | pytest --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml 15 | -------------------------------------------------------------------------------- /infra/scripts/test-serving.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infra/scripts/download-maven-cache.sh \ 4 | --archive-uri gs://feast-templocation-kf-feast/.m2.2019-10-24.tar \ 5 | --output-dir /root/ 6 | 7 | mvn --batch-mode --also-make --projects serving test 8 | TEST_EXIT_CODE=$? 9 | 10 | # Default artifact location setting in Prow jobs 11 | LOGS_ARTIFACT_PATH=/logs/artifacts 12 | cp -r serving/target/surefire-reports ${LOGS_ARTIFACT_PATH}/surefire-reports 13 | 14 | exit ${TEST_EXIT_CODE} 15 | -------------------------------------------------------------------------------- /infra/scripts/test_job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: test-runner 5 | namespace: sparkop 6 | spec: 7 | backoffLimit: 1 8 | template: 9 | spec: 10 | containers: 11 | - name: ubuntu 12 | image: feast:local 13 | command: ["bash", "-c", "./infra/scripts/test-end-to-end-local.sh"] 14 | imagePullPolicy: Never 15 | args: 16 | - bash 17 | stdin: true 18 | stdinOnce: true 19 | tty: true 20 | env: 21 | - name: AWS_ACCESS_KEY_ID 22 | valueFrom: 23 | secretKeyRef: 24 | name: minio 25 | key: accesskey 26 | - name: AWS_SECRET_ACCESS_KEY 27 | valueFrom: 28 | secretKeyRef: 29 | name: minio 30 | key: secretkey 31 | - name: AWS_DEFAULT_REGION 32 | value: us-east-1 33 | - name: AWS_S3_SIGNATURE_VERSION 34 | value: s3v4 35 | restartPolicy: Never 36 | -------------------------------------------------------------------------------- /infra/scripts/validate-helm-chart-versions.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | function finish { 4 | echo "Please ensure the Chart.yaml have the version ${1}" 5 | exit 6 | } 7 | 8 | trap "finish $1" ERR 9 | 10 | set -e 11 | 12 | if [ $# -ne 1 ]; then 13 | echo "Please provide a single semver version (without a \"v\" prefix) to test the repository against, e.g 0.99.0" 14 | exit 1 15 | fi 16 | 17 | # Get project root 18 | PROJECT_ROOT_DIR=$(git rev-parse --show-toplevel) 19 | 20 | echo "Trying to find version ${1} in the feast-spark Chart.yaml. Exiting if not found." 21 | grep "version: ${1}" "${PROJECT_ROOT_DIR}/infra/charts/feast-spark/Chart.yaml" 22 | 23 | 24 | echo "Trying to find version ${1} in the feast-jobservice Chart.yaml. Exiting if not found." 25 | grep "version: ${1}" "${PROJECT_ROOT_DIR}/infra/charts/feast-spark/charts/feast-jobservice/Chart.yaml" 26 | 27 | echo "Success! All versions found!" -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | 4.0.0 5 | Feast Spark 6 | 7 | dev.feast 8 | feast-spark-parent 9 | ${revision} 10 | pom 11 | 12 | 13 | spark/ingestion 14 | 15 | 16 | 17 | 0.2.24 18 | 1.8 19 | 1.8 20 | 2.12 21 | ${scala.version}.10 22 | 3.1.3 23 | 4.4.0 24 | 3.3.0 25 | 3.12.2 26 | 3.10 27 | 2.4.11 28 | 29 | 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /protos/feast/core/DataFormat.proto: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2020 The Feast Authors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // https://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | 17 | 18 | syntax = "proto3"; 19 | package feast.core; 20 | 21 | option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; 22 | option java_outer_classname = "DataFormatProto"; 23 | option java_package = "feast.proto.core"; 24 | 25 | // Defines the file format encoding the features/entity data in files 26 | message FileFormat { 27 | // Defines options for the Parquet data format 28 | message ParquetFormat {} 29 | 30 | oneof format { 31 | ParquetFormat parquet_format = 1; 32 | } 33 | } 34 | 35 | // Defines the data format encoding features/entity data in data streams 36 | message StreamFormat { 37 | // Defines options for the protobuf data format 38 | message ProtoFormat { 39 | // Classpath to the generated Java Protobuf class that can be used to decode 40 | // Feature data from the obtained stream message 41 | string class_path = 1; 42 | } 43 | 44 | // Defines options for the avro data format 45 | message AvroFormat { 46 | // Optional if used in a File DataSource as schema is embedded in avro file. 47 | // Specifies the schema of the Avro message as JSON string. 48 | string schema_json = 1; 49 | } 50 | 51 | // Specifies the data format and format specific options 52 | oneof format { 53 | AvroFormat avro_format = 1; 54 | ProtoFormat proto_format = 2; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /protos/feast/core/Entity.proto: -------------------------------------------------------------------------------- 1 | // 2 | // * Copyright 2020 The Feast Authors 3 | // * 4 | // * Licensed under the Apache License, Version 2.0 (the "License"); 5 | // * you may not use this file except in compliance with the License. 6 | // * You may obtain a copy of the License at 7 | // * 8 | // * https://www.apache.org/licenses/LICENSE-2.0 9 | // * 10 | // * Unless required by applicable law or agreed to in writing, software 11 | // * distributed under the License is distributed on an "AS IS" BASIS, 12 | // * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // * See the License for the specific language governing permissions and 14 | // * limitations under the License. 15 | // 16 | 17 | syntax = "proto3"; 18 | 19 | package feast.core; 20 | option java_package = "feast.proto.core"; 21 | option java_outer_classname = "EntityProto"; 22 | option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; 23 | 24 | import "feast/types/Value.proto"; 25 | import "google/protobuf/timestamp.proto"; 26 | 27 | message Entity { 28 | // User-specified specifications of this entity. 29 | EntitySpecV2 spec = 1; 30 | // System-populated metadata for this entity. 31 | EntityMeta meta = 2; 32 | } 33 | 34 | message EntitySpecV2 { 35 | // Name of the entity. 36 | string name = 1; 37 | 38 | // Type of the entity. 39 | feast.types.ValueType.Enum value_type = 2; 40 | 41 | // Description of the entity. 42 | string description = 3; 43 | 44 | // User defined metadata 45 | map labels = 8; 46 | } 47 | 48 | message EntityMeta { 49 | google.protobuf.Timestamp created_timestamp = 1; 50 | google.protobuf.Timestamp last_updated_timestamp = 2; 51 | } 52 | -------------------------------------------------------------------------------- /protos/feast/core/Feature.proto: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2020 The Feast Authors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // https://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | 17 | syntax = "proto3"; 18 | package feast.core; 19 | 20 | 21 | option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; 22 | option java_outer_classname = "FeatureProto"; 23 | option java_package = "feast.proto.core"; 24 | 25 | import "feast/types/Value.proto"; 26 | 27 | message FeatureSpecV2 { 28 | // Name of the feature. Not updatable. 29 | string name = 1; 30 | 31 | // Value type of the feature. Not updatable. 32 | feast.types.ValueType.Enum value_type = 2; 33 | 34 | // Labels for user defined metadata on a feature 35 | map labels = 3; 36 | } 37 | -------------------------------------------------------------------------------- /protos/feast/serving/ServingService.proto: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/protos/feast/serving/ServingService.proto -------------------------------------------------------------------------------- /protos/feast/storage/Redis.proto: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 The Feast Authors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | syntax = "proto3"; 18 | 19 | import "feast/types/Field.proto"; 20 | import "feast/types/Value.proto"; 21 | 22 | package feast.storage; 23 | 24 | option java_outer_classname = "RedisProto"; 25 | option java_package = "feast.proto.storage"; 26 | option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/storage"; 27 | 28 | message RedisKeyV2 { 29 | string project = 1; 30 | 31 | repeated string entity_names = 2; 32 | 33 | repeated feast.types.Value entity_values = 3; 34 | } 35 | -------------------------------------------------------------------------------- /protos/feast/third_party/grpc/health/v1/HealthService.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package grpc.health.v1; 4 | 5 | option java_package = "io.grpc.health.v1"; 6 | option java_outer_classname = "HealthProto"; 7 | 8 | message HealthCheckRequest { 9 | string service = 1; 10 | } 11 | 12 | enum ServingStatus { 13 | UNKNOWN = 0; 14 | SERVING = 1; 15 | NOT_SERVING = 2; 16 | } 17 | 18 | message HealthCheckResponse { 19 | ServingStatus status = 1; 20 | } 21 | 22 | service Health { 23 | rpc Check(HealthCheckRequest) returns (HealthCheckResponse); 24 | } -------------------------------------------------------------------------------- /protos/feast/types/Field.proto: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 The Feast Authors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | syntax = "proto3"; 18 | 19 | import "feast/types/Value.proto"; 20 | 21 | package feast.types; 22 | 23 | option java_package = "feast.proto.types"; 24 | option java_outer_classname = "FieldProto"; 25 | option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/types"; 26 | 27 | message Field { 28 | string name = 1; 29 | feast.types.Value value = 2; 30 | } 31 | -------------------------------------------------------------------------------- /protos/feast/types/Value.proto: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 The Feast Authors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | syntax = "proto3"; 18 | 19 | package feast.types; 20 | 21 | option java_package = "feast.proto.types"; 22 | option java_outer_classname = "ValueProto"; 23 | option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/types"; 24 | 25 | message ValueType { 26 | enum Enum { 27 | INVALID = 0; 28 | BYTES = 1; 29 | STRING = 2; 30 | INT32 = 3; 31 | INT64 = 4; 32 | DOUBLE = 5; 33 | FLOAT = 6; 34 | BOOL = 7; 35 | BYTES_LIST = 11; 36 | STRING_LIST = 12; 37 | INT32_LIST = 13; 38 | INT64_LIST = 14; 39 | DOUBLE_LIST = 15; 40 | FLOAT_LIST = 16; 41 | BOOL_LIST = 17; 42 | } 43 | } 44 | 45 | message Value { 46 | // ValueType is referenced by the metadata types, FeatureInfo and EntityInfo. 47 | // The enum values do not have to match the oneof val field ids, but they should. 48 | oneof val { 49 | bytes bytes_val = 1; 50 | string string_val = 2; 51 | int32 int32_val = 3; 52 | int64 int64_val = 4; 53 | double double_val = 5; 54 | float float_val = 6; 55 | bool bool_val = 7; 56 | BytesList bytes_list_val = 11; 57 | StringList string_list_val = 12; 58 | Int32List int32_list_val = 13; 59 | Int64List int64_list_val = 14; 60 | DoubleList double_list_val = 15; 61 | FloatList float_list_val = 16; 62 | BoolList bool_list_val = 17; 63 | } 64 | } 65 | 66 | message BytesList { 67 | repeated bytes val = 1; 68 | } 69 | 70 | message StringList { 71 | repeated string val = 1; 72 | } 73 | 74 | message Int32List { 75 | repeated int32 val = 1; 76 | } 77 | 78 | message Int64List { 79 | repeated int64 val = 1; 80 | } 81 | 82 | message DoubleList { 83 | repeated double val = 1; 84 | } 85 | 86 | message FloatList { 87 | repeated float val = 1; 88 | } 89 | 90 | message BoolList { 91 | repeated bool val = 1; 92 | } 93 | -------------------------------------------------------------------------------- /protos/feast_spark/third_party/grpc/health/v1/HealthService.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package grpc.health.v1; 4 | 5 | option java_package = "io.grpc.health.v1"; 6 | option java_outer_classname = "HealthProto"; 7 | 8 | message HealthCheckRequest { 9 | string service = 1; 10 | } 11 | 12 | enum ServingStatus { 13 | UNKNOWN = 0; 14 | SERVING = 1; 15 | NOT_SERVING = 2; 16 | } 17 | 18 | message HealthCheckResponse { 19 | ServingStatus status = 1; 20 | } 21 | 22 | service Health { 23 | rpc Check(HealthCheckRequest) returns (HealthCheckResponse); 24 | } -------------------------------------------------------------------------------- /python/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /python/docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | sys.path.insert(0, os.path.abspath("../../feast_spark")) 17 | sys.path.insert(0, os.path.abspath("../..")) 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = 'Feast Spark SDK' 22 | copyright = '2021, Feast Authors' 23 | author = 'Feast Authors' 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | "sphinx.ext.doctest", 32 | "sphinx.ext.intersphinx", 33 | "sphinx.ext.todo", 34 | "sphinx.ext.coverage", 35 | "sphinx.ext.mathjax", 36 | "sphinx.ext.ifconfig", 37 | "sphinx.ext.viewcode", 38 | "sphinx.ext.githubpages", 39 | "sphinx.ext.napoleon", 40 | "sphinx.ext.autodoc", 41 | "sphinx_rtd_theme", 42 | ] 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ['_templates'] 46 | 47 | # List of patterns, relative to source directory, that match files and 48 | # directories to ignore when looking for source files. 49 | # This pattern also affects html_static_path and html_extra_path. 50 | exclude_patterns = [] 51 | 52 | # -- Options for HTML output ------------------------------------------------- 53 | 54 | # The theme to use for HTML and HTML Help pages. See the documentation for 55 | # a list of builtin themes. 56 | # 57 | html_theme = "sphinx_rtd_theme" 58 | 59 | # Add any paths that contain custom static files (such as style sheets) here, 60 | # relative to this directory. They are copied after the builtin static files, 61 | # so a file named "default.css" will overwrite the builtin "default.css". 62 | html_static_path = ['_static'] 63 | -------------------------------------------------------------------------------- /python/docs/source/feast_spark.api.rst: -------------------------------------------------------------------------------- 1 | feast\_spark.api package 2 | ======================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | feast\_spark.api.JobService\_pb2 module 8 | --------------------------------------- 9 | 10 | .. automodule:: feast_spark.api.JobService_pb2 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | feast\_spark.api.JobService\_pb2\_grpc module 16 | --------------------------------------------- 17 | 18 | .. automodule:: feast_spark.api.JobService_pb2_grpc 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: feast_spark.api 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /python/docs/source/feast_spark.contrib.rst: -------------------------------------------------------------------------------- 1 | feast\_spark.contrib package 2 | ============================ 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | feast_spark.contrib.validation 11 | 12 | Module contents 13 | --------------- 14 | 15 | .. automodule:: feast_spark.contrib 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | -------------------------------------------------------------------------------- /python/docs/source/feast_spark.contrib.validation.rst: -------------------------------------------------------------------------------- 1 | feast\_spark.contrib.validation package 2 | ======================================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | feast\_spark.contrib.validation.base module 8 | ------------------------------------------- 9 | 10 | .. automodule:: feast_spark.contrib.validation.base 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | feast\_spark.contrib.validation.ge module 16 | ----------------------------------------- 17 | 18 | .. automodule:: feast_spark.contrib.validation.ge 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: feast_spark.contrib.validation 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /python/docs/source/feast_spark.pyspark.launchers.aws.rst: -------------------------------------------------------------------------------- 1 | feast\_spark.pyspark.launchers.aws package 2 | ========================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | feast\_spark.pyspark.launchers.aws.emr module 8 | --------------------------------------------- 9 | 10 | .. automodule:: feast_spark.pyspark.launchers.aws.emr 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | feast\_spark.pyspark.launchers.aws.emr\_utils module 16 | ---------------------------------------------------- 17 | 18 | .. automodule:: feast_spark.pyspark.launchers.aws.emr_utils 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: feast_spark.pyspark.launchers.aws 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /python/docs/source/feast_spark.pyspark.launchers.gcloud.rst: -------------------------------------------------------------------------------- 1 | feast\_spark.pyspark.launchers.gcloud package 2 | ============================================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | feast\_spark.pyspark.launchers.gcloud.dataproc module 8 | ----------------------------------------------------- 9 | 10 | .. automodule:: feast_spark.pyspark.launchers.gcloud.dataproc 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: feast_spark.pyspark.launchers.gcloud 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /python/docs/source/feast_spark.pyspark.launchers.k8s.rst: -------------------------------------------------------------------------------- 1 | feast\_spark.pyspark.launchers.k8s package 2 | ========================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | feast\_spark.pyspark.launchers.k8s.k8s module 8 | --------------------------------------------- 9 | 10 | .. automodule:: feast_spark.pyspark.launchers.k8s.k8s 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | feast\_spark.pyspark.launchers.k8s.k8s\_utils module 16 | ---------------------------------------------------- 17 | 18 | .. automodule:: feast_spark.pyspark.launchers.k8s.k8s_utils 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: feast_spark.pyspark.launchers.k8s 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /python/docs/source/feast_spark.pyspark.launchers.rst: -------------------------------------------------------------------------------- 1 | feast\_spark.pyspark.launchers package 2 | ====================================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | feast_spark.pyspark.launchers.aws 11 | feast_spark.pyspark.launchers.gcloud 12 | feast_spark.pyspark.launchers.k8s 13 | feast_spark.pyspark.launchers.standalone 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: feast_spark.pyspark.launchers 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /python/docs/source/feast_spark.pyspark.launchers.standalone.rst: -------------------------------------------------------------------------------- 1 | feast\_spark.pyspark.launchers.standalone package 2 | ================================================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | feast\_spark.pyspark.launchers.standalone.local module 8 | ------------------------------------------------------ 9 | 10 | .. automodule:: feast_spark.pyspark.launchers.standalone.local 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: feast_spark.pyspark.launchers.standalone 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /python/docs/source/feast_spark.pyspark.rst: -------------------------------------------------------------------------------- 1 | feast\_spark.pyspark package 2 | ============================ 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | feast_spark.pyspark.launchers 11 | 12 | Submodules 13 | ---------- 14 | 15 | feast\_spark.pyspark.abc module 16 | ------------------------------- 17 | 18 | .. automodule:: feast_spark.pyspark.abc 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | feast\_spark.pyspark.historical\_feature\_retrieval\_job module 24 | --------------------------------------------------------------- 25 | 26 | .. automodule:: feast_spark.pyspark.historical_feature_retrieval_job 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | feast\_spark.pyspark.launcher module 32 | ------------------------------------ 33 | 34 | .. automodule:: feast_spark.pyspark.launcher 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | Module contents 40 | --------------- 41 | 42 | .. automodule:: feast_spark.pyspark 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | -------------------------------------------------------------------------------- /python/docs/source/feast_spark.rst: -------------------------------------------------------------------------------- 1 | feast\_spark package 2 | ==================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | feast_spark.api 11 | feast_spark.contrib 12 | feast_spark.pyspark 13 | feast_spark.third_party 14 | 15 | Submodules 16 | ---------- 17 | 18 | feast\_spark.cli module 19 | ----------------------- 20 | 21 | .. automodule:: feast_spark.cli 22 | :members: 23 | :undoc-members: 24 | :show-inheritance: 25 | 26 | feast\_spark.client module 27 | -------------------------- 28 | 29 | .. automodule:: feast_spark.client 30 | :members: 31 | :undoc-members: 32 | :show-inheritance: 33 | 34 | feast\_spark.constants module 35 | ----------------------------- 36 | 37 | .. automodule:: feast_spark.constants 38 | :members: 39 | :undoc-members: 40 | :show-inheritance: 41 | 42 | feast\_spark.job\_service module 43 | -------------------------------- 44 | 45 | .. automodule:: feast_spark.job_service 46 | :members: 47 | :undoc-members: 48 | :show-inheritance: 49 | 50 | feast\_spark.remote\_job module 51 | ------------------------------- 52 | 53 | .. automodule:: feast_spark.remote_job 54 | :members: 55 | :undoc-members: 56 | :show-inheritance: 57 | 58 | Module contents 59 | --------------- 60 | 61 | .. automodule:: feast_spark 62 | :members: 63 | :undoc-members: 64 | :show-inheritance: 65 | -------------------------------------------------------------------------------- /python/docs/source/feast_spark.third_party.grpc.health.rst: -------------------------------------------------------------------------------- 1 | feast\_spark.third\_party.grpc.health package 2 | ============================================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | feast_spark.third_party.grpc.health.v1 11 | 12 | Module contents 13 | --------------- 14 | 15 | .. automodule:: feast_spark.third_party.grpc.health 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | -------------------------------------------------------------------------------- /python/docs/source/feast_spark.third_party.grpc.health.v1.rst: -------------------------------------------------------------------------------- 1 | feast\_spark.third\_party.grpc.health.v1 package 2 | ================================================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | feast\_spark.third\_party.grpc.health.v1.HealthService\_pb2 module 8 | ------------------------------------------------------------------ 9 | 10 | .. automodule:: feast_spark.third_party.grpc.health.v1.HealthService_pb2 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | feast\_spark.third\_party.grpc.health.v1.HealthService\_pb2\_grpc module 16 | ------------------------------------------------------------------------ 17 | 18 | .. automodule:: feast_spark.third_party.grpc.health.v1.HealthService_pb2_grpc 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: feast_spark.third_party.grpc.health.v1 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /python/docs/source/feast_spark.third_party.grpc.rst: -------------------------------------------------------------------------------- 1 | feast\_spark.third\_party.grpc package 2 | ====================================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | feast_spark.third_party.grpc.health 11 | 12 | Module contents 13 | --------------- 14 | 15 | .. automodule:: feast_spark.third_party.grpc 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | -------------------------------------------------------------------------------- /python/docs/source/feast_spark.third_party.rst: -------------------------------------------------------------------------------- 1 | feast\_spark.third\_party package 2 | ================================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | feast_spark.third_party.grpc 11 | 12 | Module contents 13 | --------------- 14 | 15 | .. automodule:: feast_spark.third_party 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | -------------------------------------------------------------------------------- /python/docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. Feast Spark SDK documentation master file, created by 2 | sphinx-quickstart on Sun Mar 21 17:00:24 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Feast Spark SDK's documentation! 7 | =========================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | 14 | Client 15 | ================== 16 | 17 | .. automodule:: feast_spark.client 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: -------------------------------------------------------------------------------- /python/docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | feast_spark 2 | =========== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | feast_spark 8 | -------------------------------------------------------------------------------- /python/feast_spark/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import Client 2 | 3 | __all__ = [ 4 | "Client", 5 | ] 6 | -------------------------------------------------------------------------------- /python/feast_spark/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/api/__init__.py -------------------------------------------------------------------------------- /python/feast_spark/cli.py: -------------------------------------------------------------------------------- 1 | import logging.config 2 | 3 | import click 4 | 5 | from feast_spark.job_service import start_job_service 6 | 7 | logging.config.dictConfig( 8 | { 9 | "version": 1, 10 | "disable_existing_loggers": True, 11 | "formatters": { 12 | "standard": {"format": "%(asctime)s [%(levelname)s] %(name)s: %(message)s"}, 13 | }, 14 | "handlers": { 15 | "debug": { 16 | "level": "INFO", 17 | "formatter": "standard", 18 | "class": "logging.StreamHandler", 19 | "stream": "ext://sys.stdout", 20 | }, 21 | "standard": { 22 | "level": "WARNING", 23 | "formatter": "standard", 24 | "class": "logging.StreamHandler", 25 | "stream": "ext://sys.stderr", 26 | }, 27 | }, 28 | "loggers": { 29 | "": {"handlers": ["standard"], "level": "WARNING", "propagate": False}, 30 | "feast_spark": { 31 | "handlers": ["debug", "standard"], 32 | "level": "INFO", 33 | "propagate": False, 34 | }, 35 | "feast": { 36 | "handlers": ["debug", "standard"], 37 | "level": "INFO", 38 | "propagate": False, 39 | }, 40 | }, 41 | } 42 | ) 43 | 44 | 45 | @click.group() 46 | def cli(): 47 | pass 48 | 49 | 50 | @cli.command(name="server") 51 | def server(): 52 | """ 53 | Start Feast Job Service 54 | """ 55 | start_job_service() 56 | 57 | 58 | if __name__ == "__main__": 59 | cli() 60 | -------------------------------------------------------------------------------- /python/feast_spark/contrib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/contrib/__init__.py -------------------------------------------------------------------------------- /python/feast_spark/contrib/validation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/contrib/validation/__init__.py -------------------------------------------------------------------------------- /python/feast_spark/contrib/validation/base.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | try: 4 | from pyspark import cloudpickle 5 | except ImportError: 6 | raise ImportError("pyspark must be installed to enable validation functionality") 7 | 8 | 9 | def serialize_udf(fun, return_type) -> bytes: 10 | buffer = io.BytesIO() 11 | command = (fun, return_type) 12 | cloudpickle.dump(command, buffer) 13 | return buffer.getvalue() 14 | -------------------------------------------------------------------------------- /python/feast_spark/metrics.py: -------------------------------------------------------------------------------- 1 | from prometheus_client import Counter 2 | 3 | job_whitelist_failure_count = Counter( 4 | "feast_job_whitelist_failure_count", 5 | "request failures due to feature table not being whitelisted", 6 | ["project", "table"], 7 | ) 8 | job_submission_count = Counter( 9 | "feast_job_submission_count", 10 | "request to submit feast job", 11 | ["job_type", "project", "table"], 12 | ) 13 | job_schedule_count = Counter( 14 | "feast_job_schedule_count", "request to schedule feast job", ["project", "table"] 15 | ) 16 | -------------------------------------------------------------------------------- /python/feast_spark/pyspark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/pyspark/__init__.py -------------------------------------------------------------------------------- /python/feast_spark/pyspark/launchers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/pyspark/launchers/__init__.py -------------------------------------------------------------------------------- /python/feast_spark/pyspark/launchers/aws/__init__.py: -------------------------------------------------------------------------------- 1 | from .emr import ( 2 | EmrBatchIngestionJob, 3 | EmrClusterLauncher, 4 | EmrRetrievalJob, 5 | EmrStreamIngestionJob, 6 | ) 7 | 8 | __all__ = [ 9 | "EmrRetrievalJob", 10 | "EmrBatchIngestionJob", 11 | "EmrStreamIngestionJob", 12 | "EmrClusterLauncher", 13 | ] 14 | -------------------------------------------------------------------------------- /python/feast_spark/pyspark/launchers/gcloud/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataproc import DataprocClusterLauncher, DataprocRetrievalJob 2 | 3 | __all__ = ["DataprocRetrievalJob", "DataprocClusterLauncher"] 4 | -------------------------------------------------------------------------------- /python/feast_spark/pyspark/launchers/k8s/__init__.py: -------------------------------------------------------------------------------- 1 | from .k8s import ( 2 | KubernetesBatchIngestionJob, 3 | KubernetesJobLauncher, 4 | KubernetesRetrievalJob, 5 | KubernetesStreamIngestionJob, 6 | ) 7 | 8 | __all__ = [ 9 | "KubernetesRetrievalJob", 10 | "KubernetesBatchIngestionJob", 11 | "KubernetesStreamIngestionJob", 12 | "KubernetesJobLauncher", 13 | ] 14 | -------------------------------------------------------------------------------- /python/feast_spark/pyspark/launchers/standalone/__init__.py: -------------------------------------------------------------------------------- 1 | from .local import ( 2 | StandaloneClusterLauncher, 3 | StandaloneClusterRetrievalJob, 4 | reset_job_cache, 5 | ) 6 | 7 | __all__ = [ 8 | "StandaloneClusterRetrievalJob", 9 | "StandaloneClusterLauncher", 10 | "reset_job_cache", 11 | ] 12 | -------------------------------------------------------------------------------- /python/feast_spark/third_party/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/third_party/__init__.py -------------------------------------------------------------------------------- /python/feast_spark/third_party/grpc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/third_party/grpc/__init__.py -------------------------------------------------------------------------------- /python/feast_spark/third_party/grpc/health/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/third_party/grpc/health/__init__.py -------------------------------------------------------------------------------- /python/feast_spark/third_party/grpc/health/v1/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/feast_spark/third_party/grpc/health/v1/__init__.py -------------------------------------------------------------------------------- /python/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 88 3 | target-version = ['py37'] 4 | include = '\.pyi?$' 5 | exclude = ''' 6 | ( 7 | /( 8 | \.eggs # exclude a few common directories in the 9 | | \.git # root of the project 10 | | \.hg 11 | | \.mypy_cache 12 | | \.tox 13 | | \.venv 14 | | _build 15 | | api 16 | | buck-out 17 | | build 18 | | dist 19 | | pb2.py 20 | | \.pyi 21 | | storage 22 | | types 23 | | third_party 24 | )/ 25 | ) 26 | ''' 27 | -------------------------------------------------------------------------------- /python/requirements-ci.txt: -------------------------------------------------------------------------------- 1 | feast>=0.9.8,<0.10.0 2 | cryptography==3.1 3 | flake8 4 | black==19.10b0 5 | isort>=5 6 | grpcio-tools==1.31.0 7 | pyspark==3.1.3 8 | pandas~=1.0.0 9 | mock==2.0.0 10 | pandavro==1.5.* 11 | moto 12 | mypy==0.790 13 | mypy-protobuf 14 | avro==1.10.0 15 | gcsfs 16 | urllib3>=1.25.4 17 | pytest==6.0.0 18 | pytest-lazy-fixture==0.6.3 19 | pytest-timeout==1.4.2 20 | pytest-ordering==0.6.* 21 | pytest-mock==1.10.4 22 | PyYAML>=5.4.* 23 | great-expectations==0.13.2 24 | adlfs==0.5.9 25 | redis==4.1.* 26 | Jinja2==3.0.3 27 | croniter==1.* -------------------------------------------------------------------------------- /python/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | multi_line_output=3 3 | include_trailing_comma=True 4 | force_grid_wrap=0 5 | use_parentheses=True 6 | line_length=88 7 | skip=feast_spark/api,feast_spark/third_party 8 | known_first_party=feast 9 | default_section=THIRDPARTY 10 | 11 | [flake8] 12 | ignore = E203, E266, E501, W503 13 | max-line-length = 88 14 | max-complexity = 20 15 | select = B,C,E,F,W,T4 16 | exclude = .git,__pycache__,docs/conf.py,dist,feast_spark/api,feast_spark/third_party 17 | 18 | [mypy] 19 | files=feast_spark,test 20 | ignore_missing_imports=true 21 | -------------------------------------------------------------------------------- /python/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/python/tests/__init__.py -------------------------------------------------------------------------------- /python/tests/data/bookings.csv: -------------------------------------------------------------------------------- 1 | driver_id,event_timestamp,created_timestamp,completed_bookings 2 | 8001,2020-08-31T00:00:00.000,2020-08-31T00:00:00.000,200 3 | 8001,2020-09-01T00:00:00.000,2020-09-01T00:00:00.000,300 4 | 8002,2020-09-01T00:00:00.000,2020-09-01T00:00:00.000,600 5 | 8002,2020-09-01T00:00:00.000,2020-09-02T00:00:00.000,500 6 | 8003,2020-09-01T00:00:00.000,2020-09-02T00:00:00.000,700 7 | -------------------------------------------------------------------------------- /python/tests/data/column_mapping_test_entity.csv: -------------------------------------------------------------------------------- 1 | id,event_timestamp 2 | 1001,2020-09-02T00:00:00.000 3 | 1001,2020-09-03T00:00:00.000 4 | 2001,2020-09-04T00:00:00.000 5 | 2001,2020-09-04T00:00:00.000 6 | 3001,2020-09-04T00:00:00.000 7 | -------------------------------------------------------------------------------- /python/tests/data/column_mapping_test_feature.csv: -------------------------------------------------------------------------------- 1 | customer_id,total_bookings,datetime,created_datetime 2 | 1001,200,2020-09-02T00:00:00.000,2020-09-02T00:00:00.000 3 | 1001,400,2020-09-04T00:00:00.000,2020-09-02T00:00:00.000 4 | 2001,500,2020-09-03T00:00:00.000,2020-09-01T00:00:00.000 5 | 2001,600,2020-09-03T00:00:00.000,2020-09-02T00:00:00.000 6 | 3001,700,2020-09-03T00:00:00.000,2020-09-03T00:00:00.000 7 | -------------------------------------------------------------------------------- /python/tests/data/customer_driver_pairs.csv: -------------------------------------------------------------------------------- 1 | customer_id,driver_id,event_timestamp 2 | 1001,8001,2020-09-02T00:00:00.000 3 | 1001,8002,2020-09-02T00:00:00.000 4 | 1001,8002,2020-09-03T00:00:00.000 5 | 2001,8002,2020-09-03T00:00:00.000 6 | 2001,8002,2020-09-04T00:00:00.000 -------------------------------------------------------------------------------- /python/tests/data/customers.csv: -------------------------------------------------------------------------------- 1 | customer_id,event_timestamp 2 | 1001,2020-09-02T00:00:00.000 3 | 1002,2020-09-02T00:00:00.000 4 | 1003,2020-09-03T00:00:00.000 5 | 1004,2020-09-03T00:00:00.000 6 | 1005,2020-09-04T00:00:00.000 7 | -------------------------------------------------------------------------------- /python/tests/data/single_customer.csv: -------------------------------------------------------------------------------- 1 | customer_id,event_timestamp 2 | 1001,2020-09-02T00:00:00.000 3 | -------------------------------------------------------------------------------- /python/tests/data/transactions.csv: -------------------------------------------------------------------------------- 1 | customer_id,event_timestamp,created_timestamp,daily_transactions 2 | 1001,2020-08-31T00:00:00.000,2020-09-01T00:00:00.000,50.0 3 | 1001,2020-09-01T00:00:00.000,2020-09-01T00:00:00.000,100.0 4 | 2001,2020-09-01T00:00:00.000,2020-08-31T00:00:00.000,80.0 5 | 2001,2020-09-01T00:00:00.000,2020-09-01T00:00:00.000,200.0 6 | 3001,2020-09-01T00:00:00.000,2020-09-01T00:00:00.000,300.0 -------------------------------------------------------------------------------- /python/tests/test_launcher_abc.py: -------------------------------------------------------------------------------- 1 | from feast_spark.pyspark.abc import StreamIngestionJobParameters 2 | 3 | 4 | def test_stream_ingestion_job_hash(): 5 | streaming_source = { 6 | "kafka": { 7 | "event_timestamp_column": "event_timestamp", 8 | "bootstrap_servers": "localhost:9092", 9 | "topic": "test", 10 | "format": { 11 | "class_path": "com.test.someprotos", 12 | "json_class": "ProtoFormat", 13 | }, 14 | } 15 | } 16 | feature_table = { 17 | "features": [ 18 | {"name": "feature_1", "type": "STRING"}, 19 | {"name": "feature_2", "type": "STRING"}, 20 | ], 21 | "entities": [ 22 | {"name": "entity_1", "type": "STRING"}, 23 | {"name": "entity_2", "type": "STRING"}, 24 | ], 25 | "project": "someproject", 26 | } 27 | feature_table_with_different_order = { 28 | "features": [ 29 | {"name": "feature_2", "type": "STRING"}, 30 | {"name": "feature_1", "type": "STRING"}, 31 | ], 32 | "entities": [ 33 | {"name": "entity_2", "type": "STRING"}, 34 | {"name": "entity_1", "type": "STRING"}, 35 | ], 36 | "project": "someproject", 37 | } 38 | param = StreamIngestionJobParameters( 39 | source=streaming_source, feature_table=feature_table, jar="" 40 | ) 41 | param_different_order = StreamIngestionJobParameters( 42 | source=streaming_source, 43 | feature_table=feature_table_with_different_order, 44 | jar="", 45 | ) 46 | assert param.get_job_hash() == param_different_order.get_job_hash() 47 | -------------------------------------------------------------------------------- /python/tests/test_lock_manager.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import pytest 4 | 5 | from feast_spark.lock_manager import JobOperation, JobOperationLock 6 | 7 | job_hash = "dummy_hash" 8 | 9 | 10 | class MockRedis: 11 | def __init__(self, cache=dict()): 12 | self.cache = cache 13 | 14 | def get(self, name): 15 | if name in self.cache: 16 | return self.cache[name] 17 | return None 18 | 19 | def set(self, name, value, *args, **kwargs): 20 | if name not in self.cache: 21 | self.cache[name] = value.encode("utf-8") 22 | return "OK" 23 | 24 | def delete(self, name): 25 | if name in self.cache: 26 | self.cache.pop(name) 27 | return None 28 | 29 | 30 | @pytest.fixture 31 | def lock_config(): 32 | return {"redis_host": "localhost", "redis_port": 0, "lock_expiry": 5} 33 | 34 | 35 | @patch("redis.Redis") 36 | def test_lock_manager_context(mock_redis, lock_config): 37 | mock_redis_connection = MockRedis() 38 | mock_redis.return_value = mock_redis_connection 39 | with JobOperationLock( 40 | job_hash=job_hash, operation=JobOperation.START, **lock_config 41 | ) as lock: 42 | # test lock acquired 43 | assert lock 44 | # verify lock key in cache 45 | assert ( 46 | f"lock_{JobOperation.START.value}_{job_hash}" in mock_redis_connection.cache 47 | ) 48 | # verify release 49 | assert ( 50 | f"lock_{JobOperation.START.value}_{job_hash}" not in mock_redis_connection.cache 51 | ) 52 | 53 | 54 | @patch("redis.Redis") 55 | def test_lock_manager_lock_not_available(mock_redis, lock_config): 56 | cache = {"lock_st_dummy_hash": b"127a32aaf729dc87"} 57 | mock_redis_connection = MockRedis(cache) 58 | mock_redis.return_value = mock_redis_connection 59 | with JobOperationLock( 60 | job_hash=job_hash, operation=JobOperation.START, **lock_config 61 | ) as lock: 62 | # test lock not acquired 63 | assert not lock 64 | 65 | 66 | def test_lock_manager_connection_error(lock_config): 67 | with JobOperationLock( 68 | job_hash=job_hash, operation=JobOperation.START, **lock_config 69 | ) as lock: 70 | # test lock not acquired 71 | assert not lock 72 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Standard spark configuration # 2 | 3 | log4j.rootCategory=INFO, console 4 | log4j.appender.console=org.apache.log4j.ConsoleAppender 5 | log4j.appender.console.target=System.out 6 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 8 | 9 | # Settings to quiet third party logs that are too verbose 10 | log4j.logger.org.sparkproject.jetty=WARN 11 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR 12 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 13 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 14 | log4j.logger.org.apache.parquet=ERROR 15 | log4j.logger.parquet=ERROR 16 | 17 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 18 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 19 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 20 | 21 | 22 | # Feast # 23 | log4j.appender.termination=org.apache.log4j.FileAppender 24 | log4j.appender.termination.File=/dev/termination-log 25 | log4j.appender.file.Append=true 26 | log4j.appender.file.ImmediateFlush=true 27 | log4j.appender.termination.layout=org.apache.log4j.PatternLayout 28 | log4j.appender.termination.layout.ConversionPattern=%c{1}: %m%n 29 | 30 | log4j.logger.feast=FATAL, termination -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/metrics/IngestionPipelineMetrics.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.metrics 18 | 19 | import org.apache.spark.SparkEnv 20 | import org.apache.spark.metrics.source.IngestionPipelineMetricSource 21 | import org.apache.spark.sql.Row 22 | 23 | class IngestionPipelineMetrics extends Serializable { 24 | 25 | def incrementDeadLetters(row: Row): Row = { 26 | metricSource.foreach(_.METRIC_DEADLETTER_ROWS_INSERTED.inc()) 27 | row 28 | } 29 | 30 | def incrementRead(row: Row): Row = { 31 | metricSource.foreach(_.METRIC_ROWS_READ_FROM_SOURCE.inc()) 32 | row 33 | } 34 | 35 | private lazy val metricSource: Option[IngestionPipelineMetricSource] = { 36 | val metricsSystem = SparkEnv.get.metricsSystem 37 | IngestionPipelineMetricsLock.synchronized { 38 | if (metricsSystem.getSourcesByName(IngestionPipelineMetricSource.sourceName).isEmpty) { 39 | metricsSystem.registerSource(new IngestionPipelineMetricSource) 40 | } 41 | } 42 | 43 | metricsSystem.getSourcesByName(IngestionPipelineMetricSource.sourceName) match { 44 | case Seq(head) => Some(head.asInstanceOf[IngestionPipelineMetricSource]) 45 | case _ => None 46 | } 47 | } 48 | } 49 | 50 | private object IngestionPipelineMetricsLock 51 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/metrics/StreamingMetrics.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.metrics 18 | 19 | import org.apache.spark.SparkEnv 20 | import org.apache.spark.metrics.source.StreamingMetricSource 21 | import org.apache.spark.sql.streaming.StreamingQueryProgress 22 | 23 | class StreamingMetrics extends Serializable { 24 | 25 | private val metricSource: Option[StreamingMetricSource] = { 26 | val metricsSystem = SparkEnv.get.metricsSystem 27 | 28 | metricsSystem.getSourcesByName(StreamingMetricSource.sourceName) match { 29 | case Seq(head) => Some(head.asInstanceOf[StreamingMetricSource]) 30 | case _ => None 31 | } 32 | } 33 | 34 | def updateStreamingProgress( 35 | progress: StreamingQueryProgress 36 | ): Unit = { 37 | metricSource.foreach(_.updateStreamingProgress(progress)) 38 | } 39 | 40 | def updateKafkaTimestamp(timestamp: Long): Unit = { 41 | metricSource.foreach(_.updateKafkaTimestamp(timestamp)) 42 | } 43 | } 44 | 45 | private object StreamingMetricsLock 46 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/registry/proto/LocalProtoRegistry.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.registry.proto 18 | import java.io.{IOException, ObjectInputStream} 19 | 20 | import com.google.protobuf.Descriptors.Descriptor 21 | 22 | import collection.mutable 23 | import scala.util.control.NonFatal 24 | 25 | class LocalProtoRegistry extends ProtoRegistry { 26 | @transient 27 | private var cache: mutable.Map[String, Descriptor] = mutable.Map.empty 28 | 29 | @throws(classOf[IOException]) 30 | private def readObject(ois: ObjectInputStream): Unit = { 31 | try { 32 | ois.defaultReadObject() 33 | cache = mutable.Map.empty 34 | } catch { 35 | case NonFatal(e) => 36 | throw new IOException(e) 37 | } 38 | } 39 | 40 | override def getProtoDescriptor(className: String): Descriptor = { 41 | if (!cache.contains(className)) { 42 | cache(className) = Class 43 | .forName(className, true, getClass.getClassLoader) 44 | .getMethod("getDescriptor") 45 | .invoke(null) 46 | .asInstanceOf[Descriptor] 47 | } 48 | 49 | cache(className) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/registry/proto/ProtoRegistry.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.registry.proto 18 | 19 | import com.google.protobuf.Descriptors.Descriptor 20 | 21 | trait ProtoRegistry extends Serializable { 22 | def getProtoDescriptor(className: String): Descriptor 23 | } 24 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/registry/proto/ProtoRegistryFactory.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.registry.proto 18 | 19 | import org.apache.spark.sql.SparkSession 20 | 21 | object ProtoRegistryFactory { 22 | val CONFIG_PREFIX = "feast.ingestion.registry.proto." 23 | val PROTO_REGISTRY_KIND = s"${CONFIG_PREFIX}kind" 24 | val DEFAULT_KIND = "local" 25 | 26 | def resolveProtoRegistry(sparkSession: SparkSession): ProtoRegistry = { 27 | val config = sparkSession.sparkContext.getConf 28 | val kind = config.get(PROTO_REGISTRY_KIND, DEFAULT_KIND) 29 | val properties = config.getAllWithPrefix(CONFIG_PREFIX).toMap 30 | protoRegistry(kind, properties) 31 | } 32 | 33 | private def protoRegistry(name: String, properties: Map[String, String]): ProtoRegistry = 34 | name match { 35 | case "local" => new LocalProtoRegistry 36 | case "stencil" => new StencilProtoRegistry(properties("url"), properties.get("token")) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/registry/proto/StencilProtoRegistry.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.registry.proto 18 | import com.google.protobuf.Descriptors 19 | import io.odpf.stencil.StencilClientFactory 20 | import io.odpf.stencil.client.StencilClient 21 | import io.odpf.stencil.config.StencilConfig 22 | import org.apache.http.{Header, HttpHeaders} 23 | import org.apache.http.message.BasicHeader 24 | 25 | import scala.collection.JavaConverters._ 26 | 27 | class StencilProtoRegistry(url: String, token: Option[String]) extends ProtoRegistry { 28 | import StencilProtoRegistry.stencilClient 29 | 30 | override def getProtoDescriptor(className: String): Descriptors.Descriptor = { 31 | stencilClient(url, token).get(className) 32 | } 33 | } 34 | 35 | object StencilProtoRegistry { 36 | @transient 37 | private var _stencilClient: StencilClient = _ 38 | 39 | def stencilClient(url: String, token: Option[String]): StencilClient = { 40 | if (_stencilClient == null) { 41 | val stencilConfigBuilder = StencilConfig.builder 42 | for (t <- token) { 43 | val authHeader = new BasicHeader(HttpHeaders.AUTHORIZATION, "Bearer " + t) 44 | val headers = List[Header](authHeader) 45 | stencilConfigBuilder.fetchHeaders(headers.asJava) 46 | } 47 | val stencilConfig = stencilConfigBuilder.build() 48 | _stencilClient = StencilClientFactory.getClient(url, stencilConfig) 49 | } 50 | _stencilClient 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/sources/bq/BigQueryReader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.sources.bq 18 | 19 | import java.sql.Timestamp 20 | 21 | import feast.ingestion.BQSource 22 | import org.joda.time.DateTime 23 | import org.apache.spark.sql.{DataFrame, SQLContext} 24 | import org.apache.spark.sql.functions.col 25 | 26 | object BigQueryReader { 27 | def createBatchSource( 28 | sqlContext: SQLContext, 29 | source: BQSource, 30 | start: DateTime, 31 | end: DateTime 32 | ): DataFrame = { 33 | val reader = sqlContext.read 34 | .format("bigquery") 35 | .option("viewsEnabled", "true") 36 | 37 | source.materialization foreach { materializationConfig => 38 | reader 39 | .option("materializationProject", materializationConfig.project) 40 | .option("materializationDataset", materializationConfig.dataset) 41 | } 42 | 43 | reader 44 | .load(s"${source.project}.${source.dataset}.${source.table}") 45 | .filter(col(source.eventTimestampColumn) >= new Timestamp(start.getMillis)) 46 | .filter(col(source.eventTimestampColumn) < new Timestamp(end.getMillis)) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/sources/file/FileReader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.sources.file 18 | 19 | import java.sql.{Timestamp, Date} 20 | 21 | import feast.ingestion.FileSource 22 | import org.apache.spark.sql.functions.col 23 | import org.apache.spark.sql.{DataFrame, SQLContext} 24 | import org.joda.time.DateTime 25 | 26 | object FileReader { 27 | def createBatchSource( 28 | sqlContext: SQLContext, 29 | source: FileSource, 30 | start: DateTime, 31 | end: DateTime 32 | ): DataFrame = { 33 | val reader = sqlContext.read 34 | .parquet(source.path) 35 | .filter(col(source.eventTimestampColumn) >= new Timestamp(start.getMillis)) 36 | .filter(col(source.eventTimestampColumn) < new Timestamp(end.getMillis)) 37 | 38 | source.datePartitionColumn match { 39 | case Some(partitionColumn) if partitionColumn.nonEmpty => 40 | reader 41 | .filter(col(partitionColumn) >= new Date(start.getMillis)) 42 | .filter(col(partitionColumn) <= new Date(end.getMillis)) 43 | case _ => reader 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/stores/bigtable/SparkBigtableConfig.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2021 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.stores.bigtable 18 | 19 | case class SparkBigtableConfig( 20 | namespace: String, 21 | projectName: String, 22 | entityColumns: Array[String], 23 | timestampColumn: String, 24 | maxAge: Long 25 | ) 26 | object SparkBigtableConfig { 27 | val NAMESPACE = "namespace" 28 | val ENTITY_COLUMNS = "entity_columns" 29 | val TS_COLUMN = "timestamp_column" 30 | val PROJECT_NAME = "project_name" 31 | val MAX_AGE = "max_age" 32 | 33 | def parse(parameters: Map[String, String]): SparkBigtableConfig = 34 | SparkBigtableConfig( 35 | namespace = parameters.getOrElse(NAMESPACE, ""), 36 | projectName = parameters.getOrElse(PROJECT_NAME, "default"), 37 | entityColumns = parameters.getOrElse(ENTITY_COLUMNS, "").split(","), 38 | timestampColumn = parameters.getOrElse(TS_COLUMN, "event_timestamp"), 39 | maxAge = parameters.get(MAX_AGE).map(_.toLong).getOrElse(0) 40 | ) 41 | } 42 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/stores/cassandra/DefaultSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2021 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.stores.cassandra 18 | 19 | import feast.ingestion.stores.serialization.AvroSerializer 20 | import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider} 21 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} 22 | 23 | class DefaultSource extends CreatableRelationProvider { 24 | override def createRelation( 25 | sqlContext: SQLContext, 26 | mode: SaveMode, 27 | parameters: Map[String, String], 28 | data: DataFrame 29 | ): BaseRelation = { 30 | 31 | val rel = 32 | new CassandraSinkRelation( 33 | sqlContext, 34 | new AvroSerializer, 35 | SparkCassandraConfig.parse(parameters) 36 | ) 37 | rel.createTable() 38 | rel.saveWriteSchema(data) 39 | rel.insert(data, overwrite = false) 40 | rel 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/stores/cassandra/SparkCassandraConfig.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2021 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.stores.cassandra 18 | 19 | case class SparkCassandraConfig( 20 | namespace: String, 21 | projectName: String, 22 | entityColumns: Array[String], 23 | timestampColumn: String, 24 | maxAge: Long 25 | ) 26 | 27 | object SparkCassandraConfig { 28 | val NAMESPACE = "namespace" 29 | val ENTITY_COLUMNS = "entity_columns" 30 | val TS_COLUMN = "timestamp_column" 31 | val PROJECT_NAME = "project_name" 32 | val MAX_AGE = "max_age" 33 | 34 | def parse(parameters: Map[String, String]): SparkCassandraConfig = 35 | SparkCassandraConfig( 36 | namespace = parameters.getOrElse(NAMESPACE, ""), 37 | projectName = parameters.getOrElse(PROJECT_NAME, "default"), 38 | entityColumns = parameters.getOrElse(ENTITY_COLUMNS, "").split(","), 39 | timestampColumn = parameters.getOrElse(TS_COLUMN, "event_timestamp"), 40 | maxAge = parameters.get(MAX_AGE).map(_.toLong).getOrElse(0) 41 | ) 42 | } 43 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/stores/redis/ClusterPipelineProvider.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2022 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.stores.redis 18 | 19 | import redis.clients.jedis.commands.PipelineBinaryCommands 20 | import redis.clients.jedis.{ClusterPipeline, DefaultJedisClientConfig, HostAndPort, Response} 21 | import redis.clients.jedis.providers.ClusterConnectionProvider 22 | 23 | import scala.collection.JavaConverters._ 24 | 25 | /** 26 | * Provide pipeline for Redis cluster. 27 | */ 28 | case class ClusterPipelineProvider(endpoint: RedisEndpoint) extends PipelineProvider { 29 | 30 | val nodes = Set(new HostAndPort(endpoint.host, endpoint.port)).asJava 31 | val DEFAULT_CLIENT_CONFIG = DefaultJedisClientConfig 32 | .builder() 33 | .password(endpoint.password) 34 | .build() 35 | val provider = new ClusterConnectionProvider(nodes, DEFAULT_CLIENT_CONFIG) 36 | 37 | /** 38 | * @return execute commands within a pipeline and return the result 39 | */ 40 | override def withPipeline[T](ops: PipelineBinaryCommands => T): T = { 41 | val pipeline = new ClusterPipeline(provider) 42 | val response = ops(pipeline) 43 | pipeline.close() 44 | response 45 | } 46 | 47 | /** 48 | * Close client connection 49 | */ 50 | override def close(): Unit = { 51 | provider.close() 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/stores/redis/DefaultSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.stores.redis 18 | 19 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} 20 | import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider} 21 | 22 | /** 23 | * Entrypoint to Redis Storage. Implements only `CreatableRelationProvider` since it's only possible write to Redis. 24 | * Here we parse configuration from spark parameters & provide SparkRedisConfig to `RedisSinkRelation` 25 | */ 26 | class RedisRelationProvider extends CreatableRelationProvider { 27 | override def createRelation( 28 | sqlContext: SQLContext, 29 | mode: SaveMode, 30 | parameters: Map[String, String], 31 | data: DataFrame 32 | ): BaseRelation = { 33 | val config = SparkRedisConfig.parse(parameters) 34 | val relation = new RedisSinkRelation(sqlContext, config) 35 | 36 | relation.insert(data, overwrite = false) 37 | 38 | relation 39 | } 40 | } 41 | 42 | class DefaultSource extends RedisRelationProvider 43 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/stores/redis/Persistence.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.stores.redis 18 | 19 | import java.sql.Timestamp 20 | import java.util 21 | import org.apache.spark.sql.Row 22 | import redis.clients.jedis.commands.PipelineBinaryCommands 23 | import redis.clients.jedis.Response 24 | 25 | /** 26 | * Determine how a Spark row should be serialized and stored on Redis. 27 | */ 28 | trait Persistence { 29 | 30 | /** 31 | * Persist a Spark row to Redis 32 | * 33 | * @param pipeline Redis pipeline 34 | * @param key Redis key in serialized bytes format 35 | * @param row Row representing the value to be persist 36 | * @param expiryTimestamp Expiry timestamp for the row 37 | * @param maxExpiryTimestamp No ttl should be set if the expiry timestamp 38 | * is equal to the maxExpiryTimestamp 39 | */ 40 | def save( 41 | pipeline: PipelineBinaryCommands, 42 | key: Array[Byte], 43 | row: Row, 44 | expiryTimestamp: Option[Timestamp] 45 | ): Unit 46 | 47 | /** 48 | * Returns a Redis response, which can be used by `storedTimestamp` and `newExpiryTimestamp` to 49 | * derive the currently stored event timestamp, and the updated expiry timestamp. This method will 50 | * be called prior to persisting the row to Redis, so that `RedisSinkRelation` can decide whether 51 | * the currently stored value should be updated. 52 | * 53 | * @param pipeline Redis pipeline 54 | * @param key Redis key in serialized bytes format 55 | * @return Redis response representing the row value 56 | */ 57 | def get( 58 | pipeline: PipelineBinaryCommands, 59 | key: Array[Byte] 60 | ): Response[util.Map[Array[Byte], Array[Byte]]] 61 | 62 | /** 63 | * Returns the currently stored event timestamp for the key and the feature table associated with the ingestion job. 64 | * 65 | * @param value Response returned from `get` 66 | * @return Stored event timestamp associated with the key. Returns `None` if 67 | * the key is not present in Redis, or if timestamp information is 68 | * unavailable on the stored value. 69 | */ 70 | def storedTimestamp(value: util.Map[Array[Byte], Array[Byte]]): Option[Timestamp] 71 | 72 | } 73 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/stores/redis/PipelineProvider.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2022 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.stores.redis 18 | 19 | import redis.clients.jedis.Response 20 | import redis.clients.jedis.commands.PipelineBinaryCommands 21 | 22 | import java.io.Closeable 23 | 24 | /** 25 | * Provide either a pipeline or cluster pipeline to read and write data into Redis. 26 | */ 27 | trait PipelineProvider { 28 | 29 | def withPipeline[T](ops: PipelineBinaryCommands => T): T 30 | 31 | /** 32 | * Close client connection 33 | */ 34 | def close(): Unit 35 | } 36 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/stores/redis/PipelineProviderFactory.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2022 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.stores.redis 18 | 19 | import redis.clients.jedis.Jedis 20 | 21 | import scala.collection.mutable 22 | import scala.util.Try 23 | 24 | object PipelineProviderFactory { 25 | 26 | private lazy val providers: mutable.Map[RedisEndpoint, PipelineProvider] = mutable.Map.empty 27 | 28 | private def newJedisClient(endpoint: RedisEndpoint): Jedis = { 29 | val jedis = new Jedis(endpoint.host, endpoint.port) 30 | if (endpoint.password.nonEmpty) { 31 | jedis.auth(endpoint.password) 32 | } 33 | jedis 34 | } 35 | 36 | private def checkIfInClusterMode(endpoint: RedisEndpoint): Boolean = { 37 | val jedis = newJedisClient(endpoint) 38 | val isCluster = Try(jedis.clusterInfo()).isSuccess 39 | jedis.close() 40 | isCluster 41 | } 42 | 43 | private def clusterPipelineProvider(endpoint: RedisEndpoint): PipelineProvider = { 44 | ClusterPipelineProvider(endpoint) 45 | } 46 | 47 | private def singleNodePipelineProvider(endpoint: RedisEndpoint): PipelineProvider = { 48 | SingleNodePipelineProvider(endpoint) 49 | } 50 | 51 | def newProvider(endpoint: RedisEndpoint): PipelineProvider = { 52 | if (checkIfInClusterMode(endpoint)) { 53 | clusterPipelineProvider(endpoint) 54 | } else { 55 | singleNodePipelineProvider(endpoint) 56 | } 57 | } 58 | 59 | def provider(endpoint: RedisEndpoint): PipelineProvider = { 60 | providers.getOrElseUpdate(endpoint, newProvider(endpoint)) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/stores/redis/RedisEndpoint.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2022 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.stores.redis 18 | 19 | case class RedisEndpoint(host: String, port: Int, password: String) 20 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/stores/redis/SingleNodePipelineProvider.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2022 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.stores.redis 18 | 19 | import redis.clients.jedis.commands.PipelineBinaryCommands 20 | import redis.clients.jedis.{JedisPool, Response} 21 | 22 | /** 23 | * Provide pipeline for single node Redis. 24 | */ 25 | case class SingleNodePipelineProvider(endpoint: RedisEndpoint) extends PipelineProvider { 26 | 27 | val jedisPool = new JedisPool(endpoint.host, endpoint.port) 28 | 29 | /** 30 | * @return execute command within a pipeline and return the result 31 | */ 32 | override def withPipeline[T](ops: PipelineBinaryCommands => T): T = { 33 | val jedis = jedisPool.getResource 34 | if (endpoint.password.nonEmpty) { 35 | jedis.auth(endpoint.password) 36 | } 37 | val response = ops(jedis.pipelined()) 38 | jedis.close() 39 | response 40 | } 41 | 42 | /** 43 | * Close client connection 44 | */ 45 | override def close(): Unit = jedisPool.close() 46 | 47 | } 48 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/stores/redis/SparkRedisConfig.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.stores.redis 18 | 19 | case class SparkRedisConfig( 20 | namespace: String, 21 | projectName: String, 22 | entityColumns: Array[String], 23 | timestampColumn: String, 24 | timestampPrefix: String = "_ts", 25 | repartitionByEntity: Boolean = true, 26 | maxAge: Long = 0, 27 | expiryPrefix: String = "_ex" 28 | ) 29 | 30 | object SparkRedisConfig { 31 | val NAMESPACE = "namespace" 32 | val ENTITY_COLUMNS = "entity_columns" 33 | val TS_COLUMN = "timestamp_column" 34 | val ENTITY_REPARTITION = "entity_repartition" 35 | val PROJECT_NAME = "project_name" 36 | val MAX_AGE = "max_age" 37 | 38 | def parse(parameters: Map[String, String]): SparkRedisConfig = 39 | SparkRedisConfig( 40 | namespace = parameters.getOrElse(NAMESPACE, ""), 41 | projectName = parameters.getOrElse(PROJECT_NAME, "default"), 42 | entityColumns = parameters.getOrElse(ENTITY_COLUMNS, "").split(","), 43 | timestampColumn = parameters.getOrElse(TS_COLUMN, "event_timestamp"), 44 | repartitionByEntity = parameters.getOrElse(ENTITY_REPARTITION, "true") == "true", 45 | maxAge = parameters.get(MAX_AGE).map(_.toLong).getOrElse(0) 46 | ) 47 | } 48 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/stores/serialization/AvroSerializer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2021 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.stores.serialization 18 | 19 | import com.google.common.hash.Hashing 20 | import org.apache.spark.sql.Column 21 | import org.apache.spark.sql.avro.SchemaConverters 22 | import org.apache.spark.sql.avro.functions.to_avro 23 | import org.apache.spark.sql.types.StructType 24 | 25 | class AvroSerializer extends Serializer { 26 | override type SchemaType = String 27 | 28 | def convertSchema(schema: StructType): String = { 29 | val avroSchema = SchemaConverters.toAvroType(schema) 30 | avroSchema.toString 31 | } 32 | 33 | def schemaReference(schema: String): Array[Byte] = { 34 | Hashing.murmur3_32().hashBytes(schema.getBytes).asBytes() 35 | } 36 | 37 | def serializeData(schema: String): Column => Column = to_avro(_, schema) 38 | } 39 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/stores/serialization/Serializer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2021 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.stores.serialization 18 | 19 | import org.apache.spark.sql.Column 20 | import org.apache.spark.sql.types.StructType 21 | 22 | trait Serializer { 23 | type SchemaType 24 | 25 | def convertSchema(schema: StructType): SchemaType 26 | 27 | def schemaReference(schema: SchemaType): Array[Byte] 28 | 29 | def serializeData(schema: SchemaType): Column => Column 30 | } 31 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/utils/JsonUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2021 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.utils 18 | 19 | import java.util.Locale.ENGLISH 20 | 21 | import org.json4s.{JArray, JField, JObject, JValue} 22 | 23 | object JsonUtils { 24 | def mapFieldWithParent(jv: JValue)(f: (String, JField) => JField): JValue = { 25 | def rec(v: JValue, parent: String = ""): JValue = v match { 26 | case JObject(l) => JObject(l.map { case (key, va) => f(parent, key -> rec(va, key)) }) 27 | case JArray(l) => JArray(l.map(rec(_, parent))) 28 | case x => x 29 | } 30 | rec(jv) 31 | } 32 | 33 | def camelize(word: String): String = { 34 | if (word.nonEmpty) { 35 | val w = pascalize(word) 36 | w.substring(0, 1).toLowerCase(ENGLISH) + w.substring(1) 37 | } else { 38 | word 39 | } 40 | } 41 | 42 | def pascalize(word: String): String = { 43 | val lst = word.split("_").toList 44 | (lst.headOption.map(s => s.substring(0, 1).toUpperCase(ENGLISH) + s.substring(1)).get :: 45 | lst.tail.map(s => s.substring(0, 1).toUpperCase + s.substring(1))).mkString("") 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/utils/StringUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2021 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.utils 18 | 19 | import com.google.common.hash.Hashing 20 | 21 | object StringUtils { 22 | private def suffixHash(expr: String): String = { 23 | Hashing.murmur3_32().hashBytes(expr.getBytes).toString 24 | } 25 | 26 | def trimAndHash(expr: String, maxLength: Int): String = { 27 | // Length 8 suffix as derived from murmurhash_32 implementation 28 | val maxPrefixLength = maxLength - 8 29 | if (expr.length > maxLength) 30 | expr 31 | .take(maxPrefixLength) 32 | .concat(suffixHash(expr.substring(maxPrefixLength))) 33 | else 34 | expr 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/utils/testing/MemoryStreamingSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.utils.testing 18 | 19 | import feast.ingestion.{DataFormat, StreamingSource} 20 | import org.apache.spark.sql.DataFrame 21 | import org.apache.spark.sql.execution.streaming.MemoryStream 22 | 23 | // For test purposes 24 | case class MemoryStreamingSource( 25 | stream: MemoryStream[_], 26 | override val fieldMapping: Map[String, String] = Map.empty, 27 | override val eventTimestampColumn: String = "timestamp", 28 | override val createdTimestampColumn: Option[String] = None, 29 | override val datePartitionColumn: Option[String] = None 30 | ) extends StreamingSource { 31 | def read: DataFrame = stream.toDF() 32 | 33 | override def format: DataFormat = null 34 | } 35 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/validation/Expectation.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2022 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.validation 18 | 19 | import org.apache.spark.sql.Column 20 | import org.apache.spark.sql.functions.{col, lit} 21 | import org.json4s.{CustomSerializer, DefaultFormats, Extraction, Formats, JObject, JValue} 22 | 23 | trait Expectation { 24 | 25 | def validate: Column 26 | } 27 | 28 | case class ExpectColumnValuesToNotBeNull(columnName: String) extends Expectation { 29 | override def validate: Column = col(columnName).isNotNull 30 | } 31 | 32 | case class ExpectColumnValuesToBeBetween( 33 | columnName: String, 34 | minValue: Option[Int], 35 | maxValue: Option[Int] 36 | ) extends Expectation { 37 | override def validate: Column = { 38 | (minValue, maxValue) match { 39 | case (Some(min), Some(max)) => col(columnName).between(min, max) 40 | case (Some(min), None) => col(columnName).>=(min) 41 | case (None, Some(max)) => col(columnName).<=(max) 42 | case _ => lit(true) 43 | } 44 | } 45 | } 46 | 47 | object Expectation { 48 | implicit val format: Formats = DefaultFormats 49 | 50 | def extractColumn(kwargs: JValue): String = { 51 | (kwargs \ "column").extract[String] 52 | } 53 | 54 | def apply(expectationType: String, kwargs: JValue): Expectation = { 55 | expectationType match { 56 | case "expect_column_values_to_not_be_null" => 57 | ExpectColumnValuesToNotBeNull(extractColumn(kwargs)) 58 | case "expect_column_values_to_be_between" => 59 | val column = extractColumn(kwargs) 60 | val minValue = (kwargs \ "minValue").toSome.map(_.extract[Int]) 61 | val maxValue = (kwargs \ "maxValue").toSome.map(_.extract[Int]) 62 | ExpectColumnValuesToBeBetween(column, minValue, maxValue) 63 | } 64 | } 65 | } 66 | 67 | object ExpectationCodec 68 | extends CustomSerializer[Expectation](implicit format => 69 | ( 70 | { case x: JObject => 71 | val eType: String = (x \ "expectationType").extract[String] 72 | val kwargs: JValue = (x \ "kwargs") 73 | Expectation(eType, kwargs) 74 | }, 75 | { case x: Expectation => 76 | Extraction.decompose(x) 77 | } 78 | ) 79 | ) 80 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/feast/ingestion/validation/RowValidator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.validation 18 | 19 | import feast.ingestion.{FeatureTable, ExpectationSpec} 20 | import org.apache.spark.sql.Column 21 | import org.apache.spark.sql.functions.{col, lit} 22 | 23 | class RowValidator( 24 | featureTable: FeatureTable, 25 | timestampColumn: String, 26 | expectationSpec: Option[ExpectationSpec] 27 | ) extends Serializable { 28 | 29 | def allEntitiesPresent: Column = 30 | featureTable.entities.map(e => col(e.name).isNotNull).reduce(_.&&(_)) 31 | 32 | def atLeastOneFeatureNotNull: Column = 33 | featureTable.features.map(f => col(f.name).isNotNull).reduce(_.||(_)) 34 | 35 | def timestampPresent: Column = 36 | col(timestampColumn).isNotNull 37 | 38 | def validationChecks: Column = { 39 | 40 | expectationSpec match { 41 | case Some(value) if value.expectations.isEmpty => lit(true) 42 | case Some(value) => 43 | value.expectations.map(_.validate).reduce(_.&&(_)) 44 | case None => lit(true) 45 | } 46 | } 47 | 48 | def allChecks: Column = 49 | allEntitiesPresent && timestampPresent && validationChecks 50 | } 51 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/org/apache/spark/metrics/AtomicGauge.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2021 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.metrics 18 | 19 | import com.codahale.metrics.Gauge 20 | 21 | import java.util.concurrent.atomic.{AtomicInteger, AtomicLong} 22 | 23 | class AtomicLongGauge(initialValue: Long = 0L) extends Gauge[Long] { 24 | val value = new AtomicLong(initialValue) 25 | override def getValue: Long = value.get() 26 | } 27 | 28 | class AtomicIntegerGauge(initialValue: Int = 0) extends Gauge[Int] { 29 | val value = new AtomicInteger(initialValue) 30 | override def getValue: Int = value.get() 31 | } 32 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/org/apache/spark/metrics/sink/StatsdSinkWithTags.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.metrics.sink 18 | 19 | import java.util.Properties 20 | import java.util.concurrent.TimeUnit 21 | 22 | import com.codahale.metrics.MetricRegistry 23 | import feast.ingestion.metrics.StatsdReporterWithTags 24 | import org.apache.spark.SecurityManager 25 | import org.apache.spark.internal.Logging 26 | import org.apache.spark.metrics.MetricsSystem 27 | 28 | class StatsdSinkWithTags( 29 | val property: Properties, 30 | val registry: MetricRegistry, 31 | securityMgr: SecurityManager 32 | ) extends Sink 33 | with Logging { 34 | import StatsdSink._ 35 | 36 | val host = property.getProperty(STATSD_KEY_HOST, STATSD_DEFAULT_HOST) 37 | val port = property.getProperty(STATSD_KEY_PORT, STATSD_DEFAULT_PORT).toInt 38 | 39 | val pollPeriod = property.getProperty(STATSD_KEY_PERIOD, STATSD_DEFAULT_PERIOD).toInt 40 | val pollUnit = 41 | TimeUnit.valueOf(property.getProperty(STATSD_KEY_UNIT, STATSD_DEFAULT_UNIT).toUpperCase) 42 | 43 | val prefix = property.getProperty(STATSD_KEY_PREFIX, STATSD_DEFAULT_PREFIX) 44 | 45 | MetricsSystem.checkMinimalPollingPeriod(pollUnit, pollPeriod) 46 | 47 | val reporter = new StatsdReporterWithTags(registry, host, port, prefix) 48 | 49 | override def start(): Unit = { 50 | reporter.start(pollPeriod, pollUnit) 51 | logInfo(s"StatsdSink started with prefix: '$prefix'") 52 | } 53 | 54 | override def stop(): Unit = { 55 | reporter.stop() 56 | logInfo("StatsdSink stopped.") 57 | } 58 | 59 | override def report(): Unit = reporter.report() 60 | } 61 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/org/apache/spark/metrics/source/BaseMetricSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.metrics.source 18 | 19 | import com.codahale.metrics.MetricRegistry 20 | import org.apache.spark.SparkEnv 21 | 22 | class BaseMetricSource extends Source { 23 | override val sourceName: String = "" 24 | 25 | override val metricRegistry: MetricRegistry = new MetricRegistry 26 | 27 | private val sparkConfig = SparkEnv.get.conf 28 | 29 | private val metricLabels = sparkConfig.get("spark.metrics.labels", "") 30 | 31 | private val appId = sparkConfig.get("spark.app.id", "") 32 | 33 | private val executorId = sparkConfig.get("spark.executor.id", "") 34 | 35 | protected def metricWithLabels(name: String) = { 36 | if (metricLabels.isEmpty) { 37 | name 38 | } else { 39 | s"$name#$metricLabels,job_id=$appId-$executorId" 40 | } 41 | } 42 | 43 | protected def counterWithLabels(name: String) = { 44 | if (metricLabels.isEmpty) { 45 | name 46 | } else { 47 | s"$name#$metricLabels" 48 | } 49 | } 50 | 51 | protected def gaugeWithLabels(name: String) = { 52 | if (metricLabels.isEmpty) { 53 | name 54 | } else { 55 | s"$name#$metricLabels" 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/org/apache/spark/metrics/source/BigTableSinkMetricSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.metrics.source 18 | 19 | class BigTableSinkMetricSource extends BaseMetricSource { 20 | override val sourceName: String = BigTableSinkMetricSource.sourceName 21 | 22 | val METRIC_TOTAL_ROWS_INSERTED = 23 | metricRegistry.counter(counterWithLabels("feature_row_ingested_count")) 24 | 25 | val METRIC_ROWS_LAG = 26 | metricRegistry.histogram(metricWithLabels("feature_row_lag_ms")) 27 | } 28 | 29 | object BigTableSinkMetricSource { 30 | val sourceName = "bigtable_sink" 31 | } 32 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/org/apache/spark/metrics/source/IngestionPipelineMetricSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.metrics.source 18 | 19 | class IngestionPipelineMetricSource extends BaseMetricSource { 20 | override val sourceName: String = IngestionPipelineMetricSource.sourceName 21 | 22 | val METRIC_DEADLETTER_ROWS_INSERTED = 23 | metricRegistry.counter(counterWithLabels("deadletter_count")) 24 | 25 | val METRIC_ROWS_READ_FROM_SOURCE = 26 | metricRegistry.counter(counterWithLabels("read_from_source_count")) 27 | } 28 | 29 | object IngestionPipelineMetricSource { 30 | val sourceName = "ingestion_pipeline" 31 | } 32 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/org/apache/spark/metrics/source/RedisSinkMetricSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.metrics.source 18 | 19 | class RedisSinkMetricSource extends BaseMetricSource { 20 | override val sourceName: String = RedisSinkMetricSource.sourceName 21 | 22 | val METRIC_TOTAL_ROWS_INSERTED = 23 | metricRegistry.counter(counterWithLabels("feature_row_ingested_count")) 24 | 25 | val METRIC_ROWS_LAG = 26 | metricRegistry.histogram(metricWithLabels("feature_row_lag_ms")) 27 | } 28 | 29 | object RedisSinkMetricSource { 30 | val sourceName = "redis_sink" 31 | } 32 | -------------------------------------------------------------------------------- /spark/ingestion/src/main/scala/org/apache/spark/metrics/source/StreamingMetricSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.metrics.source 18 | 19 | import org.apache.spark.metrics.AtomicLongGauge 20 | import org.apache.spark.sql.streaming.StreamingQueryProgress 21 | 22 | import java.time.Instant 23 | 24 | class StreamingMetricSource extends BaseMetricSource { 25 | override val sourceName: String = StreamingMetricSource.sourceName 26 | 27 | private val BATCH_DURATION_GAUGE = 28 | metricRegistry.register(gaugeWithLabels("batch_duration_ms"), new AtomicLongGauge()) 29 | private val PROCESSED_ROWS_PER_SECOND_GAUGE = 30 | metricRegistry.register(gaugeWithLabels("input_rows_per_second"), new AtomicLongGauge()) 31 | private val INPUT_ROWS_PER_SECOND_GAUGE = 32 | metricRegistry.register(gaugeWithLabels("processed_rows_per_second"), new AtomicLongGauge()) 33 | private val LAST_CONSUMED_KAFKA_TIMESTAMP_GAUGE = 34 | metricRegistry.register(gaugeWithLabels("last_consumed_kafka_timestamp"), new AtomicLongGauge()) 35 | private val LAST_PROCESSED_EVENT_TIMESTAMP_GAUGE = 36 | metricRegistry.register( 37 | gaugeWithLabels("last_processed_event_timestamp"), 38 | new AtomicLongGauge() 39 | ) 40 | 41 | def updateStreamingProgress(progress: StreamingQueryProgress): Unit = { 42 | BATCH_DURATION_GAUGE.value.set(progress.batchDuration) 43 | INPUT_ROWS_PER_SECOND_GAUGE.value.set(progress.inputRowsPerSecond.toLong) 44 | PROCESSED_ROWS_PER_SECOND_GAUGE.value.set(progress.processedRowsPerSecond.toLong) 45 | 46 | val epochTimestamp = Instant.parse(progress.timestamp).getEpochSecond 47 | LAST_PROCESSED_EVENT_TIMESTAMP_GAUGE.value.set(epochTimestamp) 48 | } 49 | 50 | def updateKafkaTimestamp(timestamp: Long): Unit = { 51 | LAST_CONSUMED_KAFKA_TIMESTAMP_GAUGE.value.set(timestamp) 52 | } 53 | } 54 | 55 | object StreamingMetricSource { 56 | val sourceName = "streaming" 57 | } 58 | -------------------------------------------------------------------------------- /spark/ingestion/src/test/proto/com/example/source.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package com.example; 4 | 5 | option java_multiple_files = true; 6 | option java_package = "com.example.protos"; 7 | 8 | import "google/protobuf/timestamp.proto"; 9 | 10 | message TestMessage { 11 | int64 s2_id = 1; 12 | VehicleType.Enum vehicle_type = 2; 13 | int64 unique_drivers = 3; 14 | google.protobuf.Timestamp event_timestamp = 4; 15 | } 16 | 17 | message VehicleType { 18 | enum Enum { 19 | UNKNOWN = 0; 20 | CAR = 1; 21 | BIKE = 2; 22 | } 23 | } 24 | 25 | message InnerMessage { 26 | repeated double double = 1; 27 | repeated float float = 2; 28 | repeated int32 integer = 3; 29 | repeated int64 long = 4; 30 | enum Enum { 31 | zero = 0; 32 | one = 1; 33 | } 34 | Enum enum = 5; 35 | } 36 | 37 | message AllTypesMessage { 38 | double double = 1; 39 | float float = 2; 40 | int32 integer = 3; 41 | int64 long = 4; 42 | uint32 uinteger = 5; 43 | uint64 ulong = 6; 44 | sint32 sinteger = 7; 45 | sint64 slong = 8; 46 | fixed32 finteger = 9; 47 | fixed64 flong = 10; 48 | sfixed32 sfinteger = 11; 49 | sfixed64 sflong = 13; 50 | bool bool = 14; 51 | string string = 15; 52 | bytes bytes = 16; 53 | map map = 17; 54 | InnerMessage inner = 18; 55 | 56 | google.protobuf.Timestamp event_timestamp = 19; 57 | } -------------------------------------------------------------------------------- /spark/ingestion/src/test/resources/python/setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CURRENT_PATH=$PWD 4 | DESTINATION=${DESTINATION:-$CURRENT_PATH} 5 | 6 | # 1. Create libraries (dependencies) package 7 | if [[ -f "$DESTINATION/libs.tar.gz" ]]; then 8 | echo "$DESTINATION/libs.tar.gz exists." 9 | else 10 | tmp_dir=$(mktemp -d) 11 | pip3 install -t ${tmp_dir}/libs great-expectations==0.13.2 pyarrow==2.0.0 Jinja2==3.0.3 12 | cd $tmp_dir && tar -czf libs.tar.gz libs/ && mv libs.tar.gz $DESTINATION/libs.tar.gz 13 | fi 14 | 15 | # 2. Pickle python udf 16 | cd $CURRENT_PATH 17 | pip3 install great-expectations==0.13.2 setuptools pyspark==3.1.3 Jinja2==3.0.3 pyarrow==2.0.0 18 | python3 udf.py $DESTINATION/udf.pickle 19 | -------------------------------------------------------------------------------- /spark/ingestion/src/test/resources/python/udf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from pyspark import cloudpickle 4 | from pyspark.sql.types import BooleanType 5 | 6 | import pandas as pd 7 | import numpy as np 8 | 9 | from great_expectations.dataset import PandasDataset 10 | 11 | 12 | def create_suite(): 13 | df = pd.DataFrame() 14 | df['num'] = np.random.randint(0, 10, 100) 15 | df['num2'] = np.random.randint(0, 20, 100) 16 | ds = PandasDataset.from_dataset(df) 17 | 18 | ds.expect_column_values_to_be_between('num', 0, 10) 19 | ds.expect_column_values_to_be_between('num2', 0, 20) 20 | 21 | return ds.get_expectation_suite() 22 | 23 | 24 | def create_validator(suite): 25 | def validate(df) -> pd.DataFrame: 26 | ds = PandasDataset.from_dataset(df) 27 | # print(ds, ds.shape) 28 | result = ds.validate(suite, result_format='COMPLETE') 29 | valid_rows = pd.Series([True] * ds.shape[0]) 30 | # print(result) 31 | for check in result.results: 32 | if check.success: 33 | continue 34 | 35 | valid_rows.iloc[check.result['unexpected_index_list']] = False 36 | return valid_rows 37 | 38 | return validate 39 | 40 | 41 | def main(dest_path): 42 | with open(dest_path, 'wb') as f: 43 | fun = create_validator(create_suite()) 44 | command = (fun, BooleanType()) 45 | cloudpickle.dump(command, f) 46 | 47 | 48 | if __name__ == '__main__': 49 | main(sys.argv[1]) 50 | -------------------------------------------------------------------------------- /spark/ingestion/src/test/resources/stencil/__files/source.desc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/spark/ingestion/src/test/resources/stencil/__files/source.desc -------------------------------------------------------------------------------- /spark/ingestion/src/test/scala/com/example/protos/InnerMessageOrBuilder.java: -------------------------------------------------------------------------------- 1 | // Generated by the protocol buffer compiler. DO NOT EDIT! 2 | // source: source.proto 3 | 4 | package com.example.protos; 5 | 6 | public interface InnerMessageOrBuilder extends 7 | // @@protoc_insertion_point(interface_extends:com.example.InnerMessage) 8 | com.google.protobuf.MessageOrBuilder { 9 | 10 | /** 11 | * repeated double double = 1; 12 | * @return A list containing the double. 13 | */ 14 | java.util.List getDoubleList(); 15 | /** 16 | * repeated double double = 1; 17 | * @return The count of double. 18 | */ 19 | int getDoubleCount(); 20 | /** 21 | * repeated double double = 1; 22 | * @param index The index of the element to return. 23 | * @return The double at the given index. 24 | */ 25 | double getDouble(int index); 26 | 27 | /** 28 | * repeated float float = 2; 29 | * @return A list containing the float. 30 | */ 31 | java.util.List getFloatList(); 32 | /** 33 | * repeated float float = 2; 34 | * @return The count of float. 35 | */ 36 | int getFloatCount(); 37 | /** 38 | * repeated float float = 2; 39 | * @param index The index of the element to return. 40 | * @return The float at the given index. 41 | */ 42 | float getFloat(int index); 43 | 44 | /** 45 | * repeated int32 integer = 3; 46 | * @return A list containing the integer. 47 | */ 48 | java.util.List getIntegerList(); 49 | /** 50 | * repeated int32 integer = 3; 51 | * @return The count of integer. 52 | */ 53 | int getIntegerCount(); 54 | /** 55 | * repeated int32 integer = 3; 56 | * @param index The index of the element to return. 57 | * @return The integer at the given index. 58 | */ 59 | int getInteger(int index); 60 | 61 | /** 62 | * repeated int64 long = 4; 63 | * @return A list containing the long. 64 | */ 65 | java.util.List getLongList(); 66 | /** 67 | * repeated int64 long = 4; 68 | * @return The count of long. 69 | */ 70 | int getLongCount(); 71 | /** 72 | * repeated int64 long = 4; 73 | * @param index The index of the element to return. 74 | * @return The long at the given index. 75 | */ 76 | long getLong(int index); 77 | 78 | /** 79 | * .com.example.InnerMessage.Enum enum = 5; 80 | * @return The enum numeric value on the wire for enum. 81 | */ 82 | int getEnumValue(); 83 | /** 84 | * .com.example.InnerMessage.Enum enum = 5; 85 | * @return The enum. 86 | */ 87 | com.example.protos.InnerMessage.Enum getEnum(); 88 | } 89 | -------------------------------------------------------------------------------- /spark/ingestion/src/test/scala/com/example/protos/TestMessageOrBuilder.java: -------------------------------------------------------------------------------- 1 | // Generated by the protocol buffer compiler. DO NOT EDIT! 2 | // source: source.proto 3 | 4 | package com.example.protos; 5 | 6 | public interface TestMessageOrBuilder extends 7 | // @@protoc_insertion_point(interface_extends:com.example.TestMessage) 8 | com.google.protobuf.MessageOrBuilder { 9 | 10 | /** 11 | * int64 s2_id = 1; 12 | * @return The s2Id. 13 | */ 14 | long getS2Id(); 15 | 16 | /** 17 | * .com.example.VehicleType.Enum vehicle_type = 2; 18 | * @return The enum numeric value on the wire for vehicleType. 19 | */ 20 | int getVehicleTypeValue(); 21 | /** 22 | * .com.example.VehicleType.Enum vehicle_type = 2; 23 | * @return The vehicleType. 24 | */ 25 | com.example.protos.VehicleType.Enum getVehicleType(); 26 | 27 | /** 28 | * int64 unique_drivers = 3; 29 | * @return The uniqueDrivers. 30 | */ 31 | long getUniqueDrivers(); 32 | 33 | /** 34 | * .google.protobuf.Timestamp event_timestamp = 4; 35 | * @return Whether the eventTimestamp field is set. 36 | */ 37 | boolean hasEventTimestamp(); 38 | /** 39 | * .google.protobuf.Timestamp event_timestamp = 4; 40 | * @return The eventTimestamp. 41 | */ 42 | com.google.protobuf.Timestamp getEventTimestamp(); 43 | /** 44 | * .google.protobuf.Timestamp event_timestamp = 4; 45 | */ 46 | com.google.protobuf.TimestampOrBuilder getEventTimestampOrBuilder(); 47 | } 48 | -------------------------------------------------------------------------------- /spark/ingestion/src/test/scala/com/example/protos/VehicleTypeOrBuilder.java: -------------------------------------------------------------------------------- 1 | // Generated by the protocol buffer compiler. DO NOT EDIT! 2 | // source: source.proto 3 | 4 | package com.example.protos; 5 | 6 | public interface VehicleTypeOrBuilder extends 7 | // @@protoc_insertion_point(interface_extends:com.example.VehicleType) 8 | com.google.protobuf.MessageOrBuilder { 9 | } 10 | -------------------------------------------------------------------------------- /spark/ingestion/src/test/scala/feast/ingestion/SparkSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion 18 | 19 | import org.apache.spark.SparkConf 20 | import org.apache.spark.sql.SparkSession 21 | import org.scalatest.BeforeAndAfter 22 | 23 | class SparkSpec extends UnitSpec with BeforeAndAfter { 24 | System.setProperty("io.netty.tryReflectionSetAccessible", "true") 25 | 26 | var sparkSession: SparkSession = null 27 | def withSparkConfOverrides(conf: SparkConf): SparkConf = conf 28 | 29 | before { 30 | val sparkConf = new SparkConf() 31 | .setMaster("local[4]") 32 | .setAppName("Testing") 33 | .set("spark.driver.bindAddress", "localhost") 34 | .set("spark.default.parallelism", "8") 35 | .set( 36 | "spark.metrics.conf.*.sink.statsd.class", 37 | "org.apache.spark.metrics.sink.StatsdSinkWithTags" 38 | ) 39 | .set("spark.metrics.conf.*.sink.statsd.host", "localhost") 40 | .set("spark.metrics.conf.*.sink.statsd.period", "999") // disable scheduled reporting 41 | .set("spark.metrics.conf.*.sink.statsd.unit", "minutes") 42 | .set("spark.metrics.labels", "job_id=test") 43 | .set("spark.metrics.namespace", "") 44 | .set("spark.sql.legacy.allowUntypedScalaUDF", "true") 45 | .set("spark.sql.execution.arrow.maxRecordsPerBatch", "50000") 46 | 47 | sparkSession = SparkSession 48 | .builder() 49 | .config(withSparkConfOverrides(sparkConf)) 50 | .getOrCreate() 51 | } 52 | 53 | after { 54 | sparkSession.stop() 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /spark/ingestion/src/test/scala/feast/ingestion/UnitSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion 18 | 19 | import org.scalatest.flatspec.AnyFlatSpec 20 | import org.scalatest._ 21 | import matchers._ 22 | 23 | abstract class UnitSpec 24 | extends AnyFlatSpec 25 | with should.Matchers 26 | with OptionValues 27 | with Inside 28 | with Inspectors 29 | -------------------------------------------------------------------------------- /spark/ingestion/src/test/scala/feast/ingestion/helpers/DataHelper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.helpers 18 | 19 | import java.nio.file.{Files, Paths} 20 | 21 | import org.apache.spark.sql.SparkSession 22 | import org.apache.spark.sql.functions.to_date 23 | import org.joda.time.{DateTime, Seconds} 24 | import org.scalacheck.Gen 25 | 26 | import scala.reflect.runtime.universe.TypeTag 27 | 28 | case class TestRow( 29 | customer: String, 30 | feature1: Int, 31 | feature2: Float, 32 | eventTimestamp: java.sql.Timestamp 33 | ) 34 | 35 | object DataHelper { 36 | def generateRows[A](gen: Gen[A], N: Int): Seq[A] = 37 | Gen.listOfN(N, gen).sample.get 38 | 39 | def generateDistinctRows[A](gen: Gen[A], N: Int, entityFun: A => String): Seq[A] = 40 | generateRows(gen, N).groupBy(entityFun).map(_._2.head).toSeq 41 | 42 | def generateTempPath(last: String): String = 43 | Paths.get(Files.createTempDirectory("test-dir").toString, last).toString 44 | 45 | def storeAsParquet[T <: Product: TypeTag](sparkSession: SparkSession, rows: Seq[T]): String = { 46 | import sparkSession.implicits._ 47 | 48 | val tempPath = generateTempPath("rows") 49 | 50 | sparkSession 51 | .createDataset(rows) 52 | .withColumn("date", to_date($"eventTimestamp")) 53 | .write 54 | .partitionBy("date") 55 | .save(tempPath) 56 | 57 | tempPath 58 | } 59 | 60 | def rowGenerator( 61 | start: DateTime, 62 | end: DateTime, 63 | customerGen: Option[Gen[String]] = None 64 | ): Gen[TestRow] = 65 | for { 66 | customer <- customerGen.getOrElse(Gen.asciiPrintableStr) 67 | feature1 <- Gen.choose(0, 100) 68 | feature2 <- Gen.choose[Float](0, 1) 69 | eventTimestamp <- Gen 70 | .choose(0, Seconds.secondsBetween(start, end).getSeconds - 1) 71 | .map(start.withMillisOfSecond(0).plusSeconds) 72 | } yield TestRow( 73 | customer, 74 | feature1, 75 | feature2, 76 | new java.sql.Timestamp(eventTimestamp.getMillis) 77 | ) 78 | } 79 | -------------------------------------------------------------------------------- /spark/ingestion/src/test/scala/feast/ingestion/helpers/RedisStorageHelper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.helpers 18 | 19 | import java.nio.charset.StandardCharsets 20 | import java.nio.{ByteBuffer, ByteOrder} 21 | import com.google.protobuf.Timestamp 22 | import feast.ingestion.FeatureTable 23 | import feast.proto.types.ValueProto 24 | import feast.ingestion.utils.TypeConversion._ 25 | import org.scalatest.matchers.Matcher 26 | import org.scalatest.matchers.must.Matchers.contain 27 | import com.google.common.hash.Hashing 28 | 29 | import scala.util.Try 30 | 31 | object RedisStorageHelper { 32 | def encodeFeatureKey(featureTable: FeatureTable)(feature: String): String = { 33 | val fullReference = s"${featureTable.name}:$feature" 34 | murmurHashHexString(fullReference) 35 | } 36 | 37 | def murmurHashHexString(s: String): String = { 38 | Hashing.murmur3_32().hashString(s, StandardCharsets.UTF_8).asInt().toHexString 39 | } 40 | 41 | def beStoredRow(mappedRow: Map[String, Any]): Matcher[Map[Array[Byte], Array[Byte]]] = { 42 | val m: Matcher[Map[String, Any]] = contain.allElementsOf(mappedRow).matcher 43 | 44 | m compose { 45 | (_: Map[Array[Byte], Array[Byte]]) 46 | .map { 47 | case (k, v) if k.sameElements("_ex".getBytes()) => 48 | (new String(k), Timestamp.parseFrom(v).asScala) 49 | 50 | case (k, v) if k.length == 4 => 51 | ( 52 | ByteBuffer.wrap(k).order(ByteOrder.LITTLE_ENDIAN).getInt.toHexString, 53 | Try(ValueProto.Value.parseFrom(v).asScala).getOrElse(Timestamp.parseFrom(v).asScala) 54 | ) 55 | } 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /spark/ingestion/src/test/scala/feast/ingestion/metrics/StatsDStub.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.metrics 18 | 19 | import java.net.{DatagramPacket, DatagramSocket, SocketTimeoutException} 20 | 21 | import scala.collection.mutable.ArrayBuffer 22 | 23 | class StatsDStub { 24 | val socket = new DatagramSocket() 25 | socket.setSoTimeout(100) 26 | 27 | def port: Int = socket.getLocalPort 28 | 29 | def receive: Array[String] = { 30 | val messages: ArrayBuffer[String] = ArrayBuffer() 31 | var finished = false 32 | 33 | do { 34 | val buf = new Array[Byte](65535) 35 | val p = new DatagramPacket(buf, buf.length) 36 | try { 37 | socket.receive(p) 38 | } catch { 39 | case _: SocketTimeoutException => 40 | finished = true 41 | } 42 | messages += new String(p.getData, 0, p.getLength) 43 | } while (!finished) 44 | 45 | messages.toArray 46 | } 47 | 48 | private val metricLine = """(.+):(.+)\|(.+)#(.+)""".r 49 | 50 | def receivedMetrics: Map[String, Float] = { 51 | receive 52 | .flatMap { 53 | case metricLine(name, value, type_, tags) => 54 | Seq(name -> value.toFloat) 55 | case s: String => 56 | Seq() 57 | } 58 | .groupBy(_._1) 59 | .mapValues(_.map(_._2).sum) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /spark/ingestion/src/test/scala/feast/ingestion/metrics/StatsReporterSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright 2018-2020 The Feast Authors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * https://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package feast.ingestion.metrics 18 | 19 | import java.util 20 | import java.util.Collections 21 | 22 | import com.codahale.metrics.{Gauge, Histogram, MetricRegistry, UniformReservoir} 23 | import feast.ingestion.UnitSpec 24 | 25 | import scala.jdk.CollectionConverters._ 26 | 27 | class StatsReporterSpec extends UnitSpec { 28 | trait Scope { 29 | val server = new StatsDStub 30 | val reporter = new StatsdReporterWithTags( 31 | new MetricRegistry, 32 | "127.0.0.1", 33 | server.port 34 | ) 35 | 36 | def gauge[A](v: A): Gauge[A] = new Gauge[A] { 37 | override def getValue: A = v 38 | } 39 | 40 | def histogram(values: Seq[Int]): Histogram = { 41 | val hist = new Histogram(new UniformReservoir) 42 | values.foreach(hist.update) 43 | hist 44 | } 45 | } 46 | 47 | "Statsd reporter" should "send simple gauge unmodified" in new Scope { 48 | reporter.report( 49 | gauges = new util.TreeMap( 50 | Map( 51 | "test" -> gauge(0) 52 | ).asJava 53 | ), 54 | counters = Collections.emptySortedMap(), 55 | histograms = Collections.emptySortedMap(), 56 | meters = Collections.emptySortedMap(), 57 | timers = Collections.emptySortedMap() 58 | ) 59 | 60 | server.receive should contain("test:0|g") 61 | } 62 | 63 | "Statsd reporter" should "keep tags part in the message's end" in new Scope { 64 | reporter.report( 65 | gauges = Collections.emptySortedMap(), 66 | counters = Collections.emptySortedMap(), 67 | histograms = new util.TreeMap( 68 | Map( 69 | "prefix.1111.test#fs=name,job=aaa" -> histogram((1 to 100)) 70 | ).asJava 71 | ), 72 | meters = Collections.emptySortedMap(), 73 | timers = Collections.emptySortedMap() 74 | ) 75 | 76 | server.receive should contain("prefix.test.p95:95.95|ms|#fs:name,job:aaa") 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/tests/__init__.py -------------------------------------------------------------------------------- /tests/e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/tests/e2e/__init__.py -------------------------------------------------------------------------------- /tests/e2e/fixtures/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/tests/e2e/fixtures/__init__.py -------------------------------------------------------------------------------- /tests/e2e/fixtures/base.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | from pathlib import Path 3 | 4 | import pytest 5 | 6 | 7 | @pytest.fixture(scope="session") 8 | def project_root(): 9 | # This file is %root%/tests/e2e/fixtures/base.py 10 | return Path(__file__).parent.parent.parent.parent 11 | 12 | 13 | @pytest.fixture(scope="session") 14 | def project_version(pytestconfig, project_root): 15 | if pytestconfig.getoption("feast_version"): 16 | return pytestconfig.getoption("feast_version") 17 | 18 | pom_xml = ET.parse(project_root / "pom.xml") 19 | root = pom_xml.getroot() 20 | return root.find(".properties/revision").text 21 | -------------------------------------------------------------------------------- /tests/e2e/fixtures/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from datetime import datetime 4 | 5 | import pytest 6 | from _pytest.fixtures import FixtureRequest 7 | from google.cloud import bigquery 8 | 9 | from feast import BigQuerySource, FileSource 10 | from feast.data_format import ParquetFormat 11 | 12 | __all__ = ("bq_dataset", "batch_source") 13 | 14 | 15 | @pytest.fixture(scope="session") 16 | def bq_dataset(pytestconfig): 17 | client = bigquery.Client(project=pytestconfig.getoption("bq_project")) 18 | timestamp = int(time.time()) 19 | name = f"feast_e2e_{timestamp}" 20 | client.create_dataset(name) 21 | yield name 22 | client.delete_dataset(name, delete_contents=True) 23 | 24 | 25 | @pytest.fixture 26 | def batch_source(local_staging_path: str, pytestconfig, request: FixtureRequest): 27 | if pytestconfig.getoption("env") == "gcloud": 28 | bq_project = pytestconfig.getoption("bq_project") 29 | bq_dataset = request.getfixturevalue("bq_dataset") 30 | return BigQuerySource( 31 | event_timestamp_column="event_timestamp", 32 | created_timestamp_column="created_timestamp", 33 | table_ref=f"{bq_project}:{bq_dataset}.source_{datetime.now():%Y%m%d%H%M%s}", 34 | ) 35 | else: 36 | return FileSource( 37 | event_timestamp_column="event_timestamp", 38 | created_timestamp_column="created_timestamp", 39 | file_format=ParquetFormat(), 40 | file_url=os.path.join(local_staging_path, "transactions"), 41 | ) 42 | -------------------------------------------------------------------------------- /tests/e2e/fixtures/external_services.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pytest_redis.executor import NoopRedis 3 | 4 | from tests.e2e.fixtures.statsd_stub import PrometheusStatsDServer 5 | 6 | __all__ = ( 7 | "feast_core", 8 | "feast_serving", 9 | "redis_server", 10 | "kafka_server", 11 | "enable_auth", 12 | "feast_jobservice", 13 | "statsd_server", 14 | ) 15 | 16 | 17 | @pytest.fixture(scope="session") 18 | def redis_server(pytestconfig): 19 | host, port = pytestconfig.getoption("redis_url").split(":") 20 | return NoopRedis(host, port, None) 21 | 22 | 23 | @pytest.fixture(scope="session") 24 | def feast_core(pytestconfig): 25 | host, port = pytestconfig.getoption("core_url").split(":") 26 | return host, port 27 | 28 | 29 | @pytest.fixture(scope="session") 30 | def feast_serving(pytestconfig): 31 | host, port = pytestconfig.getoption("serving_url").split(":") 32 | return host, port 33 | 34 | 35 | @pytest.fixture(scope="session") 36 | def kafka_server(pytestconfig): 37 | host, port = pytestconfig.getoption("kafka_brokers").split(":") 38 | return host, port 39 | 40 | 41 | @pytest.fixture(scope="session") 42 | def enable_auth(): 43 | return False 44 | 45 | 46 | @pytest.fixture(scope="session") 47 | def feast_jobservice(pytestconfig): 48 | host, port = pytestconfig.getoption("job_service_url").split(":") 49 | return host, port 50 | 51 | 52 | @pytest.fixture(scope="session") 53 | def statsd_server(pytestconfig): 54 | host, port = pytestconfig.getoption("statsd_url").split(":") 55 | prometheus_host, prometheus_port = pytestconfig.getoption("prometheus_url").split( 56 | ":" 57 | ) 58 | return PrometheusStatsDServer(host, port, prometheus_host, prometheus_port) 59 | -------------------------------------------------------------------------------- /tests/e2e/fixtures/services.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import shutil 4 | 5 | import port_for 6 | import pytest 7 | import requests 8 | from pytest_kafka import make_kafka_server, make_zookeeper_process 9 | from pytest_postgresql import factories as pg_factories 10 | from pytest_redis import factories as redis_factories 11 | 12 | __all__ = ( 13 | "kafka_server", 14 | "kafka_port", 15 | "zookeeper_server", 16 | "postgres_server", 17 | "redis_server", 18 | "statsd_server", 19 | ) 20 | 21 | from tests.e2e.fixtures.statsd_stub import StatsDStub 22 | 23 | 24 | def download_kafka(version="2.12-2.6.0"): 25 | temp_dir = pathlib.Path("/tmp") 26 | local_path = temp_dir / f"kafka_{version}.tgz" 27 | 28 | if not os.path.isfile(local_path): 29 | r = requests.get( 30 | f"https://archive.apache.org/dist/kafka/2.6.0/kafka_{version}.tgz" 31 | ) 32 | 33 | r.raise_for_status() 34 | 35 | with open(local_path, "wb") as f: 36 | f.write(r.content) 37 | 38 | shutil.unpack_archive(str(local_path), str(temp_dir)) 39 | return temp_dir / f"kafka_{version}" / "bin" 40 | 41 | 42 | @pytest.fixture 43 | def kafka_server(kafka_port): 44 | _, port = kafka_port 45 | return "localhost", port 46 | 47 | 48 | @pytest.fixture 49 | def statsd_server(): 50 | port = port_for.select_random(None) 51 | server = StatsDStub(port=port) 52 | server.start() 53 | yield server 54 | server.stop() 55 | 56 | 57 | postgres_server = pg_factories.postgresql_proc(password="password") 58 | redis_server = redis_factories.redis_proc( 59 | executable=shutil.which("redis-server"), timeout=3600 60 | ) 61 | 62 | KAFKA_BIN = download_kafka() 63 | zookeeper_server = make_zookeeper_process( 64 | str(KAFKA_BIN / "zookeeper-server-start.sh"), 65 | zk_config_template=""" 66 | dataDir={zk_data_dir} 67 | clientPort={zk_port} 68 | maxClientCnxns=0 69 | admin.enableServer=false""", 70 | ) 71 | kafka_port = make_kafka_server( 72 | kafka_bin=str(KAFKA_BIN / "kafka-server-start.sh"), 73 | zookeeper_fixture_name="zookeeper_server", 74 | ) 75 | -------------------------------------------------------------------------------- /tests/e2e/test_job_scheduling.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import uuid 3 | 4 | import pytest as pytest 5 | from kubernetes import client, config 6 | 7 | from feast import Client, Entity, Feature, FeatureTable, FileSource, ValueType 8 | from feast.data_format import ParquetFormat 9 | from feast_spark import Client as SparkClient 10 | 11 | 12 | @pytest.mark.env("k8s") 13 | def test_schedule_batch_ingestion_jobs( 14 | pytestconfig, feast_client: Client, feast_spark_client: SparkClient 15 | ): 16 | entity = Entity(name="s2id", description="S2id", value_type=ValueType.INT64,) 17 | batch_source = FileSource( 18 | file_format=ParquetFormat(), 19 | file_url="gs://example/feast/*", 20 | event_timestamp_column="datetime_col", 21 | created_timestamp_column="timestamp", 22 | date_partition_column="datetime", 23 | ) 24 | feature_table = FeatureTable( 25 | name=f"schedule_{str(uuid.uuid4())}".replace("-", "_"), 26 | entities=["s2id"], 27 | features=[Feature("unique_drivers", ValueType.INT64)], 28 | batch_source=batch_source, 29 | ) 30 | feast_client.apply(entity) 31 | feast_client.apply(feature_table) 32 | 33 | feast_spark_client.schedule_offline_to_online_ingestion( 34 | feature_table, 1, "0 0 * * *" 35 | ) 36 | config.load_incluster_config() 37 | k8s_api = client.CustomObjectsApi() 38 | 39 | def get_scheduled_spark_application(): 40 | job_hash = hashlib.md5( 41 | f"{feast_client.project}-{feature_table.name}".encode() 42 | ).hexdigest() 43 | resource_name = f"feast-{job_hash}" 44 | 45 | return k8s_api.get_namespaced_custom_object( 46 | group="sparkoperator.k8s.io", 47 | version="v1beta2", 48 | namespace=pytestconfig.getoption("k8s_namespace"), 49 | plural="scheduledsparkapplications", 50 | name=resource_name, 51 | ) 52 | 53 | response = get_scheduled_spark_application() 54 | assert response["spec"]["schedule"] == "0 0 * * *" 55 | feast_spark_client.schedule_offline_to_online_ingestion( 56 | feature_table, 1, "1 0 * * *" 57 | ) 58 | response = get_scheduled_spark_application() 59 | assert response["spec"]["schedule"] == "1 0 * * *" 60 | 61 | feast_spark_client.unschedule_offline_to_online_ingestion(feature_table) 62 | -------------------------------------------------------------------------------- /tests/e2e/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feast-dev/feast-spark/5cd2a861bbb7a17e3536e34284bfb0afb1ca9959/tests/e2e/utils/__init__.py -------------------------------------------------------------------------------- /tests/e2e/utils/common.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from feast import Entity, Feature, FeatureTable, FileSource, KafkaSource, ValueType 4 | from feast.data_format import AvroFormat, ParquetFormat 5 | from feast.wait import wait_retry_backoff 6 | from feast_spark import Client as SparkClient 7 | from feast_spark.pyspark.abc import SparkJobStatus 8 | 9 | 10 | def create_schema(kafka_broker, topic_name, feature_table_name): 11 | entity = Entity(name="key", description="Key", value_type=ValueType.INT64) 12 | feature_table = FeatureTable( 13 | name=feature_table_name, 14 | entities=["key"], 15 | features=[Feature("num", ValueType.INT64), Feature("set", ValueType.STRING)], 16 | batch_source=FileSource( 17 | event_timestamp_column="event_timestamp", 18 | file_format=ParquetFormat(), 19 | file_url="/dev/null", 20 | ), 21 | stream_source=KafkaSource( 22 | event_timestamp_column="event_timestamp", 23 | bootstrap_servers=kafka_broker, 24 | message_format=AvroFormat(avro_schema()), 25 | topic=topic_name, 26 | ), 27 | ) 28 | return entity, feature_table 29 | 30 | 31 | def start_job( 32 | feast_spark_client: SparkClient, feature_table: FeatureTable, pytestconfig 33 | ): 34 | if pytestconfig.getoption("scheduled_streaming_job"): 35 | return 36 | 37 | job = feast_spark_client.start_stream_to_online_ingestion(feature_table) 38 | wait_retry_backoff( 39 | lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 180 40 | ) 41 | return job 42 | 43 | 44 | def stop_job(job, feast_spark_client: SparkClient, feature_table: FeatureTable): 45 | if job: 46 | job.cancel() 47 | else: 48 | feast_spark_client._feast.delete_feature_table(feature_table.name) 49 | 50 | 51 | def avro_schema(): 52 | return json.dumps( 53 | { 54 | "type": "record", 55 | "name": "TestMessage", 56 | "fields": [ 57 | {"name": "key", "type": "long"}, 58 | {"name": "num", "type": "long"}, 59 | {"name": "set", "type": "string"}, 60 | { 61 | "name": "event_timestamp", 62 | "type": {"type": "long", "logicalType": "timestamp-micros"}, 63 | }, 64 | ], 65 | } 66 | ) 67 | -------------------------------------------------------------------------------- /tests/e2e/utils/kafka.py: -------------------------------------------------------------------------------- 1 | import io 2 | from typing import Any, Dict, List, Optional 3 | 4 | import avro.schema 5 | import pandas as pd 6 | import pytz 7 | from avro.io import BinaryEncoder, DatumWriter 8 | from kafka import KafkaAdminClient, KafkaProducer 9 | 10 | from feast import Client 11 | from feast.wait import wait_retry_backoff 12 | 13 | 14 | def send_avro_record_to_kafka(topic, value, bootstrap_servers, avro_schema_json): 15 | value_schema = avro.schema.parse(avro_schema_json) 16 | 17 | producer = KafkaProducer(bootstrap_servers=bootstrap_servers) 18 | 19 | writer = DatumWriter(value_schema) 20 | bytes_writer = io.BytesIO() 21 | encoder = BinaryEncoder(bytes_writer) 22 | 23 | writer.write(value, encoder) 24 | 25 | try: 26 | producer.send(topic=topic, value=bytes_writer.getvalue()) 27 | except Exception as e: 28 | print( 29 | f"Exception while producing record value - {value} to topic - {topic}: {e}" 30 | ) 31 | else: 32 | print(f"Successfully producing record value - {value} to topic - {topic}") 33 | 34 | producer.flush() 35 | 36 | 37 | def check_consumer_exist(bootstrap_servers, topic_name): 38 | admin = KafkaAdminClient(bootstrap_servers=bootstrap_servers) 39 | consumer_groups = admin.describe_consumer_groups( 40 | group_ids=[ 41 | group_id 42 | for group_id, _ in admin.list_consumer_groups() 43 | if group_id.startswith("spark-kafka-source") 44 | ] 45 | ) 46 | subscriptions = { 47 | subscription 48 | for group in consumer_groups 49 | for member in group.members 50 | if not isinstance(member.member_metadata, bytes) 51 | for subscription in member.member_metadata.subscription 52 | } 53 | return topic_name in subscriptions 54 | 55 | 56 | def ingest_and_retrieve( 57 | feast_client: Client, 58 | df: pd.DataFrame, 59 | topic_name: str, 60 | kafka_broker: str, 61 | avro_schema_json: str, 62 | entity_rows: List[Dict[str, Any]], 63 | feature_names: List[Any], 64 | expected_ingested_count: Optional[int] = None, 65 | ): 66 | expected_ingested_count = expected_ingested_count or df.shape[0] 67 | 68 | for record in df.to_dict("records"): 69 | record["event_timestamp"] = ( 70 | record["event_timestamp"].to_pydatetime().replace(tzinfo=pytz.utc) 71 | ) 72 | 73 | send_avro_record_to_kafka( 74 | topic_name, 75 | record, 76 | bootstrap_servers=kafka_broker, 77 | avro_schema_json=avro_schema_json, 78 | ) 79 | 80 | def get_online_features(): 81 | features = feast_client.get_online_features( 82 | feature_names, entity_rows=entity_rows, 83 | ).to_dict() 84 | out_df = pd.DataFrame.from_dict(features) 85 | return out_df, out_df[feature_names].count().min() >= expected_ingested_count 86 | 87 | ingested = wait_retry_backoff(get_online_features, 180) 88 | return ingested 89 | -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest==6.0.0 2 | pytest-lazy-fixture==0.6.3 3 | pytest-timeout==1.4.2 4 | pytest-ordering==0.6.* 5 | pytest-benchmark==3.2.2 6 | pytest-mock==1.10.4 7 | pytest-ordering==0.6.* 8 | pytest-xdist==2.1.0 9 | pytest-postgresql==2.5.1 10 | pytest-redis==2.0.0 11 | pytest-kafka==0.4.0 12 | deepdiff==4.3.2 13 | kafka-python==2.0.2 14 | great-expectations==0.13.2 15 | Jinja2==3.0.3 16 | pandavro==1.5.* 17 | avro==1.10.0 18 | pyspark==3.1.3 19 | gcsfs 20 | -------------------------------------------------------------------------------- /tests/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | multi_line_output=3 3 | include_trailing_comma=True 4 | force_grid_wrap=0 5 | use_parentheses=True 6 | line_length=88 7 | known_first_party=feast,feast_serving_server,feast_core_server,feast_spark 8 | default_section=THIRDPARTY 9 | 10 | [flake8] 11 | ignore = E203, E266, E501, W503 12 | max-line-length = 88 13 | max-complexity = 20 14 | select = B,C,E,F,W,T4 15 | 16 | [mypy] 17 | ignore_missing_imports=true --------------------------------------------------------------------------------