├── conftest.py ├── .flake8 ├── kubeflow ├── components │ ├── data │ │ ├── train │ │ │ ├── __init__.py │ │ │ ├── tests │ │ │ │ ├── fixtures │ │ │ │ │ ├── model.gz │ │ │ │ │ ├── features │ │ │ │ │ │ ├── f2.json │ │ │ │ │ │ └── f1.json │ │ │ │ │ ├── msearch_call2.txt │ │ │ │ │ └── msearch_call1.txt │ │ │ │ ├── test_run.py │ │ │ │ └── conftest.py │ │ │ ├── requirements.txt │ │ │ ├── Dockerfile │ │ │ ├── train.sql │ │ │ └── run.py │ │ └── validation │ │ │ ├── requirements.txt │ │ │ ├── Dockerfile │ │ │ ├── run.py │ │ │ └── validation.sql │ ├── prepare_env │ │ ├── requirements.txt │ │ ├── Dockerfile │ │ ├── lambdamart0 │ │ │ ├── features │ │ │ │ ├── name.json │ │ │ │ ├── category.json │ │ │ │ ├── avg_customer_price.json │ │ │ │ └── channel_group.json │ │ │ ├── ga_data.sql │ │ │ └── es_mapping.json │ │ └── run.py │ ├── model │ │ ├── ranklib │ │ │ └── RankLib-2.14.jar │ │ ├── tests │ │ │ ├── fixtures │ │ │ │ ├── validation.gz │ │ │ │ └── es_query.json │ │ │ ├── test_validate.py │ │ │ ├── conftest.py │ │ │ └── test_train.py │ │ ├── requirements.txt │ │ ├── queries │ │ │ ├── unittest │ │ │ │ └── es_query.json │ │ │ └── lambdamart0 │ │ │ │ └── es_query.json │ │ ├── Dockerfile │ │ ├── post_model.py │ │ ├── test.py │ │ ├── experiment.json │ │ ├── launch_katib.py │ │ ├── train.py │ │ ├── validate.py │ │ └── model.txt │ └── common │ │ └── launch_crd.py ├── namespace.yaml ├── pipe_role_biding.yaml ├── nfs-server-service.yaml ├── build │ ├── build.sh │ ├── manage_service_account.sh │ └── cloudbuild.yaml ├── pipelines │ ├── Dockerfile │ ├── helper.py │ ├── pipeline.py │ └── pipeline2.py ├── pv-pvc.yaml ├── disk-busybox.yaml └── nfs-server.yaml ├── kubernetes ├── front │ ├── run_app.sh │ ├── requirements.txt │ ├── Dockerfile │ ├── config.py │ ├── templates │ │ ├── index.html │ │ └── documents.j2 │ ├── app.yaml │ └── app.py └── es │ ├── Dockerfile │ ├── docker-compose.yaml │ └── deploy_elasticsearch.yaml ├── requirements.txt ├── Dockerfile ├── bin ├── deploy_es.sh ├── get_pipe_host.sh ├── manage_service_account.sh └── create_k8s.sh ├── README.md ├── LICENSE └── .gitignore /conftest.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length=90 3 | -------------------------------------------------------------------------------- /kubeflow/components/data/train/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kubernetes/front/run_app.sh: -------------------------------------------------------------------------------- 1 | gunicorn app:app --config=config.py 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | google-cloud-bigquery 2 | google-cloud-storage 3 | elasticsearch 4 | fire 5 | -------------------------------------------------------------------------------- /kubeflow/components/data/validation/requirements.txt: -------------------------------------------------------------------------------- 1 | google-cloud-bigquery 2 | google-cloud-storage 3 | -------------------------------------------------------------------------------- /kubernetes/front/requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==1.0.2 2 | gunicorn==19.9.0 3 | Jinja2 4 | elasticsearch 5 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7.7-alpine3.12 2 | 3 | ADD ./test.py / 4 | 5 | ENTRYPOINT ["python", "test.py"] 6 | -------------------------------------------------------------------------------- /kubeflow/components/prepare_env/requirements.txt: -------------------------------------------------------------------------------- 1 | google-cloud-bigquery 2 | google-cloud-storage 3 | elasticsearch 4 | fire 5 | -------------------------------------------------------------------------------- /kubeflow/components/model/ranklib/RankLib-2.14.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WillianFuks/pySearchML/master/kubeflow/components/model/ranklib/RankLib-2.14.jar -------------------------------------------------------------------------------- /kubeflow/namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: kubeflow 5 | labels: 6 | katib-metricscollector-injection: enabled 7 | -------------------------------------------------------------------------------- /kubeflow/components/data/train/tests/fixtures/model.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WillianFuks/pySearchML/master/kubeflow/components/data/train/tests/fixtures/model.gz -------------------------------------------------------------------------------- /kubeflow/components/model/tests/fixtures/validation.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WillianFuks/pySearchML/master/kubeflow/components/model/tests/fixtures/validation.gz -------------------------------------------------------------------------------- /kubeflow/components/model/requirements.txt: -------------------------------------------------------------------------------- 1 | google-cloud-bigquery 2 | google-cloud-storage 3 | elasticsearch 4 | numpy 5 | requests 6 | kubernetes 7 | pytest 8 | mock 9 | -------------------------------------------------------------------------------- /kubeflow/components/data/train/requirements.txt: -------------------------------------------------------------------------------- 1 | google-cloud-bigquery 2 | google-cloud-storage 3 | elasticsearch 4 | Cython 5 | pyClickModels 6 | numpy 7 | requests 8 | pytest 9 | mock 10 | -------------------------------------------------------------------------------- /kubernetes/es/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.elastic.co/elasticsearch/elasticsearch:7.3.1 2 | 3 | RUN ./bin/elasticsearch-plugin install -b http://es-learn-to-rank.labs.o19s.com/ltr-1.1.2-es7.3.1.zip 4 | -------------------------------------------------------------------------------- /kubeflow/components/data/train/tests/fixtures/features/f2.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "match": { 4 | "f2": "{{test}}" 5 | } 6 | }, 7 | "params": ["test"], 8 | "name": "test2" 9 | } 10 | -------------------------------------------------------------------------------- /kubeflow/components/data/train/tests/fixtures/features/f1.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "match": { 4 | "field1": "{{test}}" 5 | } 6 | }, 7 | "params": ["test"], 8 | "name": "test1" 9 | } 10 | -------------------------------------------------------------------------------- /kubeflow/components/model/tests/fixtures/es_query.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": "test", 3 | "rescore": { 4 | "query": { 5 | "rescore_query": { 6 | "sltr": { 7 | "params": {} 8 | } 9 | } 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /kubeflow/components/model/queries/unittest/es_query.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": "test", 3 | "rescore": { 4 | "query": { 5 | "rescore_query": { 6 | "sltr": { 7 | "params": {} 8 | } 9 | } 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /bin/deploy_es.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | SERVICE=$(kubectl get service elasticsearch -n elastic-system | grep elasticsearch) 4 | 5 | # Only deploy if service doesn't already exists. 6 | if [ -z "$SERVICE" ]; then 7 | kubectl apply -f kubernetes/es/deploy_elasticsearch.yaml 8 | fi 9 | -------------------------------------------------------------------------------- /kubernetes/front/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6-jessie 2 | WORKDIR /front 3 | ADD kubernetes/front/ /front 4 | RUN pip install -r /front/requirements.txt 5 | ADD kubeflow/components/model/queries/lambdamart0/es_query.json /front/es_query.json 6 | ENV PORT 8088 7 | CMD ["gunicorn", "app:app", "--config=config.py"] 8 | -------------------------------------------------------------------------------- /kubeflow/pipe_role_biding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: pipeline-runner-binding 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: Role 8 | name: pipeline-runner 9 | subjects: 10 | - kind: ServiceAccount 11 | name: default 12 | -------------------------------------------------------------------------------- /kubeflow/components/prepare_env/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7.7-slim as python 2 | 3 | COPY kubeflow/components/prepare_env /prepare_env 4 | WORKDIR /prepare_env 5 | COPY ./key.json . 6 | 7 | ENV GOOGLE_APPLICATION_CREDENTIALS=./key.json 8 | 9 | RUN pip install -r requirements.txt 10 | 11 | ENTRYPOINT ["python", "run.py"] 12 | -------------------------------------------------------------------------------- /kubeflow/nfs-server-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: nfs-server 5 | namespace: kubeflow 6 | spec: 7 | ports: 8 | - name: nfs 9 | port: 2049 10 | - name: mountd 11 | port: 20048 12 | - name: rpcbind 13 | port: 111 14 | selector: 15 | role: nfs-server 16 | -------------------------------------------------------------------------------- /kubeflow/components/data/validation/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7.7-slim as python 2 | 3 | COPY kubeflow/components/data/validation/ /validation 4 | WORKDIR /validation 5 | COPY ./key.json . 6 | 7 | ENV GOOGLE_APPLICATION_CREDENTIALS=./key.json 8 | 9 | RUN pip install -r requirements.txt 10 | 11 | ENTRYPOINT ["python", "run.py"] 12 | -------------------------------------------------------------------------------- /kubeflow/components/data/train/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7.7-slim as python 2 | 3 | COPY kubeflow/components/data/train/ /train 4 | WORKDIR /train 5 | COPY ./key.json . 6 | 7 | ENV GOOGLE_APPLICATION_CREDENTIALS=key.json 8 | 9 | RUN pip install -r requirements.txt 10 | RUN pip install -U pyClickModels 11 | 12 | ENTRYPOINT ["sh", "-c"] 13 | -------------------------------------------------------------------------------- /kubeflow/build/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | SUBSTITUTIONS=\ 5 | _COMPUTE_ZONE='us-central1-a',\ 6 | _CLUSTER_NAME='pysearchml',\ 7 | _VERSION='0.0.0' 8 | 9 | ./kubeflow/build/manage_service_account.sh 10 | 11 | gcloud builds submit --no-source --config kubeflow/build/cloudbuild.yaml --substitutions $SUBSTITUTIONS --timeout=2h 12 | -------------------------------------------------------------------------------- /kubernetes/es/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | es: 5 | build: 6 | context: ./ 7 | dockerfile: Dockerfile 8 | ports: 9 | - "9200:9200" 10 | tty: true 11 | environment: 12 | - discovery.type=single-node 13 | ulimits: 14 | memlock: 15 | soft: -1 16 | hard: -1 17 | -------------------------------------------------------------------------------- /kubernetes/front/config.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | from os import environ as env 3 | 4 | 5 | PORT = int(env.get("PORT", 8088)) 6 | DEBUG_MODE = int(env.get("DEBUG_MODE", 1)) 7 | 8 | # Gunicorn config 9 | bind = ":" + str(PORT) 10 | workers = multiprocessing.cpu_count() * 2 + 1 11 | workers = 1 12 | threads = 2 * multiprocessing.cpu_count() 13 | threads = 2 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pySearchML 2 | 3 | A complete AI Based Search Engine built on top of Elasticsearch, Kubeflow and Katib. 4 | 5 | Please refer to this [post](https://towardsdatascience.com/building-a-complete-ai-based-search-engine-with-elasticsearch-kubeflow-and-katib-590c7b27eb8f?source=friends_link&sk=dfbf728b708eaa6546edd877844a9a42) for a full detail of how the system works. 6 | -------------------------------------------------------------------------------- /kubeflow/components/data/train/tests/fixtures/msearch_call2.txt: -------------------------------------------------------------------------------- 1 | {"index": "test"} 2 | {"query": {"bool": {"filter": [{"terms": {"_id": ["doc0"]}}], "should": [{"sltr": {"_name": "logged_featureset", "featureset": "model_name_test", "params": {"search_term": "keyword2"}}}]}}, "_source": ["_id"], "ext": {"ltr_log": {"log_specs": {"name": "main", "named_query": "logged_featureset"}}}} 3 | -------------------------------------------------------------------------------- /kubeflow/components/prepare_env/lambdamart0/features/name.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "bool": { 4 | "minimum_should_match": 1, 5 | "should": [ 6 | { 7 | "match": { 8 | "name": "{{search_term}}" 9 | } 10 | } 11 | ] 12 | } 13 | }, 14 | "params": ["search_term"], 15 | "name": "BM25 name" 16 | } 17 | -------------------------------------------------------------------------------- /kubeflow/components/model/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:8 as java 2 | COPY --from=python:3.7.7-slim / / 3 | 4 | COPY kubeflow/components/model /model 5 | COPY kubeflow/components/common/launch_crd.py /model/launch_crd.py 6 | WORKDIR /model 7 | COPY ./key.json . 8 | 9 | ARG PROJECT_ID 10 | 11 | ENV GOOGLE_APPLICATION_CREDENTIALS=key.json \ 12 | PROJECT_ID=$PROJECT_ID 13 | 14 | RUN pip install -r requirements.txt 15 | 16 | ENTRYPOINT ["sh", "-c"] 17 | -------------------------------------------------------------------------------- /kubeflow/components/prepare_env/lambdamart0/features/category.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "bool": { 4 | "minimum_should_match": 1, 5 | "should": [ 6 | { 7 | "match": { 8 | "category": "{{search_term}}" 9 | } 10 | } 11 | ] 12 | } 13 | }, 14 | "params": ["search_term"], 15 | "name": "BM25 category" 16 | } 17 | -------------------------------------------------------------------------------- /kubeflow/pipelines/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM google/cloud-sdk:alpine as gcloud 2 | FROM python:3.7.7-slim as python 3 | 4 | COPY ./key.json /key.json 5 | 6 | ENV GOOGLE_APPLICATION_CREDENTIALS=/key.json 7 | ENV PATH=/google-cloud-sdk/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin 8 | 9 | RUN pip install kfp --upgrade 10 | RUN pip install fire 11 | 12 | COPY --from=gcloud /google-cloud-sdk /google-cloud-sdk 13 | 14 | ENTRYPOINT ["sh"] 15 | -------------------------------------------------------------------------------- /bin/get_pipe_host.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | #HOST=$(kubectl describe configmap inverse-proxy-config -n kubeflow | grep googleusercontent.com) 4 | HOST="127.0.0.1:7067/pipeline" 5 | 6 | # Means Proxy is still being created 7 | if [ -z "$HOST" ]; then 8 | echo 'Sleeping 2 mins so Kubeflow Inverse Proxy is ready' 9 | sleep 2m 10 | HOST=$(kubectl describe configmap inverse-proxy-config -n kubeflow | grep googleusercontent.com) 11 | fi 12 | 13 | echo "$HOST" > k8_host.txt 14 | -------------------------------------------------------------------------------- /kubeflow/pv-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: pysearchml-nfs 5 | namespace: kubeflow 6 | spec: 7 | capacity: 8 | storage: 200Gi 9 | accessModes: 10 | - ReadWriteMany 11 | nfs: 12 | server: 13 | path: "/" 14 | 15 | --- 16 | apiVersion: v1 17 | kind: PersistentVolumeClaim 18 | metadata: 19 | name: pysearchml-nfs 20 | namespace: kubeflow 21 | spec: 22 | accessModes: 23 | - ReadWriteMany 24 | storageClassName: "" 25 | resources: 26 | requests: 27 | storage: 200Gi 28 | -------------------------------------------------------------------------------- /kubeflow/components/prepare_env/lambdamart0/features/avg_customer_price.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "function_score": { 4 | "query": { 5 | "match_all": {} 6 | }, 7 | "script_score" : { 8 | "script" : { 9 | "params": { 10 | "customer_avg_ticket": "{{customer_avg_ticket}}" 11 | }, 12 | "source": "return Math.log(1 + Math.abs(doc['price'].value - Float.parseFloat(params.customer_avg_ticket)))" 13 | } 14 | } 15 | } 16 | }, 17 | "params": ["customer_avg_ticket"], 18 | "name": "customer_avg_ticket" 19 | } 20 | -------------------------------------------------------------------------------- /kubeflow/components/data/train/tests/fixtures/msearch_call1.txt: -------------------------------------------------------------------------------- 1 | {"index": "test"} 2 | {"query": {"bool": {"filter": [{"terms": {"_id": ["doc0", "doc1", "doc2"]}}], "should": [{"sltr": {"_name": "logged_featureset", "featureset": "model_name_test", "params": {"search_term": "keyword0"}}}]}}, "_source": ["_id"], "ext": {"ltr_log": {"log_specs": {"name": "main", "named_query": "logged_featureset"}}}} 3 | {"index": "test"} 4 | {"query": {"bool": {"filter": [{"terms": {"_id": ["doc", "doc1"]}}], "should": [{"sltr": {"_name": "logged_featureset", "featureset": "test_feature_set_name", "params": {"search_term": "keyword1"}}}]}}, "_source": ["_id"], "ext": {"ltr_log": {"log_specs": {"name": "main", "named_query": "logged_featureset"}}}} 5 | -------------------------------------------------------------------------------- /kubernetes/front/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Visualizer

6 | 7 |
8 | Input Query:
9 | 10 |
11 | Items to Return:
12 | 13 |
14 | Model Name:
15 | 16 |
17 | Channel Group:
18 | 19 |
20 | Customer Avg Ticket:
21 | 22 |
23 | 24 | 25 |
26 |
27 | 28 |
29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /kubeflow/components/model/post_model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | from train import post_model_to_elasticsearch 5 | 6 | 7 | if __name__ == '__main__': 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument( 10 | '--es_host', 11 | dest='es_host', 12 | type=str, 13 | help='Host address to reach Elasticsearch.' 14 | ) 15 | parser.add_argument( 16 | '--destination', 17 | dest='destination', 18 | type=str, 19 | help='Path where validation score is should be saved to.' 20 | ) 21 | parser.add_argument( 22 | '--model_name', 23 | dest='model_name', 24 | type=str, 25 | help='Name of featureset store as saved in Elasticsearch.' 26 | ) 27 | args, _ = parser.parse_known_args(sys.argv[1:]) 28 | post_model_to_elasticsearch(args.es_host, args.model_name, 29 | f'{args.destination}') 30 | -------------------------------------------------------------------------------- /kubeflow/disk-busybox.yaml: -------------------------------------------------------------------------------- 1 | #https://github.com/mappedinn/kubernetes-nfs-volume-on-gke/blob/master/config-yml-files/04-dep-busybox.yml 2 | apiVersion: extensions/v1beta1 3 | kind: Deployment 4 | metadata: 5 | name: nfs-busybox 6 | namespace: kubeflow 7 | spec: 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | name: nfs-busybox 12 | template: 13 | metadata: 14 | labels: 15 | name: nfs-busybox 16 | spec: 17 | containers: 18 | - image: busybox 19 | command: 20 | - sh 21 | - -c 22 | - 'while true; do date > /mnt/index.html; hostname >> /mnt/index.html; sleep $(($RANDOM % 5 + 5)); done' 23 | imagePullPolicy: IfNotPresent 24 | name: busybox 25 | volumeMounts: 26 | - name: my-pvc-nfs 27 | mountPath: "/mnt" 28 | volumes: 29 | - name: my-pvc-nfs 30 | persistentVolumeClaim: 31 | claimName: pysearchml-nfs 32 | -------------------------------------------------------------------------------- /kubeflow/nfs-server.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: nfs-server 5 | namespace: kubeflow 6 | spec: 7 | replicas: 1 8 | selector: 9 | matchLabels: 10 | role: nfs-server 11 | template: 12 | metadata: 13 | labels: 14 | role: nfs-server 15 | spec: 16 | containers: 17 | - name: nfs-server 18 | image: gcr.io/google_containers/volume-nfs:0.8 19 | ports: 20 | - name: nfs 21 | containerPort: 2049 22 | - name: mountd 23 | containerPort: 20048 24 | - name: rpcbind 25 | containerPort: 111 26 | securityContext: 27 | privileged: true 28 | volumeMounts: 29 | - mountPath: /exports 30 | name: mypvc 31 | volumes: 32 | - name: mypvc 33 | gcePersistentDisk: 34 | pdName: pysearchml-nfs-disk 35 | fsType: ext4 36 | -------------------------------------------------------------------------------- /kubeflow/components/model/tests/test_validate.py: -------------------------------------------------------------------------------- 1 | import mock 2 | from collections import namedtuple 3 | 4 | from validate import validate_model 5 | 6 | 7 | def test_validate_model(monkeypatch, es_response): 8 | es_mock = mock.Mock() 9 | es_mock.return_value.msearch.side_effect = es_response 10 | monkeypatch.setattr('validate.Elasticsearch', es_mock) 11 | 12 | args = namedtuple( 13 | 'args', 14 | [ 15 | 'files_path', 16 | 'index', 17 | 'es_host', 18 | 'model_name', 19 | 'es_batch' 20 | ] 21 | ) 22 | args.files_path = 'tests/fixtures/' 23 | args.es_query_path = 'queries/unittest/es_query.json' 24 | args.index = 'index_test' 25 | args.es_host = 'es_host_test' 26 | args.model_name = 'unittest' 27 | args.es_batch = 2 28 | 29 | rank = validate_model( 30 | args.files_path, 31 | args.es_host, 32 | args.model_name, 33 | args.index, 34 | args.es_batch 35 | ) 36 | assert rank == 0.6 37 | -------------------------------------------------------------------------------- /kubernetes/front/templates/documents.j2: -------------------------------------------------------------------------------- 1 | {% macro build_product(id, score) %} 2 |
3 |
4 |

{{ id }}

5 |
6 |
7 | Score: {{ score }} 8 |
9 |
10 | {% endmacro %} 11 | 12 | 30 | 31 |
32 | {% for product in product_list %} 33 | {{ build_product( 34 | product['_id'], 35 | product['_score'], 36 | ) }} 37 | {% endfor %} 38 |
39 | -------------------------------------------------------------------------------- /kubernetes/front/app.yaml: -------------------------------------------------------------------------------- 1 | kind: Namespace 2 | apiVersion: v1 3 | metadata: 4 | name: front 5 | 6 | --- 7 | kind: Service 8 | apiVersion: v1 9 | metadata: 10 | name: front 11 | namespace: front 12 | labels: 13 | app: front 14 | spec: 15 | selector: 16 | app: front 17 | clusterIP: None 18 | ports: 19 | - port: 8088 20 | targetPort: 8088 21 | 22 | --- 23 | apiVersion: apps/v1beta2 24 | kind: Deployment 25 | metadata: 26 | name: front 27 | namespace: front 28 | labels: 29 | app: front 30 | spec: 31 | replicas: 1 32 | selector: 33 | matchLabels: 34 | app: front 35 | template: 36 | metadata: 37 | name: front 38 | namespace: front 39 | labels: 40 | app: front 41 | spec: 42 | containers: 43 | - name: front 44 | image: willfuks/pysearchml_front 45 | ports: 46 | - containerPort: 8088 47 | resources: 48 | requests: 49 | memory: 256Mi 50 | limits: 51 | memory: 512Mi 52 | env: 53 | - name: DEBUG_MODE 54 | value: "1" 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Willian Fuks 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /kubeflow/components/prepare_env/lambdamart0/features/channel_group.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "function_score": { 4 | "query": { 5 | "match_all": {} 6 | }, 7 | "script_score" : { 8 | "script" : { 9 | "params": { 10 | "channel_group": "{{channel_group}}" 11 | }, 12 | "source": "if (params.channel_group == 'paid_search') { return doc['performances.channel.paid_search.CTR'].value * 10 } else if (params.channel_group == 'referral') { return doc['performances.channel.referral.CTR'].value * 10 } else if (params.channel_group == 'organic_search') { return doc['performances.channel.organic_search.CTR'].value * 10 } else if (params.channel_group == 'social') { return doc['performances.channel.social.CTR'].value * 10 } else if (params.channel_group == 'display') { return doc['performances.channel.display.CTR'].value * 10 } else if (params.channel_group == 'direct') { return doc['performances.channel.direct.CTR'].value * 10 } else if (params.channel_group == 'affiliates') { return doc['performances.channel.affiliates.CTR'].value * 10 }" 13 | } 14 | } 15 | } 16 | }, 17 | "params": ["channel_group"], 18 | "name": "channel_group" 19 | } 20 | -------------------------------------------------------------------------------- /bin/manage_service_account.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -eu 4 | 5 | #PROJECT_ID=$(gcloud config get-value project) 6 | PROJECT_ID=$PROJECT_ID 7 | NAME=pysearchml 8 | SECRET_NAME=pysearchml-service-account 9 | 10 | if ! [ -z "$(gcloud secrets list | grep $SECRET_NAME)" ]; then 11 | gcloud secrets versions access latest --secret=$SECRET_NAME > key.json 12 | echo Downloaded Secret 13 | fi 14 | 15 | if [ -e key.json ]; then 16 | echo File key.json already available. 17 | else 18 | echo Creating service account and downloading key... 19 | 20 | SERVICE_ACCOUNT=$(gcloud iam service-accounts list --filter NAME=$NAME) 21 | if [ -z "$SERVICE_ACCOUNT" ]; then 22 | gcloud iam service-accounts create $NAME --project $PROJECT_ID \ 23 | --display-name $NAME 24 | 25 | for ROLE in roles/editor roles/storage.admin roles/bigquery.admin roles/storage.objectAdmin; 26 | do 27 | gcloud projects add-iam-policy-binding $PROJECT_ID \ 28 | --member=serviceAccount:$NAME@$PROJECT_ID.iam.gserviceaccount.com \ 29 | --role=$ROLE 30 | done 31 | echo Created Service Account $NAME 32 | fi 33 | 34 | gcloud iam service-accounts keys create ./key.json --iam-account $NAME@$PROJECT_ID.iam.gserviceaccount.com 35 | 36 | echo Creating Secret File 37 | gcloud secrets create $SECRET_NAME --data-file=key.json --replication-policy=automatic 38 | echo New Secret File Created 39 | 40 | echo Finished downloading secrets key.json file 41 | fi 42 | -------------------------------------------------------------------------------- /kubeflow/build/manage_service_account.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #set -e 4 | 5 | #PROJECT_ID=$(gcloud config get-value project) 6 | PROJECT_ID=$PROJECT_ID 7 | NAME=pysearchml 8 | SECRET_NAME=pysearchml-service-account 9 | 10 | if ! [ -z "$(gcloud secrets list | grep $SECRET_NAME)" ]; then 11 | gcloud secrets versions access latest --secret=$SECRET_NAME > key.json 12 | echo Downloaded Secret 13 | fi 14 | 15 | if [ -e key.json ]; then 16 | echo File key.json already available. 17 | else 18 | echo Creating service account and downloading key... 19 | 20 | SERVICE_ACCOUNT=$(gcloud iam service-accounts list --filter NAME=$NAME) 21 | if [ -z "$SERVICE_ACCOUNT" ]; then 22 | gcloud iam service-accounts create $NAME --project $PROJECT_ID \ 23 | --display-name $NAME 24 | 25 | for ROLE in roles/editor roles/storage.admin roles/bigquery.admin roles/storage.objectAdmin; 26 | do 27 | gcloud projects add-iam-policy-binding $PROJECT_ID \ 28 | --member=serviceAccount:$NAME@$PROJECT_ID.iam.gserviceaccount.com \ 29 | --role=$ROLE 30 | done 31 | echo Created Service Account $NAME 32 | fi 33 | 34 | gcloud iam service-accounts keys create ./key.json --iam-account $NAME@$PROJECT_ID.iam.gserviceaccount.com 35 | 36 | echo Creating Secret File 37 | gcloud secrets create $SECRET_NAME --data-file=key.json --replication-policy=automatic 38 | echo New Secret File Created 39 | 40 | echo Finished downloading secrets key.json file 41 | fi 42 | -------------------------------------------------------------------------------- /kubernetes/front/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from flask import Flask, request, jsonify 4 | from jinja2 import Environment, FileSystemLoader 5 | from elasticsearch import Elasticsearch 6 | 7 | 8 | es = Elasticsearch('elasticsearch.elastic-system.svc.cluster.local:9200') 9 | app = Flask(__name__) 10 | env = Environment(loader=FileSystemLoader('/front/templates')) 11 | 12 | 13 | @app.route("/", methods=['GET', 'POST']) 14 | def index(): 15 | index_html = env.get_template('index.html').render() 16 | return index_html 17 | 18 | 19 | @app.route("/searchresults", methods=['POST']) 20 | def search(): 21 | try: 22 | args = request.form.to_dict() 23 | es_query = open('/front/es_query.json').read() 24 | print(args) 25 | input_query = args['search_term'] 26 | size = args.pop('size') 27 | model_name = args.pop('model_name') 28 | 29 | es_query = es_query.replace('{query}', input_query) 30 | es_query = json.loads(es_query) 31 | es_query['size'] = size 32 | es_query['_source'] = [] 33 | 34 | es_query['rescore']['window_size'] = 500 35 | es_query['rescore']['query']['rescore_query']['sltr']['params'] = args 36 | es_query['rescore']['query']['rescore_query']['sltr']['model'] = model_name 37 | 38 | if 'ltr_flag' not in args: 39 | es_query.pop('rescore') 40 | 41 | r = es.search(index='pysearchml', body=es_query).get('hits', {}).get('hits') 42 | r = [(e['_id'], e['_score']) for e in r] 43 | return jsonify(r) 44 | # return env.get_template('documents.j2').render(product_list=r) 45 | except Exception as e: 46 | return str(e) 47 | -------------------------------------------------------------------------------- /kubeflow/components/model/test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | from typing import List, NamedTuple 4 | 5 | from validate import validate_model 6 | 7 | 8 | def parse_args(args: List) -> NamedTuple: 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument( 11 | '--files_path', 12 | dest='files_path', 13 | type=str, 14 | help='Path to files containing data of customers searches and their purchases' 15 | ) 16 | parser.add_argument( 17 | '--index', 18 | dest='index', 19 | type=str, 20 | default='pysearchml', 21 | help='Name of Index where documents are stored in Elasticsearch.' 22 | ) 23 | parser.add_argument( 24 | '--es_host', 25 | dest='es_host', 26 | type=str, 27 | help='Host address to reach Elasticsearch.' 28 | ) 29 | parser.add_argument( 30 | '--model_name', 31 | dest='model_name', 32 | type=str, 33 | help='Assigns a name for the RankLib model. Each experiment on Kubeflow should ' 34 | 'have a specific name in order to preserver their results.' 35 | ) 36 | parser.add_argument( 37 | '--es_batch', 38 | dest='es_batch', 39 | type=int, 40 | default=1000, 41 | help='Determines how many items to send at once to Elasticsearch when using ' 42 | 'multisearch API.' 43 | ) 44 | args, _ = parser.parse_known_args(args) 45 | return args 46 | 47 | 48 | if __name__ == '__main__': 49 | args = parse_args(sys.argv[1:]) 50 | test_rank = validate_model( 51 | args.files_path, 52 | args.es_host, 53 | args.model_name, 54 | args.index, 55 | args.es_batch 56 | ) 57 | print(f'Test-rank={test_rank}') 58 | -------------------------------------------------------------------------------- /kubeflow/components/model/queries/lambdamart0/es_query.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "function_score": { 4 | "query": { 5 | "bool": { 6 | "must": { 7 | "bool": { 8 | "minimum_should_match": 1, 9 | "should": [ 10 | { 11 | "multi_match": { 12 | "operator": "and", 13 | "query": "{query}", 14 | "type": "cross_fields", 15 | "fields": [ 16 | "sku", 17 | "name", 18 | "category" 19 | ] 20 | } 21 | } 22 | ] 23 | } 24 | } 25 | } 26 | }, 27 | "functions": [ 28 | { 29 | "field_value_factor": { 30 | "field": "performances.global.CTR", 31 | "factor": 10, 32 | "missing": 0, 33 | "modifier": "none" 34 | } 35 | } 36 | ], 37 | "boost_mode": "sum", 38 | "score_mode": "sum" 39 | } 40 | }, 41 | "rescore": { 42 | "window_size": "{window_size}", 43 | "query": { 44 | "rescore_query": { 45 | "sltr": { 46 | "params": "{search_keys}", 47 | "model": "{model_name}" 48 | } 49 | }, 50 | "rescore_query_weight": 20, 51 | "query_weight": 0.1, 52 | "score_mode": "total" 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /kubeflow/components/model/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture 5 | def es_response(): 6 | return [ 7 | { 8 | 'responses': [ 9 | { 10 | 'hits': { 11 | 'hits': [ 12 | { 13 | '_id': 'doc3', 14 | }, 15 | { 16 | '_id': 'doc2', 17 | }, 18 | { 19 | '_id': 'doc1', 20 | }, 21 | { 22 | '_id': 'doc0', 23 | } 24 | ] 25 | } 26 | }, 27 | { 28 | 'hits': { 29 | 'hits': [ 30 | { 31 | '_id': 'doc3', 32 | }, 33 | { 34 | '_id': 'doc2', 35 | }, 36 | { 37 | '_id': 'doc1', 38 | }, 39 | { 40 | '_id': 'doc0', 41 | } 42 | ] 43 | } 44 | }, 45 | ] 46 | }, 47 | { 48 | 'responses': [ 49 | { 50 | 'hits': { 51 | 'hits': [ 52 | { 53 | '_id': 'doc3', 54 | }, 55 | { 56 | '_id': 'doc2', 57 | }, 58 | { 59 | '_id': 'doc1', 60 | }, 61 | { 62 | '_id': 'doc0', 63 | } 64 | ] 65 | } 66 | } 67 | ] 68 | }, 69 | ] 70 | -------------------------------------------------------------------------------- /kubeflow/pipelines/helper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import kfp 4 | import fire 5 | from datetime import datetime 6 | 7 | 8 | def update_op_project_id_img(op): 9 | project_id = os.getenv('PROJECT_ID') 10 | if not project_id: 11 | raise Exception('Please set an $PROJECT_ID env value.') 12 | img = op.component_spec.implementation.container.image 13 | img = img.format(PROJECT_ID=project_id) 14 | op.component_spec.implementation.container.image = img 15 | return op 16 | 17 | 18 | def get_pipe_by_name(client, name): 19 | # Tries to read pipeline. If fails, assumes pipe doesnt exist. 20 | try: 21 | pipes = client.list_pipelines() 22 | pipeline = [pipe for pipe in pipes.pipelines if pipe.name == name] 23 | except Exception: 24 | pipeline = None 25 | 26 | if pipeline: 27 | pipeline = pipeline[0] 28 | return pipeline 29 | 30 | 31 | def deploy_pipeline(host, version): 32 | client = kfp.Client(host=host) 33 | name = f'pysearchml_{version}' 34 | # Supposed page_token is not necessary for this application 35 | 36 | pipeline = get_pipe_by_name(client, name) 37 | if not pipeline: 38 | pipeline = client.upload_pipeline( 39 | pipeline_package_path='pipeline.tar.gz', 40 | pipeline_name=name 41 | ) 42 | 43 | 44 | def run_experiment(host, version, experiment_name): 45 | client = kfp.Client(host=host) 46 | name = f'pysearchml_{version}' 47 | pipeline = get_pipe_by_name(client, name) 48 | if not pipeline: 49 | raise Exception('Please first create a pipeline before running') 50 | run_id = f'experiment_{datetime.now().strftime("%Y%m%d-%H%M%S")}' 51 | experiment = client.create_experiment(name=experiment_name) 52 | params = json.loads(open('params.json').read()) 53 | client.run_pipeline(experiment.id, job_name=run_id, params=params, 54 | pipeline_id=pipeline.id) 55 | 56 | 57 | def main(action, host, **kwargs): 58 | if action == 'deploy-pipeline': 59 | version = kwargs.get('version') 60 | deploy_pipeline(host, version) 61 | elif action == 'run-pipeline': 62 | experiment_name = kwargs['experiment_name'] 63 | run_experiment(experiment_name) 64 | else: 65 | raise ValueError(f'Invalid operation name: {action}.') 66 | 67 | 68 | if __name__ == '__main__': 69 | fire.Fire(main) 70 | -------------------------------------------------------------------------------- /kubeflow/components/model/experiment.json: -------------------------------------------------------------------------------- 1 | { 2 | "apiVersion": "kubeflow.org/v1alpha3", 3 | "kind": "Experiment", 4 | "metadata": { 5 | "namespace": "kubeflow", 6 | "name": "", 7 | "labels": { 8 | "controller-tools.k8s.io": "1.0" 9 | } 10 | }, 11 | "spec": { 12 | "objective": { 13 | "type": "minimize", 14 | "objectiveMetricName": "Validation-rank", 15 | "additionalMetricNames": [ 16 | "rank" 17 | ] 18 | }, 19 | "algorithm": { 20 | "algorithmName": "bayesianoptimization" 21 | }, 22 | "parallelTrialCount": 2, 23 | "maxTrialCount": 6, 24 | "maxFailedTrialCount": 1, 25 | "parameters": [], 26 | "trialTemplate": { 27 | "goTemplate": { 28 | "rawTemplate": { 29 | "apiVersion": "batch/v1", 30 | "kind": "Job", 31 | "metadata":{ 32 | "name": "{{.Trial}}", 33 | "namespace": "{{.NameSpace}}" 34 | }, 35 | "spec": { 36 | "template": { 37 | "spec": { 38 | "restartPolicy": "Never", 39 | "containers": [ 40 | { 41 | "name": "{{.Trial}}", 42 | "image": "gcr.io/{PROJECT_ID}/model", 43 | "command": [ 44 | "python /model/train.py --train_file_path={train_file_path} --validation_files_path={validation_files_path} --validation_train_files_path={validation_train_files_path} --es_host={es_host} --destination={destination} --model_name={model_name} --ranker={ranker} {{- with .HyperParameters}} {{- range .}} {{.Name}}={{.Value}} {{- end}} {{- end}}" 45 | ], 46 | "volumeMounts": [ 47 | { 48 | "mountPath": "/data", 49 | "name": "pysearchmlpvc", 50 | "readOnly": false 51 | } 52 | ] 53 | } 54 | ], 55 | "volumes": [ 56 | { 57 | "name": "pysearchmlpvc", 58 | "persistentVolumeClaim": { 59 | "claimName": "pysearchml-nfs", 60 | "readOnly": false 61 | } 62 | } 63 | ] 64 | } 65 | } 66 | } 67 | } 68 | } 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | !kubeflow/build 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | # json key files 133 | key.json 134 | 135 | # KFP pipelines 136 | *.tar.gz 137 | kf_deploys 138 | -------------------------------------------------------------------------------- /kubernetes/es/deploy_elasticsearch.yaml: -------------------------------------------------------------------------------- 1 | kind: Namespace 2 | apiVersion: v1 3 | metadata: 4 | name: elastic-system 5 | 6 | --- 7 | kind: Service 8 | apiVersion: v1 9 | metadata: 10 | name: elasticsearch 11 | namespace: elastic-system 12 | labels: 13 | app: elasticsearch 14 | spec: 15 | selector: 16 | app: elasticsearch 17 | clusterIP: None 18 | ports: 19 | - port: 9200 20 | name: rest 21 | - port: 9300 22 | name: inter-node 23 | 24 | --- 25 | apiVersion: apps/v1 26 | kind: StatefulSet 27 | metadata: 28 | name: es-cluster 29 | namespace: elastic-system 30 | spec: 31 | serviceName: elasticsearch 32 | replicas: 1 33 | selector: 34 | matchLabels: 35 | app: elasticsearch 36 | template: 37 | metadata: 38 | labels: 39 | app: elasticsearch 40 | spec: 41 | containers: 42 | - name: elasticsearch 43 | #image: docker.elastic.co/elasticsearch/elasticsearch:7.3.1 44 | image: willfuks/ltres:0.1 45 | resources: 46 | limits: 47 | cpu: 1 48 | ports: 49 | - containerPort: 9200 50 | name: rest 51 | protocol: TCP 52 | - containerPort: 9300 53 | name: inter-node 54 | protocol: TCP 55 | volumeMounts: 56 | - name: data 57 | mountPath: /usr/share/elasticsearch/data 58 | env: 59 | - name: cluster.name 60 | value: k8s-es 61 | - name: node.name 62 | valueFrom: 63 | fieldRef: 64 | fieldPath: metadata.name 65 | - name: discovery.seed_hosts 66 | value: "es-cluster-0.elasticsearch" 67 | - name: cluster.initial_master_nodes 68 | value: "es-cluster-0" 69 | - name: ES_JAVA_OPTS 70 | value: "-Xms512m -Xmx512m" 71 | initContainers: 72 | - name: fix-permissions 73 | image: busybox 74 | command: ["sh", "-c", "chown -R 1000:1000 /usr/share/elasticsearch/data"] 75 | securityContext: 76 | privileged: true 77 | volumeMounts: 78 | - name: data 79 | mountPath: /usr/share/elasticsearch/data 80 | - name: increase-vm-max-map 81 | image: busybox 82 | command: ["sysctl", "-w", "vm.max_map_count=262144"] 83 | securityContext: 84 | privileged: true 85 | - name: increase-fd-ulimit 86 | image: busybox 87 | command: ["sh", "-c", "ulimit -n 65536"] 88 | securityContext: 89 | privileged: true 90 | volumeClaimTemplates: 91 | - metadata: 92 | name: data 93 | labels: 94 | app: elasticsearch 95 | spec: 96 | accessModes: 97 | - ReadWriteOnce 98 | resources: 99 | requests: 100 | storage: 4Gi 101 | -------------------------------------------------------------------------------- /bin/create_k8s.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | CLUSTER_EXISTS=true 4 | CLUSTER_NAME=${CLUSTER_NAME:-"pysearchml"} 5 | 6 | echo "cluster name: ${CLUSTER_NAME}" 7 | 8 | gcloud config set project $PROJECT_ID 2>/dev/null 9 | gcloud config set compute/zone $COMPUTE_ZONE 2>/dev/null 10 | 11 | if [ -z $PROJECT_ID ] || [ -z $COMPUTE_ZONE ]; then 12 | echo Error: Please set properly env variables PROJECT_ID and COMPUTE_ZONE 13 | exit 1 14 | fi 15 | 16 | gcloud container clusters describe $CLUSTER_NAME 2>/dev/null || CLUSTER_EXISTS=false 17 | 18 | if [ $CLUSTER_EXISTS = false ]; then 19 | gcloud container clusters create $CLUSTER_NAME \ 20 | --enable-autoupgrade \ 21 | --scopes cloud-platform \ 22 | --machine-type n1-standard-2 \ 23 | --zone=$COMPUTE_ZONE \ 24 | --disk-size=30GB \ 25 | --num-nodes=2 26 | 27 | gcloud components install kubectl 28 | gcloud container clusters get-credentials $CLUSTER_NAME --zone=$COMPUTE_ZONE 29 | 30 | # Install Kubeflow Pipelines 31 | export PIPELINE_VERSION=0.5.1 32 | kubectl apply -k "github.com/kubeflow/pipelines/manifests/kustomize/cluster-scoped-resources?ref=$PIPELINE_VERSION" 33 | kubectl wait --for condition=established --timeout=60s crd/applications.app.k8s.io 34 | kubectl apply -k "github.com/kubeflow/pipelines/manifests/kustomize/env/platform-agnostic/?ref=$PIPELINE_VERSION" 35 | # this step can take a while 36 | kubectl wait applications/pipeline -n kubeflow --for condition=Ready --timeout=1800s 37 | # Update namespace to contain metric collector label 38 | kubectl apply -f kubeflow/namespace.yaml 39 | 40 | # Install NFS in Kubeflow Namespace 41 | # https://medium.com/platformer-blog/nfs-persistent-volumes-with-kubernetes-a-case-study-ce1ed6e2c266 42 | gcloud compute disks create --size=10GB --zone=${COMPUTE_ZONE} pysearchml-nfs-disk 43 | kubectl apply -f kubeflow/nfs-server.yaml 44 | kubectl apply -f kubeflow/nfs-server-service.yaml 45 | CLUSTER_IP=$(kubectl -n kubeflow get services nfs-server -o=jsonpath='{.spec.clusterIP}') 46 | sed "0,/^\([[:space:]]*server: *\).*/s//\1$CLUSTER_IP/;" kubeflow/pv-pvc.yaml | kubectl apply -f - 47 | #yq w -d0 kubeflow/pv-pvc.yaml 'spec.nfs.server' ${CLUSTER_IP} | kubectl apply -f - 48 | 49 | # Install Kustomize 50 | curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | bash 51 | 52 | # Install Kabit 53 | git clone git@github.com:kubeflow/manifests.git 54 | ./kustomize build manifests/katib/katib-crds/base | kubectl apply -f - 55 | ./kustomize build manifests/katib/katib-controller/base | kubectl apply -f - 56 | 57 | # https://www.digitalocean.com/community/tutorials/how-to-set-up-an-elasticsearch-fluentd-and-kibana-efk-logging-stack-on-kubernetes#step-3-%E2%80%94-creating-the-kibana-deployment-and-service 58 | kubectl apply -f kubernetes/es/deploy_elasticsearch.yaml 59 | 60 | # Install the visualizer front end 61 | kubectl apply -f kubernetes/front/app.yaml 62 | fi 63 | -------------------------------------------------------------------------------- /kubeflow/components/prepare_env/lambdamart0/ga_data.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | sku, 3 | name, 4 | category, 5 | COALESCE(global_price, 0) AS price, 6 | STRUCT( 7 | STRUCT( 8 | COALESCE(global_impressions, 0) AS impressions, 9 | COALESCE(global_clicks, 0) AS clicks, 10 | COALESCE(IF(MAX(global_clicks) / MAX(global_impressions) > 1, 1, MAX(global_clicks) / MAX(global_impressions)), 0) AS CTR 11 | ) AS global, 12 | STRUCT( 13 | CASE WHEN channel = 'Organic Search' THEN STRUCT(COALESCE(IF(SUM(clicks) / COALESCE(SUM(impressions), 1) > 1, 1, SUM(clicks) / COALESCE(SUM(impressions), 1)), 0) AS CTR) ELSE STRUCT(0 AS CTR) END AS organic_search, 14 | CASE WHEN channel = 'Direct' THEN STRUCT(COALESCE(IF(SUM(clicks) / COALESCE(SUM(impressions), 1) > 1, 1, SUM(clicks) / COALESCE(SUM(impressions), 1)), 0) AS CTR) ELSE STRUCT(0 AS CTR) END AS direct, 15 | CASE WHEN channel = 'Referral' THEN STRUCT(COALESCE(IF(SUM(clicks) / COALESCE(SUM(impressions), 1) > 1, 1, SUM(clicks) / COALESCE(SUM(impressions), 1)), 0) AS CTR) ELSE STRUCT(0 AS CTR) END AS referral, 16 | CASE WHEN channel = 'Paid Search' THEN STRUCT(COALESCE(IF(SUM(clicks) / COALESCE(SUM(impressions), 1) > 1, 1, SUM(clicks) / COALESCE(SUM(impressions), 1)), 0) AS CTR) ELSE STRUCT(0 AS CTR) END AS paid_search, 17 | CASE WHEN channel = 'Display' THEN STRUCT(COALESCE(IF(SUM(clicks) / COALESCE(SUM(impressions), 1) > 1, 1, SUM(clicks) / COALESCE(SUM(impressions), 1)), 0) AS CTR) ELSE STRUCT(0 AS CTR) END AS display, 18 | CASE WHEN channel = 'Affiliates' THEN STRUCT(COALESCE(IF(SUM(clicks) / COALESCE(SUM(impressions), 1) > 1, 1, SUM(clicks) / COALESCE(SUM(impressions), 1)), 0) AS CTR) ELSE STRUCT(0 AS CTR) END AS affiliates, 19 | CASE WHEN channel = 'Social' THEN STRUCT(COALESCE(IF(SUM(clicks) / COALESCE(SUM(impressions), 1) > 1, 1, SUM(clicks) / COALESCE(SUM(impressions), 1)), 0) AS CTR) ELSE STRUCT(0 AS CTR) END AS social 20 | ) AS channel 21 | ) AS performances 22 | FROM( 23 | SELECT DISTINCT 24 | sku, 25 | channel, 26 | name, 27 | REGEXP_REPLACE(category, '/', ' ') AS category, 28 | SUM(impressions) OVER(PARTITION BY sku) AS global_impressions, 29 | impressions, 30 | SUM(clicks) OVER(PARTITION BY sku) AS global_clicks, 31 | clicks, 32 | AVG(price) OVER(PARTITION BY sku) AS global_price, 33 | FROM( 34 | SELECT 35 | ARRAY( 36 | SELECT AS STRUCT 37 | channelGrouping AS channel, 38 | productSku AS sku, 39 | v2ProductCategory AS category, 40 | v2ProductName AS name, 41 | SUM(CAST(isImpression AS INT64)) AS impressions, 42 | SUM(CAST(isClick AS INT64)) AS clicks, 43 | AVG(productPrice / 1e6) AS price 44 | FROM UNNEST(hits), UNNEST(product) 45 | GROUP BY channel, sku, category, name 46 | ) AS products 47 | FROM `bigquery-public-data.google_analytics_sample.ga_sessions*` 48 | WHERE TRUE 49 | AND REGEXP_EXTRACT(_TABLE_SUFFIX, r'.*_(\d+)$') BETWEEN '20160801' AND '20170801' 50 | ), UNNEST(products) 51 | ) 52 | WHERE TRUE 53 | AND global_impressions > 0 54 | GROUP BY 55 | sku, 56 | channel, 57 | name, 58 | category, 59 | global_impressions, 60 | global_clicks, 61 | global_price 62 | -------------------------------------------------------------------------------- /kubeflow/components/data/validation/run.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import argparse 4 | import pathlib 5 | import uuid 6 | from shutil import rmtree 7 | 8 | from google.cloud import storage, bigquery 9 | 10 | 11 | PATH = pathlib.Path(__file__).parent 12 | 13 | 14 | def main(validation_init_date, validation_end_date, bucket, destination): 15 | # Remove everything and deletes destination folder to receive new files. 16 | rmtree(destination, ignore_errors=True) 17 | os.makedirs(destination, exist_ok=True) 18 | 19 | storage_client = storage.Client() 20 | bq_client = bigquery.Client() 21 | 22 | ds_ref = bq_client.dataset('pysearchml') 23 | 24 | table_id = str(uuid.uuid4().hex) 25 | table_ref = ds_ref.table(table_id) 26 | 27 | # Query GA data 28 | query_path = PATH / 'validation.sql' 29 | query = open(str(query_path)).read() 30 | query = query.format(validation_init_date=validation_init_date, 31 | validation_end_date=validation_end_date) 32 | 33 | job_config = bigquery.QueryJobConfig() 34 | job_config.destination = f'{bq_client.project}.pysearchml.{table_id}' 35 | job_config.maximum_bytes_billed = 10 * (1024 ** 3) 36 | job_config.write_disposition = 'WRITE_TRUNCATE' 37 | job = bq_client.query(query, job_config=job_config) 38 | job.result() 39 | 40 | # export BigQuery table to GCS 41 | # bucket will be set in accordance to which validation dataset is referenced, i.e., 42 | # whether regular validation or validation for the training dataset. 43 | destination_uri = f"gs://{bucket}/validation*.gz" 44 | 45 | extract_config = bigquery.ExtractJobConfig() 46 | extract_config.compression = 'GZIP' 47 | extract_config.destination_format = 'NEWLINE_DELIMITED_JSON' 48 | job = bq_client.extract_table(table_ref, destination_uri, job_config=extract_config) 49 | job.result() 50 | 51 | # Download data 52 | bucket_obj = storage_client.bucket(bucket.split('/')[0]) 53 | blobs = bucket_obj.list_blobs(prefix=bucket.partition('/')[-1]) 54 | for blob in blobs: 55 | blob.download_to_filename(f"{destination}/{blob.name.split('/')[-1]}") 56 | blob.delete() 57 | 58 | # delete BQ table 59 | bq_client.delete_table(table_ref) 60 | 61 | 62 | if __name__ == '__main__': 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument( 65 | '--validation_init_date', 66 | dest='validation_init_date', 67 | type=str, 68 | help='Date in format %Y%M%D from when to start querying GA data' 69 | ) 70 | parser.add_argument( 71 | '--validation_end_date', 72 | dest='validation_end_date', 73 | type=str, 74 | help='Date in format %Y%M%D from when to stop querying GA data' 75 | ) 76 | parser.add_argument( 77 | '--bucket', 78 | dest='bucket', 79 | type=str 80 | ) 81 | parser.add_argument( 82 | '--destination', 83 | dest='destination', 84 | type=str, 85 | help='Path where validation dataset gzipped files will be stored.' 86 | ) 87 | 88 | args, _ = parser.parse_known_args(sys.argv[1:]) 89 | main( 90 | args.validation_init_date, 91 | args.validation_end_date, 92 | args.bucket, 93 | args.destination 94 | ) 95 | -------------------------------------------------------------------------------- /kubeflow/components/data/validation/validation.sql: -------------------------------------------------------------------------------- 1 | CREATE TEMP FUNCTION PROCESS_SKUS_PURCHASED_FROM_SEARCH(searched_skus ARRAY, purchased_skus ARRAY) RETURNS ARRAY > AS ( 2 | /** 3 | Compares list of skus from the search results and the ones purchased; returns the intersection between the two. 4 | **/ 5 | ARRAY(SELECT AS STRUCT sku, IF(EXISTS(SELECT 1 FROM UNNEST(purchased_skus) AS p_sku WHERE sku = p_sku), TRUE, FALSE) FROM UNNEST(searched_skus) AS sku) 6 | ); 7 | 8 | CREATE TEMP FUNCTION PROCESS_CHANNEL_GROUP(channelGroup STRING) RETURNS STRING AS ( 9 | REGEXP_REPLACE(LOWER(channelGroup), ' ', '_') 10 | ); 11 | 12 | WITH search_data AS( 13 | SELECT 14 | fv, 15 | channel_group, 16 | ARRAY( 17 | SELECT AS STRUCT 18 | query, 19 | ARRAY_AGG(STRUCT(skus.sku AS sku, skus.purchase_flag AS purchase_flag)) AS skus 20 | FROM UNNEST(hits), UNNEST(skus) AS skus 21 | GROUP BY query 22 | ) AS hits 23 | FROM( 24 | SELECT 25 | fv, 26 | channel_group, 27 | ARRAY( 28 | SELECT AS STRUCT 29 | query, 30 | PROCESS_SKUS_PURCHASED_FROM_SEARCH(query_skus, purchased_skus) skus 31 | FROM UNNEST(hits) 32 | ) AS hits 33 | FROM( 34 | SELECT 35 | fullvisitorid AS fv, 36 | COALESCE(PROCESS_CHANNEL_GROUP(channelGrouping), '') AS channel_group, 37 | ARRAY( 38 | SELECT AS STRUCT 39 | REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_EXTRACT(page.pagepath, r'([^\/]+)$'), r'(\+)+', ' '), r't ', 't'), r' s ?', ' '), r'\.axd', '') AS query, 40 | ARRAY_AGG(DISTINCT productSKU IGNORE NULLS) AS query_skus, 41 | FROM UNNEST(hits) LEFT JOIN UNNEST(product) 42 | WHERE productSKU != '(not set)' 43 | AND NOT REGEXP_CONTAINS(page.pagepath, r'\.html|home') AND REGEXP_CONTAINS(page.pagepath, r'google\+redesign') 44 | GROUP BY query 45 | ) AS hits, 46 | ARRAY(SELECT DISTINCT productSKU FROM UNNEST(hits), UNNEST(product) WHERE ecommerceAction.action_type = '6') AS purchased_skus 47 | FROM `bigquery-public-data.google_analytics_sample.ga_sessions*` 48 | WHERE TRUE 49 | AND REGEXP_EXTRACT(_TABLE_SUFFIX, r'.*_(\d+)$') BETWEEN '{validation_init_date}' AND '{validation_end_date}' 50 | ) 51 | ) 52 | ), 53 | customer_data AS( 54 | SELECT 55 | fv, 56 | COALESCE((SELECT AVG(avg_ticket) FROM UNNEST(ticket_array) AS avg_ticket), 0) AS avg_ticket 57 | FROM( 58 | SELECT 59 | fullvisitorid AS fv, 60 | ARRAY_CONCAT_AGG(ARRAY((SELECT AVG(productPrice / 1e6) AS avg_ticket FROM UNNEST(hits), UNNEST(product)))) AS ticket_array 61 | FROM `bigquery-public-data.google_analytics_sample.ga_sessions*` 62 | WHERE TRUE 63 | AND REGEXP_EXTRACT(_TABLE_SUFFIX, r'.*_(\d+)$') BETWEEN '{validation_init_date}' AND '{validation_end_date}' 64 | GROUP BY fv 65 | ) 66 | ) 67 | 68 | 69 | SELECT 70 | STRUCT( 71 | query AS search_term, 72 | COALESCE(channel_group, '') AS channel_group, 73 | COALESCE(CAST(avg_ticket AS INT64), 0) AS customer_avg_ticket 74 | ) AS search_keys, 75 | ARRAY_AGG(STRUCT(ARRAY(SELECT sku FROM UNNEST(skus) WHERE purchase_flag) AS purchased)) AS docs 76 | FROM search_data LEFT JOIN customer_data USING(fv), UNNEST(hits) 77 | WHERE ARRAY_LENGTH(ARRAY(SELECT sku FROM UNNEST(skus) WHERE purchase_flag)) > 0 78 | GROUP BY query, channel_group, avg_ticket 79 | -------------------------------------------------------------------------------- /kubeflow/pipelines/pipeline.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | from kfp import components, dsl 4 | from helper import update_op_project_id_img 5 | 6 | 7 | PATH = pathlib.Path(__file__).parent 8 | 9 | 10 | @dsl.pipeline( 11 | name='Train Lambda Mart Pipeline', 12 | description=('Responsible for generating all datasets and optimization process for' 13 | ' the chosen Ranker algorithm.') 14 | ) 15 | def build_pipeline( 16 | bucket='pysearchml', 17 | es_host='elasticsearch.elastic-system.svc.cluster.local:9200', 18 | force_restart=False, 19 | train_init_date='20170801', 20 | train_end_date='20170801', 21 | validation_init_date='20170802', 22 | validation_end_date='20170802', 23 | model_name='lambdamart0', 24 | ranker='lambdamart' 25 | ): 26 | 27 | components_path = PATH.parent / 'components' 28 | 29 | # component_path = main_path / 'gcs' / 'component.yaml' 30 | # gs_op_ = components.load_component_from_file(str(component_path)) 31 | # gs_op_ = update_op_project_id_img(gs_op_) 32 | # gs_op = gs_op_('gs://pysearchml/requirements.txt', '.').set_display_name('GS') 33 | 34 | component_path = components_path / 'prepare_env' / 'component.yaml' 35 | prepare_op_ = components.load_component_from_file(str(component_path)) 36 | prepare_op_ = update_op_project_id_img(prepare_op_) 37 | 38 | prepare_op = prepare_op_( 39 | bucket=bucket, 40 | es_host=es_host, 41 | force_restart=force_restart, 42 | model_name=model_name 43 | ).set_display_name('Preparing Environment') 44 | 45 | component_path = components_path / 'data' / 'validation' / 'component.yaml' 46 | validation_op_ = components.load_component_from_file(str(component_path)) 47 | validation_op_ = update_op_project_id_img(validation_op_) 48 | 49 | val_reg_op = validation_op_( 50 | bucket=f'{bucket}/validation/regular', 51 | validation_init_date=validation_init_date, 52 | validation_end_date=validation_end_date 53 | ).set_display_name('Build Regular Validation Dataset.').after(prepare_op) 54 | 55 | val_train_op = validation_op_( 56 | bucket=f'{bucket}/validation/train', 57 | validation_init_date=train_init_date, 58 | validation_end_date=train_end_date 59 | ).set_display_name('Build Validation Dataset of Train Data.').after(prepare_op) 60 | 61 | component_path = components_path / 'data' / 'train' / 'component.yaml' 62 | train_op_ = components.load_component_from_file(str(component_path)) 63 | train_op_ = update_op_project_id_img(train_op_) 64 | 65 | train_op = train_op_( 66 | bucket=bucket, 67 | train_init_date=train_init_date, 68 | train_end_date=train_end_date, 69 | es_host=es_host, 70 | model_name=model_name 71 | ).set_display_name('Build Train RankLib Dataset.').after(prepare_op) 72 | 73 | component_path = components_path / 'model' / 'component.yaml' 74 | model_op_ = components.load_component_from_file(str(component_path)) 75 | model_op_ = update_op_project_id_img(model_op_) 76 | 77 | model_op = model_op_( 78 | name='lambdamart', 79 | train_file_path=train_op.outputs['destination'], 80 | validation_files_path=val_reg_op.outputs['destination'], 81 | validation_train_files_path=val_train_op.outputs['destination'], 82 | es_host=es_host, 83 | model_name=model_name, 84 | ranker=ranker 85 | ).set_display_name('Launch Katib Optimization').after(val_reg_op, 86 | val_train_op, 87 | train_op) 88 | 89 | _ = dsl.ContainerOp( 90 | name="my-out-cop", 91 | image="library/bash:4.4.23", 92 | command=["sh", "-c"], 93 | arguments=["echo hyperparameter: %s" % model_op.output], 94 | ) 95 | -------------------------------------------------------------------------------- /kubeflow/components/data/train/train.sql: -------------------------------------------------------------------------------- 1 | CREATE TEMP FUNCTION PROCESS_CHANNEL_GROUP(channelGroup STRING) RETURNS STRING AS ( 2 | REGEXP_REPLACE(LOWER(channelGroup), ' ', '_') 3 | ); 4 | 5 | 6 | WITH search_data AS( 7 | SELECT 8 | fv, 9 | ARRAY( 10 | SELECT AS STRUCT 11 | query, 12 | ARRAY( 13 | SELECT AS STRUCT 14 | doc, click, purchase FROM UNNEST(session_docs) WHERE IF(max_position IS NOT NULL, position <= max_position, TRUE) AND IF(click = 0, RAND() < 0.01, True) 15 | ) AS session_docs 16 | FROM UNNEST(hits) 17 | ) as hits 18 | FROM( 19 | SELECT 20 | fv, 21 | ARRAY( 22 | SELECT AS STRUCT 23 | query, 24 | ARRAY(SELECT AS STRUCT doc, click, purchase, position, MAX(IF(purchase = 1, position, NULL)) OVER() AS max_position FROM UNNEST(session_docs) ORDER BY position) AS session_docs 25 | FROM UNNEST(hits) 26 | WHERE EXISTS(SELECT 1 FROM UNNEST(session_docs) WHERE click = 1) AND (SELECT SUM(purchase) FROM UNNEST(session_docs)) <= 1 27 | ) AS hits 28 | FROM( 29 | SELECT 30 | fv, 31 | ARRAY( 32 | SELECT AS STRUCT 33 | query, 34 | ARRAY_AGG(STRUCT(h.doc AS doc, IF(purchase = 1, 1, click) AS click, purchase, position)) AS session_docs 35 | FROM UNNEST(hits) AS h 36 | GROUP BY query 37 | ) AS hits 38 | FROM( 39 | SELECT 40 | fv, 41 | ARRAY( 42 | SELECT AS STRUCT 43 | query, 44 | doc, 45 | MAX(click) AS click, 46 | MAX(IF(EXISTS(SELECT 1 FROM UNNEST(purchased_docs) AS purchased_doc where purchased_doc = doc), 1, 0)) AS purchase, 47 | MIN(position) AS position 48 | FROM UNNEST(hits) 49 | GROUP BY query, doc 50 | ) AS hits 51 | FROM( 52 | SELECT 53 | fullvisitorid as fv, 54 | ARRAY( 55 | SELECT AS STRUCT 56 | REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_EXTRACT(page.pagepath, r'([^\/]+)$'), r'(\+)+', ' '), r't ', 't'), r' s ?', ' '), r'\.axd', '') AS query, 57 | productSKU AS doc, 58 | IF(isClick, 1, 0) AS click, 59 | ROW_NUMBER() OVER() AS position 60 | FROM UNNEST(hits) LEFT JOIN UNNEST(product) 61 | WHERE TRUE 62 | AND productSKU != '(not set)' 63 | AND NOT REGEXP_CONTAINS(page.pagepath, r'\.html|home') AND REGEXP_CONTAINS(page.pagepath, r'google\+redesign') 64 | 65 | ) AS hits, 66 | ARRAY(SELECT productSKU FROM UNNEST(hits), UNNEST(product) WHERE ecommerceAction.action_type = '6') AS purchased_docs 67 | FROM `bigquery-public-data.google_analytics_sample.ga_sessions*` 68 | WHERE TRUE 69 | AND REGEXP_EXTRACT(_TABLE_SUFFIX, r'.*_(\d+)$') BETWEEN '{train_init_date}' AND '{train_end_date}' 70 | ) 71 | ) 72 | ) 73 | ) 74 | ), 75 | customer_data AS( 76 | SELECT 77 | fv, 78 | channel_group, 79 | CAST(COALESCE((SELECT AVG(avg_ticket) FROM UNNEST(ticket_array) AS avg_ticket), 0) AS INT64) AS avg_ticket 80 | FROM( 81 | SELECT 82 | fullvisitorid AS fv, 83 | COALESCE(ARRAY_AGG(PROCESS_CHANNEL_GROUP(channelGrouping) LIMIT 1)[SAFE_OFFSET(0)], '') AS channel_group, 84 | ARRAY_CONCAT_AGG(ARRAY((SELECT AVG(productPrice / 1e6) AS avg_ticket FROM UNNEST(hits), UNNEST(product)))) AS ticket_array 85 | FROM `bigquery-public-data.google_analytics_sample.ga_sessions*` 86 | WHERE TRUE 87 | AND REGEXP_EXTRACT(_TABLE_SUFFIX, r'.*_(\d+)$') BETWEEN '{train_init_date}' AND '{train_end_date}' 88 | GROUP BY fv 89 | ) 90 | ) 91 | 92 | 93 | SELECT 94 | STRUCT( 95 | query AS search_term, 96 | COALESCE(channel_group, '') AS channel_group, 97 | COALESCE(CAST(avg_ticket AS INT64), 0) AS customer_avg_ticket 98 | ) AS search_keys, 99 | ARRAY_AGG(STRUCT(session_docs AS session)) AS judgment_keys 100 | FROM search_data LEFT JOIN customer_data USING(fv), UNNEST(hits) 101 | WHERE ARRAY_LENGTH(session_docs) BETWEEN 3 AND 500 102 | GROUP BY query, channel_group, avg_ticket 103 | -------------------------------------------------------------------------------- /kubeflow/components/model/tests/test_train.py: -------------------------------------------------------------------------------- 1 | import mock 2 | import os 3 | # from shutil import rmtree 4 | from collections import namedtuple 5 | 6 | from train import main 7 | 8 | 9 | def test_train_model(monkeypatch, tmpdir_factory): 10 | post_mock = mock.Mock() 11 | os_system_mock = mock.Mock() 12 | 13 | tmp_folder = tmpdir_factory.mktemp('unittest') 14 | 15 | def write_file(tmp_folder: str, context: int): 16 | os.makedirs(str(tmp_folder), exist_ok=True) 17 | with open(f'{tmp_folder}/model.txt', 'w') as f: 18 | f.write(f'model definition: {context}') 19 | 20 | os_system_mock.return_value = write_file(str(tmp_folder), 1) 21 | 22 | datetime_mock = mock.Mock() 23 | datetime_mock.today.return_value.strftime.return_value = 'todays date' 24 | 25 | partial_mock = mock.Mock() 26 | partial_mock.return_value = 'partial function' 27 | 28 | pool_mock = mock.Mock() 29 | pool_mock.return_value.map.side_effect = [ 30 | [0.3, 0.2], 31 | [0.2, 0.1], 32 | [0.4, 0.3] 33 | ] 34 | 35 | monkeypatch.setattr('train.post_model_to_elasticsearch', post_mock) 36 | monkeypatch.setattr('train.os.system', os_system_mock) 37 | monkeypatch.setattr('train.get_partiated_validator', partial_mock) 38 | monkeypatch.setattr('train.Pool', pool_mock) 39 | monkeypatch.setattr('train.datetime', datetime_mock) 40 | 41 | args = namedtuple( 42 | 'args', 43 | [ 44 | 'train_file_path', 45 | 'validation_files_path', 46 | 'validation_train_files_path', 47 | 'es_host', 48 | 'model_name', 49 | 'es_batch' 50 | 'destination', 51 | 'ranker', 52 | 'index' 53 | ] 54 | ) 55 | args.train_file_path = '/test/train_dataset.txt' 56 | args.validation_files_path = '/validation/regular' 57 | args.validation_train_files_path = '/validation/train' 58 | args.es_host = 'es_host_test' 59 | args.model_name = 'unittest' 60 | args.es_batch = 2 61 | args.destination = str(tmp_folder) 62 | args.ranker = 'lambdamart' 63 | args.index = 'index_test' 64 | 65 | X = ['--var1 val1 --var2 val2'] 66 | 67 | main(args, X) 68 | expected_call = ( 69 | 'java -jar ranklib/RankLib-2.14.jar -ranker 6 -train ' 70 | f'/test/train_dataset.txt -norm sum -save {str(tmp_folder)}/model.txt ' 71 | '-var1 val1 -var2 val2 -metric2t ERR' 72 | ) 73 | os_system_mock.assert_any_call(expected_call) 74 | post_mock.assert_any_call('es_host_test', 'unittest', 75 | f'{args.destination}/model.txt') 76 | partial_mock.assert_any_call('es_host_test', 'index_test', 'unittest', 2) 77 | data = open(f'{args.destination}/results.txt').read() 78 | assert data == 'todays date,--var1 val1 --var2 val2,rank_train=0.2,rank_val=0.3\n' 79 | data = open(f'{args.destination}/best_rank.txt').read() 80 | assert data == '0.3' 81 | data = open(f'{args.destination}/best_model.txt').read() 82 | assert data == 'model definition: 1' 83 | 84 | # Test if new best model gets replaced 85 | os_system_mock.return_value = write_file(str(tmp_folder), 2) 86 | main(args, X) 87 | data = open(f'{args.destination}/results.txt').read() 88 | assert data == ( 89 | 'todays date,--var1 val1 --var2 val2,rank_train=0.2,rank_val=0.3\n' 90 | 'todays date,--var1 val1 --var2 val2,rank_train=0.1,rank_val=0.2\n' 91 | ) 92 | data = open(f'{args.destination}/best_rank.txt').read() 93 | assert data == '0.2' 94 | data = open(f'{args.destination}/best_model.txt').read() 95 | assert data == 'model definition: 2' 96 | 97 | # Test if new worse model is ignored 98 | os_system_mock.return_value = write_file(str(tmp_folder), 3) 99 | main(args, X) 100 | data = open(f'{args.destination}/results.txt').read() 101 | assert data == ( 102 | 'todays date,--var1 val1 --var2 val2,rank_train=0.2,rank_val=0.3\n' 103 | 'todays date,--var1 val1 --var2 val2,rank_train=0.1,rank_val=0.2\n' 104 | 'todays date,--var1 val1 --var2 val2,rank_train=0.3,rank_val=0.4\n' 105 | ) 106 | data = open(f'{args.destination}/best_rank.txt').read() 107 | assert data == '0.2' 108 | data = open(f'{args.destination}/best_model.txt').read() 109 | assert data == 'model definition: 2' 110 | -------------------------------------------------------------------------------- /kubeflow/components/prepare_env/lambdamart0/es_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "index": "pysearchml", 3 | "body": { 4 | "mappings": { 5 | "dynamic": "strict", 6 | "properties": { 7 | "sku": { 8 | "type": "keyword" 9 | }, 10 | "name": { 11 | "type": "text" 12 | }, 13 | "category": { 14 | "type": "text" 15 | }, 16 | "price": { 17 | "type": "float" 18 | }, 19 | "performances": { 20 | "type": "object", 21 | "properties": { 22 | "global": { 23 | "type": "object", 24 | "properties": { 25 | "impressions": { 26 | "type": "integer" 27 | }, 28 | "clicks": { 29 | "type": "integer" 30 | }, 31 | "CTR": { 32 | "type": "float" 33 | } 34 | } 35 | }, 36 | "channel": { 37 | "type": "object", 38 | "properties": { 39 | "organic_search": { 40 | "type": "object", 41 | "properties": { 42 | "CTR": { 43 | "type": "float" 44 | } 45 | } 46 | }, 47 | "direct": { 48 | "type": "object", 49 | "properties": { 50 | "CTR": { 51 | "type": "float" 52 | } 53 | } 54 | }, 55 | "referral": { 56 | "type": "object", 57 | "properties": { 58 | "CTR": { 59 | "type": "float" 60 | } 61 | } 62 | }, 63 | "paid_search": { 64 | "type": "object", 65 | "properties": { 66 | "CTR": { 67 | "type": "float" 68 | } 69 | } 70 | }, 71 | "display": { 72 | "type": "object", 73 | "properties": { 74 | "CTR": { 75 | "type": "float" 76 | } 77 | } 78 | }, 79 | "affiliates": { 80 | "type": "object", 81 | "properties": { 82 | "CTR": { 83 | "type": "float" 84 | } 85 | } 86 | }, 87 | "social": { 88 | "type": "object", 89 | "properties": { 90 | "CTR": { 91 | "type": "float" 92 | } 93 | } 94 | } 95 | } 96 | } 97 | } 98 | } 99 | } 100 | } 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /kubeflow/build/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | 3 | # Transfer secret keys from GCP to Cloud Build environemnt 4 | - name: gcr.io/cloud-builders/gcloud 5 | entrypoint: 'bash' 6 | args: 7 | - '-c' 8 | - | 9 | gcloud secrets versions access latest --secret=pysearchml-git-secret > /root/.ssh/id_github 10 | gcloud secrets versions access latest --secret=pysearchml-service-account > key.json 11 | volumes: 12 | - name: 'ssh' 13 | path: /root/.ssh 14 | id: 'Get Secret Keys' 15 | 16 | # Update known_hosts 17 | - name: 'gcr.io/cloud-builders/git' 18 | entrypoint: 'bash' 19 | args: 20 | - '-c' 21 | - | 22 | chmod 600 /root/.ssh/id_github 23 | ssh-keyscan -t rsa github.com >> /root/.ssh/known_hosts 24 | cat </root/.ssh/config 25 | Hostname github.com 26 | IdentityFile /root/.ssh/id_github 27 | EOF 28 | volumes: 29 | - name: 'ssh' 30 | path: /root/.ssh 31 | id: 'Prepare Git known_hosts' 32 | 33 | # Clones Repository And Copies Service Account Key 34 | - name: 'gcr.io/cloud-builders/git' 35 | entrypoint: 'bash' 36 | args: 37 | - '-c' 38 | - | 39 | git clone git@github.com:WillianFuks/pySearchML.git 40 | cp key.json pySearchML/ 41 | volumes: 42 | - name: 'ssh' 43 | path: /root/.ssh 44 | id: 'Clone Git Repo' 45 | 46 | # Build KFP Cluster 47 | - name: 'gcr.io/cloud-builders/gcloud' 48 | entrypoint: 'bash' 49 | args: 50 | - '-c' 51 | - | 52 | ./bin/create_k8s.sh 53 | dir: 'pySearchML' 54 | env: 55 | - 'PROJECT_ID=$PROJECT_ID' 56 | - 'CLUSTER_NAME=${_CLUSTER_NAME}' 57 | - 'COMPUTE_ZONE=${_COMPUTE_ZONE}' 58 | id: 'Build KFP Cluster' 59 | volumes: 60 | - name: 'ssh' 61 | path: /root/.ssh 62 | 63 | # Build Docker Images 64 | - name: 'gcr.io/cloud-builders/docker' 65 | entrypoint: 'bash' 66 | args: 67 | - '-c' 68 | - | 69 | docker build -t gcr.io/$PROJECT_ID/prepare_env -f kubeflow/components/prepare_env/Dockerfile . 70 | docker build -t gcr.io/$PROJECT_ID/pipelines -f kubeflow/pipelines/Dockerfile . 71 | docker build -t gcr.io/$PROJECT_ID/data_validation -f kubeflow/components/data/validation/Dockerfile . 72 | docker build -t gcr.io/$PROJECT_ID/data_train -f kubeflow/components/data/train/Dockerfile . 73 | docker build -t gcr.io/$PROJECT_ID/model -f kubeflow/components/model/Dockerfile . --build-arg PROJECT_ID=$PROJECT_ID 74 | docker push gcr.io/$PROJECT_ID/prepare_env 75 | docker push gcr.io/$PROJECT_ID/pipelines 76 | docker push gcr.io/$PROJECT_ID/data_validation 77 | docker push gcr.io/$PROJECT_ID/data_train 78 | docker push gcr.io/$PROJECT_ID/model 79 | dir: 'pySearchML' 80 | id: 'Build Docker Images' 81 | waitFor: ['Clone Git Repo'] 82 | 83 | # Unit Test Data Train 84 | - name: 'gcr.io/$PROJECT_ID/data_train' 85 | args: 86 | - export PYTHONPATH=. 87 | - pytest 88 | id: 'Unittest Data Train' 89 | waitFor: ['Clone Git Repo', 'Build Docker Images'] 90 | 91 | # Unit Test Train Model 92 | - name: 'gcr.io/$PROJECT_ID/model' 93 | args: 94 | - export PYTHONPATH=. 95 | - pytest 96 | id: 'Unittest Train Model' 97 | waitFor: ['Clone Git Repo', 'Build Docker Images'] 98 | 99 | # Compile Ranker Pipeline 100 | - name: 'gcr.io/$PROJECT_ID/pipelines' 101 | args: 102 | - '-c' 103 | - | 104 | dsl-compile --py pipeline2.py --output pipeline.tar.gz 105 | dir: 'pySearchML/kubeflow/pipelines' 106 | env: 107 | - 'PROJECT_ID=$PROJECT_ID' 108 | id: 'Compile Pipeline' 109 | waitFor: ['Clone Git Repo'] 110 | 111 | # Get Host After Cluster Is Created 112 | - name: 'gcr.io/cloud-builders/gcloud' 113 | entrypoint: 'bash' 114 | args: 115 | - '-c' 116 | - | 117 | gcloud components install kubectl 118 | gcloud container clusters get-credentials ${_CLUSTER_NAME} 119 | ./pySearchML/bin/get_pipe_host.sh 120 | id: 'Get Host' 121 | waitFor: ['Build KFP Cluster'] 122 | 123 | # Deploy Pipeline to K8 124 | - name: 'gcr.io/$PROJECT_ID/pipelines' 125 | args: 126 | - '-c' 127 | - | 128 | gcloud components install kubectl 129 | gcloud container clusters get-credentials pysearchml --zone=$_COMPUTE_ZONE 130 | kubectl port-forward -n kubeflow svc/ml-pipeline-ui 7067:80 & 131 | # Wait port-forwarding to take its place 132 | sleep 60s 133 | python helper.py deploy-pipeline --host=$(cat /workspace/k8_host.txt) --version=${_VERSION} 134 | dir: 'pySearchML/kubeflow/pipelines/' 135 | id: 'Deploy Pipeline' 136 | waitFor: ['Compile Pipeline', 'Build KFP Cluster', 'Get Host'] 137 | -------------------------------------------------------------------------------- /kubeflow/components/data/train/tests/test_run.py: -------------------------------------------------------------------------------- 1 | import mock 2 | import json 3 | import gzip 4 | import os 5 | import subprocess 6 | import shutil 7 | from collections import namedtuple 8 | 9 | from run import main 10 | 11 | 12 | def test_main(monkeypatch, es_log_features, tmpdir_factory): 13 | shutil.rmtree('/tmp/pysearchml/unittest', ignore_errors=True) 14 | clickmodel_path = '/tmp/pysearchml/unittest/model' 15 | os.makedirs(clickmodel_path) 16 | tmp_dir = tmpdir_factory.mktemp('unittest') 17 | 18 | args = namedtuple( 19 | 'args', 20 | [ 21 | 'train_init_date', 22 | 'train_end_date', 23 | 'bucket', 24 | 'es_host', 25 | 'es_batch', 26 | 'destination', 27 | 'model_name', 28 | 'index' 29 | ] 30 | ) 31 | args.init_day_train = '20200101' 32 | args.end_day_train = '20200101' 33 | args.bucket = 'gcp_bucket' 34 | args.index = 'index_test' 35 | args.es_host = 'es_host_test' 36 | args.model_name = 'unittest' 37 | args.es_batch = 2 38 | args.destination = str(tmp_dir) 39 | 40 | download_mock = mock.Mock() 41 | 42 | class MockModel: 43 | def fit(self, *args, **kwargs): 44 | return self 45 | 46 | def export_judgments(self, model_path: str): 47 | subprocess.call( 48 | [f'cp -r tests/fixtures/model.gz {model_path}'], 49 | stdout=subprocess.PIPE, 50 | shell=True 51 | ) 52 | 53 | dbn_mock = mock.Mock() 54 | dbn_mock.DBNModel.return_value = MockModel() 55 | es_client_mock = mock.Mock() 56 | 57 | monkeypatch.setattr('run.download_data', download_mock) 58 | monkeypatch.setattr('run.DBN', dbn_mock) 59 | monkeypatch.setattr('run.Elasticsearch', es_client_mock) 60 | es_client_mock.msearch.side_effect = es_log_features 61 | 62 | main(args, es_client_mock) 63 | download_mock.assert_called_with(args) 64 | data_reader = gzip.GzipFile('/tmp/pysearchml/unittest/judgments/judgments.gz', 'rb') 65 | data = json.loads(data_reader.readline()) 66 | expected = { 67 | "search_keys": {"search_term": "keyword0", "var1": "val1"}, 68 | "judgment_keys": [ 69 | {"doc": "doc0", "judgment": 0}, 70 | {"doc": "doc1", "judgment": 4}, 71 | {"doc": "doc2", "judgment": 2} 72 | ] 73 | } 74 | assert expected == data 75 | 76 | data = json.loads(data_reader.readline()) 77 | expected = { 78 | "search_keys": {"search_term": "keyword1", "var1": "val1"}, 79 | "judgment_keys": [ 80 | {"doc": "doc1", "judgment": 0}, 81 | {"doc": "doc2", "judgment": 4} 82 | ] 83 | } 84 | assert expected == data 85 | 86 | data = json.loads(data_reader.readline()) 87 | expected = { 88 | "search_keys": {"search_term": "keyword2", "var1": "val1"}, 89 | "judgment_keys": [ 90 | {"doc": "doc3", "judgment": 0}, 91 | {"doc": "doc4", "judgment": 4} 92 | ] 93 | } 94 | assert expected == data 95 | 96 | body1 = ( 97 | '{"index": "index_test"}\n{"query": {"bool": {"filter": [{"terms": {"_id": ' 98 | '["doc0", "doc1", "doc2"]}}], "should": [{"sltr": {"_name": "logged_featureset"' 99 | ', "featureset": "unittest", "params": {"search_term": "keyword0", "var1": ' 100 | '"val1"}}}]}}, "_source": ["_id"], "ext": {"ltr_log": {"log_specs": {"name": ' 101 | '"main", "named_query": "logged_featureset"}}}}\n{"index": "index_test"}\n{"' 102 | 'query": {"bool": {"filter": [{"terms": {"_id": ["doc1", "doc2"]}}], "should": ' 103 | '[{"sltr": {"_name": "logged_featureset", "featureset": "unittest", "params": ' 104 | '{"search_term": "keyword1", "var1": "val1"}}}]}}, "_source": ["_id"], "ext": ' 105 | '{"ltr_log": {"log_specs": {"name": "main", "named_query": "logged_featureset"}' 106 | '}}}' 107 | ) 108 | body2 = ( 109 | '{"index": "index_test"}\n{"query": {"bool": {"filter": [{"terms": {"_id": ["' 110 | 'doc3", "doc4"]}}], "should": [{"sltr": {"_name": "logged_featureset", "' 111 | 'featureset": "unittest", "params": {"search_term": "keyword2", "var1": "val1"' 112 | '}}}]}}, "_source": ["_id"], "ext": {"ltr_log": {"log_specs": {"name": "main", ' 113 | '"named_query": "logged_featureset"}}}}' 114 | ) 115 | es_client_mock.msearch.assert_any_call(body=body1, request_timeout=60) 116 | es_client_mock.msearch.assert_any_call(body=body2, request_timeout=60) 117 | 118 | rank_data = open(f'{str(tmp_dir)}/train_dataset.txt').read() 119 | 120 | expected = ( 121 | '0\tqid:0\t1:0.01\t2:0.02\n4\tqid:0\t1:0.03\t2:0.04\n2\tqid:0\t1:0.05\t' 122 | '2:0.06\n0\tqid:1\t1:0.03\t2:0.04\n4\tqid:1\t1:0.05\t2:0\n' 123 | ) 124 | 125 | assert rank_data == expected 126 | shutil.rmtree('/tmp/pysearchml/unittest', ignore_errors=True) 127 | -------------------------------------------------------------------------------- /kubeflow/components/common/launch_crd.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 kubeflow.org. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | 15 | """ 16 | This script uses as reference the following code: 17 | 18 | https://github.com/kubeflow/pipelines/blob/848fbe0bceb786c8db72e88b3bc986d42ac768b9/components/kubeflow/common/launch_crd.py 19 | """ 20 | 21 | import datetime 22 | import time 23 | 24 | from kubernetes import client as k8s_client 25 | from kubernetes.client import rest 26 | 27 | 28 | class K8sCR(object): 29 | def __init__(self, group, plural, version, client): 30 | self.group = group 31 | self.plural = plural 32 | self.version = version 33 | self.client = k8s_client.CustomObjectsApi(client) 34 | 35 | def wait_for_condition(self, 36 | namespace, 37 | name, 38 | expected_conditions=[], 39 | timeout=datetime.timedelta(days=365), 40 | polling_interval=datetime.timedelta(seconds=30), 41 | status_callback=None): 42 | """Waits until any of the specified conditions occur. 43 | Args: 44 | namespace: namespace for the CR. 45 | name: Name of the CR. 46 | expected_conditions: A list of conditions. Function waits until any of the 47 | supplied conditions is reached. 48 | timeout: How long to wait for the CR. 49 | polling_interval: How often to poll for the status of the CR. 50 | status_callback: (Optional): Callable. If supplied this callable is 51 | invoked after we poll the CR. Callable takes a single argument which 52 | is the CR. 53 | """ 54 | end_time = datetime.datetime.now() + timeout 55 | while True: 56 | try: 57 | results = self.client.get_namespaced_custom_object( 58 | self.group, self.version, namespace, self.plural, name) 59 | except Exception as e: 60 | print("There was a problem waiting for %s/%s %s in namespace %s; Exception: %s" %( 61 | self.group, self.plural, name, namespace, e)) 62 | raise 63 | 64 | if results: 65 | if status_callback: 66 | status_callback(results) 67 | expected, condition = self.is_expected_conditions(results, expected_conditions) 68 | if expected: 69 | print("%s/%s %s in namespace %s has reached the expected condition: %s." %( 70 | self.group, self.plural, name, namespace, condition)) 71 | return results 72 | else: 73 | if condition: 74 | print("Current condition of %s/%s %s in namespace %s is %s." %( 75 | self.group, self.plural, name, namespace, condition)) 76 | 77 | if datetime.datetime.now() + polling_interval > end_time: 78 | raise Exception( 79 | "Timeout waiting for {0}/{1} {2} in namespace {3} to enter one of the " 80 | "conditions {4}.".format(self.group, self.plural, name, namespace, expected_conditions)) 81 | 82 | time.sleep(polling_interval.seconds) 83 | 84 | def is_expected_conditions(self, cr_object, expected_conditions): 85 | return False, "" 86 | 87 | def create(self, spec): 88 | """Create a CR. 89 | Args: 90 | spec: The spec for the CR. 91 | """ 92 | try: 93 | # Create a Resource 94 | namespace = spec["metadata"].get("namespace", "default") 95 | print("Creating %s/%s %s in namespace %s." %( 96 | self.group, self.plural, spec["metadata"]["name"], namespace)) 97 | api_response = self.client.create_namespaced_custom_object( 98 | self.group, self.version, namespace, self.plural, spec) 99 | print("Created %s/%s %s in namespace %s." %( 100 | self.group, self.plural, spec["metadata"]["name"], namespace)) 101 | return api_response 102 | except rest.ApiException as e: 103 | print(str(e)) 104 | def delete(self, name, namespace): 105 | try: 106 | body = { 107 | # Set garbage collection so that CR won't be deleted until all 108 | # owned references are deleted. 109 | "propagationPolicy": "Foreground", 110 | } 111 | print("Deleteing %s/%s %s in namespace %s." %( 112 | self.group, self.plural, name, namespace)) 113 | api_response = self.client.delete_namespaced_custom_object( 114 | group=self.group, 115 | version=self.version, 116 | namespace=namespace, 117 | plural=self.plural, 118 | name=name, 119 | body=body) 120 | print("Deleted %s/%s %s in namespace %s." %( 121 | self.group, self.plural, name, namespace)) 122 | return api_response 123 | except rest.ApiException as e: 124 | print(str(e)) 125 | 126 | print("Exception when %s %s/%s: %s" %(action, self.group, self.plural, ex.body)) 127 | raise ex 128 | -------------------------------------------------------------------------------- /kubeflow/pipelines/pipeline2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | 4 | from kfp import dsl 5 | 6 | 7 | PATH = pathlib.Path(__file__).parent 8 | PROJECT_ID = os.getenv('PROJECT_ID') 9 | 10 | 11 | @dsl.pipeline( 12 | name='Train Lambda Mart Pipeline', 13 | description=('Responsible for generating all datasets and optimization process for' 14 | ' the chosen Ranker algorithm.') 15 | ) 16 | def build_pipeline( 17 | bucket='pysearchml', 18 | es_host='elasticsearch.elastic-system.svc.cluster.local:9200', 19 | force_restart=False, 20 | train_init_date='20160801', 21 | train_end_date='20160801', 22 | validation_init_date='20160802', 23 | validation_end_date='20160802', 24 | test_init_date='20160803', 25 | test_end_date='20160803', 26 | model_name='lambdamart0', 27 | ranker='lambdamart', 28 | index='pysearchml' 29 | ): 30 | pvc = dsl.PipelineVolume(pvc='pysearchml-nfs') 31 | 32 | prepare_op = dsl.ContainerOp( 33 | name='prepare env', 34 | image=f'gcr.io/{PROJECT_ID}/prepare_env', 35 | arguments=[ 36 | f'--force_restart={force_restart}', 37 | f'--es_host={es_host}', 38 | f'--bucket={bucket}', 39 | f'--model_name={model_name}' 40 | ], 41 | pvolumes={'/data': pvc} 42 | ) 43 | 44 | val_reg_dataset_op = dsl.ContainerOp( 45 | name='validation regular dataset', 46 | image=f'gcr.io/{PROJECT_ID}/data_validation', 47 | arguments=[ 48 | f'--bucket={bucket}/validation/regular', 49 | f'--validation_init_date={validation_init_date}', 50 | f'--validation_end_date={validation_end_date}', 51 | f'--destination=/data/pysearchml/{model_name}/validation_regular' 52 | ], 53 | pvolumes={'/data': pvc} 54 | ).set_display_name('Build Regular Validation Dataset').after(prepare_op) 55 | 56 | val_train_dataset_op = dsl.ContainerOp( 57 | name='validation train dataset', 58 | image=f'gcr.io/{PROJECT_ID}/data_validation', 59 | arguments=[ 60 | f'--bucket={bucket}/validation/train', 61 | f'--validation_init_date={train_init_date}', 62 | f'--validation_end_date={train_end_date}', 63 | f'--destination=/data/pysearchml/{model_name}/validation_train' 64 | ], 65 | pvolumes={'/data': pvc} 66 | ).set_display_name('Build Train Validation Dataset').after(prepare_op) 67 | 68 | val_test_dataset_op = dsl.ContainerOp( 69 | name='validation test dataset', 70 | image=f'gcr.io/{PROJECT_ID}/data_validation', 71 | arguments=[ 72 | f'--bucket={bucket}/validation/test', 73 | f'--validation_init_date={test_init_date}', 74 | f'--validation_end_date={test_end_date}', 75 | f'--destination=/data/pysearchml/{model_name}/validation_test' 76 | ], 77 | pvolumes={'/data': pvc} 78 | ).set_display_name('Build Test Validation Dataset').after(prepare_op) 79 | 80 | train_dataset_op = dsl.ContainerOp( 81 | name='train dataset', 82 | image=f'gcr.io/{PROJECT_ID}/data_train', 83 | command=['python', '/train/run.py'], 84 | arguments=[ 85 | f'--bucket={bucket}', 86 | f'--train_init_date={train_init_date}', 87 | f'--train_end_date={train_end_date}', 88 | f'--es_host={es_host}', 89 | f'--model_name={model_name}', 90 | f'--index={index}', 91 | f'--destination=/data/pysearchml/{model_name}/train' 92 | ], 93 | pvolumes={'/data': pvc} 94 | ).set_display_name('Build Training Dataset').after(prepare_op) 95 | 96 | katib_op = dsl.ContainerOp( 97 | name='pySearchML Bayesian Optimization Model', 98 | image=f'gcr.io/{PROJECT_ID}/model', 99 | command=['python', '/model/launch_katib.py'], 100 | arguments=[ 101 | f'--es_host={es_host}', 102 | f'--model_name={model_name}', 103 | f'--ranker={ranker}', 104 | '--name=pysearchml', 105 | f'--train_file_path=/data/pysearchml/{model_name}/train/train_dataset.txt', 106 | f'--validation_files_path=/data/pysearchml/{model_name}/validation_regular', 107 | ('--validation_train_files_path=/data/pysearchml/{model_name}/' 108 | 'validation_train'), 109 | f'--destination=/data/pysearchml/{model_name}/' 110 | ], 111 | pvolumes={'/data': pvc} 112 | ).set_display_name('Katib Optimization Process').after( 113 | val_reg_dataset_op, val_train_dataset_op, val_test_dataset_op, train_dataset_op 114 | ) 115 | 116 | post_model_op = dsl.ContainerOp( 117 | name='Post Best RankLib Model to ES', 118 | image=f'gcr.io/{PROJECT_ID}/model', 119 | command=['python', '/model/post_model.py'], 120 | arguments=[ 121 | f'--es_host={es_host}', 122 | f'--model_name={model_name}', 123 | f'--destination=/data/pysearchml/{model_name}/best_model.txt' 124 | ], 125 | pvolumes={'/data': pvc} 126 | ).set_display_name('Post RankLib Model to ES').after(katib_op) 127 | 128 | _ = dsl.ContainerOp( 129 | name='Test Model', 130 | image=f'gcr.io/{PROJECT_ID}/model', 131 | command=['python', '/model/test.py'], 132 | arguments=[ 133 | f'--files_path=/data/pysearchml/{model_name}/validation_test', 134 | f'--index={index}', 135 | f'--es_host={es_host}', 136 | f'--model_name={model_name}', 137 | ], 138 | pvolumes={'/data': pvc} 139 | ).set_display_name('Run Test Step').after(post_model_op) 140 | -------------------------------------------------------------------------------- /kubeflow/components/data/train/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture 5 | def es_log_features(): 6 | return [ 7 | { 8 | 'responses': [ 9 | { 10 | 'hits': { 11 | 'hits': [ 12 | { 13 | '_id': 'doc0', 14 | 'fields': { 15 | '_ltrlog': [ 16 | { 17 | 'main': [ 18 | { 19 | 'value': 0.01 20 | }, 21 | { 22 | 'value': 0.02 23 | } 24 | ] 25 | } 26 | ] 27 | } 28 | }, 29 | { 30 | '_id': 'doc1', 31 | 'fields': { 32 | '_ltrlog': [ 33 | { 34 | 'main': [ 35 | { 36 | 'value': 0.03 37 | }, 38 | { 39 | 'value': 0.04 40 | } 41 | ] 42 | } 43 | ] 44 | } 45 | }, 46 | { 47 | '_id': 'doc2', 48 | 'fields': { 49 | '_ltrlog': [ 50 | { 51 | 'main': [ 52 | { 53 | 'value': 0.05 54 | }, 55 | { 56 | 'value': 0.06 57 | } 58 | ] 59 | } 60 | ] 61 | } 62 | } 63 | ] 64 | } 65 | }, 66 | { 67 | 'hits': { 68 | 'hits': [ 69 | { 70 | '_id': 'doc1', 71 | 'fields': { 72 | '_ltrlog': [ 73 | { 74 | 'main': [ 75 | { 76 | 'value': 0.03 77 | }, 78 | { 79 | 'value': 0.04 80 | } 81 | ] 82 | } 83 | ] 84 | } 85 | }, 86 | { 87 | '_id': 'doc2', 88 | 'fields': { 89 | '_ltrlog': [ 90 | { 91 | 'main': [ 92 | { 93 | 'value': 0.05 94 | }, 95 | {} 96 | ] 97 | } 98 | ] 99 | } 100 | } 101 | ] 102 | } 103 | } 104 | ] 105 | }, 106 | { 107 | 'responses': [ 108 | { 109 | 'hits': { 110 | 'hits': [ 111 | { 112 | '_id': 'doc3', 113 | 'fields': { 114 | '_ltrlog': [ 115 | { 116 | 'main': [ 117 | { 118 | 'value': 0.06 119 | }, 120 | { 121 | 'value': 0.07 122 | } 123 | ] 124 | } 125 | ] 126 | } 127 | } 128 | ] 129 | } 130 | } 131 | ] 132 | }, 133 | {} 134 | ] 135 | -------------------------------------------------------------------------------- /kubeflow/components/model/launch_katib.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | import uuid 5 | import pathlib 6 | from typing import Dict, List, Any 7 | 8 | import launch_crd 9 | from kubernetes import client as k8s_client 10 | from kubernetes import config 11 | 12 | 13 | # https://github.com/kubeflow/pipelines/tree/de2e0f2ec0edc1afd16ad79d8cd9719d1b01cb1f/components/kubeflow/katib-launcher 14 | 15 | 16 | PATH = pathlib.Path(__file__).parent 17 | 18 | 19 | def get_ranker_parameters(ranker: str) -> List[Dict[str, Any]]: 20 | return { 21 | 'lambdamart': [ 22 | { 23 | "name": "--tree", 24 | "parameterType": "int", 25 | "feasibleSpace": { 26 | "min": "1", 27 | "max": "500" 28 | } 29 | }, 30 | { 31 | "name": "--leaf", 32 | "parameterType": "int", 33 | "feasibleSpace": { 34 | "min": "2", 35 | "max": "40" 36 | } 37 | }, 38 | { 39 | "name": "--shrinkage", 40 | "parameterType": "double", 41 | "feasibleSpace": { 42 | "min": "0.01", 43 | "max": "0.2" 44 | } 45 | }, 46 | { 47 | "name": "--tc", 48 | "parameterType": "int", 49 | "feasibleSpace": { 50 | "min": "-1", 51 | "max": "300" 52 | } 53 | }, 54 | { 55 | "name": "--mls", 56 | "parameterType": "int", 57 | "feasibleSpace": { 58 | "min": "1", 59 | "max": "10" 60 | } 61 | } 62 | ] 63 | }.get(ranker) 64 | 65 | 66 | class Experiment(launch_crd.K8sCR): 67 | def __init__(self, client=None): 68 | super().__init__('kubeflow.org', 'experiments', 'v1alpha3', client) 69 | 70 | def is_expected_conditions(self, instance, expected_conditions): 71 | conditions = instance.get('status', {}).get('conditions') 72 | if not conditions: 73 | return False, '' 74 | if conditions[-1]['type'] in expected_conditions: 75 | return True, conditions[-1]['type'] 76 | else: 77 | return False, conditions[-1]['type'] 78 | 79 | 80 | def main(argv=None): 81 | parser = argparse.ArgumentParser() 82 | parser.add_argument( 83 | '--name', 84 | dest='name', 85 | type=str, 86 | help='Experiment name.' 87 | ) 88 | parser.add_argument( 89 | '--destination', 90 | dest='destination', 91 | type=str, 92 | help='The file which stores the best trial of the experiment.' 93 | ) 94 | parser.add_argument( 95 | '--train_file_path', 96 | dest='train_file_path', 97 | type=str, 98 | help='Location where training data is located.' 99 | ) 100 | parser.add_argument( 101 | '--validation_files_path', 102 | dest='validation_files_path', 103 | type=str, 104 | help='Location where validation data is located.' 105 | ) 106 | parser.add_argument( 107 | '--validation_train_files_path', 108 | dest='validation_train_files_path', 109 | type=str, 110 | help='Location where validation of training data is located.' 111 | ) 112 | parser.add_argument( 113 | '--es_host', 114 | dest='es_host', 115 | type=str, 116 | help='Name host of Elasticsearch.' 117 | ) 118 | parser.add_argument( 119 | '--model_name', 120 | dest='model_name', 121 | type=str, 122 | help='Name of feature set saved in Elasticsearch.' 123 | ) 124 | parser.add_argument( 125 | '--ranker', 126 | dest='ranker', 127 | type=str, 128 | help='RankLib algorith to use.' 129 | ) 130 | 131 | args = parser.parse_args() 132 | 133 | files = [f'{args.destination}/best_rank.txt', f'{args.destination}/best_model.txt'] 134 | for file_ in files: 135 | if os.path.isfile(file_): 136 | os.remove(file_) 137 | 138 | exp_json_file = PATH / 'experiment.json' 139 | exp_def = json.loads(open(str(exp_json_file)).read()) 140 | 141 | raw_template = json.dumps( 142 | exp_def['spec']['trialTemplate']['goTemplate']['rawTemplate'] 143 | ) 144 | raw_template = raw_template\ 145 | .replace('{PROJECT_ID}', os.getenv('PROJECT_ID'))\ 146 | .replace('{train_file_path}', args.train_file_path)\ 147 | .replace('{validation_files_path}', args.validation_files_path)\ 148 | .replace('{validation_train_files_path}', args.validation_train_files_path)\ 149 | .replace('{es_host}', args.es_host)\ 150 | .replace('{destination}', args.destination)\ 151 | .replace('{model_name}', args.model_name)\ 152 | .replace('{ranker}', args.ranker) 153 | 154 | exp_def['spec']['trialTemplate']['goTemplate']['rawTemplate'] = raw_template 155 | 156 | config.load_incluster_config() 157 | api_client = k8s_client.ApiClient() 158 | experiment = Experiment(client=api_client) 159 | exp_name = f'{args.name}-{uuid.uuid4().hex}'[:33] 160 | 161 | exp_def['spec']['parameters'] = get_ranker_parameters(args.ranker) 162 | exp_def['metadata']['name'] = exp_name 163 | print('this is exp_def: ', json.dumps(exp_def)) 164 | 165 | create_response = experiment.create(exp_def) 166 | print('create response: ', create_response) 167 | 168 | expected_conditions = ["Succeeded", "Failed"] 169 | current_exp = experiment.wait_for_condition('kubeflow', exp_name, 170 | expected_conditions) 171 | print('current_exp: ', json.dumps(current_exp)) 172 | 173 | expected, _ = experiment.is_expected_conditions(current_exp, ["Succeeded"]) 174 | 175 | if expected: 176 | best_rank = current_exp["status"]["currentOptimalTrial"]["observation"][ 177 | 'metrics'][0]['value'] 178 | print('Best Rank Found: ', best_rank) 179 | params = current_exp["status"]["currentOptimalTrial"]["parameterAssignments"] 180 | print(json.dumps(params)) 181 | os.makedirs(os.path.dirname(args.destination), exist_ok=True) 182 | if os.path.isfile(args.destination): 183 | os.remove(args.destination) 184 | 185 | experiment.delete(exp_name, 'kubeflow') 186 | 187 | 188 | if __name__ == "__main__": 189 | main() 190 | -------------------------------------------------------------------------------- /kubeflow/components/prepare_env/run.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import pathlib 4 | import gzip 5 | import json 6 | import requests 7 | from typing import Dict, Any, NamedTuple 8 | from urllib.parse import urljoin 9 | 10 | from google.cloud import storage, bigquery 11 | 12 | 13 | PATH = pathlib.Path(__file__).parent 14 | 15 | 16 | def process_feature_file(filename: str) -> Dict[str, Any]: 17 | """ 18 | Each feature for RankLib is defined in a JSON file with its name and formula. 19 | 20 | Args 21 | ---- 22 | filename: str 23 | Filename containing definition for a specific feature. 24 | 25 | Returns 26 | ------- 27 | feature: str 28 | JSON feature processed. 29 | """ 30 | feature = json.loads(open(filename).read()) 31 | template = feature['query'] 32 | name = feature['name'] 33 | params = feature['params'] 34 | feature_spec = { 35 | 'name': name, 36 | 'template': template, 37 | 'params': params 38 | } 39 | return feature_spec 40 | 41 | 42 | def create_feature_store(es_host: str) -> None: 43 | """ 44 | RankLib uses the concept of "features store" where information about features is 45 | stored on Elasticsearch. Here, the store is just created but now features are 46 | defined yet. 47 | 48 | Args 49 | ---- 50 | restart: bool 51 | If `True` then deletes feature store on Elasticsearch and create it again. 52 | es_host: str 53 | Hostname where to reach Elasticsearch. 54 | """ 55 | host = f'http://{es_host}' 56 | feature_store_url = urljoin(host, '_ltr') 57 | requests.delete(feature_store_url) 58 | requests.put(feature_store_url) 59 | 60 | 61 | def create_feature_set(es_host: str, model_name: str) -> None: 62 | """ 63 | Defines each feature that should be used for the RankLib model. It's expected the 64 | features will be available at a specific path when this script runs (this is 65 | accomplished by running previous steps on Kubeflow that prepares this data). 66 | 67 | Args 68 | ---- 69 | es_host: str 70 | Hostname of Elasticsearch. 71 | model_name: str 72 | Name that specificies current experiment in Kubeflow. 73 | """ 74 | features_path = PATH / f'{model_name}' / 'features' 75 | feature_set = { 76 | 'featureset': { 77 | 'name': model_name, 78 | 'features': [process_feature_file(str(filename)) for filename in 79 | features_path.glob('*')] 80 | } 81 | } 82 | post_feature_set(feature_set, model_name, es_host) 83 | 84 | 85 | def post_feature_set( 86 | feature_set: Dict[str, Any], 87 | model_name: str, 88 | es_host: str 89 | ) -> None: 90 | """ 91 | POST feature definition to Elasticsearch under the name of `model_name`. 92 | 93 | Args 94 | ---- 95 | feature_set: Dict[str, Any] 96 | Definition of features to be stored on Elasticsearch. 97 | model_name: str 98 | Defined for each Kubeflow experiment. 99 | es_host: str 100 | Hostname where Elasticsearch is located. 101 | """ 102 | host = f'http://{es_host}' 103 | url = f'_ltr/_featureset/{model_name}' 104 | url = urljoin(host, url) 105 | header = {'Content-Type': 'application/json'} 106 | resp = requests.post(url, data=json.dumps(feature_set), headers=header) 107 | if not resp.ok: 108 | raise Exception(resp.content) 109 | 110 | 111 | def main(args: NamedTuple): 112 | import json 113 | from elasticsearch import Elasticsearch 114 | from elasticsearch.helpers import bulk 115 | 116 | es = Elasticsearch(hosts=[args.es_host]) 117 | es_mapping_path = PATH / f'{args.model_name}' / 'es_mapping.json' 118 | schema = json.loads(open(str(es_mapping_path)).read()) 119 | index = schema.pop('index') 120 | 121 | storage_client = storage.Client() 122 | bq_client = bigquery.Client() 123 | 124 | def read_file(bucket, storage_client=storage_client, bq_client=bq_client): 125 | ds_ref = bq_client.dataset('pysearchml') 126 | bq_client.create_dataset(ds_ref, exists_ok=True) 127 | 128 | table_id = 'es_docs' 129 | table_ref = ds_ref.table(table_id) 130 | 131 | bucket_obj = storage_client.bucket(bucket) 132 | if not bucket_obj.exists(): 133 | bucket_obj.create() 134 | 135 | # Query GA data 136 | query_path = PATH / f'{args.model_name}' / 'ga_data.sql' 137 | query = open(str(query_path)).read() 138 | print(query) 139 | job_config = bigquery.QueryJobConfig() 140 | job_config.destination = f'{bq_client.project}.pysearchml.{table_id}' 141 | job_config.maximum_bytes_billed = 10 * (1024 ** 3) 142 | job_config.write_disposition = 'WRITE_TRUNCATE' 143 | job = bq_client.query(query, job_config=job_config) 144 | job.result() 145 | 146 | # export BigQuery table to GCS 147 | destination_uri = f'gs://{bucket}/es_docs.gz' 148 | 149 | extract_config = bigquery.ExtractJobConfig() 150 | extract_config.compression = 'GZIP' 151 | extract_config.destination_format = 'NEWLINE_DELIMITED_JSON' 152 | job = bq_client.extract_table(table_ref, destination_uri, 153 | job_config=extract_config) 154 | job.result() 155 | 156 | # Download data 157 | blob = bucket_obj.blob('es_docs.gz') 158 | file_obj = gzip.io.BytesIO() 159 | blob.download_to_file(file_obj) 160 | 161 | file_obj.seek(0) 162 | 163 | c = 0 164 | for row in gzip.GzipFile(fileobj=file_obj, mode='rb'): 165 | row = json.loads(row) 166 | yield { 167 | '_index': index, 168 | '_source': row, 169 | '_id': row['sku'] 170 | } 171 | c += 1 172 | if not c % 1000: 173 | print(c) 174 | 175 | # Delete BQ Table 176 | bq_client.delete_table(table_ref) 177 | 178 | if args.force_restart or not es.indices.exists(index): 179 | es.indices.delete(index, ignore=[400, 404]) 180 | print('deleted index') 181 | es.indices.create(index, **schema) 182 | print('schema created') 183 | bulk(es, read_file(args.bucket), request_timeout=30) 184 | create_feature_store(args.es_host) 185 | create_feature_set(args.es_host, args.model_name) 186 | print('Finished preparing environment.') 187 | 188 | 189 | if __name__ == '__main__': 190 | parser = argparse.ArgumentParser() 191 | parser.add_argument( 192 | '--force_restart', 193 | dest='force_restart', 194 | type=lambda arg: arg.lower() == 'true', 195 | default=False 196 | ) 197 | parser.add_argument( 198 | '--es_host', 199 | dest='es_host', 200 | type=str, 201 | default='localhost' 202 | ) 203 | parser.add_argument( 204 | '--bucket', 205 | dest='bucket', 206 | type=str 207 | ) 208 | parser.add_argument( 209 | '--model_name', 210 | dest='model_name', 211 | type=str, 212 | help=('Assigns a name for the RankLib model. Each experiment on Kubeflow ' 213 | 'should have a specific name in order to preserver their results.') 214 | ) 215 | args, _ = parser.parse_known_args(sys.argv[1:]) 216 | main(args) 217 | -------------------------------------------------------------------------------- /kubeflow/components/model/train.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import argparse 4 | import pathlib 5 | from datetime import datetime 6 | from typing import NamedTuple, Sequence 7 | from functools import partial 8 | from multiprocessing import Pool 9 | from shutil import copyfile 10 | 11 | import requests 12 | 13 | from validate import validate_model 14 | 15 | 16 | def main(args: NamedTuple, X: Sequence[str]) -> None: 17 | """ 18 | X contains a list of input arguments, such as `[--var1=val1]`. These args are sent 19 | to RankLib to setup a training run specification. 20 | 21 | Args 22 | ---- 23 | args: NamedTuple 24 | ranker: str 25 | Name of which ranker to use available in RankLib 26 | train_file_path: str 27 | Path where RankLib training file is located. 28 | validation_files_path: str 29 | Path where regular validation files are located. 30 | validation_train_files_path: str 31 | Path where validation files of training period are located. 32 | es_host: str 33 | Hostname where Elasticsearch is located. 34 | model_name: str 35 | RankLib featureset Model Name as saved in Elasticsearch. 36 | destination: str 37 | File name where to save results. 38 | 39 | X: Sequence[str] 40 | Values for input of RankLib parameters. 41 | """ 42 | ranker = get_ranker_index(args.ranker) 43 | if not ranker: 44 | raise ValueError(f'Invalid value for ranker: "{args.ranker}"') 45 | 46 | cmd = ('java -jar ranklib/RankLib-2.14.jar -ranker ' 47 | f'{ranker} -train {args.train_file_path} -norm sum -save ' 48 | f'{args.destination}/model.txt ' 49 | f'{(" ".join(X)).replace("--", "-").replace("=", " ")} -metric2t ERR') 50 | 51 | # {args.destination}/model.txt contains the specification of the 52 | # final trained model 53 | os.system(cmd) 54 | post_model_to_elasticsearch(args.es_host, args.model_name, 55 | f'{args.destination}/model.txt') 56 | partiated_validator = get_partiated_validator(args.es_host, args.index, 57 | args.model_name, args.es_batch) 58 | pool = Pool() 59 | rank_val, rank_train = pool.map(partiated_validator, [args.validation_files_path, 60 | args.validation_train_files_path]) 61 | 62 | # Kabit tracks down the StdOut searching for the string 'Validation-rank=value' 63 | print(f'Validation-rank={rank_val}') 64 | write_results(X, rank_train, rank_val, args.destination, args.model_name) 65 | 66 | 67 | def get_ranker_index(ranker: str) -> str: 68 | return { 69 | 'mart': '0', 70 | 'ranknet': '1', 71 | 'rankboost': '2', 72 | 'adarank': '3', 73 | 'coordinate ascent': '4', 74 | 'lambdamart': '6', 75 | 'listnet': '7', 76 | 'random forest': '8' 77 | }.get(ranker) 78 | 79 | 80 | def write_results(X: Sequence[str], rank_train: float, rank_val: float, 81 | destination: str, model_name: str): 82 | """ 83 | Write results in persistent disk. Uses the folder of `destination` as main reference 84 | 85 | Args 86 | ---- 87 | X: Sequence[str] 88 | Input arguments as suggested by Katib. It sets the hyperparameters of the 89 | models. 90 | rank_train: float 91 | Rank value of training data. 92 | rank_val: float 93 | Rank value for validation data. 94 | destination: str 95 | File path where to save results. 96 | model_name: str 97 | Name that identifies model being tested. 98 | """ 99 | # Katib installs sidecars pods that keeps reading StdOut of the main pod searching 100 | # for the previously specified pattern. This print tells Katib that this is the 101 | # metric it should be aiming to optimize. 102 | dir_ = pathlib.Path(destination) 103 | os.makedirs(str(dir_), exist_ok=True) 104 | with open(str(dir_ / 'results.txt'), 'a') as f: 105 | today_str = datetime.today().strftime('%Y%m%d %H:%M:%S') 106 | f.write( 107 | f'{today_str},{" ".join(X)},rank_train={rank_train},' 108 | f'rank_val={rank_val}{os.linesep}' 109 | ) 110 | best_model_file = dir_ / 'best_model.txt' 111 | best_rank_file = dir_ / 'best_rank.txt' 112 | if os.path.isfile(str(best_rank_file)): 113 | best_rank = float(open(str(best_rank_file)).readline()) 114 | if rank_val < best_rank: 115 | with open(str(best_rank_file), 'w') as f: 116 | f.write(str(rank_val)) 117 | copyfile(f'{destination}/model.txt', str(best_model_file)) 118 | else: 119 | with open(str(best_rank_file), 'w') as f: 120 | f.write(str(rank_val)) 121 | copyfile(f'{destination}/model.txt', str(best_model_file)) 122 | 123 | 124 | def get_partiated_validator( 125 | es_host: str, 126 | index: str, 127 | model_name: str, 128 | es_batch: int = 1000 129 | ): 130 | return partial(validate_model, es_host=es_host, index=index, model_name=model_name, 131 | es_batch=es_batch) 132 | 133 | 134 | def post_model_to_elasticsearch(es_host, model_name, model_path) -> None: 135 | """ 136 | Exports trained model to Elasticsearch 137 | """ 138 | model_definition = open(model_path).read() 139 | model_request = { 140 | 'model': { 141 | 'name': model_name, 142 | 'model': { 143 | 'type': 'model/ranklib', 144 | 'definition': model_definition 145 | } 146 | } 147 | } 148 | path = f'http://{es_host}/_ltr/_model/{model_name}' 149 | response = requests.delete(path) 150 | 151 | path = f'http://{es_host}/_ltr/_featureset/{model_name}/_createmodel' 152 | header = {'Content-Type': 'application/json'} 153 | response = requests.post(path, json=model_request, headers=header) 154 | if not response.ok: 155 | raise Exception(response.content) 156 | 157 | 158 | if __name__ == '__main__': 159 | parser = argparse.ArgumentParser() 160 | parser.add_argument( 161 | '--train_file_path', 162 | dest='train_file_path', 163 | type=str, 164 | help='Path where RankLib training file data is located.' 165 | ) 166 | parser.add_argument( 167 | '--validation_files_path', 168 | dest='validation_files_path', 169 | type=str, 170 | help='Path where validation data path is located.' 171 | ) 172 | parser.add_argument( 173 | '--validation_train_files_path', 174 | dest='validation_train_files_path', 175 | type=str, 176 | help='Path where validation train data path is located.' 177 | ) 178 | parser.add_argument( 179 | '--es_host', 180 | dest='es_host', 181 | type=str, 182 | help='Host address to reach Elasticsearch.' 183 | ) 184 | parser.add_argument( 185 | '--es_batch', 186 | dest='es_batch', 187 | type=int, 188 | default=1000, 189 | help=('Determines how many items to send at once to Elasticsearch when using ' 190 | 'multisearch API.') 191 | ) 192 | parser.add_argument( 193 | '--destination', 194 | dest='destination', 195 | type=str, 196 | help='Path where validation score is should be saved to.' 197 | ) 198 | parser.add_argument( 199 | '--model_name', 200 | dest='model_name', 201 | type=str, 202 | help='Name of featureset store as saved in Elasticsearch.' 203 | ) 204 | parser.add_argument( 205 | '--ranker', 206 | dest='ranker', 207 | type=str, 208 | help='Name of ranker algorithm to be used from RankLib.' 209 | ) 210 | parser.add_argument( 211 | '--index', 212 | dest='index', 213 | default='pysearchml', 214 | type=str, 215 | help='ES Index name to use.' 216 | ) 217 | args, unknown = parser.parse_known_args(sys.argv[1:]) 218 | print('thi is unknown: ', unknown) 219 | main(args, unknown) 220 | -------------------------------------------------------------------------------- /kubeflow/components/model/validate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import gzip 4 | import glob 5 | import json 6 | import sys 7 | from typing import List, NamedTuple, Any, Dict 8 | from elasticsearch import Elasticsearch 9 | import numpy as np 10 | 11 | 12 | """ 13 | Script responsible for reading validation data and evaluating the performance of a 14 | trained RankLib model stored on Elasticsearch. 15 | """ 16 | 17 | 18 | def parse_args(args: List) -> NamedTuple: 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument( 21 | '--files_path', 22 | dest='files_path', 23 | type=str, 24 | help='Path to files containing data of customers searches and their purchases' 25 | ) 26 | parser.add_argument( 27 | '--index', 28 | dest='index', 29 | type=str, 30 | default='pysearchml', 31 | help='Name of Index where documents are stored in Elasticsearch.' 32 | ) 33 | parser.add_argument( 34 | '--es_host', 35 | dest='es_host', 36 | type=str, 37 | help='Host address to reach Elasticsearch.' 38 | ) 39 | parser.add_argument( 40 | '--model_name', 41 | dest='model_name', 42 | type=str, 43 | help='Assigns a name for the RankLib model. Each experiment on Kubeflow should ' 44 | 'have a specific name in order to preserver their results.' 45 | ) 46 | parser.add_argument( 47 | '--es_batch', 48 | dest='es_batch', 49 | type=int, 50 | default=1000, 51 | help='Determines how many items to send at once to Elasticsearch when using ' 52 | 'multisearch API.' 53 | ) 54 | args, _ = parser.parse_known_args(args) 55 | return args 56 | 57 | 58 | def validate_model( 59 | files_path: str, 60 | es_host: str, 61 | model_name: str, 62 | index: str = 'pysearchml', 63 | es_batch: int = 1000 64 | ) -> float: 65 | """ 66 | Reads through an input file of searches and customers purchases. For each search, 67 | sends the query against Elasticsearch to retrieve a list of documents. This list is 68 | then compared with what customers purchased for evaluating a rank metric. 69 | 70 | The rank formula is defined in this paper: 71 | 72 | http://yifanhu.net/PUB/cf.pdf 73 | 74 | And is expressed by: 75 | 76 | rank = \frac{\\sum_{u,i}r^t_{ui}rank_{ui}}{\\sum_{u,i}r^t_{ui}} 77 | 78 | `u` is an identification of a given customer, `i` represents items. `r_{ui}` is 79 | the score a given customer u gave to item i. This score is implicit and here we just 80 | consider it equal to 1. 81 | 82 | The rank formula is an average of what percentile the purchased item from customers 83 | are located in the retrieve list of documents from Elasticseasrch operating already 84 | with the trained RankLib model. 85 | 86 | Notice that we use this function in parallel with multiprocessing. That means that 87 | its input must be pickleable so Elasticsearch client is instantiated inside the 88 | function instead of being an input argument. It breaks Injection Dependecy principle 89 | but still works fine. 90 | 91 | Args 92 | ---- 93 | files_path: str, 94 | es_host: str, 95 | model_name: str, 96 | index: str = 'pysearchml', 97 | es_batch: int = 1000 98 | """ 99 | counter = 1 100 | search_arr, purchase_arr = [], [] 101 | # Defined as lists which works as pointers 102 | rank_num, rank_den = [0], [0] 103 | es_client = Elasticsearch(hosts=[es_host]) 104 | 105 | files = glob.glob(os.path.join(files_path, '*.gz')) 106 | 107 | for file_ in files: 108 | for row in gzip.GzipFile(file_): 109 | row = json.loads(row) 110 | search_keys, docs = row['search_keys'], row['docs'] 111 | purchase_arr.append(docs) 112 | 113 | search_arr.append(json.dumps({'index': index})) 114 | search_arr.append( 115 | json.dumps(get_es_query(search_keys, model_name, es_batch)) 116 | ) 117 | 118 | if counter % es_batch == 0: 119 | compute_rank(search_arr, purchase_arr, rank_num, rank_den, es_client) 120 | search_arr, purchase_arr = [], [] 121 | 122 | counter += 1 123 | 124 | if search_arr: 125 | compute_rank(search_arr, purchase_arr, rank_num, rank_den, es_client) 126 | # return rank=50% if no document was retrieved from Elasticsearch and purchased 127 | # by customers. 128 | return rank_num[0] / rank_den[0] if rank_den[0] else 0.5 129 | 130 | 131 | def compute_rank( 132 | search_arr: List[str], 133 | purchase_arr: List[List[Dict[str, List[str]]]], 134 | rank_num: List[float], 135 | rank_den: List[float], 136 | es_client: Elasticsearch 137 | ) -> None: 138 | """ 139 | Sends queries against Elasticsearch and compares results with what customers 140 | purchased. Computes the average rank position of where the purchased document falls 141 | within the retrieved items. 142 | 143 | Args 144 | ---- 145 | search_arr: List[str] 146 | Searches made by customers as observed in validation data. We send those 147 | against Elasticsearch and compare results with purchased data 148 | purchase_arr: List[List[Dict[str, List[str]]]] 149 | List of documents that were purchased by customers 150 | rank_num: List[float] 151 | Numerator value of the rank equation. Defined as list to emulate a pointer 152 | rank_den: List[float] 153 | es_client: Elasticsearch 154 | Python Elasticsearch client 155 | """ 156 | idx = 0 157 | if not search_arr: 158 | return 159 | 160 | request = os.linesep.join(search_arr) 161 | response = es_client.msearch(body=request, request_timeout=60) 162 | 163 | for hit in response['responses']: 164 | docs = [doc['_id'] for doc in hit['hits'].get('hits', [])] 165 | 166 | if not docs or len(docs) < 2: 167 | continue 168 | 169 | purchased_docs = [ 170 | docs for purch in purchase_arr[idx] for docs in purch['purchased'] 171 | ] 172 | ranks = np.where(np.in1d(docs, purchased_docs))[0] 173 | idx += 1 174 | 175 | if ranks.size == 0: 176 | continue 177 | 178 | rank_num[0] += ranks.sum() / (len(docs) - 1) 179 | rank_den[0] += ranks.size 180 | 181 | print('rank num: ', rank_num[0]) 182 | print('rank den: ', rank_den[0]) 183 | 184 | 185 | def get_es_query( 186 | search_keys: Dict[str, Any], 187 | model_name: str, 188 | es_batch: int = 1000 189 | ) -> str: 190 | """ 191 | Builds the Elasticsearch query to be used when retrieving data. 192 | 193 | Args 194 | ---- 195 | args: NamedTuple 196 | args.search_keys: Dict[str, Any] 197 | Search query sent by the customer as well as other variables that sets its 198 | context, such as region, favorite brand and so on. 199 | args.model_name: str 200 | Name of RankLib model saved on Elasticsearch 201 | args.index: str 202 | Index on Elasticsearch where to retrieve documents 203 | args.es_batch: int 204 | How many documents to retrieve 205 | 206 | Returns 207 | ------- 208 | query: str 209 | String representation of final query 210 | """ 211 | # it's expected that a ES query will be available at: 212 | # ./queries/{model_name}/es_query.json 213 | query = open(f'queries/{model_name}/es_query.json').read() 214 | query = json.loads(query.replace('{query}', search_keys['search_term'])) 215 | # We just want to retrieve the id of the document to evaluate the ranks between 216 | # customers purchases and the retrieve list result 217 | query['_source'] = '_id' 218 | query['size'] = es_batch 219 | query['rescore']['window_size'] = 50 # Hardcoded to optimize first 50 skus 220 | query['rescore']['query']['rescore_query']['sltr']['params'] = search_keys 221 | query['rescore']['query']['rescore_query']['sltr']['model'] = model_name 222 | return query 223 | 224 | 225 | if __name__ == '__main__': 226 | args = parse_args(sys.argv[1:]) 227 | validate_model( 228 | args.files_path, 229 | args.es_host, 230 | args.model_name, 231 | args.index, 232 | args.es_batch 233 | ) 234 | -------------------------------------------------------------------------------- /kubeflow/components/data/train/run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import json 4 | import glob 5 | import gzip 6 | import sys 7 | import pathlib 8 | import uuid 9 | from shutil import rmtree 10 | from typing import List, NamedTuple, Dict, Any, Iterator, Tuple 11 | 12 | import numpy as np 13 | from elasticsearch import Elasticsearch 14 | from google.cloud import storage, bigquery 15 | from pyClickModels import DBN 16 | 17 | 18 | """ 19 | Module responsible for creating the final RankLib text file used as input for its 20 | training process. For information is available at: 21 | 22 | https://sourceforge.net/p/lemur/wiki/RankLib%20How%20to%20use/ 23 | 24 | """ 25 | 26 | 27 | PATH = pathlib.Path(__file__).parent 28 | 29 | 30 | def build_judgment_files(model_name: str) -> None: 31 | """ 32 | Uses DBN Models and the clickstream data to come up with the Judgmnets inferences. 33 | """ 34 | model = DBN.DBNModel() 35 | 36 | # clickstream has the browsing patterns of searches, clicks and purchases from 37 | # customers. 38 | clickstream_files_path = f'/tmp/pysearchml/{model_name}/clickstream/' 39 | 40 | # model is the output from pyClickModels. It contains JSON NEWLINE DELIMITED data 41 | # where each row contains a JSON with search queries and its context and then 42 | # a dictionary of skus and their correspondent judgment for the respective query. 43 | model_path = f'/tmp/pysearchml/{model_name}/model/model.gz' 44 | 45 | rmtree(os.path.dirname(model_path), ignore_errors=True) 46 | os.makedirs(os.path.dirname(model_path)) 47 | 48 | # finally judgment files is where the final judgments are written. 49 | judgment_files_path = f'/tmp/pysearchml/{model_name}/judgments/judgments.gz' 50 | rmtree(os.path.dirname(judgment_files_path), ignore_errors=True) 51 | os.makedirs(os.path.dirname(judgment_files_path)) 52 | 53 | model.fit(clickstream_files_path, iters=10) 54 | model.export_judgments(model_path) 55 | 56 | with gzip.GzipFile(judgment_files_path, 'wb') as f: 57 | for row in gzip.GzipFile(model_path): 58 | row = json.loads(row) 59 | result = [] 60 | 61 | # search_keys is something like: 62 | # {"search_term:query|brand:brand_name|context:value} 63 | # Notice that only the `search_term` is always available. Other keys depends 64 | # on the chosen context when training the model, i.e., one can choose to 65 | # add the brand information or not and so on. 66 | search_keys = list(row.keys())[0] 67 | docs_judgments = row[search_keys] 68 | search_keys = dict(e.split(':') for e in search_keys.split('|')) 69 | 70 | judgments_list = [judge for doc, judge in docs_judgments.items()] 71 | 72 | # It means all judgments expectations are equal which is not desirable 73 | if all(x == judgments_list[0] for x in judgments_list): 74 | continue 75 | 76 | # We devire judgments based on percentiles from 20% up to 100% 77 | percentiles = np.percentile(judgments_list, [20, 40, 60, 80, 100]) 78 | 79 | judgment_keys = [ 80 | { 81 | 'doc': doc, 82 | 'judgment': process_judgment(percentiles, judgment) 83 | } 84 | for doc, judgment in docs_judgments.items() 85 | ] 86 | 87 | result = { 88 | 'search_keys': search_keys, 89 | 'judgment_keys': judgment_keys 90 | } 91 | f.write(json.dumps(result).encode() + '\n'.encode()) 92 | 93 | 94 | def process_judgment(percentiles: list, judgment: float) -> int: 95 | """ 96 | Returns which quantile the current value of `judgment` belongs to. The result is 97 | already transformed to range between integers 0 and 4 inclusive. 98 | 99 | Args 100 | ---- 101 | judgents_list: np.array 102 | All judgments computed for given query 103 | judgment: float 104 | Current judgment value being computed. 105 | 106 | Returns 107 | ------- 108 | judgment: int 109 | Integer belonging to 0 and 4, inclusive. 0 means the current document is not 110 | appropriate for current query whereas 4 means it's a perfect fit. 111 | """ 112 | if judgment <= percentiles[0]: 113 | return 0 114 | if judgment <= percentiles[1]: 115 | return 1 116 | if judgment <= percentiles[2]: 117 | return 2 118 | if judgment <= percentiles[3]: 119 | return 3 120 | if judgment <= percentiles[4]: 121 | return 4 122 | 123 | 124 | def build_train_file( 125 | model_name: str, 126 | es_batch: int, 127 | es_client: Elasticsearch, 128 | destination: str, 129 | index: str 130 | ) -> None: 131 | """ 132 | After the input file has been updated with judgment data, logs features from 133 | Elasticsearch which results in the final text file used as input for RankLib. 134 | 135 | Args 136 | ---- 137 | model_name: str 138 | Name of feature set store on Elasticsearch. 139 | es_batch: int 140 | Sets how many queries to aggregate when using multisearch API. 141 | es_client: Elasticsearch 142 | Python Elasticsearch client 143 | destination: str 144 | Path where to write results to. 145 | """ 146 | counter = 1 147 | # works as a pointer 148 | queries_counter = [0] 149 | search_arr, judge_list = [], [] 150 | os.makedirs(destination, exist_ok=True) 151 | if os.path.isfile(f'{destination}/train_dataset.txt'): 152 | os.remove(f'{destination}/train_dataset.txt') 153 | 154 | for search_keys, docs, judgments in read_judgment_files(model_name): 155 | judge_list.append(judgments) 156 | 157 | search_arr.append(json.dumps({'index': f'{index}'})) 158 | search_arr.append(json.dumps(get_logging_query(model_name, docs, search_keys))) 159 | 160 | if counter % es_batch == 0: 161 | write_features(search_arr, judge_list, queries_counter, es_client, 162 | destination) 163 | search_arr, judge_list = [], [] 164 | 165 | counter += 1 166 | 167 | if search_arr: 168 | write_features(search_arr, judge_list, queries_counter, es_client, destination) 169 | 170 | 171 | def write_features( 172 | search_arr: List[str], 173 | judge_list: List[List[str]], 174 | queries_counter: List[int], 175 | es_client: Elasticsearch, 176 | destination: str 177 | ) -> None: 178 | """ 179 | Sends the query to Elasticsearch and uses the result to write final RankLib training 180 | file. 181 | 182 | Args 183 | ---- 184 | search_arr: List[str] 185 | Array containing multiple queries to send against Elasticsearch 186 | judge_list: List[List[str]] 187 | Each index contains list of judgments associated to a respective search 188 | file_: io.TextIO 189 | queries_counter: List[int] 190 | Counter of how many queries were processed so far. It's used to build the 191 | RankLib file with appropriate values. It's a list so it works as a C pointer. 192 | es: Elasticsearch 193 | Python client for interacting with Elasticsearch 194 | destination: str 195 | Path where to save results to. 196 | """ 197 | if not search_arr: 198 | return 199 | 200 | multi_request = os.linesep.join(search_arr) 201 | features_log = es_client.msearch(body=multi_request, request_timeout=60) 202 | 203 | rows = [] 204 | for i in range(len(judge_list)): 205 | es_result = features_log['responses'][i].get('hits', {}).get('hits') 206 | 207 | if not es_result or len(es_result) == 1: 208 | continue 209 | 210 | for j in range(len(es_result)): 211 | logs = es_result[j]['fields']['_ltrlog'][0]['main'] 212 | features = [ 213 | f'{idx+1}:{logs[idx].get("value", 0)}' for idx in range(len(logs)) 214 | ] 215 | features = '\t'.join(features) 216 | ranklib_entry = f'{judge_list[i][j]}\tqid:{queries_counter[0]}\t{features}' 217 | rows.append(ranklib_entry) 218 | queries_counter[0] += 1 219 | 220 | if rows: 221 | print(rows[0]) 222 | path = f'{destination}/train_dataset.txt' 223 | os.makedirs(os.path.dirname(path), exist_ok=True) 224 | with open(path, 'a') as f: 225 | f.write(os.linesep.join(rows) + os.linesep) 226 | 227 | 228 | def get_logging_query( 229 | model_name: str, 230 | docs: List[str], 231 | search_keys: Dict[str, Any] 232 | ) -> Dict[str, Any]: 233 | """ 234 | The process to extract features from Elasticsearch involves sending what is known 235 | as the "logging query". The result of the loggin query is the values, for a given 236 | search query, of each feature as defined in feature set. 237 | 238 | Args 239 | ---- 240 | model_name: str 241 | Each Kubeflow run receives a model_name so it's possible to discern each 242 | experiment. This value is used to store different models on Elasticsearch. 243 | docs: List[str] 244 | List containing several documents (skus for instance) to be inserted in the 245 | query so it's possible to send several requests in just one request. 246 | search_keys: Dict[str, Any] 247 | Those are the keys that describe the search context. It can contain 248 | data such as the region of customers, their favorite brands, their average 249 | purchasing ticket and so on. 250 | 251 | Returns 252 | ------- 253 | log_query: Dict[str, Any] 254 | Query to be sent against Elasticsearch in order to find the values of each 255 | feature as defined in featureset. 256 | """ 257 | log_query = { 258 | "query": { 259 | "bool": { 260 | "filter": [ 261 | { 262 | "terms": { 263 | "_id": "" 264 | } 265 | } 266 | ], 267 | "should": [ 268 | { 269 | "sltr": { 270 | "_name": "logged_featureset", 271 | "featureset": model_name, 272 | "params": {} 273 | } 274 | } 275 | ] 276 | } 277 | }, 278 | "_source": ['_id'], 279 | "ext": { 280 | "ltr_log": { 281 | "log_specs": { 282 | "name": "main", 283 | "named_query": "logged_featureset" 284 | } 285 | } 286 | } 287 | } 288 | log_query['query']['bool']['filter'][0]['terms']['_id'] = docs 289 | log_query['query']['bool']['should'][0]['sltr']['params'] = search_keys 290 | return log_query 291 | 292 | 293 | def read_judgment_files( 294 | model_name: str 295 | ) -> Iterator[Tuple[Dict[str, Any], List[str], List[str]]]: 296 | """ 297 | Reads resulting files of the judgments updating process. 298 | """ 299 | files = glob.glob(f'/tmp/pysearchml/{model_name}/judgments/*.gz') 300 | for file_ in files: 301 | for row in gzip.GzipFile(file_): 302 | row = json.loads(row) 303 | search_keys = row['search_keys'] 304 | judgment_keys = row['judgment_keys'] 305 | docs = [e['doc'] for e in judgment_keys] 306 | judgments = [e['judgment'] for e in judgment_keys] 307 | yield search_keys, docs, judgments 308 | 309 | 310 | def download_data(args: NamedTuple): 311 | """ 312 | Queries over GA data for input training dataset creation. The table is first 313 | exported to GS and then downloaded to respective folder, as is. 314 | 315 | Args 316 | ---- 317 | args: NamedTuple 318 | train_init_date: str 319 | Follows format %Y%M%D, represents from where the query should start 320 | retrieving data from. 321 | train_end_date: str 322 | model_name: str 323 | Name to identify model being trained. 324 | """ 325 | path_to_download = f'/tmp/pysearchml/{args.model_name}/clickstream' 326 | rmtree(path_to_download, ignore_errors=True) 327 | os.makedirs(path_to_download, exist_ok=True) 328 | 329 | storage_client = storage.Client() 330 | bq_client = bigquery.Client() 331 | 332 | ds_ref = bq_client.dataset('pysearchml') 333 | table_id = str(uuid.uuid4()).replace('-', '') 334 | table_ref = ds_ref.table(table_id) 335 | 336 | # Query GA data 337 | query_path = PATH / 'train.sql' 338 | query = open(str(query_path)).read() 339 | query = query.format(train_init_date=args.train_init_date, 340 | train_end_date=args.train_end_date) 341 | 342 | job_config = bigquery.QueryJobConfig() 343 | job_config.destination = f'{bq_client.project}.pysearchml.{table_id}' 344 | job_config.maximum_bytes_billed = 10 * (1024 ** 3) 345 | job_config.write_disposition = 'WRITE_TRUNCATE' 346 | job = bq_client.query(query, job_config=job_config) 347 | job.result() 348 | 349 | # export BigQuery table to GCS 350 | destination_uri = f'gs://{args.bucket}/train/*.gz' 351 | 352 | extract_config = bigquery.ExtractJobConfig() 353 | extract_config.compression = 'GZIP' 354 | extract_config.destination_format = 'NEWLINE_DELIMITED_JSON' 355 | job = bq_client.extract_table(table_ref, destination_uri, job_config=extract_config) 356 | job.result() 357 | 358 | # Download data 359 | bucket_obj = storage_client.bucket(args.bucket) 360 | blobs = bucket_obj.list_blobs(prefix='train') 361 | for blob in blobs: 362 | blob.download_to_filename( 363 | f"{path_to_download}/judgments_{blob.name.split('/')[-1]}" 364 | ) 365 | blob.delete() 366 | 367 | # Delete BQ Table 368 | bq_client.delete_table(table_ref) 369 | 370 | 371 | def main(args: NamedTuple, es_client: Elasticsearch) -> None: 372 | """ 373 | Uses as input data from Google Analytics containing customers clickstream for Search 374 | Result Pages. This data is processed with the Judgments model as described in 375 | [pyClickModels](https://github.com/WillianFuks/pyClickModels) and features are 376 | derived from Elasticsearch. 377 | 378 | The resulting text file is like: 379 | 380 | judgment qid feature_1 ... feature_N 381 | 4 qid:1 1:0.56 ... 2:1.3 382 | 4 qid:1 1:2.90 ... 2:1.09 383 | 3 qid:1 1:3.00 ... 2:5.51 384 | 385 | This text file is what we use as input for training models in RankLib. 386 | 387 | Args 388 | ---- 389 | args: List 390 | List of input arguments from `sys.argv` 391 | es_client: Elasticsearch 392 | Python Elasticsearch client 393 | """ 394 | download_data(args) 395 | build_judgment_files(args.model_name) 396 | build_train_file(args.model_name, args.es_batch, es_client, args.destination, 397 | args.index) 398 | 399 | 400 | if __name__ == '__main__': 401 | parser = argparse.ArgumentParser() 402 | parser.add_argument( 403 | '--train_init_date', 404 | dest='train_init_date', 405 | type=str, 406 | help=('Value to replace in BigQuery SQL. Represents date from where to start ' 407 | 'quering from. Format follows %Y%M%D.') 408 | ) 409 | parser.add_argument( 410 | '--train_end_date', 411 | dest='train_end_date', 412 | type=str, 413 | help=('Value to replace in BigQuery SQL. Represents date from where to start ' 414 | 'quering from. Format follows %Y%M%D.') 415 | ) 416 | parser.add_argument( 417 | '--bucket', 418 | dest='bucket', 419 | type=str, 420 | default='pysearchml', 421 | help='Google Cloud Storage Bucket where all data will be stored.' 422 | ) 423 | parser.add_argument( 424 | '--es_host', 425 | dest='es_host', 426 | type=str, 427 | help='Host address to reach Elasticsearch.' 428 | ) 429 | parser.add_argument( 430 | '--es_batch', 431 | dest='es_batch', 432 | type=int, 433 | default=1000, 434 | help=('Determines how many items to send at once to Elasticsearch when using ' 435 | 'multisearch API.') 436 | ) 437 | parser.add_argument( 438 | '--destination', 439 | dest='destination', 440 | type=str, 441 | help='Path where to write results to.' 442 | ) 443 | parser.add_argument( 444 | '--model_name', 445 | dest='model_name', 446 | type=str, 447 | help='Name of featureset store as saved in Elasticsearch.' 448 | ) 449 | parser.add_argument( 450 | '--index', 451 | dest='index', 452 | type=str, 453 | help='Name of index to use from in Elasticsearch.' 454 | ) 455 | 456 | args, _ = parser.parse_known_args(sys.argv[1:]) 457 | es_client = Elasticsearch(hosts=[args.es_host]) 458 | main(args, es_client) 459 | -------------------------------------------------------------------------------- /kubeflow/components/model/model.txt: -------------------------------------------------------------------------------- 1 | ## LambdaMART 2 | ## No. of trees = 10 3 | ## No. of leaves = 10 4 | ## No. of threshold candidates = 256 5 | ## Learning rate = 0.1 6 | ## Stop early = 100 7 | 8 | 9 | 10 | 11 | 4 12 | 0.25 13 | 14 | 1 15 | 0.3315219 16 | 17 | 2 18 | 0.54972833 19 | 20 | 1 21 | 0.21562068 22 | 23 | -1.7084366083145142 24 | 25 | 26 | 1 27 | 0.21851821 28 | 29 | 1.096567153930664 30 | 31 | 32 | -0.7644314169883728 33 | 34 | 35 | 36 | 37 | 2.0 38 | 39 | 40 | 41 | 1 42 | 0.46480832 43 | 44 | 0.787030041217804 45 | 46 | 47 | 0.1781938225030899 48 | 49 | 50 | 51 | 52 | 1 53 | 0.22141574 54 | 55 | 1 56 | 0.21272315 57 | 58 | 1.1048166751861572 59 | 60 | 61 | 2.0 62 | 63 | 64 | 65 | 1 66 | 0.24169846 67 | 68 | -2.0 69 | 70 | 71 | 0.6879376769065857 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 4 80 | 0.24529763 81 | 82 | 1 83 | 0.20692809 84 | 85 | -1.5049853324890137 86 | 87 | 88 | 2 89 | 0.55973893 90 | 91 | 1 92 | 0.3315219 93 | 94 | 1 95 | 0.21851821 96 | 97 | 0.41835230588912964 98 | 99 | 100 | -0.7194477915763855 101 | 102 | 103 | 104 | 1 105 | 0.46480832 106 | 107 | 0.7153688073158264 108 | 109 | 110 | 0.16227617859840393 111 | 112 | 113 | 114 | 115 | 1.8283854722976685 116 | 117 | 118 | 119 | 120 | 1 121 | 0.22141574 122 | 123 | 1.1201153993606567 124 | 125 | 126 | 1 127 | 0.24169846 128 | 129 | -1.7444206476211548 130 | 131 | 132 | 1 133 | 0.25908363 134 | 135 | 1.2497144937515259 136 | 137 | 138 | 0.32585909962654114 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 4 148 | 0.24529763 149 | 150 | 1 151 | 0.3315219 152 | 153 | 2 154 | 0.54972833 155 | 156 | 1 157 | 0.21562068 158 | 159 | -1.3644481897354126 160 | 161 | 162 | -0.4869944751262665 163 | 164 | 165 | 166 | 1.6992262601852417 167 | 168 | 169 | 170 | 1 171 | 0.46480832 172 | 173 | 0.6203141212463379 174 | 175 | 176 | 0.14612968266010284 177 | 178 | 179 | 180 | 181 | 3 182 | 0.0 183 | 184 | 1 185 | 0.25908363 186 | 187 | 4 188 | 0.6060418 189 | 190 | 1 191 | 0.24169846 192 | 193 | 0.12054406106472015 194 | 195 | 196 | 1.2553942203521729 197 | 198 | 199 | 200 | 1.622399091720581 201 | 202 | 203 | 204 | 0.05810323357582092 205 | 206 | 207 | 208 | 1.7710267305374146 209 | 210 | 211 | 212 | 213 | 214 | 215 | 4 216 | 0.31724432 217 | 218 | 1 219 | 0.3315219 220 | 221 | 3 222 | 0.0 223 | 224 | 2 225 | 0.54972833 226 | 227 | 1 228 | 0.21562068 229 | 230 | -1.0552788972854614 231 | 232 | 233 | 1 234 | 0.21851821 235 | 236 | 0.8310797810554504 237 | 238 | 239 | 1 240 | 0.23010834 241 | 242 | -1.702104926109314 243 | 244 | 245 | 1 246 | 0.23300587 247 | 248 | 1.8411474227905273 249 | 250 | 251 | -0.6400606632232666 252 | 253 | 254 | 255 | 256 | 257 | 258 | 1.5750422477722168 259 | 260 | 261 | 262 | 1.279455542564392 263 | 264 | 265 | 266 | 0.3588203489780426 267 | 268 | 269 | 270 | 1 271 | 0.25908363 272 | 273 | 0.7923973798751831 274 | 275 | 276 | 0.5431618690490723 277 | 278 | 279 | 280 | 281 | 282 | 283 | 4 284 | 0.31724432 285 | 286 | 1 287 | 0.3315219 288 | 289 | 2 290 | 0.54972833 291 | 292 | 3 293 | 0.0 294 | 295 | 1 296 | 0.21562068 297 | 298 | -0.9644893407821655 299 | 300 | 301 | 1 302 | 0.21851821 303 | 304 | 0.935935378074646 305 | 306 | 307 | 1 308 | 0.23010834 309 | 310 | -1.568875789642334 311 | 312 | 313 | -0.4440454840660095 314 | 315 | 316 | 317 | 318 | 319 | 1.155909776687622 320 | 321 | 322 | 323 | 1.5210938453674316 324 | 325 | 326 | 327 | 1 328 | 0.3402145 329 | 330 | 1.1632524728775024 331 | 332 | 333 | 0.24559912085533142 334 | 335 | 336 | 337 | 338 | 1 339 | 0.25908363 340 | 341 | 0.7345250844955444 342 | 343 | 344 | 0.45409226417541504 345 | 346 | 347 | 348 | 349 | 350 | 351 | 4 352 | 0.31724432 353 | 354 | 1 355 | 0.38077992 356 | 357 | 2 358 | 0.0 359 | 360 | 3 361 | 0.0 362 | 363 | 1 364 | 0.1895429 365 | 366 | -1.232733130455017 367 | 368 | 369 | 4 370 | 0.3172443 371 | 372 | 4 373 | 0.24529763 374 | 375 | 1 376 | 0.21562068 377 | 378 | -1.1072427034378052 379 | 380 | 381 | 1 382 | 0.21851821 383 | 384 | 0.8476951718330383 385 | 386 | 387 | -0.44101765751838684 388 | 389 | 390 | 391 | 392 | 0.2138904184103012 393 | 394 | 395 | 396 | -1.8764989376068115 397 | 398 | 399 | 400 | 401 | 1.0571155548095703 402 | 403 | 404 | 405 | 1.0903085470199585 406 | 407 | 408 | 409 | 0.4524942934513092 410 | 411 | 412 | 413 | 0.5950737595558167 414 | 415 | 416 | 417 | 418 | 419 | 4 420 | 0.31724432 421 | 422 | 1 423 | 0.38077992 424 | 425 | 2 426 | 0.0 427 | 428 | 3 429 | 0.0 430 | 431 | 1 432 | 0.1895429 433 | 434 | -1.1662677526474 435 | 436 | 437 | 4 438 | 0.3030209 439 | 440 | 4 441 | 0.30263692 442 | 443 | 1 444 | 0.21562068 445 | 446 | -1.0524094104766846 447 | 448 | 449 | 1 450 | 0.21851821 451 | 452 | 0.758584201335907 453 | 454 | 455 | -0.3663649559020996 456 | 457 | 458 | 459 | 460 | 1.0178462266921997 461 | 462 | 463 | 464 | -1.6032209396362305 465 | 466 | 467 | 468 | 469 | 0.9786844849586487 470 | 471 | 472 | 473 | 1.0436853170394897 474 | 475 | 476 | 477 | 0.4028914272785187 478 | 479 | 480 | 481 | 0.56407630443573 482 | 483 | 484 | 485 | 486 | 487 | 4 488 | 0.31724432 489 | 490 | 2 491 | 0.55973893 492 | 493 | 1 494 | 0.38077992 495 | 496 | 3 497 | 0.0 498 | 499 | 2 500 | 0.0 501 | 502 | 1 503 | 0.1895429 504 | 505 | -1.1066094636917114 506 | 507 | 508 | 4 509 | 0.3030209 510 | 511 | 4 512 | 0.30263692 513 | 514 | 1 515 | 0.21562068 516 | 517 | -0.9751008152961731 518 | 519 | 520 | -0.2530476748943329 521 | 522 | 523 | 524 | 0.9325478076934814 525 | 526 | 527 | 528 | -1.4579707384109497 529 | 530 | 531 | 532 | 533 | 0.8370009064674377 534 | 535 | 536 | 537 | 0.8957659006118774 538 | 539 | 540 | 541 | 0.36041855812072754 542 | 543 | 544 | 545 | 1.4393270015716553 546 | 547 | 548 | 549 | 0.5142536759376526 550 | 551 | 552 | 553 | 554 | 555 | 4 556 | 0.5 557 | 558 | 1 559 | 0.19533797 560 | 561 | -0.9102288484573364 562 | 563 | 564 | 2 565 | 0.55973893 566 | 567 | 4 568 | 0.24529763 569 | 570 | 1 571 | 0.3315219 572 | 573 | 1 574 | 0.21562068 575 | 576 | -1.2571520805358887 577 | 578 | 579 | 1 580 | 0.21851821 581 | 582 | 0.6894375681877136 583 | 584 | 585 | -0.48254841566085815 586 | 587 | 588 | 589 | 590 | 3 591 | 0.0 592 | 593 | 0.2937770485877991 594 | 595 | 596 | -2.1294875144958496 597 | 598 | 599 | 600 | 601 | 1 602 | 0.25908363 603 | 604 | 0.7393473386764526 605 | 606 | 607 | -0.15804332494735718 608 | 609 | 610 | 611 | 612 | 1.3993260860443115 613 | 614 | 615 | 616 | 617 | 1.0860182046890259 618 | 619 | 620 | 621 | 622 | 623 | 4 624 | 0.5 625 | 626 | 1 627 | 0.19533797 628 | 629 | -0.8420385122299194 630 | 631 | 632 | 2 633 | 0.55973893 634 | 635 | 2 636 | 0.54972833 637 | 638 | 4 639 | 0.31724432 640 | 641 | 1 642 | 0.38077992 643 | 644 | 3 645 | 0.0 646 | 647 | 4 648 | 0.3030209 649 | 650 | 4 651 | 0.30263692 652 | 653 | -0.2686406373977661 654 | 655 | 656 | 1.390183448791504 657 | 658 | 659 | 660 | -1.3712384700775146 661 | 662 | 663 | 664 | 0.8090718984603882 665 | 666 | 667 | 668 | 0.3620189130306244 669 | 670 | 671 | 672 | 0.3657061755657196 673 | 674 | 675 | 676 | -3.2265491485595703 677 | 678 | 679 | 680 | 1.3460023403167725 681 | 682 | 683 | 684 | 685 | 1.007249355316162 686 | 687 | 688 | 689 | 690 | --------------------------------------------------------------------------------