├── conftest.py
├── .flake8
├── kubeflow
    ├── components
    │   ├── data
    │   │   ├── train
    │   │   │   ├── __init__.py
    │   │   │   ├── tests
    │   │   │   │   ├── fixtures
    │   │   │   │   │   ├── model.gz
    │   │   │   │   │   ├── features
    │   │   │   │   │   │   ├── f2.json
    │   │   │   │   │   │   └── f1.json
    │   │   │   │   │   ├── msearch_call2.txt
    │   │   │   │   │   └── msearch_call1.txt
    │   │   │   │   ├── test_run.py
    │   │   │   │   └── conftest.py
    │   │   │   ├── requirements.txt
    │   │   │   ├── Dockerfile
    │   │   │   ├── train.sql
    │   │   │   └── run.py
    │   │   └── validation
    │   │   │   ├── requirements.txt
    │   │   │   ├── Dockerfile
    │   │   │   ├── run.py
    │   │   │   └── validation.sql
    │   ├── prepare_env
    │   │   ├── requirements.txt
    │   │   ├── Dockerfile
    │   │   ├── lambdamart0
    │   │   │   ├── features
    │   │   │   │   ├── name.json
    │   │   │   │   ├── category.json
    │   │   │   │   ├── avg_customer_price.json
    │   │   │   │   └── channel_group.json
    │   │   │   ├── ga_data.sql
    │   │   │   └── es_mapping.json
    │   │   └── run.py
    │   ├── model
    │   │   ├── ranklib
    │   │   │   └── RankLib-2.14.jar
    │   │   ├── tests
    │   │   │   ├── fixtures
    │   │   │   │   ├── validation.gz
    │   │   │   │   └── es_query.json
    │   │   │   ├── test_validate.py
    │   │   │   ├── conftest.py
    │   │   │   └── test_train.py
    │   │   ├── requirements.txt
    │   │   ├── queries
    │   │   │   ├── unittest
    │   │   │   │   └── es_query.json
    │   │   │   └── lambdamart0
    │   │   │   │   └── es_query.json
    │   │   ├── Dockerfile
    │   │   ├── post_model.py
    │   │   ├── test.py
    │   │   ├── experiment.json
    │   │   ├── launch_katib.py
    │   │   ├── train.py
    │   │   ├── validate.py
    │   │   └── model.txt
    │   └── common
    │   │   └── launch_crd.py
    ├── namespace.yaml
    ├── pipe_role_biding.yaml
    ├── nfs-server-service.yaml
    ├── build
    │   ├── build.sh
    │   ├── manage_service_account.sh
    │   └── cloudbuild.yaml
    ├── pipelines
    │   ├── Dockerfile
    │   ├── helper.py
    │   ├── pipeline.py
    │   └── pipeline2.py
    ├── pv-pvc.yaml
    ├── disk-busybox.yaml
    └── nfs-server.yaml
├── kubernetes
    ├── front
    │   ├── run_app.sh
    │   ├── requirements.txt
    │   ├── Dockerfile
    │   ├── config.py
    │   ├── templates
    │   │   ├── index.html
    │   │   └── documents.j2
    │   ├── app.yaml
    │   └── app.py
    └── es
    │   ├── Dockerfile
    │   ├── docker-compose.yaml
    │   └── deploy_elasticsearch.yaml
├── requirements.txt
├── Dockerfile
├── bin
    ├── deploy_es.sh
    ├── get_pipe_host.sh
    ├── manage_service_account.sh
    └── create_k8s.sh
├── README.md
├── LICENSE
└── .gitignore


/conftest.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length=90
3 | 


--------------------------------------------------------------------------------
/kubeflow/components/data/train/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/kubernetes/front/run_app.sh:
--------------------------------------------------------------------------------
1 | gunicorn app:app --config=config.py
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | google-cloud-bigquery
2 | google-cloud-storage
3 | elasticsearch
4 | fire
5 | 


--------------------------------------------------------------------------------
/kubeflow/components/data/validation/requirements.txt:
--------------------------------------------------------------------------------
1 | google-cloud-bigquery
2 | google-cloud-storage
3 | 


--------------------------------------------------------------------------------
/kubernetes/front/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==1.0.2
2 | gunicorn==19.9.0
3 | Jinja2
4 | elasticsearch
5 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7.7-alpine3.12
2 | 
3 | ADD ./test.py /
4 | 
5 | ENTRYPOINT ["python", "test.py"]
6 | 


--------------------------------------------------------------------------------
/kubeflow/components/prepare_env/requirements.txt:
--------------------------------------------------------------------------------
1 | google-cloud-bigquery
2 | google-cloud-storage
3 | elasticsearch
4 | fire
5 | 


--------------------------------------------------------------------------------
/kubeflow/components/model/ranklib/RankLib-2.14.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WillianFuks/pySearchML/master/kubeflow/components/model/ranklib/RankLib-2.14.jar


--------------------------------------------------------------------------------
/kubeflow/namespace.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Namespace
3 | metadata:
4 |   name: kubeflow
5 |   labels:
6 |     katib-metricscollector-injection: enabled
7 | 


--------------------------------------------------------------------------------
/kubeflow/components/data/train/tests/fixtures/model.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WillianFuks/pySearchML/master/kubeflow/components/data/train/tests/fixtures/model.gz


--------------------------------------------------------------------------------
/kubeflow/components/model/tests/fixtures/validation.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WillianFuks/pySearchML/master/kubeflow/components/model/tests/fixtures/validation.gz


--------------------------------------------------------------------------------
/kubeflow/components/model/requirements.txt:
--------------------------------------------------------------------------------
1 | google-cloud-bigquery
2 | google-cloud-storage
3 | elasticsearch
4 | numpy
5 | requests
6 | kubernetes
7 | pytest
8 | mock
9 | 


--------------------------------------------------------------------------------
/kubeflow/components/data/train/requirements.txt:
--------------------------------------------------------------------------------
 1 | google-cloud-bigquery
 2 | google-cloud-storage
 3 | elasticsearch
 4 | Cython
 5 | pyClickModels
 6 | numpy
 7 | requests
 8 | pytest
 9 | mock
10 | 


--------------------------------------------------------------------------------
/kubernetes/es/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM docker.elastic.co/elasticsearch/elasticsearch:7.3.1
2 | 
3 | RUN ./bin/elasticsearch-plugin install -b http://es-learn-to-rank.labs.o19s.com/ltr-1.1.2-es7.3.1.zip
4 | 


--------------------------------------------------------------------------------
/kubeflow/components/data/train/tests/fixtures/features/f2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "query": {
 3 |         "match": {
 4 |             "f2": "{{test}}"
 5 |         }
 6 |     },
 7 |     "params": ["test"],
 8 |     "name": "test2"
 9 | }
10 | 


--------------------------------------------------------------------------------
/kubeflow/components/data/train/tests/fixtures/features/f1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "query": {
 3 |         "match": {
 4 |             "field1": "{{test}}"
 5 |         }
 6 |     },
 7 |     "params": ["test"],
 8 |     "name": "test1"
 9 | }
10 | 


--------------------------------------------------------------------------------
/kubeflow/components/model/tests/fixtures/es_query.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "query": "test",
 3 |   "rescore": {
 4 |     "query": {
 5 |       "rescore_query": {
 6 |         "sltr": {
 7 |           "params": {}
 8 |         }
 9 |       }
10 |     }
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/kubeflow/components/model/queries/unittest/es_query.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "query": "test",
 3 |   "rescore": {
 4 |     "query": {
 5 |       "rescore_query": {
 6 |         "sltr": {
 7 |           "params": {}
 8 |         }
 9 |       }
10 |     }
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/bin/deploy_es.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | SERVICE=$(kubectl get service elasticsearch -n elastic-system | grep elasticsearch)
4 | 
5 | # Only deploy if service doesn't already exists.
6 | if [ -z "$SERVICE" ]; then
7 |     kubectl apply -f kubernetes/es/deploy_elasticsearch.yaml
8 | fi
9 | 


--------------------------------------------------------------------------------
/kubernetes/front/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.6-jessie
2 | WORKDIR /front
3 | ADD kubernetes/front/ /front
4 | RUN pip install -r /front/requirements.txt
5 | ADD kubeflow/components/model/queries/lambdamart0/es_query.json /front/es_query.json
6 | ENV PORT 8088
7 | CMD ["gunicorn", "app:app", "--config=config.py"]
8 | 


--------------------------------------------------------------------------------
/kubeflow/pipe_role_biding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   name: pipeline-runner-binding
 5 | roleRef:
 6 |   apiGroup: rbac.authorization.k8s.io
 7 |   kind: Role
 8 |   name: pipeline-runner
 9 | subjects:
10 | - kind: ServiceAccount
11 |   name: default
12 | 


--------------------------------------------------------------------------------
/kubeflow/components/prepare_env/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7.7-slim as python
 2 | 
 3 | COPY kubeflow/components/prepare_env /prepare_env
 4 | WORKDIR /prepare_env
 5 | COPY ./key.json .
 6 | 
 7 | ENV GOOGLE_APPLICATION_CREDENTIALS=./key.json
 8 | 
 9 | RUN pip install -r requirements.txt
10 | 
11 | ENTRYPOINT ["python", "run.py"]
12 | 


--------------------------------------------------------------------------------
/kubeflow/nfs-server-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: nfs-server
 5 |   namespace: kubeflow
 6 | spec:
 7 |   ports:
 8 |     - name: nfs
 9 |       port: 2049
10 |     - name: mountd
11 |       port: 20048
12 |     - name: rpcbind
13 |       port: 111
14 |   selector:
15 |     role: nfs-server
16 | 


--------------------------------------------------------------------------------
/kubeflow/components/data/validation/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7.7-slim as python
 2 | 
 3 | COPY kubeflow/components/data/validation/ /validation
 4 | WORKDIR /validation
 5 | COPY ./key.json .
 6 | 
 7 | ENV GOOGLE_APPLICATION_CREDENTIALS=./key.json
 8 | 
 9 | RUN pip install -r requirements.txt
10 | 
11 | ENTRYPOINT ["python", "run.py"]
12 | 


--------------------------------------------------------------------------------
/kubeflow/components/data/train/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7.7-slim as python
 2 | 
 3 | COPY kubeflow/components/data/train/ /train
 4 | WORKDIR /train
 5 | COPY ./key.json .
 6 | 
 7 | ENV GOOGLE_APPLICATION_CREDENTIALS=key.json
 8 | 
 9 | RUN pip install -r requirements.txt
10 | RUN pip install -U pyClickModels
11 | 
12 | ENTRYPOINT ["sh", "-c"]
13 | 


--------------------------------------------------------------------------------
/kubeflow/build/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | SUBSTITUTIONS=\
 5 | _COMPUTE_ZONE='us-central1-a',\
 6 | _CLUSTER_NAME='pysearchml',\
 7 | _VERSION='0.0.0'
 8 | 
 9 | ./kubeflow/build/manage_service_account.sh
10 | 
11 | gcloud builds submit --no-source --config kubeflow/build/cloudbuild.yaml --substitutions $SUBSTITUTIONS --timeout=2h
12 | 


--------------------------------------------------------------------------------
/kubernetes/es/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | services:
 4 |   es:
 5 |     build:
 6 |       context: ./
 7 |       dockerfile: Dockerfile
 8 |     ports:
 9 |       - "9200:9200"
10 |     tty: true
11 |     environment:
12 |       - discovery.type=single-node
13 |     ulimits:
14 |       memlock:
15 |         soft: -1
16 |         hard: -1
17 | 


--------------------------------------------------------------------------------
/kubernetes/front/config.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | from os import environ as env
 3 | 
 4 | 
 5 | PORT = int(env.get("PORT", 8088))
 6 | DEBUG_MODE = int(env.get("DEBUG_MODE", 1))
 7 | 
 8 | # Gunicorn config
 9 | bind = ":" + str(PORT)
10 | workers = multiprocessing.cpu_count() * 2 + 1
11 | workers = 1
12 | threads = 2 * multiprocessing.cpu_count()
13 | threads = 2
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pySearchML
2 | 
3 | A complete AI Based Search Engine built on top of Elasticsearch, Kubeflow and Katib.
4 | 
5 | Please refer to this [post](https://towardsdatascience.com/building-a-complete-ai-based-search-engine-with-elasticsearch-kubeflow-and-katib-590c7b27eb8f?source=friends_link&sk=dfbf728b708eaa6546edd877844a9a42) for a full detail of how the system works.
6 | 


--------------------------------------------------------------------------------
/kubeflow/components/data/train/tests/fixtures/msearch_call2.txt:
--------------------------------------------------------------------------------
1 | {"index": "test"}
2 | {"query": {"bool": {"filter": [{"terms": {"_id": ["doc0"]}}], "should": [{"sltr": {"_name": "logged_featureset", "featureset": "model_name_test", "params": {"search_term": "keyword2"}}}]}}, "_source": ["_id"], "ext": {"ltr_log": {"log_specs": {"name": "main", "named_query": "logged_featureset"}}}}
3 | 


--------------------------------------------------------------------------------
/kubeflow/components/prepare_env/lambdamart0/features/name.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "query": {
 3 |        "bool": {
 4 |            "minimum_should_match": 1,
 5 |            "should": [
 6 |                {
 7 |                    "match": {
 8 |                        "name": "{{search_term}}"
 9 |                    }
10 |                }
11 |            ]
12 |        }
13 |     },
14 |     "params": ["search_term"],
15 |     "name": "BM25 name"
16 | }
17 | 


--------------------------------------------------------------------------------
/kubeflow/components/model/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM openjdk:8 as java
 2 | COPY --from=python:3.7.7-slim / /
 3 | 
 4 | COPY kubeflow/components/model /model
 5 | COPY kubeflow/components/common/launch_crd.py /model/launch_crd.py
 6 | WORKDIR /model
 7 | COPY ./key.json .
 8 | 
 9 | ARG PROJECT_ID
10 | 
11 | ENV GOOGLE_APPLICATION_CREDENTIALS=key.json \
12 |     PROJECT_ID=$PROJECT_ID
13 | 
14 | RUN pip install -r requirements.txt
15 | 
16 | ENTRYPOINT ["sh", "-c"]
17 | 


--------------------------------------------------------------------------------
/kubeflow/components/prepare_env/lambdamart0/features/category.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "query": {
 3 |        "bool": {
 4 |            "minimum_should_match": 1,
 5 |            "should": [
 6 |                {
 7 |                    "match": {
 8 |                        "category": "{{search_term}}"
 9 |                    }
10 |                }
11 |            ]
12 |        }
13 |    },
14 |    "params": ["search_term"],
15 |    "name": "BM25 category"
16 | }
17 | 


--------------------------------------------------------------------------------
/kubeflow/pipelines/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM google/cloud-sdk:alpine as gcloud
 2 | FROM python:3.7.7-slim as python
 3 | 
 4 | COPY ./key.json /key.json
 5 | 
 6 | ENV GOOGLE_APPLICATION_CREDENTIALS=/key.json
 7 | ENV PATH=/google-cloud-sdk/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 8 | 
 9 | RUN pip install kfp --upgrade
10 | RUN pip install fire
11 | 
12 | COPY --from=gcloud /google-cloud-sdk /google-cloud-sdk
13 | 
14 | ENTRYPOINT ["sh"]
15 | 


--------------------------------------------------------------------------------
/bin/get_pipe_host.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #HOST=$(kubectl describe configmap inverse-proxy-config -n kubeflow | grep googleusercontent.com)
 4 | HOST="127.0.0.1:7067/pipeline"
 5 | 
 6 | # Means Proxy is still being created
 7 | if [ -z "$HOST" ]; then
 8 |     echo 'Sleeping 2 mins so Kubeflow Inverse Proxy is ready'
 9 |     sleep 2m
10 |     HOST=$(kubectl describe configmap inverse-proxy-config -n kubeflow | grep googleusercontent.com)
11 | fi
12 | 
13 | echo "$HOST" > k8_host.txt
14 | 


--------------------------------------------------------------------------------
/kubeflow/pv-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: pysearchml-nfs
 5 |   namespace: kubeflow
 6 | spec:
 7 |   capacity:
 8 |     storage: 200Gi
 9 |   accessModes:
10 |     - ReadWriteMany
11 |   nfs:
12 |     server: <ClusterIP>
13 |     path: "/"
14 | 
15 | ---
16 | apiVersion: v1
17 | kind: PersistentVolumeClaim
18 | metadata:
19 |   name: pysearchml-nfs
20 |   namespace: kubeflow
21 | spec:
22 |   accessModes:
23 |     - ReadWriteMany
24 |   storageClassName: ""
25 |   resources:
26 |     requests:
27 |       storage: 200Gi
28 | 


--------------------------------------------------------------------------------
/kubeflow/components/prepare_env/lambdamart0/features/avg_customer_price.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "query": {
 3 |         "function_score": {
 4 |             "query": {
 5 |                 "match_all": {}
 6 |             },
 7 |             "script_score" : {
 8 |                 "script" : {
 9 |                     "params": {
10 |                         "customer_avg_ticket": "{{customer_avg_ticket}}"
11 |                     },
12 |                     "source": "return Math.log(1 + Math.abs(doc['price'].value - Float.parseFloat(params.customer_avg_ticket)))"
13 |                 }
14 |             }
15 |         }
16 |     },
17 |     "params": ["customer_avg_ticket"],
18 |     "name": "customer_avg_ticket"
19 | }
20 | 


--------------------------------------------------------------------------------
/kubeflow/components/data/train/tests/fixtures/msearch_call1.txt:
--------------------------------------------------------------------------------
1 | {"index": "test"}
2 | {"query": {"bool": {"filter": [{"terms": {"_id": ["doc0", "doc1", "doc2"]}}], "should": [{"sltr": {"_name": "logged_featureset", "featureset": "model_name_test", "params": {"search_term": "keyword0"}}}]}}, "_source": ["_id"], "ext": {"ltr_log": {"log_specs": {"name": "main", "named_query": "logged_featureset"}}}}
3 | {"index": "test"}
4 | {"query": {"bool": {"filter": [{"terms": {"_id": ["doc", "doc1"]}}], "should": [{"sltr": {"_name": "logged_featureset", "featureset": "test_feature_set_name", "params": {"search_term": "keyword1"}}}]}}, "_source": ["_id"], "ext": {"ltr_log": {"log_specs": {"name": "main", "named_query": "logged_featureset"}}}}
5 | 


--------------------------------------------------------------------------------
/kubernetes/front/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <body>
 4 | 
 5 | <h1>Visualizer</h1>
 6 | 
 7 | <form action="searchresults" method="post">
 8 |     Input Query:<br>
 9 |     <input type="text" name="search_term">
10 |     <br>
11 |     Items to Return:<br>
12 |     <input type="number" name="size" value="30"/>
13 |     <br>
14 |     Model Name:<br>
15 |     <input type="text" name="model_name" value="lambdamart0">
16 |     <br>
17 |     Channel Group:<br>
18 |     <input type="text" name="channel_group" value="paid_search">
19 |     <br>
20 |     Customer Avg Ticket:<br>
21 |     <input type="number" step="0.01" name="customer_avg_ticket" value="20"/>
22 |     <br>
23 |     <input type="checkbox" name="ltr_flag" value="True" checked>
24 |     <label for="ltr_flag"> Use LTR</label>
25 |     <br>
26 |     <br>
27 |     <input type="submit" value="Submit">
28 | </form>
29 | 
30 | </body>
31 | </html>
32 | 


--------------------------------------------------------------------------------
/kubeflow/components/model/post_model.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | 
 4 | from train import post_model_to_elasticsearch
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument(
10 |         '--es_host',
11 |         dest='es_host',
12 |         type=str,
13 |         help='Host address to reach Elasticsearch.'
14 |     )
15 |     parser.add_argument(
16 |         '--destination',
17 |         dest='destination',
18 |         type=str,
19 |         help='Path where validation score is should be saved to.'
20 |     )
21 |     parser.add_argument(
22 |         '--model_name',
23 |         dest='model_name',
24 |         type=str,
25 |         help='Name of featureset store as saved in Elasticsearch.'
26 |     )
27 |     args, _ = parser.parse_known_args(sys.argv[1:])
28 |     post_model_to_elasticsearch(args.es_host, args.model_name,
29 |                                 f'{args.destination}')
30 | 


--------------------------------------------------------------------------------
/kubeflow/disk-busybox.yaml:
--------------------------------------------------------------------------------
 1 | #https://github.com/mappedinn/kubernetes-nfs-volume-on-gke/blob/master/config-yml-files/04-dep-busybox.yml
 2 | apiVersion: extensions/v1beta1
 3 | kind: Deployment
 4 | metadata:
 5 |   name: nfs-busybox
 6 |   namespace: kubeflow
 7 | spec:
 8 |   replicas: 1
 9 |   selector:
10 |     matchLabels:
11 |       name: nfs-busybox
12 |   template:
13 |     metadata:
14 |       labels:
15 |         name: nfs-busybox
16 |     spec:
17 |       containers:
18 |       - image: busybox
19 |         command:
20 |           - sh
21 |           - -c
22 |           - 'while true; do date > /mnt/index.html; hostname >> /mnt/index.html; sleep $(($RANDOM % 5 + 5)); done'
23 |         imagePullPolicy: IfNotPresent
24 |         name: busybox
25 |         volumeMounts:
26 |           - name: my-pvc-nfs
27 |             mountPath: "/mnt"
28 |       volumes:
29 |       - name: my-pvc-nfs
30 |         persistentVolumeClaim:
31 |           claimName: pysearchml-nfs
32 | 


--------------------------------------------------------------------------------
/kubeflow/nfs-server.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: extensions/v1beta1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: nfs-server
 5 |   namespace: kubeflow
 6 | spec:
 7 |   replicas: 1
 8 |   selector:
 9 |     matchLabels:
10 |       role: nfs-server
11 |   template:
12 |     metadata:
13 |       labels:
14 |         role: nfs-server
15 |     spec:
16 |       containers:
17 |       - name: nfs-server
18 |         image: gcr.io/google_containers/volume-nfs:0.8
19 |         ports:
20 |           - name: nfs
21 |             containerPort: 2049
22 |           - name: mountd
23 |             containerPort: 20048
24 |           - name: rpcbind
25 |             containerPort: 111
26 |         securityContext:
27 |           privileged: true
28 |         volumeMounts:
29 |           - mountPath: /exports
30 |             name: mypvc
31 |       volumes:
32 |         - name: mypvc
33 |           gcePersistentDisk:
34 |             pdName: pysearchml-nfs-disk
35 |             fsType: ext4
36 | 


--------------------------------------------------------------------------------
/kubeflow/components/model/tests/test_validate.py:
--------------------------------------------------------------------------------
 1 | import mock
 2 | from collections import namedtuple
 3 | 
 4 | from validate import validate_model
 5 | 
 6 | 
 7 | def test_validate_model(monkeypatch, es_response):
 8 |     es_mock = mock.Mock()
 9 |     es_mock.return_value.msearch.side_effect = es_response
10 |     monkeypatch.setattr('validate.Elasticsearch', es_mock)
11 | 
12 |     args = namedtuple(
13 |         'args',
14 |         [
15 |             'files_path',
16 |             'index',
17 |             'es_host',
18 |             'model_name',
19 |             'es_batch'
20 |         ]
21 |     )
22 |     args.files_path = 'tests/fixtures/'
23 |     args.es_query_path = 'queries/unittest/es_query.json'
24 |     args.index = 'index_test'
25 |     args.es_host = 'es_host_test'
26 |     args.model_name = 'unittest'
27 |     args.es_batch = 2
28 | 
29 |     rank = validate_model(
30 |         args.files_path,
31 |         args.es_host,
32 |         args.model_name,
33 |         args.index,
34 |         args.es_batch
35 |     )
36 |     assert rank == 0.6
37 | 


--------------------------------------------------------------------------------
/kubernetes/front/templates/documents.j2:
--------------------------------------------------------------------------------
 1 | {% macro build_product(id, score) %}
 2 |     <div id="{{ id }}" class=" product-box">
 3 |         <div class="product-box-detail">
 4 |             <p style="white-space: nowrap; overflow: hidden; text-overflow: ellipsis" class="product-box-title hide-mobile">{{ id }}</p>
 5 |         </div>
 6 |         <div class="product-box-score">
 7 |             <span class="product-box-score">Score: {{ score }}</span>
 8 |         </div>
 9 |     </div>
10 | {% endmacro %}
11 | 
12 | <style>
13 |     .products-container {
14 |         max-width: 960px;
15 |         width: 100%;
16 |         margin: 0 auto;
17 |         margin-top: 20px;
18 |     }
19 | 
20 |     .product-box {
21 |         float: left;
22 |         width: 100%;
23 |         margin-right: 20%
24 |     }
25 | 
26 |     .products-container .product-box {
27 |         width: 12%;
28 |     }
29 | </style>
30 | 
31 | <div class="products-container">
32 | {% for product in product_list %}
33 |     {{ build_product(
34 |     	product['_id'],
35 |         product['_score'],
36 |     ) }}
37 | {% endfor %}
38 | </div>
39 | 


--------------------------------------------------------------------------------
/kubernetes/front/app.yaml:
--------------------------------------------------------------------------------
 1 | kind: Namespace
 2 | apiVersion: v1
 3 | metadata:
 4 |   name: front
 5 | 
 6 | ---
 7 | kind: Service
 8 | apiVersion: v1
 9 | metadata:
10 |   name: front
11 |   namespace: front
12 |   labels:
13 |     app: front
14 | spec:
15 |   selector:
16 |     app: front
17 |   clusterIP: None
18 |   ports:
19 |     - port: 8088
20 |       targetPort: 8088
21 | 
22 | ---
23 | apiVersion: apps/v1beta2
24 | kind: Deployment
25 | metadata:
26 |   name: front
27 |   namespace: front
28 |   labels:
29 |     app: front
30 | spec:
31 |   replicas: 1
32 |   selector:
33 |     matchLabels:
34 |       app: front
35 |   template:
36 |     metadata:
37 |       name: front
38 |       namespace: front
39 |       labels:
40 |         app: front
41 |     spec:
42 |       containers:
43 |         - name: front
44 |           image: willfuks/pysearchml_front
45 |           ports:
46 |             - containerPort: 8088
47 |           resources:
48 |             requests:
49 |               memory: 256Mi
50 |             limits:
51 |               memory: 512Mi
52 |           env:
53 |             - name: DEBUG_MODE
54 |               value: "1"
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Willian Fuks
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/kubeflow/components/prepare_env/lambdamart0/features/channel_group.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "query": {
 3 |         "function_score": {
 4 |             "query": {
 5 |                 "match_all": {}
 6 |             },
 7 |             "script_score" : {
 8 |                 "script" : {
 9 |                     "params": {
10 |                         "channel_group": "{{channel_group}}"
11 |                     },
12 |                     "source": "if (params.channel_group == 'paid_search') { return doc['performances.channel.paid_search.CTR'].value * 10 } else if (params.channel_group == 'referral') { return doc['performances.channel.referral.CTR'].value * 10 } else if (params.channel_group == 'organic_search') { return doc['performances.channel.organic_search.CTR'].value * 10 } else if (params.channel_group == 'social') { return doc['performances.channel.social.CTR'].value * 10 } else if (params.channel_group == 'display') { return doc['performances.channel.display.CTR'].value * 10 } else if (params.channel_group == 'direct') { return doc['performances.channel.direct.CTR'].value * 10 } else if (params.channel_group == 'affiliates') { return doc['performances.channel.affiliates.CTR'].value * 10 }"
13 |                 }
14 |             }
15 |         }
16 |     },
17 |     "params": ["channel_group"],
18 |     "name": "channel_group"
19 | }
20 | 


--------------------------------------------------------------------------------
/bin/manage_service_account.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -eu
 4 | 
 5 | #PROJECT_ID=$(gcloud config get-value project)
 6 | PROJECT_ID=$PROJECT_ID
 7 | NAME=pysearchml
 8 | SECRET_NAME=pysearchml-service-account
 9 | 
10 | if ! [ -z "$(gcloud secrets list | grep $SECRET_NAME)" ]; then
11 |     gcloud secrets versions access latest --secret=$SECRET_NAME > key.json
12 |     echo Downloaded Secret
13 | fi
14 | 
15 | if [ -e key.json ]; then
16 |     echo File key.json already available.
17 | else
18 |     echo Creating service account and downloading key...
19 | 
20 |     SERVICE_ACCOUNT=$(gcloud iam service-accounts list --filter NAME=$NAME)
21 |     if [ -z "$SERVICE_ACCOUNT" ]; then
22 | 	gcloud iam service-accounts create $NAME --project $PROJECT_ID \
23 | 	    --display-name $NAME
24 | 
25 | 	for ROLE in roles/editor roles/storage.admin roles/bigquery.admin roles/storage.objectAdmin;
26 | 	do
27 | 	    gcloud projects add-iam-policy-binding $PROJECT_ID \
28 | 	        --member=serviceAccount:$NAME@$PROJECT_ID.iam.gserviceaccount.com \
29 | 	        --role=$ROLE
30 | 	done
31 | 	echo Created Service Account $NAME
32 |     fi
33 | 
34 |     gcloud iam service-accounts keys create ./key.json --iam-account $NAME@$PROJECT_ID.iam.gserviceaccount.com
35 | 
36 |     echo Creating Secret File
37 |     gcloud secrets create $SECRET_NAME --data-file=key.json --replication-policy=automatic
38 |     echo New Secret File Created
39 | 
40 |     echo Finished downloading secrets key.json file
41 | fi
42 | 


--------------------------------------------------------------------------------
/kubeflow/build/manage_service_account.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #set -e
 4 | 
 5 | #PROJECT_ID=$(gcloud config get-value project)
 6 | PROJECT_ID=$PROJECT_ID
 7 | NAME=pysearchml
 8 | SECRET_NAME=pysearchml-service-account
 9 | 
10 | if ! [ -z "$(gcloud secrets list | grep $SECRET_NAME)" ]; then
11 |     gcloud secrets versions access latest --secret=$SECRET_NAME > key.json
12 |     echo Downloaded Secret
13 | fi
14 | 
15 | if [ -e key.json ]; then
16 |     echo File key.json already available.
17 | else
18 |     echo Creating service account and downloading key...
19 | 
20 |     SERVICE_ACCOUNT=$(gcloud iam service-accounts list --filter NAME=$NAME)
21 |     if [ -z "$SERVICE_ACCOUNT" ]; then
22 | 	gcloud iam service-accounts create $NAME --project $PROJECT_ID \
23 | 	    --display-name $NAME
24 | 
25 | 	for ROLE in roles/editor roles/storage.admin roles/bigquery.admin roles/storage.objectAdmin;
26 | 	do
27 | 	    gcloud projects add-iam-policy-binding $PROJECT_ID \
28 | 	        --member=serviceAccount:$NAME@$PROJECT_ID.iam.gserviceaccount.com \
29 | 	        --role=$ROLE
30 | 	done
31 | 	echo Created Service Account $NAME
32 |     fi
33 | 
34 |     gcloud iam service-accounts keys create ./key.json --iam-account $NAME@$PROJECT_ID.iam.gserviceaccount.com
35 | 
36 |     echo Creating Secret File
37 |     gcloud secrets create $SECRET_NAME --data-file=key.json --replication-policy=automatic
38 |     echo New Secret File Created
39 | 
40 |     echo Finished downloading secrets key.json file
41 | fi
42 | 


--------------------------------------------------------------------------------
/kubernetes/front/app.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from flask import Flask, request, jsonify
 4 | from jinja2 import Environment, FileSystemLoader
 5 | from elasticsearch import Elasticsearch
 6 | 
 7 | 
 8 | es = Elasticsearch('elasticsearch.elastic-system.svc.cluster.local:9200')
 9 | app = Flask(__name__)
10 | env = Environment(loader=FileSystemLoader('/front/templates'))
11 | 
12 | 
13 | @app.route("/", methods=['GET', 'POST'])
14 | def index():
15 |     index_html = env.get_template('index.html').render()
16 |     return index_html
17 | 
18 | 
19 | @app.route("/searchresults", methods=['POST'])
20 | def search():
21 |     try:
22 |         args = request.form.to_dict()
23 |         es_query = open('/front/es_query.json').read()
24 |         print(args)
25 |         input_query = args['search_term']
26 |         size = args.pop('size')
27 |         model_name = args.pop('model_name')
28 | 
29 |         es_query = es_query.replace('{query}', input_query)
30 |         es_query = json.loads(es_query)
31 |         es_query['size'] = size
32 |         es_query['_source'] = []
33 | 
34 |         es_query['rescore']['window_size'] = 500
35 |         es_query['rescore']['query']['rescore_query']['sltr']['params'] = args
36 |         es_query['rescore']['query']['rescore_query']['sltr']['model'] = model_name
37 | 
38 |         if 'ltr_flag' not in args:
39 |             es_query.pop('rescore')
40 | 
41 |         r = es.search(index='pysearchml', body=es_query).get('hits', {}).get('hits')
42 |         r = [(e['_id'], e['_score']) for e in r]
43 |         return jsonify(r)
44 |         # return env.get_template('documents.j2').render(product_list=r)
45 |     except Exception as e:
46 |         return str(e)
47 | 


--------------------------------------------------------------------------------
/kubeflow/components/model/test.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | from typing import List, NamedTuple
 4 | 
 5 | from validate import validate_model
 6 | 
 7 | 
 8 | def parse_args(args: List) -> NamedTuple:
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument(
11 |         '--files_path',
12 |         dest='files_path',
13 |         type=str,
14 |         help='Path to files containing data of customers searches and their purchases'
15 |     )
16 |     parser.add_argument(
17 |         '--index',
18 |         dest='index',
19 |         type=str,
20 |         default='pysearchml',
21 |         help='Name of Index where documents are stored in Elasticsearch.'
22 |     )
23 |     parser.add_argument(
24 |         '--es_host',
25 |         dest='es_host',
26 |         type=str,
27 |         help='Host address to reach Elasticsearch.'
28 |     )
29 |     parser.add_argument(
30 |         '--model_name',
31 |         dest='model_name',
32 |         type=str,
33 |         help='Assigns a name for the RankLib model. Each experiment on Kubeflow should '
34 |              'have a specific name in order to preserver their results.'
35 |     )
36 |     parser.add_argument(
37 |         '--es_batch',
38 |         dest='es_batch',
39 |         type=int,
40 |         default=1000,
41 |         help='Determines how many items to send at once to Elasticsearch when using '
42 |              'multisearch API.'
43 |     )
44 |     args, _ = parser.parse_known_args(args)
45 |     return args
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     args = parse_args(sys.argv[1:])
50 |     test_rank = validate_model(
51 |         args.files_path,
52 |         args.es_host,
53 |         args.model_name,
54 |         args.index,
55 |         args.es_batch
56 |     )
57 |     print(f'Test-rank={test_rank}')
58 | 


--------------------------------------------------------------------------------
/kubeflow/components/model/queries/lambdamart0/es_query.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "query": {
 3 |         "function_score": {
 4 |             "query": {
 5 |                 "bool": {
 6 |                     "must": {
 7 |                         "bool": {
 8 |                             "minimum_should_match": 1,
 9 |                             "should": [
10 |                                 {
11 |                                     "multi_match": {
12 |                                         "operator": "and",
13 |                                         "query": "{query}",
14 |                                         "type": "cross_fields",
15 |                                         "fields": [
16 |                                             "sku",
17 |                                             "name",
18 |                                             "category"
19 |                                         ]
20 |                                     }
21 |                                 }
22 |                             ]
23 |                         }
24 |                     }
25 |                 }
26 |             },
27 |             "functions": [
28 |                 {
29 |                     "field_value_factor": {
30 |                         "field": "performances.global.CTR",
31 |                         "factor": 10,
32 |                         "missing": 0,
33 |                         "modifier": "none"
34 |                     }
35 |                 }
36 |             ],
37 |             "boost_mode": "sum",
38 |             "score_mode": "sum"
39 |         }
40 |     },
41 |     "rescore": {
42 |         "window_size": "{window_size}",
43 |         "query": {
44 |             "rescore_query": {
45 |                 "sltr": {
46 |                     "params": "{search_keys}",
47 |                     "model": "{model_name}"
48 |                 }
49 |             },
50 |             "rescore_query_weight": 20,
51 |             "query_weight": 0.1,
52 |             "score_mode": "total"
53 |         }
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------
/kubeflow/components/model/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.fixture
 5 | def es_response():
 6 |     return [
 7 |         {
 8 |             'responses': [
 9 |                 {
10 |                     'hits': {
11 |                         'hits': [
12 |                             {
13 |                                 '_id': 'doc3',
14 |                             },
15 |                             {
16 |                                 '_id': 'doc2',
17 |                             },
18 |                             {
19 |                                 '_id': 'doc1',
20 |                             },
21 |                             {
22 |                                 '_id': 'doc0',
23 |                             }
24 |                         ]
25 |                     }
26 |                 },
27 |                 {
28 |                     'hits': {
29 |                         'hits': [
30 |                             {
31 |                                 '_id': 'doc3',
32 |                             },
33 |                             {
34 |                                 '_id': 'doc2',
35 |                             },
36 |                             {
37 |                                 '_id': 'doc1',
38 |                             },
39 |                             {
40 |                                 '_id': 'doc0',
41 |                             }
42 |                         ]
43 |                     }
44 |                 },
45 |             ]
46 |         },
47 |         {
48 |             'responses': [
49 |                 {
50 |                     'hits': {
51 |                         'hits': [
52 |                             {
53 |                                 '_id': 'doc3',
54 |                             },
55 |                             {
56 |                                 '_id': 'doc2',
57 |                             },
58 |                             {
59 |                                 '_id': 'doc1',
60 |                             },
61 |                             {
62 |                                 '_id': 'doc0',
63 |                             }
64 |                         ]
65 |                     }
66 |                 }
67 |             ]
68 |         },
69 |     ]
70 | 


--------------------------------------------------------------------------------
/kubeflow/pipelines/helper.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import kfp
 4 | import fire
 5 | from datetime import datetime
 6 | 
 7 | 
 8 | def update_op_project_id_img(op):
 9 |     project_id = os.getenv('PROJECT_ID')
10 |     if not project_id:
11 |         raise Exception('Please set an $PROJECT_ID env value.')
12 |     img = op.component_spec.implementation.container.image
13 |     img = img.format(PROJECT_ID=project_id)
14 |     op.component_spec.implementation.container.image = img
15 |     return op
16 | 
17 | 
18 | def get_pipe_by_name(client, name):
19 |     # Tries to read pipeline. If fails, assumes pipe doesnt exist.
20 |     try:
21 |         pipes = client.list_pipelines()
22 |         pipeline = [pipe for pipe in pipes.pipelines if pipe.name == name]
23 |     except Exception:
24 |         pipeline = None
25 | 
26 |     if pipeline:
27 |         pipeline = pipeline[0]
28 |     return pipeline
29 | 
30 | 
31 | def deploy_pipeline(host, version):
32 |     client = kfp.Client(host=host)
33 |     name = f'pysearchml_{version}'
34 |     # Supposed page_token is not necessary for this application
35 | 
36 |     pipeline = get_pipe_by_name(client, name)
37 |     if not pipeline:
38 |         pipeline = client.upload_pipeline(
39 |             pipeline_package_path='pipeline.tar.gz',
40 |             pipeline_name=name
41 |         )
42 | 
43 | 
44 | def run_experiment(host, version, experiment_name):
45 |     client = kfp.Client(host=host)
46 |     name = f'pysearchml_{version}'
47 |     pipeline = get_pipe_by_name(client, name)
48 |     if not pipeline:
49 |         raise Exception('Please first create a pipeline before running')
50 |     run_id = f'experiment_{datetime.now().strftime("%Y%m%d-%H%M%S")}'
51 |     experiment = client.create_experiment(name=experiment_name)
52 |     params = json.loads(open('params.json').read())
53 |     client.run_pipeline(experiment.id, job_name=run_id, params=params,
54 |                         pipeline_id=pipeline.id)
55 | 
56 | 
57 | def main(action, host,  **kwargs):
58 |     if action == 'deploy-pipeline':
59 |         version = kwargs.get('version')
60 |         deploy_pipeline(host, version)
61 |     elif action == 'run-pipeline':
62 |         experiment_name = kwargs['experiment_name']
63 |         run_experiment(experiment_name)
64 |     else:
65 |         raise ValueError(f'Invalid operation name: {action}.')
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     fire.Fire(main)
70 | 


--------------------------------------------------------------------------------
/kubeflow/components/model/experiment.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "apiVersion": "kubeflow.org/v1alpha3",
 3 |   "kind": "Experiment",
 4 |   "metadata": {
 5 |     "namespace": "kubeflow",
 6 |     "name": "",
 7 |     "labels": {
 8 |       "controller-tools.k8s.io": "1.0"
 9 |     }
10 |   },
11 |   "spec": {
12 |     "objective": {
13 |       "type": "minimize",
14 |       "objectiveMetricName": "Validation-rank",
15 |       "additionalMetricNames": [
16 |         "rank"
17 |       ]
18 |     },
19 |     "algorithm": {
20 |       "algorithmName": "bayesianoptimization"
21 |     },
22 |     "parallelTrialCount": 2,
23 |     "maxTrialCount": 6,
24 |     "maxFailedTrialCount": 1,
25 |     "parameters": [],
26 |     "trialTemplate": {
27 |       "goTemplate": {
28 |         "rawTemplate": {
29 |           "apiVersion": "batch/v1",
30 |           "kind": "Job",
31 |           "metadata":{
32 |             "name": "{{.Trial}}",
33 |             "namespace": "{{.NameSpace}}"
34 |           },
35 |           "spec": {
36 |             "template": {
37 |               "spec": {
38 |                 "restartPolicy": "Never",
39 |                 "containers": [
40 |                   {
41 |                     "name": "{{.Trial}}",
42 |                     "image": "gcr.io/{PROJECT_ID}/model",
43 |                     "command": [
44 |                       "python /model/train.py --train_file_path={train_file_path} --validation_files_path={validation_files_path} --validation_train_files_path={validation_train_files_path} --es_host={es_host} --destination={destination} --model_name={model_name} --ranker={ranker} {{- with .HyperParameters}} {{- range .}} {{.Name}}={{.Value}} {{- end}} {{- end}}"
45 |                     ],
46 |                     "volumeMounts": [
47 |                       {
48 |                         "mountPath": "/data",
49 |                         "name": "pysearchmlpvc",
50 |                         "readOnly": false
51 |                       }
52 |                     ]
53 |                   }
54 |                 ],
55 |                 "volumes": [
56 |                   {
57 |                     "name": "pysearchmlpvc",
58 |                     "persistentVolumeClaim": {
59 |                       "claimName": "pysearchml-nfs",
60 |                       "readOnly": false
61 |                     }
62 |                   }
63 |                 ]
64 |               }
65 |             }
66 |           }
67 |         }
68 |       }
69 |     }
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | !kubeflow/build
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 
132 | # json key files
133 | key.json
134 | 
135 | # KFP pipelines
136 | *.tar.gz
137 | kf_deploys
138 | 


--------------------------------------------------------------------------------
/kubernetes/es/deploy_elasticsearch.yaml:
--------------------------------------------------------------------------------
  1 | kind: Namespace
  2 | apiVersion: v1
  3 | metadata:
  4 |   name: elastic-system
  5 | 
  6 | ---
  7 | kind: Service
  8 | apiVersion: v1
  9 | metadata:
 10 |   name: elasticsearch
 11 |   namespace: elastic-system
 12 |   labels:
 13 |     app: elasticsearch
 14 | spec:
 15 |   selector:
 16 |     app: elasticsearch
 17 |   clusterIP: None
 18 |   ports:
 19 |     - port: 9200
 20 |       name: rest
 21 |     - port: 9300
 22 |       name: inter-node
 23 | 
 24 | ---
 25 | apiVersion: apps/v1
 26 | kind: StatefulSet
 27 | metadata:
 28 |   name: es-cluster
 29 |   namespace: elastic-system
 30 | spec:
 31 |   serviceName: elasticsearch
 32 |   replicas: 1
 33 |   selector:
 34 |     matchLabels:
 35 |       app: elasticsearch
 36 |   template:
 37 |     metadata:
 38 |       labels:
 39 |         app: elasticsearch
 40 |     spec:
 41 |       containers:
 42 |       - name: elasticsearch
 43 |         #image: docker.elastic.co/elasticsearch/elasticsearch:7.3.1
 44 |         image: willfuks/ltres:0.1
 45 |         resources:
 46 |             limits:
 47 |               cpu: 1
 48 |         ports:
 49 |         - containerPort: 9200
 50 |           name: rest
 51 |           protocol: TCP
 52 |         - containerPort: 9300
 53 |           name: inter-node
 54 |           protocol: TCP
 55 |         volumeMounts:
 56 |         - name: data
 57 |           mountPath: /usr/share/elasticsearch/data
 58 |         env:
 59 |           - name: cluster.name
 60 |             value: k8s-es
 61 |           - name: node.name
 62 |             valueFrom:
 63 |               fieldRef:
 64 |                 fieldPath: metadata.name
 65 |           - name: discovery.seed_hosts
 66 |             value: "es-cluster-0.elasticsearch"
 67 |           - name: cluster.initial_master_nodes
 68 |             value: "es-cluster-0"
 69 |           - name: ES_JAVA_OPTS
 70 |             value: "-Xms512m -Xmx512m"
 71 |       initContainers:
 72 |       - name: fix-permissions
 73 |         image: busybox
 74 |         command: ["sh", "-c", "chown -R 1000:1000 /usr/share/elasticsearch/data"]
 75 |         securityContext:
 76 |           privileged: true
 77 |         volumeMounts:
 78 |         - name: data
 79 |           mountPath: /usr/share/elasticsearch/data
 80 |       - name: increase-vm-max-map
 81 |         image: busybox
 82 |         command: ["sysctl", "-w", "vm.max_map_count=262144"]
 83 |         securityContext:
 84 |           privileged: true
 85 |       - name: increase-fd-ulimit
 86 |         image: busybox
 87 |         command: ["sh", "-c", "ulimit -n 65536"]
 88 |         securityContext:
 89 |           privileged: true
 90 |   volumeClaimTemplates:
 91 |   - metadata:
 92 |       name: data
 93 |       labels:
 94 |         app: elasticsearch
 95 |     spec:
 96 |       accessModes:
 97 |         - ReadWriteOnce
 98 |       resources:
 99 |         requests:
100 |           storage: 4Gi
101 | 


--------------------------------------------------------------------------------
/bin/create_k8s.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | CLUSTER_EXISTS=true
 4 | CLUSTER_NAME=${CLUSTER_NAME:-"pysearchml"}
 5 | 
 6 | echo "cluster name: ${CLUSTER_NAME}"
 7 | 
 8 | gcloud config set project $PROJECT_ID 2>/dev/null
 9 | gcloud config set compute/zone $COMPUTE_ZONE 2>/dev/null
10 | 
11 | if [ -z $PROJECT_ID ] || [ -z $COMPUTE_ZONE ]; then
12 |     echo Error: Please set properly env variables PROJECT_ID and COMPUTE_ZONE
13 |     exit 1
14 | fi
15 | 
16 | gcloud container clusters describe $CLUSTER_NAME 2>/dev/null || CLUSTER_EXISTS=false
17 | 
18 | if [ $CLUSTER_EXISTS = false ]; then
19 |     gcloud container clusters create $CLUSTER_NAME \
20 | 	--enable-autoupgrade \
21 | 	--scopes cloud-platform \
22 | 	--machine-type n1-standard-2 \
23 | 	--zone=$COMPUTE_ZONE \
24 | 	--disk-size=30GB \
25 | 	--num-nodes=2
26 | 
27 |     gcloud components install kubectl
28 |     gcloud container clusters get-credentials $CLUSTER_NAME --zone=$COMPUTE_ZONE
29 | 
30 |     # Install Kubeflow Pipelines
31 |     export PIPELINE_VERSION=0.5.1
32 |     kubectl apply -k "github.com/kubeflow/pipelines/manifests/kustomize/cluster-scoped-resources?ref=$PIPELINE_VERSION"
33 |     kubectl wait --for condition=established --timeout=60s crd/applications.app.k8s.io
34 |     kubectl apply -k "github.com/kubeflow/pipelines/manifests/kustomize/env/platform-agnostic/?ref=$PIPELINE_VERSION"
35 |     # this step can take a while
36 |     kubectl wait applications/pipeline -n kubeflow --for condition=Ready --timeout=1800s
37 |     # Update namespace to contain metric collector label
38 |     kubectl apply -f kubeflow/namespace.yaml
39 | 
40 |     # Install NFS in Kubeflow Namespace
41 |     # https://medium.com/platformer-blog/nfs-persistent-volumes-with-kubernetes-a-case-study-ce1ed6e2c266
42 |     gcloud compute disks create --size=10GB --zone=${COMPUTE_ZONE} pysearchml-nfs-disk
43 |     kubectl apply -f kubeflow/nfs-server.yaml
44 |     kubectl apply -f kubeflow/nfs-server-service.yaml
45 |     CLUSTER_IP=$(kubectl -n kubeflow get services nfs-server -o=jsonpath='{.spec.clusterIP}')
46 |     sed "0,/^\([[:space:]]*server: *\).*/s//\1$CLUSTER_IP/;" kubeflow/pv-pvc.yaml | kubectl apply -f -
47 |     #yq w -d0 kubeflow/pv-pvc.yaml 'spec.nfs.server' ${CLUSTER_IP} | kubectl apply -f -
48 | 
49 |     # Install Kustomize
50 |     curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh"  | bash
51 | 
52 |     # Install Kabit
53 |     git clone git@github.com:kubeflow/manifests.git
54 |     ./kustomize build manifests/katib/katib-crds/base | kubectl apply -f -
55 |     ./kustomize build manifests/katib/katib-controller/base | kubectl apply -f -
56 | 
57 |     # https://www.digitalocean.com/community/tutorials/how-to-set-up-an-elasticsearch-fluentd-and-kibana-efk-logging-stack-on-kubernetes#step-3-%E2%80%94-creating-the-kibana-deployment-and-service
58 |     kubectl apply -f kubernetes/es/deploy_elasticsearch.yaml
59 | 
60 |     # Install the visualizer front end
61 |     kubectl apply -f kubernetes/front/app.yaml
62 | fi
63 | 


--------------------------------------------------------------------------------
/kubeflow/components/prepare_env/lambdamart0/ga_data.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   sku,
 3 |   name,
 4 |   category,
 5 |   COALESCE(global_price, 0) AS price,
 6 |   STRUCT(
 7 |     STRUCT(
 8 |       COALESCE(global_impressions, 0) AS impressions,
 9 |       COALESCE(global_clicks, 0) AS clicks,
10 |       COALESCE(IF(MAX(global_clicks) / MAX(global_impressions) > 1, 1, MAX(global_clicks) / MAX(global_impressions)), 0) AS CTR
11 |     ) AS global,
12 |     STRUCT(
13 |       CASE WHEN channel = 'Organic Search' THEN STRUCT(COALESCE(IF(SUM(clicks) / COALESCE(SUM(impressions), 1) > 1, 1, SUM(clicks) / COALESCE(SUM(impressions), 1)), 0) AS CTR) ELSE STRUCT(0 AS CTR) END AS organic_search,
14 |       CASE WHEN channel = 'Direct' THEN STRUCT(COALESCE(IF(SUM(clicks) / COALESCE(SUM(impressions), 1) > 1, 1, SUM(clicks) / COALESCE(SUM(impressions), 1)), 0) AS CTR) ELSE STRUCT(0 AS CTR) END AS direct,
15 |       CASE WHEN channel = 'Referral' THEN STRUCT(COALESCE(IF(SUM(clicks) / COALESCE(SUM(impressions), 1) > 1, 1, SUM(clicks) / COALESCE(SUM(impressions), 1)), 0) AS CTR) ELSE STRUCT(0 AS CTR) END AS referral,
16 |       CASE WHEN channel = 'Paid Search' THEN STRUCT(COALESCE(IF(SUM(clicks) / COALESCE(SUM(impressions), 1) > 1, 1, SUM(clicks) / COALESCE(SUM(impressions), 1)), 0) AS CTR) ELSE STRUCT(0 AS CTR) END AS paid_search,
17 |       CASE WHEN channel = 'Display' THEN STRUCT(COALESCE(IF(SUM(clicks) / COALESCE(SUM(impressions), 1) > 1, 1, SUM(clicks) / COALESCE(SUM(impressions), 1)), 0) AS CTR) ELSE STRUCT(0 AS CTR) END AS display,
18 |       CASE WHEN channel = 'Affiliates' THEN STRUCT(COALESCE(IF(SUM(clicks) / COALESCE(SUM(impressions), 1) > 1, 1, SUM(clicks) / COALESCE(SUM(impressions), 1)), 0) AS CTR) ELSE STRUCT(0 AS CTR) END AS affiliates,
19 |       CASE WHEN channel = 'Social' THEN STRUCT(COALESCE(IF(SUM(clicks) / COALESCE(SUM(impressions), 1) > 1, 1, SUM(clicks) / COALESCE(SUM(impressions), 1)), 0) AS CTR) ELSE STRUCT(0 AS CTR) END AS social
20 |     ) AS channel
21 |   ) AS performances
22 | FROM(
23 |   SELECT DISTINCT
24 |     sku,
25 |     channel,
26 |     name,
27 |     REGEXP_REPLACE(category, '/', ' ') AS category,
28 |     SUM(impressions) OVER(PARTITION BY sku) AS global_impressions,
29 |     impressions,
30 |     SUM(clicks) OVER(PARTITION BY sku) AS global_clicks,
31 |     clicks,
32 |     AVG(price) OVER(PARTITION BY sku) AS global_price,
33 |   FROM(
34 |     SELECT
35 |       ARRAY(
36 |         SELECT AS STRUCT
37 |           channelGrouping AS channel,
38 |           productSku AS sku,
39 |           v2ProductCategory AS category,
40 |           v2ProductName AS name,
41 |           SUM(CAST(isImpression AS INT64)) AS impressions,
42 |           SUM(CAST(isClick AS INT64)) AS clicks,
43 |           AVG(productPrice / 1e6) AS price
44 |         FROM UNNEST(hits), UNNEST(product)
45 |         GROUP BY channel, sku, category, name
46 |       ) AS products
47 |     FROM `bigquery-public-data.google_analytics_sample.ga_sessions*`
48 |     WHERE TRUE
49 |       AND REGEXP_EXTRACT(_TABLE_SUFFIX, r'.*_(\d+)$') BETWEEN '20160801' AND '20170801'
50 |   ), UNNEST(products)
51 | )
52 | WHERE TRUE
53 |   AND global_impressions > 0
54 | GROUP BY
55 |   sku,
56 |   channel,
57 |   name,
58 |   category,
59 |   global_impressions,
60 |   global_clicks,
61 |   global_price
62 | 


--------------------------------------------------------------------------------
/kubeflow/components/data/validation/run.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import argparse
 4 | import pathlib
 5 | import uuid
 6 | from shutil import rmtree
 7 | 
 8 | from google.cloud import storage, bigquery
 9 | 
10 | 
11 | PATH = pathlib.Path(__file__).parent
12 | 
13 | 
14 | def main(validation_init_date, validation_end_date, bucket, destination):
15 |     # Remove everything and deletes destination folder to receive new files.
16 |     rmtree(destination, ignore_errors=True)
17 |     os.makedirs(destination, exist_ok=True)
18 | 
19 |     storage_client = storage.Client()
20 |     bq_client = bigquery.Client()
21 | 
22 |     ds_ref = bq_client.dataset('pysearchml')
23 | 
24 |     table_id = str(uuid.uuid4().hex)
25 |     table_ref = ds_ref.table(table_id)
26 | 
27 |     # Query GA data
28 |     query_path = PATH / 'validation.sql'
29 |     query = open(str(query_path)).read()
30 |     query = query.format(validation_init_date=validation_init_date,
31 |                          validation_end_date=validation_end_date)
32 | 
33 |     job_config = bigquery.QueryJobConfig()
34 |     job_config.destination = f'{bq_client.project}.pysearchml.{table_id}'
35 |     job_config.maximum_bytes_billed = 10 * (1024 ** 3)
36 |     job_config.write_disposition = 'WRITE_TRUNCATE'
37 |     job = bq_client.query(query, job_config=job_config)
38 |     job.result()
39 | 
40 |     # export BigQuery table to GCS
41 |     # bucket will be set in accordance to which validation dataset is referenced, i.e.,
42 |     # whether regular validation or validation for the training dataset.
43 |     destination_uri = f"gs://{bucket}/validation*.gz"
44 | 
45 |     extract_config = bigquery.ExtractJobConfig()
46 |     extract_config.compression = 'GZIP'
47 |     extract_config.destination_format = 'NEWLINE_DELIMITED_JSON'
48 |     job = bq_client.extract_table(table_ref, destination_uri, job_config=extract_config)
49 |     job.result()
50 | 
51 |     # Download data
52 |     bucket_obj = storage_client.bucket(bucket.split('/')[0])
53 |     blobs = bucket_obj.list_blobs(prefix=bucket.partition('/')[-1])
54 |     for blob in blobs:
55 |         blob.download_to_filename(f"{destination}/{blob.name.split('/')[-1]}")
56 |         blob.delete()
57 | 
58 |     # delete BQ table
59 |     bq_client.delete_table(table_ref)
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     parser = argparse.ArgumentParser()
64 |     parser.add_argument(
65 |         '--validation_init_date',
66 |         dest='validation_init_date',
67 |         type=str,
68 |         help='Date in format %Y%M%D from when to start querying GA data'
69 |     )
70 |     parser.add_argument(
71 |         '--validation_end_date',
72 |         dest='validation_end_date',
73 |         type=str,
74 |         help='Date in format %Y%M%D from when to stop querying GA data'
75 |     )
76 |     parser.add_argument(
77 |         '--bucket',
78 |         dest='bucket',
79 |         type=str
80 |     )
81 |     parser.add_argument(
82 |         '--destination',
83 |         dest='destination',
84 |         type=str,
85 |         help='Path where validation dataset gzipped files will be stored.'
86 |     )
87 | 
88 |     args, _ = parser.parse_known_args(sys.argv[1:])
89 |     main(
90 |         args.validation_init_date,
91 |         args.validation_end_date,
92 |         args.bucket,
93 |         args.destination
94 |     )
95 | 


--------------------------------------------------------------------------------
/kubeflow/components/data/validation/validation.sql:
--------------------------------------------------------------------------------
 1 | CREATE TEMP FUNCTION PROCESS_SKUS_PURCHASED_FROM_SEARCH(searched_skus ARRAY<STRING>, purchased_skus ARRAY<STRING>) RETURNS ARRAY<STRUCT<sku STRING, purchase_flag BOOL> >  AS (
 2 |   /**
 3 |   Compares list of skus from the search results and the ones purchased; returns the intersection between the two.
 4 |   **/
 5 |   ARRAY(SELECT AS STRUCT sku, IF(EXISTS(SELECT 1 FROM UNNEST(purchased_skus) AS p_sku WHERE sku = p_sku), TRUE, FALSE) FROM UNNEST(searched_skus) AS sku)
 6 | );
 7 | 
 8 | CREATE TEMP FUNCTION PROCESS_CHANNEL_GROUP(channelGroup STRING) RETURNS STRING AS (
 9 |    REGEXP_REPLACE(LOWER(channelGroup), ' ', '_')
10 | );
11 | 
12 | WITH search_data AS(
13 |   SELECT
14 |     fv,
15 |     channel_group,
16 |     ARRAY(
17 |       SELECT AS STRUCT
18 |         query,
19 |         ARRAY_AGG(STRUCT(skus.sku AS sku, skus.purchase_flag AS purchase_flag)) AS skus
20 |       FROM UNNEST(hits), UNNEST(skus) AS skus
21 |       GROUP BY query
22 |     ) AS hits
23 |   FROM(
24 |     SELECT
25 |       fv,
26 |       channel_group,
27 |       ARRAY(
28 |         SELECT AS STRUCT
29 |           query,
30 |           PROCESS_SKUS_PURCHASED_FROM_SEARCH(query_skus, purchased_skus) skus
31 |         FROM UNNEST(hits)
32 |       ) AS hits
33 |     FROM(
34 |       SELECT
35 |         fullvisitorid AS fv,
36 |         COALESCE(PROCESS_CHANNEL_GROUP(channelGrouping), '') AS channel_group,
37 |         ARRAY(
38 |           SELECT AS STRUCT
39 |             REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_EXTRACT(page.pagepath, r'([^\/]+)$'), r'(\+)+', ' '), r't ', 't'), r' s ?', ' '), r'\.axd', '') AS query,
40 |             ARRAY_AGG(DISTINCT productSKU IGNORE NULLS) AS query_skus,
41 |           FROM UNNEST(hits) LEFT JOIN UNNEST(product)
42 |           WHERE productSKU != '(not set)'
43 |             AND NOT REGEXP_CONTAINS(page.pagepath, r'\.html|home') AND REGEXP_CONTAINS(page.pagepath, r'google\+redesign')
44 |           GROUP BY query
45 |         ) AS hits,
46 |         ARRAY(SELECT DISTINCT productSKU FROM UNNEST(hits), UNNEST(product) WHERE ecommerceAction.action_type = '6') AS purchased_skus
47 |       FROM `bigquery-public-data.google_analytics_sample.ga_sessions*`
48 |       WHERE TRUE
49 |         AND REGEXP_EXTRACT(_TABLE_SUFFIX, r'.*_(\d+)$') BETWEEN '{validation_init_date}' AND '{validation_end_date}'
50 |     )
51 |   )
52 | ),
53 | customer_data AS(
54 |   SELECT
55 |     fv,
56 |     COALESCE((SELECT AVG(avg_ticket) FROM UNNEST(ticket_array) AS avg_ticket), 0) AS avg_ticket
57 |   FROM(
58 |     SELECT
59 |       fullvisitorid AS fv,
60 |       ARRAY_CONCAT_AGG(ARRAY((SELECT AVG(productPrice / 1e6) AS avg_ticket FROM UNNEST(hits), UNNEST(product)))) AS ticket_array
61 |     FROM `bigquery-public-data.google_analytics_sample.ga_sessions*`
62 |     WHERE TRUE
63 |       AND REGEXP_EXTRACT(_TABLE_SUFFIX, r'.*_(\d+)$') BETWEEN '{validation_init_date}' AND '{validation_end_date}'
64 |     GROUP BY fv
65 |   )
66 | )
67 | 
68 | 
69 | SELECT
70 |   STRUCT(
71 |     query AS search_term,
72 |     COALESCE(channel_group, '') AS channel_group,
73 |     COALESCE(CAST(avg_ticket AS INT64), 0) AS customer_avg_ticket
74 |   ) AS search_keys,
75 |   ARRAY_AGG(STRUCT(ARRAY(SELECT sku FROM UNNEST(skus) WHERE purchase_flag) AS purchased)) AS docs
76 | FROM search_data LEFT JOIN customer_data USING(fv), UNNEST(hits)
77 | WHERE ARRAY_LENGTH(ARRAY(SELECT sku FROM UNNEST(skus) WHERE purchase_flag)) > 0
78 | GROUP BY query, channel_group, avg_ticket
79 | 


--------------------------------------------------------------------------------
/kubeflow/pipelines/pipeline.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | from kfp import components, dsl
 4 | from helper import update_op_project_id_img
 5 | 
 6 | 
 7 | PATH = pathlib.Path(__file__).parent
 8 | 
 9 | 
10 | @dsl.pipeline(
11 |     name='Train Lambda Mart Pipeline',
12 |     description=('Responsible for generating all datasets and optimization process for'
13 |                  ' the chosen Ranker algorithm.')
14 | )
15 | def build_pipeline(
16 |     bucket='pysearchml',
17 |     es_host='elasticsearch.elastic-system.svc.cluster.local:9200',
18 |     force_restart=False,
19 |     train_init_date='20170801',
20 |     train_end_date='20170801',
21 |     validation_init_date='20170802',
22 |     validation_end_date='20170802',
23 |     model_name='lambdamart0',
24 |     ranker='lambdamart'
25 | ):
26 | 
27 |     components_path = PATH.parent / 'components'
28 | 
29 | #     component_path = main_path / 'gcs' / 'component.yaml'
30 |     # gs_op_ = components.load_component_from_file(str(component_path))
31 |     # gs_op_ = update_op_project_id_img(gs_op_)
32 | #     gs_op = gs_op_('gs://pysearchml/requirements.txt', '.').set_display_name('GS')
33 | 
34 |     component_path = components_path / 'prepare_env' / 'component.yaml'
35 |     prepare_op_ = components.load_component_from_file(str(component_path))
36 |     prepare_op_ = update_op_project_id_img(prepare_op_)
37 | 
38 |     prepare_op = prepare_op_(
39 |         bucket=bucket,
40 |         es_host=es_host,
41 |         force_restart=force_restart,
42 |         model_name=model_name
43 |     ).set_display_name('Preparing Environment')
44 | 
45 |     component_path = components_path / 'data' / 'validation' / 'component.yaml'
46 |     validation_op_ = components.load_component_from_file(str(component_path))
47 |     validation_op_ = update_op_project_id_img(validation_op_)
48 | 
49 |     val_reg_op = validation_op_(
50 |         bucket=f'{bucket}/validation/regular',
51 |         validation_init_date=validation_init_date,
52 |         validation_end_date=validation_end_date
53 |     ).set_display_name('Build Regular Validation Dataset.').after(prepare_op)
54 | 
55 |     val_train_op = validation_op_(
56 |         bucket=f'{bucket}/validation/train',
57 |         validation_init_date=train_init_date,
58 |         validation_end_date=train_end_date
59 |     ).set_display_name('Build Validation Dataset of Train Data.').after(prepare_op)
60 | 
61 |     component_path = components_path / 'data' / 'train' / 'component.yaml'
62 |     train_op_ = components.load_component_from_file(str(component_path))
63 |     train_op_ = update_op_project_id_img(train_op_)
64 | 
65 |     train_op = train_op_(
66 |         bucket=bucket,
67 |         train_init_date=train_init_date,
68 |         train_end_date=train_end_date,
69 |         es_host=es_host,
70 |         model_name=model_name
71 |     ).set_display_name('Build Train RankLib Dataset.').after(prepare_op)
72 | 
73 |     component_path = components_path / 'model' / 'component.yaml'
74 |     model_op_ = components.load_component_from_file(str(component_path))
75 |     model_op_ = update_op_project_id_img(model_op_)
76 | 
77 |     model_op = model_op_(
78 |         name='lambdamart',
79 |         train_file_path=train_op.outputs['destination'],
80 |         validation_files_path=val_reg_op.outputs['destination'],
81 |         validation_train_files_path=val_train_op.outputs['destination'],
82 |         es_host=es_host,
83 |         model_name=model_name,
84 |         ranker=ranker
85 |     ).set_display_name('Launch Katib Optimization').after(val_reg_op,
86 |                                                           val_train_op,
87 |                                                           train_op)
88 | 
89 |     _ = dsl.ContainerOp(
90 |         name="my-out-cop",
91 |         image="library/bash:4.4.23",
92 |         command=["sh", "-c"],
93 |         arguments=["echo hyperparameter: %s" % model_op.output],
94 |     )
95 | 


--------------------------------------------------------------------------------
/kubeflow/components/data/train/train.sql:
--------------------------------------------------------------------------------
  1 | CREATE TEMP FUNCTION PROCESS_CHANNEL_GROUP(channelGroup STRING) RETURNS STRING AS (
  2 |    REGEXP_REPLACE(LOWER(channelGroup), ' ', '_')
  3 | );
  4 | 
  5 | 
  6 | WITH search_data AS(
  7 |   SELECT
  8 |     fv,
  9 |     ARRAY(
 10 |       SELECT AS STRUCT
 11 |         query,
 12 |         ARRAY(
 13 |           SELECT AS STRUCT
 14 |             doc, click, purchase FROM UNNEST(session_docs) WHERE IF(max_position IS NOT NULL, position <= max_position, TRUE) AND IF(click = 0, RAND() < 0.01, True)
 15 |         ) AS session_docs
 16 |       FROM UNNEST(hits)
 17 |     ) as hits
 18 |   FROM(
 19 |     SELECT
 20 |       fv,
 21 |       ARRAY(
 22 |         SELECT AS STRUCT
 23 |           query,
 24 |           ARRAY(SELECT AS STRUCT doc, click, purchase, position, MAX(IF(purchase = 1, position, NULL)) OVER() AS max_position FROM UNNEST(session_docs) ORDER BY position) AS session_docs
 25 |         FROM UNNEST(hits)
 26 |         WHERE EXISTS(SELECT 1 FROM UNNEST(session_docs) WHERE click = 1) AND (SELECT SUM(purchase) FROM UNNEST(session_docs)) <= 1
 27 |       ) AS hits
 28 |     FROM(
 29 |       SELECT
 30 |       fv,
 31 |       ARRAY(
 32 |        SELECT AS STRUCT
 33 |          query,
 34 |          ARRAY_AGG(STRUCT(h.doc AS doc, IF(purchase = 1, 1, click) AS click, purchase, position)) AS session_docs
 35 |        FROM UNNEST(hits) AS h
 36 |        GROUP BY query
 37 |      ) AS hits
 38 |      FROM(
 39 |       SELECT
 40 |         fv,
 41 |         ARRAY(
 42 |           SELECT AS STRUCT
 43 |             query,
 44 |             doc,
 45 |             MAX(click) AS click,
 46 |             MAX(IF(EXISTS(SELECT 1 FROM UNNEST(purchased_docs) AS purchased_doc where purchased_doc = doc), 1, 0)) AS purchase,
 47 |             MIN(position) AS position
 48 |           FROM UNNEST(hits)
 49 |           GROUP BY query, doc
 50 |         ) AS hits
 51 |       FROM(
 52 |         SELECT
 53 |           fullvisitorid as fv,
 54 |           ARRAY(
 55 |             SELECT AS STRUCT
 56 |               REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_EXTRACT(page.pagepath, r'([^\/]+)$'), r'(\+)+', ' '), r't ', 't'), r' s ?', ' '), r'\.axd', '') AS query,
 57 |               productSKU AS doc,
 58 |               IF(isClick, 1, 0) AS click,
 59 |               ROW_NUMBER() OVER() AS position
 60 |             FROM UNNEST(hits) LEFT JOIN UNNEST(product)
 61 |             WHERE TRUE
 62 |               AND productSKU != '(not set)'
 63 |               AND NOT REGEXP_CONTAINS(page.pagepath, r'\.html|home') AND REGEXP_CONTAINS(page.pagepath, r'google\+redesign')
 64 | 
 65 |           ) AS hits,
 66 |           ARRAY(SELECT productSKU FROM UNNEST(hits), UNNEST(product) WHERE ecommerceAction.action_type = '6') AS purchased_docs
 67 |         FROM `bigquery-public-data.google_analytics_sample.ga_sessions*`
 68 |         WHERE TRUE
 69 |           AND REGEXP_EXTRACT(_TABLE_SUFFIX, r'.*_(\d+)$') BETWEEN '{train_init_date}' AND '{train_end_date}'
 70 |         )
 71 |       )
 72 |     )
 73 |   )
 74 | ),
 75 | customer_data AS(
 76 |   SELECT
 77 |     fv,
 78 |     channel_group,
 79 |     CAST(COALESCE((SELECT AVG(avg_ticket) FROM UNNEST(ticket_array) AS avg_ticket), 0) AS INT64) AS avg_ticket
 80 |   FROM(
 81 |     SELECT
 82 |       fullvisitorid AS fv,
 83 |       COALESCE(ARRAY_AGG(PROCESS_CHANNEL_GROUP(channelGrouping) LIMIT 1)[SAFE_OFFSET(0)], '') AS channel_group,
 84 |       ARRAY_CONCAT_AGG(ARRAY((SELECT AVG(productPrice / 1e6) AS avg_ticket FROM UNNEST(hits), UNNEST(product)))) AS ticket_array
 85 |     FROM `bigquery-public-data.google_analytics_sample.ga_sessions*`
 86 |     WHERE TRUE
 87 |       AND REGEXP_EXTRACT(_TABLE_SUFFIX, r'.*_(\d+)$') BETWEEN '{train_init_date}' AND '{train_end_date}'
 88 |     GROUP BY fv
 89 |   )
 90 | )
 91 | 
 92 | 
 93 | SELECT
 94 |   STRUCT(
 95 |     query AS search_term,
 96 |     COALESCE(channel_group, '') AS channel_group,
 97 |     COALESCE(CAST(avg_ticket AS INT64), 0) AS customer_avg_ticket
 98 |   ) AS search_keys,
 99 |   ARRAY_AGG(STRUCT(session_docs AS session)) AS judgment_keys
100 | FROM search_data LEFT JOIN customer_data USING(fv), UNNEST(hits)
101 | WHERE ARRAY_LENGTH(session_docs) BETWEEN 3 AND 500
102 | GROUP BY query, channel_group, avg_ticket
103 | 


--------------------------------------------------------------------------------
/kubeflow/components/model/tests/test_train.py:
--------------------------------------------------------------------------------
  1 | import mock
  2 | import os
  3 | # from shutil import rmtree
  4 | from collections import namedtuple
  5 | 
  6 | from train import main
  7 | 
  8 | 
  9 | def test_train_model(monkeypatch, tmpdir_factory):
 10 |     post_mock = mock.Mock()
 11 |     os_system_mock = mock.Mock()
 12 | 
 13 |     tmp_folder = tmpdir_factory.mktemp('unittest')
 14 | 
 15 |     def write_file(tmp_folder: str, context: int):
 16 |         os.makedirs(str(tmp_folder), exist_ok=True)
 17 |         with open(f'{tmp_folder}/model.txt', 'w') as f:
 18 |             f.write(f'model definition: {context}')
 19 | 
 20 |     os_system_mock.return_value = write_file(str(tmp_folder), 1)
 21 | 
 22 |     datetime_mock = mock.Mock()
 23 |     datetime_mock.today.return_value.strftime.return_value = 'todays date'
 24 | 
 25 |     partial_mock = mock.Mock()
 26 |     partial_mock.return_value = 'partial function'
 27 | 
 28 |     pool_mock = mock.Mock()
 29 |     pool_mock.return_value.map.side_effect = [
 30 |         [0.3, 0.2],
 31 |         [0.2, 0.1],
 32 |         [0.4, 0.3]
 33 |     ]
 34 | 
 35 |     monkeypatch.setattr('train.post_model_to_elasticsearch', post_mock)
 36 |     monkeypatch.setattr('train.os.system', os_system_mock)
 37 |     monkeypatch.setattr('train.get_partiated_validator', partial_mock)
 38 |     monkeypatch.setattr('train.Pool', pool_mock)
 39 |     monkeypatch.setattr('train.datetime', datetime_mock)
 40 | 
 41 |     args = namedtuple(
 42 |         'args',
 43 |         [
 44 |             'train_file_path',
 45 |             'validation_files_path',
 46 |             'validation_train_files_path',
 47 |             'es_host',
 48 |             'model_name',
 49 |             'es_batch'
 50 |             'destination',
 51 |             'ranker',
 52 |             'index'
 53 |         ]
 54 |     )
 55 |     args.train_file_path = '/test/train_dataset.txt'
 56 |     args.validation_files_path = '/validation/regular'
 57 |     args.validation_train_files_path = '/validation/train'
 58 |     args.es_host = 'es_host_test'
 59 |     args.model_name = 'unittest'
 60 |     args.es_batch = 2
 61 |     args.destination = str(tmp_folder)
 62 |     args.ranker = 'lambdamart'
 63 |     args.index = 'index_test'
 64 | 
 65 |     X = ['--var1 val1 --var2 val2']
 66 | 
 67 |     main(args, X)
 68 |     expected_call = (
 69 |         'java -jar ranklib/RankLib-2.14.jar -ranker 6 -train '
 70 |         f'/test/train_dataset.txt -norm sum -save {str(tmp_folder)}/model.txt '
 71 |         '-var1 val1 -var2 val2 -metric2t ERR'
 72 |     )
 73 |     os_system_mock.assert_any_call(expected_call)
 74 |     post_mock.assert_any_call('es_host_test', 'unittest',
 75 |                               f'{args.destination}/model.txt')
 76 |     partial_mock.assert_any_call('es_host_test', 'index_test', 'unittest', 2)
 77 |     data = open(f'{args.destination}/results.txt').read()
 78 |     assert data == 'todays date,--var1 val1 --var2 val2,rank_train=0.2,rank_val=0.3\n'
 79 |     data = open(f'{args.destination}/best_rank.txt').read()
 80 |     assert data == '0.3'
 81 |     data = open(f'{args.destination}/best_model.txt').read()
 82 |     assert data == 'model definition: 1'
 83 | 
 84 |     # Test if new best model gets replaced
 85 |     os_system_mock.return_value = write_file(str(tmp_folder), 2)
 86 |     main(args, X)
 87 |     data = open(f'{args.destination}/results.txt').read()
 88 |     assert data == (
 89 |         'todays date,--var1 val1 --var2 val2,rank_train=0.2,rank_val=0.3\n'
 90 |         'todays date,--var1 val1 --var2 val2,rank_train=0.1,rank_val=0.2\n'
 91 |     )
 92 |     data = open(f'{args.destination}/best_rank.txt').read()
 93 |     assert data == '0.2'
 94 |     data = open(f'{args.destination}/best_model.txt').read()
 95 |     assert data == 'model definition: 2'
 96 | 
 97 |     # Test if new worse model is ignored
 98 |     os_system_mock.return_value = write_file(str(tmp_folder), 3)
 99 |     main(args, X)
100 |     data = open(f'{args.destination}/results.txt').read()
101 |     assert data == (
102 |         'todays date,--var1 val1 --var2 val2,rank_train=0.2,rank_val=0.3\n'
103 |         'todays date,--var1 val1 --var2 val2,rank_train=0.1,rank_val=0.2\n'
104 |         'todays date,--var1 val1 --var2 val2,rank_train=0.3,rank_val=0.4\n'
105 |     )
106 |     data = open(f'{args.destination}/best_rank.txt').read()
107 |     assert data == '0.2'
108 |     data = open(f'{args.destination}/best_model.txt').read()
109 |     assert data == 'model definition: 2'
110 | 


--------------------------------------------------------------------------------
/kubeflow/components/prepare_env/lambdamart0/es_mapping.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "index": "pysearchml",
  3 |     "body": {
  4 |         "mappings": {
  5 |             "dynamic": "strict",
  6 |             "properties": {
  7 |                 "sku": {
  8 |                     "type": "keyword"
  9 |                 },
 10 |                 "name": {
 11 |                     "type": "text"
 12 |                 },
 13 |                 "category": {
 14 |                     "type": "text"
 15 |                 },
 16 |                 "price": {
 17 |                     "type": "float"
 18 |                 },
 19 |                 "performances": {
 20 |                     "type": "object",
 21 |                     "properties": {
 22 |                         "global": {
 23 |                             "type": "object",
 24 |                             "properties": {
 25 |                                 "impressions": {
 26 |                                     "type": "integer"
 27 |                                 },
 28 |                                 "clicks": {
 29 |                                     "type": "integer"
 30 |                                 },
 31 |                                 "CTR": {
 32 |                                     "type": "float"
 33 |                                 }
 34 |                             }
 35 |                         },
 36 |                         "channel": {
 37 |                             "type": "object",
 38 |                             "properties": {
 39 |                                 "organic_search": {
 40 |                                     "type": "object",
 41 |                                     "properties": {
 42 |                                         "CTR": {
 43 |                                             "type": "float"
 44 |                                         }
 45 |                                     }
 46 |                                 },
 47 |                                 "direct": {
 48 |                                     "type": "object",
 49 |                                     "properties": {
 50 |                                         "CTR": {
 51 |                                             "type": "float"
 52 |                                         }
 53 |                                     }
 54 |                                 },
 55 |                                 "referral": {
 56 |                                     "type": "object",
 57 |                                     "properties": {
 58 |                                         "CTR": {
 59 |                                             "type": "float"
 60 |                                         }
 61 |                                     }
 62 |                                 },
 63 |                                 "paid_search": {
 64 |                                     "type": "object",
 65 |                                     "properties": {
 66 |                                         "CTR": {
 67 |                                             "type": "float"
 68 |                                         }
 69 |                                     }
 70 |                                 },
 71 |                                 "display": {
 72 |                                     "type": "object",
 73 |                                     "properties": {
 74 |                                         "CTR": {
 75 |                                             "type": "float"
 76 |                                         }
 77 |                                     }
 78 |                                 },
 79 |                                 "affiliates": {
 80 |                                     "type": "object",
 81 |                                     "properties": {
 82 |                                         "CTR": {
 83 |                                             "type": "float"
 84 |                                         }
 85 |                                     }
 86 |                                 },
 87 |                                 "social": {
 88 |                                     "type": "object",
 89 |                                     "properties": {
 90 |                                         "CTR": {
 91 |                                             "type": "float"
 92 |                                         }
 93 |                                     }
 94 |                                 }
 95 |                             }
 96 |                         }
 97 |                     }
 98 |                 }
 99 |             }
100 |         }
101 |     }
102 | }
103 | 


--------------------------------------------------------------------------------
/kubeflow/build/cloudbuild.yaml:
--------------------------------------------------------------------------------
  1 | steps:
  2 | 
  3 | # Transfer secret keys from GCP to Cloud Build environemnt
  4 | - name: gcr.io/cloud-builders/gcloud
  5 |   entrypoint: 'bash'
  6 |   args:
  7 |     - '-c'
  8 |     - |
  9 |       gcloud secrets versions access latest --secret=pysearchml-git-secret > /root/.ssh/id_github
 10 |       gcloud secrets versions access latest --secret=pysearchml-service-account > key.json
 11 |   volumes:
 12 |   - name: 'ssh'
 13 |     path: /root/.ssh
 14 |   id: 'Get Secret Keys'
 15 | 
 16 | # Update known_hosts
 17 | - name: 'gcr.io/cloud-builders/git'
 18 |   entrypoint: 'bash'
 19 |   args:
 20 |   - '-c'
 21 |   - |
 22 |     chmod 600 /root/.ssh/id_github
 23 |     ssh-keyscan -t rsa github.com >> /root/.ssh/known_hosts
 24 |     cat <<EOF >/root/.ssh/config
 25 |     Hostname github.com
 26 |     IdentityFile /root/.ssh/id_github
 27 |     EOF
 28 |   volumes:
 29 |   - name: 'ssh'
 30 |     path: /root/.ssh
 31 |   id: 'Prepare Git known_hosts'
 32 | 
 33 | # Clones Repository And Copies Service Account Key
 34 | - name: 'gcr.io/cloud-builders/git'
 35 |   entrypoint: 'bash'
 36 |   args:
 37 |   - '-c'
 38 |   - |
 39 |     git clone git@github.com:WillianFuks/pySearchML.git
 40 |     cp key.json pySearchML/
 41 |   volumes:
 42 |   - name: 'ssh'
 43 |     path: /root/.ssh
 44 |   id: 'Clone Git Repo'
 45 | 
 46 | # Build KFP Cluster
 47 | - name: 'gcr.io/cloud-builders/gcloud'
 48 |   entrypoint: 'bash'
 49 |   args:
 50 |     - '-c'
 51 |     - |
 52 |       ./bin/create_k8s.sh
 53 |   dir: 'pySearchML'
 54 |   env:
 55 |     - 'PROJECT_ID=$PROJECT_ID'
 56 |     - 'CLUSTER_NAME=${_CLUSTER_NAME}'
 57 |     - 'COMPUTE_ZONE=${_COMPUTE_ZONE}'
 58 |   id: 'Build KFP Cluster'
 59 |   volumes:
 60 |   - name: 'ssh'
 61 |     path: /root/.ssh
 62 | 
 63 | # Build Docker Images
 64 | - name: 'gcr.io/cloud-builders/docker'
 65 |   entrypoint: 'bash'
 66 |   args:
 67 |     - '-c'
 68 |     - |
 69 |       docker build -t gcr.io/$PROJECT_ID/prepare_env -f kubeflow/components/prepare_env/Dockerfile .
 70 |       docker build -t gcr.io/$PROJECT_ID/pipelines -f kubeflow/pipelines/Dockerfile .
 71 |       docker build -t gcr.io/$PROJECT_ID/data_validation -f kubeflow/components/data/validation/Dockerfile .
 72 |       docker build -t gcr.io/$PROJECT_ID/data_train -f kubeflow/components/data/train/Dockerfile .
 73 |       docker build -t gcr.io/$PROJECT_ID/model -f kubeflow/components/model/Dockerfile . --build-arg PROJECT_ID=$PROJECT_ID
 74 |       docker push gcr.io/$PROJECT_ID/prepare_env
 75 |       docker push gcr.io/$PROJECT_ID/pipelines
 76 |       docker push gcr.io/$PROJECT_ID/data_validation
 77 |       docker push gcr.io/$PROJECT_ID/data_train
 78 |       docker push gcr.io/$PROJECT_ID/model
 79 |   dir: 'pySearchML'
 80 |   id: 'Build Docker Images'
 81 |   waitFor: ['Clone Git Repo']
 82 | 
 83 | # Unit Test Data Train
 84 | - name: 'gcr.io/$PROJECT_ID/data_train'
 85 |   args:
 86 |     - export PYTHONPATH=.
 87 |     - pytest
 88 |   id: 'Unittest Data Train'
 89 |   waitFor: ['Clone Git Repo', 'Build Docker Images']
 90 | 
 91 | # Unit Test Train Model
 92 | - name: 'gcr.io/$PROJECT_ID/model'
 93 |   args:
 94 |     - export PYTHONPATH=.
 95 |     - pytest
 96 |   id: 'Unittest Train Model'
 97 |   waitFor: ['Clone Git Repo', 'Build Docker Images']
 98 | 
 99 | # Compile Ranker Pipeline
100 | - name: 'gcr.io/$PROJECT_ID/pipelines'
101 |   args:
102 |     - '-c'
103 |     - |
104 |       dsl-compile --py pipeline2.py --output pipeline.tar.gz
105 |   dir: 'pySearchML/kubeflow/pipelines'
106 |   env:
107 |     - 'PROJECT_ID=$PROJECT_ID'
108 |   id: 'Compile Pipeline'
109 |   waitFor: ['Clone Git Repo']
110 | 
111 | # Get Host After Cluster Is Created
112 | - name: 'gcr.io/cloud-builders/gcloud'
113 |   entrypoint: 'bash'
114 |   args:
115 |     - '-c'
116 |     - |
117 |       gcloud components install kubectl
118 |       gcloud container clusters get-credentials ${_CLUSTER_NAME}
119 |       ./pySearchML/bin/get_pipe_host.sh
120 |   id: 'Get Host'
121 |   waitFor: ['Build KFP Cluster']
122 | 
123 | # Deploy Pipeline to K8
124 | - name: 'gcr.io/$PROJECT_ID/pipelines'
125 |   args:
126 |     - '-c'
127 |     - |
128 |       gcloud components install kubectl
129 |       gcloud container clusters get-credentials pysearchml --zone=$_COMPUTE_ZONE
130 |       kubectl port-forward -n kubeflow svc/ml-pipeline-ui 7067:80 &
131 |       # Wait port-forwarding to take its place
132 |       sleep 60s
133 |       python helper.py deploy-pipeline --host=$(cat /workspace/k8_host.txt) --version=${_VERSION}
134 |   dir: 'pySearchML/kubeflow/pipelines/'
135 |   id:  'Deploy Pipeline'
136 |   waitFor: ['Compile Pipeline', 'Build KFP Cluster', 'Get Host']
137 | 


--------------------------------------------------------------------------------
/kubeflow/components/data/train/tests/test_run.py:
--------------------------------------------------------------------------------
  1 | import mock
  2 | import json
  3 | import gzip
  4 | import os
  5 | import subprocess
  6 | import shutil
  7 | from collections import namedtuple
  8 | 
  9 | from run import main
 10 | 
 11 | 
 12 | def test_main(monkeypatch, es_log_features, tmpdir_factory):
 13 |     shutil.rmtree('/tmp/pysearchml/unittest', ignore_errors=True)
 14 |     clickmodel_path = '/tmp/pysearchml/unittest/model'
 15 |     os.makedirs(clickmodel_path)
 16 |     tmp_dir = tmpdir_factory.mktemp('unittest')
 17 | 
 18 |     args = namedtuple(
 19 |         'args',
 20 |         [
 21 |             'train_init_date',
 22 |             'train_end_date',
 23 |             'bucket',
 24 |             'es_host',
 25 |             'es_batch',
 26 |             'destination',
 27 |             'model_name',
 28 |             'index'
 29 |         ]
 30 |     )
 31 |     args.init_day_train = '20200101'
 32 |     args.end_day_train = '20200101'
 33 |     args.bucket = 'gcp_bucket'
 34 |     args.index = 'index_test'
 35 |     args.es_host = 'es_host_test'
 36 |     args.model_name = 'unittest'
 37 |     args.es_batch = 2
 38 |     args.destination = str(tmp_dir)
 39 | 
 40 |     download_mock = mock.Mock()
 41 | 
 42 |     class MockModel:
 43 |         def fit(self, *args, **kwargs):
 44 |             return self
 45 | 
 46 |         def export_judgments(self, model_path: str):
 47 |             subprocess.call(
 48 |                 [f'cp -r tests/fixtures/model.gz {model_path}'],
 49 |                 stdout=subprocess.PIPE,
 50 |                 shell=True
 51 |             )
 52 | 
 53 |     dbn_mock = mock.Mock()
 54 |     dbn_mock.DBNModel.return_value = MockModel()
 55 |     es_client_mock = mock.Mock()
 56 | 
 57 |     monkeypatch.setattr('run.download_data', download_mock)
 58 |     monkeypatch.setattr('run.DBN', dbn_mock)
 59 |     monkeypatch.setattr('run.Elasticsearch', es_client_mock)
 60 |     es_client_mock.msearch.side_effect = es_log_features
 61 | 
 62 |     main(args, es_client_mock)
 63 |     download_mock.assert_called_with(args)
 64 |     data_reader = gzip.GzipFile('/tmp/pysearchml/unittest/judgments/judgments.gz', 'rb')
 65 |     data = json.loads(data_reader.readline())
 66 |     expected = {
 67 |         "search_keys": {"search_term": "keyword0", "var1": "val1"},
 68 |         "judgment_keys": [
 69 |             {"doc": "doc0", "judgment": 0},
 70 |             {"doc": "doc1", "judgment": 4},
 71 |             {"doc": "doc2", "judgment": 2}
 72 |         ]
 73 |     }
 74 |     assert expected == data
 75 | 
 76 |     data = json.loads(data_reader.readline())
 77 |     expected = {
 78 |         "search_keys": {"search_term": "keyword1", "var1": "val1"},
 79 |         "judgment_keys": [
 80 |             {"doc": "doc1", "judgment": 0},
 81 |             {"doc": "doc2", "judgment": 4}
 82 |         ]
 83 |     }
 84 |     assert expected == data
 85 | 
 86 |     data = json.loads(data_reader.readline())
 87 |     expected = {
 88 |         "search_keys": {"search_term": "keyword2", "var1": "val1"},
 89 |         "judgment_keys": [
 90 |             {"doc": "doc3", "judgment": 0},
 91 |             {"doc": "doc4", "judgment": 4}
 92 |         ]
 93 |     }
 94 |     assert expected == data
 95 | 
 96 |     body1 = (
 97 |         '{"index": "index_test"}\n{"query": {"bool": {"filter": [{"terms": {"_id": '
 98 |         '["doc0", "doc1", "doc2"]}}], "should": [{"sltr": {"_name": "logged_featureset"'
 99 |         ', "featureset": "unittest", "params": {"search_term": "keyword0", "var1": '
100 |         '"val1"}}}]}}, "_source": ["_id"], "ext": {"ltr_log": {"log_specs": {"name": '
101 |         '"main", "named_query": "logged_featureset"}}}}\n{"index": "index_test"}\n{"'
102 |         'query": {"bool": {"filter": [{"terms": {"_id": ["doc1", "doc2"]}}], "should": '
103 |         '[{"sltr": {"_name": "logged_featureset", "featureset": "unittest", "params": '
104 |         '{"search_term": "keyword1", "var1": "val1"}}}]}}, "_source": ["_id"], "ext": '
105 |         '{"ltr_log": {"log_specs": {"name": "main", "named_query": "logged_featureset"}'
106 |         '}}}'
107 |     )
108 |     body2 = (
109 |         '{"index": "index_test"}\n{"query": {"bool": {"filter": [{"terms": {"_id": ["'
110 |         'doc3", "doc4"]}}], "should": [{"sltr": {"_name": "logged_featureset", "'
111 |         'featureset": "unittest", "params": {"search_term": "keyword2", "var1": "val1"'
112 |         '}}}]}}, "_source": ["_id"], "ext": {"ltr_log": {"log_specs": {"name": "main", '
113 |         '"named_query": "logged_featureset"}}}}'
114 |     )
115 |     es_client_mock.msearch.assert_any_call(body=body1, request_timeout=60)
116 |     es_client_mock.msearch.assert_any_call(body=body2, request_timeout=60)
117 | 
118 |     rank_data = open(f'{str(tmp_dir)}/train_dataset.txt').read()
119 | 
120 |     expected = (
121 |         '0\tqid:0\t1:0.01\t2:0.02\n4\tqid:0\t1:0.03\t2:0.04\n2\tqid:0\t1:0.05\t'
122 |         '2:0.06\n0\tqid:1\t1:0.03\t2:0.04\n4\tqid:1\t1:0.05\t2:0\n'
123 |     )
124 | 
125 |     assert rank_data == expected
126 |     shutil.rmtree('/tmp/pysearchml/unittest', ignore_errors=True)
127 | 


--------------------------------------------------------------------------------
/kubeflow/components/common/launch_crd.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 kubeflow.org.
  2 | # Licensed under the Apache License, Version 2.0 (the "License");
  3 | # you may not use this file except in compliance with the License.
  4 | # You may obtain a copy of the License at
  5 | #
  6 | #      http://www.apache.org/licenses/LICENSE-2.0
  7 | #
  8 | # Unless required by applicable law or agreed to in writing, software
  9 | # distributed under the License is distributed on an "AS IS" BASIS,
 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 | # See the License for the specific language governing permissions and
 12 | # limitations under the License.
 13 | 
 14 | 
 15 | """
 16 | This script uses as reference the following code:
 17 | 
 18 | https://github.com/kubeflow/pipelines/blob/848fbe0bceb786c8db72e88b3bc986d42ac768b9/components/kubeflow/common/launch_crd.py
 19 | """
 20 | 
 21 | import datetime
 22 | import time
 23 | 
 24 | from kubernetes import client as k8s_client
 25 | from kubernetes.client import rest
 26 | 
 27 | 
 28 | class K8sCR(object):
 29 |   def __init__(self, group, plural, version, client):
 30 |     self.group = group
 31 |     self.plural = plural
 32 |     self.version = version
 33 |     self.client = k8s_client.CustomObjectsApi(client)
 34 | 
 35 |   def wait_for_condition(self,
 36 |                          namespace,
 37 |                          name,
 38 |                          expected_conditions=[],
 39 |                          timeout=datetime.timedelta(days=365),
 40 |                          polling_interval=datetime.timedelta(seconds=30),
 41 |                          status_callback=None):
 42 |     """Waits until any of the specified conditions occur.
 43 |     Args:
 44 |       namespace: namespace for the CR.
 45 |       name: Name of the CR.
 46 |       expected_conditions: A list of conditions. Function waits until any of the
 47 |         supplied conditions is reached.
 48 |       timeout: How long to wait for the CR.
 49 |       polling_interval: How often to poll for the status of the CR.
 50 |       status_callback: (Optional): Callable. If supplied this callable is
 51 |         invoked after we poll the CR. Callable takes a single argument which
 52 |         is the CR.
 53 |     """
 54 |     end_time = datetime.datetime.now() + timeout
 55 |     while True:
 56 |       try:
 57 |         results = self.client.get_namespaced_custom_object(
 58 |           self.group, self.version, namespace, self.plural, name)
 59 |       except Exception as e:
 60 |         print("There was a problem waiting for %s/%s %s in namespace %s; Exception: %s" %(
 61 |                        self.group, self.plural, name, namespace, e))
 62 |         raise
 63 | 
 64 |       if results:
 65 |         if status_callback:
 66 |           status_callback(results)
 67 |         expected, condition = self.is_expected_conditions(results, expected_conditions)
 68 |         if expected:
 69 |           print("%s/%s %s in namespace %s has reached the expected condition: %s." %(
 70 |                        self.group, self.plural, name, namespace, condition))
 71 |           return results
 72 |         else:
 73 |           if condition:
 74 |             print("Current condition of %s/%s %s in namespace %s is %s." %(
 75 |                   self.group, self.plural, name, namespace, condition))
 76 | 
 77 |       if datetime.datetime.now() + polling_interval > end_time:
 78 |         raise Exception(
 79 |           "Timeout waiting for {0}/{1} {2} in namespace {3} to enter one of the "
 80 |           "conditions {4}.".format(self.group, self.plural, name, namespace, expected_conditions))
 81 | 
 82 |       time.sleep(polling_interval.seconds)
 83 | 
 84 |   def is_expected_conditions(self, cr_object, expected_conditions):
 85 |     return False, ""
 86 | 
 87 |   def create(self, spec):
 88 |     """Create a CR.
 89 |     Args:
 90 |       spec: The spec for the CR.
 91 |     """
 92 |     try:
 93 |       # Create a Resource
 94 |       namespace = spec["metadata"].get("namespace", "default")
 95 |       print("Creating %s/%s %s in namespace %s." %(
 96 |         self.group, self.plural, spec["metadata"]["name"], namespace))
 97 |       api_response = self.client.create_namespaced_custom_object(
 98 |         self.group, self.version, namespace, self.plural, spec)
 99 |       print("Created %s/%s %s in namespace %s." %(
100 |         self.group, self.plural, spec["metadata"]["name"], namespace))
101 |       return api_response
102 |     except rest.ApiException as e:
103 |       print(str(e))
104 |   def delete(self, name, namespace):
105 |     try:
106 |       body = {
107 |         # Set garbage collection so that CR won't be deleted until all
108 |         # owned references are deleted.
109 |         "propagationPolicy": "Foreground",
110 |       }
111 |       print("Deleteing %s/%s %s in namespace %s." %(
112 |         self.group, self.plural, name, namespace))
113 |       api_response = self.client.delete_namespaced_custom_object(
114 |         group=self.group,
115 |         version=self.version,
116 |         namespace=namespace,
117 |         plural=self.plural,
118 |         name=name,
119 |         body=body)
120 |       print("Deleted %s/%s %s in namespace %s." %(
121 |         self.group, self.plural, name, namespace))
122 |       return api_response
123 |     except rest.ApiException as e:
124 |       print(str(e))
125 | 
126 |     print("Exception when %s %s/%s: %s" %(action, self.group, self.plural, ex.body))
127 |     raise ex
128 | 


--------------------------------------------------------------------------------
/kubeflow/pipelines/pipeline2.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pathlib
  3 | 
  4 | from kfp import dsl
  5 | 
  6 | 
  7 | PATH = pathlib.Path(__file__).parent
  8 | PROJECT_ID = os.getenv('PROJECT_ID')
  9 | 
 10 | 
 11 | @dsl.pipeline(
 12 |     name='Train Lambda Mart Pipeline',
 13 |     description=('Responsible for generating all datasets and optimization process for'
 14 |                  ' the chosen Ranker algorithm.')
 15 | )
 16 | def build_pipeline(
 17 |     bucket='pysearchml',
 18 |     es_host='elasticsearch.elastic-system.svc.cluster.local:9200',
 19 |     force_restart=False,
 20 |     train_init_date='20160801',
 21 |     train_end_date='20160801',
 22 |     validation_init_date='20160802',
 23 |     validation_end_date='20160802',
 24 |     test_init_date='20160803',
 25 |     test_end_date='20160803',
 26 |     model_name='lambdamart0',
 27 |     ranker='lambdamart',
 28 |     index='pysearchml'
 29 | ):
 30 |     pvc = dsl.PipelineVolume(pvc='pysearchml-nfs')
 31 | 
 32 |     prepare_op = dsl.ContainerOp(
 33 |         name='prepare env',
 34 |         image=f'gcr.io/{PROJECT_ID}/prepare_env',
 35 |         arguments=[
 36 |             f'--force_restart={force_restart}',
 37 |             f'--es_host={es_host}',
 38 |             f'--bucket={bucket}',
 39 |             f'--model_name={model_name}'
 40 |         ],
 41 |         pvolumes={'/data': pvc}
 42 |     )
 43 | 
 44 |     val_reg_dataset_op = dsl.ContainerOp(
 45 |         name='validation regular dataset',
 46 |         image=f'gcr.io/{PROJECT_ID}/data_validation',
 47 |         arguments=[
 48 |             f'--bucket={bucket}/validation/regular',
 49 |             f'--validation_init_date={validation_init_date}',
 50 |             f'--validation_end_date={validation_end_date}',
 51 |             f'--destination=/data/pysearchml/{model_name}/validation_regular'
 52 |         ],
 53 |         pvolumes={'/data': pvc}
 54 |     ).set_display_name('Build Regular Validation Dataset').after(prepare_op)
 55 | 
 56 |     val_train_dataset_op = dsl.ContainerOp(
 57 |         name='validation train dataset',
 58 |         image=f'gcr.io/{PROJECT_ID}/data_validation',
 59 |         arguments=[
 60 |             f'--bucket={bucket}/validation/train',
 61 |             f'--validation_init_date={train_init_date}',
 62 |             f'--validation_end_date={train_end_date}',
 63 |             f'--destination=/data/pysearchml/{model_name}/validation_train'
 64 |         ],
 65 |         pvolumes={'/data': pvc}
 66 |     ).set_display_name('Build Train Validation Dataset').after(prepare_op)
 67 | 
 68 |     val_test_dataset_op = dsl.ContainerOp(
 69 |         name='validation test dataset',
 70 |         image=f'gcr.io/{PROJECT_ID}/data_validation',
 71 |         arguments=[
 72 |             f'--bucket={bucket}/validation/test',
 73 |             f'--validation_init_date={test_init_date}',
 74 |             f'--validation_end_date={test_end_date}',
 75 |             f'--destination=/data/pysearchml/{model_name}/validation_test'
 76 |         ],
 77 |         pvolumes={'/data': pvc}
 78 |     ).set_display_name('Build Test Validation Dataset').after(prepare_op)
 79 | 
 80 |     train_dataset_op = dsl.ContainerOp(
 81 |         name='train dataset',
 82 |         image=f'gcr.io/{PROJECT_ID}/data_train',
 83 |         command=['python', '/train/run.py'],
 84 |         arguments=[
 85 |             f'--bucket={bucket}',
 86 |             f'--train_init_date={train_init_date}',
 87 |             f'--train_end_date={train_end_date}',
 88 |             f'--es_host={es_host}',
 89 |             f'--model_name={model_name}',
 90 |             f'--index={index}',
 91 |             f'--destination=/data/pysearchml/{model_name}/train'
 92 |         ],
 93 |         pvolumes={'/data': pvc}
 94 |     ).set_display_name('Build Training Dataset').after(prepare_op)
 95 | 
 96 |     katib_op = dsl.ContainerOp(
 97 |         name='pySearchML Bayesian Optimization Model',
 98 |         image=f'gcr.io/{PROJECT_ID}/model',
 99 |         command=['python', '/model/launch_katib.py'],
100 |         arguments=[
101 |             f'--es_host={es_host}',
102 |             f'--model_name={model_name}',
103 |             f'--ranker={ranker}',
104 |             '--name=pysearchml',
105 |             f'--train_file_path=/data/pysearchml/{model_name}/train/train_dataset.txt',
106 |             f'--validation_files_path=/data/pysearchml/{model_name}/validation_regular',
107 |             ('--validation_train_files_path=/data/pysearchml/{model_name}/'
108 |              'validation_train'),
109 |             f'--destination=/data/pysearchml/{model_name}/'
110 |         ],
111 |         pvolumes={'/data': pvc}
112 |     ).set_display_name('Katib Optimization Process').after(
113 |         val_reg_dataset_op, val_train_dataset_op, val_test_dataset_op, train_dataset_op
114 |     )
115 | 
116 |     post_model_op = dsl.ContainerOp(
117 |         name='Post Best RankLib Model to ES',
118 |         image=f'gcr.io/{PROJECT_ID}/model',
119 |         command=['python', '/model/post_model.py'],
120 |         arguments=[
121 |             f'--es_host={es_host}',
122 |             f'--model_name={model_name}',
123 |             f'--destination=/data/pysearchml/{model_name}/best_model.txt'
124 |         ],
125 |         pvolumes={'/data': pvc}
126 |     ).set_display_name('Post RankLib Model to ES').after(katib_op)
127 | 
128 |     _ = dsl.ContainerOp(
129 |         name='Test Model',
130 |         image=f'gcr.io/{PROJECT_ID}/model',
131 |         command=['python', '/model/test.py'],
132 |         arguments=[
133 |             f'--files_path=/data/pysearchml/{model_name}/validation_test',
134 |             f'--index={index}',
135 |             f'--es_host={es_host}',
136 |             f'--model_name={model_name}',
137 |         ],
138 |         pvolumes={'/data': pvc}
139 |     ).set_display_name('Run Test Step').after(post_model_op)
140 | 


--------------------------------------------------------------------------------
/kubeflow/components/data/train/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | 
  4 | @pytest.fixture
  5 | def es_log_features():
  6 |     return [
  7 |         {
  8 |             'responses': [
  9 |                 {
 10 |                     'hits': {
 11 |                         'hits': [
 12 |                             {
 13 |                                 '_id': 'doc0',
 14 |                                 'fields': {
 15 |                                     '_ltrlog': [
 16 |                                         {
 17 |                                             'main': [
 18 |                                                 {
 19 |                                                     'value': 0.01
 20 |                                                 },
 21 |                                                 {
 22 |                                                     'value': 0.02
 23 |                                                 }
 24 |                                             ]
 25 |                                         }
 26 |                                     ]
 27 |                                 }
 28 |                             },
 29 |                             {
 30 |                                 '_id': 'doc1',
 31 |                                 'fields': {
 32 |                                     '_ltrlog': [
 33 |                                         {
 34 |                                             'main': [
 35 |                                                 {
 36 |                                                     'value': 0.03
 37 |                                                 },
 38 |                                                 {
 39 |                                                     'value': 0.04
 40 |                                                 }
 41 |                                             ]
 42 |                                         }
 43 |                                     ]
 44 |                                 }
 45 |                             },
 46 |                             {
 47 |                                 '_id': 'doc2',
 48 |                                 'fields': {
 49 |                                     '_ltrlog': [
 50 |                                         {
 51 |                                             'main': [
 52 |                                                 {
 53 |                                                     'value': 0.05
 54 |                                                 },
 55 |                                                 {
 56 |                                                     'value': 0.06
 57 |                                                 }
 58 |                                             ]
 59 |                                         }
 60 |                                     ]
 61 |                                 }
 62 |                             }
 63 |                         ]
 64 |                     }
 65 |                 },
 66 |                 {
 67 |                     'hits': {
 68 |                         'hits': [
 69 |                             {
 70 |                                 '_id': 'doc1',
 71 |                                 'fields': {
 72 |                                     '_ltrlog': [
 73 |                                         {
 74 |                                             'main': [
 75 |                                                 {
 76 |                                                     'value': 0.03
 77 |                                                 },
 78 |                                                 {
 79 |                                                     'value': 0.04
 80 |                                                 }
 81 |                                             ]
 82 |                                         }
 83 |                                     ]
 84 |                                 }
 85 |                             },
 86 |                             {
 87 |                                 '_id': 'doc2',
 88 |                                 'fields': {
 89 |                                     '_ltrlog': [
 90 |                                         {
 91 |                                             'main': [
 92 |                                                 {
 93 |                                                     'value': 0.05
 94 |                                                 },
 95 |                                                 {}
 96 |                                             ]
 97 |                                         }
 98 |                                     ]
 99 |                                 }
100 |                             }
101 |                         ]
102 |                     }
103 |                 }
104 |             ]
105 |         },
106 |         {
107 |             'responses': [
108 |                 {
109 |                     'hits': {
110 |                         'hits': [
111 |                             {
112 |                                 '_id': 'doc3',
113 |                                 'fields': {
114 |                                     '_ltrlog': [
115 |                                         {
116 |                                             'main': [
117 |                                                 {
118 |                                                     'value': 0.06
119 |                                                 },
120 |                                                 {
121 |                                                     'value': 0.07
122 |                                                 }
123 |                                             ]
124 |                                         }
125 |                                     ]
126 |                                 }
127 |                             }
128 |                         ]
129 |                     }
130 |                 }
131 |             ]
132 |         },
133 |         {}
134 |     ]
135 | 


--------------------------------------------------------------------------------
/kubeflow/components/model/launch_katib.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import json
  4 | import uuid
  5 | import pathlib
  6 | from typing import Dict, List, Any
  7 | 
  8 | import launch_crd
  9 | from kubernetes import client as k8s_client
 10 | from kubernetes import config
 11 | 
 12 | 
 13 | # https://github.com/kubeflow/pipelines/tree/de2e0f2ec0edc1afd16ad79d8cd9719d1b01cb1f/components/kubeflow/katib-launcher
 14 | 
 15 | 
 16 | PATH = pathlib.Path(__file__).parent
 17 | 
 18 | 
 19 | def get_ranker_parameters(ranker: str) -> List[Dict[str, Any]]:
 20 |     return {
 21 |         'lambdamart': [
 22 |             {
 23 |                 "name": "--tree",
 24 |                 "parameterType": "int",
 25 |                 "feasibleSpace": {
 26 |                     "min": "1",
 27 |                     "max": "500"
 28 |                 }
 29 |             },
 30 |             {
 31 |                 "name": "--leaf",
 32 |                 "parameterType": "int",
 33 |                 "feasibleSpace": {
 34 |                     "min": "2",
 35 |                     "max": "40"
 36 |                 }
 37 |             },
 38 |             {
 39 |                 "name": "--shrinkage",
 40 |                 "parameterType": "double",
 41 |                 "feasibleSpace": {
 42 |                     "min": "0.01",
 43 |                     "max": "0.2"
 44 |                 }
 45 |             },
 46 |             {
 47 |                 "name": "--tc",
 48 |                 "parameterType": "int",
 49 |                 "feasibleSpace": {
 50 |                     "min": "-1",
 51 |                     "max": "300"
 52 |                 }
 53 |             },
 54 |             {
 55 |                 "name": "--mls",
 56 |                 "parameterType": "int",
 57 |                 "feasibleSpace": {
 58 |                     "min": "1",
 59 |                     "max": "10"
 60 |                 }
 61 |             }
 62 |         ]
 63 |     }.get(ranker)
 64 | 
 65 | 
 66 | class Experiment(launch_crd.K8sCR):
 67 |     def __init__(self, client=None):
 68 |         super().__init__('kubeflow.org', 'experiments', 'v1alpha3', client)
 69 | 
 70 |     def is_expected_conditions(self, instance, expected_conditions):
 71 |         conditions = instance.get('status', {}).get('conditions')
 72 |         if not conditions:
 73 |             return False, ''
 74 |         if conditions[-1]['type'] in expected_conditions:
 75 |             return True, conditions[-1]['type']
 76 |         else:
 77 |             return False, conditions[-1]['type']
 78 | 
 79 | 
 80 | def main(argv=None):
 81 |     parser = argparse.ArgumentParser()
 82 |     parser.add_argument(
 83 |         '--name',
 84 |         dest='name',
 85 |         type=str,
 86 |         help='Experiment name.'
 87 |     )
 88 |     parser.add_argument(
 89 |         '--destination',
 90 |         dest='destination',
 91 |         type=str,
 92 |         help='The file which stores the best trial of the experiment.'
 93 |     )
 94 |     parser.add_argument(
 95 |         '--train_file_path',
 96 |         dest='train_file_path',
 97 |         type=str,
 98 |         help='Location where training data is located.'
 99 |     )
100 |     parser.add_argument(
101 |         '--validation_files_path',
102 |         dest='validation_files_path',
103 |         type=str,
104 |         help='Location where validation data is located.'
105 |     )
106 |     parser.add_argument(
107 |         '--validation_train_files_path',
108 |         dest='validation_train_files_path',
109 |         type=str,
110 |         help='Location where validation of training data is located.'
111 |     )
112 |     parser.add_argument(
113 |         '--es_host',
114 |         dest='es_host',
115 |         type=str,
116 |         help='Name host of Elasticsearch.'
117 |     )
118 |     parser.add_argument(
119 |         '--model_name',
120 |         dest='model_name',
121 |         type=str,
122 |         help='Name of feature set saved in Elasticsearch.'
123 |     )
124 |     parser.add_argument(
125 |         '--ranker',
126 |         dest='ranker',
127 |         type=str,
128 |         help='RankLib algorith to use.'
129 |     )
130 | 
131 |     args = parser.parse_args()
132 | 
133 |     files = [f'{args.destination}/best_rank.txt', f'{args.destination}/best_model.txt']
134 |     for file_ in files:
135 |         if os.path.isfile(file_):
136 |             os.remove(file_)
137 | 
138 |     exp_json_file = PATH / 'experiment.json'
139 |     exp_def = json.loads(open(str(exp_json_file)).read())
140 | 
141 |     raw_template = json.dumps(
142 |         exp_def['spec']['trialTemplate']['goTemplate']['rawTemplate']
143 |     )
144 |     raw_template = raw_template\
145 |         .replace('{PROJECT_ID}', os.getenv('PROJECT_ID'))\
146 |         .replace('{train_file_path}', args.train_file_path)\
147 |         .replace('{validation_files_path}', args.validation_files_path)\
148 |         .replace('{validation_train_files_path}', args.validation_train_files_path)\
149 |         .replace('{es_host}', args.es_host)\
150 |         .replace('{destination}', args.destination)\
151 |         .replace('{model_name}', args.model_name)\
152 |         .replace('{ranker}', args.ranker)
153 | 
154 |     exp_def['spec']['trialTemplate']['goTemplate']['rawTemplate'] = raw_template
155 | 
156 |     config.load_incluster_config()
157 |     api_client = k8s_client.ApiClient()
158 |     experiment = Experiment(client=api_client)
159 |     exp_name = f'{args.name}-{uuid.uuid4().hex}'[:33]
160 | 
161 |     exp_def['spec']['parameters'] = get_ranker_parameters(args.ranker)
162 |     exp_def['metadata']['name'] = exp_name
163 |     print('this is exp_def: ', json.dumps(exp_def))
164 | 
165 |     create_response = experiment.create(exp_def)
166 |     print('create response: ', create_response)
167 | 
168 |     expected_conditions = ["Succeeded", "Failed"]
169 |     current_exp = experiment.wait_for_condition('kubeflow', exp_name,
170 |                                                 expected_conditions)
171 |     print('current_exp: ', json.dumps(current_exp))
172 | 
173 |     expected, _ = experiment.is_expected_conditions(current_exp, ["Succeeded"])
174 | 
175 |     if expected:
176 |         best_rank = current_exp["status"]["currentOptimalTrial"]["observation"][
177 |             'metrics'][0]['value']
178 |         print('Best Rank Found: ', best_rank)
179 |         params = current_exp["status"]["currentOptimalTrial"]["parameterAssignments"]
180 |         print(json.dumps(params))
181 |         os.makedirs(os.path.dirname(args.destination), exist_ok=True)
182 |         if os.path.isfile(args.destination):
183 |             os.remove(args.destination)
184 | 
185 |     experiment.delete(exp_name, 'kubeflow')
186 | 
187 | 
188 | if __name__ == "__main__":
189 |     main()
190 | 


--------------------------------------------------------------------------------
/kubeflow/components/prepare_env/run.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import argparse
  3 | import pathlib
  4 | import gzip
  5 | import json
  6 | import requests
  7 | from typing import Dict, Any, NamedTuple
  8 | from urllib.parse import urljoin
  9 | 
 10 | from google.cloud import storage, bigquery
 11 | 
 12 | 
 13 | PATH = pathlib.Path(__file__).parent
 14 | 
 15 | 
 16 | def process_feature_file(filename: str) -> Dict[str, Any]:
 17 |     """
 18 |     Each feature for RankLib is defined in a JSON file with its name and formula.
 19 | 
 20 |     Args
 21 |     ----
 22 |       filename: str
 23 |           Filename containing definition for a specific feature.
 24 | 
 25 |     Returns
 26 |     -------
 27 |       feature: str
 28 |           JSON feature processed.
 29 |     """
 30 |     feature = json.loads(open(filename).read())
 31 |     template = feature['query']
 32 |     name = feature['name']
 33 |     params = feature['params']
 34 |     feature_spec = {
 35 |         'name': name,
 36 |         'template': template,
 37 |         'params': params
 38 |     }
 39 |     return feature_spec
 40 | 
 41 | 
 42 | def create_feature_store(es_host: str) -> None:
 43 |     """
 44 |     RankLib uses the concept of "features store" where information about features is
 45 |     stored on Elasticsearch. Here, the store is just created but now features are
 46 |     defined yet.
 47 | 
 48 |     Args
 49 |     ----
 50 |       restart: bool
 51 |           If `True` then deletes feature store on Elasticsearch and create it again.
 52 |       es_host: str
 53 |           Hostname where to reach Elasticsearch.
 54 |     """
 55 |     host = f'http://{es_host}'
 56 |     feature_store_url = urljoin(host, '_ltr')
 57 |     requests.delete(feature_store_url)
 58 |     requests.put(feature_store_url)
 59 | 
 60 | 
 61 | def create_feature_set(es_host: str, model_name: str) -> None:
 62 |     """
 63 |     Defines each feature that should be used for the RankLib model. It's expected the
 64 |     features will be available at a specific path when this script runs (this is
 65 |     accomplished by running previous steps on Kubeflow that prepares this data).
 66 | 
 67 |     Args
 68 |     ----
 69 |       es_host: str
 70 |           Hostname of Elasticsearch.
 71 |       model_name: str
 72 |           Name that specificies current experiment in Kubeflow.
 73 |     """
 74 |     features_path = PATH / f'{model_name}' / 'features'
 75 |     feature_set = {
 76 |         'featureset': {
 77 |             'name': model_name,
 78 |             'features': [process_feature_file(str(filename)) for filename in
 79 |                          features_path.glob('*')]
 80 |         }
 81 |     }
 82 |     post_feature_set(feature_set, model_name, es_host)
 83 | 
 84 | 
 85 | def post_feature_set(
 86 |     feature_set: Dict[str, Any],
 87 |     model_name: str,
 88 |     es_host: str
 89 | ) -> None:
 90 |     """
 91 |     POST feature definition to Elasticsearch under the name of `model_name`.
 92 | 
 93 |     Args
 94 |     ----
 95 |       feature_set: Dict[str, Any]
 96 |           Definition of features to be stored on Elasticsearch.
 97 |       model_name: str
 98 |           Defined for each Kubeflow experiment.
 99 |       es_host: str
100 |           Hostname where Elasticsearch is located.
101 |     """
102 |     host = f'http://{es_host}'
103 |     url = f'_ltr/_featureset/{model_name}'
104 |     url = urljoin(host, url)
105 |     header = {'Content-Type': 'application/json'}
106 |     resp = requests.post(url, data=json.dumps(feature_set), headers=header)
107 |     if not resp.ok:
108 |         raise Exception(resp.content)
109 | 
110 | 
111 | def main(args: NamedTuple):
112 |     import json
113 |     from elasticsearch import Elasticsearch
114 |     from elasticsearch.helpers import bulk
115 | 
116 |     es = Elasticsearch(hosts=[args.es_host])
117 |     es_mapping_path = PATH / f'{args.model_name}' / 'es_mapping.json'
118 |     schema = json.loads(open(str(es_mapping_path)).read())
119 |     index = schema.pop('index')
120 | 
121 |     storage_client = storage.Client()
122 |     bq_client = bigquery.Client()
123 | 
124 |     def read_file(bucket, storage_client=storage_client, bq_client=bq_client):
125 |         ds_ref = bq_client.dataset('pysearchml')
126 |         bq_client.create_dataset(ds_ref, exists_ok=True)
127 | 
128 |         table_id = 'es_docs'
129 |         table_ref = ds_ref.table(table_id)
130 | 
131 |         bucket_obj = storage_client.bucket(bucket)
132 |         if not bucket_obj.exists():
133 |             bucket_obj.create()
134 | 
135 |         # Query GA data
136 |         query_path = PATH / f'{args.model_name}' / 'ga_data.sql'
137 |         query = open(str(query_path)).read()
138 |         print(query)
139 |         job_config = bigquery.QueryJobConfig()
140 |         job_config.destination = f'{bq_client.project}.pysearchml.{table_id}'
141 |         job_config.maximum_bytes_billed = 10 * (1024 ** 3)
142 |         job_config.write_disposition = 'WRITE_TRUNCATE'
143 |         job = bq_client.query(query, job_config=job_config)
144 |         job.result()
145 | 
146 |         # export BigQuery table to GCS
147 |         destination_uri = f'gs://{bucket}/es_docs.gz'
148 | 
149 |         extract_config = bigquery.ExtractJobConfig()
150 |         extract_config.compression = 'GZIP'
151 |         extract_config.destination_format = 'NEWLINE_DELIMITED_JSON'
152 |         job = bq_client.extract_table(table_ref, destination_uri,
153 |                                       job_config=extract_config)
154 |         job.result()
155 | 
156 |         # Download data
157 |         blob = bucket_obj.blob('es_docs.gz')
158 |         file_obj = gzip.io.BytesIO()
159 |         blob.download_to_file(file_obj)
160 | 
161 |         file_obj.seek(0)
162 | 
163 |         c = 0
164 |         for row in gzip.GzipFile(fileobj=file_obj, mode='rb'):
165 |             row = json.loads(row)
166 |             yield {
167 |                 '_index': index,
168 |                 '_source': row,
169 |                 '_id': row['sku']
170 |             }
171 |             c += 1
172 |             if not c % 1000:
173 |                 print(c)
174 | 
175 |         # Delete BQ Table
176 |         bq_client.delete_table(table_ref)
177 | 
178 |     if args.force_restart or not es.indices.exists(index):
179 |         es.indices.delete(index, ignore=[400, 404])
180 |         print('deleted index')
181 |         es.indices.create(index, **schema)
182 |         print('schema created')
183 |         bulk(es, read_file(args.bucket), request_timeout=30)
184 |         create_feature_store(args.es_host)
185 |         create_feature_set(args.es_host, args.model_name)
186 |     print('Finished preparing environment.')
187 | 
188 | 
189 | if __name__ == '__main__':
190 |     parser = argparse.ArgumentParser()
191 |     parser.add_argument(
192 |         '--force_restart',
193 |         dest='force_restart',
194 |         type=lambda arg: arg.lower() == 'true',
195 |         default=False
196 |     )
197 |     parser.add_argument(
198 |         '--es_host',
199 |         dest='es_host',
200 |         type=str,
201 |         default='localhost'
202 |     )
203 |     parser.add_argument(
204 |         '--bucket',
205 |         dest='bucket',
206 |         type=str
207 |     )
208 |     parser.add_argument(
209 |         '--model_name',
210 |         dest='model_name',
211 |         type=str,
212 |         help=('Assigns a name for the RankLib model. Each experiment on Kubeflow '
213 |               'should have a specific name in order to preserver their results.')
214 |     )
215 |     args, _ = parser.parse_known_args(sys.argv[1:])
216 |     main(args)
217 | 


--------------------------------------------------------------------------------
/kubeflow/components/model/train.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import argparse
  4 | import pathlib
  5 | from datetime import datetime
  6 | from typing import NamedTuple, Sequence
  7 | from functools import partial
  8 | from multiprocessing import Pool
  9 | from shutil import copyfile
 10 | 
 11 | import requests
 12 | 
 13 | from validate import validate_model
 14 | 
 15 | 
 16 | def main(args: NamedTuple, X: Sequence[str]) -> None:
 17 |     """
 18 |     X contains a list of input arguments, such as `[--var1=val1]`. These args are sent
 19 |     to RankLib to setup a training run specification.
 20 | 
 21 |     Args
 22 |     ----
 23 |       args: NamedTuple
 24 |         ranker: str
 25 |             Name of which ranker to use available in RankLib
 26 |         train_file_path: str
 27 |             Path where RankLib training file is located.
 28 |         validation_files_path: str
 29 |             Path where regular validation files are located.
 30 |         validation_train_files_path: str
 31 |             Path where validation files of training period are located.
 32 |         es_host: str
 33 |             Hostname where Elasticsearch is located.
 34 |         model_name: str
 35 |             RankLib featureset Model Name as saved in Elasticsearch.
 36 |         destination: str
 37 |             File name where to save results.
 38 | 
 39 |       X: Sequence[str]
 40 |           Values for input of RankLib parameters.
 41 |     """
 42 |     ranker = get_ranker_index(args.ranker)
 43 |     if not ranker:
 44 |         raise ValueError(f'Invalid value for ranker: "{args.ranker}"')
 45 | 
 46 |     cmd = ('java -jar ranklib/RankLib-2.14.jar -ranker '
 47 |            f'{ranker} -train {args.train_file_path} -norm sum -save '
 48 |            f'{args.destination}/model.txt '
 49 |            f'{(" ".join(X)).replace("--", "-").replace("=", " ")} -metric2t ERR')
 50 | 
 51 |     # {args.destination}/model.txt contains the specification of the
 52 |     # final trained model
 53 |     os.system(cmd)
 54 |     post_model_to_elasticsearch(args.es_host, args.model_name,
 55 |                                 f'{args.destination}/model.txt')
 56 |     partiated_validator = get_partiated_validator(args.es_host, args.index,
 57 |                                                   args.model_name, args.es_batch)
 58 |     pool = Pool()
 59 |     rank_val, rank_train = pool.map(partiated_validator, [args.validation_files_path,
 60 |                                     args.validation_train_files_path])
 61 | 
 62 |     # Kabit tracks down the StdOut searching for the string 'Validation-rank=value'
 63 |     print(f'Validation-rank={rank_val}')
 64 |     write_results(X, rank_train, rank_val, args.destination, args.model_name)
 65 | 
 66 | 
 67 | def get_ranker_index(ranker: str) -> str:
 68 |     return {
 69 |         'mart': '0',
 70 |         'ranknet': '1',
 71 |         'rankboost': '2',
 72 |         'adarank': '3',
 73 |         'coordinate ascent': '4',
 74 |         'lambdamart': '6',
 75 |         'listnet': '7',
 76 |         'random forest': '8'
 77 |     }.get(ranker)
 78 | 
 79 | 
 80 | def write_results(X: Sequence[str], rank_train: float, rank_val: float,
 81 |                   destination: str, model_name: str):
 82 |     """
 83 |     Write results in persistent disk. Uses the folder of `destination` as main reference
 84 | 
 85 |     Args
 86 |     ----
 87 |       X: Sequence[str]
 88 |           Input arguments as suggested by Katib. It sets the hyperparameters of the
 89 |           models.
 90 |       rank_train: float
 91 |           Rank value of training data.
 92 |       rank_val: float
 93 |           Rank value for validation data.
 94 |       destination: str
 95 |           File path where to save results.
 96 |       model_name: str
 97 |           Name that identifies model being tested.
 98 |     """
 99 |     # Katib installs sidecars pods that keeps reading StdOut of the main pod searching
100 |     # for the previously specified pattern. This print tells Katib that this is the
101 |     # metric it should be aiming to optimize.
102 |     dir_ = pathlib.Path(destination)
103 |     os.makedirs(str(dir_), exist_ok=True)
104 |     with open(str(dir_ / 'results.txt'), 'a') as f:
105 |         today_str = datetime.today().strftime('%Y%m%d %H:%M:%S')
106 |         f.write(
107 |             f'{today_str},{" ".join(X)},rank_train={rank_train},'
108 |             f'rank_val={rank_val}{os.linesep}'
109 |         )
110 |     best_model_file = dir_ / 'best_model.txt'
111 |     best_rank_file = dir_ / 'best_rank.txt'
112 |     if os.path.isfile(str(best_rank_file)):
113 |         best_rank = float(open(str(best_rank_file)).readline())
114 |         if rank_val < best_rank:
115 |             with open(str(best_rank_file), 'w') as f:
116 |                 f.write(str(rank_val))
117 |             copyfile(f'{destination}/model.txt', str(best_model_file))
118 |     else:
119 |         with open(str(best_rank_file), 'w') as f:
120 |             f.write(str(rank_val))
121 |         copyfile(f'{destination}/model.txt', str(best_model_file))
122 | 
123 | 
124 | def get_partiated_validator(
125 |     es_host: str,
126 |     index: str,
127 |     model_name: str,
128 |     es_batch: int = 1000
129 | ):
130 |     return partial(validate_model, es_host=es_host, index=index, model_name=model_name,
131 |                    es_batch=es_batch)
132 | 
133 | 
134 | def post_model_to_elasticsearch(es_host, model_name, model_path) -> None:
135 |     """
136 |     Exports trained model to Elasticsearch
137 |     """
138 |     model_definition = open(model_path).read()
139 |     model_request = {
140 |         'model': {
141 |             'name': model_name,
142 |             'model': {
143 |                 'type': 'model/ranklib',
144 |                 'definition': model_definition
145 |             }
146 |         }
147 |     }
148 |     path = f'http://{es_host}/_ltr/_model/{model_name}'
149 |     response = requests.delete(path)
150 | 
151 |     path = f'http://{es_host}/_ltr/_featureset/{model_name}/_createmodel'
152 |     header = {'Content-Type': 'application/json'}
153 |     response = requests.post(path, json=model_request, headers=header)
154 |     if not response.ok:
155 |         raise Exception(response.content)
156 | 
157 | 
158 | if __name__ == '__main__':
159 |     parser = argparse.ArgumentParser()
160 |     parser.add_argument(
161 |         '--train_file_path',
162 |         dest='train_file_path',
163 |         type=str,
164 |         help='Path where RankLib training file data is located.'
165 |     )
166 |     parser.add_argument(
167 |          '--validation_files_path',
168 |          dest='validation_files_path',
169 |          type=str,
170 |          help='Path where validation data path is located.'
171 |     )
172 |     parser.add_argument(
173 |          '--validation_train_files_path',
174 |          dest='validation_train_files_path',
175 |          type=str,
176 |          help='Path where validation train data path is located.'
177 |     )
178 |     parser.add_argument(
179 |         '--es_host',
180 |         dest='es_host',
181 |         type=str,
182 |         help='Host address to reach Elasticsearch.'
183 |     )
184 |     parser.add_argument(
185 |         '--es_batch',
186 |         dest='es_batch',
187 |         type=int,
188 |         default=1000,
189 |         help=('Determines how many items to send at once to Elasticsearch when using '
190 |               'multisearch API.')
191 |     )
192 |     parser.add_argument(
193 |         '--destination',
194 |         dest='destination',
195 |         type=str,
196 |         help='Path where validation score is should be saved to.'
197 |     )
198 |     parser.add_argument(
199 |         '--model_name',
200 |         dest='model_name',
201 |         type=str,
202 |         help='Name of featureset store as saved in Elasticsearch.'
203 |     )
204 |     parser.add_argument(
205 |         '--ranker',
206 |         dest='ranker',
207 |         type=str,
208 |         help='Name of ranker algorithm to be used from RankLib.'
209 |     )
210 |     parser.add_argument(
211 |         '--index',
212 |         dest='index',
213 |         default='pysearchml',
214 |         type=str,
215 |         help='ES Index name to use.'
216 |     )
217 |     args, unknown = parser.parse_known_args(sys.argv[1:])
218 |     print('thi is unknown: ', unknown)
219 |     main(args, unknown)
220 | 


--------------------------------------------------------------------------------
/kubeflow/components/model/validate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import gzip
  4 | import glob
  5 | import json
  6 | import sys
  7 | from typing import List, NamedTuple, Any, Dict
  8 | from elasticsearch import Elasticsearch
  9 | import numpy as np
 10 | 
 11 | 
 12 | """
 13 | Script responsible for reading validation data and evaluating the performance of a
 14 | trained RankLib model stored on Elasticsearch.
 15 | """
 16 | 
 17 | 
 18 | def parse_args(args: List) -> NamedTuple:
 19 |     parser = argparse.ArgumentParser()
 20 |     parser.add_argument(
 21 |         '--files_path',
 22 |         dest='files_path',
 23 |         type=str,
 24 |         help='Path to files containing data of customers searches and their purchases'
 25 |     )
 26 |     parser.add_argument(
 27 |         '--index',
 28 |         dest='index',
 29 |         type=str,
 30 |         default='pysearchml',
 31 |         help='Name of Index where documents are stored in Elasticsearch.'
 32 |     )
 33 |     parser.add_argument(
 34 |         '--es_host',
 35 |         dest='es_host',
 36 |         type=str,
 37 |         help='Host address to reach Elasticsearch.'
 38 |     )
 39 |     parser.add_argument(
 40 |         '--model_name',
 41 |         dest='model_name',
 42 |         type=str,
 43 |         help='Assigns a name for the RankLib model. Each experiment on Kubeflow should '
 44 |              'have a specific name in order to preserver their results.'
 45 |     )
 46 |     parser.add_argument(
 47 |         '--es_batch',
 48 |         dest='es_batch',
 49 |         type=int,
 50 |         default=1000,
 51 |         help='Determines how many items to send at once to Elasticsearch when using '
 52 |              'multisearch API.'
 53 |     )
 54 |     args, _ = parser.parse_known_args(args)
 55 |     return args
 56 | 
 57 | 
 58 | def validate_model(
 59 |     files_path: str,
 60 |     es_host: str,
 61 |     model_name: str,
 62 |     index: str = 'pysearchml',
 63 |     es_batch: int = 1000
 64 | ) -> float:
 65 |     """
 66 |     Reads through an input file of searches and customers purchases. For each search,
 67 |     sends the query against Elasticsearch to retrieve a list of documents. This list is
 68 |     then compared with what customers purchased for evaluating a rank metric.
 69 | 
 70 |     The rank formula is defined in this paper:
 71 | 
 72 |     http://yifanhu.net/PUB/cf.pdf
 73 | 
 74 |     And is expressed by:
 75 | 
 76 |     rank = \frac{\\sum_{u,i}r^t_{ui}rank_{ui}}{\\sum_{u,i}r^t_{ui}}
 77 | 
 78 |     `u` is an identification of a given customer, `i` represents items. `r_{ui}` is
 79 |     the score a given customer u gave to item i. This score is implicit and here we just
 80 |     consider it equal to 1.
 81 | 
 82 |     The rank formula is an average of what percentile the purchased item from customers
 83 |     are located in the retrieve list of documents from Elasticseasrch operating already
 84 |     with the trained RankLib model.
 85 | 
 86 |     Notice that we use this function in parallel with multiprocessing. That means that
 87 |     its input must be pickleable so Elasticsearch client is instantiated inside the
 88 |     function instead of being an input argument. It breaks Injection Dependecy principle
 89 |     but still works fine.
 90 | 
 91 |     Args
 92 |     ----
 93 |       files_path: str,
 94 |       es_host: str,
 95 |       model_name: str,
 96 |       index: str = 'pysearchml',
 97 |       es_batch: int = 1000
 98 |     """
 99 |     counter = 1
100 |     search_arr, purchase_arr = [], []
101 |     # Defined as lists which works as pointers
102 |     rank_num, rank_den = [0], [0]
103 |     es_client = Elasticsearch(hosts=[es_host])
104 | 
105 |     files = glob.glob(os.path.join(files_path, '*.gz'))
106 | 
107 |     for file_ in files:
108 |         for row in gzip.GzipFile(file_):
109 |             row = json.loads(row)
110 |             search_keys, docs = row['search_keys'], row['docs']
111 |             purchase_arr.append(docs)
112 | 
113 |             search_arr.append(json.dumps({'index': index}))
114 |             search_arr.append(
115 |                 json.dumps(get_es_query(search_keys, model_name, es_batch))
116 |             )
117 | 
118 |             if counter % es_batch == 0:
119 |                 compute_rank(search_arr, purchase_arr, rank_num, rank_den, es_client)
120 |                 search_arr, purchase_arr = [], []
121 | 
122 |             counter += 1
123 | 
124 |         if search_arr:
125 |             compute_rank(search_arr, purchase_arr, rank_num, rank_den, es_client)
126 |         # return rank=50% if no document was retrieved from Elasticsearch and purchased
127 |         # by customers.
128 |         return rank_num[0] / rank_den[0] if rank_den[0] else 0.5
129 | 
130 | 
131 | def compute_rank(
132 |     search_arr: List[str],
133 |     purchase_arr: List[List[Dict[str, List[str]]]],
134 |     rank_num: List[float],
135 |     rank_den: List[float],
136 |     es_client: Elasticsearch
137 | ) -> None:
138 |     """
139 |     Sends queries against Elasticsearch and compares results with what customers
140 |     purchased. Computes the average rank position of where the purchased document falls
141 |     within the retrieved items.
142 | 
143 |     Args
144 |     ----
145 |       search_arr: List[str]
146 |           Searches made by customers as observed in validation data. We send those
147 |           against Elasticsearch and compare results with purchased data
148 |       purchase_arr: List[List[Dict[str, List[str]]]]
149 |           List of documents that were purchased by customers
150 |       rank_num: List[float]
151 |           Numerator value of the rank equation. Defined as list to emulate a pointer
152 |       rank_den: List[float]
153 |       es_client: Elasticsearch
154 |           Python Elasticsearch client
155 |     """
156 |     idx = 0
157 |     if not search_arr:
158 |         return
159 | 
160 |     request = os.linesep.join(search_arr)
161 |     response = es_client.msearch(body=request, request_timeout=60)
162 | 
163 |     for hit in response['responses']:
164 |         docs = [doc['_id'] for doc in hit['hits'].get('hits', [])]
165 | 
166 |         if not docs or len(docs) < 2:
167 |             continue
168 | 
169 |         purchased_docs = [
170 |             docs for purch in purchase_arr[idx] for docs in purch['purchased']
171 |         ]
172 |         ranks = np.where(np.in1d(docs, purchased_docs))[0]
173 |         idx += 1
174 | 
175 |         if ranks.size == 0:
176 |             continue
177 | 
178 |         rank_num[0] += ranks.sum() / (len(docs) - 1)
179 |         rank_den[0] += ranks.size
180 | 
181 |     print('rank num: ', rank_num[0])
182 |     print('rank den: ', rank_den[0])
183 | 
184 | 
185 | def get_es_query(
186 |     search_keys: Dict[str, Any],
187 |     model_name: str,
188 |     es_batch: int = 1000
189 | ) -> str:
190 |     """
191 |     Builds the Elasticsearch query to be used when retrieving data.
192 | 
193 |     Args
194 |     ----
195 |       args: NamedTuple
196 |         args.search_keys: Dict[str, Any]
197 |             Search query sent by the customer as well as other variables that sets its
198 |             context, such as region, favorite brand and so on.
199 |         args.model_name: str
200 |             Name of RankLib model saved on Elasticsearch
201 |         args.index: str
202 |             Index on Elasticsearch where to retrieve documents
203 |         args.es_batch: int
204 |             How many documents to retrieve
205 | 
206 |     Returns
207 |     -------
208 |       query: str
209 |           String representation of final query
210 |     """
211 |     # it's expected that a ES query will be available at:
212 |     # ./queries/{model_name}/es_query.json
213 |     query = open(f'queries/{model_name}/es_query.json').read()
214 |     query = json.loads(query.replace('{query}', search_keys['search_term']))
215 |     # We just want to retrieve the id of the document to evaluate the ranks between
216 |     # customers purchases and the retrieve list result
217 |     query['_source'] = '_id'
218 |     query['size'] = es_batch
219 |     query['rescore']['window_size'] = 50  # Hardcoded to optimize first 50 skus
220 |     query['rescore']['query']['rescore_query']['sltr']['params'] = search_keys
221 |     query['rescore']['query']['rescore_query']['sltr']['model'] = model_name
222 |     return query
223 | 
224 | 
225 | if __name__ == '__main__':
226 |     args = parse_args(sys.argv[1:])
227 |     validate_model(
228 |         args.files_path,
229 |         args.es_host,
230 |         args.model_name,
231 |         args.index,
232 |         args.es_batch
233 |     )
234 | 


--------------------------------------------------------------------------------
/kubeflow/components/data/train/run.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import json
  4 | import glob
  5 | import gzip
  6 | import sys
  7 | import pathlib
  8 | import uuid
  9 | from shutil import rmtree
 10 | from typing import List, NamedTuple, Dict, Any, Iterator, Tuple
 11 | 
 12 | import numpy as np
 13 | from elasticsearch import Elasticsearch
 14 | from google.cloud import storage, bigquery
 15 | from pyClickModels import DBN
 16 | 
 17 | 
 18 | """
 19 | Module responsible for creating the final RankLib text file used as input for its
 20 | training process. For information is available at:
 21 | 
 22 |     https://sourceforge.net/p/lemur/wiki/RankLib%20How%20to%20use/
 23 | 
 24 | """
 25 | 
 26 | 
 27 | PATH = pathlib.Path(__file__).parent
 28 | 
 29 | 
 30 | def build_judgment_files(model_name: str) -> None:
 31 |     """
 32 |     Uses DBN Models and the clickstream data to come up with the Judgmnets inferences.
 33 |     """
 34 |     model = DBN.DBNModel()
 35 | 
 36 |     # clickstream has the browsing patterns of searches, clicks and purchases from
 37 |     # customers.
 38 |     clickstream_files_path = f'/tmp/pysearchml/{model_name}/clickstream/'
 39 | 
 40 |     # model is the output from pyClickModels. It contains JSON NEWLINE DELIMITED data
 41 |     # where each row contains a JSON with search queries and its context and then
 42 |     # a dictionary of skus and their correspondent judgment for the respective query.
 43 |     model_path = f'/tmp/pysearchml/{model_name}/model/model.gz'
 44 | 
 45 |     rmtree(os.path.dirname(model_path), ignore_errors=True)
 46 |     os.makedirs(os.path.dirname(model_path))
 47 | 
 48 |     # finally judgment files is where the final judgments are written.
 49 |     judgment_files_path = f'/tmp/pysearchml/{model_name}/judgments/judgments.gz'
 50 |     rmtree(os.path.dirname(judgment_files_path), ignore_errors=True)
 51 |     os.makedirs(os.path.dirname(judgment_files_path))
 52 | 
 53 |     model.fit(clickstream_files_path, iters=10)
 54 |     model.export_judgments(model_path)
 55 | 
 56 |     with gzip.GzipFile(judgment_files_path, 'wb') as f:
 57 |         for row in gzip.GzipFile(model_path):
 58 |             row = json.loads(row)
 59 |             result = []
 60 | 
 61 |             # search_keys is something like:
 62 |             # {"search_term:query|brand:brand_name|context:value}
 63 |             # Notice that only the `search_term` is always available. Other keys depends
 64 |             # on the chosen context when training the model, i.e., one can choose to
 65 |             # add the brand information or not and so on.
 66 |             search_keys = list(row.keys())[0]
 67 |             docs_judgments = row[search_keys]
 68 |             search_keys = dict(e.split(':') for e in search_keys.split('|'))
 69 | 
 70 |             judgments_list = [judge for doc, judge in docs_judgments.items()]
 71 | 
 72 |             # It means all judgments expectations are equal which is not desirable
 73 |             if all(x == judgments_list[0] for x in judgments_list):
 74 |                 continue
 75 | 
 76 |             # We devire judgments based on percentiles from 20% up to 100%
 77 |             percentiles = np.percentile(judgments_list, [20, 40, 60, 80, 100])
 78 | 
 79 |             judgment_keys = [
 80 |                 {
 81 |                     'doc': doc,
 82 |                     'judgment': process_judgment(percentiles, judgment)
 83 |                 }
 84 |                 for doc, judgment in docs_judgments.items()
 85 |             ]
 86 | 
 87 |             result = {
 88 |                 'search_keys': search_keys,
 89 |                 'judgment_keys': judgment_keys
 90 |             }
 91 |             f.write(json.dumps(result).encode() + '\n'.encode())
 92 | 
 93 | 
 94 | def process_judgment(percentiles: list, judgment: float) -> int:
 95 |     """
 96 |     Returns which quantile the current value of `judgment` belongs to. The result is
 97 |     already transformed to range between integers 0 and 4 inclusive.
 98 | 
 99 |     Args
100 |     ----
101 |       judgents_list: np.array
102 |           All judgments computed for given query
103 |       judgment: float
104 |           Current judgment value being computed.
105 | 
106 |     Returns
107 |     -------
108 |       judgment: int
109 |           Integer belonging to 0 and 4, inclusive. 0 means the current document is not
110 |           appropriate for current query whereas 4 means it's a perfect fit.
111 |     """
112 |     if judgment <= percentiles[0]:
113 |         return 0
114 |     if judgment <= percentiles[1]:
115 |         return 1
116 |     if judgment <= percentiles[2]:
117 |         return 2
118 |     if judgment <= percentiles[3]:
119 |         return 3
120 |     if judgment <= percentiles[4]:
121 |         return 4
122 | 
123 | 
124 | def build_train_file(
125 |     model_name: str,
126 |     es_batch: int,
127 |     es_client: Elasticsearch,
128 |     destination: str,
129 |     index: str
130 | ) -> None:
131 |     """
132 |     After the input file has been updated with judgment data, logs features from
133 |     Elasticsearch which results in the final text file used as input for RankLib.
134 | 
135 |     Args
136 |     ----
137 |       model_name: str
138 |           Name of feature set store on Elasticsearch.
139 |       es_batch: int
140 |           Sets how many queries to aggregate when using multisearch API.
141 |       es_client: Elasticsearch
142 |           Python Elasticsearch client
143 |       destination: str
144 |           Path where to write results to.
145 |     """
146 |     counter = 1
147 |     # works as a pointer
148 |     queries_counter = [0]
149 |     search_arr, judge_list = [], []
150 |     os.makedirs(destination, exist_ok=True)
151 |     if os.path.isfile(f'{destination}/train_dataset.txt'):
152 |         os.remove(f'{destination}/train_dataset.txt')
153 | 
154 |     for search_keys, docs, judgments in read_judgment_files(model_name):
155 |         judge_list.append(judgments)
156 | 
157 |         search_arr.append(json.dumps({'index': f'{index}'}))
158 |         search_arr.append(json.dumps(get_logging_query(model_name, docs, search_keys)))
159 | 
160 |         if counter % es_batch == 0:
161 |             write_features(search_arr, judge_list, queries_counter, es_client,
162 |                            destination)
163 |             search_arr, judge_list = [], []
164 | 
165 |         counter += 1
166 | 
167 |     if search_arr:
168 |         write_features(search_arr, judge_list, queries_counter, es_client, destination)
169 | 
170 | 
171 | def write_features(
172 |     search_arr: List[str],
173 |     judge_list: List[List[str]],
174 |     queries_counter: List[int],
175 |     es_client: Elasticsearch,
176 |     destination: str
177 | ) -> None:
178 |     """
179 |     Sends the query to Elasticsearch and uses the result to write final RankLib training
180 |     file.
181 | 
182 |     Args
183 |     ----
184 |       search_arr: List[str]
185 |           Array containing multiple queries to send against Elasticsearch
186 |       judge_list: List[List[str]]
187 |           Each index contains list of judgments associated to a respective search
188 |       file_: io.TextIO
189 |       queries_counter: List[int]
190 |           Counter of how many queries were processed so far. It's used to build the
191 |           RankLib file with appropriate values. It's a list so it works as a C pointer.
192 |       es: Elasticsearch
193 |           Python client for interacting with Elasticsearch
194 |       destination: str
195 |           Path where to save results to.
196 |     """
197 |     if not search_arr:
198 |         return
199 | 
200 |     multi_request = os.linesep.join(search_arr)
201 |     features_log = es_client.msearch(body=multi_request, request_timeout=60)
202 | 
203 |     rows = []
204 |     for i in range(len(judge_list)):
205 |         es_result = features_log['responses'][i].get('hits', {}).get('hits')
206 | 
207 |         if not es_result or len(es_result) == 1:
208 |             continue
209 | 
210 |         for j in range(len(es_result)):
211 |             logs = es_result[j]['fields']['_ltrlog'][0]['main']
212 |             features = [
213 |                 f'{idx+1}:{logs[idx].get("value", 0)}' for idx in range(len(logs))
214 |             ]
215 |             features = '\t'.join(features)
216 |             ranklib_entry = f'{judge_list[i][j]}\tqid:{queries_counter[0]}\t{features}'
217 |             rows.append(ranklib_entry)
218 |         queries_counter[0] += 1
219 | 
220 |     if rows:
221 |         print(rows[0])
222 |         path = f'{destination}/train_dataset.txt'
223 |         os.makedirs(os.path.dirname(path), exist_ok=True)
224 |         with open(path, 'a') as f:
225 |             f.write(os.linesep.join(rows) + os.linesep)
226 | 
227 | 
228 | def get_logging_query(
229 |     model_name: str,
230 |     docs: List[str],
231 |     search_keys: Dict[str, Any]
232 | ) -> Dict[str, Any]:
233 |     """
234 |     The process to extract features from Elasticsearch involves sending what is known
235 |     as the "logging query". The result of the loggin query is the values, for a given
236 |     search query, of each feature as defined in feature set.
237 | 
238 |     Args
239 |     ----
240 |       model_name: str
241 |           Each Kubeflow run receives a model_name so it's possible to discern each
242 |           experiment. This value is used to store different models on Elasticsearch.
243 |       docs: List[str]
244 |           List containing several documents (skus for instance) to be inserted in the
245 |           query so it's possible to send several requests in just one request.
246 |       search_keys: Dict[str, Any]
247 |           Those are the keys that describe the search context. It can contain
248 |           data such as the region of customers, their favorite brands, their average
249 |           purchasing ticket and so on.
250 | 
251 |     Returns
252 |     -------
253 |       log_query: Dict[str, Any]
254 |           Query to be sent against Elasticsearch in order to find the values of each
255 |           feature as defined in featureset.
256 |     """
257 |     log_query = {
258 |         "query": {
259 |             "bool": {
260 |                 "filter": [
261 |                     {
262 |                         "terms": {
263 |                             "_id": ""
264 |                         }
265 |                     }
266 |                 ],
267 |                 "should": [
268 |                     {
269 |                         "sltr": {
270 |                             "_name": "logged_featureset",
271 |                             "featureset": model_name,
272 |                             "params": {}
273 |                         }
274 |                     }
275 |                 ]
276 |             }
277 |         },
278 |         "_source": ['_id'],
279 |         "ext": {
280 |             "ltr_log": {
281 |                 "log_specs": {
282 |                     "name": "main",
283 |                     "named_query": "logged_featureset"
284 |                 }
285 |             }
286 |         }
287 |     }
288 |     log_query['query']['bool']['filter'][0]['terms']['_id'] = docs
289 |     log_query['query']['bool']['should'][0]['sltr']['params'] = search_keys
290 |     return log_query
291 | 
292 | 
293 | def read_judgment_files(
294 |     model_name: str
295 | ) -> Iterator[Tuple[Dict[str, Any], List[str], List[str]]]:
296 |     """
297 |     Reads resulting files of the judgments updating process.
298 |     """
299 |     files = glob.glob(f'/tmp/pysearchml/{model_name}/judgments/*.gz')
300 |     for file_ in files:
301 |         for row in gzip.GzipFile(file_):
302 |             row = json.loads(row)
303 |             search_keys = row['search_keys']
304 |             judgment_keys = row['judgment_keys']
305 |             docs = [e['doc'] for e in judgment_keys]
306 |             judgments = [e['judgment'] for e in judgment_keys]
307 |             yield search_keys, docs, judgments
308 | 
309 | 
310 | def download_data(args: NamedTuple):
311 |     """
312 |     Queries over GA data for input training dataset creation. The table is first
313 |     exported to GS and then downloaded to respective folder, as is.
314 | 
315 |     Args
316 |     ----
317 |       args: NamedTuple
318 |         train_init_date: str
319 |             Follows format %Y%M%D, represents from where the query should start
320 |             retrieving data from.
321 |         train_end_date: str
322 |         model_name: str
323 |             Name to identify model being trained.
324 |     """
325 |     path_to_download = f'/tmp/pysearchml/{args.model_name}/clickstream'
326 |     rmtree(path_to_download, ignore_errors=True)
327 |     os.makedirs(path_to_download, exist_ok=True)
328 | 
329 |     storage_client = storage.Client()
330 |     bq_client = bigquery.Client()
331 | 
332 |     ds_ref = bq_client.dataset('pysearchml')
333 |     table_id = str(uuid.uuid4()).replace('-', '')
334 |     table_ref = ds_ref.table(table_id)
335 | 
336 |     # Query GA data
337 |     query_path = PATH / 'train.sql'
338 |     query = open(str(query_path)).read()
339 |     query = query.format(train_init_date=args.train_init_date,
340 |                          train_end_date=args.train_end_date)
341 | 
342 |     job_config = bigquery.QueryJobConfig()
343 |     job_config.destination = f'{bq_client.project}.pysearchml.{table_id}'
344 |     job_config.maximum_bytes_billed = 10 * (1024 ** 3)
345 |     job_config.write_disposition = 'WRITE_TRUNCATE'
346 |     job = bq_client.query(query, job_config=job_config)
347 |     job.result()
348 | 
349 |     # export BigQuery table to GCS
350 |     destination_uri = f'gs://{args.bucket}/train/*.gz'
351 | 
352 |     extract_config = bigquery.ExtractJobConfig()
353 |     extract_config.compression = 'GZIP'
354 |     extract_config.destination_format = 'NEWLINE_DELIMITED_JSON'
355 |     job = bq_client.extract_table(table_ref, destination_uri, job_config=extract_config)
356 |     job.result()
357 | 
358 |     # Download data
359 |     bucket_obj = storage_client.bucket(args.bucket)
360 |     blobs = bucket_obj.list_blobs(prefix='train')
361 |     for blob in blobs:
362 |         blob.download_to_filename(
363 |             f"{path_to_download}/judgments_{blob.name.split('/')[-1]}"
364 |         )
365 |         blob.delete()
366 | 
367 |     # Delete BQ Table
368 |     bq_client.delete_table(table_ref)
369 | 
370 | 
371 | def main(args: NamedTuple, es_client: Elasticsearch) -> None:
372 |     """
373 |     Uses as input data from Google Analytics containing customers clickstream for Search
374 |     Result Pages. This data is processed with the Judgments model as described in
375 |     [pyClickModels](https://github.com/WillianFuks/pyClickModels) and features are
376 |     derived from Elasticsearch.
377 | 
378 |     The resulting text file is like:
379 | 
380 |     judgment    qid    feature_1 ...   feature_N
381 |     4           qid:1  1:0.56    ...   2:1.3
382 |     4           qid:1  1:2.90    ...   2:1.09
383 |     3           qid:1  1:3.00    ...   2:5.51
384 | 
385 |     This text file is what we use as input for training models in RankLib.
386 | 
387 |     Args
388 |     ----
389 |       args: List
390 |           List of input arguments from `sys.argv`
391 |       es_client: Elasticsearch
392 |           Python Elasticsearch client
393 |     """
394 |     download_data(args)
395 |     build_judgment_files(args.model_name)
396 |     build_train_file(args.model_name, args.es_batch, es_client, args.destination,
397 |                      args.index)
398 | 
399 | 
400 | if __name__ == '__main__':
401 |     parser = argparse.ArgumentParser()
402 |     parser.add_argument(
403 |         '--train_init_date',
404 |         dest='train_init_date',
405 |         type=str,
406 |         help=('Value to replace in BigQuery SQL. Represents date from where to start '
407 |               'quering from. Format follows %Y%M%D.')
408 |     )
409 |     parser.add_argument(
410 |          '--train_end_date',
411 |          dest='train_end_date',
412 |          type=str,
413 |          help=('Value to replace in BigQuery SQL. Represents date from where to start '
414 |                'quering from. Format follows %Y%M%D.')
415 |     )
416 |     parser.add_argument(
417 |         '--bucket',
418 |         dest='bucket',
419 |         type=str,
420 |         default='pysearchml',
421 |         help='Google Cloud Storage Bucket where all data will be stored.'
422 |     )
423 |     parser.add_argument(
424 |         '--es_host',
425 |         dest='es_host',
426 |         type=str,
427 |         help='Host address to reach Elasticsearch.'
428 |     )
429 |     parser.add_argument(
430 |         '--es_batch',
431 |         dest='es_batch',
432 |         type=int,
433 |         default=1000,
434 |         help=('Determines how many items to send at once to Elasticsearch when using '
435 |               'multisearch API.')
436 |     )
437 |     parser.add_argument(
438 |         '--destination',
439 |         dest='destination',
440 |         type=str,
441 |         help='Path where to write results to.'
442 |     )
443 |     parser.add_argument(
444 |         '--model_name',
445 |         dest='model_name',
446 |         type=str,
447 |         help='Name of featureset store as saved in Elasticsearch.'
448 |     )
449 |     parser.add_argument(
450 |         '--index',
451 |         dest='index',
452 |         type=str,
453 |         help='Name of index to use from in Elasticsearch.'
454 |     )
455 | 
456 |     args, _ = parser.parse_known_args(sys.argv[1:])
457 |     es_client = Elasticsearch(hosts=[args.es_host])
458 |     main(args, es_client)
459 | 


--------------------------------------------------------------------------------
/kubeflow/components/model/model.txt:
--------------------------------------------------------------------------------
  1 | ## LambdaMART
  2 | ## No. of trees = 10
  3 | ## No. of leaves = 10
  4 | ## No. of threshold candidates = 256
  5 | ## Learning rate = 0.1
  6 | ## Stop early = 100
  7 | 
  8 | <ensemble>
  9 | 	<tree id="1" weight="0.1">
 10 | 		<split>
 11 | 			<feature> 4 </feature>
 12 | 			<threshold> 0.25 </threshold>
 13 | 			<split pos="left">
 14 | 				<feature> 1 </feature>
 15 | 				<threshold> 0.3315219 </threshold>
 16 | 				<split pos="left">
 17 | 					<feature> 2 </feature>
 18 | 					<threshold> 0.54972833 </threshold>
 19 | 					<split pos="left">
 20 | 						<feature> 1 </feature>
 21 | 						<threshold> 0.21562068 </threshold>
 22 | 						<split pos="left">
 23 | 							<output> -1.7084366083145142 </output>
 24 | 						</split>
 25 | 						<split pos="right">
 26 | 							<feature> 1 </feature>
 27 | 							<threshold> 0.21851821 </threshold>
 28 | 							<split pos="left">
 29 | 								<output> 1.096567153930664 </output>
 30 | 							</split>
 31 | 							<split pos="right">
 32 | 								<output> -0.7644314169883728 </output>
 33 | 							</split>
 34 | 						</split>
 35 | 					</split>
 36 | 					<split pos="right">
 37 | 						<output> 2.0 </output>
 38 | 					</split>
 39 | 				</split>
 40 | 				<split pos="right">
 41 | 					<feature> 1 </feature>
 42 | 					<threshold> 0.46480832 </threshold>
 43 | 					<split pos="left">
 44 | 						<output> 0.787030041217804 </output>
 45 | 					</split>
 46 | 					<split pos="right">
 47 | 						<output> 0.1781938225030899 </output>
 48 | 					</split>
 49 | 				</split>
 50 | 			</split>
 51 | 			<split pos="right">
 52 | 				<feature> 1 </feature>
 53 | 				<threshold> 0.22141574 </threshold>
 54 | 				<split pos="left">
 55 | 					<feature> 1 </feature>
 56 | 					<threshold> 0.21272315 </threshold>
 57 | 					<split pos="left">
 58 | 						<output> 1.1048166751861572 </output>
 59 | 					</split>
 60 | 					<split pos="right">
 61 | 						<output> 2.0 </output>
 62 | 					</split>
 63 | 				</split>
 64 | 				<split pos="right">
 65 | 					<feature> 1 </feature>
 66 | 					<threshold> 0.24169846 </threshold>
 67 | 					<split pos="left">
 68 | 						<output> -2.0 </output>
 69 | 					</split>
 70 | 					<split pos="right">
 71 | 						<output> 0.6879376769065857 </output>
 72 | 					</split>
 73 | 				</split>
 74 | 			</split>
 75 | 		</split>
 76 | 	</tree>
 77 | 	<tree id="2" weight="0.1">
 78 | 		<split>
 79 | 			<feature> 4 </feature>
 80 | 			<threshold> 0.24529763 </threshold>
 81 | 			<split pos="left">
 82 | 				<feature> 1 </feature>
 83 | 				<threshold> 0.20692809 </threshold>
 84 | 				<split pos="left">
 85 | 					<output> -1.5049853324890137 </output>
 86 | 				</split>
 87 | 				<split pos="right">
 88 | 					<feature> 2 </feature>
 89 | 					<threshold> 0.55973893 </threshold>
 90 | 					<split pos="left">
 91 | 						<feature> 1 </feature>
 92 | 						<threshold> 0.3315219 </threshold>
 93 | 						<split pos="left">
 94 | 							<feature> 1 </feature>
 95 | 							<threshold> 0.21851821 </threshold>
 96 | 							<split pos="left">
 97 | 								<output> 0.41835230588912964 </output>
 98 | 							</split>
 99 | 							<split pos="right">
100 | 								<output> -0.7194477915763855 </output>
101 | 							</split>
102 | 						</split>
103 | 						<split pos="right">
104 | 							<feature> 1 </feature>
105 | 							<threshold> 0.46480832 </threshold>
106 | 							<split pos="left">
107 | 								<output> 0.7153688073158264 </output>
108 | 							</split>
109 | 							<split pos="right">
110 | 								<output> 0.16227617859840393 </output>
111 | 							</split>
112 | 						</split>
113 | 					</split>
114 | 					<split pos="right">
115 | 						<output> 1.8283854722976685 </output>
116 | 					</split>
117 | 				</split>
118 | 			</split>
119 | 			<split pos="right">
120 | 				<feature> 1 </feature>
121 | 				<threshold> 0.22141574 </threshold>
122 | 				<split pos="left">
123 | 					<output> 1.1201153993606567 </output>
124 | 				</split>
125 | 				<split pos="right">
126 | 					<feature> 1 </feature>
127 | 					<threshold> 0.24169846 </threshold>
128 | 					<split pos="left">
129 | 						<output> -1.7444206476211548 </output>
130 | 					</split>
131 | 					<split pos="right">
132 | 						<feature> 1 </feature>
133 | 						<threshold> 0.25908363 </threshold>
134 | 						<split pos="left">
135 | 							<output> 1.2497144937515259 </output>
136 | 						</split>
137 | 						<split pos="right">
138 | 							<output> 0.32585909962654114 </output>
139 | 						</split>
140 | 					</split>
141 | 				</split>
142 | 			</split>
143 | 		</split>
144 | 	</tree>
145 | 	<tree id="3" weight="0.1">
146 | 		<split>
147 | 			<feature> 4 </feature>
148 | 			<threshold> 0.24529763 </threshold>
149 | 			<split pos="left">
150 | 				<feature> 1 </feature>
151 | 				<threshold> 0.3315219 </threshold>
152 | 				<split pos="left">
153 | 					<feature> 2 </feature>
154 | 					<threshold> 0.54972833 </threshold>
155 | 					<split pos="left">
156 | 						<feature> 1 </feature>
157 | 						<threshold> 0.21562068 </threshold>
158 | 						<split pos="left">
159 | 							<output> -1.3644481897354126 </output>
160 | 						</split>
161 | 						<split pos="right">
162 | 							<output> -0.4869944751262665 </output>
163 | 						</split>
164 | 					</split>
165 | 					<split pos="right">
166 | 						<output> 1.6992262601852417 </output>
167 | 					</split>
168 | 				</split>
169 | 				<split pos="right">
170 | 					<feature> 1 </feature>
171 | 					<threshold> 0.46480832 </threshold>
172 | 					<split pos="left">
173 | 						<output> 0.6203141212463379 </output>
174 | 					</split>
175 | 					<split pos="right">
176 | 						<output> 0.14612968266010284 </output>
177 | 					</split>
178 | 				</split>
179 | 			</split>
180 | 			<split pos="right">
181 | 				<feature> 3 </feature>
182 | 				<threshold> 0.0 </threshold>
183 | 				<split pos="left">
184 | 					<feature> 1 </feature>
185 | 					<threshold> 0.25908363 </threshold>
186 | 					<split pos="left">
187 | 						<feature> 4 </feature>
188 | 						<threshold> 0.6060418 </threshold>
189 | 						<split pos="left">
190 | 							<feature> 1 </feature>
191 | 							<threshold> 0.24169846 </threshold>
192 | 							<split pos="left">
193 | 								<output> 0.12054406106472015 </output>
194 | 							</split>
195 | 							<split pos="right">
196 | 								<output> 1.2553942203521729 </output>
197 | 							</split>
198 | 						</split>
199 | 						<split pos="right">
200 | 							<output> 1.622399091720581 </output>
201 | 						</split>
202 | 					</split>
203 | 					<split pos="right">
204 | 						<output> 0.05810323357582092 </output>
205 | 					</split>
206 | 				</split>
207 | 				<split pos="right">
208 | 					<output> 1.7710267305374146 </output>
209 | 				</split>
210 | 			</split>
211 | 		</split>
212 | 	</tree>
213 | 	<tree id="4" weight="0.1">
214 | 		<split>
215 | 			<feature> 4 </feature>
216 | 			<threshold> 0.31724432 </threshold>
217 | 			<split pos="left">
218 | 				<feature> 1 </feature>
219 | 				<threshold> 0.3315219 </threshold>
220 | 				<split pos="left">
221 | 					<feature> 3 </feature>
222 | 					<threshold> 0.0 </threshold>
223 | 					<split pos="left">
224 | 						<feature> 2 </feature>
225 | 						<threshold> 0.54972833 </threshold>
226 | 						<split pos="left">
227 | 							<feature> 1 </feature>
228 | 							<threshold> 0.21562068 </threshold>
229 | 							<split pos="left">
230 | 								<output> -1.0552788972854614 </output>
231 | 							</split>
232 | 							<split pos="right">
233 | 								<feature> 1 </feature>
234 | 								<threshold> 0.21851821 </threshold>
235 | 								<split pos="left">
236 | 									<output> 0.8310797810554504 </output>
237 | 								</split>
238 | 								<split pos="right">
239 | 									<feature> 1 </feature>
240 | 									<threshold> 0.23010834 </threshold>
241 | 									<split pos="left">
242 | 										<output> -1.702104926109314 </output>
243 | 									</split>
244 | 									<split pos="right">
245 | 										<feature> 1 </feature>
246 | 										<threshold> 0.23300587 </threshold>
247 | 										<split pos="left">
248 | 											<output> 1.8411474227905273 </output>
249 | 										</split>
250 | 										<split pos="right">
251 | 											<output> -0.6400606632232666 </output>
252 | 										</split>
253 | 									</split>
254 | 								</split>
255 | 							</split>
256 | 						</split>
257 | 						<split pos="right">
258 | 							<output> 1.5750422477722168 </output>
259 | 						</split>
260 | 					</split>
261 | 					<split pos="right">
262 | 						<output> 1.279455542564392 </output>
263 | 					</split>
264 | 				</split>
265 | 				<split pos="right">
266 | 					<output> 0.3588203489780426 </output>
267 | 				</split>
268 | 			</split>
269 | 			<split pos="right">
270 | 				<feature> 1 </feature>
271 | 				<threshold> 0.25908363 </threshold>
272 | 				<split pos="left">
273 | 					<output> 0.7923973798751831 </output>
274 | 				</split>
275 | 				<split pos="right">
276 | 					<output> 0.5431618690490723 </output>
277 | 				</split>
278 | 			</split>
279 | 		</split>
280 | 	</tree>
281 | 	<tree id="5" weight="0.1">
282 | 		<split>
283 | 			<feature> 4 </feature>
284 | 			<threshold> 0.31724432 </threshold>
285 | 			<split pos="left">
286 | 				<feature> 1 </feature>
287 | 				<threshold> 0.3315219 </threshold>
288 | 				<split pos="left">
289 | 					<feature> 2 </feature>
290 | 					<threshold> 0.54972833 </threshold>
291 | 					<split pos="left">
292 | 						<feature> 3 </feature>
293 | 						<threshold> 0.0 </threshold>
294 | 						<split pos="left">
295 | 							<feature> 1 </feature>
296 | 							<threshold> 0.21562068 </threshold>
297 | 							<split pos="left">
298 | 								<output> -0.9644893407821655 </output>
299 | 							</split>
300 | 							<split pos="right">
301 | 								<feature> 1 </feature>
302 | 								<threshold> 0.21851821 </threshold>
303 | 								<split pos="left">
304 | 									<output> 0.935935378074646 </output>
305 | 								</split>
306 | 								<split pos="right">
307 | 									<feature> 1 </feature>
308 | 									<threshold> 0.23010834 </threshold>
309 | 									<split pos="left">
310 | 										<output> -1.568875789642334 </output>
311 | 									</split>
312 | 									<split pos="right">
313 | 										<output> -0.4440454840660095 </output>
314 | 									</split>
315 | 								</split>
316 | 							</split>
317 | 						</split>
318 | 						<split pos="right">
319 | 							<output> 1.155909776687622 </output>
320 | 						</split>
321 | 					</split>
322 | 					<split pos="right">
323 | 						<output> 1.5210938453674316 </output>
324 | 					</split>
325 | 				</split>
326 | 				<split pos="right">
327 | 					<feature> 1 </feature>
328 | 					<threshold> 0.3402145 </threshold>
329 | 					<split pos="left">
330 | 						<output> 1.1632524728775024 </output>
331 | 					</split>
332 | 					<split pos="right">
333 | 						<output> 0.24559912085533142 </output>
334 | 					</split>
335 | 				</split>
336 | 			</split>
337 | 			<split pos="right">
338 | 				<feature> 1 </feature>
339 | 				<threshold> 0.25908363 </threshold>
340 | 				<split pos="left">
341 | 					<output> 0.7345250844955444 </output>
342 | 				</split>
343 | 				<split pos="right">
344 | 					<output> 0.45409226417541504 </output>
345 | 				</split>
346 | 			</split>
347 | 		</split>
348 | 	</tree>
349 | 	<tree id="6" weight="0.1">
350 | 		<split>
351 | 			<feature> 4 </feature>
352 | 			<threshold> 0.31724432 </threshold>
353 | 			<split pos="left">
354 | 				<feature> 1 </feature>
355 | 				<threshold> 0.38077992 </threshold>
356 | 				<split pos="left">
357 | 					<feature> 2 </feature>
358 | 					<threshold> 0.0 </threshold>
359 | 					<split pos="left">
360 | 						<feature> 3 </feature>
361 | 						<threshold> 0.0 </threshold>
362 | 						<split pos="left">
363 | 							<feature> 1 </feature>
364 | 							<threshold> 0.1895429 </threshold>
365 | 							<split pos="left">
366 | 								<output> -1.232733130455017 </output>
367 | 							</split>
368 | 							<split pos="right">
369 | 								<feature> 4 </feature>
370 | 								<threshold> 0.3172443 </threshold>
371 | 								<split pos="left">
372 | 									<feature> 4 </feature>
373 | 									<threshold> 0.24529763 </threshold>
374 | 									<split pos="left">
375 | 										<feature> 1 </feature>
376 | 										<threshold> 0.21562068 </threshold>
377 | 										<split pos="left">
378 | 											<output> -1.1072427034378052 </output>
379 | 										</split>
380 | 										<split pos="right">
381 | 											<feature> 1 </feature>
382 | 											<threshold> 0.21851821 </threshold>
383 | 											<split pos="left">
384 | 												<output> 0.8476951718330383 </output>
385 | 											</split>
386 | 											<split pos="right">
387 | 												<output> -0.44101765751838684 </output>
388 | 											</split>
389 | 										</split>
390 | 									</split>
391 | 									<split pos="right">
392 | 										<output> 0.2138904184103012 </output>
393 | 									</split>
394 | 								</split>
395 | 								<split pos="right">
396 | 									<output> -1.8764989376068115 </output>
397 | 								</split>
398 | 							</split>
399 | 						</split>
400 | 						<split pos="right">
401 | 							<output> 1.0571155548095703 </output>
402 | 						</split>
403 | 					</split>
404 | 					<split pos="right">
405 | 						<output> 1.0903085470199585 </output>
406 | 					</split>
407 | 				</split>
408 | 				<split pos="right">
409 | 					<output> 0.4524942934513092 </output>
410 | 				</split>
411 | 			</split>
412 | 			<split pos="right">
413 | 				<output> 0.5950737595558167 </output>
414 | 			</split>
415 | 		</split>
416 | 	</tree>
417 | 	<tree id="7" weight="0.1">
418 | 		<split>
419 | 			<feature> 4 </feature>
420 | 			<threshold> 0.31724432 </threshold>
421 | 			<split pos="left">
422 | 				<feature> 1 </feature>
423 | 				<threshold> 0.38077992 </threshold>
424 | 				<split pos="left">
425 | 					<feature> 2 </feature>
426 | 					<threshold> 0.0 </threshold>
427 | 					<split pos="left">
428 | 						<feature> 3 </feature>
429 | 						<threshold> 0.0 </threshold>
430 | 						<split pos="left">
431 | 							<feature> 1 </feature>
432 | 							<threshold> 0.1895429 </threshold>
433 | 							<split pos="left">
434 | 								<output> -1.1662677526474 </output>
435 | 							</split>
436 | 							<split pos="right">
437 | 								<feature> 4 </feature>
438 | 								<threshold> 0.3030209 </threshold>
439 | 								<split pos="left">
440 | 									<feature> 4 </feature>
441 | 									<threshold> 0.30263692 </threshold>
442 | 									<split pos="left">
443 | 										<feature> 1 </feature>
444 | 										<threshold> 0.21562068 </threshold>
445 | 										<split pos="left">
446 | 											<output> -1.0524094104766846 </output>
447 | 										</split>
448 | 										<split pos="right">
449 | 											<feature> 1 </feature>
450 | 											<threshold> 0.21851821 </threshold>
451 | 											<split pos="left">
452 | 												<output> 0.758584201335907 </output>
453 | 											</split>
454 | 											<split pos="right">
455 | 												<output> -0.3663649559020996 </output>
456 | 											</split>
457 | 										</split>
458 | 									</split>
459 | 									<split pos="right">
460 | 										<output> 1.0178462266921997 </output>
461 | 									</split>
462 | 								</split>
463 | 								<split pos="right">
464 | 									<output> -1.6032209396362305 </output>
465 | 								</split>
466 | 							</split>
467 | 						</split>
468 | 						<split pos="right">
469 | 							<output> 0.9786844849586487 </output>
470 | 						</split>
471 | 					</split>
472 | 					<split pos="right">
473 | 						<output> 1.0436853170394897 </output>
474 | 					</split>
475 | 				</split>
476 | 				<split pos="right">
477 | 					<output> 0.4028914272785187 </output>
478 | 				</split>
479 | 			</split>
480 | 			<split pos="right">
481 | 				<output> 0.56407630443573 </output>
482 | 			</split>
483 | 		</split>
484 | 	</tree>
485 | 	<tree id="8" weight="0.1">
486 | 		<split>
487 | 			<feature> 4 </feature>
488 | 			<threshold> 0.31724432 </threshold>
489 | 			<split pos="left">
490 | 				<feature> 2 </feature>
491 | 				<threshold> 0.55973893 </threshold>
492 | 				<split pos="left">
493 | 					<feature> 1 </feature>
494 | 					<threshold> 0.38077992 </threshold>
495 | 					<split pos="left">
496 | 						<feature> 3 </feature>
497 | 						<threshold> 0.0 </threshold>
498 | 						<split pos="left">
499 | 							<feature> 2 </feature>
500 | 							<threshold> 0.0 </threshold>
501 | 							<split pos="left">
502 | 								<feature> 1 </feature>
503 | 								<threshold> 0.1895429 </threshold>
504 | 								<split pos="left">
505 | 									<output> -1.1066094636917114 </output>
506 | 								</split>
507 | 								<split pos="right">
508 | 									<feature> 4 </feature>
509 | 									<threshold> 0.3030209 </threshold>
510 | 									<split pos="left">
511 | 										<feature> 4 </feature>
512 | 										<threshold> 0.30263692 </threshold>
513 | 										<split pos="left">
514 | 											<feature> 1 </feature>
515 | 											<threshold> 0.21562068 </threshold>
516 | 											<split pos="left">
517 | 												<output> -0.9751008152961731 </output>
518 | 											</split>
519 | 											<split pos="right">
520 | 												<output> -0.2530476748943329 </output>
521 | 											</split>
522 | 										</split>
523 | 										<split pos="right">
524 | 											<output> 0.9325478076934814 </output>
525 | 										</split>
526 | 									</split>
527 | 									<split pos="right">
528 | 										<output> -1.4579707384109497 </output>
529 | 									</split>
530 | 								</split>
531 | 							</split>
532 | 							<split pos="right">
533 | 								<output> 0.8370009064674377 </output>
534 | 							</split>
535 | 						</split>
536 | 						<split pos="right">
537 | 							<output> 0.8957659006118774 </output>
538 | 						</split>
539 | 					</split>
540 | 					<split pos="right">
541 | 						<output> 0.36041855812072754 </output>
542 | 					</split>
543 | 				</split>
544 | 				<split pos="right">
545 | 					<output> 1.4393270015716553 </output>
546 | 				</split>
547 | 			</split>
548 | 			<split pos="right">
549 | 				<output> 0.5142536759376526 </output>
550 | 			</split>
551 | 		</split>
552 | 	</tree>
553 | 	<tree id="9" weight="0.1">
554 | 		<split>
555 | 			<feature> 4 </feature>
556 | 			<threshold> 0.5 </threshold>
557 | 			<split pos="left">
558 | 				<feature> 1 </feature>
559 | 				<threshold> 0.19533797 </threshold>
560 | 				<split pos="left">
561 | 					<output> -0.9102288484573364 </output>
562 | 				</split>
563 | 				<split pos="right">
564 | 					<feature> 2 </feature>
565 | 					<threshold> 0.55973893 </threshold>
566 | 					<split pos="left">
567 | 						<feature> 4 </feature>
568 | 						<threshold> 0.24529763 </threshold>
569 | 						<split pos="left">
570 | 							<feature> 1 </feature>
571 | 							<threshold> 0.3315219 </threshold>
572 | 							<split pos="left">
573 | 								<feature> 1 </feature>
574 | 								<threshold> 0.21562068 </threshold>
575 | 								<split pos="left">
576 | 									<output> -1.2571520805358887 </output>
577 | 								</split>
578 | 								<split pos="right">
579 | 									<feature> 1 </feature>
580 | 									<threshold> 0.21851821 </threshold>
581 | 									<split pos="left">
582 | 										<output> 0.6894375681877136 </output>
583 | 									</split>
584 | 									<split pos="right">
585 | 										<output> -0.48254841566085815 </output>
586 | 									</split>
587 | 								</split>
588 | 							</split>
589 | 							<split pos="right">
590 | 								<feature> 3 </feature>
591 | 								<threshold> 0.0 </threshold>
592 | 								<split pos="left">
593 | 									<output> 0.2937770485877991 </output>
594 | 								</split>
595 | 								<split pos="right">
596 | 									<output> -2.1294875144958496 </output>
597 | 								</split>
598 | 							</split>
599 | 						</split>
600 | 						<split pos="right">
601 | 							<feature> 1 </feature>
602 | 							<threshold> 0.25908363 </threshold>
603 | 							<split pos="left">
604 | 								<output> 0.7393473386764526 </output>
605 | 							</split>
606 | 							<split pos="right">
607 | 								<output> -0.15804332494735718 </output>
608 | 							</split>
609 | 						</split>
610 | 					</split>
611 | 					<split pos="right">
612 | 						<output> 1.3993260860443115 </output>
613 | 					</split>
614 | 				</split>
615 | 			</split>
616 | 			<split pos="right">
617 | 				<output> 1.0860182046890259 </output>
618 | 			</split>
619 | 		</split>
620 | 	</tree>
621 | 	<tree id="10" weight="0.1">
622 | 		<split>
623 | 			<feature> 4 </feature>
624 | 			<threshold> 0.5 </threshold>
625 | 			<split pos="left">
626 | 				<feature> 1 </feature>
627 | 				<threshold> 0.19533797 </threshold>
628 | 				<split pos="left">
629 | 					<output> -0.8420385122299194 </output>
630 | 				</split>
631 | 				<split pos="right">
632 | 					<feature> 2 </feature>
633 | 					<threshold> 0.55973893 </threshold>
634 | 					<split pos="left">
635 | 						<feature> 2 </feature>
636 | 						<threshold> 0.54972833 </threshold>
637 | 						<split pos="left">
638 | 							<feature> 4 </feature>
639 | 							<threshold> 0.31724432 </threshold>
640 | 							<split pos="left">
641 | 								<feature> 1 </feature>
642 | 								<threshold> 0.38077992 </threshold>
643 | 								<split pos="left">
644 | 									<feature> 3 </feature>
645 | 									<threshold> 0.0 </threshold>
646 | 									<split pos="left">
647 | 										<feature> 4 </feature>
648 | 										<threshold> 0.3030209 </threshold>
649 | 										<split pos="left">
650 | 											<feature> 4 </feature>
651 | 											<threshold> 0.30263692 </threshold>
652 | 											<split pos="left">
653 | 												<output> -0.2686406373977661 </output>
654 | 											</split>
655 | 											<split pos="right">
656 | 												<output> 1.390183448791504 </output>
657 | 											</split>
658 | 										</split>
659 | 										<split pos="right">
660 | 											<output> -1.3712384700775146 </output>
661 | 										</split>
662 | 									</split>
663 | 									<split pos="right">
664 | 										<output> 0.8090718984603882 </output>
665 | 									</split>
666 | 								</split>
667 | 								<split pos="right">
668 | 									<output> 0.3620189130306244 </output>
669 | 								</split>
670 | 							</split>
671 | 							<split pos="right">
672 | 								<output> 0.3657061755657196 </output>
673 | 							</split>
674 | 						</split>
675 | 						<split pos="right">
676 | 							<output> -3.2265491485595703 </output>
677 | 						</split>
678 | 					</split>
679 | 					<split pos="right">
680 | 						<output> 1.3460023403167725 </output>
681 | 					</split>
682 | 				</split>
683 | 			</split>
684 | 			<split pos="right">
685 | 				<output> 1.007249355316162 </output>
686 | 			</split>
687 | 		</split>
688 | 	</tree>
689 | </ensemble>
690 | 


--------------------------------------------------------------------------------