├── telco_churn
    ├── pipelines
    │   ├── __init__.py
    │   ├── sample_test_job.py
    │   ├── model_inference_batch_job.py
    │   ├── model_deployment_job.py
    │   ├── feature_table_creator_job.py
    │   ├── model_train_job.py
    │   └── demo_setup_job.py
    ├── utils
    │   ├── __init__.py
    │   ├── get_spark.py
    │   ├── logger_utils.py
    │   ├── notebook_utils.py
    │   ├── feature_store_utils.py
    │   └── evaluation_utils.py
    ├── __init__.py
    ├── model_train_pipeline.py
    ├── model_inference.py
    ├── featurize.py
    ├── feature_table_creator.py
    ├── common.py
    ├── model_train.py
    └── model_deployment.py
├── conf
    ├── staging
    │   └── .staging.env
    ├── pipeline_configs
    │   ├── sample_test.yml
    │   ├── model_deployment.yml
    │   ├── model_train.yml
    │   ├── model_inference_batch.yml
    │   ├── feature_table_creator.yml
    │   └── demo_setup.yml
    ├── dev
    │   └── .dev.env
    ├── prod
    │   └── .prod.env
    ├── .base_data_params.env
    └── deployment.yml
├── requirements.txt
├── pytest.ini
├── unit-requirements.txt
├── .coveragerc
├── setup.py
├── .gitignore
├── .dbx
    └── project.json
├── .github
    └── workflows
    │   ├── onpullrequest.yml
    │   └── onrelease.yml
├── tests
    ├── integration
    │   └── sample_test.py
    └── unit
    │   ├── model_train_pipeline_test.py
    │   └── conftest.py
├── notebooks
    ├── model_inference_batch.py
    ├── feature_table_creator.py
    ├── model_train.py
    ├── model_deployment.py
    └── demo_setup.py
└── README.md


/telco_churn/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/telco_churn/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/conf/staging/.staging.env:
--------------------------------------------------------------------------------
1 | env=staging


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dotenv==0.20.0


--------------------------------------------------------------------------------
/telco_churn/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.1"
2 | 


--------------------------------------------------------------------------------
/conf/pipeline_configs/sample_test.yml:
--------------------------------------------------------------------------------
1 | output_format: 'delta'
2 | output_path: 'dbfs:/dbx/tmp/test/e2e_mlops'


--------------------------------------------------------------------------------
/telco_churn/utils/get_spark.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | 
3 | spark = SparkSession.builder.getOrCreate()


--------------------------------------------------------------------------------
/conf/pipeline_configs/model_deployment.yml:
--------------------------------------------------------------------------------
1 | model_comparison_params:
2 |     metric: 'roc_auc_score'
3 |     higher_is_better: True


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = -s -p no:warnings
3 | log_cli = 1
4 | log_cli_level = INFO
5 | log_cli_format = [pytest][%(asctime)s][%(levelname)s][%(module)s][%(funcName)s] %(message)s
6 | log_cli_date_format = %Y-%m-%d %H:%M:%S
7 | log_level = INFO


--------------------------------------------------------------------------------
/unit-requirements.txt:
--------------------------------------------------------------------------------
 1 | setuptools==58.0.4
 2 | wheel==0.37.0
 3 | pyspark
 4 | numpy==1.20.3
 5 | pandas==1.3.4
 6 | scikit-learn==0.24.2
 7 | pyyaml==6.0
 8 | pytest==7.1.2
 9 | pytest-cov==3.0.0
10 | dbx==0.5.0
11 | delta-spark
12 | python-dotenv==0.20.0


--------------------------------------------------------------------------------
/conf/pipeline_configs/model_train.yml:
--------------------------------------------------------------------------------
 1 | mlflow_params:
 2 |   run_name: 'random_forest_baseline'
 3 | 
 4 | 
 5 | pipeline_params:
 6 |   test_size: 0.25
 7 |   random_state: 42
 8 | 
 9 | model_params:
10 |   n_estimators: 100
11 |   max_depth: 4
12 |   min_samples_leaf: 1
13 |   max_features: 'auto'
14 |   random_state: 42


--------------------------------------------------------------------------------
/conf/pipeline_configs/model_inference_batch.yml:
--------------------------------------------------------------------------------
 1 | mlflow_params:
 2 |   model_registry_stage: 'production'
 3 | 
 4 | data_input:
 5 |   # Require DataFrame to score the model on - must contain column(s) for lookup keys
 6 |   # to join feature data from Feature Store
 7 |   table_name: 'e2e_mlops_prod.churn_labels'
 8 | 
 9 | data_output:
10 |   mode: 'overwrite'


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | source = telco_churn
 4 | 
 5 | [report]
 6 | exclude_lines =
 7 |     if self.debug:
 8 |     pragma: no cover
 9 |     raise NotImplementedError
10 |     if __name__ == .__main__.:
11 | 
12 | ignore_errors = True
13 | omit =
14 |     tests/*
15 |     setup.py
16 |     # this file is autogenerated by dbx
17 |     telco_churn/common.py
18 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | from telco_churn import __version__
 3 | 
 4 | setup(
 5 |     name='telco_churn',
 6 |     packages=find_packages(exclude=['tests', 'tests.*']),
 7 |     setup_requires=['wheel'],
 8 |     version=__version__,
 9 |     description='Demo repository implementing an end-to-end MLOps workflow on Databricks. Project derived from dbx '
10 |                 'basic python template',
11 |     authors='Joseph Bradley, Rafi Kurlansik, Matthew Thomson, Niall Turbitt'
12 | )
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Distribution / packaging
 7 | *.egg-info/
 8 | .eggs
 9 | build
10 | dist
11 | 
12 | # venv
13 | venv
14 | 
15 | # Unit test / coverage reports
16 | .coverage
17 | coverage.xml
18 | junit/*
19 | htmlcov/*
20 | 
21 | # Caches
22 | .pytest_cache/
23 | 
24 | # VSCode
25 | .vscode/
26 | 
27 | # Idea
28 | .idea/
29 | *.iml
30 | 
31 | # MacOS
32 | .DS_Store
33 | 
34 | # Databricks eXtensions
35 | .dbx/lock.json
36 | 
37 | # local mlflow files
38 | mlruns/


--------------------------------------------------------------------------------
/conf/pipeline_configs/feature_table_creator.yml:
--------------------------------------------------------------------------------
 1 | input_table: 'ibm_telco_churn.bronze_customers'
 2 | 
 3 | data_prep_params:
 4 |   label_col: 'churnString'
 5 |   ohe: False
 6 | #  # Only require cat_cols if ohe=True
 7 | #  cat_cols: ['gender', 'partner', 'dependents',
 8 | #             'phoneService', 'multipleLines', 'internetService',
 9 | #             'onlineSecurity', 'onlineBackup', 'deviceProtection',
10 | #             'techSupport', 'streamingTV', 'streamingMovies',
11 | #             'contract', 'paperlessBilling', 'paymentMethod']
12 |   drop_missing: False


--------------------------------------------------------------------------------
/conf/pipeline_configs/demo_setup.yml:
--------------------------------------------------------------------------------
 1 | # Delete MLflow Registry model
 2 | # Model name set in deployment.yml
 3 | delete_model_registry: True
 4 | 
 5 | # Delete MLflow Tracking experiments (both the training experiment and the deployment experiment)
 6 | # Experiment paths/id set in deployment.yml
 7 | delete_mlflow_experiments: True
 8 | 
 9 | # Drop Feature Store feature table if it exists
10 | # Feature table name set in deployment.yml
11 | drop_feature_table: True
12 | 
13 | # Drop labels table if it exists
14 | # Label table name set in deployment.yml
15 | drop_labels_table: True


--------------------------------------------------------------------------------
/telco_churn/pipelines/sample_test_job.py:
--------------------------------------------------------------------------------
 1 | from telco_churn.common import Workload
 2 | 
 3 | 
 4 | class SampleJob(Workload):
 5 | 
 6 |     def launch(self):
 7 |         self.logger.info('Launching sample job')
 8 | 
 9 |         listing = self.dbutils.fs.ls('dbfs:/')
10 | 
11 |         for l in listing:
12 |             self.logger.info(f'DBFS directory: {l}')
13 | 
14 |         df = self.spark.range(0, 1000)
15 | 
16 |         df.write.format(self.conf['output_format']).mode('overwrite').save(
17 |             self.conf['output_path']
18 |         )
19 | 
20 |         self.logger.info('Sample job finished!')
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     job = SampleJob()
25 |     job.launch()
26 | 


--------------------------------------------------------------------------------
/.dbx/project.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "environments": {
 3 |         "dev": {
 4 |             "profile": "e2-demo-west",
 5 |             "workspace_dir": "/Shared/e2e_mlops/dev/dbx/e2e_mlops_dev",
 6 |             "artifact_location": "dbfs:/Shared/e2e_mlops/dev/dbx/projects/e2e_mlops_dbx_dev"
 7 |         },
 8 |         "staging": {
 9 |             "profile": "e2-demo-west",
10 |             "workspace_dir": "/Shared/e2e_mlops/staging/dbx/e2e_mlops_staging",
11 |             "artifact_location": "dbfs:/Shared/e2e_mlops/staging/dbx/projects/e2e_mlops_dbx_staging"
12 |         },
13 |         "prod": {
14 |             "profile": "e2-demo-west",
15 |             "workspace_dir": "/Shared/e2e_mlops/prod/dbx/e2e_mlops_prod",
16 |             "artifact_location": "dbfs:/Shared/e2e_mlops/prod/dbx/projects/e2e_mlops_dbx_prod"
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/conf/dev/.dev.env:
--------------------------------------------------------------------------------
 1 | env=dev
 2 | 
 3 | // Global MLflow params for dev
 4 | model_train_experiment_path='/Shared/e2e_mlops/dev/telco_churn_experiment_dev'
 5 | model_name=e2e_mlops_telco_churn_dev
 6 | model_deploy_experiment_path='/Shared/e2e_mlops/dev/telco_churn_deployment_dev'
 7 | 
 8 | // Feature Store params
 9 | feature_store_database_name='e2e_mlops_dev'
10 | 
11 | // Labels table params
12 | labels_table_database_name='e2e_mlops_dev'
13 | //tmp directory for demo purposes
14 | labels_table_dbfs_path='dbfs:/tmp/e2e_mlops/dev/churn_labels.delta'
15 | 
16 | // Batch inference predictions table params
17 | predictions_table_database_name='e2e_mlops_dev'
18 | predictions_table_name = 'churn_predictions'
19 | 
20 | // Reference table params - table to use for comparing staging vs production models
21 | reference_table_database_name='e2e_mlops_dev'


--------------------------------------------------------------------------------
/conf/prod/.prod.env:
--------------------------------------------------------------------------------
 1 | env=prod
 2 | 
 3 | // Global MLflow params for prod
 4 | model_train_experiment_path='/Shared/e2e_mlops/prod/telco_churn_experiment_prod'
 5 | model_name='e2e_mlops_telco_churn_prod'
 6 | model_deploy_experiment_path='/Shared/e2e_mlops/prod/telco_churn_deployment_prod'
 7 | 
 8 | // Feature Store params
 9 | feature_store_database_name='e2e_mlops_prod'
10 | 
11 | // Labels table params
12 | labels_table_database_name='e2e_mlops_prod'
13 | // tmp directory for demo purposes
14 | labels_table_dbfs_path='dbfs:/tmp/e2e_mlops/prod/churn_labels.delta'
15 | 
16 | // Batch inference predictions table params
17 | predictions_table_database_name='e2e_mlops_prod'
18 | predictions_table_name = 'churn_predictions'
19 | 
20 | // Reference table params - table to use for comparing staging vs production models
21 | reference_table_database_name='e2e_mlops_prod'
22 | 
23 | 


--------------------------------------------------------------------------------
/telco_churn/utils/logger_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | class NoReceivedCommandFilter(logging.Filter):
 5 |     def filter(self, record):
 6 |         if 'Received command c' not in record.getMessage():
 7 |             return record.getMessage()
 8 | 
 9 | 
10 | class NoPythonDotEnvFilter(logging.Filter):
11 |     def filter(self, record):
12 |         if 'Python-dotenv' not in record.getMessage():
13 |             return record.getMessage()
14 | 
15 | 
16 | def get_logger():
17 |     logging.getLogger('py4j.java_gateway').setLevel(logging.ERROR)
18 |     logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO)
19 |     logger = logging.getLogger(__name__)
20 | 
21 |     filter_1 = NoReceivedCommandFilter()
22 |     filter_2 = NoPythonDotEnvFilter()
23 |     logger.addFilter(filter_1)
24 |     logger.addFilter(filter_2)
25 | 
26 |     return logger
27 | 


--------------------------------------------------------------------------------
/conf/.base_data_params.env:
--------------------------------------------------------------------------------
 1 | // Base data params which are consistent across environments
 2 | 
 3 | // Feature Store params
 4 | feature_store_table_name='churn_features'
 5 | feature_store_table_primary_keys='customerID'
 6 | feature_store_table_description='These features are derived from the ibm_telco_churn.bronze_customers table in the lakehouse.  We created dummy variables for the categorical columns, cleaned up their names, and added a boolean flag for whether the customer churned or not.  No aggregations were performed.'
 7 | 
 8 | // Labels table params
 9 | labels_table_name='churn_labels'
10 | labels_table_label_col='churn'
11 | 
12 | // Batch inference input table params
13 | // For demo purposes we use the churn_labels table
14 | inference_table_name='churn_labels'
15 | 
16 | // Batch inference predictions table params
17 | predictions_table_name='churn_predictions'
18 | 
19 | //  Reference table params - table to use for comparing staging vs production models
20 | // For demo purposes we use the churn_labels table. However this reference table in practice would be a curated dataset
21 | // Note=this table must contain column(s) for lookup keys to join feature data from Feature Store
22 | reference_table_name='churn_labels'
23 | reference_table_label_col='churn'


--------------------------------------------------------------------------------
/telco_churn/model_train_pipeline.py:
--------------------------------------------------------------------------------
 1 | from sklearn.compose import make_column_selector, ColumnTransformer
 2 | from sklearn.impute import SimpleImputer
 3 | from sklearn.pipeline import Pipeline
 4 | from sklearn.preprocessing import OneHotEncoder
 5 | from sklearn.ensemble import RandomForestClassifier
 6 | 
 7 | 
 8 | class ModelTrainPipeline:
 9 | 
10 |     @classmethod
11 |     def create_train_pipeline(cls, model_params: dict) -> Pipeline:
12 | 
13 |         preprocessor = ColumnTransformer(
14 |             transformers=[
15 |                 ('numeric_transformer',
16 |                  SimpleImputer(strategy='median'),
17 |                  make_column_selector(dtype_exclude='object')
18 |                  ),
19 |                 ('categorical_transformer',
20 |                  OneHotEncoder(handle_unknown='ignore'),
21 |                  make_column_selector(dtype_include='object')
22 |                  ),
23 |             ],
24 |             remainder='passthrough',
25 |             sparse_threshold=0
26 |         )
27 | 
28 |         rf_classifier = RandomForestClassifier(**model_params)
29 | 
30 |         pipeline = Pipeline([
31 |             ('preprocessor', preprocessor),
32 |             ('classifier', rf_classifier),
33 |         ])
34 | 
35 |         return pipeline
36 | 


--------------------------------------------------------------------------------
/.github/workflows/onpullrequest.yml:
--------------------------------------------------------------------------------
 1 | name: CI pipeline
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - main
 7 |     tags-ignore:
 8 |       - 'v*' # this tag type is used for release pipelines
 9 | 
10 | jobs:
11 |   ci-pipeline:
12 | 
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       max-parallel: 4
16 | 
17 |     env:
18 |       DATABRICKS_HOST: ${{ secrets.DATABRICKS_STAGING_HOST }}
19 |       DATABRICKS_TOKEN:  ${{ secrets.DATABRICKS_STAGING_TOKEN }}
20 | 
21 |     steps:
22 |       - uses: actions/checkout@v3
23 | 
24 |       - name: Set up Python
25 |         uses: actions/setup-python@v3
26 |         with:
27 |           python-version: 3.9.5
28 | 
29 |       - name: Install pip
30 |         run: |
31 |           python -m pip install --upgrade pip
32 | 
33 |       - name: Install dependencies and project in dev mode
34 |         run: |
35 |           pip install -r unit-requirements.txt
36 |           pip install -e .
37 | 
38 |       - name: Run unit tests
39 |         run: |
40 |           echo "Launching unit tests"
41 |           pytest tests/unit
42 | 
43 |       - name: Deploy integration test [staging environment]
44 |         run: |
45 |           dbx deploy --jobs=STAGING-telco-churn-sample-integration-test --environment=staging --files-only
46 | 
47 |       - name: Run integration test [staging environment]
48 |         run: |
49 |           dbx launch --job=STAGING-telco-churn-sample-integration-test --environment=staging --as-run-submit --trace


--------------------------------------------------------------------------------
/telco_churn/utils/notebook_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pathlib
 3 | import dotenv
 4 | import yaml
 5 | import pprint
 6 | from typing import Dict, Any
 7 | 
 8 | 
 9 | def load_and_set_env_vars(env: str) -> Dict[str, Any]:
10 |     """
11 |     Utility function to use in Databricks notebooks to load .env files and set them via os
12 |     Return a dict of set environment variables
13 | 
14 |     Parameters
15 |     ----------
16 |     env : str
17 |         Name of deployment environment. One of
18 | 
19 |     Returns
20 |     -------
21 |     Dictionary of set environment variables
22 |     """
23 |     env_vars_path = os.path.join(os.pardir, 'conf', env, f'.{env}.env')
24 |     dotenv.load_dotenv(env_vars_path)
25 | 
26 |     base_data_vars_vars_path = os.path.join(os.pardir, 'conf', '.base_data_params.env')
27 |     dotenv.load_dotenv(base_data_vars_vars_path)
28 | 
29 |     os_dict = dict(os.environ)
30 |     pprint.pprint(os_dict)
31 | 
32 |     return os_dict
33 | 
34 | 
35 | def load_config(pipeline_name) -> Dict[str, Any]:
36 |     """
37 |     Utility function to use in Databricks notebooks to load the config yaml file for a given pipeline
38 |     Return dict of specified config params
39 | 
40 |     Parameters
41 |     ----------
42 |     pipeline_name :  str
43 |         Name of pipeline
44 | 
45 |     Returns
46 |     -------
47 |     Dictionary of config params
48 |     """
49 |     config_path = os.path.join(os.pardir, 'conf', 'pipeline_configs', f'{pipeline_name}.yml')
50 |     pipeline_config = yaml.safe_load(pathlib.Path(config_path).read_text())
51 |     pprint.pprint(pipeline_config)
52 | 
53 |     return pipeline_config
54 | 


--------------------------------------------------------------------------------
/telco_churn/utils/feature_store_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, List
 2 | 
 3 | import pyspark
 4 | 
 5 | import databricks
 6 | from databricks.feature_store import FeatureStoreClient
 7 | 
 8 | 
 9 | def create_and_write_feature_table(df: pyspark.sql.DataFrame,
10 |                                    feature_table_name: str,
11 |                                    primary_keys: Union[str, List[str]],
12 |                                    description: str) -> databricks.feature_store.entities.feature_table.FeatureTable:
13 |     """
14 |     Create and return a feature table with the given name and primary keys, writing the provided Spark DataFrame to the
15 |     feature table
16 | 
17 |     Parameters
18 |     ----------
19 |     df : pyspark.sql.DataFrame
20 |         Data to create this feature table
21 |     feature_table_name : str
22 |         A feature table name of the form <database_name>.<table_name>, for example dev.user_features.
23 |     primary_keys : Union[str, List[str]]
24 |         The feature table’s primary keys. If multiple columns are required, specify a list of column names, for example
25 |         ['customer_id', 'region'].
26 |     description : str
27 |         Description of the feature table.
28 |     Returns
29 |     -------
30 |     databricks.feature_store.entities.feature_table.FeatureTable
31 |     """
32 |     fs = FeatureStoreClient()
33 | 
34 |     feature_table = fs.create_table(
35 |         name=feature_table_name,
36 |         primary_keys=primary_keys,
37 |         schema=df.schema,
38 |         description=description
39 |     )
40 | 
41 |     fs.write_table(df=df, name=feature_table_name, mode='overwrite')
42 | 
43 |     return feature_table
44 | 


--------------------------------------------------------------------------------
/tests/integration/sample_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from telco_churn.pipelines.sample_test_job import SampleJob
 4 | from uuid import uuid4
 5 | from pyspark.dbutils import DBUtils  # noqa
 6 | 
 7 | 
 8 | class SampleJobIntegrationTest(unittest.TestCase):
 9 |     def setUp(self):
10 | 
11 |         self.test_dir = 'dbfs:/tmp/tests/sample/%s' % str(uuid4())
12 |         self.test_config = {'output_format': 'delta', 'output_path': self.test_dir}
13 | 
14 |         self.job = SampleJob(init_conf=self.test_config)
15 |         self.dbutils = DBUtils(self.job.spark)
16 |         self.spark = self.job.spark
17 | 
18 |     def test_sample(self):
19 | 
20 |         self.job.launch()
21 | 
22 |         output_count = (
23 |             self.spark.read.format(self.test_config['output_format'])
24 |             .load(self.test_config['output_path'])
25 |             .count()
26 |         )
27 | 
28 |         self.assertGreater(output_count, 0)
29 | 
30 |     def tearDown(self):
31 |         self.dbutils.fs.rm(self.test_dir, True)
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     # please don't change the logic of test result checks here
36 |     # it's intentionally done in this way to comply with pipelines run result checks
37 |     # for other tests, please simply replace the SampleJobIntegrationTest with your custom class name
38 |     loader = unittest.TestLoader()
39 |     tests = loader.loadTestsFromTestCase(SampleJobIntegrationTest)
40 |     runner = unittest.TextTestRunner(verbosity=2)
41 |     result = runner.run(tests)
42 |     if not result.wasSuccessful():
43 |         raise RuntimeError(
44 |             'One or multiple tests failed. Please check job logs for additional information.'
45 |         )
46 | 


--------------------------------------------------------------------------------
/telco_churn/pipelines/model_inference_batch_job.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | from telco_churn.common import Workload
 4 | from telco_churn.model_inference import ModelInference
 5 | from telco_churn.utils.logger_utils import get_logger
 6 | 
 7 | _logger = get_logger()
 8 | 
 9 | 
10 | class ModelInferenceJob(Workload):
11 | 
12 |     def _get_model_uri(self) -> str:
13 |         model_name = self.env_vars['model_name']
14 |         model_registry_stage = self.conf['mlflow_params']['model_registry_stage']
15 |         model_uri = f'models:/{model_name}/{model_registry_stage}'
16 | 
17 |         return model_uri
18 | 
19 |     def _get_input_table_name(self) -> str:
20 |         """
21 |         Get the name of the input table to perform inference on
22 |         """
23 |         return self.conf['data_input']['table_name']
24 | 
25 |     def _get_predictions_output_params(self) -> Dict:
26 |         """
27 |         Get a dictionary of delta_path, table_name, mode key-values to pass to run_and_write_batch of ModelInference
28 |         """
29 |         predictions_table_database_name = self.env_vars['predictions_table_database_name']
30 |         predictions_table_name = f'{predictions_table_database_name}.{self.env_vars["predictions_table_name"]}'
31 | 
32 |         return predictions_table_name
33 | 
34 |     def launch(self):
35 |         _logger.info('Launching Batch ModelInferenceJob job')
36 |         _logger.info(f'Running model-inference-batch in {self.env_vars["env"]} environment')
37 |         ModelInference(model_uri=self._get_model_uri(),
38 |                        input_table_name=self._get_input_table_name(),
39 |                        output_table_name=self._get_predictions_output_params())\
40 |             .run_and_write_batch(mode=self.conf['data_output']['mode'])
41 |         _logger.info('Batch ModelInferenceJob job finished')
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     job = ModelInferenceJob()
46 |     job.launch()
47 | 


--------------------------------------------------------------------------------
/telco_churn/utils/evaluation_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | import pandas as pd
 3 | from sklearn.metrics import roc_auc_score
 4 | 
 5 | 
 6 | class ModelEvaluation:
 7 | 
 8 |     @staticmethod
 9 |     def _roc_auc_score(y_true: pd.Series, y_score: pd.Series):
10 |         """
11 |         Compute ROC AUC score using sklearn. Computed in same way as MLflow utils
12 |         https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html
13 |         By default, for roc_auc_score, we pick `average` to be `weighted`, `multi_class` to be `ovo`,
14 |         to make the output more insensitive to dataset imbalance.
15 | 
16 |         Parameters
17 |         ----------
18 |         y_true : array-like of shape (n_samples,) or (n_samples, n_classes)
19 |             True labels or binary label indicators
20 |         y_score : array-like of shape (n_samples,) or (n_samples, n_classes)
21 |             Target scores.
22 | 
23 |         Returns
24 |         -------
25 |         auc : float
26 |         """
27 |         return roc_auc_score(y_true=y_true,
28 |                              y_score=y_score,
29 |                              average='weighted',
30 |                              multi_class='ovo')
31 | 
32 |     def evaluate(self, y_true: pd.Series, y_score: pd.Series, metric_prefix: str = '') -> Dict:
33 |         """
34 | 
35 | 
36 |         Parameters
37 |         ----------
38 |         y_true : array-like of shape (n_samples,) or (n_samples, n_classes)
39 |             True labels or binary label indicators
40 |         y_score : array-like of shape (n_samples,) or (n_samples, n_classes)
41 |             Target scores.
42 |         metric_prefix : str
43 |             Prefix for each metric key in the returned dictionary
44 | 
45 |         Returns
46 |         -------
47 |         Dictionary of (metric name, computed value)
48 |         """
49 |         return {
50 |             f'{metric_prefix}roc_auc_score': self._roc_auc_score(y_true, y_score),
51 |         }
52 | 


--------------------------------------------------------------------------------
/.github/workflows/onrelease.yml:
--------------------------------------------------------------------------------
 1 | name: Release pipeline
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10
 7 | 
 8 | jobs:
 9 |   release-pipeline:
10 | 
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       max-parallel: 4
14 |       matrix:
15 |         python-version: [ 3.9 ]
16 | 
17 |     env:
18 |       DATABRICKS_HOST: ${{ secrets.DATABRICKS_PROD_HOST }}
19 |       DATABRICKS_TOKEN:  ${{ secrets.DATABRICKS_PROD_TOKEN }}
20 | 
21 |     steps:
22 |       - uses: actions/checkout@v3
23 | 
24 |       - name: Set up Python
25 |         uses: actions/setup-python@v3
26 |         with:
27 |           python-version: 3.9.5
28 | 
29 |       - name: Install pip
30 |         run: |
31 |           python -m pip install --upgrade pip
32 | 
33 |       - name: Install dependencies and project
34 |         run: |
35 |           pip install -r unit-requirements.txt
36 | 
37 |       - name: Deploy PROD-telco-churn-model-train job [prod environment]
38 |         run: |
39 |           dbx deploy --deployment-file conf/deployment.yml --jobs=PROD-telco-churn-model-train --environment=prod
40 | 
41 |       - name: Deploy PROD-telco-churn-model-deployment job [prod environment]
42 |         run: |
43 |           dbx deploy --deployment-file conf/deployment.yml --jobs=PROD-telco-churn-model-deployment --environment=prod
44 | 
45 |       - name: Deploy PROD-telco-churn-model-inference-batch job [prod environment]
46 |         run: |
47 |           dbx deploy --deployment-file conf/deployment.yml --jobs=PROD-telco-churn-model-inference-batch --environment=prod
48 | 
49 |       - name: Create Release
50 |         id: create_release
51 |         uses: actions/create-release@v1
52 |         env:
53 |           GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
54 |         with:
55 |           tag_name: ${{ github.ref }}
56 |           release_name: ${{ github.ref }}
57 |           body: |
58 |             Release for version ${{ github.ref }}.
59 |           draft: false
60 |           prerelease: false


--------------------------------------------------------------------------------
/telco_churn/pipelines/model_deployment_job.py:
--------------------------------------------------------------------------------
 1 | from telco_churn.common import Workload, MLflowTrackingConfig
 2 | from telco_churn.model_deployment import ModelDeployment, ModelDeploymentConfig
 3 | from telco_churn.utils.logger_utils import get_logger
 4 | 
 5 | _logger = get_logger()
 6 | 
 7 | 
 8 | class ModelDeploymentJob(Workload):
 9 | 
10 |     def _get_mlflow_tracking_cfg(self):
11 |         return MLflowTrackingConfig(experiment_path=self.env_vars['model_deploy_experiment_path'],
12 |                                     run_name='staging_vs_prod_comparison',
13 |                                     model_name=self.env_vars['model_name'])
14 | 
15 |     def _get_reference_data(self) -> str:
16 |         reference_table_database_name = self.env_vars['reference_table_database_name']
17 |         reference_table_name = self.env_vars['reference_table_name']
18 |         return f'{reference_table_database_name}.{reference_table_name}'
19 | 
20 |     def _get_reference_data_label_col(self) -> str:
21 |         return self.env_vars['reference_table_label_col']
22 | 
23 |     def _get_model_comparison_params(self) -> dict:
24 |         return self.conf['model_comparison_params']
25 | 
26 |     def launch(self):
27 |         _logger.info('Launching ModelDeploymentJob job')
28 |         _logger.info(f'Running model-deployment pipeline in {self.env_vars["env"]} environment')
29 |         cfg = ModelDeploymentConfig(mlflow_tracking_cfg=self._get_mlflow_tracking_cfg(),
30 |                                     reference_data=self._get_reference_data(),
31 |                                     label_col=self._get_reference_data_label_col(),
32 |                                     comparison_metric=self._get_model_comparison_params()['metric'],
33 |                                     higher_is_better=self._get_model_comparison_params()['higher_is_better'])
34 |         ModelDeployment(cfg).run()
35 |         _logger.info('Launching ModelDeploymentJob job finished!')
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     job = ModelDeploymentJob()
40 |     job.launch()
41 | 


--------------------------------------------------------------------------------
/tests/unit/model_train_pipeline_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from dataclasses import dataclass
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | from telco_churn.model_train_pipeline import ModelTrainPipeline
 8 | 
 9 | 
10 | class ModelTrainPipelineTest(unittest.TestCase):
11 | 
12 |     def test_create_train_pipeline(self):
13 |         @dataclass
14 |         class Example:
15 |             contract: str
16 |             dependents: str
17 |             deviceProtection: str
18 |             gender: str
19 |             internetService: str
20 |             monthlyCharges: float
21 |             multipleLines: str
22 |             onlineBackup: str
23 |             onlineSecurity: str
24 |             paperlessBilling: str
25 |             partner: str
26 |             paymentMethod: str
27 |             phoneService: str
28 |             seniorCitizen: float
29 |             streamingMovies: str
30 |             streamingTV: str
31 |             techSupport: str
32 |             tenure: float
33 |             totalCharges: float
34 | 
35 |         X = pd.DataFrame(data=[
36 |             Example('Two year', 'Yes', 'No', 'Female', 'DSL', 53.65, 'No phone service', 'Yes', 'No', 'No', 'Yes',
37 |                     'Credit card (automatic)', 'No', 0.0, 'Yes', 'Yes', 'Yes', 72.0, 3784.0),
38 |             Example('Month-to-month', 'No', 'No', 'Male', 'Fiber optic', 74.9, 'Yes', 'No', 'No', 'Yes', 'No',
39 |                     'Electronic check', 'Yes', 0.0, 'No', 'No', 'No', 1.0, 74.9),
40 |             Example('Month-to-month', 'No', 'No', 'Female', 'Fiber optic', 100.4, 'Yes', 'No', 'No', 'Yes', 'Yes',
41 |                     'Bank transfer (automatic)', 'Yes', 1.0, 'Yes', 'Yes', 'Yes', 58.0, 5749.8),
42 |         ])
43 |         y = np.random.randint(2, size=3)
44 | 
45 |         model_params = {'n_estimators': 4,
46 |                         'max_depth': 4,
47 |                         'min_samples_leaf': 1,
48 |                         'max_features': 'auto',
49 |                         'random_state': 42}
50 | 
51 |         pipeline = ModelTrainPipeline.create_train_pipeline(model_params=model_params)
52 |         pipeline.fit(X, y)
53 |         y_pred = pipeline.predict(X)
54 | 
55 |         assert np.array_equal(y_pred, y_pred.astype(bool))
56 | 
57 | 


--------------------------------------------------------------------------------
/telco_churn/pipelines/feature_table_creator_job.py:
--------------------------------------------------------------------------------
 1 | from telco_churn.common import Workload, FeatureStoreTableConfig, LabelsTableConfig
 2 | from telco_churn.feature_table_creator import FeatureTableCreator, FeatureTableCreatorConfig
 3 | from telco_churn.featurize import FeaturizerConfig
 4 | from telco_churn.utils.logger_utils import get_logger
 5 | 
 6 | _logger = get_logger()
 7 | 
 8 | 
 9 | class FeatureTableCreatorJob(Workload):
10 | 
11 |     def _get_input_table(self) -> dict:
12 |         return self.conf['input_table']
13 | 
14 |     def _get_data_prep_params(self) -> FeaturizerConfig:
15 |         return FeaturizerConfig(**self.conf['data_prep_params'])
16 | 
17 |     def _get_feature_store_table_cfg(self) -> FeatureStoreTableConfig:
18 |         return FeatureStoreTableConfig(database_name=self.env_vars['feature_store_database_name'],
19 |                                        table_name=self.env_vars['feature_store_table_name'],
20 |                                        primary_keys=self.env_vars['feature_store_table_primary_keys'],
21 |                                        description=self.env_vars['feature_store_table_description'])
22 | 
23 |     def _get_labels_table_cfg(self) -> LabelsTableConfig:
24 |         return LabelsTableConfig(database_name=self.env_vars['labels_table_database_name'],
25 |                                  table_name=self.env_vars['labels_table_name'],
26 |                                  label_col=self.env_vars['labels_table_label_col'],
27 |                                  dbfs_path=self.env_vars['labels_table_dbfs_path'])
28 | 
29 |     def launch(self) -> None:
30 |         """
31 |         Launch FeatureStoreTableCreator job
32 |         """
33 |         _logger.info('Launching FeatureTableCreator job')
34 |         _logger.info(f'Running feature-table-creation pipeline in {self.env_vars["env"]} environment')
35 |         cfg = FeatureTableCreatorConfig(input_table=self._get_input_table(),
36 |                                         featurizer_cfg=self._get_data_prep_params(),
37 |                                         feature_store_table_cfg=self._get_feature_store_table_cfg(),
38 |                                         labels_table_cfg=self._get_labels_table_cfg())
39 |         FeatureTableCreator(cfg).run()
40 |         _logger.info('FeatureTableCreator job finished!')
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     job = FeatureTableCreatorJob()
45 |     job.launch()
46 | 


--------------------------------------------------------------------------------
/notebooks/model_inference_batch.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC 
 4 | # MAGIC # `model_inference_batch`
 5 | # MAGIC 
 6 | # MAGIC Pipeline to execute model inference.
 7 | # MAGIC Apply the model at the specified URI for batch inference on the table with name input_table_name,  writing results to the table with name output_table_name
 8 | 
 9 | # COMMAND ----------
10 | 
11 | # DBTITLE 1,pip install requirements.txt
12 | # MAGIC %pip install -r ../requirements.txt
13 | 
14 | # COMMAND ----------
15 | 
16 | # DBTITLE 1,Set env
17 | dbutils.widgets.dropdown('env', 'dev', ['dev', 'staging', 'prod'], 'Environment Name')
18 | 
19 | # COMMAND ----------
20 | 
21 | # DBTITLE 1,Module Imports
22 | from typing import Dict
23 | 
24 | from telco_churn.utils.notebook_utils import load_and_set_env_vars, load_config
25 | from telco_churn.model_inference import ModelInference
26 | from telco_churn.utils.logger_utils import get_logger
27 | 
28 | _logger = get_logger()
29 | 
30 | # COMMAND ----------
31 | 
32 | # DBTITLE 1,Load pipeline config params
33 | # Set pipeline name
34 | pipeline_name = 'model_inference_batch'
35 | 
36 | # Load pipeline config yaml file (../conf/pipeline_configs/{pipeline_name}.yml)
37 | pipeline_config = load_config(pipeline_name)
38 | 
39 | # Load and set arbitrary params via spark_env_vars
40 | # Params passed via ../conf/{env}/.{env}.env and ../conf/.base_data_params
41 | env_vars = load_and_set_env_vars(env=dbutils.widgets.get('env'))
42 | 
43 | # COMMAND ----------
44 | 
45 | # DBTITLE 1,Pipeline Config
46 | # Fetch model_uri
47 | model_name = env_vars['model_name']
48 | model_registry_stage = pipeline_config['mlflow_params']['model_registry_stage']
49 | model_uri = f'models:/{model_name}/{model_registry_stage}'
50 | print(f'model_uri: {model_uri}')
51 | 
52 | # Set input table name
53 | input_table_name = pipeline_config['data_input']['table_name']
54 | print(f'input_table_name: {input_table_name}')
55 | 
56 | # Set output table name
57 | predictions_table_database_name = env_vars['predictions_table_database_name']
58 | predictions_table_name = f'{predictions_table_database_name}.{env_vars["predictions_table_name"]}'
59 | print(f'predictions_table_name: {predictions_table_name}')
60 | 
61 | # COMMAND ----------
62 | 
63 | # DBTITLE 1,Execute Pipeline
64 | # Instantiate model inference pipeline
65 | model_inference_pipeline = ModelInference(model_uri=model_uri,
66 |                                           input_table_name=input_table_name,
67 |                                           output_table_name=predictions_table_name)
68 | 
69 | model_inference_pipeline.run_and_write_batch(mode=pipeline_config['data_output']['mode'])
70 | 


--------------------------------------------------------------------------------
/telco_churn/pipelines/model_train_job.py:
--------------------------------------------------------------------------------
 1 | from telco_churn.common import Workload, MLflowTrackingConfig, FeatureStoreTableConfig, LabelsTableConfig
 2 | from telco_churn.model_train import ModelTrain, ModelTrainConfig
 3 | from telco_churn.utils.logger_utils import get_logger
 4 | 
 5 | _logger = get_logger()
 6 | 
 7 | 
 8 | class ModelTrainJob(Workload):
 9 | 
10 |     def _get_mlflow_tracking_cfg(self):
11 |         try:
12 |             experiment_id = self.env_vars['model_train_experiment_id']
13 |         except KeyError:
14 |             experiment_id = None
15 |         try:
16 |             experiment_path = self.env_vars['model_train_experiment_path']
17 |         except KeyError:
18 |             experiment_path = None
19 | 
20 |         return MLflowTrackingConfig(run_name=self.conf['mlflow_params']['run_name'],
21 |                                     experiment_id=experiment_id,
22 |                                     experiment_path=experiment_path,
23 |                                     model_name=self.env_vars['model_name'])
24 | 
25 |     def _get_feature_store_table_cfg(self):
26 |         return FeatureStoreTableConfig(database_name=self.env_vars['feature_store_database_name'],
27 |                                        table_name=self.env_vars['feature_store_table_name'],
28 |                                        primary_keys=self.env_vars['feature_store_table_primary_keys'])
29 | 
30 |     def _get_labels_table_cfg(self):
31 |         return LabelsTableConfig(database_name=self.env_vars['labels_table_database_name'],
32 |                                  table_name=self.env_vars['labels_table_name'],
33 |                                  label_col=self.env_vars['labels_table_label_col'])
34 | 
35 |     def _get_pipeline_params(self):
36 |         return self.conf['pipeline_params']
37 | 
38 |     def _get_model_params(self):
39 |         return self.conf['model_params']
40 | 
41 |     def launch(self):
42 |         _logger.info('Launching ModelTrainJob job')
43 |         _logger.info(f'Running model-train pipeline in {self.env_vars["env"]} environment')
44 |         cfg = ModelTrainConfig(mlflow_tracking_cfg=self._get_mlflow_tracking_cfg(),
45 |                                feature_store_table_cfg=self._get_feature_store_table_cfg(),
46 |                                labels_table_cfg=self._get_labels_table_cfg(),
47 |                                pipeline_params=self._get_pipeline_params(),
48 |                                model_params=self._get_model_params(),
49 |                                conf=self.conf,
50 |                                env_vars=self.env_vars)
51 |         ModelTrain(cfg).run()
52 |         _logger.info('ModelTrainJob job finished!')
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     job = ModelTrainJob()
57 |     job.launch()
58 | 


--------------------------------------------------------------------------------
/notebooks/feature_table_creator.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC 
 4 | # MAGIC # `feature_table_creator`
 5 | # MAGIC 
 6 | # MAGIC Pipeline to create a Feature Store table, and separate labels table
 7 | 
 8 | # COMMAND ----------
 9 | 
10 | # DBTITLE 1,pip install requirements.txt
11 | # MAGIC %pip install -r ../requirements.txt
12 | 
13 | # COMMAND ----------
14 | 
15 | # DBTITLE 1,Set env
16 | dbutils.widgets.dropdown('env', 'dev', ['dev', 'staging', 'prod'], 'Environment Name')
17 | 
18 | # COMMAND ----------
19 | 
20 | # DBTITLE 1,Module Imports
21 | from telco_churn.utils.notebook_utils import load_and_set_env_vars, load_config
22 | 
23 | from telco_churn.common import FeatureStoreTableConfig, LabelsTableConfig
24 | from telco_churn.feature_table_creator import FeatureTableCreator, FeatureTableCreatorConfig
25 | from telco_churn.featurize import FeaturizerConfig
26 | from telco_churn.utils.logger_utils import get_logger
27 | 
28 | _logger = get_logger()
29 | 
30 | # COMMAND ----------
31 | 
32 | # DBTITLE 1,Load pipeline config params
33 | # Set pipeline name
34 | pipeline_name = 'feature_table_creator'
35 | 
36 | # Load pipeline config yaml file (../conf/pipeline_configs/{pipeline_name}.yml)
37 | pipeline_config = load_config(pipeline_name)
38 | 
39 | # Load and set arbitrary params via spark_env_vars
40 | # Params passed via ../conf/{env}/.{env}.env and ../conf/.base_data_params
41 | env_vars = load_and_set_env_vars(env=dbutils.widgets.get('env'))
42 | 
43 | # COMMAND ----------
44 | 
45 | # DBTITLE 1,Pipeline Config
46 | # Set FeaturizerConfig - data preparation config
47 | featurizer_cfg = FeaturizerConfig(**pipeline_config['data_prep_params'])
48 | 
49 | # Set Feature Store feature table config
50 | feature_store_table_cfg = FeatureStoreTableConfig(database_name=env_vars['feature_store_database_name'],
51 |                                                   table_name=env_vars['feature_store_table_name'],
52 |                                                   primary_keys=env_vars['feature_store_table_primary_keys'],
53 |                                                   description=env_vars['feature_store_table_description'])
54 | 
55 | # Set Labels Table config
56 | labels_table_cfg = LabelsTableConfig(database_name=env_vars['labels_table_database_name'],
57 |                                      table_name=env_vars['labels_table_name'],
58 |                                      label_col=env_vars['labels_table_label_col'],
59 |                                      dbfs_path=env_vars['labels_table_dbfs_path'])
60 | 
61 | # Set FeatureTableCreatorConfig
62 | cfg = FeatureTableCreatorConfig(input_table=pipeline_config['input_table'],
63 |                                 featurizer_cfg=featurizer_cfg,
64 |                                 feature_store_table_cfg=feature_store_table_cfg,
65 |                                 labels_table_cfg=labels_table_cfg)
66 | 
67 | # COMMAND ----------
68 | 
69 | # DBTITLE 1,Execute Pipeline
70 | # Instantiate pipeline
71 | feature_table_creator_pipeline = FeatureTableCreator(cfg)
72 | feature_table_creator_pipeline.run()
73 | 


--------------------------------------------------------------------------------
/notebooks/model_train.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC 
 4 | # MAGIC # `model_train`
 5 | # MAGIC 
 6 | # MAGIC Pipeline to execute model training. Params, metrics and model artifacts will be tracking to MLflow Tracking.
 7 | # MAGIC Optionally, the resulting model will be registered to MLflow Model Registry if provided.
 8 | 
 9 | # COMMAND ----------
10 | 
11 | # DBTITLE 1,pip install requirements.txt
12 | # MAGIC %pip install -r ../requirements.txt
13 | 
14 | # COMMAND ----------
15 | 
16 | # DBTITLE 1,Set env
17 | dbutils.widgets.dropdown('env', 'dev', ['dev', 'staging', 'prod'], 'Environment Name')
18 | 
19 | # COMMAND ----------
20 | 
21 | # DBTITLE 1,Module Imports
22 | from telco_churn.utils.notebook_utils import load_and_set_env_vars, load_config
23 | 
24 | from telco_churn.common import MLflowTrackingConfig, FeatureStoreTableConfig, LabelsTableConfig
25 | from telco_churn.model_train import ModelTrain, ModelTrainConfig
26 | from telco_churn.utils.logger_utils import get_logger
27 | 
28 | _logger = get_logger()
29 | 
30 | # COMMAND ----------
31 | 
32 | # DBTITLE 1,Load pipeline config params
33 | # Set pipeline name
34 | pipeline_name = 'model_train'
35 | 
36 | # Load pipeline config yaml file (../conf/pipeline_configs/{pipeline_name}.yml)
37 | pipeline_config = load_config(pipeline_name)
38 | 
39 | # Load and set arbitrary params via spark_env_vars
40 | # Params passed via ../conf/{env}/.{env}.env and ../conf/.base_data_params
41 | env_vars = load_and_set_env_vars(env=dbutils.widgets.get('env'))
42 | 
43 | # COMMAND ----------
44 | 
45 | # DBTITLE 1,Pipeline Config
46 | # Set MLflowTrackingConfig
47 | mlflow_tracking_cfg = MLflowTrackingConfig(run_name=pipeline_config['mlflow_params']['run_name'],
48 |                                            experiment_path=env_vars['model_train_experiment_path'],
49 |                                            model_name=env_vars['model_name'])
50 | 
51 | 
52 | # Set FeatureStoreTableConfig
53 | feature_store_table_cfg = FeatureStoreTableConfig(database_name=env_vars['feature_store_database_name'],
54 |                                                   table_name=env_vars['feature_store_table_name'],
55 |                                                   primary_keys=env_vars['feature_store_table_primary_keys'])
56 | 
57 | # Set LabelsTableConfig
58 | labels_table_cfg = LabelsTableConfig(database_name=env_vars['labels_table_database_name'],
59 |                                  table_name=env_vars['labels_table_name'],
60 |                                  label_col=env_vars['labels_table_label_col'])
61 | 
62 | # Set pipeline_params
63 | pipeline_params = pipeline_config['pipeline_params']
64 | 
65 | # Set model_params
66 | model_params = pipeline_config['model_params']
67 | 
68 | # Define ModelTrainConfig
69 | cfg = ModelTrainConfig(mlflow_tracking_cfg=mlflow_tracking_cfg,
70 |                        feature_store_table_cfg=feature_store_table_cfg,
71 |                        labels_table_cfg=labels_table_cfg,
72 |                        pipeline_params=pipeline_params,
73 |                        model_params=model_params,
74 |                        conf=pipeline_config,    # Track pipeline_config to mlflow
75 |                        env_vars=env_vars        # Track env_vars to mlflow
76 |                       )
77 | 
78 | # COMMAND ----------
79 | 
80 | # DBTITLE 1,Execute Pipeline
81 | # Instantiate pipeline
82 | model_train_pipeline = ModelTrain(cfg)
83 | model_train_pipeline.run()
84 | 


--------------------------------------------------------------------------------
/notebooks/model_deployment.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC 
 4 | # MAGIC # `model_deployment`
 5 | # MAGIC 
 6 | # MAGIC Pipeline to execute model deployment. This class orchestrates the comparison of the current Production model versus Staging model. 
 7 | # MAGIC The Production model will be the most recent model version under registered in the MLflow Model Registry under the provided model_name, for stage="Production". Likewise for Staging.
 8 | # MAGIC Execution will involve loading the models and performing batch inference for a specified reference dataset.
 9 | # MAGIC The two models will be compared using the specified comparison_metric.
10 | # MAGIC `higher_is_better` indicates whether a higher value for the evaluation metric equates to a better peforming model.
11 | # MAGIC Dependent on this comparison the candidate Staging model will be either promoted to Production (and the current
12 | # MAGIC Production model archived) if performing better, or the Staging model will be archived if it does not perform better than the current Production model.
13 | # MAGIC 
14 | # MAGIC Metrics computed when comparing the two models will be logged to MLflow, under the provided experiment_id or experiment_path.
15 | # MAGIC 
16 | # MAGIC **Pipeline Steps**:
17 | # MAGIC 1. Set MLflow Tracking experiment. Used to track metrics computed when comparing Staging versus Production
18 | # MAGIC        models.
19 | # MAGIC 1. Load Staging and Production models and score against reference dataset provided. The reference data specified must currently be a table.
20 | # MAGIC 1. Compute evaluation metric for both Staging and Production model predictions against reference data
21 | # MAGIC 1. If higher_is_better=True, the Staging model will be promoted in place of the Production model iff the Staging model evaluation metric is higher than the Production model evaluation metric. If higher_is_better=False, the Staging model will be promoted in place of the Production model iff the Staging model evaluation metric is lower than the Production model evaluation metric.
22 | 
23 | # COMMAND ----------
24 | 
25 | # DBTITLE 1,pip install requirements.txt
26 | # MAGIC %pip install -r ../requirements.txt
27 | 
28 | # COMMAND ----------
29 | 
30 | # DBTITLE 1,Set env
31 | dbutils.widgets.dropdown('env', 'dev', ['dev', 'staging', 'prod'], 'Environment Name')
32 | 
33 | # COMMAND ----------
34 | 
35 | # DBTITLE 1,Module Imports
36 | from telco_churn.utils.notebook_utils import load_and_set_env_vars, load_config
37 | 
38 | from telco_churn.common import MLflowTrackingConfig
39 | from telco_churn.model_deployment import ModelDeployment, ModelDeploymentConfig
40 | from telco_churn.utils.logger_utils import get_logger
41 | 
42 | _logger = get_logger()
43 | 
44 | # COMMAND ----------
45 | 
46 | # DBTITLE 1,Load pipeline config params
47 | # Set pipeline name
48 | pipeline_name = 'model_deployment'
49 | 
50 | # Load pipeline config yaml file (../conf/pipeline_configs/{pipeline_name}.yml)
51 | pipeline_config = load_config(pipeline_name)
52 | 
53 | # Load and set arbitrary params via spark_env_vars
54 | # Params passed via ../conf/{env}/.{env}.env and ../conf/.base_data_params
55 | env_vars = load_and_set_env_vars(env=dbutils.widgets.get('env'))
56 | 
57 | # COMMAND ----------
58 | 
59 | # DBTITLE 1,Pipeline Config
60 | # Set MLflowTrackingConfig - comparison metrics logged to MLflow
61 | mlflow_tracking_cfg = MLflowTrackingConfig(experiment_path=env_vars['model_deploy_experiment_path'],
62 |                                            run_name='staging_vs_prod_comparison',
63 |                                            model_name=env_vars['model_name'])
64 | 
65 | # Define reference dataset 
66 | reference_table_database_name = env_vars['reference_table_database_name']
67 | reference_table_name = f'{reference_table_database_name}.{env_vars["reference_table_name"]}'   
68 | 
69 | # Set label col from reference dataset
70 | label_col = env_vars['reference_table_label_col']
71 | 
72 | # Params defining how to compare staging vs prod models
73 | model_comparison_params = pipeline_config['model_comparison_params']
74 | 
75 | # Define ModelDeploymentConfig
76 | cfg = ModelDeploymentConfig(mlflow_tracking_cfg=mlflow_tracking_cfg,
77 |                             reference_data=reference_table_name,
78 |                             label_col=label_col,
79 |                             comparison_metric=model_comparison_params['metric'],
80 |                             higher_is_better=model_comparison_params['higher_is_better'])
81 | 
82 | # COMMAND ----------
83 | 
84 | # DBTITLE 1,Execute Pipeline
85 | # Instantiate pipeline
86 | model_deployment_pipeline = ModelDeployment(cfg)
87 | model_deployment_pipeline.run()
88 | 


--------------------------------------------------------------------------------
/tests/unit/conftest.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This conftest.py contains handy components that prepare SparkSession and other relevant objects.
  3 | """
  4 | 
  5 | import os
  6 | from pathlib import Path
  7 | import shutil
  8 | import tempfile
  9 | from typing import Iterator
 10 | from unittest.mock import patch
 11 | 
 12 | import mlflow
 13 | import pytest
 14 | from delta import configure_spark_with_delta_pip
 15 | from pyspark.sql import SparkSession
 16 | import logging
 17 | from dataclasses import dataclass
 18 | 
 19 | 
 20 | @dataclass
 21 | class FileInfoFixture:
 22 |     """
 23 |     This class mocks the DBUtils FileInfo object
 24 |     """
 25 |     path: str
 26 |     name: str
 27 |     size: int
 28 |     modificationTime: int
 29 | 
 30 | 
 31 | class DBUtilsFixture:
 32 |     """
 33 |     This class is used for mocking the behaviour of DBUtils inside tests.
 34 |     """
 35 | 
 36 |     def __init__(self):
 37 |         self.fs = self
 38 | 
 39 |     def cp(self, src: str, dest: str, recurse: bool = False):
 40 |         copy_func = shutil.copytree if recurse else shutil.copy
 41 |         copy_func(src, dest)
 42 | 
 43 |     def ls(self, path: str):
 44 |         _paths = Path(path).glob("*")
 45 |         _objects = [
 46 |             FileInfoFixture(
 47 |                 str(p.absolute()), p.name, p.stat().st_size, int(p.stat().st_mtime)
 48 |             )
 49 |             for p in _paths
 50 |         ]
 51 |         return _objects
 52 | 
 53 |     def mkdirs(self, path: str):
 54 |         Path(path).mkdir(parents=True, exist_ok=True)
 55 | 
 56 |     def mv(self, src: str, dest: str, recurse: bool = False):
 57 |         copy_func = shutil.copytree if recurse else shutil.copy
 58 |         shutil.move(src, dest, copy_function=copy_func)
 59 | 
 60 |     def put(self, path: str, content: str, overwrite: bool = False):
 61 |         _f = Path(path)
 62 | 
 63 |         if _f.exists() and not overwrite:
 64 |             raise FileExistsError("File already exists")
 65 | 
 66 |         _f.write_text(content, encoding="utf-8")
 67 | 
 68 |     def rm(self, path: str, recurse: bool = False):
 69 |         deletion_func = shutil.rmtree if recurse else os.remove
 70 |         deletion_func(path)
 71 | 
 72 | 
 73 | @pytest.fixture(scope="session")
 74 | def spark() -> SparkSession:
 75 |     """
 76 |     This fixture provides preconfigured SparkSession with Hive and Delta support.
 77 |     After the test session, temporary warehouse directory is deleted.
 78 |     :return: SparkSession
 79 |     """
 80 |     logging.info("Configuring Spark session for testing environment")
 81 |     warehouse_dir = tempfile.TemporaryDirectory().name
 82 |     _builder = (
 83 |         SparkSession.builder.master("local[1]")
 84 |             .config("spark.hive.metastore.warehouse.dir", Path(warehouse_dir).as_uri())
 85 |             .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
 86 |             .config(
 87 |             "spark.sql.catalog.spark_catalog",
 88 |             "org.apache.spark.sql.delta.catalog.DeltaCatalog",
 89 |         )
 90 |     )
 91 |     spark: SparkSession = configure_spark_with_delta_pip(_builder).getOrCreate()
 92 |     logging.info("Spark session configured")
 93 |     yield spark
 94 |     logging.info("Shutting down Spark session")
 95 |     spark.stop()
 96 |     if Path(warehouse_dir).exists():
 97 |         shutil.rmtree(warehouse_dir)
 98 | 
 99 | 
100 | @pytest.fixture(scope="session", autouse=True)
101 | def mlflow_local():
102 |     """
103 |     This fixture provides local instance of mlflow with support for tracking and registry functions.
104 |     After the test session:
105 |     * temporary storage for tracking and registry is deleted.
106 |     * Active run will be automatically stopped to avoid verbose errors.
107 |     :return: None
108 |     """
109 |     logging.info("Configuring local MLflow instance")
110 |     tracking_uri = tempfile.TemporaryDirectory().name
111 |     registry_uri = f"sqlite:///{tempfile.TemporaryDirectory().name}"
112 | 
113 |     mlflow.set_tracking_uri(Path(tracking_uri).as_uri())
114 |     mlflow.set_registry_uri(registry_uri)
115 |     logging.info("MLflow instance configured")
116 |     yield None
117 | 
118 |     mlflow.end_run()
119 | 
120 |     if Path(tracking_uri).exists():
121 |         shutil.rmtree(tracking_uri)
122 | 
123 |     if Path(registry_uri).exists():
124 |         Path(registry_uri).unlink()
125 |     logging.info("Test session finished, unrolling the MLflow instance")
126 | 
127 | 
128 | @pytest.fixture(scope="session", autouse=True)
129 | def dbutils_fixture() -> Iterator[None]:
130 |     """
131 |     This fixture patches the `get_dbutils` function.
132 |     Please note that patch is applied on a string name of the function.
133 |     If you change the name or location of it, patching won't work.
134 |     :return:
135 |     """
136 |     logging.info("Patching the DBUtils object")
137 |     with patch("telco_churn.common.get_dbutils", lambda _: DBUtilsFixture()):
138 |         yield
139 |     logging.info("Test session finished, patching completed")
140 | 


--------------------------------------------------------------------------------
/telco_churn/model_inference.py:
--------------------------------------------------------------------------------
  1 | import pyspark.sql.dataframe
  2 | from databricks.feature_store import FeatureStoreClient
  3 | 
  4 | from telco_churn.utils.get_spark import spark
  5 | from telco_churn.utils.logger_utils import get_logger
  6 | 
  7 | _logger = get_logger()
  8 | 
  9 | 
 10 | class ModelInference:
 11 |     """
 12 |     Class to execute model inference.
 13 |     Apply the model at the specified URI for batch inference on the table with name input_table_name,
 14 |     writing results to the table with name output_table_name
 15 |     """
 16 |     def __init__(self, model_uri: str, input_table_name: str, output_table_name: str = None):
 17 |         """
 18 | 
 19 |         Parameters
 20 |         ----------
 21 |         model_uri : str
 22 |             MLflow model uri. Model model must have been logged using the Feature Store API
 23 |         input_table_name : str
 24 |             Table name to load as a Spark DataFrame to score the model on. Must contain column(s)
 25 |             for lookup keys to join feature data from Feature Store
 26 |         output_table_name : str
 27 |             Output table name to write results to
 28 |         """
 29 |         self.model_uri = model_uri
 30 |         self.input_table_name = input_table_name
 31 |         self.output_table_name = output_table_name
 32 | 
 33 |     def _load_input_table(self) -> pyspark.sql.DataFrame:
 34 |         """
 35 |         Load Spark DataFrame containing lookup keys to join feature data from Feature Store
 36 | 
 37 |         Returns
 38 |         -------
 39 |         pyspark.sql.DataFrame
 40 |         """
 41 |         input_table_name = self.input_table_name
 42 |         _logger.info(f"Loading lookup keys from input table: {input_table_name}")
 43 |         return spark.table(input_table_name)
 44 | 
 45 |     def fs_score_batch(self, df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:
 46 |         """
 47 |         Load and apply model from MLflow Model Registry using Feature Store API. Features will be automatically
 48 |         retrieved from the Feature Store. This method requires that the registered model must have been logged
 49 |         with FeatureStoreClient.log_model(), which packages the model with feature metadata. Unless present in df ,
 50 |         these features will be looked up from Feature Store and joined with df prior to scoring the model.
 51 | 
 52 |         Parameters
 53 |         ----------
 54 |         df : pyspark.sql.DataFrame
 55 |             The DataFrame to score the model on. Feature Store features will be joined with df prior to scoring the
 56 |                 model. df must:
 57 | 
 58 |                     1. Contain columns for lookup keys required to join feature data from Feature Store, as specified in
 59 |                        the feature_spec.yaml artifact.
 60 |                     2. Contain columns for all source keys required to score the model, as specified in the
 61 |                        feature_spec.yaml artifact.
 62 |                     3. Not contain a column prediction, which is reserved for the modelʼs predictions. df may contain
 63 |                        additional columns.
 64 | 
 65 |         Returns
 66 |         -------
 67 |         pyspark.sql.DataFrame:
 68 |             A Spark DataFrame containing:
 69 |                 1. All columns of df.
 70 |                 2. All feature values retrieved from Feature Store.
 71 |                 3. A column prediction containing the output of the model.
 72 |         """
 73 |         fs = FeatureStoreClient()
 74 |         _logger.info(f"Loading model from Model Registry: {self.model_uri}")
 75 | 
 76 |         return fs.score_batch(self.model_uri, df)
 77 | 
 78 |     def run_batch(self) -> pyspark.sql.DataFrame:
 79 |         """
 80 |         Load inference lookup keys, feature data from Feature Store, and score using the loaded model from MLflow
 81 |         model registry
 82 | 
 83 |         Returns
 84 |         -------
 85 |         pyspark.sql.DataFrame:
 86 |             A Spark DataFrame containing:
 87 |                 1. All columns of inference df.
 88 |                 2. All feature values retrieved from Feature Store.
 89 |                 3. A column prediction containing the output of the model.
 90 |         """
 91 |         input_df = self._load_input_table()
 92 |         pred_df = self.fs_score_batch(input_df)
 93 | 
 94 |         return pred_df
 95 | 
 96 |     def run_and_write_batch(self, mode: str = 'overwrite') -> None:
 97 |         """
 98 |         Run batch inference, save as Delta table to `self.output_table_name`
 99 | 
100 |         Parameters
101 |         ----------
102 |         mode : str
103 |             Specify behavior when predictions data already exists.
104 |                         Options include:
105 |                             * "append": Append contents of this :class:`DataFrame` to existing data.
106 |                             * "overwrite": Overwrite existing data.
107 | 
108 |         Returns
109 |         -------
110 | 
111 |         """
112 |         _logger.info("==========Running batch model inference==========")
113 |         pred_df = self.run_batch()
114 | 
115 |         _logger.info("==========Writing predictions==========")
116 |         _logger.info(f"mode={mode}")
117 |         _logger.info(f"Predictions written to {self.output_table_name}")
118 |         # Model predictions are written to the Delta table provided as input.
119 |         # Delta is the default format in Databricks Runtime 8.0 and above.
120 |         pred_df.write.format("delta").mode(mode).saveAsTable(self.output_table_name)
121 | 
122 |         _logger.info("==========Batch model inference completed==========")
123 | 


--------------------------------------------------------------------------------
/telco_churn/featurize.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | 
  3 | import pyspark
  4 | import pyspark.pandas as ps
  5 | 
  6 | from telco_churn.utils.logger_utils import get_logger
  7 | 
  8 | _logger = get_logger()
  9 | 
 10 | 
 11 | @dataclass
 12 | class FeaturizerConfig:
 13 |     """
 14 |      Attributes:
 15 |         label_col (str): Name of original label column in input data
 16 |         ohe (bool): Flag to indicate whether or not to one hot encode categorical columns
 17 |         cat_cols (list): List of categorical columns. Only required if ohe=True
 18 |         drop_missing (bool): Flag to indicate whether or not to drop missing values
 19 |     """
 20 |     label_col: str = 'churnString'
 21 |     ohe: bool = False
 22 |     cat_cols: list = None
 23 |     drop_missing: bool = True
 24 | 
 25 | 
 26 | class Featurizer:
 27 |     """
 28 |     Class containing featurization logic to apply to input Spark DataFrame
 29 |     """
 30 |     def __init__(self, cfg: FeaturizerConfig):
 31 |         self.cfg = cfg 
 32 | 
 33 |     @staticmethod
 34 |     def pyspark_pandas_ohe(psdf: ps.DataFrame, cat_cols: list) -> pyspark.pandas.DataFrame:
 35 |         """
 36 |         Take a pyspark.pandas DataFrame and convert a list of categorical variables (columns) into dummy/indicator
 37 |         variables, also known as one hot encoding.
 38 |         
 39 |         Parameters
 40 |         ----------
 41 |         psdf : pyspark.pandas.DataFrame
 42 |             pyspark.pandas DataFrame to OHE
 43 |         cat_cols : list
 44 |             List of categorical features
 45 |         Returns
 46 |         -------
 47 |         pyspark.pandas.DataFrame
 48 |         """
 49 |         return ps.get_dummies(psdf, columns=cat_cols, dtype='int64')
 50 | 
 51 |     def process_label(self, psdf: pyspark.pandas.DataFrame, rename_to: str = 'churn') -> pyspark.pandas.DataFrame:
 52 |         """
 53 |         Convert label to int and rename label column
 54 |         TODO: add test
 55 |         
 56 |         Parameters
 57 |         ----------
 58 |         psdf : pyspark.pandas.DataFrame
 59 |             pyspark.pandas DataFrame
 60 |         rename_to : str
 61 |             Name of new label column name
 62 |         Returns
 63 |         -------
 64 |         pyspark.pandas.DataFrame
 65 |         """
 66 |         psdf[self.cfg.label_col] = psdf[self.cfg.label_col].map({'Yes': 1, 'No': 0})
 67 |         psdf = psdf.astype({self.cfg.label_col: 'int32'})
 68 |         psdf = psdf.rename(columns={self.cfg.label_col: rename_to})
 69 | 
 70 |         return psdf
 71 | 
 72 |     @staticmethod
 73 |     def process_col_names(psdf: pyspark.pandas.DataFrame) -> pyspark.pandas.DataFrame:
 74 |         """
 75 |         Strip parentheses and spaces from existing column names, replacing spaces with '_'
 76 |         TODO: add test
 77 |         
 78 |         Parameters
 79 |         ----------
 80 |         psdf : pyspark.pandas.DataFrame
 81 |             pyspark.pandas DataFrame
 82 |         Returns
 83 |         -------
 84 |         pyspark.pandas.DataFrame
 85 |         """
 86 |         cols = psdf.columns.to_list()
 87 |         new_col_names = [col.replace(' ', '').replace('(', '_').replace(')', '') for col in cols]
 88 | 
 89 |         # Update column names to new column names
 90 |         psdf.columns = new_col_names
 91 | 
 92 |         return psdf
 93 | 
 94 |     @staticmethod
 95 |     def drop_missing_values(psdf: pyspark.pandas.DataFrame) -> pyspark.pandas.DataFrame:
 96 |         """
 97 |         Remove missing values
 98 |         
 99 |         Parameters
100 |         ----------
101 |         psdf
102 |         Returns
103 |         -------
104 |         pyspark.pandas.DataFrame
105 |         """
106 |         return psdf.dropna()
107 | 
108 |     def run(self, df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:
109 |         """
110 |         Run all data preprocessing steps. Consists of the following:
111 |     
112 |             1. Convert PySpark DataFrame to pandas_on_spark DataFrame 
113 |             2. Process the label column - converting to int and renaming col to 'churn'
114 |             3. Apply OHE if specified in the config
115 |             4. Drop any missing values if specified in the config
116 |             5. Return resulting preprocessed dataset as a PySpark DataFrame
117 |             
118 |         Parameters
119 |         ----------
120 |         df : pyspark.sql.DataFrame
121 |             Input PySpark DataFrame to preprocess
122 | 
123 |         Returns
124 |         -------
125 |         pyspark.sql.DataFrame
126 |             Preprocessed dataset of features and label column
127 |         """
128 |         _logger.info('Running Data Preprocessing steps...')
129 | 
130 |         # Convert Spark DataFrame to pandas on Spark DataFrame
131 |         psdf = df.pandas_api()
132 | 
133 |         # Convert label to int and rename column
134 |         _logger.info(f'Processing label: {self.cfg.label_col}')
135 |         psdf = self.process_label(psdf, rename_to='churn')
136 | 
137 |         # OHE
138 |         if self.cfg.ohe:
139 |             _logger.info('Applying one-hot-encoding')
140 |             if self.cfg.cat_cols is None:
141 |                 raise RuntimeError('cat_cols must be provided if ohe=True')
142 |             psdf = self.pyspark_pandas_ohe(psdf, self.cfg.cat_cols)
143 | 
144 |             # Clean up column names resulting from OHE
145 |             _logger.info(f'Renaming columns')
146 |             psdf = self.process_col_names(psdf)
147 | 
148 |         # Drop missing values
149 |         if self.cfg.drop_missing:
150 |             _logger.info(f'Dropping missing values')
151 |             psdf = self.drop_missing_values(psdf)
152 | 
153 |         # Return as Spark DataFrame
154 |         preproc_df = psdf.to_spark()
155 | 
156 |         return preproc_df
157 | 


--------------------------------------------------------------------------------
/telco_churn/feature_table_creator.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | 
  3 | import pyspark.sql.dataframe
  4 | 
  5 | from telco_churn import featurize
  6 | from telco_churn.common import FeatureStoreTableConfig, LabelsTableConfig
  7 | from telco_churn.featurize import FeaturizerConfig
  8 | from telco_churn.utils import feature_store_utils
  9 | from telco_churn.utils.get_spark import spark
 10 | from telco_churn.utils.logger_utils import get_logger
 11 | 
 12 | _logger = get_logger()
 13 | 
 14 | 
 15 | @dataclass
 16 | class FeatureTableCreatorConfig:
 17 |     """
 18 |     Attributes:
 19 |         input_table (str):
 20 |             Name of the table to use as input for creating features
 21 |         featurizer_cfg (FeaturizerConfig):
 22 |             Featurization config to specify label_col, ohe, cat_cols and drop_missing params
 23 |         feature_store_table_cfg (FeatureStoreTableConfig):
 24 |             Feature Store table config to specify database_name, table_name, primary_keys and description
 25 |         labels_table_cfg (LabelsTableConfig):
 26 |             Labels table config to specify database_name, table_name, label_col and dbfs_path
 27 |     """
 28 |     input_table: str
 29 |     featurizer_cfg: FeaturizerConfig
 30 |     feature_store_table_cfg: FeatureStoreTableConfig
 31 |     labels_table_cfg: LabelsTableConfig
 32 | 
 33 | 
 34 | class FeatureTableCreator:
 35 |     """
 36 |     Class to execute a pipeline to create a Feature Store table, and separate labels table
 37 |     """
 38 |     def __init__(self, cfg: FeatureTableCreatorConfig):
 39 |         self.cfg = cfg
 40 | 
 41 |     @staticmethod
 42 |     def setup(database_name: str, table_name: str) -> None:
 43 |         """
 44 |         Set up database to use. Create the database {database_name} if it doesn't exist, and drop the table {table_name}
 45 |         if it exists
 46 | 
 47 |         Parameters
 48 |         ----------
 49 |         database_name : str
 50 |             Database to create if it doesn't exist. Otherwise use database of the name provided
 51 |         table_name : str
 52 |             Drop table if it already exists
 53 |         """
 54 |         _logger.info(f'Creating database {database_name} if not exists')
 55 |         spark.sql(f'CREATE DATABASE IF NOT EXISTS {database_name};')
 56 |         spark.sql(f'USE {database_name};')
 57 |         spark.sql(f'DROP TABLE IF EXISTS {table_name};')
 58 | 
 59 |     def run_data_ingest(self) -> pyspark.sql.DataFrame:
 60 |         """
 61 |         Run data ingest step
 62 | 
 63 |         Returns
 64 |         -------
 65 |         pyspark.sql.DataFrame
 66 |             Input Spark DataFrame
 67 |         """
 68 |         return spark.table(self.cfg.input_table)
 69 | 
 70 |     def run_data_prep(self, input_df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:
 71 |         """
 72 |         Run data preparation step, using Featurizer to run featurization logic to create features from the input
 73 |         DataFrame.
 74 | 
 75 |         Parameters
 76 |         ----------
 77 |         input_df : pyspark.sql.DataFrame
 78 |             Input Spark DataFrame
 79 | 
 80 |         Returns
 81 |         -------
 82 |         pyspark.sql.DataFrame
 83 |             Processed Spark DataFrame containing features
 84 |         """
 85 |         featurizer = featurize.Featurizer(self.cfg.featurizer_cfg)
 86 |         proc_df = featurizer.run(input_df)
 87 | 
 88 |         return proc_df
 89 | 
 90 |     def run_feature_table_create(self, df: pyspark.sql.DataFrame) -> None:
 91 |         """
 92 |         Method to create feature table in Databricks Feature Store. When run, this method will create from scratch the
 93 |         feature table. As such, we first create (if it doesn't exist) the database specified, and drop the table if it
 94 |         already exists.
 95 | 
 96 |         The feature table is created from the Spark DataFrame provided, dropping the label column if it exists in the
 97 |         DataFrame. The label column cannot be present in the feature table when later constructing a feature store
 98 |         training set from the feature table. The feature table will be created using the primary keys and description
 99 |         proivided via feature_store_table_cfg.
100 | 
101 |         Parameters
102 |         ----------
103 |         df : pyspark.sql.DataFrame
104 |             Spark DataFrame from which to create the feature table.
105 |         """
106 |         feature_store_table_cfg = self.cfg.feature_store_table_cfg
107 | 
108 |         # Create database if not exists, drop table if it already exists
109 |         self.setup(database_name=feature_store_table_cfg.database_name,
110 |                    table_name=feature_store_table_cfg.table_name)
111 | 
112 |         # Store only features for each customerID, storing customerID, churn in separate churn_labels table
113 |         # During model training we will use the churn_labels table to join features into
114 |         features_df = df.drop(self.cfg.labels_table_cfg.label_col)
115 |         feature_table_name = f'{feature_store_table_cfg.database_name}.{feature_store_table_cfg.table_name}'
116 |         _logger.info(f'Creating and writing features to feature table: {feature_table_name}')
117 |         feature_store_utils.create_and_write_feature_table(features_df,
118 |                                                            feature_table_name,
119 |                                                            primary_keys=feature_store_table_cfg.primary_keys,
120 |                                                            description=feature_store_table_cfg.description)
121 | 
122 |     def run_labels_table_create(self, df: pyspark.sql.DataFrame) -> None:
123 |         """
124 |         Method to create labels table. This table will consist of the columns primary_key, label_col
125 | 
126 |         Create table using params specified in labels_table_cfg. Will create Delta table at dbfs_path, and further
127 |         create a table using this Delta location.
128 | 
129 |         Parameters
130 |         ----------
131 |         df : pyspark.sql.DataFrame
132 |             Spark DataFrame containing primary keys column and label column
133 |         """
134 |         feature_store_table_cfg = self.cfg.feature_store_table_cfg
135 |         labels_table_cfg = self.cfg.labels_table_cfg
136 | 
137 |         if isinstance(feature_store_table_cfg.primary_keys, str):
138 |             labels_table_cols = [feature_store_table_cfg.primary_keys,
139 |                                  labels_table_cfg.label_col]
140 |         elif isinstance(feature_store_table_cfg.primary_keys, list):
141 |             labels_table_cols = feature_store_table_cfg.primary_keys + \
142 |                                 [labels_table_cfg.label_col]
143 |         else:
144 |             raise RuntimeError('Feature Store table primary keys must be one of either str of list type')
145 | 
146 |         labels_database_name = labels_table_cfg.database_name
147 |         labels_table_name = labels_table_cfg.table_name
148 |         labels_dbfs_path = labels_table_cfg.dbfs_path
149 |         # Create database if not exists, drop table if it already exists
150 |         self.setup(database_name=labels_database_name, table_name=labels_table_name)
151 |         # DataFrame of customerID/churn labels
152 |         labels_df = df.select(labels_table_cols)
153 |         _logger.info(f'Writing labels to DBFS: {labels_dbfs_path}')
154 |         labels_df.write.format('delta').mode('overwrite').save(labels_dbfs_path)
155 |         spark.sql(f"""CREATE TABLE {labels_database_name}.{labels_table_name}
156 |                       USING DELTA LOCATION '{labels_dbfs_path}';""")
157 |         _logger.info(f'Created labels table: {labels_database_name}.{labels_table_name}')
158 | 
159 |     def run(self) -> None:
160 |         """
161 |         Run feature table creation pipeline
162 |         """
163 |         _logger.info('==========Data Ingest==========')
164 |         input_df = self.run_data_ingest()
165 | 
166 |         _logger.info('==========Data Prep==========')
167 |         proc_df = self.run_data_prep(input_df)
168 | 
169 |         _logger.info('==========Create Feature Table==========')
170 |         self.run_feature_table_create(proc_df)
171 | 
172 |         _logger.info('==========Create Labels Table==========')
173 |         self.run_labels_table_create(proc_df)
174 | 


--------------------------------------------------------------------------------
/conf/deployment.yml:
--------------------------------------------------------------------------------
  1 | custom:
  2 | 
  3 |   # Cluster configs for each environment
  4 |   default-cluster-spec: &default-cluster-spec
  5 |     spark_version: '11.0.x-cpu-ml-scala2.12'
  6 |     node_type_id: 'i3.xlarge' # NOTE: this is an AWS-specific instance type. Change accordingly if running on Azure or GCP.
  7 |     driver_node_type_id: 'i3.xlarge'  # NOTE: this is an AWS-specific instance type. Change accordingly if running on Azure or GCP.
  8 |     num_workers: 1
  9 |     # To reduce start up time for each job, it is advisable to use a cluster pool. To do so involves supplying the following
 10 |     # two fields with a pool_id to acquire both the driver and instances from.
 11 |     # If driver_instance_pool_id and instance_pool_id are set, both node_type_id and driver_node_type_id CANNOT be supplied.
 12 |     # As such, if providing a pool_id for driver and worker instances, please ensure that node_type_id and driver_node_type_id are not present
 13 | #    driver_instance_pool_id: '0617-151415-bells2-pool-hh7h6tjm'
 14 | #    instance_pool_id: '0617-151415-bells2-pool-hh7h6tjm'
 15 | 
 16 |   dev-cluster-config: &dev-cluster-config
 17 |     new_cluster:
 18 |       <<: *default-cluster-spec
 19 | 
 20 |   staging-cluster-config: &staging-cluster-config
 21 |     new_cluster:
 22 |       <<: *default-cluster-spec
 23 | 
 24 |   prod-cluster-config: &prod-cluster-config
 25 |     new_cluster:
 26 |       <<: *default-cluster-spec
 27 | 
 28 | # Databricks Jobs definitions
 29 | # please note that we're using FUSE reference for config, and env files, hence we're going to load this file using its local FS path
 30 | environments:
 31 | 
 32 |   dev:
 33 |     strict_path_adjustment_policy: true
 34 |     jobs:
 35 |       - name: 'DEV-telco-churn-demo-setup'
 36 |         <<: *dev-cluster-config
 37 |         spark_python_task:
 38 |           python_file: 'file://telco_churn/pipelines/demo_setup_job.py'
 39 |           parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env',
 40 |                        '--env', 'file:fuse://conf/dev/.dev.env',
 41 |                        '--conf-file', 'file:fuse://conf/pipeline_configs/demo_setup.yml']
 42 |       - name: 'DEV-telco-churn-feature-table-creation'
 43 |         <<: *dev-cluster-config
 44 |         spark_python_task:
 45 |           python_file: 'file://telco_churn/pipelines/feature_table_creator_job.py'
 46 |           parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env',
 47 |                        '--env', 'file:fuse://conf/dev/.dev.env',
 48 |                        '--conf-file', 'file:fuse://conf/pipeline_configs/feature_table_creator.yml']
 49 |       - name: 'DEV-telco-churn-model-train'
 50 |         <<:
 51 |           - *dev-cluster-config
 52 |         spark_python_task:
 53 |           python_file: 'file://telco_churn/pipelines/model_train_job.py'
 54 |           parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env',
 55 |                        '--env', 'file:fuse://conf/dev/.dev.env',
 56 |                        '--conf-file', 'file:fuse://conf/pipeline_configs/model_train.yml']
 57 |       - name: 'DEV-telco-churn-model-deployment'
 58 |         <<:
 59 |           - *dev-cluster-config
 60 |         spark_python_task:
 61 |           python_file: 'file://telco_churn/pipelines/model_deployment_job.py'
 62 |           parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env',
 63 |                        '--env', 'file:fuse://conf/dev/.dev.env',
 64 |                        '--conf-file', 'file:fuse://conf/pipeline_configs/model_deployment.yml']
 65 |       - name: 'DEV-telco-churn-model-inference-batch'
 66 |         <<:
 67 |           - *dev-cluster-config
 68 |         spark_python_task:
 69 |           python_file: 'file://telco_churn/pipelines/model_inference_batch_job.py'
 70 |           parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env',
 71 |                        '--env', 'file:fuse://conf/dev/.dev.env',
 72 |                        '--conf-file', 'file:fuse://conf/pipeline_configs/model_inference_batch.yml']
 73 |       - name: 'DEV-telco-churn-sample-integration-test'
 74 |         <<:
 75 |           - *dev-cluster-config
 76 |         spark_python_task:
 77 |           python_file: 'file://tests/integration/sample_test.py'
 78 |           parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env',
 79 |                        '--env', 'file:fuse://conf/dev/.dev.env',
 80 |                        '--conf-file', 'file:fuse://conf/pipeline_configs/sample_test.yml']
 81 | 
 82 |   staging:
 83 |     strict_path_adjustment_policy: true
 84 |     jobs:
 85 |       - name: 'STAGING-telco-churn-sample-integration-test'
 86 |         <<:
 87 |           - *staging-cluster-config
 88 |         spark_python_task:
 89 |           python_file: 'file://tests/integration/sample_test.py'
 90 |           parameters: ['--env', 'file:fuse://conf/staging/.staging.env',
 91 |                        '--conf-file', 'file:fuse://conf/pipeline_configs/sample_test.yml']
 92 | 
 93 |   prod:
 94 |     strict_path_adjustment_policy: true
 95 |     jobs:
 96 |       - name: 'PROD-telco-churn-demo-setup'
 97 |         <<: *prod-cluster-config
 98 |         spark_python_task:
 99 |           python_file: 'file://telco_churn/pipelines/demo_setup_job.py'
100 |           parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env',
101 |                        '--env', 'file:fuse://conf/prod/.prod.env',
102 |                        '--conf-file', 'file:fuse://conf/pipeline_configs/demo_setup.yml']
103 |       - name: 'PROD-telco-churn-initial-model-train-register'
104 |         tasks:
105 |           - task_key: 'demo-setup'
106 |             <<:
107 |               - *prod-cluster-config
108 |             spark_python_task:
109 |               python_file: 'file://telco_churn/pipelines/demo_setup_job.py'
110 |               parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env',
111 |                            '--env', 'file:fuse://conf/prod/.prod.env',
112 |                            '--conf-file', 'file:fuse://conf/pipeline_configs/demo_setup.yml']
113 |           - task_key: 'feature-table-creation'
114 |             <<: *prod-cluster-config
115 |             depends_on:
116 |               - task_key: 'demo-setup'
117 |             spark_python_task:
118 |               python_file: 'file://telco_churn/pipelines/feature_table_creator_job.py'
119 |               parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env',
120 |                            '--env', 'file:fuse://conf/prod/.prod.env',
121 |                            '--conf-file', 'file:fuse://conf/pipeline_configs/feature_table_creator.yml']
122 |           - task_key: 'model-train'
123 |             <<: *prod-cluster-config
124 |             depends_on:
125 |               - task_key: 'demo-setup'
126 |               - task_key: 'feature-table-creation'
127 |             spark_python_task:
128 |               python_file: 'file://telco_churn/pipelines/model_train_job.py'
129 |               parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env',
130 |                            '--env', 'file:fuse://conf/prod/.prod.env',
131 |                            '--conf-file', 'file:fuse://conf/pipeline_configs/model_train.yml']
132 |       - name: 'PROD-telco-churn-model-train'
133 |         <<:
134 |           - *prod-cluster-config
135 |         spark_python_task:
136 |           python_file: 'file://telco_churn/pipelines/model_train_job.py'
137 |           parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env',
138 |                        '--env', 'file:fuse://conf/prod/.prod.env',
139 |                        '--conf-file', 'file:fuse://conf/pipeline_configs/model_train.yml']
140 |       - name: 'PROD-telco-churn-model-deployment'
141 |         <<:
142 |           - *prod-cluster-config
143 |         spark_python_task:
144 |           python_file: 'file://telco_churn/pipelines/model_deployment_job.py'
145 |           parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env',
146 |                        '--env', 'file:fuse://conf/prod/.prod.env',
147 |                        '--conf-file', 'file:fuse://conf/pipeline_configs/model_deployment.yml']
148 |       - name: 'PROD-telco-churn-model-inference-batch'
149 |         <<:
150 |           - *prod-cluster-config
151 |         spark_python_task:
152 |           python_file: 'file://telco_churn/pipelines/model_inference_batch_job.py'
153 |           parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env',
154 |                        '--env', 'file:fuse://conf/prod/.prod.env',
155 |                        '--conf-file', 'file:fuse://conf/pipeline_configs/model_inference_batch.yml']


--------------------------------------------------------------------------------
/telco_churn/common.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module containing common data classes used throughout different pipelines, in addition to Workload class which is
  3 | extended to run pipelines/tasks.
  4 | """
  5 | import os
  6 | import sys
  7 | from dataclasses import dataclass
  8 | 
  9 | import yaml
 10 | import pathlib
 11 | import dotenv
 12 | from abc import ABC, abstractmethod
 13 | from argparse import ArgumentParser
 14 | from logging import Logger
 15 | from typing import Dict, Any, Union, List
 16 | from pyspark.sql import SparkSession
 17 | 
 18 | 
 19 | @dataclass
 20 | class MLflowTrackingConfig:
 21 |     """
 22 |     Configuration data class used to unpack MLflow parameters during a model training run.
 23 | 
 24 |     Attributes:
 25 |         run_name (str)
 26 |             Name of MLflow run
 27 |         experiment_id (int)
 28 |             ID of the MLflow experiment to be activated. If an experiment with this ID does not exist, raise an exception.
 29 |         experiment_path (str)
 30 |             Case sensitive name of the experiment to be activated. If an experiment with this name does not exist,
 31 |             a new experiment wth this name is created.
 32 |         model_name (str)
 33 |             Name of the registered model under which to create a new model version. If a registered model with the given
 34 |             name does not exist, it will be created automatically.
 35 |     """
 36 |     run_name: str
 37 |     experiment_id: int = None
 38 |     experiment_path: str = None
 39 |     model_name: str = None
 40 | 
 41 | 
 42 | @dataclass
 43 | class FeatureStoreTableConfig:
 44 |     """
 45 |     Configuration data class used to unpack parameters when creating or loading a Feature Store table.
 46 | 
 47 |     Attributes:
 48 |         database_name (str)
 49 |             Name of database to use for creating the feature table
 50 |         table_name (str)
 51 |             Name of feature table
 52 |         primary_keys (string or list)
 53 |             String or list of strings, of columns to use as the primary key(s). Use single column (customerID) as the
 54 |             primary key for the telco churn example.
 55 |         description (str)
 56 |             [Optional] string containing description to attribute to the feature table in the Feature Store.
 57 |             Only used when creating a Feature Store table.
 58 |     """
 59 |     database_name: str
 60 |     table_name: str
 61 |     primary_keys: Union[str, List[str]]
 62 |     description: str = None
 63 | 
 64 | 
 65 | @dataclass
 66 | class LabelsTableConfig:
 67 |     """
 68 |     Configuration data class used to unpack parameters when creating or loading labels table.
 69 | 
 70 |     Attributes:
 71 |         database_name (str)
 72 |             Name of database to use for creating the labels table
 73 |         table_name (str)
 74 |             Name of labels table within the database
 75 |         label_col (str)
 76 |             Name of column to use as the label column (in telco churn example we rename this column to 'churn')
 77 |         primary_keys (string or list)
 78 |             [Optional] String or list of strings, of columns to use as the primary key(s)
 79 |         dbfs_path (str)
 80 |             [Optional] DBFS path to use for the labels table (saving as a Delta table)
 81 |     """
 82 |     database_name: str
 83 |     table_name: str
 84 |     label_col: str
 85 |     primary_keys: Union[str, List[str]] = None
 86 |     dbfs_path: str = None
 87 | 
 88 | 
 89 | class Workload(ABC):
 90 |     """
 91 |     This is an abstract class that provides handy interfaces to implement workloads (e.g. pipelines or job tasks).
 92 |     Create a child from this class and implement the abstract launch method.
 93 |     Class provides access to the following useful objects:
 94 |     * self.spark is a SparkSession
 95 |     * self.dbutils provides access to the DBUtils
 96 |     * self.logger provides access to the Spark-compatible logger
 97 |     * self.conf provides access to the parsed configuration of the job
 98 |     * self.env_vars provides access to the parsed environment variables of the job
 99 |     """
100 |     def __init__(self, spark=None, init_conf=None):
101 |         self.spark = self._prepare_spark(spark)
102 |         self.logger = self._prepare_logger()
103 |         self.dbutils = self.get_dbutils()
104 |         if init_conf:
105 |             self.conf = init_conf
106 |         else:
107 |             self.conf = self._provide_config()
108 |         self._log_conf()
109 |         self.env_vars = self.get_env_vars_as_dict()
110 |         self._log_env_vars()
111 | 
112 |     @staticmethod
113 |     def _prepare_spark(spark) -> SparkSession:
114 |         if not spark:
115 |             return SparkSession.builder.getOrCreate()
116 |         else:
117 |             return spark
118 | 
119 |     @staticmethod
120 |     def _get_dbutils(spark: SparkSession):
121 |         try:
122 |             from pyspark.dbutils import DBUtils  # noqa
123 | 
124 |             if 'dbutils' not in locals():
125 |                 utils = DBUtils(spark)
126 |                 return utils
127 |             else:
128 |                 return locals().get('dbutils')
129 |         except ImportError:
130 |             return None
131 | 
132 |     def get_dbutils(self):
133 |         utils = self._get_dbutils(self.spark)
134 | 
135 |         if not utils:
136 |             self.logger.warn('No DBUtils defined in the runtime')
137 |         else:
138 |             self.logger.info('DBUtils class initialized')
139 | 
140 |         return utils
141 | 
142 |     def _provide_config(self):
143 |         self.logger.info('Reading configuration from --conf-file job option')
144 |         conf_file = self._get_conf_file()
145 |         if not conf_file:
146 |             self.logger.info(
147 |                 'No conf file was provided, setting configuration to empty dict.'
148 |                 'Please override configuration in subclass init method'
149 |             )
150 |             return {}
151 |         else:
152 |             self.logger.info(f'Conf file was provided, reading configuration from {conf_file}')
153 |             return self._read_config(conf_file)
154 | 
155 |     @staticmethod
156 |     def _get_conf_file():
157 |         p = ArgumentParser()
158 |         p.add_argument('--conf-file', required=False, type=str)
159 |         namespace = p.parse_known_args(sys.argv[1:])[0]
160 |         return namespace.conf_file
161 | 
162 |     @staticmethod
163 |     def _read_config(conf_file) -> Dict[str, Any]:
164 |         config = yaml.safe_load(pathlib.Path(conf_file).read_text())
165 |         return config
166 | 
167 |     @staticmethod
168 |     def _get_base_data_params():
169 |         p = ArgumentParser()
170 |         p.add_argument('--base-data-params', required=False, type=str)
171 |         namespace = p.parse_known_args(sys.argv[1:])[0]
172 |         return namespace.base_data_params
173 | 
174 |     @staticmethod
175 |     def _get_env():
176 |         p = ArgumentParser()
177 |         p.add_argument('--env', required=False, type=str)
178 |         namespace = p.parse_known_args(sys.argv[1:])[0]
179 |         return namespace.env
180 | 
181 |     @staticmethod
182 |     def _set_environ(env_vars):
183 |         dotenv.load_dotenv(env_vars)
184 | 
185 |     def get_env_vars_as_dict(self):
186 |         base_data_params = self._get_base_data_params()
187 |         self._set_environ(base_data_params)
188 | 
189 |         env = self._get_env()
190 |         self._set_environ(env)
191 | 
192 |         return dict(os.environ)
193 | 
194 |     def _prepare_logger(self) -> Logger:
195 |         log4j_logger = self.spark._jvm.org.apache.log4j  # noqa
196 |         return log4j_logger.LogManager.getLogger(self.__class__.__name__)
197 | 
198 |     def _log_conf(self):
199 |         # log parameters
200 |         self.logger.info('Launching job with configuration parameters:')
201 |         for key, item in self.conf.items():
202 |             self.logger.info('\t Parameter: %-30s with value => %-30s' % (key, item))
203 | 
204 |     def _log_env_vars(self):
205 |         # log parameters
206 |         self.logger.info('Using environment variables:')
207 |         for key, item in self.env_vars.items():
208 |             self.logger.info('\t Parameter: %-30s with value => %-30s' % (key, item))
209 | 
210 |     @abstractmethod
211 |     def launch(self):
212 |         """
213 |         Main method of the job.
214 |         :return:
215 |         """
216 |         pass
217 | 
218 | 
219 | def get_dbutils(
220 |     spark: SparkSession,
221 | ):  # please note that this function is used in mocking by its name
222 |     try:
223 |         from pyspark.dbutils import DBUtils  # noqa
224 | 
225 |         if "dbutils" not in locals():
226 |             utils = DBUtils(spark)
227 |             return utils
228 |         else:
229 |             return locals().get("dbutils")
230 |     except ImportError:
231 |         return None
232 | 


--------------------------------------------------------------------------------
/telco_churn/model_train.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import List, Dict, Any
  3 | import pprint
  4 | 
  5 | import pandas as pd
  6 | import sklearn
  7 | from sklearn.model_selection import train_test_split
  8 | import mlflow
  9 | from mlflow.models import infer_signature
 10 | 
 11 | import databricks
 12 | from databricks.feature_store import FeatureStoreClient, FeatureLookup
 13 | 
 14 | from telco_churn.common import MLflowTrackingConfig, FeatureStoreTableConfig, LabelsTableConfig
 15 | from telco_churn.model_train_pipeline import ModelTrainPipeline
 16 | from telco_churn.utils.get_spark import spark
 17 | from telco_churn.utils.logger_utils import get_logger
 18 | 
 19 | fs = FeatureStoreClient()
 20 | _logger = get_logger()
 21 | 
 22 | 
 23 | @dataclass
 24 | class ModelTrainConfig:
 25 |     """
 26 |     Configuration data class used to execute ModelTrain pipeline.
 27 | 
 28 |     Attributes:
 29 |         mlflow_tracking_cfg (MLflowTrackingConfig)
 30 |             Configuration data class used to unpack MLflow parameters during a model training run.
 31 |         feature_store_table_cfg (FeatureStoreTableConfig):
 32 |             Configuration data class used to unpack parameters when loading the Feature Store table.
 33 |         labels_table_cfg (LabelsTableConfig):
 34 |             Configuration data class used to unpack parameters when loading labels table.
 35 |         pipeline_params (dict):
 36 |             Params to use in preprocessing pipeline. Read from model_train.yml
 37 |             - test_size: Proportion of input data to use as training data
 38 |             - random_state: Random state to enable reproducible train-test split
 39 |         model_params (dict):
 40 |             Dictionary of params for model. Read from model_train.yml
 41 |         conf (dict):
 42 |             [Optional] dictionary of conf file used to trigger pipeline. If provided will be tracked as a yml
 43 |             file to MLflow tracking.
 44 |         env_vars (dict):
 45 |             [Optional] dictionary of environment variables to trigger pipeline. If provided will be tracked as a yml
 46 |             file to MLflow tracking.
 47 |     """
 48 |     mlflow_tracking_cfg: MLflowTrackingConfig
 49 |     feature_store_table_cfg: FeatureStoreTableConfig
 50 |     labels_table_cfg: LabelsTableConfig
 51 |     pipeline_params: Dict[str, Any]
 52 |     model_params: Dict[str, Any]
 53 |     conf: Dict[str, Any] = None
 54 |     env_vars: Dict[str, str] = None
 55 | 
 56 | 
 57 | class ModelTrain:
 58 |     """
 59 |     Class to execute model training. Params, metrics and model artifacts will be tracking to MLflow Tracking.
 60 |     Optionally, the resulting model will be registered to MLflow Model Registry if provided.
 61 |     """
 62 |     def __init__(self, cfg: ModelTrainConfig):
 63 |         self.cfg = cfg
 64 | 
 65 |     @staticmethod
 66 |     def _set_experiment(mlflow_tracking_cfg: MLflowTrackingConfig):
 67 |         """
 68 |         Set MLflow experiment. Use one of either experiment_id or experiment_path
 69 |         """
 70 |         if mlflow_tracking_cfg.experiment_id is not None:
 71 |             _logger.info(f'MLflow experiment_id: {mlflow_tracking_cfg.experiment_id}')
 72 |             mlflow.set_experiment(experiment_id=mlflow_tracking_cfg.experiment_id)
 73 |         elif mlflow_tracking_cfg.experiment_path is not None:
 74 |             _logger.info(f'MLflow experiment_path: {mlflow_tracking_cfg.experiment_path}')
 75 |             mlflow.set_experiment(experiment_name=mlflow_tracking_cfg.experiment_path)
 76 |         else:
 77 |             raise RuntimeError('MLflow experiment_id or experiment_path must be set in mlflow_params')
 78 | 
 79 |     def _get_feature_table_lookup(self) -> List[databricks.feature_store.entities.feature_lookup.FeatureLookup]:
 80 |         """
 81 |         Create list of FeatureLookup for single feature store table. The FeatureLookup is a value class used to specify
 82 |         features to use in a TrainingSet.
 83 | 
 84 |         Returns
 85 |         -------
 86 |         List[databricks.feature_store.entities.feature_lookup.FeatureLookup]
 87 |         """
 88 |         feature_store_table_cfg = self.cfg.feature_store_table_cfg
 89 | 
 90 |         _logger.info('Creating feature lookups...')
 91 |         feature_table_name = f'{feature_store_table_cfg.database_name}.{feature_store_table_cfg.table_name}'
 92 |         feature_lookup = FeatureLookup(table_name=feature_table_name,
 93 |                                        lookup_key=feature_store_table_cfg.primary_keys)
 94 |         # Lookup for single feature table
 95 |         feature_table_lookup = [feature_lookup]
 96 | 
 97 |         return feature_table_lookup
 98 | 
 99 |     def get_fs_training_set(self) -> databricks.feature_store.training_set.TrainingSet:
100 |         """
101 |         Create the Feature Store TrainingSet
102 | 
103 |         Returns
104 |         -------
105 |         databricks.feature_store.training_set.TrainingSet
106 |         """
107 |         feature_store_table_cfg = self.cfg.feature_store_table_cfg
108 |         labels_table_cfg = self.cfg.labels_table_cfg
109 |         labels_df = spark.table(f'{labels_table_cfg.database_name}.{labels_table_cfg.table_name}')
110 | 
111 |         feature_table_lookup = self._get_feature_table_lookup()
112 |         _logger.info('Creating Feature Store training set...')
113 |         return fs.create_training_set(df=labels_df,
114 |                                       feature_lookups=feature_table_lookup,
115 |                                       label=labels_table_cfg.label_col,
116 |                                       exclude_columns=feature_store_table_cfg.primary_keys)
117 | 
118 |     def create_train_test_split(self, fs_training_set: databricks.feature_store.training_set.TrainingSet):
119 |         """
120 |         Load the TrainingSet for training. The loaded DataFrame has columns specified by fs_training_set.
121 |         Loaded Spark DataFrame is converted to pandas DataFrame and split into train/test splits.
122 | 
123 |         Parameters
124 |         ----------
125 |         fs_training_set : databricks.feature_store.training_set.TrainingSet
126 |             Feature Store TrainingSet
127 | 
128 |         Returns
129 |         -------
130 |         train-test splits
131 |         """
132 |         labels_table_cfg = self.cfg.labels_table_cfg
133 | 
134 |         _logger.info('Load training set from Feature Store, converting to pandas DataFrame')
135 |         training_set_pdf = fs_training_set.load_df().toPandas()
136 | 
137 |         X = training_set_pdf.drop(labels_table_cfg.label_col, axis=1)
138 |         y = training_set_pdf[labels_table_cfg.label_col]
139 | 
140 |         _logger.info(f'Splitting into train/test splits - test_size: {self.cfg.pipeline_params["test_size"]}')
141 |         X_train, X_test, y_train, y_test = train_test_split(X, y,
142 |                                                             random_state=self.cfg.pipeline_params['random_state'],
143 |                                                             test_size=self.cfg.pipeline_params['test_size'],
144 |                                                             stratify=y)
145 | 
146 |         return X_train, X_test, y_train, y_test
147 | 
148 |     def fit_pipeline(self, X_train: pd.DataFrame, y_train: pd.Series) -> sklearn.pipeline.Pipeline:
149 |         """
150 |         Create sklearn pipeline and fit pipeline.
151 | 
152 |         Parameters
153 |         ----------
154 |         X_train : pd.DataFrame
155 |             Training data
156 | 
157 |         y_train : pd.Series
158 |             Training labels
159 | 
160 |         Returns
161 |         -------
162 |         scikit-learn pipeline with fitted steps.
163 |         """
164 |         _logger.info('Creating sklearn pipeline...')
165 |         pipeline = ModelTrainPipeline.create_train_pipeline(self.cfg.model_params)
166 | 
167 |         _logger.info('Fitting sklearn RandomForestClassifier...')
168 |         _logger.info(f'Model params: {pprint.pformat(self.cfg.model_params)}')
169 |         model = pipeline.fit(X_train, y_train)
170 | 
171 |         return model
172 | 
173 |     def run(self):
174 |         """
175 |         Method to trigger model training, and tracking to MLflow.
176 | 
177 |         Steps:
178 |             1. Set MLflow experiment (creating a new experiment if it does not already exist)
179 |             2. Start MLflow run
180 |             3. Create Databricks Feature Store training set
181 |             4. Create train-test splits to be used to train and evaluate the model
182 |             5. Define sklearn pipeline using ModelTrainPipeline, and fit on train data
183 |             6. Log trained model using the Databricks Feature Store API. Model will be logged to MLflow with associated
184 |                feature table metadata.
185 |             7. Register the model to MLflow model registry if model_name is provided in mlflow_params
186 |         """
187 |         _logger.info('==========Running model training==========')
188 |         mlflow_tracking_cfg = self.cfg.mlflow_tracking_cfg
189 | 
190 |         _logger.info('==========Setting MLflow experiment==========')
191 |         self._set_experiment(mlflow_tracking_cfg)
192 |         # Enable automatic logging of input samples, metrics, parameters, and models
193 |         mlflow.sklearn.autolog(log_input_examples=True, silent=True)
194 | 
195 |         _logger.info('==========Starting MLflow run==========')
196 |         with mlflow.start_run(run_name=mlflow_tracking_cfg.run_name) as mlflow_run:
197 | 
198 |             if self.cfg.conf is not None:
199 |                 # Log config file
200 |                 mlflow.log_dict(self.cfg.conf, 'conf.yml')
201 |             if self.cfg.env_vars is not None:
202 |                 # Log config file
203 |                 mlflow.log_dict(self.cfg.env_vars, 'env_vars.yml')
204 | 
205 |             # Create Feature Store Training Set
206 |             _logger.info('==========Creating Feature Store training set==========')
207 |             fs_training_set = self.get_fs_training_set()
208 | 
209 |             # Load and preprocess data into train/test splits
210 |             _logger.info('==========Creating train/test splits==========')
211 |             X_train, X_test, y_train, y_test = self.create_train_test_split(fs_training_set)
212 | 
213 |             # Fit pipeline with RandomForestClassifier
214 |             _logger.info('==========Fitting RandomForestClassifier model==========')
215 |             model = self.fit_pipeline(X_train, y_train)
216 | 
217 |             # Log model using Feature Store API
218 |             _logger.info('Logging model to MLflow using Feature Store API')
219 |             fs.log_model(
220 |                 model,
221 |                 'fs_model',
222 |                 flavor=mlflow.sklearn,
223 |                 training_set=fs_training_set,
224 |                 input_example=X_train[:100],
225 |                 signature=infer_signature(X_train, y_train))
226 | 
227 |             # Training metrics are logged by MLflow autologging
228 |             # Log metrics for the test set
229 |             _logger.info('==========Model Evaluation==========')
230 |             _logger.info('Evaluating and logging metrics')
231 |             test_metrics = mlflow.sklearn.eval_and_log_metrics(model, X_test, y_test, prefix='test_')
232 |             print(pd.DataFrame(test_metrics, index=[0]))
233 | 
234 |             # Register model to MLflow Model Registry if provided
235 |             if mlflow_tracking_cfg.model_name is not None:
236 |                 _logger.info('==========MLflow Model Registry==========')
237 |                 _logger.info(f'Registering model: {mlflow_tracking_cfg.model_name}')
238 |                 mlflow.register_model(f'runs:/{mlflow_run.info.run_id}/fs_model',
239 |                                       name=mlflow_tracking_cfg.model_name)
240 | 
241 |         _logger.info('==========Model training completed==========')
242 | 


--------------------------------------------------------------------------------
/telco_churn/pipelines/demo_setup_job.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import mlflow
  4 | from mlflow.tracking import MlflowClient
  5 | from mlflow.exceptions import RestException
  6 | 
  7 | from telco_churn.common import Workload
  8 | from telco_churn.utils.logger_utils import get_logger
  9 | 
 10 | from databricks.feature_store.client import FeatureStoreClient
 11 | 
 12 | client = MlflowClient()
 13 | fs = FeatureStoreClient()
 14 | _logger = get_logger()
 15 | 
 16 | 
 17 | class DemoSetup(Workload):
 18 | 
 19 |     def _get_train_experiment_id(self):
 20 |         try:
 21 |             return self.env_vars['model_train_experiment_id']
 22 |         except KeyError:
 23 |             return None
 24 | 
 25 |     def _get_train_experiment_path(self):
 26 |         try:
 27 |             return self.env_vars['model_train_experiment_path']
 28 |         except KeyError:
 29 |             return None
 30 | 
 31 |     def _get_deploy_experiment_id(self):
 32 |         try:
 33 |             return self.env_vars['model_deploy_experiment_id']
 34 |         except KeyError:
 35 |             return None
 36 | 
 37 |     def _get_deploy_experiment_path(self):
 38 |         try:
 39 |             return self.env_vars['model_deploy_experiment_path']
 40 |         except KeyError:
 41 |             return None
 42 | 
 43 |     @staticmethod
 44 |     def _check_mlflow_model_registry_exists(model_name) -> bool:
 45 |         """
 46 |         Check if model exists in MLflow Model Registry.
 47 |         Returns True if model exists in Model Registry, False if not
 48 |         """
 49 |         try:
 50 |             client.get_registered_model(name=model_name)
 51 |             _logger.info(f'MLflow Model Registry name: {model_name} exists')
 52 |             return True
 53 |         except RestException:
 54 |             _logger.info(f'MLflow Model Registry name: {model_name} DOES NOT exists')
 55 |             return False
 56 | 
 57 |     @staticmethod
 58 |     def _archive_registered_models(model_name):
 59 |         """
 60 |         Archive any model versions which are not already under stage='Archived'
 61 |         """
 62 |         registered_model = client.get_registered_model(name=model_name)
 63 |         latest_versions_list = registered_model.latest_versions
 64 | 
 65 |         _logger.info(f'MLflow Model Registry name: {model_name}')
 66 |         for model_version in latest_versions_list:
 67 |             if model_version.current_stage != 'Archived':
 68 |                 _logger.info(f'Archiving model version: {model_version.version}')
 69 |                 client.transition_model_version_stage(
 70 |                     name=model_name,
 71 |                     version=model_version.version,
 72 |                     stage='Archived'
 73 |                 )
 74 | 
 75 |     def _delete_registered_model(self, model_name):
 76 |         """
 77 |         Delete an experiment from the backend store.
 78 |         """
 79 |         self._archive_registered_models(model_name)
 80 |         client.delete_registered_model(name=model_name)
 81 |         _logger.info(f'Deleted MLflow Model Registry model: {model_name}')
 82 | 
 83 |     def _check_mlflow_experiments_exists(self) -> dict:
 84 |         """
 85 |         The demo workflow consists of creating 2 MLflow Tracking experiments:
 86 |             * train_experiment - Experiment used to track params, metrics, artifacts during model training
 87 |             * deploy_experiment - Experiment used to metrics when comparing models during the deploy model step
 88 | 
 89 |         This method checks the demo_setup config dict for either the experiment_id or experiment_path for both
 90 |         experiments.
 91 | 
 92 |         A dictionary containing the keys train_exp_exists and deploy_exp_exists along with boolean values is returned
 93 | 
 94 |         Returns
 95 |         -------
 96 |         Dictionary indicating whether train and deploy MLflow experiments currently exist
 97 |         """
 98 |         train_experiment_id = self._get_train_experiment_id()
 99 |         train_experiment_path = self._get_train_experiment_path()
100 |         deploy_experiment_id = self._get_deploy_experiment_id()
101 |         deploy_experiment_path = self._get_deploy_experiment_path()
102 | 
103 |         def check_by_experiment_id(experiment_id):
104 |             try:
105 |                 mlflow.get_experiment(experiment_id=experiment_id)
106 |                 _logger.info(f'MLflow Tracking experiment_id: {experiment_id} exists')
107 |                 return True
108 |             except RestException:
109 |                 _logger.info(f'MLflow Tracking experiment_id: {experiment_id} DOES NOT exist')
110 |                 return False
111 | 
112 |         def check_by_experiment_path(experiment_path):
113 |             experiment = mlflow.get_experiment_by_name(name=experiment_path)
114 |             if experiment is not None:
115 |                 _logger.info(f'MLflow Tracking experiment_path: {experiment_path} exists')
116 |                 return True
117 |             else:
118 |                 _logger.info(f'MLflow Tracking experiment_path: {experiment_path} DOES NOT exist')
119 |                 return False
120 | 
121 |         if train_experiment_id is not None:
122 |             train_exp_exists = check_by_experiment_id(train_experiment_id)
123 |         elif train_experiment_path is not None:
124 |             train_exp_exists = check_by_experiment_path(train_experiment_path)
125 |         else:
126 |             raise RuntimeError('Either model_train_experiment_id or model_train_experiment_path should be passed in '
127 |                                'deployment.yml')
128 | 
129 |         if deploy_experiment_id is not None:
130 |             deploy_exp_exists = check_by_experiment_id(deploy_experiment_id)
131 |         elif deploy_experiment_path is not None:
132 |             deploy_exp_exists = check_by_experiment_path(deploy_experiment_path)
133 |         else:
134 |             raise RuntimeError('Either model_train_experiment_id or model_train_experiment_path should be passed in '
135 |                                'deployment.yml')
136 | 
137 |         return {'train_exp_exists': train_exp_exists,
138 |                 'deploy_exp_exists': deploy_exp_exists}
139 | 
140 |     def _delete_mlflow_experiments(self, exp_exists_dict: dict):
141 |         """
142 |         Check exp_exists_dict if train_exp_exists or deploy_exp_exists is True. Delete experiments if they exist
143 | 
144 |         Parameters
145 |         ----------
146 |         exp_exists_dict : dict
147 |             A dictionary containing the keys train_exp_exists and deploy_exp_exists along with boolean values
148 |         """
149 |         delete_experiments = [exp for exp, exists in exp_exists_dict.items() if exists == True]
150 |         if len(delete_experiments) == 0:
151 |             _logger.info(f'No existing experiments to delete')
152 |         if 'train_exp_exists' in delete_experiments:
153 |             if self.env_vars['model_train_experiment_path'] is not None:
154 |                 experiment = mlflow.get_experiment_by_name(name=self.env_vars['model_train_experiment_path'])
155 |                 mlflow.delete_experiment(experiment_id=experiment.experiment_id)
156 |                 _logger.info(f'Deleted existing experiment_path: {self.env_vars["model_train_experiment_path"]}')
157 |             elif self.env_vars['model_train_experiment_id'] is not None:
158 |                 mlflow.delete_experiment(experiment_id=self.env_vars['model_train_experiment_id'])
159 |                 _logger.info(f'Deleted existing experiment_id: {self.env_vars["model_train_experiment_id"]}')
160 |             else:
161 |                 raise RuntimeError('Either model_train_experiment_id or model_train_experiment_path should be passed '
162 |                                    'in deployment.yml')
163 | 
164 |         if 'deploy_exp_exists' in delete_experiments:
165 |             if self.env_vars['model_deploy_experiment_path'] is not None:
166 |                 experiment = mlflow.get_experiment_by_name(name=self.env_vars['model_deploy_experiment_path'])
167 |                 mlflow.delete_experiment(experiment_id=experiment.experiment_id)
168 |                 _logger.info(
169 |                     f'Deleted existing experiment_path: {self.env_vars["model_deploy_experiment_path"]}')
170 |             elif self.env_vars['model_deploy_experiment_id'] is not None:
171 |                 mlflow.delete_experiment(experiment_id=self.env_vars['model_deploy_experiment_id'])
172 |                 _logger.info(f'Deleted existing experiment_id: {self.env_vars["model_deploy_experiment_id"]}')
173 | 
174 |     @staticmethod
175 |     def _check_feature_table_exists(feature_store_table) -> bool:
176 |         """
177 |         Check if Feature Store feature table exists
178 |         Returns True if feature table exists in Feature Store, False if not
179 |         """
180 |         try:
181 |             fs.get_table(name=feature_store_table)
182 |             _logger.info(f'Feature Store feature table: {feature_store_table} exists')
183 |             return True
184 |         except (ValueError, Exception):
185 |             _logger.info(f'Feature Store feature table: {feature_store_table} DOES NOT exist')
186 |             return False
187 | 
188 |     @staticmethod
189 |     def _drop_feature_table(feature_store_table):
190 |         """
191 |         Delete Feature Store feature table
192 |         """
193 |         try:
194 |             fs.drop_table(
195 |                 name=feature_store_table
196 |             )
197 |             _logger.info(f'Deleted Feature Store feature table: {feature_store_table}')
198 |         except ValueError:
199 |             _logger.info(f'Feature Store feature table: {feature_store_table} does not exist')
200 | 
201 |     def _check_labels_delta_table_exists(self, labels_table_dbfs_path) -> bool:
202 |         """
203 |         Check if Delta table exists in DBFS
204 | 
205 |         Parameters
206 |         ----------
207 |         labels_table_dbfs_path : str
208 |             Path to Delta table in DBFS
209 | 
210 |         Returns
211 |         -------
212 |         bool
213 |         """
214 |         try:
215 |             self.dbutils.fs.ls(labels_table_dbfs_path)
216 |             _logger.info(f'Labels Delta table: {labels_table_dbfs_path} exists')
217 |             return True
218 |         except:
219 |             _logger.info(f'Labels Delta table: {labels_table_dbfs_path} DOES NOT exist')
220 |             return False
221 | 
222 |     def _delete_labels_delta_table(self, labels_table_dbfs_path):
223 |         self.dbutils.fs.rm(labels_table_dbfs_path, True)
224 |         _logger.info(f'Deleted labels Delta table: {labels_table_dbfs_path}')
225 | 
226 |     def setup(self):
227 |         """
228 |         Demo setup steps:
229 |         * Delete Model Registry model if exists (archive any existing models)
230 |         * Delete MLflow experiments if exists
231 |         * Delete Feature Table if exists
232 |         """
233 |         _logger.info('==========Demo Setup=========')
234 |         _logger.info(f'Running demo-setup pipeline in {self.env_vars["env"]} environment')
235 | 
236 |         if self.conf['delete_model_registry']:
237 |             _logger.info('Checking MLflow Model Registry...')
238 |             model_name = self.env_vars['model_name']
239 |             if self._check_mlflow_model_registry_exists(model_name):
240 |                 self._delete_registered_model(model_name)
241 | 
242 |         if self.conf['delete_mlflow_experiments']:
243 |             _logger.info('Checking MLflow Tracking...')
244 |             exp_exists_dict = self._check_mlflow_experiments_exists()
245 |             self._delete_mlflow_experiments(exp_exists_dict)
246 | 
247 |         if self.conf['drop_feature_table']:
248 |             _logger.info('Checking Feature Store...')
249 |             feature_store_database_name = self.env_vars['feature_store_database_name']
250 |             feature_store_table_name = self.env_vars['feature_store_table_name']
251 |             feature_store_table = f'{feature_store_database_name}.{feature_store_table_name}'
252 |             if self._check_feature_table_exists(feature_store_table=feature_store_table):
253 |                 self._drop_feature_table(feature_store_table=feature_store_table)
254 | 
255 |         if self.conf['drop_labels_table']:
256 |             _logger.info('Checking existing labels table...')
257 |             labels_table_dbfs_path = self.env_vars['labels_table_dbfs_path']
258 |             if self._check_labels_delta_table_exists(labels_table_dbfs_path=labels_table_dbfs_path):
259 |                 self._delete_labels_delta_table(labels_table_dbfs_path=labels_table_dbfs_path)
260 | 
261 |         _logger.info('==========Demo Setup Complete=========')
262 | 
263 |     def launch(self) -> None:
264 |         """
265 |         Launch DemoSetup job
266 |         """
267 |         _logger.info('Launching DemoSetup job')
268 |         DemoSetup().setup()
269 |         _logger.info('DemoSetup job finished!')
270 | 
271 | 
272 | if __name__ == '__main__':
273 |     job = DemoSetup()
274 |     job.launch()
275 | 


--------------------------------------------------------------------------------
/telco_churn/model_deployment.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | 
  3 | import mlflow
  4 | import pandas as pd
  5 | import pyspark.sql
  6 | from mlflow.tracking import MlflowClient
  7 | 
  8 | from telco_churn.common import MLflowTrackingConfig
  9 | from telco_churn.model_inference import ModelInference
 10 | from telco_churn.utils.evaluation_utils import ModelEvaluation
 11 | from telco_churn.utils.logger_utils import get_logger
 12 | 
 13 | _logger = get_logger()
 14 | 
 15 | 
 16 | @dataclass
 17 | class ModelDeploymentConfig:
 18 |     """
 19 |     Attributes:
 20 |         mlflow_tracking_cfg (MLflowTrackingConfig)
 21 |             Configuration data class used to unpack MLflow parameters during a model training run.
 22 |         reference_data (str): Name of table to use as a reference DataFrame to score loaded model against.
 23 |             Must contain column(s) for lookup keys to join feature data from Feature Store
 24 |         label_col (str): Name of label column in input data
 25 |         comparison_metric (str): Name of evaluation metric to use when comparing models
 26 |         higher_is_better (bool): Boolean indicating whether a higher value for the evaluation metric equates to better
 27 |             model performance
 28 |     """
 29 |     mlflow_tracking_cfg: MLflowTrackingConfig
 30 |     reference_data: str
 31 |     label_col: str = 'churn'
 32 |     comparison_metric: str = 'roc_auc_score'
 33 |     higher_is_better: bool = True
 34 | 
 35 | 
 36 | class ModelDeployment:
 37 |     """
 38 |     Class to execute model deployment. This class orchestrates the comparison of the current Production model versus
 39 |     Staging model. The Production model will be the most recent model version under registered in the MLflow Model
 40 |     Registry under the provided model_name, for stage="Production". Likewise for Staging.
 41 | 
 42 |     Execution will involve loading the models and performing batch inference for a specified reference dataset.
 43 |     The two models will be compared using the specified comparison_metric.
 44 |     higher_is_better indicates whether a higher value for the evaluation metric equates to a better peforming model.
 45 |     Dependent on this comparison the candidate Staging model will be either promoted to Production (and the current
 46 |     Production model archived) if performing better, or the Staging model will be archived if it does not perform
 47 |     better than the current Production model.
 48 | 
 49 |     Metrics computed when comparing the two models will be logged to MLflow, under the provided experiment_id or
 50 |     experiment_path.
 51 |     """
 52 |     def __init__(self, cfg: ModelDeploymentConfig):
 53 |         self.cfg = cfg
 54 | 
 55 |     @staticmethod
 56 |     def _set_experiment(mlflow_tracking_cfg: MLflowTrackingConfig):
 57 |         """
 58 |         Set MLflow experiment. Use one of either experiment_id or experiment_path
 59 |         """
 60 |         if mlflow_tracking_cfg.experiment_id is not None:
 61 |             _logger.info(f'MLflow experiment_id: {mlflow_tracking_cfg.experiment_id}')
 62 |             mlflow.set_experiment(experiment_id=mlflow_tracking_cfg.experiment_id)
 63 |         elif mlflow_tracking_cfg.experiment_path is not None:
 64 |             _logger.info(f'MLflow experiment_path: {mlflow_tracking_cfg.experiment_path}')
 65 |             mlflow.set_experiment(experiment_name=mlflow_tracking_cfg.experiment_path)
 66 |         else:
 67 |             raise RuntimeError('MLflow experiment_id or experiment_path must be set in MLflowTrackingConfig')
 68 | 
 69 |     def _get_model_uri_by_stage(self, stage: str):
 70 |         return f'models:/{self.cfg.mlflow_tracking_cfg.model_name}/{stage}'
 71 | 
 72 |     def _batch_inference_by_stage(self, stage: str) -> pyspark.sql.DataFrame:
 73 |         """
 74 |         Load and compute batch inference using model loaded from an MLflow Model Registry stage.
 75 |         Inference is computed on reference data specified. The model will use this reference data to look up feature
 76 |         values for primary keys, and use the loaded features as input for model scoring.
 77 |         The most recent model under the specified stage will be loaded. The registered model must have been logged to
 78 |         MLflow using the Feature Store API.
 79 | 
 80 |         Parameters
 81 |         ----------
 82 |         stage : str
 83 |             MLflow Model Registry stage
 84 | 
 85 |         Returns
 86 |         -------
 87 |         Spark DataFrame containing primary keys of the reference data, the loaded features from the feature store and
 88 |         prediction from model scoring
 89 |         """
 90 |         model_uri = self._get_model_uri_by_stage(stage=stage)
 91 |         _logger.info(f'Computing batch inference using: {model_uri}')
 92 |         _logger.info(f'Reference data: {self.cfg.reference_data}')
 93 |         model_inference = ModelInference(model_uri=model_uri,
 94 |                                          input_table_name=self.cfg.reference_data)
 95 | 
 96 |         return model_inference.run_batch()
 97 | 
 98 |     @staticmethod
 99 |     def _get_evaluation_metric(y_true: pd.Series, y_score: pd.Series, metric: str, stage: str) -> float:
100 |         """
101 |         Trigger evaluation, and return evaluation specified. A dictionary of evaluation metrics will be tracked to
102 |         MLflow tracking.
103 | 
104 |         Parameters
105 |         ----------
106 |         y_true : array-like of shape (n_samples,) or (n_samples, n_classes)
107 |             True labels or binary label indicators
108 |         y_score : array-like of shape (n_samples,) or (n_samples, n_classes)
109 |             Target scores.
110 |         metric : str
111 |             Name of metric to retrieve from evaluation dictionary
112 |         stage : str
113 |             Name of the MLflow Registry model stage corresponding. Used as a prefix when logging metrics
114 | 
115 |         Returns
116 |         -------
117 |         Evaluation metric
118 |         """
119 |         metric_prefix = stage + "_"
120 |         eval_dict = ModelEvaluation().evaluate(y_true, y_score, metric_prefix=metric_prefix)
121 |         mlflow.log_metrics(eval_dict)
122 |         eval_metric = eval_dict[metric_prefix + metric]
123 | 
124 |         return eval_metric
125 | 
126 |     def _run_promotion_logic(self, staging_eval_metric: float, production_eval_metric: float):
127 |         """
128 |         Basic logic to either promote a candidate Staging model performing better than the current Production model,
129 |         or alternatively archive the Staging model if not outperforming Production model.
130 | 
131 |         Parameters
132 |         ----------
133 |         staging_eval_metric :  float
134 |             Evaluation metric computed using Staging model
135 |         production_eval_metric : float
136 |             Evaluation metric computed using Production model
137 |         """
138 |         client = MlflowClient()
139 |         model_name = self.cfg.mlflow_tracking_cfg.model_name
140 |         staging_model_version = client.get_latest_versions(name=model_name, stages=['staging'])[0]
141 | 
142 |         _logger.info(f'metric={self.cfg.comparison_metric}')
143 |         _logger.info(f'higher_is_better={self.cfg.higher_is_better}')
144 |         if self.cfg.higher_is_better:
145 |             if staging_eval_metric <= production_eval_metric:
146 |                 _logger.info('Candidate Staging model DOES NOT perform better than current Production model')
147 |                 _logger.info('Transition candidate model from stage="staging" to stage="archived"')
148 |                 client.transition_model_version_stage(name=model_name,
149 |                                                       version=staging_model_version.version,
150 |                                                       stage='archived')
151 | 
152 |             elif staging_eval_metric > production_eval_metric:
153 |                 _logger.info('Candidate Staging model DOES perform better than current Production model')
154 |                 _logger.info('Transition candidate model from stage="staging" to stage="production"')
155 |                 _logger.info('Existing Production model will be archived')
156 |                 client.transition_model_version_stage(name=model_name,
157 |                                                       version=staging_model_version.version,
158 |                                                       stage='production',
159 |                                                       archive_existing_versions=True)
160 | 
161 |         else:
162 |             if staging_eval_metric >= production_eval_metric:
163 |                 _logger.info('Candidate Staging model DOES NOT perform better than current Production model')
164 |                 _logger.info('Transition candidate model from stage="staging" to stage="archived"')
165 |                 client.transition_model_version_stage(name=model_name,
166 |                                                       version=staging_model_version.version,
167 |                                                       stage='archived')
168 | 
169 |             elif staging_eval_metric < production_eval_metric:
170 |                 _logger.info('Candidate Staging model DOES perform better than current Production model')
171 |                 _logger.info('Transition candidate model from stage="staging" to stage="production"')
172 |                 _logger.info('Existing Production model will be archived')
173 |                 client.transition_model_version_stage(name=model_name,
174 |                                                       version=staging_model_version.version,
175 |                                                       stage='production',
176 |                                                       archive_existing_versions=True)
177 | 
178 |     def run(self):
179 |         """
180 |         Runner method to orchestrate model comparison and potential model promotion.
181 | 
182 |         Steps:
183 |             1. Set MLflow Tracking experiment. Used to track metrics computed when comparing Staging versus Production
184 |                models.
185 |             2. Load Staging and Production models and score against reference dataset provided. The reference data
186 |                specified must currently be a table.
187 |             3. Compute evaluation metric for both Staging and Production model predictions against reference data
188 |             4. If higher_is_better=True, the Staging model will be promoted in place of the Production model iff the
189 |                Staging model evaluation metric is higher than the Production model evaluation metric.
190 |                If higher_is_better=False, the Staging model will be promoted in place of the Production model iff the
191 |                Staging model evaluation metric is lower than the Production model evaluation metric.
192 | 
193 |         """
194 |         _logger.info('==========Running model deployment==========')
195 | 
196 |         _logger.info('==========Setting MLflow experiment==========')
197 |         mlflow_tracking_cfg = self.cfg.mlflow_tracking_cfg
198 |         self._set_experiment(mlflow_tracking_cfg)
199 | 
200 |         with mlflow.start_run(run_name=mlflow_tracking_cfg.run_name):
201 | 
202 |             _logger.info('==========Batch inference: staging model==========')
203 |             staging_inference_pred_df = self._batch_inference_by_stage(stage='staging')
204 |             staging_inference_pred_pdf = staging_inference_pred_df.toPandas()
205 |             _logger.info('==========Batch inference: production model==========')
206 |             prod_inference_pred_df = self._batch_inference_by_stage(stage='production')
207 |             prod_inference_pred_pdf = prod_inference_pred_df.toPandas()
208 | 
209 |             _logger.info('==========Model evaluation: staging model==========')
210 |             staging_eval_metric = self._get_evaluation_metric(y_true=staging_inference_pred_pdf[self.cfg.label_col],
211 |                                                               y_score=staging_inference_pred_pdf['prediction'],
212 |                                                               metric=self.cfg.comparison_metric,
213 |                                                               stage='staging')
214 |             _logger.info(f'Candidate Staging model (stage="staging") {self.cfg.comparison_metric}: {staging_eval_metric}')
215 | 
216 |             _logger.info('==========Model evaluation: production model==========')
217 |             production_eval_metric = self._get_evaluation_metric(y_true=prod_inference_pred_pdf[self.cfg.label_col],
218 |                                                                  y_score=prod_inference_pred_pdf['prediction'],
219 |                                                                  metric=self.cfg.comparison_metric,
220 |                                                                  stage='production')
221 |             _logger.info(
222 |                 f'Current Production model (stage="production") {self.cfg.comparison_metric}: {production_eval_metric}')
223 | 
224 |             _logger.info('==========Model comparison: candidate staging model vs current production model==========')
225 |             self._run_promotion_logic(staging_eval_metric, production_eval_metric)
226 | 
227 |             _logger.info('==========Model deployment completed==========')
228 | 


--------------------------------------------------------------------------------
/notebooks/demo_setup.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC 
  4 | # MAGIC # `demo_setup`
  5 | # MAGIC 
  6 | # MAGIC Pipeline to ensure that we can run the demo from a clean setup. Executing the `DemoSetup.run()` will do the following steps:
  7 | # MAGIC 
  8 | # MAGIC - Delete Model Registry model if exists (archive any existing models)
  9 | # MAGIC - Delete MLflow experiments if exists
 10 | # MAGIC - Delete Feature Table if exists
 11 | 
 12 | # COMMAND ----------
 13 | 
 14 | # DBTITLE 1,pip install requirements.txt
 15 | # MAGIC %pip install -r ../requirements.txt
 16 | 
 17 | # COMMAND ----------
 18 | 
 19 | # DBTITLE 1,Set env
 20 | dbutils.widgets.dropdown('env', 'dev', ['dev', 'staging', 'prod'], 'Environment Name')
 21 | 
 22 | # COMMAND ----------
 23 | 
 24 | # DBTITLE 1,Module Imports
 25 | import mlflow
 26 | from mlflow.tracking import MlflowClient
 27 | from mlflow.exceptions import RestException
 28 | 
 29 | from telco_churn.utils.notebook_utils import load_and_set_env_vars, load_config
 30 | from telco_churn.utils.logger_utils import get_logger
 31 | 
 32 | from databricks.feature_store.client import FeatureStoreClient
 33 | 
 34 | client = MlflowClient()
 35 | fs = FeatureStoreClient()
 36 | _logger = get_logger()
 37 | 
 38 | # COMMAND ----------
 39 | 
 40 | # DBTITLE 1,Load pipeline config params
 41 | # Set pipeline name
 42 | pipeline_name = 'demo_setup'
 43 | 
 44 | # Load pipeline config yaml file (../conf/pipeline_configs/{pipeline_name}.yml)
 45 | pipeline_config = load_config(pipeline_name)
 46 | 
 47 | # Load and set arbitrary params via spark_env_vars
 48 | # Params passed via ../conf/{env}/.{env}.env and ../conf/.base_data_params
 49 | env_vars = load_and_set_env_vars(env=dbutils.widgets.get('env'))
 50 | 
 51 | # COMMAND ----------
 52 | 
 53 | # DBTITLE 1,Pipeline Class
 54 | class DemoSetup:
 55 |     
 56 |     def __init__(self, conf: dict, env_vars: dict):
 57 |         
 58 |         self.conf = conf
 59 |         self.env_vars = env_vars
 60 | 
 61 |     def _get_train_experiment_id(self):
 62 |         try:
 63 |             return self.env_vars['model_train_experiment_id']
 64 |         except KeyError:
 65 |             return None
 66 | 
 67 |     def _get_train_experiment_path(self):
 68 |         try:
 69 |             return self.env_vars['model_train_experiment_path']
 70 |         except KeyError:
 71 |             return None
 72 | 
 73 |     def _get_deploy_experiment_id(self):
 74 |         try:
 75 |             return self.env_vars['model_deploy_experiment_id']
 76 |         except KeyError:
 77 |             return None
 78 | 
 79 |     def _get_deploy_experiment_path(self):
 80 |         try:
 81 |             return self.env_vars['model_deploy_experiment_path']
 82 |         except KeyError:
 83 |             return None
 84 | 
 85 |     @staticmethod
 86 |     def _check_mlflow_model_registry_exists(model_name) -> bool:
 87 |         """
 88 |         Check if model exists in MLflow Model Registry.
 89 |         Returns True if model exists in Model Registry, False if not
 90 |         """
 91 |         try:
 92 |             client.get_registered_model(name=model_name)
 93 |             _logger.info(f'MLflow Model Registry name: {model_name} exists')
 94 |             return True
 95 |         except RestException:
 96 |             _logger.info(f'MLflow Model Registry name: {model_name} DOES NOT exists')
 97 |             return False
 98 | 
 99 |     @staticmethod
100 |     def _archive_registered_models(model_name):
101 |         """
102 |         Archive any model versions which are not already under stage='Archived'
103 |         """
104 |         registered_model = client.get_registered_model(name=model_name)
105 |         latest_versions_list = registered_model.latest_versions
106 | 
107 |         _logger.info(f'MLflow Model Registry name: {model_name}')
108 |         for model_version in latest_versions_list:
109 |             if model_version.current_stage != 'Archived':
110 |                 _logger.info(f'Archiving model version: {model_version.version}')
111 |                 client.transition_model_version_stage(
112 |                     name=model_name,
113 |                     version=model_version.version,
114 |                     stage='Archived'
115 |                 )
116 | 
117 |     def _delete_registered_model(self, model_name):
118 |         """
119 |         Delete an experiment from the backend store.
120 |         """
121 |         self._archive_registered_models(model_name)
122 |         client.delete_registered_model(name=model_name)
123 |         _logger.info(f'Deleted MLflow Model Registry model: {model_name}')
124 | 
125 |     def _check_mlflow_experiments_exists(self) -> dict:
126 |         """
127 |         The demo workflow consists of creating 2 MLflow Tracking experiments:
128 |             * train_experiment - Experiment used to track params, metrics, artifacts during model training
129 |             * deploy_experiment - Experiment used to metrics when comparing models during the deploy model step
130 | 
131 |         This method checks the demo_setup config dict for either the experiment_id or experiment_path for both
132 |         experiments.
133 | 
134 |         A dictionary containing the keys train_exp_exists and deploy_exp_exists along with boolean values is returned
135 | 
136 |         Returns
137 |         -------
138 |         Dictionary indicating whether train and deploy MLflow experiments currently exist
139 |         """
140 |         train_experiment_id = self._get_train_experiment_id()
141 |         train_experiment_path = self._get_train_experiment_path()
142 |         deploy_experiment_id = self._get_deploy_experiment_id()
143 |         deploy_experiment_path = self._get_deploy_experiment_path()
144 | 
145 |         def check_by_experiment_id(experiment_id):
146 |             try:
147 |                 mlflow.get_experiment(experiment_id=experiment_id)
148 |                 _logger.info(f'MLflow Tracking experiment_id: {experiment_id} exists')
149 |                 return True
150 |             except RestException:
151 |                 _logger.info(f'MLflow Tracking experiment_id: {experiment_id} DOES NOT exist')
152 |                 return False
153 | 
154 |         def check_by_experiment_path(experiment_path):
155 |             experiment = mlflow.get_experiment_by_name(name=experiment_path)
156 |             if experiment is not None:
157 |                 _logger.info(f'MLflow Tracking experiment_path: {experiment_path} exists')
158 |                 return True
159 |             else:
160 |                 _logger.info(f'MLflow Tracking experiment_path: {experiment_path} DOES NOT exist')
161 |                 return False
162 | 
163 |         if train_experiment_id is not None:
164 |             train_exp_exists = check_by_experiment_id(train_experiment_id)
165 |         elif train_experiment_path is not None:
166 |             train_exp_exists = check_by_experiment_path(train_experiment_path)
167 |         else:
168 |             raise RuntimeError('Either model_train_experiment_id or model_train_experiment_path should be passed in '
169 |                                'deployment.yml')
170 | 
171 |         if deploy_experiment_id is not None:
172 |             deploy_exp_exists = check_by_experiment_id(deploy_experiment_id)
173 |         elif deploy_experiment_path is not None:
174 |             deploy_exp_exists = check_by_experiment_path(deploy_experiment_path)
175 |         else:
176 |             raise RuntimeError('Either model_train_experiment_id or model_train_experiment_path should be passed in '
177 |                                'deployment.yml')
178 | 
179 |         return {'train_exp_exists': train_exp_exists,
180 |                 'deploy_exp_exists': deploy_exp_exists}
181 | 
182 |     def _delete_mlflow_experiments(self, exp_exists_dict: dict):
183 |         """
184 |         Check exp_exists_dict if train_exp_exists or deploy_exp_exists is True. Delete experiments if they exist
185 | 
186 |         Parameters
187 |         ----------
188 |         exp_exists_dict : dict
189 |             A dictionary containing the keys train_exp_exists and deploy_exp_exists along with boolean values
190 |         """
191 |         delete_experiments = [exp for exp, exists in exp_exists_dict.items() if exists == True]
192 |         if len(delete_experiments) == 0:
193 |             _logger.info(f'No existing experiments to delete')
194 |         if 'train_exp_exists' in delete_experiments:
195 |             if self.env_vars['model_train_experiment_path'] is not None:
196 |                 experiment = mlflow.get_experiment_by_name(name=self.env_vars['model_train_experiment_path'])
197 |                 mlflow.delete_experiment(experiment_id=experiment.experiment_id)
198 |                 _logger.info(f'Deleted existing experiment_path: {self.env_vars["model_train_experiment_path"]}')
199 |             elif self.env_vars['model_train_experiment_id'] is not None:
200 |                 mlflow.delete_experiment(experiment_id=self.env_vars['model_train_experiment_id'])
201 |                 _logger.info(f'Deleted existing experiment_id: {self.env_vars["model_train_experiment_id"]}')
202 |             else:
203 |                 raise RuntimeError('Either model_train_experiment_id or model_train_experiment_path should be passed '
204 |                                    'in deployment.yml')
205 | 
206 |         if 'deploy_exp_exists' in delete_experiments:
207 |             if self.env_vars['model_deploy_experiment_path'] is not None:
208 |                 experiment = mlflow.get_experiment_by_name(name=self.env_vars['model_deploy_experiment_path'])
209 |                 mlflow.delete_experiment(experiment_id=experiment.experiment_id)
210 |                 _logger.info(
211 |                     f'Deleted existing experiment_path: {self.env_vars["model_deploy_experiment_path"]}')
212 |             elif self.env_vars['model_deploy_experiment_id'] is not None:
213 |                 mlflow.delete_experiment(experiment_id=self.env_vars['model_deploy_experiment_id'])
214 |                 _logger.info(f'Deleted existing experiment_id: {self.env_vars["model_deploy_experiment_id"]}')
215 | 
216 |     @staticmethod
217 |     def _check_feature_table_exists(feature_store_table) -> bool:
218 |         """
219 |         Check if Feature Store feature table exists
220 |         Returns True if feature table exists in Feature Store, False if not
221 |         """
222 |         try:
223 |             fs.get_table(name=feature_store_table)
224 |             _logger.info(f'Feature Store feature table: {feature_store_table} exists')
225 |             return True
226 |         except (ValueError, Exception):
227 |             _logger.info(f'Feature Store feature table: {feature_store_table} DOES NOT exist')
228 |             return False
229 | 
230 |     @staticmethod
231 |     def _drop_feature_table(feature_store_table):
232 |         """
233 |         Delete Feature Store feature table
234 |         """
235 |         try:
236 |             fs.drop_table(
237 |                 name=feature_store_table
238 |             )
239 |             _logger.info(f'Deleted Feature Store feature table: {feature_store_table}')
240 |         except ValueError:
241 |             _logger.info(f'Feature Store feature table: {feature_store_table} does not exist')
242 | 
243 |     def _check_labels_delta_table_exists(self, labels_table_dbfs_path) -> bool:
244 |         """
245 |         Check if Delta table exists in DBFS
246 | 
247 |         Parameters
248 |         ----------
249 |         labels_table_dbfs_path : str
250 |             Path to Delta table in DBFS
251 | 
252 |         Returns
253 |         -------
254 |         bool
255 |         """
256 |         try:
257 |             self.dbutils.fs.ls(labels_table_dbfs_path)
258 |             _logger.info(f'Labels Delta table: {labels_table_dbfs_path} exists')
259 |             return True
260 |         except:
261 |             _logger.info(f'Labels Delta table: {labels_table_dbfs_path} DOES NOT exist')
262 |             return False
263 | 
264 |     def _delete_labels_delta_table(self, labels_table_dbfs_path):
265 |         self.dbutils.fs.rm(labels_table_dbfs_path, True)
266 |         _logger.info(f'Deleted labels Delta table: {labels_table_dbfs_path}')
267 | 
268 |     def run(self):
269 |         """
270 |         Demo setup steps:
271 |         * Delete Model Registry model if exists (archive any existing models)
272 |         * Delete MLflow experiments if exists
273 |         * Delete Feature Table if exists
274 |         """
275 |         _logger.info('==========Demo Setup=========')
276 |         _logger.info(f'Running demo-setup pipeline in {self.env_vars["env"]} environment')
277 | 
278 |         if self.conf['delete_model_registry']:
279 |             _logger.info('Checking MLflow Model Registry...')
280 |             model_name = self.env_vars['model_name']
281 |             if self._check_mlflow_model_registry_exists(model_name):
282 |                 self._delete_registered_model(model_name)
283 | 
284 |         if self.conf['delete_mlflow_experiments']:
285 |             _logger.info('Checking MLflow Tracking...')
286 |             exp_exists_dict = self._check_mlflow_experiments_exists()
287 |             self._delete_mlflow_experiments(exp_exists_dict)
288 | 
289 |         if self.conf['drop_feature_table']:
290 |             _logger.info('Checking Feature Store...')
291 |             feature_store_database_name = self.env_vars['feature_store_database_name']
292 |             feature_store_table_name = self.env_vars['feature_store_table_name']
293 |             feature_store_table = f'{feature_store_database_name}.{feature_store_table_name}'
294 |             if self._check_feature_table_exists(feature_store_table=feature_store_table):
295 |                 self._drop_feature_table(feature_store_table=feature_store_table)
296 | 
297 |         if self.conf['drop_labels_table']:
298 |             _logger.info('Checking existing labels table...')
299 |             labels_table_dbfs_path = self.env_vars['labels_table_dbfs_path']
300 |             if self._check_labels_delta_table_exists(labels_table_dbfs_path=labels_table_dbfs_path):
301 |                 self._delete_labels_delta_table(labels_table_dbfs_path=labels_table_dbfs_path)
302 | 
303 |         _logger.info('==========Demo Setup Complete=========')
304 | 
305 | # COMMAND ----------
306 | 
307 | # DBTITLE 1,Execute Pipeline
308 | # Instantiate pipeline
309 | demo_setup_pipeline = DemoSetup(conf=pipeline_config, env_vars=env_vars)
310 | demo_setup_pipeline.run()
311 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # [DEPRECATED] e2e-mlops
  2 | 
  3 | > **NOTE:** This repository is deprecated as of 2022/11/11. The end-to-end MLOps workflow demonstrated in this project was designed with recommended tooling available at the time. Since the release of this repo, Databricks has built a product-supported MLOps template called MLOps Stacks (currently in private preview). The repo for MLOps Stacks can be found [here](https://github.com/databricks/mlops-stack). If you would like to express interest and enroll in this private preview, please complete [this questionnaire](https://docs.google.com/forms/d/e/1FAIpQLSfHXCmkbsEURjQQvtUGObgh2D5q1eD4YRHnUxZ0M4Hu0W63WA/viewform).
  4 | 
  5 | ---
  6 | 
  7 | This repo is intended to demonstrate an end-to-end MLOps workflow on Databricks, where a model is deployed along with its ancillary pipelines to a specified (currently single) Databricks workspace.
  8 | Each pipeline (e.g model training pipeline, model deployment pipeline) is deployed as a [Databricks job](https://docs.databricks.com/data-engineering/jobs/jobs.html), where these jobs are deployed to a Databricks workspace using Databricks Labs' [`dbx`](https://dbx.readthedocs.io/en/latest/index.html) tool. 
  9 | 
 10 | The use case at hand is a churn prediction problem. We use the [IBM Telco Customer Churn dataset](https://community.ibm.com/community/user/businessanalytics/blogs/steven-macko/2019/07/11/telco-customer-churn-1113) to build a simple classifier to predict whether a customer will churn from a fictional telco company.
 11 | 
 12 | Note that the package is solely developed via an IDE, and as such there are no Databricks Notebooks in the repository. All jobs are executed via a command line based workflow using [`dbx`](https://dbx.readthedocs.io/en/latest/).
 13 | 
 14 | ## Pipelines
 15 | 
 16 | The following pipelines currently defined within the package are:
 17 | - `demo-setup`
 18 |     - Deletes existing feature store tables, existing MLflow experiments and models registered to MLflow Model Registry, 
 19 |       in order to start afresh for a demo.  
 20 | - `feature-table-creation`
 21 |     - Creates new feature table and separate labels Delta table.
 22 | - `model-train`
 23 |     - Trains a scikit-learn Random Forest model  
 24 | - `model-deployment`
 25 |     - Compare the Staging versus Production models in the MLflow Model Registry. Transition the Staging model to 
 26 |       Production if outperforming the current Production model.
 27 | - `model-inference-batch`
 28 |     - Load a model from MLflow Model Registry, load features from Feature Store and score batch.
 29 | 
 30 | ## Demo
 31 | The following outlines the workflow to demo the repo.
 32 | 
 33 | ### Set up
 34 | 1. Fork https://github.com/niall-turbitt/e2e-mlops
 35 | 1. Configure [Databricks CLI connection profile](https://docs.databricks.com/dev-tools/cli/index.html#connection-profiles)
 36 |     - The project is designed to use 3 different Databricks CLI connection profiles: dev, staging and prod. 
 37 |       These profiles are set in [e2e-mlops/.dbx/project.json](https://github.com/niall-turbitt/e2e-mlops/blob/main/.dbx/project.json).
 38 |     - Note that for demo purposes we use the same connection profile for each of the 3 environments. 
 39 |       **In practice each profile would correspond to separate dev, staging and prod Databricks workspaces.**
 40 |     - This [project.json](https://github.com/niall-turbitt/e2e-mlops/blob/main/.dbx/project.json) file will have to be 
 41 |       adjusted accordingly to the connection profiles a user has configured on their local machine.
 42 | 1. Configure Databricks secrets for GitHub Actions (ensure GitHub actions are enabled for you forked project, as the default is off in a forked repo).
 43 |     - Within the GitHub project navigate to Secrets under the project settings
 44 |     - To run the GitHub actions workflows we require the following GitHub actions secrets:
 45 |         - `DATABRICKS_STAGING_HOST`
 46 |             - URL of Databricks staging workspace
 47 |         - `DATABRICKS_STAGING_TOKEN`
 48 |             - [Databricks access token](https://docs.databricks.com/dev-tools/api/latest/authentication.html) for staging workspace
 49 |         - `DATABRICKS_PROD_HOST`
 50 |             - URL of Databricks production workspace
 51 |         - `DATABRICKS_PROD_TOKEN`
 52 |             - [Databricks access token](https://docs.databricks.com/dev-tools/api/latest/authentication.html) for production workspace
 53 |         - `GH_TOKEN`
 54 |             - GitHub [personal access token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token)
 55 | 
 56 |     #### ASIDE: Starting from scratch
 57 |     
 58 |     The following resources should not be present if starting from scratch: 
 59 |     - Feature table must be deleted
 60 |         - The table e2e_mlops_testing.churn_features will be created when the feature-table-creation pipeline is triggered.
 61 |     - MLflow experiment
 62 |         - MLflow Experiments during model training and model deployment will be used in both the dev and prod environments. 
 63 |           The paths to these experiments are configured in [conf/deployment.yml](https://github.com/niall-turbitt/e2e-mlops/blob/main/conf/deployment.yml).
 64 |         - For demo purposes, we delete these experiments if they exist to begin from a blank slate.
 65 |     - Model Registry
 66 |         - Delete Model in MLflow Model Registry if exists.
 67 |     
 68 |     **NOTE:** As part of the `initial-model-train-register` multitask job, the first task `demo-setup` will delete these, 
 69 |    as specified in [`demo_setup.yml`](https://github.com/niall-turbitt/e2e-mlops/blob/main/conf/job_configs/demo_setup.yml).
 70 | 
 71 | ### Workflow
 72 | 
 73 | 1. **Run `PROD-telco-churn-initial-model-train-register` multitask job in prod environment**
 74 | 
 75 |     - To demonstrate a CICD workflow, we want to start from a “steady state” where there is a current model in production. 
 76 |       As such, we will manually trigger a multitask job to do the following steps:
 77 |       1. Set up the workspace for the demo by deleting existing MLflow experiments and register models, along with 
 78 |          existing Feature Store and labels tables. 
 79 |       1. Create a new Feature Store table to be used by the model training pipeline.
 80 |       1. Train an initial “baseline” model
 81 |     - There is then a final manual step to promote this newly trained model to production via the MLflow Model Registry UI.
 82 | 
 83 |     - Outlined below are the detailed steps to do this:
 84 | 
 85 |         1. Run the multitask `PROD-telco-churn-initial-model-train-register` job via an automated job cluster in the prod environment
 86 |            - **NOTE:** multitask jobs can only be run via `dbx deploy; dbx launch` currently).
 87 |            ```
 88 |            dbx deploy --jobs=PROD-telco-churn-initial-model-train-register --environment=prod --files-only
 89 |            dbx launch --job=PROD-telco-churn-initial-model-train-register --environment=prod --as-run-submit --trace
 90 |            ```
 91 |            See the Limitations section below regarding running multitask jobs. In order to reduce cluster start up time
 92 |            you may want to consider using a [Databricks pool](https://docs.databricks.com/clusters/instance-pools/index.html), 
 93 |            and specify this pool ID in [`conf/deployment.yml`](https://github.com/niall-turbitt/e2e-mlops/blob/main/conf/deployment.yml).
 94 |     - `PROD-telco-churn-initial-model-train-register` tasks:
 95 |         1. Demo setup task steps ([`demo-setup`](https://github.com/niall-turbitt/e2e-mlops/blob/main/telco_churn/jobs/demo_setup_job.py))
 96 |             1. Delete Model Registry model if exists (archive any existing models).
 97 |             1. Delete MLflow experiment if exists.
 98 |             1. Delete Feature Table if exists.
 99 |         1. Feature table creation task steps (`feature-table-creation`)
100 |             1. Creates new churn_features feature table in the Feature Store. 
101 |                 - **NOTE:** `ibm_telco_churn.bronze_customers` is a table created from the following [dataset](https://www.kaggle.com/datasets/yeanzc/telco-customer-churn-ibm-dataset). This will not be automatically available in your Databricks workspace. The user will have to create this table (or update the `feature-table-creation` config to point at this dataset) in your own workspace.
102 |         1. Model train task steps (`model-train`)
103 |             1. Train initial “baseline” classifier (RandomForestClassifier - `max_depth=4`) 
104 |                 - **NOTE:** no changes to config need to be made at this point
105 |             1. Register the model. Model version 1 will be registered to `stage=None` upon successful model training.
106 |             1. **Manual Step**: MLflow Model Registry UI promotion to `stage='Production'`
107 |                 - Go to MLflow Model Registry and manually promote model to `stage='Production'`.
108 | 
109 | 
110 | 2. **Code change / model update (Continuous Integration)**
111 | 
112 |     - Create new “dev/new_model” branch 
113 |         - `git checkout -b  dev/new_model`
114 |     - Make a change to the [`model_train.yml`](https://github.com/niall-turbitt/e2e-mlops/blob/main/conf/job_configs/model_train.yml) config file, updating `max_depth` under model_params from 4 to 8
115 |         - Optional: change run name under mlflow params in [`model_train.yml`](https://github.com/niall-turbitt/e2e-mlops/blob/main/conf/job_configs/model_train.yml) config file
116 |     - Create pull request, to instantiate a request to merge the branch dev/new_model into main. 
117 | 
118 | * On pull request the following steps are triggered in the GitHub Actions workflow:
119 |     1. Trigger unit tests 
120 |     1. Trigger integration tests
121 | * Note that upon tests successfully passing, this merge request will have to be confirmed in GitHub.    
122 | 
123 | 
124 | 3. **Cut release**
125 | 
126 |     - Create tag (e.g. `v0.0.1`)
127 |         - `git tag <tag_name> -a -m “Message”`
128 |             - Note that tags are matched to `v*`, i.e. `v1.0`, `v20.15.10`
129 |     - Push tag
130 |         - `git push origin <tag_name>`
131 | 
132 |     - On pushing this the following steps are triggered in the [`onrelease.yml`](https://github.com/niall-turbitt/e2e-mlops/blob/main/.github/workflows/onrelease.yml) GitHub Actions workflow:
133 |         1. Trigger unit tests.
134 |         1. Deploy `PROD-telco-churn-model-train` job to the prod environment.
135 |         1. Deploy `PROD-telco-churn-model-deployment` job to the prod environment.
136 |         1. Deploy `PROD-telco-churn-model-inference-batch` job to the prod environment.
137 |             - These jobs will now all be present in the specified workspace, and visible under the [Workflows](https://docs.databricks.com/data-engineering/jobs/index.html) tab.
138 |     
139 | 
140 | 4. **Run `PROD-telco-churn-model-train` job in the prod environment**
141 |     - Manually trigger job via UI
142 |         - In the Databricks workspace (prod environment) go to `Workflows` > `Jobs`, where the `PROD-telco-churn-model-train` job will be present.
143 |         - Click into PROD-telco-churn-model-train and select ‘Run Now’. Doing so will trigger the job on the specified cluster configuration.
144 |     - Alternatively you can trigger the job using the Databricks CLI:
145 |       - `databricks jobs run-now –job-id JOB_ID`
146 |        
147 |     - Model train job steps (`telco-churn-model-train`)
148 |         1. Train improved “new” classifier (RandomForestClassifier - `max_depth=8`)
149 |         1. Register the model. Model version 2 will be registered to stage=None upon successful model training.
150 |         1. **Manual Step**: MLflow Model Registry UI promotion to stage='Staging'
151 |             - Go to Model registry and manually promote model to stage='Staging'
152 | 
153 |     **ASIDE:** At this point, there should now be two model versions registered in MLflow Model Registry:
154 |         
155 |     - Version 1 (Production): RandomForestClassifier (`max_depth=4`)
156 |     - Version 2 (Staging): RandomForestClassifier (`max_depth=8`)
157 | 
158 | 
159 | 5. **Run `PROD-telco-churn-model-deployment` job (Continuous Deployment) in the prod environment**
160 |     - Manually trigger job via UI
161 |         - In the Databricks workspace go to `Workflows` > `Jobs`, where the `telco-churn-model-deployment` job will be present.
162 |         - Click into telco-churn-model-deployment and click ‘Run Now’. Doing so will trigger the job on the specified cluster configuration. 
163 |     - Alternatively you can trigger the job using the Databricks CLI:
164 |       - `databricks jobs run-now –job-id JOB_ID`
165 |     
166 |     - Model deployment job steps  (`PROD-telco-churn-model-deployment`)
167 |         1. Compare new “candidate model” in `stage='Staging'` versus current Production model in `stage='Production'`.
168 |         1. Comparison criteria set through [`model_deployment.yml`](https://github.com/niall-turbitt/e2e-mlops/blob/main/conf/job_configs/model_deployment.yml)
169 |             1. Compute predictions using both models against a specified reference dataset
170 |             1. If Staging model performs better than Production model, promote Staging model to Production and archive existing Production model
171 |             1. If Staging model performs worse than Production model, archive Staging model
172 |             
173 | 
174 | 6. **Run `PROD-telco-churn-model-inference-batch` job in the prod environment** 
175 |     - Manually trigger job via UI
176 |         - In the Databricks workspace go to `Workflows` > `Jobs`, where the `PROD-telco-churn-model-inference-batch` job will be present.
177 |         - Click into telco-churn-model-inference-batch and click ‘Run Now’. Doing so will trigger the job on the specified cluster configuration.
178 |     - Alternatively you can trigger the job using the Databricks CLI:
179 |       - `databricks jobs run-now –job-id JOB_ID`
180 | 
181 |     - Batch model inference steps  (`PROD-telco-churn-model-inference-batch`)
182 |         1. Load model from stage=Production in Model Registry
183 |             - **NOTE:** model must have been logged to MLflow using the Feature Store API
184 |         1. Use primary keys in specified inference input data to load features from feature store
185 |         1. Apply loaded model to loaded features
186 |         1. Write predictions to specified Delta path
187 | 
188 | ## Limitations
189 | - Multitask jobs running against the same cluster
190 |     - The pipeline initial-model-train-register is a [multitask job](https://docs.databricks.com/data-engineering/jobs/index.html) 
191 |       which stitches together demo setup, feature store creation and model train pipelines. 
192 |     - At present, each of these tasks within the multitask job is executed on a different automated job cluster, 
193 |       rather than all tasks executed on the same cluster. As such, there will be time incurred for each task to acquire 
194 |       cluster resources and install dependencies.
195 |     - As above, we recommend using a pool from which instances can be acquired when jobs are launched to reduce cluster start up time.
196 |     
197 | ---
198 | ## Development
199 | 
200 | While using this project, you need Python 3.X and `pip` or `conda` for package management.
201 | 
202 | ### Installing project requirements
203 | 
204 | ```bash
205 | pip install -r unit-requirements.txt
206 | ```
207 | 
208 | ### Install project package in a developer mode
209 | 
210 | ```bash
211 | pip install -e .
212 | ```
213 | 
214 | ### Testing
215 | 
216 | #### Running unit tests
217 | 
218 | For unit testing, please use `pytest`:
219 | ```
220 | pytest tests/unit --cov
221 | ```
222 | 
223 | Please check the directory `tests/unit` for more details on how to use unit tests.
224 | In the `tests/unit/conftest.py` you'll also find useful testing primitives, such as local Spark instance with Delta support, local MLflow and DBUtils fixture.
225 | 
226 | #### Running integration tests
227 | 
228 | There are two options for running integration tests:
229 | 
230 | - On an interactive cluster via `dbx execute`
231 | - On a job cluster via `dbx launch`
232 | 
233 | For quicker startup of the job clusters we recommend using instance pools ([AWS](https://docs.databricks.com/clusters/instance-pools/index.html), [Azure](https://docs.microsoft.com/en-us/azure/databricks/clusters/instance-pools/), [GCP](https://docs.gcp.databricks.com/clusters/instance-pools/index.html)).
234 | 
235 | For an integration test on interactive cluster, use the following command:
236 | ```
237 | dbx execute --cluster-name=<name of interactive cluster> --job=<name of the job to test>
238 | ```
239 | 
240 | For a test on an automated job cluster, deploy the job files and then launch:
241 | ```
242 | dbx deploy --jobs=<name of the job to test> --files-only
243 | dbx launch --job=<name of the job to test> --as-run-submit --trace
244 | ```
245 | 
246 | Please note that for testing we recommend using [jobless deployments](https://dbx.readthedocs.io/en/latest/guidance/run_submit.html), so you won't affect existing job definitions.
247 | 
248 | ### Interactive execution and development on Databricks clusters
249 | 
250 | 1. `dbx` expects that cluster for interactive execution supports `%pip` and `%conda` magic [commands](https://docs.databricks.com/libraries/notebooks-python-libraries.html).
251 | 2. Please configure your job in `conf/deployment.yml` file.
252 | 2. To execute the code interactively, provide either `--cluster-id` or `--cluster-name`.
253 | ```bash
254 | dbx execute \
255 |     --cluster-name="<some-cluster-name>" \
256 |     --job=job-name
257 | ```
258 | 
259 | Multiple users also can use the same cluster for development. Libraries will be isolated per each execution context.
260 | 
261 | ### Working with notebooks and Repos
262 | 
263 | To start working with your notebooks from [Repos](https://docs.databricks.com/repos/index.html), do the following steps:
264 | 
265 | 1. Add your git provider token to your user settings
266 | 2. Add your repository to Repos. This could be done via UI, or via CLI command below:
267 | ```bash
268 | databricks repos create --url <your repo URL> --provider <your-provider>
269 | ```
270 | This command will create your personal repository under `/Repos/<username>/telco_churn`.
271 | 3. To set up the CI/CD pipeline with the notebook, create a separate `Staging` repo:
272 | ```bash
273 | databricks repos create --url <your repo URL> --provider <your-provider> --path /Repos/Staging/telco_churn
274 | ```
275 | 


--------------------------------------------------------------------------------