├── telco_churn ├── pipelines │ ├── __init__.py │ ├── sample_test_job.py │ ├── model_inference_batch_job.py │ ├── model_deployment_job.py │ ├── feature_table_creator_job.py │ ├── model_train_job.py │ └── demo_setup_job.py ├── utils │ ├── __init__.py │ ├── get_spark.py │ ├── logger_utils.py │ ├── notebook_utils.py │ ├── feature_store_utils.py │ └── evaluation_utils.py ├── __init__.py ├── model_train_pipeline.py ├── model_inference.py ├── featurize.py ├── feature_table_creator.py ├── common.py ├── model_train.py └── model_deployment.py ├── conf ├── staging │ └── .staging.env ├── pipeline_configs │ ├── sample_test.yml │ ├── model_deployment.yml │ ├── model_train.yml │ ├── model_inference_batch.yml │ ├── feature_table_creator.yml │ └── demo_setup.yml ├── dev │ └── .dev.env ├── prod │ └── .prod.env ├── .base_data_params.env └── deployment.yml ├── requirements.txt ├── pytest.ini ├── unit-requirements.txt ├── .coveragerc ├── setup.py ├── .gitignore ├── .dbx └── project.json ├── .github └── workflows │ ├── onpullrequest.yml │ └── onrelease.yml ├── tests ├── integration │ └── sample_test.py └── unit │ ├── model_train_pipeline_test.py │ └── conftest.py ├── notebooks ├── model_inference_batch.py ├── feature_table_creator.py ├── model_train.py ├── model_deployment.py └── demo_setup.py └── README.md /telco_churn/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /telco_churn/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /conf/staging/.staging.env: -------------------------------------------------------------------------------- 1 | env=staging -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-dotenv==0.20.0 -------------------------------------------------------------------------------- /telco_churn/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.1" 2 | -------------------------------------------------------------------------------- /conf/pipeline_configs/sample_test.yml: -------------------------------------------------------------------------------- 1 | output_format: 'delta' 2 | output_path: 'dbfs:/dbx/tmp/test/e2e_mlops' -------------------------------------------------------------------------------- /telco_churn/utils/get_spark.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession.builder.getOrCreate() -------------------------------------------------------------------------------- /conf/pipeline_configs/model_deployment.yml: -------------------------------------------------------------------------------- 1 | model_comparison_params: 2 | metric: 'roc_auc_score' 3 | higher_is_better: True -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = -s -p no:warnings 3 | log_cli = 1 4 | log_cli_level = INFO 5 | log_cli_format = [pytest][%(asctime)s][%(levelname)s][%(module)s][%(funcName)s] %(message)s 6 | log_cli_date_format = %Y-%m-%d %H:%M:%S 7 | log_level = INFO -------------------------------------------------------------------------------- /unit-requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools==58.0.4 2 | wheel==0.37.0 3 | pyspark 4 | numpy==1.20.3 5 | pandas==1.3.4 6 | scikit-learn==0.24.2 7 | pyyaml==6.0 8 | pytest==7.1.2 9 | pytest-cov==3.0.0 10 | dbx==0.5.0 11 | delta-spark 12 | python-dotenv==0.20.0 -------------------------------------------------------------------------------- /conf/pipeline_configs/model_train.yml: -------------------------------------------------------------------------------- 1 | mlflow_params: 2 | run_name: 'random_forest_baseline' 3 | 4 | 5 | pipeline_params: 6 | test_size: 0.25 7 | random_state: 42 8 | 9 | model_params: 10 | n_estimators: 100 11 | max_depth: 4 12 | min_samples_leaf: 1 13 | max_features: 'auto' 14 | random_state: 42 -------------------------------------------------------------------------------- /conf/pipeline_configs/model_inference_batch.yml: -------------------------------------------------------------------------------- 1 | mlflow_params: 2 | model_registry_stage: 'production' 3 | 4 | data_input: 5 | # Require DataFrame to score the model on - must contain column(s) for lookup keys 6 | # to join feature data from Feature Store 7 | table_name: 'e2e_mlops_prod.churn_labels' 8 | 9 | data_output: 10 | mode: 'overwrite' -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = telco_churn 4 | 5 | [report] 6 | exclude_lines = 7 | if self.debug: 8 | pragma: no cover 9 | raise NotImplementedError 10 | if __name__ == .__main__.: 11 | 12 | ignore_errors = True 13 | omit = 14 | tests/* 15 | setup.py 16 | # this file is autogenerated by dbx 17 | telco_churn/common.py 18 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | from telco_churn import __version__ 3 | 4 | setup( 5 | name='telco_churn', 6 | packages=find_packages(exclude=['tests', 'tests.*']), 7 | setup_requires=['wheel'], 8 | version=__version__, 9 | description='Demo repository implementing an end-to-end MLOps workflow on Databricks. Project derived from dbx ' 10 | 'basic python template', 11 | authors='Joseph Bradley, Rafi Kurlansik, Matthew Thomson, Niall Turbitt' 12 | ) 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Distribution / packaging 7 | *.egg-info/ 8 | .eggs 9 | build 10 | dist 11 | 12 | # venv 13 | venv 14 | 15 | # Unit test / coverage reports 16 | .coverage 17 | coverage.xml 18 | junit/* 19 | htmlcov/* 20 | 21 | # Caches 22 | .pytest_cache/ 23 | 24 | # VSCode 25 | .vscode/ 26 | 27 | # Idea 28 | .idea/ 29 | *.iml 30 | 31 | # MacOS 32 | .DS_Store 33 | 34 | # Databricks eXtensions 35 | .dbx/lock.json 36 | 37 | # local mlflow files 38 | mlruns/ -------------------------------------------------------------------------------- /conf/pipeline_configs/feature_table_creator.yml: -------------------------------------------------------------------------------- 1 | input_table: 'ibm_telco_churn.bronze_customers' 2 | 3 | data_prep_params: 4 | label_col: 'churnString' 5 | ohe: False 6 | # # Only require cat_cols if ohe=True 7 | # cat_cols: ['gender', 'partner', 'dependents', 8 | # 'phoneService', 'multipleLines', 'internetService', 9 | # 'onlineSecurity', 'onlineBackup', 'deviceProtection', 10 | # 'techSupport', 'streamingTV', 'streamingMovies', 11 | # 'contract', 'paperlessBilling', 'paymentMethod'] 12 | drop_missing: False -------------------------------------------------------------------------------- /conf/pipeline_configs/demo_setup.yml: -------------------------------------------------------------------------------- 1 | # Delete MLflow Registry model 2 | # Model name set in deployment.yml 3 | delete_model_registry: True 4 | 5 | # Delete MLflow Tracking experiments (both the training experiment and the deployment experiment) 6 | # Experiment paths/id set in deployment.yml 7 | delete_mlflow_experiments: True 8 | 9 | # Drop Feature Store feature table if it exists 10 | # Feature table name set in deployment.yml 11 | drop_feature_table: True 12 | 13 | # Drop labels table if it exists 14 | # Label table name set in deployment.yml 15 | drop_labels_table: True -------------------------------------------------------------------------------- /telco_churn/pipelines/sample_test_job.py: -------------------------------------------------------------------------------- 1 | from telco_churn.common import Workload 2 | 3 | 4 | class SampleJob(Workload): 5 | 6 | def launch(self): 7 | self.logger.info('Launching sample job') 8 | 9 | listing = self.dbutils.fs.ls('dbfs:/') 10 | 11 | for l in listing: 12 | self.logger.info(f'DBFS directory: {l}') 13 | 14 | df = self.spark.range(0, 1000) 15 | 16 | df.write.format(self.conf['output_format']).mode('overwrite').save( 17 | self.conf['output_path'] 18 | ) 19 | 20 | self.logger.info('Sample job finished!') 21 | 22 | 23 | if __name__ == '__main__': 24 | job = SampleJob() 25 | job.launch() 26 | -------------------------------------------------------------------------------- /.dbx/project.json: -------------------------------------------------------------------------------- 1 | { 2 | "environments": { 3 | "dev": { 4 | "profile": "e2-demo-west", 5 | "workspace_dir": "/Shared/e2e_mlops/dev/dbx/e2e_mlops_dev", 6 | "artifact_location": "dbfs:/Shared/e2e_mlops/dev/dbx/projects/e2e_mlops_dbx_dev" 7 | }, 8 | "staging": { 9 | "profile": "e2-demo-west", 10 | "workspace_dir": "/Shared/e2e_mlops/staging/dbx/e2e_mlops_staging", 11 | "artifact_location": "dbfs:/Shared/e2e_mlops/staging/dbx/projects/e2e_mlops_dbx_staging" 12 | }, 13 | "prod": { 14 | "profile": "e2-demo-west", 15 | "workspace_dir": "/Shared/e2e_mlops/prod/dbx/e2e_mlops_prod", 16 | "artifact_location": "dbfs:/Shared/e2e_mlops/prod/dbx/projects/e2e_mlops_dbx_prod" 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /conf/dev/.dev.env: -------------------------------------------------------------------------------- 1 | env=dev 2 | 3 | // Global MLflow params for dev 4 | model_train_experiment_path='/Shared/e2e_mlops/dev/telco_churn_experiment_dev' 5 | model_name=e2e_mlops_telco_churn_dev 6 | model_deploy_experiment_path='/Shared/e2e_mlops/dev/telco_churn_deployment_dev' 7 | 8 | // Feature Store params 9 | feature_store_database_name='e2e_mlops_dev' 10 | 11 | // Labels table params 12 | labels_table_database_name='e2e_mlops_dev' 13 | //tmp directory for demo purposes 14 | labels_table_dbfs_path='dbfs:/tmp/e2e_mlops/dev/churn_labels.delta' 15 | 16 | // Batch inference predictions table params 17 | predictions_table_database_name='e2e_mlops_dev' 18 | predictions_table_name = 'churn_predictions' 19 | 20 | // Reference table params - table to use for comparing staging vs production models 21 | reference_table_database_name='e2e_mlops_dev' -------------------------------------------------------------------------------- /conf/prod/.prod.env: -------------------------------------------------------------------------------- 1 | env=prod 2 | 3 | // Global MLflow params for prod 4 | model_train_experiment_path='/Shared/e2e_mlops/prod/telco_churn_experiment_prod' 5 | model_name='e2e_mlops_telco_churn_prod' 6 | model_deploy_experiment_path='/Shared/e2e_mlops/prod/telco_churn_deployment_prod' 7 | 8 | // Feature Store params 9 | feature_store_database_name='e2e_mlops_prod' 10 | 11 | // Labels table params 12 | labels_table_database_name='e2e_mlops_prod' 13 | // tmp directory for demo purposes 14 | labels_table_dbfs_path='dbfs:/tmp/e2e_mlops/prod/churn_labels.delta' 15 | 16 | // Batch inference predictions table params 17 | predictions_table_database_name='e2e_mlops_prod' 18 | predictions_table_name = 'churn_predictions' 19 | 20 | // Reference table params - table to use for comparing staging vs production models 21 | reference_table_database_name='e2e_mlops_prod' 22 | 23 | -------------------------------------------------------------------------------- /telco_churn/utils/logger_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | class NoReceivedCommandFilter(logging.Filter): 5 | def filter(self, record): 6 | if 'Received command c' not in record.getMessage(): 7 | return record.getMessage() 8 | 9 | 10 | class NoPythonDotEnvFilter(logging.Filter): 11 | def filter(self, record): 12 | if 'Python-dotenv' not in record.getMessage(): 13 | return record.getMessage() 14 | 15 | 16 | def get_logger(): 17 | logging.getLogger('py4j.java_gateway').setLevel(logging.ERROR) 18 | logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO) 19 | logger = logging.getLogger(__name__) 20 | 21 | filter_1 = NoReceivedCommandFilter() 22 | filter_2 = NoPythonDotEnvFilter() 23 | logger.addFilter(filter_1) 24 | logger.addFilter(filter_2) 25 | 26 | return logger 27 | -------------------------------------------------------------------------------- /conf/.base_data_params.env: -------------------------------------------------------------------------------- 1 | // Base data params which are consistent across environments 2 | 3 | // Feature Store params 4 | feature_store_table_name='churn_features' 5 | feature_store_table_primary_keys='customerID' 6 | feature_store_table_description='These features are derived from the ibm_telco_churn.bronze_customers table in the lakehouse. We created dummy variables for the categorical columns, cleaned up their names, and added a boolean flag for whether the customer churned or not. No aggregations were performed.' 7 | 8 | // Labels table params 9 | labels_table_name='churn_labels' 10 | labels_table_label_col='churn' 11 | 12 | // Batch inference input table params 13 | // For demo purposes we use the churn_labels table 14 | inference_table_name='churn_labels' 15 | 16 | // Batch inference predictions table params 17 | predictions_table_name='churn_predictions' 18 | 19 | // Reference table params - table to use for comparing staging vs production models 20 | // For demo purposes we use the churn_labels table. However this reference table in practice would be a curated dataset 21 | // Note=this table must contain column(s) for lookup keys to join feature data from Feature Store 22 | reference_table_name='churn_labels' 23 | reference_table_label_col='churn' -------------------------------------------------------------------------------- /telco_churn/model_train_pipeline.py: -------------------------------------------------------------------------------- 1 | from sklearn.compose import make_column_selector, ColumnTransformer 2 | from sklearn.impute import SimpleImputer 3 | from sklearn.pipeline import Pipeline 4 | from sklearn.preprocessing import OneHotEncoder 5 | from sklearn.ensemble import RandomForestClassifier 6 | 7 | 8 | class ModelTrainPipeline: 9 | 10 | @classmethod 11 | def create_train_pipeline(cls, model_params: dict) -> Pipeline: 12 | 13 | preprocessor = ColumnTransformer( 14 | transformers=[ 15 | ('numeric_transformer', 16 | SimpleImputer(strategy='median'), 17 | make_column_selector(dtype_exclude='object') 18 | ), 19 | ('categorical_transformer', 20 | OneHotEncoder(handle_unknown='ignore'), 21 | make_column_selector(dtype_include='object') 22 | ), 23 | ], 24 | remainder='passthrough', 25 | sparse_threshold=0 26 | ) 27 | 28 | rf_classifier = RandomForestClassifier(**model_params) 29 | 30 | pipeline = Pipeline([ 31 | ('preprocessor', preprocessor), 32 | ('classifier', rf_classifier), 33 | ]) 34 | 35 | return pipeline 36 | -------------------------------------------------------------------------------- /.github/workflows/onpullrequest.yml: -------------------------------------------------------------------------------- 1 | name: CI pipeline 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | tags-ignore: 8 | - 'v*' # this tag type is used for release pipelines 9 | 10 | jobs: 11 | ci-pipeline: 12 | 13 | runs-on: ubuntu-latest 14 | strategy: 15 | max-parallel: 4 16 | 17 | env: 18 | DATABRICKS_HOST: ${{ secrets.DATABRICKS_STAGING_HOST }} 19 | DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_STAGING_TOKEN }} 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | 24 | - name: Set up Python 25 | uses: actions/setup-python@v3 26 | with: 27 | python-version: 3.9.5 28 | 29 | - name: Install pip 30 | run: | 31 | python -m pip install --upgrade pip 32 | 33 | - name: Install dependencies and project in dev mode 34 | run: | 35 | pip install -r unit-requirements.txt 36 | pip install -e . 37 | 38 | - name: Run unit tests 39 | run: | 40 | echo "Launching unit tests" 41 | pytest tests/unit 42 | 43 | - name: Deploy integration test [staging environment] 44 | run: | 45 | dbx deploy --jobs=STAGING-telco-churn-sample-integration-test --environment=staging --files-only 46 | 47 | - name: Run integration test [staging environment] 48 | run: | 49 | dbx launch --job=STAGING-telco-churn-sample-integration-test --environment=staging --as-run-submit --trace -------------------------------------------------------------------------------- /telco_churn/utils/notebook_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import dotenv 4 | import yaml 5 | import pprint 6 | from typing import Dict, Any 7 | 8 | 9 | def load_and_set_env_vars(env: str) -> Dict[str, Any]: 10 | """ 11 | Utility function to use in Databricks notebooks to load .env files and set them via os 12 | Return a dict of set environment variables 13 | 14 | Parameters 15 | ---------- 16 | env : str 17 | Name of deployment environment. One of 18 | 19 | Returns 20 | ------- 21 | Dictionary of set environment variables 22 | """ 23 | env_vars_path = os.path.join(os.pardir, 'conf', env, f'.{env}.env') 24 | dotenv.load_dotenv(env_vars_path) 25 | 26 | base_data_vars_vars_path = os.path.join(os.pardir, 'conf', '.base_data_params.env') 27 | dotenv.load_dotenv(base_data_vars_vars_path) 28 | 29 | os_dict = dict(os.environ) 30 | pprint.pprint(os_dict) 31 | 32 | return os_dict 33 | 34 | 35 | def load_config(pipeline_name) -> Dict[str, Any]: 36 | """ 37 | Utility function to use in Databricks notebooks to load the config yaml file for a given pipeline 38 | Return dict of specified config params 39 | 40 | Parameters 41 | ---------- 42 | pipeline_name : str 43 | Name of pipeline 44 | 45 | Returns 46 | ------- 47 | Dictionary of config params 48 | """ 49 | config_path = os.path.join(os.pardir, 'conf', 'pipeline_configs', f'{pipeline_name}.yml') 50 | pipeline_config = yaml.safe_load(pathlib.Path(config_path).read_text()) 51 | pprint.pprint(pipeline_config) 52 | 53 | return pipeline_config 54 | -------------------------------------------------------------------------------- /telco_churn/utils/feature_store_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | import pyspark 4 | 5 | import databricks 6 | from databricks.feature_store import FeatureStoreClient 7 | 8 | 9 | def create_and_write_feature_table(df: pyspark.sql.DataFrame, 10 | feature_table_name: str, 11 | primary_keys: Union[str, List[str]], 12 | description: str) -> databricks.feature_store.entities.feature_table.FeatureTable: 13 | """ 14 | Create and return a feature table with the given name and primary keys, writing the provided Spark DataFrame to the 15 | feature table 16 | 17 | Parameters 18 | ---------- 19 | df : pyspark.sql.DataFrame 20 | Data to create this feature table 21 | feature_table_name : str 22 | A feature table name of the form ., for example dev.user_features. 23 | primary_keys : Union[str, List[str]] 24 | The feature table’s primary keys. If multiple columns are required, specify a list of column names, for example 25 | ['customer_id', 'region']. 26 | description : str 27 | Description of the feature table. 28 | Returns 29 | ------- 30 | databricks.feature_store.entities.feature_table.FeatureTable 31 | """ 32 | fs = FeatureStoreClient() 33 | 34 | feature_table = fs.create_table( 35 | name=feature_table_name, 36 | primary_keys=primary_keys, 37 | schema=df.schema, 38 | description=description 39 | ) 40 | 41 | fs.write_table(df=df, name=feature_table_name, mode='overwrite') 42 | 43 | return feature_table 44 | -------------------------------------------------------------------------------- /tests/integration/sample_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from telco_churn.pipelines.sample_test_job import SampleJob 4 | from uuid import uuid4 5 | from pyspark.dbutils import DBUtils # noqa 6 | 7 | 8 | class SampleJobIntegrationTest(unittest.TestCase): 9 | def setUp(self): 10 | 11 | self.test_dir = 'dbfs:/tmp/tests/sample/%s' % str(uuid4()) 12 | self.test_config = {'output_format': 'delta', 'output_path': self.test_dir} 13 | 14 | self.job = SampleJob(init_conf=self.test_config) 15 | self.dbutils = DBUtils(self.job.spark) 16 | self.spark = self.job.spark 17 | 18 | def test_sample(self): 19 | 20 | self.job.launch() 21 | 22 | output_count = ( 23 | self.spark.read.format(self.test_config['output_format']) 24 | .load(self.test_config['output_path']) 25 | .count() 26 | ) 27 | 28 | self.assertGreater(output_count, 0) 29 | 30 | def tearDown(self): 31 | self.dbutils.fs.rm(self.test_dir, True) 32 | 33 | 34 | if __name__ == '__main__': 35 | # please don't change the logic of test result checks here 36 | # it's intentionally done in this way to comply with pipelines run result checks 37 | # for other tests, please simply replace the SampleJobIntegrationTest with your custom class name 38 | loader = unittest.TestLoader() 39 | tests = loader.loadTestsFromTestCase(SampleJobIntegrationTest) 40 | runner = unittest.TextTestRunner(verbosity=2) 41 | result = runner.run(tests) 42 | if not result.wasSuccessful(): 43 | raise RuntimeError( 44 | 'One or multiple tests failed. Please check job logs for additional information.' 45 | ) 46 | -------------------------------------------------------------------------------- /telco_churn/pipelines/model_inference_batch_job.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from telco_churn.common import Workload 4 | from telco_churn.model_inference import ModelInference 5 | from telco_churn.utils.logger_utils import get_logger 6 | 7 | _logger = get_logger() 8 | 9 | 10 | class ModelInferenceJob(Workload): 11 | 12 | def _get_model_uri(self) -> str: 13 | model_name = self.env_vars['model_name'] 14 | model_registry_stage = self.conf['mlflow_params']['model_registry_stage'] 15 | model_uri = f'models:/{model_name}/{model_registry_stage}' 16 | 17 | return model_uri 18 | 19 | def _get_input_table_name(self) -> str: 20 | """ 21 | Get the name of the input table to perform inference on 22 | """ 23 | return self.conf['data_input']['table_name'] 24 | 25 | def _get_predictions_output_params(self) -> Dict: 26 | """ 27 | Get a dictionary of delta_path, table_name, mode key-values to pass to run_and_write_batch of ModelInference 28 | """ 29 | predictions_table_database_name = self.env_vars['predictions_table_database_name'] 30 | predictions_table_name = f'{predictions_table_database_name}.{self.env_vars["predictions_table_name"]}' 31 | 32 | return predictions_table_name 33 | 34 | def launch(self): 35 | _logger.info('Launching Batch ModelInferenceJob job') 36 | _logger.info(f'Running model-inference-batch in {self.env_vars["env"]} environment') 37 | ModelInference(model_uri=self._get_model_uri(), 38 | input_table_name=self._get_input_table_name(), 39 | output_table_name=self._get_predictions_output_params())\ 40 | .run_and_write_batch(mode=self.conf['data_output']['mode']) 41 | _logger.info('Batch ModelInferenceJob job finished') 42 | 43 | 44 | if __name__ == '__main__': 45 | job = ModelInferenceJob() 46 | job.launch() 47 | -------------------------------------------------------------------------------- /telco_churn/utils/evaluation_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | import pandas as pd 3 | from sklearn.metrics import roc_auc_score 4 | 5 | 6 | class ModelEvaluation: 7 | 8 | @staticmethod 9 | def _roc_auc_score(y_true: pd.Series, y_score: pd.Series): 10 | """ 11 | Compute ROC AUC score using sklearn. Computed in same way as MLflow utils 12 | https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html 13 | By default, for roc_auc_score, we pick `average` to be `weighted`, `multi_class` to be `ovo`, 14 | to make the output more insensitive to dataset imbalance. 15 | 16 | Parameters 17 | ---------- 18 | y_true : array-like of shape (n_samples,) or (n_samples, n_classes) 19 | True labels or binary label indicators 20 | y_score : array-like of shape (n_samples,) or (n_samples, n_classes) 21 | Target scores. 22 | 23 | Returns 24 | ------- 25 | auc : float 26 | """ 27 | return roc_auc_score(y_true=y_true, 28 | y_score=y_score, 29 | average='weighted', 30 | multi_class='ovo') 31 | 32 | def evaluate(self, y_true: pd.Series, y_score: pd.Series, metric_prefix: str = '') -> Dict: 33 | """ 34 | 35 | 36 | Parameters 37 | ---------- 38 | y_true : array-like of shape (n_samples,) or (n_samples, n_classes) 39 | True labels or binary label indicators 40 | y_score : array-like of shape (n_samples,) or (n_samples, n_classes) 41 | Target scores. 42 | metric_prefix : str 43 | Prefix for each metric key in the returned dictionary 44 | 45 | Returns 46 | ------- 47 | Dictionary of (metric name, computed value) 48 | """ 49 | return { 50 | f'{metric_prefix}roc_auc_score': self._roc_auc_score(y_true, y_score), 51 | } 52 | -------------------------------------------------------------------------------- /.github/workflows/onrelease.yml: -------------------------------------------------------------------------------- 1 | name: Release pipeline 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10 7 | 8 | jobs: 9 | release-pipeline: 10 | 11 | runs-on: ubuntu-latest 12 | strategy: 13 | max-parallel: 4 14 | matrix: 15 | python-version: [ 3.9 ] 16 | 17 | env: 18 | DATABRICKS_HOST: ${{ secrets.DATABRICKS_PROD_HOST }} 19 | DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_PROD_TOKEN }} 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | 24 | - name: Set up Python 25 | uses: actions/setup-python@v3 26 | with: 27 | python-version: 3.9.5 28 | 29 | - name: Install pip 30 | run: | 31 | python -m pip install --upgrade pip 32 | 33 | - name: Install dependencies and project 34 | run: | 35 | pip install -r unit-requirements.txt 36 | 37 | - name: Deploy PROD-telco-churn-model-train job [prod environment] 38 | run: | 39 | dbx deploy --deployment-file conf/deployment.yml --jobs=PROD-telco-churn-model-train --environment=prod 40 | 41 | - name: Deploy PROD-telco-churn-model-deployment job [prod environment] 42 | run: | 43 | dbx deploy --deployment-file conf/deployment.yml --jobs=PROD-telco-churn-model-deployment --environment=prod 44 | 45 | - name: Deploy PROD-telco-churn-model-inference-batch job [prod environment] 46 | run: | 47 | dbx deploy --deployment-file conf/deployment.yml --jobs=PROD-telco-churn-model-inference-batch --environment=prod 48 | 49 | - name: Create Release 50 | id: create_release 51 | uses: actions/create-release@v1 52 | env: 53 | GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} 54 | with: 55 | tag_name: ${{ github.ref }} 56 | release_name: ${{ github.ref }} 57 | body: | 58 | Release for version ${{ github.ref }}. 59 | draft: false 60 | prerelease: false -------------------------------------------------------------------------------- /telco_churn/pipelines/model_deployment_job.py: -------------------------------------------------------------------------------- 1 | from telco_churn.common import Workload, MLflowTrackingConfig 2 | from telco_churn.model_deployment import ModelDeployment, ModelDeploymentConfig 3 | from telco_churn.utils.logger_utils import get_logger 4 | 5 | _logger = get_logger() 6 | 7 | 8 | class ModelDeploymentJob(Workload): 9 | 10 | def _get_mlflow_tracking_cfg(self): 11 | return MLflowTrackingConfig(experiment_path=self.env_vars['model_deploy_experiment_path'], 12 | run_name='staging_vs_prod_comparison', 13 | model_name=self.env_vars['model_name']) 14 | 15 | def _get_reference_data(self) -> str: 16 | reference_table_database_name = self.env_vars['reference_table_database_name'] 17 | reference_table_name = self.env_vars['reference_table_name'] 18 | return f'{reference_table_database_name}.{reference_table_name}' 19 | 20 | def _get_reference_data_label_col(self) -> str: 21 | return self.env_vars['reference_table_label_col'] 22 | 23 | def _get_model_comparison_params(self) -> dict: 24 | return self.conf['model_comparison_params'] 25 | 26 | def launch(self): 27 | _logger.info('Launching ModelDeploymentJob job') 28 | _logger.info(f'Running model-deployment pipeline in {self.env_vars["env"]} environment') 29 | cfg = ModelDeploymentConfig(mlflow_tracking_cfg=self._get_mlflow_tracking_cfg(), 30 | reference_data=self._get_reference_data(), 31 | label_col=self._get_reference_data_label_col(), 32 | comparison_metric=self._get_model_comparison_params()['metric'], 33 | higher_is_better=self._get_model_comparison_params()['higher_is_better']) 34 | ModelDeployment(cfg).run() 35 | _logger.info('Launching ModelDeploymentJob job finished!') 36 | 37 | 38 | if __name__ == '__main__': 39 | job = ModelDeploymentJob() 40 | job.launch() 41 | -------------------------------------------------------------------------------- /tests/unit/model_train_pipeline_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from dataclasses import dataclass 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from telco_churn.model_train_pipeline import ModelTrainPipeline 8 | 9 | 10 | class ModelTrainPipelineTest(unittest.TestCase): 11 | 12 | def test_create_train_pipeline(self): 13 | @dataclass 14 | class Example: 15 | contract: str 16 | dependents: str 17 | deviceProtection: str 18 | gender: str 19 | internetService: str 20 | monthlyCharges: float 21 | multipleLines: str 22 | onlineBackup: str 23 | onlineSecurity: str 24 | paperlessBilling: str 25 | partner: str 26 | paymentMethod: str 27 | phoneService: str 28 | seniorCitizen: float 29 | streamingMovies: str 30 | streamingTV: str 31 | techSupport: str 32 | tenure: float 33 | totalCharges: float 34 | 35 | X = pd.DataFrame(data=[ 36 | Example('Two year', 'Yes', 'No', 'Female', 'DSL', 53.65, 'No phone service', 'Yes', 'No', 'No', 'Yes', 37 | 'Credit card (automatic)', 'No', 0.0, 'Yes', 'Yes', 'Yes', 72.0, 3784.0), 38 | Example('Month-to-month', 'No', 'No', 'Male', 'Fiber optic', 74.9, 'Yes', 'No', 'No', 'Yes', 'No', 39 | 'Electronic check', 'Yes', 0.0, 'No', 'No', 'No', 1.0, 74.9), 40 | Example('Month-to-month', 'No', 'No', 'Female', 'Fiber optic', 100.4, 'Yes', 'No', 'No', 'Yes', 'Yes', 41 | 'Bank transfer (automatic)', 'Yes', 1.0, 'Yes', 'Yes', 'Yes', 58.0, 5749.8), 42 | ]) 43 | y = np.random.randint(2, size=3) 44 | 45 | model_params = {'n_estimators': 4, 46 | 'max_depth': 4, 47 | 'min_samples_leaf': 1, 48 | 'max_features': 'auto', 49 | 'random_state': 42} 50 | 51 | pipeline = ModelTrainPipeline.create_train_pipeline(model_params=model_params) 52 | pipeline.fit(X, y) 53 | y_pred = pipeline.predict(X) 54 | 55 | assert np.array_equal(y_pred, y_pred.astype(bool)) 56 | 57 | -------------------------------------------------------------------------------- /telco_churn/pipelines/feature_table_creator_job.py: -------------------------------------------------------------------------------- 1 | from telco_churn.common import Workload, FeatureStoreTableConfig, LabelsTableConfig 2 | from telco_churn.feature_table_creator import FeatureTableCreator, FeatureTableCreatorConfig 3 | from telco_churn.featurize import FeaturizerConfig 4 | from telco_churn.utils.logger_utils import get_logger 5 | 6 | _logger = get_logger() 7 | 8 | 9 | class FeatureTableCreatorJob(Workload): 10 | 11 | def _get_input_table(self) -> dict: 12 | return self.conf['input_table'] 13 | 14 | def _get_data_prep_params(self) -> FeaturizerConfig: 15 | return FeaturizerConfig(**self.conf['data_prep_params']) 16 | 17 | def _get_feature_store_table_cfg(self) -> FeatureStoreTableConfig: 18 | return FeatureStoreTableConfig(database_name=self.env_vars['feature_store_database_name'], 19 | table_name=self.env_vars['feature_store_table_name'], 20 | primary_keys=self.env_vars['feature_store_table_primary_keys'], 21 | description=self.env_vars['feature_store_table_description']) 22 | 23 | def _get_labels_table_cfg(self) -> LabelsTableConfig: 24 | return LabelsTableConfig(database_name=self.env_vars['labels_table_database_name'], 25 | table_name=self.env_vars['labels_table_name'], 26 | label_col=self.env_vars['labels_table_label_col'], 27 | dbfs_path=self.env_vars['labels_table_dbfs_path']) 28 | 29 | def launch(self) -> None: 30 | """ 31 | Launch FeatureStoreTableCreator job 32 | """ 33 | _logger.info('Launching FeatureTableCreator job') 34 | _logger.info(f'Running feature-table-creation pipeline in {self.env_vars["env"]} environment') 35 | cfg = FeatureTableCreatorConfig(input_table=self._get_input_table(), 36 | featurizer_cfg=self._get_data_prep_params(), 37 | feature_store_table_cfg=self._get_feature_store_table_cfg(), 38 | labels_table_cfg=self._get_labels_table_cfg()) 39 | FeatureTableCreator(cfg).run() 40 | _logger.info('FeatureTableCreator job finished!') 41 | 42 | 43 | if __name__ == '__main__': 44 | job = FeatureTableCreatorJob() 45 | job.launch() 46 | -------------------------------------------------------------------------------- /notebooks/model_inference_batch.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # `model_inference_batch` 5 | # MAGIC 6 | # MAGIC Pipeline to execute model inference. 7 | # MAGIC Apply the model at the specified URI for batch inference on the table with name input_table_name, writing results to the table with name output_table_name 8 | 9 | # COMMAND ---------- 10 | 11 | # DBTITLE 1,pip install requirements.txt 12 | # MAGIC %pip install -r ../requirements.txt 13 | 14 | # COMMAND ---------- 15 | 16 | # DBTITLE 1,Set env 17 | dbutils.widgets.dropdown('env', 'dev', ['dev', 'staging', 'prod'], 'Environment Name') 18 | 19 | # COMMAND ---------- 20 | 21 | # DBTITLE 1,Module Imports 22 | from typing import Dict 23 | 24 | from telco_churn.utils.notebook_utils import load_and_set_env_vars, load_config 25 | from telco_churn.model_inference import ModelInference 26 | from telco_churn.utils.logger_utils import get_logger 27 | 28 | _logger = get_logger() 29 | 30 | # COMMAND ---------- 31 | 32 | # DBTITLE 1,Load pipeline config params 33 | # Set pipeline name 34 | pipeline_name = 'model_inference_batch' 35 | 36 | # Load pipeline config yaml file (../conf/pipeline_configs/{pipeline_name}.yml) 37 | pipeline_config = load_config(pipeline_name) 38 | 39 | # Load and set arbitrary params via spark_env_vars 40 | # Params passed via ../conf/{env}/.{env}.env and ../conf/.base_data_params 41 | env_vars = load_and_set_env_vars(env=dbutils.widgets.get('env')) 42 | 43 | # COMMAND ---------- 44 | 45 | # DBTITLE 1,Pipeline Config 46 | # Fetch model_uri 47 | model_name = env_vars['model_name'] 48 | model_registry_stage = pipeline_config['mlflow_params']['model_registry_stage'] 49 | model_uri = f'models:/{model_name}/{model_registry_stage}' 50 | print(f'model_uri: {model_uri}') 51 | 52 | # Set input table name 53 | input_table_name = pipeline_config['data_input']['table_name'] 54 | print(f'input_table_name: {input_table_name}') 55 | 56 | # Set output table name 57 | predictions_table_database_name = env_vars['predictions_table_database_name'] 58 | predictions_table_name = f'{predictions_table_database_name}.{env_vars["predictions_table_name"]}' 59 | print(f'predictions_table_name: {predictions_table_name}') 60 | 61 | # COMMAND ---------- 62 | 63 | # DBTITLE 1,Execute Pipeline 64 | # Instantiate model inference pipeline 65 | model_inference_pipeline = ModelInference(model_uri=model_uri, 66 | input_table_name=input_table_name, 67 | output_table_name=predictions_table_name) 68 | 69 | model_inference_pipeline.run_and_write_batch(mode=pipeline_config['data_output']['mode']) 70 | -------------------------------------------------------------------------------- /telco_churn/pipelines/model_train_job.py: -------------------------------------------------------------------------------- 1 | from telco_churn.common import Workload, MLflowTrackingConfig, FeatureStoreTableConfig, LabelsTableConfig 2 | from telco_churn.model_train import ModelTrain, ModelTrainConfig 3 | from telco_churn.utils.logger_utils import get_logger 4 | 5 | _logger = get_logger() 6 | 7 | 8 | class ModelTrainJob(Workload): 9 | 10 | def _get_mlflow_tracking_cfg(self): 11 | try: 12 | experiment_id = self.env_vars['model_train_experiment_id'] 13 | except KeyError: 14 | experiment_id = None 15 | try: 16 | experiment_path = self.env_vars['model_train_experiment_path'] 17 | except KeyError: 18 | experiment_path = None 19 | 20 | return MLflowTrackingConfig(run_name=self.conf['mlflow_params']['run_name'], 21 | experiment_id=experiment_id, 22 | experiment_path=experiment_path, 23 | model_name=self.env_vars['model_name']) 24 | 25 | def _get_feature_store_table_cfg(self): 26 | return FeatureStoreTableConfig(database_name=self.env_vars['feature_store_database_name'], 27 | table_name=self.env_vars['feature_store_table_name'], 28 | primary_keys=self.env_vars['feature_store_table_primary_keys']) 29 | 30 | def _get_labels_table_cfg(self): 31 | return LabelsTableConfig(database_name=self.env_vars['labels_table_database_name'], 32 | table_name=self.env_vars['labels_table_name'], 33 | label_col=self.env_vars['labels_table_label_col']) 34 | 35 | def _get_pipeline_params(self): 36 | return self.conf['pipeline_params'] 37 | 38 | def _get_model_params(self): 39 | return self.conf['model_params'] 40 | 41 | def launch(self): 42 | _logger.info('Launching ModelTrainJob job') 43 | _logger.info(f'Running model-train pipeline in {self.env_vars["env"]} environment') 44 | cfg = ModelTrainConfig(mlflow_tracking_cfg=self._get_mlflow_tracking_cfg(), 45 | feature_store_table_cfg=self._get_feature_store_table_cfg(), 46 | labels_table_cfg=self._get_labels_table_cfg(), 47 | pipeline_params=self._get_pipeline_params(), 48 | model_params=self._get_model_params(), 49 | conf=self.conf, 50 | env_vars=self.env_vars) 51 | ModelTrain(cfg).run() 52 | _logger.info('ModelTrainJob job finished!') 53 | 54 | 55 | if __name__ == '__main__': 56 | job = ModelTrainJob() 57 | job.launch() 58 | -------------------------------------------------------------------------------- /notebooks/feature_table_creator.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # `feature_table_creator` 5 | # MAGIC 6 | # MAGIC Pipeline to create a Feature Store table, and separate labels table 7 | 8 | # COMMAND ---------- 9 | 10 | # DBTITLE 1,pip install requirements.txt 11 | # MAGIC %pip install -r ../requirements.txt 12 | 13 | # COMMAND ---------- 14 | 15 | # DBTITLE 1,Set env 16 | dbutils.widgets.dropdown('env', 'dev', ['dev', 'staging', 'prod'], 'Environment Name') 17 | 18 | # COMMAND ---------- 19 | 20 | # DBTITLE 1,Module Imports 21 | from telco_churn.utils.notebook_utils import load_and_set_env_vars, load_config 22 | 23 | from telco_churn.common import FeatureStoreTableConfig, LabelsTableConfig 24 | from telco_churn.feature_table_creator import FeatureTableCreator, FeatureTableCreatorConfig 25 | from telco_churn.featurize import FeaturizerConfig 26 | from telco_churn.utils.logger_utils import get_logger 27 | 28 | _logger = get_logger() 29 | 30 | # COMMAND ---------- 31 | 32 | # DBTITLE 1,Load pipeline config params 33 | # Set pipeline name 34 | pipeline_name = 'feature_table_creator' 35 | 36 | # Load pipeline config yaml file (../conf/pipeline_configs/{pipeline_name}.yml) 37 | pipeline_config = load_config(pipeline_name) 38 | 39 | # Load and set arbitrary params via spark_env_vars 40 | # Params passed via ../conf/{env}/.{env}.env and ../conf/.base_data_params 41 | env_vars = load_and_set_env_vars(env=dbutils.widgets.get('env')) 42 | 43 | # COMMAND ---------- 44 | 45 | # DBTITLE 1,Pipeline Config 46 | # Set FeaturizerConfig - data preparation config 47 | featurizer_cfg = FeaturizerConfig(**pipeline_config['data_prep_params']) 48 | 49 | # Set Feature Store feature table config 50 | feature_store_table_cfg = FeatureStoreTableConfig(database_name=env_vars['feature_store_database_name'], 51 | table_name=env_vars['feature_store_table_name'], 52 | primary_keys=env_vars['feature_store_table_primary_keys'], 53 | description=env_vars['feature_store_table_description']) 54 | 55 | # Set Labels Table config 56 | labels_table_cfg = LabelsTableConfig(database_name=env_vars['labels_table_database_name'], 57 | table_name=env_vars['labels_table_name'], 58 | label_col=env_vars['labels_table_label_col'], 59 | dbfs_path=env_vars['labels_table_dbfs_path']) 60 | 61 | # Set FeatureTableCreatorConfig 62 | cfg = FeatureTableCreatorConfig(input_table=pipeline_config['input_table'], 63 | featurizer_cfg=featurizer_cfg, 64 | feature_store_table_cfg=feature_store_table_cfg, 65 | labels_table_cfg=labels_table_cfg) 66 | 67 | # COMMAND ---------- 68 | 69 | # DBTITLE 1,Execute Pipeline 70 | # Instantiate pipeline 71 | feature_table_creator_pipeline = FeatureTableCreator(cfg) 72 | feature_table_creator_pipeline.run() 73 | -------------------------------------------------------------------------------- /notebooks/model_train.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # `model_train` 5 | # MAGIC 6 | # MAGIC Pipeline to execute model training. Params, metrics and model artifacts will be tracking to MLflow Tracking. 7 | # MAGIC Optionally, the resulting model will be registered to MLflow Model Registry if provided. 8 | 9 | # COMMAND ---------- 10 | 11 | # DBTITLE 1,pip install requirements.txt 12 | # MAGIC %pip install -r ../requirements.txt 13 | 14 | # COMMAND ---------- 15 | 16 | # DBTITLE 1,Set env 17 | dbutils.widgets.dropdown('env', 'dev', ['dev', 'staging', 'prod'], 'Environment Name') 18 | 19 | # COMMAND ---------- 20 | 21 | # DBTITLE 1,Module Imports 22 | from telco_churn.utils.notebook_utils import load_and_set_env_vars, load_config 23 | 24 | from telco_churn.common import MLflowTrackingConfig, FeatureStoreTableConfig, LabelsTableConfig 25 | from telco_churn.model_train import ModelTrain, ModelTrainConfig 26 | from telco_churn.utils.logger_utils import get_logger 27 | 28 | _logger = get_logger() 29 | 30 | # COMMAND ---------- 31 | 32 | # DBTITLE 1,Load pipeline config params 33 | # Set pipeline name 34 | pipeline_name = 'model_train' 35 | 36 | # Load pipeline config yaml file (../conf/pipeline_configs/{pipeline_name}.yml) 37 | pipeline_config = load_config(pipeline_name) 38 | 39 | # Load and set arbitrary params via spark_env_vars 40 | # Params passed via ../conf/{env}/.{env}.env and ../conf/.base_data_params 41 | env_vars = load_and_set_env_vars(env=dbutils.widgets.get('env')) 42 | 43 | # COMMAND ---------- 44 | 45 | # DBTITLE 1,Pipeline Config 46 | # Set MLflowTrackingConfig 47 | mlflow_tracking_cfg = MLflowTrackingConfig(run_name=pipeline_config['mlflow_params']['run_name'], 48 | experiment_path=env_vars['model_train_experiment_path'], 49 | model_name=env_vars['model_name']) 50 | 51 | 52 | # Set FeatureStoreTableConfig 53 | feature_store_table_cfg = FeatureStoreTableConfig(database_name=env_vars['feature_store_database_name'], 54 | table_name=env_vars['feature_store_table_name'], 55 | primary_keys=env_vars['feature_store_table_primary_keys']) 56 | 57 | # Set LabelsTableConfig 58 | labels_table_cfg = LabelsTableConfig(database_name=env_vars['labels_table_database_name'], 59 | table_name=env_vars['labels_table_name'], 60 | label_col=env_vars['labels_table_label_col']) 61 | 62 | # Set pipeline_params 63 | pipeline_params = pipeline_config['pipeline_params'] 64 | 65 | # Set model_params 66 | model_params = pipeline_config['model_params'] 67 | 68 | # Define ModelTrainConfig 69 | cfg = ModelTrainConfig(mlflow_tracking_cfg=mlflow_tracking_cfg, 70 | feature_store_table_cfg=feature_store_table_cfg, 71 | labels_table_cfg=labels_table_cfg, 72 | pipeline_params=pipeline_params, 73 | model_params=model_params, 74 | conf=pipeline_config, # Track pipeline_config to mlflow 75 | env_vars=env_vars # Track env_vars to mlflow 76 | ) 77 | 78 | # COMMAND ---------- 79 | 80 | # DBTITLE 1,Execute Pipeline 81 | # Instantiate pipeline 82 | model_train_pipeline = ModelTrain(cfg) 83 | model_train_pipeline.run() 84 | -------------------------------------------------------------------------------- /notebooks/model_deployment.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # `model_deployment` 5 | # MAGIC 6 | # MAGIC Pipeline to execute model deployment. This class orchestrates the comparison of the current Production model versus Staging model. 7 | # MAGIC The Production model will be the most recent model version under registered in the MLflow Model Registry under the provided model_name, for stage="Production". Likewise for Staging. 8 | # MAGIC Execution will involve loading the models and performing batch inference for a specified reference dataset. 9 | # MAGIC The two models will be compared using the specified comparison_metric. 10 | # MAGIC `higher_is_better` indicates whether a higher value for the evaluation metric equates to a better peforming model. 11 | # MAGIC Dependent on this comparison the candidate Staging model will be either promoted to Production (and the current 12 | # MAGIC Production model archived) if performing better, or the Staging model will be archived if it does not perform better than the current Production model. 13 | # MAGIC 14 | # MAGIC Metrics computed when comparing the two models will be logged to MLflow, under the provided experiment_id or experiment_path. 15 | # MAGIC 16 | # MAGIC **Pipeline Steps**: 17 | # MAGIC 1. Set MLflow Tracking experiment. Used to track metrics computed when comparing Staging versus Production 18 | # MAGIC models. 19 | # MAGIC 1. Load Staging and Production models and score against reference dataset provided. The reference data specified must currently be a table. 20 | # MAGIC 1. Compute evaluation metric for both Staging and Production model predictions against reference data 21 | # MAGIC 1. If higher_is_better=True, the Staging model will be promoted in place of the Production model iff the Staging model evaluation metric is higher than the Production model evaluation metric. If higher_is_better=False, the Staging model will be promoted in place of the Production model iff the Staging model evaluation metric is lower than the Production model evaluation metric. 22 | 23 | # COMMAND ---------- 24 | 25 | # DBTITLE 1,pip install requirements.txt 26 | # MAGIC %pip install -r ../requirements.txt 27 | 28 | # COMMAND ---------- 29 | 30 | # DBTITLE 1,Set env 31 | dbutils.widgets.dropdown('env', 'dev', ['dev', 'staging', 'prod'], 'Environment Name') 32 | 33 | # COMMAND ---------- 34 | 35 | # DBTITLE 1,Module Imports 36 | from telco_churn.utils.notebook_utils import load_and_set_env_vars, load_config 37 | 38 | from telco_churn.common import MLflowTrackingConfig 39 | from telco_churn.model_deployment import ModelDeployment, ModelDeploymentConfig 40 | from telco_churn.utils.logger_utils import get_logger 41 | 42 | _logger = get_logger() 43 | 44 | # COMMAND ---------- 45 | 46 | # DBTITLE 1,Load pipeline config params 47 | # Set pipeline name 48 | pipeline_name = 'model_deployment' 49 | 50 | # Load pipeline config yaml file (../conf/pipeline_configs/{pipeline_name}.yml) 51 | pipeline_config = load_config(pipeline_name) 52 | 53 | # Load and set arbitrary params via spark_env_vars 54 | # Params passed via ../conf/{env}/.{env}.env and ../conf/.base_data_params 55 | env_vars = load_and_set_env_vars(env=dbutils.widgets.get('env')) 56 | 57 | # COMMAND ---------- 58 | 59 | # DBTITLE 1,Pipeline Config 60 | # Set MLflowTrackingConfig - comparison metrics logged to MLflow 61 | mlflow_tracking_cfg = MLflowTrackingConfig(experiment_path=env_vars['model_deploy_experiment_path'], 62 | run_name='staging_vs_prod_comparison', 63 | model_name=env_vars['model_name']) 64 | 65 | # Define reference dataset 66 | reference_table_database_name = env_vars['reference_table_database_name'] 67 | reference_table_name = f'{reference_table_database_name}.{env_vars["reference_table_name"]}' 68 | 69 | # Set label col from reference dataset 70 | label_col = env_vars['reference_table_label_col'] 71 | 72 | # Params defining how to compare staging vs prod models 73 | model_comparison_params = pipeline_config['model_comparison_params'] 74 | 75 | # Define ModelDeploymentConfig 76 | cfg = ModelDeploymentConfig(mlflow_tracking_cfg=mlflow_tracking_cfg, 77 | reference_data=reference_table_name, 78 | label_col=label_col, 79 | comparison_metric=model_comparison_params['metric'], 80 | higher_is_better=model_comparison_params['higher_is_better']) 81 | 82 | # COMMAND ---------- 83 | 84 | # DBTITLE 1,Execute Pipeline 85 | # Instantiate pipeline 86 | model_deployment_pipeline = ModelDeployment(cfg) 87 | model_deployment_pipeline.run() 88 | -------------------------------------------------------------------------------- /tests/unit/conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | This conftest.py contains handy components that prepare SparkSession and other relevant objects. 3 | """ 4 | 5 | import os 6 | from pathlib import Path 7 | import shutil 8 | import tempfile 9 | from typing import Iterator 10 | from unittest.mock import patch 11 | 12 | import mlflow 13 | import pytest 14 | from delta import configure_spark_with_delta_pip 15 | from pyspark.sql import SparkSession 16 | import logging 17 | from dataclasses import dataclass 18 | 19 | 20 | @dataclass 21 | class FileInfoFixture: 22 | """ 23 | This class mocks the DBUtils FileInfo object 24 | """ 25 | path: str 26 | name: str 27 | size: int 28 | modificationTime: int 29 | 30 | 31 | class DBUtilsFixture: 32 | """ 33 | This class is used for mocking the behaviour of DBUtils inside tests. 34 | """ 35 | 36 | def __init__(self): 37 | self.fs = self 38 | 39 | def cp(self, src: str, dest: str, recurse: bool = False): 40 | copy_func = shutil.copytree if recurse else shutil.copy 41 | copy_func(src, dest) 42 | 43 | def ls(self, path: str): 44 | _paths = Path(path).glob("*") 45 | _objects = [ 46 | FileInfoFixture( 47 | str(p.absolute()), p.name, p.stat().st_size, int(p.stat().st_mtime) 48 | ) 49 | for p in _paths 50 | ] 51 | return _objects 52 | 53 | def mkdirs(self, path: str): 54 | Path(path).mkdir(parents=True, exist_ok=True) 55 | 56 | def mv(self, src: str, dest: str, recurse: bool = False): 57 | copy_func = shutil.copytree if recurse else shutil.copy 58 | shutil.move(src, dest, copy_function=copy_func) 59 | 60 | def put(self, path: str, content: str, overwrite: bool = False): 61 | _f = Path(path) 62 | 63 | if _f.exists() and not overwrite: 64 | raise FileExistsError("File already exists") 65 | 66 | _f.write_text(content, encoding="utf-8") 67 | 68 | def rm(self, path: str, recurse: bool = False): 69 | deletion_func = shutil.rmtree if recurse else os.remove 70 | deletion_func(path) 71 | 72 | 73 | @pytest.fixture(scope="session") 74 | def spark() -> SparkSession: 75 | """ 76 | This fixture provides preconfigured SparkSession with Hive and Delta support. 77 | After the test session, temporary warehouse directory is deleted. 78 | :return: SparkSession 79 | """ 80 | logging.info("Configuring Spark session for testing environment") 81 | warehouse_dir = tempfile.TemporaryDirectory().name 82 | _builder = ( 83 | SparkSession.builder.master("local[1]") 84 | .config("spark.hive.metastore.warehouse.dir", Path(warehouse_dir).as_uri()) 85 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 86 | .config( 87 | "spark.sql.catalog.spark_catalog", 88 | "org.apache.spark.sql.delta.catalog.DeltaCatalog", 89 | ) 90 | ) 91 | spark: SparkSession = configure_spark_with_delta_pip(_builder).getOrCreate() 92 | logging.info("Spark session configured") 93 | yield spark 94 | logging.info("Shutting down Spark session") 95 | spark.stop() 96 | if Path(warehouse_dir).exists(): 97 | shutil.rmtree(warehouse_dir) 98 | 99 | 100 | @pytest.fixture(scope="session", autouse=True) 101 | def mlflow_local(): 102 | """ 103 | This fixture provides local instance of mlflow with support for tracking and registry functions. 104 | After the test session: 105 | * temporary storage for tracking and registry is deleted. 106 | * Active run will be automatically stopped to avoid verbose errors. 107 | :return: None 108 | """ 109 | logging.info("Configuring local MLflow instance") 110 | tracking_uri = tempfile.TemporaryDirectory().name 111 | registry_uri = f"sqlite:///{tempfile.TemporaryDirectory().name}" 112 | 113 | mlflow.set_tracking_uri(Path(tracking_uri).as_uri()) 114 | mlflow.set_registry_uri(registry_uri) 115 | logging.info("MLflow instance configured") 116 | yield None 117 | 118 | mlflow.end_run() 119 | 120 | if Path(tracking_uri).exists(): 121 | shutil.rmtree(tracking_uri) 122 | 123 | if Path(registry_uri).exists(): 124 | Path(registry_uri).unlink() 125 | logging.info("Test session finished, unrolling the MLflow instance") 126 | 127 | 128 | @pytest.fixture(scope="session", autouse=True) 129 | def dbutils_fixture() -> Iterator[None]: 130 | """ 131 | This fixture patches the `get_dbutils` function. 132 | Please note that patch is applied on a string name of the function. 133 | If you change the name or location of it, patching won't work. 134 | :return: 135 | """ 136 | logging.info("Patching the DBUtils object") 137 | with patch("telco_churn.common.get_dbutils", lambda _: DBUtilsFixture()): 138 | yield 139 | logging.info("Test session finished, patching completed") 140 | -------------------------------------------------------------------------------- /telco_churn/model_inference.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.dataframe 2 | from databricks.feature_store import FeatureStoreClient 3 | 4 | from telco_churn.utils.get_spark import spark 5 | from telco_churn.utils.logger_utils import get_logger 6 | 7 | _logger = get_logger() 8 | 9 | 10 | class ModelInference: 11 | """ 12 | Class to execute model inference. 13 | Apply the model at the specified URI for batch inference on the table with name input_table_name, 14 | writing results to the table with name output_table_name 15 | """ 16 | def __init__(self, model_uri: str, input_table_name: str, output_table_name: str = None): 17 | """ 18 | 19 | Parameters 20 | ---------- 21 | model_uri : str 22 | MLflow model uri. Model model must have been logged using the Feature Store API 23 | input_table_name : str 24 | Table name to load as a Spark DataFrame to score the model on. Must contain column(s) 25 | for lookup keys to join feature data from Feature Store 26 | output_table_name : str 27 | Output table name to write results to 28 | """ 29 | self.model_uri = model_uri 30 | self.input_table_name = input_table_name 31 | self.output_table_name = output_table_name 32 | 33 | def _load_input_table(self) -> pyspark.sql.DataFrame: 34 | """ 35 | Load Spark DataFrame containing lookup keys to join feature data from Feature Store 36 | 37 | Returns 38 | ------- 39 | pyspark.sql.DataFrame 40 | """ 41 | input_table_name = self.input_table_name 42 | _logger.info(f"Loading lookup keys from input table: {input_table_name}") 43 | return spark.table(input_table_name) 44 | 45 | def fs_score_batch(self, df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame: 46 | """ 47 | Load and apply model from MLflow Model Registry using Feature Store API. Features will be automatically 48 | retrieved from the Feature Store. This method requires that the registered model must have been logged 49 | with FeatureStoreClient.log_model(), which packages the model with feature metadata. Unless present in df , 50 | these features will be looked up from Feature Store and joined with df prior to scoring the model. 51 | 52 | Parameters 53 | ---------- 54 | df : pyspark.sql.DataFrame 55 | The DataFrame to score the model on. Feature Store features will be joined with df prior to scoring the 56 | model. df must: 57 | 58 | 1. Contain columns for lookup keys required to join feature data from Feature Store, as specified in 59 | the feature_spec.yaml artifact. 60 | 2. Contain columns for all source keys required to score the model, as specified in the 61 | feature_spec.yaml artifact. 62 | 3. Not contain a column prediction, which is reserved for the modelʼs predictions. df may contain 63 | additional columns. 64 | 65 | Returns 66 | ------- 67 | pyspark.sql.DataFrame: 68 | A Spark DataFrame containing: 69 | 1. All columns of df. 70 | 2. All feature values retrieved from Feature Store. 71 | 3. A column prediction containing the output of the model. 72 | """ 73 | fs = FeatureStoreClient() 74 | _logger.info(f"Loading model from Model Registry: {self.model_uri}") 75 | 76 | return fs.score_batch(self.model_uri, df) 77 | 78 | def run_batch(self) -> pyspark.sql.DataFrame: 79 | """ 80 | Load inference lookup keys, feature data from Feature Store, and score using the loaded model from MLflow 81 | model registry 82 | 83 | Returns 84 | ------- 85 | pyspark.sql.DataFrame: 86 | A Spark DataFrame containing: 87 | 1. All columns of inference df. 88 | 2. All feature values retrieved from Feature Store. 89 | 3. A column prediction containing the output of the model. 90 | """ 91 | input_df = self._load_input_table() 92 | pred_df = self.fs_score_batch(input_df) 93 | 94 | return pred_df 95 | 96 | def run_and_write_batch(self, mode: str = 'overwrite') -> None: 97 | """ 98 | Run batch inference, save as Delta table to `self.output_table_name` 99 | 100 | Parameters 101 | ---------- 102 | mode : str 103 | Specify behavior when predictions data already exists. 104 | Options include: 105 | * "append": Append contents of this :class:`DataFrame` to existing data. 106 | * "overwrite": Overwrite existing data. 107 | 108 | Returns 109 | ------- 110 | 111 | """ 112 | _logger.info("==========Running batch model inference==========") 113 | pred_df = self.run_batch() 114 | 115 | _logger.info("==========Writing predictions==========") 116 | _logger.info(f"mode={mode}") 117 | _logger.info(f"Predictions written to {self.output_table_name}") 118 | # Model predictions are written to the Delta table provided as input. 119 | # Delta is the default format in Databricks Runtime 8.0 and above. 120 | pred_df.write.format("delta").mode(mode).saveAsTable(self.output_table_name) 121 | 122 | _logger.info("==========Batch model inference completed==========") 123 | -------------------------------------------------------------------------------- /telco_churn/featurize.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import pyspark 4 | import pyspark.pandas as ps 5 | 6 | from telco_churn.utils.logger_utils import get_logger 7 | 8 | _logger = get_logger() 9 | 10 | 11 | @dataclass 12 | class FeaturizerConfig: 13 | """ 14 | Attributes: 15 | label_col (str): Name of original label column in input data 16 | ohe (bool): Flag to indicate whether or not to one hot encode categorical columns 17 | cat_cols (list): List of categorical columns. Only required if ohe=True 18 | drop_missing (bool): Flag to indicate whether or not to drop missing values 19 | """ 20 | label_col: str = 'churnString' 21 | ohe: bool = False 22 | cat_cols: list = None 23 | drop_missing: bool = True 24 | 25 | 26 | class Featurizer: 27 | """ 28 | Class containing featurization logic to apply to input Spark DataFrame 29 | """ 30 | def __init__(self, cfg: FeaturizerConfig): 31 | self.cfg = cfg 32 | 33 | @staticmethod 34 | def pyspark_pandas_ohe(psdf: ps.DataFrame, cat_cols: list) -> pyspark.pandas.DataFrame: 35 | """ 36 | Take a pyspark.pandas DataFrame and convert a list of categorical variables (columns) into dummy/indicator 37 | variables, also known as one hot encoding. 38 | 39 | Parameters 40 | ---------- 41 | psdf : pyspark.pandas.DataFrame 42 | pyspark.pandas DataFrame to OHE 43 | cat_cols : list 44 | List of categorical features 45 | Returns 46 | ------- 47 | pyspark.pandas.DataFrame 48 | """ 49 | return ps.get_dummies(psdf, columns=cat_cols, dtype='int64') 50 | 51 | def process_label(self, psdf: pyspark.pandas.DataFrame, rename_to: str = 'churn') -> pyspark.pandas.DataFrame: 52 | """ 53 | Convert label to int and rename label column 54 | TODO: add test 55 | 56 | Parameters 57 | ---------- 58 | psdf : pyspark.pandas.DataFrame 59 | pyspark.pandas DataFrame 60 | rename_to : str 61 | Name of new label column name 62 | Returns 63 | ------- 64 | pyspark.pandas.DataFrame 65 | """ 66 | psdf[self.cfg.label_col] = psdf[self.cfg.label_col].map({'Yes': 1, 'No': 0}) 67 | psdf = psdf.astype({self.cfg.label_col: 'int32'}) 68 | psdf = psdf.rename(columns={self.cfg.label_col: rename_to}) 69 | 70 | return psdf 71 | 72 | @staticmethod 73 | def process_col_names(psdf: pyspark.pandas.DataFrame) -> pyspark.pandas.DataFrame: 74 | """ 75 | Strip parentheses and spaces from existing column names, replacing spaces with '_' 76 | TODO: add test 77 | 78 | Parameters 79 | ---------- 80 | psdf : pyspark.pandas.DataFrame 81 | pyspark.pandas DataFrame 82 | Returns 83 | ------- 84 | pyspark.pandas.DataFrame 85 | """ 86 | cols = psdf.columns.to_list() 87 | new_col_names = [col.replace(' ', '').replace('(', '_').replace(')', '') for col in cols] 88 | 89 | # Update column names to new column names 90 | psdf.columns = new_col_names 91 | 92 | return psdf 93 | 94 | @staticmethod 95 | def drop_missing_values(psdf: pyspark.pandas.DataFrame) -> pyspark.pandas.DataFrame: 96 | """ 97 | Remove missing values 98 | 99 | Parameters 100 | ---------- 101 | psdf 102 | Returns 103 | ------- 104 | pyspark.pandas.DataFrame 105 | """ 106 | return psdf.dropna() 107 | 108 | def run(self, df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame: 109 | """ 110 | Run all data preprocessing steps. Consists of the following: 111 | 112 | 1. Convert PySpark DataFrame to pandas_on_spark DataFrame 113 | 2. Process the label column - converting to int and renaming col to 'churn' 114 | 3. Apply OHE if specified in the config 115 | 4. Drop any missing values if specified in the config 116 | 5. Return resulting preprocessed dataset as a PySpark DataFrame 117 | 118 | Parameters 119 | ---------- 120 | df : pyspark.sql.DataFrame 121 | Input PySpark DataFrame to preprocess 122 | 123 | Returns 124 | ------- 125 | pyspark.sql.DataFrame 126 | Preprocessed dataset of features and label column 127 | """ 128 | _logger.info('Running Data Preprocessing steps...') 129 | 130 | # Convert Spark DataFrame to pandas on Spark DataFrame 131 | psdf = df.pandas_api() 132 | 133 | # Convert label to int and rename column 134 | _logger.info(f'Processing label: {self.cfg.label_col}') 135 | psdf = self.process_label(psdf, rename_to='churn') 136 | 137 | # OHE 138 | if self.cfg.ohe: 139 | _logger.info('Applying one-hot-encoding') 140 | if self.cfg.cat_cols is None: 141 | raise RuntimeError('cat_cols must be provided if ohe=True') 142 | psdf = self.pyspark_pandas_ohe(psdf, self.cfg.cat_cols) 143 | 144 | # Clean up column names resulting from OHE 145 | _logger.info(f'Renaming columns') 146 | psdf = self.process_col_names(psdf) 147 | 148 | # Drop missing values 149 | if self.cfg.drop_missing: 150 | _logger.info(f'Dropping missing values') 151 | psdf = self.drop_missing_values(psdf) 152 | 153 | # Return as Spark DataFrame 154 | preproc_df = psdf.to_spark() 155 | 156 | return preproc_df 157 | -------------------------------------------------------------------------------- /telco_churn/feature_table_creator.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import pyspark.sql.dataframe 4 | 5 | from telco_churn import featurize 6 | from telco_churn.common import FeatureStoreTableConfig, LabelsTableConfig 7 | from telco_churn.featurize import FeaturizerConfig 8 | from telco_churn.utils import feature_store_utils 9 | from telco_churn.utils.get_spark import spark 10 | from telco_churn.utils.logger_utils import get_logger 11 | 12 | _logger = get_logger() 13 | 14 | 15 | @dataclass 16 | class FeatureTableCreatorConfig: 17 | """ 18 | Attributes: 19 | input_table (str): 20 | Name of the table to use as input for creating features 21 | featurizer_cfg (FeaturizerConfig): 22 | Featurization config to specify label_col, ohe, cat_cols and drop_missing params 23 | feature_store_table_cfg (FeatureStoreTableConfig): 24 | Feature Store table config to specify database_name, table_name, primary_keys and description 25 | labels_table_cfg (LabelsTableConfig): 26 | Labels table config to specify database_name, table_name, label_col and dbfs_path 27 | """ 28 | input_table: str 29 | featurizer_cfg: FeaturizerConfig 30 | feature_store_table_cfg: FeatureStoreTableConfig 31 | labels_table_cfg: LabelsTableConfig 32 | 33 | 34 | class FeatureTableCreator: 35 | """ 36 | Class to execute a pipeline to create a Feature Store table, and separate labels table 37 | """ 38 | def __init__(self, cfg: FeatureTableCreatorConfig): 39 | self.cfg = cfg 40 | 41 | @staticmethod 42 | def setup(database_name: str, table_name: str) -> None: 43 | """ 44 | Set up database to use. Create the database {database_name} if it doesn't exist, and drop the table {table_name} 45 | if it exists 46 | 47 | Parameters 48 | ---------- 49 | database_name : str 50 | Database to create if it doesn't exist. Otherwise use database of the name provided 51 | table_name : str 52 | Drop table if it already exists 53 | """ 54 | _logger.info(f'Creating database {database_name} if not exists') 55 | spark.sql(f'CREATE DATABASE IF NOT EXISTS {database_name};') 56 | spark.sql(f'USE {database_name};') 57 | spark.sql(f'DROP TABLE IF EXISTS {table_name};') 58 | 59 | def run_data_ingest(self) -> pyspark.sql.DataFrame: 60 | """ 61 | Run data ingest step 62 | 63 | Returns 64 | ------- 65 | pyspark.sql.DataFrame 66 | Input Spark DataFrame 67 | """ 68 | return spark.table(self.cfg.input_table) 69 | 70 | def run_data_prep(self, input_df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame: 71 | """ 72 | Run data preparation step, using Featurizer to run featurization logic to create features from the input 73 | DataFrame. 74 | 75 | Parameters 76 | ---------- 77 | input_df : pyspark.sql.DataFrame 78 | Input Spark DataFrame 79 | 80 | Returns 81 | ------- 82 | pyspark.sql.DataFrame 83 | Processed Spark DataFrame containing features 84 | """ 85 | featurizer = featurize.Featurizer(self.cfg.featurizer_cfg) 86 | proc_df = featurizer.run(input_df) 87 | 88 | return proc_df 89 | 90 | def run_feature_table_create(self, df: pyspark.sql.DataFrame) -> None: 91 | """ 92 | Method to create feature table in Databricks Feature Store. When run, this method will create from scratch the 93 | feature table. As such, we first create (if it doesn't exist) the database specified, and drop the table if it 94 | already exists. 95 | 96 | The feature table is created from the Spark DataFrame provided, dropping the label column if it exists in the 97 | DataFrame. The label column cannot be present in the feature table when later constructing a feature store 98 | training set from the feature table. The feature table will be created using the primary keys and description 99 | proivided via feature_store_table_cfg. 100 | 101 | Parameters 102 | ---------- 103 | df : pyspark.sql.DataFrame 104 | Spark DataFrame from which to create the feature table. 105 | """ 106 | feature_store_table_cfg = self.cfg.feature_store_table_cfg 107 | 108 | # Create database if not exists, drop table if it already exists 109 | self.setup(database_name=feature_store_table_cfg.database_name, 110 | table_name=feature_store_table_cfg.table_name) 111 | 112 | # Store only features for each customerID, storing customerID, churn in separate churn_labels table 113 | # During model training we will use the churn_labels table to join features into 114 | features_df = df.drop(self.cfg.labels_table_cfg.label_col) 115 | feature_table_name = f'{feature_store_table_cfg.database_name}.{feature_store_table_cfg.table_name}' 116 | _logger.info(f'Creating and writing features to feature table: {feature_table_name}') 117 | feature_store_utils.create_and_write_feature_table(features_df, 118 | feature_table_name, 119 | primary_keys=feature_store_table_cfg.primary_keys, 120 | description=feature_store_table_cfg.description) 121 | 122 | def run_labels_table_create(self, df: pyspark.sql.DataFrame) -> None: 123 | """ 124 | Method to create labels table. This table will consist of the columns primary_key, label_col 125 | 126 | Create table using params specified in labels_table_cfg. Will create Delta table at dbfs_path, and further 127 | create a table using this Delta location. 128 | 129 | Parameters 130 | ---------- 131 | df : pyspark.sql.DataFrame 132 | Spark DataFrame containing primary keys column and label column 133 | """ 134 | feature_store_table_cfg = self.cfg.feature_store_table_cfg 135 | labels_table_cfg = self.cfg.labels_table_cfg 136 | 137 | if isinstance(feature_store_table_cfg.primary_keys, str): 138 | labels_table_cols = [feature_store_table_cfg.primary_keys, 139 | labels_table_cfg.label_col] 140 | elif isinstance(feature_store_table_cfg.primary_keys, list): 141 | labels_table_cols = feature_store_table_cfg.primary_keys + \ 142 | [labels_table_cfg.label_col] 143 | else: 144 | raise RuntimeError('Feature Store table primary keys must be one of either str of list type') 145 | 146 | labels_database_name = labels_table_cfg.database_name 147 | labels_table_name = labels_table_cfg.table_name 148 | labels_dbfs_path = labels_table_cfg.dbfs_path 149 | # Create database if not exists, drop table if it already exists 150 | self.setup(database_name=labels_database_name, table_name=labels_table_name) 151 | # DataFrame of customerID/churn labels 152 | labels_df = df.select(labels_table_cols) 153 | _logger.info(f'Writing labels to DBFS: {labels_dbfs_path}') 154 | labels_df.write.format('delta').mode('overwrite').save(labels_dbfs_path) 155 | spark.sql(f"""CREATE TABLE {labels_database_name}.{labels_table_name} 156 | USING DELTA LOCATION '{labels_dbfs_path}';""") 157 | _logger.info(f'Created labels table: {labels_database_name}.{labels_table_name}') 158 | 159 | def run(self) -> None: 160 | """ 161 | Run feature table creation pipeline 162 | """ 163 | _logger.info('==========Data Ingest==========') 164 | input_df = self.run_data_ingest() 165 | 166 | _logger.info('==========Data Prep==========') 167 | proc_df = self.run_data_prep(input_df) 168 | 169 | _logger.info('==========Create Feature Table==========') 170 | self.run_feature_table_create(proc_df) 171 | 172 | _logger.info('==========Create Labels Table==========') 173 | self.run_labels_table_create(proc_df) 174 | -------------------------------------------------------------------------------- /conf/deployment.yml: -------------------------------------------------------------------------------- 1 | custom: 2 | 3 | # Cluster configs for each environment 4 | default-cluster-spec: &default-cluster-spec 5 | spark_version: '11.0.x-cpu-ml-scala2.12' 6 | node_type_id: 'i3.xlarge' # NOTE: this is an AWS-specific instance type. Change accordingly if running on Azure or GCP. 7 | driver_node_type_id: 'i3.xlarge' # NOTE: this is an AWS-specific instance type. Change accordingly if running on Azure or GCP. 8 | num_workers: 1 9 | # To reduce start up time for each job, it is advisable to use a cluster pool. To do so involves supplying the following 10 | # two fields with a pool_id to acquire both the driver and instances from. 11 | # If driver_instance_pool_id and instance_pool_id are set, both node_type_id and driver_node_type_id CANNOT be supplied. 12 | # As such, if providing a pool_id for driver and worker instances, please ensure that node_type_id and driver_node_type_id are not present 13 | # driver_instance_pool_id: '0617-151415-bells2-pool-hh7h6tjm' 14 | # instance_pool_id: '0617-151415-bells2-pool-hh7h6tjm' 15 | 16 | dev-cluster-config: &dev-cluster-config 17 | new_cluster: 18 | <<: *default-cluster-spec 19 | 20 | staging-cluster-config: &staging-cluster-config 21 | new_cluster: 22 | <<: *default-cluster-spec 23 | 24 | prod-cluster-config: &prod-cluster-config 25 | new_cluster: 26 | <<: *default-cluster-spec 27 | 28 | # Databricks Jobs definitions 29 | # please note that we're using FUSE reference for config, and env files, hence we're going to load this file using its local FS path 30 | environments: 31 | 32 | dev: 33 | strict_path_adjustment_policy: true 34 | jobs: 35 | - name: 'DEV-telco-churn-demo-setup' 36 | <<: *dev-cluster-config 37 | spark_python_task: 38 | python_file: 'file://telco_churn/pipelines/demo_setup_job.py' 39 | parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env', 40 | '--env', 'file:fuse://conf/dev/.dev.env', 41 | '--conf-file', 'file:fuse://conf/pipeline_configs/demo_setup.yml'] 42 | - name: 'DEV-telco-churn-feature-table-creation' 43 | <<: *dev-cluster-config 44 | spark_python_task: 45 | python_file: 'file://telco_churn/pipelines/feature_table_creator_job.py' 46 | parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env', 47 | '--env', 'file:fuse://conf/dev/.dev.env', 48 | '--conf-file', 'file:fuse://conf/pipeline_configs/feature_table_creator.yml'] 49 | - name: 'DEV-telco-churn-model-train' 50 | <<: 51 | - *dev-cluster-config 52 | spark_python_task: 53 | python_file: 'file://telco_churn/pipelines/model_train_job.py' 54 | parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env', 55 | '--env', 'file:fuse://conf/dev/.dev.env', 56 | '--conf-file', 'file:fuse://conf/pipeline_configs/model_train.yml'] 57 | - name: 'DEV-telco-churn-model-deployment' 58 | <<: 59 | - *dev-cluster-config 60 | spark_python_task: 61 | python_file: 'file://telco_churn/pipelines/model_deployment_job.py' 62 | parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env', 63 | '--env', 'file:fuse://conf/dev/.dev.env', 64 | '--conf-file', 'file:fuse://conf/pipeline_configs/model_deployment.yml'] 65 | - name: 'DEV-telco-churn-model-inference-batch' 66 | <<: 67 | - *dev-cluster-config 68 | spark_python_task: 69 | python_file: 'file://telco_churn/pipelines/model_inference_batch_job.py' 70 | parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env', 71 | '--env', 'file:fuse://conf/dev/.dev.env', 72 | '--conf-file', 'file:fuse://conf/pipeline_configs/model_inference_batch.yml'] 73 | - name: 'DEV-telco-churn-sample-integration-test' 74 | <<: 75 | - *dev-cluster-config 76 | spark_python_task: 77 | python_file: 'file://tests/integration/sample_test.py' 78 | parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env', 79 | '--env', 'file:fuse://conf/dev/.dev.env', 80 | '--conf-file', 'file:fuse://conf/pipeline_configs/sample_test.yml'] 81 | 82 | staging: 83 | strict_path_adjustment_policy: true 84 | jobs: 85 | - name: 'STAGING-telco-churn-sample-integration-test' 86 | <<: 87 | - *staging-cluster-config 88 | spark_python_task: 89 | python_file: 'file://tests/integration/sample_test.py' 90 | parameters: ['--env', 'file:fuse://conf/staging/.staging.env', 91 | '--conf-file', 'file:fuse://conf/pipeline_configs/sample_test.yml'] 92 | 93 | prod: 94 | strict_path_adjustment_policy: true 95 | jobs: 96 | - name: 'PROD-telco-churn-demo-setup' 97 | <<: *prod-cluster-config 98 | spark_python_task: 99 | python_file: 'file://telco_churn/pipelines/demo_setup_job.py' 100 | parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env', 101 | '--env', 'file:fuse://conf/prod/.prod.env', 102 | '--conf-file', 'file:fuse://conf/pipeline_configs/demo_setup.yml'] 103 | - name: 'PROD-telco-churn-initial-model-train-register' 104 | tasks: 105 | - task_key: 'demo-setup' 106 | <<: 107 | - *prod-cluster-config 108 | spark_python_task: 109 | python_file: 'file://telco_churn/pipelines/demo_setup_job.py' 110 | parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env', 111 | '--env', 'file:fuse://conf/prod/.prod.env', 112 | '--conf-file', 'file:fuse://conf/pipeline_configs/demo_setup.yml'] 113 | - task_key: 'feature-table-creation' 114 | <<: *prod-cluster-config 115 | depends_on: 116 | - task_key: 'demo-setup' 117 | spark_python_task: 118 | python_file: 'file://telco_churn/pipelines/feature_table_creator_job.py' 119 | parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env', 120 | '--env', 'file:fuse://conf/prod/.prod.env', 121 | '--conf-file', 'file:fuse://conf/pipeline_configs/feature_table_creator.yml'] 122 | - task_key: 'model-train' 123 | <<: *prod-cluster-config 124 | depends_on: 125 | - task_key: 'demo-setup' 126 | - task_key: 'feature-table-creation' 127 | spark_python_task: 128 | python_file: 'file://telco_churn/pipelines/model_train_job.py' 129 | parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env', 130 | '--env', 'file:fuse://conf/prod/.prod.env', 131 | '--conf-file', 'file:fuse://conf/pipeline_configs/model_train.yml'] 132 | - name: 'PROD-telco-churn-model-train' 133 | <<: 134 | - *prod-cluster-config 135 | spark_python_task: 136 | python_file: 'file://telco_churn/pipelines/model_train_job.py' 137 | parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env', 138 | '--env', 'file:fuse://conf/prod/.prod.env', 139 | '--conf-file', 'file:fuse://conf/pipeline_configs/model_train.yml'] 140 | - name: 'PROD-telco-churn-model-deployment' 141 | <<: 142 | - *prod-cluster-config 143 | spark_python_task: 144 | python_file: 'file://telco_churn/pipelines/model_deployment_job.py' 145 | parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env', 146 | '--env', 'file:fuse://conf/prod/.prod.env', 147 | '--conf-file', 'file:fuse://conf/pipeline_configs/model_deployment.yml'] 148 | - name: 'PROD-telco-churn-model-inference-batch' 149 | <<: 150 | - *prod-cluster-config 151 | spark_python_task: 152 | python_file: 'file://telco_churn/pipelines/model_inference_batch_job.py' 153 | parameters: ['--base-data-params', 'file:fuse://conf/.base_data_params.env', 154 | '--env', 'file:fuse://conf/prod/.prod.env', 155 | '--conf-file', 'file:fuse://conf/pipeline_configs/model_inference_batch.yml'] -------------------------------------------------------------------------------- /telco_churn/common.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module containing common data classes used throughout different pipelines, in addition to Workload class which is 3 | extended to run pipelines/tasks. 4 | """ 5 | import os 6 | import sys 7 | from dataclasses import dataclass 8 | 9 | import yaml 10 | import pathlib 11 | import dotenv 12 | from abc import ABC, abstractmethod 13 | from argparse import ArgumentParser 14 | from logging import Logger 15 | from typing import Dict, Any, Union, List 16 | from pyspark.sql import SparkSession 17 | 18 | 19 | @dataclass 20 | class MLflowTrackingConfig: 21 | """ 22 | Configuration data class used to unpack MLflow parameters during a model training run. 23 | 24 | Attributes: 25 | run_name (str) 26 | Name of MLflow run 27 | experiment_id (int) 28 | ID of the MLflow experiment to be activated. If an experiment with this ID does not exist, raise an exception. 29 | experiment_path (str) 30 | Case sensitive name of the experiment to be activated. If an experiment with this name does not exist, 31 | a new experiment wth this name is created. 32 | model_name (str) 33 | Name of the registered model under which to create a new model version. If a registered model with the given 34 | name does not exist, it will be created automatically. 35 | """ 36 | run_name: str 37 | experiment_id: int = None 38 | experiment_path: str = None 39 | model_name: str = None 40 | 41 | 42 | @dataclass 43 | class FeatureStoreTableConfig: 44 | """ 45 | Configuration data class used to unpack parameters when creating or loading a Feature Store table. 46 | 47 | Attributes: 48 | database_name (str) 49 | Name of database to use for creating the feature table 50 | table_name (str) 51 | Name of feature table 52 | primary_keys (string or list) 53 | String or list of strings, of columns to use as the primary key(s). Use single column (customerID) as the 54 | primary key for the telco churn example. 55 | description (str) 56 | [Optional] string containing description to attribute to the feature table in the Feature Store. 57 | Only used when creating a Feature Store table. 58 | """ 59 | database_name: str 60 | table_name: str 61 | primary_keys: Union[str, List[str]] 62 | description: str = None 63 | 64 | 65 | @dataclass 66 | class LabelsTableConfig: 67 | """ 68 | Configuration data class used to unpack parameters when creating or loading labels table. 69 | 70 | Attributes: 71 | database_name (str) 72 | Name of database to use for creating the labels table 73 | table_name (str) 74 | Name of labels table within the database 75 | label_col (str) 76 | Name of column to use as the label column (in telco churn example we rename this column to 'churn') 77 | primary_keys (string or list) 78 | [Optional] String or list of strings, of columns to use as the primary key(s) 79 | dbfs_path (str) 80 | [Optional] DBFS path to use for the labels table (saving as a Delta table) 81 | """ 82 | database_name: str 83 | table_name: str 84 | label_col: str 85 | primary_keys: Union[str, List[str]] = None 86 | dbfs_path: str = None 87 | 88 | 89 | class Workload(ABC): 90 | """ 91 | This is an abstract class that provides handy interfaces to implement workloads (e.g. pipelines or job tasks). 92 | Create a child from this class and implement the abstract launch method. 93 | Class provides access to the following useful objects: 94 | * self.spark is a SparkSession 95 | * self.dbutils provides access to the DBUtils 96 | * self.logger provides access to the Spark-compatible logger 97 | * self.conf provides access to the parsed configuration of the job 98 | * self.env_vars provides access to the parsed environment variables of the job 99 | """ 100 | def __init__(self, spark=None, init_conf=None): 101 | self.spark = self._prepare_spark(spark) 102 | self.logger = self._prepare_logger() 103 | self.dbutils = self.get_dbutils() 104 | if init_conf: 105 | self.conf = init_conf 106 | else: 107 | self.conf = self._provide_config() 108 | self._log_conf() 109 | self.env_vars = self.get_env_vars_as_dict() 110 | self._log_env_vars() 111 | 112 | @staticmethod 113 | def _prepare_spark(spark) -> SparkSession: 114 | if not spark: 115 | return SparkSession.builder.getOrCreate() 116 | else: 117 | return spark 118 | 119 | @staticmethod 120 | def _get_dbutils(spark: SparkSession): 121 | try: 122 | from pyspark.dbutils import DBUtils # noqa 123 | 124 | if 'dbutils' not in locals(): 125 | utils = DBUtils(spark) 126 | return utils 127 | else: 128 | return locals().get('dbutils') 129 | except ImportError: 130 | return None 131 | 132 | def get_dbutils(self): 133 | utils = self._get_dbutils(self.spark) 134 | 135 | if not utils: 136 | self.logger.warn('No DBUtils defined in the runtime') 137 | else: 138 | self.logger.info('DBUtils class initialized') 139 | 140 | return utils 141 | 142 | def _provide_config(self): 143 | self.logger.info('Reading configuration from --conf-file job option') 144 | conf_file = self._get_conf_file() 145 | if not conf_file: 146 | self.logger.info( 147 | 'No conf file was provided, setting configuration to empty dict.' 148 | 'Please override configuration in subclass init method' 149 | ) 150 | return {} 151 | else: 152 | self.logger.info(f'Conf file was provided, reading configuration from {conf_file}') 153 | return self._read_config(conf_file) 154 | 155 | @staticmethod 156 | def _get_conf_file(): 157 | p = ArgumentParser() 158 | p.add_argument('--conf-file', required=False, type=str) 159 | namespace = p.parse_known_args(sys.argv[1:])[0] 160 | return namespace.conf_file 161 | 162 | @staticmethod 163 | def _read_config(conf_file) -> Dict[str, Any]: 164 | config = yaml.safe_load(pathlib.Path(conf_file).read_text()) 165 | return config 166 | 167 | @staticmethod 168 | def _get_base_data_params(): 169 | p = ArgumentParser() 170 | p.add_argument('--base-data-params', required=False, type=str) 171 | namespace = p.parse_known_args(sys.argv[1:])[0] 172 | return namespace.base_data_params 173 | 174 | @staticmethod 175 | def _get_env(): 176 | p = ArgumentParser() 177 | p.add_argument('--env', required=False, type=str) 178 | namespace = p.parse_known_args(sys.argv[1:])[0] 179 | return namespace.env 180 | 181 | @staticmethod 182 | def _set_environ(env_vars): 183 | dotenv.load_dotenv(env_vars) 184 | 185 | def get_env_vars_as_dict(self): 186 | base_data_params = self._get_base_data_params() 187 | self._set_environ(base_data_params) 188 | 189 | env = self._get_env() 190 | self._set_environ(env) 191 | 192 | return dict(os.environ) 193 | 194 | def _prepare_logger(self) -> Logger: 195 | log4j_logger = self.spark._jvm.org.apache.log4j # noqa 196 | return log4j_logger.LogManager.getLogger(self.__class__.__name__) 197 | 198 | def _log_conf(self): 199 | # log parameters 200 | self.logger.info('Launching job with configuration parameters:') 201 | for key, item in self.conf.items(): 202 | self.logger.info('\t Parameter: %-30s with value => %-30s' % (key, item)) 203 | 204 | def _log_env_vars(self): 205 | # log parameters 206 | self.logger.info('Using environment variables:') 207 | for key, item in self.env_vars.items(): 208 | self.logger.info('\t Parameter: %-30s with value => %-30s' % (key, item)) 209 | 210 | @abstractmethod 211 | def launch(self): 212 | """ 213 | Main method of the job. 214 | :return: 215 | """ 216 | pass 217 | 218 | 219 | def get_dbutils( 220 | spark: SparkSession, 221 | ): # please note that this function is used in mocking by its name 222 | try: 223 | from pyspark.dbutils import DBUtils # noqa 224 | 225 | if "dbutils" not in locals(): 226 | utils = DBUtils(spark) 227 | return utils 228 | else: 229 | return locals().get("dbutils") 230 | except ImportError: 231 | return None 232 | -------------------------------------------------------------------------------- /telco_churn/model_train.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Dict, Any 3 | import pprint 4 | 5 | import pandas as pd 6 | import sklearn 7 | from sklearn.model_selection import train_test_split 8 | import mlflow 9 | from mlflow.models import infer_signature 10 | 11 | import databricks 12 | from databricks.feature_store import FeatureStoreClient, FeatureLookup 13 | 14 | from telco_churn.common import MLflowTrackingConfig, FeatureStoreTableConfig, LabelsTableConfig 15 | from telco_churn.model_train_pipeline import ModelTrainPipeline 16 | from telco_churn.utils.get_spark import spark 17 | from telco_churn.utils.logger_utils import get_logger 18 | 19 | fs = FeatureStoreClient() 20 | _logger = get_logger() 21 | 22 | 23 | @dataclass 24 | class ModelTrainConfig: 25 | """ 26 | Configuration data class used to execute ModelTrain pipeline. 27 | 28 | Attributes: 29 | mlflow_tracking_cfg (MLflowTrackingConfig) 30 | Configuration data class used to unpack MLflow parameters during a model training run. 31 | feature_store_table_cfg (FeatureStoreTableConfig): 32 | Configuration data class used to unpack parameters when loading the Feature Store table. 33 | labels_table_cfg (LabelsTableConfig): 34 | Configuration data class used to unpack parameters when loading labels table. 35 | pipeline_params (dict): 36 | Params to use in preprocessing pipeline. Read from model_train.yml 37 | - test_size: Proportion of input data to use as training data 38 | - random_state: Random state to enable reproducible train-test split 39 | model_params (dict): 40 | Dictionary of params for model. Read from model_train.yml 41 | conf (dict): 42 | [Optional] dictionary of conf file used to trigger pipeline. If provided will be tracked as a yml 43 | file to MLflow tracking. 44 | env_vars (dict): 45 | [Optional] dictionary of environment variables to trigger pipeline. If provided will be tracked as a yml 46 | file to MLflow tracking. 47 | """ 48 | mlflow_tracking_cfg: MLflowTrackingConfig 49 | feature_store_table_cfg: FeatureStoreTableConfig 50 | labels_table_cfg: LabelsTableConfig 51 | pipeline_params: Dict[str, Any] 52 | model_params: Dict[str, Any] 53 | conf: Dict[str, Any] = None 54 | env_vars: Dict[str, str] = None 55 | 56 | 57 | class ModelTrain: 58 | """ 59 | Class to execute model training. Params, metrics and model artifacts will be tracking to MLflow Tracking. 60 | Optionally, the resulting model will be registered to MLflow Model Registry if provided. 61 | """ 62 | def __init__(self, cfg: ModelTrainConfig): 63 | self.cfg = cfg 64 | 65 | @staticmethod 66 | def _set_experiment(mlflow_tracking_cfg: MLflowTrackingConfig): 67 | """ 68 | Set MLflow experiment. Use one of either experiment_id or experiment_path 69 | """ 70 | if mlflow_tracking_cfg.experiment_id is not None: 71 | _logger.info(f'MLflow experiment_id: {mlflow_tracking_cfg.experiment_id}') 72 | mlflow.set_experiment(experiment_id=mlflow_tracking_cfg.experiment_id) 73 | elif mlflow_tracking_cfg.experiment_path is not None: 74 | _logger.info(f'MLflow experiment_path: {mlflow_tracking_cfg.experiment_path}') 75 | mlflow.set_experiment(experiment_name=mlflow_tracking_cfg.experiment_path) 76 | else: 77 | raise RuntimeError('MLflow experiment_id or experiment_path must be set in mlflow_params') 78 | 79 | def _get_feature_table_lookup(self) -> List[databricks.feature_store.entities.feature_lookup.FeatureLookup]: 80 | """ 81 | Create list of FeatureLookup for single feature store table. The FeatureLookup is a value class used to specify 82 | features to use in a TrainingSet. 83 | 84 | Returns 85 | ------- 86 | List[databricks.feature_store.entities.feature_lookup.FeatureLookup] 87 | """ 88 | feature_store_table_cfg = self.cfg.feature_store_table_cfg 89 | 90 | _logger.info('Creating feature lookups...') 91 | feature_table_name = f'{feature_store_table_cfg.database_name}.{feature_store_table_cfg.table_name}' 92 | feature_lookup = FeatureLookup(table_name=feature_table_name, 93 | lookup_key=feature_store_table_cfg.primary_keys) 94 | # Lookup for single feature table 95 | feature_table_lookup = [feature_lookup] 96 | 97 | return feature_table_lookup 98 | 99 | def get_fs_training_set(self) -> databricks.feature_store.training_set.TrainingSet: 100 | """ 101 | Create the Feature Store TrainingSet 102 | 103 | Returns 104 | ------- 105 | databricks.feature_store.training_set.TrainingSet 106 | """ 107 | feature_store_table_cfg = self.cfg.feature_store_table_cfg 108 | labels_table_cfg = self.cfg.labels_table_cfg 109 | labels_df = spark.table(f'{labels_table_cfg.database_name}.{labels_table_cfg.table_name}') 110 | 111 | feature_table_lookup = self._get_feature_table_lookup() 112 | _logger.info('Creating Feature Store training set...') 113 | return fs.create_training_set(df=labels_df, 114 | feature_lookups=feature_table_lookup, 115 | label=labels_table_cfg.label_col, 116 | exclude_columns=feature_store_table_cfg.primary_keys) 117 | 118 | def create_train_test_split(self, fs_training_set: databricks.feature_store.training_set.TrainingSet): 119 | """ 120 | Load the TrainingSet for training. The loaded DataFrame has columns specified by fs_training_set. 121 | Loaded Spark DataFrame is converted to pandas DataFrame and split into train/test splits. 122 | 123 | Parameters 124 | ---------- 125 | fs_training_set : databricks.feature_store.training_set.TrainingSet 126 | Feature Store TrainingSet 127 | 128 | Returns 129 | ------- 130 | train-test splits 131 | """ 132 | labels_table_cfg = self.cfg.labels_table_cfg 133 | 134 | _logger.info('Load training set from Feature Store, converting to pandas DataFrame') 135 | training_set_pdf = fs_training_set.load_df().toPandas() 136 | 137 | X = training_set_pdf.drop(labels_table_cfg.label_col, axis=1) 138 | y = training_set_pdf[labels_table_cfg.label_col] 139 | 140 | _logger.info(f'Splitting into train/test splits - test_size: {self.cfg.pipeline_params["test_size"]}') 141 | X_train, X_test, y_train, y_test = train_test_split(X, y, 142 | random_state=self.cfg.pipeline_params['random_state'], 143 | test_size=self.cfg.pipeline_params['test_size'], 144 | stratify=y) 145 | 146 | return X_train, X_test, y_train, y_test 147 | 148 | def fit_pipeline(self, X_train: pd.DataFrame, y_train: pd.Series) -> sklearn.pipeline.Pipeline: 149 | """ 150 | Create sklearn pipeline and fit pipeline. 151 | 152 | Parameters 153 | ---------- 154 | X_train : pd.DataFrame 155 | Training data 156 | 157 | y_train : pd.Series 158 | Training labels 159 | 160 | Returns 161 | ------- 162 | scikit-learn pipeline with fitted steps. 163 | """ 164 | _logger.info('Creating sklearn pipeline...') 165 | pipeline = ModelTrainPipeline.create_train_pipeline(self.cfg.model_params) 166 | 167 | _logger.info('Fitting sklearn RandomForestClassifier...') 168 | _logger.info(f'Model params: {pprint.pformat(self.cfg.model_params)}') 169 | model = pipeline.fit(X_train, y_train) 170 | 171 | return model 172 | 173 | def run(self): 174 | """ 175 | Method to trigger model training, and tracking to MLflow. 176 | 177 | Steps: 178 | 1. Set MLflow experiment (creating a new experiment if it does not already exist) 179 | 2. Start MLflow run 180 | 3. Create Databricks Feature Store training set 181 | 4. Create train-test splits to be used to train and evaluate the model 182 | 5. Define sklearn pipeline using ModelTrainPipeline, and fit on train data 183 | 6. Log trained model using the Databricks Feature Store API. Model will be logged to MLflow with associated 184 | feature table metadata. 185 | 7. Register the model to MLflow model registry if model_name is provided in mlflow_params 186 | """ 187 | _logger.info('==========Running model training==========') 188 | mlflow_tracking_cfg = self.cfg.mlflow_tracking_cfg 189 | 190 | _logger.info('==========Setting MLflow experiment==========') 191 | self._set_experiment(mlflow_tracking_cfg) 192 | # Enable automatic logging of input samples, metrics, parameters, and models 193 | mlflow.sklearn.autolog(log_input_examples=True, silent=True) 194 | 195 | _logger.info('==========Starting MLflow run==========') 196 | with mlflow.start_run(run_name=mlflow_tracking_cfg.run_name) as mlflow_run: 197 | 198 | if self.cfg.conf is not None: 199 | # Log config file 200 | mlflow.log_dict(self.cfg.conf, 'conf.yml') 201 | if self.cfg.env_vars is not None: 202 | # Log config file 203 | mlflow.log_dict(self.cfg.env_vars, 'env_vars.yml') 204 | 205 | # Create Feature Store Training Set 206 | _logger.info('==========Creating Feature Store training set==========') 207 | fs_training_set = self.get_fs_training_set() 208 | 209 | # Load and preprocess data into train/test splits 210 | _logger.info('==========Creating train/test splits==========') 211 | X_train, X_test, y_train, y_test = self.create_train_test_split(fs_training_set) 212 | 213 | # Fit pipeline with RandomForestClassifier 214 | _logger.info('==========Fitting RandomForestClassifier model==========') 215 | model = self.fit_pipeline(X_train, y_train) 216 | 217 | # Log model using Feature Store API 218 | _logger.info('Logging model to MLflow using Feature Store API') 219 | fs.log_model( 220 | model, 221 | 'fs_model', 222 | flavor=mlflow.sklearn, 223 | training_set=fs_training_set, 224 | input_example=X_train[:100], 225 | signature=infer_signature(X_train, y_train)) 226 | 227 | # Training metrics are logged by MLflow autologging 228 | # Log metrics for the test set 229 | _logger.info('==========Model Evaluation==========') 230 | _logger.info('Evaluating and logging metrics') 231 | test_metrics = mlflow.sklearn.eval_and_log_metrics(model, X_test, y_test, prefix='test_') 232 | print(pd.DataFrame(test_metrics, index=[0])) 233 | 234 | # Register model to MLflow Model Registry if provided 235 | if mlflow_tracking_cfg.model_name is not None: 236 | _logger.info('==========MLflow Model Registry==========') 237 | _logger.info(f'Registering model: {mlflow_tracking_cfg.model_name}') 238 | mlflow.register_model(f'runs:/{mlflow_run.info.run_id}/fs_model', 239 | name=mlflow_tracking_cfg.model_name) 240 | 241 | _logger.info('==========Model training completed==========') 242 | -------------------------------------------------------------------------------- /telco_churn/pipelines/demo_setup_job.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import mlflow 4 | from mlflow.tracking import MlflowClient 5 | from mlflow.exceptions import RestException 6 | 7 | from telco_churn.common import Workload 8 | from telco_churn.utils.logger_utils import get_logger 9 | 10 | from databricks.feature_store.client import FeatureStoreClient 11 | 12 | client = MlflowClient() 13 | fs = FeatureStoreClient() 14 | _logger = get_logger() 15 | 16 | 17 | class DemoSetup(Workload): 18 | 19 | def _get_train_experiment_id(self): 20 | try: 21 | return self.env_vars['model_train_experiment_id'] 22 | except KeyError: 23 | return None 24 | 25 | def _get_train_experiment_path(self): 26 | try: 27 | return self.env_vars['model_train_experiment_path'] 28 | except KeyError: 29 | return None 30 | 31 | def _get_deploy_experiment_id(self): 32 | try: 33 | return self.env_vars['model_deploy_experiment_id'] 34 | except KeyError: 35 | return None 36 | 37 | def _get_deploy_experiment_path(self): 38 | try: 39 | return self.env_vars['model_deploy_experiment_path'] 40 | except KeyError: 41 | return None 42 | 43 | @staticmethod 44 | def _check_mlflow_model_registry_exists(model_name) -> bool: 45 | """ 46 | Check if model exists in MLflow Model Registry. 47 | Returns True if model exists in Model Registry, False if not 48 | """ 49 | try: 50 | client.get_registered_model(name=model_name) 51 | _logger.info(f'MLflow Model Registry name: {model_name} exists') 52 | return True 53 | except RestException: 54 | _logger.info(f'MLflow Model Registry name: {model_name} DOES NOT exists') 55 | return False 56 | 57 | @staticmethod 58 | def _archive_registered_models(model_name): 59 | """ 60 | Archive any model versions which are not already under stage='Archived' 61 | """ 62 | registered_model = client.get_registered_model(name=model_name) 63 | latest_versions_list = registered_model.latest_versions 64 | 65 | _logger.info(f'MLflow Model Registry name: {model_name}') 66 | for model_version in latest_versions_list: 67 | if model_version.current_stage != 'Archived': 68 | _logger.info(f'Archiving model version: {model_version.version}') 69 | client.transition_model_version_stage( 70 | name=model_name, 71 | version=model_version.version, 72 | stage='Archived' 73 | ) 74 | 75 | def _delete_registered_model(self, model_name): 76 | """ 77 | Delete an experiment from the backend store. 78 | """ 79 | self._archive_registered_models(model_name) 80 | client.delete_registered_model(name=model_name) 81 | _logger.info(f'Deleted MLflow Model Registry model: {model_name}') 82 | 83 | def _check_mlflow_experiments_exists(self) -> dict: 84 | """ 85 | The demo workflow consists of creating 2 MLflow Tracking experiments: 86 | * train_experiment - Experiment used to track params, metrics, artifacts during model training 87 | * deploy_experiment - Experiment used to metrics when comparing models during the deploy model step 88 | 89 | This method checks the demo_setup config dict for either the experiment_id or experiment_path for both 90 | experiments. 91 | 92 | A dictionary containing the keys train_exp_exists and deploy_exp_exists along with boolean values is returned 93 | 94 | Returns 95 | ------- 96 | Dictionary indicating whether train and deploy MLflow experiments currently exist 97 | """ 98 | train_experiment_id = self._get_train_experiment_id() 99 | train_experiment_path = self._get_train_experiment_path() 100 | deploy_experiment_id = self._get_deploy_experiment_id() 101 | deploy_experiment_path = self._get_deploy_experiment_path() 102 | 103 | def check_by_experiment_id(experiment_id): 104 | try: 105 | mlflow.get_experiment(experiment_id=experiment_id) 106 | _logger.info(f'MLflow Tracking experiment_id: {experiment_id} exists') 107 | return True 108 | except RestException: 109 | _logger.info(f'MLflow Tracking experiment_id: {experiment_id} DOES NOT exist') 110 | return False 111 | 112 | def check_by_experiment_path(experiment_path): 113 | experiment = mlflow.get_experiment_by_name(name=experiment_path) 114 | if experiment is not None: 115 | _logger.info(f'MLflow Tracking experiment_path: {experiment_path} exists') 116 | return True 117 | else: 118 | _logger.info(f'MLflow Tracking experiment_path: {experiment_path} DOES NOT exist') 119 | return False 120 | 121 | if train_experiment_id is not None: 122 | train_exp_exists = check_by_experiment_id(train_experiment_id) 123 | elif train_experiment_path is not None: 124 | train_exp_exists = check_by_experiment_path(train_experiment_path) 125 | else: 126 | raise RuntimeError('Either model_train_experiment_id or model_train_experiment_path should be passed in ' 127 | 'deployment.yml') 128 | 129 | if deploy_experiment_id is not None: 130 | deploy_exp_exists = check_by_experiment_id(deploy_experiment_id) 131 | elif deploy_experiment_path is not None: 132 | deploy_exp_exists = check_by_experiment_path(deploy_experiment_path) 133 | else: 134 | raise RuntimeError('Either model_train_experiment_id or model_train_experiment_path should be passed in ' 135 | 'deployment.yml') 136 | 137 | return {'train_exp_exists': train_exp_exists, 138 | 'deploy_exp_exists': deploy_exp_exists} 139 | 140 | def _delete_mlflow_experiments(self, exp_exists_dict: dict): 141 | """ 142 | Check exp_exists_dict if train_exp_exists or deploy_exp_exists is True. Delete experiments if they exist 143 | 144 | Parameters 145 | ---------- 146 | exp_exists_dict : dict 147 | A dictionary containing the keys train_exp_exists and deploy_exp_exists along with boolean values 148 | """ 149 | delete_experiments = [exp for exp, exists in exp_exists_dict.items() if exists == True] 150 | if len(delete_experiments) == 0: 151 | _logger.info(f'No existing experiments to delete') 152 | if 'train_exp_exists' in delete_experiments: 153 | if self.env_vars['model_train_experiment_path'] is not None: 154 | experiment = mlflow.get_experiment_by_name(name=self.env_vars['model_train_experiment_path']) 155 | mlflow.delete_experiment(experiment_id=experiment.experiment_id) 156 | _logger.info(f'Deleted existing experiment_path: {self.env_vars["model_train_experiment_path"]}') 157 | elif self.env_vars['model_train_experiment_id'] is not None: 158 | mlflow.delete_experiment(experiment_id=self.env_vars['model_train_experiment_id']) 159 | _logger.info(f'Deleted existing experiment_id: {self.env_vars["model_train_experiment_id"]}') 160 | else: 161 | raise RuntimeError('Either model_train_experiment_id or model_train_experiment_path should be passed ' 162 | 'in deployment.yml') 163 | 164 | if 'deploy_exp_exists' in delete_experiments: 165 | if self.env_vars['model_deploy_experiment_path'] is not None: 166 | experiment = mlflow.get_experiment_by_name(name=self.env_vars['model_deploy_experiment_path']) 167 | mlflow.delete_experiment(experiment_id=experiment.experiment_id) 168 | _logger.info( 169 | f'Deleted existing experiment_path: {self.env_vars["model_deploy_experiment_path"]}') 170 | elif self.env_vars['model_deploy_experiment_id'] is not None: 171 | mlflow.delete_experiment(experiment_id=self.env_vars['model_deploy_experiment_id']) 172 | _logger.info(f'Deleted existing experiment_id: {self.env_vars["model_deploy_experiment_id"]}') 173 | 174 | @staticmethod 175 | def _check_feature_table_exists(feature_store_table) -> bool: 176 | """ 177 | Check if Feature Store feature table exists 178 | Returns True if feature table exists in Feature Store, False if not 179 | """ 180 | try: 181 | fs.get_table(name=feature_store_table) 182 | _logger.info(f'Feature Store feature table: {feature_store_table} exists') 183 | return True 184 | except (ValueError, Exception): 185 | _logger.info(f'Feature Store feature table: {feature_store_table} DOES NOT exist') 186 | return False 187 | 188 | @staticmethod 189 | def _drop_feature_table(feature_store_table): 190 | """ 191 | Delete Feature Store feature table 192 | """ 193 | try: 194 | fs.drop_table( 195 | name=feature_store_table 196 | ) 197 | _logger.info(f'Deleted Feature Store feature table: {feature_store_table}') 198 | except ValueError: 199 | _logger.info(f'Feature Store feature table: {feature_store_table} does not exist') 200 | 201 | def _check_labels_delta_table_exists(self, labels_table_dbfs_path) -> bool: 202 | """ 203 | Check if Delta table exists in DBFS 204 | 205 | Parameters 206 | ---------- 207 | labels_table_dbfs_path : str 208 | Path to Delta table in DBFS 209 | 210 | Returns 211 | ------- 212 | bool 213 | """ 214 | try: 215 | self.dbutils.fs.ls(labels_table_dbfs_path) 216 | _logger.info(f'Labels Delta table: {labels_table_dbfs_path} exists') 217 | return True 218 | except: 219 | _logger.info(f'Labels Delta table: {labels_table_dbfs_path} DOES NOT exist') 220 | return False 221 | 222 | def _delete_labels_delta_table(self, labels_table_dbfs_path): 223 | self.dbutils.fs.rm(labels_table_dbfs_path, True) 224 | _logger.info(f'Deleted labels Delta table: {labels_table_dbfs_path}') 225 | 226 | def setup(self): 227 | """ 228 | Demo setup steps: 229 | * Delete Model Registry model if exists (archive any existing models) 230 | * Delete MLflow experiments if exists 231 | * Delete Feature Table if exists 232 | """ 233 | _logger.info('==========Demo Setup=========') 234 | _logger.info(f'Running demo-setup pipeline in {self.env_vars["env"]} environment') 235 | 236 | if self.conf['delete_model_registry']: 237 | _logger.info('Checking MLflow Model Registry...') 238 | model_name = self.env_vars['model_name'] 239 | if self._check_mlflow_model_registry_exists(model_name): 240 | self._delete_registered_model(model_name) 241 | 242 | if self.conf['delete_mlflow_experiments']: 243 | _logger.info('Checking MLflow Tracking...') 244 | exp_exists_dict = self._check_mlflow_experiments_exists() 245 | self._delete_mlflow_experiments(exp_exists_dict) 246 | 247 | if self.conf['drop_feature_table']: 248 | _logger.info('Checking Feature Store...') 249 | feature_store_database_name = self.env_vars['feature_store_database_name'] 250 | feature_store_table_name = self.env_vars['feature_store_table_name'] 251 | feature_store_table = f'{feature_store_database_name}.{feature_store_table_name}' 252 | if self._check_feature_table_exists(feature_store_table=feature_store_table): 253 | self._drop_feature_table(feature_store_table=feature_store_table) 254 | 255 | if self.conf['drop_labels_table']: 256 | _logger.info('Checking existing labels table...') 257 | labels_table_dbfs_path = self.env_vars['labels_table_dbfs_path'] 258 | if self._check_labels_delta_table_exists(labels_table_dbfs_path=labels_table_dbfs_path): 259 | self._delete_labels_delta_table(labels_table_dbfs_path=labels_table_dbfs_path) 260 | 261 | _logger.info('==========Demo Setup Complete=========') 262 | 263 | def launch(self) -> None: 264 | """ 265 | Launch DemoSetup job 266 | """ 267 | _logger.info('Launching DemoSetup job') 268 | DemoSetup().setup() 269 | _logger.info('DemoSetup job finished!') 270 | 271 | 272 | if __name__ == '__main__': 273 | job = DemoSetup() 274 | job.launch() 275 | -------------------------------------------------------------------------------- /telco_churn/model_deployment.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import mlflow 4 | import pandas as pd 5 | import pyspark.sql 6 | from mlflow.tracking import MlflowClient 7 | 8 | from telco_churn.common import MLflowTrackingConfig 9 | from telco_churn.model_inference import ModelInference 10 | from telco_churn.utils.evaluation_utils import ModelEvaluation 11 | from telco_churn.utils.logger_utils import get_logger 12 | 13 | _logger = get_logger() 14 | 15 | 16 | @dataclass 17 | class ModelDeploymentConfig: 18 | """ 19 | Attributes: 20 | mlflow_tracking_cfg (MLflowTrackingConfig) 21 | Configuration data class used to unpack MLflow parameters during a model training run. 22 | reference_data (str): Name of table to use as a reference DataFrame to score loaded model against. 23 | Must contain column(s) for lookup keys to join feature data from Feature Store 24 | label_col (str): Name of label column in input data 25 | comparison_metric (str): Name of evaluation metric to use when comparing models 26 | higher_is_better (bool): Boolean indicating whether a higher value for the evaluation metric equates to better 27 | model performance 28 | """ 29 | mlflow_tracking_cfg: MLflowTrackingConfig 30 | reference_data: str 31 | label_col: str = 'churn' 32 | comparison_metric: str = 'roc_auc_score' 33 | higher_is_better: bool = True 34 | 35 | 36 | class ModelDeployment: 37 | """ 38 | Class to execute model deployment. This class orchestrates the comparison of the current Production model versus 39 | Staging model. The Production model will be the most recent model version under registered in the MLflow Model 40 | Registry under the provided model_name, for stage="Production". Likewise for Staging. 41 | 42 | Execution will involve loading the models and performing batch inference for a specified reference dataset. 43 | The two models will be compared using the specified comparison_metric. 44 | higher_is_better indicates whether a higher value for the evaluation metric equates to a better peforming model. 45 | Dependent on this comparison the candidate Staging model will be either promoted to Production (and the current 46 | Production model archived) if performing better, or the Staging model will be archived if it does not perform 47 | better than the current Production model. 48 | 49 | Metrics computed when comparing the two models will be logged to MLflow, under the provided experiment_id or 50 | experiment_path. 51 | """ 52 | def __init__(self, cfg: ModelDeploymentConfig): 53 | self.cfg = cfg 54 | 55 | @staticmethod 56 | def _set_experiment(mlflow_tracking_cfg: MLflowTrackingConfig): 57 | """ 58 | Set MLflow experiment. Use one of either experiment_id or experiment_path 59 | """ 60 | if mlflow_tracking_cfg.experiment_id is not None: 61 | _logger.info(f'MLflow experiment_id: {mlflow_tracking_cfg.experiment_id}') 62 | mlflow.set_experiment(experiment_id=mlflow_tracking_cfg.experiment_id) 63 | elif mlflow_tracking_cfg.experiment_path is not None: 64 | _logger.info(f'MLflow experiment_path: {mlflow_tracking_cfg.experiment_path}') 65 | mlflow.set_experiment(experiment_name=mlflow_tracking_cfg.experiment_path) 66 | else: 67 | raise RuntimeError('MLflow experiment_id or experiment_path must be set in MLflowTrackingConfig') 68 | 69 | def _get_model_uri_by_stage(self, stage: str): 70 | return f'models:/{self.cfg.mlflow_tracking_cfg.model_name}/{stage}' 71 | 72 | def _batch_inference_by_stage(self, stage: str) -> pyspark.sql.DataFrame: 73 | """ 74 | Load and compute batch inference using model loaded from an MLflow Model Registry stage. 75 | Inference is computed on reference data specified. The model will use this reference data to look up feature 76 | values for primary keys, and use the loaded features as input for model scoring. 77 | The most recent model under the specified stage will be loaded. The registered model must have been logged to 78 | MLflow using the Feature Store API. 79 | 80 | Parameters 81 | ---------- 82 | stage : str 83 | MLflow Model Registry stage 84 | 85 | Returns 86 | ------- 87 | Spark DataFrame containing primary keys of the reference data, the loaded features from the feature store and 88 | prediction from model scoring 89 | """ 90 | model_uri = self._get_model_uri_by_stage(stage=stage) 91 | _logger.info(f'Computing batch inference using: {model_uri}') 92 | _logger.info(f'Reference data: {self.cfg.reference_data}') 93 | model_inference = ModelInference(model_uri=model_uri, 94 | input_table_name=self.cfg.reference_data) 95 | 96 | return model_inference.run_batch() 97 | 98 | @staticmethod 99 | def _get_evaluation_metric(y_true: pd.Series, y_score: pd.Series, metric: str, stage: str) -> float: 100 | """ 101 | Trigger evaluation, and return evaluation specified. A dictionary of evaluation metrics will be tracked to 102 | MLflow tracking. 103 | 104 | Parameters 105 | ---------- 106 | y_true : array-like of shape (n_samples,) or (n_samples, n_classes) 107 | True labels or binary label indicators 108 | y_score : array-like of shape (n_samples,) or (n_samples, n_classes) 109 | Target scores. 110 | metric : str 111 | Name of metric to retrieve from evaluation dictionary 112 | stage : str 113 | Name of the MLflow Registry model stage corresponding. Used as a prefix when logging metrics 114 | 115 | Returns 116 | ------- 117 | Evaluation metric 118 | """ 119 | metric_prefix = stage + "_" 120 | eval_dict = ModelEvaluation().evaluate(y_true, y_score, metric_prefix=metric_prefix) 121 | mlflow.log_metrics(eval_dict) 122 | eval_metric = eval_dict[metric_prefix + metric] 123 | 124 | return eval_metric 125 | 126 | def _run_promotion_logic(self, staging_eval_metric: float, production_eval_metric: float): 127 | """ 128 | Basic logic to either promote a candidate Staging model performing better than the current Production model, 129 | or alternatively archive the Staging model if not outperforming Production model. 130 | 131 | Parameters 132 | ---------- 133 | staging_eval_metric : float 134 | Evaluation metric computed using Staging model 135 | production_eval_metric : float 136 | Evaluation metric computed using Production model 137 | """ 138 | client = MlflowClient() 139 | model_name = self.cfg.mlflow_tracking_cfg.model_name 140 | staging_model_version = client.get_latest_versions(name=model_name, stages=['staging'])[0] 141 | 142 | _logger.info(f'metric={self.cfg.comparison_metric}') 143 | _logger.info(f'higher_is_better={self.cfg.higher_is_better}') 144 | if self.cfg.higher_is_better: 145 | if staging_eval_metric <= production_eval_metric: 146 | _logger.info('Candidate Staging model DOES NOT perform better than current Production model') 147 | _logger.info('Transition candidate model from stage="staging" to stage="archived"') 148 | client.transition_model_version_stage(name=model_name, 149 | version=staging_model_version.version, 150 | stage='archived') 151 | 152 | elif staging_eval_metric > production_eval_metric: 153 | _logger.info('Candidate Staging model DOES perform better than current Production model') 154 | _logger.info('Transition candidate model from stage="staging" to stage="production"') 155 | _logger.info('Existing Production model will be archived') 156 | client.transition_model_version_stage(name=model_name, 157 | version=staging_model_version.version, 158 | stage='production', 159 | archive_existing_versions=True) 160 | 161 | else: 162 | if staging_eval_metric >= production_eval_metric: 163 | _logger.info('Candidate Staging model DOES NOT perform better than current Production model') 164 | _logger.info('Transition candidate model from stage="staging" to stage="archived"') 165 | client.transition_model_version_stage(name=model_name, 166 | version=staging_model_version.version, 167 | stage='archived') 168 | 169 | elif staging_eval_metric < production_eval_metric: 170 | _logger.info('Candidate Staging model DOES perform better than current Production model') 171 | _logger.info('Transition candidate model from stage="staging" to stage="production"') 172 | _logger.info('Existing Production model will be archived') 173 | client.transition_model_version_stage(name=model_name, 174 | version=staging_model_version.version, 175 | stage='production', 176 | archive_existing_versions=True) 177 | 178 | def run(self): 179 | """ 180 | Runner method to orchestrate model comparison and potential model promotion. 181 | 182 | Steps: 183 | 1. Set MLflow Tracking experiment. Used to track metrics computed when comparing Staging versus Production 184 | models. 185 | 2. Load Staging and Production models and score against reference dataset provided. The reference data 186 | specified must currently be a table. 187 | 3. Compute evaluation metric for both Staging and Production model predictions against reference data 188 | 4. If higher_is_better=True, the Staging model will be promoted in place of the Production model iff the 189 | Staging model evaluation metric is higher than the Production model evaluation metric. 190 | If higher_is_better=False, the Staging model will be promoted in place of the Production model iff the 191 | Staging model evaluation metric is lower than the Production model evaluation metric. 192 | 193 | """ 194 | _logger.info('==========Running model deployment==========') 195 | 196 | _logger.info('==========Setting MLflow experiment==========') 197 | mlflow_tracking_cfg = self.cfg.mlflow_tracking_cfg 198 | self._set_experiment(mlflow_tracking_cfg) 199 | 200 | with mlflow.start_run(run_name=mlflow_tracking_cfg.run_name): 201 | 202 | _logger.info('==========Batch inference: staging model==========') 203 | staging_inference_pred_df = self._batch_inference_by_stage(stage='staging') 204 | staging_inference_pred_pdf = staging_inference_pred_df.toPandas() 205 | _logger.info('==========Batch inference: production model==========') 206 | prod_inference_pred_df = self._batch_inference_by_stage(stage='production') 207 | prod_inference_pred_pdf = prod_inference_pred_df.toPandas() 208 | 209 | _logger.info('==========Model evaluation: staging model==========') 210 | staging_eval_metric = self._get_evaluation_metric(y_true=staging_inference_pred_pdf[self.cfg.label_col], 211 | y_score=staging_inference_pred_pdf['prediction'], 212 | metric=self.cfg.comparison_metric, 213 | stage='staging') 214 | _logger.info(f'Candidate Staging model (stage="staging") {self.cfg.comparison_metric}: {staging_eval_metric}') 215 | 216 | _logger.info('==========Model evaluation: production model==========') 217 | production_eval_metric = self._get_evaluation_metric(y_true=prod_inference_pred_pdf[self.cfg.label_col], 218 | y_score=prod_inference_pred_pdf['prediction'], 219 | metric=self.cfg.comparison_metric, 220 | stage='production') 221 | _logger.info( 222 | f'Current Production model (stage="production") {self.cfg.comparison_metric}: {production_eval_metric}') 223 | 224 | _logger.info('==========Model comparison: candidate staging model vs current production model==========') 225 | self._run_promotion_logic(staging_eval_metric, production_eval_metric) 226 | 227 | _logger.info('==========Model deployment completed==========') 228 | -------------------------------------------------------------------------------- /notebooks/demo_setup.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # `demo_setup` 5 | # MAGIC 6 | # MAGIC Pipeline to ensure that we can run the demo from a clean setup. Executing the `DemoSetup.run()` will do the following steps: 7 | # MAGIC 8 | # MAGIC - Delete Model Registry model if exists (archive any existing models) 9 | # MAGIC - Delete MLflow experiments if exists 10 | # MAGIC - Delete Feature Table if exists 11 | 12 | # COMMAND ---------- 13 | 14 | # DBTITLE 1,pip install requirements.txt 15 | # MAGIC %pip install -r ../requirements.txt 16 | 17 | # COMMAND ---------- 18 | 19 | # DBTITLE 1,Set env 20 | dbutils.widgets.dropdown('env', 'dev', ['dev', 'staging', 'prod'], 'Environment Name') 21 | 22 | # COMMAND ---------- 23 | 24 | # DBTITLE 1,Module Imports 25 | import mlflow 26 | from mlflow.tracking import MlflowClient 27 | from mlflow.exceptions import RestException 28 | 29 | from telco_churn.utils.notebook_utils import load_and_set_env_vars, load_config 30 | from telco_churn.utils.logger_utils import get_logger 31 | 32 | from databricks.feature_store.client import FeatureStoreClient 33 | 34 | client = MlflowClient() 35 | fs = FeatureStoreClient() 36 | _logger = get_logger() 37 | 38 | # COMMAND ---------- 39 | 40 | # DBTITLE 1,Load pipeline config params 41 | # Set pipeline name 42 | pipeline_name = 'demo_setup' 43 | 44 | # Load pipeline config yaml file (../conf/pipeline_configs/{pipeline_name}.yml) 45 | pipeline_config = load_config(pipeline_name) 46 | 47 | # Load and set arbitrary params via spark_env_vars 48 | # Params passed via ../conf/{env}/.{env}.env and ../conf/.base_data_params 49 | env_vars = load_and_set_env_vars(env=dbutils.widgets.get('env')) 50 | 51 | # COMMAND ---------- 52 | 53 | # DBTITLE 1,Pipeline Class 54 | class DemoSetup: 55 | 56 | def __init__(self, conf: dict, env_vars: dict): 57 | 58 | self.conf = conf 59 | self.env_vars = env_vars 60 | 61 | def _get_train_experiment_id(self): 62 | try: 63 | return self.env_vars['model_train_experiment_id'] 64 | except KeyError: 65 | return None 66 | 67 | def _get_train_experiment_path(self): 68 | try: 69 | return self.env_vars['model_train_experiment_path'] 70 | except KeyError: 71 | return None 72 | 73 | def _get_deploy_experiment_id(self): 74 | try: 75 | return self.env_vars['model_deploy_experiment_id'] 76 | except KeyError: 77 | return None 78 | 79 | def _get_deploy_experiment_path(self): 80 | try: 81 | return self.env_vars['model_deploy_experiment_path'] 82 | except KeyError: 83 | return None 84 | 85 | @staticmethod 86 | def _check_mlflow_model_registry_exists(model_name) -> bool: 87 | """ 88 | Check if model exists in MLflow Model Registry. 89 | Returns True if model exists in Model Registry, False if not 90 | """ 91 | try: 92 | client.get_registered_model(name=model_name) 93 | _logger.info(f'MLflow Model Registry name: {model_name} exists') 94 | return True 95 | except RestException: 96 | _logger.info(f'MLflow Model Registry name: {model_name} DOES NOT exists') 97 | return False 98 | 99 | @staticmethod 100 | def _archive_registered_models(model_name): 101 | """ 102 | Archive any model versions which are not already under stage='Archived' 103 | """ 104 | registered_model = client.get_registered_model(name=model_name) 105 | latest_versions_list = registered_model.latest_versions 106 | 107 | _logger.info(f'MLflow Model Registry name: {model_name}') 108 | for model_version in latest_versions_list: 109 | if model_version.current_stage != 'Archived': 110 | _logger.info(f'Archiving model version: {model_version.version}') 111 | client.transition_model_version_stage( 112 | name=model_name, 113 | version=model_version.version, 114 | stage='Archived' 115 | ) 116 | 117 | def _delete_registered_model(self, model_name): 118 | """ 119 | Delete an experiment from the backend store. 120 | """ 121 | self._archive_registered_models(model_name) 122 | client.delete_registered_model(name=model_name) 123 | _logger.info(f'Deleted MLflow Model Registry model: {model_name}') 124 | 125 | def _check_mlflow_experiments_exists(self) -> dict: 126 | """ 127 | The demo workflow consists of creating 2 MLflow Tracking experiments: 128 | * train_experiment - Experiment used to track params, metrics, artifacts during model training 129 | * deploy_experiment - Experiment used to metrics when comparing models during the deploy model step 130 | 131 | This method checks the demo_setup config dict for either the experiment_id or experiment_path for both 132 | experiments. 133 | 134 | A dictionary containing the keys train_exp_exists and deploy_exp_exists along with boolean values is returned 135 | 136 | Returns 137 | ------- 138 | Dictionary indicating whether train and deploy MLflow experiments currently exist 139 | """ 140 | train_experiment_id = self._get_train_experiment_id() 141 | train_experiment_path = self._get_train_experiment_path() 142 | deploy_experiment_id = self._get_deploy_experiment_id() 143 | deploy_experiment_path = self._get_deploy_experiment_path() 144 | 145 | def check_by_experiment_id(experiment_id): 146 | try: 147 | mlflow.get_experiment(experiment_id=experiment_id) 148 | _logger.info(f'MLflow Tracking experiment_id: {experiment_id} exists') 149 | return True 150 | except RestException: 151 | _logger.info(f'MLflow Tracking experiment_id: {experiment_id} DOES NOT exist') 152 | return False 153 | 154 | def check_by_experiment_path(experiment_path): 155 | experiment = mlflow.get_experiment_by_name(name=experiment_path) 156 | if experiment is not None: 157 | _logger.info(f'MLflow Tracking experiment_path: {experiment_path} exists') 158 | return True 159 | else: 160 | _logger.info(f'MLflow Tracking experiment_path: {experiment_path} DOES NOT exist') 161 | return False 162 | 163 | if train_experiment_id is not None: 164 | train_exp_exists = check_by_experiment_id(train_experiment_id) 165 | elif train_experiment_path is not None: 166 | train_exp_exists = check_by_experiment_path(train_experiment_path) 167 | else: 168 | raise RuntimeError('Either model_train_experiment_id or model_train_experiment_path should be passed in ' 169 | 'deployment.yml') 170 | 171 | if deploy_experiment_id is not None: 172 | deploy_exp_exists = check_by_experiment_id(deploy_experiment_id) 173 | elif deploy_experiment_path is not None: 174 | deploy_exp_exists = check_by_experiment_path(deploy_experiment_path) 175 | else: 176 | raise RuntimeError('Either model_train_experiment_id or model_train_experiment_path should be passed in ' 177 | 'deployment.yml') 178 | 179 | return {'train_exp_exists': train_exp_exists, 180 | 'deploy_exp_exists': deploy_exp_exists} 181 | 182 | def _delete_mlflow_experiments(self, exp_exists_dict: dict): 183 | """ 184 | Check exp_exists_dict if train_exp_exists or deploy_exp_exists is True. Delete experiments if they exist 185 | 186 | Parameters 187 | ---------- 188 | exp_exists_dict : dict 189 | A dictionary containing the keys train_exp_exists and deploy_exp_exists along with boolean values 190 | """ 191 | delete_experiments = [exp for exp, exists in exp_exists_dict.items() if exists == True] 192 | if len(delete_experiments) == 0: 193 | _logger.info(f'No existing experiments to delete') 194 | if 'train_exp_exists' in delete_experiments: 195 | if self.env_vars['model_train_experiment_path'] is not None: 196 | experiment = mlflow.get_experiment_by_name(name=self.env_vars['model_train_experiment_path']) 197 | mlflow.delete_experiment(experiment_id=experiment.experiment_id) 198 | _logger.info(f'Deleted existing experiment_path: {self.env_vars["model_train_experiment_path"]}') 199 | elif self.env_vars['model_train_experiment_id'] is not None: 200 | mlflow.delete_experiment(experiment_id=self.env_vars['model_train_experiment_id']) 201 | _logger.info(f'Deleted existing experiment_id: {self.env_vars["model_train_experiment_id"]}') 202 | else: 203 | raise RuntimeError('Either model_train_experiment_id or model_train_experiment_path should be passed ' 204 | 'in deployment.yml') 205 | 206 | if 'deploy_exp_exists' in delete_experiments: 207 | if self.env_vars['model_deploy_experiment_path'] is not None: 208 | experiment = mlflow.get_experiment_by_name(name=self.env_vars['model_deploy_experiment_path']) 209 | mlflow.delete_experiment(experiment_id=experiment.experiment_id) 210 | _logger.info( 211 | f'Deleted existing experiment_path: {self.env_vars["model_deploy_experiment_path"]}') 212 | elif self.env_vars['model_deploy_experiment_id'] is not None: 213 | mlflow.delete_experiment(experiment_id=self.env_vars['model_deploy_experiment_id']) 214 | _logger.info(f'Deleted existing experiment_id: {self.env_vars["model_deploy_experiment_id"]}') 215 | 216 | @staticmethod 217 | def _check_feature_table_exists(feature_store_table) -> bool: 218 | """ 219 | Check if Feature Store feature table exists 220 | Returns True if feature table exists in Feature Store, False if not 221 | """ 222 | try: 223 | fs.get_table(name=feature_store_table) 224 | _logger.info(f'Feature Store feature table: {feature_store_table} exists') 225 | return True 226 | except (ValueError, Exception): 227 | _logger.info(f'Feature Store feature table: {feature_store_table} DOES NOT exist') 228 | return False 229 | 230 | @staticmethod 231 | def _drop_feature_table(feature_store_table): 232 | """ 233 | Delete Feature Store feature table 234 | """ 235 | try: 236 | fs.drop_table( 237 | name=feature_store_table 238 | ) 239 | _logger.info(f'Deleted Feature Store feature table: {feature_store_table}') 240 | except ValueError: 241 | _logger.info(f'Feature Store feature table: {feature_store_table} does not exist') 242 | 243 | def _check_labels_delta_table_exists(self, labels_table_dbfs_path) -> bool: 244 | """ 245 | Check if Delta table exists in DBFS 246 | 247 | Parameters 248 | ---------- 249 | labels_table_dbfs_path : str 250 | Path to Delta table in DBFS 251 | 252 | Returns 253 | ------- 254 | bool 255 | """ 256 | try: 257 | self.dbutils.fs.ls(labels_table_dbfs_path) 258 | _logger.info(f'Labels Delta table: {labels_table_dbfs_path} exists') 259 | return True 260 | except: 261 | _logger.info(f'Labels Delta table: {labels_table_dbfs_path} DOES NOT exist') 262 | return False 263 | 264 | def _delete_labels_delta_table(self, labels_table_dbfs_path): 265 | self.dbutils.fs.rm(labels_table_dbfs_path, True) 266 | _logger.info(f'Deleted labels Delta table: {labels_table_dbfs_path}') 267 | 268 | def run(self): 269 | """ 270 | Demo setup steps: 271 | * Delete Model Registry model if exists (archive any existing models) 272 | * Delete MLflow experiments if exists 273 | * Delete Feature Table if exists 274 | """ 275 | _logger.info('==========Demo Setup=========') 276 | _logger.info(f'Running demo-setup pipeline in {self.env_vars["env"]} environment') 277 | 278 | if self.conf['delete_model_registry']: 279 | _logger.info('Checking MLflow Model Registry...') 280 | model_name = self.env_vars['model_name'] 281 | if self._check_mlflow_model_registry_exists(model_name): 282 | self._delete_registered_model(model_name) 283 | 284 | if self.conf['delete_mlflow_experiments']: 285 | _logger.info('Checking MLflow Tracking...') 286 | exp_exists_dict = self._check_mlflow_experiments_exists() 287 | self._delete_mlflow_experiments(exp_exists_dict) 288 | 289 | if self.conf['drop_feature_table']: 290 | _logger.info('Checking Feature Store...') 291 | feature_store_database_name = self.env_vars['feature_store_database_name'] 292 | feature_store_table_name = self.env_vars['feature_store_table_name'] 293 | feature_store_table = f'{feature_store_database_name}.{feature_store_table_name}' 294 | if self._check_feature_table_exists(feature_store_table=feature_store_table): 295 | self._drop_feature_table(feature_store_table=feature_store_table) 296 | 297 | if self.conf['drop_labels_table']: 298 | _logger.info('Checking existing labels table...') 299 | labels_table_dbfs_path = self.env_vars['labels_table_dbfs_path'] 300 | if self._check_labels_delta_table_exists(labels_table_dbfs_path=labels_table_dbfs_path): 301 | self._delete_labels_delta_table(labels_table_dbfs_path=labels_table_dbfs_path) 302 | 303 | _logger.info('==========Demo Setup Complete=========') 304 | 305 | # COMMAND ---------- 306 | 307 | # DBTITLE 1,Execute Pipeline 308 | # Instantiate pipeline 309 | demo_setup_pipeline = DemoSetup(conf=pipeline_config, env_vars=env_vars) 310 | demo_setup_pipeline.run() 311 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [DEPRECATED] e2e-mlops 2 | 3 | > **NOTE:** This repository is deprecated as of 2022/11/11. The end-to-end MLOps workflow demonstrated in this project was designed with recommended tooling available at the time. Since the release of this repo, Databricks has built a product-supported MLOps template called MLOps Stacks (currently in private preview). The repo for MLOps Stacks can be found [here](https://github.com/databricks/mlops-stack). If you would like to express interest and enroll in this private preview, please complete [this questionnaire](https://docs.google.com/forms/d/e/1FAIpQLSfHXCmkbsEURjQQvtUGObgh2D5q1eD4YRHnUxZ0M4Hu0W63WA/viewform). 4 | 5 | --- 6 | 7 | This repo is intended to demonstrate an end-to-end MLOps workflow on Databricks, where a model is deployed along with its ancillary pipelines to a specified (currently single) Databricks workspace. 8 | Each pipeline (e.g model training pipeline, model deployment pipeline) is deployed as a [Databricks job](https://docs.databricks.com/data-engineering/jobs/jobs.html), where these jobs are deployed to a Databricks workspace using Databricks Labs' [`dbx`](https://dbx.readthedocs.io/en/latest/index.html) tool. 9 | 10 | The use case at hand is a churn prediction problem. We use the [IBM Telco Customer Churn dataset](https://community.ibm.com/community/user/businessanalytics/blogs/steven-macko/2019/07/11/telco-customer-churn-1113) to build a simple classifier to predict whether a customer will churn from a fictional telco company. 11 | 12 | Note that the package is solely developed via an IDE, and as such there are no Databricks Notebooks in the repository. All jobs are executed via a command line based workflow using [`dbx`](https://dbx.readthedocs.io/en/latest/). 13 | 14 | ## Pipelines 15 | 16 | The following pipelines currently defined within the package are: 17 | - `demo-setup` 18 | - Deletes existing feature store tables, existing MLflow experiments and models registered to MLflow Model Registry, 19 | in order to start afresh for a demo. 20 | - `feature-table-creation` 21 | - Creates new feature table and separate labels Delta table. 22 | - `model-train` 23 | - Trains a scikit-learn Random Forest model 24 | - `model-deployment` 25 | - Compare the Staging versus Production models in the MLflow Model Registry. Transition the Staging model to 26 | Production if outperforming the current Production model. 27 | - `model-inference-batch` 28 | - Load a model from MLflow Model Registry, load features from Feature Store and score batch. 29 | 30 | ## Demo 31 | The following outlines the workflow to demo the repo. 32 | 33 | ### Set up 34 | 1. Fork https://github.com/niall-turbitt/e2e-mlops 35 | 1. Configure [Databricks CLI connection profile](https://docs.databricks.com/dev-tools/cli/index.html#connection-profiles) 36 | - The project is designed to use 3 different Databricks CLI connection profiles: dev, staging and prod. 37 | These profiles are set in [e2e-mlops/.dbx/project.json](https://github.com/niall-turbitt/e2e-mlops/blob/main/.dbx/project.json). 38 | - Note that for demo purposes we use the same connection profile for each of the 3 environments. 39 | **In practice each profile would correspond to separate dev, staging and prod Databricks workspaces.** 40 | - This [project.json](https://github.com/niall-turbitt/e2e-mlops/blob/main/.dbx/project.json) file will have to be 41 | adjusted accordingly to the connection profiles a user has configured on their local machine. 42 | 1. Configure Databricks secrets for GitHub Actions (ensure GitHub actions are enabled for you forked project, as the default is off in a forked repo). 43 | - Within the GitHub project navigate to Secrets under the project settings 44 | - To run the GitHub actions workflows we require the following GitHub actions secrets: 45 | - `DATABRICKS_STAGING_HOST` 46 | - URL of Databricks staging workspace 47 | - `DATABRICKS_STAGING_TOKEN` 48 | - [Databricks access token](https://docs.databricks.com/dev-tools/api/latest/authentication.html) for staging workspace 49 | - `DATABRICKS_PROD_HOST` 50 | - URL of Databricks production workspace 51 | - `DATABRICKS_PROD_TOKEN` 52 | - [Databricks access token](https://docs.databricks.com/dev-tools/api/latest/authentication.html) for production workspace 53 | - `GH_TOKEN` 54 | - GitHub [personal access token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) 55 | 56 | #### ASIDE: Starting from scratch 57 | 58 | The following resources should not be present if starting from scratch: 59 | - Feature table must be deleted 60 | - The table e2e_mlops_testing.churn_features will be created when the feature-table-creation pipeline is triggered. 61 | - MLflow experiment 62 | - MLflow Experiments during model training and model deployment will be used in both the dev and prod environments. 63 | The paths to these experiments are configured in [conf/deployment.yml](https://github.com/niall-turbitt/e2e-mlops/blob/main/conf/deployment.yml). 64 | - For demo purposes, we delete these experiments if they exist to begin from a blank slate. 65 | - Model Registry 66 | - Delete Model in MLflow Model Registry if exists. 67 | 68 | **NOTE:** As part of the `initial-model-train-register` multitask job, the first task `demo-setup` will delete these, 69 | as specified in [`demo_setup.yml`](https://github.com/niall-turbitt/e2e-mlops/blob/main/conf/job_configs/demo_setup.yml). 70 | 71 | ### Workflow 72 | 73 | 1. **Run `PROD-telco-churn-initial-model-train-register` multitask job in prod environment** 74 | 75 | - To demonstrate a CICD workflow, we want to start from a “steady state” where there is a current model in production. 76 | As such, we will manually trigger a multitask job to do the following steps: 77 | 1. Set up the workspace for the demo by deleting existing MLflow experiments and register models, along with 78 | existing Feature Store and labels tables. 79 | 1. Create a new Feature Store table to be used by the model training pipeline. 80 | 1. Train an initial “baseline” model 81 | - There is then a final manual step to promote this newly trained model to production via the MLflow Model Registry UI. 82 | 83 | - Outlined below are the detailed steps to do this: 84 | 85 | 1. Run the multitask `PROD-telco-churn-initial-model-train-register` job via an automated job cluster in the prod environment 86 | - **NOTE:** multitask jobs can only be run via `dbx deploy; dbx launch` currently). 87 | ``` 88 | dbx deploy --jobs=PROD-telco-churn-initial-model-train-register --environment=prod --files-only 89 | dbx launch --job=PROD-telco-churn-initial-model-train-register --environment=prod --as-run-submit --trace 90 | ``` 91 | See the Limitations section below regarding running multitask jobs. In order to reduce cluster start up time 92 | you may want to consider using a [Databricks pool](https://docs.databricks.com/clusters/instance-pools/index.html), 93 | and specify this pool ID in [`conf/deployment.yml`](https://github.com/niall-turbitt/e2e-mlops/blob/main/conf/deployment.yml). 94 | - `PROD-telco-churn-initial-model-train-register` tasks: 95 | 1. Demo setup task steps ([`demo-setup`](https://github.com/niall-turbitt/e2e-mlops/blob/main/telco_churn/jobs/demo_setup_job.py)) 96 | 1. Delete Model Registry model if exists (archive any existing models). 97 | 1. Delete MLflow experiment if exists. 98 | 1. Delete Feature Table if exists. 99 | 1. Feature table creation task steps (`feature-table-creation`) 100 | 1. Creates new churn_features feature table in the Feature Store. 101 | - **NOTE:** `ibm_telco_churn.bronze_customers` is a table created from the following [dataset](https://www.kaggle.com/datasets/yeanzc/telco-customer-churn-ibm-dataset). This will not be automatically available in your Databricks workspace. The user will have to create this table (or update the `feature-table-creation` config to point at this dataset) in your own workspace. 102 | 1. Model train task steps (`model-train`) 103 | 1. Train initial “baseline” classifier (RandomForestClassifier - `max_depth=4`) 104 | - **NOTE:** no changes to config need to be made at this point 105 | 1. Register the model. Model version 1 will be registered to `stage=None` upon successful model training. 106 | 1. **Manual Step**: MLflow Model Registry UI promotion to `stage='Production'` 107 | - Go to MLflow Model Registry and manually promote model to `stage='Production'`. 108 | 109 | 110 | 2. **Code change / model update (Continuous Integration)** 111 | 112 | - Create new “dev/new_model” branch 113 | - `git checkout -b dev/new_model` 114 | - Make a change to the [`model_train.yml`](https://github.com/niall-turbitt/e2e-mlops/blob/main/conf/job_configs/model_train.yml) config file, updating `max_depth` under model_params from 4 to 8 115 | - Optional: change run name under mlflow params in [`model_train.yml`](https://github.com/niall-turbitt/e2e-mlops/blob/main/conf/job_configs/model_train.yml) config file 116 | - Create pull request, to instantiate a request to merge the branch dev/new_model into main. 117 | 118 | * On pull request the following steps are triggered in the GitHub Actions workflow: 119 | 1. Trigger unit tests 120 | 1. Trigger integration tests 121 | * Note that upon tests successfully passing, this merge request will have to be confirmed in GitHub. 122 | 123 | 124 | 3. **Cut release** 125 | 126 | - Create tag (e.g. `v0.0.1`) 127 | - `git tag -a -m “Message”` 128 | - Note that tags are matched to `v*`, i.e. `v1.0`, `v20.15.10` 129 | - Push tag 130 | - `git push origin ` 131 | 132 | - On pushing this the following steps are triggered in the [`onrelease.yml`](https://github.com/niall-turbitt/e2e-mlops/blob/main/.github/workflows/onrelease.yml) GitHub Actions workflow: 133 | 1. Trigger unit tests. 134 | 1. Deploy `PROD-telco-churn-model-train` job to the prod environment. 135 | 1. Deploy `PROD-telco-churn-model-deployment` job to the prod environment. 136 | 1. Deploy `PROD-telco-churn-model-inference-batch` job to the prod environment. 137 | - These jobs will now all be present in the specified workspace, and visible under the [Workflows](https://docs.databricks.com/data-engineering/jobs/index.html) tab. 138 | 139 | 140 | 4. **Run `PROD-telco-churn-model-train` job in the prod environment** 141 | - Manually trigger job via UI 142 | - In the Databricks workspace (prod environment) go to `Workflows` > `Jobs`, where the `PROD-telco-churn-model-train` job will be present. 143 | - Click into PROD-telco-churn-model-train and select ‘Run Now’. Doing so will trigger the job on the specified cluster configuration. 144 | - Alternatively you can trigger the job using the Databricks CLI: 145 | - `databricks jobs run-now –job-id JOB_ID` 146 | 147 | - Model train job steps (`telco-churn-model-train`) 148 | 1. Train improved “new” classifier (RandomForestClassifier - `max_depth=8`) 149 | 1. Register the model. Model version 2 will be registered to stage=None upon successful model training. 150 | 1. **Manual Step**: MLflow Model Registry UI promotion to stage='Staging' 151 | - Go to Model registry and manually promote model to stage='Staging' 152 | 153 | **ASIDE:** At this point, there should now be two model versions registered in MLflow Model Registry: 154 | 155 | - Version 1 (Production): RandomForestClassifier (`max_depth=4`) 156 | - Version 2 (Staging): RandomForestClassifier (`max_depth=8`) 157 | 158 | 159 | 5. **Run `PROD-telco-churn-model-deployment` job (Continuous Deployment) in the prod environment** 160 | - Manually trigger job via UI 161 | - In the Databricks workspace go to `Workflows` > `Jobs`, where the `telco-churn-model-deployment` job will be present. 162 | - Click into telco-churn-model-deployment and click ‘Run Now’. Doing so will trigger the job on the specified cluster configuration. 163 | - Alternatively you can trigger the job using the Databricks CLI: 164 | - `databricks jobs run-now –job-id JOB_ID` 165 | 166 | - Model deployment job steps (`PROD-telco-churn-model-deployment`) 167 | 1. Compare new “candidate model” in `stage='Staging'` versus current Production model in `stage='Production'`. 168 | 1. Comparison criteria set through [`model_deployment.yml`](https://github.com/niall-turbitt/e2e-mlops/blob/main/conf/job_configs/model_deployment.yml) 169 | 1. Compute predictions using both models against a specified reference dataset 170 | 1. If Staging model performs better than Production model, promote Staging model to Production and archive existing Production model 171 | 1. If Staging model performs worse than Production model, archive Staging model 172 | 173 | 174 | 6. **Run `PROD-telco-churn-model-inference-batch` job in the prod environment** 175 | - Manually trigger job via UI 176 | - In the Databricks workspace go to `Workflows` > `Jobs`, where the `PROD-telco-churn-model-inference-batch` job will be present. 177 | - Click into telco-churn-model-inference-batch and click ‘Run Now’. Doing so will trigger the job on the specified cluster configuration. 178 | - Alternatively you can trigger the job using the Databricks CLI: 179 | - `databricks jobs run-now –job-id JOB_ID` 180 | 181 | - Batch model inference steps (`PROD-telco-churn-model-inference-batch`) 182 | 1. Load model from stage=Production in Model Registry 183 | - **NOTE:** model must have been logged to MLflow using the Feature Store API 184 | 1. Use primary keys in specified inference input data to load features from feature store 185 | 1. Apply loaded model to loaded features 186 | 1. Write predictions to specified Delta path 187 | 188 | ## Limitations 189 | - Multitask jobs running against the same cluster 190 | - The pipeline initial-model-train-register is a [multitask job](https://docs.databricks.com/data-engineering/jobs/index.html) 191 | which stitches together demo setup, feature store creation and model train pipelines. 192 | - At present, each of these tasks within the multitask job is executed on a different automated job cluster, 193 | rather than all tasks executed on the same cluster. As such, there will be time incurred for each task to acquire 194 | cluster resources and install dependencies. 195 | - As above, we recommend using a pool from which instances can be acquired when jobs are launched to reduce cluster start up time. 196 | 197 | --- 198 | ## Development 199 | 200 | While using this project, you need Python 3.X and `pip` or `conda` for package management. 201 | 202 | ### Installing project requirements 203 | 204 | ```bash 205 | pip install -r unit-requirements.txt 206 | ``` 207 | 208 | ### Install project package in a developer mode 209 | 210 | ```bash 211 | pip install -e . 212 | ``` 213 | 214 | ### Testing 215 | 216 | #### Running unit tests 217 | 218 | For unit testing, please use `pytest`: 219 | ``` 220 | pytest tests/unit --cov 221 | ``` 222 | 223 | Please check the directory `tests/unit` for more details on how to use unit tests. 224 | In the `tests/unit/conftest.py` you'll also find useful testing primitives, such as local Spark instance with Delta support, local MLflow and DBUtils fixture. 225 | 226 | #### Running integration tests 227 | 228 | There are two options for running integration tests: 229 | 230 | - On an interactive cluster via `dbx execute` 231 | - On a job cluster via `dbx launch` 232 | 233 | For quicker startup of the job clusters we recommend using instance pools ([AWS](https://docs.databricks.com/clusters/instance-pools/index.html), [Azure](https://docs.microsoft.com/en-us/azure/databricks/clusters/instance-pools/), [GCP](https://docs.gcp.databricks.com/clusters/instance-pools/index.html)). 234 | 235 | For an integration test on interactive cluster, use the following command: 236 | ``` 237 | dbx execute --cluster-name= --job= 238 | ``` 239 | 240 | For a test on an automated job cluster, deploy the job files and then launch: 241 | ``` 242 | dbx deploy --jobs= --files-only 243 | dbx launch --job= --as-run-submit --trace 244 | ``` 245 | 246 | Please note that for testing we recommend using [jobless deployments](https://dbx.readthedocs.io/en/latest/guidance/run_submit.html), so you won't affect existing job definitions. 247 | 248 | ### Interactive execution and development on Databricks clusters 249 | 250 | 1. `dbx` expects that cluster for interactive execution supports `%pip` and `%conda` magic [commands](https://docs.databricks.com/libraries/notebooks-python-libraries.html). 251 | 2. Please configure your job in `conf/deployment.yml` file. 252 | 2. To execute the code interactively, provide either `--cluster-id` or `--cluster-name`. 253 | ```bash 254 | dbx execute \ 255 | --cluster-name="" \ 256 | --job=job-name 257 | ``` 258 | 259 | Multiple users also can use the same cluster for development. Libraries will be isolated per each execution context. 260 | 261 | ### Working with notebooks and Repos 262 | 263 | To start working with your notebooks from [Repos](https://docs.databricks.com/repos/index.html), do the following steps: 264 | 265 | 1. Add your git provider token to your user settings 266 | 2. Add your repository to Repos. This could be done via UI, or via CLI command below: 267 | ```bash 268 | databricks repos create --url --provider 269 | ``` 270 | This command will create your personal repository under `/Repos//telco_churn`. 271 | 3. To set up the CI/CD pipeline with the notebook, create a separate `Staging` repo: 272 | ```bash 273 | databricks repos create --url --provider --path /Repos/Staging/telco_churn 274 | ``` 275 | --------------------------------------------------------------------------------