├── .amlignore ├── .azureml └── config.json ├── .gitignore ├── Code ├── Data_Acquisition_and_Understanding │ ├── Readme.md │ ├── define_dataset.py │ └── ingest_data.py ├── Modeling │ ├── Readme.md │ ├── hypertrain.py │ ├── hypertrain_submit.py │ ├── pipeline_train.py │ ├── score_realtime.py │ ├── train.py │ ├── train_datasets.py │ ├── train_submit.py │ └── train_submit_datasets.py ├── Operationalization │ ├── Readme.md │ ├── dashboards │ │ └── Readme.md │ └── monitoring │ │ ├── Readme.md │ │ └── monitoring_pipeline.py └── Readme.md ├── Docs ├── lab002 │ └── Readme.md └── lab05 │ └── Readme.md ├── LICENSE-CODE.TXT ├── LICENSE.TXT ├── NOTICE.TXT ├── README.md ├── SECURITY.md ├── Sample_Data ├── For_Modeling │ └── modelling.md ├── Processed │ └── processed.md ├── README.md └── Raw │ └── rawData.md ├── conda_dependencies.yml ├── infrastructure ├── README.md ├── arm-templates │ ├── appinsights │ │ ├── parameters.dev.json │ │ ├── parameters.test.json │ │ └── template.json │ ├── containerregistry │ │ ├── parameters.dev.json │ │ ├── parameters.test.json │ │ └── template.json │ ├── keyvault │ │ ├── parameters.dev.json │ │ ├── parameters.test.json │ │ └── template.json │ ├── mlcompute │ │ ├── parameters-vnet.dev.json │ │ ├── parameters-vnet.test.json │ │ ├── parameters.dev.json │ │ ├── parameters.test.json │ │ ├── template-vnet.json │ │ └── template.json │ ├── mlworkspace │ │ ├── parameters.dev.json │ │ ├── parameters.test.json │ │ └── template.json │ └── storage │ │ ├── parameters.dev.json │ │ ├── parameters.test.json │ │ └── template.json ├── build-and-release │ ├── deploy-infra.ps1 │ ├── deploy-infra.template.yml │ └── deploy-infra.yml ├── infra_stages.png ├── runconfigschema.json └── scripts │ ├── create-aks.sh │ ├── create-azmlcompute.sh │ └── create-workspace.sh └── labs ├── 01_setup.md ├── 02_experiments.md ├── 03_managedcompute.md ├── 04_datasets.md ├── 05_hypertune.md ├── 06_pipelines.md └── README.md /.amlignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | azureml-logs 3 | .azureml 4 | .git 5 | outputs 6 | azureml-setup 7 | docs 8 | -------------------------------------------------------------------------------- /.azureml/config.json: -------------------------------------------------------------------------------- 1 | {"Id": null, "Scope": "/subscriptions/cf4e1704-b4bc-4554-bcd7-309394f2ee56/resourceGroups/azuremlworkshoprgp/providers/Microsoft.MachineLearningServices/workspaces/azuremlworkshopws"} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore local folders and config files 2 | aml_config/*.json 3 | Sample_Data 4 | .vscode 5 | outputs 6 | assets 7 | 8 | # ignore python cache 9 | *.pyc 10 | 11 | # ignore faultive folders 12 | Code/Modeling/aml_config 13 | Code/Modeling/assets 14 | Code/Modeling/.amlignore 15 | 16 | # ignore tmp folders 17 | code/Data_Acquisition_and_Understanding/tmp/* 18 | -------------------------------------------------------------------------------- /Code/Data_Acquisition_and_Understanding/Readme.md: -------------------------------------------------------------------------------- 1 | # This folder hosts production-intended data preparation logic -------------------------------------------------------------------------------- /Code/Data_Acquisition_and_Understanding/define_dataset.py: -------------------------------------------------------------------------------- 1 | # Defines a tabular dataset on top of an Azure ML datastore 2 | from azureml.core import Workspace, Datastore, Dataset 3 | from azureml.data import DataType 4 | from azureml.core.authentication import AzureCliAuthentication 5 | 6 | # Retrieve a datastore from a ML workspace 7 | ws = Workspace.from_config(auth=AzureCliAuthentication()) 8 | datastore_name = 'workspaceblobstore' 9 | datastore = Datastore.get(ws, datastore_name) 10 | 11 | # Register dataset version for each data split 12 | for data_split in ['train', 'test']: 13 | # Create a TabularDataset from paths in datastore in split folder 14 | # Note that wildcards can be used 15 | datastore_paths = [ 16 | (datastore, '{}/*.csv'.format(data_split)) 17 | ] 18 | 19 | # Create a TabularDataset from paths in datastore 20 | dataset = Dataset.Tabular.from_delimited_files( 21 | path=datastore_paths, 22 | set_column_types={ 23 | 'text': DataType.to_string(), 24 | 'target': DataType.to_string() 25 | }, 26 | header=True 27 | ) 28 | 29 | # Register the defined dataset for later use 30 | dataset.register( 31 | workspace=ws, 32 | name='newsgroups_{}'.format(data_split), 33 | description='newsgroups data' 34 | ) 35 | -------------------------------------------------------------------------------- /Code/Data_Acquisition_and_Understanding/ingest_data.py: -------------------------------------------------------------------------------- 1 | # Pre-processes SKLearn sample data 2 | # Ingest the data into an Azure ML Datastore for training 3 | import pandas as pd 4 | import time 5 | import os 6 | from sklearn.datasets import fetch_20newsgroups 7 | from azureml.core import Workspace, Datastore 8 | from azureml.core.authentication import AzureCliAuthentication 9 | 10 | # Define newsgroup categories to be downloaded to generate sample dataset 11 | # @TODO add additional newsgroups 12 | categories = [ 13 | 'alt.atheism', 14 | 'talk.religion.misc', 15 | 'comp.graphics', 16 | 'sci.space', 17 | ] 18 | 19 | print("Loading 20 newsgroups dataset for categories:") 20 | print(categories if categories else "all") 21 | 22 | for data_split in ['train', 'test']: 23 | # retrieve newsgroup data 24 | newsgroupdata = fetch_20newsgroups( 25 | subset=data_split, 26 | categories=categories, 27 | shuffle=True, 28 | random_state=42 29 | ) 30 | 31 | # construct pandas data frame from loaded sklearn newsgroup data 32 | df = pd.DataFrame({ 33 | 'text': newsgroupdata.data, 34 | 'target': newsgroupdata.target 35 | }) 36 | 37 | print('data loaded') 38 | 39 | # pre-process: 40 | # remove line breaks 41 | # replace target index by newsgroup name 42 | target_names = newsgroupdata.target_names 43 | df.target = df.target.apply(lambda x: target_names[x]) 44 | df.text = df.text.replace('\n', ' ', regex=True) 45 | 46 | print(df.head(5)) 47 | 48 | # write to csv 49 | df.to_csv(os.path.join( 50 | os.path.dirname(os.path.realpath(__file__)), 51 | 'tmp', 52 | data_split, 53 | '{}.csv'.format(int(time.time())) # unique file name 54 | ), index=False, encoding="utf-8", line_terminator='\n') 55 | 56 | 57 | datastore_name = 'workspaceblobstore' 58 | 59 | # get existing ML workspace 60 | workspace = Workspace.from_config(auth=AzureCliAuthentication()) 61 | 62 | # retrieve an existing datastore in the workspace by name 63 | datastore = Datastore.get(workspace, datastore_name) 64 | 65 | # upload files 66 | datastore.upload( 67 | src_dir=os.path.join( 68 | os.path.dirname(os.path.realpath(__file__)), 69 | 'tmp' 70 | ), 71 | target_path=None, 72 | overwrite=True, 73 | show_progress=True 74 | ) 75 | -------------------------------------------------------------------------------- /Code/Modeling/Readme.md: -------------------------------------------------------------------------------- 1 | # This folder contains code for modeling and related activities (such as feature engineering, model evaluation etc.) 2 | 3 | You can add detailed description in this markdown related to your specific data science project. 4 | 5 | The following project structure has been provided as an example. 6 | 7 | * modelpackage 8 | * tests e.g. unit tests and integration tests -------------------------------------------------------------------------------- /Code/Modeling/hypertrain.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from optparse import OptionParser 3 | import sys 4 | 5 | from sklearn.datasets import fetch_20newsgroups 6 | from sklearn.feature_extraction.text import TfidfVectorizer 7 | from sklearn.ensemble import RandomForestClassifier 8 | from sklearn import metrics 9 | from sklearn.feature_extraction.text import HashingVectorizer 10 | 11 | from sklearn.externals import joblib 12 | from azureml.core import Run 13 | 14 | # Display progress logs on stdout 15 | logging.basicConfig(level=logging.INFO, 16 | format='%(asctime)s %(levelname)s %(message)s') 17 | 18 | op = OptionParser() 19 | op.add_option("--all_categories", 20 | action="store_true", dest="all_categories", 21 | help="Whether to use all categories or not.") 22 | op.add_option("--use_hashing", 23 | action="store_true", 24 | help="Use a hashing vectorizer.") 25 | op.add_option("--n_features", 26 | action="store", 27 | type=int, 28 | default=2 ** 16, 29 | help="n_features when using the hashing vectorizer.") 30 | op.add_option("--max_depth", 31 | type=int, default=10) 32 | op.add_option("--n_estimators", 33 | type=int, default=100) 34 | op.add_option("--criterion", 35 | type=str, 36 | default='gini') 37 | op.add_option("--min_samples_split", 38 | type=int, 39 | default=2) 40 | 41 | 42 | def is_interactive(): 43 | return not hasattr(sys.modules['__main__'], '__file__') 44 | 45 | 46 | # work-around for Jupyter notebook and IPython console 47 | argv = [] if is_interactive() else sys.argv[1:] 48 | (opts, args) = op.parse_args(argv) 49 | if len(args) > 0: 50 | op.error("this script takes no arguments.") 51 | sys.exit(1) 52 | 53 | if opts.all_categories: 54 | categories = None 55 | else: 56 | categories = [ 57 | 'alt.atheism', 58 | 'talk.religion.misc', 59 | 'comp.graphics', 60 | 'sci.space', 61 | ] 62 | 63 | print("Loading 20 newsgroups dataset for categories:") 64 | print(categories if categories else "all") 65 | 66 | data_train = fetch_20newsgroups(subset='train', categories=categories, 67 | shuffle=True, random_state=42) 68 | 69 | data_test = fetch_20newsgroups(subset='test', categories=categories, 70 | shuffle=True, random_state=42) 71 | print('data loaded') 72 | 73 | # order of labels in `target_names` can be different from `categories` 74 | target_names = data_train.target_names 75 | 76 | 77 | def size_mb(docs): 78 | return sum(len(s.encode('utf-8')) for s in docs) / 1e6 79 | 80 | 81 | # split a training set and a test set 82 | y_train, y_test = data_train.target, data_test.target 83 | 84 | print("Extracting features from the training data using a sparse vectorizer") 85 | if opts.use_hashing: 86 | vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, 87 | n_features=opts.n_features) 88 | X_train = vectorizer.transform(data_train.data) 89 | else: 90 | vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, 91 | stop_words='english') 92 | X_train = vectorizer.fit_transform(data_train.data) 93 | 94 | print("Extracting features from the test data using the same vectorizer") 95 | X_test = vectorizer.transform(data_test.data) 96 | 97 | 98 | # mapping from integer feature name to original token string 99 | if opts.use_hashing: 100 | feature_names = None 101 | else: 102 | feature_names = vectorizer.get_feature_names() 103 | 104 | 105 | def trim(s): 106 | """Trim string to fit on terminal (assuming 80-column display)""" 107 | return s if len(s) <= 80 else s[:77] + "..." 108 | 109 | 110 | def benchmark(clf, name=""): 111 | """benchmark classifier performance""" 112 | 113 | # train a model 114 | print("\nTraining run with algorithm \n{}".format(clf)) 115 | clf.fit(X_train, y_train) 116 | 117 | # evaluate on test set 118 | pred = clf.predict(X_test) 119 | score = metrics.accuracy_score(y_test, pred) 120 | 121 | # log metrics 122 | run_logger = Run.get_context() 123 | run_logger.log("accuracy", float(score)) 124 | 125 | # save .pkl file 126 | model_name = "model" + ".pkl" 127 | filename = "outputs/" + model_name 128 | joblib.dump(value=clf, filename=filename) 129 | run_logger.upload_file(name=model_name, path_or_stream=filename) 130 | 131 | print("accuracy: %0.3f" % score) 132 | clf_descr = str(clf).split('(')[0] 133 | return clf_descr, score 134 | 135 | 136 | results = [] 137 | 138 | # Select the training hyperparameters. 139 | # Create a dict of hyperparameters from the input flags. 140 | hyperparameters = { 141 | "max_depth": opts.max_depth, 142 | "n_estimators": opts.n_estimators, 143 | "criterion": opts.criterion, 144 | "min_samples_split": opts.min_samples_split 145 | } 146 | 147 | # Select the training hyperparameters. 148 | max_depth = hyperparameters["max_depth"] 149 | n_estimators = hyperparameters["n_estimators"] 150 | criterion = hyperparameters["criterion"] 151 | min_samples_split = hyperparameters["min_samples_split"] 152 | 153 | 154 | clf = RandomForestClassifier(max_depth=max_depth, 155 | n_estimators=n_estimators, criterion=criterion, 156 | min_samples_split=min_samples_split) 157 | 158 | model = benchmark(clf) 159 | -------------------------------------------------------------------------------- /Code/Modeling/hypertrain_submit.py: -------------------------------------------------------------------------------- 1 | from azureml.train.hyperdrive import ( 2 | RandomParameterSampling, 3 | HyperDriveConfig, PrimaryMetricGoal) 4 | from azureml.core import Workspace, Experiment 5 | from azureml.train.estimator import Estimator 6 | import pandas as pd 7 | import os 8 | from random import choice 9 | from azureml.core.authentication import AzureCliAuthentication 10 | 11 | # load Azure ML workspace 12 | workspace = Workspace.from_config(auth=AzureCliAuthentication()) 13 | 14 | cluster_name = 'hypetuning' 15 | 16 | # Define Run Configuration 17 | estimator = Estimator( 18 | entry_script='hypertrain.py', 19 | source_directory=os.path.dirname(os.path.realpath(__file__)), 20 | compute_target=workspace.compute_targets[cluster_name], 21 | pip_packages=[ 22 | 'numpy==1.15.4', 23 | 'pandas==0.23.4', 24 | 'scikit-learn==0.20.1', 25 | 'scipy==1.0.0', 26 | 'matplotlib==3.0.2', 27 | 'utils==0.9.0' 28 | ] 29 | ) 30 | 31 | # Set parameters for search 32 | param_sampling = RandomParameterSampling({ 33 | "max_depth": choice([100, 50, 20, 10]), 34 | "n_estimators": choice([50, 150, 200, 250]), 35 | "criterion": choice(['gini', 'entropy']), 36 | "min_samples_split": choice([2, 3, 4, 5]) 37 | } 38 | ) 39 | 40 | # Define multi-run configuration 41 | hyperdrive_run_config = HyperDriveConfig( 42 | estimator=estimator, 43 | hyperparameter_sampling=param_sampling, 44 | policy=None, 45 | primary_metric_name="accuracy", 46 | primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 47 | max_total_runs=2, 48 | max_concurrent_runs=None 49 | ) 50 | 51 | # Define the ML experiment 52 | experiment = Experiment(workspace, "newsgroups_train_hypertune") 53 | 54 | hyperdrive_run = experiment.submit(hyperdrive_run_config) 55 | hyperdrive_run.wait_for_completion() 56 | 57 | # Select the best run from all submitted 58 | best_run = hyperdrive_run.get_best_run_by_primary_metric() 59 | best_run_metrics = best_run.get_metrics() 60 | 61 | # Log the best run's performance to the parent run 62 | hyperdrive_run.log("Accuracy", best_run_metrics['accuracy']) 63 | parameter_values = best_run.get_details()['runDefinition']['arguments'] 64 | 65 | # Print best set of parameters found 66 | best_parameters = dict(zip(parameter_values[::2], parameter_values[1::2])) 67 | pd.Series(best_parameters, name='Value').to_frame() 68 | 69 | best_model_parameters = best_parameters.copy() 70 | pd.Series(best_model_parameters, name='Value').to_frame() 71 | print(best_model_parameters) 72 | 73 | # Define a final training run with model's best parameters 74 | model_est = Estimator( 75 | entry_script='hypertrain.py', 76 | source_directory=os.path.dirname(os.path.realpath(__file__)), 77 | script_params=best_model_parameters, 78 | compute_target=workspace.compute_targets[cluster_name], 79 | pip_packages=[ 80 | 'numpy==1.15.4', 81 | 'pandas==0.23.4', 82 | 'scikit-learn==0.20.1', 83 | 'scipy==1.0.0', 84 | 'matplotlib==3.0.2', 85 | 'utils==0.9.0' 86 | ] 87 | ) 88 | 89 | # Submit the experiment 90 | model_run = experiment.submit(model_est) 91 | 92 | model_run_status = model_run.wait_for_completion(wait_post_processing=True) 93 | 94 | model = model_run.register_model(model_name='model', 95 | model_path=os.path.join('outputs', 'model.pkl')) 96 | -------------------------------------------------------------------------------- /Code/Modeling/pipeline_train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Model Training Pipeline 3 | 4 | Note: ML Pipelines are executed on registered compute resources. 5 | Run configurations hence cannot reference local compute. 6 | """ 7 | import os 8 | from azureml.core import Experiment, Workspace 9 | from azureml.pipeline.core import Pipeline, PipelineData 10 | from azureml.pipeline.steps import PythonScriptStep 11 | from azureml.core import RunConfiguration 12 | from azureml.core.authentication import AzureCliAuthentication 13 | from azureml.data.data_reference import DataReference 14 | 15 | # Define run configuration (compute/environment/data references/..) 16 | run_config_name = 'dsvmcluster' 17 | exp_name = "Training_Pipeline" 18 | curr_dir = os.path.dirname(os.path.realpath(__file__)) 19 | output_dir = 'outputs' 20 | output_dir_local = os.path.join(curr_dir, '../../../', 'outputs') 21 | 22 | # Pipeline parameters 23 | run_experiment = True 24 | register_model = False 25 | publish_pipeline = False 26 | 27 | # load workspace config, load default datastore. 28 | ws = Workspace.from_config(auth=AzureCliAuthentication()) 29 | default_ds = ws.get_default_datastore() 30 | 31 | # load run config 32 | run_config = RunConfiguration.load( 33 | path=os.path.join(curr_dir, '../../../', 'aml_config'), 34 | name=run_config_name 35 | ) 36 | 37 | # define training pipeline with one AMLCompute step 38 | trainStep = PythonScriptStep( 39 | script_name="train.py", 40 | name="Model Training", 41 | arguments=[ 42 | '--data-dir', str(default_ds.as_mount()), 43 | '--output-dir', output_dir 44 | ], 45 | inputs=[ 46 | DataReference( 47 | datastore=default_ds, 48 | mode="mount" 49 | ) 50 | ], 51 | outputs=[ 52 | PipelineData( 53 | name="model", 54 | datastore=default_ds, 55 | output_path_on_compute="training" 56 | ) 57 | ], 58 | compute_target=run_config.target, 59 | runconfig=run_config, 60 | source_directory=os.path.join(curr_dir, '../') 61 | ) 62 | 63 | training_pipeline = Pipeline(workspace=ws, steps=[trainStep]) 64 | training_pipeline.validate() 65 | print("Pipeline validation complete") 66 | 67 | # Submit pipeline run 68 | pipeline_run = Experiment(ws, exp_name).submit(training_pipeline) 69 | pipeline_run.wait_for_completion() 70 | -------------------------------------------------------------------------------- /Code/Modeling/score_realtime.py: -------------------------------------------------------------------------------- 1 | """ 2 | Real Time Scoring Service 3 | @TODO 4 | """ 5 | import json 6 | import time 7 | import numpy as np 8 | from azureml.core.model import Model 9 | from sklearn.externals import joblib 10 | 11 | 12 | def init(): 13 | """ 14 | Load model and other dependencies for inferencing 15 | """ 16 | global model 17 | # Print statement for appinsights custom traces: 18 | print("model initialized" + time.strftime("%H:%M:%S")) 19 | 20 | # note here "sklearn_regression_model.pkl" is the name of the 21 | # model registered under the workspace this call should return 22 | # the path to the model.pkl file on the local disk. 23 | model_path = Model.get_model_path(model_name='model.pkl') 24 | 25 | # deserialize the model file back into a sklearn model 26 | model = joblib.load(model_path) 27 | 28 | 29 | def run(raw_data): 30 | """ 31 | Score new data against model 32 | """ 33 | try: 34 | data = json.loads(raw_data)['data'] 35 | data = np.array(data) 36 | result = model.predict(data) 37 | 38 | # you can return any datatype as long as it is JSON-serializable 39 | return result.tolist() 40 | except Exception as e: 41 | error = str(e) 42 | print(error + time.strftime("%H:%M:%S")) 43 | return error 44 | -------------------------------------------------------------------------------- /Code/Modeling/train.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import numpy as np 3 | from optparse import OptionParser 4 | import sys 5 | from time import time 6 | 7 | from sklearn.datasets import fetch_20newsgroups 8 | from sklearn.feature_extraction.text import TfidfVectorizer 9 | from sklearn.feature_extraction.text import HashingVectorizer 10 | from sklearn.feature_selection import SelectFromModel 11 | from sklearn.linear_model import RidgeClassifier 12 | from sklearn.pipeline import Pipeline 13 | from sklearn.svm import LinearSVC 14 | from sklearn.linear_model import SGDClassifier 15 | from sklearn.linear_model import Perceptron 16 | from sklearn.linear_model import PassiveAggressiveClassifier 17 | from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB 18 | from sklearn.neighbors import KNeighborsClassifier 19 | from sklearn.neighbors import NearestCentroid 20 | from sklearn.ensemble import RandomForestClassifier 21 | from sklearn import metrics 22 | 23 | # Display progress logs on stdout 24 | logging.basicConfig(level=logging.INFO, 25 | format='%(asctime)s %(levelname)s %(message)s') 26 | 27 | op = OptionParser() 28 | op.add_option("--all_categories", 29 | action="store_true", dest="all_categories", 30 | help="Whether to use all categories or not.") 31 | op.add_option("--use_hashing", 32 | action="store_true", 33 | help="Use a hashing vectorizer.") 34 | op.add_option("--n_features", 35 | action="store", type=int, default=2 ** 16, 36 | help="n_features when using the hashing vectorizer.") 37 | 38 | 39 | def is_interactive(): 40 | return not hasattr(sys.modules['__main__'], '__file__') 41 | 42 | 43 | # work-around for Jupyter notebook and IPython console 44 | argv = [] if is_interactive() else sys.argv[1:] 45 | (opts, args) = op.parse_args(argv) 46 | if len(args) > 0: 47 | op.error("this script takes no arguments.") 48 | sys.exit(1) 49 | 50 | if opts.all_categories: 51 | categories = None 52 | else: 53 | categories = [ 54 | 'alt.atheism', 55 | 'talk.religion.misc', 56 | 'comp.graphics', 57 | 'sci.space', 58 | ] 59 | 60 | 61 | print("Loading 20 newsgroups dataset for categories:") 62 | 63 | data_train = fetch_20newsgroups(subset='train', categories=categories, 64 | shuffle=True, random_state=42) 65 | 66 | data_test = fetch_20newsgroups(subset='test', categories=categories, 67 | shuffle=True, random_state=42) 68 | print('data loaded') 69 | 70 | # order of labels in `target_names` can be different from `categories` 71 | target_names = data_train.target_names 72 | 73 | 74 | def size_mb(docs): 75 | return sum(len(s.encode('utf-8')) for s in docs) / 1e6 76 | 77 | 78 | # split a training set and a test set 79 | y_train, y_test = data_train.target, data_test.target 80 | 81 | # Extracting features from the training data using a sparse vectorizer 82 | if opts.use_hashing: 83 | vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, 84 | n_features=opts.n_features) 85 | X_train = vectorizer.transform(data_train.data) 86 | else: 87 | vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, 88 | stop_words='english') 89 | X_train = vectorizer.fit_transform(data_train.data) 90 | 91 | # Extracting features from the test data using the same vectorizer" 92 | X_test = vectorizer.transform(data_test.data) 93 | 94 | # mapping from integer feature name to original token string 95 | if opts.use_hashing: 96 | feature_names = None 97 | else: 98 | feature_names = vectorizer.get_feature_names() 99 | 100 | if feature_names: 101 | feature_names = np.asarray(feature_names) 102 | 103 | 104 | def trim(s): 105 | """Trim string to fit on terminal (assuming 80-column display)""" 106 | return s if len(s) <= 80 else s[:77] + "..." 107 | 108 | 109 | def benchmark(clf, name=""): 110 | """benchmark classifier performance""" 111 | 112 | # train a model 113 | print("\nTraining run with algorithm \n{}".format(clf)) 114 | clf.fit(X_train, y_train) 115 | 116 | # evaluate on test set 117 | pred = clf.predict(X_test) 118 | score = ? 119 | 120 | clf_descr = str(clf).split('(')[0] 121 | print("? %0.3f" % score) 122 | return clf_descr, score 123 | 124 | 125 | # Run benchmark and collect results with multiple classifiers 126 | for clf, name in ( 127 | (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), 128 | (Perceptron(max_iter=50), "Perceptron"), 129 | (PassiveAggressiveClassifier(max_iter=50), 130 | "Passive-Aggressive"), 131 | (KNeighborsClassifier(n_neighbors=10), "kNN"), 132 | (RandomForestClassifier(), "Random forest")): 133 | # run benchmarking function for each 134 | benchmark(clf, name) 135 | 136 | 137 | # Run with different regularization techniques 138 | for penalty in ["l2", "l1"]: 139 | # Train Liblinear model 140 | name = penalty + "LinearSVC" 141 | benchmark( 142 | clf=LinearSVC( 143 | penalty=penalty, 144 | dual=False, 145 | tol=1e-3 146 | ), 147 | name=penalty + "LinearSVC" 148 | ) 149 | 150 | # Train SGD model 151 | benchmark( 152 | SGDClassifier( 153 | alpha=.0001, 154 | max_iter=50, 155 | penalty=penalty 156 | ), 157 | name=penalty + "SGDClassifier" 158 | ) 159 | 160 | # Train SGD with Elastic Net penalty 161 | benchmark( 162 | SGDClassifier( 163 | alpha=.0001, 164 | max_iter=50, 165 | penalty="elasticnet" 166 | ), 167 | name="Elastic-Net penalty" 168 | ) 169 | 170 | # Train NearestCentroid without threshold 171 | benchmark( 172 | NearestCentroid(), 173 | name="NearestCentroid (aka Rocchio classifier)" 174 | ) 175 | 176 | # Train sparse Naive Bayes classifiers 177 | benchmark( 178 | MultinomialNB(alpha=.01), 179 | name="Naive Bayes MultinomialNB" 180 | ) 181 | 182 | benchmark( 183 | BernoulliNB(alpha=.01), 184 | name="Naive Bayes BernoulliNB" 185 | ) 186 | 187 | benchmark( 188 | ComplementNB(alpha=.1), 189 | name="Naive Bayes ComplementNB" 190 | ) 191 | 192 | # The smaller C, the stronger the regularization. 193 | # The more regularization, the more sparsity. 194 | benchmark( 195 | Pipeline([ 196 | ('feature_selection', 197 | SelectFromModel( 198 | LinearSVC( 199 | penalty="l1", 200 | dual=False, 201 | tol=1e-3 202 | ) 203 | )), 204 | ('classification', 205 | LinearSVC(penalty="l2")) 206 | ] 207 | ), 208 | name="LinearSVC with L1-based feature selection" 209 | ) -------------------------------------------------------------------------------- /Code/Modeling/train_datasets.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from optparse import OptionParser 3 | from time import time 4 | from sklearn.feature_extraction.text import TfidfVectorizer 5 | from sklearn.feature_extraction.text import HashingVectorizer 6 | from sklearn.feature_selection import SelectFromModel 7 | from sklearn.feature_selection import SelectKBest, chi2 8 | from sklearn.linear_model import RidgeClassifier 9 | from sklearn.pipeline import Pipeline 10 | from sklearn.svm import LinearSVC 11 | from sklearn.linear_model import SGDClassifier 12 | from sklearn.linear_model import Perceptron 13 | from sklearn.linear_model import PassiveAggressiveClassifier 14 | from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB 15 | from sklearn.neighbors import KNeighborsClassifier 16 | from sklearn.neighbors import NearestCentroid 17 | from sklearn.ensemble import RandomForestClassifier 18 | from sklearn.utils.extmath import density 19 | from sklearn import metrics 20 | from sklearn.externals import joblib 21 | from azureml.core import Run 22 | 23 | op = OptionParser() 24 | op.add_option("--report", 25 | action="store_true", dest="print_report", 26 | help="Print a detailed classification report.") 27 | op.add_option("--chi2_select", 28 | action="store", type="int", dest="select_chi2", 29 | help="Select some number of features using a chi-squared test") 30 | op.add_option("--confusion_matrix", 31 | action="store_true", dest="print_cm", 32 | help="Print the confusion matrix.") 33 | op.add_option("--top10", 34 | action="store_true", dest="print_top10", 35 | help="Print ten most discriminative terms per class" 36 | " for every classifier.") 37 | op.add_option("--all_categories", 38 | action="store_true", dest="all_categories", 39 | help="Whether to use all categories or not.") 40 | op.add_option("--use_hashing", 41 | action="store_true", 42 | help="Use a hashing vectorizer.") 43 | op.add_option("--n_features", 44 | action="store", type=int, default=2 ** 16, 45 | help="n_features when using the hashing vectorizer.") 46 | op.add_option("--filtered", 47 | action="store_true", 48 | help="Remove newsgroup information that is easily overfit: " 49 | "headers, signatures, and quoting.") 50 | 51 | # Retrieve the run and its context (datasets etc.) 52 | run = Run.get_context() 53 | 54 | # Load the input datasets from Azure ML 55 | dataset_train = run.input_datasets['train'].to_pandas_dataframe() 56 | dataset_test = run.input_datasets['test'].to_pandas_dataframe() 57 | 58 | # Pre-process df for sklearn 59 | # convert to numpy df 60 | data_train = dataset_train.text.values 61 | data_test = dataset_test.text.values 62 | 63 | # save orginal target names 64 | target_names = data_train.target_names 65 | 66 | # convert label to int 67 | y_train = dataset_train.target.values 68 | y_test = dataset_test.target.values 69 | 70 | # Extracting features from the training data using a sparse vectorizer") 71 | vectorizer = HashingVectorizer( 72 | stop_words='english', 73 | alternate_sign=False, 74 | n_features=op.n_features 75 | ) 76 | 77 | X_train = vectorizer.transform(data_train.data) 78 | 79 | # Extracting features from the test data using the same vectorizer 80 | X_test = vectorizer.transform(data_test.data) 81 | 82 | # mapping from integer feature name to original token string 83 | feature_names = vectorizer.get_feature_names() 84 | 85 | # # Extracting %d best features by a chi-squared test 86 | # ch2 = SelectKBest(chi2, k=op.select_chi2) 87 | # X_train = ch2.fit_transform(X_train, y_train) 88 | # X_test = ch2.transform(X_test) 89 | 90 | # keep selected feature names 91 | # feature_names = [feature_names[i] for i 92 | # in ch2.get_support(indices=True)] 93 | # feature_names = np.asarray(feature_names) 94 | 95 | 96 | def trim(s): 97 | """Trim string to fit on terminal (assuming 80-column display)""" 98 | return s if len(s) <= 80 else s[:77] + "..." 99 | 100 | 101 | def benchmark(clf, name): 102 | print('_' * 80) 103 | print("Training: ") 104 | print(clf) 105 | t0 = time() 106 | clf.fit(X_train, y_train) 107 | train_time = time() - t0 108 | print("train time: %0.3fs" % train_time) 109 | 110 | t0 = time() 111 | pred = clf.predict(X_test) 112 | test_time = time() - t0 113 | print("test time: %0.3fs" % test_time) 114 | score = metrics.accuracy_score(y_test, pred) 115 | 116 | child_run = run.child_run(name=name) 117 | child_run.log("accuracy", float(score)) 118 | model_name = "model" + str(name) + ".pkl" 119 | filename = "outputs/" + model_name 120 | joblib.dump(value=clf, filename=filename) 121 | child_run.upload_file(name=model_name, path_or_stream=filename) 122 | 123 | print("accuracy: %0.3f" % score) 124 | 125 | if hasattr(clf, 'coef_'): 126 | print("dimensionality: %d" % clf.coef_.shape[1]) 127 | print("density: %f" % density(clf.coef_)) 128 | 129 | if op.print_top10 and feature_names is not None: 130 | print("top 10 keywords per class:") 131 | for i, label in enumerate(target_names): 132 | top10 = np.argsort(clf.coef_[i])[-10:] 133 | print(trim("%s: %s" % (label, " ".join(feature_names[top10])))) 134 | print() 135 | 136 | if op.print_report: 137 | print("classification report:") 138 | print(metrics.classification_report(y_test, pred, 139 | target_names=target_names)) 140 | 141 | if op.print_cm: 142 | print("confusion matrix:") 143 | print(metrics.confusion_matrix(y_test, pred)) 144 | 145 | print() 146 | clf_descr = str(clf).split('(')[0] 147 | 148 | child_run.complete() 149 | return clf_descr, score, train_time, test_time 150 | 151 | 152 | results = [] 153 | 154 | for clf, name in ( 155 | (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), 156 | (Perceptron(max_iter=50), "Perceptron"), 157 | (PassiveAggressiveClassifier(max_iter=50), 158 | "Passive-Aggressive"), 159 | (KNeighborsClassifier(n_neighbors=10), "kNN"), 160 | (RandomForestClassifier(), "Random forest")): 161 | print('=' * 80) 162 | print(name) 163 | results.append(benchmark(clf, name)) 164 | 165 | for penalty in ["l2", "l1"]: 166 | print('=' * 80) 167 | print("%s penalty" % penalty.upper()) 168 | # Train Liblinear model 169 | name = penalty + "LinearSVC" 170 | results.append(benchmark(LinearSVC(penalty=penalty, dual=False, 171 | tol=1e-3))) 172 | 173 | # Train SGD model 174 | name = penalty + "SGDClassifier" 175 | results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50, 176 | penalty=penalty))) 177 | 178 | # Train SGD with Elastic Net penalty 179 | print('=' * 80) 180 | print("Elastic-Net penalty") 181 | name = "Elastic-Net penalty" 182 | results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50, 183 | penalty="elasticnet"))) 184 | 185 | # Train NearestCentroid without threshold 186 | print('=' * 80) 187 | print("NearestCentroid (aka Rocchio classifier)") 188 | name ="NearestCentroid (aka Rocchio classifier)" 189 | results.append(benchmark(NearestCentroid())) 190 | 191 | 192 | # Train sparse Naive Bayes classifiers 193 | print('=' * 80) 194 | print("Naive Bayes") 195 | name = "Naive Bayes MultinomialNB" 196 | results.append(benchmark(MultinomialNB(alpha=.01))) 197 | 198 | name = "Naive Bayes BernoulliNB" 199 | results.append(benchmark(BernoulliNB(alpha=.01))) 200 | 201 | name = "Naive Bayes ComplementNB" 202 | results.append(benchmark(ComplementNB(alpha=.1))) 203 | 204 | print('=' * 80) 205 | print("LinearSVC with L1-based feature selection") 206 | # The smaller C, the stronger the regularization. 207 | # The more regularization, the more sparsity. 208 | name = "LinearSVC with L1-based feature selection" 209 | results.append(benchmark(Pipeline([ 210 | ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, 211 | tol=1e-3))), 212 | ('classification', LinearSVC(penalty="l2"))]))) 213 | -------------------------------------------------------------------------------- /Code/Modeling/train_submit.py: -------------------------------------------------------------------------------- 1 | """ 2 | Training submitter 3 | 4 | Facilitates (remote) training execution through the Azure ML service. 5 | """ 6 | import os 7 | from azureml.core import Workspace, Experiment 8 | from azureml.train.estimator import Estimator 9 | from azureml.core.authentication import AzureCliAuthentication 10 | 11 | # load Azure ML workspace 12 | workspace = Workspace.from_config(auth=AzureCliAuthentication()) 13 | 14 | # Define Run Configuration 15 | est = Estimator( 16 | entry_script='train.py', 17 | source_directory=os.path.dirname(os.path.realpath(__file__)), 18 | compute_target='local', 19 | conda_packages=[ 20 | 'pip==20.0.2' 21 | ], 22 | pip_packages=[ 23 | 'numpy==1.15.4', 24 | 'pandas==0.23.4', 25 | 'scikit-learn==0.20.1', 26 | 'scipy==1.0.0', 27 | 'matplotlib==3.0.2', 28 | 'utils==0.9.0' 29 | ], 30 | use_docker=False 31 | ) 32 | 33 | # Define the ML experiment 34 | experiment = Experiment(workspace, "newsgroups_train") 35 | 36 | # Submit experiment run, if compute is idle, this may take some time') 37 | run = experiment.submit(est) 38 | 39 | # wait for run completion of the run, while showing the logs 40 | run.wait_for_completion(show_output=True) 41 | -------------------------------------------------------------------------------- /Code/Modeling/train_submit_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Training submitter 3 | 4 | Facilitates (remote) training execution through the Azure ML service. 5 | """ 6 | import os 7 | from azureml.core import Workspace, Experiment 8 | from azureml.train.estimator import Estimator 9 | from azureml.core.authentication import AzureCliAuthentication 10 | 11 | # load Azure ML workspace 12 | workspace = Workspace.from_config(auth=AzureCliAuthentication()) 13 | 14 | # retrieve datasets used for training 15 | dataset_train = Dataset.get_by_name(workspace, name='newsgroups_train') 16 | dataset_test = Dataset.get_by_name(workspace, name='newsgroups_test') 17 | 18 | # Define Run Configuration 19 | est = Estimator( 20 | entry_script='train.py', 21 | source_directory=os.path.dirname(os.path.realpath(__file__)), 22 | compute_target='local', 23 | conda_packages=[ 24 | 'pip==20.0.2' 25 | ], 26 | pip_packages=[ 27 | 'numpy==1.15.4', 28 | 'pandas==0.23.4', 29 | 'scikit-learn==0.20.1', 30 | 'scipy==1.0.0', 31 | 'matplotlib==3.0.2', 32 | 'utils==0.9.0' 33 | ], 34 | use_docker=False, 35 | inputs=[ 36 | dataset_train.as_named_input('train'), 37 | dataset_train.as_named_input('test') 38 | ], 39 | ) 40 | 41 | # Define the ML experiment 42 | experiment = Experiment(workspace, "newsgroups_train") 43 | 44 | # Submit experiment run, if compute is idle, this may take some time') 45 | run = experiment.submit(est) 46 | 47 | # wait for run completion of the run, while showing the logs 48 | run.wait_for_completion(show_output=True) 49 | -------------------------------------------------------------------------------- /Code/Operationalization/Readme.md: -------------------------------------------------------------------------------- 1 | # This folder contains code for model deployment 2 | 3 | You can add detailed description in this markdown related to your specific data science project. 4 | -------------------------------------------------------------------------------- /Code/Operationalization/dashboards/Readme.md: -------------------------------------------------------------------------------- 1 | # This folder contains dashboards e.g. end-user facing or for reporting purposes # 2 | 3 | Use git-lfs for large binary files such as PowerBI reports. -------------------------------------------------------------------------------- /Code/Operationalization/monitoring/Readme.md: -------------------------------------------------------------------------------- 1 | # This folder contains scripts for the monitoring (e.g. drift analysis) of deployed models 2 | -------------------------------------------------------------------------------- /Code/Operationalization/monitoring/monitoring_pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | Model monitoring pipeline 3 | 4 | Runs monitoring script by schedule (e.g. using Azure ML Pipelines and 5 | Azure Data Factory or as an Azure Function) 6 | @TODO read model training data 7 | @TODO read model collected data 8 | """ 9 | # Monitor Data Quality 10 | 11 | # Monitor Model Performance 12 | 13 | # Monitor Business KPIs 14 | -------------------------------------------------------------------------------- /Code/Readme.md: -------------------------------------------------------------------------------- 1 | # Code folder for hosting code for a Data Science Project 2 | 3 | This folder hosts all code for a data science project. It has three sub-folders, belonging to 3 stages of the Data Science Lifecycle: 4 | 5 | 1. Data_Acquisition_and_Understanding 6 | 2. Modeling 7 | 3. Deployment 8 | -------------------------------------------------------------------------------- /Docs/lab002/Readme.md: -------------------------------------------------------------------------------- 1 | ## Lab 2: running experiments ## 2 | 3 | # Understand the non-azure / open source ml model code # 4 | We first start with understanding the training script. The training script is an open source ML model code from https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html. This is an example showing how scikit-learn can be used to classify documents by topics using a bag-of-words approach. This example uses a scipy.sparse matrix to store the features and demonstrates various classifiers that can efficiently handle sparse matrices. The dataset used in this example is the 20 newsgroups dataset. It will be automatically downloaded, then cached. The newsgroup datasets contains text documents that are classified into 20 categories. 5 | 6 | 1. Open the train.py document to inspect the code. 7 | The first step in the code is to load the dataset from the 20 newsgroup dataset. In this example we are only going to use a subset of the categories. Please state the catogories we are going to use: 8 | 9 | ... 10 | 11 | The second step is to extract the features from the text. We do this with a sparse vecorizer. We also clean the data a bit. What is the operation that we do on the data to clean the text? 12 | 13 | ... 14 | 15 | After we have reshaped our data and made sure the feature names are in the right place, we are going to define the algorithm to fit the model. This step is defining the benchmark. We fit the data and make predictions on the test set. To validate out model we need a metric to score the model. There are many metrics we can use. Define in the code the metric that you want to use to validate your model and make sure the print statement will output your metric. (Note: you can define multiple scores if you want. If so, make sure to return these scores.) 16 | 17 | ... 18 | 19 | 20 | The last step is to define tha algoritms that we want to fit over our data. In this example we are using 15 classification algoritms to fit the data. We keep track of the metrics of all olgoritms, so we can compare the performance and pick the model. Look at the code and whrite down the different algoritms that we are going to test. 21 | 22 | ... 23 | 24 | # Run the training locally # 25 | We are now going to train the scripts locally. The script will return the diffetent metrics for all algoritms. Inspect the metrics that you specified. Wich algoritms performs best? 26 | 27 | ... 28 | 29 | # Run the code via Azure ML # 30 | We are now going to run our code via Azure ML. 31 | We are going to make use of child runs. The expiriment will perform a parent run that is going to execute train.py. Within train.py we are going to create child runs. For every of the 15 algoritms that we have we want to create a sub run and log the metrics seprately. Whihin the child run we are going to log the performane and the model .pkl files. This way we can easily track and compare our experiment in Azure ML. 32 | 33 | 1. Read Experiment Tracking documentation 34 | 35 | 2. Read How to Mange a Run documentation 36 | 37 | 3. Refactor the code to capture run metrics in train.py 38 | 1. Get the run context 39 | 2. Create a child run 40 | 3. Log the metric in the child trun 41 | 4. upload the .pkl file to the output folder of child run 42 | 5. close the child run 43 | 44 | 4. ALter the train_submit.py file 45 | 46 | 1. Load Azure ML workspace form config file 47 | 2. Create an extimator to define the run configuration 48 | 3. Define the ML experiment 49 | 4. Submit the experiment 50 | 51 | 5. Go to the portal to inspect the run history 52 | 53 | -------------------------------------------------------------------------------- /Docs/lab05/Readme.md: -------------------------------------------------------------------------------- 1 | ## Lab 5: hypertune capabilities ## 2 | 3 | # Understand goal # 4 | In this lab we are going to tune the hyperparameters of a random forest classifier. We do this in order to find the best model to fit our data and that will give the highest proobabailities. In Azure ML we can run a special run that is optimized for hyperparamter tuning. 5 | 6 | 1. Read the documentation Hyperparameter tuning 7 | 8 | # Define the Hyper paramters 9 | Before we start creating the hyper parameter run, we need to know and understand the parameters that we can tune for the random forest classifier 10 | 11 | 2. Search for sklearn randomclassifier and identify the parameters that we can tune. Write them down below 12 | 13 | ... 14 | 15 | # Alter the hypertrain scipt # 16 | The hypertrain script is similar to the train script, but instead of running 15 different algortims, we are only going to run the RandomForestClassifier. As we have seen in the previous step, the RandomForestClassifier has a lot of parameters that we can tune. In this example, we will only tune max_depth, n_estimators, criterion, min_samples_split. (Note: if you want to add more more hyperparameters you can do that in the similair way as we are adding these paramters.) 17 | 18 | 3. Define the parameters as input arguments in the OptionParser(), define the input type and set the default value to the default provided in the documentation. 19 | 20 | op.add_option("--max_depth", 21 | type=int, default=10) 22 | op.add_option("--n_estimators", 23 | type=int, default=100) 24 | op.add_option("--criterion", 25 | type=str, 26 | default='gini') 27 | op.add_option("--min_samples_split", 28 | type=int, 29 | default=2) 30 | 31 | 4. Create a dict of hyperparameters from the input flags. 32 | 33 | hyperparameters = { 34 | "max_depth": opts.max_depth, 35 | "n_estimators": opts.n_estimators, 36 | "criterion": opts.criterion, 37 | "min_samples_split": opts.min_samples_split 38 | } 39 | 40 | 5. Select the training hyperparameters as imput variables 41 | 42 | max_depth = hyperparameters["max_depth"] 43 | n_estimators = hyperparameters["n_estimators"] 44 | criterion = hyperparameters["criterion"] 45 | min_samples_split = hyperparameters["min_samples_split"] 46 | 47 | 6. Add the hyperparameters as imput options to RandomForestClassifier() 48 | 49 | 7. Add the log metrics to the script 50 | 51 | 8. Save the .pkl file 52 | 53 | # Understand differences in run configuration 54 | The run configuration for the hypertuning is slightly different from the standard run configuration. Azure ML has a special package azureml.train.hyperdrive for creating a hyperparamter tuning run. From this package, we are going to make use of the HyperDriveConfig to create the config file. 55 | 56 | 9. Create the estimator. (Note: the estimator for the hypertrain run is the same as for a normal run, but we are now running the script hypertrain.py) 57 | 58 | 10. Define the parameter sampling space and the search algoritm 59 | There are primaly 3 different ways to perform parameter searching: Random, Sweeping and.... 60 | In this example we will make use of the RandomParameterSampling. 61 | For every hyperparamter we can tune, we need to specify the search space. This search space can be conitious and defined by a uniform or normal distribution, or can be dircrete and defined by a choice function. 62 | 63 | 11. De fine the hyperdrive run configuration 64 | Make sure to use the paramter sampling as an imput of the config file and 65 | 66 | # Submit run on AML compute 67 | 68 | # View results in the portal 69 | -------------------------------------------------------------------------------- /LICENSE-CODE.TXT: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) Microsoft Corporation. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 5 | associated documentation files (the "Software"), to deal in the Software without restriction, 6 | including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, 8 | subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all copies or substantial 11 | portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT 14 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 15 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 16 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 17 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /LICENSE.TXT: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/MLOps-TDSP-Template/011f5418bc3be25570a84ff8c58dca94d4b35a45/LICENSE.TXT -------------------------------------------------------------------------------- /NOTICE.TXT: -------------------------------------------------------------------------------- 1 | ##Legal Notices 2 | Microsoft and any contributors grant you a license to the Microsoft documentation and other content 3 | in this repository under the [Creative Commons Attribution 4.0 International Public License](https://creativecommons.org/licenses/by/4.0/legalcode), 4 | see the LICENSE file, and grant you a license to any code in the repository under the [MIT License](https://opensource.org/licenses/MIT), see the 5 | LICENSE-CODE file. 6 | 7 | Microsoft, Windows, Microsoft Azure and/or other Microsoft products and services referenced in the documentation 8 | may be either trademarks or registered trademarks of Microsoft in the United States and/or other countries. 9 | The licenses for this project do not grant you rights to use any Microsoft names, logos, or trademarks. 10 | Microsoft's general trademark guidelines can be found at http://go.microsoft.com/fwlink/?LinkID=254653. 11 | 12 | Privacy information can be found at https://privacy.microsoft.com/ 13 | 14 | Microsoft and any contributors reserve all others rights, whether under their respective copyrights, patents, 15 | or trademarks, whether by implication, estoppel or otherwise. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MLOps Quickstart Template # 2 | 3 | This repo provides a quickstarter template as a fork on TDSP (https://github.com/Azure/Azure-TDSP-ProjectTemplate), extending the template with a suggested structure for operationalization using Azure. The current code base includes ARM templates as IaC for resource deployment, template build and release pipelines to enable ML model CI/CD, template code for working with Azure ML. 4 | 5 | ## How to get started ## 6 | 7 | * Clone this repo 8 | * Make sure you have an Azure Subscription set up. 9 | * Make sure you have an Azure DevOps instance set up. 10 | * Import the build and release definitions ('Code'>'Operationalization'>'build_and_release') into Azure DevOps pipelines. 11 | * Update the build and release definitions to use your credentials i.e. Azure subscription. 12 | * Create an initial commit. 13 | * If everything is set up correctly, Azure DevOps will provision your Azure Resources as triggered by the CI. 14 | * Use the Azure CLI ML Extension (`az ml project attach` command) or Azure ML SDK to configure your local workspace to use the created Azure ML workspace. 15 | * Run `Code/Modeling/train_submit` to run your first AzureML experiment on remote compute. 16 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /Sample_Data/For_Modeling/modelling.md: -------------------------------------------------------------------------------- 1 | # List of feature sets 2 | | Feature Set Name | Link to the Full Feature Set | Full Feature Set Size (MB) | Link to Report | 3 | | ---:| ---: | ---: | ---: | 4 | | Feature Set 1 | [link](link/to/feature/set1) | 2,000 | [Feature Set 1 Report](link/to/report1)| 5 | | Feature Set 2 | [link](link/to/feature/set2) | 300 | [Feature Set 2 Report](link/to/report2)| 6 | 7 | If the link to the full dataset does not apply, provide some information on how to access the full dataset. 8 | 9 | If the data stays in an Azure file storage, please provide the link to the text file with the information of the file storage that has been checked in to the git repository. -------------------------------------------------------------------------------- /Sample_Data/Processed/processed.md: -------------------------------------------------------------------------------- 1 | ## List of Processed Datasets 2 | 3 | 4 | | Processed Dataset Name | Link to the Full Processed Dataset | Full Processed Dataset Size (MB) | Link to Report | 5 | | ---:| ---: | ---: | ---: | 6 | | Processed Dataset 1 | [link](link/to/processed/dataset1) | 2,000 | [Processed Dataset 1 Report](link/to/report1)| 7 | | Processed Dataset 2 | [link](link/to/processed/dataset2) | 300 | [Processed Dataset 2 Report](link/to/report2)| 8 | 9 | 10 | If the link to the full dataset does not apply, provide some information on how to access the full dataset. 11 | 12 | If the data stays in an Azure file storage, please provide the link to the text file with the information of the file storage that has been checked in to the git repository. -------------------------------------------------------------------------------- /Sample_Data/README.md: -------------------------------------------------------------------------------- 1 | The **Sample_Data** directory in the project git repository is the place to store **SAMPLE** datasets which should be of small size, **NOT** the entire datasets. If your client does not allow you to store even the sample data on the github repository, if possible, store a sample dataset with all confidential fields hashed. If still not allowed, please do not store sample data here. But, please still fill in the table in each sub-directory. 2 | 3 | The small sample datasets can be used to make your data preprocessing, feature engineering, or modeling scripts runnable. It can be helpful to quickly run the scripts that process or model the data, and understand what the scripts are doing. 4 | 5 | In each directory, there is a markdown file, which lists all datasets in each directory. Please provide the link to the full dataset in case one wants to access the full dataset. 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Sample_Data/Raw/rawData.md: -------------------------------------------------------------------------------- 1 | ## List of Raw Datasets 2 | 3 | 4 | | Raw Dataset Name | Link to the Full Dataset | Full Dataset Size (MB) | Link to Report | 5 | | ---:| ---: | ---: | ---: | 6 | | Raw Dataset 1 | [link](link/to/full/dataset1) | 2,000 | [Raw Dataset 1 Report](link/to/report1)| 7 | | Raw Dataset 2 | [link](link/to/full/dataset2) | 300 | [Raw Dataset 2 Report](link/to/report2)| 8 | 9 | If the link to the full dataset does not apply, provide some information on how to access the full dataset. 10 | 11 | If the data stays in an Azure file storage, please provide the link to the text file with the information of the file storage that has been checked in to the git repository. 12 | 13 | -------------------------------------------------------------------------------- /conda_dependencies.yml: -------------------------------------------------------------------------------- 1 | name: project_environment 2 | dependencies: 3 | - python=3.6.2 4 | - pip: 5 | - numpy==1.15.4 6 | - pandas==0.23.4 7 | - scikit-learn==0.20.1 8 | - scipy==1.0.0 9 | - matplotlib==3.0.2 10 | - utils==0.9.0 11 | # Required packages for AzureML execution, history, and data preparation. 12 | - azureml-sdk==1.0.85 13 | - azureml-defaults==1.0.85 14 | - azure-cli==2.0.58 15 | # Dev Tools 16 | - setuptools 17 | - flake8 18 | - flake8_formatter_junit_xml 19 | - pytest 20 | - -------------------------------------------------------------------------------- /infrastructure/README.md: -------------------------------------------------------------------------------- 1 | # Infrastructure as Code 2 | 3 | This folder contains examples for how to bootstrap your machine learning workflow. 4 | Azure Resource Manager (ARM) templates & Azure ML CLI commands can easily be used to bootstrap and provision workspaces for your data scientists prior to enabling them to begin data preparation & model training. 5 | 6 | * **[ARM-Templates](arm-templates)** contains infrastructure-as-code templates and parameter files for two sample environments (dev + test). The use of ARM templates gives you the most flexibility in customizing your Azure resources. 7 | * **[Scripts](scripts)** contains Azure CLI scripts for resource deployment. The use of CLI commands for deployment provides the most lean way to deploy resources to Azure. 8 | * **[Build-and-Release](build-and-release)** contains pipeline definitions for Azure DevOps to automate infrastructure roll out. Included moreover is a PowerShell script that can be used for test deployments of the infrastructure resources. 9 | 10 | ## Automated roll out of infrastructure 11 | 12 | In this section you will learn about how you could use [Azure Pipelines](https://azure.microsoft.com/en-us/services/devops/pipelines/) for the automated deployment of infrastructure. This way of working enables you to incrementally deploy changes to your resources, stage the changes over different environments, and build confidence as your system growths more complex. 13 | 14 | ### Getting started 15 | 16 | Complete the below steps to set up your pipeline for infrastructure roll out. 17 | 18 | * Navigate to [Azure DevOps](http://dev.azure.com/) and create a new organization and project. You can also re-use an existing organization and/or project. 19 | * Create a new [service connection](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml) in Azure DevOps of the Azure Resources Manager connection type. Azure DevOps will authenticate using this connection to make deployments to your Azure Subscription. 20 | * In [deploy-infra.yml](build-and-release/deploy-infra.yml) replace `` by the name of the service connection that you created in the previous step. 21 | * Some Azure resources require you to use globally unique names across Azure. This holds for example for storage account resources. Adapt resource names in the ARM parameter files to a name that is globally unique. Note that you should update the parameter files for the ML workspace and ML compute resources as well once you update the names of the underlying resources. 22 | * Make a test deployment using the provided powershell script `deploy-infra.ps1`. 23 | * Set up a new pipeline in Azure DevOps with the option to re-use an existing template. Point to the pipeline definition [deploy-infra.yml](build-and-release/deploy-infra.yml) in your repository. 24 | * Run your pipeline from Azure DevOps. On completion, you should see a result like the below. 25 | ![An example of a pipeline for Infrastructure roll out](infra_stages.png) 26 | 27 | ### Best practices on customizing the templates for your environment and team 28 | 29 | * Many teams already have existing resources in their Azure tenant for e.g. Keyvault and Application Insights. These resources can be re-used by Azure Machine Learning. Simply point to these resources in the [Machine Learning Workspace template](arm-templates/mlworkspace/template.json). For ease of modification, we have provided separate templates for each of the resources in this repository. 30 | * In most situations data already resides on existing storage in Azure. The [Azure CLI ML Extension](https://docs.microsoft.com/en-us/azure/machine-learning/reference-azure-machine-learning-cli) allows for a lean way to add storage as a [Datastore](https://docs.microsoft.com/en-us/azure/machine-learning/concept-data) in Azure Machine Learning. The [Azure CLI task](https://docs.microsoft.com/en-us/azure/devops/pipelines/tasks/deploy/azure-cli?view=azure-devops) in Azure DevOps can help you to automate the datastore attachment process as part of the infrastructure roll out. 31 | * Many teams choose to deploy multiple environments to work with, for example DEV, INT and PROD. In this way infrastructure can be rolled out in a phased way and with more confidence as your system becomes more complex. 32 | * As one rolls out additional infrastructural resources, it becomes valuable to stage changes across the different environments. You could consider to run a set of integration or component tests before rolling out to PRD. 33 | * It is a sound practice to protect the roll out of changes to PRD from originating from branches other than master. [Conditions](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/conditions?view=azure-devops&tabs=yaml) in Azure pipelines can you help to set controls like these. 34 | * One could specify a security group of users that require to give their [approval](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/approvals?view=azure-devops&tabs=check-pass#approvals) to make roll outs to specific environments. 35 | * It is important to note that in the MLOps way of working, we make a separation of concerns between the roll out of infrastructure and the roll out of ML artifacts. Hence the two types are rolled out at different moments and with different automation pipelines. 36 | * Multiple additional security controls (virtual network rules, role-based access control and custom identities) can be applied on the Azure resources that are found in this repository. Controls can be added directly from the ARM templates. Consult the [documentation](https://docs.microsoft.com/en-us/azure/templates/) on Azure Resource Manager to find the possible modifications that can be made to each Azure Resource. As an example on modifications for the template for Azure ML compute, one can find a [template](arm-templates/mlcompute/template-vnet.json) in this repository that adds a SSH user and virtual network controls to the managed compute virtual machines. -------------------------------------------------------------------------------- /infrastructure/arm-templates/appinsights/parameters.dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "appInsightsName": { 6 | "value": "mlops-ain-dev" 7 | }, 8 | "regionId": { 9 | "value": "westeurope" 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/appinsights/parameters.test.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "appInsightsName": { 6 | "value": "mlops-ain-test" 7 | }, 8 | "regionId": { 9 | "value": "westeurope" 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/appinsights/template.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://schema.management.azure.com/schemas/2014-04-01-preview/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "appInsightsName": { 6 | "type": "string" 7 | }, 8 | "regionId": { 9 | "type": "string" 10 | } 11 | }, 12 | "resources": [ 13 | { 14 | "type": "Microsoft.Insights/components", 15 | "location": "[parameters('regionId')]", 16 | "name": "[parameters('appInsightsName')]", 17 | "apiVersion": "2015-05-01", 18 | "kind": "web", 19 | "properties": { 20 | "Application_Type": "web" 21 | } 22 | } 23 | ] 24 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/containerregistry/parameters.dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "crname": { 6 | "value": "mlopscrdev" 7 | }, 8 | "location": { 9 | "value": "westeurope" 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/containerregistry/parameters.test.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "crname": { 6 | "value": "mlopscrtest" 7 | }, 8 | "location": { 9 | "value": "westeurope" 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/containerregistry/template.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "crname": { 6 | "type": "string" 7 | }, 8 | "location": { 9 | "type": "string" 10 | } 11 | }, 12 | "variables": {}, 13 | "resources": [ 14 | { 15 | "type": "Microsoft.ContainerRegistry/registries", 16 | "sku": { 17 | "name": "Basic", 18 | "tier": "Basic" 19 | }, 20 | "name": "[parameters('crname')]", 21 | "apiVersion": "2017-10-01", 22 | "location": "[parameters('location')]", 23 | "tags": {}, 24 | "scale": null, 25 | "properties": { 26 | "adminUserEnabled": true 27 | } 28 | } 29 | ] 30 | } 31 | -------------------------------------------------------------------------------- /infrastructure/arm-templates/keyvault/parameters.dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "kvname": { 6 | "value": "mlops-kv-dev" 7 | }, 8 | "location": { 9 | "value": "westeurope" 10 | }, 11 | "createMode": { 12 | "value": "default" 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/keyvault/parameters.test.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "kvname": { 6 | "value": "mlops-kv-test" 7 | }, 8 | "location": { 9 | "value": "westeurope" 10 | }, 11 | "createMode": { 12 | "value": "default" 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/keyvault/template.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "kvname": { 6 | "type": "string" 7 | }, 8 | "location": { 9 | "type": "string" 10 | }, 11 | "createMode": { 12 | "type": "string" 13 | } 14 | }, 15 | "variables": { 16 | }, 17 | "resources": [ 18 | { 19 | "type": "Microsoft.KeyVault/vaults", 20 | "name": "[parameters('kvname')]", 21 | "apiVersion": "2018-02-14", 22 | "location": "[parameters('location')]", 23 | "tags": {}, 24 | "scale": null, 25 | "properties": { 26 | "sku": { 27 | "family": "A", 28 | "name": "standard" 29 | }, 30 | "tenantId": "[subscription().tenantId]", 31 | "createMode": "[parameters('createMode')]", 32 | "enabledForTemplateDeployment": true, 33 | "accessPolicies": [] 34 | } 35 | } 36 | ] 37 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/mlcompute/parameters-vnet.dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "workspaceName": { 6 | "value": "mlops-mls-dev" 7 | }, 8 | "clusterName": { 9 | "value": "cpu-compute" 10 | }, 11 | "vmSize": { 12 | "value": "STANDARD_D3_V2" 13 | }, 14 | "minNodeCount": { 15 | "value": 0 16 | }, 17 | "maxNodeCount": { 18 | "value": 3 19 | }, 20 | "scaleDownTime": { 21 | "value": "PT15M" 22 | }, 23 | "subnetId": { 24 | "value": "/subscriptions/xxxx/resourceGroups/yyyy/providers/Microsoft.Network/virtualNetworks/zzzz/subnets/ssss" 25 | }, 26 | "adminUserName": { 27 | "value": "" 28 | }, 29 | "adminUserPassword": { 30 | "value": "" 31 | } 32 | } 33 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/mlcompute/parameters-vnet.test.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "workspaceName": { 6 | "value": "mlops-mls-test" 7 | }, 8 | "clusterName": { 9 | "value": "cpu-compute" 10 | }, 11 | "vmSize": { 12 | "value": "STANDARD_D3_V2" 13 | }, 14 | "minNodeCount": { 15 | "value": 0 16 | }, 17 | "maxNodeCount": { 18 | "value": 3 19 | }, 20 | "scaleDownTime": { 21 | "value": "PT15M" 22 | }, 23 | "subnetId": { 24 | "value": "/subscriptions/xxxx/resourceGroups/yyyy/providers/Microsoft.Network/virtualNetworks/zzzz/subnets/ssss" 25 | }, 26 | "adminUserName": { 27 | "value": "" 28 | }, 29 | "adminUserPassword": { 30 | "value": "" 31 | } 32 | } 33 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/mlcompute/parameters.dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "workspaceName": { 6 | "value": "mlops-mls-dev" 7 | }, 8 | "clusterName": { 9 | "value": "cpu-compute" 10 | }, 11 | "vmSize": { 12 | "value": "STANDARD_D3_V2" 13 | }, 14 | "minNodeCount": { 15 | "value": 0 16 | }, 17 | "maxNodeCount": { 18 | "value": 3 19 | }, 20 | "scaleDownTime": { 21 | "value": "PT15M" 22 | } 23 | } 24 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/mlcompute/parameters.test.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "workspaceName": { 6 | "value": "mlops-mls-test" 7 | }, 8 | "clusterName": { 9 | "value": "cpu-compute" 10 | }, 11 | "vmSize": { 12 | "value": "STANDARD_D3_V2" 13 | }, 14 | "minNodeCount": { 15 | "value": 0 16 | }, 17 | "maxNodeCount": { 18 | "value": 3 19 | }, 20 | "scaleDownTime": { 21 | "value": "PT15M" 22 | } 23 | } 24 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/mlcompute/template-vnet.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "workspaceName": { 6 | "type": "string" 7 | }, 8 | "clusterName": { 9 | "type": "string" 10 | }, 11 | "vmSize": { 12 | "type": "string" 13 | }, 14 | "minNodeCount": { 15 | "type": "int" 16 | }, 17 | "maxNodeCount": { 18 | "type": "int" 19 | }, 20 | "scaleDownTime": { 21 | "type": "string" 22 | }, 23 | "subnetId": { 24 | "type": "string" 25 | }, 26 | "adminUserName": { 27 | "type": "string" 28 | }, 29 | "adminUserPassword": { 30 | "type": "string" 31 | } 32 | }, 33 | "variables": {}, 34 | "resources": [ 35 | { 36 | "type": "Microsoft.MachineLearningServices/workspaces/computes", 37 | "name": "[concat(parameters('workspaceName'), '/', parameters('clusterName'))]", 38 | "apiVersion": "2018-11-19", 39 | "location" : "[resourceGroup().location]", 40 | "properties": { 41 | "computeType": "AmlCompute", 42 | "computeLocation" : "[resourceGroup().location]", 43 | "properties": 44 | { 45 | "scaleSettings": 46 | { 47 | "minNodeCount" : "[parameters('minNodeCount')]", 48 | "maxNodeCount" : "[parameters('maxNodeCount')]", 49 | "nodeIdleTimeBeforeScaleDown": "[parameters('scaleDownTime')]" 50 | }, 51 | "vmPriority": "Dedicated", 52 | "vmSize" : "[parameters('vmSize')]", 53 | "userAccountCredentials" : 54 | { 55 | "adminUserName" : "[parameters('adminUserName')]", 56 | "adminUserPassword" : "[parameters('adminUserPassword')]" 57 | }, 58 | "subnet" : 59 | { 60 | "id" : "[parameters('subnetId')]" 61 | } 62 | } 63 | } 64 | } 65 | ] 66 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/mlcompute/template.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "workspaceName": { 6 | "type": "string" 7 | }, 8 | "clusterName": { 9 | "type": "string" 10 | }, 11 | "vmSize": { 12 | "type": "string" 13 | }, 14 | "minNodeCount": { 15 | "type": "int" 16 | }, 17 | "maxNodeCount": { 18 | "type": "int" 19 | }, 20 | "scaleDownTime": { 21 | "type": "string" 22 | } 23 | }, 24 | "variables": {}, 25 | "resources": [ 26 | { 27 | "type": "Microsoft.MachineLearningServices/workspaces/computes", 28 | "name": "[concat(parameters('workspaceName'), '/', parameters('clusterName'))]", 29 | "apiVersion": "2018-11-19", 30 | "location" : "[resourceGroup().location]", 31 | "properties": { 32 | "computeType": "AmlCompute", 33 | "computeLocation" : "[resourceGroup().location]", 34 | "properties": 35 | { 36 | "scaleSettings": 37 | { 38 | "minNodeCount" : "[parameters('minNodeCount')]", 39 | "maxNodeCount" : "[parameters('maxNodeCount')]", 40 | "nodeIdleTimeBeforeScaleDown": "[parameters('scaleDownTime')]" 41 | }, 42 | "vmPriority": "Dedicated", 43 | "vmSize" : "[parameters('vmSize')]" 44 | } 45 | } 46 | } 47 | ] 48 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/mlworkspace/parameters.dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "workspaceName": { 6 | "value": "mlops-mls-dev" 7 | }, 8 | "keyVaultName": { 9 | "value": "mlops-kv-dev" 10 | }, 11 | "applicationInsightsName": { 12 | "value": "mlops-ain-dev" 13 | }, 14 | "containerRegistryName": { 15 | "value": "mlopscrdev" 16 | }, 17 | "storageAccountName": { 18 | "value": "mlopssadev" 19 | } 20 | } 21 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/mlworkspace/parameters.test.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "workspaceName": { 6 | "value": "mlops-mls-test" 7 | }, 8 | "keyVaultName": { 9 | "value": "mlops-kv-test" 10 | }, 11 | "applicationInsightsName": { 12 | "value": "mlops-ain-test" 13 | }, 14 | "containerRegistryName": { 15 | "value": "mlopscrtest" 16 | }, 17 | "storageAccountName": { 18 | "value": "mlopssatest" 19 | } 20 | } 21 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/mlworkspace/template.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "workspaceName": { 6 | "type": "string" 7 | }, 8 | "keyVaultName": { 9 | "type": "string" 10 | }, 11 | "applicationInsightsName": { 12 | "type": "string" 13 | }, 14 | "containerRegistryName": { 15 | "type": "string" 16 | }, 17 | "storageAccountName": { 18 | "type": "string" 19 | } 20 | }, 21 | "resources": [ 22 | { 23 | "name": "[parameters('workspaceName')]", 24 | "type": "Microsoft.MachineLearningServices/workspaces", 25 | "apiVersion": "2018-11-19", 26 | "location": "[resourceGroup().location]", 27 | "identity": { 28 | "type": "systemAssigned" 29 | }, 30 | "properties": { 31 | "keyVault": "[resourceId('Microsoft.KeyVault/vaults', parameters('keyVaultName'))]", 32 | "applicationInsights": "[resourceId('Microsoft.Insights/components', parameters('applicationInsightsName'))]", 33 | "containerRegistry": "[resourceId('Microsoft.ContainerRegistry/registries', parameters('containerRegistryName'))]", 34 | "storageAccount": "[resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccountName'))]" 35 | } 36 | } 37 | ] 38 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/storage/parameters.dev.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "saname": { 6 | "value": "mlopssadev" 7 | }, 8 | "location": { 9 | "value": "westeurope" 10 | }, 11 | "accountType": { 12 | "value": "Standard_RAGRS" 13 | }, 14 | "kind": { 15 | "value": "StorageV2" 16 | }, 17 | "accessTier": { 18 | "value": "Hot" 19 | } 20 | } 21 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/storage/parameters.test.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "saname": { 6 | "value": "mlopssatest" 7 | }, 8 | "location": { 9 | "value": "westeurope" 10 | }, 11 | "accountType": { 12 | "value": "Standard_RAGRS" 13 | }, 14 | "kind": { 15 | "value": "StorageV2" 16 | }, 17 | "accessTier": { 18 | "value": "Hot" 19 | } 20 | } 21 | } -------------------------------------------------------------------------------- /infrastructure/arm-templates/storage/template.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "saname": { 6 | "type": "string" 7 | }, 8 | "location": { 9 | "type": "string" 10 | }, 11 | "accountType": { 12 | "type": "string" 13 | }, 14 | "kind": { 15 | "type": "string" 16 | }, 17 | "accessTier": { 18 | "type": "string" 19 | } 20 | }, 21 | "variables": {}, 22 | "resources": [ 23 | { 24 | "type": "Microsoft.Storage/storageAccounts", 25 | "sku": { 26 | "name": "[parameters('accountType')]" 27 | }, 28 | "kind": "[parameters('kind')]", 29 | "name": "[parameters('saname')]", 30 | "apiVersion": "2018-07-01", 31 | "location": "[parameters('location')]", 32 | "properties": { 33 | "accessTier": "[parameters('accessTier')]", 34 | "supportsHttpsTrafficOnly": true 35 | } 36 | } 37 | ] 38 | } -------------------------------------------------------------------------------- /infrastructure/build-and-release/deploy-infra.ps1: -------------------------------------------------------------------------------- 1 | # Deployment script for machine learning resources 2 | # Run locally to debug changes in the resource configuration 3 | # Use `deploy-infrastructure.yml` for automation of deployments. 4 | 5 | # Prompt users for resource group and location 6 | $resourceGroupName = Read-Host -Prompt "Provide a resource group name" 7 | $location = Read-Host -Prompt "Provide a DC location" 8 | 9 | # Create a Resource Group 10 | New-AzResourceGroup -Name $resourceGroupName -Location $location 11 | 12 | # Deploy Storage Account 13 | New-AzResourceGroupDeployment -ResourceGroupName $resourceGroupName ` 14 | -TemplateFile $PSScriptRoot/../arm-templates/storage/template.json ` 15 | -TemplateParameterFile $PSScriptRoot/../arm-templates/storage/parameters.dev.json 16 | 17 | # Deploy Container Registry 18 | New-AzResourceGroupDeployment -ResourceGroupName $resourceGroupName ` 19 | -TemplateFile $PSScriptRoot/../arm-templates/containerregistry/template.json ` 20 | -TemplateParameterFile $PSScriptRoot/../arm-templates/containerregistry/parameters.dev.json 21 | 22 | # Deploy Application Insights 23 | New-AzResourceGroupDeployment -ResourceGroupName $resourceGroupName ` 24 | -TemplateFile $PSScriptRoot/../arm-templates/appinsights/template.json ` 25 | -TemplateParameterFile $PSScriptRoot/../arm-templates/appinsights/parameters.dev.json 26 | 27 | # Deploy Key Vault 28 | New-AzResourceGroupDeployment -ResourceGroupName $resourceGroupName ` 29 | -TemplateFile $PSScriptRoot/../arm-templates/keyvault/template.json ` 30 | -TemplateParameterFile $PSScriptRoot/../arm-templates/keyvault/parameters.dev.json 31 | 32 | # Deploy Workspace 33 | New-AzResourceGroupDeployment -ResourceGroupName $resourceGroupName ` 34 | -TemplateFile $PSScriptRoot/../arm-templates/mlworkspace/template.json ` 35 | -TemplateParameterFile $PSScriptRoot/../arm-templates/mlworkspace/parameters.dev.json 36 | 37 | # Deploy Compute 38 | New-AzResourceGroupDeployment -ResourceGroupName $resourceGroupName ` 39 | -TemplateFile $PSScriptRoot/../arm-templates/mlcompute/template.json ` 40 | -TemplateParameterFile $PSScriptRoot/../arm-templates/mlcompute/parameters.dev.json 41 | -------------------------------------------------------------------------------- /infrastructure/build-and-release/deploy-infra.template.yml: -------------------------------------------------------------------------------- 1 | # Azure Pipeline Template for ML Workspace Resources Deployment 2 | parameters: 3 | - name: environment 4 | type: string 5 | - name: serviceConnection 6 | type: string 7 | 8 | jobs: 9 | - deployment: DeployMLResources 10 | displayName: Deploy ML Resources 11 | pool: 12 | vmImage: ubuntu-16.04 13 | environment: ${{ parameters.environment }} 14 | variables: 15 | - name: resourceGroupName 16 | value: mlopsexample-${{ parameters.environment }} 17 | - name: resourceGroupLocation 18 | value: westeurope 19 | strategy: 20 | runOnce: 21 | deploy: 22 | steps: 23 | - download: current 24 | artifact: infratemplates 25 | - script: ls 26 | displayName: 'List dirs' 27 | 28 | - task: AzureResourceGroupDeployment@2 29 | displayName: 'Deploy Storage Account for AML' 30 | inputs: 31 | azureSubscription: ${{ parameters.serviceConnection }} 32 | resourceGroupName: $(resourceGroupName) 33 | location: $(resourceGroupLocation) 34 | csmFile: '$(Pipeline.Workspace)/infratemplates/storage/template.json' 35 | csmParametersFile: '$(Pipeline.Workspace)/infratemplates/storage/parameters.${{ parameters.environment }}.json' 36 | 37 | # Optional - Add a second storage account to host data for machine learning 38 | # - task: AzureResourceGroupDeployment@2 39 | # displayName: 'Deploy Storage Account for Data' 40 | # inputs: 41 | # azureSubscription: ${{ parameters.serviceConnection }} 42 | # resourceGroupName: $(resourceGroupName) 43 | # location: $(resourceGroupLocation) 44 | # csmFile: '$(Pipeline.Workspace)/infratemplates/storage/template.json' 45 | # csmParametersFile: '$(Pipeline.Workspace)/infratemplates/storage/parameters.${{ parameters.environment }}.json' 46 | # overrideParameters: | 47 | # -name "mlopssadata${{ parameters.environment }}" 48 | 49 | - task: AzureResourceGroupDeployment@2 50 | displayName: 'Deploy Container Registry' 51 | inputs: 52 | azureSubscription: ${{ parameters.serviceConnection }} 53 | resourceGroupName: $(resourceGroupName) 54 | location: $(resourceGroupLocation) 55 | csmFile: '$(Pipeline.Workspace)/infratemplates/containerregistry/template.json' 56 | csmParametersFile: '$(Pipeline.Workspace)/infratemplates/containerregistry/parameters.${{ parameters.environment }}.json' 57 | 58 | - task: AzureResourceGroupDeployment@2 59 | displayName: 'Deploy Application Insights' 60 | inputs: 61 | azureSubscription: ${{ parameters.serviceConnection }} 62 | resourceGroupName: $(resourceGroupName) 63 | location: $(resourceGroupLocation) 64 | csmFile: '$(Pipeline.Workspace)/infratemplates/appinsights/template.json' 65 | csmParametersFile: '$(Pipeline.Workspace)/infratemplates/appinsights/parameters.${{ parameters.environment }}.json' 66 | 67 | - task: AzureResourceGroupDeployment@2 68 | displayName: 'Deploy Key Vault' 69 | inputs: 70 | azureSubscription: ${{ parameters.serviceConnection }} 71 | resourceGroupName: $(resourceGroupName) 72 | location: $(resourceGroupLocation) 73 | csmFile: '$(Pipeline.Workspace)/infratemplates/keyvault/template.json' 74 | csmParametersFile: '$(Pipeline.Workspace)/infratemplates/keyvault/parameters.${{ parameters.environment }}.json' 75 | 76 | - task: AzureResourceGroupDeployment@2 77 | displayName: 'Deploy ML Workspace' 78 | inputs: 79 | azureSubscription: ${{ parameters.serviceConnection }} 80 | resourceGroupName: $(resourceGroupName) 81 | location: $(resourceGroupLocation) 82 | csmFile: '$(Pipeline.Workspace)/infratemplates/mlworkspace/template.json' 83 | csmParametersFile: '$(Pipeline.Workspace)/infratemplates/mlworkspace/parameters.${{ parameters.environment }}.json' 84 | 85 | # Optional - Add a second ML workspace using the same underlying infrastructure 86 | # - task: AzureResourceGroupDeployment@2 87 | # displayName: 'Deploy ML Workspace' 88 | # inputs: 89 | # azureSubscription: ${{ parameters.serviceConnection }} 90 | # resourceGroupName: $(resourceGroupName) 91 | # location: $(resourceGroupLocation) 92 | # csmFile: '$(Pipeline.Workspace)/infratemplates/mlworkspace/template.json' 93 | # csmParametersFile: '$(Pipeline.Workspace)/infratemplates/mlworkspace/parameters.${{ parameters.environment }}.json' 94 | # csmParametersFile: '$(Pipeline.Workspace)/infratemplates/storage/parameters.${{ parameters.environment }}.json' 95 | # overrideParameters: | 96 | # -name "mlops-mls2-${{ parameters.environment }}" 97 | 98 | - task: AzureResourceGroupDeployment@2 99 | displayName: 'Deploy ML Compute' 100 | inputs: 101 | azureSubscription: ${{ parameters.serviceConnection }} 102 | resourceGroupName: $(resourceGroupName) 103 | location: $(resourceGroupLocation) 104 | csmFile: '$(Pipeline.Workspace)/infratemplates/mlcompute/template.json' 105 | csmParametersFile: '$(Pipeline.Workspace)/infratemplates/mlcompute/parameters.${{ parameters.environment }}.json' 106 | -------------------------------------------------------------------------------- /infrastructure/build-and-release/deploy-infra.yml: -------------------------------------------------------------------------------- 1 | # Azure Pipeline Definition for Infrastructure Deployment 2 | 3 | # Trigger on changes in the infrastructure folder and on the master branch 4 | trigger: 5 | branches: 6 | include: 7 | - master 8 | 9 | paths: 10 | include: 11 | - infrastructure/* 12 | 13 | stages: 14 | - stage: Build 15 | displayName: 'IaC Build' 16 | jobs: 17 | - job: Build 18 | pool: 19 | vmImage: ubuntu-16.04 20 | steps: 21 | - task: CopyFiles@2 22 | displayName: 'Copy ARM templates' 23 | inputs: 24 | sourceFolder: 'infrastructure/arm-templates' 25 | targetFolder: '$(Build.ArtifactStagingDirectory)' 26 | - publish: '$(Build.ArtifactStagingDirectory)' 27 | artifact: infratemplates 28 | 29 | - stage: DEV 30 | displayName: 'DEV Deployment' 31 | jobs: 32 | - template: deploy-infra.template.yml 33 | parameters: 34 | environment: dev 35 | serviceConnection: 36 | 37 | - stage: TEST 38 | # only make deployments to TEST originating from the master branch 39 | condition: and(succeeded(), eq(variables['build.sourceBranch'], 'refs/heads/master')) 40 | displayName: 'TEST Deployment' 41 | jobs: 42 | - template: deploy-infra.template.yml 43 | parameters: 44 | environment: test 45 | serviceConnection: 46 | 47 | -------------------------------------------------------------------------------- /infrastructure/infra_stages.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/MLOps-TDSP-Template/011f5418bc3be25570a84ff8c58dca94d4b35a45/infrastructure/infra_stages.png -------------------------------------------------------------------------------- /infrastructure/runconfigschema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-04/schema#", 3 | "title": "RunConfiguration", 4 | "type": "object", 5 | "required": [ 6 | "script", 7 | "environment" 8 | ], 9 | "properties": { 10 | "script": { 11 | "type": "string", 12 | "description": "The relative path to the python script file. The file path is relative to the source_directory passed to submit run.\nExample: train.py", 13 | "minLength": 1 14 | }, 15 | "arguments": { 16 | "type": [ 17 | "array", 18 | "null" 19 | ], 20 | "description": "Command line arguments for the python script file.\nExample: [\"234\"]", 21 | "items": { 22 | "type": "string" 23 | } 24 | }, 25 | "sourceDirectoryDataStore": { 26 | "type": [ 27 | "null", 28 | "string" 29 | ], 30 | "description": "The attribute is used to configure the backing datastore for the project share.\nExample: my-source-store" 31 | }, 32 | "framework": { 33 | "description": "The supported frameworks are Python, PySpark, CNTK, TensorFlow, and PyTorch. Use Tensorflow for AmlCompute clusters, and Python for distributed training jobs.\nRemarks: If framework is set to PySpark, then spark field is required.\nIf framework is set to TensorFlow then tensorflow field is required.\nExample: Python", 34 | "oneOf": [ 35 | { 36 | "$ref": "#/definitions/Framework" 37 | } 38 | ] 39 | }, 40 | "communicator": { 41 | "description": "The supported communicators are None, ParameterServer, OpenMpi, and IntelMpi Keep in mind that OpenMpi requires a custom image with OpenMpi installed.\nUse ParameterServer or OpenMpi for AmlCompute clusters. Use IntelMpi for distributed training jobs.\nRemarks: If communicator is set to Mpi, then mpi field is required.\nExample: None", 42 | "oneOf": [ 43 | { 44 | "type": "null" 45 | }, 46 | { 47 | "$ref": "#/definitions/Communicator" 48 | } 49 | ] 50 | }, 51 | "target": { 52 | "type": [ 53 | "null", 54 | "string" 55 | ], 56 | "description": "Target refers to compute where the job is scheduled for execution. The default target is \"local\" refering to the local machine.\nRemarks: If target is amlcompute then amlCompute field is required.\nExample: amlcompute" 57 | }, 58 | "dataReferences": { 59 | "type": [ 60 | "null", 61 | "object" 62 | ], 63 | "description": "Data reference configuration details. All the data sources are made available to the run during execution based on each configuration.\n", 64 | "additionalProperties": { 65 | "$ref": "#/definitions/DataReferenceConfiguration" 66 | } 67 | }, 68 | "jobName": { 69 | "type": [ 70 | "null", 71 | "string" 72 | ], 73 | "description": "This is primarily intended for notebooks to override the default job name.\nDefaults to ArgumentVector[0] if not specified.\nExample: FindSquaresJob" 74 | }, 75 | "autoPrepareEnvironment": { 76 | "type": [ 77 | "boolean", 78 | "null" 79 | ], 80 | "description": "Defaulted to True, but if set to False the run will fail if no environment was found matching the requirements specified.\nThis can be set to False to fail fast when the environment is not found in the cache.\nExample: True" 81 | }, 82 | "maxRunDurationSeconds": { 83 | "type": [ 84 | "integer", 85 | "null" 86 | ], 87 | "description": "Maximum allowed time for the run. The system will attempt to automatically cancel the run if it took longer than this value.\nMaxRunDurationSeconds=null means infinite duration.\nExample: 84000", 88 | "format": "int64" 89 | }, 90 | "nodeCount": { 91 | "type": [ 92 | "integer", 93 | "null" 94 | ], 95 | "description": "Number of compute nodes to run the job on. Only applies to AMLCompute.\nExample: 1", 96 | "format": "int32" 97 | }, 98 | "environment": { 99 | "description": "The environment definition, This field configures the python environment.\nIt can be configured to use an existing Python environment or configured to setup a temp environment for the experiment.\nThe definition is also responsible for setting the required application dependencies.\n", 100 | "oneOf": [ 101 | { 102 | "$ref": "#/definitions/EnvironmentDefinition" 103 | } 104 | ] 105 | }, 106 | "history": { 107 | "description": "This section is used to disable and enable experiment history logging features.\n", 108 | "oneOf": [ 109 | { 110 | "type": "null" 111 | }, 112 | { 113 | "$ref": "#/definitions/HistoryConfiguration" 114 | } 115 | ] 116 | }, 117 | "spark": { 118 | "description": "Spark configuration details. When the platform is set to Pyspark, then the spark configuration is used to set the default sparkconf for the submitted job.\n", 119 | "oneOf": [ 120 | { 121 | "type": "null" 122 | }, 123 | { 124 | "$ref": "#/definitions/SparkConfiguration" 125 | } 126 | ] 127 | }, 128 | "batchAi": { 129 | "oneOf": [ 130 | { 131 | "type": "null" 132 | }, 133 | { 134 | "$ref": "#/definitions/BatchAiConfiguration" 135 | } 136 | ] 137 | }, 138 | "amlCompute": { 139 | "description": "The attribute is used to configure details of the compute target to be created during experiment.\nThe configuration only takes effect when the target is set to \"amlcompute\".\n", 140 | "oneOf": [ 141 | { 142 | "type": "null" 143 | }, 144 | { 145 | "$ref": "#/definitions/AMLComputeConfiguration" 146 | } 147 | ] 148 | }, 149 | "tensorflow": { 150 | "description": "The attribute is used to configure the distributed tensorflow parameters.\nThis attribute takes effect only when the framework is set to TensorFlow, and the communicator to ParameterServer.\nAmlCompute is the only supported compute for this configuration.\n", 151 | "oneOf": [ 152 | { 153 | "type": "null" 154 | }, 155 | { 156 | "$ref": "#/definitions/TensorflowConfiguration" 157 | } 158 | ] 159 | }, 160 | "mpi": { 161 | "description": "The attribute is used to configure the distributed MPI job parameters.\nThis attribute takes effect only when the framework is set to Python, and the communicator to OpenMpi or IntelMpi.\nAmlComppute is the only supported compute type for this configuration.\n", 162 | "oneOf": [ 163 | { 164 | "type": "null" 165 | }, 166 | { 167 | "$ref": "#/definitions/MpiConfiguration" 168 | } 169 | ] 170 | }, 171 | "hdi": { 172 | "description": "This attribute takes effect only when the target is set to an Azure HDI compute.\nThe HDI Configuration is used to set the YARN deployment mode. It is defaulted to cluster mode.\n", 173 | "oneOf": [ 174 | { 175 | "type": "null" 176 | }, 177 | { 178 | "$ref": "#/definitions/HdiConfiguration" 179 | } 180 | ] 181 | }, 182 | "containerInstance": { 183 | "oneOf": [ 184 | { 185 | "type": "null" 186 | }, 187 | { 188 | "$ref": "#/definitions/ContainerInstanceConfiguration" 189 | } 190 | ] 191 | }, 192 | "exposedPorts": { 193 | "type": [ 194 | "array", 195 | "null" 196 | ], 197 | "description": "Currently unused.\n", 198 | "items": { 199 | "type": "integer", 200 | "format": "int32" 201 | } 202 | }, 203 | "prepareEnvironment": { 204 | "type": [ 205 | "boolean", 206 | "null" 207 | ] 208 | } 209 | }, 210 | "definitions": { 211 | "Framework": { 212 | "type": "string", 213 | "description": "", 214 | "x-enumNames": [ 215 | "Python", 216 | "PySpark", 217 | "Cntk", 218 | "TensorFlow", 219 | "PyTorch", 220 | "TensorFlowParameterServer", 221 | "PythonMpi", 222 | "PythonIntelMpi", 223 | "PySparkInteractive" 224 | ], 225 | "enum": [ 226 | "Python", 227 | "PySpark", 228 | "Cntk", 229 | "TensorFlow", 230 | "PyTorch", 231 | "TensorFlowParameterServer", 232 | "PythonMpi", 233 | "PythonIntelMpi", 234 | "PySparkInteractive" 235 | ] 236 | }, 237 | "Communicator": { 238 | "type": "string", 239 | "description": "", 240 | "x-enumNames": [ 241 | "None", 242 | "ParameterServer", 243 | "OpenMpi", 244 | "IntelMpi", 245 | "Gloo", 246 | "Mpi" 247 | ], 248 | "enum": [ 249 | "None", 250 | "ParameterServer", 251 | "OpenMpi", 252 | "IntelMpi", 253 | "Gloo", 254 | "Mpi" 255 | ] 256 | }, 257 | "DataReferenceConfiguration": { 258 | "type": "object", 259 | "description": "A class for managing DataReferenceConfiguration.\n", 260 | "properties": { 261 | "dataStoreName": { 262 | "type": [ 263 | "null", 264 | "string" 265 | ], 266 | "description": "The name of the data store.\nExample: myblobstore" 267 | }, 268 | "mode": { 269 | "description": "Operation on the datastore, mount, download, upload.\nExample: Mount", 270 | "oneOf": [ 271 | { 272 | "$ref": "#/definitions/DataStoreMode" 273 | } 274 | ] 275 | }, 276 | "pathOnDataStore": { 277 | "type": [ 278 | "null", 279 | "string" 280 | ], 281 | "description": "Relative path on the datastore.\nExample: /images/validation" 282 | }, 283 | "pathOnCompute": { 284 | "type": [ 285 | "null", 286 | "string" 287 | ], 288 | "description": "The path on the compute target.\n" 289 | }, 290 | "overwrite": { 291 | "type": "boolean", 292 | "description": "Whether to overwrite the data if existing.\nExample: False" 293 | } 294 | }, 295 | "defaultSnippets": [ 296 | { 297 | "label": "Data references configuration template.", 298 | "description": "Data references configuration template.", 299 | "body": { 300 | "dataStoreName": "", 301 | "mode": "", 302 | "overwrite": "" 303 | } 304 | } 305 | ] 306 | }, 307 | "DataStoreMode": { 308 | "type": "string", 309 | "description": "", 310 | "x-enumNames": [ 311 | "Mount", 312 | "Download", 313 | "Upload" 314 | ], 315 | "enum": [ 316 | "Mount", 317 | "Download", 318 | "Upload" 319 | ] 320 | }, 321 | "EnvironmentDefinition": { 322 | "type": "object", 323 | "properties": { 324 | "name": { 325 | "type": [ 326 | "null", 327 | "string" 328 | ], 329 | "description": "The name of the environment.\nRemarks: Read-only from a contract perspective; set with URI fields on the relevant APIs.\nExample: mydevenvironment" 330 | }, 331 | "version": { 332 | "type": [ 333 | "null", 334 | "string" 335 | ], 336 | "description": "The environment version.\nRemarks: Read-only from a contract perspective; set with URI fields on the relevant APIs.\nExample: 1" 337 | }, 338 | "python": { 339 | "description": "Settings for a Python environment.\n", 340 | "oneOf": [ 341 | { 342 | "type": "null" 343 | }, 344 | { 345 | "$ref": "#/definitions/PythonSection" 346 | } 347 | ] 348 | }, 349 | "environmentVariables": { 350 | "type": [ 351 | "null", 352 | "object" 353 | ], 354 | "description": "Definition of environment variables to be defined in the environment.\n", 355 | "additionalProperties": { 356 | "type": "string" 357 | } 358 | }, 359 | "docker": { 360 | "description": "The definition of a Docker container.\n", 361 | "oneOf": [ 362 | { 363 | "type": "null" 364 | }, 365 | { 366 | "$ref": "#/definitions/DockerSection" 367 | } 368 | ] 369 | }, 370 | "spark": { 371 | "description": "The configuration for a Spark environment.\n", 372 | "oneOf": [ 373 | { 374 | "type": "null" 375 | }, 376 | { 377 | "$ref": "#/definitions/SparkSection" 378 | } 379 | ] 380 | } 381 | }, 382 | "defaultSnippets": [ 383 | { 384 | "label": "Environment definition configuration template.", 385 | "description": "Environment definition configuration template.", 386 | "body": { 387 | "python": { 388 | "interpreterPath": "python", 389 | "userManagedDependencies": false, 390 | "condaDependencies": { 391 | "dependencies": [ 392 | "python=3.6.2", 393 | { 394 | "pip": [ 395 | "azureml-defaults" 396 | ] 397 | } 398 | ] 399 | } 400 | }, 401 | "docker": { 402 | "baseImage": "mcr.microsoft.com/azureml/base:0.2.2", 403 | "enabled": false, 404 | "baseImageRegistry": { 405 | "address": "", 406 | "username": "", 407 | "password": "" 408 | } 409 | } 410 | } 411 | } 412 | ] 413 | }, 414 | "PythonSection": { 415 | "type": "object", 416 | "properties": { 417 | "interpreterPath": { 418 | "type": [ 419 | "null", 420 | "string" 421 | ], 422 | "description": "The python interpreter path. This is only used when user_managed_dependencies=True.\n" 423 | }, 424 | "userManagedDependencies": { 425 | "type": "boolean", 426 | "description": "True means that AzureML reuses an existing python environment; False means that AzureML will create a python environment based on the Conda dependencies specification.\n" 427 | }, 428 | "condaDependencies": { 429 | "description": "Conda dependencies for the run.\nRemarks: Specify conda dependencies in the json format here, or specify 'condaDependenciesFile' field and set its value to the conda file path, like '\\\"condaDependenciesFile\\\": \\\".azureml/conda_dependencies.yml\\\"'\n", 430 | "oneOf": [ 431 | {}, 432 | { 433 | "type": "null" 434 | } 435 | ] 436 | }, 437 | "baseCondaEnvironment": { 438 | "type": [ 439 | "null", 440 | "string" 441 | ] 442 | } 443 | }, 444 | "allOf": [ 445 | { 446 | "anyOf": [ 447 | { 448 | "anyOf": [ 449 | { 450 | "not": { 451 | "required": [ 452 | "userManagedDependencies" 453 | ], 454 | "properties": { 455 | "userManagedDependencies": { 456 | "enum": [ 457 | false 458 | ] 459 | } 460 | } 461 | } 462 | }, 463 | { 464 | "not": { 465 | "properties": { 466 | "condaDependenciesFile": { 467 | "enum": [ 468 | null 469 | ] 470 | } 471 | } 472 | }, 473 | "required": [ 474 | "condaDependenciesFile" 475 | ] 476 | } 477 | ] 478 | }, 479 | { 480 | "anyOf": [ 481 | { 482 | "not": { 483 | "required": [ 484 | "userManagedDependencies" 485 | ], 486 | "properties": { 487 | "userManagedDependencies": { 488 | "enum": [ 489 | false 490 | ] 491 | } 492 | } 493 | } 494 | }, 495 | { 496 | "not": { 497 | "properties": { 498 | "condaDependencies": { 499 | "enum": [ 500 | null 501 | ] 502 | } 503 | } 504 | }, 505 | "required": [ 506 | "condaDependencies" 507 | ] 508 | } 509 | ] 510 | } 511 | ] 512 | }, 513 | { 514 | "anyOf": [ 515 | { 516 | "not": { 517 | "required": [ 518 | "userManagedDependencies" 519 | ], 520 | "properties": { 521 | "userManagedDependencies": { 522 | "enum": [ 523 | true 524 | ] 525 | } 526 | } 527 | } 528 | }, 529 | { 530 | "not": { 531 | "properties": { 532 | "interpreterPath": { 533 | "enum": [ 534 | null 535 | ] 536 | } 537 | } 538 | }, 539 | "required": [ 540 | "interpreterPath" 541 | ] 542 | } 543 | ] 544 | } 545 | ], 546 | "defaultSnippets": [ 547 | { 548 | "label": "Python section configuration template.", 549 | "description": "Python section configuration template.", 550 | "body": { 551 | "interpreterPath": "python", 552 | "userManagedDependencies": false, 553 | "condaDependencies": { 554 | "dependencies": [ 555 | "python=3.6.2", 556 | { 557 | "pip": [ 558 | "azureml-defaults" 559 | ] 560 | } 561 | ] 562 | } 563 | } 564 | } 565 | ] 566 | }, 567 | "DockerSection": { 568 | "type": "object", 569 | "properties": { 570 | "baseImage": { 571 | "type": [ 572 | "null", 573 | "string" 574 | ], 575 | "description": "Base image used for Docker-based runs. If base image is not available in docker hub then please specify BaseImageRegistry field.\nExample: ubuntu:latest" 576 | }, 577 | "enabled": { 578 | "type": "boolean", 579 | "description": "Set True to perform this run inside a Docker container.\nExample: True" 580 | }, 581 | "sharedVolumes": { 582 | "type": "boolean", 583 | "description": "Set False if necessary to work around shared volume bugs on Windows.\nExample: True" 584 | }, 585 | "preparation": { 586 | "oneOf": [ 587 | { 588 | "type": "null" 589 | }, 590 | { 591 | "$ref": "#/definitions/Preparation" 592 | } 593 | ] 594 | }, 595 | "gpuSupport": { 596 | "type": "boolean", 597 | "description": "Run with NVidia Docker extension to support GPUs.\nExample: False" 598 | }, 599 | "shmSize": { 600 | "type": [ 601 | "null", 602 | "string" 603 | ], 604 | "description": "The shared memory size setting for NVidia GPUs.\nRemarks: 1GB is NVidia's recommended default shm size. In testing, more was not needed.\nExample: 1g" 605 | }, 606 | "arguments": { 607 | "type": [ 608 | "array", 609 | "null" 610 | ], 611 | "description": "Extra arguments to the Docker run command.\n", 612 | "items": { 613 | "type": "string" 614 | } 615 | }, 616 | "baseImageRegistry": { 617 | "description": "Image registry that contains the base image.\n", 618 | "oneOf": [ 619 | { 620 | "type": "null" 621 | }, 622 | { 623 | "$ref": "#/definitions/ContainerRegistry" 624 | } 625 | ] 626 | } 627 | }, 628 | "allOf": [ 629 | { 630 | "anyOf": [ 631 | { 632 | "properties": { 633 | "enabled": { 634 | "enum": [ 635 | false 636 | ] 637 | } 638 | } 639 | }, 640 | { 641 | "not": { 642 | "required": [ 643 | "enabled" 644 | ] 645 | } 646 | }, 647 | { 648 | "properties": { 649 | "baseImage": { 650 | "enum": [ 651 | null 652 | ] 653 | } 654 | } 655 | }, 656 | { 657 | "not": { 658 | "required": [ 659 | "baseImage" 660 | ] 661 | } 662 | }, 663 | { 664 | "not": { 665 | "properties": { 666 | "baseImageRegistry": { 667 | "enum": [ 668 | null 669 | ] 670 | } 671 | } 672 | }, 673 | "required": [ 674 | "baseImageRegistry" 675 | ] 676 | } 677 | ] 678 | } 679 | ], 680 | "defaultSnippets": [ 681 | { 682 | "label": "Docker section configuration template.", 683 | "description": "Docker section configuration template.", 684 | "body": { 685 | "baseImage": "mcr.microsoft.com/azureml/base:0.2.2", 686 | "enabled": false, 687 | "baseImageRegistry": { 688 | "address": "", 689 | "username": "", 690 | "password": "" 691 | } 692 | } 693 | } 694 | ] 695 | }, 696 | "Preparation": { 697 | "type": "object", 698 | "properties": { 699 | "commandLine": { 700 | "type": [ 701 | "null", 702 | "string" 703 | ] 704 | } 705 | } 706 | }, 707 | "ContainerRegistry": { 708 | "type": "object", 709 | "properties": { 710 | "address": { 711 | "type": [ 712 | "null", 713 | "string" 714 | ], 715 | "description": "DNS name or IP address of a container registry.\n" 716 | }, 717 | "username": { 718 | "type": [ 719 | "null", 720 | "string" 721 | ], 722 | "description": "The username for the container registry.\nRemarks: If username is specified then password is also required.\n" 723 | }, 724 | "password": { 725 | "type": [ 726 | "null", 727 | "string" 728 | ], 729 | "description": "The password for the container registry.\n" 730 | } 731 | }, 732 | "allOf": [ 733 | { 734 | "anyOf": [ 735 | { 736 | "properties": { 737 | "username": { 738 | "enum": [ 739 | null 740 | ] 741 | } 742 | } 743 | }, 744 | { 745 | "not": { 746 | "required": [ 747 | "username" 748 | ] 749 | } 750 | }, 751 | { 752 | "not": { 753 | "properties": { 754 | "password": { 755 | "enum": [ 756 | null 757 | ] 758 | } 759 | } 760 | }, 761 | "required": [ 762 | "password" 763 | ] 764 | } 765 | ] 766 | } 767 | ], 768 | "defaultSnippets": [ 769 | { 770 | "label": "Container registry configuration template.", 771 | "description": "Container registry configuration template.", 772 | "body": { 773 | "address": "", 774 | "username": "", 775 | "password": "" 776 | } 777 | } 778 | ] 779 | }, 780 | "SparkSection": { 781 | "type": "object", 782 | "properties": { 783 | "repositories": { 784 | "type": [ 785 | "array", 786 | "null" 787 | ], 788 | "description": "The list of spark repositories.\n", 789 | "items": { 790 | "type": "string" 791 | } 792 | }, 793 | "packages": { 794 | "type": [ 795 | "array", 796 | "null" 797 | ], 798 | "description": "The Spark packages to use.\n", 799 | "items": { 800 | "$ref": "#/definitions/SparkMavenPackage" 801 | } 802 | }, 803 | "precachePackages": { 804 | "type": "boolean", 805 | "description": "Whether to preckage the packages.\nExample: True" 806 | } 807 | }, 808 | "defaultSnippets": [ 809 | { 810 | "label": "Spark section configuration template.", 811 | "description": "Spark section configuration template.", 812 | "body": { 813 | "repositories": [ 814 | "https://mmlspark.azureedge.net/maven" 815 | ], 816 | "packages": "", 817 | "precachePackages": true 818 | } 819 | } 820 | ] 821 | }, 822 | "SparkMavenPackage": { 823 | "type": "object", 824 | "properties": { 825 | "group": { 826 | "type": [ 827 | "null", 828 | "string" 829 | ] 830 | }, 831 | "artifact": { 832 | "type": [ 833 | "null", 834 | "string" 835 | ] 836 | }, 837 | "version": { 838 | "type": [ 839 | "null", 840 | "string" 841 | ] 842 | } 843 | }, 844 | "defaultSnippets": [ 845 | { 846 | "label": "Spark maven package configuration template.", 847 | "description": "Spark maven package configuration template.", 848 | "body": { 849 | "group": "com.microsoft.ml.spark", 850 | "artifact": "mmlspark_2.11", 851 | "version": "0.12" 852 | } 853 | } 854 | ] 855 | }, 856 | "HistoryConfiguration": { 857 | "type": "object", 858 | "additionalProperties": { 859 | "oneOf": [ 860 | {}, 861 | { 862 | "type": "null" 863 | } 864 | ] 865 | }, 866 | "properties": { 867 | "outputCollection": { 868 | "type": "boolean", 869 | "description": "Enable history tracking -- this allows status, logs, metrics, and outputs to be collected for a run.\"\nExample: True" 870 | }, 871 | "directoriesToWatch": { 872 | "type": [ 873 | "array", 874 | "null" 875 | ], 876 | "description": "The list of directories to monitor and upload files from.\nExample: [\"logs\", \"outputs\"]", 877 | "default": [ 878 | "logs" 879 | ], 880 | "items": { 881 | "type": "string" 882 | } 883 | } 884 | }, 885 | "defaultSnippets": [ 886 | { 887 | "label": "History configuration template.", 888 | "description": "History configuration template.", 889 | "body": { 890 | "outputCollection": true 891 | } 892 | } 893 | ] 894 | }, 895 | "SparkConfiguration": { 896 | "type": "object", 897 | "properties": { 898 | "configuration": { 899 | "type": [ 900 | "null", 901 | "object" 902 | ], 903 | "description": "The Spark configuration.\n", 904 | "additionalProperties": { 905 | "type": "string" 906 | } 907 | } 908 | }, 909 | "defaultSnippets": [ 910 | { 911 | "label": "Spark configuration template.", 912 | "description": "Spark configuration template.", 913 | "body": { 914 | "configuration": { 915 | "spark.app.name": "Azure ML Experiment", 916 | "spark.yarn.maxAppAttempts": "1" 917 | } 918 | } 919 | } 920 | ] 921 | }, 922 | "BatchAiConfiguration": { 923 | "type": "object", 924 | "properties": { 925 | "nodeCount": { 926 | "type": "integer", 927 | "format": "int32" 928 | } 929 | } 930 | }, 931 | "AMLComputeConfiguration": { 932 | "type": "object", 933 | "properties": { 934 | "name": { 935 | "type": [ 936 | "null", 937 | "string" 938 | ], 939 | "description": "Name of the cluster to be created. If not specified, runId will be used as cluster name.\nExample: my8nodeCluster" 940 | }, 941 | "vmSize": { 942 | "type": [ 943 | "null", 944 | "string" 945 | ], 946 | "description": "VM size of the Cluster to be created. Allowed values are Azure vm sizes.\nThe list of vm sizes is available in https://docs.microsoft.com/en-us/azure/cloud-services/cloud-services-sizes-specs\nExample: Standard_D2_v2" 947 | }, 948 | "vmPriority": { 949 | "type": [ 950 | "null", 951 | "string" 952 | ], 953 | "description": "VM priority of the Cluster to be created. Allowed values are dedicated and lowpriority.\nExample: dedicated" 954 | }, 955 | "retainCluster": { 956 | "type": "boolean", 957 | "description": "Setting to true will prevent the cluster from being deleted upon completion of the run.\nExample: False" 958 | }, 959 | "clusterMaxNodeCount": { 960 | "type": "integer", 961 | "description": "The maximum number of nodes that the cluster can scale up to.\nMinimum number of nodes will always be set to 0.\nExample: 10", 962 | "format": "int32" 963 | } 964 | }, 965 | "defaultSnippets": [ 966 | { 967 | "label": "AmlCompute configuration template.", 968 | "description": "AmlCompute configuration template.", 969 | "body": { 970 | "name": "", 971 | "retainCluster": false, 972 | "clusterMaxNodeCount": 1 973 | } 974 | } 975 | ] 976 | }, 977 | "TensorflowConfiguration": { 978 | "type": "object", 979 | "properties": { 980 | "workerCount": { 981 | "type": "integer", 982 | "description": "The number of workers.\nExample: 2", 983 | "format": "int32" 984 | }, 985 | "parameterServerCount": { 986 | "type": "integer", 987 | "description": "Number of parameter servers.\nExample: 1", 988 | "format": "int32" 989 | } 990 | }, 991 | "defaultSnippets": [ 992 | { 993 | "label": "Tensorflow configuration template.", 994 | "description": "Tensorflow configuration template.", 995 | "body": { 996 | "workerCount": 1, 997 | "parameterServerCount": 1 998 | } 999 | } 1000 | ] 1001 | }, 1002 | "MpiConfiguration": { 1003 | "type": "object", 1004 | "properties": { 1005 | "processCountPerNode": { 1006 | "type": "integer", 1007 | "description": "When using MPI, the number of processes per node.\nExample: 2", 1008 | "format": "int32" 1009 | } 1010 | }, 1011 | "defaultSnippets": [ 1012 | { 1013 | "label": "Mpi configuration template.", 1014 | "description": "Mpi configuration template.", 1015 | "body": { 1016 | "processCountPerNode": 1 1017 | } 1018 | } 1019 | ] 1020 | }, 1021 | "HdiConfiguration": { 1022 | "type": "object", 1023 | "properties": { 1024 | "yarnDeployMode": { 1025 | "description": "Yarn deploy mode.\n", 1026 | "oneOf": [ 1027 | { 1028 | "$ref": "#/definitions/YarnDeployMode" 1029 | } 1030 | ] 1031 | } 1032 | }, 1033 | "defaultSnippets": [ 1034 | { 1035 | "label": "Hdi configuration template.", 1036 | "description": "Hdi configuration template.", 1037 | "body": { 1038 | "yarnDeployMode": "" 1039 | } 1040 | } 1041 | ] 1042 | }, 1043 | "YarnDeployMode": { 1044 | "type": "string", 1045 | "description": "", 1046 | "x-enumNames": [ 1047 | "None", 1048 | "Client", 1049 | "Cluster" 1050 | ], 1051 | "enum": [ 1052 | "None", 1053 | "Client", 1054 | "Cluster" 1055 | ] 1056 | }, 1057 | "ContainerInstanceConfiguration": { 1058 | "type": "object", 1059 | "properties": { 1060 | "region": { 1061 | "type": [ 1062 | "null", 1063 | "string" 1064 | ], 1065 | "description": "Defaults to the region of the workspace.\nExample: eastus2" 1066 | }, 1067 | "cpuCores": { 1068 | "type": "number", 1069 | "description": "Default size corresponds to the largest container supported in all regions.\nDetails: https://docs.microsoft.com/en-us/azure/container-instances/container-instances-quotas\nExample: 2", 1070 | "format": "double" 1071 | }, 1072 | "memoryGb": { 1073 | "type": "number", 1074 | "description": "The memory available for the container instance.\nExample: 3.5", 1075 | "format": "double" 1076 | } 1077 | }, 1078 | "defaultSnippets": [ 1079 | { 1080 | "label": "Container instance configuration template.", 1081 | "description": "Container instance configuration template.", 1082 | "body": {} 1083 | } 1084 | ] 1085 | } 1086 | }, 1087 | "allOf": [ 1088 | { 1089 | "anyOf": [ 1090 | { 1091 | "not": { 1092 | "required": [ 1093 | "communicator" 1094 | ], 1095 | "properties": { 1096 | "communicator": { 1097 | "enum": [ 1098 | "Mpi" 1099 | ] 1100 | } 1101 | } 1102 | } 1103 | }, 1104 | { 1105 | "not": { 1106 | "properties": { 1107 | "mpi": { 1108 | "enum": [ 1109 | null 1110 | ] 1111 | } 1112 | } 1113 | }, 1114 | "required": [ 1115 | "mpi" 1116 | ] 1117 | } 1118 | ] 1119 | }, 1120 | { 1121 | "anyOf": [ 1122 | { 1123 | "not": { 1124 | "required": [ 1125 | "framework" 1126 | ], 1127 | "properties": { 1128 | "framework": { 1129 | "enum": [ 1130 | "PySpark" 1131 | ] 1132 | } 1133 | } 1134 | } 1135 | }, 1136 | { 1137 | "not": { 1138 | "properties": { 1139 | "spark": { 1140 | "enum": [ 1141 | null 1142 | ] 1143 | } 1144 | } 1145 | }, 1146 | "required": [ 1147 | "spark" 1148 | ] 1149 | } 1150 | ] 1151 | }, 1152 | { 1153 | "anyOf": [ 1154 | { 1155 | "not": { 1156 | "required": [ 1157 | "target" 1158 | ], 1159 | "properties": { 1160 | "target": { 1161 | "enum": [ 1162 | "amlcompute" 1163 | ] 1164 | } 1165 | } 1166 | } 1167 | }, 1168 | { 1169 | "not": { 1170 | "properties": { 1171 | "amlCompute": { 1172 | "enum": [ 1173 | null 1174 | ] 1175 | } 1176 | } 1177 | }, 1178 | "required": [ 1179 | "amlCompute" 1180 | ] 1181 | } 1182 | ] 1183 | }, 1184 | { 1185 | "anyOf": [ 1186 | { 1187 | "not": { 1188 | "required": [ 1189 | "framework" 1190 | ], 1191 | "properties": { 1192 | "framework": { 1193 | "enum": [ 1194 | "TensorFlow" 1195 | ] 1196 | } 1197 | } 1198 | } 1199 | }, 1200 | { 1201 | "not": { 1202 | "properties": { 1203 | "tensorflow": { 1204 | "enum": [ 1205 | null 1206 | ] 1207 | } 1208 | } 1209 | }, 1210 | "required": [ 1211 | "tensorflow" 1212 | ] 1213 | } 1214 | ] 1215 | }, 1216 | { 1217 | "anyOf": [ 1218 | { 1219 | "not": { 1220 | "required": [ 1221 | "target" 1222 | ], 1223 | "properties": { 1224 | "target": { 1225 | "enum": [ 1226 | "containerinstance" 1227 | ] 1228 | } 1229 | } 1230 | } 1231 | }, 1232 | { 1233 | "not": { 1234 | "properties": { 1235 | "containerInstance": { 1236 | "enum": [ 1237 | null 1238 | ] 1239 | } 1240 | } 1241 | }, 1242 | "required": [ 1243 | "containerInstance" 1244 | ] 1245 | } 1246 | ] 1247 | } 1248 | ], 1249 | "defaultSnippets": [ 1250 | { 1251 | "label": "RunConfiguration default template.", 1252 | "description": "RunConfiguration default template.", 1253 | "body": { 1254 | "script": "train.py", 1255 | "arguments": [], 1256 | "framework": "", 1257 | "communicator": "None", 1258 | "target": "local", 1259 | "environment": { 1260 | "python": { 1261 | "interpreterPath": "python", 1262 | "userManagedDependencies": false, 1263 | "condaDependencies": { 1264 | "dependencies": [ 1265 | "python=3.6.2", 1266 | { 1267 | "pip": [ 1268 | "azureml-defaults" 1269 | ] 1270 | } 1271 | ] 1272 | } 1273 | }, 1274 | "docker": { 1275 | "baseImage": "mcr.microsoft.com/azureml/base:0.2.2", 1276 | "enabled": false, 1277 | "baseImageRegistry": { 1278 | "address": "", 1279 | "username": "", 1280 | "password": "" 1281 | } 1282 | } 1283 | } 1284 | } 1285 | } 1286 | ] 1287 | } -------------------------------------------------------------------------------- /infrastructure/scripts/create-aks.sh: -------------------------------------------------------------------------------- 1 | az ml computetarget create aks -n myaks 2 | -------------------------------------------------------------------------------- /infrastructure/scripts/create-azmlcompute.sh: -------------------------------------------------------------------------------- 1 | az ml computetarget create amlcompute -n cpu --min-nodes 1 --max-nodes 1 -s STANDARD_D3_V2 2 | -------------------------------------------------------------------------------- /infrastructure/scripts/create-workspace.sh: -------------------------------------------------------------------------------- 1 | az group create -n myresourcegroup -l westus2 2 | az ml workspace create -w myworkspace -g myresourcegroup 3 | -------------------------------------------------------------------------------- /labs/01_setup.md: -------------------------------------------------------------------------------- 1 | # Lab 1: setting up the environment 2 | 3 | In this first lab, we'll set up our working environment. 4 | 5 | ## Requirements 6 | 7 | * Visual Studio Code 8 | Download and Install [Visual Studio Code](https://code.visualstudio.com/) 9 | 10 | * Miniconda 11 | Download and install [Miniconda](https://docs.conda.io/en/latest/miniconda.html) 12 | 13 | * Azure ML SDK 14 | From a command line window, run the following command to install the python client package for Azure ML: `pip install azureml-sdk` 15 | 16 | * Azure CLI 17 | From a command line window, run the following command to install the Azure CLI, used for authentication and management tasks: `pip install azure-cli` 18 | 19 | * A git client to clone the lab content 20 | For example Git SCM - https://git-scm.com/. 21 | 22 | ## Clone the repository 23 | 24 | Clone the following git repository: git clone https://github.com/Azure/MLOps-TDSP-Template 25 | 26 | ## Open the cloned git repository in VS Code or your favorite IDE 27 | 28 | ## Az Login 29 | From a terminal, login to your subscription on Azure using the azure cli. 30 | 31 | * `az login` 32 | 33 | If you have multiple subscriptions, you might want to set the right subscription by using the following command. 34 | 35 | * `az account set -s ` 36 | 37 | ## Deploy an ML workspace and dependent resources 38 | 39 | Execute the script `infrastructure/create_mlworkspace.py` to deploy the ML workspace resource and dependent resources such as a Keyvault instance and a Storage Account. 40 | 41 | ## Browse through the created resources in the portal 42 | 43 | You can now take a look over the created resources via the [Azure Portal](http://portal.azure.com/). 44 | -------------------------------------------------------------------------------- /labs/02_experiments.md: -------------------------------------------------------------------------------- 1 | # Lab 2: running experiments 2 | 3 | ## Understand the non-azure / open source ml model code 4 | Observe print statements 5 | Observe performance metrics 6 | 7 | ## Run the training locally 8 | Inspect the results 9 | 10 | ## Run the code via Azure ML 11 | Observe additional metadata 12 | Observe run history 13 | 14 | ## Read Experiment Tracking documentation 15 | 16 | ## Refactor the code to capture run metrics 17 | 18 | ## Submit the experiment again 19 | 20 | ## Refactor the code a little further, and then go to the portal to inspect the run history 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /labs/03_managedcompute.md: -------------------------------------------------------------------------------- 1 | # Lab 3: using remote compute 2 | 3 | ## Review compute management in the studio 4 | 5 | ## Create a compute cluster via the studio 6 | 7 | ## Refactor the training script to make use of the newly created compute 8 | 9 | ## Submit a new training run 10 | 11 | ## Observe the logs via the studio 12 | One dependency is missing for the compute to run 13 | 14 | ## Fix the code and re-Submit 15 | 16 | ## Observe cluster run statuses via the studio 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /labs/04_datasets.md: -------------------------------------------------------------------------------- 1 | # Lab 4: Datasets 2 | 3 | ## Open the studio, browse through data store management and datasets tabs 4 | Understand the differences 5 | 6 | ## Ingest some data into a data store using the script 7 | 8 | ## Use the portal storage explorer to review uploaded data 9 | 10 | ## Define a dataset over this data using the script 11 | 12 | ## Inspect the created datasets via the portal 13 | 14 | ## Open a dataset and note the explorative capabilities 15 | 16 | ## Review the train submit script that uses datasets 17 | 18 | ## Review the train script that uses datasets 19 | 20 | ## Submit a training run 21 | 22 | ## Observe from the run metadata which dataset was used for training 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /labs/05_hypertune.md: -------------------------------------------------------------------------------- 1 | Lab 5: hypertune capabilities 2 | 3 | # Understand goal 4 | 5 | # Walk through hypertune codE 6 | 7 | # Understand differences in run configuration 8 | 9 | # Submit run on AML compute 10 | 11 | # View results in the portal 12 | -------------------------------------------------------------------------------- /labs/06_pipelines.md: -------------------------------------------------------------------------------- 1 | Lab 6: pipelines 2 | 3 | # Refactor hypertune code into an ML pipeline with 4 | 1) data prep 5 | 2) hypertune 6 | 3) train 7 | 8 | -------------------------------------------------------------------------------- /labs/README.md: -------------------------------------------------------------------------------- 1 | # Folder for hosting all documents for a Data Science Project 2 | 3 | Documents will contain information about the following 4 | 5 | 1. System architecture 6 | 2. Data dictionaries 7 | 3. Reports related to data understanding, modeling 8 | 4. Project management and planning docs 9 | 5. Information obtained from a business owner or client about the project 10 | 6. Docs and presentations prepared to share information about the project 11 | --------------------------------------------------------------------------------