├── README.md ├── best_run ├── README.md └── best_run.py ├── data ├── README.md └── sample_libsvm_data.txt ├── dump ├── README.md ├── dump_experiment.py ├── dump_run.py ├── dump_utils.py ├── experiment.txt └── run.txt ├── hello_world ├── MLproject ├── README.md ├── cluster.json ├── conda.yaml ├── hello_world.ipynb ├── hello_world.py └── playbook.ipynb ├── pyspark ├── MLproject ├── README.md ├── conda.yaml ├── playbook.ipynb ├── predict.py └── train.py ├── scala_spark ├── README.md ├── playbook.ipynb ├── pom.xml ├── run_submit_existing_cluster.json ├── run_submit_new_cluster.json └── src │ └── main │ └── scala │ └── org │ └── andre │ └── mlflow │ └── examples │ ├── MLeapUtils.scala │ ├── MLflowUtils.scala │ ├── decisiontree │ ├── PredictDecisionTree.scala │ └── TrainDecisionTree.scala │ └── hello │ └── HelloWorld.scala ├── search ├── README.md └── search.py └── sklearn ├── MLproject ├── README.md ├── Train_Wine_Quality.ipynb ├── conda.yaml ├── create_job_existing_cluster.json ├── create_job_new_cluster.json ├── data ├── wine-quality-red.csv ├── wine-quality-white.csv └── wine-quality.json ├── main.py ├── mlflow_run_cluster.json ├── pickle_predict.py ├── playbook.ipynb ├── pyfunc_predict.py ├── run_submit_existing_cluster.json ├── run_submit_new_cluster.json ├── scikit_predict.py ├── setup.py ├── spark_udf_predict.py ├── util.py └── wine_quality ├── __init__.py ├── plot_utils.py └── train.py /README.md: -------------------------------------------------------------------------------- 1 | # mlflow-spark-summit-2019 2 | 3 | MLflow code for Spark Summit 2019. 4 | 5 | Session: [Managing the Complete Machine Learning Lifecycle with MLflow](https://databricks.com/sparkaisummit/north-america/sessions-single-2019?id=183). 6 | 7 | ## Setup 8 | ``` 9 | pip install mlflow==0.9.1 10 | pip install matplotlib 11 | pip install pyarrow 12 | ``` 13 | 14 | ## MLflow Server 15 | ``` 16 | virtualenv mlflow_server 17 | source mlflow_server/bin/activate 18 | mlflow server --host 0.0.0.0 --port 5000 --backend-store-uri $PWD/mlruns --default-artifact-root $PWD/mlruns 19 | ``` 20 | 21 | ## Examples 22 | Before running an experiment: 23 | ``` 24 | export MLFLOW_TRACKING_URI=http://localhost:5000 25 | ``` 26 | 27 | * [hello_world](hello_world) - Hello World 28 | * [sklearn](sklearn) - Scikit learn model 29 | * [pyspark](pyspark) - PySpark model 30 | * [scala_spark](scala_spark) - Scala Spark ML model using the Java client 31 | * [search](search) - Shows new [MLflow 0.9.1 Search](https://mlflow.org/docs/latest/search-syntax.html) feature 32 | * [dump](dump) - Shows usage of some [mlflow.tracking](https://mlflow.org/docs/latest/python_api/mlflow.tracking.html) package methods 33 | * [best_run](best_run) - Finds the best model run 34 | -------------------------------------------------------------------------------- /best_run/README.md: -------------------------------------------------------------------------------- 1 | 2 | # mlflow-spark-summit-2019 - best_run 3 | 4 | Finds the best run of an experiment by searching for the min or max of a metric. 5 | 6 | Ideally we would like to execute the search on the server side for scalability reasons. 7 | Since the [search](https://www.mlflow.org/docs/latest/search-syntax.html) syntax does not support min/max, we have to perform the search logic on the client side. 8 | 9 | Two implementations: 10 | * Slow - Finds the best run by calling get_run for each run. Optimized for space as response payloads are small. 11 | * Fast - Finds the best run by calling search once to get data for all an experiment's runs. Optimized for time but response payload will be large for experiments with many runs. 12 | 13 | Sample run for [best_run.py](best_run.py): 14 | ``` 15 | python best_metric.py --experiment_id 2 --metric rmse --ascending 16 | ``` 17 | ``` 18 | slow best: ('3d57e49ba31843ac9ea3f4443ac4fbac', 0.7585747707504502) 19 | fast best: ('3d57e49ba31843ac9ea3f4443ac4fbac', 0.7585747707504502) 20 | ``` 21 | -------------------------------------------------------------------------------- /best_run/best_run.py: -------------------------------------------------------------------------------- 1 | 2 | from argparse import ArgumentParser 3 | import mlflow 4 | client = mlflow.tracking.MlflowClient() 5 | 6 | def lt(x,y): return x < y 7 | def gt(x,y): return x > y 8 | 9 | def calc(metric,run, best, funk): 10 | for m in run.data.metrics: 11 | if m.key == metric and (best is None or funk(m.value,best[1])): 12 | best = (run.info.run_uuid,m.value) 13 | return best 14 | 15 | """ 16 | Finds the best run by calling get_run for each run. 17 | """ 18 | def get_best_run_slow(experiment_id, metric, ascending=False): 19 | funk = lt if ascending else gt 20 | best = None 21 | infos = client.list_run_infos(experiment_id) 22 | for info in infos: 23 | run = client.get_run(info.run_uuid) 24 | best = calc(metric,run, best, funk) 25 | return best 26 | 27 | """ 28 | Finds the best run by calling search once to get data for all an experiment's runs 29 | """ 30 | def get_best_run_fast(experiment_id, metric, ascending=False): 31 | funk = lt if ascending else gt 32 | best = None 33 | runs = client.search_runs([experiment_id],"") 34 | for run in runs: 35 | best = calc(metric,run, best, funk) 36 | return best 37 | 38 | if __name__ == "__main__": 39 | parser = ArgumentParser() 40 | parser.add_argument("--experiment_id", dest="experiment_id", help="Experiment ID", type=str, required=True) 41 | parser.add_argument("--metric", dest="metric", help="Metric", type=str, required=True) 42 | parser.add_argument("--ascending", dest="ascending", help="ascending", required=False, default=False, action="store_true") 43 | parser.add_argument("--which", dest="which", help="Which: fast|slow|both", type=str, default="both") 44 | args = parser.parse_args() 45 | 46 | if args.which in ['slow','both']: 47 | best = get_best_run_fast(args.experiment_id, args.metric, args.ascending) 48 | print("fast best:",best) 49 | if args.which in ['fast','both']: 50 | best = get_best_run_slow(args.experiment_id, args.metric, args.ascending) 51 | print("slow best:",best) 52 | 53 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | 2 | sample_libsvm_data.txt: 3 | * https://github.com/apache/spark/tree/master/data/mllib/sample_libsvm_data.txt 4 | * $SPARK_HOME/data/mllib/sample_libsvm_data.txt 5 | -------------------------------------------------------------------------------- /dump/README.md: -------------------------------------------------------------------------------- 1 | # mlflow-spark-summit-2019 - dump 2 | 3 | Dumps all experiment or run information recursively. 4 | 5 | **Dump Run** 6 | 7 | * [dump_run.py](dump_run.py) 8 | * [sample dump](run.txt) 9 | 10 | ``` 11 | python dump_run.py --run_id 2cbab69842e4412c99bfb5e15344bc42 --artifact_max_level 5 12 | ``` 13 | 14 | **Dump Experiment** 15 | 16 | * [dump_experiment.py](dump_experiment.py) 17 | * [sample dump](experiment.txt) 18 | 19 | ``` 20 | python dump_experiment.py --experiment_id 2 --showRuns --artifact_max_level 5 21 | ``` 22 | -------------------------------------------------------------------------------- /dump/dump_experiment.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | Recursively dumps all information about an experiment including all details of its runs and their params, metrics and artifacts. 4 | Note that this can be expensive. Adjust your artifact_max_level. 5 | """ 6 | 7 | from __future__ import print_function 8 | import sys 9 | from argparse import ArgumentParser 10 | import mlflow 11 | from dump_utils import * 12 | 13 | print("MLflow Version:", mlflow.version.VERSION) 14 | 15 | def dump_experiment(exp): 16 | print("Experiment Details:") 17 | for k,v in exp.__dict__.items(): print(" {}: {}".format(k[1:],v)) 18 | 19 | def get_runs(client, infos, artifact_max_level): 20 | for info in infos: 21 | run = client.get_run(info.run_uuid) 22 | dump_run(run) 23 | dump_artifacts(client, info.run_uuid,"",INDENT_INC,artifact_max_level) 24 | 25 | def dump(exp_id_or_name, artifact_max_level, show_runs): 26 | print("Options:") 27 | print(" exp_id_or_name:",exp_id_or_name) 28 | print(" artifact_max_level:",artifact_max_level) 29 | print(" show_runs:",show_runs) 30 | client = mlflow.tracking.MlflowClient() 31 | if exp_id_or_name.isdigit(): 32 | exp_id = int(exp_id_or_name) 33 | else: 34 | print("experiment_name:",exp_id_or_name) 35 | exp_id = client.get_experiment_by_name(exp_id_or_name).experiment_id 36 | print("experiment_id:",exp_id) 37 | exp = client.get_experiment(exp_id) 38 | dump_experiment(exp) 39 | infos = client.list_run_infos(exp_id) 40 | print(" #runs:",len(infos)) 41 | if not show_runs: 42 | return 43 | get_runs(client,infos,artifact_max_level) 44 | print("#runs:",len(infos)) 45 | 46 | if __name__ == "__main__": 47 | parser = ArgumentParser() 48 | parser.add_argument("--experiment_id_or_name", dest="experiment_id", help="Experiment ID", required=True) 49 | parser.add_argument("--artifact_max_level", dest="artifact_max_level", help="Number of artifact levels to recurse", required=False, default=1, type=int) 50 | parser.add_argument("--show_runs", dest="show_runs", help="Show runs", required=False, default=False, action='store_true') 51 | args = parser.parse_args() 52 | dump(args.experiment_id, args.artifact_max_level,args.show_runs) 53 | -------------------------------------------------------------------------------- /dump/dump_run.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | Recursively dumps all information a run including params, metrics, tags and artifacts. 4 | """ 5 | 6 | from __future__ import print_function 7 | from argparse import ArgumentParser 8 | import mlflow 9 | from dump_utils import * 10 | 11 | print("MLflow Version:", mlflow.version.VERSION) 12 | 13 | def get_runs(client, infos, artifact_max_level): 14 | for info in infos: 15 | run = client.get_run(info.run_uuid) 16 | dump_run(run) 17 | dump_artifacts(client, info.run_uuid,"",INDENT_INC,artifact_max_level) 18 | 19 | def dump(run_id, artifact_max_level): 20 | print("Options:") 21 | print(" run_id:",run_id) 22 | print(" artifact_max_level:",artifact_max_level) 23 | client = mlflow.tracking.MlflowClient() 24 | run = client.get_run(run_id) 25 | dump_run(run) 26 | dump_artifacts(client, run_id,"",INDENT_INC, artifact_max_level) 27 | 28 | if __name__ == "__main__": 29 | parser = ArgumentParser() 30 | parser.add_argument("--run_id", dest="run_id", help="Run ID", required=True) 31 | parser.add_argument("--artifact_max_level", dest="artifact_max_level", help="Number of artifact levels to recurse", required=False, default=1, type=int) 32 | args = parser.parse_args() 33 | dump(args.run_id, args.artifact_max_level) 34 | -------------------------------------------------------------------------------- /dump/dump_utils.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | Dump utilities. 4 | """ 5 | 6 | from __future__ import print_function 7 | import time 8 | 9 | INDENT_INC = " " 10 | MAX_LEVEL = 1 11 | 12 | def dump_run(run): 13 | print("Run {}".format(run.info.run_uuid)) 14 | for k,v in run.info.__dict__.items(): print(" {}: {}".format(k[1:],v)) 15 | print(" Params:") 16 | for e in run.data.params: 17 | print(" {}: {}".format(e.key,e.value)) 18 | print(" Metrics:") 19 | for e in run.data.metrics: 20 | sdt = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(e.timestamp/1000)) 21 | print(" {}: {} - timestamp: {} {}".format(e.key,e.value,e.timestamp,sdt)) 22 | print(" Tags:") 23 | for e in run.data.tags: 24 | print(" {}: {}".format(e.key,e.value)) 25 | 26 | def dump_run_info(info): 27 | print("Run run_uuid {}".format(info.run_uuid)) 28 | for k,v in info.__dict__.items(): print(" {}: {}".format(k,v)) 29 | 30 | def dump_artifact(art,indent="",level=0): 31 | print("{}Artifact - level {}:".format(indent,level)) 32 | for k,v in art.__dict__.items(): print(" {}{}: {}".format(indent,k[1:],v)) 33 | 34 | def _dump_artifacts(client, run_id, path, indent, level, max_level): 35 | level += 1 36 | if level > max_level: return 37 | artifacts = client.list_artifacts(run_id,path) 38 | for art in artifacts: 39 | dump_artifact(art,indent+INDENT_INC,level) 40 | if art.is_dir: 41 | _dump_artifacts(client, run_id, art.path, indent+INDENT_INC,level,max_level) 42 | 43 | def dump_artifacts(client, run_id, path="", indent="", max_level=MAX_LEVEL): 44 | print("{}Artifacts:".format(indent)) 45 | _dump_artifacts(client, run_id, path, indent, 0, max_level) 46 | 47 | -------------------------------------------------------------------------------- /dump/experiment.txt: -------------------------------------------------------------------------------- 1 | MLflow Version: 0.9.1 2 | Options: 3 | exp_id_or_name: 2 4 | artifact_max_level: 5 5 | show_runs: True 6 | experiment_id: 2 7 | Experiment Details: 8 | experiment_id: 2 9 | name: sklearn 10 | artifact_location: /opt/mlflow-server/mlruns/2 11 | lifecycle_stage: active 12 | #runs: 1 13 | Run 65b9e6f331da452a9ec0ff57fbfc26a1 14 | run_uuid: 65b9e6f331da452a9ec0ff57fbfc26a1 15 | experiment_id: 2 16 | name: 17 | source_type: 4 18 | source_name: train.py 19 | entry_point_name: 20 | user_id: andre 21 | status: 3 22 | start_time: 1556210939387 23 | end_time: 1556210940662 24 | source_version: 4d756b3812204510ffea2ff8d5af15e8e9cbe06e 25 | lifecycle_stage: active 26 | artifact_uri: /opt/mlflow-server/mlruns/2/65b9e6f331da452a9ec0ff57fbfc26a1/artifacts 27 | Params: 28 | alpha: 1.0 29 | l1_ratio: 0.5 30 | Metrics: 31 | mae: 0.6481010264813273 - timestamp: 1556210939 1970-01-19 00:16:50 32 | r2: 0.04618821720476163 - timestamp: 1556210939 1970-01-19 00:16:50 33 | rmse: 0.8593526200510287 - timestamp: 1556210939 1970-01-19 00:16:50 34 | Tags: 35 | data_path: data/wine-quality-white.csv 36 | exp_id: 2 37 | exp_name: sklearn 38 | mlflow.source.git.commit: 4d756b3812204510ffea2ff8d5af15e8e9cbe06e 39 | mlflow.source.name: train.py 40 | mlflow.source.type: LOCAL 41 | platform: Darwin 42 | run_origin: demo/egg/train.sh 43 | Artifacts: 44 | Artifact - level 1: 45 | path: model 46 | is_dir: True 47 | bytes: None 48 | Artifact - level 2: 49 | path: model/MLmodel 50 | is_dir: False 51 | bytes: 344 52 | Artifact - level 2: 53 | path: model/conda.yaml 54 | is_dir: False 55 | bytes: 120 56 | Artifact - level 2: 57 | path: model/model.pkl 58 | is_dir: False 59 | bytes: 673 60 | Artifact - level 1: 61 | path: wine_ElasticNet-paths.png 62 | is_dir: False 63 | bytes: 27773 64 | #runs: 1 65 | -------------------------------------------------------------------------------- /dump/run.txt: -------------------------------------------------------------------------------- 1 | MLflow Version: 0.9.1 2 | Options: 3 | run_id: 65b9e6f331da452a9ec0ff57fbfc26a1 4 | artifact_max_level: 5 5 | Run 65b9e6f331da452a9ec0ff57fbfc26a1 6 | run_uuid: 65b9e6f331da452a9ec0ff57fbfc26a1 7 | experiment_id: 2 8 | name: 9 | source_type: 4 10 | source_name: train.py 11 | entry_point_name: 12 | user_id: andre 13 | status: 3 14 | start_time: 1556210939387 15 | end_time: 1556210940662 16 | source_version: 4d756b3812204510ffea2ff8d5af15e8e9cbe06e 17 | lifecycle_stage: active 18 | artifact_uri: /opt/mlflow-server/mlruns/2/65b9e6f331da452a9ec0ff57fbfc26a1/artifacts 19 | Params: 20 | alpha: 1.0 21 | l1_ratio: 0.5 22 | Metrics: 23 | mae: 0.6481010264813273 - timestamp: 1556210939 1970-01-19 00:16:50 24 | r2: 0.04618821720476163 - timestamp: 1556210939 1970-01-19 00:16:50 25 | rmse: 0.8593526200510287 - timestamp: 1556210939 1970-01-19 00:16:50 26 | Tags: 27 | data_path: data/wine-quality-white.csv 28 | exp_id: 2 29 | exp_name: sklearn 30 | mlflow.source.git.commit: 4d756b3812204510ffea2ff8d5af15e8e9cbe06e 31 | mlflow.source.name: train.py 32 | mlflow.source.type: LOCAL 33 | platform: Darwin 34 | run_origin: demo/egg/train.sh 35 | Artifacts: 36 | Artifact - level 1: 37 | path: model 38 | is_dir: True 39 | bytes: None 40 | Artifact - level 2: 41 | path: model/MLmodel 42 | is_dir: False 43 | bytes: 344 44 | Artifact - level 2: 45 | path: model/conda.yaml 46 | is_dir: False 47 | bytes: 120 48 | Artifact - level 2: 49 | path: model/model.pkl 50 | is_dir: False 51 | bytes: 673 52 | Artifact - level 1: 53 | path: wine_ElasticNet-paths.png 54 | is_dir: False 55 | bytes: 27773 56 | -------------------------------------------------------------------------------- /hello_world/MLproject: -------------------------------------------------------------------------------- 1 | name: mlflow_demo_hello_world 2 | 3 | conda_env: conda.yaml 4 | 5 | entry_points: 6 | main: 7 | parameters: 8 | alpha: {type: string, default: "0.01" } 9 | run_origin: {type: string, default: "None" } 10 | log_artifact: {type: string, default: "False" } 11 | command: "python hello_world.py 12 | --alpha {alpha} 13 | --run_origin {run_origin} 14 | --log_artifact {log_artifact}" 15 | -------------------------------------------------------------------------------- /hello_world/README.md: -------------------------------------------------------------------------------- 1 | 2 | # mlflow-spark-summit-2019 - hello_world 3 | 4 | Simple Hello World that demonstrates the different ways to run an MLflow experiment. 5 | 6 | For details see [MLflow documentation - Running Projects](https://mlflow.org/docs/latest/projects.html#running-projects). 7 | 8 | Synopsis of [hello_world.py](hello_world.py): 9 | * Creates an experiment HelloWorld if it does not exist. 10 | * Logs parameters, metrics and tags. 11 | * Batch loggging of parameters, metrics and tags. 12 | * No ML training. 13 | * Optionally writes an artifact. 14 | 15 | The different ways to run an experiment: 16 | * Unmanaged without mlflow 17 | * Command-line python 18 | * Jupyter notebook 19 | * Using mlflow run with [MLproject](MLproject) 20 | * mlflow run local 21 | * mlflow run git 22 | * mlflow run remote 23 | 24 | ## Setup 25 | 26 | **External tracking server** 27 | ``` 28 | export MLFLOW_TRACKING_URI=http://localhost:5000 29 | ``` 30 | 31 | **Databricks managed tracking server** 32 | ``` 33 | export MLFLOW_TRACKING_URI=databricks 34 | ``` 35 | The token and tracking server URL will be picked up from your Databricks CLI ~/.databrickscfg default profile. 36 | 37 | ## Running 38 | 39 | ### Unmanaged without mlflow run 40 | #### Command-line python 41 | ``` 42 | python hello_world.py 43 | ``` 44 | 45 | #### Jupyter notebook 46 | See [hello_world.ipynb](hello_world.ipynb). 47 | ``` 48 | export MLFLOW_TRACKING_URI=http://localhost:5000 49 | jupyter notebook 50 | ``` 51 | 52 | ### Using mlflow run 53 | 54 | #### mlflow run local 55 | ``` 56 | mlflow run . -Palpha=.01 -Prun_origin=LocalRun -Plog_artifact=True 57 | ``` 58 | You can also specify an experiment ID: 59 | ``` 60 | mlflow run . --experiment-id=2019 -Palpha=.01 -Prun_origin=LocalRun -Plog_artifact=True 61 | ``` 62 | 63 | #### mlflow run git 64 | ``` 65 | mlflow run https://github.com/amesar/mlflow-fun.git#examples/hello_world \ 66 | --experiment-id=2019 \ 67 | -Palpha=100 -Prun_origin=GitRun -Plog_artifact=True 68 | ``` 69 | #### mlflow run Databricks remote 70 | Run against Databricks. See [Remote Execution on Databricks](https://mlflow.org/docs/latest/projects.html#remote-execution-on-databricks) and [cluster.json](cluster.json). 71 | ``` 72 | mlflow run https://github.com/amesar/mlflow-fun.git#examples/hello_world \ 73 | --experiment-id=2019 \ 74 | -Palpha=100 -Prun_origin=RemoteRun -Plog_artifact=True \ 75 | -m databricks --cluster-spec cluster.json 76 | ``` 77 | -------------------------------------------------------------------------------- /hello_world/cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "spark_version": "5.3.x-scala2.11", 3 | "driver_node_type_id": "i3.xlarge", 4 | "node_type_id": "i3.xlarge", 5 | "num_workers": 1, 6 | "spark_env_vars": { 7 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 8 | }, 9 | "libraries": [ 10 | { "pypi": { "package": "mlflow" }} 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /hello_world/conda.yaml: -------------------------------------------------------------------------------- 1 | name: mlflow-demo-hello-world 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.6 6 | - pip: 7 | - mlflow==0.9.1 8 | -------------------------------------------------------------------------------- /hello_world/hello_world.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from __future__ import print_function\n", 10 | "import mlflow\n", 11 | "from mlflow.entities import Param,Metric,RunTag" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 4, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "MLflow Version: 0.9.1\n", 24 | "Tracking URI: http://localhost:5000\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "print(\"MLflow Version:\", mlflow.version.VERSION)\n", 30 | "mlflow.set_tracking_uri(\"http://localhost:5000\")\n", 31 | "print(\"Tracking URI:\", mlflow.tracking.get_tracking_uri())" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 5, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "experiment_name: hello_world\n", 44 | "experiment_id: 1\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "experiment_name = \"hello_world\"\n", 50 | "print(\"experiment_name:\",experiment_name)\n", 51 | "mlflow.set_experiment(experiment_name)\n", 52 | "\n", 53 | "client = mlflow.tracking.MlflowClient()\n", 54 | "experiment_id = client.get_experiment_by_name(experiment_name).experiment_id\n", 55 | "print(\"experiment_id:\",experiment_id)\n", 56 | "\n", 57 | "import time\n", 58 | "now = int(time.time()+.5)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 6, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "def run(alpha, run_origin, log_artifact):\n", 68 | " with mlflow.start_run(run_name=run_origin) as run:\n", 69 | " print(\"runId:\",run.info.run_uuid)\n", 70 | " print(\"artifact_uri:\",mlflow.get_artifact_uri())\n", 71 | " print(\"alpha:\",alpha)\n", 72 | " print(\"log_artifact:\",log_artifact)\n", 73 | " print(\"run_origin:\",run_origin)\n", 74 | " mlflow.log_param(\"alpha\", alpha)\n", 75 | " mlflow.log_metric(\"rmse\", 0.789)\n", 76 | " mlflow.set_tag(\"run_origin\", run_origin)\n", 77 | " mlflow.set_tag(\"log_artifact\", log_artifact)\n", 78 | " if log_artifact:\n", 79 | " with open(\"info.txt\", \"w\") as f:\n", 80 | " f.write(\"Hi artifact\")\n", 81 | " mlflow.log_artifact(\"info.txt\")\n", 82 | "\n", 83 | " params = [ Param(\"p1\",\"0.1\"), Param(\"p2\",\"0.2\") ]\n", 84 | " metrics = [ Metric(\"m1\",0.1,now), Metric(\"m2\",0.2,now) ]\n", 85 | " tags = [ RunTag(\"t1\",\"hi1\"), RunTag(\"t2\",\"hi2\") ]\n", 86 | " client.log_batch(run.info.run_uuid, metrics, params, tags)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 7, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "runId: 512955d89c6a40c09bd45429be8206aa\n", 99 | "artifact_uri: /Users/ander/work/mlflow/local_mlrun/mlruns/1/512955d89c6a40c09bd45429be8206aa/artifacts\n", 100 | "alpha: 0.1\n", 101 | "log_artifact: True\n", 102 | "run_origin: jupyter\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "run(\"0.1\", \"jupyter\", True)" 108 | ] 109 | } 110 | ], 111 | "metadata": { 112 | "kernelspec": { 113 | "display_name": "Python 3", 114 | "language": "python", 115 | "name": "python3" 116 | }, 117 | "language_info": { 118 | "codemirror_mode": { 119 | "name": "ipython", 120 | "version": 3 121 | }, 122 | "file_extension": ".py", 123 | "mimetype": "text/x-python", 124 | "name": "python", 125 | "nbconvert_exporter": "python", 126 | "pygments_lexer": "ipython3", 127 | "version": "3.6.8" 128 | } 129 | }, 130 | "nbformat": 4, 131 | "nbformat_minor": 2 132 | } 133 | -------------------------------------------------------------------------------- /hello_world/hello_world.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | import time 4 | import mlflow 5 | from mlflow.entities import Param,Metric,RunTag 6 | from argparse import ArgumentParser 7 | 8 | print("MLflow Version:", mlflow.version.VERSION) 9 | print("Tracking URI:", mlflow.tracking.get_tracking_uri()) 10 | 11 | experiment_name = "hello_world" 12 | print("experiment_name:",experiment_name) 13 | mlflow.set_experiment(experiment_name) 14 | 15 | client = mlflow.tracking.MlflowClient() 16 | experiment_id = client.get_experiment_by_name(experiment_name).experiment_id 17 | print("experiment_id:",experiment_id) 18 | 19 | now = int(time.time()+.5) 20 | 21 | def run(alpha, run_origin, log_artifact): 22 | with mlflow.start_run(run_name=run_origin) as run: 23 | print("runId:",run.info.run_uuid) 24 | print("artifact_uri:",mlflow.get_artifact_uri()) 25 | print("alpha:",alpha) 26 | print("log_artifact:",log_artifact) 27 | print("run_origin:",run_origin) 28 | mlflow.log_param("alpha", alpha) 29 | mlflow.log_metric("rmse", 0.789) 30 | mlflow.set_tag("run_origin", run_origin) 31 | mlflow.set_tag("log_artifact", log_artifact) 32 | if log_artifact: 33 | with open("info.txt", "w") as f: 34 | f.write("Hi artifact") 35 | mlflow.log_artifact("info.txt") 36 | 37 | params = [ Param("p1","0.1"), Param("p2","0.2") ] 38 | metrics = [ Metric("m1",0.1,now), Metric("m2",0.2,now) ] 39 | tags = [ RunTag("t1","hi1"), RunTag("t2","hi2") ] 40 | client.log_batch(run.info.run_uuid, metrics, params, tags) 41 | 42 | import sys 43 | if __name__ == "__main__": 44 | parser = ArgumentParser() 45 | parser.add_argument("--alpha", dest="alpha", help="alpha", default=0.1, type=float ) 46 | parser.add_argument("--run_origin", dest="run_origin", help="run_origin", default="") 47 | parser.add_argument("--log_artifact", dest="log_artifact", help="Log artifact", type=str, default="False") 48 | args = parser.parse_args() 49 | run(args.alpha,args.run_origin,args.log_artifact=="True") 50 | -------------------------------------------------------------------------------- /hello_world/playbook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## hello_world playbook" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import os\n", 17 | "os.environ[\"MLFLOW_TRACKING_URI\"] = \"http://localhost:5000\"" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "name": "stdout", 27 | "output_type": "stream", 28 | "text": [ 29 | "MLflow Version: 0.9.1\n", 30 | "Tracking URI: http://localhost:5000\n", 31 | "experiment_name: hello_world\n", 32 | "experiment_id: 1\n", 33 | "runId: f12c8d9e6d56450280943ec814cdb32e\n", 34 | "artifact_uri: /Users/ander/work/mlflow/local_mlrun/mlruns/1/f12c8d9e6d56450280943ec814cdb32e/artifacts\n", 35 | "alpha: 0.1\n", 36 | "log_artifact: False\n", 37 | "run_origin: \n" 38 | ] 39 | } 40 | ], 41 | "source": [ 42 | "! python hello_world.py" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "MLflow Version: 0.9.1\n", 55 | "Tracking URI: http://localhost:5000\n", 56 | "experiment_name: hello_world\n", 57 | "experiment_id: 1\n", 58 | "runId: 34b98f61e9f94ada81d6b8be892c8f65\n", 59 | "artifact_uri: /Users/ander/work/mlflow/local_mlrun/mlruns/1/34b98f61e9f94ada81d6b8be892c8f65/artifacts\n", 60 | "alpha: 0.1\n", 61 | "log_artifact: True\n", 62 | "run_origin: \n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "! python hello_world.py --log_artifact True" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "2019/04/25 13:42:50 INFO mlflow.projects: === Created directory /var/folders/_9/tbkxzw0116v2cp_zq4f1_1cm0000gp/T/tmpdihQqS for downloading remote URIs passed to arguments of type 'path' ===\n", 80 | "2019/04/25 13:42:50 INFO mlflow.projects: === Running command 'source activate mlflow-aacce47b0cb7984f4aead56265692d3969388f30 && python hello_world.py --alpha .01 --run_origin LocalRun --log_artifact True' in run with ID '91bdea4b8e7f47379688d98bfeb424a0' === \n", 81 | "MLflow Version: 0.9.1\n", 82 | "Tracking URI: http://localhost:5000\n", 83 | "experiment_name: hello_world\n", 84 | "experiment_id: 1\n", 85 | "runId: 91bdea4b8e7f47379688d98bfeb424a0\n", 86 | "artifact_uri: /Users/ander/work/mlflow/local_mlrun/mlruns/0/91bdea4b8e7f47379688d98bfeb424a0/artifacts\n", 87 | "alpha: 0.01\n", 88 | "log_artifact: True\n", 89 | "run_origin: LocalRun\n", 90 | "2019/04/25 13:42:51 INFO mlflow.projects: === Run (ID '91bdea4b8e7f47379688d98bfeb424a0') succeeded ===\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "! mlflow run . -Palpha=.01 -Prun_origin=LocalRun -Plog_artifact=True" 96 | ] 97 | } 98 | ], 99 | "metadata": { 100 | "kernelspec": { 101 | "display_name": "Python 2", 102 | "language": "python", 103 | "name": "python2" 104 | }, 105 | "language_info": { 106 | "codemirror_mode": { 107 | "name": "ipython", 108 | "version": 2 109 | }, 110 | "file_extension": ".py", 111 | "mimetype": "text/x-python", 112 | "name": "python", 113 | "nbconvert_exporter": "python", 114 | "pygments_lexer": "ipython2", 115 | "version": "2.7.14" 116 | } 117 | }, 118 | "nbformat": 4, 119 | "nbformat_minor": 2 120 | } 121 | -------------------------------------------------------------------------------- /pyspark/MLproject: -------------------------------------------------------------------------------- 1 | name: mlflow_demo_pyspark 2 | 3 | conda_env: conda.yaml 4 | 5 | entry_points: 6 | main: 7 | parameters: 8 | max_depth: {type: int, default: 2 } 9 | max_bins: {type: int, default: 32 } 10 | command: "spark-submit --master local[2] train.py --max_depth {max_depth} --max_bins {max_bins}" 11 | -------------------------------------------------------------------------------- /pyspark/README.md: -------------------------------------------------------------------------------- 1 | # mlflow-spark-summit-2019 - pyspark 2 | 3 | ## Overview 4 | 5 | * PySpark Decision Tree Classification example 6 | * Source: [train.py](train.py) and [predict.py](predict.py) 7 | * Experiment name: pypark 8 | 9 | ## Train 10 | 11 | ### Unmanaged without mlflow run 12 | 13 | To run with standard main function 14 | ``` 15 | spark-submit --master local[2] train.py --max_depth 16 --max_bins 32 16 | ``` 17 | 18 | ### Using mlflow run 19 | 20 | These runs use the [MLproject](MLproject) file. For more details see [MLflow documentation - Running Projects](https://mlflow.org/docs/latest/projects.html#running-projects). 21 | 22 | Note that `mlflow run` ignores the `set_experiment()` function so you must specify the experiment with the `--experiment-id` argument. 23 | 24 | **mlflow run local** 25 | ``` 26 | mlflow run . -P max_depth=3 -P max_bins=24 --experiment-id=2019 27 | ``` 28 | 29 | **mlflow run github** 30 | ``` 31 | mlflow run https://github.com/amesar/mlflow-fun.git#examples/pyspark \ 32 | -P max_depth=3 -P max_bins=24 \ 33 | --experiment-id=2019 34 | ``` 35 | 36 | ## Predict 37 | 38 | See [predict.py](predict.py). 39 | 40 | ``` 41 | run_id=7b951173284249f7a3b27746450ac7b0 42 | spark-submit --master local[2] predict.py $run_id 43 | ``` 44 | 45 | ``` 46 | Predictions 47 | root 48 | |-- label: double (nullable = true) 49 | |-- features: vector (nullable = true) 50 | |-- indexedLabel: double (nullable = false) 51 | |-- indexedFeatures: vector (nullable = true) 52 | |-- rawPrediction: vector (nullable = true) 53 | |-- probability: vector (nullable = true) 54 | |-- prediction: double (nullable = false) 55 | 56 | +----------+------------+-----------+ 57 | |prediction|indexedLabel|probability| 58 | +----------+------------+-----------+ 59 | |0.0 |1.0 |[1.0,0.0] | 60 | |1.0 |0.0 |[0.0,1.0] | 61 | |1.0 |0.0 |[0.0,1.0] | 62 | +----------+------------+-----------+ 63 | ``` 64 | 65 | -------------------------------------------------------------------------------- /pyspark/conda.yaml: -------------------------------------------------------------------------------- 1 | name: mlflow-demo-pyspark 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.6 6 | - pip: 7 | - mlflow==0.9.1 8 | -------------------------------------------------------------------------------- /pyspark/playbook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## pyspark playbook" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 23, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "mlflow, version 0.9.1\r\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "! mlflow --version" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "### Train" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 24, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import os\n", 41 | "os.environ[\"MLFLOW_TRACKING_URI\"] = \"http://localhost:5000\"" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 26, 47 | "metadata": { 48 | "scrolled": false 49 | }, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "19/04/24 21:07:26 WARN Utils: Your hostname, C02VD1RGHTDD resolves to a loopback address: 127.0.0.1; using 10.64.185.74 instead (on interface en0)\n", 56 | "19/04/24 21:07:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", 57 | "19/04/24 21:07:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", 58 | "MLflow Version: 0.9.1\n", 59 | "Tracking URI: http://localhost:5000\n", 60 | "experiment_name: pyspark\n", 61 | "MLflow Version: 0.9.1\n", 62 | "experiment_id: 3\n", 63 | "run_id: 71fc995eb9154fe7bf360f1686456ea6\n", 64 | "experiment_id: 3\n", 65 | "Parameters: max_depth: 16 max_bins: 32\n", 66 | "+----------+------------+--------------------+\n", 67 | "|prediction|indexedLabel| features|\n", 68 | "+----------+------------+--------------------+\n", 69 | "| 1.0| 1.0|(692,[98,99,100,1...|\n", 70 | "| 1.0| 1.0|(692,[123,124,125...|\n", 71 | "| 1.0| 1.0|(692,[124,125,126...|\n", 72 | "| 1.0| 1.0|(692,[124,125,126...|\n", 73 | "| 1.0| 1.0|(692,[126,127,128...|\n", 74 | "+----------+------------+--------------------+\n", 75 | "only showing top 5 rows\n", 76 | "\n", 77 | "Test Error = 0.033333333333333326 \n", 78 | "DecisionTreeClassificationModel (uid=DecisionTreeClassifier_2cea66fefc86) of depth 1 with 3 nodes\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "! spark-submit --master local[2] train.py --max_depth 16 --max_bins 32" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 27, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | "2019/04/24 21:07:43 INFO mlflow.projects: === Fetching project from https://github.com/amesar/mlflow-fun.git#examples/pyspark into /var/folders/_9/tbkxzw0116v2cp_zq4f1_1cm0000gp/T/tmpg7fq0zy5 ===\n", 96 | "2019/04/24 21:07:47 INFO mlflow.projects: === Created directory /var/folders/_9/tbkxzw0116v2cp_zq4f1_1cm0000gp/T/tmpji7c7udh for downloading remote URIs passed to arguments of type 'path' ===\n", 97 | "2019/04/24 21:07:47 INFO mlflow.projects: === Running command 'source activate mlflow-95125872403f1ccbea3f04eea25e874f26a00372 && spark-submit --master local[2] train.py --max_depth 3 --max_bins 24' in run with ID '6aed788a20ea4fae90c423898fbfad58' === \n", 98 | "19/04/24 21:07:48 WARN Utils: Your hostname, C02VD1RGHTDD resolves to a loopback address: 127.0.0.1; using 10.64.185.74 instead (on interface en0)\n", 99 | "19/04/24 21:07:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", 100 | "19/04/24 21:07:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", 101 | "/Users/ander/miniconda3/envs/mlflow-95125872403f1ccbea3f04eea25e874f26a00372/lib/python3.6/site-packages/mlflow/utils/environment.py:26: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", 102 | " env = yaml.load(_conda_header)\n", 103 | "MLflow Version: 0.8.2\n", 104 | "Tracking URI: http://localhost:5000\n", 105 | "experiment_name: py/spark/DecisionTree\n", 106 | "INFO: 'py/spark/DecisionTree' does not exist. Creating a new experiment\n", 107 | "MLflow Version: 0.8.2\n", 108 | "experiment_id: 6\n", 109 | "run_id: 6aed788a20ea4fae90c423898fbfad58\n", 110 | "experiment_id: 2\n", 111 | "Parameters: max_depth: 3 max_bins: 24\n", 112 | "+----------+------------+--------------------+\n", 113 | "|prediction|indexedLabel| features|\n", 114 | "+----------+------------+--------------------+\n", 115 | "| 1.0| 1.0|(692,[121,122,123...|\n", 116 | "| 1.0| 1.0|(692,[123,124,125...|\n", 117 | "| 1.0| 1.0|(692,[124,125,126...|\n", 118 | "| 1.0| 1.0|(692,[126,127,128...|\n", 119 | "| 1.0| 1.0|(692,[126,127,128...|\n", 120 | "+----------+------------+--------------------+\n", 121 | "only showing top 5 rows\n", 122 | "\n", 123 | "Test Error = 0.0 \n", 124 | "DecisionTreeClassificationModel (uid=DecisionTreeClassifier_a089dee78712) of depth 2 with 5 nodes\n", 125 | "2019/04/24 21:08:04 INFO mlflow.projects: === Run (ID '6aed788a20ea4fae90c423898fbfad58') succeeded ===\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "! mlflow run https://github.com/amesar/mlflow-fun.git#examples/pyspark \\\n", 131 | " -P max_depth=3 -P max_bins=24 \\\n", 132 | " --experiment-id=2 \\" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 28, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "2019/04/24 21:08:06 INFO mlflow.projects: === Fetching project from https://github.com/amesar/mlflow-fun.git#examples/pyspark into /var/folders/_9/tbkxzw0116v2cp_zq4f1_1cm0000gp/T/tmpmg4jv0rv ===\n", 145 | "2019/04/24 21:08:10 INFO mlflow.projects: === Created directory /var/folders/_9/tbkxzw0116v2cp_zq4f1_1cm0000gp/T/tmpi8i4vntc for downloading remote URIs passed to arguments of type 'path' ===\n", 146 | "2019/04/24 21:08:10 INFO mlflow.projects: === Running command 'source activate mlflow-95125872403f1ccbea3f04eea25e874f26a00372 && spark-submit --master local[2] train.py --max_depth 3 --max_bins 24' in run with ID '980be9f445184ff4854701b0dfd0889d' === \n", 147 | "19/04/24 21:08:11 WARN Utils: Your hostname, C02VD1RGHTDD resolves to a loopback address: 127.0.0.1; using 10.64.185.74 instead (on interface en0)\n", 148 | "19/04/24 21:08:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", 149 | "19/04/24 21:08:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", 150 | "/Users/ander/miniconda3/envs/mlflow-95125872403f1ccbea3f04eea25e874f26a00372/lib/python3.6/site-packages/mlflow/utils/environment.py:26: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", 151 | " env = yaml.load(_conda_header)\n", 152 | "MLflow Version: 0.8.2\n", 153 | "Tracking URI: http://localhost:5000\n", 154 | "experiment_name: py/spark/DecisionTree\n", 155 | "MLflow Version: 0.8.2\n", 156 | "experiment_id: 6\n", 157 | "run_id: 980be9f445184ff4854701b0dfd0889d\n", 158 | "experiment_id: 2\n", 159 | "Parameters: max_depth: 3 max_bins: 24\n", 160 | "+----------+------------+--------------------+\n", 161 | "|prediction|indexedLabel| features|\n", 162 | "+----------+------------+--------------------+\n", 163 | "| 1.0| 1.0|(692,[98,99,100,1...|\n", 164 | "| 1.0| 1.0|(692,[100,101,102...|\n", 165 | "| 1.0| 1.0|(692,[121,122,123...|\n", 166 | "| 1.0| 1.0|(692,[122,123,124...|\n", 167 | "| 1.0| 1.0|(692,[124,125,126...|\n", 168 | "+----------+------------+--------------------+\n", 169 | "only showing top 5 rows\n", 170 | "\n", 171 | "Test Error = 0.02941176470588236 \n", 172 | "DecisionTreeClassificationModel (uid=DecisionTreeClassifier_a5830562b284) of depth 2 with 5 nodes\n", 173 | "2019/04/24 21:08:23 INFO mlflow.projects: === Run (ID '980be9f445184ff4854701b0dfd0889d') succeeded ===\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "! mlflow run https://github.com/amesar/mlflow-fun.git#examples/pyspark \\\n", 179 | " -P max_depth=3 -P max_bins=24 \\\n", 180 | " --experiment-id=2" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "### Predict" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 3, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "19/04/25 14:01:33 WARN Utils: Your hostname, C02VD1RGHTDD resolves to a loopback address: 127.0.0.1; using 10.64.185.74 instead (on interface en0)\n", 200 | "19/04/25 14:01:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", 201 | "19/04/25 14:01:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", 202 | "MLflow Version: 0.9.1\n", 203 | "Tracking URI: /Users/ander/git/andre/clean/mlflow-spark-summit-2019/pyspark/mlruns\n", 204 | "run_id: 6ca69795529e491983d217181ab2dae9\n", 205 | "data_path: ../data/sample_libsvm_data.txt\n", 206 | "Traceback (most recent call last):\n", 207 | " File \"/Users/ander/git/andre/clean/mlflow-spark-summit-2019/pyspark/predict.py\", line 20, in \n", 208 | " model = mlflow_spark.load_model(\"spark-model\", run_id=run_id)\n", 209 | " File \"/Users/ander/venvs/mlflow-venv/lib/python2.7/site-packages/mlflow/spark.py\", line 348, in load_model\n", 210 | " path = mlflow.tracking.utils._get_model_log_dir(model_name=path, run_id=run_id)\n", 211 | " File \"/Users/ander/venvs/mlflow-venv/lib/python2.7/site-packages/mlflow/tracking/utils.py\", line 279, in _get_model_log_dir\n", 212 | " run = store.get_run(run_id)\n", 213 | " File \"/Users/ander/venvs/mlflow-venv/lib/python2.7/site-packages/mlflow/store/file_store.py\", line 368, in get_run\n", 214 | " run_info = self._get_run_info(run_uuid)\n", 215 | " File \"/Users/ander/venvs/mlflow-venv/lib/python2.7/site-packages/mlflow/store/file_store.py\", line 384, in _get_run_info\n", 216 | " databricks_pb2.RESOURCE_DOES_NOT_EXIST)\n", 217 | "mlflow.exceptions.MlflowException: Run '6ca69795529e491983d217181ab2dae9' not found\n" 218 | ] 219 | } 220 | ], 221 | "source": [ 222 | "! spark-submit --master local[2] predict.py 6ca69795529e491983d217181ab2dae9" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [] 231 | } 232 | ], 233 | "metadata": { 234 | "kernelspec": { 235 | "display_name": "Python 2", 236 | "language": "python", 237 | "name": "python2" 238 | }, 239 | "language_info": { 240 | "codemirror_mode": { 241 | "name": "ipython", 242 | "version": 2 243 | }, 244 | "file_extension": ".py", 245 | "mimetype": "text/x-python", 246 | "name": "python", 247 | "nbconvert_exporter": "python", 248 | "pygments_lexer": "ipython2", 249 | "version": "2.7.14" 250 | } 251 | }, 252 | "nbformat": 4, 253 | "nbformat_minor": 2 254 | } 255 | -------------------------------------------------------------------------------- /pyspark/predict.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function 3 | import sys 4 | import mlflow 5 | import mlflow.spark as mlflow_spark 6 | from pyspark.sql import SparkSession 7 | 8 | print("MLflow Version:", mlflow.version.VERSION) 9 | print("Tracking URI:", mlflow.tracking.get_tracking_uri()) 10 | 11 | if __name__ == "__main__": 12 | run_id = sys.argv[1] 13 | print("run_id:",run_id) 14 | spark = SparkSession.builder.appName("Predict").getOrCreate() 15 | 16 | data_path = "../data/sample_libsvm_data.txt" 17 | print("data_path:",data_path) 18 | data = spark.read.format("libsvm").load(data_path) 19 | 20 | model = mlflow_spark.load_model("spark-model", run_id=run_id) 21 | predictions = model.transform(data) 22 | 23 | print("Prediction Dataframe") 24 | predictions.printSchema() 25 | 26 | print("Filtered Prediction Dataframe") 27 | df = predictions.select("prediction", "indexedLabel","probability").filter("prediction <> indexedLabel") 28 | df.printSchema() 29 | df.show(5,False) 30 | -------------------------------------------------------------------------------- /pyspark/train.py: -------------------------------------------------------------------------------- 1 | """ 2 | PySpark Decision Tree Classification Example. 3 | """ 4 | from __future__ import print_function 5 | 6 | import sys,os 7 | from argparse import ArgumentParser 8 | from pyspark.ml import Pipeline 9 | from pyspark.ml.classification import DecisionTreeClassifier 10 | from pyspark.ml.feature import StringIndexer, VectorIndexer 11 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator 12 | from pyspark.sql import SparkSession 13 | import mlflow 14 | from mlflow import version 15 | from mlflow import spark as mlflow_spark 16 | 17 | print("MLflow Version:", mlflow.version.VERSION) 18 | print("Tracking URI:", mlflow.tracking.get_tracking_uri()) 19 | experiment_name = "pyspark" 20 | print("experiment_name:",experiment_name) 21 | mlflow.set_experiment(experiment_name) 22 | 23 | def train(max_depth, max_bins): 24 | print("Parameters: max_depth: {} max_bins: {}".format(max_depth,max_bins)) 25 | spark = SparkSession.builder.appName("DecisionTreeClassificationExample").getOrCreate() 26 | 27 | # Load the data stored in LIBSVM format as a DataFrame. 28 | data_path = "../data/sample_libsvm_data.txt" 29 | data = spark.read.format("libsvm").load(data_path) 30 | 31 | # Index labels, adding metadata to the label column. 32 | # Fit on whole dataset to include all labels in index. 33 | labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) 34 | 35 | # Automatically identify categorical features, and index them. 36 | # We specify maxCategories so features with > 4 distinct values are treated as continuous. 37 | featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) 38 | 39 | # Split the data into training and test sets 40 | (trainingData, testData) = data.randomSplit([0.7, 0.3]) 41 | 42 | # Train a DecisionTree model. 43 | mlflow.log_param("max_depth",max_depth) 44 | mlflow.log_param("max_bins",max_bins) 45 | dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxDepth=max_depth, maxBins=max_bins) 46 | 47 | # Chain indexers and tree in a Pipeline. 48 | pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) 49 | 50 | # Train model. This also runs the indexers. 51 | model = pipeline.fit(trainingData) 52 | 53 | # Make predictions 54 | predictions = model.transform(testData) 55 | 56 | # Select example rows to display. 57 | predictions.select("prediction", "indexedLabel", "features").show(5) 58 | 59 | # Select (prediction, true label) and compute test error. 60 | evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") 61 | accuracy = evaluator.evaluate(predictions) 62 | test_error = 1.0 - accuracy 63 | print("Test Error = {} ".format(test_error)) 64 | 65 | mlflow.log_metric("accuracy", accuracy) 66 | mlflow.log_metric("test_error", test_error) 67 | 68 | treeModel = model.stages[2] 69 | print(treeModel) 70 | 71 | mlflow_spark.log_model(model, "spark-model") 72 | #mlflow.mleap.log_model(model, testData, "mleap-model") # TODO: Bombs :( 73 | 74 | spark.stop() 75 | 76 | if __name__ == "__main__": 77 | parser = ArgumentParser() 78 | parser.add_argument("--max_depth", dest="max_depth", help="max_depth", default=2, type=int) 79 | parser.add_argument("--max_bins", dest="max_bins", help="max_bins", default=32, type=int) 80 | args = parser.parse_args() 81 | current_file = os.path.basename(__file__) 82 | print("MLflow Version:", version.VERSION) 83 | 84 | client = mlflow.tracking.MlflowClient() 85 | print("experiment_id:",client.get_experiment_by_name(experiment_name).experiment_id) 86 | 87 | with mlflow.start_run(source_name=current_file) as run: 88 | print("run_id:",run.info.run_uuid) 89 | print("experiment_id:",run.info.experiment_id) 90 | train(args.max_depth,args.max_bins) 91 | 92 | -------------------------------------------------------------------------------- /scala_spark/README.md: -------------------------------------------------------------------------------- 1 | # mlflow-spark-summit-2019 - scala-spark 2 | 3 | Scala examples using the MLflow Java client: 4 | * Hello World - Simple MLflow example with no training. 5 | * Spark ML DecisionTree - saves and predicts SparkML and MLeap model formats. 6 | 7 | ## Build 8 | ``` 9 | mvn clean package 10 | ``` 11 | 12 | ## Hello World Sample 13 | ### Run 14 | ``` 15 | spark-submit --master local[2] \ 16 | --class org.andre.mlflow.examples.hello.HelloWorld \ 17 | target/mlflow-spark-examples-1.0-SNAPSHOT.jar \ 18 | http://localhost:5000 19 | ``` 20 | ``` 21 | Experiment name: scala_HelloWorld 22 | Experiment ID: 3 23 | Run ID: 81cc7941adae4860899ad5449df52802 24 | ``` 25 | 26 | ### Source 27 | Source snippet from [HelloWorld.scala](src/main/scala/org/andre/mlflow/examples/hello/HelloWorld.scala). 28 | ``` 29 | // Create client 30 | val trackingUri = args(0) 31 | val mlflowClient = new MlflowClient(trackingUri) 32 | 33 | // Create or get existing experiment 34 | val expName = "scala/HelloWorld" 35 | val expId = MLflowUtils.getOrCreateExperimentId(mlflowClient, expName) 36 | println("Experiment name: "+expName) 37 | println("Experiment ID: "+expId) 38 | 39 | // Create run 40 | val sourceName = getClass().getSimpleName()+".scala" 41 | val runInfo = mlflowClient.createRun(expId, sourceName); 42 | val runId = runInfo.getRunUuid() 43 | 44 | // Log params and metrics 45 | mlflowClient.logParam(runId, "p1","hi") 46 | mlflowClient.logMetric(runId, "m1",0.123F) 47 | 48 | // Close run 49 | mlflowClient.setTerminated(runId, RunStatus.FINISHED, System.currentTimeMillis()) 50 | ``` 51 | 52 | ## Spark ML DecisionTree Sample 53 | 54 | Sample demonstrating: 55 | * Trains a model 56 | * Saves the model in Spark ML and MLeap formats 57 | * Predicts from Spark ML and MLeap formats 58 | 59 | ### Train 60 | 61 | Saves model as Spark ML and MLeap artifact in MLflow. 62 | 63 | 64 | #### Source 65 | 66 | Source snippet from [TrainDecisionTree.scala](src/main/scala/org/andre/mlflow/examples/TrainDecisionTree.scala). 67 | ``` 68 | import org.mlflow.tracking.MlflowClient 69 | import org.mlflow.api.proto.Service.RunStatus 70 | 71 | // Create client 72 | val mlflowClient = new MlflowClient("http://localhost:5000") 73 | 74 | // MLflow - create or get existing experiment 75 | val expName = "scala/SimpleDecisionTree" 76 | val expId = MLflowUtils.getOrCreateExperimentId(mlflowClient, expName) 77 | 78 | // MLflow - create run 79 | val sourceName = getClass().getSimpleName()+".scala" 80 | val runInfo = mlflowClient.createRun(expId, sourceName); 81 | val runId = runInfo.getRunUuid() 82 | 83 | // MLflow - Log parameters 84 | mlflowClient.logParameter(runId, "maxDepth",""+dt.getMaxDepth) 85 | mlflowClient.logParameter(runId, "maxBins",""+dt.getMaxBins) 86 | 87 | . . . 88 | 89 | // MLflow - Log metric 90 | mlflowClient.logMetric(runId, "rmse",rmse.toFloat) 91 | 92 | // MLflow - save model as artifact 93 | //pipeline.save("tmp") 94 | clf.save("tmp") 95 | mlflowClient.logArtifacts(runId, new File("tmp"),"model") 96 | 97 | // MLflow - save model as Spark ML artifact 98 | val sparkModelPath = "out/spark_model" 99 | model.write.overwrite().save(sparkModelPath) 100 | mlflowClient.logArtifacts(runId, new File(sparkModelPath), "spark_model") 101 | 102 | // MLflow - save model as MLeap artifact 103 | val mleapModelDir = new File("out/mleap_model") 104 | mleapModelDir.mkdir 105 | MLeapUtils.save(model, predictions, "file:"+mleapModelDir.getAbsolutePath) 106 | mlflowClient.logArtifacts(runId, mleapModelDir, "mleap_model") 107 | 108 | // MLflow - close run 109 | mlflowClient.setTerminated(runId, RunStatus.FINISHED, System.currentTimeMillis()) 110 | ``` 111 | 112 | ### Run against local Spark and local MLflow tracking server 113 | 114 | ``` 115 | spark-submit --master local[2] \ 116 | --class org.andre.mlflow.examples.TrainDecisionTree \ 117 | target/mlflow-spark-examples-1.0-SNAPSHOT.jar \ 118 | --trackingUri http://localhost:5000 \ 119 | --experimentName scala_DecisionTree \ 120 | --dataPath ../data/sample_libsvm_data.txt \ 121 | --modelPath model_sample --maxDepth 5 --maxBins 5 122 | ``` 123 | 124 | ### Run against local Spark and Databricks hosted tracking server 125 | 126 | ``` 127 | spark-submit --master local[2] \ 128 | --class org.andre.mlflow.examples.decisiontree.TrainDecisionTree \ 129 | target/mlflow-spark-examples-1.0-SNAPSHOT.jar \ 130 | --trackingUri https://acme.cloud.databricks.com --token MY_TOKEN \ 131 | --experimentName spark_DecisionTree \ 132 | --dataPath ../data/sample_libsvm_data.txt \ 133 | --modelPath model_sample --maxDepth 5 --maxBins 5 134 | ``` 135 | 136 | ### Run in Databricks Cluster 137 | 138 | You can also run your jar in a Databricks cluster with the standard Databricks REST API run endpoints. 139 | See [runs submit](https://docs.databricks.com/api/latest/jobs.html#runs-submit), [run now](https://docs.databricks.com/api/latest/jobs.html#run-now) and [spark_jar_task](https://docs.databricks.com/api/latest/jobs.html#jobssparkjartask). 140 | In this example we showcase runs_submit. 141 | 142 | #### Setup 143 | 144 | Upload the data file and jar to your Databricks cluster. 145 | ``` 146 | databricks fs cp data/sample_libsvm_data.txt \ 147 | dbfs:/tmp/jobs/spark-scala-example/sample_libsvm_data.txt 148 | 149 | databricks fs cp target/mlflow-spark-examples-1.0-SNAPSHOT.jar \ 150 | dbfs:/tmp/jobs/spark-scala-example/mlflow-spark-examples-1.0-SNAPSHOT.jar 151 | ``` 152 | 153 | Here is a snippet from 154 | [run_submit_new_cluster.json](run_submit_new_cluster.json) or 155 | [run_submit_existing_cluster.json](run_submit_existing_cluster.json). 156 | ``` 157 | "libraries": [ 158 | { "pypi": { "package": "mlflow" } }, 159 | { "jar": "dbfs:/tmp/jobs/spark-scala-example/mlflow-spark-examples-1.0-SNAPSHOT.jar" } 160 | ], 161 | "spark_jar_task": { 162 | "main_class_name": "org.andre.mlflow.examples.decisiontree.TrainDecisionTree", 163 | "parameters": [ 164 | "--dataPath", "dbfs:/tmp/jobs/spark-scala-example/sample_libsvm_data.txt", 165 | "--modelPath", "/dbfs/tmp/jobs/spark-scala-example/models", 166 | "--runOrigin", "run_submit_new_cluster.json" 167 | ] 168 | } 169 | ``` 170 | 171 | #### Run with new cluster 172 | 173 | Create [run_submit_new_cluster.json](run_submit_new_cluster.json) and launch the run. 174 | ``` 175 | curl -X POST -H "Authorization: Bearer MY_TOKEN" \ 176 | -d @run_submit_new_cluster.json \ 177 | https://acme.cloud.databricks.com/api/2.0/jobs/runs/submit 178 | ``` 179 | 180 | #### Run with existing cluster 181 | 182 | Every time you build a new jar, you need to upload (as described above) it to DBFS and restart the cluster. 183 | ``` 184 | databricks clusters restart --cluster-id 0113-005848-about166 185 | ``` 186 | 187 | Create [run_submit_existing_cluster.json](run_submit_existing_cluster.json) and launch the run. 188 | ``` 189 | curl -X POST -H "Authorization: Bearer MY_TOKEN" \ 190 | -d @run_submit_existing_cluster.json \ 191 | https://acme.cloud.databricks.com/api/2.0/jobs/runs/submit 192 | ``` 193 | 194 | #### Run jar from Databricks notebook 195 | 196 | Create a notebook with the following cell. Attach it to the existing cluster described above. 197 | ``` 198 | import org.andre.mlflow.examples.decisiontree.TrainDecisionTree 199 | val dataPath = "dbfs:/tmp/jobs/spark-scala-example/sample_libsvm_data.txt" 200 | val modelPath = "/dbfs/tmp/jobs/spark-scala-example/models" 201 | val runOrigin = "run_from_jar_Notebook" 202 | TrainDecisionTree.train(spark, dataPath, modelPath, 5, 5, runOrigin) 203 | ``` 204 | 205 | ### Predict 206 | 207 | Predicts from Spark ML and MLeap models. 208 | 209 | #### Run 210 | ``` 211 | spark-submit --master local[2] \ 212 | --class org.andre.mlflow.examples.PredictDecisionTree \ 213 | target/mlflow-spark-examples-1.0-SNAPSHOT.jar \ 214 | --trackingUri http://localhost:5000 \ 215 | --dataPath data/sample_libsvm_data.txt \ 216 | --runId 3e422c4736a34046a74795384741ac33 217 | ``` 218 | 219 | ``` 220 | +----------+-----+--------------------+ 221 | |prediction|label| features| 222 | +----------+-----+--------------------+ 223 | | 0.0| 0.0|(692,[127,128,129...| 224 | | 1.0| 1.0|(692,[158,159,160...| 225 | | 1.0| 1.0|(692,[124,125,126...| 226 | | 1.0| 1.0|(692,[152,153,154...| 227 | +----------+-----+--------------------+ 228 | ``` 229 | 230 | #### Source 231 | 232 | Source snippet from [PredictDecisionTree.scala](src/main/scala/org/andre/mlflow/examples/PredictDecisionTree.scala). 233 | ``` 234 | val data = spark.read.format("libsvm").load(opts.dataPath) 235 | val model = PipelineModel.load(opts.modelPath) 236 | val predictions = model.transform(data) 237 | println("Prediction:") 238 | predictions.select("prediction", "label", "features").show(10,false) 239 | ``` 240 | -------------------------------------------------------------------------------- /scala_spark/playbook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## scala_spark playbook" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 8, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import os\n", 17 | "os.environ[\"MLFLOW_TRACKING_URI\"] = \"http://localhost:5000\"" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "### Train" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 10, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "name": "stdout", 34 | "output_type": "stream", 35 | "text": [ 36 | "19/04/24 21:11:52 WARN Utils: Your hostname, C02VD1RGHTDD resolves to a loopback address: 127.0.0.1; using 10.64.185.74 instead (on interface en0)\n", 37 | "19/04/24 21:11:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", 38 | "19/04/24 21:11:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", 39 | "args: [Ljava.lang.String;@27216cd\n", 40 | "Tracking URI: http://localhost:5000\n", 41 | "Experiment name: scala_HelloWorld\n", 42 | "Experiment ID: 5\n", 43 | "Run ID: 5d44fd992c94459fbcb0a1c56a75db58\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "! spark-submit --master local[2] \\\n", 49 | " --class org.andre.mlflow.examples.hello.HelloWorld \\\n", 50 | " target/mlflow-spark-examples-1.0-SNAPSHOT.jar \\\n", 51 | " http://localhost:5000" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 9, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "19/04/24 21:11:29 WARN Utils: Your hostname, C02VD1RGHTDD resolves to a loopback address: 127.0.0.1; using 10.64.185.74 instead (on interface en0)\n", 64 | "19/04/24 21:11:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", 65 | "19/04/24 21:11:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", 66 | "Options:\n", 67 | " Tracking URI: http://localhost:5000\n", 68 | " token: null\n", 69 | " experimentName: scala_DecisionTree\n", 70 | " dataPath: ../data/sample_libsvm_data.txt\n", 71 | " modelPath: model_sample\n", 72 | " maxDepth: 5\n", 73 | " maxBins: 5\n", 74 | " runOrigin: None\n", 75 | "Experiment ID: 4\n", 76 | "Run ID: 1b95d7319070445c9b497e4c51ef3d01\n", 77 | "runOrigin: None\n", 78 | "Params:\n", 79 | " maxDepth: 5\n", 80 | " maxBins: 5\n", 81 | "Metrics:\n", 82 | " RMSE: 0.2970442628930023\n", 83 | " isLargerBetter: false\n", 84 | "Prediction:\n", 85 | "+----------+-----+--------------------+\n", 86 | "|prediction|label| features|\n", 87 | "+----------+-----+--------------------+\n", 88 | "| 0.0| 0.0|(692,[100,101,102...|\n", 89 | "| 0.0| 0.0|(692,[121,122,123...|\n", 90 | "| 0.0| 0.0|(692,[124,125,126...|\n", 91 | "| 0.0| 0.0|(692,[124,125,126...|\n", 92 | "| 0.0| 0.0|(692,[124,125,126...|\n", 93 | "+----------+-----+--------------------+\n", 94 | "only showing top 5 rows\n", 95 | "\n", 96 | "Learned regression tree model:\n", 97 | " DecisionTreeRegressionModel (uid=dtr_eaa0ec226e98) of depth 2 with 5 nodes\n", 98 | " If (feature 407 <= 9.5)\n", 99 | " If (feature 243 <= 4.0)\n", 100 | " Predict: 1.0\n", 101 | " Else (feature 243 > 4.0)\n", 102 | " Predict: 0.0\n", 103 | " Else (feature 407 > 9.5)\n", 104 | " Predict: 1.0\n", 105 | "\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "! spark-submit --master local[2] \\\n", 111 | " --class org.andre.mlflow.examples.decisiontree.TrainDecisionTree \\\n", 112 | " target/mlflow-spark-examples-1.0-SNAPSHOT.jar \\\n", 113 | " --trackingUri http://localhost:5000 \\\n", 114 | " --experimentName scala_DecisionTree \\\n", 115 | " --dataPath ../data/sample_libsvm_data.txt \\\n", 116 | " --modelPath model_sample --maxDepth 5 --maxBins 5" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "### Predict" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 9, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | "19/04/25 14:05:42 WARN Utils: Your hostname, C02VD1RGHTDD resolves to a loopback address: 127.0.0.1; using 10.64.185.74 instead (on interface en0)\n", 136 | "19/04/25 14:05:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", 137 | "19/04/25 14:05:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", 138 | "Options:\n", 139 | " dataPath: ../data/sample_libsvm_data.txt\n", 140 | " tracking URI: null\n", 141 | " token: null\n", 142 | " runId: 3418ce2004454821b63e2fcff64177f4\n", 143 | "MLFLOW_TRACKING_URI: http://localhost:5000\n", 144 | "==== Spark ML\n", 145 | "+----------+-----+--------------------+\n", 146 | "|prediction|label| features|\n", 147 | "+----------+-----+--------------------+\n", 148 | "| 0.0| 0.0|(692,[127,128,129...|\n", 149 | "| 1.0| 1.0|(692,[158,159,160...|\n", 150 | "| 1.0| 1.0|(692,[124,125,126...|\n", 151 | "| 1.0| 1.0|(692,[152,153,154...|\n", 152 | "| 1.0| 1.0|(692,[151,152,153...|\n", 153 | "| 0.0| 0.0|(692,[129,130,131...|\n", 154 | "| 1.0| 1.0|(692,[158,159,160...|\n", 155 | "| 1.0| 1.0|(692,[99,100,101,...|\n", 156 | "| 0.0| 0.0|(692,[154,155,156...|\n", 157 | "| 0.0| 0.0|(692,[127,128,129...|\n", 158 | "+----------+-----+--------------------+\n", 159 | "only showing top 10 rows\n", 160 | "\n", 161 | "==== MLeap\n", 162 | "+----------+-----+--------------------+\n", 163 | "|prediction|label| features|\n", 164 | "+----------+-----+--------------------+\n", 165 | "| 0.0| 0.0|(692,[127,128,129...|\n", 166 | "| 0.0| 1.0|(692,[158,159,160...|\n", 167 | "| 0.0| 1.0|(692,[124,125,126...|\n", 168 | "| 0.0| 1.0|(692,[152,153,154...|\n", 169 | "| 0.0| 1.0|(692,[151,152,153...|\n", 170 | "| 0.0| 0.0|(692,[129,130,131...|\n", 171 | "| 0.0| 1.0|(692,[158,159,160...|\n", 172 | "| 0.0| 1.0|(692,[99,100,101,...|\n", 173 | "| 0.0| 0.0|(692,[154,155,156...|\n", 174 | "| 0.0| 0.0|(692,[127,128,129...|\n", 175 | "+----------+-----+--------------------+\n", 176 | "only showing top 10 rows\n", 177 | "\n" 178 | ] 179 | } 180 | ], 181 | "source": [ 182 | "! spark-submit --class org.andre.mlflow.examples.decisiontree.PredictDecisionTree \\\n", 183 | " --master local[2] target/mlflow-spark-examples-1.0-SNAPSHOT.jar \\\n", 184 | " --dataPath ../data/sample_libsvm_data.txt \\\n", 185 | " --runId 3418ce2004454821b63e2fcff64177f4" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [] 194 | } 195 | ], 196 | "metadata": { 197 | "kernelspec": { 198 | "display_name": "Python 2", 199 | "language": "python", 200 | "name": "python2" 201 | }, 202 | "language_info": { 203 | "codemirror_mode": { 204 | "name": "ipython", 205 | "version": 2 206 | }, 207 | "file_extension": ".py", 208 | "mimetype": "text/x-python", 209 | "name": "python", 210 | "nbconvert_exporter": "python", 211 | "pygments_lexer": "ipython2", 212 | "version": "2.7.14" 213 | } 214 | }, 215 | "nbformat": 4, 216 | "nbformat_minor": 2 217 | } 218 | -------------------------------------------------------------------------------- /scala_spark/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | org.andre 5 | mlflow-spark-examples 6 | jar 7 | 1.0-SNAPSHOT 8 | 9 | 10 | 2.11.8 11 | 2.11 12 | 2.4.0 13 | 0.9.1 14 | 0.12.0 15 | 1.72 16 | 17 | 18 | 19 | 20 | 21 | org.scala-lang 22 | scala-library 23 | ${scala.version} 24 | provided 25 | 26 | 27 | 28 | org.apache.spark 29 | spark-core_${scala.binary.version} 30 | ${spark.version} 31 | provided 32 | 33 | 34 | 35 | org.apache.spark 36 | spark-sql_${scala.binary.version} 37 | ${spark.version} 38 | provided 39 | 40 | 41 | 42 | org.apache.spark 43 | spark-mllib_${scala.binary.version} 44 | ${spark.version} 45 | provided 46 | 47 | 48 | 49 | jar 50 | org.mlflow 51 | mlflow-client 52 | ${mlflow.version} 53 | 54 | 55 | 56 | ml.combust.mleap 57 | mleap-spark_2.11 58 | ${mleap.version} 59 | 60 | 61 | ml.combust.mleap 62 | mleap-spark-base_2.11 63 | ${mleap.version} 64 | 65 | 66 | 67 | com.beust 68 | jcommander 69 | ${jcommander.version} 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | net.alchim31.maven 79 | scala-maven-plugin 80 | 3.2.0 81 | 82 | 83 | scala-compile-first 84 | process-resources 85 | 86 | compile 87 | 88 | 89 | 90 | scala-test-compile-first 91 | process-test-resources 92 | 93 | testCompile 94 | 95 | 96 | 97 | attach-scaladocs 98 | verify 99 | 100 | doc-jar 101 | 102 | 103 | 104 | 105 | 106 | org.apache.maven.plugins 107 | maven-shade-plugin 108 | 1.7.1 109 | 110 | 111 | 112 | *:* 113 | 114 | META-INF/*.SF 115 | META-INF/*.DSA 116 | META-INF/*.RSA 117 | 118 | 119 | 120 | 121 | 122 | 123 | package 124 | 125 | shade 126 | 127 | 128 | 129 | 130 | 131 | 132 | reference.conf 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /scala_spark/run_submit_existing_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "run_name": "MLflow_RunSubmit_ExistingCluster", 3 | "existing_cluster_id": "0113-005848-about166", 4 | "timeout_seconds": 3600, 5 | "spark_jar_task": { 6 | "main_class_name": "org.andre.mlflow.examples.TrainDecisionTree", 7 | "parameters": [ 8 | "--experimentName", "/Shared/experiments/demo/scala_DecisionTree", 9 | "--dataPath", "dbfs:/tmp/jobs/spark-scala-example/sample_libsvm_data.txt", 10 | "--modelPath", "/dbfs/tmp/jobs/spark-scala-example/models", 11 | "--runOrigin", "run_submit_existing_cluster.json" 12 | ] 13 | }, 14 | "libraries": [ 15 | { 16 | "jar": "dbfs:/tmp/jobs/spark-scala-example/mlflow-spark-examples-1.0-SNAPSHOT.jar", 17 | "pypi-package": "mlflow" 18 | } 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /scala_spark/run_submit_new_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "run_name": "MLflow_RunSubmit_NewCluster", 3 | "new_cluster": { 4 | "spark_version": "5.3.x-scala2.11", 5 | "node_type_id": "i3.xlarge", 6 | "num_workers": 1 7 | }, 8 | "timeout_seconds": 3600, 9 | "libraries": [ 10 | { "pypi": { "package": "mlflow" } }, 11 | { "jar": "dbfs:/tmp/jobs/spark-scala-example/mlflow-spark-examples-1.0-SNAPSHOT.jar" } 12 | ], 13 | "spark_jar_task": { 14 | "main_class_name": "org.andre.mlflow.examples.TrainDecisionTree", 15 | "parameters": [ 16 | "--experimentName", "/Shared/experiments/demo/scala_DecisionTree", 17 | "--dataPath", "dbfs:/tmp/jobs/spark-scala-example/sample_libsvm_data.txt", 18 | "--modelPath", "/dbfs/tmp/jobs/spark-scala-example/models", 19 | "--runOrigin", "run_submit_new_cluster.json" 20 | ] 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /scala_spark/src/main/scala/org/andre/mlflow/examples/MLeapUtils.scala: -------------------------------------------------------------------------------- 1 | package org.andre.mlflow.examples 2 | 3 | import ml.combust.bundle.BundleFile 4 | import ml.combust.mleap.spark.SparkSupport._ 5 | import resource.managed 6 | import org.apache.spark.ml.bundle.SparkBundleContext 7 | import org.apache.spark.ml.PipelineModel 8 | import org.apache.spark.sql.DataFrame 9 | 10 | /* 11 | MLeap URI formats: 12 | file:/tmp/mleap_scala_model_export/my-model 13 | jar:file:/tmp/mleap_scala_model_export/my-model.zip 14 | */ 15 | object MLeapUtils { 16 | 17 | def saveModel(model: PipelineModel, df: DataFrame, bundlePath: String) { 18 | val context = SparkBundleContext().withDataset(df) 19 | (for(modelFile <- managed(BundleFile(bundlePath))) yield { 20 | model.writeBundle.save(modelFile)(context) 21 | }).tried.get 22 | } 23 | 24 | def readModel(bundlePath: String) = { 25 | val obundle = (for(bundle <- managed(BundleFile(bundlePath))) yield { 26 | bundle.loadSparkBundle().get 27 | }).opt 28 | obundle match { 29 | case Some(b) => b.root 30 | case None => throw new Exception(s"Cannot find bundle: $bundlePath") 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /scala_spark/src/main/scala/org/andre/mlflow/examples/MLflowUtils.scala: -------------------------------------------------------------------------------- 1 | package org.andre.mlflow.examples 2 | 3 | import scala.collection.JavaConversions._ 4 | import org.mlflow.tracking.MlflowClient 5 | import org.mlflow.tracking.creds.BasicMlflowHostCreds 6 | 7 | object MLflowUtils { 8 | 9 | def getOrCreateExperimentId(client: MlflowClient, experimentName: String) : String = { 10 | val expOpt = client.listExperiments() find (_.getName == experimentName) 11 | expOpt match { 12 | case Some(exp) => exp.getExperimentId 13 | case None => client.createExperiment(experimentName) 14 | } 15 | } 16 | 17 | def createMlflowClient(args: Array[String]) = { 18 | println("args: "+args) 19 | if (args.length == 0) { 20 | val env = System.getenv("MLFLOW_TRACKING_URI") 21 | println(s"MLFLOW_TRACKING_URI: $env") 22 | new MlflowClient() 23 | } else { 24 | val trackingUri = args(0) 25 | println(s"Tracking URI: $trackingUri") 26 | if (args.length > 1) { 27 | new MlflowClient(new BasicMlflowHostCreds(trackingUri,args(1))) 28 | } else { 29 | new MlflowClient(trackingUri) 30 | } 31 | } 32 | } 33 | 34 | def createMlflowClient(trackingUri: String, token: String) = { 35 | if (trackingUri == null) { 36 | val env = System.getenv("MLFLOW_TRACKING_URI") 37 | println(s"MLFLOW_TRACKING_URI: $env") 38 | new MlflowClient() 39 | } else { 40 | if (token != null) { 41 | new MlflowClient(new BasicMlflowHostCreds(trackingUri, token)) 42 | } else { 43 | new MlflowClient(trackingUri) 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /scala_spark/src/main/scala/org/andre/mlflow/examples/decisiontree/PredictDecisionTree.scala: -------------------------------------------------------------------------------- 1 | package org.andre.mlflow.examples.decisiontree 2 | 3 | import com.beust.jcommander.{JCommander, Parameter} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.DataFrame 6 | import org.apache.spark.ml.PipelineModel 7 | import org.apache.spark.ml.Transformer 8 | import org.mlflow.tracking.MlflowClient 9 | import org.mlflow.tracking.creds.BasicMlflowHostCreds 10 | import org.andre.mlflow.examples.{MLflowUtils,MLeapUtils} 11 | 12 | object PredictDecisionTree { 13 | 14 | def main(args: Array[String]) { 15 | new JCommander(opts, args.toArray: _*) 16 | println("Options:") 17 | println(s" dataPath: ${opts.dataPath}") 18 | println(s" tracking URI: ${opts.trackingUri}") 19 | println(s" token: ${opts.token}") 20 | println(s" runId: ${opts.runId}") 21 | 22 | val mlflowClient = MLflowUtils.createMlflowClient(opts.trackingUri, opts.token) 23 | val spark = SparkSession.builder.appName("Predict").getOrCreate() 24 | val data = spark.read.format("libsvm").load(opts.dataPath) 25 | 26 | val runInfo = mlflowClient.getRun(opts.runId).getInfo 27 | val uri = runInfo.getArtifactUri 28 | predictSparkML(uri, data) 29 | predictMLeap(uri, data) 30 | } 31 | 32 | def predictSparkML(uri: String, data: DataFrame) { 33 | println("==== Spark ML") 34 | val modelPath = s"${uri}/spark-model" 35 | val model = PipelineModel.load(modelPath) 36 | showPredictions(model, data) 37 | } 38 | 39 | def predictMLeap(uri: String, data: DataFrame) { 40 | println("==== MLeap") 41 | val modelPath = s"file:${uri}/mleap-model/mleap/model" 42 | val model = MLeapUtils.readModel(modelPath) 43 | showPredictions(model, data) 44 | } 45 | 46 | def showPredictions(model: Transformer, data: DataFrame) { 47 | val predictions = model.transform(data) 48 | val df = predictions.select("prediction", "label", "features") 49 | df.show(10) 50 | } 51 | 52 | object opts { 53 | @Parameter(names = Array("--dataPath" ), description = "Data path", required=true) 54 | var dataPath: String = null 55 | 56 | @Parameter(names = Array("--trackingUri" ), description = "Tracking Server URI", required=false) 57 | var trackingUri: String = null 58 | 59 | @Parameter(names = Array("--token" ), description = "REST API token", required=false) 60 | var token: String = null 61 | 62 | @Parameter(names = Array("--runId" ), description = "runId", required=true) 63 | var runId: String = null 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /scala_spark/src/main/scala/org/andre/mlflow/examples/decisiontree/TrainDecisionTree.scala: -------------------------------------------------------------------------------- 1 | package org.andre.mlflow.examples.decisiontree 2 | 3 | // From: https://github.com/apache/spark/blob/master/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala 4 | 5 | import java.io.{File,PrintWriter} 6 | import org.apache.spark.sql.{SparkSession,DataFrame} 7 | import org.apache.spark.ml.{Pipeline,PipelineModel} 8 | import org.apache.spark.ml.evaluation.RegressionEvaluator 9 | import org.apache.spark.ml.feature.{VectorIndexer,VectorIndexerModel} 10 | import org.apache.spark.ml.regression.{DecisionTreeRegressionModel,DecisionTreeRegressor} 11 | import org.mlflow.tracking.MlflowClient 12 | import org.mlflow.api.proto.Service.RunStatus 13 | import com.beust.jcommander.{JCommander, Parameter} 14 | import org.andre.mlflow.examples.{MLflowUtils,MLeapUtils} 15 | 16 | object TrainDecisionTree { 17 | case class DataHolder(trainingData: DataFrame, testData: DataFrame, featureIndexer: VectorIndexerModel) 18 | val spark = SparkSession.builder.appName("DecisionTreeRegressionExample").getOrCreate() 19 | val seed = 2019 20 | 21 | def main(args: Array[String]) { 22 | new JCommander(opts, args: _*) 23 | println("Options:") 24 | println(s" Tracking URI: ${opts.trackingUri}") 25 | println(s" token: ${opts.token}") 26 | println(s" experimentName: ${opts.experimentName}") 27 | println(s" dataPath: ${opts.dataPath}") 28 | println(s" modelPath: ${opts.modelPath}") 29 | println(s" maxDepth: ${opts.maxDepth}") 30 | println(s" maxBins: ${opts.maxBins}") 31 | println(s" runOrigin: ${opts.runOrigin}") 32 | 33 | // MLflow - create or get existing experiment 34 | val mlflowClient = MLflowUtils.createMlflowClient(opts.trackingUri, opts.token) 35 | 36 | val experimentId = MLflowUtils.getOrCreateExperimentId(mlflowClient, opts.experimentName) 37 | println("Experiment ID: "+experimentId) 38 | 39 | // Read data 40 | val dataHolder = readData(opts.dataPath) 41 | 42 | // Train model 43 | train(mlflowClient, experimentId, opts.modelPath, opts.maxDepth, opts.maxBins, opts.runOrigin, dataHolder) 44 | } 45 | 46 | def readData(dataPath: String) : DataHolder = { 47 | val data = spark.read.format("libsvm").load(dataPath) 48 | 49 | // Automatically identify categorical features, and index them. 50 | // Here, we treat features with > 4 distinct values as continuous. 51 | val featureIndexer = new VectorIndexer() 52 | .setInputCol("features") 53 | .setOutputCol("indexedFeatures") 54 | .setMaxCategories(4) 55 | .fit(data) 56 | 57 | // Split the data into training and test sets (30% held out for testing). 58 | val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3), seed) 59 | 60 | DataHolder(trainingData, testData, featureIndexer) 61 | } 62 | 63 | def train(mlflowClient: MlflowClient, experimentId: String, modelPath: String, maxDepth: Int, maxBins: Int, runOrigin: String, dataHolder: DataHolder) { 64 | // Create a DecisionTree model 65 | val clf = new DecisionTreeRegressor() 66 | .setLabelCol("label") 67 | .setFeaturesCol("indexedFeatures") 68 | if (maxDepth != -1) clf.setMaxDepth(maxDepth) 69 | if (maxBins != -1) clf.setMaxBins(maxBins) 70 | 71 | // MLflow - create run 72 | val sourceName = (getClass().getSimpleName()+".scala").replace("$","") 73 | val runInfo = mlflowClient.createRun(experimentId, sourceName); 74 | val runId = runInfo.getRunUuid() 75 | println(s"Run ID: $runId") 76 | println(s"runOrigin: $runOrigin") 77 | 78 | // MLflow - Log parameters 79 | mlflowClient.logParam(runId, "maxDepth",""+clf.getMaxDepth) 80 | mlflowClient.logParam(runId, "maxBins",""+clf.getMaxBins) 81 | mlflowClient.logParam(runId, "runOrigin",runOrigin) 82 | println(s"Params:") 83 | println(s" maxDepth: ${clf.getMaxDepth}") 84 | println(s" maxBins: ${clf.getMaxBins}") 85 | 86 | // Chain indexer and tree in a Pipeline. 87 | val pipeline = new Pipeline().setStages(Array(dataHolder.featureIndexer, clf)) 88 | 89 | // Train model. This also runs the indexer. 90 | val model = pipeline.fit(dataHolder.trainingData) 91 | 92 | // Make predictions. 93 | val predictions = model.transform(dataHolder.testData) 94 | 95 | // Create metrics: select (prediction, true label) and compute test error. 96 | val evaluator = new RegressionEvaluator() 97 | .setLabelCol("label") 98 | .setPredictionCol("prediction") 99 | .setMetricName("rmse") 100 | val rmse = evaluator.evaluate(predictions) 101 | println(s"Metrics:") 102 | println(s" RMSE: $rmse") 103 | println(s" isLargerBetter: ${evaluator.isLargerBetter}") 104 | 105 | // MLflow - Log metric 106 | mlflowClient.logMetric(runId, "rmse",rmse) 107 | 108 | // Select example rows to display. 109 | println("Prediction:") 110 | predictions.select("prediction", "label", "features").show(5) 111 | 112 | // Print decision tree 113 | val treeModel = model.stages(1).asInstanceOf[DecisionTreeRegressionModel] 114 | println(s"Learned regression tree model:\n ${treeModel.toDebugString}") 115 | 116 | // MLflow - Log simple artifact 117 | val path="details.txt" 118 | new PrintWriter(path) { write("Info: "+new java.util.Date()) ; close } 119 | mlflowClient.logArtifact(runId,new File(path),"info") 120 | 121 | // MLflow - Save model in Spark ML and MLeap formats 122 | saveModelAsSparkML(mlflowClient, runId, modelPath, model) 123 | saveModelAsMLeap(mlflowClient, runId, modelPath, model, predictions) 124 | 125 | // MLflow - close run 126 | mlflowClient.setTerminated(runId, RunStatus.FINISHED, System.currentTimeMillis()) 127 | } 128 | 129 | def saveModelAsSparkML(mlflowClient: MlflowClient, runId: String, baseModelDir: String, model: PipelineModel) = { 130 | val modelPath = s"$baseModelDir/spark-model" 131 | model.write.overwrite().save(modelPath) 132 | mlflowClient.logArtifacts(runId, new File(modelPath), "spark-model") 133 | } 134 | 135 | def saveModelAsMLeap(mlflowClient: MlflowClient, runId: String, baseModelDir: String, model: PipelineModel, predictions: DataFrame) = { 136 | val modelPath = new File(s"$baseModelDir/mleap-model") 137 | modelPath.mkdir 138 | MLeapUtils.saveModel(model, predictions, "file:"+modelPath.getAbsolutePath) 139 | mlflowClient.logArtifacts(runId, modelPath, "mleap-model/mleap/model") // Make compatible with MLflow Python mlflow.mleap.log_model 140 | } 141 | 142 | object opts { 143 | @Parameter(names = Array("--trackingUri" ), description = "Tracking Server URI", required=false) 144 | var trackingUri: String = null 145 | 146 | @Parameter(names = Array("--token" ), description = "REST API token", required=false) 147 | var token: String = null 148 | 149 | @Parameter(names = Array("--dataPath" ), description = "Data path", required=true) 150 | var dataPath: String = null 151 | 152 | @Parameter(names = Array("--modelPath" ), description = "Data path", required=true) 153 | var modelPath: String = null 154 | 155 | @Parameter(names = Array("--maxDepth" ), description = "maxDepth", required=false) 156 | var maxDepth = -1 157 | 158 | @Parameter(names = Array("--maxBins" ), description = "maxBins", required=false) 159 | var maxBins = -1 160 | 161 | @Parameter(names = Array("--runOrigin" ), description = "runOrigin", required=false) 162 | var runOrigin = "None" 163 | 164 | @Parameter(names = Array("--experimentName" ), description = "experimentName", required=false) 165 | var experimentName = "scala_DecisionTree" 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /scala_spark/src/main/scala/org/andre/mlflow/examples/hello/HelloWorld.scala: -------------------------------------------------------------------------------- 1 | package org.andre.mlflow.examples.hello 2 | 3 | import java.io.{File,PrintWriter} 4 | import org.mlflow.tracking.MlflowClient 5 | import org.mlflow.tracking.creds.BasicMlflowHostCreds 6 | import org.mlflow.api.proto.Service.RunStatus 7 | import scala.collection.JavaConversions._ 8 | import org.andre.mlflow.examples.MLflowUtils 9 | 10 | object HelloWorld { 11 | def main(args: Array[String]) { 12 | 13 | // Create MLflow client 14 | val mlflowClient = MLflowUtils.createMlflowClient(args) 15 | 16 | // Create or get existing experiment 17 | val expName = "scala_HelloWorld" 18 | val expId = MLflowUtils.getOrCreateExperimentId(mlflowClient, expName) 19 | println("Experiment name: "+expName) 20 | println("Experiment ID: "+expId) 21 | 22 | // Create run 23 | val sourceName = (getClass().getSimpleName()+".scala").replace("$","") 24 | val runInfo = mlflowClient.createRun(expId, sourceName) 25 | val runId = runInfo.getRunUuid() 26 | println("Run ID: "+runId) 27 | 28 | // Log params and metrics 29 | mlflowClient.logParam(runId, "p1","hi") 30 | mlflowClient.logMetric(runId, "m1",0.123) 31 | 32 | // Log file artifact 33 | new PrintWriter("info.txt") { write("File artifact: "+new java.util.Date()) ; close } 34 | mlflowClient.logArtifact(runId, new File("info.txt")) 35 | 36 | // Log directory artifact 37 | val dir = new File("tmp") 38 | dir.mkdir 39 | new PrintWriter(new File(dir, "model.txt")) { write("Directory artifact: "+new java.util.Date()) ; close } 40 | mlflowClient.logArtifacts(runId, dir, "model") 41 | 42 | // Close run 43 | mlflowClient.setTerminated(runId, RunStatus.FINISHED, System.currentTimeMillis()) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /search/README.md: -------------------------------------------------------------------------------- 1 | 2 | # mlflow-spark-summit-2019 - search 3 | 4 | Synopsis: 5 | * Shows how to search for runs. 6 | * For details see https://mlflow.org/docs/latest/search-syntax.html. 7 | 8 | Summary of [search.py](search.py): 9 | * Creates an experiment `search_example` with five runs with rmse values: 0.76, 0.71, 0.77, 0.69, 0.69. 10 | * Searches for runs: `metrics.rmse >= 0.76` 11 | 12 | 13 | Run 14 | ``` 15 | python search.py 16 | ``` 17 | 18 | ``` 19 | MLflow Version: 0.9.1 20 | Tracking URI: http://localhost:5000 21 | experiment_name: search_example 22 | experiment_id: 6 23 | Adding 5 runs: 24 | metric: 0.76 run_uuid: cc2debe52ff14c6b9e87cbbe27bedc5b 25 | metric: 0.71 run_uuid: bc762d998463434e95e8ebdfa50019c0 26 | metric: 0.77 run_uuid: a9205f64570041b49d362b298195fcb6 27 | metric: 0.69 run_uuid: fe3aeb9865e7428495b4dca3b6745177 28 | metric: 0.69 run_uuid: 72d1281840334f3a934797ca196951a1 29 | Query: metrics.rmse >= 0.76 30 | Found 2 matching runs: 31 | run_uuid: a9205f64570041b49d362b298195fcb6 metrics: [] 32 | run_uuid: cc2debe52ff14c6b9e87cbbe27bedc5b metrics: [] 33 | ``` 34 | -------------------------------------------------------------------------------- /search/search.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import mlflow 3 | 4 | print("MLflow Version:", mlflow.version.VERSION) 5 | print("Tracking URI:", mlflow.tracking.get_tracking_uri()) 6 | 7 | experiment_name = "search_example" 8 | print("experiment_name:",experiment_name) 9 | mlflow.set_experiment(experiment_name) 10 | 11 | client = mlflow.tracking.MlflowClient() 12 | experiment_id = client.get_experiment_by_name(experiment_name).experiment_id 13 | print("experiment_id:",experiment_id) 14 | 15 | def create_run(metric): 16 | with mlflow.start_run() as run: 17 | print(" metric:",metric," run_uuid:",run.info.run_uuid) 18 | mlflow.log_metric("rmse", metric) 19 | 20 | def create_runs(): 21 | metrics = [0.76, 0.71, 0.77, 0.69, 0.69 ] 22 | print("Adding {} runs:".format(len(metrics))) 23 | for m in metrics: 24 | create_run(m) 25 | run_infos = client.list_run_infos(experiment_id) 26 | 27 | def delete_runs(experiment_id): 28 | run_infos = client.list_run_infos(experiment_id) 29 | for info in run_infos: 30 | client.delete_run(info.run_uuid) 31 | run_infos = client.list_run_infos(experiment_id) 32 | 33 | def search(exp_ids, query): 34 | print("Query:",query) 35 | runs = client.search_runs(exp_ids,query) 36 | print("Found {} matching runs:".format(len(runs))) 37 | for run in runs: 38 | print(" run_uuid:",run.info.run_uuid," metrics:",run.data.metrics) 39 | 40 | if __name__ == "__main__": 41 | delete_runs(experiment_id) 42 | create_runs() 43 | query = "metrics.rmse >= 0.76" 44 | search([experiment_id], query) 45 | 46 | -------------------------------------------------------------------------------- /sklearn/MLproject: -------------------------------------------------------------------------------- 1 | name: mlflow_demo_sklearn 2 | 3 | conda_env: conda.yaml 4 | 5 | entry_points: 6 | main: 7 | parameters: 8 | experiment_name: {type: string, default: "none" } 9 | data_path: {type: string} 10 | alpha: float 11 | l1_ratio: {type: float, default: 0.1} 12 | run_origin: {type: string, default: "default" } 13 | command: "python main_train_wine_quality.py 14 | --experiment_name {experiment_name} 15 | --data_path {data_path} 16 | --alpha {alpha} 17 | --l1_ratio {l1_ratio} 18 | --run_origin {run_origin}" 19 | -------------------------------------------------------------------------------- /sklearn/README.md: -------------------------------------------------------------------------------- 1 | # mlflow-spark-summit-2019 - sklearn 2 | 3 | ## Overview 4 | * Wine Quality Elastic Net Example 5 | * This example demonstrates all features of MLflow training and prediction. 6 | * Saves model in pickle format 7 | * Saves plot artifacts 8 | * Shows several ways to run training - _mlflow run_, run against Databricks cluster, call egg from notebook, etc. 9 | * Shows several ways to run prediction - web server, mlflow.load_model(), UDF, etc. 10 | * Data: data/wine-quality-white.csv and data/predict wine-quality-red.csv. 11 | 12 | ## Training 13 | 14 | Source: [main.py](main.py) and [train.py](wine_quality/train.py). 15 | 16 | ### Unmanaged without mlflow run 17 | 18 | #### Command-line python 19 | 20 | To run with standard main function: 21 | ``` 22 | python main.py --experiment_name sklearn \ 23 | --data_path data/wine-quality-white.csv \ 24 | --alpha 0.5 --l1_ratio 0.5 25 | ``` 26 | 27 | #### Jupyter notebook 28 | See [Train_Wine_Quality.ipynb](Train_Wine_Quality.ipynb). 29 | ``` 30 | export MLFLOW_TRACKING_URI=http://localhost:5000 31 | jupyter notebook 32 | ``` 33 | 34 | ### Using mlflow run 35 | 36 | These runs use the [MLproject](MLproject) file. For more details see [MLflow documentation - Running Projects](https://mlflow.org/docs/latest/projects.html#running-projects). 37 | 38 | Note that mlflow run ignores the `set_experiment()` function so you must specify the experiment with the `--experiment-id` argument. 39 | 40 | **mlflow run local** 41 | ``` 42 | mlflow run . -P alpha=0.01 -P l1_ratio=0.75 -P run_origin=LocalRun --experiment-id=2 43 | ``` 44 | 45 | **mlflow run github** 46 | ``` 47 | mlflow run https://github.com/amesar/mlflow-fun.git#examples/scikit-learn/wine-quality \ 48 | -P alpha=0.01 -P l1_ratio=0.75 -P run_origin=GitRun \ 49 | --experiment-id=2 50 | ``` 51 | 52 | **mlflow run Databricks remote** - Run against Databricks. 53 | 54 | See [Remote Execution on Databricks](https://mlflow.org/docs/latest/projects.html#remote-execution-on-databricks) and [mlflow_run_cluster.json](mlflow_run_cluster.json). 55 | 56 | Setup. 57 | ``` 58 | export MLFLOW_TRACKING_URI=databricks 59 | ``` 60 | The token and tracking server URL will be picked up from your Databricks CLI ~/.databrickscfg default profile. 61 | 62 | Now run. 63 | ``` 64 | mlflow run https://github.com/amesar/mlflow-fun.git#examples/scikit-learn/wine-quality \ 65 | -P alpha=0.01 -P l1_ratio=0.75 -P run_origin=GitRun \ 66 | -P data_path=/dbfs/tmp/data/wine-quality-white.csv \ 67 | --experiment-id=2019 \ 68 | --mode databricks --cluster-spec mlflow_run_cluster.json 69 | ``` 70 | 71 | ### Databricks Cluster Runs 72 | 73 | You can also package your code as an egg and run it with the standard Databricks REST API endpoints 74 | [job/runs/submit](https://docs.databricks.com/api/latest/jobs.html#runs-submit) 75 | or [jobs/run-now](https://docs.databricks.com/api/latest/jobs.html#run-now) 76 | using the [spark_python_task](https://docs.databricks.com/api/latest/jobs.html#jobssparkpythontask). 77 | 78 | #### Setup 79 | 80 | Build the egg. 81 | ``` 82 | python setup.py bdist_egg 83 | ``` 84 | 85 | Upload the data file, main file and egg to your Databricks cluster. 86 | ``` 87 | databricks fs cp main.py dbfs:/tmp/jobs/wine_quality/main.py 88 | databricks fs cp data/wine-quality-white.csv dbfs:/tmp/jobs/wine_quality/wine-quality-white.csv 89 | databricks fs cp \ 90 | dist/mlflow_wine_quality-0.0.1-py3.6.egg \ 91 | dbfs:/tmp/jobs/wine_quality/mlflow_wine_quality-0.0.1-py3.6.egg 92 | ``` 93 | 94 | 95 | #### Run Submit 96 | 97 | ##### Run with new cluster 98 | 99 | Define your run in [run_submit_new_cluster.json](run_submit_new_cluster.json) and launch the run. 100 | 101 | ``` 102 | curl -X POST -H "Authorization: Bearer MY_TOKEN" \ 103 | -d @run_submit_new_cluster.json \ 104 | https://myshard.cloud.databricks.com/api/2.0/jobs/runs/submit 105 | ``` 106 | 107 | ##### Run with existing cluster 108 | 109 | Every time you build a new egg, you need to upload (as described above) it to DBFS and restart the cluster. 110 | ``` 111 | databricks clusters restart --cluster-id 1222-015510-grams64 112 | ``` 113 | 114 | Define your run in [run_submit_existing_cluster.json](run_submit_existing_cluster.json) and launch the run. 115 | ``` 116 | curl -X POST -H "Authorization: Bearer MY_TOKEN" \ 117 | -d @run_submit_existing_cluster.json \ 118 | https://myshard.cloud.databricks.com/api/2.0/jobs/runs/submit 119 | ``` 120 | 121 | #### Job Run Now 122 | 123 | ##### Run with new cluster 124 | 125 | First create a job with the spec file [create_job_new_cluster.json](create_job_new_cluster.json). 126 | ``` 127 | databricks jobs create --json-file create_job_new_cluster.json 128 | ``` 129 | 130 | Then run the job with desired parameters. 131 | ``` 132 | databricks jobs run-now --job-id $JOB_ID --python-params ' [ "WineQualityExperiment", 0.3, 0.3, "/dbfs/tmp/jobs/wine_quality/wine-quality-white.csv" ] ' 133 | ``` 134 | 135 | ##### Run with existing cluster 136 | First create a job with the spec file [create_job_existing_cluster.json](create_job_existing_cluster.json). 137 | ``` 138 | databricks jobs create --json-file create_job_existing_cluster.json 139 | ``` 140 | 141 | Then run the job with desired parameters. 142 | ``` 143 | databricks jobs run-now --job-id $JOB_ID --python-params ' [ "WineQualityExperiment", 0.3, 0.3, "/dbfs/tmp/jobs/wine_quality/wine-quality-white.csv" ] ' 144 | ``` 145 | 146 | 147 | #### Run egg from Databricks notebook 148 | 149 | Create a notebook with the following cell. Attach it to the existing cluster described above. 150 | ``` 151 | from wine_quality import Trainer 152 | data_path = "/dbfs/tmp/jobs/wine_quality/wine-quality-white.csv" 153 | trainer = Trainer("WineQualityExperiment", data_path, "from_notebook_with_egg") 154 | trainer.train(0.4, 0.4) 155 | ``` 156 | 157 | ## Predictions 158 | 159 | You can make predictions in the following ways: 160 | 1. Use MLflow's serving web server and submit predictions via HTTP calls 161 | 2. Call mlflow.sklearn.load_model() from your own serving code and then make predictions 162 | 4. Call mlflow.pyfunc.load_pyfunc() from your own serving code and then make predictions 163 | 5. Batch prediction with Spark UDF (user-defined function) 164 | 165 | 166 | See MLflow documentation: 167 | * [Tutorial - Serving the Model](https://www.mlflow.org/docs/latest/tutorial.html#serving-the-model) 168 | * [Quickstart - Saving and Serving Models](https://www.mlflow.org/docs/latest/quickstart.html#saving-and-serving-models) 169 | * [mlflow.pyfunc.spark_udf](https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.spark_udf) 170 | 171 | 172 | ### Data for predictions 173 | [data/wine-quality-red.csv](data/wine-quality-red.csv): 174 | ``` 175 | [ 176 | { 177 | "fixed acidity": 7, 178 | "volatile acidity": 0.27, 179 | "citric acid": 0.36, 180 | "residual sugar": 20.7, 181 | "chlorides": 0.045, 182 | "free sulfur dioxide": 45, 183 | "total sulfur dioxide": 170, 184 | "density": 1.001, 185 | "pH": 3, 186 | "sulphates": 0.45, 187 | "alcohol": 8.8 188 | }, 189 | . . . . . 190 | ] 191 | ``` 192 | 193 | ### 1. Serving Models from MLflow Web Server 194 | 195 | In one window run the server. 196 | ``` 197 | mlflow pyfunc serve -p 5001 -r 7e674524514846799310c41f10d6b99d -m model 198 | ``` 199 | 200 | In another window, submit a prediction. 201 | ``` 202 | curl -X POST -H "Content-Type:application/json" -d @data/wine-quality-red.csv http://localhost:5001/invocations 203 | 204 | [ 205 | 5.551096337521979, 206 | 5.297727513113797, 207 | 5.427572126267637, 208 | 5.562886443251915, 209 | 5.562886443251915 210 | ] 211 | ``` 212 | 213 | ### 2. Predict with mlflow.sklearn.load_model() 214 | 215 | ``` 216 | python scikit_predict.py 7e674524514846799310c41f10d6b99d 217 | 218 | predictions: [5.55109634 5.29772751 5.42757213 5.56288644 5.56288644] 219 | ``` 220 | From [scikit_predict.py](scikit_predict.py): 221 | ``` 222 | model = mlflow.sklearn.load_model("model",run_id="7e674524514846799310c41f10d6b99d") 223 | df = pd.read_csv("data/wine-quality-red.csv") 224 | predicted = model.predict(df) 225 | print("predicted:",predicted) 226 | ``` 227 | 228 | ### 3. Predict with mlflow.pyfunc.load_pyfunc() 229 | 230 | ``` 231 | python pyfunc_predict.py 7e674524514846799310c41f10d6b99d 232 | 233 | predictions: [5.55109634 5.29772751 5.42757213 5.56288644 5.56288644] 234 | ``` 235 | From [pyfunc_predict.py](pyfunc_predict.py): 236 | ``` 237 | model_uri = mlflow.start_run("7e674524514846799310c41f10d6b99d").info.artifact_uri + "/model" 238 | model = mlflow.pyfunc.load_pyfunc(model_uri) 239 | df = pd.read_csv("data/wine-quality-red.csv") 240 | predicted = model.predict(df) 241 | print("predicted:",predicted) 242 | ``` 243 | 244 | ### 4. Batch prediction with Spark UDF (user-defined function) 245 | 246 | Scroll right to see prediction column. 247 | 248 | ``` 249 | pip install pyarrow 250 | 251 | spark-submit --master local[2] spark_udf_predict.py 7e674524514846799310c41f10d6b99d 252 | 253 | +-------+---------+-----------+-------+-------------+-------------------+----+--------------+---------+--------------------+----------------+------------------+ 254 | |alcohol|chlorides|citric acid|density|fixed acidity|free sulfur dioxide| pH|residual sugar|sulphates|total sulfur dioxide|volatile acidity| prediction| 255 | +-------+---------+-----------+-------+-------------+-------------------+----+--------------+---------+--------------------+----------------+------------------+ 256 | | 8.8| 0.045| 0.36| 1.001| 7.0| 45.0| 3.0| 20.7| 0.45| 170.0| 0.27| 5.551096337521979| 257 | | 9.5| 0.049| 0.34| 0.994| 6.3| 14.0| 3.3| 1.6| 0.49| 132.0| 0.3| 5.297727513113797| 258 | | 10.1| 0.05| 0.4| 0.9951| 8.1| 30.0|3.26| 6.9| 0.44| 97.0| 0.28| 5.427572126267637| 259 | | 9.9| 0.058| 0.32| 0.9956| 7.2| 47.0|3.19| 8.5| 0.4| 186.0| 0.23| 5.562886443251915| 260 | ``` 261 | From [spark_udf_predict.py](spark_udf_predict.py): 262 | ``` 263 | spark = SparkSession.builder.appName("ServePredictions").getOrCreate() 264 | df = spark.read.option("inferSchema",True).option("header", True).csv("data/wine-quality-red.csv") 265 | df = df.drop("quality") 266 | 267 | udf = mlflow.pyfunc.spark_udf(spark, "model", run_id="7e674524514846799310c41f10d6b99d") 268 | df2 = df.withColumn("prediction", udf(*df.columns)) 269 | df2.show(10) 270 | ``` 271 | -------------------------------------------------------------------------------- /sklearn/Train_Wine_Quality.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MLflow Train Wine Quality Notebook\n", 8 | "This is a Quick Start notebook.\n", 9 | "* It is based on [train.py](https://github.com/databricks/mlflow/blob/master/example/tutorial/train.py) from [MLflow's tutorial](https://mlflow.org/docs/latest/tutorial.html). \n", 10 | "* It creates runs in the experiment \"py/sk/ElasticNet/WineQuality\".\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 36, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "data": { 20 | "text/plain": [ 21 | "'http://localhost:5000'" 22 | ] 23 | }, 24 | "execution_count": 36, 25 | "metadata": {}, 26 | "output_type": "execute_result" 27 | } 28 | ], 29 | "source": [ 30 | "from __future__ import print_function\n", 31 | "import mlflow\n", 32 | "\n", 33 | "mlflow.set_tracking_uri(\"http://localhost:5000\")\n", 34 | "mlflow.tracking.get_tracking_uri()" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 37, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "def now():\n", 44 | " now = int(time.time()+.5)\n", 45 | " dt = time.strftime(\"%Y-%m-%d_%H:%M:%S\", time.gmtime(now))\n", 46 | " return dt" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 38, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "Run Start: 2019-03-26_02:10:09\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "print(\"Run Start:\",now())" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 39, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "experiment_name = \"py/sk/ElasticNet/WineQuality\"\n", 73 | "wine_data_path = \"./data/wine-quality-white.csv\"\n", 74 | "wine_data_url = \"https://raw.githubusercontent.com/mlflow/mlflow/master/examples/sklearn_elasticnet_wine/wine-quality.csv\"\n", 75 | "run_origin = \"jupyter\"" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 40, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "MLflow Version: 0.8.2\n", 88 | "experiment_id: 5\n", 89 | "experiment_name: py/sk/ElasticNet/WineQuality\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "import mlflow\n", 95 | "print(\"MLflow Version:\",mlflow.version.VERSION)\n", 96 | "mlflow.set_experiment(experiment_name)\n", 97 | "mlflow_client = mlflow.tracking.MlflowClient()\n", 98 | "experiment_id = mlflow_client.get_experiment_by_name(experiment_name).experiment_id\n", 99 | "print(\"experiment_id:\",experiment_id)\n", 100 | "print(\"experiment_name:\",experiment_name)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 41, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "import os\n", 110 | "import requests\n", 111 | "\n", 112 | "if not os.path.exists(wine_data_path):\n", 113 | " print(\"Downloading {} to {}\".format(wine_data_url,wine_data_path))\n", 114 | " rsp = requests.get(wine_data_url)\n", 115 | " with open(wine_data_path, 'w') as f:\n", 116 | " f.write(rsp.text)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "#### Write your ML code based on the`train.py` code\n", 124 | "This tutorial is based on the MLflow's example [train.py](https://github.com/databricks/mlflow/blob/master/example/tutorial/train.py), which uses an external [wine-quality.csv](https://github.com/databricks/mlflow/blob/master/example/tutorial/wine-quality.csv) dataset to predict wine quality." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 42, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "# The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality\n", 134 | "# P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.\n", 135 | "# Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.\n", 136 | "\n", 137 | "import os\n", 138 | "import warnings\n", 139 | "import sys\n", 140 | "\n", 141 | "import pandas as pd\n", 142 | "import numpy as np\n", 143 | "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n", 144 | "from sklearn.model_selection import train_test_split\n", 145 | "from sklearn.linear_model import ElasticNet\n", 146 | "\n", 147 | "import mlflow\n", 148 | "import mlflow.sklearn\n", 149 | "\n", 150 | "def eval_metrics(actual, pred):\n", 151 | " rmse = np.sqrt(mean_squared_error(actual, pred))\n", 152 | " mae = mean_absolute_error(actual, pred)\n", 153 | " r2 = r2_score(actual, pred)\n", 154 | " return rmse, mae, r2" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 43, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "from sklearn.linear_model import enet_path\n", 164 | "import matplotlib.pyplot as plt\n", 165 | "from itertools import cycle\n", 166 | "\n", 167 | "def plot_enet_descent_path(X, y, l1_ratio, plot_file):\n", 168 | " # Compute paths\n", 169 | " eps = 5e-3 # the smaller it is the longer is the path\n", 170 | "\n", 171 | " # Reference the global image variable\n", 172 | " global image\n", 173 | " \n", 174 | " print(\"Computing regularization path using the elastic net.\")\n", 175 | " alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=l1_ratio, fit_intercept=False)\n", 176 | "\n", 177 | " # Display results\n", 178 | " fig = plt.figure(1)\n", 179 | " ax = plt.gca()\n", 180 | "\n", 181 | " colors = cycle(['b', 'r', 'g', 'c', 'k'])\n", 182 | " neg_log_alphas_enet = -np.log10(alphas_enet)\n", 183 | " for coef_e, c in zip(coefs_enet, colors):\n", 184 | " l1 = plt.plot(neg_log_alphas_enet, coef_e, linestyle='--', c=c)\n", 185 | "\n", 186 | " plt.xlabel('-Log(alpha)')\n", 187 | " plt.ylabel('coefficients')\n", 188 | " title = 'ElasticNet Path by alpha for l1_ratio = ' + str(l1_ratio)\n", 189 | " plt.title(title)\n", 190 | " plt.axis('tight')\n", 191 | "\n", 192 | " image = fig\n", 193 | " fig.savefig(plot_file)\n", 194 | " plt.close(fig)\n", 195 | " return image " 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 44, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "def train(alpha, l1_ratio):\n", 205 | " warnings.filterwarnings(\"ignore\")\n", 206 | " np.random.seed(40)\n", 207 | "\n", 208 | " data = pd.read_csv(wine_data_path)\n", 209 | "\n", 210 | " # Split the data into training and test sets. (0.75, 0.25) split.\n", 211 | " train, test = train_test_split(data)\n", 212 | "\n", 213 | " # The predicted column is \"quality\" which is a scalar from [3, 9]\n", 214 | " train_x = train.drop([\"quality\"], axis=1)\n", 215 | " test_x = test.drop([\"quality\"], axis=1)\n", 216 | " train_y = train[[\"quality\"]]\n", 217 | " test_y = test[[\"quality\"]]\n", 218 | " with mlflow.start_run() as run:\n", 219 | " run_id = run.info.run_uuid\n", 220 | " print(\"run_id:\",run_id)\n", 221 | " print(\"run_origin:\",run_origin)\n", 222 | " clf = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)\n", 223 | " clf.fit(train_x, train_y)\n", 224 | "\n", 225 | " predicted_qualities = clf.predict(test_x)\n", 226 | " (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)\n", 227 | "\n", 228 | " print(\"Elasticnet model (alpha={}, l1_ratio={}):\".format(alpha, l1_ratio))\n", 229 | " print(\" RMSE:\",rmse)\n", 230 | " print(\" MAE:\",mae)\n", 231 | " print(\" R2:\",r2)\n", 232 | "\n", 233 | " mlflow.log_param(\"alpha\", alpha)\n", 234 | " mlflow.log_param(\"l1_ratio\", l1_ratio)\n", 235 | " mlflow.log_param(\"run_origin\", run_origin)\n", 236 | " mlflow.log_metric(\"rmse\", rmse)\n", 237 | " mlflow.log_metric(\"r2\", r2)\n", 238 | " mlflow.log_metric(\"mae\", mae)\n", 239 | "\n", 240 | " mlflow.sklearn.log_model(clf, \"model\")\n", 241 | " \n", 242 | " X = data.drop([\"quality\"], axis=1).values\n", 243 | " y = data[[\"quality\"]].values.ravel()\n", 244 | " plot_file = \"wine_quality.png\"\n", 245 | " plot_enet_descent_path(X, y, l1_ratio, plot_file)\n", 246 | " mlflow.log_artifact(plot_file)\n", 247 | " \n", 248 | " return (rmse,r2,mae)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 45, 254 | "metadata": {}, 255 | "outputs": [ 256 | { 257 | "name": "stdout", 258 | "output_type": "stream", 259 | "text": [ 260 | "run_id: d8b189ad970e44f4978c5b19bbf6fa1e\n", 261 | "run_origin: jupyter\n", 262 | "Elasticnet model (alpha=0.1, l1_ratio=0.1):\n", 263 | " RMSE: 0.7792546522251949\n", 264 | " MAE: 0.6112547988118587\n", 265 | " R2: 0.2157063843066196\n", 266 | "Computing regularization path using the elastic net.\n" 267 | ] 268 | }, 269 | { 270 | "data": { 271 | "text/plain": [ 272 | "(0.7792546522251949, 0.2157063843066196, 0.6112547988118587)" 273 | ] 274 | }, 275 | "execution_count": 45, 276 | "metadata": {}, 277 | "output_type": "execute_result" 278 | } 279 | ], 280 | "source": [ 281 | "train(0.1, 0.1)" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 46, 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "name": "stdout", 291 | "output_type": "stream", 292 | "text": [ 293 | "Run End: 2019-03-26_02:10:10\n" 294 | ] 295 | } 296 | ], 297 | "source": [ 298 | "print(\"Run End:\",now())" 299 | ] 300 | } 301 | ], 302 | "metadata": { 303 | "kernelspec": { 304 | "display_name": "Python 3", 305 | "language": "python", 306 | "name": "python3" 307 | }, 308 | "language_info": { 309 | "codemirror_mode": { 310 | "name": "ipython", 311 | "version": 3 312 | }, 313 | "file_extension": ".py", 314 | "mimetype": "text/x-python", 315 | "name": "python", 316 | "nbconvert_exporter": "python", 317 | "pygments_lexer": "ipython3", 318 | "version": "3.6.8" 319 | }, 320 | "name": "Train Wine Quality", 321 | "notebookId": 1406514 322 | }, 323 | "nbformat": 4, 324 | "nbformat_minor": 1 325 | } 326 | -------------------------------------------------------------------------------- /sklearn/conda.yaml: -------------------------------------------------------------------------------- 1 | name: mlflow-demo-sklearn 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.6 6 | - pip: 7 | - mlflow==0.9.1 8 | - scikit-learn==0.19.1 9 | - numpy==1.14.5 10 | - scipy==1.1.0 11 | - matplotlib==2.2.2 12 | -------------------------------------------------------------------------------- /sklearn/create_job_existing_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "MLflow_ExistingCluster", 3 | "existing_cluster_id": "1222-015510-grams64", 4 | "email_notifications": { 5 | "on_start": ["myname@mycompany.com"], 6 | "on_success": ["myname@mycompany.com"], 7 | "on_failure": ["myname@mycompany.com"] 8 | }, 9 | "spark_python_task": { 10 | "python_file": "dbfs:/tmp/jobs/wine_quality/main_train_wine_quality.py" 11 | }, 12 | "timeout_seconds": 3600 13 | } 14 | -------------------------------------------------------------------------------- /sklearn/create_job_new_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "MLflow_RunNow_NewCluster", 3 | "new_cluster": { 4 | "spark_version": "5.3.x-scala2.11", 5 | "node_type_id": "i3.xlarge", 6 | "num_workers": 1 7 | }, 8 | "email_notifications": { 9 | "on_start": ["myname@mycompany.com"], 10 | "on_success": ["myname@mycompany.com"], 11 | "on_failure": ["myname@mycompany.com"] 12 | }, 13 | "libraries": [ 14 | { "pypi": { "package": "mlflow" } }, 15 | { "pypi": { "package": "cloudpickle" }}, 16 | { "egg": "dbfs:/tmp/jobs/wine_quality/mlflow_wine_quality-0.0.1-py3.6.egg" } 17 | ], 18 | "spark_python_task": { 19 | "python_file": "dbfs:/tmp/jobs/wine_quality/main_train_wine_quality.py" 20 | }, 21 | "timeout_seconds": 3600 22 | } 23 | -------------------------------------------------------------------------------- /sklearn/data/wine-quality.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "fixed acidity": 7, 4 | "volatile acidity": 0.27, 5 | "citric acid": 0.36, 6 | "residual sugar": 20.7, 7 | "chlorides": 0.045, 8 | "free sulfur dioxide": 45, 9 | "total sulfur dioxide": 170, 10 | "density": 1.001, 11 | "pH": 3, 12 | "sulphates": 0.45, 13 | "alcohol": 8.8 14 | } 15 | ] 16 | -------------------------------------------------------------------------------- /sklearn/main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from wine_quality.train import Trainer 3 | from argparse import ArgumentParser 4 | 5 | if __name__ == "__main__": 6 | parser = ArgumentParser() 7 | parser.add_argument("--experiment_name", dest="experiment_name", help="experiment_name", required=True) 8 | parser.add_argument("--data_path", dest="data_path", help="data_path", required=True) 9 | parser.add_argument("--alpha", dest="alpha", help="alpha", default=0.1, type=float ) 10 | parser.add_argument("--l1_ratio", dest="l1_ratio", help="l1_ratio", default=0.1, type=float ) 11 | parser.add_argument("--run_origin", dest="run_origin", help="run_origin", default="none") 12 | args = parser.parse_args() 13 | trainer = Trainer(args.experiment_name, args.data_path,args.run_origin) 14 | trainer.train(args.alpha, args.l1_ratio) 15 | -------------------------------------------------------------------------------- /sklearn/mlflow_run_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "spark_version": "5.3.x-scala2.11", 3 | "driver_node_type_id": "i3.xlarge", 4 | "node_type_id": "i3.xlarge", 5 | "num_workers": 1, 6 | "spark_env_vars": { 7 | "PYSPARK_PYTHON": "/databricks/python3/bin/python3" 8 | }, 9 | "libraries": [ 10 | { "pypi": { "package": "mlflow" }} 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /sklearn/pickle_predict.py: -------------------------------------------------------------------------------- 1 | 2 | """ Serve predictions by unpickling model artifact file. """ 3 | 4 | from __future__ import print_function 5 | import sys 6 | import pickle 7 | import util 8 | 9 | if __name__ == "__main__": 10 | if len(sys.argv) < 1: 11 | println("ERROR: Expecting PICKLE_FILE PREDICTION_FILE") 12 | sys.exit(1) 13 | pickle_path = sys.argv[1] 14 | data_path = sys.argv[2] if len(sys.argv) > 2 else "data/wine-quality-red.csv" 15 | print("pickle_path:",pickle_path) 16 | print("data_path:",data_path) 17 | 18 | with open(pickle_path, 'rb') as f: 19 | model = pickle.load(f) 20 | print("model:",model) 21 | print("model type:",type(model)) 22 | 23 | df = util.read_prediction_data(data_path) 24 | predictions = model.predict(df) 25 | print("predictions:",predictions) 26 | -------------------------------------------------------------------------------- /sklearn/playbook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## sklearn playbook" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 4, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import os\n", 17 | "os.environ[\"MLFLOW_TRACKING_URI\"] = \"http://localhost:5000\"" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "### Train" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 5, 30 | "metadata": { 31 | "scrolled": true 32 | }, 33 | "outputs": [ 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | "MLflow Version: 0.9.1\n", 39 | "MLflow Tracking URI: http://localhost:5000\n", 40 | "experiment_name: WineQualityExperiment\n", 41 | "run_origin: none\n", 42 | "data_path: data/wine-quality-white.csv\n", 43 | "experiment_id: 7\n", 44 | "run_id: 792037e4fdde40efb1c43eeef76a5f9b\n", 45 | " experiment_id: 7\n", 46 | " Parameters:\n", 47 | " alpha: 0.5\n", 48 | " l1_ratio: 0.5\n", 49 | " Metrics:\n", 50 | " RMSE: 0.82224284975954\n", 51 | " MAE: 0.6278761410160693\n", 52 | " R2: 0.12678721972772677\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "! python main.py --experiment_name WineQualityExperiment \\\n", 58 | " --data_path data/wine-quality-white.csv \\\n", 59 | " --alpha 0.5 --l1_ratio 0.5" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "2019/04/24 21:23:52 INFO mlflow.projects: === Creating conda environment mlflow-169b50d7002caef08e847b2615a41296bd46b052 ===\n", 72 | "Collecting package metadata: done\n", 73 | "Solving environment: done\n", 74 | "\n", 75 | "\n", 76 | "==> WARNING: A newer version of conda exists. <==\n", 77 | " current version: 4.6.2\n", 78 | " latest version: 4.6.14\n", 79 | "\n", 80 | "Please update conda by running\n", 81 | "\n", 82 | " $ conda update -n base -c defaults conda\n", 83 | "\n", 84 | "\n", 85 | "\n", 86 | "Downloading and Extracting Packages\n", 87 | "pip-19.1 | 1.8 MB | ##################################### | 100% \n", 88 | "Preparing transaction: done\n", 89 | "Verifying transaction: done\n", 90 | "Executing transaction: done\n", 91 | "Collecting mlflow==0.9.1 (from -r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 92 | " Using cached https://files.pythonhosted.org/packages/4f/8a/3713ce558aba91acf8495bf9e82961030858eb5fe3041c5c51186e89fc85/mlflow-0.9.1-py3-none-any.whl\n", 93 | "Collecting scikit-learn==0.19.1 (from -r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 2))\n", 94 | " Using cached https://files.pythonhosted.org/packages/f0/5e/1e1576587c5a9e8de6771806a4cccea8decd268c988453cf35ccbf892929/scikit_learn-0.19.1-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl\n", 95 | "Collecting numpy==1.14.5 (from -r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 3))\n", 96 | " Using cached https://files.pythonhosted.org/packages/f6/cd/b2c50b5190b66c711c23ef23c41d450297eb5a54d2033f8dcb3b8b13ac85/numpy-1.14.5-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl\n", 97 | "Collecting scipy==1.1.0 (from -r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 4))\n", 98 | " Using cached https://files.pythonhosted.org/packages/a0/b6/70bf61c1badb5fea82d4c558e05e76c2dee5e77bb072fe465d7c7a87287d/scipy-1.1.0-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl\n", 99 | "Collecting matplotlib==2.2.2 (from -r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 5))\n", 100 | " Using cached https://files.pythonhosted.org/packages/8a/d5/5337662b714c65100f3545ed3909e9478614d1ebf1f692a52981f3f5167b/matplotlib-2.2.2-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl\n", 101 | "Collecting pandas (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 102 | " Using cached https://files.pythonhosted.org/packages/2a/67/0a59cb257c72bb837575ca0ddf5f0fe2a482e98209b7a1bed8cde68ddb46/pandas-0.24.2-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl\n", 103 | "Collecting requests>=2.17.3 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 104 | " Using cached https://files.pythonhosted.org/packages/7d/e3/20f3d364d6c8e5d2353c72a67778eb189176f08e873c9900e10c0287b84b/requests-2.21.0-py2.py3-none-any.whl\n", 105 | "Collecting gunicorn (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 106 | " Using cached https://files.pythonhosted.org/packages/8c/da/b8dd8deb741bff556db53902d4706774c8e1e67265f69528c14c003644e6/gunicorn-19.9.0-py2.py3-none-any.whl\n", 107 | "Collecting gitpython>=2.1.0 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 108 | " Using cached https://files.pythonhosted.org/packages/fe/e5/fafe827507644c32d6dc553a1c435cdf882e0c28918a5bab29f7fbebfb70/GitPython-2.1.11-py2.py3-none-any.whl\n", 109 | "Collecting click>=7.0 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 110 | " Using cached https://files.pythonhosted.org/packages/fa/37/45185cb5abbc30d7257104c434fe0b07e5a195a6847506c074527aa599ec/Click-7.0-py2.py3-none-any.whl\n", 111 | "Collecting python-dateutil (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 112 | " Using cached https://files.pythonhosted.org/packages/41/17/c62faccbfbd163c7f57f3844689e3a78bae1f403648a6afb1d0866d87fbb/python_dateutil-2.8.0-py2.py3-none-any.whl\n", 113 | "Collecting Flask (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 114 | " Using cached https://files.pythonhosted.org/packages/7f/e7/08578774ed4536d3242b14dacb4696386634607af824ea997202cd0edb4b/Flask-1.0.2-py2.py3-none-any.whl\n", 115 | "Collecting simplejson (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 116 | "Collecting querystring-parser (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 117 | "Collecting databricks-cli>=0.8.0 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 118 | "Collecting boto3>=1.7.12 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 119 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/3f/1b/4adde85d1cbbe8a8f2fef47276daf496328fc8d7c40d7a8d6a67b0eba45b/boto3-1.9.135-py2.py3-none-any.whl (128kB)\n", 120 | "\u001b[K |████████████████████████████████| 133kB 3.4MB/s eta 0:00:01\n", 121 | "\u001b[?25hCollecting sqlparse (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 122 | " Using cached https://files.pythonhosted.org/packages/ef/53/900f7d2a54557c6a37886585a91336520e5539e3ae2423ff1102daf4f3a7/sqlparse-0.3.0-py2.py3-none-any.whl\n", 123 | "Collecting entrypoints (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 124 | " Using cached https://files.pythonhosted.org/packages/ac/c6/44694103f8c221443ee6b0041f69e2740d89a25641e62fb4f2ee568f2f9c/entrypoints-0.3-py2.py3-none-any.whl\n", 125 | "Collecting docker>=3.6.0 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 126 | " Using cached https://files.pythonhosted.org/packages/48/68/c3afca1a5aa8d2997ec3b8ee822a4d752cf85907b321f07ea86888545152/docker-3.7.2-py2.py3-none-any.whl\n", 127 | "Collecting cloudpickle (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 128 | " Using cached https://files.pythonhosted.org/packages/6e/bc/67f13115adcce4efc5e4d7f8220fb9a50aaa2b5c7ed460b26cbb76aa76ad/cloudpickle-0.8.1-py2.py3-none-any.whl\n", 129 | "Collecting pyyaml (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 130 | "Collecting six>=1.10.0 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 131 | " Using cached https://files.pythonhosted.org/packages/73/fb/00a976f728d0d1fecfe898238ce23f502a721c0ac0ecfedb80e0d88c64e9/six-1.12.0-py2.py3-none-any.whl\n", 132 | "Collecting protobuf>=3.6.0 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 133 | " Using cached https://files.pythonhosted.org/packages/46/1e/c481d59760cded074d89ff51c99381708111c550ff698934cc296d27df2c/protobuf-3.7.1-cp36-cp36m-macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl\n", 134 | "Collecting mleap>=0.8.1 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 135 | "Collecting cycler>=0.10 (from matplotlib==2.2.2->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 5))\n", 136 | " Using cached https://files.pythonhosted.org/packages/f7/d2/e07d3ebb2bd7af696440ce7e754c59dd546ffe1bbe732c8ab68b9c834e61/cycler-0.10.0-py2.py3-none-any.whl\n", 137 | "Collecting pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 (from matplotlib==2.2.2->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 5))\n", 138 | " Using cached https://files.pythonhosted.org/packages/dd/d9/3ec19e966301a6e25769976999bd7bbe552016f0d32b577dc9d63d2e0c49/pyparsing-2.4.0-py2.py3-none-any.whl\n", 139 | "Collecting kiwisolver>=1.0.1 (from matplotlib==2.2.2->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 5))\n" 140 | ] 141 | }, 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | " Downloading https://files.pythonhosted.org/packages/16/e7/df58eb8868d183223692d2a62529a594f6414964a3ae93548467b146a24d/kiwisolver-1.1.0.tar.gz\n", 147 | "Collecting pytz (from matplotlib==2.2.2->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 5))\n", 148 | " Using cached https://files.pythonhosted.org/packages/3d/73/fe30c2daaaa0713420d0382b16fbb761409f532c56bdcc514bf7b6262bb6/pytz-2019.1-py2.py3-none-any.whl\n", 149 | "Collecting chardet<3.1.0,>=3.0.2 (from requests>=2.17.3->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 150 | " Using cached https://files.pythonhosted.org/packages/bc/a9/01ffebfb562e4274b6487b4bb1ddec7ca55ec7510b22e4c51f14098443b8/chardet-3.0.4-py2.py3-none-any.whl\n", 151 | "Collecting idna<2.9,>=2.5 (from requests>=2.17.3->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 152 | " Using cached https://files.pythonhosted.org/packages/14/2c/cd551d81dbe15200be1cf41cd03869a46fe7226e7450af7a6545bfc474c9/idna-2.8-py2.py3-none-any.whl\n", 153 | "Collecting urllib3<1.25,>=1.21.1 (from requests>=2.17.3->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 154 | " Using cached https://files.pythonhosted.org/packages/df/1c/59cca3abf96f991f2ec3131a4ffe72ae3d9ea1f5894abe8a9c5e3c77cfee/urllib3-1.24.2-py2.py3-none-any.whl\n", 155 | "Requirement already satisfied: certifi>=2017.4.17 in /Users/ander/miniconda3/envs/mlflow-169b50d7002caef08e847b2615a41296bd46b052/lib/python3.6/site-packages (from requests>=2.17.3->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1)) (2019.3.9)\n", 156 | "Collecting gitdb2>=2.0.0 (from gitpython>=2.1.0->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 157 | " Using cached https://files.pythonhosted.org/packages/da/30/a407568aa8d8f25db817cf50121a958722f3fc5f87e3a6fba1f40c0633e3/gitdb2-2.0.5-py2.py3-none-any.whl\n", 158 | "Collecting Jinja2>=2.10 (from Flask->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 159 | " Using cached https://files.pythonhosted.org/packages/1d/e7/fd8b501e7a6dfe492a433deb7b9d833d39ca74916fa8bc63dd1a4947a671/Jinja2-2.10.1-py2.py3-none-any.whl\n", 160 | "Collecting itsdangerous>=0.24 (from Flask->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 161 | " Using cached https://files.pythonhosted.org/packages/76/ae/44b03b253d6fade317f32c24d100b3b35c2239807046a4c953c7b89fa49e/itsdangerous-1.1.0-py2.py3-none-any.whl\n", 162 | "Collecting Werkzeug>=0.14 (from Flask->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 163 | " Using cached https://files.pythonhosted.org/packages/18/79/84f02539cc181cdbf5ff5a41b9f52cae870b6f632767e43ba6ac70132e92/Werkzeug-0.15.2-py2.py3-none-any.whl\n", 164 | "Collecting configparser>=0.3.5 (from databricks-cli>=0.8.0->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 165 | " Using cached https://files.pythonhosted.org/packages/ba/05/6c96328e92e625fc31445d24d75a2c92ef9ba34fc5b037fe69693c362a0d/configparser-3.7.4-py2.py3-none-any.whl\n", 166 | "Collecting tabulate>=0.7.7 (from databricks-cli>=0.8.0->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 167 | "Collecting botocore<1.13.0,>=1.12.135 (from boto3>=1.7.12->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n", 168 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d3/36/08db4978d59d75750ef6da9835150b901c6fb96f6d5f30d8c50eb424ed4e/botocore-1.12.135-py2.py3-none-any.whl (5.4MB)\n", 169 | "\u001b[K |████████████████████████████████| 5.4MB 3.6MB/s eta 0:00:01\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "! mlflow run . -P alpha=0.01 -P l1_ratio=0.75 -P run_origin=LocalRun \\\n", 175 | " -P data_path=data/wine-quality-white.csv \\\n", 176 | " --experiment-id=2" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "! mlflow run https://github.com/amesar/mlflow-fun.git#examples/scikit-learn/wine-quality \\\n", 186 | " -P alpha=0.01 -P l1_ratio=0.75 -P run_origin=GitRun \\\n", 187 | " -P data_path=data/wine-quality-white.csv \\\n", 188 | " --experiment-id=2" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "### Predict" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 5, 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | "MLflow Version: 0.9.1\n", 208 | "data_path: data/wine-quality-white.csv\n", 209 | "run_id: 4fc01818631840d9ae4d8ad9f86299fc\n", 210 | "model_uri: /Users/ander/work/mlflow/local_mlrun/mlruns/2/4fc01818631840d9ae4d8ad9f86299fc/artifacts/model\n", 211 | "model: ElasticNet(alpha=0.01, copy_X=True, fit_intercept=True, l1_ratio=1.0,\n", 212 | " max_iter=1000, normalize=False, positive=False, precompute=False,\n", 213 | " random_state=42, selection='cyclic', tol=0.0001, warm_start=False)\n", 214 | "predictions: [5.56281868 5.30043479 5.70350174 ... 5.44619008 6.55740254 6.2852277 ]\n" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "! python pyfunc_predict.py 4fc01818631840d9ae4d8ad9f86299fc data/wine-quality-white.csv" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 6, 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "name": "stdout", 229 | "output_type": "stream", 230 | "text": [ 231 | "MLflow Version: 0.9.1\n", 232 | "data_path: data/wine-quality-white.csv\n", 233 | "run_id: 4fc01818631840d9ae4d8ad9f86299fc\n", 234 | "model: ElasticNet(alpha=0.01, copy_X=True, fit_intercept=True, l1_ratio=1.0,\n", 235 | " max_iter=1000, normalize=False, positive=False, precompute=False,\n", 236 | " random_state=42, selection='cyclic', tol=0.0001, warm_start=False)\n", 237 | "predictions: [5.56281868 5.30043479 5.70350174 ... 5.44619008 6.55740254 6.2852277 ]\n" 238 | ] 239 | } 240 | ], 241 | "source": [ 242 | "! python scikit_predict.py 4fc01818631840d9ae4d8ad9f86299fc data/wine-quality-white.csv" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 9, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "name": "stdout", 252 | "output_type": "stream", 253 | "text": [ 254 | "19/04/25 14:06:00 WARN Utils: Your hostname, C02VD1RGHTDD resolves to a loopback address: 127.0.0.1; using 10.64.185.74 instead (on interface en0)\n", 255 | "19/04/25 14:06:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", 256 | "19/04/25 14:06:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", 257 | "path: data/wine-quality-white.csv\n", 258 | "run_id= 4fc01818631840d9ae4d8ad9f86299fc\n", 259 | "MLflow Version: 0.9.1\n", 260 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+\n", 261 | "|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density| pH|sulphates|alcohol|\n", 262 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+\n", 263 | "| 7.0| 0.27| 0.36| 20.7| 0.045| 45.0| 170.0| 1.001| 3.0| 0.45| 8.8|\n", 264 | "| 6.3| 0.3| 0.34| 1.6| 0.049| 14.0| 132.0| 0.994| 3.3| 0.49| 9.5|\n", 265 | "| 8.1| 0.28| 0.4| 6.9| 0.05| 30.0| 97.0| 0.9951|3.26| 0.44| 10.1|\n", 266 | "| 7.2| 0.23| 0.32| 8.5| 0.058| 47.0| 186.0| 0.9956|3.19| 0.4| 9.9|\n", 267 | "| 7.2| 0.23| 0.32| 8.5| 0.058| 47.0| 186.0| 0.9956|3.19| 0.4| 9.9|\n", 268 | "| 8.1| 0.28| 0.4| 6.9| 0.05| 30.0| 97.0| 0.9951|3.26| 0.44| 10.1|\n", 269 | "| 6.2| 0.32| 0.16| 7.0| 0.045| 30.0| 136.0| 0.9949|3.18| 0.47| 9.6|\n", 270 | "| 7.0| 0.27| 0.36| 20.7| 0.045| 45.0| 170.0| 1.001| 3.0| 0.45| 8.8|\n", 271 | "| 6.3| 0.3| 0.34| 1.6| 0.049| 14.0| 132.0| 0.994| 3.3| 0.49| 9.5|\n", 272 | "| 8.1| 0.22| 0.43| 1.5| 0.044| 28.0| 129.0| 0.9938|3.22| 0.45| 11.0|\n", 273 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+\n", 274 | "only showing top 10 rows\n", 275 | "\n", 276 | "/Users/ander/venvs/mlflow-venv/lib/python2.7/site-packages/pyarrow/__init__.py:152: UserWarning: pyarrow.open_stream is deprecated, please use pyarrow.ipc.open_stream\n", 277 | " warnings.warn(\"pyarrow.open_stream is deprecated, please use \"\n", 278 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+------------------+\n", 279 | "|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density| pH|sulphates|alcohol| prediction|\n", 280 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+------------------+\n", 281 | "| 7.0| 0.27| 0.36| 20.7| 0.045| 45.0| 170.0| 1.001| 3.0| 0.45| 8.8| 5.562818680029495|\n", 282 | "| 6.3| 0.3| 0.34| 1.6| 0.049| 14.0| 132.0| 0.994| 3.3| 0.49| 9.5|5.3004347872132715|\n", 283 | "| 8.1| 0.28| 0.4| 6.9| 0.05| 30.0| 97.0| 0.9951|3.26| 0.44| 10.1| 5.703501740309309|\n", 284 | "| 7.2| 0.23| 0.32| 8.5| 0.058| 47.0| 186.0| 0.9956|3.19| 0.4| 9.9| 5.742035433111079|\n", 285 | "| 7.2| 0.23| 0.32| 8.5| 0.058| 47.0| 186.0| 0.9956|3.19| 0.4| 9.9| 5.742035433111079|\n", 286 | "| 8.1| 0.28| 0.4| 6.9| 0.05| 30.0| 97.0| 0.9951|3.26| 0.44| 10.1| 5.703501740309309|\n", 287 | "| 6.2| 0.32| 0.16| 7.0| 0.045| 30.0| 136.0| 0.9949|3.18| 0.47| 9.6| 5.526523431037601|\n", 288 | "| 7.0| 0.27| 0.36| 20.7| 0.045| 45.0| 170.0| 1.001| 3.0| 0.45| 8.8| 5.562818680029495|\n", 289 | "| 6.3| 0.3| 0.34| 1.6| 0.049| 14.0| 132.0| 0.994| 3.3| 0.49| 9.5|5.3004347872132715|\n", 290 | "| 8.1| 0.22| 0.43| 1.5| 0.044| 28.0| 129.0| 0.9938|3.22| 0.45| 11.0| 5.912851142126504|\n", 291 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+------------------+\n", 292 | "only showing top 10 rows\n", 293 | "\n", 294 | "/Users/ander/venvs/mlflow-venv/lib/python2.7/site-packages/pyarrow/__init__.py:152: UserWarning: pyarrow.open_stream is deprecated, please use pyarrow.ipc.open_stream\n", 295 | " warnings.warn(\"pyarrow.open_stream is deprecated, please use \"\n", 296 | "+------------------+\n", 297 | "| prediction|\n", 298 | "+------------------+\n", 299 | "| 5.562818680029495|\n", 300 | "|5.3004347872132715|\n", 301 | "| 5.703501740309309|\n", 302 | "| 5.742035433111079|\n", 303 | "| 5.742035433111079|\n", 304 | "| 5.703501740309309|\n", 305 | "| 5.526523431037601|\n", 306 | "| 5.562818680029495|\n", 307 | "|5.3004347872132715|\n", 308 | "| 5.912851142126504|\n", 309 | "+------------------+\n", 310 | "only showing top 10 rows\n", 311 | "\n", 312 | "/Users/ander/venvs/mlflow-venv/lib/python2.7/site-packages/pyarrow/__init__.py:152: UserWarning: pyarrow.open_stream is deprecated, please use pyarrow.ipc.open_stream\n", 313 | " warnings.warn(\"pyarrow.open_stream is deprecated, please use \"\n", 314 | "predictions: 5.5628187\n" 315 | ] 316 | } 317 | ], 318 | "source": [ 319 | "! spark-submit --master local[2] spark_udf_predict.py \\\n", 320 | " 4fc01818631840d9ae4d8ad9f86299fc data/wine-quality-white.csv" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [] 329 | } 330 | ], 331 | "metadata": { 332 | "kernelspec": { 333 | "display_name": "Python 2", 334 | "language": "python", 335 | "name": "python2" 336 | }, 337 | "language_info": { 338 | "codemirror_mode": { 339 | "name": "ipython", 340 | "version": 2 341 | }, 342 | "file_extension": ".py", 343 | "mimetype": "text/x-python", 344 | "name": "python", 345 | "nbconvert_exporter": "python", 346 | "pygments_lexer": "ipython2", 347 | "version": "2.7.14" 348 | } 349 | }, 350 | "nbformat": 4, 351 | "nbformat_minor": 2 352 | } 353 | -------------------------------------------------------------------------------- /sklearn/pyfunc_predict.py: -------------------------------------------------------------------------------- 1 | 2 | # Serve predictions with mlflow.pyfunc.load_pyfunc() 3 | 4 | from __future__ import print_function 5 | import sys 6 | import mlflow 7 | import mlflow.pyfunc 8 | import mlflow.tracking 9 | import util 10 | 11 | if __name__ == "__main__": 12 | if len(sys.argv) < 1: 13 | println("ERROR: Expecting RUN_ID PREDICTION_FILE") 14 | sys.exit(1) 15 | print("MLflow Version:", mlflow.version.VERSION) 16 | run_id = sys.argv[1] 17 | data_path = sys.argv[2] if len(sys.argv) > 2 else "data/wine-quality-red.csv" 18 | print("data_path:",data_path) 19 | print("run_id:",run_id) 20 | 21 | client = mlflow.tracking.MlflowClient() 22 | model_uri = client.get_run(run_id).info.artifact_uri + "/model" 23 | print("model_uri:",model_uri) 24 | model = mlflow.pyfunc.load_pyfunc(model_uri) 25 | print("model:",model) 26 | 27 | df = util.read_prediction_data(data_path) 28 | predictions = model.predict(df) 29 | print("predictions:",predictions) 30 | -------------------------------------------------------------------------------- /sklearn/run_submit_existing_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "run_name": "MLflow_RunSubmit_ExistingCluster", 3 | "existing_cluster_id": "1222-015510-grams64", 4 | "timeout_seconds": 3600, 5 | "libraries": [ 6 | { "pypi": { "package": "mlflow" } }, 7 | { "pypi": { "package": "cloudpickle" }}, 8 | { "egg": "dbfs:/tmp/jobs/wine_quality/mlflow_wine_quality-0.0.1-py3.6.egg" } 9 | ], 10 | "spark_python_task": { 11 | "python_file": "dbfs:/tmp/jobs/wine_quality/main_train_wine_quality.py", 12 | "parameters": [ "/Users/john.doe@acme.com/experiments/WineQuality", 0.3, 0.3, "/dbfs/tmp/jobs/wine_quality/wine-quality.csv", "run_submit_existing_cluster_egg" ] 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /sklearn/run_submit_new_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "run_name": "MLflow_RunSubmit_NewCluster", 3 | "new_cluster": { 4 | "spark_version": "5.1.x-scala2.11", 5 | "node_type_id": "i3.xlarge", 6 | "num_workers": 1 7 | }, 8 | "libraries": [ 9 | { "pypi": { "package": "mlflow" } }, 10 | { "egg": "dbfs:/tmp/jobs/wine_quality/mlflow_wine_quality-0.0.1-py3.6.egg" } 11 | ], 12 | "spark_python_task": { 13 | "python_file": "dbfs:/tmp/jobs/wine_quality/main_train_wine_quality.py", 14 | "parameters": [ "/Users/john.doe@acme.com/experiments/WineQuality", 0.3, 0.3, "/dbfs/tmp/jobs/wine_quality/wine-quality.csv", "run_submit_new_cluster_egg" ] 15 | }, 16 | "timeout_seconds": 3600 17 | } 18 | -------------------------------------------------------------------------------- /sklearn/scikit_predict.py: -------------------------------------------------------------------------------- 1 | 2 | # Serve predictions with mlflow.sklearn.load_model() 3 | 4 | from __future__ import print_function 5 | import sys 6 | import mlflow 7 | import mlflow.sklearn 8 | import util 9 | 10 | if __name__ == "__main__": 11 | if len(sys.argv) < 1: 12 | println("ERROR: Expecting RUN_ID PREDICTION_FILE") 13 | sys.exit(1) 14 | print("MLflow Version:", mlflow.version.VERSION) 15 | run_id = sys.argv[1] 16 | data_path = sys.argv[2] if len(sys.argv) > 2 else "data/wine-quality-red.csv" 17 | print("data_path:",data_path) 18 | print("run_id:",run_id) 19 | 20 | model = mlflow.sklearn.load_model("model", run_id=run_id) 21 | print("model:",model) 22 | 23 | df = util.read_prediction_data(data_path) 24 | predictions = model.predict(df) 25 | print("predictions:",predictions) 26 | -------------------------------------------------------------------------------- /sklearn/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='mlflow_wine_quality', 4 | version='0.0.1', 5 | description='mlflow_wine_quality', 6 | author='Andre', 7 | packages=['wine_quality'], 8 | zip_safe=False) 9 | -------------------------------------------------------------------------------- /sklearn/spark_udf_predict.py: -------------------------------------------------------------------------------- 1 | """ 2 | Serve predictions with Spark UDF. 3 | """ 4 | from __future__ import print_function 5 | 6 | import sys 7 | from pyspark.sql import SparkSession 8 | import mlflow 9 | import mlflow.sklearn 10 | 11 | if __name__ == "__main__": 12 | path = sys.argv[2] if len(sys.argv) > 2 else "data/wine-quality-red.csv" 13 | run_id = sys.argv[1] 14 | print("path:",path) 15 | print("run_id=",run_id) 16 | print("MLflow Version:", mlflow.version.VERSION) 17 | 18 | spark = SparkSession.builder.appName("ServePredictions").getOrCreate() 19 | 20 | df = spark.read.option("inferSchema",True).option("header", True).csv(path) if path.endswith(".csv") \ 21 | else spark.read.option("multiLine",True).json(path) 22 | 23 | if "quality" in df.columns: 24 | df = df.drop("quality") 25 | df.show(10) 26 | 27 | udf = mlflow.pyfunc.spark_udf(spark, "model", run_id=run_id) 28 | df2 = df.withColumn("prediction", udf(*df.columns)) 29 | df2.show(10) 30 | df2.select("prediction").show(10) 31 | pred = df2.select("prediction").first()[0] 32 | print("predictions: {:,.7f}".format(pred)) 33 | 34 | -------------------------------------------------------------------------------- /sklearn/util.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import pandas as pd 3 | 4 | def read_prediction_data(data_path): 5 | df = pd.read_csv(data_path) if data_path.endswith(".csv") else pd.read_json(data_path) 6 | #print("df.shape:",df.shape) 7 | #print("df.columns:",df.columns) 8 | if 'quality' in df: 9 | df = df.drop(['quality'], axis=1) 10 | return df 11 | -------------------------------------------------------------------------------- /sklearn/wine_quality/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amesar/mlflow-spark-summit-2019/3d4791f1defae41c2a6c570b379f129ed0ae59cb/sklearn/wine_quality/__init__.py -------------------------------------------------------------------------------- /sklearn/wine_quality/plot_utils.py: -------------------------------------------------------------------------------- 1 | 2 | from itertools import cycle 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | def plot_enet_descent_path(X, y, l1_ratio, alphas_enet, coefs_enet, plot_file): 7 | fig = plt.figure(1) 8 | ax = plt.gca() 9 | 10 | colors = cycle(['b', 'r', 'g', 'c', 'k']) 11 | neg_log_alphas_enet = -np.log10(alphas_enet) 12 | for coef_e, c in zip(coefs_enet, colors): 13 | l2 = plt.plot(neg_log_alphas_enet, coef_e, linestyle='--', c=c) 14 | 15 | plt.xlabel('-Log(alpha)') 16 | plt.ylabel('coefficients') 17 | title = 'ElasticNet Path by alpha for l1_ratio = ' + str(l1_ratio) 18 | plt.title(title) 19 | plt.axis('tight') 20 | 21 | fig.savefig(plot_file) 22 | plt.close(fig) 23 | return fig 24 | -------------------------------------------------------------------------------- /sklearn/wine_quality/train.py: -------------------------------------------------------------------------------- 1 | # The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality 2 | # P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. 3 | # Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009. 4 | 5 | from __future__ import print_function 6 | import os 7 | import sys 8 | import platform 9 | 10 | import pandas as pd 11 | import numpy as np 12 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 13 | from sklearn.model_selection import train_test_split 14 | from sklearn.linear_model import ElasticNet, enet_path 15 | 16 | import mlflow 17 | import mlflow.sklearn 18 | from wine_quality import plot_utils 19 | 20 | print("MLflow Version:", mlflow.version.VERSION) 21 | print("MLflow Tracking URI:", mlflow.get_tracking_uri()) 22 | 23 | class Trainer(object): 24 | def __init__(self, experiment_name, data_path, run_origin="none"): 25 | self.experiment_name = experiment_name 26 | self.data_path = data_path 27 | self.run_origin = run_origin 28 | np.random.seed(40) 29 | 30 | print("experiment_name:",self.experiment_name) 31 | print("run_origin:",run_origin) 32 | 33 | # Read the wine-quality csv file 34 | print("data_path:",data_path) 35 | data = pd.read_csv(data_path) 36 | 37 | # Split the data into training and test sets. (0.75, 0.25) split. 38 | train, test = train_test_split(data) 39 | 40 | # The predicted column is "quality" which is a scalar from [3, 9] 41 | self.train_x = train.drop(["quality"], axis=1) 42 | self.test_x = test.drop(["quality"], axis=1) 43 | self.train_y = train[["quality"]] 44 | self.test_y = test[["quality"]] 45 | self.current_file = os.path.basename(__file__) 46 | 47 | self.X = data.drop(["quality"], axis=1).values 48 | self.y = data[["quality"]].values.ravel() 49 | 50 | # If using 'mlflow run' must use --experiment-id to set experiment since set_experiment() does not work 51 | if self.experiment_name != "none": 52 | mlflow.set_experiment(experiment_name) 53 | client = mlflow.tracking.MlflowClient() 54 | experiment_id = client.get_experiment_by_name(experiment_name).experiment_id 55 | print("experiment_id:",experiment_id) 56 | 57 | def eval_metrics(self, actual, pred): 58 | rmse = np.sqrt(mean_squared_error(actual, pred)) 59 | mae = mean_absolute_error(actual, pred) 60 | r2 = r2_score(actual, pred) 61 | return rmse, mae, r2 62 | 63 | def train(self, alpha, l1_ratio): 64 | with mlflow.start_run(source_name=self.current_file) as run: 65 | run_id = run.info.run_uuid 66 | print("run_id:",run_id) 67 | experiment_id = run.info.experiment_id 68 | print(" experiment_id:",experiment_id) 69 | clf = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42) 70 | clf.fit(self.train_x, self.train_y) 71 | 72 | predicted_qualities = clf.predict(self.test_x) 73 | (rmse, mae, r2) = self.eval_metrics(self.test_y, predicted_qualities) 74 | 75 | #print("Parameters:(alpha={}, l1_ratio={}):".format(alpha, l1_ratio)) 76 | print(" Parameters:") 77 | print(" alpha:",alpha) 78 | print(" l1_ratio:",l1_ratio) 79 | print(" Metrics:") 80 | print(" RMSE:",rmse) 81 | print(" MAE:",mae) 82 | print(" R2:",r2) 83 | 84 | mlflow.log_param("alpha", alpha) 85 | mlflow.log_param("l1_ratio", l1_ratio) 86 | 87 | mlflow.log_metric("rmse", rmse) 88 | mlflow.log_metric("r2", r2) 89 | mlflow.log_metric("mae", mae) 90 | 91 | mlflow.set_tag("data_path", self.data_path) 92 | mlflow.set_tag("exp_id", experiment_id) 93 | mlflow.set_tag("exp_name", self.experiment_name) 94 | mlflow.set_tag("run_origin", self.run_origin) 95 | mlflow.set_tag("platform", platform.system()) 96 | 97 | mlflow.sklearn.log_model(clf, "model") 98 | 99 | eps = 5e-3 # the smaller it is the longer is the path 100 | alphas_enet, coefs_enet, _ = enet_path(self.X, self.y, eps=eps, l1_ratio=l1_ratio, fit_intercept=False) 101 | plot_file = "wine_ElasticNet-paths.png" 102 | plot_utils.plot_enet_descent_path(self.X, self.y, l1_ratio, alphas_enet, coefs_enet, plot_file) 103 | mlflow.log_artifact(plot_file) 104 | 105 | return (experiment_id,run_id) 106 | --------------------------------------------------------------------------------