├── README.md
├── best_run
    ├── README.md
    └── best_run.py
├── data
    ├── README.md
    └── sample_libsvm_data.txt
├── dump
    ├── README.md
    ├── dump_experiment.py
    ├── dump_run.py
    ├── dump_utils.py
    ├── experiment.txt
    └── run.txt
├── hello_world
    ├── MLproject
    ├── README.md
    ├── cluster.json
    ├── conda.yaml
    ├── hello_world.ipynb
    ├── hello_world.py
    └── playbook.ipynb
├── pyspark
    ├── MLproject
    ├── README.md
    ├── conda.yaml
    ├── playbook.ipynb
    ├── predict.py
    └── train.py
├── scala_spark
    ├── README.md
    ├── playbook.ipynb
    ├── pom.xml
    ├── run_submit_existing_cluster.json
    ├── run_submit_new_cluster.json
    └── src
    │   └── main
    │       └── scala
    │           └── org
    │               └── andre
    │                   └── mlflow
    │                       └── examples
    │                           ├── MLeapUtils.scala
    │                           ├── MLflowUtils.scala
    │                           ├── decisiontree
    │                               ├── PredictDecisionTree.scala
    │                               └── TrainDecisionTree.scala
    │                           └── hello
    │                               └── HelloWorld.scala
├── search
    ├── README.md
    └── search.py
└── sklearn
    ├── MLproject
    ├── README.md
    ├── Train_Wine_Quality.ipynb
    ├── conda.yaml
    ├── create_job_existing_cluster.json
    ├── create_job_new_cluster.json
    ├── data
        ├── wine-quality-red.csv
        ├── wine-quality-white.csv
        └── wine-quality.json
    ├── main.py
    ├── mlflow_run_cluster.json
    ├── pickle_predict.py
    ├── playbook.ipynb
    ├── pyfunc_predict.py
    ├── run_submit_existing_cluster.json
    ├── run_submit_new_cluster.json
    ├── scikit_predict.py
    ├── setup.py
    ├── spark_udf_predict.py
    ├── util.py
    └── wine_quality
        ├── __init__.py
        ├── plot_utils.py
        └── train.py


/README.md:
--------------------------------------------------------------------------------
 1 | # mlflow-spark-summit-2019
 2 | 
 3 | MLflow code for Spark Summit 2019.
 4 | 
 5 | Session: [Managing the Complete Machine Learning Lifecycle with MLflow](https://databricks.com/sparkaisummit/north-america/sessions-single-2019?id=183).
 6 | 
 7 | ## Setup
 8 | ```
 9 | pip install mlflow==0.9.1
10 | pip install matplotlib
11 | pip install pyarrow
12 | ```
13 | 
14 | ## MLflow Server
15 | ```
16 | virtualenv mlflow_server
17 | source mlflow_server/bin/activate
18 | mlflow server --host 0.0.0.0 --port 5000 --backend-store-uri $PWD/mlruns --default-artifact-root $PWD/mlruns
19 | ```
20 | 
21 | ## Examples
22 | Before running an experiment:
23 | ```
24 | export MLFLOW_TRACKING_URI=http://localhost:5000
25 | ```
26 | 
27 | * [hello_world](hello_world) - Hello World
28 | * [sklearn](sklearn) - Scikit learn model
29 | * [pyspark](pyspark) - PySpark model
30 | * [scala_spark](scala_spark) - Scala Spark ML model using the Java client
31 | * [search](search) - Shows new [MLflow 0.9.1 Search](https://mlflow.org/docs/latest/search-syntax.html) feature
32 | * [dump](dump) - Shows usage of some [mlflow.tracking](https://mlflow.org/docs/latest/python_api/mlflow.tracking.html) package methods
33 | * [best_run](best_run) - Finds the best model run
34 | 


--------------------------------------------------------------------------------
/best_run/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # mlflow-spark-summit-2019 - best_run
 3 | 
 4 | Finds the best run of an experiment by searching for the min or max of a metric.
 5 | 
 6 | Ideally we would like to execute the search on the server side for scalability reasons.
 7 | Since the [search](https://www.mlflow.org/docs/latest/search-syntax.html) syntax does not support min/max, we have to perform the search logic on the client side.
 8 | 
 9 | Two implementations:
10 | * Slow - Finds the best run by calling get_run for each run. Optimized for space as response payloads are small.
11 | * Fast - Finds the best run by calling search once to get data for all an experiment's runs. Optimized for time but response payload will be large for experiments with many runs.
12 | 
13 | Sample run for [best_run.py](best_run.py):
14 | ```
15 | python  best_metric.py --experiment_id 2 --metric rmse --ascending
16 | ```
17 | ```
18 | slow best: ('3d57e49ba31843ac9ea3f4443ac4fbac', 0.7585747707504502)
19 | fast best: ('3d57e49ba31843ac9ea3f4443ac4fbac', 0.7585747707504502)
20 | ```
21 | 


--------------------------------------------------------------------------------
/best_run/best_run.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from argparse import ArgumentParser
 3 | import mlflow
 4 | client = mlflow.tracking.MlflowClient()
 5 | 
 6 | def lt(x,y): return x < y
 7 | def gt(x,y): return x > y
 8 | 
 9 | def calc(metric,run, best, funk):
10 |     for m in run.data.metrics:
11 |         if m.key == metric and (best is None or funk(m.value,best[1])):
12 |            best = (run.info.run_uuid,m.value)
13 |     return best
14 | 
15 | """
16 | Finds the best run by calling get_run for each run. 
17 | """
18 | def get_best_run_slow(experiment_id, metric, ascending=False):
19 |     funk = lt if ascending else gt
20 |     best = None
21 |     infos = client.list_run_infos(experiment_id)
22 |     for info in infos:
23 |         run = client.get_run(info.run_uuid)
24 |         best = calc(metric,run, best, funk)
25 |     return best
26 | 
27 | """
28 | Finds the best run by calling search once to get data for all an experiment's runs
29 | """
30 | def get_best_run_fast(experiment_id, metric, ascending=False):
31 |     funk = lt if ascending else gt
32 |     best = None
33 |     runs = client.search_runs([experiment_id],"")
34 |     for run in runs:
35 |         best = calc(metric,run, best, funk)
36 |     return best
37 | 
38 | if __name__ == "__main__":
39 |     parser = ArgumentParser()
40 |     parser.add_argument("--experiment_id", dest="experiment_id", help="Experiment ID", type=str, required=True)
41 |     parser.add_argument("--metric", dest="metric", help="Metric", type=str, required=True)
42 |     parser.add_argument("--ascending", dest="ascending", help="ascending", required=False, default=False, action="store_true")
43 |     parser.add_argument("--which", dest="which", help="Which: fast|slow|both", type=str, default="both")
44 |     args = parser.parse_args()
45 |     
46 |     if args.which in ['slow','both']:
47 |         best = get_best_run_fast(args.experiment_id, args.metric, args.ascending)
48 |         print("fast best:",best)
49 |     if args.which in ['fast','both']:
50 |         best = get_best_run_slow(args.experiment_id, args.metric, args.ascending)
51 |         print("slow best:",best)
52 |         
53 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | 
2 | sample_libsvm_data.txt:
3 |   * https://github.com/apache/spark/tree/master/data/mllib/sample_libsvm_data.txt
4 |   * $SPARK_HOME/data/mllib/sample_libsvm_data.txt
5 | 


--------------------------------------------------------------------------------
/dump/README.md:
--------------------------------------------------------------------------------
 1 | # mlflow-spark-summit-2019 - dump
 2 | 
 3 | Dumps all experiment or run information recursively.
 4 | 
 5 | **Dump Run**
 6 | 
 7 | * [dump_run.py](dump_run.py) 
 8 | * [sample dump](run.txt)
 9 | 
10 | ```
11 | python dump_run.py --run_id 2cbab69842e4412c99bfb5e15344bc42 --artifact_max_level 5 
12 | ```
13 | 
14 | **Dump Experiment**
15 | 
16 | * [dump_experiment.py](dump_experiment.py) 
17 | * [sample dump](experiment.txt)
18 | 
19 | ```
20 | python dump_experiment.py --experiment_id 2 --showRuns --artifact_max_level 5
21 | ```
22 | 


--------------------------------------------------------------------------------
/dump/dump_experiment.py:
--------------------------------------------------------------------------------
 1 | 
 2 | """
 3 | Recursively dumps all information about an experiment including all details of its runs and their params, metrics and artifacts.
 4 | Note that this can be expensive. Adjust your artifact_max_level.
 5 | """
 6 | 
 7 | from __future__ import print_function
 8 | import sys
 9 | from argparse import ArgumentParser
10 | import mlflow
11 | from dump_utils import *
12 | 
13 | print("MLflow Version:", mlflow.version.VERSION)
14 | 
15 | def dump_experiment(exp):
16 |     print("Experiment Details:")
17 |     for k,v in exp.__dict__.items(): print("  {}: {}".format(k[1:],v))
18 |   
19 | def get_runs(client, infos, artifact_max_level):
20 |     for info in infos:
21 |         run = client.get_run(info.run_uuid)
22 |         dump_run(run)
23 |         dump_artifacts(client, info.run_uuid,"",INDENT_INC,artifact_max_level)
24 | 
25 | def dump(exp_id_or_name, artifact_max_level, show_runs):
26 |     print("Options:")
27 |     print("  exp_id_or_name:",exp_id_or_name)
28 |     print("  artifact_max_level:",artifact_max_level)
29 |     print("  show_runs:",show_runs)
30 |     client = mlflow.tracking.MlflowClient()
31 |     if exp_id_or_name.isdigit():
32 |         exp_id = int(exp_id_or_name)
33 |     else:
34 |         print("experiment_name:",exp_id_or_name)
35 |         exp_id = client.get_experiment_by_name(exp_id_or_name).experiment_id
36 |     print("experiment_id:",exp_id)
37 |     exp = client.get_experiment(exp_id)
38 |     dump_experiment(exp)
39 |     infos = client.list_run_infos(exp_id)
40 |     print("  #runs:",len(infos))
41 |     if not show_runs: 
42 |         return
43 |     get_runs(client,infos,artifact_max_level)
44 |     print("#runs:",len(infos))
45 | 
46 | if __name__ == "__main__":
47 |     parser = ArgumentParser()
48 |     parser.add_argument("--experiment_id_or_name", dest="experiment_id", help="Experiment ID", required=True)
49 |     parser.add_argument("--artifact_max_level", dest="artifact_max_level", help="Number of artifact levels to recurse", required=False, default=1, type=int)
50 |     parser.add_argument("--show_runs", dest="show_runs", help="Show runs", required=False, default=False, action='store_true')
51 |     args = parser.parse_args()
52 |     dump(args.experiment_id, args.artifact_max_level,args.show_runs)
53 | 


--------------------------------------------------------------------------------
/dump/dump_run.py:
--------------------------------------------------------------------------------
 1 | 
 2 | """
 3 | Recursively dumps all information a run including params, metrics, tags and artifacts.
 4 | """
 5 | 
 6 | from __future__ import print_function
 7 | from argparse import ArgumentParser
 8 | import mlflow
 9 | from dump_utils import *
10 | 
11 | print("MLflow Version:", mlflow.version.VERSION)
12 |   
13 | def get_runs(client, infos, artifact_max_level):
14 |     for info in infos:
15 |         run = client.get_run(info.run_uuid)
16 |         dump_run(run)
17 |         dump_artifacts(client, info.run_uuid,"",INDENT_INC,artifact_max_level)
18 | 
19 | def dump(run_id, artifact_max_level):
20 |     print("Options:")
21 |     print("  run_id:",run_id)
22 |     print("  artifact_max_level:",artifact_max_level)
23 |     client = mlflow.tracking.MlflowClient()
24 |     run = client.get_run(run_id)
25 |     dump_run(run)
26 |     dump_artifacts(client, run_id,"",INDENT_INC, artifact_max_level)
27 | 
28 | if __name__ == "__main__":
29 |     parser = ArgumentParser()
30 |     parser.add_argument("--run_id", dest="run_id", help="Run ID", required=True)
31 |     parser.add_argument("--artifact_max_level", dest="artifact_max_level", help="Number of artifact levels to recurse", required=False, default=1, type=int)
32 |     args = parser.parse_args()
33 |     dump(args.run_id, args.artifact_max_level)
34 | 


--------------------------------------------------------------------------------
/dump/dump_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | """
 3 | Dump utilities.
 4 | """
 5 | 
 6 | from __future__ import print_function
 7 | import time
 8 | 
 9 | INDENT_INC = "  "
10 | MAX_LEVEL = 1
11 | 
12 | def dump_run(run):
13 |     print("Run {}".format(run.info.run_uuid))
14 |     for k,v in run.info.__dict__.items(): print("  {}: {}".format(k[1:],v))
15 |     print("  Params:")
16 |     for e in run.data.params:
17 |         print("    {}: {}".format(e.key,e.value))
18 |     print("  Metrics:")
19 |     for e in run.data.metrics:
20 |         sdt = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(e.timestamp/1000))
21 |         print("    {}: {}  - timestamp: {} {}".format(e.key,e.value,e.timestamp,sdt))
22 |     print("  Tags:")
23 |     for e in run.data.tags:
24 |         print("    {}: {}".format(e.key,e.value))
25 | 
26 | def dump_run_info(info):
27 |     print("Run run_uuid {}".format(info.run_uuid))
28 |     for k,v in info.__dict__.items(): print("  {}: {}".format(k,v))
29 | 
30 | def dump_artifact(art,indent="",level=0):
31 |     print("{}Artifact - level {}:".format(indent,level))
32 |     for k,v in art.__dict__.items(): print("  {}{}: {}".format(indent,k[1:],v))
33 | 
34 | def _dump_artifacts(client, run_id, path, indent, level, max_level):
35 |     level += 1
36 |     if level > max_level: return
37 |     artifacts = client.list_artifacts(run_id,path)
38 |     for art in artifacts:
39 |         dump_artifact(art,indent+INDENT_INC,level)
40 |         if art.is_dir:
41 |             _dump_artifacts(client, run_id, art.path, indent+INDENT_INC,level,max_level)
42 | 
43 | def dump_artifacts(client, run_id, path="", indent="", max_level=MAX_LEVEL):
44 |     print("{}Artifacts:".format(indent))
45 |     _dump_artifacts(client, run_id, path, indent, 0, max_level)
46 | 
47 | 


--------------------------------------------------------------------------------
/dump/experiment.txt:
--------------------------------------------------------------------------------
 1 | MLflow Version: 0.9.1
 2 | Options:
 3 |   exp_id_or_name: 2
 4 |   artifact_max_level: 5
 5 |   show_runs: True
 6 | experiment_id: 2
 7 | Experiment Details:
 8 |   experiment_id: 2
 9 |   name: sklearn
10 |   artifact_location: /opt/mlflow-server/mlruns/2
11 |   lifecycle_stage: active
12 |   #runs: 1
13 | Run 65b9e6f331da452a9ec0ff57fbfc26a1
14 |   run_uuid: 65b9e6f331da452a9ec0ff57fbfc26a1
15 |   experiment_id: 2
16 |   name: 
17 |   source_type: 4
18 |   source_name: train.py
19 |   entry_point_name: 
20 |   user_id: andre
21 |   status: 3
22 |   start_time: 1556210939387
23 |   end_time: 1556210940662
24 |   source_version: 4d756b3812204510ffea2ff8d5af15e8e9cbe06e
25 |   lifecycle_stage: active
26 |   artifact_uri: /opt/mlflow-server/mlruns/2/65b9e6f331da452a9ec0ff57fbfc26a1/artifacts
27 |   Params:
28 |     alpha: 1.0
29 |     l1_ratio: 0.5
30 |   Metrics:
31 |     mae: 0.6481010264813273  - timestamp: 1556210939 1970-01-19 00:16:50
32 |     r2: 0.04618821720476163  - timestamp: 1556210939 1970-01-19 00:16:50
33 |     rmse: 0.8593526200510287  - timestamp: 1556210939 1970-01-19 00:16:50
34 |   Tags:
35 |     data_path: data/wine-quality-white.csv
36 |     exp_id: 2
37 |     exp_name: sklearn
38 |     mlflow.source.git.commit: 4d756b3812204510ffea2ff8d5af15e8e9cbe06e
39 |     mlflow.source.name: train.py
40 |     mlflow.source.type: LOCAL
41 |     platform: Darwin
42 |     run_origin: demo/egg/train.sh
43 |   Artifacts:
44 |     Artifact - level 1:
45 |       path: model
46 |       is_dir: True
47 |       bytes: None
48 |       Artifact - level 2:
49 |         path: model/MLmodel
50 |         is_dir: False
51 |         bytes: 344
52 |       Artifact - level 2:
53 |         path: model/conda.yaml
54 |         is_dir: False
55 |         bytes: 120
56 |       Artifact - level 2:
57 |         path: model/model.pkl
58 |         is_dir: False
59 |         bytes: 673
60 |     Artifact - level 1:
61 |       path: wine_ElasticNet-paths.png
62 |       is_dir: False
63 |       bytes: 27773
64 | #runs: 1
65 | 


--------------------------------------------------------------------------------
/dump/run.txt:
--------------------------------------------------------------------------------
 1 | MLflow Version: 0.9.1
 2 | Options:
 3 |   run_id: 65b9e6f331da452a9ec0ff57fbfc26a1
 4 |   artifact_max_level: 5
 5 | Run 65b9e6f331da452a9ec0ff57fbfc26a1
 6 |   run_uuid: 65b9e6f331da452a9ec0ff57fbfc26a1
 7 |   experiment_id: 2
 8 |   name: 
 9 |   source_type: 4
10 |   source_name: train.py
11 |   entry_point_name: 
12 |   user_id: andre
13 |   status: 3
14 |   start_time: 1556210939387
15 |   end_time: 1556210940662
16 |   source_version: 4d756b3812204510ffea2ff8d5af15e8e9cbe06e
17 |   lifecycle_stage: active
18 |   artifact_uri: /opt/mlflow-server/mlruns/2/65b9e6f331da452a9ec0ff57fbfc26a1/artifacts
19 |   Params:
20 |     alpha: 1.0
21 |     l1_ratio: 0.5
22 |   Metrics:
23 |     mae: 0.6481010264813273  - timestamp: 1556210939 1970-01-19 00:16:50
24 |     r2: 0.04618821720476163  - timestamp: 1556210939 1970-01-19 00:16:50
25 |     rmse: 0.8593526200510287  - timestamp: 1556210939 1970-01-19 00:16:50
26 |   Tags:
27 |     data_path: data/wine-quality-white.csv
28 |     exp_id: 2
29 |     exp_name: sklearn
30 |     mlflow.source.git.commit: 4d756b3812204510ffea2ff8d5af15e8e9cbe06e
31 |     mlflow.source.name: train.py
32 |     mlflow.source.type: LOCAL
33 |     platform: Darwin
34 |     run_origin: demo/egg/train.sh
35 |   Artifacts:
36 |     Artifact - level 1:
37 |       path: model
38 |       is_dir: True
39 |       bytes: None
40 |       Artifact - level 2:
41 |         path: model/MLmodel
42 |         is_dir: False
43 |         bytes: 344
44 |       Artifact - level 2:
45 |         path: model/conda.yaml
46 |         is_dir: False
47 |         bytes: 120
48 |       Artifact - level 2:
49 |         path: model/model.pkl
50 |         is_dir: False
51 |         bytes: 673
52 |     Artifact - level 1:
53 |       path: wine_ElasticNet-paths.png
54 |       is_dir: False
55 |       bytes: 27773
56 | 


--------------------------------------------------------------------------------
/hello_world/MLproject:
--------------------------------------------------------------------------------
 1 | name: mlflow_demo_hello_world
 2 | 
 3 | conda_env: conda.yaml
 4 | 
 5 | entry_points:
 6 |   main:
 7 |     parameters:
 8 |       alpha: {type: string, default: "0.01" }
 9 |       run_origin: {type: string, default: "None" }
10 |       log_artifact: {type: string, default: "False" }
11 |     command: "python hello_world.py
12 |                 --alpha {alpha}
13 |                 --run_origin {run_origin}
14 |                 --log_artifact {log_artifact}"
15 | 


--------------------------------------------------------------------------------
/hello_world/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # mlflow-spark-summit-2019 - hello_world
 3 | 
 4 | Simple Hello World that demonstrates the different ways to run an MLflow experiment.
 5 | 
 6 | For details see [MLflow documentation - Running Projects](https://mlflow.org/docs/latest/projects.html#running-projects).
 7 | 
 8 | Synopsis of [hello_world.py](hello_world.py):
 9 | * Creates an experiment HelloWorld if it does not exist. 
10 | * Logs parameters, metrics and tags.
11 | * Batch loggging of parameters, metrics and tags.
12 | * No ML training.
13 | * Optionally writes an artifact.
14 | 
15 | The different ways to run an experiment:
16 | * Unmanaged without mlflow
17 |   * Command-line python
18 |   * Jupyter notebook
19 | * Using mlflow run with [MLproject](MLproject)
20 |   * mlflow run local
21 |   * mlflow run git
22 |   * mlflow run remote
23 | 
24 | ## Setup
25 | 
26 | **External tracking server**
27 | ```
28 | export MLFLOW_TRACKING_URI=http://localhost:5000
29 | ```
30 | 
31 | **Databricks managed tracking server**
32 | ```
33 | export MLFLOW_TRACKING_URI=databricks
34 | ```
35 | The token and tracking server URL will be picked up from your Databricks CLI ~/.databrickscfg default profile.
36 | 
37 | ## Running
38 | 
39 | ### Unmanaged without mlflow run
40 | #### Command-line python
41 | ```
42 | python hello_world.py
43 | ```
44 | 
45 | #### Jupyter notebook
46 | See [hello_world.ipynb](hello_world.ipynb).
47 | ```
48 | export MLFLOW_TRACKING_URI=http://localhost:5000
49 | jupyter notebook
50 | ```
51 | 
52 | ### Using mlflow run
53 | 
54 | #### mlflow run local
55 | ```
56 | mlflow run . -Palpha=.01 -Prun_origin=LocalRun -Plog_artifact=True
57 | ```
58 | You can also specify an experiment ID:
59 | ```
60 | mlflow run . --experiment-id=2019 -Palpha=.01 -Prun_origin=LocalRun -Plog_artifact=True
61 | ```
62 | 
63 | #### mlflow run git
64 | ```
65 | mlflow run  https://github.com/amesar/mlflow-fun.git#examples/hello_world \
66 |   --experiment-id=2019 \
67 |   -Palpha=100 -Prun_origin=GitRun -Plog_artifact=True
68 | ```
69 | #### mlflow run Databricks remote
70 | Run against Databricks. See [Remote Execution on Databricks](https://mlflow.org/docs/latest/projects.html#remote-execution-on-databricks) and [cluster.json](cluster.json).
71 | ```
72 | mlflow run  https://github.com/amesar/mlflow-fun.git#examples/hello_world \
73 |   --experiment-id=2019 \
74 |   -Palpha=100 -Prun_origin=RemoteRun -Plog_artifact=True \
75 |   -m databricks --cluster-spec cluster.json
76 | ```
77 | 


--------------------------------------------------------------------------------
/hello_world/cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "spark_version": "5.3.x-scala2.11",
 3 |   "driver_node_type_id": "i3.xlarge",
 4 |   "node_type_id": "i3.xlarge",
 5 |   "num_workers": 1,
 6 |   "spark_env_vars": {
 7 |     "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
 8 |   },
 9 |   "libraries": [
10 |     { "pypi": { "package": "mlflow" }}
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/hello_world/conda.yaml:
--------------------------------------------------------------------------------
1 | name: mlflow-demo-hello-world
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | - python=3.6
6 | - pip:
7 |     - mlflow==0.9.1
8 | 


--------------------------------------------------------------------------------
/hello_world/hello_world.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from __future__ import print_function\n",
 10 |     "import mlflow\n",
 11 |     "from mlflow.entities import Param,Metric,RunTag"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 4,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "name": "stdout",
 21 |      "output_type": "stream",
 22 |      "text": [
 23 |       "MLflow Version: 0.9.1\n",
 24 |       "Tracking URI: http://localhost:5000\n"
 25 |      ]
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "print(\"MLflow Version:\", mlflow.version.VERSION)\n",
 30 |     "mlflow.set_tracking_uri(\"http://localhost:5000\")\n",
 31 |     "print(\"Tracking URI:\", mlflow.tracking.get_tracking_uri())"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 5,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stdout",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "experiment_name: hello_world\n",
 44 |       "experiment_id: 1\n"
 45 |      ]
 46 |     }
 47 |    ],
 48 |    "source": [
 49 |     "experiment_name = \"hello_world\"\n",
 50 |     "print(\"experiment_name:\",experiment_name)\n",
 51 |     "mlflow.set_experiment(experiment_name)\n",
 52 |     "\n",
 53 |     "client = mlflow.tracking.MlflowClient()\n",
 54 |     "experiment_id = client.get_experiment_by_name(experiment_name).experiment_id\n",
 55 |     "print(\"experiment_id:\",experiment_id)\n",
 56 |     "\n",
 57 |     "import time\n",
 58 |     "now = int(time.time()+.5)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 6,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "def run(alpha, run_origin, log_artifact):\n",
 68 |     "    with mlflow.start_run(run_name=run_origin) as run:\n",
 69 |     "        print(\"runId:\",run.info.run_uuid)\n",
 70 |     "        print(\"artifact_uri:\",mlflow.get_artifact_uri())\n",
 71 |     "        print(\"alpha:\",alpha)\n",
 72 |     "        print(\"log_artifact:\",log_artifact)\n",
 73 |     "        print(\"run_origin:\",run_origin)\n",
 74 |     "        mlflow.log_param(\"alpha\", alpha)\n",
 75 |     "        mlflow.log_metric(\"rmse\", 0.789)\n",
 76 |     "        mlflow.set_tag(\"run_origin\", run_origin)\n",
 77 |     "        mlflow.set_tag(\"log_artifact\", log_artifact)\n",
 78 |     "        if log_artifact:\n",
 79 |     "            with open(\"info.txt\", \"w\") as f:\n",
 80 |     "                f.write(\"Hi artifact\")\n",
 81 |     "            mlflow.log_artifact(\"info.txt\")\n",
 82 |     "\n",
 83 |     "        params = [ Param(\"p1\",\"0.1\"), Param(\"p2\",\"0.2\") ]\n",
 84 |     "        metrics = [ Metric(\"m1\",0.1,now), Metric(\"m2\",0.2,now) ]\n",
 85 |     "        tags = [ RunTag(\"t1\",\"hi1\"), RunTag(\"t2\",\"hi2\") ]\n",
 86 |     "        client.log_batch(run.info.run_uuid, metrics, params, tags)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 7,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "name": "stdout",
 96 |      "output_type": "stream",
 97 |      "text": [
 98 |       "runId: 512955d89c6a40c09bd45429be8206aa\n",
 99 |       "artifact_uri: /Users/ander/work/mlflow/local_mlrun/mlruns/1/512955d89c6a40c09bd45429be8206aa/artifacts\n",
100 |       "alpha: 0.1\n",
101 |       "log_artifact: True\n",
102 |       "run_origin: jupyter\n"
103 |      ]
104 |     }
105 |    ],
106 |    "source": [
107 |     "run(\"0.1\", \"jupyter\", True)"
108 |    ]
109 |   }
110 |  ],
111 |  "metadata": {
112 |   "kernelspec": {
113 |    "display_name": "Python 3",
114 |    "language": "python",
115 |    "name": "python3"
116 |   },
117 |   "language_info": {
118 |    "codemirror_mode": {
119 |     "name": "ipython",
120 |     "version": 3
121 |    },
122 |    "file_extension": ".py",
123 |    "mimetype": "text/x-python",
124 |    "name": "python",
125 |    "nbconvert_exporter": "python",
126 |    "pygments_lexer": "ipython3",
127 |    "version": "3.6.8"
128 |   }
129 |  },
130 |  "nbformat": 4,
131 |  "nbformat_minor": 2
132 | }
133 | 


--------------------------------------------------------------------------------
/hello_world/hello_world.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | import time
 4 | import mlflow
 5 | from mlflow.entities import Param,Metric,RunTag
 6 | from argparse import ArgumentParser
 7 | 
 8 | print("MLflow Version:", mlflow.version.VERSION)
 9 | print("Tracking URI:", mlflow.tracking.get_tracking_uri())
10 | 
11 | experiment_name = "hello_world"
12 | print("experiment_name:",experiment_name)
13 | mlflow.set_experiment(experiment_name)
14 | 
15 | client = mlflow.tracking.MlflowClient()
16 | experiment_id = client.get_experiment_by_name(experiment_name).experiment_id
17 | print("experiment_id:",experiment_id)
18 | 
19 | now = int(time.time()+.5)
20 | 
21 | def run(alpha, run_origin, log_artifact):
22 |     with mlflow.start_run(run_name=run_origin) as run:
23 |         print("runId:",run.info.run_uuid)
24 |         print("artifact_uri:",mlflow.get_artifact_uri())
25 |         print("alpha:",alpha)
26 |         print("log_artifact:",log_artifact)
27 |         print("run_origin:",run_origin)
28 |         mlflow.log_param("alpha", alpha)
29 |         mlflow.log_metric("rmse", 0.789)
30 |         mlflow.set_tag("run_origin", run_origin)
31 |         mlflow.set_tag("log_artifact", log_artifact)
32 |         if log_artifact:
33 |             with open("info.txt", "w") as f:
34 |                 f.write("Hi artifact")
35 |             mlflow.log_artifact("info.txt")
36 | 
37 |         params = [ Param("p1","0.1"), Param("p2","0.2") ]
38 |         metrics = [ Metric("m1",0.1,now), Metric("m2",0.2,now) ]
39 |         tags = [ RunTag("t1","hi1"), RunTag("t2","hi2") ]
40 |         client.log_batch(run.info.run_uuid, metrics, params, tags)
41 | 
42 | import sys
43 | if __name__ == "__main__":
44 |     parser = ArgumentParser()
45 |     parser.add_argument("--alpha", dest="alpha", help="alpha", default=0.1, type=float )
46 |     parser.add_argument("--run_origin", dest="run_origin", help="run_origin", default="")
47 |     parser.add_argument("--log_artifact", dest="log_artifact", help="Log artifact", type=str, default="False")
48 |     args = parser.parse_args()
49 |     run(args.alpha,args.run_origin,args.log_artifact=="True")
50 | 


--------------------------------------------------------------------------------
/hello_world/playbook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## hello_world playbook"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import os\n",
 17 |     "os.environ[\"MLFLOW_TRACKING_URI\"] = \"http://localhost:5000\""
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "metadata": {},
 24 |    "outputs": [
 25 |     {
 26 |      "name": "stdout",
 27 |      "output_type": "stream",
 28 |      "text": [
 29 |       "MLflow Version: 0.9.1\n",
 30 |       "Tracking URI: http://localhost:5000\n",
 31 |       "experiment_name: hello_world\n",
 32 |       "experiment_id: 1\n",
 33 |       "runId: f12c8d9e6d56450280943ec814cdb32e\n",
 34 |       "artifact_uri: /Users/ander/work/mlflow/local_mlrun/mlruns/1/f12c8d9e6d56450280943ec814cdb32e/artifacts\n",
 35 |       "alpha: 0.1\n",
 36 |       "log_artifact: False\n",
 37 |       "run_origin: \n"
 38 |      ]
 39 |     }
 40 |    ],
 41 |    "source": [
 42 |     "! python hello_world.py"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "MLflow Version: 0.9.1\n",
 55 |       "Tracking URI: http://localhost:5000\n",
 56 |       "experiment_name: hello_world\n",
 57 |       "experiment_id: 1\n",
 58 |       "runId: 34b98f61e9f94ada81d6b8be892c8f65\n",
 59 |       "artifact_uri: /Users/ander/work/mlflow/local_mlrun/mlruns/1/34b98f61e9f94ada81d6b8be892c8f65/artifacts\n",
 60 |       "alpha: 0.1\n",
 61 |       "log_artifact: True\n",
 62 |       "run_origin: \n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "! python hello_world.py --log_artifact True"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 4,
 73 |    "metadata": {},
 74 |    "outputs": [
 75 |     {
 76 |      "name": "stdout",
 77 |      "output_type": "stream",
 78 |      "text": [
 79 |       "2019/04/25 13:42:50 INFO mlflow.projects: === Created directory /var/folders/_9/tbkxzw0116v2cp_zq4f1_1cm0000gp/T/tmpdihQqS for downloading remote URIs passed to arguments of type 'path' ===\n",
 80 |       "2019/04/25 13:42:50 INFO mlflow.projects: === Running command 'source activate mlflow-aacce47b0cb7984f4aead56265692d3969388f30 && python hello_world.py --alpha .01 --run_origin LocalRun --log_artifact True' in run with ID '91bdea4b8e7f47379688d98bfeb424a0' === \n",
 81 |       "MLflow Version: 0.9.1\n",
 82 |       "Tracking URI: http://localhost:5000\n",
 83 |       "experiment_name: hello_world\n",
 84 |       "experiment_id: 1\n",
 85 |       "runId: 91bdea4b8e7f47379688d98bfeb424a0\n",
 86 |       "artifact_uri: /Users/ander/work/mlflow/local_mlrun/mlruns/0/91bdea4b8e7f47379688d98bfeb424a0/artifacts\n",
 87 |       "alpha: 0.01\n",
 88 |       "log_artifact: True\n",
 89 |       "run_origin: LocalRun\n",
 90 |       "2019/04/25 13:42:51 INFO mlflow.projects: === Run (ID '91bdea4b8e7f47379688d98bfeb424a0') succeeded ===\n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "! mlflow run . -Palpha=.01 -Prun_origin=LocalRun -Plog_artifact=True"
 96 |    ]
 97 |   }
 98 |  ],
 99 |  "metadata": {
100 |   "kernelspec": {
101 |    "display_name": "Python 2",
102 |    "language": "python",
103 |    "name": "python2"
104 |   },
105 |   "language_info": {
106 |    "codemirror_mode": {
107 |     "name": "ipython",
108 |     "version": 2
109 |    },
110 |    "file_extension": ".py",
111 |    "mimetype": "text/x-python",
112 |    "name": "python",
113 |    "nbconvert_exporter": "python",
114 |    "pygments_lexer": "ipython2",
115 |    "version": "2.7.14"
116 |   }
117 |  },
118 |  "nbformat": 4,
119 |  "nbformat_minor": 2
120 | }
121 | 


--------------------------------------------------------------------------------
/pyspark/MLproject:
--------------------------------------------------------------------------------
 1 | name: mlflow_demo_pyspark
 2 | 
 3 | conda_env: conda.yaml
 4 | 
 5 | entry_points:
 6 |   main:
 7 |     parameters:
 8 |       max_depth: {type: int, default: 2 }
 9 |       max_bins: {type: int, default: 32 }
10 |     command: "spark-submit --master local[2] train.py --max_depth {max_depth} --max_bins {max_bins}"
11 | 


--------------------------------------------------------------------------------
/pyspark/README.md:
--------------------------------------------------------------------------------
 1 | # mlflow-spark-summit-2019 - pyspark
 2 | 
 3 | ## Overview
 4 | 
 5 | * PySpark Decision Tree Classification example
 6 | * Source: [train.py](train.py) and [predict.py](predict.py)
 7 | * Experiment name: pypark
 8 | 
 9 | ## Train
10 | 
11 | ### Unmanaged without mlflow run
12 | 
13 | To run with standard main function
14 | ```
15 | spark-submit --master local[2] train.py --max_depth 16 --max_bins 32
16 | ```
17 | 
18 | ### Using mlflow run
19 | 
20 | These runs use the [MLproject](MLproject) file. For more details see [MLflow documentation - Running Projects](https://mlflow.org/docs/latest/projects.html#running-projects).
21 | 
22 | Note that `mlflow run` ignores the `set_experiment()` function so you must specify the experiment with the  `--experiment-id` argument.
23 | 
24 | **mlflow run local**
25 | ```
26 | mlflow run . -P max_depth=3 -P max_bins=24 --experiment-id=2019
27 | ```
28 | 
29 | **mlflow run github**
30 | ```
31 | mlflow run https://github.com/amesar/mlflow-fun.git#examples/pyspark \
32 |    -P max_depth=3 -P max_bins=24 \
33 |   --experiment-id=2019
34 | ```
35 | 
36 | ## Predict
37 | 
38 | See [predict.py](predict.py).
39 | 
40 | ```
41 | run_id=7b951173284249f7a3b27746450ac7b0
42 | spark-submit --master local[2] predict.py $run_id
43 | ```
44 | 
45 | ```
46 | Predictions
47 | root
48 |  |-- label: double (nullable = true)
49 |  |-- features: vector (nullable = true)
50 |  |-- indexedLabel: double (nullable = false)
51 |  |-- indexedFeatures: vector (nullable = true)
52 |  |-- rawPrediction: vector (nullable = true)
53 |  |-- probability: vector (nullable = true)
54 |  |-- prediction: double (nullable = false)
55 | 
56 | +----------+------------+-----------+
57 | |prediction|indexedLabel|probability|
58 | +----------+------------+-----------+
59 | |0.0       |1.0         |[1.0,0.0]  |
60 | |1.0       |0.0         |[0.0,1.0]  |
61 | |1.0       |0.0         |[0.0,1.0]  |
62 | +----------+------------+-----------+
63 | ```
64 | 
65 | 


--------------------------------------------------------------------------------
/pyspark/conda.yaml:
--------------------------------------------------------------------------------
1 | name: mlflow-demo-pyspark
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | - python=3.6
6 | - pip:
7 |     - mlflow==0.9.1
8 | 


--------------------------------------------------------------------------------
/pyspark/playbook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## pyspark playbook"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 23,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stdout",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "mlflow, version 0.9.1\r\n"
 20 |      ]
 21 |     }
 22 |    ],
 23 |    "source": [
 24 |     "! mlflow --version"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "### Train"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 24,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import os\n",
 41 |     "os.environ[\"MLFLOW_TRACKING_URI\"] = \"http://localhost:5000\""
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 26,
 47 |    "metadata": {
 48 |     "scrolled": false
 49 |    },
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "19/04/24 21:07:26 WARN Utils: Your hostname, C02VD1RGHTDD resolves to a loopback address: 127.0.0.1; using 10.64.185.74 instead (on interface en0)\n",
 56 |       "19/04/24 21:07:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
 57 |       "19/04/24 21:07:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
 58 |       "MLflow Version: 0.9.1\n",
 59 |       "Tracking URI: http://localhost:5000\n",
 60 |       "experiment_name: pyspark\n",
 61 |       "MLflow Version: 0.9.1\n",
 62 |       "experiment_id: 3\n",
 63 |       "run_id: 71fc995eb9154fe7bf360f1686456ea6\n",
 64 |       "experiment_id: 3\n",
 65 |       "Parameters: max_depth: 16  max_bins: 32\n",
 66 |       "+----------+------------+--------------------+\n",
 67 |       "|prediction|indexedLabel|            features|\n",
 68 |       "+----------+------------+--------------------+\n",
 69 |       "|       1.0|         1.0|(692,[98,99,100,1...|\n",
 70 |       "|       1.0|         1.0|(692,[123,124,125...|\n",
 71 |       "|       1.0|         1.0|(692,[124,125,126...|\n",
 72 |       "|       1.0|         1.0|(692,[124,125,126...|\n",
 73 |       "|       1.0|         1.0|(692,[126,127,128...|\n",
 74 |       "+----------+------------+--------------------+\n",
 75 |       "only showing top 5 rows\n",
 76 |       "\n",
 77 |       "Test Error = 0.033333333333333326 \n",
 78 |       "DecisionTreeClassificationModel (uid=DecisionTreeClassifier_2cea66fefc86) of depth 1 with 3 nodes\n"
 79 |      ]
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "! spark-submit --master local[2] train.py --max_depth 16 --max_bins 32"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 27,
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "name": "stdout",
 93 |      "output_type": "stream",
 94 |      "text": [
 95 |       "2019/04/24 21:07:43 INFO mlflow.projects: === Fetching project from https://github.com/amesar/mlflow-fun.git#examples/pyspark into /var/folders/_9/tbkxzw0116v2cp_zq4f1_1cm0000gp/T/tmpg7fq0zy5 ===\n",
 96 |       "2019/04/24 21:07:47 INFO mlflow.projects: === Created directory /var/folders/_9/tbkxzw0116v2cp_zq4f1_1cm0000gp/T/tmpji7c7udh for downloading remote URIs passed to arguments of type 'path' ===\n",
 97 |       "2019/04/24 21:07:47 INFO mlflow.projects: === Running command 'source activate mlflow-95125872403f1ccbea3f04eea25e874f26a00372 && spark-submit --master local[2] train.py --max_depth 3 --max_bins 24' in run with ID '6aed788a20ea4fae90c423898fbfad58' === \n",
 98 |       "19/04/24 21:07:48 WARN Utils: Your hostname, C02VD1RGHTDD resolves to a loopback address: 127.0.0.1; using 10.64.185.74 instead (on interface en0)\n",
 99 |       "19/04/24 21:07:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
100 |       "19/04/24 21:07:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
101 |       "/Users/ander/miniconda3/envs/mlflow-95125872403f1ccbea3f04eea25e874f26a00372/lib/python3.6/site-packages/mlflow/utils/environment.py:26: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n",
102 |       "  env = yaml.load(_conda_header)\n",
103 |       "MLflow Version: 0.8.2\n",
104 |       "Tracking URI: http://localhost:5000\n",
105 |       "experiment_name: py/spark/DecisionTree\n",
106 |       "INFO: 'py/spark/DecisionTree' does not exist. Creating a new experiment\n",
107 |       "MLflow Version: 0.8.2\n",
108 |       "experiment_id: 6\n",
109 |       "run_id: 6aed788a20ea4fae90c423898fbfad58\n",
110 |       "experiment_id: 2\n",
111 |       "Parameters: max_depth: 3  max_bins: 24\n",
112 |       "+----------+------------+--------------------+\n",
113 |       "|prediction|indexedLabel|            features|\n",
114 |       "+----------+------------+--------------------+\n",
115 |       "|       1.0|         1.0|(692,[121,122,123...|\n",
116 |       "|       1.0|         1.0|(692,[123,124,125...|\n",
117 |       "|       1.0|         1.0|(692,[124,125,126...|\n",
118 |       "|       1.0|         1.0|(692,[126,127,128...|\n",
119 |       "|       1.0|         1.0|(692,[126,127,128...|\n",
120 |       "+----------+------------+--------------------+\n",
121 |       "only showing top 5 rows\n",
122 |       "\n",
123 |       "Test Error = 0.0 \n",
124 |       "DecisionTreeClassificationModel (uid=DecisionTreeClassifier_a089dee78712) of depth 2 with 5 nodes\n",
125 |       "2019/04/24 21:08:04 INFO mlflow.projects: === Run (ID '6aed788a20ea4fae90c423898fbfad58') succeeded ===\n"
126 |      ]
127 |     }
128 |    ],
129 |    "source": [
130 |     "! mlflow run https://github.com/amesar/mlflow-fun.git#examples/pyspark \\\n",
131 |     "   -P max_depth=3 -P max_bins=24 \\\n",
132 |     "  --experiment-id=2 \\"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 28,
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "name": "stdout",
142 |      "output_type": "stream",
143 |      "text": [
144 |       "2019/04/24 21:08:06 INFO mlflow.projects: === Fetching project from https://github.com/amesar/mlflow-fun.git#examples/pyspark into /var/folders/_9/tbkxzw0116v2cp_zq4f1_1cm0000gp/T/tmpmg4jv0rv ===\n",
145 |       "2019/04/24 21:08:10 INFO mlflow.projects: === Created directory /var/folders/_9/tbkxzw0116v2cp_zq4f1_1cm0000gp/T/tmpi8i4vntc for downloading remote URIs passed to arguments of type 'path' ===\n",
146 |       "2019/04/24 21:08:10 INFO mlflow.projects: === Running command 'source activate mlflow-95125872403f1ccbea3f04eea25e874f26a00372 && spark-submit --master local[2] train.py --max_depth 3 --max_bins 24' in run with ID '980be9f445184ff4854701b0dfd0889d' === \n",
147 |       "19/04/24 21:08:11 WARN Utils: Your hostname, C02VD1RGHTDD resolves to a loopback address: 127.0.0.1; using 10.64.185.74 instead (on interface en0)\n",
148 |       "19/04/24 21:08:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
149 |       "19/04/24 21:08:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
150 |       "/Users/ander/miniconda3/envs/mlflow-95125872403f1ccbea3f04eea25e874f26a00372/lib/python3.6/site-packages/mlflow/utils/environment.py:26: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n",
151 |       "  env = yaml.load(_conda_header)\n",
152 |       "MLflow Version: 0.8.2\n",
153 |       "Tracking URI: http://localhost:5000\n",
154 |       "experiment_name: py/spark/DecisionTree\n",
155 |       "MLflow Version: 0.8.2\n",
156 |       "experiment_id: 6\n",
157 |       "run_id: 980be9f445184ff4854701b0dfd0889d\n",
158 |       "experiment_id: 2\n",
159 |       "Parameters: max_depth: 3  max_bins: 24\n",
160 |       "+----------+------------+--------------------+\n",
161 |       "|prediction|indexedLabel|            features|\n",
162 |       "+----------+------------+--------------------+\n",
163 |       "|       1.0|         1.0|(692,[98,99,100,1...|\n",
164 |       "|       1.0|         1.0|(692,[100,101,102...|\n",
165 |       "|       1.0|         1.0|(692,[121,122,123...|\n",
166 |       "|       1.0|         1.0|(692,[122,123,124...|\n",
167 |       "|       1.0|         1.0|(692,[124,125,126...|\n",
168 |       "+----------+------------+--------------------+\n",
169 |       "only showing top 5 rows\n",
170 |       "\n",
171 |       "Test Error = 0.02941176470588236 \n",
172 |       "DecisionTreeClassificationModel (uid=DecisionTreeClassifier_a5830562b284) of depth 2 with 5 nodes\n",
173 |       "2019/04/24 21:08:23 INFO mlflow.projects: === Run (ID '980be9f445184ff4854701b0dfd0889d') succeeded ===\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "! mlflow run https://github.com/amesar/mlflow-fun.git#examples/pyspark \\\n",
179 |     "   -P max_depth=3 -P max_bins=24 \\\n",
180 |     "   --experiment-id=2"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "### Predict"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 3,
193 |    "metadata": {},
194 |    "outputs": [
195 |     {
196 |      "name": "stdout",
197 |      "output_type": "stream",
198 |      "text": [
199 |       "19/04/25 14:01:33 WARN Utils: Your hostname, C02VD1RGHTDD resolves to a loopback address: 127.0.0.1; using 10.64.185.74 instead (on interface en0)\n",
200 |       "19/04/25 14:01:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
201 |       "19/04/25 14:01:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
202 |       "MLflow Version: 0.9.1\n",
203 |       "Tracking URI: /Users/ander/git/andre/clean/mlflow-spark-summit-2019/pyspark/mlruns\n",
204 |       "run_id: 6ca69795529e491983d217181ab2dae9\n",
205 |       "data_path: ../data/sample_libsvm_data.txt\n",
206 |       "Traceback (most recent call last):\n",
207 |       "  File \"/Users/ander/git/andre/clean/mlflow-spark-summit-2019/pyspark/predict.py\", line 20, in <module>\n",
208 |       "    model = mlflow_spark.load_model(\"spark-model\", run_id=run_id)\n",
209 |       "  File \"/Users/ander/venvs/mlflow-venv/lib/python2.7/site-packages/mlflow/spark.py\", line 348, in load_model\n",
210 |       "    path = mlflow.tracking.utils._get_model_log_dir(model_name=path, run_id=run_id)\n",
211 |       "  File \"/Users/ander/venvs/mlflow-venv/lib/python2.7/site-packages/mlflow/tracking/utils.py\", line 279, in _get_model_log_dir\n",
212 |       "    run = store.get_run(run_id)\n",
213 |       "  File \"/Users/ander/venvs/mlflow-venv/lib/python2.7/site-packages/mlflow/store/file_store.py\", line 368, in get_run\n",
214 |       "    run_info = self._get_run_info(run_uuid)\n",
215 |       "  File \"/Users/ander/venvs/mlflow-venv/lib/python2.7/site-packages/mlflow/store/file_store.py\", line 384, in _get_run_info\n",
216 |       "    databricks_pb2.RESOURCE_DOES_NOT_EXIST)\n",
217 |       "mlflow.exceptions.MlflowException: Run '6ca69795529e491983d217181ab2dae9' not found\n"
218 |      ]
219 |     }
220 |    ],
221 |    "source": [
222 |     "! spark-submit --master local[2] predict.py 6ca69795529e491983d217181ab2dae9"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": []
231 |   }
232 |  ],
233 |  "metadata": {
234 |   "kernelspec": {
235 |    "display_name": "Python 2",
236 |    "language": "python",
237 |    "name": "python2"
238 |   },
239 |   "language_info": {
240 |    "codemirror_mode": {
241 |     "name": "ipython",
242 |     "version": 2
243 |    },
244 |    "file_extension": ".py",
245 |    "mimetype": "text/x-python",
246 |    "name": "python",
247 |    "nbconvert_exporter": "python",
248 |    "pygments_lexer": "ipython2",
249 |    "version": "2.7.14"
250 |   }
251 |  },
252 |  "nbformat": 4,
253 |  "nbformat_minor": 2
254 | }
255 | 


--------------------------------------------------------------------------------
/pyspark/predict.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from __future__ import print_function
 3 | import sys
 4 | import mlflow
 5 | import mlflow.spark as mlflow_spark
 6 | from pyspark.sql import SparkSession
 7 | 
 8 | print("MLflow Version:", mlflow.version.VERSION)
 9 | print("Tracking URI:", mlflow.tracking.get_tracking_uri())
10 | 
11 | if __name__ == "__main__":
12 |     run_id = sys.argv[1]
13 |     print("run_id:",run_id)
14 |     spark = SparkSession.builder.appName("Predict").getOrCreate()
15 | 
16 |     data_path = "../data/sample_libsvm_data.txt"
17 |     print("data_path:",data_path)
18 |     data = spark.read.format("libsvm").load(data_path)
19 | 
20 |     model = mlflow_spark.load_model("spark-model", run_id=run_id)
21 |     predictions = model.transform(data)
22 | 
23 |     print("Prediction Dataframe")
24 |     predictions.printSchema()
25 | 
26 |     print("Filtered Prediction Dataframe")
27 |     df = predictions.select("prediction", "indexedLabel","probability").filter("prediction <> indexedLabel")
28 |     df.printSchema()
29 |     df.show(5,False)
30 | 


--------------------------------------------------------------------------------
/pyspark/train.py:
--------------------------------------------------------------------------------
 1 | """
 2 | PySpark Decision Tree Classification Example.
 3 | """
 4 | from __future__ import print_function
 5 | 
 6 | import sys,os
 7 | from argparse import ArgumentParser
 8 | from pyspark.ml import Pipeline
 9 | from pyspark.ml.classification import DecisionTreeClassifier
10 | from pyspark.ml.feature import StringIndexer, VectorIndexer
11 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator
12 | from pyspark.sql import SparkSession
13 | import mlflow
14 | from mlflow import version
15 | from mlflow import spark as mlflow_spark
16 | 
17 | print("MLflow Version:", mlflow.version.VERSION)
18 | print("Tracking URI:", mlflow.tracking.get_tracking_uri())
19 | experiment_name = "pyspark"
20 | print("experiment_name:",experiment_name)
21 | mlflow.set_experiment(experiment_name)
22 | 
23 | def train(max_depth, max_bins):
24 |     print("Parameters: max_depth: {}  max_bins: {}".format(max_depth,max_bins))
25 |     spark = SparkSession.builder.appName("DecisionTreeClassificationExample").getOrCreate()
26 | 
27 |     # Load the data stored in LIBSVM format as a DataFrame.
28 |     data_path = "../data/sample_libsvm_data.txt"
29 |     data = spark.read.format("libsvm").load(data_path)
30 | 
31 |     # Index labels, adding metadata to the label column.
32 |     # Fit on whole dataset to include all labels in index.
33 |     labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
34 | 
35 |     # Automatically identify categorical features, and index them.
36 |     # We specify maxCategories so features with > 4 distinct values are treated as continuous.
37 |     featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
38 | 
39 |     # Split the data into training and test sets
40 |     (trainingData, testData) = data.randomSplit([0.7, 0.3])
41 | 
42 |     # Train a DecisionTree model.
43 |     mlflow.log_param("max_depth",max_depth)
44 |     mlflow.log_param("max_bins",max_bins)
45 |     dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxDepth=max_depth, maxBins=max_bins)
46 | 
47 |     # Chain indexers and tree in a Pipeline.
48 |     pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
49 | 
50 |     # Train model.  This also runs the indexers.
51 |     model = pipeline.fit(trainingData)
52 | 
53 |     # Make predictions
54 |     predictions = model.transform(testData)
55 | 
56 |     # Select example rows to display.
57 |     predictions.select("prediction", "indexedLabel", "features").show(5)
58 | 
59 |     # Select (prediction, true label) and compute test error.
60 |     evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
61 |     accuracy = evaluator.evaluate(predictions)
62 |     test_error = 1.0 - accuracy
63 |     print("Test Error = {} ".format(test_error))
64 | 
65 |     mlflow.log_metric("accuracy", accuracy)
66 |     mlflow.log_metric("test_error", test_error)
67 | 
68 |     treeModel = model.stages[2]
69 |     print(treeModel)
70 | 
71 |     mlflow_spark.log_model(model, "spark-model")
72 |     #mlflow.mleap.log_model(model, testData, "mleap-model") # TODO: Bombs :(
73 | 
74 |     spark.stop()
75 | 
76 | if __name__ == "__main__":
77 |     parser = ArgumentParser()
78 |     parser.add_argument("--max_depth", dest="max_depth", help="max_depth", default=2, type=int)
79 |     parser.add_argument("--max_bins", dest="max_bins", help="max_bins", default=32, type=int)
80 |     args = parser.parse_args()
81 |     current_file = os.path.basename(__file__)
82 |     print("MLflow Version:", version.VERSION)
83 | 
84 |     client = mlflow.tracking.MlflowClient()
85 |     print("experiment_id:",client.get_experiment_by_name(experiment_name).experiment_id)
86 | 
87 |     with mlflow.start_run(source_name=current_file) as run:
88 |         print("run_id:",run.info.run_uuid)
89 |         print("experiment_id:",run.info.experiment_id)
90 |         train(args.max_depth,args.max_bins)
91 | 
92 | 


--------------------------------------------------------------------------------
/scala_spark/README.md:
--------------------------------------------------------------------------------
  1 | # mlflow-spark-summit-2019 - scala-spark
  2 | 
  3 | Scala examples using the MLflow Java client:
  4 | * Hello World - Simple MLflow example with no training.
  5 | * Spark ML DecisionTree - saves and predicts SparkML and MLeap model formats.
  6 | 
  7 | ## Build
  8 | ```
  9 | mvn clean package
 10 | ```
 11 | 
 12 | ## Hello World Sample
 13 | ### Run
 14 | ```
 15 | spark-submit --master local[2] \
 16 |   --class org.andre.mlflow.examples.hello.HelloWorld \
 17 |   target/mlflow-spark-examples-1.0-SNAPSHOT.jar \
 18 |   http://localhost:5000
 19 | ```
 20 | ```
 21 | Experiment name: scala_HelloWorld
 22 | Experiment ID: 3
 23 | Run ID: 81cc7941adae4860899ad5449df52802
 24 | ```
 25 | 
 26 | ### Source
 27 | Source snippet from [HelloWorld.scala](src/main/scala/org/andre/mlflow/examples/hello/HelloWorld.scala).
 28 | ```
 29 | // Create client
 30 | val trackingUri = args(0)
 31 | val mlflowClient = new MlflowClient(trackingUri)
 32 | 
 33 | // Create or get existing experiment
 34 | val expName = "scala/HelloWorld"
 35 | val expId = MLflowUtils.getOrCreateExperimentId(mlflowClient, expName)
 36 | println("Experiment name: "+expName)
 37 | println("Experiment ID: "+expId)
 38 | 
 39 | // Create run
 40 | val sourceName = getClass().getSimpleName()+".scala"
 41 | val runInfo = mlflowClient.createRun(expId, sourceName);
 42 | val runId = runInfo.getRunUuid()
 43 | 
 44 | // Log params and metrics
 45 | mlflowClient.logParam(runId, "p1","hi")
 46 | mlflowClient.logMetric(runId, "m1",0.123F)
 47 | 
 48 | // Close run
 49 | mlflowClient.setTerminated(runId, RunStatus.FINISHED, System.currentTimeMillis())
 50 | ```
 51 | 
 52 | ## Spark ML DecisionTree Sample
 53 | 
 54 | Sample demonstrating:
 55 | *  Trains a model
 56 | *  Saves the model in Spark ML and MLeap formats
 57 | *  Predicts from Spark ML and MLeap formats
 58 | 
 59 | ### Train
 60 | 
 61 | Saves model as Spark ML and MLeap artifact in MLflow.
 62 | 
 63 | 
 64 | #### Source
 65 | 
 66 | Source snippet from [TrainDecisionTree.scala](src/main/scala/org/andre/mlflow/examples/TrainDecisionTree.scala).
 67 | ```
 68 | import org.mlflow.tracking.MlflowClient
 69 | import org.mlflow.api.proto.Service.RunStatus
 70 | 
 71 | // Create client
 72 | val mlflowClient = new MlflowClient("http://localhost:5000")
 73 | 
 74 | // MLflow - create or get existing experiment
 75 | val expName = "scala/SimpleDecisionTree"
 76 | val expId = MLflowUtils.getOrCreateExperimentId(mlflowClient, expName)
 77 | 
 78 | // MLflow - create run
 79 | val sourceName = getClass().getSimpleName()+".scala"
 80 | val runInfo = mlflowClient.createRun(expId, sourceName);
 81 | val runId = runInfo.getRunUuid()
 82 | 
 83 | // MLflow - Log parameters
 84 | mlflowClient.logParameter(runId, "maxDepth",""+dt.getMaxDepth)
 85 | mlflowClient.logParameter(runId, "maxBins",""+dt.getMaxBins)
 86 | 
 87 | . . . 
 88 | 
 89 | // MLflow - Log metric
 90 | mlflowClient.logMetric(runId, "rmse",rmse.toFloat)
 91 | 
 92 | // MLflow - save model as artifact
 93 | //pipeline.save("tmp")
 94 | clf.save("tmp")
 95 | mlflowClient.logArtifacts(runId, new File("tmp"),"model")
 96 | 
 97 | // MLflow - save model as Spark ML artifact
 98 | val sparkModelPath = "out/spark_model"
 99 | model.write.overwrite().save(sparkModelPath)
100 | mlflowClient.logArtifacts(runId, new File(sparkModelPath), "spark_model")
101 | 
102 | // MLflow - save model as MLeap artifact
103 | val mleapModelDir = new File("out/mleap_model")
104 | mleapModelDir.mkdir
105 | MLeapUtils.save(model, predictions, "file:"+mleapModelDir.getAbsolutePath)
106 | mlflowClient.logArtifacts(runId, mleapModelDir, "mleap_model")
107 | 
108 | // MLflow - close run
109 | mlflowClient.setTerminated(runId, RunStatus.FINISHED, System.currentTimeMillis())
110 | ```
111 | 
112 | ### Run against local Spark and local MLflow tracking server
113 | 
114 | ```
115 | spark-submit --master local[2] \
116 |   --class org.andre.mlflow.examples.TrainDecisionTree \
117 |   target/mlflow-spark-examples-1.0-SNAPSHOT.jar \
118 |   --trackingUri http://localhost:5000 \
119 |   --experimentName scala_DecisionTree \
120 |   --dataPath ../data/sample_libsvm_data.txt \
121 |   --modelPath model_sample --maxDepth 5 --maxBins 5
122 | ```
123 | 
124 | ### Run against local Spark and Databricks hosted tracking server
125 | 
126 | ```
127 | spark-submit --master local[2] \
128 |   --class org.andre.mlflow.examples.decisiontree.TrainDecisionTree \
129 |   target/mlflow-spark-examples-1.0-SNAPSHOT.jar \
130 |   --trackingUri https://acme.cloud.databricks.com --token MY_TOKEN \
131 |   --experimentName spark_DecisionTree \
132 |   --dataPath ../data/sample_libsvm_data.txt \
133 |   --modelPath model_sample --maxDepth 5 --maxBins 5
134 | ```
135 | 
136 | ### Run in Databricks Cluster
137 | 
138 | You can also run your jar in a Databricks cluster with the standard Databricks REST API run endpoints.
139 | See [runs submit](https://docs.databricks.com/api/latest/jobs.html#runs-submit), [run now](https://docs.databricks.com/api/latest/jobs.html#run-now) and [spark_jar_task](https://docs.databricks.com/api/latest/jobs.html#jobssparkjartask).
140 | In this example we showcase runs_submit.
141 | 
142 | #### Setup
143 | 
144 | Upload the data file and jar to your Databricks cluster.
145 | ```
146 | databricks fs cp data/sample_libsvm_data.txt \
147 |   dbfs:/tmp/jobs/spark-scala-example/sample_libsvm_data.txt
148 | 
149 | databricks fs cp target/mlflow-spark-examples-1.0-SNAPSHOT.jar \
150 |   dbfs:/tmp/jobs/spark-scala-example/mlflow-spark-examples-1.0-SNAPSHOT.jar
151 | ```
152 | 
153 | Here is a snippet from
154 | [run_submit_new_cluster.json](run_submit_new_cluster.json) or
155 | [run_submit_existing_cluster.json](run_submit_existing_cluster.json).
156 | ```
157 |   "libraries": [
158 |     { "pypi": { "package": "mlflow" } },
159 |     { "jar": "dbfs:/tmp/jobs/spark-scala-example/mlflow-spark-examples-1.0-SNAPSHOT.jar" }
160 |   ],
161 |   "spark_jar_task": {
162 |     "main_class_name": "org.andre.mlflow.examples.decisiontree.TrainDecisionTree",
163 |     "parameters": [ 
164 |       "--dataPath",  "dbfs:/tmp/jobs/spark-scala-example/sample_libsvm_data.txt",
165 |       "--modelPath", "/dbfs/tmp/jobs/spark-scala-example/models",
166 |       "--runOrigin", "run_submit_new_cluster.json"
167 |     ]
168 |   }
169 | ```
170 | 
171 | #### Run with new cluster
172 | 
173 | Create [run_submit_new_cluster.json](run_submit_new_cluster.json) and launch the run.
174 | ```
175 | curl -X POST -H "Authorization: Bearer MY_TOKEN" \
176 |   -d @run_submit_new_cluster.json  \
177 |   https://acme.cloud.databricks.com/api/2.0/jobs/runs/submit
178 | ```
179 | 
180 | #### Run with existing cluster
181 | 
182 | Every time you build a new jar, you need to upload (as described above) it to DBFS and restart the cluster.
183 | ```
184 | databricks clusters restart --cluster-id 0113-005848-about166
185 | ```
186 | 
187 | Create [run_submit_existing_cluster.json](run_submit_existing_cluster.json) and launch the run.
188 | ```
189 | curl -X POST -H "Authorization: Bearer MY_TOKEN" \
190 |   -d @run_submit_existing_cluster.json  \
191 |   https://acme.cloud.databricks.com/api/2.0/jobs/runs/submit
192 | ```
193 | 
194 | #### Run jar from Databricks notebook
195 | 
196 | Create a notebook with the following cell. Attach it to the existing cluster described above.
197 | ```
198 | import org.andre.mlflow.examples.decisiontree.TrainDecisionTree
199 | val dataPath = "dbfs:/tmp/jobs/spark-scala-example/sample_libsvm_data.txt"
200 | val modelPath = "/dbfs/tmp/jobs/spark-scala-example/models"
201 | val runOrigin = "run_from_jar_Notebook"
202 | TrainDecisionTree.train(spark, dataPath, modelPath, 5, 5, runOrigin)
203 | ```
204 | 
205 | ### Predict
206 | 
207 | Predicts from Spark ML and MLeap models.
208 | 
209 | #### Run
210 | ```
211 | spark-submit --master local[2] \
212 |   --class org.andre.mlflow.examples.PredictDecisionTree \
213 |   target/mlflow-spark-examples-1.0-SNAPSHOT.jar \
214 |   --trackingUri http://localhost:5000 \
215 |   --dataPath data/sample_libsvm_data.txt \
216 |   --runId 3e422c4736a34046a74795384741ac33
217 | ```
218 | 
219 | ```
220 | +----------+-----+--------------------+
221 | |prediction|label|            features|
222 | +----------+-----+--------------------+
223 | |       0.0|  0.0|(692,[127,128,129...|
224 | |       1.0|  1.0|(692,[158,159,160...|
225 | |       1.0|  1.0|(692,[124,125,126...|
226 | |       1.0|  1.0|(692,[152,153,154...|
227 | +----------+-----+--------------------+
228 | ```
229 | 
230 | #### Source
231 | 
232 | Source snippet from [PredictDecisionTree.scala](src/main/scala/org/andre/mlflow/examples/PredictDecisionTree.scala).
233 | ```
234 | val data = spark.read.format("libsvm").load(opts.dataPath)
235 | val model = PipelineModel.load(opts.modelPath)
236 | val predictions = model.transform(data)
237 | println("Prediction:")
238 | predictions.select("prediction", "label", "features").show(10,false)
239 | ```
240 | 


--------------------------------------------------------------------------------
/scala_spark/playbook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## scala_spark playbook"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 8,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import os\n",
 17 |     "os.environ[\"MLFLOW_TRACKING_URI\"] = \"http://localhost:5000\""
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "### Train"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 10,
 30 |    "metadata": {},
 31 |    "outputs": [
 32 |     {
 33 |      "name": "stdout",
 34 |      "output_type": "stream",
 35 |      "text": [
 36 |       "19/04/24 21:11:52 WARN Utils: Your hostname, C02VD1RGHTDD resolves to a loopback address: 127.0.0.1; using 10.64.185.74 instead (on interface en0)\n",
 37 |       "19/04/24 21:11:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
 38 |       "19/04/24 21:11:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
 39 |       "args: [Ljava.lang.String;@27216cd\n",
 40 |       "Tracking URI: http://localhost:5000\n",
 41 |       "Experiment name: scala_HelloWorld\n",
 42 |       "Experiment ID: 5\n",
 43 |       "Run ID: 5d44fd992c94459fbcb0a1c56a75db58\n"
 44 |      ]
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "! spark-submit --master local[2] \\\n",
 49 |     "  --class org.andre.mlflow.examples.hello.HelloWorld \\\n",
 50 |     "  target/mlflow-spark-examples-1.0-SNAPSHOT.jar \\\n",
 51 |     "  http://localhost:5000"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 9,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stdout",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "19/04/24 21:11:29 WARN Utils: Your hostname, C02VD1RGHTDD resolves to a loopback address: 127.0.0.1; using 10.64.185.74 instead (on interface en0)\n",
 64 |       "19/04/24 21:11:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
 65 |       "19/04/24 21:11:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
 66 |       "Options:\n",
 67 |       "  Tracking URI: http://localhost:5000\n",
 68 |       "  token: null\n",
 69 |       "  experimentName: scala_DecisionTree\n",
 70 |       "  dataPath: ../data/sample_libsvm_data.txt\n",
 71 |       "  modelPath: model_sample\n",
 72 |       "  maxDepth: 5\n",
 73 |       "  maxBins: 5\n",
 74 |       "  runOrigin: None\n",
 75 |       "Experiment ID: 4\n",
 76 |       "Run ID: 1b95d7319070445c9b497e4c51ef3d01\n",
 77 |       "runOrigin: None\n",
 78 |       "Params:\n",
 79 |       "  maxDepth: 5\n",
 80 |       "  maxBins: 5\n",
 81 |       "Metrics:\n",
 82 |       "  RMSE: 0.2970442628930023\n",
 83 |       "  isLargerBetter: false\n",
 84 |       "Prediction:\n",
 85 |       "+----------+-----+--------------------+\n",
 86 |       "|prediction|label|            features|\n",
 87 |       "+----------+-----+--------------------+\n",
 88 |       "|       0.0|  0.0|(692,[100,101,102...|\n",
 89 |       "|       0.0|  0.0|(692,[121,122,123...|\n",
 90 |       "|       0.0|  0.0|(692,[124,125,126...|\n",
 91 |       "|       0.0|  0.0|(692,[124,125,126...|\n",
 92 |       "|       0.0|  0.0|(692,[124,125,126...|\n",
 93 |       "+----------+-----+--------------------+\n",
 94 |       "only showing top 5 rows\n",
 95 |       "\n",
 96 |       "Learned regression tree model:\n",
 97 |       " DecisionTreeRegressionModel (uid=dtr_eaa0ec226e98) of depth 2 with 5 nodes\n",
 98 |       "  If (feature 407 <= 9.5)\n",
 99 |       "   If (feature 243 <= 4.0)\n",
100 |       "    Predict: 1.0\n",
101 |       "   Else (feature 243 > 4.0)\n",
102 |       "    Predict: 0.0\n",
103 |       "  Else (feature 407 > 9.5)\n",
104 |       "   Predict: 1.0\n",
105 |       "\n"
106 |      ]
107 |     }
108 |    ],
109 |    "source": [
110 |     "! spark-submit --master local[2] \\\n",
111 |     "  --class org.andre.mlflow.examples.decisiontree.TrainDecisionTree \\\n",
112 |     "  target/mlflow-spark-examples-1.0-SNAPSHOT.jar \\\n",
113 |     "  --trackingUri http://localhost:5000 \\\n",
114 |     "  --experimentName scala_DecisionTree \\\n",
115 |     "  --dataPath ../data/sample_libsvm_data.txt \\\n",
116 |     "  --modelPath model_sample --maxDepth 5 --maxBins 5"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "### Predict"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 9,
129 |    "metadata": {},
130 |    "outputs": [
131 |     {
132 |      "name": "stdout",
133 |      "output_type": "stream",
134 |      "text": [
135 |       "19/04/25 14:05:42 WARN Utils: Your hostname, C02VD1RGHTDD resolves to a loopback address: 127.0.0.1; using 10.64.185.74 instead (on interface en0)\n",
136 |       "19/04/25 14:05:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
137 |       "19/04/25 14:05:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
138 |       "Options:\n",
139 |       "  dataPath: ../data/sample_libsvm_data.txt\n",
140 |       "  tracking URI: null\n",
141 |       "  token: null\n",
142 |       "  runId: 3418ce2004454821b63e2fcff64177f4\n",
143 |       "MLFLOW_TRACKING_URI: http://localhost:5000\n",
144 |       "==== Spark ML\n",
145 |       "+----------+-----+--------------------+\n",
146 |       "|prediction|label|            features|\n",
147 |       "+----------+-----+--------------------+\n",
148 |       "|       0.0|  0.0|(692,[127,128,129...|\n",
149 |       "|       1.0|  1.0|(692,[158,159,160...|\n",
150 |       "|       1.0|  1.0|(692,[124,125,126...|\n",
151 |       "|       1.0|  1.0|(692,[152,153,154...|\n",
152 |       "|       1.0|  1.0|(692,[151,152,153...|\n",
153 |       "|       0.0|  0.0|(692,[129,130,131...|\n",
154 |       "|       1.0|  1.0|(692,[158,159,160...|\n",
155 |       "|       1.0|  1.0|(692,[99,100,101,...|\n",
156 |       "|       0.0|  0.0|(692,[154,155,156...|\n",
157 |       "|       0.0|  0.0|(692,[127,128,129...|\n",
158 |       "+----------+-----+--------------------+\n",
159 |       "only showing top 10 rows\n",
160 |       "\n",
161 |       "==== MLeap\n",
162 |       "+----------+-----+--------------------+\n",
163 |       "|prediction|label|            features|\n",
164 |       "+----------+-----+--------------------+\n",
165 |       "|       0.0|  0.0|(692,[127,128,129...|\n",
166 |       "|       0.0|  1.0|(692,[158,159,160...|\n",
167 |       "|       0.0|  1.0|(692,[124,125,126...|\n",
168 |       "|       0.0|  1.0|(692,[152,153,154...|\n",
169 |       "|       0.0|  1.0|(692,[151,152,153...|\n",
170 |       "|       0.0|  0.0|(692,[129,130,131...|\n",
171 |       "|       0.0|  1.0|(692,[158,159,160...|\n",
172 |       "|       0.0|  1.0|(692,[99,100,101,...|\n",
173 |       "|       0.0|  0.0|(692,[154,155,156...|\n",
174 |       "|       0.0|  0.0|(692,[127,128,129...|\n",
175 |       "+----------+-----+--------------------+\n",
176 |       "only showing top 10 rows\n",
177 |       "\n"
178 |      ]
179 |     }
180 |    ],
181 |    "source": [
182 |     "! spark-submit --class org.andre.mlflow.examples.decisiontree.PredictDecisionTree \\\n",
183 |     "  --master local[2] target/mlflow-spark-examples-1.0-SNAPSHOT.jar \\\n",
184 |     "  --dataPath ../data/sample_libsvm_data.txt  \\\n",
185 |     "  --runId 3418ce2004454821b63e2fcff64177f4"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": []
194 |   }
195 |  ],
196 |  "metadata": {
197 |   "kernelspec": {
198 |    "display_name": "Python 2",
199 |    "language": "python",
200 |    "name": "python2"
201 |   },
202 |   "language_info": {
203 |    "codemirror_mode": {
204 |     "name": "ipython",
205 |     "version": 2
206 |    },
207 |    "file_extension": ".py",
208 |    "mimetype": "text/x-python",
209 |    "name": "python",
210 |    "nbconvert_exporter": "python",
211 |    "pygments_lexer": "ipython2",
212 |    "version": "2.7.14"
213 |   }
214 |  },
215 |  "nbformat": 4,
216 |  "nbformat_minor": 2
217 | }
218 | 


--------------------------------------------------------------------------------
/scala_spark/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 |   <groupId>org.andre</groupId>
  5 |   <artifactId>mlflow-spark-examples</artifactId>
  6 |   <packaging>jar</packaging>
  7 |   <version>1.0-SNAPSHOT</version>
  8 | 
  9 |   <properties>
 10 |     <scala.version>2.11.8</scala.version>
 11 |     <scala.binary.version>2.11</scala.binary.version>
 12 |     <spark.version>2.4.0</spark.version>
 13 |     <mlflow.version>0.9.1</mlflow.version>
 14 |     <mleap.version>0.12.0</mleap.version>
 15 |     <jcommander.version>1.72</jcommander.version>
 16 |   </properties>
 17 | 
 18 |   <dependencies>
 19 | 
 20 |     <dependency>
 21 |       <groupId>org.scala-lang</groupId>
 22 |       <artifactId>scala-library</artifactId>
 23 |       <version>${scala.version}</version>
 24 |       <scope>provided</scope>
 25 |     </dependency>
 26 | 
 27 |     <dependency>
 28 |       <groupId>org.apache.spark</groupId>
 29 |       <artifactId>spark-core_${scala.binary.version}</artifactId>
 30 |       <version>${spark.version}</version>
 31 |       <scope>provided</scope>
 32 |     </dependency>
 33 | 
 34 |     <dependency>
 35 |       <groupId>org.apache.spark</groupId>
 36 |       <artifactId>spark-sql_${scala.binary.version}</artifactId>
 37 |       <version>${spark.version}</version>
 38 |       <scope>provided</scope>
 39 |     </dependency>
 40 | 
 41 |     <dependency>
 42 |       <groupId>org.apache.spark</groupId>
 43 |       <artifactId>spark-mllib_${scala.binary.version}</artifactId>
 44 |       <version>${spark.version}</version>
 45 |       <scope>provided</scope>
 46 |     </dependency>
 47 | 
 48 |     <dependency>
 49 |       <type>jar</type>
 50 |       <groupId>org.mlflow</groupId>
 51 |       <artifactId>mlflow-client</artifactId>
 52 |       <version>${mlflow.version}</version>
 53 |     </dependency>
 54 | 
 55 |     <dependency>
 56 |       <groupId>ml.combust.mleap</groupId>
 57 |       <artifactId>mleap-spark_2.11</artifactId>
 58 |       <version>${mleap.version}</version>
 59 |     </dependency>
 60 |     <dependency>
 61 |       <groupId>ml.combust.mleap</groupId>
 62 |       <artifactId>mleap-spark-base_2.11</artifactId>
 63 |       <version>${mleap.version}</version>
 64 |     </dependency>
 65 | 
 66 |     <dependency>
 67 |       <groupId>com.beust</groupId>
 68 |       <artifactId>jcommander</artifactId>
 69 |       <version>${jcommander.version}</version>
 70 |     </dependency>
 71 | 
 72 |   </dependencies>
 73 | 
 74 |   <build>
 75 | 
 76 |     <plugins>
 77 |       <plugin>
 78 |         <groupId>net.alchim31.maven</groupId>
 79 |         <artifactId>scala-maven-plugin</artifactId>
 80 |         <version>3.2.0</version>
 81 |         <executions>
 82 |           <execution>
 83 |             <id>scala-compile-first</id>
 84 |             <phase>process-resources</phase>
 85 |             <goals>
 86 |               <goal>compile</goal>
 87 |             </goals>
 88 |           </execution>
 89 |           <execution>
 90 |             <id>scala-test-compile-first</id>
 91 |             <phase>process-test-resources</phase>
 92 |             <goals>
 93 |               <goal>testCompile</goal>
 94 |             </goals>
 95 |           </execution>
 96 |           <execution>
 97 |             <id>attach-scaladocs</id>
 98 |             <phase>verify</phase>
 99 |             <goals>
100 |               <goal>doc-jar</goal>
101 |             </goals>
102 |           </execution>
103 |         </executions>
104 |       </plugin>
105 |       <plugin>
106 |         <groupId>org.apache.maven.plugins</groupId>
107 |         <artifactId>maven-shade-plugin</artifactId>
108 |         <version>1.7.1</version>
109 |         <configuration>
110 |             <filters>
111 |                 <filter>
112 |                     <artifact>*:*</artifact>
113 |                     <excludes>
114 |                         <exclude>META-INF/*.SF</exclude>
115 |                         <exclude>META-INF/*.DSA</exclude>
116 |                         <exclude>META-INF/*.RSA</exclude>
117 |                     </excludes>
118 |                 </filter>
119 |             </filters>
120 |         </configuration>
121 |         <executions>
122 |           <execution>
123 |             <phase>package</phase>
124 |             <goals>
125 |               <goal>shade</goal>
126 |             </goals>
127 |               <configuration>
128 |                 <transformers>
129 |                     <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
130 |                     <!-- Some care is required: http://doc.akka.io/docs/akka/snapshot/general/configuration.html -->
131 |                     <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
132 |                         <resource>reference.conf</resource>
133 |                     </transformer>
134 |                 </transformers>
135 |               </configuration>
136 |           </execution>
137 |         </executions>
138 |       </plugin>
139 |     </plugins>
140 |   </build>
141 | </project>
142 | 


--------------------------------------------------------------------------------
/scala_spark/run_submit_existing_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "run_name": "MLflow_RunSubmit_ExistingCluster",
 3 |   "existing_cluster_id": "0113-005848-about166",
 4 |   "timeout_seconds": 3600,
 5 |   "spark_jar_task": {
 6 |     "main_class_name": "org.andre.mlflow.examples.TrainDecisionTree",
 7 |     "parameters": [ 
 8 |       "--experimentName", "/Shared/experiments/demo/scala_DecisionTree",
 9 |       "--dataPath",  "dbfs:/tmp/jobs/spark-scala-example/sample_libsvm_data.txt",
10 |       "--modelPath", "/dbfs/tmp/jobs/spark-scala-example/models",
11 |       "--runOrigin", "run_submit_existing_cluster.json"
12 |     ]
13 |   },
14 |   "libraries": [
15 |     {
16 |       "jar": "dbfs:/tmp/jobs/spark-scala-example/mlflow-spark-examples-1.0-SNAPSHOT.jar",
17 |       "pypi-package": "mlflow"
18 |     }
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/scala_spark/run_submit_new_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "run_name": "MLflow_RunSubmit_NewCluster",
 3 |   "new_cluster": {
 4 |     "spark_version": "5.3.x-scala2.11",
 5 |     "node_type_id": "i3.xlarge",
 6 |     "num_workers": 1
 7 |   },
 8 |   "timeout_seconds": 3600,
 9 |   "libraries": [
10 |     { "pypi": { "package": "mlflow" } },
11 |     { "jar": "dbfs:/tmp/jobs/spark-scala-example/mlflow-spark-examples-1.0-SNAPSHOT.jar" }
12 |   ],
13 |   "spark_jar_task": {
14 |     "main_class_name": "org.andre.mlflow.examples.TrainDecisionTree",
15 |     "parameters": [ 
16 |       "--experimentName", "/Shared/experiments/demo/scala_DecisionTree",
17 |       "--dataPath",  "dbfs:/tmp/jobs/spark-scala-example/sample_libsvm_data.txt",
18 |       "--modelPath", "/dbfs/tmp/jobs/spark-scala-example/models",
19 |       "--runOrigin", "run_submit_new_cluster.json"
20 |     ]
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/scala_spark/src/main/scala/org/andre/mlflow/examples/MLeapUtils.scala:
--------------------------------------------------------------------------------
 1 | package org.andre.mlflow.examples
 2 | 
 3 | import ml.combust.bundle.BundleFile
 4 | import ml.combust.mleap.spark.SparkSupport._
 5 | import resource.managed
 6 | import org.apache.spark.ml.bundle.SparkBundleContext
 7 | import org.apache.spark.ml.PipelineModel
 8 | import org.apache.spark.sql.DataFrame
 9 | 
10 | /*
11 |   MLeap URI formats:
12 |     file:/tmp/mleap_scala_model_export/my-model
13 |     jar:file:/tmp/mleap_scala_model_export/my-model.zip
14 | */
15 | object MLeapUtils {
16 | 
17 |   def saveModel(model: PipelineModel, df: DataFrame, bundlePath: String) {
18 |     val context = SparkBundleContext().withDataset(df)
19 |     (for(modelFile <- managed(BundleFile(bundlePath))) yield {
20 |       model.writeBundle.save(modelFile)(context)
21 |     }).tried.get
22 |   }
23 | 
24 |   def readModel(bundlePath: String) = {
25 |     val obundle = (for(bundle <- managed(BundleFile(bundlePath))) yield {
26 |       bundle.loadSparkBundle().get
27 |     }).opt
28 |     obundle match {
29 |       case Some(b) => b.root
30 |       case None => throw new Exception(s"Cannot find bundle: $bundlePath")
31 |     }
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/scala_spark/src/main/scala/org/andre/mlflow/examples/MLflowUtils.scala:
--------------------------------------------------------------------------------
 1 | package org.andre.mlflow.examples
 2 | 
 3 | import scala.collection.JavaConversions._
 4 | import org.mlflow.tracking.MlflowClient
 5 | import org.mlflow.tracking.creds.BasicMlflowHostCreds
 6 | 
 7 | object MLflowUtils {
 8 | 
 9 |   def getOrCreateExperimentId(client: MlflowClient, experimentName: String) : String = {
10 |     val expOpt = client.listExperiments() find (_.getName == experimentName)
11 |     expOpt match {
12 |       case Some(exp) => exp.getExperimentId
13 |       case None => client.createExperiment(experimentName)
14 |     }
15 |   }
16 | 
17 |   def createMlflowClient(args: Array[String]) = {
18 |     println("args: "+args)
19 |     if (args.length == 0) {
20 |         val env = System.getenv("MLFLOW_TRACKING_URI")
21 |         println(s"MLFLOW_TRACKING_URI: $env")
22 |         new MlflowClient()
23 |     } else {
24 |       val trackingUri = args(0)
25 |       println(s"Tracking URI: $trackingUri")
26 |       if (args.length > 1) {
27 |         new MlflowClient(new BasicMlflowHostCreds(trackingUri,args(1)))
28 |       } else {
29 |         new MlflowClient(trackingUri)
30 |       }
31 |     }
32 |   }
33 | 
34 |   def createMlflowClient(trackingUri: String, token: String) = {
35 |     if (trackingUri == null) {
36 |         val env = System.getenv("MLFLOW_TRACKING_URI")
37 |         println(s"MLFLOW_TRACKING_URI: $env")
38 |         new MlflowClient()
39 |     } else {
40 |       if (token != null) {
41 |         new MlflowClient(new BasicMlflowHostCreds(trackingUri, token))
42 |       } else {
43 |         new MlflowClient(trackingUri)
44 |       }
45 |     }
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/scala_spark/src/main/scala/org/andre/mlflow/examples/decisiontree/PredictDecisionTree.scala:
--------------------------------------------------------------------------------
 1 | package org.andre.mlflow.examples.decisiontree
 2 | 
 3 | import com.beust.jcommander.{JCommander, Parameter}
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.DataFrame
 6 | import org.apache.spark.ml.PipelineModel
 7 | import org.apache.spark.ml.Transformer
 8 | import org.mlflow.tracking.MlflowClient
 9 | import org.mlflow.tracking.creds.BasicMlflowHostCreds
10 | import org.andre.mlflow.examples.{MLflowUtils,MLeapUtils}
11 | 
12 | object PredictDecisionTree {
13 | 
14 |   def main(args: Array[String]) {
15 |     new JCommander(opts, args.toArray: _*)
16 |     println("Options:")
17 |     println(s"  dataPath: ${opts.dataPath}")
18 |     println(s"  tracking URI: ${opts.trackingUri}")
19 |     println(s"  token: ${opts.token}")
20 |     println(s"  runId: ${opts.runId}")
21 | 
22 |     val mlflowClient = MLflowUtils.createMlflowClient(opts.trackingUri, opts.token)
23 |     val spark = SparkSession.builder.appName("Predict").getOrCreate()
24 |     val data = spark.read.format("libsvm").load(opts.dataPath)
25 | 
26 |     val runInfo = mlflowClient.getRun(opts.runId).getInfo
27 |     val uri = runInfo.getArtifactUri
28 |     predictSparkML(uri, data)
29 |     predictMLeap(uri, data)
30 |   }
31 | 
32 |   def predictSparkML(uri: String, data: DataFrame) {
33 |     println("==== Spark ML")
34 |     val modelPath = s"${uri}/spark-model"
35 |     val model = PipelineModel.load(modelPath)
36 |     showPredictions(model, data)
37 |   }
38 | 
39 |   def predictMLeap(uri: String, data: DataFrame) {
40 |     println("==== MLeap")
41 |     val modelPath = s"file:${uri}/mleap-model/mleap/model"
42 |     val model = MLeapUtils.readModel(modelPath)
43 |     showPredictions(model, data)
44 |   } 
45 | 
46 |   def showPredictions(model: Transformer, data: DataFrame) {
47 |     val predictions = model.transform(data)
48 |     val df = predictions.select("prediction", "label", "features")
49 |     df.show(10)
50 |   }
51 | 
52 |   object opts {
53 |     @Parameter(names = Array("--dataPath" ), description = "Data path", required=true)
54 |     var dataPath: String = null
55 | 
56 |     @Parameter(names = Array("--trackingUri" ), description = "Tracking Server URI", required=false)
57 |     var trackingUri: String = null
58 | 
59 |     @Parameter(names = Array("--token" ), description = "REST API token", required=false)
60 |     var token: String = null
61 | 
62 |     @Parameter(names = Array("--runId" ), description = "runId", required=true)
63 |     var runId: String = null
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/scala_spark/src/main/scala/org/andre/mlflow/examples/decisiontree/TrainDecisionTree.scala:
--------------------------------------------------------------------------------
  1 | package org.andre.mlflow.examples.decisiontree
  2 | 
  3 | // From: https://github.com/apache/spark/blob/master/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
  4 | 
  5 | import java.io.{File,PrintWriter}
  6 | import org.apache.spark.sql.{SparkSession,DataFrame}
  7 | import org.apache.spark.ml.{Pipeline,PipelineModel}
  8 | import org.apache.spark.ml.evaluation.RegressionEvaluator
  9 | import org.apache.spark.ml.feature.{VectorIndexer,VectorIndexerModel}
 10 | import org.apache.spark.ml.regression.{DecisionTreeRegressionModel,DecisionTreeRegressor}
 11 | import org.mlflow.tracking.MlflowClient
 12 | import org.mlflow.api.proto.Service.RunStatus
 13 | import com.beust.jcommander.{JCommander, Parameter}
 14 | import org.andre.mlflow.examples.{MLflowUtils,MLeapUtils}
 15 | 
 16 | object TrainDecisionTree {
 17 |   case class DataHolder(trainingData: DataFrame, testData: DataFrame, featureIndexer: VectorIndexerModel)
 18 |   val spark = SparkSession.builder.appName("DecisionTreeRegressionExample").getOrCreate()
 19 |   val seed = 2019
 20 | 
 21 |   def main(args: Array[String]) {
 22 |     new JCommander(opts, args: _*)
 23 |     println("Options:")
 24 |     println(s"  Tracking URI: ${opts.trackingUri}")
 25 |     println(s"  token: ${opts.token}")
 26 |     println(s"  experimentName: ${opts.experimentName}")
 27 |     println(s"  dataPath: ${opts.dataPath}")
 28 |     println(s"  modelPath: ${opts.modelPath}")
 29 |     println(s"  maxDepth: ${opts.maxDepth}")
 30 |     println(s"  maxBins: ${opts.maxBins}")
 31 |     println(s"  runOrigin: ${opts.runOrigin}")
 32 | 
 33 |     // MLflow - create or get existing experiment
 34 |     val mlflowClient = MLflowUtils.createMlflowClient(opts.trackingUri, opts.token)
 35 | 
 36 |     val experimentId = MLflowUtils.getOrCreateExperimentId(mlflowClient, opts.experimentName)
 37 |     println("Experiment ID: "+experimentId)
 38 | 
 39 |     // Read data
 40 |     val dataHolder = readData(opts.dataPath)
 41 | 
 42 |     // Train model
 43 |     train(mlflowClient, experimentId, opts.modelPath, opts.maxDepth, opts.maxBins, opts.runOrigin, dataHolder)
 44 |   }
 45 | 
 46 |   def readData(dataPath: String) : DataHolder = {
 47 |     val data = spark.read.format("libsvm").load(dataPath)
 48 | 
 49 |     // Automatically identify categorical features, and index them.
 50 |     // Here, we treat features with > 4 distinct values as continuous.
 51 |     val featureIndexer = new VectorIndexer()
 52 |       .setInputCol("features")
 53 |       .setOutputCol("indexedFeatures")
 54 |       .setMaxCategories(4)
 55 |       .fit(data)
 56 | 
 57 |     // Split the data into training and test sets (30% held out for testing).
 58 |     val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3), seed)
 59 |     
 60 |     DataHolder(trainingData, testData, featureIndexer)
 61 |   }
 62 | 
 63 |   def train(mlflowClient: MlflowClient, experimentId: String, modelPath: String, maxDepth: Int, maxBins: Int, runOrigin: String, dataHolder: DataHolder) {
 64 |     // Create a DecisionTree model
 65 |     val clf = new DecisionTreeRegressor()
 66 |       .setLabelCol("label")
 67 |       .setFeaturesCol("indexedFeatures")
 68 |     if (maxDepth != -1) clf.setMaxDepth(maxDepth)
 69 |     if (maxBins != -1) clf.setMaxBins(maxBins)
 70 | 
 71 |     // MLflow - create run
 72 |     val sourceName = (getClass().getSimpleName()+".scala").replace("$","")
 73 |     val runInfo = mlflowClient.createRun(experimentId, sourceName);
 74 |     val runId = runInfo.getRunUuid()
 75 |     println(s"Run ID: $runId")
 76 |     println(s"runOrigin: $runOrigin")
 77 | 
 78 |     // MLflow - Log parameters
 79 |     mlflowClient.logParam(runId, "maxDepth",""+clf.getMaxDepth)
 80 |     mlflowClient.logParam(runId, "maxBins",""+clf.getMaxBins)
 81 |     mlflowClient.logParam(runId, "runOrigin",runOrigin)
 82 |     println(s"Params:")
 83 |     println(s"  maxDepth: ${clf.getMaxDepth}")
 84 |     println(s"  maxBins: ${clf.getMaxBins}")
 85 | 
 86 |     // Chain indexer and tree in a Pipeline.
 87 |     val pipeline = new Pipeline().setStages(Array(dataHolder.featureIndexer, clf))
 88 | 
 89 |     // Train model. This also runs the indexer.
 90 |     val model = pipeline.fit(dataHolder.trainingData)
 91 | 
 92 |     // Make predictions.
 93 |     val predictions = model.transform(dataHolder.testData)
 94 | 
 95 |     // Create metrics: select (prediction, true label) and compute test error.
 96 |     val evaluator = new RegressionEvaluator()
 97 |       .setLabelCol("label")
 98 |       .setPredictionCol("prediction")
 99 |       .setMetricName("rmse")
100 |     val rmse = evaluator.evaluate(predictions)
101 |     println(s"Metrics:")
102 |     println(s"  RMSE: $rmse")
103 |     println(s"  isLargerBetter: ${evaluator.isLargerBetter}")
104 | 
105 |     // MLflow - Log metric
106 |     mlflowClient.logMetric(runId, "rmse",rmse)
107 | 
108 |     // Select example rows to display.
109 |     println("Prediction:")
110 |     predictions.select("prediction", "label", "features").show(5)
111 | 
112 |     // Print decision tree
113 |     val treeModel = model.stages(1).asInstanceOf[DecisionTreeRegressionModel]
114 |     println(s"Learned regression tree model:\n ${treeModel.toDebugString}")
115 | 
116 |     // MLflow - Log simple artifact
117 |     val path="details.txt"
118 |     new PrintWriter(path) { write("Info: "+new java.util.Date()) ; close }
119 |     mlflowClient.logArtifact(runId,new File(path),"info")
120 | 
121 |     // MLflow - Save model in Spark ML and MLeap formats
122 |     saveModelAsSparkML(mlflowClient, runId, modelPath, model)
123 |     saveModelAsMLeap(mlflowClient, runId, modelPath, model, predictions)
124 | 
125 |     // MLflow - close run
126 |     mlflowClient.setTerminated(runId, RunStatus.FINISHED, System.currentTimeMillis())
127 |   }
128 | 
129 |   def saveModelAsSparkML(mlflowClient: MlflowClient, runId: String, baseModelDir: String, model: PipelineModel) = {
130 |     val modelPath = s"$baseModelDir/spark-model"
131 |     model.write.overwrite().save(modelPath)
132 |     mlflowClient.logArtifacts(runId, new File(modelPath), "spark-model")
133 |   }
134 | 
135 |   def saveModelAsMLeap(mlflowClient: MlflowClient, runId: String, baseModelDir: String, model: PipelineModel, predictions: DataFrame) = {
136 |     val modelPath = new File(s"$baseModelDir/mleap-model")
137 |     modelPath.mkdir
138 |     MLeapUtils.saveModel(model, predictions, "file:"+modelPath.getAbsolutePath)
139 |     mlflowClient.logArtifacts(runId, modelPath, "mleap-model/mleap/model") // Make compatible with MLflow Python mlflow.mleap.log_model
140 |   }
141 | 
142 |   object opts {
143 |     @Parameter(names = Array("--trackingUri" ), description = "Tracking Server URI", required=false)
144 |     var trackingUri: String = null
145 | 
146 |     @Parameter(names = Array("--token" ), description = "REST API token", required=false)
147 |     var token: String = null
148 | 
149 |     @Parameter(names = Array("--dataPath" ), description = "Data path", required=true)
150 |     var dataPath: String = null
151 | 
152 |     @Parameter(names = Array("--modelPath" ), description = "Data path", required=true)
153 |     var modelPath: String = null
154 | 
155 |     @Parameter(names = Array("--maxDepth" ), description = "maxDepth", required=false)
156 |     var maxDepth = -1
157 | 
158 |     @Parameter(names = Array("--maxBins" ), description = "maxBins", required=false)
159 |     var maxBins = -1
160 | 
161 |     @Parameter(names = Array("--runOrigin" ), description = "runOrigin", required=false)
162 |     var runOrigin = "None"
163 | 
164 |     @Parameter(names = Array("--experimentName" ), description = "experimentName", required=false)
165 |     var experimentName = "scala_DecisionTree"
166 |   }
167 | }
168 | 


--------------------------------------------------------------------------------
/scala_spark/src/main/scala/org/andre/mlflow/examples/hello/HelloWorld.scala:
--------------------------------------------------------------------------------
 1 | package org.andre.mlflow.examples.hello
 2 | 
 3 | import java.io.{File,PrintWriter}
 4 | import org.mlflow.tracking.MlflowClient
 5 | import org.mlflow.tracking.creds.BasicMlflowHostCreds
 6 | import org.mlflow.api.proto.Service.RunStatus
 7 | import scala.collection.JavaConversions._
 8 | import org.andre.mlflow.examples.MLflowUtils
 9 | 
10 | object HelloWorld {
11 |   def main(args: Array[String]) {
12 | 
13 |     // Create MLflow client
14 |     val mlflowClient = MLflowUtils.createMlflowClient(args)
15 | 
16 |     // Create or get existing experiment
17 |     val expName = "scala_HelloWorld"
18 |     val expId = MLflowUtils.getOrCreateExperimentId(mlflowClient, expName)
19 |     println("Experiment name: "+expName)
20 |     println("Experiment ID: "+expId)
21 | 
22 |     // Create run
23 |     val sourceName = (getClass().getSimpleName()+".scala").replace("$","")
24 |     val runInfo = mlflowClient.createRun(expId, sourceName)
25 |     val runId = runInfo.getRunUuid()
26 |     println("Run ID: "+runId)
27 | 
28 |     // Log params and metrics
29 |     mlflowClient.logParam(runId, "p1","hi")
30 |     mlflowClient.logMetric(runId, "m1",0.123)
31 | 
32 |     // Log file artifact
33 |     new PrintWriter("info.txt") { write("File artifact: "+new java.util.Date()) ; close }
34 |     mlflowClient.logArtifact(runId, new File("info.txt"))
35 | 
36 |     // Log directory artifact
37 |     val dir = new File("tmp")
38 |     dir.mkdir
39 |     new PrintWriter(new File(dir, "model.txt")) { write("Directory artifact: "+new java.util.Date()) ; close }
40 |     mlflowClient.logArtifacts(runId, dir, "model")
41 | 
42 |     // Close run
43 |     mlflowClient.setTerminated(runId, RunStatus.FINISHED, System.currentTimeMillis())
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/search/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # mlflow-spark-summit-2019 - search
 3 | 
 4 | Synopsis:
 5 | * Shows how to search for runs. 
 6 | * For details see https://mlflow.org/docs/latest/search-syntax.html.
 7 | 
 8 | Summary of [search.py](search.py):
 9 | * Creates an experiment `search_example` with five runs with rmse values: 0.76, 0.71, 0.77, 0.69, 0.69.
10 | * Searches for runs: `metrics.rmse >= 0.76`
11 | 
12 | 
13 | Run
14 | ```
15 | python search.py
16 | ```
17 | 
18 | ```
19 | MLflow Version: 0.9.1
20 | Tracking URI: http://localhost:5000
21 | experiment_name: search_example
22 | experiment_id: 6
23 | Adding 5 runs:
24 |   metric: 0.76  run_uuid: cc2debe52ff14c6b9e87cbbe27bedc5b
25 |   metric: 0.71  run_uuid: bc762d998463434e95e8ebdfa50019c0
26 |   metric: 0.77  run_uuid: a9205f64570041b49d362b298195fcb6
27 |   metric: 0.69  run_uuid: fe3aeb9865e7428495b4dca3b6745177
28 |   metric: 0.69  run_uuid: 72d1281840334f3a934797ca196951a1
29 | Query: metrics.rmse >= 0.76
30 | Found 2 matching runs:
31 |   run_uuid: a9205f64570041b49d362b298195fcb6  metrics: [<Metric: key='rmse', timestamp=1556059082, value=0.77>]
32 |   run_uuid: cc2debe52ff14c6b9e87cbbe27bedc5b  metrics: [<Metric: key='rmse', timestamp=1556059082, value=0.76>]
33 | ```
34 | 


--------------------------------------------------------------------------------
/search/search.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import mlflow
 3 | 
 4 | print("MLflow Version:", mlflow.version.VERSION)
 5 | print("Tracking URI:", mlflow.tracking.get_tracking_uri())
 6 | 
 7 | experiment_name = "search_example"
 8 | print("experiment_name:",experiment_name)
 9 | mlflow.set_experiment(experiment_name)
10 | 
11 | client = mlflow.tracking.MlflowClient()
12 | experiment_id = client.get_experiment_by_name(experiment_name).experiment_id
13 | print("experiment_id:",experiment_id)
14 | 
15 | def create_run(metric):
16 |     with mlflow.start_run() as run:
17 |         print("  metric:",metric," run_uuid:",run.info.run_uuid)
18 |         mlflow.log_metric("rmse", metric)
19 | 
20 | def create_runs():
21 |     metrics = [0.76, 0.71, 0.77, 0.69, 0.69 ]
22 |     print("Adding {} runs:".format(len(metrics)))
23 |     for m in metrics:
24 |         create_run(m)
25 |     run_infos = client.list_run_infos(experiment_id)
26 | 
27 | def delete_runs(experiment_id):
28 |     run_infos = client.list_run_infos(experiment_id)
29 |     for info in run_infos:
30 |         client.delete_run(info.run_uuid)
31 |     run_infos = client.list_run_infos(experiment_id)
32 | 
33 | def search(exp_ids, query):
34 |     print("Query:",query)
35 |     runs = client.search_runs(exp_ids,query)
36 |     print("Found {} matching runs:".format(len(runs)))
37 |     for run in runs:
38 |         print("  run_uuid:",run.info.run_uuid," metrics:",run.data.metrics)
39 | 
40 | if __name__ == "__main__":
41 |     delete_runs(experiment_id)
42 |     create_runs()
43 |     query = "metrics.rmse >= 0.76"
44 |     search([experiment_id], query)
45 | 
46 | 


--------------------------------------------------------------------------------
/sklearn/MLproject:
--------------------------------------------------------------------------------
 1 | name: mlflow_demo_sklearn
 2 | 
 3 | conda_env: conda.yaml
 4 | 
 5 | entry_points:
 6 |   main:
 7 |     parameters:
 8 |       experiment_name: {type: string, default: "none" }
 9 |       data_path: {type: string}
10 |       alpha: float
11 |       l1_ratio: {type: float, default: 0.1}
12 |       run_origin: {type: string, default: "default" }
13 |     command: "python main_train_wine_quality.py 
14 |                 --experiment_name {experiment_name} 
15 |                 --data_path {data_path} 
16 |                 --alpha {alpha} 
17 |                 --l1_ratio {l1_ratio} 
18 |                 --run_origin {run_origin}"
19 | 


--------------------------------------------------------------------------------
/sklearn/README.md:
--------------------------------------------------------------------------------
  1 | # mlflow-spark-summit-2019 - sklearn 
  2 | 
  3 | ## Overview
  4 | * Wine Quality Elastic Net Example
  5 | * This example demonstrates all features of MLflow training and prediction.
  6 | * Saves model in pickle format
  7 | * Saves plot artifacts
  8 | * Shows several ways to run training - _mlflow run_, run against Databricks cluster, call egg from notebook, etc.
  9 | * Shows several ways to run prediction  - web server,  mlflow.load_model(), UDF, etc.
 10 | * Data: data/wine-quality-white.csv and data/predict wine-quality-red.csv.
 11 | 
 12 | ## Training
 13 | 
 14 | Source: [main.py](main.py) and [train.py](wine_quality/train.py).
 15 | 
 16 | ### Unmanaged without mlflow run
 17 | 
 18 | #### Command-line python
 19 | 
 20 | To run with standard main function:
 21 | ```
 22 | python main.py --experiment_name sklearn \
 23 |   --data_path data/wine-quality-white.csv \
 24 |   --alpha 0.5 --l1_ratio 0.5
 25 | ```
 26 | 
 27 | #### Jupyter notebook
 28 | See [Train_Wine_Quality.ipynb](Train_Wine_Quality.ipynb).
 29 | ```
 30 | export MLFLOW_TRACKING_URI=http://localhost:5000
 31 | jupyter notebook
 32 | ```
 33 | 
 34 | ### Using mlflow run
 35 | 
 36 | These runs use the [MLproject](MLproject) file. For more details see [MLflow documentation - Running Projects](https://mlflow.org/docs/latest/projects.html#running-projects).
 37 | 
 38 | Note that mlflow run ignores the `set_experiment()` function so you must specify the experiment with the  `--experiment-id` argument.
 39 | 
 40 | **mlflow run local**
 41 | ```
 42 | mlflow run . -P alpha=0.01 -P l1_ratio=0.75 -P run_origin=LocalRun --experiment-id=2
 43 | ```
 44 | 
 45 | **mlflow run github**
 46 | ```
 47 | mlflow run https://github.com/amesar/mlflow-fun.git#examples/scikit-learn/wine-quality \
 48 |   -P alpha=0.01 -P l1_ratio=0.75 -P run_origin=GitRun \
 49 |   --experiment-id=2
 50 | ```
 51 | 
 52 | **mlflow run Databricks remote** - Run against Databricks. 
 53 | 
 54 | See [Remote Execution on Databricks](https://mlflow.org/docs/latest/projects.html#remote-execution-on-databricks) and [mlflow_run_cluster.json](mlflow_run_cluster.json).
 55 | 
 56 | Setup.
 57 | ```
 58 | export MLFLOW_TRACKING_URI=databricks
 59 | ```
 60 | The token and tracking server URL will be picked up from your Databricks CLI ~/.databrickscfg default profile.
 61 | 
 62 | Now run.
 63 | ```
 64 | mlflow run https://github.com/amesar/mlflow-fun.git#examples/scikit-learn/wine-quality \
 65 |   -P alpha=0.01 -P l1_ratio=0.75 -P run_origin=GitRun \
 66 |   -P data_path=/dbfs/tmp/data/wine-quality-white.csv \
 67 |   --experiment-id=2019 \
 68 |   --mode databricks --cluster-spec mlflow_run_cluster.json
 69 | ```
 70 | 
 71 | ### Databricks Cluster Runs
 72 | 
 73 | You can also package your code as an egg and run it with the standard Databricks REST API endpoints
 74 | [job/runs/submit](https://docs.databricks.com/api/latest/jobs.html#runs-submit) 
 75 | or [jobs/run-now](https://docs.databricks.com/api/latest/jobs.html#run-now) 
 76 | using the [spark_python_task](https://docs.databricks.com/api/latest/jobs.html#jobssparkpythontask). 
 77 | 
 78 | #### Setup
 79 | 
 80 | Build the egg.
 81 | ```
 82 | python setup.py bdist_egg
 83 | ```
 84 | 
 85 | Upload the data file, main file and egg to your Databricks cluster.
 86 | ```
 87 | databricks fs cp main.py dbfs:/tmp/jobs/wine_quality/main.py
 88 | databricks fs cp data/wine-quality-white.csv dbfs:/tmp/jobs/wine_quality/wine-quality-white.csv
 89 | databricks fs cp \
 90 |   dist/mlflow_wine_quality-0.0.1-py3.6.egg \
 91 |   dbfs:/tmp/jobs/wine_quality/mlflow_wine_quality-0.0.1-py3.6.egg
 92 | ```
 93 | 
 94 | 
 95 | #### Run Submit
 96 | 
 97 | ##### Run with new cluster
 98 | 
 99 | Define your run in [run_submit_new_cluster.json](run_submit_new_cluster.json) and launch the run.
100 | 
101 | ```
102 | curl -X POST -H "Authorization: Bearer MY_TOKEN" \
103 |   -d @run_submit_new_cluster.json  \
104 |   https://myshard.cloud.databricks.com/api/2.0/jobs/runs/submit
105 | ```
106 | 
107 | ##### Run with existing cluster
108 | 
109 | Every time you build a new egg, you need to upload (as described above) it to DBFS and restart the cluster.
110 | ```
111 | databricks clusters restart --cluster-id 1222-015510-grams64
112 | ```
113 | 
114 | Define your run in [run_submit_existing_cluster.json](run_submit_existing_cluster.json) and launch the run.
115 | ```
116 | curl -X POST -H "Authorization: Bearer MY_TOKEN" \
117 |   -d @run_submit_existing_cluster.json  \
118 |   https://myshard.cloud.databricks.com/api/2.0/jobs/runs/submit
119 | ```
120 | 
121 | #### Job Run Now
122 | 
123 | ##### Run with new cluster
124 | 
125 | First create a job with the spec file [create_job_new_cluster.json](create_job_new_cluster.json). 
126 | ```
127 | databricks jobs create --json-file create_job_new_cluster.json
128 | ```
129 | 
130 | Then run the job with desired parameters.
131 | ```
132 | databricks jobs run-now --job-id $JOB_ID --python-params ' [ "WineQualityExperiment", 0.3, 0.3, "/dbfs/tmp/jobs/wine_quality/wine-quality-white.csv" ] '
133 | ```
134 | 
135 | ##### Run with existing cluster
136 | First create a job with the spec file [create_job_existing_cluster.json](create_job_existing_cluster.json).
137 | ```
138 | databricks jobs create --json-file create_job_existing_cluster.json
139 | ```
140 | 
141 | Then run the job with desired parameters.
142 | ```
143 | databricks jobs run-now --job-id $JOB_ID --python-params ' [ "WineQualityExperiment", 0.3, 0.3, "/dbfs/tmp/jobs/wine_quality/wine-quality-white.csv" ] '
144 | ```
145 | 
146 | 
147 | #### Run egg from Databricks notebook
148 | 
149 | Create a notebook with the following cell. Attach it to the existing cluster described above.
150 | ```
151 | from wine_quality import Trainer
152 | data_path = "/dbfs/tmp/jobs/wine_quality/wine-quality-white.csv"
153 | trainer = Trainer("WineQualityExperiment", data_path, "from_notebook_with_egg")
154 | trainer.train(0.4, 0.4)
155 | ```
156 | 
157 | ## Predictions
158 | 
159 | You can make predictions in the following ways:
160 | 1. Use MLflow's serving web server and submit predictions via HTTP calls
161 | 2. Call mlflow.sklearn.load_model() from your own serving code and then make predictions
162 | 4. Call mlflow.pyfunc.load_pyfunc() from your own serving code and then make predictions
163 | 5. Batch prediction with Spark UDF (user-defined function)
164 | 
165 | 
166 | See MLflow documentation:
167 | * [Tutorial - Serving the Model](https://www.mlflow.org/docs/latest/tutorial.html#serving-the-model)
168 | * [Quickstart - Saving and Serving Models](https://www.mlflow.org/docs/latest/quickstart.html#saving-and-serving-models)
169 | * [mlflow.pyfunc.spark_udf](https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.spark_udf)
170 | 
171 | 
172 | ### Data for predictions
173 | [data/wine-quality-red.csv](data/wine-quality-red.csv):
174 | ```
175 | [
176 |   {
177 |     "fixed acidity": 7,
178 |     "volatile acidity": 0.27,
179 |     "citric acid": 0.36,
180 |     "residual sugar": 20.7,
181 |     "chlorides": 0.045,
182 |     "free sulfur dioxide": 45,
183 |     "total sulfur dioxide": 170,
184 |     "density": 1.001,
185 |     "pH": 3,
186 |     "sulphates": 0.45,
187 |     "alcohol": 8.8
188 |   }, 
189 |   . . . . .
190 | ]
191 | ```
192 | 
193 | ### 1. Serving Models from MLflow Web Server
194 | 
195 | In one window run the server.
196 | ```
197 | mlflow pyfunc serve -p 5001 -r 7e674524514846799310c41f10d6b99d -m model
198 | ```
199 | 
200 | In another window, submit a prediction.
201 | ```
202 | curl -X POST -H "Content-Type:application/json" -d @data/wine-quality-red.csv http://localhost:5001/invocations
203 | 
204 | [
205 |     5.551096337521979,
206 |     5.297727513113797,
207 |     5.427572126267637,
208 |     5.562886443251915,
209 |     5.562886443251915
210 | ]
211 | ```
212 | 
213 | ### 2. Predict with mlflow.sklearn.load_model()
214 | 
215 | ```
216 | python scikit_predict.py 7e674524514846799310c41f10d6b99d
217 | 
218 | predictions: [5.55109634 5.29772751 5.42757213 5.56288644 5.56288644]
219 | ```
220 | From [scikit_predict.py](scikit_predict.py):
221 | ```
222 | model = mlflow.sklearn.load_model("model",run_id="7e674524514846799310c41f10d6b99d")
223 | df = pd.read_csv("data/wine-quality-red.csv")
224 | predicted = model.predict(df)
225 | print("predicted:",predicted)
226 | ```
227 | 
228 | ### 3. Predict with mlflow.pyfunc.load_pyfunc()
229 | 
230 | ```
231 | python pyfunc_predict.py 7e674524514846799310c41f10d6b99d
232 | 
233 | predictions: [5.55109634 5.29772751 5.42757213 5.56288644 5.56288644]
234 | ```
235 | From [pyfunc_predict.py](pyfunc_predict.py):
236 | ```
237 | model_uri = mlflow.start_run("7e674524514846799310c41f10d6b99d").info.artifact_uri +  "/model"
238 | model = mlflow.pyfunc.load_pyfunc(model_uri)
239 | df = pd.read_csv("data/wine-quality-red.csv")
240 | predicted = model.predict(df)
241 | print("predicted:",predicted)
242 | ```
243 | 
244 | ### 4. Batch prediction with Spark UDF (user-defined function)
245 | 
246 | Scroll right to see prediction column.
247 | 
248 | ```
249 | pip install pyarrow
250 | 
251 | spark-submit --master local[2] spark_udf_predict.py 7e674524514846799310c41f10d6b99d
252 | 
253 | +-------+---------+-----------+-------+-------------+-------------------+----+--------------+---------+--------------------+----------------+------------------+
254 | |alcohol|chlorides|citric acid|density|fixed acidity|free sulfur dioxide|  pH|residual sugar|sulphates|total sulfur dioxide|volatile acidity|        prediction|
255 | +-------+---------+-----------+-------+-------------+-------------------+----+--------------+---------+--------------------+----------------+------------------+
256 | |    8.8|    0.045|       0.36|  1.001|          7.0|               45.0| 3.0|          20.7|     0.45|               170.0|            0.27| 5.551096337521979|
257 | |    9.5|    0.049|       0.34|  0.994|          6.3|               14.0| 3.3|           1.6|     0.49|               132.0|             0.3| 5.297727513113797|
258 | |   10.1|     0.05|        0.4| 0.9951|          8.1|               30.0|3.26|           6.9|     0.44|                97.0|            0.28| 5.427572126267637|
259 | |    9.9|    0.058|       0.32| 0.9956|          7.2|               47.0|3.19|           8.5|      0.4|               186.0|            0.23| 5.562886443251915|
260 | ```
261 | From [spark_udf_predict.py](spark_udf_predict.py):
262 | ```
263 | spark = SparkSession.builder.appName("ServePredictions").getOrCreate()
264 | df = spark.read.option("inferSchema",True).option("header", True).csv("data/wine-quality-red.csv")
265 | df = df.drop("quality")
266 | 
267 | udf = mlflow.pyfunc.spark_udf(spark, "model", run_id="7e674524514846799310c41f10d6b99d")
268 | df2 = df.withColumn("prediction", udf(*df.columns))
269 | df2.show(10)
270 | ```
271 | 


--------------------------------------------------------------------------------
/sklearn/Train_Wine_Quality.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MLflow Train Wine Quality Notebook\n",
  8 |     "This is a Quick Start notebook.\n",
  9 |     "* It is based on [train.py](https://github.com/databricks/mlflow/blob/master/example/tutorial/train.py) from [MLflow's tutorial](https://mlflow.org/docs/latest/tutorial.html). \n",
 10 |     "* It creates runs in the experiment \"py/sk/ElasticNet/WineQuality\".\n"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 36,
 16 |    "metadata": {},
 17 |    "outputs": [
 18 |     {
 19 |      "data": {
 20 |       "text/plain": [
 21 |        "'http://localhost:5000'"
 22 |       ]
 23 |      },
 24 |      "execution_count": 36,
 25 |      "metadata": {},
 26 |      "output_type": "execute_result"
 27 |     }
 28 |    ],
 29 |    "source": [
 30 |     "from __future__ import print_function\n",
 31 |     "import mlflow\n",
 32 |     "\n",
 33 |     "mlflow.set_tracking_uri(\"http://localhost:5000\")\n",
 34 |     "mlflow.tracking.get_tracking_uri()"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 37,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "def now():\n",
 44 |     "    now = int(time.time()+.5)\n",
 45 |     "    dt = time.strftime(\"%Y-%m-%d_%H:%M:%S\", time.gmtime(now))\n",
 46 |     "    return dt"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 38,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "Run Start: 2019-03-26_02:10:09\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "print(\"Run Start:\",now())"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 39,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "experiment_name = \"py/sk/ElasticNet/WineQuality\"\n",
 73 |     "wine_data_path = \"./data/wine-quality-white.csv\"\n",
 74 |     "wine_data_url = \"https://raw.githubusercontent.com/mlflow/mlflow/master/examples/sklearn_elasticnet_wine/wine-quality.csv\"\n",
 75 |     "run_origin = \"jupyter\""
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 40,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "MLflow Version: 0.8.2\n",
 88 |       "experiment_id: 5\n",
 89 |       "experiment_name: py/sk/ElasticNet/WineQuality\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "import mlflow\n",
 95 |     "print(\"MLflow Version:\",mlflow.version.VERSION)\n",
 96 |     "mlflow.set_experiment(experiment_name)\n",
 97 |     "mlflow_client = mlflow.tracking.MlflowClient()\n",
 98 |     "experiment_id = mlflow_client.get_experiment_by_name(experiment_name).experiment_id\n",
 99 |     "print(\"experiment_id:\",experiment_id)\n",
100 |     "print(\"experiment_name:\",experiment_name)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 41,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "import os\n",
110 |     "import requests\n",
111 |     "\n",
112 |     "if not os.path.exists(wine_data_path):\n",
113 |     "    print(\"Downloading {} to {}\".format(wine_data_url,wine_data_path))\n",
114 |     "    rsp = requests.get(wine_data_url)\n",
115 |     "    with open(wine_data_path, 'w') as f:\n",
116 |     "        f.write(rsp.text)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "#### Write your ML code based on the`train.py` code\n",
124 |     "This tutorial is based on the MLflow's example [train.py](https://github.com/databricks/mlflow/blob/master/example/tutorial/train.py), which uses an external [wine-quality.csv](https://github.com/databricks/mlflow/blob/master/example/tutorial/wine-quality.csv) dataset to predict wine quality."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 42,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "# The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality\n",
134 |     "# P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.\n",
135 |     "# Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.\n",
136 |     "\n",
137 |     "import os\n",
138 |     "import warnings\n",
139 |     "import sys\n",
140 |     "\n",
141 |     "import pandas as pd\n",
142 |     "import numpy as np\n",
143 |     "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
144 |     "from sklearn.model_selection import train_test_split\n",
145 |     "from sklearn.linear_model import ElasticNet\n",
146 |     "\n",
147 |     "import mlflow\n",
148 |     "import mlflow.sklearn\n",
149 |     "\n",
150 |     "def eval_metrics(actual, pred):\n",
151 |     "    rmse = np.sqrt(mean_squared_error(actual, pred))\n",
152 |     "    mae = mean_absolute_error(actual, pred)\n",
153 |     "    r2 = r2_score(actual, pred)\n",
154 |     "    return rmse, mae, r2"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 43,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "from sklearn.linear_model import enet_path\n",
164 |     "import matplotlib.pyplot as plt\n",
165 |     "from itertools import cycle\n",
166 |     "\n",
167 |     "def plot_enet_descent_path(X, y, l1_ratio, plot_file):\n",
168 |     "    # Compute paths\n",
169 |     "    eps = 5e-3  # the smaller it is the longer is the path\n",
170 |     "\n",
171 |     "    # Reference the global image variable\n",
172 |     "    global image\n",
173 |     "    \n",
174 |     "    print(\"Computing regularization path using the elastic net.\")\n",
175 |     "    alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=l1_ratio, fit_intercept=False)\n",
176 |     "\n",
177 |     "    # Display results\n",
178 |     "    fig = plt.figure(1)\n",
179 |     "    ax = plt.gca()\n",
180 |     "\n",
181 |     "    colors = cycle(['b', 'r', 'g', 'c', 'k'])\n",
182 |     "    neg_log_alphas_enet = -np.log10(alphas_enet)\n",
183 |     "    for coef_e, c in zip(coefs_enet, colors):\n",
184 |     "        l1 = plt.plot(neg_log_alphas_enet, coef_e, linestyle='--', c=c)\n",
185 |     "\n",
186 |     "    plt.xlabel('-Log(alpha)')\n",
187 |     "    plt.ylabel('coefficients')\n",
188 |     "    title = 'ElasticNet Path by alpha for l1_ratio = ' + str(l1_ratio)\n",
189 |     "    plt.title(title)\n",
190 |     "    plt.axis('tight')\n",
191 |     "\n",
192 |     "    image = fig\n",
193 |     "    fig.savefig(plot_file)\n",
194 |     "    plt.close(fig)\n",
195 |     "    return image   "
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 44,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "def train(alpha, l1_ratio):\n",
205 |     "    warnings.filterwarnings(\"ignore\")\n",
206 |     "    np.random.seed(40)\n",
207 |     "\n",
208 |     "    data = pd.read_csv(wine_data_path)\n",
209 |     "\n",
210 |     "    # Split the data into training and test sets. (0.75, 0.25) split.\n",
211 |     "    train, test = train_test_split(data)\n",
212 |     "\n",
213 |     "    # The predicted column is \"quality\" which is a scalar from [3, 9]\n",
214 |     "    train_x = train.drop([\"quality\"], axis=1)\n",
215 |     "    test_x = test.drop([\"quality\"], axis=1)\n",
216 |     "    train_y = train[[\"quality\"]]\n",
217 |     "    test_y = test[[\"quality\"]]\n",
218 |     "    with mlflow.start_run() as run:\n",
219 |     "        run_id = run.info.run_uuid\n",
220 |     "        print(\"run_id:\",run_id)\n",
221 |     "        print(\"run_origin:\",run_origin)\n",
222 |     "        clf = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)\n",
223 |     "        clf.fit(train_x, train_y)\n",
224 |     "\n",
225 |     "        predicted_qualities = clf.predict(test_x)\n",
226 |     "        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)\n",
227 |     "\n",
228 |     "        print(\"Elasticnet model (alpha={}, l1_ratio={}):\".format(alpha, l1_ratio))\n",
229 |     "        print(\"  RMSE:\",rmse)\n",
230 |     "        print(\"  MAE:\",mae)\n",
231 |     "        print(\"  R2:\",r2)\n",
232 |     "\n",
233 |     "        mlflow.log_param(\"alpha\", alpha)\n",
234 |     "        mlflow.log_param(\"l1_ratio\", l1_ratio)\n",
235 |     "        mlflow.log_param(\"run_origin\", run_origin)\n",
236 |     "        mlflow.log_metric(\"rmse\", rmse)\n",
237 |     "        mlflow.log_metric(\"r2\", r2)\n",
238 |     "        mlflow.log_metric(\"mae\", mae)\n",
239 |     "\n",
240 |     "        mlflow.sklearn.log_model(clf, \"model\")\n",
241 |     "        \n",
242 |     "        X = data.drop([\"quality\"], axis=1).values\n",
243 |     "        y = data[[\"quality\"]].values.ravel()\n",
244 |     "        plot_file = \"wine_quality.png\"\n",
245 |     "        plot_enet_descent_path(X, y, l1_ratio, plot_file)\n",
246 |     "        mlflow.log_artifact(plot_file)\n",
247 |     "        \n",
248 |     "        return (rmse,r2,mae)"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 45,
254 |    "metadata": {},
255 |    "outputs": [
256 |     {
257 |      "name": "stdout",
258 |      "output_type": "stream",
259 |      "text": [
260 |       "run_id: d8b189ad970e44f4978c5b19bbf6fa1e\n",
261 |       "run_origin: jupyter\n",
262 |       "Elasticnet model (alpha=0.1, l1_ratio=0.1):\n",
263 |       "  RMSE: 0.7792546522251949\n",
264 |       "  MAE: 0.6112547988118587\n",
265 |       "  R2: 0.2157063843066196\n",
266 |       "Computing regularization path using the elastic net.\n"
267 |      ]
268 |     },
269 |     {
270 |      "data": {
271 |       "text/plain": [
272 |        "(0.7792546522251949, 0.2157063843066196, 0.6112547988118587)"
273 |       ]
274 |      },
275 |      "execution_count": 45,
276 |      "metadata": {},
277 |      "output_type": "execute_result"
278 |     }
279 |    ],
280 |    "source": [
281 |     "train(0.1, 0.1)"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 46,
287 |    "metadata": {},
288 |    "outputs": [
289 |     {
290 |      "name": "stdout",
291 |      "output_type": "stream",
292 |      "text": [
293 |       "Run End: 2019-03-26_02:10:10\n"
294 |      ]
295 |     }
296 |    ],
297 |    "source": [
298 |     "print(\"Run End:\",now())"
299 |    ]
300 |   }
301 |  ],
302 |  "metadata": {
303 |   "kernelspec": {
304 |    "display_name": "Python 3",
305 |    "language": "python",
306 |    "name": "python3"
307 |   },
308 |   "language_info": {
309 |    "codemirror_mode": {
310 |     "name": "ipython",
311 |     "version": 3
312 |    },
313 |    "file_extension": ".py",
314 |    "mimetype": "text/x-python",
315 |    "name": "python",
316 |    "nbconvert_exporter": "python",
317 |    "pygments_lexer": "ipython3",
318 |    "version": "3.6.8"
319 |   },
320 |   "name": "Train Wine Quality",
321 |   "notebookId": 1406514
322 |  },
323 |  "nbformat": 4,
324 |  "nbformat_minor": 1
325 | }
326 | 


--------------------------------------------------------------------------------
/sklearn/conda.yaml:
--------------------------------------------------------------------------------
 1 | name: mlflow-demo-sklearn
 2 | channels:
 3 | - conda-forge
 4 | dependencies:
 5 | - python=3.6
 6 | - pip:
 7 |     - mlflow==0.9.1
 8 |     - scikit-learn==0.19.1
 9 |     - numpy==1.14.5
10 |     - scipy==1.1.0
11 |     - matplotlib==2.2.2
12 | 


--------------------------------------------------------------------------------
/sklearn/create_job_existing_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "MLflow_ExistingCluster",
 3 |   "existing_cluster_id": "1222-015510-grams64",
 4 |   "email_notifications": {
 5 |     "on_start": ["myname@mycompany.com"],
 6 |     "on_success": ["myname@mycompany.com"],
 7 |     "on_failure": ["myname@mycompany.com"]
 8 |   },
 9 |   "spark_python_task": {
10 |     "python_file": "dbfs:/tmp/jobs/wine_quality/main_train_wine_quality.py"
11 |   },
12 |   "timeout_seconds": 3600
13 | }
14 | 


--------------------------------------------------------------------------------
/sklearn/create_job_new_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "MLflow_RunNow_NewCluster",
 3 |   "new_cluster": {
 4 |     "spark_version": "5.3.x-scala2.11",
 5 |     "node_type_id": "i3.xlarge",
 6 |     "num_workers": 1
 7 |   },
 8 |   "email_notifications": {
 9 |     "on_start": ["myname@mycompany.com"],
10 |     "on_success": ["myname@mycompany.com"],
11 |     "on_failure": ["myname@mycompany.com"]
12 |   },
13 |   "libraries": [
14 |     { "pypi": { "package": "mlflow" } },
15 |     { "pypi": { "package": "cloudpickle" }},
16 |     { "egg": "dbfs:/tmp/jobs/wine_quality/mlflow_wine_quality-0.0.1-py3.6.egg" }
17 |   ],
18 |   "spark_python_task": {
19 |     "python_file": "dbfs:/tmp/jobs/wine_quality/main_train_wine_quality.py"
20 |   },
21 |   "timeout_seconds": 3600
22 | }
23 | 


--------------------------------------------------------------------------------
/sklearn/data/wine-quality.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "fixed acidity": 7,
 4 |     "volatile acidity": 0.27,
 5 |     "citric acid": 0.36,
 6 |     "residual sugar": 20.7,
 7 |     "chlorides": 0.045,
 8 |     "free sulfur dioxide": 45,
 9 |     "total sulfur dioxide": 170,
10 |     "density": 1.001,
11 |     "pH": 3,
12 |     "sulphates": 0.45,
13 |     "alcohol": 8.8
14 |   }
15 | ]
16 | 


--------------------------------------------------------------------------------
/sklearn/main.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from wine_quality.train import Trainer
 3 | from argparse import ArgumentParser
 4 | 
 5 | if __name__ == "__main__":
 6 |     parser = ArgumentParser()
 7 |     parser.add_argument("--experiment_name", dest="experiment_name", help="experiment_name", required=True)
 8 |     parser.add_argument("--data_path", dest="data_path", help="data_path", required=True)
 9 |     parser.add_argument("--alpha", dest="alpha", help="alpha", default=0.1, type=float )
10 |     parser.add_argument("--l1_ratio", dest="l1_ratio", help="l1_ratio", default=0.1, type=float )
11 |     parser.add_argument("--run_origin", dest="run_origin", help="run_origin", default="none")
12 |     args = parser.parse_args()
13 |     trainer = Trainer(args.experiment_name, args.data_path,args.run_origin)
14 |     trainer.train(args.alpha, args.l1_ratio)
15 | 


--------------------------------------------------------------------------------
/sklearn/mlflow_run_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "spark_version": "5.3.x-scala2.11",
 3 |   "driver_node_type_id": "i3.xlarge",
 4 |   "node_type_id": "i3.xlarge",
 5 |   "num_workers": 1,
 6 |   "spark_env_vars": {
 7 |     "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
 8 |   },
 9 |   "libraries": [
10 |     { "pypi": { "package": "mlflow" }}
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/sklearn/pickle_predict.py:
--------------------------------------------------------------------------------
 1 | 
 2 | """ Serve predictions by unpickling model artifact file. """
 3 | 
 4 | from __future__ import print_function
 5 | import sys
 6 | import pickle
 7 | import util
 8 | 
 9 | if __name__ == "__main__":
10 |     if len(sys.argv) < 1:
11 |         println("ERROR: Expecting PICKLE_FILE PREDICTION_FILE")
12 |         sys.exit(1)
13 |     pickle_path = sys.argv[1]
14 |     data_path = sys.argv[2] if len(sys.argv) > 2 else "data/wine-quality-red.csv"
15 |     print("pickle_path:",pickle_path)
16 |     print("data_path:",data_path)
17 | 
18 |     with open(pickle_path, 'rb') as f:
19 |         model = pickle.load(f)
20 |     print("model:",model)
21 |     print("model type:",type(model))
22 | 
23 |     df = util.read_prediction_data(data_path)
24 |     predictions = model.predict(df)
25 |     print("predictions:",predictions)
26 | 


--------------------------------------------------------------------------------
/sklearn/playbook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## sklearn playbook"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 4,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import os\n",
 17 |     "os.environ[\"MLFLOW_TRACKING_URI\"] = \"http://localhost:5000\""
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "### Train"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 5,
 30 |    "metadata": {
 31 |     "scrolled": true
 32 |    },
 33 |    "outputs": [
 34 |     {
 35 |      "name": "stdout",
 36 |      "output_type": "stream",
 37 |      "text": [
 38 |       "MLflow Version: 0.9.1\n",
 39 |       "MLflow Tracking URI: http://localhost:5000\n",
 40 |       "experiment_name: WineQualityExperiment\n",
 41 |       "run_origin: none\n",
 42 |       "data_path: data/wine-quality-white.csv\n",
 43 |       "experiment_id: 7\n",
 44 |       "run_id: 792037e4fdde40efb1c43eeef76a5f9b\n",
 45 |       "  experiment_id: 7\n",
 46 |       "  Parameters:\n",
 47 |       "    alpha: 0.5\n",
 48 |       "    l1_ratio: 0.5\n",
 49 |       "  Metrics:\n",
 50 |       "    RMSE: 0.82224284975954\n",
 51 |       "    MAE: 0.6278761410160693\n",
 52 |       "    R2: 0.12678721972772677\n"
 53 |      ]
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "! python main.py --experiment_name WineQualityExperiment \\\n",
 58 |     "  --data_path data/wine-quality-white.csv \\\n",
 59 |     "  --alpha 0.5 --l1_ratio 0.5"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [
 67 |     {
 68 |      "name": "stdout",
 69 |      "output_type": "stream",
 70 |      "text": [
 71 |       "2019/04/24 21:23:52 INFO mlflow.projects: === Creating conda environment mlflow-169b50d7002caef08e847b2615a41296bd46b052 ===\n",
 72 |       "Collecting package metadata: done\n",
 73 |       "Solving environment: done\n",
 74 |       "\n",
 75 |       "\n",
 76 |       "==> WARNING: A newer version of conda exists. <==\n",
 77 |       "  current version: 4.6.2\n",
 78 |       "  latest version: 4.6.14\n",
 79 |       "\n",
 80 |       "Please update conda by running\n",
 81 |       "\n",
 82 |       "    $ conda update -n base -c defaults conda\n",
 83 |       "\n",
 84 |       "\n",
 85 |       "\n",
 86 |       "Downloading and Extracting Packages\n",
 87 |       "pip-19.1             | 1.8 MB    | ##################################### | 100% \n",
 88 |       "Preparing transaction: done\n",
 89 |       "Verifying transaction: done\n",
 90 |       "Executing transaction: done\n",
 91 |       "Collecting mlflow==0.9.1 (from -r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
 92 |       "  Using cached https://files.pythonhosted.org/packages/4f/8a/3713ce558aba91acf8495bf9e82961030858eb5fe3041c5c51186e89fc85/mlflow-0.9.1-py3-none-any.whl\n",
 93 |       "Collecting scikit-learn==0.19.1 (from -r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 2))\n",
 94 |       "  Using cached https://files.pythonhosted.org/packages/f0/5e/1e1576587c5a9e8de6771806a4cccea8decd268c988453cf35ccbf892929/scikit_learn-0.19.1-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl\n",
 95 |       "Collecting numpy==1.14.5 (from -r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 3))\n",
 96 |       "  Using cached https://files.pythonhosted.org/packages/f6/cd/b2c50b5190b66c711c23ef23c41d450297eb5a54d2033f8dcb3b8b13ac85/numpy-1.14.5-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl\n",
 97 |       "Collecting scipy==1.1.0 (from -r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 4))\n",
 98 |       "  Using cached https://files.pythonhosted.org/packages/a0/b6/70bf61c1badb5fea82d4c558e05e76c2dee5e77bb072fe465d7c7a87287d/scipy-1.1.0-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl\n",
 99 |       "Collecting matplotlib==2.2.2 (from -r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 5))\n",
100 |       "  Using cached https://files.pythonhosted.org/packages/8a/d5/5337662b714c65100f3545ed3909e9478614d1ebf1f692a52981f3f5167b/matplotlib-2.2.2-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl\n",
101 |       "Collecting pandas (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
102 |       "  Using cached https://files.pythonhosted.org/packages/2a/67/0a59cb257c72bb837575ca0ddf5f0fe2a482e98209b7a1bed8cde68ddb46/pandas-0.24.2-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl\n",
103 |       "Collecting requests>=2.17.3 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
104 |       "  Using cached https://files.pythonhosted.org/packages/7d/e3/20f3d364d6c8e5d2353c72a67778eb189176f08e873c9900e10c0287b84b/requests-2.21.0-py2.py3-none-any.whl\n",
105 |       "Collecting gunicorn (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
106 |       "  Using cached https://files.pythonhosted.org/packages/8c/da/b8dd8deb741bff556db53902d4706774c8e1e67265f69528c14c003644e6/gunicorn-19.9.0-py2.py3-none-any.whl\n",
107 |       "Collecting gitpython>=2.1.0 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
108 |       "  Using cached https://files.pythonhosted.org/packages/fe/e5/fafe827507644c32d6dc553a1c435cdf882e0c28918a5bab29f7fbebfb70/GitPython-2.1.11-py2.py3-none-any.whl\n",
109 |       "Collecting click>=7.0 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
110 |       "  Using cached https://files.pythonhosted.org/packages/fa/37/45185cb5abbc30d7257104c434fe0b07e5a195a6847506c074527aa599ec/Click-7.0-py2.py3-none-any.whl\n",
111 |       "Collecting python-dateutil (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
112 |       "  Using cached https://files.pythonhosted.org/packages/41/17/c62faccbfbd163c7f57f3844689e3a78bae1f403648a6afb1d0866d87fbb/python_dateutil-2.8.0-py2.py3-none-any.whl\n",
113 |       "Collecting Flask (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
114 |       "  Using cached https://files.pythonhosted.org/packages/7f/e7/08578774ed4536d3242b14dacb4696386634607af824ea997202cd0edb4b/Flask-1.0.2-py2.py3-none-any.whl\n",
115 |       "Collecting simplejson (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
116 |       "Collecting querystring-parser (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
117 |       "Collecting databricks-cli>=0.8.0 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
118 |       "Collecting boto3>=1.7.12 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
119 |       "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/3f/1b/4adde85d1cbbe8a8f2fef47276daf496328fc8d7c40d7a8d6a67b0eba45b/boto3-1.9.135-py2.py3-none-any.whl (128kB)\n",
120 |       "\u001b[K     |████████████████████████████████| 133kB 3.4MB/s eta 0:00:01\n",
121 |       "\u001b[?25hCollecting sqlparse (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
122 |       "  Using cached https://files.pythonhosted.org/packages/ef/53/900f7d2a54557c6a37886585a91336520e5539e3ae2423ff1102daf4f3a7/sqlparse-0.3.0-py2.py3-none-any.whl\n",
123 |       "Collecting entrypoints (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
124 |       "  Using cached https://files.pythonhosted.org/packages/ac/c6/44694103f8c221443ee6b0041f69e2740d89a25641e62fb4f2ee568f2f9c/entrypoints-0.3-py2.py3-none-any.whl\n",
125 |       "Collecting docker>=3.6.0 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
126 |       "  Using cached https://files.pythonhosted.org/packages/48/68/c3afca1a5aa8d2997ec3b8ee822a4d752cf85907b321f07ea86888545152/docker-3.7.2-py2.py3-none-any.whl\n",
127 |       "Collecting cloudpickle (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
128 |       "  Using cached https://files.pythonhosted.org/packages/6e/bc/67f13115adcce4efc5e4d7f8220fb9a50aaa2b5c7ed460b26cbb76aa76ad/cloudpickle-0.8.1-py2.py3-none-any.whl\n",
129 |       "Collecting pyyaml (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
130 |       "Collecting six>=1.10.0 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
131 |       "  Using cached https://files.pythonhosted.org/packages/73/fb/00a976f728d0d1fecfe898238ce23f502a721c0ac0ecfedb80e0d88c64e9/six-1.12.0-py2.py3-none-any.whl\n",
132 |       "Collecting protobuf>=3.6.0 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
133 |       "  Using cached https://files.pythonhosted.org/packages/46/1e/c481d59760cded074d89ff51c99381708111c550ff698934cc296d27df2c/protobuf-3.7.1-cp36-cp36m-macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl\n",
134 |       "Collecting mleap>=0.8.1 (from mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
135 |       "Collecting cycler>=0.10 (from matplotlib==2.2.2->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 5))\n",
136 |       "  Using cached https://files.pythonhosted.org/packages/f7/d2/e07d3ebb2bd7af696440ce7e754c59dd546ffe1bbe732c8ab68b9c834e61/cycler-0.10.0-py2.py3-none-any.whl\n",
137 |       "Collecting pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 (from matplotlib==2.2.2->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 5))\n",
138 |       "  Using cached https://files.pythonhosted.org/packages/dd/d9/3ec19e966301a6e25769976999bd7bbe552016f0d32b577dc9d63d2e0c49/pyparsing-2.4.0-py2.py3-none-any.whl\n",
139 |       "Collecting kiwisolver>=1.0.1 (from matplotlib==2.2.2->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 5))\n"
140 |      ]
141 |     },
142 |     {
143 |      "name": "stdout",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "  Downloading https://files.pythonhosted.org/packages/16/e7/df58eb8868d183223692d2a62529a594f6414964a3ae93548467b146a24d/kiwisolver-1.1.0.tar.gz\n",
147 |       "Collecting pytz (from matplotlib==2.2.2->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 5))\n",
148 |       "  Using cached https://files.pythonhosted.org/packages/3d/73/fe30c2daaaa0713420d0382b16fbb761409f532c56bdcc514bf7b6262bb6/pytz-2019.1-py2.py3-none-any.whl\n",
149 |       "Collecting chardet<3.1.0,>=3.0.2 (from requests>=2.17.3->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
150 |       "  Using cached https://files.pythonhosted.org/packages/bc/a9/01ffebfb562e4274b6487b4bb1ddec7ca55ec7510b22e4c51f14098443b8/chardet-3.0.4-py2.py3-none-any.whl\n",
151 |       "Collecting idna<2.9,>=2.5 (from requests>=2.17.3->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
152 |       "  Using cached https://files.pythonhosted.org/packages/14/2c/cd551d81dbe15200be1cf41cd03869a46fe7226e7450af7a6545bfc474c9/idna-2.8-py2.py3-none-any.whl\n",
153 |       "Collecting urllib3<1.25,>=1.21.1 (from requests>=2.17.3->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
154 |       "  Using cached https://files.pythonhosted.org/packages/df/1c/59cca3abf96f991f2ec3131a4ffe72ae3d9ea1f5894abe8a9c5e3c77cfee/urllib3-1.24.2-py2.py3-none-any.whl\n",
155 |       "Requirement already satisfied: certifi>=2017.4.17 in /Users/ander/miniconda3/envs/mlflow-169b50d7002caef08e847b2615a41296bd46b052/lib/python3.6/site-packages (from requests>=2.17.3->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1)) (2019.3.9)\n",
156 |       "Collecting gitdb2>=2.0.0 (from gitpython>=2.1.0->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
157 |       "  Using cached https://files.pythonhosted.org/packages/da/30/a407568aa8d8f25db817cf50121a958722f3fc5f87e3a6fba1f40c0633e3/gitdb2-2.0.5-py2.py3-none-any.whl\n",
158 |       "Collecting Jinja2>=2.10 (from Flask->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
159 |       "  Using cached https://files.pythonhosted.org/packages/1d/e7/fd8b501e7a6dfe492a433deb7b9d833d39ca74916fa8bc63dd1a4947a671/Jinja2-2.10.1-py2.py3-none-any.whl\n",
160 |       "Collecting itsdangerous>=0.24 (from Flask->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
161 |       "  Using cached https://files.pythonhosted.org/packages/76/ae/44b03b253d6fade317f32c24d100b3b35c2239807046a4c953c7b89fa49e/itsdangerous-1.1.0-py2.py3-none-any.whl\n",
162 |       "Collecting Werkzeug>=0.14 (from Flask->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
163 |       "  Using cached https://files.pythonhosted.org/packages/18/79/84f02539cc181cdbf5ff5a41b9f52cae870b6f632767e43ba6ac70132e92/Werkzeug-0.15.2-py2.py3-none-any.whl\n",
164 |       "Collecting configparser>=0.3.5 (from databricks-cli>=0.8.0->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
165 |       "  Using cached https://files.pythonhosted.org/packages/ba/05/6c96328e92e625fc31445d24d75a2c92ef9ba34fc5b037fe69693c362a0d/configparser-3.7.4-py2.py3-none-any.whl\n",
166 |       "Collecting tabulate>=0.7.7 (from databricks-cli>=0.8.0->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
167 |       "Collecting botocore<1.13.0,>=1.12.135 (from boto3>=1.7.12->mlflow==0.9.1->-r /Users/ander/git/andre/mlflow-spark-summit-2019/sklearn/condaenv.k9fcah73.requirements.txt (line 1))\n",
168 |       "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d3/36/08db4978d59d75750ef6da9835150b901c6fb96f6d5f30d8c50eb424ed4e/botocore-1.12.135-py2.py3-none-any.whl (5.4MB)\n",
169 |       "\u001b[K     |████████████████████████████████| 5.4MB 3.6MB/s eta 0:00:01\n"
170 |      ]
171 |     }
172 |    ],
173 |    "source": [
174 |     "! mlflow run . -P alpha=0.01 -P l1_ratio=0.75 -P run_origin=LocalRun \\\n",
175 |     "  -P data_path=data/wine-quality-white.csv \\\n",
176 |     "  --experiment-id=2"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "! mlflow run https://github.com/amesar/mlflow-fun.git#examples/scikit-learn/wine-quality \\\n",
186 |     "  -P alpha=0.01 -P l1_ratio=0.75 -P run_origin=GitRun \\\n",
187 |     "  -P data_path=data/wine-quality-white.csv \\\n",
188 |     "  --experiment-id=2"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "### Predict"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 5,
201 |    "metadata": {},
202 |    "outputs": [
203 |     {
204 |      "name": "stdout",
205 |      "output_type": "stream",
206 |      "text": [
207 |       "MLflow Version: 0.9.1\n",
208 |       "data_path: data/wine-quality-white.csv\n",
209 |       "run_id: 4fc01818631840d9ae4d8ad9f86299fc\n",
210 |       "model_uri: /Users/ander/work/mlflow/local_mlrun/mlruns/2/4fc01818631840d9ae4d8ad9f86299fc/artifacts/model\n",
211 |       "model: ElasticNet(alpha=0.01, copy_X=True, fit_intercept=True, l1_ratio=1.0,\n",
212 |       "      max_iter=1000, normalize=False, positive=False, precompute=False,\n",
213 |       "      random_state=42, selection='cyclic', tol=0.0001, warm_start=False)\n",
214 |       "predictions: [5.56281868 5.30043479 5.70350174 ... 5.44619008 6.55740254 6.2852277 ]\n"
215 |      ]
216 |     }
217 |    ],
218 |    "source": [
219 |     "! python pyfunc_predict.py 4fc01818631840d9ae4d8ad9f86299fc data/wine-quality-white.csv"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 6,
225 |    "metadata": {},
226 |    "outputs": [
227 |     {
228 |      "name": "stdout",
229 |      "output_type": "stream",
230 |      "text": [
231 |       "MLflow Version: 0.9.1\n",
232 |       "data_path: data/wine-quality-white.csv\n",
233 |       "run_id: 4fc01818631840d9ae4d8ad9f86299fc\n",
234 |       "model: ElasticNet(alpha=0.01, copy_X=True, fit_intercept=True, l1_ratio=1.0,\n",
235 |       "      max_iter=1000, normalize=False, positive=False, precompute=False,\n",
236 |       "      random_state=42, selection='cyclic', tol=0.0001, warm_start=False)\n",
237 |       "predictions: [5.56281868 5.30043479 5.70350174 ... 5.44619008 6.55740254 6.2852277 ]\n"
238 |      ]
239 |     }
240 |    ],
241 |    "source": [
242 |     "! python scikit_predict.py 4fc01818631840d9ae4d8ad9f86299fc data/wine-quality-white.csv"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 9,
248 |    "metadata": {},
249 |    "outputs": [
250 |     {
251 |      "name": "stdout",
252 |      "output_type": "stream",
253 |      "text": [
254 |       "19/04/25 14:06:00 WARN Utils: Your hostname, C02VD1RGHTDD resolves to a loopback address: 127.0.0.1; using 10.64.185.74 instead (on interface en0)\n",
255 |       "19/04/25 14:06:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
256 |       "19/04/25 14:06:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
257 |       "path: data/wine-quality-white.csv\n",
258 |       "run_id= 4fc01818631840d9ae4d8ad9f86299fc\n",
259 |       "MLflow Version: 0.9.1\n",
260 |       "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+\n",
261 |       "|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|\n",
262 |       "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+\n",
263 |       "|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8|\n",
264 |       "|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|\n",
265 |       "|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                97.0| 0.9951|3.26|     0.44|   10.1|\n",
266 |       "|          7.2|            0.23|       0.32|           8.5|    0.058|               47.0|               186.0| 0.9956|3.19|      0.4|    9.9|\n",
267 |       "|          7.2|            0.23|       0.32|           8.5|    0.058|               47.0|               186.0| 0.9956|3.19|      0.4|    9.9|\n",
268 |       "|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                97.0| 0.9951|3.26|     0.44|   10.1|\n",
269 |       "|          6.2|            0.32|       0.16|           7.0|    0.045|               30.0|               136.0| 0.9949|3.18|     0.47|    9.6|\n",
270 |       "|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8|\n",
271 |       "|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|\n",
272 |       "|          8.1|            0.22|       0.43|           1.5|    0.044|               28.0|               129.0| 0.9938|3.22|     0.45|   11.0|\n",
273 |       "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+\n",
274 |       "only showing top 10 rows\n",
275 |       "\n",
276 |       "/Users/ander/venvs/mlflow-venv/lib/python2.7/site-packages/pyarrow/__init__.py:152: UserWarning: pyarrow.open_stream is deprecated, please use pyarrow.ipc.open_stream\n",
277 |       "  warnings.warn(\"pyarrow.open_stream is deprecated, please use \"\n",
278 |       "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+------------------+\n",
279 |       "|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|        prediction|\n",
280 |       "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+------------------+\n",
281 |       "|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8| 5.562818680029495|\n",
282 |       "|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|5.3004347872132715|\n",
283 |       "|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                97.0| 0.9951|3.26|     0.44|   10.1| 5.703501740309309|\n",
284 |       "|          7.2|            0.23|       0.32|           8.5|    0.058|               47.0|               186.0| 0.9956|3.19|      0.4|    9.9| 5.742035433111079|\n",
285 |       "|          7.2|            0.23|       0.32|           8.5|    0.058|               47.0|               186.0| 0.9956|3.19|      0.4|    9.9| 5.742035433111079|\n",
286 |       "|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                97.0| 0.9951|3.26|     0.44|   10.1| 5.703501740309309|\n",
287 |       "|          6.2|            0.32|       0.16|           7.0|    0.045|               30.0|               136.0| 0.9949|3.18|     0.47|    9.6| 5.526523431037601|\n",
288 |       "|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8| 5.562818680029495|\n",
289 |       "|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|5.3004347872132715|\n",
290 |       "|          8.1|            0.22|       0.43|           1.5|    0.044|               28.0|               129.0| 0.9938|3.22|     0.45|   11.0| 5.912851142126504|\n",
291 |       "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+------------------+\n",
292 |       "only showing top 10 rows\n",
293 |       "\n",
294 |       "/Users/ander/venvs/mlflow-venv/lib/python2.7/site-packages/pyarrow/__init__.py:152: UserWarning: pyarrow.open_stream is deprecated, please use pyarrow.ipc.open_stream\n",
295 |       "  warnings.warn(\"pyarrow.open_stream is deprecated, please use \"\n",
296 |       "+------------------+\n",
297 |       "|        prediction|\n",
298 |       "+------------------+\n",
299 |       "| 5.562818680029495|\n",
300 |       "|5.3004347872132715|\n",
301 |       "| 5.703501740309309|\n",
302 |       "| 5.742035433111079|\n",
303 |       "| 5.742035433111079|\n",
304 |       "| 5.703501740309309|\n",
305 |       "| 5.526523431037601|\n",
306 |       "| 5.562818680029495|\n",
307 |       "|5.3004347872132715|\n",
308 |       "| 5.912851142126504|\n",
309 |       "+------------------+\n",
310 |       "only showing top 10 rows\n",
311 |       "\n",
312 |       "/Users/ander/venvs/mlflow-venv/lib/python2.7/site-packages/pyarrow/__init__.py:152: UserWarning: pyarrow.open_stream is deprecated, please use pyarrow.ipc.open_stream\n",
313 |       "  warnings.warn(\"pyarrow.open_stream is deprecated, please use \"\n",
314 |       "predictions: 5.5628187\n"
315 |      ]
316 |     }
317 |    ],
318 |    "source": [
319 |     "! spark-submit --master local[2] spark_udf_predict.py \\\n",
320 |     "  4fc01818631840d9ae4d8ad9f86299fc data/wine-quality-white.csv"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": []
329 |   }
330 |  ],
331 |  "metadata": {
332 |   "kernelspec": {
333 |    "display_name": "Python 2",
334 |    "language": "python",
335 |    "name": "python2"
336 |   },
337 |   "language_info": {
338 |    "codemirror_mode": {
339 |     "name": "ipython",
340 |     "version": 2
341 |    },
342 |    "file_extension": ".py",
343 |    "mimetype": "text/x-python",
344 |    "name": "python",
345 |    "nbconvert_exporter": "python",
346 |    "pygments_lexer": "ipython2",
347 |    "version": "2.7.14"
348 |   }
349 |  },
350 |  "nbformat": 4,
351 |  "nbformat_minor": 2
352 | }
353 | 


--------------------------------------------------------------------------------
/sklearn/pyfunc_predict.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Serve predictions with mlflow.pyfunc.load_pyfunc()
 3 | 
 4 | from __future__ import print_function
 5 | import sys
 6 | import mlflow
 7 | import mlflow.pyfunc
 8 | import mlflow.tracking
 9 | import util
10 | 
11 | if __name__ == "__main__":
12 |     if len(sys.argv) < 1:
13 |         println("ERROR: Expecting RUN_ID PREDICTION_FILE")
14 |         sys.exit(1)
15 |     print("MLflow Version:", mlflow.version.VERSION)
16 |     run_id = sys.argv[1]
17 |     data_path = sys.argv[2] if len(sys.argv) > 2 else "data/wine-quality-red.csv"
18 |     print("data_path:",data_path)
19 |     print("run_id:",run_id)
20 | 
21 |     client = mlflow.tracking.MlflowClient()
22 |     model_uri = client.get_run(run_id).info.artifact_uri + "/model"
23 |     print("model_uri:",model_uri)
24 |     model = mlflow.pyfunc.load_pyfunc(model_uri)
25 |     print("model:",model)
26 | 
27 |     df = util.read_prediction_data(data_path)
28 |     predictions = model.predict(df)
29 |     print("predictions:",predictions)
30 | 


--------------------------------------------------------------------------------
/sklearn/run_submit_existing_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "run_name": "MLflow_RunSubmit_ExistingCluster",
 3 |   "existing_cluster_id": "1222-015510-grams64",
 4 |   "timeout_seconds": 3600,
 5 |   "libraries": [
 6 |     { "pypi": { "package": "mlflow" } },
 7 |     { "pypi": { "package": "cloudpickle" }},
 8 |     { "egg": "dbfs:/tmp/jobs/wine_quality/mlflow_wine_quality-0.0.1-py3.6.egg" }
 9 |   ],
10 |   "spark_python_task": {
11 |     "python_file": "dbfs:/tmp/jobs/wine_quality/main_train_wine_quality.py",
12 |     "parameters": [ "/Users/john.doe@acme.com/experiments/WineQuality", 0.3, 0.3, "/dbfs/tmp/jobs/wine_quality/wine-quality.csv", "run_submit_existing_cluster_egg" ]
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/sklearn/run_submit_new_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "run_name": "MLflow_RunSubmit_NewCluster",
 3 |   "new_cluster": {
 4 |     "spark_version": "5.1.x-scala2.11",
 5 |     "node_type_id": "i3.xlarge",
 6 |     "num_workers": 1
 7 |   },
 8 |   "libraries": [
 9 |     { "pypi": { "package": "mlflow" } },
10 |     { "egg": "dbfs:/tmp/jobs/wine_quality/mlflow_wine_quality-0.0.1-py3.6.egg" }
11 |   ],
12 |   "spark_python_task": {
13 |     "python_file": "dbfs:/tmp/jobs/wine_quality/main_train_wine_quality.py",
14 |     "parameters": [ "/Users/john.doe@acme.com/experiments/WineQuality", 0.3, 0.3, "/dbfs/tmp/jobs/wine_quality/wine-quality.csv", "run_submit_new_cluster_egg" ]
15 |   },
16 |   "timeout_seconds": 3600
17 | }
18 | 


--------------------------------------------------------------------------------
/sklearn/scikit_predict.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Serve predictions with mlflow.sklearn.load_model()
 3 | 
 4 | from __future__ import print_function
 5 | import sys
 6 | import mlflow
 7 | import mlflow.sklearn
 8 | import util
 9 | 
10 | if __name__ == "__main__":
11 |     if len(sys.argv) < 1:
12 |         println("ERROR: Expecting RUN_ID PREDICTION_FILE")
13 |         sys.exit(1)
14 |     print("MLflow Version:", mlflow.version.VERSION)
15 |     run_id = sys.argv[1]
16 |     data_path = sys.argv[2] if len(sys.argv) > 2 else "data/wine-quality-red.csv"
17 |     print("data_path:",data_path)
18 |     print("run_id:",run_id)
19 | 
20 |     model = mlflow.sklearn.load_model("model", run_id=run_id)
21 |     print("model:",model)
22 | 
23 |     df = util.read_prediction_data(data_path)
24 |     predictions = model.predict(df)
25 |     print("predictions:",predictions)
26 | 


--------------------------------------------------------------------------------
/sklearn/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup(name='mlflow_wine_quality',
4 |       version='0.0.1',
5 |       description='mlflow_wine_quality',
6 |       author='Andre',
7 |       packages=['wine_quality'],
8 |       zip_safe=False)
9 | 


--------------------------------------------------------------------------------
/sklearn/spark_udf_predict.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Serve predictions with Spark UDF.
 3 | """
 4 | from __future__ import print_function
 5 | 
 6 | import sys
 7 | from pyspark.sql import SparkSession
 8 | import mlflow
 9 | import mlflow.sklearn
10 | 
11 | if __name__ == "__main__":
12 |     path = sys.argv[2] if len(sys.argv) > 2 else "data/wine-quality-red.csv"
13 |     run_id = sys.argv[1]
14 |     print("path:",path)
15 |     print("run_id=",run_id)
16 |     print("MLflow Version:", mlflow.version.VERSION)
17 | 
18 |     spark = SparkSession.builder.appName("ServePredictions").getOrCreate()
19 | 
20 |     df = spark.read.option("inferSchema",True).option("header", True).csv(path) if path.endswith(".csv") \
21 |     else spark.read.option("multiLine",True).json(path)
22 | 
23 |     if "quality" in df.columns:
24 |         df = df.drop("quality")
25 |     df.show(10)
26 | 
27 |     udf = mlflow.pyfunc.spark_udf(spark, "model", run_id=run_id)
28 |     df2 = df.withColumn("prediction", udf(*df.columns))
29 |     df2.show(10)
30 |     df2.select("prediction").show(10)
31 |     pred = df2.select("prediction").first()[0]
32 |     print("predictions: {:,.7f}".format(pred))
33 | 
34 | 


--------------------------------------------------------------------------------
/sklearn/util.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import pandas as pd
 3 | 
 4 | def read_prediction_data(data_path):
 5 |     df = pd.read_csv(data_path) if data_path.endswith(".csv") else pd.read_json(data_path)
 6 |     #print("df.shape:",df.shape)
 7 |     #print("df.columns:",df.columns)
 8 |     if 'quality' in df:
 9 |          df = df.drop(['quality'], axis=1)
10 |     return df
11 | 


--------------------------------------------------------------------------------
/sklearn/wine_quality/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amesar/mlflow-spark-summit-2019/3d4791f1defae41c2a6c570b379f129ed0ae59cb/sklearn/wine_quality/__init__.py


--------------------------------------------------------------------------------
/sklearn/wine_quality/plot_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from itertools import cycle
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | def plot_enet_descent_path(X, y, l1_ratio, alphas_enet, coefs_enet, plot_file):
 7 |     fig = plt.figure(1)
 8 |     ax = plt.gca()
 9 | 
10 |     colors = cycle(['b', 'r', 'g', 'c', 'k'])
11 |     neg_log_alphas_enet = -np.log10(alphas_enet)
12 |     for coef_e, c in zip(coefs_enet, colors):
13 |         l2 = plt.plot(neg_log_alphas_enet, coef_e, linestyle='--', c=c)
14 | 
15 |     plt.xlabel('-Log(alpha)')
16 |     plt.ylabel('coefficients')
17 |     title = 'ElasticNet Path by alpha for l1_ratio = ' + str(l1_ratio)
18 |     plt.title(title)
19 |     plt.axis('tight')
20 | 
21 |     fig.savefig(plot_file)
22 |     plt.close(fig)
23 |     return fig
24 | 


--------------------------------------------------------------------------------
/sklearn/wine_quality/train.py:
--------------------------------------------------------------------------------
  1 | # The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality
  2 | # P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
  3 | # Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.
  4 | 
  5 | from __future__ import print_function
  6 | import os
  7 | import sys
  8 | import platform
  9 | 
 10 | import pandas as pd
 11 | import numpy as np
 12 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
 13 | from sklearn.model_selection import train_test_split
 14 | from sklearn.linear_model import ElasticNet, enet_path
 15 | 
 16 | import mlflow
 17 | import mlflow.sklearn
 18 | from wine_quality import plot_utils
 19 | 
 20 | print("MLflow Version:", mlflow.version.VERSION)
 21 | print("MLflow Tracking URI:", mlflow.get_tracking_uri())
 22 | 
 23 | class Trainer(object):
 24 |     def __init__(self, experiment_name, data_path, run_origin="none"):
 25 |         self.experiment_name = experiment_name
 26 |         self.data_path = data_path
 27 |         self.run_origin = run_origin
 28 |         np.random.seed(40)
 29 | 
 30 |         print("experiment_name:",self.experiment_name)
 31 |         print("run_origin:",run_origin)
 32 | 
 33 |         # Read the wine-quality csv file 
 34 |         print("data_path:",data_path)
 35 |         data = pd.read_csv(data_path)
 36 |     
 37 |         # Split the data into training and test sets. (0.75, 0.25) split.
 38 |         train, test = train_test_split(data)
 39 |     
 40 |         # The predicted column is "quality" which is a scalar from [3, 9]
 41 |         self.train_x = train.drop(["quality"], axis=1)
 42 |         self.test_x = test.drop(["quality"], axis=1)
 43 |         self.train_y = train[["quality"]]
 44 |         self.test_y = test[["quality"]]
 45 |         self.current_file = os.path.basename(__file__)
 46 | 
 47 |         self.X = data.drop(["quality"], axis=1).values
 48 |         self.y = data[["quality"]].values.ravel()
 49 | 
 50 |         # If using 'mlflow run' must use --experiment-id to set experiment since set_experiment() does not work
 51 |         if self.experiment_name != "none":
 52 |             mlflow.set_experiment(experiment_name)
 53 |             client = mlflow.tracking.MlflowClient()
 54 |             experiment_id = client.get_experiment_by_name(experiment_name).experiment_id
 55 |             print("experiment_id:",experiment_id)
 56 | 
 57 |     def eval_metrics(self, actual, pred):
 58 |         rmse = np.sqrt(mean_squared_error(actual, pred))
 59 |         mae = mean_absolute_error(actual, pred)
 60 |         r2 = r2_score(actual, pred)
 61 |         return rmse, mae, r2
 62 |     
 63 |     def train(self, alpha, l1_ratio):
 64 |         with mlflow.start_run(source_name=self.current_file) as run:
 65 |             run_id = run.info.run_uuid
 66 |             print("run_id:",run_id)
 67 |             experiment_id = run.info.experiment_id
 68 |             print("  experiment_id:",experiment_id)
 69 |             clf = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
 70 |             clf.fit(self.train_x, self.train_y)
 71 |     
 72 |             predicted_qualities = clf.predict(self.test_x)
 73 |             (rmse, mae, r2) = self.eval_metrics(self.test_y, predicted_qualities)
 74 |     
 75 |             #print("Parameters:(alpha={}, l1_ratio={}):".format(alpha, l1_ratio))
 76 |             print("  Parameters:")
 77 |             print("    alpha:",alpha)
 78 |             print("    l1_ratio:",l1_ratio)
 79 |             print("  Metrics:")
 80 |             print("    RMSE:",rmse)
 81 |             print("    MAE:",mae)
 82 |             print("    R2:",r2)
 83 |     
 84 |             mlflow.log_param("alpha", alpha)
 85 |             mlflow.log_param("l1_ratio", l1_ratio)
 86 |     
 87 |             mlflow.log_metric("rmse", rmse)
 88 |             mlflow.log_metric("r2", r2)
 89 |             mlflow.log_metric("mae", mae)
 90 |             
 91 |             mlflow.set_tag("data_path", self.data_path)
 92 |             mlflow.set_tag("exp_id", experiment_id)
 93 |             mlflow.set_tag("exp_name", self.experiment_name)
 94 |             mlflow.set_tag("run_origin", self.run_origin)
 95 |             mlflow.set_tag("platform", platform.system())
 96 |     
 97 |             mlflow.sklearn.log_model(clf, "model")
 98 |     
 99 |             eps = 5e-3  # the smaller it is the longer is the path
100 |             alphas_enet, coefs_enet, _ = enet_path(self.X, self.y, eps=eps, l1_ratio=l1_ratio, fit_intercept=False)
101 |             plot_file = "wine_ElasticNet-paths.png"
102 |             plot_utils.plot_enet_descent_path(self.X, self.y, l1_ratio, alphas_enet, coefs_enet, plot_file)
103 |             mlflow.log_artifact(plot_file)
104 |     
105 |         return (experiment_id,run_id)
106 | 


--------------------------------------------------------------------------------