├── src
    ├── __init__.py
    ├── tests
    │   ├── train
    │   │   └── test_train.py
    │   ├── evaluate
    │   │   └── test_evaluate.py
    │   ├── pytest.ini
    │   ├── __pycache__
    │   │   └── test_main.cpython-38-pytest-6.0.1.pyc
    │   ├── test_main.py
    │   ├── test_data
    │   │   └── testWeatherAUS_processed.csv
    │   ├── preprocess
    │   │   ├── test_data
    │   │   │   └── testWeatherAUS.csv
    │   │   └── test_preprocess.py
    │   └── model
    │   │   └── test_model.py
    ├── __pycache__
    │   └── model.cpython-37.pyc
    ├── scripts
    │   ├── Scripts
    │   │   ├── requirements.txt
    │   │   ├── README.md
    │   │   ├── git_release_pipeline.sh
    │   │   ├── init_project.sh
    │   │   └── std_check.sh
    │   └── Pipelines
    │   │   ├── git_release_pipeline.py
    │   │   ├── README.md
    │   │   ├── model_update_deployment_pipeline.py
    │   │   ├── model_update_pipeline.py
    │   │   ├── model_redeploy_pipeline.py
    │   │   ├── model_deploy_pipeline.py
    │   │   ├── model_deployed_validate_pipeline.py
    │   │   ├── model_train_autoAI.py
    │   │   └── openscale.py
    ├── evaluate.py
    ├── train.py
    ├── preprocess_data.py
    └── model.py
├── .infra
    ├── .gitignore
    ├── .terraform
    │   ├── .gitignore
    │   ├── output.tf
    │   └── init_infra.tf
    └── datapak_manage.py
├── .pylintrc
├── models
    └── .gitignore
├── .dvc
    ├── .gitignore
    └── config
├── results
    ├── .gitignore
    └── metrics.json
├── data
    ├── .gitignore
    └── weatherAUS.csv.dvc
├── credentials.yaml.gpg
├── credentials_example.yaml
├── requirements.txt
├── metadata.yaml
├── .pre-commit-config.yaml
├── .github
    └── workflows
    │   ├── test_on_push.yaml
    │   ├── deploy_on_release.yaml
    │   └── train_evaluate.yaml
├── .gitignore
├── dvc.yaml
├── dvc.lock
└── README.md


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.infra/.gitignore:
--------------------------------------------------------------------------------
1 | configs


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [default]
2 | ignored-modules


--------------------------------------------------------------------------------
/models/.gitignore:
--------------------------------------------------------------------------------
1 | /model.joblib
2 | 


--------------------------------------------------------------------------------
/.dvc/.gitignore:
--------------------------------------------------------------------------------
1 | /config.local
2 | /tmp
3 | /cache
4 | 


--------------------------------------------------------------------------------
/results/.gitignore:
--------------------------------------------------------------------------------
1 | /precision_recall_curve.png
2 | /roc_curve.png
3 | 


--------------------------------------------------------------------------------
/src/tests/train/test_train.py:
--------------------------------------------------------------------------------
1 | def test_train():
2 |     assert 1 + 1 == 2
3 | 


--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | /weatherAUS.csv
2 | /weatherAUS_processed.csv
3 | /features.csv
4 | 


--------------------------------------------------------------------------------
/src/tests/evaluate/test_evaluate.py:
--------------------------------------------------------------------------------
1 | def test_evaluate():
2 |     assert 1 + 1 == 2
3 | 


--------------------------------------------------------------------------------
/src/tests/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 |     error
4 |     ignore::UserWarning
5 | 


--------------------------------------------------------------------------------
/credentials.yaml.gpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlops-guide/dvc-gitactions/HEAD/credentials.yaml.gpg


--------------------------------------------------------------------------------
/data/weatherAUS.csv.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: a65cf8b8719b1a65db4f361eeec18457
3 |   size: 14094055
4 |   path: weatherAUS.csv
5 | 


--------------------------------------------------------------------------------
/src/__pycache__/model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlops-guide/dvc-gitactions/HEAD/src/__pycache__/model.cpython-37.pyc


--------------------------------------------------------------------------------
/results/metrics.json:
--------------------------------------------------------------------------------
1 | {"accuracy": 0.849730029073792, "recall": 0.9460718094560967, "precision": 0.8718998787799365, "f1": 0.9074727635415069}


--------------------------------------------------------------------------------
/src/scripts/Scripts/requirements.txt:
--------------------------------------------------------------------------------
 1 | sklearn
 2 | pandas
 3 | seaborn
 4 | matplotlib
 5 | yaml
 6 | joblib
 7 | numpy
 8 | importlib
 9 | json
10 | 


--------------------------------------------------------------------------------
/src/scripts/Scripts/README.md:
--------------------------------------------------------------------------------
1 | # Project_init
2 | Script to create standardized structure for ml projects
3 | 
4 |  ``
5 |  $ ./init_project.sh my-model v1
6 | ``
7 | 


--------------------------------------------------------------------------------
/.infra/.terraform/.gitignore:
--------------------------------------------------------------------------------
1 | #.terraform folder
2 | .terraform/
3 | .terraform.lock.hcl
4 | output.json
5 | terraform.tfstate
6 | terraform.tfstate.backup
7 | infra_state.json


--------------------------------------------------------------------------------
/credentials_example.yaml:
--------------------------------------------------------------------------------
1 | # Rename this file to credentials.yaml to be able to run the scripts
2 | 
3 | url: "https://us-south.ml.cloud.ibm.com"
4 | apikey: ""
5 | space_id: ""
6 | 


--------------------------------------------------------------------------------
/src/tests/__pycache__/test_main.cpython-38-pytest-6.0.1.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlops-guide/dvc-gitactions/HEAD/src/tests/__pycache__/test_main.cpython-38-pytest-6.0.1.pyc


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | scikit-learn>=0.23
 2 | pandas
 3 | seaborn
 4 | matplotlib
 5 | joblib
 6 | numpy
 7 | ibm_watson_machine_learning
 8 | pyyaml
 9 | pytest
10 | pytest-dependency
11 | pre-commit
12 | 


--------------------------------------------------------------------------------
/.dvc/config:
--------------------------------------------------------------------------------
1 | [core]
2 |     remote = remote-storage
3 | ['remote "remote-storage"']
4 |     url = s3://wine-bucket-test/rain_australia/
5 |     endpointurl = https://s3.us-south.cloud-object-storage.appdomain.cloud
6 | 


--------------------------------------------------------------------------------
/.infra/.terraform/output.tf:
--------------------------------------------------------------------------------
 1 | output "cos_crn" {
 2 |   value = ibm_resource_instance.cos.crn
 3 | }
 4 | 
 5 | output "wml_name" {
 6 |   value = ibm_resource_instance.wml.name
 7 | }
 8 | 
 9 | output "wml_crn" {
10 |   value = ibm_resource_instance.wml
11 | }
12 | 


--------------------------------------------------------------------------------
/metadata.yaml:
--------------------------------------------------------------------------------
1 | author: guipleite
2 | datetime_creted: 29/03/2021_13:46:23:802394723
3 | deployment_uid: e02e481d-4e56-470f-baa9-ae84a583c0a8
4 | model_type: scikit-learn_0.23
5 | model_uid: f29e4cfc-3aab-458a-b703-fabc265f43a3
6 | project_name: Rain_aus
7 | project_version: v0.3
8 | 


--------------------------------------------------------------------------------
/src/tests/test_main.py:
--------------------------------------------------------------------------------
 1 | # General tests that comprehend general aspects of the code
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | def capital_case(x):
 7 |     return x.capitalize()
 8 | 
 9 | 
10 | def test_capital_case():
11 |     assert capital_case("semaphore") == "Semaphore"
12 | 


--------------------------------------------------------------------------------
/src/tests/test_data/testWeatherAUS_processed.csv:
--------------------------------------------------------------------------------
1 | MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,WindGustDir_W,WindGustDir_WNW,WindDir9am_NNW,WindDir9am_W,WindDir3pm_WNW,WindDir3pm_WSW,RainTomorrow
2 | 13.4,22.9,0.6,44,20,24,71,22,1007.7,1007.1,16.9,21.8,0,1,0,0,1,1,0,0
3 | 7.4,25.1,0.0,44,4,22,44,25,1010.6,1007.8,17.2,24.3,0,0,1,1,0,0,1,0
4 | 


--------------------------------------------------------------------------------
/src/tests/preprocess/test_data/testWeatherAUS.csv:
--------------------------------------------------------------------------------
1 | Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
2 | 2008-12-01,Albury,13.4,22.9,0.6,NA,NA,W,44,W,WNW,20,24,71,22,1007.7,1007.1,8,NA,16.9,21.8,No,No
3 | 2008-12-02,Albury,7.4,25.1,0,NA,NA,WNW,44,NNW,WSW,4,22,44,25,1010.6,1007.8,NA,NA,17.2,24.3,No,No


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | --- 
 2 | repos:
 3 | -
 4 |   repo: https://github.com/ambv/black
 5 |   rev: 20.8b1
 6 |   hooks: 
 7 |     - 
 8 |       id: black
 9 |       language_version: python3
10 |   
11 | -   repo: local
12 |     hooks:
13 |     -   id: python-tests
14 |         name: pytests
15 |         entry: pytest src/tests
16 |         language: python
17 |         additional_dependencies: [pre-commit, pytest, pandas, sklearn, matplotlib]
18 |         always_run: true
19 |         pass_filenames: false
20 |   
21 | 
22 |   
23 | 


--------------------------------------------------------------------------------
/src/scripts/Scripts/git_release_pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | if  ! python3 ./src/scripts/Pipelines/git_release_pipeline.py ./
 4 | then 
 5 |     echo "      Model already has been deployed, updating it"
 6 |     python3 ./src/scripts/Pipelines/model_update_pipeline.py ./models/model.joblib ./ ./credentials.yaml
 7 |     python3 ./src/scripts/Pipelines/model_update_deployment_pipeline.py ./ ./credentials.yaml
 8 | else    
 9 |     echo "      Deploying model for the first time" 
10 |     python3 ./src/scripts/Pipelines/model_deploy_pipeline.py ./models/model.joblib ./ ./credentials.yaml
11 | fi
12 | 


--------------------------------------------------------------------------------
/src/scripts/Pipelines/git_release_pipeline.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import yaml
 4 | 
 5 | """
 6 |     Usage:
 7 |         python3 model_deploy_pipeline.py ./pickle_model ../path/to/project/ ../credentials.yaml
 8 | 
 9 | """
10 | 
11 | PROJ_PATH = os.path.abspath(sys.argv[1])
12 | META_PATH = PROJ_PATH + "/metadata.yaml"
13 | 
14 | with open(META_PATH) as stream:
15 |     try:
16 |         metadata = yaml.safe_load(stream)
17 |     except yaml.YAMLError as exc:
18 |         print(exc)
19 | 
20 | if "deployment_uid" in metadata.keys():
21 |     sys.exit(1)
22 | 
23 | else:
24 |     sys.exit(0)
25 | 


--------------------------------------------------------------------------------
/src/scripts/Pipelines/README.md:
--------------------------------------------------------------------------------
 1 | python3 model_train_pipeline.py ../../Test_Project/my-model_v1/src/breast_cancer.csv ../../Test_Project/my-model_v1/ 10 ./pickle_model
 2 | 
 3 | 
 4 | python3 model_deploy_pipeline.py ./pickle_model ../../Test_Project/my-model_v1/ ../../credentials.yaml
 5 | 
 6 | 
 7 | python3 model_deployed_validate_pipeline.py ../../Test_Project/my-model_v1/src/breast_cancer.csv  ../../credentials.yaml ../../Test_Project/my-model_v1/
 8 | 
 9 | 
10 | python3 model_update_pipeline.py ./pickle_model ../../credentials.yaml ../../Test_Project/my-model_v1/
11 | 
12 | 
13 | python3 model_redeploy_pipeline.py ../../credentials.yaml
14 | 
15 | 
16 | python3 model_redeploy_pipeline.py ../../credentials.yaml 
17 | 


--------------------------------------------------------------------------------
/src/scripts/Scripts/init_project.sh:
--------------------------------------------------------------------------------
 1 | MODEL=$1
 2 | VERSION=$2
 3 | PROJECT_NAME="$1_$2"
 4 | echo "Creating $MODEL $VERSION"
 5 | mkdir -p -m 777 $PROJECT_NAME 
 6 | mkdir -p -m 777 $PROJECT_NAME/src
 7 | touch $PROJECT_NAME/src/__init__.py
 8 | touch $PROJECT_NAME/src/model.py
 9 | mkdir -p -m 777 $PROJECT_NAME/notebooks
10 | mkdir -p -m 777 $PROJECT_NAME/tests
11 | echo "Created by: "$USERNAME 
12 | now=$(date +%x_%H:%M:%S:%N)
13 | echo "At: "$now
14 | 
15 | cat <<EOF >./$PROJECT_NAME/metadata.yaml
16 | project_name: $MODEL
17 | project_version: $VERSION
18 | model_type: scikit-learn_0.23
19 | author: $USERNAME
20 | datetime_creted: $now
21 | EOF
22 | 
23 | cp ./requirements.txt ./$PROJECT_NAME
24 | cp -avr ./scripts  ./$PROJECT_NAME/src/
25 | 


--------------------------------------------------------------------------------
/.github/workflows/test_on_push.yaml:
--------------------------------------------------------------------------------
 1 | name: Python Package and Test
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       matrix:
11 |         python-version:  [3.6]
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v2
15 |     - name: Set up Python ${{ matrix.python-version }}
16 |       uses: actions/setup-python@v2
17 |       with:
18 |         python-version: ${{ matrix.python-version }}
19 |     - name: Install dependencies
20 |       run: |
21 |         python -m pip install --upgrade pip
22 |         pip install pytest black
23 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
24 |     - name: Test with pytest
25 |       run: |
26 |         pytest
27 |     - name: Python Black
28 |       run: |
29 |         black . --check
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *.pyc
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | myenv/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/


--------------------------------------------------------------------------------
/dvc.yaml:
--------------------------------------------------------------------------------
 1 | stages:
 2 |   std_check:
 3 |     cmd: src/scripts/Scripts/std_check.sh ./
 4 |   preprocess:
 5 |     cmd: python3 ./src/preprocess_data.py ./data/weatherAUS.csv
 6 |     deps:
 7 |     - ./src/preprocess_data.py
 8 |     - data/weatherAUS.csv
 9 |     outs:
10 |     - ./data/weatherAUS_processed.csv
11 |     - ./data/features.csv
12 |   train:
13 |     cmd: python3 ./src/train.py ./data/weatherAUS_processed.csv ./src/model.py 200
14 |     deps:
15 |     - ./data/weatherAUS_processed.csv
16 |     - ./src/model.py
17 |     - ./src/train.py
18 |     outs:
19 |     - ./models/model.joblib
20 |   evaluate:
21 |     cmd: python3 ./src/evaluate.py ./data/weatherAUS_processed.csv ./src/model.py ./models/model.joblib
22 |     deps:
23 |     - ./data/weatherAUS_processed.csv
24 |     - ./models/model.joblib
25 |     - ./src/evaluate.py
26 |     - ./src/model.py
27 |     outs:
28 |     - ./results/precision_recall_curve.png
29 |     - ./results/roc_curve.png
30 |     metrics:
31 |     - ./results/metrics.json:
32 |         cache: false
33 | 


--------------------------------------------------------------------------------
/src/evaluate.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import importlib.util
 3 | import pickle
 4 | import os
 5 | import json
 6 | 
 7 | # from sklearn.externals import joblib
 8 | import joblib
 9 | 
10 | DATA_PATH = os.path.abspath(sys.argv[1])
11 | # PROJ_PATH = os.path.abspath(sys.argv[2])
12 | # MODEL_PATH = PROJ_PATH+"/src/model.py"
13 | MODEL_PATH = sys.argv[2]
14 | PICKLE_PATH = sys.argv[3]
15 | 
16 | 
17 | sys.path.insert(1, MODEL_PATH)
18 | 
19 | 
20 | def module_from_file(module_name, file_path):
21 |     spec = importlib.util.spec_from_file_location(module_name, file_path)
22 |     module = importlib.util.module_from_spec(spec)
23 |     spec.loader.exec_module(module)
24 |     return module
25 | 
26 | 
27 | model = module_from_file("model", MODEL_PATH)
28 | 
29 | # with open(PICKLE_PATH, "rb") as file:
30 | #         pipeline = pickle.load(file)
31 | pipeline = joblib.load(PICKLE_PATH)
32 | log_eval = model.evaluate(DATA_PATH, pipeline, "./results")
33 | 
34 | with open("./results/metrics.json", "w") as outfile:
35 |     json.dump(log_eval["metrics"], outfile)
36 | 


--------------------------------------------------------------------------------
/.infra/.terraform/init_infra.tf:
--------------------------------------------------------------------------------
 1 | #### AUTH && PLUGIN
 2 | 
 3 | terraform {
 4 |   required_providers {
 5 |     ibm = {
 6 |       source  = "IBM-Cloud/ibm"
 7 |       version = "~> 1.12.0"
 8 |     }
 9 |   }
10 | }
11 | 
12 | 
13 | provider "ibm" {}
14 | 
15 | #### RESOURCE GROUP
16 | 
17 | data "ibm_resource_group" "group" {
18 |   name = "fpe_insper"
19 | }
20 | 
21 | #### Machine learning service
22 | resource "ibm_resource_instance" "wml" {
23 |   name              = "TESTE_TERRAFORM"
24 |   service           = "pm-20"
25 |   plan              = "lite"
26 |   location          = "us-south"
27 |   resource_group_id = data.ibm_resource_group.group.id
28 |   tags              = ["TESTE", "TERRAFORM"]
29 | 
30 | }
31 | 
32 | #### Object storage
33 | 
34 | resource "ibm_resource_instance" "cos" {
35 |   name              = "TESTE_COS"
36 |   service           = "cloud-object-storage"
37 |   plan              = "standard"
38 |   location          = "global"
39 |   resource_group_id = data.ibm_resource_group.group.id
40 |   tags              = ["TERRAFORM", "TEST"]
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import importlib.util
 3 | import pickle
 4 | import os
 5 | import json
 6 | import joblib
 7 | 
 8 | # import sklearn.external.joblib as extjoblib
 9 | 
10 | DATA_PATH = os.path.abspath(sys.argv[1])
11 | # PROJ_PATH = os.path.abspath(sys.argv[2])
12 | MODEL_PATH = sys.argv[2]
13 | PARAM = int(sys.argv[3])
14 | 
15 | sys.path.insert(1, MODEL_PATH)
16 | 
17 | 
18 | def module_from_file(module_name, file_path):
19 |     spec = importlib.util.spec_from_file_location(module_name, file_path)
20 |     module = importlib.util.module_from_spec(spec)
21 |     spec.loader.exec_module(module)
22 |     return module
23 | 
24 | 
25 | model = module_from_file("model", MODEL_PATH)
26 | 
27 | if __name__ == "__main__":
28 | 
29 |     pipeline, log_train = model.train(DATA_PATH, PARAM)
30 | 
31 |     # if sys.argv[4]:
32 |     # with open("./models/model.pkl", "wb") as file:
33 |     #     pickle.dump(pipeline[0], file)
34 |     joblib.dump(pipeline, "./models/model.joblib")
35 | 
36 |     # log_eval = model.evaluate(DATA_PATH, pipeline, "./results")
37 | 
38 |     # with open("./results/metrics.json", "w") as outfile:
39 |     #     json.dump(log_eval["metrics"], outfile)
40 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy_on_release.yaml:
--------------------------------------------------------------------------------
 1 | name: model-deploy-on-release
 2 | on:
 3 |   release:
 4 |     types: 
 5 |       - 'created'
 6 | 
 7 | jobs:
 8 |   run:
 9 |     runs-on: [ubuntu-latest]
10 |     container: docker://dvcorg/cml-py3:latest
11 |     steps:
12 |       - uses: actions/checkout@v2
13 |       - name: 'Deploy/Update on new release'
14 |         shell: bash
15 | 
16 |         env:
17 |           repo_token: ${{ secrets.GITHUB_TOKEN }}
18 |           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
19 |           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
20 |           CRED_SECRET: ${{ secrets.IBM_CREDENTIALS_PASS }}
21 |         
22 |         run: |
23 |           # Install requirements
24 |           pip install -r requirements.txt
25 | 
26 |           # Pull data & run-cache from S3 and reproduce pipeline
27 |           dvc pull --run-cache
28 |           dvc repro
29 | 
30 |           # Decrypt credentials file
31 |           gpg --quiet --batch --yes --decrypt --passphrase="$CRED_SECRET" --output credentials.yaml credentials.yaml.gpg
32 | 
33 |           # Check if there is a deployment already, if positive update it, otherwise deploys it for the first time
34 |           ./src/scripts/Scripts/git_release_pipeline.sh 
35 | 
36 |             


--------------------------------------------------------------------------------
/.github/workflows/train_evaluate.yaml:
--------------------------------------------------------------------------------
 1 | name: model-training-evaluate
 2 | on: [push]
 3 | jobs:
 4 |   run:
 5 |     runs-on: [ubuntu-latest]
 6 |     container: docker://dvcorg/cml-py3:latest
 7 |     steps:
 8 |       - uses: actions/checkout@v2
 9 |       - name: 'Train and Evaluate model'
10 |         shell: bash
11 |         env:
12 |           repo_token: ${{ secrets.GITHUB_TOKEN }}
13 |           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
14 |           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
15 |         run: |
16 |           # Install requirements
17 |           pip install -r requirements.txt
18 | 
19 |           # Pull data & run-cache from S3 and reproduce pipeline
20 |           dvc pull --run-cache
21 |           dvc repro
22 | 
23 |           # Report metrics
24 |           echo "## Metrics" >> report.md
25 |           git fetch --prune
26 |           dvc metrics diff master --show-md >> report.md
27 | 
28 |           # Publish confusion matrix diff
29 |           echo -e "## Plots\n### ROC Curve" >> report.md
30 |           cml-publish ./results/roc_curve.png --md >> report.md
31 |           echo -e "\n### Precision and Recall Curve" >> report.md
32 |           cml-publish ./results/precision_recall_curve.png --md >> report.md
33 |           cml-send-comment report.md
34 | 


--------------------------------------------------------------------------------
/src/scripts/Pipelines/model_update_deployment_pipeline.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import yaml
 4 | from ibm_watson_machine_learning import APIClient
 5 | 
 6 | """
 7 |     Usage:
 8 |         python3 model_update_deployment_pipeline.py ../path/to/project/ ../credentials.yaml
 9 | 
10 | """
11 | 
12 | PROJ_PATH = os.path.abspath(sys.argv[1])
13 | CRED_PATH = os.path.abspath(sys.argv[2])
14 | META_PATH = PROJ_PATH + "/metadata.yaml"
15 | 
16 | with open(CRED_PATH) as stream:
17 |     try:
18 |         credentials = yaml.safe_load(stream)
19 |     except yaml.YAMLError as exc:
20 |         print(exc)
21 | 
22 | with open(META_PATH) as stream:
23 |     try:
24 |         metadata = yaml.safe_load(stream)
25 |     except yaml.YAMLError as exc:
26 |         print(exc)
27 | 
28 | wml_credentials = {"url": credentials["url"], "apikey": credentials["apikey"]}
29 | 
30 | client = APIClient(wml_credentials)
31 | client.spaces.list()
32 | 
33 | SPACE_ID = credentials["space_id"]
34 | 
35 | if "deployment_uid" in metadata.keys():
36 |     MODEL_GUID = metadata["model_uid"]
37 |     DEPLOYMENT_UID = metadata["deployment_uid"]
38 |     print("\nExtracting DEPLOYMENT UID and MODEL GUID from metadata file\n")
39 | 
40 | else:
41 |     MODEL_GUID = input("MODEL GUID: ")
42 |     DEPLOYMENT_UID = input("DEPLOYMENT UID: ")
43 | 
44 | client.set.default_space(SPACE_ID)
45 | 
46 | change_meta = {client.deployments.ConfigurationMetaNames.ASSET: {"id": MODEL_GUID}}
47 | 
48 | print("Alterando o deploy abaixo: ")
49 | print(client.deployments.get_details(DEPLOYMENT_UID))
50 | 
51 | client.deployments.update(DEPLOYMENT_UID, change_meta)
52 | 


--------------------------------------------------------------------------------
/src/scripts/Scripts/std_check.sh:
--------------------------------------------------------------------------------
 1 | PROJ_PATH=$1
 2 | RED='\033[0;31m'
 3 | GRE='\033[0;32m'
 4 | NC='\033[0m'
 5 | ER=0
 6 | a=1
 7 | 
 8 | if !([ -d "$PROJ_PATH" ])
 9 | then 
10 |     ER=$a
11 |     echo -e "${RED}Error: Directory $PROJ_PATH does not exists.${NC}"
12 | else
13 |     echo "Directory $PROJ_PATH exists." 
14 | fi
15 | 
16 | if !([ -f "$PROJ_PATH/metadata.yaml" ])
17 | then 
18 |     ER=$a
19 |     echo -e "      ${RED}Error: File $PROJ_PATH/metadata.yaml does not exists.${NC}"
20 | else    
21 |     echo "      File $PROJ_PATH/metadata.yaml exists." 
22 | fi
23 | 
24 | if !([ -d "$PROJ_PATH/src" ])
25 | then 
26 |     ER=$a
27 |     echo -e "${RED}Error: Directory $PROJ_PATH/src does not exists.${NC}"
28 | else    
29 |     echo "Directory $PROJ_PATH/src exists."
30 | fi
31 | 
32 | if !([ -f "$PROJ_PATH/src/model.py" ])
33 | then 
34 |     ER=$a
35 |     echo -e "     ${RED}Error: File $PROJ_PATH/src/model.py does not exists.${NC}"
36 | else    
37 |     echo "      File $PROJ_PATH/src/model.py exists."
38 | fi
39 | if !([ -f "$PROJ_PATH/src/__init__.py" ])
40 | then 
41 |     ER=$a
42 |     echo -e "      ${RED}Error: File $PROJ_PATH/src/__init__.py does not exists.${NC}"
43 | else    
44 |     echo "      File $PROJ_PATH/src/__init__.py exists." 
45 | fi
46 | 
47 | 
48 | 
49 | if !([ -d "$PROJ_PATH/notebooks" ])
50 | then 
51 |     ER=$a
52 |     echo -e "${RED}Error: Directory $PROJ_PATH/notebooks does not exists.${NC}"
53 | else
54 |     echo "Directory $PROJ_PATH/notebooks exists."
55 | fi
56 | 
57 | printf "\n"
58 | if [ $ER == 1 ]
59 | then
60 |     echo -e "${RED}Error: Project Structure has been changed, please fix it \n${NC}"
61 |     exit 0
62 | else
63 |     echo -e "${GRE}Project structure is ok \n${NC}"
64 | 
65 | fi
66 | 
67 | 
68 | if ! black ./$PROJ_PATH --check; then
69 |     echo -e "${RED}Please run the command 'black' to format your files"
70 |     exit 0
71 | else
72 |     echo -e "${GRE}Files formated, moving foward"
73 | fi
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/src/scripts/Pipelines/model_update_pipeline.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import yaml
 4 | import joblib
 5 | from ibm_watson_machine_learning import APIClient
 6 | 
 7 | """
 8 |     Usage:
 9 |         python3 model_update_pipeline.py ./pickle_model ../path/to/project/ ../credentials.yaml
10 | 
11 | """
12 | 
13 | MODEL_PATH = os.path.abspath(sys.argv[1])
14 | PROJ_PATH = os.path.abspath(sys.argv[2])
15 | CRED_PATH = os.path.abspath(sys.argv[3])
16 | META_PATH = PROJ_PATH + "/metadata.yaml"
17 | 
18 | with open(CRED_PATH) as stream:
19 |     try:
20 |         credentials = yaml.safe_load(stream)
21 |     except yaml.YAMLError as exc:
22 |         print(exc)
23 | 
24 | with open(META_PATH) as stream:
25 |     try:
26 |         metadata = yaml.safe_load(stream)
27 |     except yaml.YAMLError as exc:
28 |         print(exc)
29 | 
30 | with open(MODEL_PATH, "rb") as file:
31 |     # pickle_model = pickle.load(file)
32 |     pipeline = joblib.load(file)
33 | 
34 | wml_credentials = {"url": credentials["url"], "apikey": credentials["apikey"]}
35 | 
36 | client = APIClient(wml_credentials)
37 | client.spaces.list()
38 | 
39 | SPACE_ID = credentials["space_id"]
40 | 
41 | if "model_uid" in metadata.keys():
42 |     MODEL_GUID = metadata["model_uid"]
43 |     print("\nExtracting MODEL GUID from metadata file\n")
44 | 
45 | else:
46 |     MODEL_GUID = input("MODEL GUID: ")
47 | 
48 | client.set.default_space(SPACE_ID)
49 | 
50 | print("\nCreating new version")
51 | 
52 | published_model = client.repository.update_model(
53 |     model_uid=MODEL_GUID,
54 |     update_model=pipeline,
55 |     updated_meta_props={
56 |         client.repository.ModelMetaNames.NAME: metadata["project_name"]
57 |         + "_"
58 |         + metadata["project_version"]
59 |     },
60 | )
61 | 
62 | new_model_revision = client.repository.create_model_revision(MODEL_GUID)
63 | 
64 | rev_id = new_model_revision["metadata"].get("rev")
65 | print("\nversion", rev_id)
66 | 
67 | client.repository.list_models_revisions(MODEL_GUID)
68 | 


--------------------------------------------------------------------------------
/src/tests/model/test_model.py:
--------------------------------------------------------------------------------
 1 | # PyTest file for model.py
 2 | 
 3 | import sys
 4 | import os
 5 | import pytest
 6 | import pandas as pd
 7 | 
 8 | # Parent Folder
 9 | sys.path.append(
10 |     os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
11 | )
12 | 
13 | # Model Python file
14 | from model import get_variables
15 | 
16 | FILE_NAME = "testWeatherAUS"
17 | PROCESSED_DATA_PATH = (
18 |     os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
19 |     + "/test_data/"
20 |     + FILE_NAME
21 |     + "_processed.csv"
22 | )
23 | 
24 | 
25 | @pytest.mark.parametrize(
26 |     "expected_X,expected_y",
27 |     [
28 |         (
29 |             {
30 |                 "MinTemp": {0: 13.4, 1: 7.4},
31 |                 "MaxTemp": {0: 22.9, 1: 25.1},
32 |                 "Rainfall": {0: 0.6, 1: 0.0},
33 |                 "WindGustSpeed": {0: 44, 1: 44},
34 |                 "WindSpeed9am": {0: 20, 1: 4},
35 |                 "WindSpeed3pm": {0: 24, 1: 22},
36 |                 "Humidity9am": {0: 71, 1: 44},
37 |                 "Humidity3pm": {0: 22, 1: 25},
38 |                 "Pressure9am": {0: 1007.7, 1: 1010.6},
39 |                 "Pressure3pm": {0: 1007.1, 1: 1007.8},
40 |                 "Temp9am": {0: 16.9, 1: 17.2},
41 |                 "Temp3pm": {0: 21.8, 1: 24.3},
42 |                 "RainToday": {0: 0, 1: 0},
43 |                 "WindGustDir_W": {0: 1, 1: 0},
44 |                 "WindGustDir_WNW": {0: 0, 1: 1},
45 |                 "WindDir9am_NNW": {0: 0, 1: 1},
46 |                 "WindDir9am_W": {0: 1, 1: 0},
47 |                 "WindDir3pm_WNW": {0: 1, 1: 0},
48 |                 "WindDir3pm_WSW": {0: 0, 1: 1},
49 |             },
50 |             [0, 0],
51 |         )
52 |     ],
53 | )
54 | def test_get_variables(expected_X, expected_y):
55 | 
56 |     # Open CSV as DF
57 |     data = pd.read_csv(PROCESSED_DATA_PATH)
58 | 
59 |     # Run Function
60 |     X, y = get_variables(data, "RainTomorrow")
61 | 
62 |     assert (X.to_dict(), y.to_list()) == (expected_X, expected_y)
63 | 


--------------------------------------------------------------------------------
/src/scripts/Pipelines/model_redeploy_pipeline.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import time
 4 | import yaml
 5 | from ibm_watson_machine_learning import APIClient
 6 | 
 7 | """
 8 |     Usage:
 9 |         python3 model_reploy_pipeline.py ../path/to/project/ ../credentials.yaml
10 | 
11 | """
12 | 
13 | PROJ_PATH = os.path.abspath(sys.argv[1])
14 | CRED_PATH = os.path.abspath(sys.argv[2])
15 | META_PATH = PROJ_PATH + "/metadata.yaml"
16 | 
17 | with open(CRED_PATH) as stream:
18 |     try:
19 |         credentials = yaml.safe_load(stream)
20 |     except yaml.YAMLError as exc:
21 |         print(exc)
22 | 
23 | with open(META_PATH) as stream:
24 |     try:
25 |         metadata = yaml.safe_load(stream)
26 |     except yaml.YAMLError as exc:
27 |         print(exc)
28 | 
29 | wml_credentials = {"url": credentials["url"], "apikey": credentials["apikey"]}
30 | 
31 | client = APIClient(wml_credentials)
32 | client.spaces.list()
33 | 
34 | SPACE_ID = credentials["space_id"]
35 | 
36 | if "deployment_uid" in metadata.keys():
37 |     MODEL_GUID = metadata["model_uid"]
38 |     DEPLOYMENT_UID = metadata["deployment_uid"]
39 |     print("\nExtracting DEPLOYMENT UID and MODEL GUID from metadata file\n")
40 | 
41 | else:
42 |     MODEL_GUID = input("MODEL GUID: ")
43 |     DEPLOYMENT_UID = input("DEPLOYMENT UID: ")
44 | 
45 | client.set.default_space(SPACE_ID)
46 | 
47 | client.repository.list_models_revisions(MODEL_GUID)
48 | 
49 | MODEL_VERSION = input("MODEL VERSION: ")
50 | 
51 | meta = {
52 |     client.deployments.ConfigurationMetaNames.ASSET: {
53 |         "id": MODEL_GUID,
54 |         "rev": MODEL_VERSION,
55 |     }
56 | }
57 | updated_deployment = client.deployments.update(
58 |     deployment_uid=DEPLOYMENT_UID, changes=meta
59 | )
60 | 
61 | status = None
62 | while status not in ["ready", "failed"]:
63 |     print(".", end=" ")
64 |     time.sleep(2)
65 |     deployment_details = client.deployments.get_details(DEPLOYMENT_UID)
66 |     status = deployment_details["entity"]["status"].get("state")
67 | 
68 | print("\nDeployment update finished with status: ", status)
69 | # print(deployment_details)
70 | 


--------------------------------------------------------------------------------
/dvc.lock:
--------------------------------------------------------------------------------
 1 | schema: '2.0'
 2 | stages:
 3 |   preprocess:
 4 |     cmd: python3 ./src/preprocess_data.py ./data/weatherAUS.csv
 5 |     deps:
 6 |     - path: ./src/preprocess_data.py
 7 |       md5: b5e571f866aa8993ad3bb844594e112e
 8 |       size: 1909
 9 |     - path: data/weatherAUS.csv
10 |       md5: a65cf8b8719b1a65db4f361eeec18457
11 |       size: 14094055
12 |     outs:
13 |     - path: ./data/features.csv
14 |       md5: 49c2fbca9e0ae3101ae5bb56d6a4521a
15 |       size: 19266775
16 |     - path: ./data/weatherAUS_processed.csv
17 |       md5: 59e89e62fb8f9face4901630d1de3e16
18 |       size: 19507550
19 |   train:
20 |     cmd: python3 ./src/train.py ./data/weatherAUS_processed.csv ./src/model.py 200
21 |     deps:
22 |     - path: ./data/weatherAUS_processed.csv
23 |       md5: 59e89e62fb8f9face4901630d1de3e16
24 |       size: 19507550
25 |     - path: ./src/model.py
26 |       md5: 895596132410cf7e581953ecbdc9b44d
27 |       size: 4485
28 |     - path: ./src/train.py
29 |       md5: 1b5c6c1786d40c9505b2261f11a3b274
30 |       size: 1002
31 |     outs:
32 |     - path: ./models/model.joblib
33 |       md5: 8cf64091db28e29b327baf946a796f27
34 |       size: 3275
35 |   evaluate:
36 |     cmd: python3 ./src/evaluate.py ./data/weatherAUS_processed.csv ./src/model.py
37 |       ./models/model.joblib
38 |     deps:
39 |     - path: ./data/weatherAUS_processed.csv
40 |       md5: 59e89e62fb8f9face4901630d1de3e16
41 |       size: 19507550
42 |     - path: ./models/model.joblib
43 |       md5: 8cf64091db28e29b327baf946a796f27
44 |       size: 3275
45 |     - path: ./src/evaluate.py
46 |       md5: 7e466368d793d09316fc1e078111a9de
47 |       size: 882
48 |     - path: ./src/model.py
49 |       md5: 895596132410cf7e581953ecbdc9b44d
50 |       size: 4485
51 |     outs:
52 |     - path: ./results/metrics.json
53 |       md5: 17cacf1c4e374794927b5bc143016e23
54 |       size: 120
55 |     - path: ./results/precision_recall_curve.png
56 |       md5: bf5e1f1911560127be04aae88977b7a4
57 |       size: 17045
58 |     - path: ./results/roc_curve.png
59 |       md5: 77346f3a6fb9f23410af073ac1670898
60 |       size: 19933
61 |   std_check:
62 |     cmd: src/scripts/Scripts/std_check.sh ./
63 | 


--------------------------------------------------------------------------------
/src/preprocess_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pandas as pd
 4 | from sklearn import preprocessing
 5 | 
 6 | 
 7 | def count_nulls_by_line(df):
 8 |     return df.isnull().sum().sort_values(ascending=False)
 9 | 
10 | 
11 | def null_percent_by_line(df):
12 |     return (df.isnull().sum() / df.isnull().count()).sort_values(ascending=False)
13 | 
14 | 
15 | def preprocess_data(DATA_PATH):
16 |     df = pd.read_csv(DATA_PATH)
17 | 
18 |     zeros_cnt = count_nulls_by_line(df)
19 |     # df.isnull().sum().sort_values(ascending=False)
20 |     percent_zeros = null_percent_by_line(df)
21 |     # (df.isnull().sum() / df.isnull().count()).sort_values(ascending=False)
22 | 
23 |     missing_data = pd.concat(
24 |         [zeros_cnt, percent_zeros], axis=1, keys=["Total", "Percent"]
25 |     )
26 | 
27 |     dropList = list(missing_data[missing_data["Percent"] > 0.15].index)
28 | 
29 |     df.drop(dropList, axis=1, inplace=True)
30 |     df.drop(["Date"], axis=1, inplace=True)
31 |     df.drop(["Location"], axis=1, inplace=True)
32 | 
33 |     ohe = pd.get_dummies(data=df, columns=["WindGustDir", "WindDir9am", "WindDir3pm"])
34 | 
35 |     ohe["RainToday"] = df["RainToday"].astype(str)
36 |     ohe["RainTomorrow"] = df["RainTomorrow"].astype(str)
37 | 
38 |     lb = preprocessing.LabelBinarizer()
39 | 
40 |     ohe["RainToday"] = lb.fit_transform(ohe["RainToday"])
41 |     ohe["RainTomorrow"] = lb.fit_transform(ohe["RainTomorrow"])
42 |     ohe = ohe.dropna()
43 |     precessed_df = ohe
44 | 
45 |     y = ohe["RainTomorrow"]
46 |     X = ohe.drop(["RainTomorrow"], axis=1)
47 | 
48 |     cols = precessed_df.columns.tolist()
49 |     cols.remove("RainTomorrow")
50 |     cols.append("RainTomorrow")
51 |     precessed_df = precessed_df[cols]
52 | 
53 |     cols = precessed_df.columns.tolist()
54 | 
55 |     features_df = precessed_df.drop(["RainTomorrow"], axis=1)
56 |     features_df.to_csv("./data/features.csv", index=False)
57 | 
58 |     precessed_df.to_csv(DATA_PATH[:-4] + "_processed.csv", index=False)
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     DATA_PATH = os.path.abspath(sys.argv[1])
63 |     preprocess_data(DATA_PATH)
64 |     print("Saved to {}".format(DATA_PATH[:-4] + "_processed.csv"))
65 | 


--------------------------------------------------------------------------------
/src/tests/preprocess/test_preprocess.py:
--------------------------------------------------------------------------------
 1 | # PyTest file for all preprocessing of data
 2 | 
 3 | import io
 4 | import builtins
 5 | import pytest
 6 | import pandas as pd
 7 | import sys
 8 | import os
 9 | 
10 | # Parent Folder
11 | sys.path.append(
12 |     os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
13 | )
14 | 
15 | # Preprocess Python file
16 | import preprocess_data
17 | 
18 | FILE_NAME = "testWeatherAUS"
19 | DATA_PATH = (
20 |     os.path.dirname(os.path.realpath(__file__)) + "/test_data/" + FILE_NAME + ".csv"
21 | )
22 | PROCESSED_DATA_PATH = (
23 |     os.path.dirname(os.path.realpath(__file__))
24 |     + "/test_data/"
25 |     + FILE_NAME
26 |     + "_processed.csv"
27 | )
28 | 
29 | 
30 | def test_count_nulls_by_line():
31 |     # Tests function that counts number of nulls by line on a dataframe
32 |     data = pd.DataFrame([[0, 2], [0, 1], [6, None]])
33 |     assert preprocess_data.count_nulls_by_line(data).to_list() == [1, 0]
34 | 
35 | 
36 | def test_null_percent():
37 |     # Tests function that gets the percentage of nulls by line on a dataframe
38 |     data = pd.DataFrame([[0, 2], [1, None]])
39 |     assert preprocess_data.null_percent_by_line(data).to_list() == [0.5, 0]
40 | 
41 | 
42 | # @pytest.mark.dependency()
43 | # def test_preprocess():
44 | #     # Checks if running the preprocess function returns an error
45 | #     preprocess_data.preprocess_data(DATA_PATH)
46 | 
47 | 
48 | # @pytest.mark.dependency(depends=["test_preprocess"])
49 | # def test_processed_file_created():
50 | #     #  Checks if the processed file was created during test_preprocess() and is accessible
51 | #     f = open(PROCESSED_DATA_PATH)
52 | 
53 | 
54 | # @pytest.mark.dependency(depends=["test_processed_file_created"])
55 | # def test_processed_file_format():
56 | #     # Checks if the processed file is in  the correct format (.csv) and can be transformed in dataframe
57 | #     try:
58 | #         pd.read_csv(PROCESSED_DATA_PATH)
59 | #     except:
60 | #         raise RuntimeError("Unable to open " + PROCESSED_DATA_PATH + " as dataframe")
61 | 
62 | 
63 | @pytest.fixture(scope="session", autouse=True)
64 | def cleanup(request):
65 |     # Runs tests then cleans up the processed file
66 |     yield
67 |     try:
68 |         os.remove(PROCESSED_DATA_PATH)
69 |     except:
70 |         pass
71 | 


--------------------------------------------------------------------------------
/src/scripts/Pipelines/model_deploy_pipeline.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pickle
 4 | import yaml
 5 | import joblib
 6 | from ibm_watson_machine_learning import APIClient
 7 | 
 8 | """
 9 |     Usage:
10 |         python3 model_deploy_pipeline.py ./pickle_model ../path/to/project/ ../credentials.yaml
11 | 
12 | """
13 | 
14 | MODEL_PATH = os.path.abspath(sys.argv[1])
15 | PROJ_PATH = os.path.abspath(sys.argv[2])
16 | CRED_PATH = os.path.abspath(sys.argv[3])
17 | META_PATH = PROJ_PATH + "/metadata.yaml"
18 | 
19 | 
20 | with open(CRED_PATH) as stream:
21 |     try:
22 |         credentials = yaml.safe_load(stream)
23 |     except yaml.YAMLError as exc:
24 |         print(exc)
25 | 
26 | 
27 | with open(META_PATH) as stream:
28 |     try:
29 |         metadata = yaml.safe_load(stream)
30 |     except yaml.YAMLError as exc:
31 |         print(exc)
32 | 
33 | with open(MODEL_PATH, "rb") as file:
34 |     # pickle_model = pickle.load(file)
35 |     pipeline = joblib.load(file)
36 | 
37 | wml_credentials = {"url": credentials["url"], "apikey": credentials["apikey"]}
38 | 
39 | client = APIClient(wml_credentials)
40 | client.spaces.list()
41 | 
42 | MODEL_NAME = metadata["project_name"] + "_" + metadata["project_version"]
43 | DEPLOY_NAME = MODEL_NAME + "-Deployment"
44 | MODEL = pipeline
45 | SPACE_ID = credentials["space_id"]
46 | 
47 | client.set.default_space(SPACE_ID)
48 | 
49 | model_props = {
50 |     client.repository.ModelMetaNames.NAME: MODEL_NAME,
51 |     client.repository.ModelMetaNames.TYPE: metadata["model_type"],
52 |     client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: client.software_specifications.get_id_by_name(
53 |         "default_py3.7"
54 |     ),
55 | }
56 | 
57 | model_details = client.repository.store_model(model=MODEL, meta_props=model_props)
58 | model_uid = client.repository.get_model_uid(model_details)
59 | 
60 | deployment_props = {
61 |     client.deployments.ConfigurationMetaNames.NAME: DEPLOY_NAME,
62 |     client.deployments.ConfigurationMetaNames.ONLINE: {},
63 | }
64 | 
65 | deployment = client.deployments.create(
66 |     artifact_uid=model_uid, meta_props=deployment_props
67 | )
68 | 
69 | deployment_uid = client.deployments.get_uid(deployment)
70 | 
71 | metadata["model_uid"] = model_uid
72 | metadata["deployment_uid"] = deployment_uid
73 | 
74 | f = open(META_PATH, "w+")
75 | yaml.dump(metadata, f, allow_unicode=True)
76 | 


--------------------------------------------------------------------------------
/src/scripts/Pipelines/model_deployed_validate_pipeline.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import yaml
 3 | import os
 4 | import pandas as pd
 5 | import numpy as np
 6 | from sklearn.metrics import confusion_matrix, accuracy_score
 7 | from sklearn.model_selection import cross_val_score
 8 | from ibm_watson_machine_learning import APIClient
 9 | from sklearn.model_selection import train_test_split
10 | 
11 | """
12 |     Usage:
13 |         python3 model_deployed_validate_pipeline.py ../../ ../../credentials.yaml path/to/project/
14 | 
15 | """
16 | 
17 | DATA_PATH = os.path.abspath(sys.argv[1])
18 | CRED_PATH = os.path.abspath(sys.argv[2])
19 | PROJ_PATH = os.path.abspath(sys.argv[3])
20 | META_PATH = PROJ_PATH + "/metadata.yaml"
21 | 
22 | 
23 | def main():
24 |     with open(CRED_PATH) as stream:
25 |         try:
26 |             credentials = yaml.safe_load(stream)
27 |         except yaml.YAMLError as exc:
28 |             print(exc)
29 | 
30 |     with open(META_PATH) as stream:
31 |         try:
32 |             metadata = yaml.safe_load(stream)
33 |         except yaml.YAMLError as exc:
34 |             print(exc)
35 | 
36 |     data = pd.read_csv(DATA_PATH)
37 | 
38 |     X = data.iloc[:, :-1]
39 |     y = data[data.columns[-1]]
40 |     X_train, X_test, y_train, y_test = train_test_split(
41 |         X, y, test_size=0.3, random_state=0
42 |     )
43 | 
44 |     wml_credentials = {"url": credentials["url"], "apikey": credentials["apikey"]}
45 | 
46 |     client = APIClient(wml_credentials)
47 |     client.spaces.list()
48 | 
49 |     SPACE_ID = credentials["space_id"]
50 | 
51 |     if "deployment_uid" in metadata.keys():
52 |         DEPLOYMENT_UID = metadata["deployment_uid"]
53 |         print("\nExtracting DEPLOYMENT UID from metadata file\n")
54 | 
55 |     else:
56 |         DEPLOYMENT_UID = input("DEPLOYMENT UID: ")
57 | 
58 |     client.set.default_space(SPACE_ID)
59 | 
60 |     payload = {
61 |         "input_data": [
62 |             {
63 |                 "fields": X.columns.to_numpy().tolist(),
64 |                 "values": X_test.to_numpy().tolist(),
65 |             }
66 |         ]
67 |     }
68 |     result = client.deployments.score(DEPLOYMENT_UID, payload)
69 | 
70 |     pred_values = np.squeeze(result["predictions"][0]["values"])
71 |     y_pred_values = [i[0] for i in pred_values]
72 | 
73 |     def comb_eval(y, y_pred):
74 |         cm = confusion_matrix(y, y_pred)
75 |         acc = accuracy_score(y, y_pred)
76 | 
77 |         return {"cm": cm, "acc": acc}
78 | 
79 |     eval = comb_eval(y_test, y_pred_values)
80 |     print(eval)
81 | 
82 |     return eval
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🧬 DVC CI/CD MLOps Pipeline
  2 | MLOps pipeline with DVC and CML using Github Actions and IBM Cloud
  3 | 
  4 | 
  5 | [![model-deploy-on-release](https://github.com/MLOPsStudyGroup/dvc-gitactions/actions/workflows/deploy_on_release.yaml/badge.svg)](https://github.com/MLOPsStudyGroup/dvc-gitactions/actions/workflows/deploy_on_release.yaml)
  6 | [![Python Package and Test](https://github.com/MLOPsStudyGroup/dvc-gitactions/actions/workflows/test_on_push.yaml/badge.svg)](https://github.com/MLOPsStudyGroup/dvc-gitactions/actions/workflows/test_on_push.yaml)
  7 | 
  8 | [Video Demo](https://www.youtube.com/watch?v=URpGaE-FA5U)
  9 | 
 10 | [Documentation and Implementation Guide](https://mlops-guide.github.io)
 11 | 
 12 | ## 🔰 Milestones
 13 | - [X] Data Versioning: DVC
 14 | - [X] Machine Learning Pipeline: DVC Pipeline (preprocess, train, evaluate)
 15 | - [X] CI/CD: Unit testing with Pytest, pre-commit and Github Actions
 16 | - [X] CML: Continuous Machine Learning and Github Actions
 17 | - [X] Deploy on release: Github Actions and IBM Watson
 18 | - [X] Monitoring: OpenScale
 19 | - [X] Infrastructure-as-a-code: Terraform script
 20 | 
 21 | ## 📋 Requirements
 22 | 
 23 | * DVC
 24 | * Python3 and pip
 25 | * Access to IBM Cloud Object Storage
 26 | 
 27 | ## 🏃🏻 Running Project
 28 | 
 29 | ### 🔑 Setup IBM Bucket Credentials
 30 | 
 31 | #### MacOS
 32 | Setup your credentials on ```~/.aws/credentials``` and ```~/.aws/config```. DVC works perfectly with IBM Obejct Storage, although it uses S3 protocol, you can also see this in other portions of the repository.
 33 | 
 34 | 
 35 | ~/.aws/credentials
 36 | 
 37 | ```credentials
 38 | [default]
 39 | aws_access_key_id = {{Key ID}}
 40 | aws_secret_access_key = {{Access Key}}
 41 | ```
 42 | 
 43 | 
 44 | ### ✅ Pre-commit Testings
 45 | 
 46 | In order to activate pre-commit testing you need ```pre-commit```
 47 | 
 48 | Installing pre-commit with pip
 49 | ```
 50 | pip install pre-commit
 51 | ```
 52 | 
 53 | Installing pre-commit on your local repository. Keep in mind this creates a Github Hook.
 54 | ```
 55 | pre-commit install
 56 | ```
 57 | 
 58 | Now everytime you make a commit, it will run some tests defined on ```.pre-commit-config.yaml``` before allowing your commit.
 59 | 
 60 | **Example**
 61 | ```
 62 | $ git commit -m "Example commit"
 63 | 
 64 | black....................................................................Passed
 65 | pytest-check.............................................................Passed
 66 | ```
 67 | 
 68 | 
 69 | ### ⚗️ Using DVC
 70 | 
 71 | Download data from the DVC repository(analog to ```git pull```)
 72 | ```
 73 | dvc pull
 74 | ```
 75 | 
 76 | Reproduces the pipeline using DVC
 77 | ```
 78 | dvc repro
 79 | ```
 80 | 
 81 | 
 82 | ### ⚙️ DVC Pipelines
 83 | 
 84 | 
 85 | ✂️ Preprocessing pipeline
 86 | ```
 87 | dvc run -n preprocess -d ./src/preprocess_data.py -d data/weatherAUS.csv \
 88 | -o ./data/weatherAUS_processed.csv -o ./data/features.csv \
 89 | python3 ./src/preprocess_data.py ./data/weatherAUS.csv
 90 | ```
 91 | 
 92 | 
 93 | 📘 Training pipeline
 94 | ```
 95 | dvc run -n train -d ./src/train.py -d ./data/weatherAUS_processed.csv \
 96 |  -d ./src/model.py \
 97 | -o ./models/model.joblib \
 98 | python3 ./src/train.py ./data/weatherAUS_processed.csv ./src/model.py 200
 99 | ```
100 | 
101 | 
102 | 📊 Evaluate pipeline
103 | ```
104 | dvc run -n evaluate -d ./src/evaluate.py -d ./data/weatherAUS_processed.csv \
105 | -d ./src/model.py -d ./models/model.joblib -o ./results/metrics.json \
106 | -o ./results/precision_recall_curve.png -o ./results/roc_curve.png \
107 | python3 ./src/evaluate.py ./data/weatherAUS_processed.csv ./src/model.py ./models/model.joblib
108 | ```
109 | 
110 | ### 🐙 Git Actions
111 | 🔐 IBM Credentials
112 | 
113 | 
114 | Fill the ```credentials_example.yaml``` file and rename it to ```credentials.yaml``` to be able to run the scripts that require IBM keys. ⚠️ Never upload this file to GitHub!
115 | 
116 | To use Git Actions to deploy your model, you'll need to encrypt it, to do that run the command bellow and choose a strong password.
117 | 
118 | ```
119 | gpg --symmetric --cipher-algo AES256 credentials.yaml 
120 | ```
121 | Now in the GitHub page for the repository, go to ```Settings->Secrets``` and add the keys to the following secrets:
122 | 
123 | ```
124 | AWS_ACCESS_KEY_ID (Bucket Credential)
125 | AWS_SECRET_ACCESS_KEY (Bucket Credential)
126 | IBM_CREDENTIALS_PASS (password for the encrypted file)
127 | ```
128 | 


--------------------------------------------------------------------------------
/src/scripts/Pipelines/model_train_autoAI.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import yaml
  4 | import pandas as pd
  5 | 
  6 | # from ibm_watson_machine_learning.helpers import DataConnection, S3Connection, S3Location
  7 | from ibm_watson_machine_learning.experiment import AutoAI
  8 | from ibm_watson_machine_learning.autoai.helpers.connections import (
  9 |     S3Connection,
 10 |     S3Location,
 11 |     DataConnection,
 12 | )
 13 | 
 14 | DATA_PATH = os.path.abspath(sys.argv[1])
 15 | CRED_PATH = os.path.abspath(sys.argv[2])
 16 | PROJ_PATH = os.path.abspath(sys.argv[3])
 17 | META_PATH = PROJ_PATH + "/metadata.yaml"
 18 | 
 19 | 
 20 | with open(CRED_PATH) as stream:
 21 |     try:
 22 |         credentials = yaml.safe_load(stream)
 23 |     except yaml.YAMLError as exc:
 24 |         print(exc)
 25 | 
 26 | with open(META_PATH) as stream:
 27 |     try:
 28 |         metadata = yaml.safe_load(stream)
 29 |     except yaml.YAMLError as exc:
 30 |         print(exc)
 31 | 
 32 | wml_credentials = {"url": credentials["url"], "apikey": credentials["apikey"]}
 33 | 
 34 | SPACE_ID = credentials["space_id"]
 35 | 
 36 | data = pd.read_csv(DATA_PATH)
 37 | 
 38 | X = data.iloc[:, :-1]
 39 | y = data[data.columns[-1]]
 40 | cols = data.columns.tolist()
 41 | TARGET = cols[-1]
 42 | AUTOAI_ENDPOINT = "auto_ml/a4243da5-a8b0-4e6a-8273-13c161f7e117/wml_data/d8ab3fe3-17a2-4474-b702-9c8309586a40"
 43 | 
 44 | experiment = AutoAI(
 45 |     wml_credentials=wml_credentials,
 46 |     # project_id=credentials['project_id'],
 47 |     space_id=credentials["space_id"],
 48 | )
 49 | 
 50 | 
 51 | pipeline_optimizer = experiment.optimizer(
 52 |     name=metadata["project_name"],
 53 |     desc="",
 54 |     prediction_type=AutoAI.PredictionType.BINARY,
 55 |     prediction_column=TARGET,
 56 |     scoring=AutoAI.Metrics.ACCURACY_SCORE,
 57 |     test_size=0.2,
 58 |     max_num_daub_ensembles=1,
 59 |     train_sample_rows_test_size=1.0,
 60 |     daub_include_only_estimators=[
 61 |         AutoAI.ClassificationAlgorithms.XGB,
 62 |         AutoAI.ClassificationAlgorithms.LGBM,
 63 |     ],
 64 |     cognito_transform_names=[AutoAI.Transformers.SUM, AutoAI.Transformers.MAX],
 65 | )
 66 | 
 67 | 
 68 | # note: this DataConnection will be used as a reference where to find your training dataset
 69 | training_data_connection = DataConnection(
 70 |     connection=S3Connection(
 71 |         endpoint_url="url of the COS endpoint",
 72 |         access_key_id="COS access key id",
 73 |         secret_access_key="COS secret acces key",
 74 |     ),
 75 |     location=S3Location(
 76 |         bucket="bucket_name",  # note: COS bucket name where training dataset is located
 77 |         path="my_path",  # note: path within bucket where your training dataset is located
 78 |     ),
 79 | )
 80 | 
 81 | # note: this DataConnection will be used as a reference where to save all of the AutoAI experiment results
 82 | results_connection = DataConnection(
 83 |     connection=S3Connection(
 84 |         endpoint_url="url of the COS endpoint",
 85 |         access_key_id="COS access key id",
 86 |         secret_access_key="COS secret acces key",
 87 |     ),
 88 |     # note: bucket name and path could be different or the same as specified in the training_data_connection
 89 |     location=S3Location(bucket="bucket_name", path="my_path"),
 90 | )
 91 | 
 92 | # training_data_connection = [DataConnection(
 93 | #     connection=S3Connection(
 94 | #         api_key=credentials['s3_apikey'],
 95 | #         auth_endpoint='https://iam.bluemix.net/oidc/token/',
 96 | #         endpoint_url='https://s3-api.us-geo.objectstorage.softlayer.net'
 97 | #     ),
 98 | #         location=S3Location(
 99 | #         bucket=credentials['s3_bucket'],
100 | #         path=DATA_PATH
101 | #     ))
102 | # ]
103 | # results_connection = DataConnection(
104 | #     connection=S3Connection(
105 | #         api_key=credentials['s3_apikey'],
106 | #         auth_endpoint='https://iam.bluemix.net/oidc/token/',
107 | #         endpoint_url='https://s3-api.us-geo.objectstorage.softlayer.net'
108 | #     ),
109 | #     location=S3Location(
110 | #         bucket=credentials['s3_bucket'],
111 | #         path=AUTOAI_ENDPOINT+'/data/automl',
112 | #         model_location=AUTOAI_ENDPOINT+'/data/automl/cognito_output/Pipeline1/model.pickle',
113 | #         training_status=AUTOAI_ENDPOINT+'/training-status.json'
114 | #     ))
115 | 
116 | fit_details = pipeline_optimizer.fit(
117 |     training_data_reference=[training_data_connection],
118 |     training_results_reference=results_connection,
119 |     background_mode=True,
120 | )
121 | 
122 | 
123 | status = pipeline_optimizer.get_run_status()
124 | print(status)
125 | 
126 | run_details = pipeline_optimizer.get_run_details()
127 | 
128 | results = pipeline_optimizer.summary()
129 | print(results)
130 | 


--------------------------------------------------------------------------------
/.infra/datapak_manage.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DataPak deployment space manage script
  3 | 
  4 | """
  5 | 
  6 | import os
  7 | import sys
  8 | from pprint import pprint
  9 | import json
 10 | from ibm_watson_machine_learning import APIClient
 11 | 
 12 | TERRAFORM_OUTPUT = ".terraform/terraform.tfstate"
 13 | 
 14 | 
 15 | def authentication():
 16 | 
 17 |     if os.getenv("IBMCLOUD_API_KEY"):
 18 | 
 19 |         wml_credentials = {
 20 |             "url": "https://us-south.ml.cloud.ibm.com",
 21 |             "apikey": os.environ.get("IBMCLOUD_API_KEY"),
 22 |         }
 23 |         client = APIClient(wml_credentials)  # Connect to IBM cloud
 24 | 
 25 |         return client
 26 | 
 27 |     raise Exception("API_KEY environment variable not defined")
 28 | 
 29 | 
 30 | def terraform_output(terraform_path=TERRAFORM_OUTPUT):
 31 | 
 32 |     output = dict(json.load(open(terraform_path)))["outputs"]
 33 | 
 34 |     cos_crn = output["cos_crn"]["value"]
 35 |     wml_crn = output["wml_crn"]["value"]["crn"]
 36 |     wml_name = output["wml_crn"]["value"]["resource_name"]
 37 | 
 38 |     state = {"cos_crn": cos_crn, "wml_name": wml_name, "wml_crn": wml_crn}
 39 |     return state
 40 | 
 41 | 
 42 | def create_deployment_space(
 43 |     client, cos_crn, wml_name, wml_crn, space_name="default", description=""
 44 | ):
 45 | 
 46 |     metadata = {
 47 |         client.spaces.ConfigurationMetaNames.NAME: space_name,  ## Project info
 48 |         client.spaces.ConfigurationMetaNames.DESCRIPTION: description,
 49 |         client.spaces.ConfigurationMetaNames.STORAGE: {
 50 |             "type": "bmcos_object_storage",
 51 |             "resource_crn": cos_crn,
 52 |         },
 53 |         client.spaces.ConfigurationMetaNames.COMPUTE: {  ## Project compute instance (WML)
 54 |             "name": wml_name,
 55 |             "crn": wml_crn,
 56 |         },
 57 |     }
 58 | 
 59 |     space_details = client.spaces.store(meta_props=metadata)  # Create a space
 60 |     return space_details
 61 | 
 62 | 
 63 | def update_deployment_space(client, new_name, space_id):
 64 | 
 65 |     metadata = {client.spaces.ConfigurationMetaNames.NAME: new_name}
 66 | 
 67 |     space_details = client.spaces.update(space_id, changes=metadata)
 68 |     return space_details
 69 | 
 70 | 
 71 | def delete_deployment_space(client, space_id):
 72 | 
 73 |     client.spaces.delete(space_id)
 74 | 
 75 | 
 76 | def list_deployment_space(client):
 77 |     spaces = client.spaces.list()
 78 |     print(spaces)
 79 | 
 80 | 
 81 | def describe_deployment_space(client, space_id):
 82 |     info = client.spaces.get_details(space_id)
 83 |     pprint(info)
 84 | 
 85 | 
 86 | def help():
 87 | 
 88 |     print(
 89 |         """
 90 |         datapak_config.py [options] 
 91 | 
 92 |         create  
 93 |         update  
 94 |         delete  
 95 |         list    
 96 |         describe
 97 |         """
 98 |     )
 99 | 
100 | 
101 | if __name__ == "__main__":
102 | 
103 |     client = authentication()
104 | 
105 |     args = sys.argv[1:]
106 | 
107 |     if len(args) >= 1:
108 |         action = args[0]
109 | 
110 |         if action == "create":
111 | 
112 |             infos = terraform_output()
113 |             if len(args) == 2:
114 |                 space_name = args[1]
115 |                 space = create_deployment_space(
116 |                     client,
117 |                     infos["cos_crn"],
118 |                     infos["wml_name"],
119 |                     infos["wml_crn"],
120 |                     space_name,
121 |                 )
122 | 
123 |             elif len(args) > 2:
124 |                 space_name = args[1]
125 |                 description = args[2]
126 |                 space = create_deployment_space(
127 |                     client,
128 |                     infos["cos_crn"],
129 |                     infos["wml_name"],
130 |                     infos["wml_crn"],
131 |                     space_name,
132 |                     description,
133 |                 )
134 | 
135 |             pprint(space)
136 | 
137 |         elif action == "update":
138 | 
139 |             try:
140 |                 new_name = args[1]
141 |                 space_id = args[2]
142 |             except:
143 |                 raise Exception("Missing arguments")
144 | 
145 |             space = update_deployment_space(client, new_name, space_id)
146 |             pprint(space)
147 | 
148 |         elif action == "delete":
149 |             try:
150 |                 space_id = args[1]
151 |             except:
152 |                 raise Exception("Missing space_id")
153 | 
154 |             delete_deployment_space(client, space_id)
155 | 
156 |         elif action == "list":
157 |             list_deployment_space(client)
158 | 
159 |         elif action == "describe":
160 | 
161 |             try:
162 |                 space_id = args[1]
163 |             except:
164 |                 raise Exception("Missing space_id")
165 | 
166 |             describe_deployment_space(client, space_id)
167 | 
168 |         else:
169 |             help()
170 | 
171 |     else:
172 |         help()
173 | 


--------------------------------------------------------------------------------
/src/model.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import sklearn
  3 | from sklearn.model_selection import train_test_split
  4 | from sklearn.model_selection import cross_val_score
  5 | from sklearn.metrics import accuracy_score, f1_score
  6 | from sklearn.pipeline import Pipeline
  7 | from sklearn.preprocessing import StandardScaler
  8 | from sklearn.ensemble import RandomForestClassifier
  9 | from sklearn.linear_model import LogisticRegression
 10 | from sklearn.model_selection import cross_val_score
 11 | from sklearn.metrics import confusion_matrix
 12 | from sklearn.metrics import accuracy_score
 13 | from sklearn.metrics import precision_score, recall_score, f1_score
 14 | from sklearn.metrics import precision_recall_curve
 15 | from sklearn.metrics import roc_curve
 16 | from sklearn.metrics import roc_auc_score
 17 | import matplotlib.pyplot as plt
 18 | 
 19 | 
 20 | def get_variables(data, column):
 21 |     # Seperating the dependant and independant variables
 22 |     y = data[column]
 23 |     X = data.drop([column], axis=1)
 24 | 
 25 |     return X, y
 26 | 
 27 | 
 28 | def train(data, num_estimators, isDataFrame=False):
 29 | 
 30 |     if not isDataFrame:
 31 |         data = pd.read_csv(data)
 32 | 
 33 |     # Seperating the dependant and independant variables
 34 |     # y = data["RainTomorrow"]
 35 |     # X = data.drop(["RainTomorrow"], axis=1)
 36 | 
 37 |     X, y = get_variables(data, "RainTomorrow")
 38 | 
 39 |     X_train, X_test, y_train, y_test = train_test_split(
 40 |         X, y, test_size=0.3, random_state=0
 41 |     )
 42 | 
 43 |     pipe = Pipeline(
 44 |         [
 45 |             ("scaler", StandardScaler()),
 46 |             (
 47 |                 "RFC",
 48 |                 RandomForestClassifier(
 49 |                     criterion="gini",
 50 |                     max_depth=10,
 51 |                     max_features="auto",
 52 |                     n_estimators=num_estimators,
 53 |                 ),
 54 |             ),
 55 |         ]
 56 |     )
 57 | 
 58 |     training_logs = pipe.fit(X_train, y_train)
 59 | 
 60 |     logs = {"training_logs": training_logs}
 61 | 
 62 |     return pipe, logs
 63 | 
 64 | 
 65 | def evaluate(data, pipeline, OUTPUT_PATH, isDataFrame=False):
 66 | 
 67 |     pipe = pipeline
 68 | 
 69 |     if not isDataFrame:
 70 |         data = pd.read_csv(data)
 71 | 
 72 |     y = data["RainTomorrow"]
 73 |     X = data.drop(["RainTomorrow"], axis=1)
 74 | 
 75 |     X_train, X_test, y_train, y_test = train_test_split(
 76 |         X, y, test_size=0.3, random_state=0
 77 |     )
 78 | 
 79 |     # metrics
 80 |     def comb_eval(y, y_pred):
 81 |         acc = accuracy_score(y, y_pred)
 82 |         recall = recall_score(y, y_pred)
 83 |         precision = precision_score(y, y_pred)
 84 |         f1 = f1_score(y, y_pred)
 85 | 
 86 |         return {"accuracy": acc, "recall": recall, "precision": precision, "f1": f1}
 87 | 
 88 |     # y_pred_train = pipe.predict(X_train)
 89 |     # train_result = comb_eval(y_train, y_pred_train)
 90 | 
 91 |     y_pred_test = pipe.predict(X_test)
 92 |     test_result = comb_eval(y_test, y_pred_test)
 93 | 
 94 |     # cvs = cross_val_score(pipe, X, y, cv=3)
 95 | 
 96 |     # roc curve
 97 |     # y_pred = pipe.predict(X_test)
 98 | 
 99 |     dummy_probs = [0 for _ in range(len(y_test))]
100 |     model_probs = pipe.predict_proba(X_test)
101 |     model_probs = model_probs[:, 1]
102 | 
103 |     # model_auc = roc_auc_score(y_test, model_probs)
104 | 
105 |     dummy_fpr, dummy_tpr, _ = roc_curve(y_test, dummy_probs)
106 |     model_fpr, model_tpr, _ = roc_curve(y_test, model_probs)
107 | 
108 |     # precision_recall_curve
109 |     y_scores = pipe.predict_proba(X_test)[:, 1]
110 |     precisions, recalls, thresholds = precision_recall_curve(y_test, y_scores)
111 | 
112 |     logs = {
113 |         "metrics": test_result,
114 |         "roc_curve": {
115 |             "model_tpr": model_tpr,
116 |             "model_fpr": model_fpr,
117 |             "dummy_tpr": dummy_tpr,
118 |             "dummy_fpr": dummy_fpr,
119 |         },
120 |         "precision_recall_curve": {
121 |             "precisions": precisions,
122 |             "recalls": recalls,
123 |             "thresholds": thresholds,
124 |         },
125 |     }
126 | 
127 |     # roc curve
128 |     # plot the roc curve for the model
129 |     plt.plot(
130 |         logs["roc_curve"]["dummy_fpr"],
131 |         logs["roc_curve"]["dummy_tpr"],
132 |         linestyle="--",
133 |         label="Dummy Classifer",
134 |     )
135 |     plt.plot(
136 |         logs["roc_curve"]["model_fpr"],
137 |         logs["roc_curve"]["model_tpr"],
138 |         marker=".",
139 |         label="RFC",
140 |     )
141 |     # axis labels
142 |     plt.xlabel("False Positive Rate")
143 |     plt.ylabel("True Positive Rate")
144 |     # show the legend
145 |     plt.legend()
146 |     out_path = OUTPUT_PATH + "/roc_curve.png"
147 |     plt.savefig(out_path, dpi=80)
148 |     plt.cla()
149 | 
150 |     def plot_prc(precisions, recalls, thresholds):
151 |         plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
152 |         plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
153 |         plt.xlabel("Thresholds")
154 |         plt.legend(loc="center left")
155 |         plt.ylim([0, 1])
156 |         out_path = OUTPUT_PATH + "/precision_recall_curve.png"
157 |         plt.savefig(out_path, dpi=80)
158 | 
159 |     plot_prc(
160 |         logs["precision_recall_curve"]["precisions"],
161 |         logs["precision_recall_curve"]["recalls"],
162 |         logs["precision_recall_curve"]["thresholds"],
163 |     )
164 | 
165 |     return logs
166 | 


--------------------------------------------------------------------------------
/src/scripts/Pipelines/openscale.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import yaml
  4 | from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
  5 | from ibm_ai_openscale import APIClient
  6 | from ibm_ai_openscale.engines import *
  7 | from ibm_ai_openscale.utils import *
  8 | from ibm_ai_openscale.supporting_classes import PayloadRecord, Feature
  9 | from ibm_ai_openscale.supporting_classes.enums import *
 10 | import requests
 11 | from ibm_ai_openscale.utils import get_instance_guid
 12 | import ibm_watson_machine_learning
 13 | import json
 14 | import pandas as pd
 15 | from sklearn.model_selection import train_test_split
 16 | import numpy as np
 17 | from sklearn.metrics import (
 18 |     confusion_matrix,
 19 |     accuracy_score,
 20 |     precision_score,
 21 |     recall_score,
 22 |     f1_score,
 23 | )
 24 | from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
 25 | from ibm_watson_openscale import *
 26 | from ibm_watson_openscale.supporting_classes.enums import *
 27 | from ibm_watson_openscale.supporting_classes.payload_record import PayloadRecord
 28 | import ibm_watson_openscale
 29 | 
 30 | 
 31 | with open("../credentials.yaml") as stream:
 32 |     try:
 33 |         credentials = yaml.safe_load(stream)
 34 |     except yaml.YAMLError as exc:
 35 |         print(exc)
 36 | 
 37 | 
 38 | with open("../metadata.yaml") as stream:
 39 |     try:
 40 |         metadata = yaml.safe_load(stream)
 41 |     except yaml.YAMLError as exc:
 42 |         print(exc)
 43 | 
 44 | 
 45 | service_credentials = {
 46 |     "apikey": credentials["apikey"],
 47 |     "url": "https://api.aiopenscale.cloud.ibm.com",
 48 | }
 49 | 
 50 | DEPLOYMENT_UID = metadata["deployment_uid"]
 51 | MODEL_UID = metadata["model_uid"]
 52 | MODEL_NAME = metadata["project_name"] + "_" + metadata["project_version"]
 53 | SPACE_ID = credentials["space_id"]
 54 | WOS_GUID = get_instance_guid(api_key=service_credentials["apikey"])
 55 | WOS_CREDENTIALS = {
 56 |     "instance_guid": WOS_GUID,
 57 |     "apikey": service_credentials["apikey"],
 58 |     "url": "https://api.aiopenscale.cloud.ibm.com",
 59 | }
 60 | 
 61 | if WOS_GUID is None:
 62 |     print("Watson OpenScale GUID NOT FOUND")
 63 | else:
 64 |     print(WOS_GUID)
 65 | 
 66 | ai_client = APIClient(aios_credentials=WOS_CREDENTIALS)
 67 | print(ai_client.version)
 68 | 
 69 | wml_credentials = {"url": credentials["url"], "apikey": credentials["apikey"]}
 70 | 
 71 | wml_client = ibm_watson_machine_learning.APIClient(wml_credentials)
 72 | 
 73 | wml_credentials = {
 74 |     "url": credentials["url"],
 75 |     "apikey": credentials["apikey"],
 76 |     "instance_id": "wml_local",
 77 | }
 78 | 
 79 | wml_client.set.default_space("16148a4d-9055-4220-af26-0c0369cdf31a")
 80 | 
 81 | authenticator = IAMAuthenticator(apikey=credentials["apikey"])
 82 | wos_client = ibm_watson_openscale.APIClient(
 83 |     authenticator=authenticator, service_url="https://api.aiopenscale.cloud.ibm.com"
 84 | )
 85 | 
 86 | 
 87 | KEEP_MY_INTERNAL_POSTGRES = True
 88 | DB_CREDENTIALS = None
 89 | try:
 90 |     data_mart_details = ai_client.data_mart.get_details()
 91 |     if (
 92 |         "internal_database" in data_mart_details
 93 |         and data_mart_details["internal_database"]
 94 |     ):
 95 |         if KEEP_MY_INTERNAL_POSTGRES:
 96 |             print("Using existing internal datamart.")
 97 |         else:
 98 |             if DB_CREDENTIALS is None:
 99 |                 print(
100 |                     "No postgres credentials supplied. Using existing internal datamart"
101 |                 )
102 |             else:
103 |                 print("Switching to external datamart")
104 |                 ai_client.data_mart.delete(force=True)
105 |                 ai_client.data_mart.setup(db_credentials=DB_CREDENTIALS)
106 |     else:
107 |         print("Using existing external datamart")
108 | except:
109 |     if DB_CREDENTIALS is None:
110 |         print("Setting up internal datamart")
111 |         ai_client.data_mart.setup(internal_db=True)
112 |     else:
113 |         print("Setting up external datamart")
114 |         try:
115 |             ai_client.data_mart.setup(db_credentials=DB_CREDENTIALS)
116 |         except:
117 |             print("Setup failed, trying Db2 setup")
118 |             ai_client.data_mart.setup(
119 |                 db_credentials=DB_CREDENTIALS, schema=DB_CREDENTIALS["username"]
120 |             )
121 | data_mart_details = ai_client.data_mart.get_details()
122 | 
123 | binding_uid = ai_client.data_mart.bindings.add(
124 |     "Rain Aus", WatsonMachineLearningInstance(wml_credentials)
125 | )
126 | 
127 | bindings_details = ai_client.data_mart.bindings.get_details()
128 | 
129 | if binding_uid is None:
130 |     binding_uid = [
131 |         binding["metadata"]["guid"]
132 |         for binding in bindings_details["service_bindings"]
133 |         if binding["entity"]["name"] == "WML Cloud Instance"
134 |     ][0]
135 | ai_client.data_mart.bindings.list()
136 | 
137 | ai_client.data_mart.bindings.list_assets(binding_uid=binding_uid)
138 | 
139 | subscriptions_uids = ai_client.data_mart.subscriptions.get_uids()
140 | # for subscription in subscriptions_uids:
141 | #     sub_name = ai_client.data_mart.subscriptions.get_details(subscription)['entity']['asset']['name']
142 | #     if sub_name == MODEL_NAME:
143 | #         ai_client.data_mart.subscriptions.delete(subscription)
144 | #         print('Deleted existing subscription for', MODEL_NAME)
145 | 
146 | # subscription = ai_client.data_mart.subscriptions.add(WatsonMachineLearningAsset(
147 | #     MODEL_UID,
148 | #     problem_type=ProblemType.BINARY_CLASSIFICATION,
149 | #     input_data_type=InputDataType.STRUCTURED,
150 | #     label_column='RainTomorrow',
151 | #     prediction_column='predictedLabel',
152 | #     probability_column='probability',
153 | #     transaction_id_column='transaction_id',
154 | #     feature_columns = ["Humidity3pm", "Humidity9am", "MaxTemp", "MinTemp", "Pressure3pm", "Pressure9am", "RainToday", "Rainfall", "Temp3pm", "Temp9am", "WindDir3pm_E", "WindDir3pm_ENE", "WindDir3pm_ESE", "WindDir3pm_N", "WindDir3pm_NE", "WindDir3pm_NNE", "WindDir3pm_NNW", "WindDir3pm_NW", "WindDir3pm_S", "WindDir3pm_SE", "WindDir3pm_SSE", "WindDir3pm_SSW", "WindDir3pm_SW", "WindDir3pm_W", "WindDir3pm_WNW", "WindDir3pm_WSW", "WindDir9am_E", "WindDir9am_ENE", "WindDir9am_ESE", "WindDir9am_N", "WindDir9am_NE", "WindDir9am_NNE", "WindDir9am_NNW", "WindDir9am_NW", "WindDir9am_S", "WindDir9am_SE", "WindDir9am_SSE", "WindDir9am_SSW", "WindDir9am_SW", "WindDir9am_W", "WindDir9am_WNW", "WindDir9am_WSW", "WindGustDir_E", "WindGustDir_ENE", "WindGustDir_ESE", "WindGustDir_N", "WindGustDir_NE", "WindGustDir_NNE", "WindGustDir_NNW", "WindGustDir_NW", "WindGustDir_S", "WindGustDir_SE", "WindGustDir_SSE", "WindGustDir_SSW", "WindGustDir_SW", "WindGustDir_W", "WindGustDir_WNW", "WindGustDir_WSW", "WindGustSpeed", "WindSpeed3pm", "WindSpeed9am"],
155 | #     categorical_columns = ["RainToday", "WindDir3pm_E", "WindDir3pm_ENE", "WindDir3pm_ESE", "WindDir3pm_N", "WindDir3pm_NE", "WindDir3pm_NNE", "WindDir3pm_NNW", "WindDir3pm_NW", "WindDir3pm_S", "WindDir3pm_SE", "WindDir3pm_SSE", "WindDir3pm_SSW", "WindDir3pm_SW", "WindDir3pm_W", "WindDir3pm_WNW", "WindDir3pm_WSW", "WindDir9am_E", "WindDir9am_ENE", "WindDir9am_ESE", "WindDir9am_N", "WindDir9am_NE", "WindDir9am_NNE", "WindDir9am_NNW", "WindDir9am_NW", "WindDir9am_S", "WindDir9am_SE", "WindDir9am_SSE", "WindDir9am_SSW", "WindDir9am_SW", "WindDir9am_W", "WindDir9am_WNW", "WindDir9am_WSW", "WindGustDir_E", "WindGustDir_ENE", "WindGustDir_ESE", "WindGustDir_N", "WindGustDir_NE", "WindGustDir_NNE", "WindGustDir_NNW", "WindGustDir_NW", "WindGustDir_S", "WindGustDir_SE", "WindGustDir_SSE", "WindGustDir_SSW", "WindGustDir_SW", "WindGustDir_W", "WindGustDir_WNW", "WindGustDir_WSW", "WindGustSpeed", "WindSpeed3pm", "WindSpeed9am"]
156 | # ))
157 | 
158 | subscription = None
159 | 
160 | if subscription is None:
161 |     print("Subscription already exists; get the existing one")
162 |     subscriptions_uids = ai_client.data_mart.subscriptions.get_uids()
163 | 
164 |     for sub in subscriptions_uids:
165 |         if (
166 |             ai_client.data_mart.subscriptions.get_details(sub)["entity"]["asset"][
167 |                 "name"
168 |             ]
169 |             == MODEL_NAME
170 |         ):
171 |             subscription = ai_client.data_mart.subscriptions.get(sub)
172 | 
173 | 
174 | for deployment in wml_client.deployments.get_details()["resources"]:
175 |     if DEPLOYMENT_UID in deployment["metadata"]["id"]:
176 | 
177 |         scoring_endpoint = deployment["entity"]["status"]["online_url"]["url"]
178 | 
179 | print(scoring_endpoint)
180 | 
181 | 
182 | data = pd.read_csv("../data/weatherAUS_processed.csv")
183 | 
184 | X = data.iloc[:, :-1]
185 | y = data[data.columns[-1]]
186 | X_train, X_test, y_train, y_test = train_test_split(
187 |     X, y, test_size=0.01, random_state=1337
188 | )
189 | 
190 | 
191 | # Payload Logging DAtASET
192 | 
193 | payload_data_set_id = (
194 |     wos_client.data_sets.list(
195 |         type=DataSetTypes.PAYLOAD_LOGGING,
196 |         target_target_id=subscription_id,
197 |         target_target_type=TargetTypes.SUBSCRIPTION,
198 |     )
199 |     .result.data_sets[0]
200 |     .metadata.id
201 | )
202 | print("Payload data set id:", payload_data_set_id)
203 | 
204 | 
205 | payload_scoring = {
206 |     "input_data": [
207 |         {"fields": X.columns.to_numpy().tolist(), "values": X_test.to_numpy().tolist()}
208 |     ]
209 | }
210 | 
211 | scoring_response = wml_client.deployments.score(DEPLOYMENT_UID, payload_scoring)
212 | 
213 | print("Logging")
214 | records = [
215 |     PayloadRecord(request=payload_scoring, response=scoring_response, response_time=72)
216 | ]
217 | store_record_info = wos_client.data_sets.store_records(payload_data_set_id, records)
218 | 
219 | 
220 | # Feedback Logging
221 | 
222 | feedback_dataset = wos_client.data_sets.list(
223 |     type=DataSetTypes.FEEDBACK,
224 |     target_target_id=subscription_id,
225 |     target_target_type=TargetTypes.SUBSCRIPTION,
226 | ).result
227 | 
228 | feedback_dataset_id = feedback_dataset.data_sets[0].metadata.id
229 | if feedback_dataset_id is None:
230 |     print("Feedback data set not found. Please check quality monitor status.")
231 |     sys.exit(1)
232 | 
233 | data = X_test.to_dict("records")
234 | 
235 | wos_client.data_sets.store_records(
236 |     feedback_dataset_id,
237 |     request_body=data,
238 |     background_mode=False,
239 |     header=True,
240 |     delimiter=",",
241 |     csv_max_line_length=1000,
242 | )
243 | 
244 | print(wos_client.data_sets.get_records_count(data_set_id=feedback_dataset_id))
245 | 
246 | 
247 | ####
248 | 
249 | 
250 | from ibm_watson_openscale.supporting_classes.enums import *
251 | 
252 | print("\nData marts: ")
253 | datams = wos_client.data_marts.list().result.data_marts
254 | for d in datams:
255 |     print(d.metadata.id)
256 | datamart_id = d.metadata.id
257 | 
258 | print("\nService providers: ")
259 | services = wos_client.service_providers.list().result.service_providers
260 | for service in services:
261 |     print(service.metadata.id + " / Name: " + service.entity.name)
262 | service_id = service.metadata.id
263 | 
264 | # wos_client.subscriptions.show()
265 | # wos_client.data_sets.show()
266 | 
267 | print("\nSubscriptions: ")
268 | subscriptions = wos_client.subscriptions.list(
269 |     data_mart_id=datamart_id, service_provider_id=service_id
270 | ).result.subscriptions
271 | for s in subscriptions:
272 |     print(s.metadata.id + "   " + s.entity.asset.name)
273 | subscription_id = s.metadata.id
274 | 
275 | print("\n")
276 | 
277 | payload_data_set_id = (
278 |     wos_client.data_sets.list(
279 |         type=DataSetTypes.PAYLOAD_LOGGING,
280 |         target_target_id=subscription_id,
281 |         target_target_type=TargetTypes.SUBSCRIPTION,
282 |     )
283 |     .result.data_sets[0]
284 |     .metadata.id
285 | )
286 | print("Payload data set id:", payload_data_set_id)
287 | 
288 | 
289 | pl_records_count = wos_client.data_sets.get_records_count(payload_data_set_id)
290 | print("Number of records in the payload logging table: {}".format(pl_records_count))
291 | if pl_records_count == 0:
292 |     raise Exception("Payload logging did not happen!")
293 | 
294 | 
295 | # Create Monitor
296 | 
297 | target = ibm_watson_openscale.base_classes.watson_open_scale_v2.Target(
298 |     target_type=TargetTypes.SUBSCRIPTION, target_id=subscription.uid
299 | )
300 | parameters = {"min_feedback_data_size": 200}
301 | thresholds = [{"metric_id": "area_under_roc", "type": "lower_limit", "value": 0.75}]
302 | wos_client.monitor_instances.create(
303 |     data_mart_id=datamart_id,
304 |     background_mode=False,
305 |     monitor_definition_id=wos_client.monitor_definitions.MONITORS.QUALITY.ID,
306 |     target=target,
307 |     parameters=parameters,
308 |     thresholds=thresholds,
309 | )
310 | 
311 | monitor_instances_info = wos_client.monitor_instances.show(data_mart_id=datamart_id)
312 | 
313 | 
314 | # wos_client.monitor_instances.delete(
315 | #         background_mode=False,
316 | #         monitor_instance_id='94e582d5-c244-4533-9697-c16046c5fc40'
317 | #      )
318 | 
319 | monitor_instance_run_info = wos_client.monitor_instances.run(
320 |     background_mode=False, monitor_instance_id="5ddff093-25fa-44f8-abae-fd29659fd0d0"
321 | )
322 | 


--------------------------------------------------------------------------------