├── src ├── __init__.py ├── tests │ ├── train │ │ └── test_train.py │ ├── evaluate │ │ └── test_evaluate.py │ ├── pytest.ini │ ├── __pycache__ │ │ └── test_main.cpython-38-pytest-6.0.1.pyc │ ├── test_main.py │ ├── test_data │ │ └── testWeatherAUS_processed.csv │ ├── preprocess │ │ ├── test_data │ │ │ └── testWeatherAUS.csv │ │ └── test_preprocess.py │ └── model │ │ └── test_model.py ├── __pycache__ │ └── model.cpython-37.pyc ├── scripts │ ├── Scripts │ │ ├── requirements.txt │ │ ├── README.md │ │ ├── git_release_pipeline.sh │ │ ├── init_project.sh │ │ └── std_check.sh │ └── Pipelines │ │ ├── git_release_pipeline.py │ │ ├── README.md │ │ ├── model_update_deployment_pipeline.py │ │ ├── model_update_pipeline.py │ │ ├── model_redeploy_pipeline.py │ │ ├── model_deploy_pipeline.py │ │ ├── model_deployed_validate_pipeline.py │ │ ├── model_train_autoAI.py │ │ └── openscale.py ├── evaluate.py ├── train.py ├── preprocess_data.py └── model.py ├── .infra ├── .gitignore ├── .terraform │ ├── .gitignore │ ├── output.tf │ └── init_infra.tf └── datapak_manage.py ├── .pylintrc ├── models └── .gitignore ├── .dvc ├── .gitignore └── config ├── results ├── .gitignore └── metrics.json ├── data ├── .gitignore └── weatherAUS.csv.dvc ├── credentials.yaml.gpg ├── credentials_example.yaml ├── requirements.txt ├── metadata.yaml ├── .pre-commit-config.yaml ├── .github └── workflows │ ├── test_on_push.yaml │ ├── deploy_on_release.yaml │ └── train_evaluate.yaml ├── .gitignore ├── dvc.yaml ├── dvc.lock └── README.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.infra/.gitignore: -------------------------------------------------------------------------------- 1 | configs -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [default] 2 | ignored-modules -------------------------------------------------------------------------------- /models/.gitignore: -------------------------------------------------------------------------------- 1 | /model.joblib 2 | -------------------------------------------------------------------------------- /.dvc/.gitignore: -------------------------------------------------------------------------------- 1 | /config.local 2 | /tmp 3 | /cache 4 | -------------------------------------------------------------------------------- /results/.gitignore: -------------------------------------------------------------------------------- 1 | /precision_recall_curve.png 2 | /roc_curve.png 3 | -------------------------------------------------------------------------------- /src/tests/train/test_train.py: -------------------------------------------------------------------------------- 1 | def test_train(): 2 | assert 1 + 1 == 2 3 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | /weatherAUS.csv 2 | /weatherAUS_processed.csv 3 | /features.csv 4 | -------------------------------------------------------------------------------- /src/tests/evaluate/test_evaluate.py: -------------------------------------------------------------------------------- 1 | def test_evaluate(): 2 | assert 1 + 1 == 2 3 | -------------------------------------------------------------------------------- /src/tests/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | error 4 | ignore::UserWarning 5 | -------------------------------------------------------------------------------- /credentials.yaml.gpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlops-guide/dvc-gitactions/HEAD/credentials.yaml.gpg -------------------------------------------------------------------------------- /data/weatherAUS.csv.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: a65cf8b8719b1a65db4f361eeec18457 3 | size: 14094055 4 | path: weatherAUS.csv 5 | -------------------------------------------------------------------------------- /src/__pycache__/model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlops-guide/dvc-gitactions/HEAD/src/__pycache__/model.cpython-37.pyc -------------------------------------------------------------------------------- /results/metrics.json: -------------------------------------------------------------------------------- 1 | {"accuracy": 0.849730029073792, "recall": 0.9460718094560967, "precision": 0.8718998787799365, "f1": 0.9074727635415069} -------------------------------------------------------------------------------- /src/scripts/Scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | sklearn 2 | pandas 3 | seaborn 4 | matplotlib 5 | yaml 6 | joblib 7 | numpy 8 | importlib 9 | json 10 | -------------------------------------------------------------------------------- /src/scripts/Scripts/README.md: -------------------------------------------------------------------------------- 1 | # Project_init 2 | Script to create standardized structure for ml projects 3 | 4 | `` 5 | $ ./init_project.sh my-model v1 6 | `` 7 | -------------------------------------------------------------------------------- /.infra/.terraform/.gitignore: -------------------------------------------------------------------------------- 1 | #.terraform folder 2 | .terraform/ 3 | .terraform.lock.hcl 4 | output.json 5 | terraform.tfstate 6 | terraform.tfstate.backup 7 | infra_state.json -------------------------------------------------------------------------------- /credentials_example.yaml: -------------------------------------------------------------------------------- 1 | # Rename this file to credentials.yaml to be able to run the scripts 2 | 3 | url: "https://us-south.ml.cloud.ibm.com" 4 | apikey: "" 5 | space_id: "" 6 | -------------------------------------------------------------------------------- /src/tests/__pycache__/test_main.cpython-38-pytest-6.0.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlops-guide/dvc-gitactions/HEAD/src/tests/__pycache__/test_main.cpython-38-pytest-6.0.1.pyc -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn>=0.23 2 | pandas 3 | seaborn 4 | matplotlib 5 | joblib 6 | numpy 7 | ibm_watson_machine_learning 8 | pyyaml 9 | pytest 10 | pytest-dependency 11 | pre-commit 12 | -------------------------------------------------------------------------------- /.dvc/config: -------------------------------------------------------------------------------- 1 | [core] 2 | remote = remote-storage 3 | ['remote "remote-storage"'] 4 | url = s3://wine-bucket-test/rain_australia/ 5 | endpointurl = https://s3.us-south.cloud-object-storage.appdomain.cloud 6 | -------------------------------------------------------------------------------- /.infra/.terraform/output.tf: -------------------------------------------------------------------------------- 1 | output "cos_crn" { 2 | value = ibm_resource_instance.cos.crn 3 | } 4 | 5 | output "wml_name" { 6 | value = ibm_resource_instance.wml.name 7 | } 8 | 9 | output "wml_crn" { 10 | value = ibm_resource_instance.wml 11 | } 12 | -------------------------------------------------------------------------------- /metadata.yaml: -------------------------------------------------------------------------------- 1 | author: guipleite 2 | datetime_creted: 29/03/2021_13:46:23:802394723 3 | deployment_uid: e02e481d-4e56-470f-baa9-ae84a583c0a8 4 | model_type: scikit-learn_0.23 5 | model_uid: f29e4cfc-3aab-458a-b703-fabc265f43a3 6 | project_name: Rain_aus 7 | project_version: v0.3 8 | -------------------------------------------------------------------------------- /src/tests/test_main.py: -------------------------------------------------------------------------------- 1 | # General tests that comprehend general aspects of the code 2 | 3 | import pytest 4 | 5 | 6 | def capital_case(x): 7 | return x.capitalize() 8 | 9 | 10 | def test_capital_case(): 11 | assert capital_case("semaphore") == "Semaphore" 12 | -------------------------------------------------------------------------------- /src/tests/test_data/testWeatherAUS_processed.csv: -------------------------------------------------------------------------------- 1 | MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,WindGustDir_W,WindGustDir_WNW,WindDir9am_NNW,WindDir9am_W,WindDir3pm_WNW,WindDir3pm_WSW,RainTomorrow 2 | 13.4,22.9,0.6,44,20,24,71,22,1007.7,1007.1,16.9,21.8,0,1,0,0,1,1,0,0 3 | 7.4,25.1,0.0,44,4,22,44,25,1010.6,1007.8,17.2,24.3,0,0,1,1,0,0,1,0 4 | -------------------------------------------------------------------------------- /src/tests/preprocess/test_data/testWeatherAUS.csv: -------------------------------------------------------------------------------- 1 | Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow 2 | 2008-12-01,Albury,13.4,22.9,0.6,NA,NA,W,44,W,WNW,20,24,71,22,1007.7,1007.1,8,NA,16.9,21.8,No,No 3 | 2008-12-02,Albury,7.4,25.1,0,NA,NA,WNW,44,NNW,WSW,4,22,44,25,1010.6,1007.8,NA,NA,17.2,24.3,No,No -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | repos: 3 | - 4 | repo: https://github.com/ambv/black 5 | rev: 20.8b1 6 | hooks: 7 | - 8 | id: black 9 | language_version: python3 10 | 11 | - repo: local 12 | hooks: 13 | - id: python-tests 14 | name: pytests 15 | entry: pytest src/tests 16 | language: python 17 | additional_dependencies: [pre-commit, pytest, pandas, sklearn, matplotlib] 18 | always_run: true 19 | pass_filenames: false 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /src/scripts/Scripts/git_release_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if ! python3 ./src/scripts/Pipelines/git_release_pipeline.py ./ 4 | then 5 | echo " Model already has been deployed, updating it" 6 | python3 ./src/scripts/Pipelines/model_update_pipeline.py ./models/model.joblib ./ ./credentials.yaml 7 | python3 ./src/scripts/Pipelines/model_update_deployment_pipeline.py ./ ./credentials.yaml 8 | else 9 | echo " Deploying model for the first time" 10 | python3 ./src/scripts/Pipelines/model_deploy_pipeline.py ./models/model.joblib ./ ./credentials.yaml 11 | fi 12 | -------------------------------------------------------------------------------- /src/scripts/Pipelines/git_release_pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import yaml 4 | 5 | """ 6 | Usage: 7 | python3 model_deploy_pipeline.py ./pickle_model ../path/to/project/ ../credentials.yaml 8 | 9 | """ 10 | 11 | PROJ_PATH = os.path.abspath(sys.argv[1]) 12 | META_PATH = PROJ_PATH + "/metadata.yaml" 13 | 14 | with open(META_PATH) as stream: 15 | try: 16 | metadata = yaml.safe_load(stream) 17 | except yaml.YAMLError as exc: 18 | print(exc) 19 | 20 | if "deployment_uid" in metadata.keys(): 21 | sys.exit(1) 22 | 23 | else: 24 | sys.exit(0) 25 | -------------------------------------------------------------------------------- /src/scripts/Pipelines/README.md: -------------------------------------------------------------------------------- 1 | python3 model_train_pipeline.py ../../Test_Project/my-model_v1/src/breast_cancer.csv ../../Test_Project/my-model_v1/ 10 ./pickle_model 2 | 3 | 4 | python3 model_deploy_pipeline.py ./pickle_model ../../Test_Project/my-model_v1/ ../../credentials.yaml 5 | 6 | 7 | python3 model_deployed_validate_pipeline.py ../../Test_Project/my-model_v1/src/breast_cancer.csv ../../credentials.yaml ../../Test_Project/my-model_v1/ 8 | 9 | 10 | python3 model_update_pipeline.py ./pickle_model ../../credentials.yaml ../../Test_Project/my-model_v1/ 11 | 12 | 13 | python3 model_redeploy_pipeline.py ../../credentials.yaml 14 | 15 | 16 | python3 model_redeploy_pipeline.py ../../credentials.yaml 17 | -------------------------------------------------------------------------------- /src/scripts/Scripts/init_project.sh: -------------------------------------------------------------------------------- 1 | MODEL=$1 2 | VERSION=$2 3 | PROJECT_NAME="$1_$2" 4 | echo "Creating $MODEL $VERSION" 5 | mkdir -p -m 777 $PROJECT_NAME 6 | mkdir -p -m 777 $PROJECT_NAME/src 7 | touch $PROJECT_NAME/src/__init__.py 8 | touch $PROJECT_NAME/src/model.py 9 | mkdir -p -m 777 $PROJECT_NAME/notebooks 10 | mkdir -p -m 777 $PROJECT_NAME/tests 11 | echo "Created by: "$USERNAME 12 | now=$(date +%x_%H:%M:%S:%N) 13 | echo "At: "$now 14 | 15 | cat <./$PROJECT_NAME/metadata.yaml 16 | project_name: $MODEL 17 | project_version: $VERSION 18 | model_type: scikit-learn_0.23 19 | author: $USERNAME 20 | datetime_creted: $now 21 | EOF 22 | 23 | cp ./requirements.txt ./$PROJECT_NAME 24 | cp -avr ./scripts ./$PROJECT_NAME/src/ 25 | -------------------------------------------------------------------------------- /.github/workflows/test_on_push.yaml: -------------------------------------------------------------------------------- 1 | name: Python Package and Test 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: [3.6] 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install pytest black 23 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 24 | - name: Test with pytest 25 | run: | 26 | pytest 27 | - name: Python Black 28 | run: | 29 | black . --check 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *.pyc 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | myenv/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ -------------------------------------------------------------------------------- /dvc.yaml: -------------------------------------------------------------------------------- 1 | stages: 2 | std_check: 3 | cmd: src/scripts/Scripts/std_check.sh ./ 4 | preprocess: 5 | cmd: python3 ./src/preprocess_data.py ./data/weatherAUS.csv 6 | deps: 7 | - ./src/preprocess_data.py 8 | - data/weatherAUS.csv 9 | outs: 10 | - ./data/weatherAUS_processed.csv 11 | - ./data/features.csv 12 | train: 13 | cmd: python3 ./src/train.py ./data/weatherAUS_processed.csv ./src/model.py 200 14 | deps: 15 | - ./data/weatherAUS_processed.csv 16 | - ./src/model.py 17 | - ./src/train.py 18 | outs: 19 | - ./models/model.joblib 20 | evaluate: 21 | cmd: python3 ./src/evaluate.py ./data/weatherAUS_processed.csv ./src/model.py ./models/model.joblib 22 | deps: 23 | - ./data/weatherAUS_processed.csv 24 | - ./models/model.joblib 25 | - ./src/evaluate.py 26 | - ./src/model.py 27 | outs: 28 | - ./results/precision_recall_curve.png 29 | - ./results/roc_curve.png 30 | metrics: 31 | - ./results/metrics.json: 32 | cache: false 33 | -------------------------------------------------------------------------------- /src/evaluate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import importlib.util 3 | import pickle 4 | import os 5 | import json 6 | 7 | # from sklearn.externals import joblib 8 | import joblib 9 | 10 | DATA_PATH = os.path.abspath(sys.argv[1]) 11 | # PROJ_PATH = os.path.abspath(sys.argv[2]) 12 | # MODEL_PATH = PROJ_PATH+"/src/model.py" 13 | MODEL_PATH = sys.argv[2] 14 | PICKLE_PATH = sys.argv[3] 15 | 16 | 17 | sys.path.insert(1, MODEL_PATH) 18 | 19 | 20 | def module_from_file(module_name, file_path): 21 | spec = importlib.util.spec_from_file_location(module_name, file_path) 22 | module = importlib.util.module_from_spec(spec) 23 | spec.loader.exec_module(module) 24 | return module 25 | 26 | 27 | model = module_from_file("model", MODEL_PATH) 28 | 29 | # with open(PICKLE_PATH, "rb") as file: 30 | # pipeline = pickle.load(file) 31 | pipeline = joblib.load(PICKLE_PATH) 32 | log_eval = model.evaluate(DATA_PATH, pipeline, "./results") 33 | 34 | with open("./results/metrics.json", "w") as outfile: 35 | json.dump(log_eval["metrics"], outfile) 36 | -------------------------------------------------------------------------------- /.infra/.terraform/init_infra.tf: -------------------------------------------------------------------------------- 1 | #### AUTH && PLUGIN 2 | 3 | terraform { 4 | required_providers { 5 | ibm = { 6 | source = "IBM-Cloud/ibm" 7 | version = "~> 1.12.0" 8 | } 9 | } 10 | } 11 | 12 | 13 | provider "ibm" {} 14 | 15 | #### RESOURCE GROUP 16 | 17 | data "ibm_resource_group" "group" { 18 | name = "fpe_insper" 19 | } 20 | 21 | #### Machine learning service 22 | resource "ibm_resource_instance" "wml" { 23 | name = "TESTE_TERRAFORM" 24 | service = "pm-20" 25 | plan = "lite" 26 | location = "us-south" 27 | resource_group_id = data.ibm_resource_group.group.id 28 | tags = ["TESTE", "TERRAFORM"] 29 | 30 | } 31 | 32 | #### Object storage 33 | 34 | resource "ibm_resource_instance" "cos" { 35 | name = "TESTE_COS" 36 | service = "cloud-object-storage" 37 | plan = "standard" 38 | location = "global" 39 | resource_group_id = data.ibm_resource_group.group.id 40 | tags = ["TERRAFORM", "TEST"] 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import importlib.util 3 | import pickle 4 | import os 5 | import json 6 | import joblib 7 | 8 | # import sklearn.external.joblib as extjoblib 9 | 10 | DATA_PATH = os.path.abspath(sys.argv[1]) 11 | # PROJ_PATH = os.path.abspath(sys.argv[2]) 12 | MODEL_PATH = sys.argv[2] 13 | PARAM = int(sys.argv[3]) 14 | 15 | sys.path.insert(1, MODEL_PATH) 16 | 17 | 18 | def module_from_file(module_name, file_path): 19 | spec = importlib.util.spec_from_file_location(module_name, file_path) 20 | module = importlib.util.module_from_spec(spec) 21 | spec.loader.exec_module(module) 22 | return module 23 | 24 | 25 | model = module_from_file("model", MODEL_PATH) 26 | 27 | if __name__ == "__main__": 28 | 29 | pipeline, log_train = model.train(DATA_PATH, PARAM) 30 | 31 | # if sys.argv[4]: 32 | # with open("./models/model.pkl", "wb") as file: 33 | # pickle.dump(pipeline[0], file) 34 | joblib.dump(pipeline, "./models/model.joblib") 35 | 36 | # log_eval = model.evaluate(DATA_PATH, pipeline, "./results") 37 | 38 | # with open("./results/metrics.json", "w") as outfile: 39 | # json.dump(log_eval["metrics"], outfile) 40 | -------------------------------------------------------------------------------- /.github/workflows/deploy_on_release.yaml: -------------------------------------------------------------------------------- 1 | name: model-deploy-on-release 2 | on: 3 | release: 4 | types: 5 | - 'created' 6 | 7 | jobs: 8 | run: 9 | runs-on: [ubuntu-latest] 10 | container: docker://dvcorg/cml-py3:latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: 'Deploy/Update on new release' 14 | shell: bash 15 | 16 | env: 17 | repo_token: ${{ secrets.GITHUB_TOKEN }} 18 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 19 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 20 | CRED_SECRET: ${{ secrets.IBM_CREDENTIALS_PASS }} 21 | 22 | run: | 23 | # Install requirements 24 | pip install -r requirements.txt 25 | 26 | # Pull data & run-cache from S3 and reproduce pipeline 27 | dvc pull --run-cache 28 | dvc repro 29 | 30 | # Decrypt credentials file 31 | gpg --quiet --batch --yes --decrypt --passphrase="$CRED_SECRET" --output credentials.yaml credentials.yaml.gpg 32 | 33 | # Check if there is a deployment already, if positive update it, otherwise deploys it for the first time 34 | ./src/scripts/Scripts/git_release_pipeline.sh 35 | 36 | -------------------------------------------------------------------------------- /.github/workflows/train_evaluate.yaml: -------------------------------------------------------------------------------- 1 | name: model-training-evaluate 2 | on: [push] 3 | jobs: 4 | run: 5 | runs-on: [ubuntu-latest] 6 | container: docker://dvcorg/cml-py3:latest 7 | steps: 8 | - uses: actions/checkout@v2 9 | - name: 'Train and Evaluate model' 10 | shell: bash 11 | env: 12 | repo_token: ${{ secrets.GITHUB_TOKEN }} 13 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 14 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 15 | run: | 16 | # Install requirements 17 | pip install -r requirements.txt 18 | 19 | # Pull data & run-cache from S3 and reproduce pipeline 20 | dvc pull --run-cache 21 | dvc repro 22 | 23 | # Report metrics 24 | echo "## Metrics" >> report.md 25 | git fetch --prune 26 | dvc metrics diff master --show-md >> report.md 27 | 28 | # Publish confusion matrix diff 29 | echo -e "## Plots\n### ROC Curve" >> report.md 30 | cml-publish ./results/roc_curve.png --md >> report.md 31 | echo -e "\n### Precision and Recall Curve" >> report.md 32 | cml-publish ./results/precision_recall_curve.png --md >> report.md 33 | cml-send-comment report.md 34 | -------------------------------------------------------------------------------- /src/scripts/Pipelines/model_update_deployment_pipeline.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import yaml 4 | from ibm_watson_machine_learning import APIClient 5 | 6 | """ 7 | Usage: 8 | python3 model_update_deployment_pipeline.py ../path/to/project/ ../credentials.yaml 9 | 10 | """ 11 | 12 | PROJ_PATH = os.path.abspath(sys.argv[1]) 13 | CRED_PATH = os.path.abspath(sys.argv[2]) 14 | META_PATH = PROJ_PATH + "/metadata.yaml" 15 | 16 | with open(CRED_PATH) as stream: 17 | try: 18 | credentials = yaml.safe_load(stream) 19 | except yaml.YAMLError as exc: 20 | print(exc) 21 | 22 | with open(META_PATH) as stream: 23 | try: 24 | metadata = yaml.safe_load(stream) 25 | except yaml.YAMLError as exc: 26 | print(exc) 27 | 28 | wml_credentials = {"url": credentials["url"], "apikey": credentials["apikey"]} 29 | 30 | client = APIClient(wml_credentials) 31 | client.spaces.list() 32 | 33 | SPACE_ID = credentials["space_id"] 34 | 35 | if "deployment_uid" in metadata.keys(): 36 | MODEL_GUID = metadata["model_uid"] 37 | DEPLOYMENT_UID = metadata["deployment_uid"] 38 | print("\nExtracting DEPLOYMENT UID and MODEL GUID from metadata file\n") 39 | 40 | else: 41 | MODEL_GUID = input("MODEL GUID: ") 42 | DEPLOYMENT_UID = input("DEPLOYMENT UID: ") 43 | 44 | client.set.default_space(SPACE_ID) 45 | 46 | change_meta = {client.deployments.ConfigurationMetaNames.ASSET: {"id": MODEL_GUID}} 47 | 48 | print("Alterando o deploy abaixo: ") 49 | print(client.deployments.get_details(DEPLOYMENT_UID)) 50 | 51 | client.deployments.update(DEPLOYMENT_UID, change_meta) 52 | -------------------------------------------------------------------------------- /src/scripts/Scripts/std_check.sh: -------------------------------------------------------------------------------- 1 | PROJ_PATH=$1 2 | RED='\033[0;31m' 3 | GRE='\033[0;32m' 4 | NC='\033[0m' 5 | ER=0 6 | a=1 7 | 8 | if !([ -d "$PROJ_PATH" ]) 9 | then 10 | ER=$a 11 | echo -e "${RED}Error: Directory $PROJ_PATH does not exists.${NC}" 12 | else 13 | echo "Directory $PROJ_PATH exists." 14 | fi 15 | 16 | if !([ -f "$PROJ_PATH/metadata.yaml" ]) 17 | then 18 | ER=$a 19 | echo -e " ${RED}Error: File $PROJ_PATH/metadata.yaml does not exists.${NC}" 20 | else 21 | echo " File $PROJ_PATH/metadata.yaml exists." 22 | fi 23 | 24 | if !([ -d "$PROJ_PATH/src" ]) 25 | then 26 | ER=$a 27 | echo -e "${RED}Error: Directory $PROJ_PATH/src does not exists.${NC}" 28 | else 29 | echo "Directory $PROJ_PATH/src exists." 30 | fi 31 | 32 | if !([ -f "$PROJ_PATH/src/model.py" ]) 33 | then 34 | ER=$a 35 | echo -e " ${RED}Error: File $PROJ_PATH/src/model.py does not exists.${NC}" 36 | else 37 | echo " File $PROJ_PATH/src/model.py exists." 38 | fi 39 | if !([ -f "$PROJ_PATH/src/__init__.py" ]) 40 | then 41 | ER=$a 42 | echo -e " ${RED}Error: File $PROJ_PATH/src/__init__.py does not exists.${NC}" 43 | else 44 | echo " File $PROJ_PATH/src/__init__.py exists." 45 | fi 46 | 47 | 48 | 49 | if !([ -d "$PROJ_PATH/notebooks" ]) 50 | then 51 | ER=$a 52 | echo -e "${RED}Error: Directory $PROJ_PATH/notebooks does not exists.${NC}" 53 | else 54 | echo "Directory $PROJ_PATH/notebooks exists." 55 | fi 56 | 57 | printf "\n" 58 | if [ $ER == 1 ] 59 | then 60 | echo -e "${RED}Error: Project Structure has been changed, please fix it \n${NC}" 61 | exit 0 62 | else 63 | echo -e "${GRE}Project structure is ok \n${NC}" 64 | 65 | fi 66 | 67 | 68 | if ! black ./$PROJ_PATH --check; then 69 | echo -e "${RED}Please run the command 'black' to format your files" 70 | exit 0 71 | else 72 | echo -e "${GRE}Files formated, moving foward" 73 | fi 74 | 75 | 76 | -------------------------------------------------------------------------------- /src/scripts/Pipelines/model_update_pipeline.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import yaml 4 | import joblib 5 | from ibm_watson_machine_learning import APIClient 6 | 7 | """ 8 | Usage: 9 | python3 model_update_pipeline.py ./pickle_model ../path/to/project/ ../credentials.yaml 10 | 11 | """ 12 | 13 | MODEL_PATH = os.path.abspath(sys.argv[1]) 14 | PROJ_PATH = os.path.abspath(sys.argv[2]) 15 | CRED_PATH = os.path.abspath(sys.argv[3]) 16 | META_PATH = PROJ_PATH + "/metadata.yaml" 17 | 18 | with open(CRED_PATH) as stream: 19 | try: 20 | credentials = yaml.safe_load(stream) 21 | except yaml.YAMLError as exc: 22 | print(exc) 23 | 24 | with open(META_PATH) as stream: 25 | try: 26 | metadata = yaml.safe_load(stream) 27 | except yaml.YAMLError as exc: 28 | print(exc) 29 | 30 | with open(MODEL_PATH, "rb") as file: 31 | # pickle_model = pickle.load(file) 32 | pipeline = joblib.load(file) 33 | 34 | wml_credentials = {"url": credentials["url"], "apikey": credentials["apikey"]} 35 | 36 | client = APIClient(wml_credentials) 37 | client.spaces.list() 38 | 39 | SPACE_ID = credentials["space_id"] 40 | 41 | if "model_uid" in metadata.keys(): 42 | MODEL_GUID = metadata["model_uid"] 43 | print("\nExtracting MODEL GUID from metadata file\n") 44 | 45 | else: 46 | MODEL_GUID = input("MODEL GUID: ") 47 | 48 | client.set.default_space(SPACE_ID) 49 | 50 | print("\nCreating new version") 51 | 52 | published_model = client.repository.update_model( 53 | model_uid=MODEL_GUID, 54 | update_model=pipeline, 55 | updated_meta_props={ 56 | client.repository.ModelMetaNames.NAME: metadata["project_name"] 57 | + "_" 58 | + metadata["project_version"] 59 | }, 60 | ) 61 | 62 | new_model_revision = client.repository.create_model_revision(MODEL_GUID) 63 | 64 | rev_id = new_model_revision["metadata"].get("rev") 65 | print("\nversion", rev_id) 66 | 67 | client.repository.list_models_revisions(MODEL_GUID) 68 | -------------------------------------------------------------------------------- /src/tests/model/test_model.py: -------------------------------------------------------------------------------- 1 | # PyTest file for model.py 2 | 3 | import sys 4 | import os 5 | import pytest 6 | import pandas as pd 7 | 8 | # Parent Folder 9 | sys.path.append( 10 | os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) 11 | ) 12 | 13 | # Model Python file 14 | from model import get_variables 15 | 16 | FILE_NAME = "testWeatherAUS" 17 | PROCESSED_DATA_PATH = ( 18 | os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 19 | + "/test_data/" 20 | + FILE_NAME 21 | + "_processed.csv" 22 | ) 23 | 24 | 25 | @pytest.mark.parametrize( 26 | "expected_X,expected_y", 27 | [ 28 | ( 29 | { 30 | "MinTemp": {0: 13.4, 1: 7.4}, 31 | "MaxTemp": {0: 22.9, 1: 25.1}, 32 | "Rainfall": {0: 0.6, 1: 0.0}, 33 | "WindGustSpeed": {0: 44, 1: 44}, 34 | "WindSpeed9am": {0: 20, 1: 4}, 35 | "WindSpeed3pm": {0: 24, 1: 22}, 36 | "Humidity9am": {0: 71, 1: 44}, 37 | "Humidity3pm": {0: 22, 1: 25}, 38 | "Pressure9am": {0: 1007.7, 1: 1010.6}, 39 | "Pressure3pm": {0: 1007.1, 1: 1007.8}, 40 | "Temp9am": {0: 16.9, 1: 17.2}, 41 | "Temp3pm": {0: 21.8, 1: 24.3}, 42 | "RainToday": {0: 0, 1: 0}, 43 | "WindGustDir_W": {0: 1, 1: 0}, 44 | "WindGustDir_WNW": {0: 0, 1: 1}, 45 | "WindDir9am_NNW": {0: 0, 1: 1}, 46 | "WindDir9am_W": {0: 1, 1: 0}, 47 | "WindDir3pm_WNW": {0: 1, 1: 0}, 48 | "WindDir3pm_WSW": {0: 0, 1: 1}, 49 | }, 50 | [0, 0], 51 | ) 52 | ], 53 | ) 54 | def test_get_variables(expected_X, expected_y): 55 | 56 | # Open CSV as DF 57 | data = pd.read_csv(PROCESSED_DATA_PATH) 58 | 59 | # Run Function 60 | X, y = get_variables(data, "RainTomorrow") 61 | 62 | assert (X.to_dict(), y.to_list()) == (expected_X, expected_y) 63 | -------------------------------------------------------------------------------- /src/scripts/Pipelines/model_redeploy_pipeline.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | import yaml 5 | from ibm_watson_machine_learning import APIClient 6 | 7 | """ 8 | Usage: 9 | python3 model_reploy_pipeline.py ../path/to/project/ ../credentials.yaml 10 | 11 | """ 12 | 13 | PROJ_PATH = os.path.abspath(sys.argv[1]) 14 | CRED_PATH = os.path.abspath(sys.argv[2]) 15 | META_PATH = PROJ_PATH + "/metadata.yaml" 16 | 17 | with open(CRED_PATH) as stream: 18 | try: 19 | credentials = yaml.safe_load(stream) 20 | except yaml.YAMLError as exc: 21 | print(exc) 22 | 23 | with open(META_PATH) as stream: 24 | try: 25 | metadata = yaml.safe_load(stream) 26 | except yaml.YAMLError as exc: 27 | print(exc) 28 | 29 | wml_credentials = {"url": credentials["url"], "apikey": credentials["apikey"]} 30 | 31 | client = APIClient(wml_credentials) 32 | client.spaces.list() 33 | 34 | SPACE_ID = credentials["space_id"] 35 | 36 | if "deployment_uid" in metadata.keys(): 37 | MODEL_GUID = metadata["model_uid"] 38 | DEPLOYMENT_UID = metadata["deployment_uid"] 39 | print("\nExtracting DEPLOYMENT UID and MODEL GUID from metadata file\n") 40 | 41 | else: 42 | MODEL_GUID = input("MODEL GUID: ") 43 | DEPLOYMENT_UID = input("DEPLOYMENT UID: ") 44 | 45 | client.set.default_space(SPACE_ID) 46 | 47 | client.repository.list_models_revisions(MODEL_GUID) 48 | 49 | MODEL_VERSION = input("MODEL VERSION: ") 50 | 51 | meta = { 52 | client.deployments.ConfigurationMetaNames.ASSET: { 53 | "id": MODEL_GUID, 54 | "rev": MODEL_VERSION, 55 | } 56 | } 57 | updated_deployment = client.deployments.update( 58 | deployment_uid=DEPLOYMENT_UID, changes=meta 59 | ) 60 | 61 | status = None 62 | while status not in ["ready", "failed"]: 63 | print(".", end=" ") 64 | time.sleep(2) 65 | deployment_details = client.deployments.get_details(DEPLOYMENT_UID) 66 | status = deployment_details["entity"]["status"].get("state") 67 | 68 | print("\nDeployment update finished with status: ", status) 69 | # print(deployment_details) 70 | -------------------------------------------------------------------------------- /dvc.lock: -------------------------------------------------------------------------------- 1 | schema: '2.0' 2 | stages: 3 | preprocess: 4 | cmd: python3 ./src/preprocess_data.py ./data/weatherAUS.csv 5 | deps: 6 | - path: ./src/preprocess_data.py 7 | md5: b5e571f866aa8993ad3bb844594e112e 8 | size: 1909 9 | - path: data/weatherAUS.csv 10 | md5: a65cf8b8719b1a65db4f361eeec18457 11 | size: 14094055 12 | outs: 13 | - path: ./data/features.csv 14 | md5: 49c2fbca9e0ae3101ae5bb56d6a4521a 15 | size: 19266775 16 | - path: ./data/weatherAUS_processed.csv 17 | md5: 59e89e62fb8f9face4901630d1de3e16 18 | size: 19507550 19 | train: 20 | cmd: python3 ./src/train.py ./data/weatherAUS_processed.csv ./src/model.py 200 21 | deps: 22 | - path: ./data/weatherAUS_processed.csv 23 | md5: 59e89e62fb8f9face4901630d1de3e16 24 | size: 19507550 25 | - path: ./src/model.py 26 | md5: 895596132410cf7e581953ecbdc9b44d 27 | size: 4485 28 | - path: ./src/train.py 29 | md5: 1b5c6c1786d40c9505b2261f11a3b274 30 | size: 1002 31 | outs: 32 | - path: ./models/model.joblib 33 | md5: 8cf64091db28e29b327baf946a796f27 34 | size: 3275 35 | evaluate: 36 | cmd: python3 ./src/evaluate.py ./data/weatherAUS_processed.csv ./src/model.py 37 | ./models/model.joblib 38 | deps: 39 | - path: ./data/weatherAUS_processed.csv 40 | md5: 59e89e62fb8f9face4901630d1de3e16 41 | size: 19507550 42 | - path: ./models/model.joblib 43 | md5: 8cf64091db28e29b327baf946a796f27 44 | size: 3275 45 | - path: ./src/evaluate.py 46 | md5: 7e466368d793d09316fc1e078111a9de 47 | size: 882 48 | - path: ./src/model.py 49 | md5: 895596132410cf7e581953ecbdc9b44d 50 | size: 4485 51 | outs: 52 | - path: ./results/metrics.json 53 | md5: 17cacf1c4e374794927b5bc143016e23 54 | size: 120 55 | - path: ./results/precision_recall_curve.png 56 | md5: bf5e1f1911560127be04aae88977b7a4 57 | size: 17045 58 | - path: ./results/roc_curve.png 59 | md5: 77346f3a6fb9f23410af073ac1670898 60 | size: 19933 61 | std_check: 62 | cmd: src/scripts/Scripts/std_check.sh ./ 63 | -------------------------------------------------------------------------------- /src/preprocess_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pandas as pd 4 | from sklearn import preprocessing 5 | 6 | 7 | def count_nulls_by_line(df): 8 | return df.isnull().sum().sort_values(ascending=False) 9 | 10 | 11 | def null_percent_by_line(df): 12 | return (df.isnull().sum() / df.isnull().count()).sort_values(ascending=False) 13 | 14 | 15 | def preprocess_data(DATA_PATH): 16 | df = pd.read_csv(DATA_PATH) 17 | 18 | zeros_cnt = count_nulls_by_line(df) 19 | # df.isnull().sum().sort_values(ascending=False) 20 | percent_zeros = null_percent_by_line(df) 21 | # (df.isnull().sum() / df.isnull().count()).sort_values(ascending=False) 22 | 23 | missing_data = pd.concat( 24 | [zeros_cnt, percent_zeros], axis=1, keys=["Total", "Percent"] 25 | ) 26 | 27 | dropList = list(missing_data[missing_data["Percent"] > 0.15].index) 28 | 29 | df.drop(dropList, axis=1, inplace=True) 30 | df.drop(["Date"], axis=1, inplace=True) 31 | df.drop(["Location"], axis=1, inplace=True) 32 | 33 | ohe = pd.get_dummies(data=df, columns=["WindGustDir", "WindDir9am", "WindDir3pm"]) 34 | 35 | ohe["RainToday"] = df["RainToday"].astype(str) 36 | ohe["RainTomorrow"] = df["RainTomorrow"].astype(str) 37 | 38 | lb = preprocessing.LabelBinarizer() 39 | 40 | ohe["RainToday"] = lb.fit_transform(ohe["RainToday"]) 41 | ohe["RainTomorrow"] = lb.fit_transform(ohe["RainTomorrow"]) 42 | ohe = ohe.dropna() 43 | precessed_df = ohe 44 | 45 | y = ohe["RainTomorrow"] 46 | X = ohe.drop(["RainTomorrow"], axis=1) 47 | 48 | cols = precessed_df.columns.tolist() 49 | cols.remove("RainTomorrow") 50 | cols.append("RainTomorrow") 51 | precessed_df = precessed_df[cols] 52 | 53 | cols = precessed_df.columns.tolist() 54 | 55 | features_df = precessed_df.drop(["RainTomorrow"], axis=1) 56 | features_df.to_csv("./data/features.csv", index=False) 57 | 58 | precessed_df.to_csv(DATA_PATH[:-4] + "_processed.csv", index=False) 59 | 60 | 61 | if __name__ == "__main__": 62 | DATA_PATH = os.path.abspath(sys.argv[1]) 63 | preprocess_data(DATA_PATH) 64 | print("Saved to {}".format(DATA_PATH[:-4] + "_processed.csv")) 65 | -------------------------------------------------------------------------------- /src/tests/preprocess/test_preprocess.py: -------------------------------------------------------------------------------- 1 | # PyTest file for all preprocessing of data 2 | 3 | import io 4 | import builtins 5 | import pytest 6 | import pandas as pd 7 | import sys 8 | import os 9 | 10 | # Parent Folder 11 | sys.path.append( 12 | os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) 13 | ) 14 | 15 | # Preprocess Python file 16 | import preprocess_data 17 | 18 | FILE_NAME = "testWeatherAUS" 19 | DATA_PATH = ( 20 | os.path.dirname(os.path.realpath(__file__)) + "/test_data/" + FILE_NAME + ".csv" 21 | ) 22 | PROCESSED_DATA_PATH = ( 23 | os.path.dirname(os.path.realpath(__file__)) 24 | + "/test_data/" 25 | + FILE_NAME 26 | + "_processed.csv" 27 | ) 28 | 29 | 30 | def test_count_nulls_by_line(): 31 | # Tests function that counts number of nulls by line on a dataframe 32 | data = pd.DataFrame([[0, 2], [0, 1], [6, None]]) 33 | assert preprocess_data.count_nulls_by_line(data).to_list() == [1, 0] 34 | 35 | 36 | def test_null_percent(): 37 | # Tests function that gets the percentage of nulls by line on a dataframe 38 | data = pd.DataFrame([[0, 2], [1, None]]) 39 | assert preprocess_data.null_percent_by_line(data).to_list() == [0.5, 0] 40 | 41 | 42 | # @pytest.mark.dependency() 43 | # def test_preprocess(): 44 | # # Checks if running the preprocess function returns an error 45 | # preprocess_data.preprocess_data(DATA_PATH) 46 | 47 | 48 | # @pytest.mark.dependency(depends=["test_preprocess"]) 49 | # def test_processed_file_created(): 50 | # # Checks if the processed file was created during test_preprocess() and is accessible 51 | # f = open(PROCESSED_DATA_PATH) 52 | 53 | 54 | # @pytest.mark.dependency(depends=["test_processed_file_created"]) 55 | # def test_processed_file_format(): 56 | # # Checks if the processed file is in the correct format (.csv) and can be transformed in dataframe 57 | # try: 58 | # pd.read_csv(PROCESSED_DATA_PATH) 59 | # except: 60 | # raise RuntimeError("Unable to open " + PROCESSED_DATA_PATH + " as dataframe") 61 | 62 | 63 | @pytest.fixture(scope="session", autouse=True) 64 | def cleanup(request): 65 | # Runs tests then cleans up the processed file 66 | yield 67 | try: 68 | os.remove(PROCESSED_DATA_PATH) 69 | except: 70 | pass 71 | -------------------------------------------------------------------------------- /src/scripts/Pipelines/model_deploy_pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pickle 4 | import yaml 5 | import joblib 6 | from ibm_watson_machine_learning import APIClient 7 | 8 | """ 9 | Usage: 10 | python3 model_deploy_pipeline.py ./pickle_model ../path/to/project/ ../credentials.yaml 11 | 12 | """ 13 | 14 | MODEL_PATH = os.path.abspath(sys.argv[1]) 15 | PROJ_PATH = os.path.abspath(sys.argv[2]) 16 | CRED_PATH = os.path.abspath(sys.argv[3]) 17 | META_PATH = PROJ_PATH + "/metadata.yaml" 18 | 19 | 20 | with open(CRED_PATH) as stream: 21 | try: 22 | credentials = yaml.safe_load(stream) 23 | except yaml.YAMLError as exc: 24 | print(exc) 25 | 26 | 27 | with open(META_PATH) as stream: 28 | try: 29 | metadata = yaml.safe_load(stream) 30 | except yaml.YAMLError as exc: 31 | print(exc) 32 | 33 | with open(MODEL_PATH, "rb") as file: 34 | # pickle_model = pickle.load(file) 35 | pipeline = joblib.load(file) 36 | 37 | wml_credentials = {"url": credentials["url"], "apikey": credentials["apikey"]} 38 | 39 | client = APIClient(wml_credentials) 40 | client.spaces.list() 41 | 42 | MODEL_NAME = metadata["project_name"] + "_" + metadata["project_version"] 43 | DEPLOY_NAME = MODEL_NAME + "-Deployment" 44 | MODEL = pipeline 45 | SPACE_ID = credentials["space_id"] 46 | 47 | client.set.default_space(SPACE_ID) 48 | 49 | model_props = { 50 | client.repository.ModelMetaNames.NAME: MODEL_NAME, 51 | client.repository.ModelMetaNames.TYPE: metadata["model_type"], 52 | client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: client.software_specifications.get_id_by_name( 53 | "default_py3.7" 54 | ), 55 | } 56 | 57 | model_details = client.repository.store_model(model=MODEL, meta_props=model_props) 58 | model_uid = client.repository.get_model_uid(model_details) 59 | 60 | deployment_props = { 61 | client.deployments.ConfigurationMetaNames.NAME: DEPLOY_NAME, 62 | client.deployments.ConfigurationMetaNames.ONLINE: {}, 63 | } 64 | 65 | deployment = client.deployments.create( 66 | artifact_uid=model_uid, meta_props=deployment_props 67 | ) 68 | 69 | deployment_uid = client.deployments.get_uid(deployment) 70 | 71 | metadata["model_uid"] = model_uid 72 | metadata["deployment_uid"] = deployment_uid 73 | 74 | f = open(META_PATH, "w+") 75 | yaml.dump(metadata, f, allow_unicode=True) 76 | -------------------------------------------------------------------------------- /src/scripts/Pipelines/model_deployed_validate_pipeline.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import yaml 3 | import os 4 | import pandas as pd 5 | import numpy as np 6 | from sklearn.metrics import confusion_matrix, accuracy_score 7 | from sklearn.model_selection import cross_val_score 8 | from ibm_watson_machine_learning import APIClient 9 | from sklearn.model_selection import train_test_split 10 | 11 | """ 12 | Usage: 13 | python3 model_deployed_validate_pipeline.py ../../ ../../credentials.yaml path/to/project/ 14 | 15 | """ 16 | 17 | DATA_PATH = os.path.abspath(sys.argv[1]) 18 | CRED_PATH = os.path.abspath(sys.argv[2]) 19 | PROJ_PATH = os.path.abspath(sys.argv[3]) 20 | META_PATH = PROJ_PATH + "/metadata.yaml" 21 | 22 | 23 | def main(): 24 | with open(CRED_PATH) as stream: 25 | try: 26 | credentials = yaml.safe_load(stream) 27 | except yaml.YAMLError as exc: 28 | print(exc) 29 | 30 | with open(META_PATH) as stream: 31 | try: 32 | metadata = yaml.safe_load(stream) 33 | except yaml.YAMLError as exc: 34 | print(exc) 35 | 36 | data = pd.read_csv(DATA_PATH) 37 | 38 | X = data.iloc[:, :-1] 39 | y = data[data.columns[-1]] 40 | X_train, X_test, y_train, y_test = train_test_split( 41 | X, y, test_size=0.3, random_state=0 42 | ) 43 | 44 | wml_credentials = {"url": credentials["url"], "apikey": credentials["apikey"]} 45 | 46 | client = APIClient(wml_credentials) 47 | client.spaces.list() 48 | 49 | SPACE_ID = credentials["space_id"] 50 | 51 | if "deployment_uid" in metadata.keys(): 52 | DEPLOYMENT_UID = metadata["deployment_uid"] 53 | print("\nExtracting DEPLOYMENT UID from metadata file\n") 54 | 55 | else: 56 | DEPLOYMENT_UID = input("DEPLOYMENT UID: ") 57 | 58 | client.set.default_space(SPACE_ID) 59 | 60 | payload = { 61 | "input_data": [ 62 | { 63 | "fields": X.columns.to_numpy().tolist(), 64 | "values": X_test.to_numpy().tolist(), 65 | } 66 | ] 67 | } 68 | result = client.deployments.score(DEPLOYMENT_UID, payload) 69 | 70 | pred_values = np.squeeze(result["predictions"][0]["values"]) 71 | y_pred_values = [i[0] for i in pred_values] 72 | 73 | def comb_eval(y, y_pred): 74 | cm = confusion_matrix(y, y_pred) 75 | acc = accuracy_score(y, y_pred) 76 | 77 | return {"cm": cm, "acc": acc} 78 | 79 | eval = comb_eval(y_test, y_pred_values) 80 | print(eval) 81 | 82 | return eval 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🧬 DVC CI/CD MLOps Pipeline 2 | MLOps pipeline with DVC and CML using Github Actions and IBM Cloud 3 | 4 | 5 | [![model-deploy-on-release](https://github.com/MLOPsStudyGroup/dvc-gitactions/actions/workflows/deploy_on_release.yaml/badge.svg)](https://github.com/MLOPsStudyGroup/dvc-gitactions/actions/workflows/deploy_on_release.yaml) 6 | [![Python Package and Test](https://github.com/MLOPsStudyGroup/dvc-gitactions/actions/workflows/test_on_push.yaml/badge.svg)](https://github.com/MLOPsStudyGroup/dvc-gitactions/actions/workflows/test_on_push.yaml) 7 | 8 | [Video Demo](https://www.youtube.com/watch?v=URpGaE-FA5U) 9 | 10 | [Documentation and Implementation Guide](https://mlops-guide.github.io) 11 | 12 | ## 🔰 Milestones 13 | - [X] Data Versioning: DVC 14 | - [X] Machine Learning Pipeline: DVC Pipeline (preprocess, train, evaluate) 15 | - [X] CI/CD: Unit testing with Pytest, pre-commit and Github Actions 16 | - [X] CML: Continuous Machine Learning and Github Actions 17 | - [X] Deploy on release: Github Actions and IBM Watson 18 | - [X] Monitoring: OpenScale 19 | - [X] Infrastructure-as-a-code: Terraform script 20 | 21 | ## 📋 Requirements 22 | 23 | * DVC 24 | * Python3 and pip 25 | * Access to IBM Cloud Object Storage 26 | 27 | ## 🏃🏻 Running Project 28 | 29 | ### 🔑 Setup IBM Bucket Credentials 30 | 31 | #### MacOS 32 | Setup your credentials on ```~/.aws/credentials``` and ```~/.aws/config```. DVC works perfectly with IBM Obejct Storage, although it uses S3 protocol, you can also see this in other portions of the repository. 33 | 34 | 35 | ~/.aws/credentials 36 | 37 | ```credentials 38 | [default] 39 | aws_access_key_id = {{Key ID}} 40 | aws_secret_access_key = {{Access Key}} 41 | ``` 42 | 43 | 44 | ### ✅ Pre-commit Testings 45 | 46 | In order to activate pre-commit testing you need ```pre-commit``` 47 | 48 | Installing pre-commit with pip 49 | ``` 50 | pip install pre-commit 51 | ``` 52 | 53 | Installing pre-commit on your local repository. Keep in mind this creates a Github Hook. 54 | ``` 55 | pre-commit install 56 | ``` 57 | 58 | Now everytime you make a commit, it will run some tests defined on ```.pre-commit-config.yaml``` before allowing your commit. 59 | 60 | **Example** 61 | ``` 62 | $ git commit -m "Example commit" 63 | 64 | black....................................................................Passed 65 | pytest-check.............................................................Passed 66 | ``` 67 | 68 | 69 | ### ⚗️ Using DVC 70 | 71 | Download data from the DVC repository(analog to ```git pull```) 72 | ``` 73 | dvc pull 74 | ``` 75 | 76 | Reproduces the pipeline using DVC 77 | ``` 78 | dvc repro 79 | ``` 80 | 81 | 82 | ### ⚙️ DVC Pipelines 83 | 84 | 85 | ✂️ Preprocessing pipeline 86 | ``` 87 | dvc run -n preprocess -d ./src/preprocess_data.py -d data/weatherAUS.csv \ 88 | -o ./data/weatherAUS_processed.csv -o ./data/features.csv \ 89 | python3 ./src/preprocess_data.py ./data/weatherAUS.csv 90 | ``` 91 | 92 | 93 | 📘 Training pipeline 94 | ``` 95 | dvc run -n train -d ./src/train.py -d ./data/weatherAUS_processed.csv \ 96 | -d ./src/model.py \ 97 | -o ./models/model.joblib \ 98 | python3 ./src/train.py ./data/weatherAUS_processed.csv ./src/model.py 200 99 | ``` 100 | 101 | 102 | 📊 Evaluate pipeline 103 | ``` 104 | dvc run -n evaluate -d ./src/evaluate.py -d ./data/weatherAUS_processed.csv \ 105 | -d ./src/model.py -d ./models/model.joblib -o ./results/metrics.json \ 106 | -o ./results/precision_recall_curve.png -o ./results/roc_curve.png \ 107 | python3 ./src/evaluate.py ./data/weatherAUS_processed.csv ./src/model.py ./models/model.joblib 108 | ``` 109 | 110 | ### 🐙 Git Actions 111 | 🔐 IBM Credentials 112 | 113 | 114 | Fill the ```credentials_example.yaml``` file and rename it to ```credentials.yaml``` to be able to run the scripts that require IBM keys. ⚠️ Never upload this file to GitHub! 115 | 116 | To use Git Actions to deploy your model, you'll need to encrypt it, to do that run the command bellow and choose a strong password. 117 | 118 | ``` 119 | gpg --symmetric --cipher-algo AES256 credentials.yaml 120 | ``` 121 | Now in the GitHub page for the repository, go to ```Settings->Secrets``` and add the keys to the following secrets: 122 | 123 | ``` 124 | AWS_ACCESS_KEY_ID (Bucket Credential) 125 | AWS_SECRET_ACCESS_KEY (Bucket Credential) 126 | IBM_CREDENTIALS_PASS (password for the encrypted file) 127 | ``` 128 | -------------------------------------------------------------------------------- /src/scripts/Pipelines/model_train_autoAI.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import yaml 4 | import pandas as pd 5 | 6 | # from ibm_watson_machine_learning.helpers import DataConnection, S3Connection, S3Location 7 | from ibm_watson_machine_learning.experiment import AutoAI 8 | from ibm_watson_machine_learning.autoai.helpers.connections import ( 9 | S3Connection, 10 | S3Location, 11 | DataConnection, 12 | ) 13 | 14 | DATA_PATH = os.path.abspath(sys.argv[1]) 15 | CRED_PATH = os.path.abspath(sys.argv[2]) 16 | PROJ_PATH = os.path.abspath(sys.argv[3]) 17 | META_PATH = PROJ_PATH + "/metadata.yaml" 18 | 19 | 20 | with open(CRED_PATH) as stream: 21 | try: 22 | credentials = yaml.safe_load(stream) 23 | except yaml.YAMLError as exc: 24 | print(exc) 25 | 26 | with open(META_PATH) as stream: 27 | try: 28 | metadata = yaml.safe_load(stream) 29 | except yaml.YAMLError as exc: 30 | print(exc) 31 | 32 | wml_credentials = {"url": credentials["url"], "apikey": credentials["apikey"]} 33 | 34 | SPACE_ID = credentials["space_id"] 35 | 36 | data = pd.read_csv(DATA_PATH) 37 | 38 | X = data.iloc[:, :-1] 39 | y = data[data.columns[-1]] 40 | cols = data.columns.tolist() 41 | TARGET = cols[-1] 42 | AUTOAI_ENDPOINT = "auto_ml/a4243da5-a8b0-4e6a-8273-13c161f7e117/wml_data/d8ab3fe3-17a2-4474-b702-9c8309586a40" 43 | 44 | experiment = AutoAI( 45 | wml_credentials=wml_credentials, 46 | # project_id=credentials['project_id'], 47 | space_id=credentials["space_id"], 48 | ) 49 | 50 | 51 | pipeline_optimizer = experiment.optimizer( 52 | name=metadata["project_name"], 53 | desc="", 54 | prediction_type=AutoAI.PredictionType.BINARY, 55 | prediction_column=TARGET, 56 | scoring=AutoAI.Metrics.ACCURACY_SCORE, 57 | test_size=0.2, 58 | max_num_daub_ensembles=1, 59 | train_sample_rows_test_size=1.0, 60 | daub_include_only_estimators=[ 61 | AutoAI.ClassificationAlgorithms.XGB, 62 | AutoAI.ClassificationAlgorithms.LGBM, 63 | ], 64 | cognito_transform_names=[AutoAI.Transformers.SUM, AutoAI.Transformers.MAX], 65 | ) 66 | 67 | 68 | # note: this DataConnection will be used as a reference where to find your training dataset 69 | training_data_connection = DataConnection( 70 | connection=S3Connection( 71 | endpoint_url="url of the COS endpoint", 72 | access_key_id="COS access key id", 73 | secret_access_key="COS secret acces key", 74 | ), 75 | location=S3Location( 76 | bucket="bucket_name", # note: COS bucket name where training dataset is located 77 | path="my_path", # note: path within bucket where your training dataset is located 78 | ), 79 | ) 80 | 81 | # note: this DataConnection will be used as a reference where to save all of the AutoAI experiment results 82 | results_connection = DataConnection( 83 | connection=S3Connection( 84 | endpoint_url="url of the COS endpoint", 85 | access_key_id="COS access key id", 86 | secret_access_key="COS secret acces key", 87 | ), 88 | # note: bucket name and path could be different or the same as specified in the training_data_connection 89 | location=S3Location(bucket="bucket_name", path="my_path"), 90 | ) 91 | 92 | # training_data_connection = [DataConnection( 93 | # connection=S3Connection( 94 | # api_key=credentials['s3_apikey'], 95 | # auth_endpoint='https://iam.bluemix.net/oidc/token/', 96 | # endpoint_url='https://s3-api.us-geo.objectstorage.softlayer.net' 97 | # ), 98 | # location=S3Location( 99 | # bucket=credentials['s3_bucket'], 100 | # path=DATA_PATH 101 | # )) 102 | # ] 103 | # results_connection = DataConnection( 104 | # connection=S3Connection( 105 | # api_key=credentials['s3_apikey'], 106 | # auth_endpoint='https://iam.bluemix.net/oidc/token/', 107 | # endpoint_url='https://s3-api.us-geo.objectstorage.softlayer.net' 108 | # ), 109 | # location=S3Location( 110 | # bucket=credentials['s3_bucket'], 111 | # path=AUTOAI_ENDPOINT+'/data/automl', 112 | # model_location=AUTOAI_ENDPOINT+'/data/automl/cognito_output/Pipeline1/model.pickle', 113 | # training_status=AUTOAI_ENDPOINT+'/training-status.json' 114 | # )) 115 | 116 | fit_details = pipeline_optimizer.fit( 117 | training_data_reference=[training_data_connection], 118 | training_results_reference=results_connection, 119 | background_mode=True, 120 | ) 121 | 122 | 123 | status = pipeline_optimizer.get_run_status() 124 | print(status) 125 | 126 | run_details = pipeline_optimizer.get_run_details() 127 | 128 | results = pipeline_optimizer.summary() 129 | print(results) 130 | -------------------------------------------------------------------------------- /.infra/datapak_manage.py: -------------------------------------------------------------------------------- 1 | """ 2 | DataPak deployment space manage script 3 | 4 | """ 5 | 6 | import os 7 | import sys 8 | from pprint import pprint 9 | import json 10 | from ibm_watson_machine_learning import APIClient 11 | 12 | TERRAFORM_OUTPUT = ".terraform/terraform.tfstate" 13 | 14 | 15 | def authentication(): 16 | 17 | if os.getenv("IBMCLOUD_API_KEY"): 18 | 19 | wml_credentials = { 20 | "url": "https://us-south.ml.cloud.ibm.com", 21 | "apikey": os.environ.get("IBMCLOUD_API_KEY"), 22 | } 23 | client = APIClient(wml_credentials) # Connect to IBM cloud 24 | 25 | return client 26 | 27 | raise Exception("API_KEY environment variable not defined") 28 | 29 | 30 | def terraform_output(terraform_path=TERRAFORM_OUTPUT): 31 | 32 | output = dict(json.load(open(terraform_path)))["outputs"] 33 | 34 | cos_crn = output["cos_crn"]["value"] 35 | wml_crn = output["wml_crn"]["value"]["crn"] 36 | wml_name = output["wml_crn"]["value"]["resource_name"] 37 | 38 | state = {"cos_crn": cos_crn, "wml_name": wml_name, "wml_crn": wml_crn} 39 | return state 40 | 41 | 42 | def create_deployment_space( 43 | client, cos_crn, wml_name, wml_crn, space_name="default", description="" 44 | ): 45 | 46 | metadata = { 47 | client.spaces.ConfigurationMetaNames.NAME: space_name, ## Project info 48 | client.spaces.ConfigurationMetaNames.DESCRIPTION: description, 49 | client.spaces.ConfigurationMetaNames.STORAGE: { 50 | "type": "bmcos_object_storage", 51 | "resource_crn": cos_crn, 52 | }, 53 | client.spaces.ConfigurationMetaNames.COMPUTE: { ## Project compute instance (WML) 54 | "name": wml_name, 55 | "crn": wml_crn, 56 | }, 57 | } 58 | 59 | space_details = client.spaces.store(meta_props=metadata) # Create a space 60 | return space_details 61 | 62 | 63 | def update_deployment_space(client, new_name, space_id): 64 | 65 | metadata = {client.spaces.ConfigurationMetaNames.NAME: new_name} 66 | 67 | space_details = client.spaces.update(space_id, changes=metadata) 68 | return space_details 69 | 70 | 71 | def delete_deployment_space(client, space_id): 72 | 73 | client.spaces.delete(space_id) 74 | 75 | 76 | def list_deployment_space(client): 77 | spaces = client.spaces.list() 78 | print(spaces) 79 | 80 | 81 | def describe_deployment_space(client, space_id): 82 | info = client.spaces.get_details(space_id) 83 | pprint(info) 84 | 85 | 86 | def help(): 87 | 88 | print( 89 | """ 90 | datapak_config.py [options] 91 | 92 | create 93 | update 94 | delete 95 | list 96 | describe 97 | """ 98 | ) 99 | 100 | 101 | if __name__ == "__main__": 102 | 103 | client = authentication() 104 | 105 | args = sys.argv[1:] 106 | 107 | if len(args) >= 1: 108 | action = args[0] 109 | 110 | if action == "create": 111 | 112 | infos = terraform_output() 113 | if len(args) == 2: 114 | space_name = args[1] 115 | space = create_deployment_space( 116 | client, 117 | infos["cos_crn"], 118 | infos["wml_name"], 119 | infos["wml_crn"], 120 | space_name, 121 | ) 122 | 123 | elif len(args) > 2: 124 | space_name = args[1] 125 | description = args[2] 126 | space = create_deployment_space( 127 | client, 128 | infos["cos_crn"], 129 | infos["wml_name"], 130 | infos["wml_crn"], 131 | space_name, 132 | description, 133 | ) 134 | 135 | pprint(space) 136 | 137 | elif action == "update": 138 | 139 | try: 140 | new_name = args[1] 141 | space_id = args[2] 142 | except: 143 | raise Exception("Missing arguments") 144 | 145 | space = update_deployment_space(client, new_name, space_id) 146 | pprint(space) 147 | 148 | elif action == "delete": 149 | try: 150 | space_id = args[1] 151 | except: 152 | raise Exception("Missing space_id") 153 | 154 | delete_deployment_space(client, space_id) 155 | 156 | elif action == "list": 157 | list_deployment_space(client) 158 | 159 | elif action == "describe": 160 | 161 | try: 162 | space_id = args[1] 163 | except: 164 | raise Exception("Missing space_id") 165 | 166 | describe_deployment_space(client, space_id) 167 | 168 | else: 169 | help() 170 | 171 | else: 172 | help() 173 | -------------------------------------------------------------------------------- /src/model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sklearn 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.model_selection import cross_val_score 5 | from sklearn.metrics import accuracy_score, f1_score 6 | from sklearn.pipeline import Pipeline 7 | from sklearn.preprocessing import StandardScaler 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.linear_model import LogisticRegression 10 | from sklearn.model_selection import cross_val_score 11 | from sklearn.metrics import confusion_matrix 12 | from sklearn.metrics import accuracy_score 13 | from sklearn.metrics import precision_score, recall_score, f1_score 14 | from sklearn.metrics import precision_recall_curve 15 | from sklearn.metrics import roc_curve 16 | from sklearn.metrics import roc_auc_score 17 | import matplotlib.pyplot as plt 18 | 19 | 20 | def get_variables(data, column): 21 | # Seperating the dependant and independant variables 22 | y = data[column] 23 | X = data.drop([column], axis=1) 24 | 25 | return X, y 26 | 27 | 28 | def train(data, num_estimators, isDataFrame=False): 29 | 30 | if not isDataFrame: 31 | data = pd.read_csv(data) 32 | 33 | # Seperating the dependant and independant variables 34 | # y = data["RainTomorrow"] 35 | # X = data.drop(["RainTomorrow"], axis=1) 36 | 37 | X, y = get_variables(data, "RainTomorrow") 38 | 39 | X_train, X_test, y_train, y_test = train_test_split( 40 | X, y, test_size=0.3, random_state=0 41 | ) 42 | 43 | pipe = Pipeline( 44 | [ 45 | ("scaler", StandardScaler()), 46 | ( 47 | "RFC", 48 | RandomForestClassifier( 49 | criterion="gini", 50 | max_depth=10, 51 | max_features="auto", 52 | n_estimators=num_estimators, 53 | ), 54 | ), 55 | ] 56 | ) 57 | 58 | training_logs = pipe.fit(X_train, y_train) 59 | 60 | logs = {"training_logs": training_logs} 61 | 62 | return pipe, logs 63 | 64 | 65 | def evaluate(data, pipeline, OUTPUT_PATH, isDataFrame=False): 66 | 67 | pipe = pipeline 68 | 69 | if not isDataFrame: 70 | data = pd.read_csv(data) 71 | 72 | y = data["RainTomorrow"] 73 | X = data.drop(["RainTomorrow"], axis=1) 74 | 75 | X_train, X_test, y_train, y_test = train_test_split( 76 | X, y, test_size=0.3, random_state=0 77 | ) 78 | 79 | # metrics 80 | def comb_eval(y, y_pred): 81 | acc = accuracy_score(y, y_pred) 82 | recall = recall_score(y, y_pred) 83 | precision = precision_score(y, y_pred) 84 | f1 = f1_score(y, y_pred) 85 | 86 | return {"accuracy": acc, "recall": recall, "precision": precision, "f1": f1} 87 | 88 | # y_pred_train = pipe.predict(X_train) 89 | # train_result = comb_eval(y_train, y_pred_train) 90 | 91 | y_pred_test = pipe.predict(X_test) 92 | test_result = comb_eval(y_test, y_pred_test) 93 | 94 | # cvs = cross_val_score(pipe, X, y, cv=3) 95 | 96 | # roc curve 97 | # y_pred = pipe.predict(X_test) 98 | 99 | dummy_probs = [0 for _ in range(len(y_test))] 100 | model_probs = pipe.predict_proba(X_test) 101 | model_probs = model_probs[:, 1] 102 | 103 | # model_auc = roc_auc_score(y_test, model_probs) 104 | 105 | dummy_fpr, dummy_tpr, _ = roc_curve(y_test, dummy_probs) 106 | model_fpr, model_tpr, _ = roc_curve(y_test, model_probs) 107 | 108 | # precision_recall_curve 109 | y_scores = pipe.predict_proba(X_test)[:, 1] 110 | precisions, recalls, thresholds = precision_recall_curve(y_test, y_scores) 111 | 112 | logs = { 113 | "metrics": test_result, 114 | "roc_curve": { 115 | "model_tpr": model_tpr, 116 | "model_fpr": model_fpr, 117 | "dummy_tpr": dummy_tpr, 118 | "dummy_fpr": dummy_fpr, 119 | }, 120 | "precision_recall_curve": { 121 | "precisions": precisions, 122 | "recalls": recalls, 123 | "thresholds": thresholds, 124 | }, 125 | } 126 | 127 | # roc curve 128 | # plot the roc curve for the model 129 | plt.plot( 130 | logs["roc_curve"]["dummy_fpr"], 131 | logs["roc_curve"]["dummy_tpr"], 132 | linestyle="--", 133 | label="Dummy Classifer", 134 | ) 135 | plt.plot( 136 | logs["roc_curve"]["model_fpr"], 137 | logs["roc_curve"]["model_tpr"], 138 | marker=".", 139 | label="RFC", 140 | ) 141 | # axis labels 142 | plt.xlabel("False Positive Rate") 143 | plt.ylabel("True Positive Rate") 144 | # show the legend 145 | plt.legend() 146 | out_path = OUTPUT_PATH + "/roc_curve.png" 147 | plt.savefig(out_path, dpi=80) 148 | plt.cla() 149 | 150 | def plot_prc(precisions, recalls, thresholds): 151 | plt.plot(thresholds, precisions[:-1], "b--", label="Precision") 152 | plt.plot(thresholds, recalls[:-1], "g-", label="Recall") 153 | plt.xlabel("Thresholds") 154 | plt.legend(loc="center left") 155 | plt.ylim([0, 1]) 156 | out_path = OUTPUT_PATH + "/precision_recall_curve.png" 157 | plt.savefig(out_path, dpi=80) 158 | 159 | plot_prc( 160 | logs["precision_recall_curve"]["precisions"], 161 | logs["precision_recall_curve"]["recalls"], 162 | logs["precision_recall_curve"]["thresholds"], 163 | ) 164 | 165 | return logs 166 | -------------------------------------------------------------------------------- /src/scripts/Pipelines/openscale.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import yaml 4 | from ibm_cloud_sdk_core.authenticators import IAMAuthenticator 5 | from ibm_ai_openscale import APIClient 6 | from ibm_ai_openscale.engines import * 7 | from ibm_ai_openscale.utils import * 8 | from ibm_ai_openscale.supporting_classes import PayloadRecord, Feature 9 | from ibm_ai_openscale.supporting_classes.enums import * 10 | import requests 11 | from ibm_ai_openscale.utils import get_instance_guid 12 | import ibm_watson_machine_learning 13 | import json 14 | import pandas as pd 15 | from sklearn.model_selection import train_test_split 16 | import numpy as np 17 | from sklearn.metrics import ( 18 | confusion_matrix, 19 | accuracy_score, 20 | precision_score, 21 | recall_score, 22 | f1_score, 23 | ) 24 | from ibm_cloud_sdk_core.authenticators import IAMAuthenticator 25 | from ibm_watson_openscale import * 26 | from ibm_watson_openscale.supporting_classes.enums import * 27 | from ibm_watson_openscale.supporting_classes.payload_record import PayloadRecord 28 | import ibm_watson_openscale 29 | 30 | 31 | with open("../credentials.yaml") as stream: 32 | try: 33 | credentials = yaml.safe_load(stream) 34 | except yaml.YAMLError as exc: 35 | print(exc) 36 | 37 | 38 | with open("../metadata.yaml") as stream: 39 | try: 40 | metadata = yaml.safe_load(stream) 41 | except yaml.YAMLError as exc: 42 | print(exc) 43 | 44 | 45 | service_credentials = { 46 | "apikey": credentials["apikey"], 47 | "url": "https://api.aiopenscale.cloud.ibm.com", 48 | } 49 | 50 | DEPLOYMENT_UID = metadata["deployment_uid"] 51 | MODEL_UID = metadata["model_uid"] 52 | MODEL_NAME = metadata["project_name"] + "_" + metadata["project_version"] 53 | SPACE_ID = credentials["space_id"] 54 | WOS_GUID = get_instance_guid(api_key=service_credentials["apikey"]) 55 | WOS_CREDENTIALS = { 56 | "instance_guid": WOS_GUID, 57 | "apikey": service_credentials["apikey"], 58 | "url": "https://api.aiopenscale.cloud.ibm.com", 59 | } 60 | 61 | if WOS_GUID is None: 62 | print("Watson OpenScale GUID NOT FOUND") 63 | else: 64 | print(WOS_GUID) 65 | 66 | ai_client = APIClient(aios_credentials=WOS_CREDENTIALS) 67 | print(ai_client.version) 68 | 69 | wml_credentials = {"url": credentials["url"], "apikey": credentials["apikey"]} 70 | 71 | wml_client = ibm_watson_machine_learning.APIClient(wml_credentials) 72 | 73 | wml_credentials = { 74 | "url": credentials["url"], 75 | "apikey": credentials["apikey"], 76 | "instance_id": "wml_local", 77 | } 78 | 79 | wml_client.set.default_space("16148a4d-9055-4220-af26-0c0369cdf31a") 80 | 81 | authenticator = IAMAuthenticator(apikey=credentials["apikey"]) 82 | wos_client = ibm_watson_openscale.APIClient( 83 | authenticator=authenticator, service_url="https://api.aiopenscale.cloud.ibm.com" 84 | ) 85 | 86 | 87 | KEEP_MY_INTERNAL_POSTGRES = True 88 | DB_CREDENTIALS = None 89 | try: 90 | data_mart_details = ai_client.data_mart.get_details() 91 | if ( 92 | "internal_database" in data_mart_details 93 | and data_mart_details["internal_database"] 94 | ): 95 | if KEEP_MY_INTERNAL_POSTGRES: 96 | print("Using existing internal datamart.") 97 | else: 98 | if DB_CREDENTIALS is None: 99 | print( 100 | "No postgres credentials supplied. Using existing internal datamart" 101 | ) 102 | else: 103 | print("Switching to external datamart") 104 | ai_client.data_mart.delete(force=True) 105 | ai_client.data_mart.setup(db_credentials=DB_CREDENTIALS) 106 | else: 107 | print("Using existing external datamart") 108 | except: 109 | if DB_CREDENTIALS is None: 110 | print("Setting up internal datamart") 111 | ai_client.data_mart.setup(internal_db=True) 112 | else: 113 | print("Setting up external datamart") 114 | try: 115 | ai_client.data_mart.setup(db_credentials=DB_CREDENTIALS) 116 | except: 117 | print("Setup failed, trying Db2 setup") 118 | ai_client.data_mart.setup( 119 | db_credentials=DB_CREDENTIALS, schema=DB_CREDENTIALS["username"] 120 | ) 121 | data_mart_details = ai_client.data_mart.get_details() 122 | 123 | binding_uid = ai_client.data_mart.bindings.add( 124 | "Rain Aus", WatsonMachineLearningInstance(wml_credentials) 125 | ) 126 | 127 | bindings_details = ai_client.data_mart.bindings.get_details() 128 | 129 | if binding_uid is None: 130 | binding_uid = [ 131 | binding["metadata"]["guid"] 132 | for binding in bindings_details["service_bindings"] 133 | if binding["entity"]["name"] == "WML Cloud Instance" 134 | ][0] 135 | ai_client.data_mart.bindings.list() 136 | 137 | ai_client.data_mart.bindings.list_assets(binding_uid=binding_uid) 138 | 139 | subscriptions_uids = ai_client.data_mart.subscriptions.get_uids() 140 | # for subscription in subscriptions_uids: 141 | # sub_name = ai_client.data_mart.subscriptions.get_details(subscription)['entity']['asset']['name'] 142 | # if sub_name == MODEL_NAME: 143 | # ai_client.data_mart.subscriptions.delete(subscription) 144 | # print('Deleted existing subscription for', MODEL_NAME) 145 | 146 | # subscription = ai_client.data_mart.subscriptions.add(WatsonMachineLearningAsset( 147 | # MODEL_UID, 148 | # problem_type=ProblemType.BINARY_CLASSIFICATION, 149 | # input_data_type=InputDataType.STRUCTURED, 150 | # label_column='RainTomorrow', 151 | # prediction_column='predictedLabel', 152 | # probability_column='probability', 153 | # transaction_id_column='transaction_id', 154 | # feature_columns = ["Humidity3pm", "Humidity9am", "MaxTemp", "MinTemp", "Pressure3pm", "Pressure9am", "RainToday", "Rainfall", "Temp3pm", "Temp9am", "WindDir3pm_E", "WindDir3pm_ENE", "WindDir3pm_ESE", "WindDir3pm_N", "WindDir3pm_NE", "WindDir3pm_NNE", "WindDir3pm_NNW", "WindDir3pm_NW", "WindDir3pm_S", "WindDir3pm_SE", "WindDir3pm_SSE", "WindDir3pm_SSW", "WindDir3pm_SW", "WindDir3pm_W", "WindDir3pm_WNW", "WindDir3pm_WSW", "WindDir9am_E", "WindDir9am_ENE", "WindDir9am_ESE", "WindDir9am_N", "WindDir9am_NE", "WindDir9am_NNE", "WindDir9am_NNW", "WindDir9am_NW", "WindDir9am_S", "WindDir9am_SE", "WindDir9am_SSE", "WindDir9am_SSW", "WindDir9am_SW", "WindDir9am_W", "WindDir9am_WNW", "WindDir9am_WSW", "WindGustDir_E", "WindGustDir_ENE", "WindGustDir_ESE", "WindGustDir_N", "WindGustDir_NE", "WindGustDir_NNE", "WindGustDir_NNW", "WindGustDir_NW", "WindGustDir_S", "WindGustDir_SE", "WindGustDir_SSE", "WindGustDir_SSW", "WindGustDir_SW", "WindGustDir_W", "WindGustDir_WNW", "WindGustDir_WSW", "WindGustSpeed", "WindSpeed3pm", "WindSpeed9am"], 155 | # categorical_columns = ["RainToday", "WindDir3pm_E", "WindDir3pm_ENE", "WindDir3pm_ESE", "WindDir3pm_N", "WindDir3pm_NE", "WindDir3pm_NNE", "WindDir3pm_NNW", "WindDir3pm_NW", "WindDir3pm_S", "WindDir3pm_SE", "WindDir3pm_SSE", "WindDir3pm_SSW", "WindDir3pm_SW", "WindDir3pm_W", "WindDir3pm_WNW", "WindDir3pm_WSW", "WindDir9am_E", "WindDir9am_ENE", "WindDir9am_ESE", "WindDir9am_N", "WindDir9am_NE", "WindDir9am_NNE", "WindDir9am_NNW", "WindDir9am_NW", "WindDir9am_S", "WindDir9am_SE", "WindDir9am_SSE", "WindDir9am_SSW", "WindDir9am_SW", "WindDir9am_W", "WindDir9am_WNW", "WindDir9am_WSW", "WindGustDir_E", "WindGustDir_ENE", "WindGustDir_ESE", "WindGustDir_N", "WindGustDir_NE", "WindGustDir_NNE", "WindGustDir_NNW", "WindGustDir_NW", "WindGustDir_S", "WindGustDir_SE", "WindGustDir_SSE", "WindGustDir_SSW", "WindGustDir_SW", "WindGustDir_W", "WindGustDir_WNW", "WindGustDir_WSW", "WindGustSpeed", "WindSpeed3pm", "WindSpeed9am"] 156 | # )) 157 | 158 | subscription = None 159 | 160 | if subscription is None: 161 | print("Subscription already exists; get the existing one") 162 | subscriptions_uids = ai_client.data_mart.subscriptions.get_uids() 163 | 164 | for sub in subscriptions_uids: 165 | if ( 166 | ai_client.data_mart.subscriptions.get_details(sub)["entity"]["asset"][ 167 | "name" 168 | ] 169 | == MODEL_NAME 170 | ): 171 | subscription = ai_client.data_mart.subscriptions.get(sub) 172 | 173 | 174 | for deployment in wml_client.deployments.get_details()["resources"]: 175 | if DEPLOYMENT_UID in deployment["metadata"]["id"]: 176 | 177 | scoring_endpoint = deployment["entity"]["status"]["online_url"]["url"] 178 | 179 | print(scoring_endpoint) 180 | 181 | 182 | data = pd.read_csv("../data/weatherAUS_processed.csv") 183 | 184 | X = data.iloc[:, :-1] 185 | y = data[data.columns[-1]] 186 | X_train, X_test, y_train, y_test = train_test_split( 187 | X, y, test_size=0.01, random_state=1337 188 | ) 189 | 190 | 191 | # Payload Logging DAtASET 192 | 193 | payload_data_set_id = ( 194 | wos_client.data_sets.list( 195 | type=DataSetTypes.PAYLOAD_LOGGING, 196 | target_target_id=subscription_id, 197 | target_target_type=TargetTypes.SUBSCRIPTION, 198 | ) 199 | .result.data_sets[0] 200 | .metadata.id 201 | ) 202 | print("Payload data set id:", payload_data_set_id) 203 | 204 | 205 | payload_scoring = { 206 | "input_data": [ 207 | {"fields": X.columns.to_numpy().tolist(), "values": X_test.to_numpy().tolist()} 208 | ] 209 | } 210 | 211 | scoring_response = wml_client.deployments.score(DEPLOYMENT_UID, payload_scoring) 212 | 213 | print("Logging") 214 | records = [ 215 | PayloadRecord(request=payload_scoring, response=scoring_response, response_time=72) 216 | ] 217 | store_record_info = wos_client.data_sets.store_records(payload_data_set_id, records) 218 | 219 | 220 | # Feedback Logging 221 | 222 | feedback_dataset = wos_client.data_sets.list( 223 | type=DataSetTypes.FEEDBACK, 224 | target_target_id=subscription_id, 225 | target_target_type=TargetTypes.SUBSCRIPTION, 226 | ).result 227 | 228 | feedback_dataset_id = feedback_dataset.data_sets[0].metadata.id 229 | if feedback_dataset_id is None: 230 | print("Feedback data set not found. Please check quality monitor status.") 231 | sys.exit(1) 232 | 233 | data = X_test.to_dict("records") 234 | 235 | wos_client.data_sets.store_records( 236 | feedback_dataset_id, 237 | request_body=data, 238 | background_mode=False, 239 | header=True, 240 | delimiter=",", 241 | csv_max_line_length=1000, 242 | ) 243 | 244 | print(wos_client.data_sets.get_records_count(data_set_id=feedback_dataset_id)) 245 | 246 | 247 | #### 248 | 249 | 250 | from ibm_watson_openscale.supporting_classes.enums import * 251 | 252 | print("\nData marts: ") 253 | datams = wos_client.data_marts.list().result.data_marts 254 | for d in datams: 255 | print(d.metadata.id) 256 | datamart_id = d.metadata.id 257 | 258 | print("\nService providers: ") 259 | services = wos_client.service_providers.list().result.service_providers 260 | for service in services: 261 | print(service.metadata.id + " / Name: " + service.entity.name) 262 | service_id = service.metadata.id 263 | 264 | # wos_client.subscriptions.show() 265 | # wos_client.data_sets.show() 266 | 267 | print("\nSubscriptions: ") 268 | subscriptions = wos_client.subscriptions.list( 269 | data_mart_id=datamart_id, service_provider_id=service_id 270 | ).result.subscriptions 271 | for s in subscriptions: 272 | print(s.metadata.id + " " + s.entity.asset.name) 273 | subscription_id = s.metadata.id 274 | 275 | print("\n") 276 | 277 | payload_data_set_id = ( 278 | wos_client.data_sets.list( 279 | type=DataSetTypes.PAYLOAD_LOGGING, 280 | target_target_id=subscription_id, 281 | target_target_type=TargetTypes.SUBSCRIPTION, 282 | ) 283 | .result.data_sets[0] 284 | .metadata.id 285 | ) 286 | print("Payload data set id:", payload_data_set_id) 287 | 288 | 289 | pl_records_count = wos_client.data_sets.get_records_count(payload_data_set_id) 290 | print("Number of records in the payload logging table: {}".format(pl_records_count)) 291 | if pl_records_count == 0: 292 | raise Exception("Payload logging did not happen!") 293 | 294 | 295 | # Create Monitor 296 | 297 | target = ibm_watson_openscale.base_classes.watson_open_scale_v2.Target( 298 | target_type=TargetTypes.SUBSCRIPTION, target_id=subscription.uid 299 | ) 300 | parameters = {"min_feedback_data_size": 200} 301 | thresholds = [{"metric_id": "area_under_roc", "type": "lower_limit", "value": 0.75}] 302 | wos_client.monitor_instances.create( 303 | data_mart_id=datamart_id, 304 | background_mode=False, 305 | monitor_definition_id=wos_client.monitor_definitions.MONITORS.QUALITY.ID, 306 | target=target, 307 | parameters=parameters, 308 | thresholds=thresholds, 309 | ) 310 | 311 | monitor_instances_info = wos_client.monitor_instances.show(data_mart_id=datamart_id) 312 | 313 | 314 | # wos_client.monitor_instances.delete( 315 | # background_mode=False, 316 | # monitor_instance_id='94e582d5-c244-4533-9697-c16046c5fc40' 317 | # ) 318 | 319 | monitor_instance_run_info = wos_client.monitor_instances.run( 320 | background_mode=False, monitor_instance_id="5ddff093-25fa-44f8-abae-fd29659fd0d0" 321 | ) 322 | --------------------------------------------------------------------------------