├── scripts ├── __init__.py ├── repo_metrics │ ├── __init__.py │ ├── config_template.py │ ├── README.md │ └── track_metrics.py ├── config.json ├── databricks_install.sh ├── prepare_databricks_for_o16n.sh └── generate_conda_file.py ├── notebooks ├── scripts │ ├── __init__.py │ ├── config.json │ ├── reco_full.yaml │ ├── databricks_install.sh │ ├── prepare_databricks_for_o16n.sh │ └── generate_conda_file.py ├── reco_utils │ ├── azureml │ │ ├── __init__.py │ │ ├── wide_deep.py │ │ └── svd_training.py │ ├── common │ │ ├── __init__.py │ │ ├── constants.py │ │ ├── spark_utils.py │ │ ├── general_utils.py │ │ ├── notebook_utils.py │ │ ├── python_utils.py │ │ ├── timer.py │ │ ├── gpu_utils.py │ │ └── notebook_memory_management.py │ ├── dataset │ │ ├── __init__.py │ │ ├── url_utils.py │ │ ├── pandas_df_utils.py │ │ └── cosmos_cli.py │ ├── evaluation │ │ ├── __init__.py │ │ └── parameter_sweep.py │ ├── recommender │ │ ├── __init__.py │ │ ├── rbm │ │ │ └── __init__.py │ │ ├── deeprec │ │ │ ├── __init__.py │ │ │ ├── IO │ │ │ │ └── __init__.py │ │ │ └── models │ │ │ │ └── __init__.py │ │ ├── fastai │ │ │ ├── __init__.py │ │ │ └── fastai_utils.py │ │ ├── ncf │ │ │ └── __init__.py │ │ ├── surprise │ │ │ ├── __init__.py │ │ │ └── surprise_utils.py │ │ ├── wide_deep │ │ │ └── __init__.py │ │ ├── vowpal_wabbit │ │ │ └── __init__.py │ │ └── sar │ │ │ └── __init__.py │ ├── __init__.py │ └── README.md └── README.md ├── reco_utils ├── azureml │ ├── __init__.py │ ├── azureml_utils.py │ ├── aks_utils.py │ ├── wide_deep.py │ └── svd_training.py ├── common │ ├── __init__.py │ ├── constants.py │ ├── notebook_utils.py │ ├── general_utils.py │ ├── spark_utils.py │ ├── timer.py │ ├── gpu_utils.py │ ├── python_utils.py │ └── notebook_memory_management.py ├── dataset │ ├── __init__.py │ ├── download_utils.py │ └── cosmos_cli.py ├── evaluation │ ├── __init__.py │ └── parameter_sweep.py ├── recommender │ ├── __init__.py │ ├── rbm │ │ └── __init__.py │ ├── deeprec │ │ ├── __init__.py │ │ ├── IO │ │ │ └── __init__.py │ │ └── models │ │ │ └── __init__.py │ ├── fastai │ │ ├── __init__.py │ │ └── fastai_utils.py │ ├── lightgbm │ │ └── __init__.py │ ├── ncf │ │ └── __init__.py │ ├── surprise │ │ ├── __init__.py │ │ └── surprise_utils.py │ ├── wide_deep │ │ └── __init__.py │ ├── vowpal_wabbit │ │ └── __init__.py │ └── sar │ │ └── __init__.py ├── __init__.py ├── README.md └── nni │ └── nni_utils.py ├── tests ├── ci │ ├── requirements.txt │ ├── config.json │ ├── runpytest.py │ ├── install_requirements.sh │ ├── Master-CPU-pipeline.yml │ ├── pytest.yml │ └── submitpytest.py └── unit │ ├── test_general_utils.py │ ├── test_dataset.py │ ├── test_gpu_utils.py │ ├── test_sweep.py │ ├── test_timer.py │ ├── test_notebook_utils.py │ ├── test_notebook_utils.ipynb │ ├── test_notebooks_pyspark.py │ ├── test_notebooks_python.py │ ├── test_notebooks_gpu.py │ ├── test_deeprec_model.py │ ├── test_python_utils.py │ ├── test_pandas_df_utils.py │ ├── test_deeprec_utils.py │ ├── test_vowpal_wabbit.py │ ├── test_wide_deep_utils.py │ ├── test_surprise_utils.py │ ├── test_ncf_dataset.py │ ├── test_sparse.py │ ├── test_tf_utils.py │ ├── test_rbm.py │ └── test_ncf_singlenode.py ├── azure-pipelines.yml ├── LICENSE ├── README.md └── SECURITY.md /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reco_utils/azureml/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reco_utils/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reco_utils/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reco_utils/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/repo_metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reco_utils/recommender/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reco_utils/recommender/rbm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/reco_utils/azureml/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/reco_utils/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/reco_utils/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/reco_utils/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reco_utils/recommender/fastai/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reco_utils/recommender/lightgbm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reco_utils/recommender/ncf/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /reco_utils/recommender/surprise/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/reco_utils/recommender/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/reco_utils/recommender/rbm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/IO/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reco_utils/recommender/wide_deep/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/reco_utils/recommender/deeprec/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/reco_utils/recommender/fastai/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/reco_utils/recommender/ncf/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /notebooks/reco_utils/recommender/surprise/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reco_utils/recommender/deeprec/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reco_utils/recommender/vowpal_wabbit/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /notebooks/reco_utils/recommender/deeprec/IO/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/reco_utils/recommender/wide_deep/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/reco_utils/recommender/deeprec/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/reco_utils/recommender/vowpal_wabbit/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/ci/requirements.txt: -------------------------------------------------------------------------------- 1 | scipy==1.0.0 2 | scikit-learn==0.19.1 3 | numpy==1.14.5 4 | pandas==0.23.1 5 | pytest==4.3.0 -------------------------------------------------------------------------------- /tests/ci/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "subscription_id": "<>", 3 | "resource_group": "recommender", 4 | "workspace_name": "RecoWS", 5 | "location": "eastus" 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/scripts/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "subscription_id": <>, 3 | "resource_group": "recommender", 4 | "workspace_name": "addWS", 5 | "location": "southcentralus" 6 | } -------------------------------------------------------------------------------- /scripts/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "subscription_id": "15ae9cb6-95c1-483d-a0e3-b1a1a3b06324", 3 | "resource_group": "recommender", 4 | "workspace_name": "RecoWS", 5 | "location": "southcentralus" 6 | } -------------------------------------------------------------------------------- /reco_utils/__init__.py: -------------------------------------------------------------------------------- 1 | __title__ = "Microsoft Recommenders" 2 | __version__ = "2019.02" 3 | __author__ = "RecoDev Team at Microsoft" 4 | __license__ = "MIT" 5 | __copyright__ = "Copyright 2018-present Microsoft Corporation" 6 | 7 | # Version synonym 8 | VERSION = __version__ 9 | -------------------------------------------------------------------------------- /notebooks/reco_utils/__init__.py: -------------------------------------------------------------------------------- 1 | __title__ = "Microsoft Recommenders" 2 | __version__ = "2019.02" 3 | __author__ = "RecoDev Team at Microsoft" 4 | __license__ = "MIT" 5 | __copyright__ = "Copyright 2018-present Microsoft Corporation" 6 | 7 | # Version synonym 8 | VERSION = __version__ 9 | -------------------------------------------------------------------------------- /notebooks/reco_utils/common/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | # Default column names 5 | DEFAULT_USER_COL = "userID" 6 | DEFAULT_ITEM_COL = "itemID" 7 | DEFAULT_RATING_COL = "rating" 8 | DEFAULT_TIMESTAMP_COL = "timestamp" 9 | PREDICTION_COL = "prediction" 10 | DEFAULT_PREDICTION_COL = PREDICTION_COL 11 | 12 | # Filtering variables 13 | DEFAULT_K = 10 14 | DEFAULT_THRESHOLD = 10 15 | -------------------------------------------------------------------------------- /tests/unit/test_general_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | from reco_utils.common.general_utils import invert_dictionary, get_number_processors 6 | 7 | 8 | def test_invert_dictionary(): 9 | d = {"a": 1, "b": 2} 10 | d_inv = invert_dictionary(d) 11 | assert d_inv == {1: "a", 2: "b"} 12 | 13 | 14 | def test_get_number_processors(): 15 | assert get_number_processors() >= 4 16 | -------------------------------------------------------------------------------- /scripts/repo_metrics/config_template.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | # Github token 5 | # More info: https://help.github.com/articles/creating-a-personal-access-token-for-the-command-line/ 6 | GITHUB_TOKEN = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" 7 | 8 | # CosmosDB Mongo API 9 | CONNECTION_STRING = "mongodb://XXXXXXXXXXXXXXXXXXXXXXXXX.documents.azure.com:10255/?ssl=true&replicaSet=globaldb" 10 | DATABASE = "reco_stats" 11 | COLLECTION_GITHUB_STATS = "github_stats" 12 | COLLECTION_EVENTS = "events" 13 | 14 | -------------------------------------------------------------------------------- /tests/ci/runpytest.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import subprocess 5 | import os 6 | 7 | from azureml.core import Run 8 | print('before run.get_context') 9 | run = Run.get_context() 10 | print('before subprocess.run') 11 | 12 | subprocess.run(["pytest", "tests/unit", 13 | "-m", "not notebooks and not spark and not gpu", 14 | "--junitxml=reports/test-unit.xml"]) 15 | 16 | print("os.listdir files", os.listdir(".")) 17 | # set up reports 18 | name_of_upload = "reports" 19 | path_on_disk = "reports" 20 | run.upload_folder(name_of_upload, path_on_disk) 21 | -------------------------------------------------------------------------------- /reco_utils/common/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | # Default column names 5 | DEFAULT_USER_COL = "userID" 6 | DEFAULT_ITEM_COL = "itemID" 7 | DEFAULT_RATING_COL = "rating" 8 | DEFAULT_LABEL_COL = "label" 9 | DEFAULT_TIMESTAMP_COL = "timestamp" 10 | DEFAULT_PREDICTION_COL = "prediction" 11 | COL_DICT = { 12 | "col_user": DEFAULT_USER_COL, 13 | "col_item": DEFAULT_ITEM_COL, 14 | "col_rating": DEFAULT_RATING_COL, 15 | "col_prediction": DEFAULT_PREDICTION_COL 16 | } 17 | 18 | # Filtering variables 19 | DEFAULT_K = 10 20 | DEFAULT_THRESHOLD = 10 21 | 22 | # Other 23 | SEED = 42 24 | -------------------------------------------------------------------------------- /tests/unit/test_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | import sys 6 | import pytest 7 | from reco_utils.dataset.url_utils import maybe_download 8 | 9 | 10 | def test_maybe_download(): 11 | file_url = "https://raw.githubusercontent.com/Microsoft/Recommenders/master/LICENSE" 12 | filepath = "license.txt" 13 | assert not os.path.exists(filepath) 14 | filepath = maybe_download(file_url, "license.txt", expected_bytes=1162) 15 | assert os.path.exists(filepath) 16 | os.remove(filepath) 17 | with pytest.raises(IOError): 18 | filepath = maybe_download(file_url, "license.txt", expected_bytes=0) 19 | -------------------------------------------------------------------------------- /reco_utils/recommender/sar/__init__.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | # Time since epoch in seconds 4 | EPOCH = datetime.datetime.utcfromtimestamp(0) 5 | # Default value for time decay parameter in SAR 6 | TIME_DECAY_COEFFICIENT = 30 7 | # Switch to trigger groupby in TimeDecay calculation 8 | TIMEDECAY_FORMULA = False 9 | # cooccurrence matrix threshold 10 | THRESHOLD = 1 11 | # Current time 12 | # TIME_NOW = (datetime.datetime.now() - EPOCH).total_seconds() 13 | TIME_NOW = None 14 | # Default names for functions which change the item-item cooccurrence matrix 15 | SIM_COOCCUR = "cooccurrence" 16 | SIM_JACCARD = "jaccard" 17 | SIM_LIFT = "lift" 18 | 19 | INDEXED_ITEMS = "indexedItems" 20 | INDEXED_USERS = "indexedUsers" 21 | 22 | -------------------------------------------------------------------------------- /notebooks/reco_utils/recommender/sar/__init__.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | # Time since epoch in seconds 4 | EPOCH = datetime.datetime.utcfromtimestamp(0) 5 | # Default value for time decay parameter in SAR 6 | TIME_DECAY_COEFFICIENT = 30 7 | # Switch to trigger groupby in TimeDecay calculation 8 | TIMEDECAY_FORMULA = False 9 | # cooccurrence matrix threshold 10 | THRESHOLD = 1 11 | # Current time 12 | # TIME_NOW = (datetime.datetime.now() - EPOCH).total_seconds() 13 | TIME_NOW = None 14 | # Default names for functions which change the item-item cooccurrence matrix 15 | SIM_COOCCUR = "cooccurrence" 16 | SIM_JACCARD = "jaccard" 17 | SIM_LIFT = "lift" 18 | 19 | INDEXED_ITEMS = "indexedItems" 20 | INDEXED_USERS = "indexedUsers" 21 | 22 | -------------------------------------------------------------------------------- /tests/unit/test_gpu_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import sys 5 | import pytest 6 | from reco_utils.common.gpu_utils import get_number_gpus, clear_memory_all_gpus, get_cuda_version, get_cudnn_version 7 | 8 | 9 | @pytest.mark.gpu 10 | def test_get_number_gpus(): 11 | assert get_number_gpus() >= 1 12 | 13 | 14 | @pytest.mark.gpu 15 | @pytest.mark.skip(reason="TODO: Implement this") 16 | def test_clear_memory_all_gpus(): 17 | pass 18 | 19 | 20 | @pytest.mark.gpu 21 | @pytest.mark.skipif(sys.platform == 'win32', reason="Not implemented on Windows") 22 | def test_get_cuda_version(): 23 | assert get_cuda_version() > "9.0.0" 24 | 25 | 26 | @pytest.mark.gpu 27 | def test_get_cudnn_version(): 28 | assert get_cudnn_version() > "7.0.0" -------------------------------------------------------------------------------- /notebooks/reco_utils/common/spark_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | try: 5 | from pyspark.sql import SparkSession 6 | except ImportError: 7 | pass # skip this import if we are in pure python environment 8 | 9 | 10 | def start_or_get_spark(app_name="Sample", url="local[*]", memory="10G"): 11 | """Start Spark if not started 12 | 13 | Args: 14 | app_name (str): Set name of the application 15 | url (str): URL for spark master. 16 | memory (str): Size of memory for spark driver. 17 | 18 | Returns: 19 | obj: Spark context. 20 | """ 21 | spark = ( 22 | SparkSession.builder.appName(app_name) 23 | .master(url) 24 | .config("spark.driver.memory", memory) 25 | .getOrCreate() 26 | ) 27 | 28 | return spark 29 | -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Run your notebooks as-is on AzureML Service 2 | 3 | This folder demonstrates how to build, train and test notebooks from our [Recommendation Project](http://github.com/Microsoft/Recommenders) project so you can make your own Recommendation system. 4 | 5 | We use MLOps to manually or automatically trigger builds due to Github PRs and changes. The control plane is in DevOps and AzureML Service provides numerous capabilities to track your assets when running Jupyter notebooks local or in the cloud. 6 | 7 | ## AzureML improves your MLOps experience! 8 | 9 | ### Build Definitions 10 | 11 | [Run Recommender Notebooks](https://dev.azure.com/emcmanu/NotebookPipelineDemo/_build?definitionId=15) 12 | 13 | [Validate Notebook Changes](https://dev.azure.com/emcmanu/NotebookPipelineDemo/_apps/hub/ms.vss-ciworkflow.build-ci-hub?_a=edit-build-definition&id=14) 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | # Validate notebooks in repo 2 | 3 | trigger: 4 | - master 5 | 6 | pool: 7 | vmImage: 'Ubuntu 16.04' 8 | 9 | steps: 10 | - task: UsePythonVersion@0 11 | displayName: 'Use Python 3.6' 12 | inputs: 13 | versionSpec: 3.6 14 | 15 | - task: RunNotebook@0 16 | inputs: 17 | azureSubscription: 'emcmanu_test' 18 | targetType: 'custom' 19 | computeTarget: 'gpucluster' 20 | pathFilter: 'notebooks/*.ipynb' 21 | condaDependencies: 'scripts/reco_full.yaml' 22 | commonFiles: 'notebooks/reco_utils' 23 | dockerBaseImage: 'mcr.microsoft.com/azureml/base-gpu:intelmpi2018.3-cuda9.0-cudnn7-ubuntu16.04' 24 | 25 | - task: PublishBuildArtifacts@1 26 | displayName: 'Publish Artifact: devops-for-ai' 27 | inputs: 28 | ArtifactName: 'devops-for-ai' 29 | publishLocation: 'container' 30 | pathtoPublish: '$(Build.ArtifactStagingDirectory)' 31 | TargetPath: '$(Build.ArtifactStagingDirectory)' -------------------------------------------------------------------------------- /tests/unit/test_sweep.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | from reco_utils.evaluation.parameter_sweep import generate_param_grid 5 | 6 | 7 | @pytest.fixture(scope="module") 8 | def parameter_dictionary(): 9 | params = { 10 | "param1": [1, 2, 3], 11 | "param2": [4, 5, 6], 12 | "param3": 1 13 | } 14 | 15 | return params 16 | 17 | 18 | def test_param_sweep(parameter_dictionary): 19 | params_grid = generate_param_grid(parameter_dictionary) 20 | 21 | assert params_grid == [ 22 | {'param1': 1, 'param2': 4, 'param3': 1}, {'param1': 1, 'param2': 5, 'param3': 1}, 23 | {'param1': 1, 'param2': 6, 'param3': 1}, {'param1': 2, 'param2': 4, 'param3': 1}, 24 | {'param1': 2, 'param2': 5, 'param3': 1}, {'param1': 2, 'param2': 6, 'param3': 1}, 25 | {'param1': 3, 'param2': 4, 'param3': 1}, {'param1': 3, 'param2': 5, 'param3': 1}, 26 | {'param1': 3, 'param2': 6, 'param3': 1} 27 | ] 28 | -------------------------------------------------------------------------------- /notebooks/reco_utils/common/general_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | 6 | 7 | def invert_dictionary(dictionary): 8 | """Invert a dictionary 9 | NOTE: If the dictionary has unique keys and unique values, the invertion would be perfect. However, if there are 10 | repeated values, the invertion can take different keys 11 | 12 | Args: 13 | dictionary (dict): A dictionary 14 | 15 | Returns: 16 | dict: inverted dictionary 17 | """ 18 | return {v: k for k, v in dictionary.items()} 19 | 20 | 21 | def get_number_processors(): 22 | """Get the number of processors in a CPU. 23 | 24 | Returns: 25 | int: Number of processors. 26 | """ 27 | try: 28 | num = os.cpu_count() 29 | except Exception: 30 | import multiprocessing # force exception in case mutiprocessing is not installed 31 | 32 | num = multiprocessing.cpu_count() 33 | return num 34 | -------------------------------------------------------------------------------- /reco_utils/common/notebook_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | 6 | 7 | def is_jupyter(): 8 | """Check if the module is running on Jupyter notebook/console 9 | 10 | Returns: 11 | bool: True if the module is running on Jupyter notebook or Jupyter console, 12 | False otherwise. 13 | """ 14 | try: 15 | shell_name = get_ipython().__class__.__name__ 16 | if shell_name == 'ZMQInteractiveShell': 17 | return True 18 | else: 19 | return False 20 | except NameError: 21 | return False 22 | 23 | 24 | def is_databricks(): 25 | """Check if the module is running on Databricks 26 | 27 | Returns: 28 | bool: True if the module is running on Databricks notebook, 29 | False otherwise. 30 | """ 31 | try: 32 | if os.path.realpath(".") == "/databricks/driver": 33 | return True 34 | else: 35 | return False 36 | except NameError: 37 | return False 38 | -------------------------------------------------------------------------------- /notebooks/reco_utils/common/notebook_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | 6 | 7 | def is_jupyter(): 8 | """Check if the module is running on Jupyter notebook/console 9 | 10 | Returns: 11 | bool: True if the module is running on Jupyter notebook or Jupyter console, 12 | False otherwise. 13 | """ 14 | try: 15 | shell_name = get_ipython().__class__.__name__ 16 | if shell_name == 'ZMQInteractiveShell': 17 | return True 18 | else: 19 | return False 20 | except NameError: 21 | return False 22 | 23 | 24 | def is_databricks(): 25 | """Check if the module is running on Databricks 26 | 27 | Returns: 28 | bool: True if the module is running on Databricks notebook, 29 | False otherwise. 30 | """ 31 | try: 32 | if os.path.realpath(".") == "/databricks/driver": 33 | return True 34 | else: 35 | return False 36 | except NameError: 37 | return False 38 | -------------------------------------------------------------------------------- /tests/unit/test_timer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | 5 | import pytest 6 | import time 7 | from reco_utils.common.timer import Timer 8 | 9 | 10 | TOL = 0.01 11 | 12 | 13 | @pytest.fixture(scope="function") 14 | def t(): 15 | return Timer() 16 | 17 | 18 | def test_no_time(t): 19 | assert t.interval == 0 20 | assert t.running == False 21 | 22 | 23 | def test_stop_before_start(t): 24 | with pytest.raises(ValueError): 25 | t.stop() 26 | 27 | 28 | def test_interval_before_stop(t): 29 | t.start() 30 | with pytest.raises(ValueError): 31 | t.interval 32 | 33 | 34 | def test_timer(t): 35 | t.start() 36 | assert t.running == True 37 | time.sleep(1) 38 | t.stop() 39 | assert t.running == False 40 | assert t.interval == pytest.approx(1, abs=TOL) 41 | with Timer() as t2: 42 | assert t2.running == True 43 | time.sleep(1) 44 | assert t2.interval == pytest.approx(1, abs=TOL) 45 | assert t2.running == False 46 | 47 | 48 | def test_timer_format(t): 49 | assert str(t) == "0:00:00" 50 | assert str(t.interval) == "0" 51 | -------------------------------------------------------------------------------- /tests/unit/test_notebook_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | import os 4 | import pytest 5 | import papermill as pm 6 | from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME 7 | from reco_utils.common.notebook_utils import is_jupyter, is_databricks 8 | 9 | 10 | @pytest.mark.notebooks 11 | def test_is_jupyter(): 12 | # Test on the terminal 13 | assert is_jupyter() is False 14 | assert is_databricks() is False 15 | 16 | # Test on Jupyter notebook 17 | path = os.path.join("tests", "unit", "test_notebook_utils.ipynb") 18 | pm.execute_notebook( 19 | path, 20 | OUTPUT_NOTEBOOK, 21 | kernel_name=KERNEL_NAME, 22 | ) 23 | nb = pm.read_notebook(OUTPUT_NOTEBOOK) 24 | df = nb.dataframe 25 | result_is_jupyter = df.loc[df["name"] == "is_jupyter", "value"].values[0] 26 | assert result_is_jupyter is True 27 | result_is_databricks = df.loc[df["name"] == "is_databricks", "value"].values[0] 28 | assert result_is_databricks is False 29 | 30 | # @pytest.mark.notebooks 31 | # def test_is_databricks(): 32 | # TODO Currently, we cannot pytest modules on Databricks 33 | -------------------------------------------------------------------------------- /notebooks/reco_utils/dataset/url_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | from urllib.request import urlretrieve 6 | import logging 7 | 8 | log = logging.getLogger(__name__) 9 | 10 | 11 | def maybe_download(url, filename, work_directory=".", expected_bytes=None): 12 | """Download a file if it is not already downloaded. 13 | 14 | Args: 15 | filename (str): File name. 16 | work_directory (str): Working directory. 17 | url (str): URL of the file to download. 18 | expected_bytes (int): Expected file size in bytes. 19 | 20 | Returns: 21 | str: File path of the file downloaded. 22 | """ 23 | filepath = os.path.join(work_directory, filename) 24 | if not os.path.exists(filepath): 25 | filepath, _ = urlretrieve(url, filepath) 26 | else: 27 | log.debug("File {} already downloaded".format(filepath)) 28 | if expected_bytes is not None: 29 | statinfo = os.stat(filepath) 30 | if statinfo.st_size != expected_bytes: 31 | os.remove(filepath) 32 | raise IOError("Failed to verify {}".format(filepath)) 33 | 34 | return filepath 35 | -------------------------------------------------------------------------------- /reco_utils/common/general_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | import psutil 6 | 7 | 8 | def invert_dictionary(dictionary): 9 | """Invert a dictionary 10 | NOTE: If the dictionary has unique keys and unique values, the inversion would be perfect. However, if there are 11 | repeated values, the inversion can take different keys 12 | 13 | Args: 14 | dictionary (dict): A dictionary 15 | 16 | Returns: 17 | dict: inverted dictionary 18 | """ 19 | return {v: k for k, v in dictionary.items()} 20 | 21 | 22 | def get_physical_memory(): 23 | """Get the physical memory in GBs. 24 | 25 | Returns: 26 | float: Physical memory in GBs. 27 | """ 28 | return psutil.virtual_memory()[0] / 1073741824 29 | 30 | 31 | def get_number_processors(): 32 | """Get the number of processors in a CPU. 33 | 34 | Returns: 35 | int: Number of processors. 36 | """ 37 | try: 38 | num = os.cpu_count() 39 | except Exception: 40 | import multiprocessing # force exception in case multiprocessing is not installed 41 | 42 | num = multiprocessing.cpu_count() 43 | return num 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /notebooks/scripts/reco_full.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # To create the conda environment: 3 | # $ conda env create -f reco_full.yaml 4 | # 5 | # To update the conda environment: 6 | # $ conda env update -f reco_full.yaml 7 | # 8 | # To register the conda environment in Jupyter: 9 | # $ conda activate reco_full 10 | # $ python -m ipykernel install --user --name reco_full --display-name "Python (reco_full)" 11 | # 12 | name: reco_full 13 | channels: 14 | - defaults 15 | - conda-forge 16 | - pytorch 17 | - fastai 18 | dependencies: 19 | - scipy>=1.0.0 20 | - mock==2.0.0 21 | - scikit-surprise>=1.0.6 22 | - fastparquet>=0.1.6 23 | - scikit-learn==0.19.1 24 | - pyspark==2.3.1 25 | - tensorflow-gpu==1.12.0 26 | - seaborn>=0.8.1 27 | - matplotlib>=2.2.2 28 | - pandas>=0.23.4 29 | - pytorch>=1.0.0 30 | - ipykernel>=4.6.1 31 | - jupyter>=1.0.0 32 | - gitpython>=2.1.8 33 | - dask>=0.17.1 34 | - numpy>=1.13.3 35 | - python==3.6.8 36 | - pymongo>=3.6.1 37 | - pytest>=3.6.4 38 | - pyarrow>=0.8.0 39 | - numba>=0.38.1 40 | - pip: 41 | - azureml-sdk[notebooks,contrib] 42 | - black>=18.6b4 43 | - dataclasses>=0.6 44 | - azure-storage>=0.36.0 45 | - hyperopt==0.1.1 46 | - nvidia-ml-py3>=7.352.0 47 | - pydocumentdb>=2.3.3 48 | - papermill>=0.15.0 49 | - fastai==1.0.46 50 | - idna==2.7 51 | - memory-profiler>=0.54.0 52 | -------------------------------------------------------------------------------- /scripts/repo_metrics/README.md: -------------------------------------------------------------------------------- 1 | # Repository Metrics 2 | 3 | [![Build status](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_apis/build/status/Recommenders/Recommenders%20repo%20stats)](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_build/latest?definitionId=5206) 4 | 5 | We developed a script that allows us to track the metrics of the Recommenders repo. Some of the metrics we can track are listed here: 6 | 7 | * Number of stars 8 | * Number of forks 9 | * Number of clones 10 | * Number of views 11 | * Number of lines of code 12 | 13 | To see the full list of metrics, see [git_stats.py](scripts/repo_metrics/git_stats.py) 14 | 15 | The first step is to set up the credentials, copy the configuration file and fill up the credentials of GitHub and CosmosDB: 16 | 17 | cp scripts/repo_metrics/config_template.py scripts/repo_metrics/config.py 18 | 19 | To track the current state of the repository and save it to CosmosDB: 20 | 21 | python scripts/repo_metrics/track_metrics.py --github_repo "https://github.com/Microsoft/Recommenders" --save_to_database 22 | 23 | To track an event related to this repository and save it to CosmosDB: 24 | 25 | python scripts/repo_metrics/track_metrics.py --event "Today we did our first blog of the project" --event_date 2018-12-01 --save_to_database 26 | 27 | -------------------------------------------------------------------------------- /reco_utils/evaluation/parameter_sweep.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | # 4 | # Utility functions for parameter sweep. 5 | 6 | from itertools import product 7 | 8 | 9 | def generate_param_grid(params): 10 | """Generator of parameter grids 11 | Generate parameter lists from a parameter dictionary in the form of 12 | { 13 | "param1": [value1, value2], 14 | "param2": [value1, value2] 15 | } 16 | 17 | to 18 | 19 | [ 20 | {"param1": value1, "param2": value1}, 21 | {"param1": value2, "param2": value1}, 22 | {"param1": value1, "param2": value2}, 23 | {"param1": value2, "param2": value2} 24 | ] 25 | 26 | Args: 27 | param_dict (dict): dictionary of parameters and values (in a list). 28 | 29 | Return: 30 | list: A list of parameter dictionary string that can be fed directly into 31 | model builder as keyword arguments. 32 | """ 33 | param_new = {} 34 | param_fixed = {} 35 | 36 | for key, value in params.items(): 37 | if isinstance(value, list): 38 | param_new[key] = value 39 | else: 40 | param_fixed[key] = value 41 | 42 | items = sorted(param_new.items()) 43 | keys, values = zip(*items) 44 | 45 | params_exp = [] 46 | for v in product(*values): 47 | param_exp = dict(zip(keys, v)) 48 | param_exp.update(param_fixed) 49 | params_exp.append(param_exp) 50 | 51 | return params_exp 52 | 53 | -------------------------------------------------------------------------------- /notebooks/reco_utils/evaluation/parameter_sweep.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | # 4 | # Utility functions for parameter sweep. 5 | 6 | from itertools import product 7 | 8 | 9 | def generate_param_grid(params): 10 | """Generator of parameter grids 11 | Generate parameter lists from a paramater dictionary in the form of 12 | { 13 | "param1": [value1, value2], 14 | "param2": [value1, value2] 15 | } 16 | 17 | to 18 | 19 | [ 20 | {"param1": value1, "param2": value1}, 21 | {"param1": value2, "param2": value1}, 22 | {"param1": value1, "param2": value2}, 23 | {"param1": value2, "param2": value2} 24 | ] 25 | 26 | Args: 27 | param_dict (dict): dictionary of parameters and values (in a list). 28 | 29 | Return: 30 | list: A list of parameter dictionary string that can be fed directly into 31 | model builder as keyword arguments. 32 | """ 33 | param_new = {} 34 | param_fixed = {} 35 | 36 | for key, value in params.items(): 37 | if isinstance(value, list): 38 | param_new[key] = value 39 | else: 40 | param_fixed[key] = value 41 | 42 | items = sorted(param_new.items()) 43 | keys, values = zip(*items) 44 | 45 | params_exp = [] 46 | for v in product(*values): 47 | param_exp = dict(zip(keys, v)) 48 | param_exp.update(param_fixed) 49 | params_exp.append(param_exp) 50 | 51 | return params_exp 52 | 53 | -------------------------------------------------------------------------------- /reco_utils/common/spark_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | import sys 6 | 7 | 8 | try: 9 | from pyspark.sql import SparkSession 10 | except ImportError: 11 | pass # skip this import if we are in pure python environment 12 | 13 | 14 | def start_or_get_spark( 15 | app_name="Sample", 16 | url="local[*]", 17 | memory="10G", 18 | packages=None, 19 | jars=None, 20 | repository=None 21 | ): 22 | """Start Spark if not started 23 | 24 | Args: 25 | app_name (str): Set name of the application 26 | url (str): URL for spark master 27 | memory (str): Size of memory for spark driver 28 | packages (list): list of packages to install 29 | jars (list): list of jar files to add 30 | repository (str): The maven repository 31 | 32 | Returns: 33 | obj: Spark context. 34 | """ 35 | 36 | submit_args = '' 37 | if packages is not None: 38 | submit_args = '--packages {} '.format(','.join(packages)) 39 | if jars is not None: 40 | submit_args += '--jars {} '.format(','.join(jars)) 41 | if repository is not None: 42 | submit_args += "--repositories {}".format(repository) 43 | if submit_args: 44 | os.environ['PYSPARK_SUBMIT_ARGS'] = '{} pyspark-shell'.format(submit_args) 45 | 46 | spark = ( 47 | SparkSession.builder.appName(app_name) 48 | .master(url) 49 | .config("spark.driver.memory", memory) 50 | .getOrCreate() 51 | ) 52 | 53 | return spark 54 | -------------------------------------------------------------------------------- /tests/unit/test_notebook_utils.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# This is a test notebook for reco_utils.common.notebook_utils module" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# set the environment path to find Recommenders\n", 19 | "import sys\n", 20 | "sys.path.append(\"../../\")\n", 21 | "\n", 22 | "import papermill as pm\n", 23 | "from reco_utils.common.notebook_utils import is_jupyter, is_databricks" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "pm.record(\"is_jupyter\", is_jupyter())\n", 33 | "pm.record(\"is_databricks\", is_databricks())" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [] 42 | } 43 | ], 44 | "metadata": { 45 | "celltoolbar": "Tags", 46 | "kernelspec": { 47 | "display_name": "Python 3", 48 | "language": "python", 49 | "name": "python3" 50 | }, 51 | "language_info": { 52 | "codemirror_mode": { 53 | "name": "ipython", 54 | "version": 3 55 | }, 56 | "file_extension": ".py", 57 | "mimetype": "text/x-python", 58 | "name": "python", 59 | "nbconvert_exporter": "python", 60 | "pygments_lexer": "ipython3", 61 | "version": "3.6.0" 62 | } 63 | }, 64 | "nbformat": 4, 65 | "nbformat_minor": 1 66 | } 67 | -------------------------------------------------------------------------------- /tests/unit/test_notebooks_pyspark.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | import papermill as pm 6 | from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME 7 | 8 | 9 | @pytest.mark.notebooks 10 | @pytest.mark.spark 11 | def test_als_pyspark_runs(notebooks): 12 | notebook_path = notebooks["als_pyspark"] 13 | pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) 14 | 15 | 16 | @pytest.mark.notebooks 17 | @pytest.mark.spark 18 | def test_data_split_runs(notebooks): 19 | notebook_path = notebooks["data_split"] 20 | pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) 21 | 22 | 23 | @pytest.mark.notebooks 24 | @pytest.mark.spark 25 | def test_als_deep_dive_runs(notebooks): 26 | notebook_path = notebooks["als_deep_dive"] 27 | pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) 28 | 29 | 30 | @pytest.mark.notebooks 31 | @pytest.mark.spark 32 | def test_evaluation_runs(notebooks): 33 | notebook_path = notebooks["evaluation"] 34 | pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) 35 | 36 | 37 | 38 | @pytest.mark.notebooks 39 | @pytest.mark.spark 40 | def test_spark_tuning(notebooks): 41 | notebook_path = notebooks["spark_tuning"] 42 | pm.execute_notebook( 43 | notebook_path, 44 | OUTPUT_NOTEBOOK, 45 | kernel_name=KERNEL_NAME, 46 | parameters=dict( 47 | NUMBER_CORES="*", 48 | NUMBER_ITERATIONS=3, 49 | RANK=[5, 5], 50 | REG=[0.1, 0.01] 51 | ) 52 | ) 53 | 54 | -------------------------------------------------------------------------------- /tests/ci/install_requirements.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (C) Microsoft Corporation. All rights reserved.​ 4 | # ​ 5 | # Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, 6 | # royalty-free right to use, copy, and modify the software code provided by us 7 | # ('Software Code'). You may not sublicense the Software Code or any use of it 8 | # (except to your affiliates and to vendors to perform work on your behalf) 9 | # through distribution, network access, service agreement, lease, rental, or 10 | # otherwise. This license does not purport to express any claim of ownership over 11 | # data you may have shared with Microsoft in the creation of the Software Code. 12 | # Unless applicable law gives you more rights, Microsoft reserves all other 13 | # rights not expressly granted herein, whether by implication, estoppel or 14 | # otherwise. ​ 15 | # ​ 16 | # THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS 17 | # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | # MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 22 | # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 23 | # IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | # ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE 25 | # POSSIBILITY OF SUCH DAMAGE. 26 | 27 | 28 | python --version 29 | pip install azure-cli==2.0.46 30 | pip install --upgrade azureml-sdk[cli] 31 | pip install -r requirements.txt -------------------------------------------------------------------------------- /notebooks/reco_utils/common/python_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def exponential_decay(value, max_val, half_life): 5 | """Compute decay factor for a given value based on an exponential decay 6 | Values greater than max_val will be set to 1 7 | Args: 8 | value (numeric): value to calculate decay factor 9 | max_val (numeric): value at which decay factor will be 1 10 | half_life (numeric): value at which decay factor will be 0.5 11 | Returns: 12 | float: decay factor 13 | """ 14 | 15 | return np.minimum(1.0, np.power(0.5, (max_val - value) / half_life)) 16 | 17 | 18 | def jaccard(cooccurrence): 19 | """Helper method to calculate the Jaccard similarity of a matrix of co-occurrences 20 | Args: 21 | cooccurrence (np.array): the symmetric matrix of co-occurrences of items 22 | Returns: 23 | np.array: The matrix of Jaccard similarities between any two items 24 | """ 25 | 26 | diag = cooccurrence.diagonal() 27 | diag_rows = np.expand_dims(diag, axis=0) 28 | diag_cols = np.expand_dims(diag, axis=1) 29 | 30 | with np.errstate(invalid="ignore", divide="ignore"): 31 | result = cooccurrence / (diag_rows + diag_cols - cooccurrence) 32 | 33 | return np.array(result) 34 | 35 | 36 | def lift(cooccurrence): 37 | """Helper method to calculate the Lift of a matrix of co-occurrences 38 | Args: 39 | cooccurrence (np.array): the symmetric matrix of co-occurrences of items 40 | Returns: 41 | np.array: The matrix of Lifts between any two items 42 | """ 43 | 44 | diag = cooccurrence.diagonal() 45 | diag_rows = np.expand_dims(diag, axis=0) 46 | diag_cols = np.expand_dims(diag, axis=1) 47 | 48 | with np.errstate(invalid="ignore", divide="ignore"): 49 | result = cooccurrence / (diag_rows * diag_cols) 50 | 51 | return np.array(result) 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Run your notebooks as-is on AzureML Service with MLOps extensions 2 | 3 | 4 | We use MLOps to manually or automatically trigger builds due to Github PRs and changes. The control plane is in DevOps and AzureML Service provides numerous capabilities to track your assets when running Jupyter notebooks local or in the cloud. 5 | 6 | ## AzureML improves your MLOps experience! 7 | 8 | ### Build Definitions 9 | 10 | [Run Recommender Notebooks](https://dev.azure.com/emcmanu/NotebookPipelineDemo/_build?definitionId=15) 11 | 12 | [Validate Notebook Changes](https://dev.azure.com/emcmanu/NotebookPipelineDemo/_apps/hub/ms.vss-ciworkflow.build-ci-hub?_a=edit-build-definition&id=14) 13 | 14 | This folder demonstrates how to build, train and test notebooks from our [Recommendation Project](http://github.com/Microsoft/Recommenders) project so you can make your own Recommendation system. 15 | 16 | # Contributing 17 | 18 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 19 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 20 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 21 | 22 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 23 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions 24 | provided by the bot. You will only need to do this once across all repos using our CLA. 25 | 26 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 27 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 28 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /tests/unit/test_notebooks_python.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | import papermill as pm 6 | from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME 7 | 8 | 9 | @pytest.mark.notebooks 10 | def test_template_runs(notebooks): 11 | notebook_path = notebooks["template"] 12 | pm.execute_notebook( 13 | notebook_path, 14 | OUTPUT_NOTEBOOK, 15 | parameters=dict(PM_VERSION=pm.__version__), 16 | kernel_name=KERNEL_NAME, 17 | ) 18 | nb = pm.read_notebook(OUTPUT_NOTEBOOK) 19 | df = nb.dataframe 20 | assert df.shape[0] == 2 21 | check_version = df.loc[df["name"] == "checked_version", "value"].values[0] 22 | assert check_version is True 23 | 24 | 25 | @pytest.mark.notebooks 26 | def test_sar_single_node_runs(notebooks): 27 | notebook_path = notebooks["sar_single_node"] 28 | pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) 29 | 30 | 31 | @pytest.mark.notebooks 32 | def test_sar_deep_dive_runs(notebooks): 33 | notebook_path = notebooks["sar_deep_dive"] 34 | pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) 35 | 36 | 37 | @pytest.mark.notebooks 38 | def test_baseline_deep_dive_runs(notebooks): 39 | notebook_path = notebooks["baseline_deep_dive"] 40 | pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) 41 | 42 | 43 | @pytest.mark.notebooks 44 | def test_surprise_deep_dive_runs(notebooks): 45 | notebook_path = notebooks["surprise_svd_deep_dive"] 46 | pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) 47 | 48 | 49 | @pytest.mark.notebooks 50 | def test_vw_deep_dive_runs(notebooks): 51 | notebook_path = notebooks["vowpal_wabbit_deep_dive"] 52 | pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) 53 | -------------------------------------------------------------------------------- /reco_utils/common/timer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | from timeit import default_timer 5 | from datetime import timedelta 6 | 7 | 8 | class Timer(object): 9 | """Timer class. 10 | Original code: https://github.com/miguelgfierro/codebase 11 | 12 | Examples: 13 | >>> import time 14 | >>> t = Timer() 15 | >>> t.start() 16 | >>> time.sleep(1) 17 | >>> t.stop() 18 | >>> t.interval < 1 19 | True 20 | >>> with Timer() as t: 21 | ... time.sleep(1) 22 | >>> t.interval < 1 23 | True 24 | >>> "Time elapsed {}".format(t) #doctest: +ELLIPSIS 25 | 'Time elapsed 1...' 26 | """ 27 | 28 | def __init__(self): 29 | self._timer = default_timer 30 | self._interval = 0 31 | self.running = False 32 | 33 | def __enter__(self): 34 | self.start() 35 | return self 36 | 37 | def __exit__(self, *args): 38 | self.stop() 39 | 40 | def __str__(self): 41 | return "{:0.4f}".format(self.interval) 42 | 43 | def start(self): 44 | """Start the timer.""" 45 | self.init = self._timer() 46 | self.running = True 47 | 48 | def stop(self): 49 | """Stop the timer. Calculate the interval in seconds.""" 50 | self.end = self._timer() 51 | try: 52 | self._interval = self.end - self.init 53 | self.running = False 54 | except AttributeError: 55 | raise ValueError( 56 | "Timer has not been initialized: use start() or the contextual form with Timer() as t:" 57 | ) 58 | 59 | @property 60 | def interval(self): 61 | if self.running: 62 | raise ValueError("Timer has not been stopped, please use stop().") 63 | else: 64 | return self._interval 65 | 66 | -------------------------------------------------------------------------------- /notebooks/reco_utils/common/timer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | from timeit import default_timer 5 | from datetime import timedelta 6 | 7 | 8 | class Timer(object): 9 | """Timer class. 10 | Original code: https://github.com/miguelgfierro/codebase 11 | 12 | Examples: 13 | >>> import time 14 | >>> t = Timer() 15 | >>> t.start() 16 | >>> time.sleep(1) 17 | >>> t.stop() 18 | >>> t.interval < 1 19 | True 20 | >>> with Timer() as t: 21 | ... time.sleep(1) 22 | >>> t.interval < 1 23 | True 24 | >>> "Time elapsed {}".format(t) #doctest: +ELLIPSIS 25 | 'Time elapsed 0:00:...' 26 | """ 27 | 28 | def __init__(self): 29 | self._timer = default_timer 30 | self._interval = 0 31 | self.running = False 32 | 33 | def __enter__(self): 34 | self.start() 35 | return self 36 | 37 | def __exit__(self, *args): 38 | self.stop() 39 | 40 | def __str__(self): 41 | return str(timedelta(seconds=self._interval)) 42 | 43 | def start(self): 44 | """Start the timer.""" 45 | self.init = self._timer() 46 | self.running = True 47 | 48 | def stop(self): 49 | """Stop the timer. Calculate the interval in seconds.""" 50 | self.end = self._timer() 51 | try: 52 | self._interval = self.end - self.init 53 | self.running = False 54 | except AttributeError: 55 | raise ValueError( 56 | "Timer has not been initialized: use start() or the contextual form with Timer() as t:" 57 | ) 58 | 59 | @property 60 | def interval(self): 61 | if self.running: 62 | raise ValueError("Timer has not been stopped, please use stop().") 63 | else: 64 | return self._interval 65 | -------------------------------------------------------------------------------- /tests/unit/test_notebooks_gpu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import shutil 5 | import pytest 6 | from reco_utils.common.gpu_utils import get_number_gpus 7 | from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME 8 | import papermill as pm 9 | 10 | 11 | @pytest.mark.notebooks 12 | @pytest.mark.gpu 13 | def test_gpu_vm(): 14 | assert get_number_gpus() >= 1 15 | 16 | 17 | @pytest.mark.notebooks 18 | @pytest.mark.gpu 19 | def test_fastai(notebooks): 20 | notebook_path = notebooks["fastai"] 21 | pm.execute_notebook( 22 | notebook_path, 23 | OUTPUT_NOTEBOOK, 24 | kernel_name=KERNEL_NAME, 25 | parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="100k", EPOCHS=1), 26 | ) 27 | 28 | 29 | @pytest.mark.notebooks 30 | @pytest.mark.gpu 31 | def test_ncf(notebooks): 32 | notebook_path = notebooks["ncf"] 33 | pm.execute_notebook( 34 | notebook_path, 35 | OUTPUT_NOTEBOOK, 36 | kernel_name=KERNEL_NAME, 37 | parameters=dict( 38 | TOP_K=10, MOVIELENS_DATA_SIZE="100k", EPOCHS=1, BATCH_SIZE=1024 39 | ), 40 | ) 41 | 42 | 43 | @pytest.mark.notebooks 44 | @pytest.mark.gpu 45 | def test_ncf_deep_dive(notebooks): 46 | notebook_path = notebooks["ncf_deep_dive"] 47 | pm.execute_notebook( 48 | notebook_path, 49 | OUTPUT_NOTEBOOK, 50 | kernel_name=KERNEL_NAME, 51 | parameters=dict( 52 | TOP_K=10, MOVIELENS_DATA_SIZE="100k", EPOCHS=1, BATCH_SIZE=2048 53 | ), 54 | ) 55 | 56 | 57 | @pytest.mark.notebooks 58 | @pytest.mark.gpu 59 | def test_wide_deep(notebooks): 60 | notebook_path = notebooks["wide_deep"] 61 | 62 | MODEL_DIR = 'model_checkpoints' 63 | params = { 64 | 'MOVIELENS_DATA_SIZE': '100k', 65 | 'EPOCHS': 1, 66 | 'EVALUATE_WHILE_TRAINING': False, 67 | 'MODEL_DIR': MODEL_DIR, 68 | 'EXPORT_DIR_BASE': MODEL_DIR, 69 | 'RATING_METRICS': ['rmse', 'mae'], 70 | 'RANKING_METRICS': ['ndcg_at_k', 'precision_at_k'], 71 | } 72 | 73 | pm.execute_notebook( 74 | notebook_path, 75 | OUTPUT_NOTEBOOK, 76 | kernel_name=KERNEL_NAME, 77 | parameters=params, 78 | ) 79 | 80 | shutil.rmtree(MODEL_DIR, ignore_errors=True) 81 | -------------------------------------------------------------------------------- /tests/unit/test_deeprec_model.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from reco_utils.recommender.deeprec.deeprec_utils import prepare_hparams, download_deeprec_resources 4 | from reco_utils.recommender.deeprec.models.xDeepFM import XDeepFMModel 5 | from reco_utils.recommender.deeprec.models.dkn import DKN 6 | from reco_utils.recommender.deeprec.IO.iterator import FFMTextIterator 7 | from reco_utils.recommender.deeprec.IO.dkn_iterator import DKNTextIterator 8 | 9 | 10 | @pytest.fixture 11 | def resource_path(): 12 | return os.path.dirname(os.path.realpath(__file__)) 13 | 14 | 15 | @pytest.mark.gpu 16 | @pytest.mark.deeprec 17 | def test_xdeepfm_component_definition(resource_path): 18 | data_path = os.path.join(resource_path, "..", "resources", "deeprec", "xdeepfm") 19 | yaml_file = os.path.join(data_path, "xDeepFM.yaml") 20 | 21 | if not os.path.exists(yaml_file): 22 | download_deeprec_resources( 23 | "https://recodatasets.blob.core.windows.net/deeprec/", 24 | data_path, 25 | "xdeepfmresources.zip", 26 | ) 27 | 28 | hparams = prepare_hparams(yaml_file) 29 | model = XDeepFMModel(hparams, FFMTextIterator) 30 | 31 | assert model.logit is not None 32 | assert model.update is not None 33 | assert model.iterator is not None 34 | 35 | 36 | @pytest.mark.gpu 37 | @pytest.mark.deeprec 38 | def test_dkn_component_definition(resource_path): 39 | data_path = os.path.join(resource_path, "..", "resources", "deeprec", "dkn") 40 | yaml_file = os.path.join(data_path, "dkn.yaml") 41 | wordEmb_file = os.path.join(data_path, "word_embeddings_100.npy") 42 | entityEmb_file = os.path.join(data_path, "TransE_entity2vec_100.npy") 43 | 44 | if not os.path.exists(yaml_file): 45 | download_deeprec_resources( 46 | "https://recodatasets.blob.core.windows.net/deeprec/", 47 | data_path, 48 | "dknresources.zip", 49 | ) 50 | 51 | hparams = prepare_hparams( 52 | yaml_file, 53 | wordEmb_file=wordEmb_file, 54 | entityEmb_file=entityEmb_file, 55 | epochs=5, 56 | learning_rate=0.0001, 57 | ) 58 | assert hparams is not None 59 | model = DKN(hparams, DKNTextIterator) 60 | 61 | assert model.logit is not None 62 | assert model.update is not None 63 | assert model.iterator is not None 64 | -------------------------------------------------------------------------------- /notebooks/reco_utils/dataset/pandas_df_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pandas as pd 5 | 6 | from reco_utils.common.constants import ( 7 | DEFAULT_USER_COL, 8 | DEFAULT_ITEM_COL, 9 | DEFAULT_RATING_COL, 10 | ) 11 | 12 | 13 | def user_item_pairs( 14 | user_df, 15 | item_df, 16 | user_col=DEFAULT_USER_COL, 17 | item_col=DEFAULT_ITEM_COL, 18 | user_item_filter_df=None, 19 | shuffle=True, 20 | ): 21 | """Get all pairs of users and items data. 22 | 23 | Args: 24 | user_df (pd.DataFrame): User data containing unique user ids and maybe their features. 25 | item_df (pd.DataFrame): Item data containing unique item ids and maybe their features. 26 | user_col (str): User id column name. 27 | item_col (str): Item id column name. 28 | user_item_filter_df (pd.DataFrame): User-item pairs to be used as a filter. 29 | shuffle (bool): If True, shuffles the result. 30 | 31 | Returns: 32 | pd.DataFrame: All pairs of user-item from user_df and item_df, excepting the pairs in user_item_filter_df 33 | """ 34 | 35 | # Get all user-item pairs 36 | user_df["key"] = 1 37 | item_df["key"] = 1 38 | users_items = user_df.merge(item_df, on="key") 39 | 40 | user_df.drop("key", axis=1, inplace=True) 41 | item_df.drop("key", axis=1, inplace=True) 42 | users_items.drop("key", axis=1, inplace=True) 43 | 44 | # Filter 45 | if user_item_filter_df is not None: 46 | users_items = filter_by(users_items, user_item_filter_df, [user_col, item_col]) 47 | 48 | if shuffle: 49 | users_items = users_items.sample(frac=1).reset_index(drop=True) 50 | 51 | return users_items 52 | 53 | 54 | def filter_by(df, filter_by_df, filter_by_cols): 55 | """From the input DataFrame (df), remove the records whose target column (filter_by_cols) values are 56 | exist in the filter-by DataFrame (filter_by_df) 57 | 58 | Args: 59 | df (pd.DataFrame): Source dataframe. 60 | filter_by_df (pd.DataFrame): Filter dataframe. 61 | filter_by_cols (iterable of str): Filter columns. 62 | 63 | Returns: 64 | pd.DataFrame: Dataframe filtered by filter_by_df on filter_by_cols 65 | """ 66 | 67 | return df.loc[ 68 | ~df.set_index(filter_by_cols).index.isin( 69 | filter_by_df.set_index(filter_by_cols).index 70 | ) 71 | ] 72 | -------------------------------------------------------------------------------- /reco_utils/azureml/azureml_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | 6 | from azureml.core import Workspace 7 | 8 | 9 | def get_or_create_workspace( 10 | config_path=None, 11 | subscription_id=None, 12 | resource_group=None, 13 | workspace_name=None, 14 | workspace_region=None, 15 | ): 16 | """Get or create AzureML Workspace this will save the config to the path specified for later use 17 | 18 | Args: 19 | config_path (str): optional directory to look for / store config.json file (defaults to current directory) 20 | subscription_id (str): subscription id 21 | resource_group (str): resource group 22 | workspace_name (str): workspace name 23 | workspace_region (str): region 24 | 25 | Returns: 26 | Workspace 27 | """ 28 | 29 | # use environment variables if needed 30 | if subscription_id is None: 31 | subscription_id = os.getenv("SUBSCRIPTION_ID") 32 | if resource_group is None: 33 | resource_group = os.getenv("RESOURCE_GROUP") 34 | if workspace_name is None: 35 | workspace_name = os.getenv("WORKSPACE_NAME") 36 | if workspace_region is None: 37 | workspace_region = os.getenv("WORKSPACE_REGION") 38 | 39 | # define fallback options in order to try 40 | options = [ 41 | ( 42 | Workspace, 43 | dict( 44 | subscription_id=subscription_id, 45 | resource_group=resource_group, 46 | workspace_name=workspace_name, 47 | ), 48 | ), 49 | (Workspace.from_config, dict(path=config_path)), 50 | ( 51 | Workspace.create, 52 | dict( 53 | subscription_id=subscription_id, 54 | resource_group=resource_group, 55 | name=workspace_name, 56 | location=workspace_region, 57 | create_resource_group=True, 58 | exist_ok=True, 59 | ), 60 | ), 61 | ] 62 | 63 | for function, kwargs in options: 64 | try: 65 | ws = function(**kwargs) 66 | break 67 | except Exception: 68 | continue 69 | else: 70 | raise ValueError( 71 | "Failed to get or create AzureML Workspace with the configuration information provided" 72 | ) 73 | 74 | ws.write_config(path=config_path) 75 | return ws 76 | -------------------------------------------------------------------------------- /tests/ci/Master-CPU-pipeline.yml: -------------------------------------------------------------------------------- 1 | # Master-CPU-pipeline.yml 2 | # Starter pipeline 3 | # Start with a minimal pipeline that you can customize to build and deploy your code. 4 | # Add steps that build, run tests, deploy, and more: 5 | # https://aka.ms/yaml 6 | # 7 | # use variable group name 8 | variables: 9 | - group: AzureKeyVaultSecrets 10 | 11 | #trigger: 12 | #- azure-pipelines-bz 13 | # - master 14 | 15 | #pr: 16 | #- staging 17 | 18 | pool: 19 | vmImage: 'ubuntu-16.04' 20 | 21 | steps: 22 | 23 | - task: UsePythonVersion@0 24 | inputs: 25 | versionSpec: '3.6' 26 | architecture: 'x64' 27 | displayName: 'Use Python 3.6' 28 | 29 | - script: | 30 | az login --service-principal -u $(ClientID) -p $(ClientSecret) --tenant $(TenantID) 31 | 32 | displayName: 'Login to Azure' 33 | 34 | - script: | 35 | sed -i 's#"subscription_id": "<>"#"subscription_id": "$(SubscriptionID)"#g' ./tests/ci/config.json 36 | echo my subscription is $(SubscriptionID) 37 | cat ./tests/ci/config.json 38 | displayName: 'replace subscription value' 39 | 40 | - script: 41 | sed -i 's#"tests/unit_or_smoke_int"#"tests/unit"#g' ./tests/ci/runpytest.py 42 | displayName: 'replace unit or smoke or int' 43 | 44 | - script: 45 | sed -i 's#"not notebooks and not spark and not gpu"#"not notebooks and not spark and not gpu"#g' ./tests/ci/runpytest.py 46 | displayName: 'notebooks and spark and gpu new' 47 | 48 | - bash: | 49 | echo "##vso[task.prependpath]/data/anaconda/bin" 50 | displayName: Add Conda to PATH 51 | 52 | - script: 'pip install azureml-sdk' 53 | displayName: 'install azureml-sdk' 54 | continueOnError: true 55 | 56 | - script: 57 | python scripts/generate_conda_file.py 58 | displayName: ' generate_conda_file.py' 59 | 60 | - script: | 61 | chmod +x scripts/*.py 62 | ls -al scripts 63 | chmod +x tests/ci/*.py 64 | ls -al tests/ci 65 | pwd 66 | ls -al 67 | displayName: 'ls' 68 | 69 | - script: | 70 | python --version 71 | pip install azure-cli==2.0.46 72 | pip install --upgrade azureml-sdk[cli] 73 | # pip install -r tests/ci/requirements.txt 74 | displayName: 'install cli' 75 | 76 | - script: 77 | python tests/ci/submitpytest.py 78 | displayName: 'standalone pytest test persistent' 79 | 80 | - task: PublishTestResults@2 81 | displayName: 'Publish Test Results **/test-*.xml' 82 | inputs: 83 | testResultsFiles: '**/test-*.xml' 84 | failTaskOnFailedTests: true 85 | condition: succeededOrFailed() 86 | -------------------------------------------------------------------------------- /tests/ci/pytest.yml: -------------------------------------------------------------------------------- 1 | # pytest.yml 2 | # Starter pipeline 3 | # Start with a minimal pipeline that you can customize to build and deploy your code. 4 | # Add steps that build, run tests, deploy, and more: 5 | # https://aka.ms/yaml 6 | # 7 | # use variable group name 8 | variables: 9 | - group: AzureKeyVaultSecrets 10 | 11 | #trigger: 12 | #- azure-pipelines-bz 13 | # - master 14 | 15 | #pr: 16 | #- staging 17 | 18 | pool: 19 | vmImage: 'ubuntu-16.04' 20 | 21 | steps: 22 | 23 | - task: UsePythonVersion@0 24 | inputs: 25 | versionSpec: '3.6' 26 | architecture: 'x64' 27 | displayName: 'Use Python 3.6' 28 | 29 | - script: | 30 | az login --service-principal -u $(ClientID) -p $(ClientSecret) --tenant $(TenantID) 31 | 32 | displayName: 'Login to Azure' 33 | 34 | - script: | 35 | pwd 36 | ls ./tests/ci 37 | sed -i 's#"subscription_id": "<>"#"subscription_id": "$(SubscriptionID)"#g' ./tests/ci/config.json 38 | echo my subscription is $(SubscriptionID) 39 | cat ./tests/ci/config.json 40 | displayName: 'replace subscription value' 41 | 42 | - script: 43 | sed -i 's#"tests/unit_or_smoke_int"#"tests/unit"#g' ./tests/ci/runpytest.py 44 | displayName: 'replace unit or smoke or int' 45 | 46 | - script: 47 | sed -i 's#"not notebooks and not spark and not gpu"#"not notebooks and not spark and not gpu"#g' ./tests/ci/runpytest.py 48 | displayName: 'notebooks and spark and gpu new' 49 | 50 | - bash: | 51 | echo "##vso[task.prependpath]/data/anaconda/bin" 52 | displayName: Add Conda to PATH 53 | 54 | - script: 'pip install azureml-sdk' 55 | displayName: 'install azureml-sdk' 56 | continueOnError: true 57 | 58 | - script: 59 | python scripts/generate_conda_file.py 60 | displayName: ' generate_conda_file.py' 61 | 62 | - script: | 63 | chmod +x scripts/*.py 64 | ls -al scripts 65 | chmod +x tests/ci/*.py 66 | ls -al tests/ci 67 | pwd 68 | ls -al 69 | displayName: 'ls' 70 | 71 | - script: | 72 | python --version 73 | pip install azure-cli==2.0.46 74 | pip install --upgrade azureml-sdk[cli] 75 | # pip install -r tests/ci/requirements.txt 76 | displayName: 'install cli' 77 | 78 | - script: 79 | python tests/ci/submitpytest.py 80 | displayName: 'standalone pytest test persistent' 81 | 82 | - task: PublishTestResults@2 83 | displayName: 'Publish Test Results **/test-*.xml' 84 | inputs: 85 | testResultsFiles: '**/test-*.xml' 86 | failTaskOnFailedTests: true 87 | condition: succeededOrFailed() -------------------------------------------------------------------------------- /tests/unit/test_python_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | """ 5 | Test common python utils 6 | """ 7 | import numpy as np 8 | import pytest 9 | 10 | from reco_utils.common.python_utils import ( 11 | exponential_decay, 12 | jaccard, 13 | lift 14 | ) 15 | 16 | TOL = 0.0001 17 | 18 | 19 | @pytest.fixture 20 | def target_matrices(scope="module"): 21 | J1 = np.array([[1.0, 0.0, 0.5], 22 | [0.0, 1.0, 0.33333], 23 | [0.5, 0.33333, 1.0]]) 24 | J2 = np.array([[1.0, 0.0, 0.0, 0.2], 25 | [0.0, 1.0, 0.0, 0.0], 26 | [0.0, 0.0, 1.0, 0.5], 27 | [0.2, 0.0, 0.5, 1.0]]) 28 | L1 = np.array([[1.0, 0.0, 0.5], 29 | [0.0, 0.5, 0.25], 30 | [0.5, 0.25, 0.5]]) 31 | L2 = np.array([[0.5, 0.0, 0.0, 0.125], 32 | [0.0, 0.33333, 0.0, 0.0], 33 | [0.0, 0.0, 0.5, 0.25], 34 | [0.125, 0.0, 0.25, 0.25]]) 35 | return { 36 | "jaccard1": pytest.approx(J1, TOL), 37 | "jaccard2": pytest.approx(J2, TOL), 38 | "lift1": pytest.approx(L1, TOL), 39 | "lift2": pytest.approx(L2, TOL) 40 | } 41 | 42 | 43 | @pytest.fixture(scope="module") 44 | def python_data(): 45 | cooccurrence1 = np.array([[1.0, 0.0, 1.0], 46 | [0.0, 2.0, 1.0], 47 | [1.0, 1.0, 2.0]]) 48 | cooccurrence2 = np.array([[2.0, 0.0, 0.0, 1.0], 49 | [0.0, 3.0, 0.0, 0.0], 50 | [0.0, 0.0, 2.0, 2.0], 51 | [1.0, 0.0, 2.0, 4.0]]) 52 | return cooccurrence1, cooccurrence2 53 | 54 | 55 | def test_python_jaccard(python_data, target_matrices): 56 | cooccurrence1, cooccurrence2 = python_data 57 | J1 = jaccard(cooccurrence1) 58 | assert type(J1) == np.ndarray 59 | assert J1 == target_matrices["jaccard1"] 60 | 61 | J2 = jaccard(cooccurrence2) 62 | assert type(J2) == np.ndarray 63 | assert J2 == target_matrices["jaccard2"] 64 | 65 | 66 | def test_python_lift(python_data, target_matrices): 67 | cooccurrence1, cooccurrence2 = python_data 68 | L1 = lift(cooccurrence1) 69 | assert type(L1) == np.ndarray 70 | assert L1 == target_matrices["lift1"] 71 | 72 | L2 = lift(cooccurrence2) 73 | assert type(L2) == np.ndarray 74 | assert L2 == target_matrices["lift2"] 75 | 76 | 77 | def test_exponential_decay(): 78 | values = np.array([1, 2, 3, 4, 5, 6]) 79 | expected = np.array([0.25, 0.35355339, 0.5, 0.70710678, 1., 1.]) 80 | actual = exponential_decay(value=values, max_val=5, half_life=2) 81 | assert np.allclose(actual, expected, atol=TOL) 82 | -------------------------------------------------------------------------------- /notebooks/reco_utils/README.md: -------------------------------------------------------------------------------- 1 | # Recommender Utilities 2 | 3 | This module (reco_utils) contains functions to simplify common tasks used when developing and evaluating recommender systems. A short description of the sub-modules is provided below. For more details about what functions are available and how to use them, please review the doc-strings provided with the code. 4 | 5 | ## Sub-Modules 6 | 7 | ### [Common](./common) 8 | This submodule contains high-level utilities for defining constants used in most algorithms as well as helper functions for managing aspects of different frameworks: gpu, spark, jupyter notebook. 9 | 10 | ### [Dataset](./dataset) 11 | Dataset includes helper functions for interacting with Azure Cosmos databases, pulling different sizes of the Movielens dataset and formatting them appropriately as well as utilities for splitting data for training / testing. 12 | 13 | #### Data Loading 14 | The movielens module will allow you to load a dataframe in pandas or spark formats from the Movielens dataset, with sizes of 100k, 1M, 10M, or 20M to test algorithms and evaluate performance benchmarks. 15 | ```python 16 | df = movielens.load_pandas_df(size="100k") 17 | ``` 18 | 19 | #### Splitting Techniques: 20 | Currently three methods are available for splitting datasets. All of them support splitting by user or item and filtering out minimal samples (for instance users that have not rated enough item, or items that have not been rated by enough users). 21 | - Random: this is the basic approach where entries are randomly assigned to each group based on the ratio desired 22 | - Chronological: this uses provided timestamps to order the data and selects a cut-off time that will split the desired ratio of data to train before that time and test after that time 23 | - Stratified: this is similar to random sampling, but the splits are stratified, for example if the datasets are split by user, the splitting approach will attempt to maintain the same set of items used in both training and test splits. The converse is true if splitting by item. 24 | 25 | ### [Evaluation](./evaluation) 26 | The evaluation submodule includes functionality for performing hyperparameter sweeps as well as calculating common recommender metrics directly in python or in a Spark environment using pyspark. 27 | 28 | Currently available metrics include: 29 | - Root Mean Squared Error 30 | - Mean Absolute Error 31 | - R2 32 | - Explained Variance 33 | - Precision at K 34 | - Recall at K 35 | - Normalized Discounted Cumulative Gain at K 36 | - Mean Average Precision at K 37 | 38 | ### [Recommender](./recommender) 39 | The recommender submodule contains implementations of various algorithms that can be used in addition to external packages to evaluate and develop new recommender system approaches. 40 | Currently the Simple Adaptive Recommender (SAR) algorithm is implemented in python for running on a single node. 41 | -------------------------------------------------------------------------------- /reco_utils/azureml/aks_utils.py: -------------------------------------------------------------------------------- 1 | from math import ceil, floor 2 | import logging 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | def qps_to_replicas(target_qps, processing_time, max_qp_replica=1, target_utilization=0.7): 7 | """Provide a rough estimate of the number of replicas to support a given load (queries per second) 8 | 9 | Args: 10 | target_qps (int): target queries per second that you want to support 11 | processing_time (float): the estimated amount of time (in seconds) your service call takes 12 | max_qp_replica (int): maximum number of concurrent queries per replica 13 | target_utilization (float): proportion of CPU utilization you think is ideal 14 | 15 | Returns: 16 | replicas: Number of estimated replicas required to support a target number of queries per second 17 | """ 18 | concurrent_queries = target_qps * processing_time / target_utilization 19 | replicas = ceil(concurrent_queries / max_qp_replica) 20 | logger.info('Approximately {} replicas are estimated to support {} queries per second.'.format(replicas, target_qps)) 21 | return replicas 22 | 23 | def replicas_to_qps(num_replicas, processing_time, max_qp_replica=1, target_utilization=0.7): 24 | """Provide a rough estimate of the queries per second supported by a number of replicas 25 | 26 | Args: 27 | num_replicas (int): number of replicas 28 | processing_time (float): the estimated amount of time (in seconds) your service call takes 29 | max_qp_replica (int): maximum number of concurrent queries per replica 30 | target_utilization (float): proportion of CPU utilization you think is ideal 31 | 32 | Returns: 33 | qps: queries per second supported by the number of replicas 34 | """ 35 | qps = floor(num_replicas*max_qp_replica*target_utilization/processing_time) 36 | logger.info('Approximately {} queries per second are supported by {} replicas.'.format(qps, num_replicas)) 37 | return qps 38 | 39 | 40 | def total_cores_to_replicas(n_cores, cpu_cores_per_replica=0.1, overhead=0.1): 41 | """Provide a rough estimate of the number of replicas supported by a particular number of cores. 42 | 43 | Args: 44 | n_cores (int): Total number of cores within an AKS cluster that you want to use 45 | cpu_cores_per_replica (float): Cores assigned to each replica. This can be fractional and corresponds to the 46 | cpu_cores argument passed to AksWebservice.deploy_configuration() configuration 47 | overhead (float): Amount of overhead (as a proportion) 48 | 49 | Returns: 50 | replicas: Total number of replicas supported by n_cores 51 | """ 52 | replicas = floor((1 - overhead)*n_cores/(cpu_cores_per_replica)) 53 | logger.info('Approximately {} replicas are supported by {} cores.'.format(replicas, n_cores)) 54 | return replicas -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /reco_utils/recommender/fastai/fastai_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import fastai 8 | from fastprogress import force_console_behavior 9 | import fastprogress 10 | 11 | from reco_utils.common import constants as cc 12 | 13 | 14 | def cartesian_product(*arrays): 15 | """Compute the Cartesian product in fastai algo. This is a helper function. 16 | 17 | Args: 18 | arrays (tuple of np.array): Input arrays 19 | 20 | Returns: 21 | np.array: product 22 | 23 | """ 24 | la = len(arrays) 25 | dtype = np.result_type(*arrays) 26 | arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype) 27 | for i, a in enumerate(np.ix_(*arrays)): 28 | arr[..., i] = a 29 | return arr.reshape(-1, la) 30 | 31 | 32 | def score( 33 | learner, 34 | test_df, 35 | user_col=cc.DEFAULT_USER_COL, 36 | item_col=cc.DEFAULT_ITEM_COL, 37 | prediction_col=cc.DEFAULT_PREDICTION_COL, 38 | top_k=None, 39 | ): 40 | """Score all users+items provided and reduce to top_k items per user if top_k>0 41 | 42 | Args: 43 | learner (obj): Model. 44 | test_df (pd.DataFrame): Test dataframe. 45 | user_col (str): User column name. 46 | item_col (str): Item column name. 47 | prediction_col (str): Prediction column name. 48 | top_k (int): Number of top items to recommend. 49 | 50 | Returns: 51 | pd.DataFrame: Result of recommendation 52 | """ 53 | # replace values not known to the model with NaN 54 | total_users, total_items = learner.data.train_ds.x.classes.values() 55 | test_df.loc[~test_df[user_col].isin(total_users), user_col] = np.nan 56 | test_df.loc[~test_df[item_col].isin(total_items), item_col] = np.nan 57 | 58 | # map ids to embedding ids 59 | u = learner.get_idx(test_df[user_col], is_item=False) 60 | m = learner.get_idx(test_df[item_col], is_item=True) 61 | 62 | # score the pytorch model 63 | pred = learner.model.forward(u, m) 64 | scores = pd.DataFrame( 65 | {user_col: test_df[user_col], item_col: test_df[item_col], prediction_col: pred} 66 | ) 67 | scores = scores.sort_values([user_col, prediction_col], ascending=[True, False]) 68 | if top_k is not None: 69 | top_scores = scores.groupby(user_col).head(top_k).reset_index(drop=True) 70 | else: 71 | top_scores = scores 72 | return top_scores 73 | 74 | 75 | def hide_fastai_progress_bar(): 76 | """Hide fastai progress bar""" 77 | fastprogress.fastprogress.NO_BAR = True 78 | fastprogress.fastprogress.WRITER_FN = str 79 | master_bar, progress_bar = force_console_behavior() 80 | fastai.basic_train.master_bar, fastai.basic_train.progress_bar = ( 81 | master_bar, 82 | progress_bar, 83 | ) 84 | 85 | -------------------------------------------------------------------------------- /notebooks/reco_utils/recommender/fastai/fastai_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import fastai 8 | from fastprogress import force_console_behavior 9 | import fastprogress 10 | 11 | from reco_utils.common import constants as cc 12 | 13 | 14 | def cartesian_product(*arrays): 15 | """Compute the cartesian product in fastai algo. This is a helper function. 16 | 17 | Args: 18 | arrays (tuple of np.array): Input arrays 19 | 20 | Returns: 21 | np.array: product 22 | 23 | """ 24 | la = len(arrays) 25 | dtype = np.result_type(*arrays) 26 | arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype) 27 | for i, a in enumerate(np.ix_(*arrays)): 28 | arr[..., i] = a 29 | return arr.reshape(-1, la) 30 | 31 | 32 | def score( 33 | learner, 34 | test_df, 35 | user_col=cc.DEFAULT_USER_COL, 36 | item_col=cc.DEFAULT_ITEM_COL, 37 | prediction_col=cc.DEFAULT_PREDICTION_COL, 38 | top_k=None, 39 | ): 40 | """Score all users+items provided and reduce to top_k items per user if top_k>0 41 | 42 | Args: 43 | learner (obj): Model. 44 | test_df (pd.DataFrame): Test dataframe. 45 | user_col (str): User column name. 46 | item_col (str): Item column name. 47 | prediction_col (str): Prediction column name. 48 | top_k (int): Number of top items to recommend. 49 | 50 | Returns: 51 | pd.DataFrame: Result of recommendation 52 | """ 53 | # replace values not known to the model with NaN 54 | total_users, total_items = learner.data.train_ds.x.classes.values() 55 | test_df.loc[~test_df[user_col].isin(total_users), user_col] = np.nan 56 | test_df.loc[~test_df[item_col].isin(total_items), item_col] = np.nan 57 | 58 | # map ids to embedding ids 59 | u = learner.get_idx(test_df[user_col], is_item=False) 60 | m = learner.get_idx(test_df[item_col], is_item=True) 61 | 62 | # score the pytorch model 63 | pred = learner.model.forward(u, m) 64 | scores = pd.DataFrame( 65 | {user_col: test_df[user_col], item_col: test_df[item_col], prediction_col: pred} 66 | ) 67 | scores = scores.sort_values([user_col, prediction_col], ascending=[True, False]) 68 | if top_k is not None: 69 | top_scores = scores.groupby(user_col).head(top_k).reset_index(drop=True) 70 | else: 71 | top_scores = scores 72 | return top_scores 73 | 74 | 75 | def hide_fastai_progress_bar(): 76 | """Hide fastai progress bar""" 77 | fastprogress.fastprogress.NO_BAR = True 78 | fastprogress.fastprogress.WRITER_FN = str 79 | master_bar, progress_bar = force_console_behavior() 80 | fastai.basic_train.master_bar, fastai.basic_train.progress_bar = ( 81 | master_bar, 82 | progress_bar, 83 | ) 84 | 85 | -------------------------------------------------------------------------------- /reco_utils/README.md: -------------------------------------------------------------------------------- 1 | # Recommender Utilities 2 | 3 | This module (reco_utils) contains functions to simplify common tasks used when developing and evaluating recommender systems. A short description of the sub-modules is provided below. For more details about what functions are available and how to use them, please review the doc-strings provided with the code. 4 | 5 | ## Sub-Modules 6 | 7 | ### [Common](./common) 8 | This submodule contains high-level utilities for defining constants used in most algorithms as well as helper functions for managing aspects of different frameworks: gpu, spark, jupyter notebook. 9 | 10 | ### [Dataset](./dataset) 11 | Dataset includes helper functions for interacting with Azure Cosmos databases, pulling different sizes of the MovieLens dataset and formatting them appropriately as well as utilities for splitting data for training / testing. 12 | 13 | #### Data Loading 14 | The movielens module will allow you to load a dataframe in pandas or spark formats from the MovieLens dataset, with sizes of 100k, 1M, 10M, or 20M to test algorithms and evaluate performance benchmarks. 15 | ```python 16 | df = movielens.load_pandas_df(size="100k") 17 | ``` 18 | 19 | #### Splitting Techniques: 20 | Currently three methods are available for splitting datasets. All of them support splitting by user or item and filtering out minimal samples (for instance users that have not rated enough item, or items that have not been rated by enough users). 21 | - Random: this is the basic approach where entries are randomly assigned to each group based on the ratio desired 22 | - Chronological: this uses provided timestamps to order the data and selects a cut-off time that will split the desired ratio of data to train before that time and test after that time 23 | - Stratified: this is similar to random sampling, but the splits are stratified, for example if the datasets are split by user, the splitting approach will attempt to maintain the same set of items used in both training and test splits. The converse is true if splitting by item. 24 | 25 | ### [Evaluation](./evaluation) 26 | The evaluation submodule includes functionality for performing hyperparameter sweeps as well as calculating common recommender metrics directly in python or in a Spark environment using pyspark. 27 | 28 | Currently available metrics include: 29 | - Root Mean Squared Error 30 | - Mean Absolute Error 31 | - R2 32 | - Explained Variance 33 | - Precision at K 34 | - Recall at K 35 | - Normalized Discounted Cumulative Gain at K 36 | - Mean Average Precision at K 37 | - Area Under Curve 38 | - Logistic Loss 39 | 40 | ### [Recommender](./recommender) 41 | The recommender submodule contains implementations of various algorithms that can be used in addition to external packages to evaluate and develop new recommender system approaches. 42 | Currently the Simple Adaptive Recommender (SAR) algorithm is implemented in python for running on a single node. 43 | -------------------------------------------------------------------------------- /reco_utils/dataset/download_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | from urllib.request import urlretrieve 6 | import logging 7 | from contextlib import contextmanager 8 | from tempfile import TemporaryDirectory 9 | from tqdm import tqdm 10 | 11 | 12 | log = logging.getLogger(__name__) 13 | 14 | 15 | class TqdmUpTo(tqdm): 16 | """Wrapper class for the progress bar tqdm to get `update_to(n)` functionality""" 17 | 18 | def update_to(self, b=1, bsize=1, tsize=None): 19 | """A progress bar showing how much is left to finish the operation 20 | 21 | Args: 22 | b (int): Number of blocks transferred so far. 23 | bsize (int): Size of each block (in tqdm units). 24 | tsize (int): Total size (in tqdm units). 25 | """ 26 | if tsize is not None: 27 | self.total = tsize 28 | self.update(b * bsize - self.n) # will also set self.n = b * bsize 29 | 30 | 31 | def maybe_download(url, filename=None, work_directory=".", expected_bytes=None): 32 | """Download a file if it is not already downloaded. 33 | 34 | Args: 35 | filename (str): File name. 36 | work_directory (str): Working directory. 37 | url (str): URL of the file to download. 38 | expected_bytes (int): Expected file size in bytes. 39 | 40 | Returns: 41 | str: File path of the file downloaded. 42 | """ 43 | if filename is None: 44 | filename = url.split("/")[-1] 45 | filepath = os.path.join(work_directory, filename) 46 | if not os.path.exists(filepath): 47 | with TqdmUpTo(unit="B", unit_scale=True) as t: 48 | filepath, _ = urlretrieve(url, filepath, reporthook=t.update_to) 49 | else: 50 | log.debug("File {} already downloaded".format(filepath)) 51 | if expected_bytes is not None: 52 | statinfo = os.stat(filepath) 53 | if statinfo.st_size != expected_bytes: 54 | os.remove(filepath) 55 | raise IOError("Failed to verify {}".format(filepath)) 56 | 57 | return filepath 58 | 59 | 60 | @contextmanager 61 | def download_path(path=None): 62 | """Return a path to download data. If `path=None`, then it yields a temporal path that is eventually deleted, 63 | otherwise the real path of the input. 64 | 65 | Args: 66 | path (str): Path to download data. 67 | 68 | Returns: 69 | str: Real path where the data is stored. 70 | 71 | Examples: 72 | >>> with download_path() as path: 73 | >>> ... maybe_download(url="http://example.com/file.zip", work_directory=path) 74 | 75 | """ 76 | if path is None: 77 | tmp_dir = TemporaryDirectory() 78 | try: 79 | yield tmp_dir.name 80 | finally: 81 | tmp_dir.cleanup() 82 | else: 83 | path = os.path.realpath(path) 84 | yield path 85 | 86 | -------------------------------------------------------------------------------- /reco_utils/dataset/cosmos_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | import pydocumentdb.errors as errors 4 | 5 | 6 | def find_collection(client, dbid, id): 7 | """Find whether or not a CosmosDB collection exists. 8 | Args: 9 | client (obj): A pydocumentdb client object. 10 | dbid (str): Database ID. 11 | id (str): Collection ID. 12 | Returns: 13 | bool: True if the collection exists, False otherwise. 14 | """ 15 | database_link = "dbs/" + dbid 16 | collections = list( 17 | client.QueryCollections( 18 | database_link, 19 | { 20 | "query": "SELECT * FROM r WHERE r.id=@id", 21 | "parameters": [{"name": "@id", "value": id}], 22 | }, 23 | ) 24 | ) 25 | if len(collections) > 0: 26 | return True 27 | else: 28 | return False 29 | 30 | 31 | def read_collection(client, dbid, id): 32 | """Read a CosmosDB collection. 33 | Args: 34 | client (obj): A pydocumentdb client object. 35 | dbid (str): Database ID. 36 | id (str): Collection ID. 37 | Returns: 38 | obj: A collection. 39 | """ 40 | try: 41 | database_link = "dbs/" + dbid 42 | collection_link = database_link + "/colls/{0}".format(id) 43 | collection = client.ReadCollection(collection_link) 44 | return collection 45 | except errors.DocumentDBError as e: 46 | if e.status_code == 404: 47 | print("A collection with id '{0}' does not exist".format(id)) 48 | else: 49 | raise errors.HTTPFailure(e.status_code) 50 | 51 | 52 | def read_database(client, id): 53 | """Read a CosmosDB database. 54 | Args: 55 | client (obj): A pydocumentdb client object. 56 | id (str): Database ID. 57 | Returns: 58 | obj: A database. 59 | """ 60 | try: 61 | database_link = "dbs/" + id 62 | database = client.ReadDatabase(database_link) 63 | return database 64 | except errors.DocumentDBError as e: 65 | if e.status_code == 404: 66 | print("A database with id '{0}' does not exist".format(id)) 67 | else: 68 | raise errors.HTTPFailure(e.status_code) 69 | 70 | 71 | def find_database(client, id): 72 | """Find whether or not a CosmosDB database exists. 73 | Args: 74 | client (obj): A pydocumentdb client object. 75 | id (str): Database ID. 76 | Returns: 77 | bool: True if the database exists, False otherwise. 78 | """ 79 | databases = list( 80 | client.QueryDatabases( 81 | { 82 | "query": "SELECT * FROM r WHERE r.id=@id", 83 | "parameters": [{"name": "@id", "value": id}], 84 | } 85 | ) 86 | ) 87 | if len(databases) > 0: 88 | return True 89 | else: 90 | return False 91 | 92 | -------------------------------------------------------------------------------- /notebooks/reco_utils/dataset/cosmos_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | import pydocumentdb.errors as errors 4 | 5 | 6 | def find_collection(client, dbid, id): 7 | """Find whether or not a CosmosDB collection exists. 8 | Args: 9 | client (obj): A pydocumentdb client object. 10 | dbid (str): Database ID. 11 | id (str): Collection ID. 12 | Returns: 13 | bool: True if the collection exists, False otherwise. 14 | """ 15 | database_link = "dbs/" + dbid 16 | collections = list( 17 | client.QueryCollections( 18 | database_link, 19 | { 20 | "query": "SELECT * FROM r WHERE r.id=@id", 21 | "parameters": [{"name": "@id", "value": id}], 22 | }, 23 | ) 24 | ) 25 | if len(collections) > 0: 26 | return True 27 | else: 28 | return False 29 | 30 | 31 | def read_collection(client, dbid, id): 32 | """Read a CosmosDB collection. 33 | Args: 34 | client (obj): A pydocumentdb client object. 35 | dbid (str): Database ID. 36 | id (str): Collection ID. 37 | Returns: 38 | obj: A collection. 39 | """ 40 | try: 41 | database_link = "dbs/" + dbid 42 | collection_link = database_link + "/colls/{0}".format(id) 43 | collection = client.ReadCollection(collection_link) 44 | return collection 45 | except errors.DocumentDBError as e: 46 | if e.status_code == 404: 47 | print("A collection with id '{0}' does not exist".format(id)) 48 | else: 49 | raise errors.HTTPFailure(e.status_code) 50 | 51 | 52 | def read_database(client, id): 53 | """Read a CosmosDB database. 54 | Args: 55 | client (obj): A pydocumentdb client object. 56 | id (str): Database ID. 57 | Returns: 58 | obj: A database. 59 | """ 60 | try: 61 | database_link = "dbs/" + id 62 | database = client.ReadDatabase(database_link) 63 | return database 64 | except errors.DocumentDBError as e: 65 | if e.status_code == 404: 66 | print("A database with id '{0}' does not exist".format(id)) 67 | else: 68 | raise errors.HTTPFailure(e.status_code) 69 | 70 | 71 | def find_database(client, id): 72 | """Find whether or not a CosmosDB database exists. 73 | Args: 74 | client (obj): A pydocumentdb client object. 75 | id (str): Database ID. 76 | Returns: 77 | bool: True if the database exists, False otherwise. 78 | """ 79 | databases = list( 80 | client.QueryDatabases( 81 | { 82 | "query": "SELECT * FROM r WHERE r.id=@id", 83 | "parameters": [{"name": "@id", "value": id}], 84 | } 85 | ) 86 | ) 87 | if len(databases) > 0: 88 | return True 89 | else: 90 | return False 91 | 92 | -------------------------------------------------------------------------------- /scripts/databricks_install.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | # --------------------------------------------------------- 4 | # This script installs Recommenders into Databricks 5 | 6 | DATABRICKS_CLI=$(which databricks) 7 | if ! [ -x "$DATABRICKS_CLI" ]; then 8 | echo "No databricks-cli found!! Please see the SETUP.md file for installation prerequisites." 9 | exit 1 10 | fi 11 | 12 | CLUSTER_ID=$1 13 | if [ -z $CLUSTER_ID ]; then 14 | echo "Please provide the target cluster id: 'databricks_install.sh '." 15 | echo "Cluster id can be found by running 'databricks clusters list'" 16 | echo "which returns a list of ." 17 | exit 1 18 | fi 19 | 20 | CLUSTER_EXIST=false 21 | while IFS=' ' read -ra ARR; do 22 | if [ ${ARR[0]} = $CLUSTER_ID ]; then 23 | CLUSTER_EXIST=true 24 | 25 | STATUS=${ARR[2]} 26 | STATUS=${STATUS//[^a-zA-Z]/} 27 | if [ $STATUS = RUNNING ]; then 28 | echo 29 | echo "Preparing Recommenders library file (egg)..." 30 | zip -r -q Recommenders.egg ./reco_utils -i \*.py 31 | 32 | echo 33 | echo "Uploading to databricks..." 34 | dbfs cp --overwrite Recommenders.egg dbfs:/FileStore/jars/Recommenders.egg 35 | 36 | echo 37 | echo "Installing the library onto databricks cluster $CLUSTER_ID..." 38 | databricks libraries install --cluster-id $CLUSTER_ID --egg dbfs:/FileStore/jars/Recommenders.egg 39 | 40 | echo 41 | echo "Done! Installation status checking..." 42 | databricks libraries cluster-status --cluster-id $CLUSTER_ID 43 | 44 | echo 45 | echo "Restarting the cluster to activate the library..." 46 | databricks clusters restart --cluster-id $CLUSTER_ID 47 | 48 | echo "This will take few seconds. Please check the result from Databricks workspace." 49 | echo "Alternatively, run 'databricks clusters list' to check the restart status and" 50 | echo "run 'databricks libraries cluster-status --cluster-id $CLUSTER_ID' to check the installation status." 51 | 52 | rm Recommenders.egg 53 | exit 0 54 | else 55 | echo "Cluster $CLUSTER_ID found, but it is not running. Status=${STATUS}" 56 | echo "You can start the cluster with 'databricks clusters start --cluster-id $CLUSTER_ID'." 57 | echo "Then, check the cluster status by using 'databricks clusters list' and" 58 | echo "re-try installation once the status turns into RUNNING." 59 | exit 1 60 | fi 61 | fi 62 | done < <(databricks clusters list) 63 | 64 | if ! [ $CLUSTER_EXIST = true ]; then 65 | echo "Cannot find the target cluster $CLUSTER_ID. Please check if you entered the valid id." 66 | echo "Cluster id can be found by running 'databricks clusters list'" 67 | echo "which returns a list of ." 68 | exit 1 69 | fi 70 | 71 | -------------------------------------------------------------------------------- /notebooks/scripts/databricks_install.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | # --------------------------------------------------------- 4 | # This script installs Recommenders into Databricks 5 | 6 | DATABRICKS_CLI=$(which databricks) 7 | if ! [ -x "$DATABRICKS_CLI" ]; then 8 | echo "No databricks-cli found!! Please see the SETUP.md file for installation prerequisites." 9 | exit 1 10 | fi 11 | 12 | CLUSTER_ID=$1 13 | if [ -z $CLUSTER_ID ]; then 14 | echo "Please provide the target cluster id: 'databricks_install.sh '." 15 | echo "Cluster id can be found by running 'databricks clusters list'" 16 | echo "which returns a list of ." 17 | exit 1 18 | fi 19 | 20 | CLUSTER_EXIST=false 21 | while IFS=' ' read -ra ARR; do 22 | if [ ${ARR[0]} = $CLUSTER_ID ]; then 23 | CLUSTER_EXIST=true 24 | 25 | STATUS=${ARR[2]} 26 | STATUS=${STATUS//[^a-zA-Z]/} 27 | if [ $STATUS = RUNNING ]; then 28 | echo 29 | echo "Preparing Recommenders library file (egg)..." 30 | zip -r -q Recommenders.egg ./reco_utils -i \*.py 31 | 32 | echo 33 | echo "Uploading to databricks..." 34 | dbfs cp --overwrite Recommenders.egg dbfs:/FileStore/jars/Recommenders.egg 35 | 36 | echo 37 | echo "Installing the library onto databricks cluster $CLUSTER_ID..." 38 | databricks libraries install --cluster-id $CLUSTER_ID --egg dbfs:/FileStore/jars/Recommenders.egg 39 | 40 | echo 41 | echo "Done! Installation status checking..." 42 | databricks libraries cluster-status --cluster-id $CLUSTER_ID 43 | 44 | echo 45 | echo "Restarting the cluster to activate the library..." 46 | databricks clusters restart --cluster-id $CLUSTER_ID 47 | 48 | echo "This will take few seconds. Please check the result from Databricks workspace." 49 | echo "Alternatively, run 'databricks clusters list' to check the restart status and" 50 | echo "run 'databricks libraries cluster-status --cluster-id $CLUSTER_ID' to check the installation status." 51 | 52 | rm Recommenders.egg 53 | exit 0 54 | else 55 | echo "Cluster $CLUSTER_ID found, but it is not running. Status=${STATUS}" 56 | echo "You can start the cluster with 'databricks clusters start --cluster-id $CLUSTER_ID'." 57 | echo "Then, check the cluster status by using 'databricks clusters list' and" 58 | echo "re-try installation once the status turns into RUNNING." 59 | exit 1 60 | fi 61 | fi 62 | done < <(databricks clusters list) 63 | 64 | if ! [ $CLUSTER_EXIST = true ]; then 65 | echo "Cannot find the target cluster $CLUSTER_ID. Please check if you entered the valid id." 66 | echo "Cluster id can be found by running 'databricks clusters list'" 67 | echo "which returns a list of ." 68 | exit 1 69 | fi 70 | 71 | -------------------------------------------------------------------------------- /tests/unit/test_pandas_df_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | import pandas as pd 6 | from reco_utils.dataset.pandas_df_utils import ( 7 | user_item_pairs, 8 | filter_by 9 | ) 10 | 11 | 12 | @pytest.fixture(scope="module") 13 | def user_item_dataset(): 14 | """Get users and items dataframe""" 15 | user_df = pd.DataFrame({ 16 | 'user_id': [1, 2, 3, 4, 5], 17 | 'user_age': [23, 24, 25, 26, 27] 18 | }) 19 | 20 | item_df = pd.DataFrame({ 21 | 'item_id': [6, 7, 8], 22 | 'item_feat': [[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]] 23 | }) 24 | 25 | return user_df, item_df 26 | 27 | 28 | def test_user_item_pairs(user_item_dataset): 29 | user_df, item_df = user_item_dataset 30 | 31 | user_item = user_item_pairs( 32 | user_df=user_df, 33 | item_df=item_df, 34 | user_col='user_id', 35 | item_col='item_id', 36 | shuffle=False 37 | ) 38 | # Validate cross-join 39 | assert len(user_df) * len(item_df) == len(user_item) 40 | assert user_item.loc[(user_item['user_id'] == 3) & (user_item['item_id'] == 7)].values.tolist()[0]\ 41 | == [3, 25, 7, [0.2, 0.2]] 42 | 43 | # Check if result is deterministic 44 | assert user_item.iloc[0].values.tolist() == [1, 23, 6, [0.1, 0.1]] 45 | 46 | # Check shuffle 47 | user_item_shuffled = user_item_pairs( 48 | user_df=user_df, 49 | item_df=item_df, 50 | user_col='user_id', 51 | item_col='item_id', 52 | shuffle=True 53 | ) 54 | # Check shuffled result is still valid 55 | assert len(user_df) * len(item_df) == len(user_item_shuffled) 56 | row = user_item.loc[(user_item['user_id'] == 2) & (user_item['item_id'] == 6)] 57 | assert row['user_age'].iloc[0] == 24 58 | assert row['item_feat'].iloc[0] == [0.1, 0.1] 59 | # Check shuffled result is different from not-shuffled dataframe 60 | assert [*user_item_shuffled['user_id'].values] != [*user_item['user_id'].values] 61 | 62 | # Check filter 63 | seen_df = pd.DataFrame({ 64 | 'user_id': [1, 9, 3, 5, 5, 1], 65 | 'item_id': [1, 6, 7, 6, 8, 9] 66 | }) 67 | user_item_filtered = user_item_pairs( 68 | user_df=user_df, 69 | item_df=item_df, 70 | user_col='user_id', 71 | item_col='item_id', 72 | user_item_filter_df=seen_df, 73 | shuffle=False 74 | ) 75 | # Check filtered out number 76 | assert len(user_item_filtered) == len(user_item) - 3 77 | # Check filtered out record 78 | assert len(user_item_filtered.loc[(user_item['user_id'] == 3) & (user_item['item_id'] == 7)]) == 0 79 | 80 | 81 | def test_filter_by(): 82 | user_df = pd.DataFrame({ 83 | 'user_id': [1, 9, 3, 5, 5, 1], 84 | 'item_id': [1, 6, 7, 6, 8, 9] 85 | }) 86 | 87 | seen_df = pd.DataFrame({ 88 | 'user_id': [1, 2, 4], 89 | }) 90 | 91 | filtered_df = filter_by(user_df, seen_df, ['user_id']) 92 | 93 | # Check filtered out number 94 | assert len(filtered_df) == len(user_df) - 2 95 | # Check filtered out record 96 | assert len(filtered_df.loc[(user_df['user_id'] == 1)]) == 0 97 | -------------------------------------------------------------------------------- /tests/unit/test_deeprec_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import tensorflow as tf 4 | from reco_utils.recommender.deeprec.deeprec_utils import ( 5 | prepare_hparams, 6 | download_deeprec_resources, 7 | load_yaml, 8 | ) 9 | from reco_utils.recommender.deeprec.IO.iterator import FFMTextIterator 10 | from reco_utils.recommender.deeprec.IO.dkn_iterator import DKNTextIterator 11 | 12 | 13 | @pytest.fixture 14 | def resource_path(): 15 | return os.path.dirname(os.path.realpath(__file__)) 16 | 17 | 18 | @pytest.mark.parametrize( 19 | "must_exist_attributes", ["FEATURE_COUNT", "data_format", "dim"] 20 | ) 21 | @pytest.mark.gpu 22 | @pytest.mark.deeprec 23 | def test_prepare_hparams(must_exist_attributes, resource_path): 24 | data_path = os.path.join(resource_path, "..", "resources", "deeprec", "xdeepfm") 25 | yaml_file = os.path.join(data_path, "xDeepFM.yaml") 26 | if not os.path.exists(yaml_file): 27 | download_deeprec_resources( 28 | "https://recodatasets.blob.core.windows.net/deeprec/", 29 | data_path, 30 | "xdeepfmresources.zip", 31 | ) 32 | hparams = prepare_hparams(yaml_file) 33 | assert hasattr(hparams, must_exist_attributes) 34 | 35 | 36 | @pytest.mark.gpu 37 | @pytest.mark.deeprec 38 | def test_load_yaml_file(resource_path): 39 | data_path = os.path.join(resource_path, "..", "resources", "deeprec", "xdeepfm") 40 | yaml_file = os.path.join(data_path, "xDeepFM.yaml") 41 | 42 | if not os.path.exists(yaml_file): 43 | download_deeprec_resources( 44 | "https://recodatasets.blob.core.windows.net/deeprec/", 45 | data_path, 46 | "xdeepfmresources.zip", 47 | ) 48 | 49 | config = load_yaml(yaml_file) 50 | assert config is not None 51 | 52 | 53 | @pytest.mark.gpu 54 | @pytest.mark.deeprec 55 | def test_FFM_iterator(resource_path): 56 | data_path = os.path.join(resource_path, "..", "resources", "deeprec", "xdeepfm") 57 | yaml_file = os.path.join(data_path, "xDeepFM.yaml") 58 | data_file = os.path.join(data_path, "sample_FFM_data.txt") 59 | 60 | if not os.path.exists(yaml_file): 61 | download_deeprec_resources( 62 | "https://recodatasets.blob.core.windows.net/deeprec/", 63 | data_path, 64 | "xdeepfmresources.zip", 65 | ) 66 | 67 | hparams = prepare_hparams(yaml_file) 68 | iterator = FFMTextIterator(hparams, tf.Graph()) 69 | assert iterator is not None 70 | for res in iterator.load_data_from_file(data_file): 71 | assert isinstance(res, dict) 72 | 73 | 74 | @pytest.mark.gpu 75 | @pytest.mark.deeprec 76 | def test_DKN_iterator(resource_path): 77 | data_path = os.path.join(resource_path, "..", "resources", "deeprec", "dkn") 78 | data_file = os.path.join(data_path, "final_test_with_entity.txt") 79 | yaml_file = os.path.join(data_path, "dkn.yaml") 80 | if not os.path.exists(yaml_file): 81 | download_deeprec_resources( 82 | "https://recodatasets.blob.core.windows.net/deeprec/", 83 | data_path, 84 | "dknresources.zip", 85 | ) 86 | 87 | hparams = prepare_hparams(yaml_file, wordEmb_file="", entityEmb_file="") 88 | iterator = DKNTextIterator(hparams, tf.Graph()) 89 | assert iterator is not None 90 | for res in iterator.load_data_from_file(data_file): 91 | assert isinstance(res, dict) 92 | -------------------------------------------------------------------------------- /reco_utils/common/gpu_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import sys 5 | import os 6 | import glob 7 | from numba import cuda 8 | from numba.cuda.cudadrv.error import CudaSupportError 9 | 10 | 11 | DEFAULT_CUDA_PATH_LINUX = "/usr/local/cuda/version.txt" 12 | 13 | 14 | def get_number_gpus(): 15 | """Get the number of GPUs in the system. 16 | 17 | Returns: 18 | int: Number of GPUs. 19 | """ 20 | try: 21 | return len(cuda.gpus) 22 | except CudaSupportError: 23 | return 0 24 | 25 | 26 | def clear_memory_all_gpus(): 27 | """Clear memory of all GPUs.""" 28 | try: 29 | for gpu in cuda.gpus: 30 | with gpu: 31 | cuda.current_context().deallocations.clear() 32 | except CudaSupportError: 33 | print("No CUDA available") 34 | 35 | 36 | def get_cuda_version(unix_path=DEFAULT_CUDA_PATH_LINUX): 37 | """Get CUDA version 38 | 39 | Args: 40 | unix_path (str): Path to CUDA version file in Linux/Mac. 41 | 42 | Returns: 43 | str: Version of the library. 44 | """ 45 | if sys.platform == "win32": 46 | raise NotImplementedError("Implement this!") 47 | elif sys.platform in ["linux", "darwin"]: 48 | if os.path.isfile(unix_path): 49 | with open(unix_path, "r") as f: 50 | data = f.read().replace("\n", "") 51 | return data 52 | else: 53 | return "No CUDA in this machine" 54 | else: 55 | raise ValueError("Not in Windows, Linux or Mac") 56 | 57 | 58 | def get_cudnn_version(): 59 | """Get the CuDNN version 60 | 61 | Returns: 62 | str: Version of the library. 63 | 64 | """ 65 | 66 | def find_cudnn_in_headers(candidates): 67 | for c in candidates: 68 | file = glob.glob(c) 69 | if file: 70 | break 71 | if file: 72 | with open(file[0], "r") as f: 73 | version = "" 74 | for line in f: 75 | if "#define CUDNN_MAJOR" in line: 76 | version = line.split()[-1] 77 | if "#define CUDNN_MINOR" in line: 78 | version += "." + line.split()[-1] 79 | if "#define CUDNN_PATCHLEVEL" in line: 80 | version += "." + line.split()[-1] 81 | if version: 82 | return version 83 | else: 84 | return "Cannot find CUDNN version" 85 | else: 86 | return "No CUDNN in this machine" 87 | 88 | if sys.platform == "win32": 89 | candidates = ["C:\\NVIDIA\\cuda\\include\\cudnn.h", 90 | "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\include\\cudnn.h"] 91 | elif sys.platform == "linux": 92 | candidates = [ 93 | "/usr/include/x86_64-linux-gnu/cudnn_v*.h", 94 | "/usr/local/cuda/include/cudnn.h", 95 | "/usr/include/cudnn.h", 96 | ] 97 | elif sys.platform == "darwin": 98 | candidates = ["/usr/local/cuda/include/cudnn.h", "/usr/include/cudnn.h"] 99 | else: 100 | raise ValueError("Not in Windows, Linux or Mac") 101 | return find_cudnn_in_headers(candidates) 102 | 103 | -------------------------------------------------------------------------------- /notebooks/reco_utils/common/gpu_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import sys 5 | import os 6 | import glob 7 | from numba import cuda 8 | from numba.cuda.cudadrv.error import CudaSupportError 9 | 10 | 11 | DEFAULT_CUDA_PATH_LINUX = "/usr/local/cuda/version.txt" 12 | 13 | 14 | def get_number_gpus(): 15 | """Get the number of GPUs in the system. 16 | 17 | Returns: 18 | int: Number of GPUs. 19 | """ 20 | try: 21 | return len(cuda.gpus) 22 | except CudaSupportError: 23 | return 0 24 | 25 | 26 | def clear_memory_all_gpus(): 27 | """Clear memory of all GPUs.""" 28 | try: 29 | for gpu in cuda.gpus: 30 | with gpu: 31 | cuda.current_context().deallocations.clear() 32 | except CudaSupportError: 33 | print("No CUDA available") 34 | 35 | 36 | def get_cuda_version(unix_path=DEFAULT_CUDA_PATH_LINUX): 37 | """Get CUDA version 38 | 39 | Args: 40 | unix_path (str): Path to CUDA version file in Linux/Mac. 41 | 42 | Returns: 43 | str: Version of the library. 44 | """ 45 | if sys.platform == "win32": 46 | raise NotImplementedError("Implement this!") 47 | elif sys.platform == "linux" or sys.platform == "darwin": 48 | if os.path.isfile(unix_path): 49 | with open(unix_path, "r") as f: 50 | data = f.read().replace("\n", "") 51 | return data 52 | else: 53 | return "No CUDA in this machine" 54 | else: 55 | raise ValueError("Not in Windows, Linux or Mac") 56 | 57 | 58 | def get_cudnn_version(): 59 | """Get the CuDNN version 60 | 61 | Returns: 62 | str: Version of the library. 63 | """ 64 | 65 | def find_cudnn_in_headers(candiates): 66 | for c in candidates: 67 | file = glob.glob(c) 68 | if file: 69 | break 70 | if file: 71 | with open(file[0], "r") as f: 72 | version = "" 73 | for line in f: 74 | if "#define CUDNN_MAJOR" in line: 75 | version = line.split()[-1] 76 | if "#define CUDNN_MINOR" in line: 77 | version += "." + line.split()[-1] 78 | if "#define CUDNN_PATCHLEVEL" in line: 79 | version += "." + line.split()[-1] 80 | if version: 81 | return version 82 | else: 83 | return "Cannot find CUDNN version" 84 | else: 85 | return "No CUDNN in this machine" 86 | 87 | if sys.platform == "win32": 88 | candidates = ["C:\\NVIDIA\\cuda\\include\\cudnn.h", 89 | "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v[0-99]\\include\\cudnn.h"] 90 | elif sys.platform == "linux": 91 | candidates = [ 92 | "/usr/include/x86_64-linux-gnu/cudnn_v[0-99].h", 93 | "/usr/local/cuda/include/cudnn.h", 94 | "/usr/include/cudnn.h", 95 | ] 96 | elif sys.platform == "darwin": 97 | candidates = ["/usr/local/cuda/include/cudnn.h", "/usr/include/cudnn.h"] 98 | else: 99 | raise ValueError("Not in Windows, Linux or Mac") 100 | return find_cudnn_in_headers(candidates) 101 | -------------------------------------------------------------------------------- /reco_utils/common/python_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import logging 5 | 6 | import numpy as np 7 | from scipy import sparse 8 | 9 | 10 | logger = logging.getLogger() 11 | 12 | 13 | def exponential_decay(value, max_val, half_life): 14 | """Compute decay factor for a given value based on an exponential decay 15 | Values greater than max_val will be set to 1 16 | Args: 17 | value (numeric): value to calculate decay factor 18 | max_val (numeric): value at which decay factor will be 1 19 | half_life (numeric): value at which decay factor will be 0.5 20 | Returns: 21 | float: decay factor 22 | """ 23 | 24 | return np.minimum(1.0, np.power(0.5, (max_val - value) / half_life)) 25 | 26 | 27 | def jaccard(cooccurrence): 28 | """Helper method to calculate the Jaccard similarity of a matrix of co-occurrences 29 | Args: 30 | cooccurrence (np.array): the symmetric matrix of co-occurrences of items 31 | Returns: 32 | np.array: The matrix of Jaccard similarities between any two items 33 | """ 34 | 35 | diag = cooccurrence.diagonal() 36 | diag_rows = np.expand_dims(diag, axis=0) 37 | diag_cols = np.expand_dims(diag, axis=1) 38 | 39 | with np.errstate(invalid="ignore", divide="ignore"): 40 | result = cooccurrence / (diag_rows + diag_cols - cooccurrence) 41 | 42 | return np.array(result) 43 | 44 | 45 | def lift(cooccurrence): 46 | """Helper method to calculate the Lift of a matrix of co-occurrences 47 | Args: 48 | cooccurrence (np.array): the symmetric matrix of co-occurrences of items 49 | Returns: 50 | np.array: The matrix of Lifts between any two items 51 | """ 52 | 53 | diag = cooccurrence.diagonal() 54 | diag_rows = np.expand_dims(diag, axis=0) 55 | diag_cols = np.expand_dims(diag, axis=1) 56 | 57 | with np.errstate(invalid="ignore", divide="ignore"): 58 | result = cooccurrence / (diag_rows * diag_cols) 59 | 60 | return np.array(result) 61 | 62 | 63 | def get_top_k_scored_items(scores, top_k, sort_top_k=False): 64 | """Extract top K items from a matrix of scores for each user-item pair, optionally sort results per user 65 | 66 | Args: 67 | scores (np.array): score matrix (users x items) 68 | top_k (int): number of top items to recommend 69 | sort_top_k (bool): flag to sort top k results 70 | 71 | Returns: 72 | np.array, np.array: indices into score matrix for each users top items, scores corresponding to top items 73 | """ 74 | 75 | # ensure we're working with a dense ndarray 76 | if isinstance(scores, sparse.spmatrix): 77 | scores = scores.todense() 78 | 79 | if scores.shape[1] < top_k: 80 | logger.warning( 81 | "Number of items is less than top_k, limiting top_k to number of items" 82 | ) 83 | k = min(top_k, scores.shape[1]) 84 | 85 | test_user_idx = np.arange(scores.shape[0])[:, None] 86 | 87 | # get top K items and scores 88 | # this determines the un-ordered top-k item indices for each user 89 | top_items = np.argpartition(scores, -k, axis=1)[:, -k:] 90 | top_scores = scores[test_user_idx, top_items] 91 | 92 | if sort_top_k: 93 | sort_ind = np.argsort(-top_scores) 94 | top_items = top_items[test_user_idx, sort_ind] 95 | top_scores = top_scores[test_user_idx, sort_ind] 96 | 97 | return np.array(top_items), np.array(top_scores) 98 | -------------------------------------------------------------------------------- /tests/unit/test_vowpal_wabbit.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | import pytest 6 | from unittest import mock 7 | 8 | 9 | import pandas as pd 10 | 11 | from reco_utils.recommender.vowpal_wabbit.vw import VW 12 | 13 | 14 | @pytest.fixture(scope="module") 15 | def df(): 16 | return pd.DataFrame( 17 | dict(user=[1, 3, 2], item=[8, 7, 7], rating=[1, 5, 3], timestamp=[1, 2, 3]) 18 | ) 19 | 20 | 21 | @pytest.fixture(scope="function") 22 | def model(): 23 | model = VW(col_user="user", col_item="item", col_prediction="prediction", q="ui") 24 | yield model 25 | del model 26 | 27 | 28 | def test_vw_init_del(): 29 | model = VW() 30 | tempdir = model.tempdir.name 31 | assert os.path.exists(tempdir) 32 | 33 | del model 34 | assert not os.path.exists(tempdir) 35 | 36 | 37 | def test_to_vw_cmd(): 38 | expected = [ 39 | "vw", 40 | "-l", 41 | "0.1", 42 | "--l1", 43 | "0.2", 44 | "--loss_function", 45 | "logistic", 46 | "--holdout_off", 47 | "--rank", 48 | "3", 49 | "-t", 50 | ] 51 | params = dict( 52 | l=0.1, 53 | l1=0.2, 54 | loss_function="logistic", 55 | holdout_off=True, 56 | quiet=False, 57 | rank=3, 58 | t=True, 59 | ) 60 | assert VW.to_vw_cmd(params=params) == expected 61 | 62 | 63 | def test_parse_train_cmd(model): 64 | expected = [ 65 | "vw", 66 | "--loss_function", 67 | "logistic", 68 | "--oaa", 69 | "5", 70 | "-f", 71 | model.model_file, 72 | "-d", 73 | model.train_file, 74 | ] 75 | params = dict(loss_function="logistic", oaa=5, f="test", d="data", quiet=False) 76 | assert model.parse_train_params(params=params) == expected 77 | 78 | 79 | def test_parse_test_cmd(model): 80 | expected = [ 81 | "vw", 82 | "--loss_function", 83 | "logistic", 84 | "-d", 85 | model.test_file, 86 | "--quiet", 87 | "-i", 88 | model.model_file, 89 | "-p", 90 | model.prediction_file, 91 | "-t", 92 | ] 93 | params = dict( 94 | loss_function="logistic", i="test", oaa=5, d="data", test_only=True, quiet=True 95 | ) 96 | assert model.parse_test_params(params=params) == expected 97 | 98 | 99 | def test_to_vw_file(model, df): 100 | expected = ["1 0|user 1 |item 8", "5 1|user 3 |item 7", "3 2|user 2 |item 7"] 101 | model.to_vw_file(df, train=True) 102 | with open(model.train_file, "r") as f: 103 | assert f.read().splitlines() == expected 104 | del model 105 | 106 | 107 | def test_fit_and_predict(model, df): 108 | # generate fake predictions 109 | with open(model.prediction_file, "w") as f: 110 | f.writelines(["1 0\n", "3 1\n", "5 2\n"]) 111 | 112 | # patch subprocess call to vw 113 | with mock.patch("reco_utils.recommender.vowpal_wabbit.vw.run") as mock_run: 114 | model.fit(df) 115 | result = model.predict(df) 116 | 117 | expected = dict( 118 | user=dict(enumerate([1, 3, 2])), 119 | item=dict(enumerate([8, 7, 7])), 120 | rating=dict(enumerate([1, 5, 3])), 121 | timestamp=dict(enumerate([1, 2, 3])), 122 | prediction=dict(enumerate([1, 3, 5])), 123 | ) 124 | 125 | assert result.to_dict() == expected 126 | -------------------------------------------------------------------------------- /reco_utils/common/notebook_memory_management.py: -------------------------------------------------------------------------------- 1 | # Original code: https://raw.githubusercontent.com/miguelgfierro/codebase/master/python/system/notebook_memory_management.py 2 | # 3 | # Profile memory usage envelope of IPython commands and report interactively. 4 | # Usage (inside a python notebook): 5 | # from notebook_memory_management import start_watching_memory, stop_watching_memory 6 | # To start profile: 7 | # start_watching_memory() 8 | # To stop profile: 9 | # stop_watching_memory() 10 | # 11 | # Based on: https://github.com/ianozsvald/ipython_memory_usage 12 | # 13 | 14 | from __future__ import division # 1/2 == 0.5, as in Py3 15 | from __future__ import absolute_import # avoid hiding global modules with locals 16 | from __future__ import print_function # force use of print("hello") 17 | from __future__ import ( 18 | unicode_literals 19 | ) # force unadorned strings "" to be Unicode without prepending u"" 20 | import time 21 | import memory_profiler 22 | from IPython import get_ipython 23 | import psutil 24 | import warnings 25 | 26 | 27 | # keep a global accounting for the last known memory usage 28 | # which is the reference point for the memory delta calculation 29 | previous_call_memory_usage = memory_profiler.memory_usage()[0] 30 | t1 = time.time() # will be set to current time later 31 | keep_watching = True 32 | watching_memory = True 33 | try: 34 | input_cells = get_ipython().user_ns["In"] 35 | except: 36 | warnings.warn("Not running on notebook") 37 | 38 | 39 | def start_watching_memory(): 40 | """Register memory profiling tools to IPython instance.""" 41 | global watching_memory 42 | watching_memory = True 43 | ip = get_ipython() 44 | ip.events.register("post_run_cell", watch_memory) 45 | ip.events.register("pre_run_cell", pre_run_cell) 46 | 47 | 48 | def stop_watching_memory(): 49 | """Unregister memory profiling tools from IPython instance.""" 50 | global watching_memory 51 | watching_memory = False 52 | ip = get_ipython() 53 | try: 54 | ip.events.unregister("post_run_cell", watch_memory) 55 | except ValueError: 56 | print("ERROR: problem when unregistering") 57 | pass 58 | try: 59 | ip.events.unregister("pre_run_cell", pre_run_cell) 60 | except ValueError: 61 | print("ERROR: problem when unregistering") 62 | pass 63 | 64 | 65 | def watch_memory(): 66 | # bring in the global memory usage value from the previous iteration 67 | global previous_call_memory_usage, keep_watching, watching_memory, input_cells 68 | new_memory_usage = memory_profiler.memory_usage()[0] 69 | memory_delta = new_memory_usage - previous_call_memory_usage 70 | keep_watching = False 71 | total_memory = psutil.virtual_memory()[0] / 1024 / 1024 # in Mb 72 | # calculate time delta using global t1 (from the pre-run event) and current time 73 | time_delta_secs = time.time() - t1 74 | num_commands = len(input_cells) - 1 75 | cmd = "In [{}]".format(num_commands) 76 | # convert the results into a pretty string 77 | output_template = ( 78 | "{cmd} used {memory_delta:0.4f} Mb RAM in " 79 | "{time_delta:0.2f}s, total RAM usage " 80 | "{memory_usage:0.2f} Mb, total RAM " 81 | "memory {total_memory:0.2f} Mb" 82 | ) 83 | output = output_template.format( 84 | time_delta=time_delta_secs, 85 | cmd=cmd, 86 | memory_delta=memory_delta, 87 | memory_usage=new_memory_usage, 88 | total_memory=total_memory, 89 | ) 90 | if watching_memory: 91 | print(str(output)) 92 | previous_call_memory_usage = new_memory_usage 93 | 94 | 95 | def pre_run_cell(): 96 | """Capture current time before we execute the current command""" 97 | global t1 98 | t1 = time.time() 99 | 100 | -------------------------------------------------------------------------------- /notebooks/reco_utils/common/notebook_memory_management.py: -------------------------------------------------------------------------------- 1 | # Original code: https://raw.githubusercontent.com/miguelgfierro/codebase/master/python/system/notebook_memory_management.py 2 | # 3 | # Profile memory usage envelope of IPython commands and report interactively. 4 | # Usage (inside a python notebook): 5 | # from notebook_memory_management import start_watching_memory, stop_watching_memory 6 | # To start profile: 7 | # start_watching_memory() 8 | # To stop profile: 9 | # stop_watching_memory() 10 | # 11 | # Based on: https://github.com/ianozsvald/ipython_memory_usage 12 | # 13 | 14 | from __future__ import division # 1/2 == 0.5, as in Py3 15 | from __future__ import absolute_import # avoid hiding global modules with locals 16 | from __future__ import print_function # force use of print("hello") 17 | from __future__ import ( 18 | unicode_literals 19 | ) # force unadorned strings "" to be unicode without prepending u"" 20 | import time 21 | import memory_profiler 22 | from IPython import get_ipython 23 | import psutil 24 | import warnings 25 | 26 | 27 | # keep a global accounting for the last known memory usage 28 | # which is the reference point for the memory delta calculation 29 | previous_call_memory_usage = memory_profiler.memory_usage()[0] 30 | t1 = time.time() # will be set to current time later 31 | keep_watching = True 32 | watching_memory = True 33 | try: 34 | input_cells = get_ipython().user_ns["In"] 35 | except: 36 | warnings.warn("Not running on notebook") 37 | 38 | 39 | def start_watching_memory(): 40 | """Register memory profiling tools to IPython instance.""" 41 | global watching_memory 42 | watching_memory = True 43 | ip = get_ipython() 44 | ip.events.register("post_run_cell", watch_memory) 45 | ip.events.register("pre_run_cell", pre_run_cell) 46 | 47 | 48 | def stop_watching_memory(): 49 | """Unregister memory profiling tools from IPython instance.""" 50 | global watching_memory 51 | watching_memory = False 52 | ip = get_ipython() 53 | try: 54 | ip.events.unregister("post_run_cell", watch_memory) 55 | except ValueError: 56 | print("ERROR: problem when unregistering") 57 | pass 58 | try: 59 | ip.events.unregister("pre_run_cell", pre_run_cell) 60 | except ValueError: 61 | print("ERROR: problem when unregistering") 62 | pass 63 | 64 | 65 | def watch_memory(): 66 | # bring in the global memory usage value from the previous iteration 67 | global previous_call_memory_usage, keep_watching, watching_memory, input_cells 68 | new_memory_usage = memory_profiler.memory_usage()[0] 69 | memory_delta = new_memory_usage - previous_call_memory_usage 70 | keep_watching = False 71 | total_memory = psutil.virtual_memory()[0] / 1024 / 1024 # in Mb 72 | # calculate time delta using global t1 (from the pre-run event) and current time 73 | time_delta_secs = time.time() - t1 74 | num_commands = len(input_cells) - 1 75 | cmd = "In [{}]".format(num_commands) 76 | # convert the results into a pretty string 77 | output_template = ( 78 | "{cmd} used {memory_delta:0.4f} Mb RAM in " 79 | "{time_delta:0.2f}s, total RAM usage " 80 | "{memory_usage:0.2f} Mb, total RAM " 81 | "memory {total_memory:0.2f} Mb" 82 | ) 83 | output = output_template.format( 84 | time_delta=time_delta_secs, 85 | cmd=cmd, 86 | memory_delta=memory_delta, 87 | memory_usage=new_memory_usage, 88 | total_memory=total_memory, 89 | ) 90 | if watching_memory: 91 | print(str(output)) 92 | previous_call_memory_usage = new_memory_usage 93 | 94 | 95 | def pre_run_cell(): 96 | """Capture current time before we execute the current command""" 97 | global t1 98 | t1 = time.time() 99 | 100 | -------------------------------------------------------------------------------- /tests/unit/test_wide_deep_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | import shutil 6 | 7 | import pandas as pd 8 | import tensorflow as tf 9 | 10 | from reco_utils.common.tf_utils import ( 11 | pandas_input_fn, 12 | MODEL_DIR 13 | ) 14 | from reco_utils.recommender.wide_deep.wide_deep_utils import ( 15 | build_model, 16 | build_feature_columns, 17 | ) 18 | from reco_utils.common.constants import ( 19 | DEFAULT_USER_COL, 20 | DEFAULT_ITEM_COL, 21 | DEFAULT_RATING_COL 22 | ) 23 | 24 | ITEM_FEAT_COL = 'itemFeat' 25 | 26 | 27 | @pytest.fixture(scope='module') 28 | def pd_df(): 29 | df = pd.DataFrame( 30 | { 31 | DEFAULT_USER_COL: [1, 1, 1, 2, 2, 2], 32 | DEFAULT_ITEM_COL: [1, 2, 3, 1, 4, 5], 33 | ITEM_FEAT_COL: [[1, 1, 1], [2, 2, 2], [3, 3, 3], [1, 1, 1], [4, 4, 4], [5, 5, 5]], 34 | DEFAULT_RATING_COL: [5, 4, 3, 5, 5, 3], 35 | } 36 | ) 37 | users = df.drop_duplicates(DEFAULT_USER_COL)[DEFAULT_USER_COL].values 38 | items = df.drop_duplicates(DEFAULT_ITEM_COL)[DEFAULT_ITEM_COL].values 39 | return df, users, items 40 | 41 | 42 | @pytest.mark.gpu 43 | def test_build_feature_columns(pd_df): 44 | data, users, items = pd_df 45 | 46 | # Test if wide column has one crossed column 47 | wide_columns, _ = build_feature_columns(users, items, model_type='wide') 48 | assert len(wide_columns) == 1 49 | 50 | # Test if deep columns have user and item columns 51 | _, deep_columns = build_feature_columns(users, items, model_type='deep') 52 | assert len(deep_columns) == 2 53 | 54 | # Test if wide and deep columns have correct columns 55 | wide_columns, deep_columns = build_feature_columns(users, items, model_type='wide_deep') 56 | assert len(wide_columns) == 1 57 | assert len(deep_columns) == 2 58 | 59 | 60 | @pytest.mark.gpu 61 | def test_build_model(pd_df): 62 | data, users, items = pd_df 63 | 64 | # Test wide model 65 | wide_columns, deep_columns = build_feature_columns(users, items, model_type='wide') 66 | model = build_model('wide_'+MODEL_DIR, wide_columns=wide_columns) 67 | assert isinstance(model, tf.estimator.LinearRegressor) 68 | model = build_model('wide_'+MODEL_DIR, wide_columns=wide_columns, deep_columns=deep_columns) 69 | assert isinstance(model, tf.estimator.LinearRegressor) 70 | 71 | # Test if model train works 72 | model.train( 73 | input_fn=pandas_input_fn(df=data, y_col=DEFAULT_RATING_COL, batch_size=1, num_epochs=10, shuffle=True) 74 | ) 75 | shutil.rmtree('wide_' + MODEL_DIR, ignore_errors=True) 76 | 77 | # Test deep model 78 | wide_columns, deep_columns = build_feature_columns(users, items, model_type='deep') 79 | model = build_model('deep_'+MODEL_DIR, deep_columns=deep_columns) 80 | assert isinstance(model, tf.estimator.DNNRegressor) 81 | model = build_model('deep_'+MODEL_DIR, wide_columns=wide_columns, deep_columns=deep_columns) 82 | assert isinstance(model, tf.estimator.DNNRegressor) 83 | 84 | # Test if model train works 85 | model.train( 86 | input_fn=pandas_input_fn(df=data, y_col=DEFAULT_RATING_COL, batch_size=1, num_epochs=10, shuffle=True) 87 | ) 88 | shutil.rmtree('deep_' + MODEL_DIR, ignore_errors=True) 89 | 90 | # Test wide_deep model 91 | wide_columns, deep_columns = build_feature_columns(users, items, model_type='wide_deep') 92 | model = build_model('wide_deep_'+MODEL_DIR, wide_columns=wide_columns, deep_columns=deep_columns) 93 | assert isinstance(model, tf.estimator.DNNLinearCombinedRegressor) 94 | 95 | # Test if model train works 96 | model.train( 97 | input_fn=pandas_input_fn(df=data, y_col=DEFAULT_RATING_COL, batch_size=1, num_epochs=10, shuffle=True) 98 | ) 99 | shutil.rmtree('wide_deep_'+MODEL_DIR, ignore_errors=True) 100 | -------------------------------------------------------------------------------- /tests/unit/test_surprise_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | """ 5 | Test utils for Surprise algos 6 | """ 7 | import pandas as pd 8 | import pytest 9 | 10 | import surprise 11 | 12 | from reco_utils.recommender.surprise.surprise_utils import ( 13 | compute_rating_predictions, 14 | compute_ranking_predictions 15 | ) 16 | from tests.unit.test_python_evaluation import python_data 17 | 18 | TOL = 0.001 19 | 20 | 21 | def test_compute_rating_predictions(python_data): 22 | rating_true, _, _ = python_data(binary_rating=False) 23 | svd = surprise.SVD() 24 | train_set = surprise.Dataset.load_from_df(rating_true, reader=surprise.Reader()).build_full_trainset() 25 | svd.fit(train_set) 26 | 27 | preds = compute_rating_predictions(svd, rating_true) 28 | assert set(preds.columns) == {'userID', 'itemID', 'prediction'} 29 | assert preds['userID'].dtypes == rating_true['userID'].dtypes 30 | assert preds['itemID'].dtypes == rating_true['itemID'].dtypes 31 | user = rating_true.iloc[0]['userID'] 32 | item = rating_true.iloc[0]['itemID'] 33 | assert preds[(preds['userID'] == user) & (preds['itemID'] == item)]['prediction'].values == \ 34 | pytest.approx(svd.predict(user, item).est, rel=TOL) 35 | 36 | preds = compute_rating_predictions(svd, rating_true.rename(columns={'userID': 'uid', 'itemID': 'iid'}), 37 | usercol='uid', itemcol='iid', predcol='pred') 38 | assert set(preds.columns) == {'uid', 'iid', 'pred'} 39 | assert preds['uid'].dtypes == rating_true['userID'].dtypes 40 | assert preds['iid'].dtypes == rating_true['itemID'].dtypes 41 | user = rating_true.iloc[1]['userID'] 42 | item = rating_true.iloc[1]['itemID'] 43 | assert preds[(preds['uid'] == user) & (preds['iid'] == item)]['pred'].values == \ 44 | pytest.approx(svd.predict(user, item).est, rel=TOL) 45 | 46 | 47 | def test_compute_ranking_predictions(python_data): 48 | rating_true, _, _ = python_data(binary_rating=False) 49 | n_users = len(rating_true['userID'].unique()) 50 | n_items = len(rating_true['itemID'].unique()) 51 | svd = surprise.SVD() 52 | train_set = surprise.Dataset.load_from_df(rating_true, reader=surprise.Reader()).build_full_trainset() 53 | svd.fit(train_set) 54 | 55 | preds = compute_ranking_predictions(svd, rating_true) 56 | assert set(preds.columns) == {'userID', 'itemID', 'prediction'} 57 | assert preds['userID'].dtypes == rating_true['userID'].dtypes 58 | assert preds['itemID'].dtypes == rating_true['itemID'].dtypes 59 | user = preds.iloc[0]['userID'] 60 | item = preds.iloc[0]['itemID'] 61 | assert preds[(preds['userID'] == user) & (preds['itemID'] == item)]['prediction'].values == \ 62 | pytest.approx(svd.predict(user, item).est, rel=TOL) 63 | # Test default recommend_seen=False 64 | assert pd.merge(rating_true, preds, on=['userID', 'itemID']).shape[0] == 0 65 | assert preds.shape[0] == (n_users * n_items - rating_true.shape[0]) 66 | 67 | preds = compute_ranking_predictions(svd, 68 | rating_true.rename(columns={'userID': 'uid', 'itemID': 'iid', 'rating': 'r'}), 69 | usercol='uid', itemcol='iid', predcol='pred', recommend_seen=True) 70 | assert set(preds.columns) == {'uid', 'iid', 'pred'} 71 | assert preds['uid'].dtypes == rating_true['userID'].dtypes 72 | assert preds['iid'].dtypes == rating_true['itemID'].dtypes 73 | user = preds.iloc[1]['uid'] 74 | item = preds.iloc[1]['iid'] 75 | assert preds[(preds['uid'] == user) & (preds['iid'] == item)]['pred'].values == \ 76 | pytest.approx(svd.predict(user, item).est, rel=TOL) 77 | # Test recommend_seen=True 78 | assert pd.merge(rating_true, preds, left_on=['userID', 'itemID'], right_on=['uid', 'iid']).shape[0] == \ 79 | rating_true.shape[0] 80 | assert preds.shape[0] == n_users * n_items 81 | -------------------------------------------------------------------------------- /scripts/prepare_databricks_for_o16n.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | # --------------------------------------------------------- 4 | # This script installs appropriate external libraries onto 5 | # a databricks cluster for operationalization. 6 | 7 | DATABRICKS_CLI=$(which databricks) 8 | if ! [ -x "$DATABRICKS_CLI" ]; then 9 | echo "No databricks-cli found!! Please see the SETUP.md file for installation prerequisites." 10 | exit 1 11 | fi 12 | 13 | CLUSTER_ID=$1 14 | if [ -z $CLUSTER_ID ]; then 15 | echo "Please provide the target cluster id: 'prepare_databricks_for_016n.sh '." 16 | echo "Cluster id can be found by running 'databricks clusters list'" 17 | echo "which returns a list of ." 18 | exit 1 19 | fi 20 | 21 | ## for spark version >=2.3.0 22 | COSMOSDB_CONNECTOR_URL="https://search.maven.org/remotecontent?filepath=com/microsoft/azure/azure-cosmosdb-spark_2.3.0_2.11/1.2.2/azure-cosmosdb-spark_2.3.0_2.11-1.2.2-uber.jar" 23 | COSMOSDB_CONNECTOR_BASENAME=$(basename $COSMOSDB_CONNECTOR_URL) 24 | 25 | CLUSTER_EXIST=false 26 | PYPI_LIBRARIES=( "azure-cli==2.0.56" "azureml-sdk[databricks]==1.0.8" "pydocumentdb==2.3.3" ) 27 | while IFS=' ' read -ra ARR; do 28 | if [ ${ARR[0]} = $CLUSTER_ID ]; then 29 | CLUSTER_EXIST=true 30 | 31 | STATUS=${ARR[2]} 32 | STATUS=${STATUS//[^a-zA-Z]/} 33 | if [ $STATUS = RUNNING ]; then 34 | ## install each of the pypi libraries 35 | for lib in "${PYPI_LIBRARIES[@]}" 36 | do 37 | echo 38 | echo "Adding $lib" 39 | echo 40 | databricks libraries install --cluster-id $CLUSTER_ID --pypi-package $lib 41 | done 42 | 43 | ## get spark-cosmosdb connector: 44 | echo 45 | echo "downloading cosmosdb connector jar file" 46 | echo 47 | curl -O $COSMOSDB_CONNECTOR_URL 48 | 49 | ## uplaod the jar to dbfs 50 | echo 51 | echo "Uploading to dbfs" 52 | echo 53 | dbfs cp --overwrite ${COSMOSDB_CONNECTOR_BASENAME} dbfs:/FileStore/jars/${COSMOSDB_CONNECTOR_BASENAME} 54 | 55 | # isntall from dbfs 56 | echo 57 | echo "Adding ${COSMOSDB_CONNECTOR_BASENAME} as library" 58 | echo 59 | databricks libraries install --cluster-id $CLUSTER_ID --jar dbfs:/FileStore/jars/${COSMOSDB_CONNECTOR_BASENAME} 60 | 61 | ## Check installation status 62 | echo 63 | echo "Done! Installation status checking..." 64 | databricks libraries cluster-status --cluster-id $CLUSTER_ID 65 | 66 | echo 67 | echo "Restarting the cluster to activate the library..." 68 | databricks clusters restart --cluster-id $CLUSTER_ID 69 | 70 | echo "This will take few seconds. Please check the result from Databricks workspace." 71 | echo "Alternatively, run 'databricks clusters list' to check the restart status and" 72 | echo "run 'databricks libraries cluster-status --cluster-id $CLUSTER_ID' to check the installation status." 73 | 74 | exit 0 75 | else 76 | echo "Cluster $CLUSTER_ID found, but it is not running. Status=${STATUS}" 77 | echo "You can start the cluster with 'databricks clusters start --cluster-id $CLUSTER_ID'." 78 | echo "Then, check the cluster status by using 'databricks clusters list' and" 79 | echo "re-try installation once the status turns into RUNNING." 80 | exit 1 81 | fi 82 | fi 83 | done < <(databricks clusters list) 84 | 85 | if ! [ $CLUSTER_EXIST = true ]; then 86 | echo "Cannot find the target cluster $CLUSTER_ID. Please check if you entered the valid id." 87 | echo "Cluster id can be found by running 'databricks clusters list'" 88 | echo "which returns a list of ." 89 | exit 1 90 | fi 91 | 92 | -------------------------------------------------------------------------------- /notebooks/scripts/prepare_databricks_for_o16n.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | # --------------------------------------------------------- 4 | # This script installs appropriate external libraries onto 5 | # a databricks cluster for operationalization. 6 | 7 | DATABRICKS_CLI=$(which databricks) 8 | if ! [ -x "$DATABRICKS_CLI" ]; then 9 | echo "No databricks-cli found!! Please see the SETUP.md file for installation prerequisites." 10 | exit 1 11 | fi 12 | 13 | CLUSTER_ID=$1 14 | if [ -z $CLUSTER_ID ]; then 15 | echo "Please provide the target cluster id: 'prepare_databricks_for_016n.sh '." 16 | echo "Cluster id can be found by running 'databricks clusters list'" 17 | echo "which returns a list of ." 18 | exit 1 19 | fi 20 | 21 | ## for spark version >=2.3.0 22 | COSMOSDB_CONNECTOR_URL="https://search.maven.org/remotecontent?filepath=com/microsoft/azure/azure-cosmosdb-spark_2.3.0_2.11/1.2.2/azure-cosmosdb-spark_2.3.0_2.11-1.2.2-uber.jar" 23 | COSMOSDB_CONNECTOR_BASENAME=$(basename $COSMOSDB_CONNECTOR_URL) 24 | 25 | CLUSTER_EXIST=false 26 | PYPI_LIBRARIES=( "azure-cli==2.0.56" "azureml-sdk[databricks]==1.0.8" "pydocumentdb==2.3.3" ) 27 | while IFS=' ' read -ra ARR; do 28 | if [ ${ARR[0]} = $CLUSTER_ID ]; then 29 | CLUSTER_EXIST=true 30 | 31 | STATUS=${ARR[2]} 32 | STATUS=${STATUS//[^a-zA-Z]/} 33 | if [ $STATUS = RUNNING ]; then 34 | ## install each of the pypi libraries 35 | for lib in "${PYPI_LIBRARIES[@]}" 36 | do 37 | echo 38 | echo "Adding $lib" 39 | echo 40 | databricks libraries install --cluster-id $CLUSTER_ID --pypi-package $lib 41 | done 42 | 43 | ## get spark-cosmosdb connector: 44 | echo 45 | echo "downloading cosmosdb connector jar file" 46 | echo 47 | curl -O $COSMOSDB_CONNECTOR_URL 48 | 49 | ## uplaod the jar to dbfs 50 | echo 51 | echo "Uploading to dbfs" 52 | echo 53 | dbfs cp --overwrite ${COSMOSDB_CONNECTOR_BASENAME} dbfs:/FileStore/jars/${COSMOSDB_CONNECTOR_BASENAME} 54 | 55 | # isntall from dbfs 56 | echo 57 | echo "Adding ${COSMOSDB_CONNECTOR_BASENAME} as library" 58 | echo 59 | databricks libraries install --cluster-id $CLUSTER_ID --jar dbfs:/FileStore/jars/${COSMOSDB_CONNECTOR_BASENAME} 60 | 61 | ## Check installation status 62 | echo 63 | echo "Done! Installation status checking..." 64 | databricks libraries cluster-status --cluster-id $CLUSTER_ID 65 | 66 | echo 67 | echo "Restarting the cluster to activate the library..." 68 | databricks clusters restart --cluster-id $CLUSTER_ID 69 | 70 | echo "This will take few seconds. Please check the result from Databricks workspace." 71 | echo "Alternatively, run 'databricks clusters list' to check the restart status and" 72 | echo "run 'databricks libraries cluster-status --cluster-id $CLUSTER_ID' to check the installation status." 73 | 74 | exit 0 75 | else 76 | echo "Cluster $CLUSTER_ID found, but it is not running. Status=${STATUS}" 77 | echo "You can start the cluster with 'databricks clusters start --cluster-id $CLUSTER_ID'." 78 | echo "Then, check the cluster status by using 'databricks clusters list' and" 79 | echo "re-try installation once the status turns into RUNNING." 80 | exit 1 81 | fi 82 | fi 83 | done < <(databricks clusters list) 84 | 85 | if ! [ $CLUSTER_EXIST = true ]; then 86 | echo "Cannot find the target cluster $CLUSTER_ID. Please check if you entered the valid id." 87 | echo "Cluster id can be found by running 'databricks clusters list'" 88 | echo "which returns a list of ." 89 | exit 1 90 | fi 91 | 92 | -------------------------------------------------------------------------------- /reco_utils/recommender/surprise/surprise_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from reco_utils.common.constants import DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_PREDICTION_COL 9 | from reco_utils.common.general_utils import invert_dictionary 10 | 11 | 12 | def surprise_trainset_to_df( 13 | trainset, col_user="uid", col_item="iid", col_rating="rating" 14 | ): 15 | """Converts a surprise.Trainset object to pd.DataFrame 16 | More info: https://surprise.readthedocs.io/en/stable/trainset.html 17 | 18 | Args: 19 | trainset (obj): A surprise.Trainset object. 20 | col_user (str): User column name. 21 | col_item (str): Item column name. 22 | col_rating (str): Rating column name. 23 | 24 | Returns: 25 | pd.DataFrame: A dataframe. The user and item columns are strings and the rating columns are floats. 26 | """ 27 | df = pd.DataFrame(trainset.all_ratings(), columns=[col_user, col_item, col_rating]) 28 | map_user = ( 29 | trainset._inner2raw_id_users 30 | if trainset._inner2raw_id_users is not None 31 | else invert_dictionary(trainset._raw2inner_id_users) 32 | ) 33 | map_item = ( 34 | trainset._inner2raw_id_items 35 | if trainset._inner2raw_id_items is not None 36 | else invert_dictionary(trainset._raw2inner_id_items) 37 | ) 38 | df[col_user] = df[col_user].map(map_user) 39 | df[col_item] = df[col_item].map(map_item) 40 | return df 41 | 42 | 43 | def compute_rating_predictions(algo, data, usercol=DEFAULT_USER_COL, itemcol=DEFAULT_ITEM_COL, predcol=DEFAULT_PREDICTION_COL): 44 | """Computes predictions of an algorithm from Surprise on the data. Can be used for computing rating metrics like RMSE. 45 | 46 | Args: 47 | algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise 48 | data (pd.DataFrame): the data on which to predict 49 | usercol (str): name of the user column 50 | itemcol (str): name of the item column 51 | 52 | Returns: 53 | pd.DataFrame: dataframe with usercol, itemcol, predcol 54 | """ 55 | predictions = [algo.predict(getattr(row, usercol), getattr(row, itemcol)) for row in data.itertuples()] 56 | predictions = pd.DataFrame(predictions) 57 | predictions = predictions.rename(index=str, columns={'uid': usercol, 'iid': itemcol, 'est': predcol}) 58 | return predictions.drop(['details', 'r_ui'], axis='columns') 59 | 60 | 61 | def compute_ranking_predictions(algo, data, usercol=DEFAULT_USER_COL, itemcol=DEFAULT_ITEM_COL, 62 | predcol=DEFAULT_PREDICTION_COL, recommend_seen=False): 63 | """Computes predictions of an algorithm from Surprise on all users and items in data. can be used for computing 64 | ranking metrics like NDCG. 65 | 66 | Args: 67 | algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise 68 | data (pd.DataFrame): the data from which to get the users and items 69 | usercol (str): name of the user column 70 | itemcol (str): name of the item column 71 | recommend_seen (bool): flag to include (user, item) pairs that appear in data 72 | 73 | Returns: 74 | pd.DataFrame: dataframe with usercol, itemcol, predcol 75 | """ 76 | preds_lst = [] 77 | for user in data[usercol].unique(): 78 | for item in data[itemcol].unique(): 79 | preds_lst.append([user, item, algo.predict(user, item).est]) 80 | 81 | all_predictions = pd.DataFrame(data=preds_lst, columns=[usercol, itemcol, predcol]) 82 | 83 | if recommend_seen: 84 | return all_predictions 85 | else: 86 | tempdf = pd.concat([data[[usercol, itemcol]], 87 | pd.DataFrame(data=np.ones(data.shape[0]), columns=['dummycol'], index=data.index)], 88 | axis=1) 89 | merged = pd.merge(tempdf, all_predictions, on=[usercol, itemcol], how="outer") 90 | return merged[merged['dummycol'].isnull()].drop('dummycol', axis=1) 91 | 92 | -------------------------------------------------------------------------------- /notebooks/reco_utils/recommender/surprise/surprise_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from reco_utils.common.constants import DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_PREDICTION_COL 9 | from reco_utils.common.general_utils import invert_dictionary 10 | 11 | 12 | def surprise_trainset_to_df( 13 | trainset, col_user="uid", col_item="iid", col_rating="rating" 14 | ): 15 | """Converts a surprise.Trainset object to pd.DataFrame 16 | More info: https://surprise.readthedocs.io/en/stable/trainset.html 17 | 18 | Args: 19 | trainset (obj): A surprise.Trainset object. 20 | col_user (str): User column name. 21 | col_item (str): Item column name. 22 | col_rating (str): Rating column name. 23 | 24 | Returns: 25 | pd.DataFrame: A dataframe. The user and item columns are strings and the rating columns are floats. 26 | """ 27 | df = pd.DataFrame(trainset.all_ratings(), columns=[col_user, col_item, col_rating]) 28 | map_user = ( 29 | trainset._inner2raw_id_users 30 | if trainset._inner2raw_id_users is not None 31 | else invert_dictionary(trainset._raw2inner_id_users) 32 | ) 33 | map_item = ( 34 | trainset._inner2raw_id_items 35 | if trainset._inner2raw_id_items is not None 36 | else invert_dictionary(trainset._raw2inner_id_items) 37 | ) 38 | df[col_user] = df[col_user].map(map_user) 39 | df[col_item] = df[col_item].map(map_item) 40 | return df 41 | 42 | 43 | def compute_rating_predictions(algo, data, usercol=DEFAULT_USER_COL, itemcol=DEFAULT_ITEM_COL, predcol=DEFAULT_PREDICTION_COL): 44 | """Computes predictions of an algorithm from Surprise on the data. Can be used for computing rating metrics like RMSE. 45 | 46 | Args: 47 | algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise 48 | data (pd.DataFrame): the data on which to predict 49 | usercol (str): name of the user column 50 | itemcol (str): name of the item column 51 | 52 | Returns: 53 | pd.DataFrame: dataframe with usercol, itemcol, predcol 54 | """ 55 | predictions = [algo.predict(getattr(row, usercol), getattr(row, itemcol)) for row in data.itertuples()] 56 | predictions = pd.DataFrame(predictions) 57 | predictions = predictions.rename(index=str, columns={'uid': usercol, 'iid': itemcol, 'est': predcol}) 58 | return predictions.drop(['details', 'r_ui'], axis='columns') 59 | 60 | 61 | def compute_ranking_predictions(algo, data, usercol=DEFAULT_USER_COL, itemcol=DEFAULT_ITEM_COL, 62 | predcol=DEFAULT_PREDICTION_COL, recommend_seen=False): 63 | """Computes predictions of an algorithm from Surprise on all users and items in data. can be used for computing 64 | ranking metrics like NDCG. 65 | 66 | Args: 67 | algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise 68 | data (pd.DataFrame): the data from which to get the users and items 69 | usercol (str): name of the user column 70 | itemcol (str): name of the item column 71 | recommend_seen (bool): flag to include (user, item) pairs that appear in data 72 | 73 | Returns: 74 | pd.DataFrame: dataframe with usercol, itemcol, predcol 75 | """ 76 | preds_lst = [] 77 | for user in data[usercol].unique(): 78 | for item in data[itemcol].unique(): 79 | preds_lst.append([user, item, algo.predict(user, item).est]) 80 | 81 | all_predictions = pd.DataFrame(data=preds_lst, columns=[usercol, itemcol, predcol]) 82 | 83 | if recommend_seen: 84 | return all_predictions 85 | else: 86 | tempdf = pd.concat([data[[usercol, itemcol]], 87 | pd.DataFrame(data=np.ones(data.shape[0]), columns=['dummycol'], index=data.index)], 88 | axis=1) 89 | merged = pd.merge(tempdf, all_predictions, on=[usercol, itemcol], how="outer") 90 | return merged[merged['dummycol'].isnull()].drop('dummycol', axis=1) 91 | 92 | -------------------------------------------------------------------------------- /reco_utils/nni/nni_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import requests 4 | import time 5 | 6 | NNI_REST_ENDPOINT = 'http://localhost:8080/api/v1/nni' 7 | NNI_STATUS_URL = NNI_REST_ENDPOINT + '/check-status' 8 | NNI_TRIAL_JOBS_URL = NNI_REST_ENDPOINT + '/trial-jobs' 9 | WAITING_TIME = 20 10 | MAX_RETRIES = 5 11 | 12 | 13 | def get_experiment_status(status_url): 14 | """ 15 | Helper method. Gets the experiment status from the REST endpoint 16 | 17 | Args: 18 | status_url (str): URL for the REST endpoint 19 | 20 | Returns: 21 | str: status of the experiment 22 | """ 23 | nni_status = requests.get(status_url).json() 24 | return nni_status['status'] 25 | 26 | 27 | def check_experiment_status(wait=WAITING_TIME, max_retries=MAX_RETRIES): 28 | """ Checks the status of the current experiment on the NNI REST endpoint 29 | Waits until the tuning has completed 30 | 31 | Args: 32 | wait (numeric) : time to wait in seconds 33 | max_retries (int): max number of retries 34 | """ 35 | i = 0 36 | while i < max_retries: 37 | status = get_experiment_status(NNI_STATUS_URL) 38 | if status in ['DONE', 'TUNER_NO_MORE_TRIAL']: 39 | break 40 | elif status not in ['RUNNING', 'NO_MORE_TRIAL']: 41 | raise RuntimeError("NNI experiment failed to complete with status {}".format(status)) 42 | time.sleep(wait) 43 | i += 1 44 | if i == max_retries: 45 | raise TimeoutError("check_experiment_status() timed out") 46 | 47 | 48 | def check_stopped(wait=WAITING_TIME, max_retries=MAX_RETRIES): 49 | """ 50 | Checks that there is no NNI experiment active (the URL is not accessible) 51 | This method should be called after 'nnictl stop' for verification 52 | 53 | Args: 54 | wait (numeric) : time to wait in seconds 55 | max_retries (int): max number of retries 56 | """ 57 | i = 0 58 | while i < max_retries: 59 | try: 60 | get_experiment_status(NNI_STATUS_URL) 61 | except: 62 | break 63 | time.sleep(wait) 64 | i += 1 65 | if i == max_retries: 66 | raise TimeoutError("check_stopped() timed out") 67 | 68 | 69 | def check_metrics_written(wait=WAITING_TIME, max_retries=MAX_RETRIES): 70 | """ 71 | Waits until the metrics have been written to the trial logs 72 | """ 73 | i = 0 74 | while i < max_retries: 75 | all_trials = requests.get(NNI_TRIAL_JOBS_URL).json() 76 | if all(['finalMetricData' in trial for trial in all_trials]): 77 | break 78 | time.sleep(wait) 79 | i += 1 80 | if i == max_retries: 81 | raise TimeoutError("check_metrics_written() timed out") 82 | 83 | 84 | def get_trials(optimize_mode): 85 | """ Obtain information about the trials of the current experiment via the REST endpoint 86 | 87 | Args: 88 | optimize_mode (str): One of 'minimize', 'maximize'. Determines how to obtain the best default metric. 89 | 90 | Returns: 91 | list: Trials info, list of (metrics, log path) 92 | dict: Metrics for the best choice of hyperparameters 93 | dict: Best hyperparameters 94 | str: Log path for the best trial 95 | """ 96 | 97 | if optimize_mode not in ['minimize', 'maximize']: 98 | raise ValueError("optimize_mode should equal either 'minimize' or 'maximize'") 99 | all_trials = requests.get(NNI_TRIAL_JOBS_URL).json() 100 | trials = [(eval(trial['finalMetricData'][0]['data']), trial['logPath'].split(':')[-1]) for trial in all_trials] 101 | sorted_trials = sorted(trials, key=lambda x: x[0]['default'], reverse=(optimize_mode == 'maximize')) 102 | best_trial_path = sorted_trials[0][1] 103 | # Read the metrics from the trial directory in order to get the name of the default metric 104 | with open(os.path.join(best_trial_path, "metrics.json"), "r") as fp: 105 | best_metrics = json.load(fp) 106 | with open(os.path.join(best_trial_path, "parameter.cfg"), "r") as fp: 107 | best_params = json.load(fp) 108 | return trials, best_metrics, best_params, best_trial_path 109 | -------------------------------------------------------------------------------- /tests/unit/test_ncf_dataset.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import numpy as np 4 | from itertools import product 5 | import pytest 6 | 7 | from reco_utils.common.constants import ( 8 | DEFAULT_USER_COL, 9 | DEFAULT_ITEM_COL, 10 | DEFAULT_RATING_COL, 11 | DEFAULT_TIMESTAMP_COL, 12 | ) 13 | from reco_utils.recommender.ncf.dataset import Dataset 14 | from tests.ncf_common import python_dataset_ncf, test_specs_ncf 15 | 16 | 17 | N_NEG = 5 18 | N_NEG_TEST = 10 19 | BATCH_SIZE = 32 20 | 21 | 22 | def test_data_preprocessing(python_dataset_ncf): 23 | train, test = python_dataset_ncf 24 | data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST) 25 | 26 | # shape 27 | assert len(data.train) == len(train) 28 | assert len(data.test) == len(test) 29 | 30 | # index correctness for id2user, user2id, id2item, item2id 31 | for data_row, row in zip(data.train.iterrows(), train.iterrows()): 32 | assert data_row[1][DEFAULT_USER_COL] == data.user2id[row[1][DEFAULT_USER_COL]] 33 | assert row[1][DEFAULT_USER_COL] == data.id2user[data_row[1][DEFAULT_USER_COL]] 34 | assert data_row[1][DEFAULT_ITEM_COL] == data.item2id[row[1][DEFAULT_ITEM_COL]] 35 | assert row[1][DEFAULT_ITEM_COL] == data.id2item[data_row[1][DEFAULT_ITEM_COL]] 36 | 37 | for data_row, row in zip(data.test.iterrows(), test.iterrows()): 38 | assert data_row[1][DEFAULT_USER_COL] == data.user2id[row[1][DEFAULT_USER_COL]] 39 | assert row[1][DEFAULT_USER_COL] == data.id2user[data_row[1][DEFAULT_USER_COL]] 40 | assert data_row[1][DEFAULT_ITEM_COL] == data.item2id[row[1][DEFAULT_ITEM_COL]] 41 | assert row[1][DEFAULT_ITEM_COL] == data.id2item[data_row[1][DEFAULT_ITEM_COL]] 42 | 43 | 44 | def test_train_loader(python_dataset_ncf): 45 | train, test = python_dataset_ncf 46 | data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST) 47 | 48 | # collect positvie user-item dict 49 | positive_pool = {} 50 | for u in train[DEFAULT_USER_COL].unique(): 51 | positive_pool[u] = set(train[train[DEFAULT_USER_COL] == u][DEFAULT_ITEM_COL]) 52 | 53 | # without negative sampling 54 | for batch in data.train_loader(batch_size=BATCH_SIZE, shuffle=False): 55 | user, item, labels = batch 56 | #shape 57 | assert len(user) == BATCH_SIZE 58 | assert len(item) == BATCH_SIZE 59 | assert len(labels) == BATCH_SIZE 60 | assert max(labels) == min(labels) 61 | 62 | # right labels 63 | for u, i, is_pos in zip(user, item, labels): 64 | if is_pos: 65 | assert i in positive_pool[u] 66 | else: 67 | assert i not in positive_pool[u] 68 | 69 | data.negative_sampling() 70 | label_list = [] 71 | batches = [] 72 | for idx, batch in enumerate(data.train_loader(batch_size=1)): 73 | user, item, labels = batch 74 | assert len(user) == 1 75 | assert len(item) == 1 76 | assert len(labels) == 1 77 | 78 | # right labels 79 | for u, i, is_pos in zip(user, item, labels): 80 | if is_pos: 81 | assert i in positive_pool[u] 82 | else: 83 | assert i not in positive_pool[u] 84 | 85 | label_list.append(is_pos) 86 | 87 | # neagtive smapling 88 | assert len(label_list) == (N_NEG + 1) * sum(label_list) 89 | 90 | 91 | def test_test_loader(python_dataset_ncf): 92 | train, test = python_dataset_ncf 93 | data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST) 94 | 95 | # positive user-item dict, noting that the pool is train+test 96 | positive_pool = {} 97 | df = train.append(test) 98 | for u in df[DEFAULT_USER_COL].unique(): 99 | positive_pool[u] = set(df[df[DEFAULT_USER_COL] == u][DEFAULT_ITEM_COL]) 100 | 101 | for batch in data.test_loader(): 102 | user, item, labels = batch 103 | # shape 104 | assert len(user) == N_NEG_TEST + 1 105 | assert len(item) == N_NEG_TEST + 1 106 | assert len(labels) == N_NEG_TEST + 1 107 | 108 | label_list = [] 109 | 110 | for u, i, is_pos in zip(user, item, labels): 111 | if is_pos: 112 | assert i in positive_pool[u] 113 | else: 114 | assert i not in positive_pool[u] 115 | 116 | label_list.append(is_pos) 117 | 118 | # leave-one-out 119 | assert sum(label_list) == 1 120 | # right labels 121 | assert len(label_list) == (N_NEG_TEST + 1) * sum(label_list) 122 | -------------------------------------------------------------------------------- /tests/unit/test_sparse.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import pytest 7 | 8 | from reco_utils.dataset.sparse import AffinityMatrix 9 | from reco_utils.common.constants import ( 10 | DEFAULT_USER_COL, 11 | DEFAULT_ITEM_COL, 12 | DEFAULT_RATING_COL, 13 | DEFAULT_TIMESTAMP_COL, 14 | ) 15 | 16 | 17 | @pytest.fixture(scope="module") 18 | def test_specs(): 19 | return {"number_of_items": 50, "number_of_users": 20, "seed": 123} 20 | 21 | 22 | # generate a syntetic dataset 23 | @pytest.fixture(scope="module") 24 | def python_dataset(test_specs): 25 | 26 | """Get Python labels""" 27 | 28 | def random_date_generator(start_date, range_in_days): 29 | """Helper function to generate random timestamps. 30 | 31 | Reference: https://stackoverflow.com/questions/41006182/generate-random-dates-within-a 32 | -range-in-numpy 33 | 34 | """ 35 | 36 | days_to_add = np.arange(0, range_in_days) 37 | random_dates = [] 38 | 39 | for i in range(range_in_days): 40 | random_date = np.datetime64(start_date) + np.random.choice(days_to_add) 41 | random_dates.append(random_date) 42 | 43 | return random_dates 44 | 45 | # fix the the random seed 46 | np.random.seed(test_specs["seed"]) 47 | 48 | # generates the user/item affinity matrix. Ratings are from 1 to 5, with 0s denoting unrated items 49 | X = np.random.randint( 50 | low=0, 51 | high=6, 52 | size=(test_specs["number_of_users"], test_specs["number_of_items"]), 53 | ) 54 | 55 | # In the main code, input data are passed as pandas dataframe. Below we generate such df from the above matrix 56 | userids = [] 57 | 58 | for i in range(1, test_specs["number_of_users"] + 1): 59 | userids.extend([i] * test_specs["number_of_items"]) 60 | 61 | itemids = [i for i in range(1, test_specs["number_of_items"] + 1)] * test_specs[ 62 | "number_of_users" 63 | ] 64 | ratings = np.reshape(X, -1) 65 | 66 | # create dataframe 67 | results = pd.DataFrame.from_dict( 68 | { 69 | DEFAULT_USER_COL: userids, 70 | DEFAULT_ITEM_COL: itemids, 71 | DEFAULT_RATING_COL: ratings, 72 | DEFAULT_TIMESTAMP_COL: random_date_generator( 73 | "2018-01-01", 74 | test_specs["number_of_users"] * test_specs["number_of_items"], 75 | ), 76 | } 77 | ) 78 | 79 | # here we eliminate the missing ratings to obtain a standard form of the df as that of real data. 80 | results = results[results.rating != 0] 81 | 82 | return results 83 | 84 | 85 | def test_df_to_sparse(test_specs, python_dataset): 86 | # initialize the splitter 87 | header = { 88 | "col_user": DEFAULT_USER_COL, 89 | "col_item": DEFAULT_ITEM_COL, 90 | "col_rating": DEFAULT_RATING_COL, 91 | } 92 | 93 | # instantiate the affinity matrix 94 | am = AffinityMatrix(DF=python_dataset, **header) 95 | 96 | # obtain the sparse matrix representation of the input dataframe 97 | X = am.gen_affinity_matrix() 98 | 99 | # check that the generated matrix has the correct dimensions 100 | assert (X.shape[0] == python_dataset.userID.unique().shape[0]) & ( 101 | X.shape[1] == python_dataset.itemID.unique().shape[0] 102 | ) 103 | 104 | 105 | def test_sparse_to_df(test_specs, python_dataset): 106 | # initialize the splitter 107 | header = { 108 | "col_user": DEFAULT_USER_COL, 109 | "col_item": DEFAULT_ITEM_COL, 110 | "col_rating": DEFAULT_RATING_COL, 111 | } 112 | 113 | # instantiate the the affinity matrix 114 | am = AffinityMatrix(DF=python_dataset, **header) 115 | 116 | # generate the sparse matrix representation 117 | X = am.gen_affinity_matrix() 118 | 119 | # use the inverse function to generate a pandas df from a sparse matrix ordered by userID 120 | DF = am.map_back_sparse(X, kind="ratings") 121 | 122 | # tests: check that the two dataframes have the same elements in the same positions. 123 | assert ( 124 | DF.userID.values.all() 125 | == python_dataset.sort_values(by=["userID"]).userID.values.all() 126 | ) 127 | 128 | assert ( 129 | DF.itemID.values.all() 130 | == python_dataset.sort_values(by=["userID"]).itemID.values.all() 131 | ) 132 | 133 | assert ( 134 | DF.rating.values.all() 135 | == python_dataset.sort_values(by=["userID"]).rating.values.all() 136 | ) 137 | -------------------------------------------------------------------------------- /tests/ci/submitpytest.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | import json 6 | import azureml.core 7 | from azureml.core.authentication import AzureCliAuthentication 8 | from azureml.core import Workspace 9 | from azureml.core import Experiment 10 | from azureml.core.runconfig import RunConfiguration 11 | from azureml.core.conda_dependencies import CondaDependencies 12 | from azureml.core.runconfig import DEFAULT_CPU_IMAGE 13 | # uncomment if using gpu 14 | from azureml.core.runconfig import DEFAULT_GPU_IMAGE 15 | from azureml.core.script_run_config import ScriptRunConfig 16 | 17 | 18 | from azureml.core.compute import ComputeTarget, AmlCompute 19 | from azureml.core.compute_target import ComputeTargetException 20 | 21 | 22 | with open("tests/ci/config.json") as f: 23 | config = json.load(f) 24 | 25 | workspace_name= config["workspace_name"] 26 | resource_group = config["resource_group"] 27 | subscription_id = config["subscription_id"] 28 | location = config["location"] 29 | 30 | print(" WS name ", workspace_name) 31 | print("subscription_id ", subscription_id) 32 | print("location",location) 33 | 34 | cli_auth = AzureCliAuthentication() 35 | print("cliauth") 36 | 37 | try: 38 | print("Trying to get ws") 39 | ws = Workspace.get( 40 | name=workspace_name, 41 | subscription_id=subscription_id, 42 | resource_group=resource_group, 43 | auth=cli_auth 44 | ) 45 | 46 | except Exception: 47 | # this call might take a minute or two. 48 | print("Creating new workspace") 49 | ws = Workspace.create( 50 | name=ws, 51 | subscription_id=subscription_id, 52 | resource_group=resource_group, 53 | # create_resource_group=True, 54 | location=location, 55 | auth=cli_auth 56 | ) 57 | 58 | # Choose a name for your CPU cluster 59 | cpu_cluster_name = "persistentcpu" 60 | #cpu_cluster_name = "cpucluster" 61 | print("cpu_cluster_name",cpu_cluster_name) 62 | # Verify that cluster does not exist already 63 | # https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets 64 | 65 | try: 66 | cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name) 67 | print('Found existing cluster, use it.') 68 | except ComputeTargetException: 69 | print("create cluster") 70 | compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', 71 | max_nodes=4) 72 | cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config) 73 | 74 | cpu_cluster.wait_for_completion(show_output=True) 75 | 76 | from azureml.core.runconfig import RunConfiguration 77 | from azureml.core.conda_dependencies import CondaDependencies 78 | from azureml.core.runconfig import DEFAULT_CPU_IMAGE 79 | 80 | # Create a new runconfig object 81 | run_amlcompute = RunConfiguration() 82 | 83 | # Use the cpu_cluster you created above. 84 | run_amlcompute.target = cpu_cluster 85 | 86 | # Enable Docker 87 | run_amlcompute.environment.docker.enabled = True 88 | 89 | # Set Docker base image to the default CPU-based image 90 | run_amlcompute.environment.docker.base_image = DEFAULT_CPU_IMAGE 91 | 92 | # Use conda_dependencies.yml to create a conda environment in the Docker image for execution 93 | run_amlcompute.environment.python.user_managed_dependencies = False 94 | 95 | # Auto-prepare the Docker image when used for execution (if it is not already prepared) 96 | run_amlcompute.auto_prepare_environment = True 97 | 98 | # Specify CondaDependencies obj, add necessary packages 99 | 100 | run_amlcompute.environment.python.conda_dependencies = CondaDependencies( 101 | conda_dependencies_file_path='./reco_base.yaml') 102 | 103 | from azureml.core import Experiment 104 | experiment_name = 'PersistentAML' 105 | 106 | experiment = Experiment(workspace=ws, name=experiment_name) 107 | project_folder = "." 108 | script_run_config = ScriptRunConfig(source_directory=project_folder, 109 | script='./tests/ci/runpytest.py', 110 | run_config=run_amlcompute) 111 | 112 | print('before submit') 113 | run = experiment.submit(script_run_config) 114 | print('after submit') 115 | run.wait_for_completion(show_output=True, wait_post_processing=True) 116 | 117 | # go to azure portal to see log in azure ws and look for experiment name and 118 | # look for individual run 119 | print('files', run.get_file_names()) 120 | run.download_files(prefix='reports') 121 | run.tag('persistentaml tag') 122 | 123 | -------------------------------------------------------------------------------- /tests/unit/test_tf_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | import shutil 6 | 7 | import pandas as pd 8 | import tensorflow as tf 9 | 10 | from reco_utils.common.tf_utils import ( 11 | pandas_input_fn, 12 | build_optimizer, 13 | evaluation_log_hook, 14 | Logger, 15 | MODEL_DIR 16 | ) 17 | from reco_utils.recommender.wide_deep.wide_deep_utils import ( 18 | build_model, 19 | build_feature_columns, 20 | ) 21 | from reco_utils.common.constants import ( 22 | DEFAULT_USER_COL, 23 | DEFAULT_ITEM_COL, 24 | DEFAULT_RATING_COL 25 | ) 26 | from reco_utils.evaluation.python_evaluation import rmse 27 | 28 | ITEM_FEAT_COL = 'itemFeat' 29 | 30 | 31 | @pytest.fixture(scope='module') 32 | def pd_df(): 33 | df = pd.DataFrame( 34 | { 35 | DEFAULT_USER_COL: [1, 1, 1, 2, 2, 2], 36 | DEFAULT_ITEM_COL: [1, 2, 3, 1, 4, 5], 37 | ITEM_FEAT_COL: [[1, 1, 1], [2, 2, 2], [3, 3, 3], [1, 1, 1], [4, 4, 4], [5, 5, 5]], 38 | DEFAULT_RATING_COL: [5, 4, 3, 5, 5, 3], 39 | } 40 | ) 41 | users = df.drop_duplicates(DEFAULT_USER_COL)[DEFAULT_USER_COL].values 42 | items = df.drop_duplicates(DEFAULT_ITEM_COL)[DEFAULT_ITEM_COL].values 43 | return df, users, items 44 | 45 | 46 | @pytest.mark.gpu 47 | def test_pandas_input_fn(pd_df): 48 | df, _, _ = pd_df 49 | 50 | input_fn = pandas_input_fn(df) 51 | sample = input_fn() 52 | 53 | # check the input function returns all the columns 54 | assert len(df.columns) == len(sample) 55 | for k, v in sample.items(): 56 | assert k in df.columns.values 57 | # check if a list feature column converted correctly 58 | if len(v.shape) == 2: 59 | assert v.shape[1] == len(df[k][0]) 60 | 61 | input_fn_with_label = pandas_input_fn(df, y_col=DEFAULT_RATING_COL) 62 | X, y = input_fn_with_label() 63 | features = df.copy() 64 | features.pop(DEFAULT_RATING_COL) 65 | assert len(X) == len(features.columns) 66 | 67 | 68 | @pytest.mark.gpu 69 | def test_build_optimizer(): 70 | adadelta = build_optimizer('Adadelta') 71 | assert isinstance(adadelta, tf.train.AdadeltaOptimizer) 72 | 73 | adagrad = build_optimizer('Adagrad') 74 | assert isinstance(adagrad, tf.train.AdagradOptimizer) 75 | 76 | adam = build_optimizer('Adam') 77 | assert isinstance(adam, tf.train.AdamOptimizer) 78 | 79 | ftrl = build_optimizer('Ftrl', **{'l1_regularization_strength': 0.001}) 80 | assert isinstance(ftrl, tf.train.FtrlOptimizer) 81 | 82 | momentum = build_optimizer('Momentum', **{'momentum': 0.5}) 83 | assert isinstance(momentum, tf.train.MomentumOptimizer) 84 | 85 | rmsprop = build_optimizer('RMSProp') 86 | assert isinstance(rmsprop, tf.train.RMSPropOptimizer) 87 | 88 | sgd = build_optimizer('SGD') 89 | assert isinstance(sgd, tf.train.GradientDescentOptimizer) 90 | 91 | 92 | @pytest.mark.gpu 93 | def test_evaluation_log_hook(pd_df): 94 | data, users, items = pd_df 95 | 96 | # Run hook 10 times 97 | hook_frequency = 10 98 | train_steps = 101 99 | 100 | _, deep_columns = build_feature_columns(users, items, model_type='deep') 101 | 102 | model = build_model( 103 | 'deep_'+MODEL_DIR, deep_columns=deep_columns, save_checkpoints_steps=train_steps//hook_frequency 104 | ) 105 | 106 | class EvaluationLogger(Logger): 107 | def __init__(self): 108 | self.eval_log = {} 109 | 110 | def log(self, metric, value): 111 | if metric not in self.eval_log: 112 | self.eval_log[metric] = [] 113 | self.eval_log[metric].append(value) 114 | 115 | def get_log(self): 116 | return self.eval_log 117 | 118 | evaluation_logger = EvaluationLogger() 119 | 120 | hooks = [ 121 | evaluation_log_hook( 122 | model, 123 | logger=evaluation_logger, 124 | true_df=data, 125 | y_col=DEFAULT_RATING_COL, 126 | eval_df=data.drop(DEFAULT_RATING_COL, axis=1), 127 | every_n_iter=train_steps//hook_frequency, 128 | model_dir='deep_'+MODEL_DIR, 129 | eval_fns=[rmse], 130 | ) 131 | ] 132 | model.train( 133 | input_fn=pandas_input_fn(df=data, y_col=DEFAULT_RATING_COL, batch_size=1, num_epochs=None, shuffle=True), 134 | hooks=hooks, 135 | steps=train_steps 136 | ) 137 | shutil.rmtree('deep_' + MODEL_DIR, ignore_errors=True) 138 | 139 | # Check if hook logged the given metric 140 | assert rmse.__name__ in evaluation_logger.get_log() 141 | assert len(evaluation_logger.get_log()[rmse.__name__]) == hook_frequency 142 | -------------------------------------------------------------------------------- /tests/unit/test_rbm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | import numpy as np 6 | from reco_utils.recommender.rbm.rbm import RBM 7 | from tests.rbm_common import test_specs, affinity_matrix 8 | 9 | 10 | @pytest.fixture(scope="module") 11 | def init_rbm(): 12 | return { 13 | "n_hidden": 100, 14 | "epochs": 10, 15 | "minibatch": 50, 16 | "keep_prob": 0.8, 17 | "learning_rate": 0.002, 18 | "init_stdv": 0.01, 19 | "sampling_protocol": [30, 50, 80, 90, 100], 20 | "display": 20, 21 | } 22 | 23 | 24 | @pytest.mark.gpu 25 | def test_class_init(init_rbm): 26 | model = RBM( 27 | hidden_units=init_rbm["n_hidden"], 28 | training_epoch=init_rbm["epochs"], 29 | minibatch_size=init_rbm["minibatch"], 30 | keep_prob=init_rbm["keep_prob"], 31 | learning_rate=init_rbm["learning_rate"], 32 | init_stdv=init_rbm["init_stdv"], 33 | sampling_protocol=init_rbm["sampling_protocol"], 34 | display_epoch=init_rbm["display"], 35 | ) 36 | 37 | # number of hidden units 38 | assert model.Nhidden == init_rbm["n_hidden"] 39 | # number of training epochs 40 | assert model.epochs == init_rbm["epochs"] + 1 41 | # minibatch size 42 | assert model.minibatch == init_rbm["minibatch"] 43 | # keep probability for dropout regulrization 44 | assert model.keep == init_rbm["keep_prob"] 45 | # learning rate 46 | assert model.learning_rate == init_rbm["learning_rate"] 47 | # standard deviation used to initialize the weight matrix from a normal distribution 48 | assert model.stdv == init_rbm["init_stdv"] 49 | # sampling protocol used to increase the number of steps in Gibbs sampling 50 | assert model.sampling_protocol == init_rbm["sampling_protocol"] 51 | # number of epochs after which the rmse is displayed 52 | assert model.display == init_rbm["display"] 53 | 54 | 55 | @pytest.mark.gpu 56 | def test_train_param_init(init_rbm, affinity_matrix): 57 | # obtain the train/test set matrices 58 | Xtr, Xtst = affinity_matrix 59 | 60 | # initialize the model 61 | model = RBM( 62 | hidden_units=init_rbm["n_hidden"], 63 | training_epoch=init_rbm["epochs"], 64 | minibatch_size=init_rbm["minibatch"], 65 | ) 66 | # fit the model to the data 67 | model.fit(Xtr, Xtst) 68 | 69 | # visible units placeholder (tensor) 70 | model.vu.shape[1] == Xtr.shape[1] 71 | # weight matrix 72 | assert model.w.shape == [Xtr.shape[1], init_rbm["n_hidden"]] 73 | # bias, visible units 74 | assert model.bv.shape == [1, Xtr.shape[1]] 75 | # bias, hidden units 76 | assert model.bh.shape == [1, init_rbm["n_hidden"]] 77 | 78 | 79 | @pytest.mark.gpu 80 | def test_sampling_funct(init_rbm, affinity_matrix): 81 | # obtain the train/test set matrices 82 | Xtr, Xtst = affinity_matrix 83 | 84 | # initialize the model 85 | model = RBM( 86 | hidden_units=init_rbm["n_hidden"], 87 | training_epoch=init_rbm["epochs"], 88 | minibatch_size=init_rbm["minibatch"], 89 | ) 90 | 91 | def check_sampled_values(sampled, s): 92 | """ 93 | Check if the elements of the sampled units are in {0,s} 94 | """ 95 | a = [] 96 | 97 | for i in range(0, s + 1): 98 | l = sampled == i 99 | a.append(l) 100 | 101 | return sum(a) 102 | 103 | r = Xtr.max() # obtain the rating scale 104 | 105 | # fit the model to the data 106 | model.fit(Xtr, Xtst) 107 | 108 | # evaluate the activation probabilities of the hidden units and their sampled values 109 | phv, h = model.sess.run(model.sample_hidden_units(model.v)) 110 | 111 | # check the dimensions of the two matrices 112 | assert phv.shape == (Xtr.shape[0], 100) 113 | assert h.shape == (Xtr.shape[0], 100) 114 | 115 | # check that the activation probabilities are in [0,1] 116 | assert (phv <= 1).all() & (phv >= 0).all() 117 | 118 | # check that the sampled value of the hidden units is either 1 or 0 119 | assert check_sampled_values(h, 1).all() 120 | 121 | # evaluate the activation probabilities of the visible units and their sampled values 122 | pvh, v_sampled = model.sess.run(model.sample_visible_units(h)) 123 | 124 | assert pvh.shape == (Xtr.shape[0], Xtr.shape[1], r) 125 | assert v_sampled.shape == Xtr.shape 126 | 127 | # check that the multinomial distribution is normalized over the r classes for all users/items 128 | assert np.sum(pvh, axis=2) == pytest.approx(np.ones(Xtr.shape)) 129 | 130 | # check that the sampled values of the visible units is in [0,r] 131 | assert check_sampled_values(v_sampled, r).all() 132 | -------------------------------------------------------------------------------- /notebooks/reco_utils/azureml/wide_deep.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | """ 4 | AzureML Hyperdrive entry script for wide-deep model 5 | """ 6 | import argparse 7 | import os 8 | import shutil 9 | 10 | import papermill as pm 11 | import tensorflow as tf 12 | print("TensorFlow version:", tf.VERSION) 13 | 14 | try: 15 | from azureml.core import Run 16 | run = Run.get_context() 17 | except ImportError: 18 | run = None 19 | 20 | from reco_utils.common.constants import ( 21 | DEFAULT_USER_COL, 22 | DEFAULT_ITEM_COL, 23 | DEFAULT_RATING_COL 24 | ) 25 | 26 | 27 | NOTEBOOK_NAME = os.path.join( 28 | "notebooks", 29 | "00_quick_start", 30 | "wide_deep_movielens.ipynb" 31 | ) 32 | OUTPUT_NOTEBOOK = "wide_deep.ipynb" 33 | 34 | 35 | def _log(metric, value): 36 | if run is not None: 37 | if isinstance(value, list) and len(value) > 0 and isinstance(value[0], (int, float)) 38 | run.log_list(metric, value) 39 | else: 40 | run.log(metric, value) 41 | print(metric, "=", value) 42 | 43 | 44 | # Parse arguments passed by Hyperdrive 45 | parser = argparse.ArgumentParser() 46 | 47 | parser.add_argument('--top-k', type=int, dest='TOP_K', help="Top k recommendation", default=10) 48 | # Data path 49 | parser.add_argument('--datastore', type=str, dest='DATA_DIR', help="Datastore path") 50 | parser.add_argument('--train-datapath', type=str, dest='TRAIN_PICKLE_PATH') 51 | parser.add_argument('--test-datapath', type=str, dest='TEST_PICKLE_PATH') 52 | parser.add_argument('--model-dir', type=str, dest='MODEL_DIR', default='model_checkpoints') 53 | # Data column names 54 | parser.add_argument('--user-col', type=str, dest='USER_COL', default=DEFAULT_USER_COL) 55 | parser.add_argument('--item-col', type=str, dest='ITEM_COL', default=DEFAULT_ITEM_COL) 56 | parser.add_argument('--rating-col', type=str, dest='RATING_COL', default=DEFAULT_RATING_COL) 57 | parser.add_argument('--item-feat-col', type=str, dest='ITEM_FEAT_COL') # Optional 58 | parser.add_argument('--ranking-metrics', type=str, nargs='*', dest='RANKING_METRICS', default=['ndcg_at_k']) 59 | parser.add_argument('--rating-metrics', type=str, nargs='*', dest='RATING_METRICS', default=['rmse']) 60 | # Model type: either 'wide', 'deep', or 'wide_deep' 61 | parser.add_argument('--model-type', type=str, dest='MODEL_TYPE', default='wide_deep') 62 | # Wide model params 63 | parser.add_argument('--linear-optimizer', type=str, dest='LINEAR_OPTIMIZER', default='Ftrl') 64 | parser.add_argument('--linear-optimizer-lr', type=float, dest='LINEAR_OPTIMIZER_LR', default=0.01) 65 | parser.add_argument('--linear-l1-reg', type=float, dest='LINEAR_L1_REG', default=0.0) 66 | parser.add_argument('--linear-momentum', type=float, dest='LINEAR_MOMENTUM', default=0.9) 67 | # Deep model params 68 | parser.add_argument('--dnn-optimizer', type=str, dest='DNN_OPTIMIZER', default='Adagrad') 69 | parser.add_argument('--dnn-optimizer-lr', type=float, dest='DNN_OPTIMIZER_LR', default=0.01) 70 | parser.add_argument('--dnn-l1-reg', type=float, dest='DNN_L1_REG', default=0.0) 71 | parser.add_argument('--dnn-momentum', type=float, dest='DNN_MOMENTUM', default=0.9) 72 | parser.add_argument('--dnn-hidden-layer-1', type=int, dest='DNN_HIDDEN_LAYER_1', default=0) 73 | parser.add_argument('--dnn-hidden-layer-2', type=int, dest='DNN_HIDDEN_LAYER_2', default=0) 74 | parser.add_argument('--dnn-hidden-layer-3', type=int, dest='DNN_HIDDEN_LAYER_3', default=128) 75 | parser.add_argument('--dnn-hidden-layer-4', type=int, dest='DNN_HIDDEN_LAYER_4', default=128) 76 | parser.add_argument('--dnn-user-embedding-dim', type=int, dest='DNN_USER_DIM', default=8) 77 | parser.add_argument('--dnn-item-embedding-dim', type=int, dest='DNN_ITEM_DIM', default=8) 78 | parser.add_argument('--dnn-batch-norm', type=int, dest='DNN_BATCH_NORM', default=1) 79 | parser.add_argument('--dnn-dropout', type=float, dest='DNN_DROPOUT', default=0.0) 80 | # Training parameters 81 | parser.add_argument('--epochs', type=int, dest='EPOCHS', default=50) 82 | parser.add_argument('--batch-size', type=int, dest='BATCH_SIZE', default=128) 83 | parser.add_argument('--evaluate-while-training', dest='EVALUATE_WHILE_TRAINING', action='store_true') 84 | 85 | 86 | args = parser.parse_args() 87 | 88 | params = vars(args) 89 | 90 | if params['TOP_K'] <= 0: 91 | raise ValueError("Top K should be larger than 0") 92 | 93 | if params['MODEL_TYPE'] not in {'wide', 'deep', 'wide_deep'}: 94 | raise ValueError("Model type should be either 'wide', 'deep', or 'wide_deep'") 95 | 96 | if params['DATA_DIR'] is None: 97 | raise ValueError("Datastore path should be given") 98 | 99 | print("Args:") 100 | for k, v in params.items(): 101 | _log(k, v) 102 | 103 | 104 | print("Run", NOTEBOOK_NAME) 105 | 106 | pm.execute_notebook( 107 | NOTEBOOK_NAME, 108 | OUTPUT_NOTEBOOK, 109 | parameters=params, 110 | kernel_name='python3' 111 | ) 112 | nb = pm.read_notebook(OUTPUT_NOTEBOOK) 113 | 114 | for m, v in nb.data.items(): 115 | _log(m, v) 116 | 117 | # clean-up 118 | os.remove(OUTPUT_NOTEBOOK) 119 | shutil.rmtree(params['MODEL_DIR'], ignore_errors=True) 120 | -------------------------------------------------------------------------------- /reco_utils/azureml/wide_deep.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | """ 4 | AzureML Hyperdrive entry script for wide-deep model 5 | """ 6 | import argparse 7 | import os 8 | import shutil 9 | 10 | import papermill as pm 11 | import tensorflow as tf 12 | print("TensorFlow version:", tf.VERSION) 13 | 14 | try: 15 | from azureml.core import Run 16 | run = Run.get_context() 17 | except ImportError: 18 | run = None 19 | 20 | from reco_utils.common.constants import ( 21 | DEFAULT_USER_COL, 22 | DEFAULT_ITEM_COL, 23 | DEFAULT_RATING_COL 24 | ) 25 | 26 | 27 | NOTEBOOK_NAME = os.path.join( 28 | "notebooks", 29 | "00_quick_start", 30 | "wide_deep_movielens.ipynb" 31 | ) 32 | OUTPUT_NOTEBOOK = "wide_deep.ipynb" 33 | 34 | 35 | def _log(metric, value): 36 | """AzureML log wrapper. 37 | 38 | Record list of int or float as a list metrics so that we can plot it from AzureML workspace portal. 39 | Otherwise, record as a single value of the metric. 40 | """ 41 | if run is not None: 42 | if isinstance(value, list) and len(value) > 0 and isinstance(value[0], (int, float)): 43 | run.log_list(metric, value) 44 | else: 45 | # Force cast to str since run.log will raise an error if the value is iterable. 46 | run.log(metric, str(value)) 47 | print(metric, "=", value) 48 | 49 | 50 | # Parse arguments passed by Hyperdrive 51 | parser = argparse.ArgumentParser() 52 | 53 | parser.add_argument('--top-k', type=int, dest='TOP_K', help="Top k recommendation", default=10) 54 | # Data path 55 | parser.add_argument('--datastore', type=str, dest='DATA_DIR', help="Datastore path") 56 | parser.add_argument('--train-datapath', type=str, dest='TRAIN_PICKLE_PATH') 57 | parser.add_argument('--test-datapath', type=str, dest='TEST_PICKLE_PATH') 58 | parser.add_argument('--model-dir', type=str, dest='MODEL_DIR', default='model_checkpoints') 59 | # Data column names 60 | parser.add_argument('--user-col', type=str, dest='USER_COL', default=DEFAULT_USER_COL) 61 | parser.add_argument('--item-col', type=str, dest='ITEM_COL', default=DEFAULT_ITEM_COL) 62 | parser.add_argument('--rating-col', type=str, dest='RATING_COL', default=DEFAULT_RATING_COL) 63 | parser.add_argument('--item-feat-col', type=str, dest='ITEM_FEAT_COL') # Optional 64 | parser.add_argument('--ranking-metrics', type=str, nargs='*', dest='RANKING_METRICS', default=['ndcg_at_k']) 65 | parser.add_argument('--rating-metrics', type=str, nargs='*', dest='RATING_METRICS', default=['rmse']) 66 | # Model type: either 'wide', 'deep', or 'wide_deep' 67 | parser.add_argument('--model-type', type=str, dest='MODEL_TYPE', default='wide_deep') 68 | # Wide model params 69 | parser.add_argument('--linear-optimizer', type=str, dest='LINEAR_OPTIMIZER', default='Ftrl') 70 | parser.add_argument('--linear-optimizer-lr', type=float, dest='LINEAR_OPTIMIZER_LR', default=0.01) 71 | parser.add_argument('--linear-l1-reg', type=float, dest='LINEAR_L1_REG', default=0.0) 72 | parser.add_argument('--linear-momentum', type=float, dest='LINEAR_MOMENTUM', default=0.9) 73 | # Deep model params 74 | parser.add_argument('--dnn-optimizer', type=str, dest='DNN_OPTIMIZER', default='Adagrad') 75 | parser.add_argument('--dnn-optimizer-lr', type=float, dest='DNN_OPTIMIZER_LR', default=0.01) 76 | parser.add_argument('--dnn-l1-reg', type=float, dest='DNN_L1_REG', default=0.0) 77 | parser.add_argument('--dnn-momentum', type=float, dest='DNN_MOMENTUM', default=0.9) 78 | parser.add_argument('--dnn-hidden-layer-1', type=int, dest='DNN_HIDDEN_LAYER_1', default=0) 79 | parser.add_argument('--dnn-hidden-layer-2', type=int, dest='DNN_HIDDEN_LAYER_2', default=0) 80 | parser.add_argument('--dnn-hidden-layer-3', type=int, dest='DNN_HIDDEN_LAYER_3', default=128) 81 | parser.add_argument('--dnn-hidden-layer-4', type=int, dest='DNN_HIDDEN_LAYER_4', default=128) 82 | parser.add_argument('--dnn-user-embedding-dim', type=int, dest='DNN_USER_DIM', default=8) 83 | parser.add_argument('--dnn-item-embedding-dim', type=int, dest='DNN_ITEM_DIM', default=8) 84 | parser.add_argument('--dnn-batch-norm', type=int, dest='DNN_BATCH_NORM', default=1) 85 | parser.add_argument('--dnn-dropout', type=float, dest='DNN_DROPOUT', default=0.0) 86 | # Training parameters 87 | parser.add_argument('--epochs', type=int, dest='EPOCHS', default=50) 88 | parser.add_argument('--batch-size', type=int, dest='BATCH_SIZE', default=128) 89 | parser.add_argument('--evaluate-while-training', dest='EVALUATE_WHILE_TRAINING', action='store_true') 90 | 91 | 92 | args = parser.parse_args() 93 | 94 | params = vars(args) 95 | 96 | if params['TOP_K'] <= 0: 97 | raise ValueError("Top K should be larger than 0") 98 | 99 | if params['MODEL_TYPE'] not in {'wide', 'deep', 'wide_deep'}: 100 | raise ValueError("Model type should be either 'wide', 'deep', or 'wide_deep'") 101 | 102 | if params['DATA_DIR'] is None: 103 | raise ValueError("Datastore path should be given") 104 | 105 | print("Args:") 106 | for k, v in params.items(): 107 | _log(k, v) 108 | 109 | 110 | print("Run", NOTEBOOK_NAME) 111 | 112 | pm.execute_notebook( 113 | NOTEBOOK_NAME, 114 | OUTPUT_NOTEBOOK, 115 | parameters=params, 116 | kernel_name='python3' 117 | ) 118 | nb = pm.read_notebook(OUTPUT_NOTEBOOK) 119 | 120 | for m, v in nb.data.items(): 121 | _log(m, v) 122 | 123 | # clean-up 124 | os.remove(OUTPUT_NOTEBOOK) 125 | shutil.rmtree(params['MODEL_DIR'], ignore_errors=True) 126 | -------------------------------------------------------------------------------- /reco_utils/azureml/svd_training.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | import sys 4 | 5 | sys.path.append("../../") 6 | 7 | import argparse 8 | import os 9 | import pandas as pd 10 | import surprise 11 | 12 | try: 13 | from azureml.core import Run 14 | 15 | HAS_AML = True 16 | run = Run.get_context() 17 | except ModuleNotFoundError: 18 | HAS_AML = False 19 | 20 | from reco_utils.evaluation.python_evaluation import * 21 | from reco_utils.recommender.surprise.surprise_utils import compute_rating_predictions, compute_ranking_predictions 22 | 23 | 24 | def svd_training(args): 25 | """ 26 | Train Surprise SVD using the given hyper-parameters 27 | """ 28 | print("Start training...") 29 | train_data = pd.read_pickle(path=os.path.join(args.datastore, args.train_datapath)) 30 | validation_data = pd.read_pickle(path=os.path.join(args.datastore, args.validation_datapath)) 31 | 32 | svd = surprise.SVD(random_state=args.random_state, n_epochs=args.epochs, verbose=args.verbose, biased=args.biased, 33 | n_factors=args.n_factors, init_mean=args.init_mean, init_std_dev=args.init_std_dev, 34 | lr_all=args.lr_all, reg_all=args.reg_all, lr_bu=args.lr_bu, lr_bi=args.lr_bi, lr_pu=args.lr_pu, 35 | lr_qi=args.lr_qi, reg_bu=args.reg_bu, reg_bi=args.reg_bi, reg_pu=args.reg_pu, 36 | reg_qi=args.reg_qi) 37 | 38 | train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(args.surprise_reader)) \ 39 | .build_full_trainset() 40 | svd.fit(train_set) 41 | 42 | print("Evaluating...") 43 | 44 | rating_metrics = args.rating_metrics 45 | if len(rating_metrics) > 0: 46 | predictions = compute_rating_predictions(svd, validation_data, usercol=args.usercol, itemcol=args.itemcol) 47 | for metric in rating_metrics: 48 | result = eval(metric)(validation_data, predictions) 49 | print(metric, result) 50 | if HAS_AML: 51 | run.log(metric, result) 52 | 53 | ranking_metrics = args.ranking_metrics 54 | if len(ranking_metrics) > 0: 55 | all_predictions = compute_ranking_predictions(svd, train_data, usercol=args.usercol, itemcol=args.itemcol, 56 | recommend_seen=args.recommend_seen) 57 | k = args.k 58 | for metric in ranking_metrics: 59 | result = eval(metric)(validation_data, all_predictions, col_prediction='prediction', k=k) 60 | print("{}@{}".format(metric, k), result) 61 | if HAS_AML: 62 | run.log(metric, result) 63 | 64 | if len(ranking_metrics) == 0 and len(rating_metrics) == 0: 65 | raise ValueError("No metrics were specified.") 66 | 67 | return svd 68 | 69 | 70 | def main(): 71 | parser = argparse.ArgumentParser() 72 | # Data path 73 | parser.add_argument('--datastore', type=str, dest='datastore', help="Datastore path") 74 | parser.add_argument('--train-datapath', type=str, dest='train_datapath') 75 | parser.add_argument('--validation-datapath', type=str, dest='validation_datapath') 76 | parser.add_argument('--output_dir', type=str, help='output directory') 77 | parser.add_argument('--surprise-reader', type=str, dest='surprise_reader') 78 | parser.add_argument('--usercol', type=str, dest='usercol', default='userID') 79 | parser.add_argument('--itemcol', type=str, dest='itemcol', default='itemID') 80 | # Metrics 81 | parser.add_argument('--rating-metrics', type=str, nargs='*', dest='rating_metrics', default=[]) 82 | parser.add_argument('--ranking-metrics', type=str, nargs='*', dest='ranking_metrics', default=[]) 83 | parser.add_argument('--k', type=int, dest='k', default=None) 84 | parser.add_argument('--recommend-seen', dest='recommend_seen', action='store_true') 85 | # Training parameters 86 | parser.add_argument('--random-state', type=int, dest='random_state', default=0) 87 | parser.add_argument('--verbose', dest='verbose', action='store_true') 88 | parser.add_argument('--epochs', type=int, dest='epochs', default=30) 89 | parser.add_argument('--biased', dest='biased', action='store_true') 90 | # Hyperparameters to be tuned 91 | parser.add_argument('--n_factors', type=int, dest='n_factors', default=100) 92 | parser.add_argument('--init_mean', type=float, dest='init_mean', default=0.0) 93 | parser.add_argument('--init_std_dev', type=float, dest='init_std_dev', default=0.1) 94 | parser.add_argument('--lr_all', type=float, dest='lr_all', default=0.005) 95 | parser.add_argument('--reg_all', type=float, dest='reg_all', default=0.02) 96 | parser.add_argument('--lr_bu', type=float, dest='lr_bu', default=None) 97 | parser.add_argument('--lr_bi', type=float, dest='lr_bi', default=None) 98 | parser.add_argument('--lr_pu', type=float, dest='lr_pu', default=None) 99 | parser.add_argument('--lr_qi', type=float, dest='lr_qi', default=None) 100 | parser.add_argument('--reg_bu', type=float, dest='reg_bu', default=None) 101 | parser.add_argument('--reg_bi', type=float, dest='reg_bi', default=None) 102 | parser.add_argument('--reg_pu', type=float, dest='reg_pu', default=None) 103 | parser.add_argument('--reg_qi', type=float, dest='reg_qi', default=None) 104 | 105 | args = parser.parse_args() 106 | 107 | print("Args:", str(vars(args)), sep='\n') 108 | 109 | if HAS_AML: 110 | run.log('Number of epochs', args.epochs) 111 | 112 | svd = svd_training(args) 113 | # Save SVD model to the output directory for later use 114 | os.makedirs(args.output_dir, exist_ok=True) 115 | surprise.dump.dump(os.path.join(args.output_dir, 'model.dump'), algo=svd) 116 | 117 | 118 | if __name__ == "__main__": 119 | main() 120 | -------------------------------------------------------------------------------- /notebooks/reco_utils/azureml/svd_training.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | import sys 4 | 5 | sys.path.append("../../") 6 | 7 | import argparse 8 | import os 9 | import pandas as pd 10 | import surprise 11 | 12 | try: 13 | from azureml.core import Run 14 | 15 | HAS_AML = True 16 | run = Run.get_context() 17 | except ModuleNotFoundError: 18 | HAS_AML = False 19 | 20 | from reco_utils.evaluation.python_evaluation import * 21 | from reco_utils.recommender.surprise.surprise_utils import compute_rating_predictions, compute_ranking_predictions 22 | 23 | 24 | def svd_training(args): 25 | """ 26 | Train Surprise SVD using the given hyper-parameters 27 | """ 28 | print("Start training...") 29 | train_data = pd.read_pickle(path=os.path.join(args.datastore, args.train_datapath)) 30 | validation_data = pd.read_pickle(path=os.path.join(args.datastore, args.validation_datapath)) 31 | 32 | svd = surprise.SVD(random_state=args.random_state, n_epochs=args.epochs, verbose=args.verbose, biased=args.biased, 33 | n_factors=args.n_factors, init_mean=args.init_mean, init_std_dev=args.init_std_dev, 34 | lr_all=args.lr_all, reg_all=args.reg_all, lr_bu=args.lr_bu, lr_bi=args.lr_bi, lr_pu=args.lr_pu, 35 | lr_qi=args.lr_qi, reg_bu=args.reg_bu, reg_bi=args.reg_bi, reg_pu=args.reg_pu, 36 | reg_qi=args.reg_qi) 37 | 38 | train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(args.surprise_reader)) \ 39 | .build_full_trainset() 40 | svd.fit(train_set) 41 | 42 | print("Evaluating...") 43 | 44 | rating_metrics = args.rating_metrics 45 | if len(rating_metrics) > 0: 46 | predictions = compute_rating_predictions(svd, validation_data, usercol=args.usercol, itemcol=args.itemcol) 47 | for metric in rating_metrics: 48 | result = eval(metric)(validation_data, predictions) 49 | print(metric, result) 50 | if HAS_AML: 51 | run.log(metric, result) 52 | 53 | ranking_metrics = args.ranking_metrics 54 | if len(ranking_metrics) > 0: 55 | all_predictions = compute_ranking_predictions(svd, train_data, usercol=args.usercol, itemcol=args.itemcol, 56 | recommend_seen=args.recommend_seen) 57 | k = args.k 58 | for metric in ranking_metrics: 59 | result = eval(metric)(validation_data, all_predictions, col_prediction='prediction', k=k) 60 | print("{}@{}".format(metric, k), result) 61 | if HAS_AML: 62 | run.log(metric, result) 63 | 64 | if len(ranking_metrics) == 0 and len(rating_metrics) == 0: 65 | raise ValueError("No metrics were specified.") 66 | 67 | return svd 68 | 69 | 70 | def main(): 71 | parser = argparse.ArgumentParser() 72 | # Data path 73 | parser.add_argument('--datastore', type=str, dest='datastore', help="Datastore path") 74 | parser.add_argument('--train-datapath', type=str, dest='train_datapath') 75 | parser.add_argument('--validation-datapath', type=str, dest='validation_datapath') 76 | parser.add_argument('--output_dir', type=str, help='output directory') 77 | parser.add_argument('--surprise-reader', type=str, dest='surprise_reader') 78 | parser.add_argument('--usercol', type=str, dest='usercol', default='userID') 79 | parser.add_argument('--itemcol', type=str, dest='itemcol', default='itemID') 80 | # Metrics 81 | parser.add_argument('--rating-metrics', type=str, nargs='*', dest='rating_metrics', default=[]) 82 | parser.add_argument('--ranking-metrics', type=str, nargs='*', dest='ranking_metrics', default=[]) 83 | parser.add_argument('--k', type=int, dest='k', default=None) 84 | parser.add_argument('--recommend-seen', dest='recommend_seen', action='store_true') 85 | # Training parameters 86 | parser.add_argument('--random-state', type=int, dest='random_state', default=0) 87 | parser.add_argument('--verbose', dest='verbose', action='store_true') 88 | parser.add_argument('--epochs', type=int, dest='epochs', default=30) 89 | parser.add_argument('--biased', dest='biased', action='store_true') 90 | # Hyperparameters to be tuned 91 | parser.add_argument('--n_factors', type=int, dest='n_factors', default=100) 92 | parser.add_argument('--init_mean', type=float, dest='init_mean', default=0.0) 93 | parser.add_argument('--init_std_dev', type=float, dest='init_std_dev', default=0.1) 94 | parser.add_argument('--lr_all', type=float, dest='lr_all', default=0.005) 95 | parser.add_argument('--reg_all', type=float, dest='reg_all', default=0.02) 96 | parser.add_argument('--lr_bu', type=float, dest='lr_bu', default=None) 97 | parser.add_argument('--lr_bi', type=float, dest='lr_bi', default=None) 98 | parser.add_argument('--lr_pu', type=float, dest='lr_pu', default=None) 99 | parser.add_argument('--lr_qi', type=float, dest='lr_qi', default=None) 100 | parser.add_argument('--reg_bu', type=float, dest='reg_bu', default=None) 101 | parser.add_argument('--reg_bi', type=float, dest='reg_bi', default=None) 102 | parser.add_argument('--reg_pu', type=float, dest='reg_pu', default=None) 103 | parser.add_argument('--reg_qi', type=float, dest='reg_qi', default=None) 104 | 105 | args = parser.parse_args() 106 | 107 | print("Args:", str(vars(args)), sep='\n') 108 | 109 | if HAS_AML: 110 | run.log('Number of epochs', args.epochs) 111 | 112 | svd = svd_training(args) 113 | # Save SVD model to the output directory for later use 114 | os.makedirs(args.output_dir, exist_ok=True) 115 | surprise.dump.dump(os.path.join(args.output_dir, 'model.dump'), algo=svd) 116 | 117 | 118 | if __name__ == "__main__": 119 | main() 120 | -------------------------------------------------------------------------------- /scripts/repo_metrics/track_metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import sys 5 | import os 6 | 7 | # Need to append a full path instead of relative path. 8 | # This seems to be an issue from Azure DevOps command line task. 9 | # NOTE this does not affect running directly in the shell. 10 | sys.path.append(os.getcwd()) 11 | import argparse 12 | import traceback 13 | import logging 14 | from dateutil.parser import isoparse 15 | from pymongo import MongoClient 16 | from datetime import datetime 17 | from scripts.repo_metrics.git_stats import Github 18 | from scripts.repo_metrics.config import ( 19 | GITHUB_TOKEN, 20 | CONNECTION_STRING, 21 | DATABASE, 22 | COLLECTION_GITHUB_STATS, 23 | COLLECTION_EVENTS, 24 | ) 25 | 26 | format_str = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)s]: %(message)s" 27 | format_time = "%Y-%m-%d %H:%M:%S" 28 | logging.basicConfig(level=logging.INFO, format=format_str, datefmt=format_time) 29 | log = logging.getLogger() 30 | 31 | 32 | def parse_args(): 33 | """Argument parser. 34 | Returns: 35 | obj: Parser. 36 | """ 37 | parser = argparse.ArgumentParser( 38 | description="Metrics Tracker", 39 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 40 | ) 41 | parser.add_argument("--github_repo", type=str, help="GitHub repository") 42 | parser.add_argument( 43 | "--event", 44 | type=str, 45 | help="Input a general event that can be saved to the database", 46 | ) 47 | parser.add_argument( 48 | "--save_to_database", 49 | action="store_true", 50 | help="Whether or not to save the information to the database", 51 | ) 52 | parser.add_argument( 53 | "--event_date", 54 | default=datetime.now().isoformat(), 55 | type=isoparse, 56 | help="Date for an event (format: YYYY-MM-DD)", 57 | ) 58 | return parser.parse_args() 59 | 60 | 61 | def connect(uri="mongodb://localhost"): 62 | """Mongo connector. 63 | Args: 64 | uri (str): Connection string. 65 | Returns: 66 | obj: Mongo client. 67 | """ 68 | client = MongoClient(uri, serverSelectionTimeoutMS=1000) 69 | 70 | # Send a query to the server to see if the connection is working. 71 | try: 72 | client.server_info() 73 | except Exception: 74 | raise 75 | return client 76 | 77 | 78 | def event_as_dict(event, date): 79 | """Encodes an string event input as a dictionary with the date. 80 | Args: 81 | event (str): Details of a event. 82 | date (datetime): Date of the event. 83 | Returns: 84 | dict: Dictionary with the event and the date. 85 | """ 86 | return {"date": date.strftime("%b %d %Y %H:%M:%S"), "event": event} 87 | 88 | 89 | def github_stats_as_dict(github): 90 | """Encodes Github statistics as a dictionary with the date. 91 | Args: 92 | obj: Github object. 93 | Returns: 94 | dict: Dictionary with Github details and the date. 95 | """ 96 | return { 97 | "date": datetime.now().strftime("%b %d %Y %H:%M:%S"), 98 | "stars": github.stars, 99 | "forks": github.forks, 100 | "watchers": github.watchers, 101 | "subscribers": github.subscribers, 102 | "open_issues": github.open_issues, 103 | "open_pull_requests": github.open_pull_requests, 104 | "unique_views": github.number_unique_views, 105 | "total_views": github.number_total_views, 106 | "details_views": github.views, 107 | "unique_clones": github.number_unique_clones, 108 | "total_clones": github.number_total_clones, 109 | "details_clones": github.clones, 110 | "last_year_commit_frequency": github.last_year_commit_frequency, 111 | "details_referrers": github.top_ten_referrers, 112 | "total_referrers": github.number_total_referrers, 113 | "unique_referrers": github.number_unique_referrers, 114 | "details_content": github.top_ten_content, 115 | "repo_size": github.repo_size, 116 | "commits": github.number_commits, 117 | "contributors": github.number_contributors, 118 | "branches": github.number_branches, 119 | "tags": github.number_tags, 120 | "total_lines": github.number_total_lines, 121 | "added_lines": github.number_added_lines, 122 | "deleted_lines": github.number_deleted_lines, 123 | } 124 | 125 | 126 | def tracker(args): 127 | """Main function to track metrics. 128 | Args: 129 | args (obj): Parsed arguments. 130 | """ 131 | if args.github_repo: 132 | # if there is an env variable, overwrite it 133 | token = os.environ.get("GITHUB_TOKEN", GITHUB_TOKEN) 134 | g = Github(token, args.github_repo) 135 | git_doc = github_stats_as_dict(g) 136 | log.info("GitHub stats -- {}".format(git_doc)) 137 | g.clean() 138 | 139 | if args.event: 140 | event_doc = event_as_dict(args.event, args.event_date) 141 | log.info("Event -- {}".format(event_doc)) 142 | 143 | if args.save_to_database: 144 | # if there is an env variable, overwrite it 145 | connection = token = os.environ.get("CONNECTION_STRING", CONNECTION_STRING) 146 | cli = connect(connection) 147 | db = cli[DATABASE] 148 | if args.github_repo: 149 | db[COLLECTION_GITHUB_STATS].insert_one(git_doc) 150 | if args.event: 151 | db[COLLECTION_EVENTS].insert_one(event_doc) 152 | 153 | 154 | if __name__ == "__main__": 155 | log.info("Starting routine") 156 | args = parse_args() 157 | try: 158 | log.info("Arguments: {}".format(args)) 159 | tracker(args) 160 | except Exception as e: 161 | trace = traceback.format_exc() 162 | log.error("Traceback: {}".format(trace)) 163 | log.error("Exception: {}".format(e)) 164 | finally: 165 | log.info("Routine finished") 166 | -------------------------------------------------------------------------------- /scripts/generate_conda_file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Copyright (c) Microsoft Corporation. All rights reserved. 4 | # Licensed under the MIT License. 5 | 6 | # This script creates yaml files to build conda environments 7 | # For generating a conda file for running only python code: 8 | # $ python generate_conda_file.py 9 | # For generating a conda file for running python gpu: 10 | # $ python generate_conda_file.py --gpu 11 | # For generating a conda file for running pyspark: 12 | # $ python generate_conda_file.py --pyspark 13 | # For generating a conda file for running python gpu and pyspark: 14 | # $ python generate_conda_file.py --gpu --pyspark 15 | # For generating a conda file for running python gpu and pyspark with a particular version: 16 | # $ python generate_conda_file.py --gpu --pyspark-version 2.4.0 17 | 18 | import argparse 19 | import textwrap 20 | 21 | 22 | HELP_MSG = """ 23 | To create the conda environment: 24 | $ conda env create -f {conda_env}.yaml 25 | 26 | To update the conda environment: 27 | $ conda env update -f {conda_env}.yaml 28 | 29 | To register the conda environment in Jupyter: 30 | $ conda activate {conda_env} 31 | $ python -m ipykernel install --user --name {conda_env} --display-name "Python ({conda_env})" 32 | """ 33 | 34 | CHANNELS = [ "defaults", "conda-forge", "pytorch", "fastai"] 35 | 36 | CONDA_BASE = { 37 | "mock": "mock==2.0.0", 38 | "dask": "dask>=0.17.1", 39 | "fastparquet": "fastparquet>=0.1.6", 40 | "gitpython": "gitpython>=2.1.8", 41 | "ipykernel": "ipykernel>=4.6.1", 42 | "jupyter": "jupyter>=1.0.0", 43 | "matplotlib": "matplotlib>=2.2.2", 44 | "numpy": "numpy>=1.13.3", 45 | "pandas": "pandas>=0.23.4", 46 | "pymongo": "pymongo>=3.6.1", 47 | "python": "python==3.6.8", 48 | "pytest": "pytest>=3.6.4", 49 | "pytorch": "pytorch-cpu>=1.0.0", 50 | "seaborn": "seaborn>=0.8.1", 51 | "scikit-learn": "scikit-learn==0.19.1", 52 | "scipy": "scipy>=1.0.0", 53 | "scikit-surprise": "scikit-surprise>=1.0.6", 54 | "tensorflow": "tensorflow==1.12.0", 55 | } 56 | 57 | CONDA_PYSPARK = {"pyarrow": "pyarrow>=0.8.0", "pyspark": "pyspark==2.3.1"} 58 | 59 | CONDA_GPU = {"numba": "numba>=0.38.1", "pytorch": "pytorch>=1.0.0", "tensorflow": "tensorflow-gpu==1.12.0"} 60 | 61 | PIP_BASE = { 62 | "azureml-sdk[notebooks,contrib]": "azureml-sdk[notebooks,contrib]", 63 | "azure-storage": "azure-storage>=0.36.0", 64 | "black": "black>=18.6b4", 65 | "dataclasses": "dataclasses>=0.6", 66 | "hyperopt": "hyperopt==0.1.1", 67 | "idna": "idna==2.7", 68 | "memory-profiler": "memory-profiler>=0.54.0", 69 | "nvidia-ml-py3": "nvidia-ml-py3>=7.352.0", 70 | "papermill": "papermill==0.18.2", 71 | "pydocumentdb": "pydocumentdb>=2.3.3", 72 | "fastai": "fastai==1.0.46", 73 | } 74 | 75 | PIP_PYSPARK = {} 76 | PIP_GPU = {} 77 | 78 | 79 | if __name__ == "__main__": 80 | parser = argparse.ArgumentParser( 81 | description=textwrap.dedent( 82 | """ 83 | This script generates a conda file for different environments. 84 | Plain python is the default, but flags can be used to support PySpark and GPU functionality""" 85 | ), 86 | epilog=HELP_MSG, 87 | formatter_class=argparse.RawDescriptionHelpFormatter, 88 | ) 89 | parser.add_argument("--name", help="specify name of conda environment") 90 | parser.add_argument( 91 | "--gpu", action="store_true", help="include packages for GPU support" 92 | ) 93 | parser.add_argument( 94 | "--pyspark", action="store_true", help="include packages for PySpark support" 95 | ) 96 | parser.add_argument( 97 | "--pyspark-version", help="provide specific version of PySpark to use" 98 | ) 99 | args = parser.parse_args() 100 | 101 | # check pyspark version 102 | if args.pyspark_version is not None: 103 | args.pyspark = True 104 | pyspark_version_info = args.pyspark_version.split(".") 105 | if len(pyspark_version_info) != 3 or any( 106 | [not x.isdigit() for x in pyspark_version_info] 107 | ): 108 | raise TypeError( 109 | "PySpark version input must be valid numeric format (e.g. --pyspark-version=2.3.1)" 110 | ) 111 | else: 112 | args.pyspark_version = "2.3.1" 113 | 114 | # set name for environment and output yaml file 115 | conda_env = "reco_base" 116 | if args.gpu and args.pyspark: 117 | conda_env = "reco_full" 118 | elif args.gpu: 119 | conda_env = "reco_gpu" 120 | elif args.pyspark: 121 | conda_env = "reco_pyspark" 122 | 123 | # overwrite environment name with user input 124 | if args.name is not None: 125 | conda_env = args.name 126 | 127 | # update conda and pip packages based on flags provided 128 | conda_packages = CONDA_BASE 129 | pip_packages = PIP_BASE 130 | if args.pyspark: 131 | conda_packages.update(CONDA_PYSPARK) 132 | conda_packages["pyspark"] = "pyspark=={}".format(args.pyspark_version) 133 | pip_packages.update(PIP_PYSPARK) 134 | if args.gpu: 135 | conda_packages.update(CONDA_GPU) 136 | pip_packages.update(PIP_GPU) 137 | 138 | # write out yaml file 139 | conda_file = "{}.yaml".format(conda_env) 140 | with open(conda_file, "w") as f: 141 | for line in HELP_MSG.format(conda_env=conda_env).split("\n"): 142 | f.write("# {}\n".format(line)) 143 | f.write("name: {}\n".format(conda_env)) 144 | f.write("channels:\n") 145 | for channel in CHANNELS: 146 | f.write("- {}\n".format(channel)) 147 | f.write("dependencies:\n") 148 | for conda_package in conda_packages.values(): 149 | f.write("- {}\n".format(conda_package)) 150 | f.write("- pip:\n") 151 | for pip_package in pip_packages.values(): 152 | f.write(" - {}\n".format(pip_package)) 153 | 154 | print("Generated conda file: {}".format(conda_file)) 155 | print(HELP_MSG.format(conda_env=conda_env)) 156 | -------------------------------------------------------------------------------- /notebooks/scripts/generate_conda_file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Copyright (c) Microsoft Corporation. All rights reserved. 4 | # Licensed under the MIT License. 5 | 6 | # This script creates yaml files to build conda environments 7 | # For generating a conda file for running only python code: 8 | # $ python generate_conda_file.py 9 | # For generating a conda file for running python gpu: 10 | # $ python generate_conda_file.py --gpu 11 | # For generating a conda file for running pyspark: 12 | # $ python generate_conda_file.py --pyspark 13 | # For generating a conda file for running python gpu and pyspark: 14 | # $ python generate_conda_file.py --gpu --pyspark 15 | # For generating a conda file for running python gpu and pyspark with a particular version: 16 | # $ python generate_conda_file.py --gpu --pyspark-version 2.4.0 17 | 18 | import argparse 19 | import textwrap 20 | 21 | 22 | HELP_MSG = """ 23 | To create the conda environment: 24 | $ conda env create -f {conda_env}.yaml 25 | 26 | To update the conda environment: 27 | $ conda env update -f {conda_env}.yaml 28 | 29 | To register the conda environment in Jupyter: 30 | $ conda activate {conda_env} 31 | $ python -m ipykernel install --user --name {conda_env} --display-name "Python ({conda_env})" 32 | """ 33 | 34 | CHANNELS = [ "defaults", "conda-forge", "pytorch", "fastai"] 35 | 36 | CONDA_BASE = { 37 | "mock": "mock==2.0.0", 38 | "dask": "dask>=0.17.1", 39 | "fastparquet": "fastparquet>=0.1.6", 40 | "gitpython": "gitpython>=2.1.8", 41 | "ipykernel": "ipykernel>=4.6.1", 42 | "jupyter": "jupyter>=1.0.0", 43 | "matplotlib": "matplotlib>=2.2.2", 44 | "numpy": "numpy>=1.13.3", 45 | "pandas": "pandas>=0.23.4", 46 | "pymongo": "pymongo>=3.6.1", 47 | "python": "python==3.6.8", 48 | "pytest": "pytest>=3.6.4", 49 | "pytorch": "pytorch-cpu>=1.0.0", 50 | "seaborn": "seaborn>=0.8.1", 51 | "scikit-learn": "scikit-learn==0.19.1", 52 | "scipy": "scipy>=1.0.0", 53 | "scikit-surprise": "scikit-surprise>=1.0.6", 54 | "tensorflow": "tensorflow==1.12.0", 55 | } 56 | 57 | CONDA_PYSPARK = {"pyarrow": "pyarrow>=0.8.0", "pyspark": "pyspark==2.3.1"} 58 | 59 | CONDA_GPU = {"numba": "numba>=0.38.1", "pytorch": "pytorch>=1.0.0", "tensorflow": "tensorflow-gpu==1.12.0"} 60 | 61 | PIP_BASE = { 62 | "azureml-sdk[notebooks,contrib]": "azureml-sdk[notebooks,contrib]", 63 | "azure-storage": "azure-storage>=0.36.0", 64 | "black": "black>=18.6b4", 65 | "dataclasses": "dataclasses>=0.6", 66 | "hyperopt": "hyperopt==0.1.1", 67 | "idna": "idna==2.7", 68 | "memory-profiler": "memory-profiler>=0.54.0", 69 | "nvidia-ml-py3": "nvidia-ml-py3>=7.352.0", 70 | "papermill": "papermill>=0.15.0", 71 | "pydocumentdb": "pydocumentdb>=2.3.3", 72 | "fastai": "fastai==1.0.46", 73 | } 74 | 75 | PIP_PYSPARK = {} 76 | PIP_GPU = {} 77 | 78 | 79 | if __name__ == "__main__": 80 | parser = argparse.ArgumentParser( 81 | description=textwrap.dedent( 82 | """ 83 | This script generates a conda file for different environments. 84 | Plain python is the default, but flags can be used to support PySpark and GPU functionality""" 85 | ), 86 | epilog=HELP_MSG, 87 | formatter_class=argparse.RawDescriptionHelpFormatter, 88 | ) 89 | parser.add_argument("--name", help="specify name of conda environment") 90 | parser.add_argument( 91 | "--gpu", action="store_true", help="include packages for GPU support" 92 | ) 93 | parser.add_argument( 94 | "--pyspark", action="store_true", help="include packages for PySpark support" 95 | ) 96 | parser.add_argument( 97 | "--pyspark-version", help="provide specific version of PySpark to use" 98 | ) 99 | args = parser.parse_args() 100 | 101 | # check pyspark version 102 | if args.pyspark_version is not None: 103 | args.pyspark = True 104 | pyspark_version_info = args.pyspark_version.split(".") 105 | if len(pyspark_version_info) != 3 or any( 106 | [not x.isdigit() for x in pyspark_version_info] 107 | ): 108 | raise TypeError( 109 | "PySpark version input must be valid numeric format (e.g. --pyspark-version=2.3.1)" 110 | ) 111 | else: 112 | args.pyspark_version = "2.3.1" 113 | 114 | # set name for environment and output yaml file 115 | conda_env = "reco_base" 116 | if args.gpu and args.pyspark: 117 | conda_env = "reco_full" 118 | elif args.gpu: 119 | conda_env = "reco_gpu" 120 | elif args.pyspark: 121 | conda_env = "reco_pyspark" 122 | 123 | # overwrite environment name with user input 124 | if args.name is not None: 125 | conda_env = args.name 126 | 127 | # update conda and pip packages based on flags provided 128 | conda_packages = CONDA_BASE 129 | pip_packages = PIP_BASE 130 | if args.pyspark: 131 | conda_packages.update(CONDA_PYSPARK) 132 | conda_packages["pyspark"] = "pyspark=={}".format(args.pyspark_version) 133 | pip_packages.update(PIP_PYSPARK) 134 | if args.gpu: 135 | conda_packages.update(CONDA_GPU) 136 | pip_packages.update(PIP_GPU) 137 | 138 | # write out yaml file 139 | conda_file = "{}.yaml".format(conda_env) 140 | with open(conda_file, "w") as f: 141 | for line in HELP_MSG.format(conda_env=conda_env).split("\n"): 142 | f.write("# {}\n".format(line)) 143 | f.write("name: {}\n".format(conda_env)) 144 | f.write("channels:\n") 145 | for channel in CHANNELS: 146 | f.write("- {}\n".format(channel)) 147 | f.write("dependencies:\n") 148 | for conda_package in conda_packages.values(): 149 | f.write("- {}\n".format(conda_package)) 150 | f.write("- pip:\n") 151 | for pip_package in pip_packages.values(): 152 | f.write(" - {}\n".format(pip_package)) 153 | 154 | print("Generated conda file: {}".format(conda_file)) 155 | print(HELP_MSG.format(conda_env=conda_env)) 156 | -------------------------------------------------------------------------------- /tests/unit/test_ncf_singlenode.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | import itertools 6 | import numpy as np 7 | import pandas as pd 8 | import os 9 | import shutil 10 | from reco_utils.recommender.ncf.ncf_singlenode import NCF 11 | from reco_utils.recommender.ncf.dataset import Dataset 12 | from reco_utils.common.constants import ( 13 | DEFAULT_USER_COL, 14 | DEFAULT_ITEM_COL, 15 | DEFAULT_RATING_COL, 16 | DEFAULT_TIMESTAMP_COL, 17 | ) 18 | from tests.ncf_common import python_dataset_ncf, test_specs_ncf 19 | 20 | 21 | N_NEG = 5 22 | N_NEG_TEST = 10 23 | 24 | 25 | @pytest.mark.gpu 26 | @pytest.mark.parametrize( 27 | "model_type, n_users, n_items", [("NeuMF", 1, 1), ("GMF", 10, 10), ("MLP", 4, 8)] 28 | ) 29 | def test_init(model_type, n_users, n_items): 30 | model = NCF(n_users=n_users, n_items=n_items, model_type=model_type) 31 | # model type 32 | assert model.model_type == model_type.lower() 33 | # number of users in dataset 34 | assert model.n_users == n_users 35 | # number of items in dataset 36 | assert model.n_items == n_items 37 | # dimension of gmf user embedding 38 | assert model.embedding_gmf_P.shape == [n_users, model.n_factors] 39 | # dimension of gmf item embedding 40 | assert model.embedding_gmf_Q.shape == [n_items, model.n_factors] 41 | # dimension of mlp user embedding 42 | assert model.embedding_mlp_P.shape == [n_users, model.n_factors] 43 | # dimension of mlp item embedding 44 | assert model.embedding_mlp_Q.shape == [n_items, model.n_factors] 45 | 46 | # TODO: more parameters 47 | 48 | 49 | @pytest.mark.gpu 50 | @pytest.mark.parametrize( 51 | "model_type, n_users, n_items", [("NeuMF", 5, 5), ("GMF", 5, 5), ("MLP", 5, 5)] 52 | ) 53 | def test_regular_save_load(model_type, n_users, n_items): 54 | ckpt = ".%s" % model_type 55 | if os.path.exists(ckpt): 56 | shutil.rmtree(ckpt) 57 | 58 | model = NCF(n_users=n_users, n_items=n_items, model_type=model_type) 59 | model.save(ckpt) 60 | if model.model_type == "neumf": 61 | P = model.sess.run(model.embedding_gmf_P) 62 | Q = model.sess.run(model.embedding_mlp_Q) 63 | elif model.model_type == "gmf": 64 | P = model.sess.run(model.embedding_gmf_P) 65 | Q = model.sess.run(model.embedding_gmf_Q) 66 | elif model.model_type == "mlp": 67 | P = model.sess.run(model.embedding_mlp_P) 68 | Q = model.sess.run(model.embedding_mlp_Q) 69 | 70 | del model 71 | model = NCF(n_users=n_users, n_items=n_items, model_type=model_type) 72 | 73 | if model.model_type == "neumf": 74 | model.load(neumf_dir=ckpt) 75 | P_ = model.sess.run(model.embedding_gmf_P) 76 | Q_ = model.sess.run(model.embedding_mlp_Q) 77 | elif model.model_type == "gmf": 78 | model.load(gmf_dir=ckpt) 79 | P_ = model.sess.run(model.embedding_gmf_P) 80 | Q_ = model.sess.run(model.embedding_gmf_Q) 81 | elif model.model_type == "mlp": 82 | model.load(mlp_dir=ckpt) 83 | P_ = model.sess.run(model.embedding_mlp_P) 84 | Q_ = model.sess.run(model.embedding_mlp_Q) 85 | 86 | # test load function 87 | assert np.array_equal(P, P_) 88 | assert np.array_equal(Q, Q_) 89 | 90 | if os.path.exists(ckpt): 91 | shutil.rmtree(ckpt) 92 | 93 | @pytest.mark.gpu 94 | @pytest.mark.parametrize( 95 | "n_users, n_items", [(5, 5), (4, 8)] 96 | ) 97 | def test_neumf_save_load(n_users, n_items): 98 | model_type = "gmf" 99 | ckpt_gmf = ".%s" % model_type 100 | if os.path.exists(ckpt_gmf): 101 | shutil.rmtree(ckpt_gmf) 102 | model = NCF(n_users=n_users, n_items=n_items, model_type=model_type) 103 | model.save(ckpt_gmf) 104 | P_gmf = model.sess.run(model.embedding_gmf_P) 105 | Q_gmf = model.sess.run(model.embedding_gmf_Q) 106 | del model 107 | 108 | model_type = "mlp" 109 | ckpt_mlp = ".%s" % model_type 110 | if os.path.exists(ckpt_mlp): 111 | shutil.rmtree(ckpt_mlp) 112 | model = NCF(n_users=n_users, n_items=n_items, model_type=model_type) 113 | model.save(".%s" % model_type) 114 | P_mlp = model.sess.run(model.embedding_mlp_P) 115 | Q_mlp = model.sess.run(model.embedding_mlp_Q) 116 | del model 117 | 118 | model_type = "neumf" 119 | model = NCF(n_users=n_users, n_items=n_items, model_type=model_type) 120 | model.load(gmf_dir=ckpt_gmf, mlp_dir=ckpt_mlp) 121 | 122 | P_gmf_ = model.sess.run(model.embedding_gmf_P) 123 | Q_gmf_ = model.sess.run(model.embedding_gmf_Q) 124 | 125 | P_mlp_ = model.sess.run(model.embedding_mlp_P) 126 | Q_mlp_ = model.sess.run(model.embedding_mlp_Q) 127 | 128 | assert np.array_equal(P_gmf, P_gmf_) 129 | assert np.array_equal(Q_gmf, Q_gmf_) 130 | assert np.array_equal(Q_mlp, Q_mlp_) 131 | assert np.array_equal(Q_mlp, Q_mlp_) 132 | 133 | if os.path.exists(ckpt_gmf): 134 | shutil.rmtree(ckpt_gmf) 135 | if os.path.exists(ckpt_mlp): 136 | shutil.rmtree(ckpt_mlp) 137 | 138 | # TODO: test loading fc-concat 139 | 140 | @pytest.mark.gpu 141 | @pytest.mark.parametrize( 142 | "model_type", ["NeuMF", "GMF", "MLP"] 143 | ) 144 | def test_fit(python_dataset_ncf, model_type): 145 | train, test = python_dataset_ncf 146 | data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST) 147 | model = NCF(n_users=data.n_users, n_items=data.n_items, model_type=model_type) 148 | model.fit(data) 149 | 150 | @pytest.mark.gpu 151 | @pytest.mark.parametrize( 152 | "model_type", ["NeuMF", "GMF", "MLP"] 153 | ) 154 | def test_predict(python_dataset_ncf, model_type): 155 | # test data format 156 | train, test = python_dataset_ncf 157 | data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST) 158 | model = NCF(n_users=data.n_users, n_items=data.n_items, model_type=model_type) 159 | model.fit(data) 160 | 161 | test_users, test_items = list(test[DEFAULT_USER_COL]), list(test[DEFAULT_ITEM_COL]) 162 | 163 | assert type(model.predict(test_users[0], test_items[0])) == float 164 | 165 | res = model.predict(test_users, test_items, is_list=True) 166 | 167 | assert type(res) == list 168 | assert len(res) == len(test) 169 | 170 | 171 | 172 | --------------------------------------------------------------------------------