├── scripts
    ├── __init__.py
    ├── repo_metrics
    │   ├── __init__.py
    │   ├── config_template.py
    │   ├── README.md
    │   └── track_metrics.py
    ├── config.json
    ├── databricks_install.sh
    ├── prepare_databricks_for_o16n.sh
    └── generate_conda_file.py
├── notebooks
    ├── scripts
    │   ├── __init__.py
    │   ├── config.json
    │   ├── reco_full.yaml
    │   ├── databricks_install.sh
    │   ├── prepare_databricks_for_o16n.sh
    │   └── generate_conda_file.py
    ├── reco_utils
    │   ├── azureml
    │   │   ├── __init__.py
    │   │   ├── wide_deep.py
    │   │   └── svd_training.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── constants.py
    │   │   ├── spark_utils.py
    │   │   ├── general_utils.py
    │   │   ├── notebook_utils.py
    │   │   ├── python_utils.py
    │   │   ├── timer.py
    │   │   ├── gpu_utils.py
    │   │   └── notebook_memory_management.py
    │   ├── dataset
    │   │   ├── __init__.py
    │   │   ├── url_utils.py
    │   │   ├── pandas_df_utils.py
    │   │   └── cosmos_cli.py
    │   ├── evaluation
    │   │   ├── __init__.py
    │   │   └── parameter_sweep.py
    │   ├── recommender
    │   │   ├── __init__.py
    │   │   ├── rbm
    │   │   │   └── __init__.py
    │   │   ├── deeprec
    │   │   │   ├── __init__.py
    │   │   │   ├── IO
    │   │   │   │   └── __init__.py
    │   │   │   └── models
    │   │   │   │   └── __init__.py
    │   │   ├── fastai
    │   │   │   ├── __init__.py
    │   │   │   └── fastai_utils.py
    │   │   ├── ncf
    │   │   │   └── __init__.py
    │   │   ├── surprise
    │   │   │   ├── __init__.py
    │   │   │   └── surprise_utils.py
    │   │   ├── wide_deep
    │   │   │   └── __init__.py
    │   │   ├── vowpal_wabbit
    │   │   │   └── __init__.py
    │   │   └── sar
    │   │   │   └── __init__.py
    │   ├── __init__.py
    │   └── README.md
    └── README.md
├── reco_utils
    ├── azureml
    │   ├── __init__.py
    │   ├── azureml_utils.py
    │   ├── aks_utils.py
    │   ├── wide_deep.py
    │   └── svd_training.py
    ├── common
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── notebook_utils.py
    │   ├── general_utils.py
    │   ├── spark_utils.py
    │   ├── timer.py
    │   ├── gpu_utils.py
    │   ├── python_utils.py
    │   └── notebook_memory_management.py
    ├── dataset
    │   ├── __init__.py
    │   ├── download_utils.py
    │   └── cosmos_cli.py
    ├── evaluation
    │   ├── __init__.py
    │   └── parameter_sweep.py
    ├── recommender
    │   ├── __init__.py
    │   ├── rbm
    │   │   └── __init__.py
    │   ├── deeprec
    │   │   ├── __init__.py
    │   │   ├── IO
    │   │   │   └── __init__.py
    │   │   └── models
    │   │   │   └── __init__.py
    │   ├── fastai
    │   │   ├── __init__.py
    │   │   └── fastai_utils.py
    │   ├── lightgbm
    │   │   └── __init__.py
    │   ├── ncf
    │   │   └── __init__.py
    │   ├── surprise
    │   │   ├── __init__.py
    │   │   └── surprise_utils.py
    │   ├── wide_deep
    │   │   └── __init__.py
    │   ├── vowpal_wabbit
    │   │   └── __init__.py
    │   └── sar
    │   │   └── __init__.py
    ├── __init__.py
    ├── README.md
    └── nni
    │   └── nni_utils.py
├── tests
    ├── ci
    │   ├── requirements.txt
    │   ├── config.json
    │   ├── runpytest.py
    │   ├── install_requirements.sh
    │   ├── Master-CPU-pipeline.yml
    │   ├── pytest.yml
    │   └── submitpytest.py
    └── unit
    │   ├── test_general_utils.py
    │   ├── test_dataset.py
    │   ├── test_gpu_utils.py
    │   ├── test_sweep.py
    │   ├── test_timer.py
    │   ├── test_notebook_utils.py
    │   ├── test_notebook_utils.ipynb
    │   ├── test_notebooks_pyspark.py
    │   ├── test_notebooks_python.py
    │   ├── test_notebooks_gpu.py
    │   ├── test_deeprec_model.py
    │   ├── test_python_utils.py
    │   ├── test_pandas_df_utils.py
    │   ├── test_deeprec_utils.py
    │   ├── test_vowpal_wabbit.py
    │   ├── test_wide_deep_utils.py
    │   ├── test_surprise_utils.py
    │   ├── test_ncf_dataset.py
    │   ├── test_sparse.py
    │   ├── test_tf_utils.py
    │   ├── test_rbm.py
    │   └── test_ncf_singlenode.py
├── azure-pipelines.yml
├── LICENSE
├── README.md
└── SECURITY.md


/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reco_utils/azureml/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reco_utils/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reco_utils/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reco_utils/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scripts/repo_metrics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reco_utils/recommender/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reco_utils/recommender/rbm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/azureml/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reco_utils/recommender/deeprec/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reco_utils/recommender/fastai/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reco_utils/recommender/lightgbm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reco_utils/recommender/ncf/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/reco_utils/recommender/surprise/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/recommender/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/recommender/rbm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reco_utils/recommender/deeprec/IO/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reco_utils/recommender/wide_deep/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/recommender/deeprec/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/recommender/fastai/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/recommender/ncf/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/recommender/surprise/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reco_utils/recommender/deeprec/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reco_utils/recommender/vowpal_wabbit/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/recommender/deeprec/IO/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/recommender/wide_deep/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/recommender/deeprec/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/recommender/vowpal_wabbit/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/ci/requirements.txt:
--------------------------------------------------------------------------------
1 | scipy==1.0.0
2 | scikit-learn==0.19.1
3 | numpy==1.14.5
4 | pandas==0.23.1
5 | pytest==4.3.0


--------------------------------------------------------------------------------
/tests/ci/config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "subscription_id": "<>",
3 |     "resource_group": "recommender",
4 |     "workspace_name": "RecoWS",
5 |     "location": "eastus"
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/scripts/config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "subscription_id": <>,
3 |     "resource_group": "recommender",
4 |     "workspace_name": "addWS",
5 |     "location": "southcentralus"
6 | }


--------------------------------------------------------------------------------
/scripts/config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "subscription_id": "15ae9cb6-95c1-483d-a0e3-b1a1a3b06324",
3 |     "resource_group": "recommender",
4 |     "workspace_name": "RecoWS",
5 |     "location": "southcentralus"
6 | }


--------------------------------------------------------------------------------
/reco_utils/__init__.py:
--------------------------------------------------------------------------------
1 | __title__ = "Microsoft Recommenders"
2 | __version__ = "2019.02"
3 | __author__ = "RecoDev Team at Microsoft"
4 | __license__ = "MIT"
5 | __copyright__ = "Copyright 2018-present Microsoft Corporation"
6 | 
7 | # Version synonym
8 | VERSION = __version__
9 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/__init__.py:
--------------------------------------------------------------------------------
1 | __title__ = "Microsoft Recommenders"
2 | __version__ = "2019.02"
3 | __author__ = "RecoDev Team at Microsoft"
4 | __license__ = "MIT"
5 | __copyright__ = "Copyright 2018-present Microsoft Corporation"
6 | 
7 | # Version synonym
8 | VERSION = __version__
9 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/common/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | # Default column names
 5 | DEFAULT_USER_COL = "userID"
 6 | DEFAULT_ITEM_COL = "itemID"
 7 | DEFAULT_RATING_COL = "rating"
 8 | DEFAULT_TIMESTAMP_COL = "timestamp"
 9 | PREDICTION_COL = "prediction"
10 | DEFAULT_PREDICTION_COL = PREDICTION_COL
11 | 
12 | # Filtering variables
13 | DEFAULT_K = 10
14 | DEFAULT_THRESHOLD = 10
15 | 


--------------------------------------------------------------------------------
/tests/unit/test_general_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import pytest
 5 | from reco_utils.common.general_utils import invert_dictionary, get_number_processors
 6 | 
 7 | 
 8 | def test_invert_dictionary():
 9 |     d = {"a": 1, "b": 2}
10 |     d_inv = invert_dictionary(d)
11 |     assert d_inv == {1: "a", 2: "b"}
12 | 
13 | 
14 | def test_get_number_processors():
15 |     assert get_number_processors() >= 4
16 | 


--------------------------------------------------------------------------------
/scripts/repo_metrics/config_template.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | # Github token
 5 | # More info: https://help.github.com/articles/creating-a-personal-access-token-for-the-command-line/
 6 | GITHUB_TOKEN = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
 7 | 
 8 | # CosmosDB Mongo API
 9 | CONNECTION_STRING = "mongodb://XXXXXXXXXXXXXXXXXXXXXXXXX.documents.azure.com:10255/?ssl=true&replicaSet=globaldb"
10 | DATABASE = "reco_stats"
11 | COLLECTION_GITHUB_STATS = "github_stats"
12 | COLLECTION_EVENTS = "events"
13 | 
14 | 


--------------------------------------------------------------------------------
/tests/ci/runpytest.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import subprocess
 5 | import os
 6 | 
 7 | from azureml.core import Run
 8 | print('before run.get_context')
 9 | run = Run.get_context()
10 | print('before subprocess.run')
11 | 
12 | subprocess.run(["pytest", "tests/unit",
13 |                 "-m", "not notebooks and not spark and not gpu",
14 |                 "--junitxml=reports/test-unit.xml"])
15 | 
16 | print("os.listdir files", os.listdir("."))
17 | # set up reports
18 | name_of_upload = "reports"
19 | path_on_disk = "reports"
20 | run.upload_folder(name_of_upload, path_on_disk)
21 | 


--------------------------------------------------------------------------------
/reco_utils/common/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | # Default column names
 5 | DEFAULT_USER_COL = "userID"
 6 | DEFAULT_ITEM_COL = "itemID"
 7 | DEFAULT_RATING_COL = "rating"
 8 | DEFAULT_LABEL_COL = "label"
 9 | DEFAULT_TIMESTAMP_COL = "timestamp"
10 | DEFAULT_PREDICTION_COL = "prediction"
11 | COL_DICT = {
12 |     "col_user": DEFAULT_USER_COL, 
13 |     "col_item": DEFAULT_ITEM_COL, 
14 |     "col_rating": DEFAULT_RATING_COL, 
15 |     "col_prediction": DEFAULT_PREDICTION_COL
16 | }
17 | 
18 | # Filtering variables
19 | DEFAULT_K = 10
20 | DEFAULT_THRESHOLD = 10
21 | 
22 | # Other
23 | SEED = 42
24 | 


--------------------------------------------------------------------------------
/tests/unit/test_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import os
 5 | import sys
 6 | import pytest
 7 | from reco_utils.dataset.url_utils import maybe_download
 8 | 
 9 | 
10 | def test_maybe_download():
11 |     file_url = "https://raw.githubusercontent.com/Microsoft/Recommenders/master/LICENSE"
12 |     filepath = "license.txt"
13 |     assert not os.path.exists(filepath)
14 |     filepath = maybe_download(file_url, "license.txt", expected_bytes=1162)
15 |     assert os.path.exists(filepath)
16 |     os.remove(filepath)
17 |     with pytest.raises(IOError):
18 |         filepath = maybe_download(file_url, "license.txt", expected_bytes=0)
19 | 


--------------------------------------------------------------------------------
/reco_utils/recommender/sar/__init__.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | # Time since epoch in seconds
 4 | EPOCH = datetime.datetime.utcfromtimestamp(0)
 5 | # Default value for time decay parameter in SAR
 6 | TIME_DECAY_COEFFICIENT = 30
 7 | # Switch to trigger groupby in TimeDecay calculation
 8 | TIMEDECAY_FORMULA = False
 9 | # cooccurrence matrix threshold
10 | THRESHOLD = 1
11 | # Current time
12 | # TIME_NOW = (datetime.datetime.now() - EPOCH).total_seconds()
13 | TIME_NOW = None
14 | # Default names for functions which change the item-item cooccurrence matrix
15 | SIM_COOCCUR = "cooccurrence"
16 | SIM_JACCARD = "jaccard"
17 | SIM_LIFT = "lift"
18 | 
19 | INDEXED_ITEMS = "indexedItems"
20 | INDEXED_USERS = "indexedUsers"
21 | 
22 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/recommender/sar/__init__.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | # Time since epoch in seconds
 4 | EPOCH = datetime.datetime.utcfromtimestamp(0)
 5 | # Default value for time decay parameter in SAR
 6 | TIME_DECAY_COEFFICIENT = 30
 7 | # Switch to trigger groupby in TimeDecay calculation
 8 | TIMEDECAY_FORMULA = False
 9 | # cooccurrence matrix threshold
10 | THRESHOLD = 1
11 | # Current time
12 | # TIME_NOW = (datetime.datetime.now() - EPOCH).total_seconds()
13 | TIME_NOW = None
14 | # Default names for functions which change the item-item cooccurrence matrix
15 | SIM_COOCCUR = "cooccurrence"
16 | SIM_JACCARD = "jaccard"
17 | SIM_LIFT = "lift"
18 | 
19 | INDEXED_ITEMS = "indexedItems"
20 | INDEXED_USERS = "indexedUsers"
21 | 
22 | 


--------------------------------------------------------------------------------
/tests/unit/test_gpu_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import sys
 5 | import pytest
 6 | from reco_utils.common.gpu_utils import get_number_gpus, clear_memory_all_gpus, get_cuda_version, get_cudnn_version
 7 | 
 8 | 
 9 | @pytest.mark.gpu
10 | def test_get_number_gpus():
11 |     assert get_number_gpus() >= 1
12 | 
13 | 
14 | @pytest.mark.gpu
15 | @pytest.mark.skip(reason="TODO: Implement this")
16 | def test_clear_memory_all_gpus():
17 |     pass
18 | 
19 | 
20 | @pytest.mark.gpu
21 | @pytest.mark.skipif(sys.platform == 'win32', reason="Not implemented on Windows")
22 | def test_get_cuda_version():
23 |     assert get_cuda_version() > "9.0.0"
24 | 
25 | 
26 | @pytest.mark.gpu
27 | def test_get_cudnn_version():
28 |     assert get_cudnn_version() > "7.0.0" 


--------------------------------------------------------------------------------
/notebooks/reco_utils/common/spark_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | try:
 5 |     from pyspark.sql import SparkSession
 6 | except ImportError:
 7 |     pass  # skip this import if we are in pure python environment
 8 | 
 9 | 
10 | def start_or_get_spark(app_name="Sample", url="local[*]", memory="10G"):
11 |     """Start Spark if not started
12 | 
13 |     Args:
14 |         app_name (str): Set name of the application
15 |         url (str): URL for spark master.
16 |         memory (str): Size of memory for spark driver.
17 |         
18 |     Returns:
19 |         obj: Spark context.
20 |     """
21 |     spark = (
22 |         SparkSession.builder.appName(app_name)
23 |         .master(url)
24 |         .config("spark.driver.memory", memory)
25 |         .getOrCreate()
26 |     )
27 | 
28 |     return spark
29 | 


--------------------------------------------------------------------------------
/notebooks/README.md:
--------------------------------------------------------------------------------
 1 | # Run your notebooks as-is on AzureML Service
 2 | 
 3 | This folder demonstrates how to build, train and test notebooks from our [Recommendation Project](http://github.com/Microsoft/Recommenders) project so you can make your own Recommendation system. 
 4 | 
 5 | We use MLOps to manually or automatically trigger builds due to Github PRs and changes.  The control plane is in DevOps and AzureML Service provides numerous capabilities to track your assets when running Jupyter notebooks local or in the cloud. 
 6 | 
 7 | ## AzureML improves your MLOps experience!
 8 | 
 9 | ### Build Definitions
10 | 
11 | [Run Recommender Notebooks](https://dev.azure.com/emcmanu/NotebookPipelineDemo/_build?definitionId=15)
12 | 
13 | [Validate Notebook Changes](https://dev.azure.com/emcmanu/NotebookPipelineDemo/_apps/hub/ms.vss-ciworkflow.build-ci-hub?_a=edit-build-definition&id=14)
14 | 
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
 1 | # Validate notebooks in repo
 2 | 
 3 | trigger:
 4 | - master
 5 | 
 6 | pool:
 7 |   vmImage: 'Ubuntu 16.04'
 8 | 
 9 | steps:
10 | - task: UsePythonVersion@0
11 |   displayName: 'Use Python 3.6'
12 |   inputs:
13 |     versionSpec: 3.6
14 | 
15 | - task: RunNotebook@0
16 |   inputs:
17 |     azureSubscription: 'emcmanu_test'
18 |     targetType: 'custom'
19 |     computeTarget: 'gpucluster'
20 |     pathFilter: 'notebooks/*.ipynb'
21 |     condaDependencies: 'scripts/reco_full.yaml'
22 |     commonFiles: 'notebooks/reco_utils'
23 |     dockerBaseImage: 'mcr.microsoft.com/azureml/base-gpu:intelmpi2018.3-cuda9.0-cudnn7-ubuntu16.04'
24 | 
25 | - task: PublishBuildArtifacts@1
26 |   displayName: 'Publish Artifact: devops-for-ai'
27 |   inputs:
28 |     ArtifactName: 'devops-for-ai'
29 |     publishLocation: 'container'
30 |     pathtoPublish: '$(Build.ArtifactStagingDirectory)' 
31 |     TargetPath: '$(Build.ArtifactStagingDirectory)'


--------------------------------------------------------------------------------
/tests/unit/test_sweep.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | 
 4 | from reco_utils.evaluation.parameter_sweep import generate_param_grid
 5 | 
 6 | 
 7 | @pytest.fixture(scope="module")
 8 | def parameter_dictionary():
 9 |     params = {
10 |         "param1": [1, 2, 3],
11 |         "param2": [4, 5, 6],
12 |         "param3": 1
13 |     }
14 | 
15 |     return params
16 | 
17 | 
18 | def test_param_sweep(parameter_dictionary):
19 |     params_grid = generate_param_grid(parameter_dictionary)
20 | 
21 |     assert params_grid == [
22 |         {'param1': 1, 'param2': 4, 'param3': 1}, {'param1': 1, 'param2': 5, 'param3': 1},
23 |         {'param1': 1, 'param2': 6, 'param3': 1}, {'param1': 2, 'param2': 4, 'param3': 1},
24 |         {'param1': 2, 'param2': 5, 'param3': 1}, {'param1': 2, 'param2': 6, 'param3': 1},
25 |         {'param1': 3, 'param2': 4, 'param3': 1}, {'param1': 3, 'param2': 5, 'param3': 1},
26 |         {'param1': 3, 'param2': 6, 'param3': 1}
27 |     ]
28 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/common/general_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import os
 5 | 
 6 | 
 7 | def invert_dictionary(dictionary):
 8 |     """Invert a dictionary
 9 |     NOTE: If the dictionary has unique keys and unique values, the invertion would be perfect. However, if there are
10 |     repeated values, the invertion can take different keys
11 |     
12 |     Args: 
13 |         dictionary (dict): A dictionary
14 |     
15 |     Returns:
16 |         dict: inverted dictionary
17 |     """
18 |     return {v: k for k, v in dictionary.items()}
19 | 
20 | 
21 | def get_number_processors():
22 |     """Get the number of processors in a CPU.
23 |     
24 |     Returns:
25 |         int: Number of processors.
26 |     """
27 |     try:
28 |         num = os.cpu_count()
29 |     except Exception:
30 |         import multiprocessing  # force exception in case mutiprocessing is not installed
31 | 
32 |         num = multiprocessing.cpu_count()
33 |     return num
34 | 


--------------------------------------------------------------------------------
/reco_utils/common/notebook_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import os
 5 | 
 6 | 
 7 | def is_jupyter():
 8 |     """Check if the module is running on Jupyter notebook/console
 9 | 
10 |     Returns:
11 |         bool: True if the module is running on Jupyter notebook or Jupyter console,
12 |         False otherwise.
13 |     """
14 |     try:
15 |         shell_name = get_ipython().__class__.__name__
16 |         if shell_name == 'ZMQInteractiveShell':
17 |             return True
18 |         else:
19 |             return False
20 |     except NameError:
21 |         return False
22 | 
23 | 
24 | def is_databricks():
25 |     """Check if the module is running on Databricks
26 | 
27 |     Returns:
28 |         bool: True if the module is running on Databricks notebook,
29 |         False otherwise.
30 |     """
31 |     try:
32 |         if os.path.realpath(".") == "/databricks/driver":
33 |             return True
34 |         else:
35 |             return False
36 |     except NameError:
37 |         return False
38 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/common/notebook_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import os
 5 | 
 6 | 
 7 | def is_jupyter():
 8 |     """Check if the module is running on Jupyter notebook/console
 9 | 
10 |     Returns:
11 |         bool: True if the module is running on Jupyter notebook or Jupyter console,
12 |         False otherwise.
13 |     """
14 |     try:
15 |         shell_name = get_ipython().__class__.__name__
16 |         if shell_name == 'ZMQInteractiveShell':
17 |             return True
18 |         else:
19 |             return False
20 |     except NameError:
21 |         return False
22 | 
23 | 
24 | def is_databricks():
25 |     """Check if the module is running on Databricks
26 | 
27 |     Returns:
28 |         bool: True if the module is running on Databricks notebook,
29 |         False otherwise.
30 |     """
31 |     try:
32 |         if os.path.realpath(".") == "/databricks/driver":
33 |             return True
34 |         else:
35 |             return False
36 |     except NameError:
37 |         return False
38 | 


--------------------------------------------------------------------------------
/tests/unit/test_timer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | 
 5 | import pytest
 6 | import time
 7 | from reco_utils.common.timer import Timer
 8 | 
 9 | 
10 | TOL = 0.01
11 | 
12 | 
13 | @pytest.fixture(scope="function")
14 | def t():
15 |     return Timer()
16 | 
17 | 
18 | def test_no_time(t):
19 |     assert t.interval == 0
20 |     assert t.running == False
21 | 
22 | 
23 | def test_stop_before_start(t):
24 |     with pytest.raises(ValueError):
25 |         t.stop()
26 | 
27 | 
28 | def test_interval_before_stop(t):
29 |     t.start()
30 |     with pytest.raises(ValueError):
31 |         t.interval
32 | 
33 | 
34 | def test_timer(t):
35 |     t.start()
36 |     assert t.running == True
37 |     time.sleep(1)
38 |     t.stop()
39 |     assert t.running == False
40 |     assert t.interval == pytest.approx(1, abs=TOL)
41 |     with Timer() as t2:
42 |         assert t2.running == True
43 |         time.sleep(1)
44 |     assert t2.interval == pytest.approx(1, abs=TOL)
45 |     assert t2.running == False
46 | 
47 | 
48 | def test_timer_format(t):
49 |     assert str(t) == "0:00:00"
50 |     assert str(t.interval) == "0"
51 | 


--------------------------------------------------------------------------------
/tests/unit/test_notebook_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | import os
 4 | import pytest
 5 | import papermill as pm
 6 | from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
 7 | from reco_utils.common.notebook_utils import is_jupyter, is_databricks
 8 | 
 9 | 
10 | @pytest.mark.notebooks
11 | def test_is_jupyter():
12 |     # Test on the terminal
13 |     assert is_jupyter() is False
14 |     assert is_databricks() is False
15 | 
16 |     # Test on Jupyter notebook
17 |     path = os.path.join("tests", "unit", "test_notebook_utils.ipynb")
18 |     pm.execute_notebook(
19 |         path,
20 |         OUTPUT_NOTEBOOK,
21 |         kernel_name=KERNEL_NAME,
22 |     )
23 |     nb = pm.read_notebook(OUTPUT_NOTEBOOK)
24 |     df = nb.dataframe
25 |     result_is_jupyter = df.loc[df["name"] == "is_jupyter", "value"].values[0]
26 |     assert result_is_jupyter is True
27 |     result_is_databricks = df.loc[df["name"] == "is_databricks", "value"].values[0]
28 |     assert result_is_databricks is False
29 | 
30 | # @pytest.mark.notebooks
31 | # def test_is_databricks():
32 | #     TODO Currently, we cannot pytest modules on Databricks
33 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/dataset/url_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import os
 5 | from urllib.request import urlretrieve
 6 | import logging
 7 | 
 8 | log = logging.getLogger(__name__)
 9 | 
10 | 
11 | def maybe_download(url, filename, work_directory=".", expected_bytes=None):
12 |     """Download a file if it is not already downloaded.
13 |     
14 |     Args:
15 |         filename (str): File name.
16 |         work_directory (str): Working directory.
17 |         url (str): URL of the file to download.
18 |         expected_bytes (int): Expected file size in bytes.
19 | 
20 |     Returns:
21 |         str: File path of the file downloaded.
22 |     """
23 |     filepath = os.path.join(work_directory, filename)
24 |     if not os.path.exists(filepath):
25 |         filepath, _ = urlretrieve(url, filepath)
26 |     else:
27 |         log.debug("File {} already downloaded".format(filepath))
28 |     if expected_bytes is not None:
29 |         statinfo = os.stat(filepath)
30 |         if statinfo.st_size != expected_bytes:
31 |             os.remove(filepath)
32 |             raise IOError("Failed to verify {}".format(filepath))
33 | 
34 |     return filepath
35 | 


--------------------------------------------------------------------------------
/reco_utils/common/general_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import os
 5 | import psutil
 6 | 
 7 | 
 8 | def invert_dictionary(dictionary):
 9 |     """Invert a dictionary
10 |     NOTE: If the dictionary has unique keys and unique values, the inversion would be perfect. However, if there are
11 |     repeated values, the inversion can take different keys
12 | 
13 |     Args:
14 |         dictionary (dict): A dictionary
15 | 
16 |     Returns:
17 |         dict: inverted dictionary
18 |     """
19 |     return {v: k for k, v in dictionary.items()}
20 | 
21 | 
22 | def get_physical_memory():
23 |     """Get the physical memory in GBs.
24 | 
25 |     Returns:
26 |         float: Physical memory in GBs.
27 |     """
28 |     return psutil.virtual_memory()[0] / 1073741824
29 | 
30 | 
31 | def get_number_processors():
32 |     """Get the number of processors in a CPU.
33 | 
34 |     Returns:
35 |         int: Number of processors.
36 |     """
37 |     try:
38 |         num = os.cpu_count()
39 |     except Exception:
40 |         import multiprocessing  # force exception in case multiprocessing is not installed
41 | 
42 |         num = multiprocessing.cpu_count()
43 |     return num
44 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/notebooks/scripts/reco_full.yaml:
--------------------------------------------------------------------------------
 1 | # 
 2 | # To create the conda environment:
 3 | # $ conda env create -f reco_full.yaml
 4 | # 
 5 | # To update the conda environment:
 6 | # $ conda env update -f reco_full.yaml
 7 | # 
 8 | # To register the conda environment in Jupyter:
 9 | # $ conda activate reco_full
10 | # $ python -m ipykernel install --user --name reco_full --display-name "Python (reco_full)"
11 | # 
12 | name: reco_full
13 | channels:
14 | - defaults
15 | - conda-forge
16 | - pytorch
17 | - fastai
18 | dependencies:
19 | - scipy>=1.0.0
20 | - mock==2.0.0
21 | - scikit-surprise>=1.0.6
22 | - fastparquet>=0.1.6
23 | - scikit-learn==0.19.1
24 | - pyspark==2.3.1
25 | - tensorflow-gpu==1.12.0
26 | - seaborn>=0.8.1
27 | - matplotlib>=2.2.2
28 | - pandas>=0.23.4
29 | - pytorch>=1.0.0
30 | - ipykernel>=4.6.1
31 | - jupyter>=1.0.0
32 | - gitpython>=2.1.8
33 | - dask>=0.17.1
34 | - numpy>=1.13.3
35 | - python==3.6.8
36 | - pymongo>=3.6.1
37 | - pytest>=3.6.4
38 | - pyarrow>=0.8.0
39 | - numba>=0.38.1
40 | - pip:
41 |   - azureml-sdk[notebooks,contrib]
42 |   - black>=18.6b4
43 |   - dataclasses>=0.6
44 |   - azure-storage>=0.36.0
45 |   - hyperopt==0.1.1
46 |   - nvidia-ml-py3>=7.352.0
47 |   - pydocumentdb>=2.3.3
48 |   - papermill>=0.15.0
49 |   - fastai==1.0.46
50 |   - idna==2.7
51 |   - memory-profiler>=0.54.0
52 | 


--------------------------------------------------------------------------------
/scripts/repo_metrics/README.md:
--------------------------------------------------------------------------------
 1 | # Repository Metrics
 2 | 
 3 | [![Build status](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_apis/build/status/Recommenders/Recommenders%20repo%20stats)](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_build/latest?definitionId=5206)
 4 | 
 5 | We developed a script that allows us to track the metrics of the Recommenders repo. Some of the metrics we can track are listed here:
 6 | 
 7 | * Number of stars
 8 | * Number of forks
 9 | * Number of clones
10 | * Number of views
11 | * Number of lines of code
12 | 
13 | To see the full list of metrics, see [git_stats.py](scripts/repo_metrics/git_stats.py)
14 | 
15 | The first step is to set up the credentials, copy the configuration file and fill up the credentials of GitHub and CosmosDB:
16 | 
17 |     cp scripts/repo_metrics/config_template.py scripts/repo_metrics/config.py
18 | 
19 | To track the current state of the repository and save it to CosmosDB:
20 | 
21 |     python scripts/repo_metrics/track_metrics.py --github_repo "https://github.com/Microsoft/Recommenders" --save_to_database
22 | 
23 | To track an event related to this repository and save it to CosmosDB:
24 | 
25 |     python scripts/repo_metrics/track_metrics.py --event "Today we did our first blog of the project" --event_date 2018-12-01 --save_to_database
26 | 
27 | 


--------------------------------------------------------------------------------
/reco_utils/evaluation/parameter_sweep.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | #
 4 | # Utility functions for parameter sweep.
 5 | 
 6 | from itertools import product
 7 | 
 8 | 
 9 | def generate_param_grid(params):
10 |     """Generator of parameter grids
11 |     Generate parameter lists from a parameter dictionary in the form of
12 |     {
13 |         "param1": [value1, value2],
14 |         "param2": [value1, value2]
15 |     }
16 | 
17 |     to
18 | 
19 |     [
20 |         {"param1": value1, "param2": value1},
21 |         {"param1": value2, "param2": value1},
22 |         {"param1": value1, "param2": value2},
23 |         {"param1": value2, "param2": value2}
24 |     ]
25 | 
26 |     Args:
27 |         param_dict (dict): dictionary of parameters and values (in a list).
28 | 
29 |     Return:
30 |         list: A list of parameter dictionary string that can be fed directly into
31 |         model builder as keyword arguments.
32 |     """
33 |     param_new = {}
34 |     param_fixed = {}
35 | 
36 |     for key, value in params.items():
37 |         if isinstance(value, list):
38 |             param_new[key] = value
39 |         else:
40 |             param_fixed[key] = value
41 | 
42 |     items = sorted(param_new.items())
43 |     keys, values = zip(*items)
44 | 
45 |     params_exp = []
46 |     for v in product(*values):
47 |         param_exp = dict(zip(keys, v))
48 |         param_exp.update(param_fixed)
49 |         params_exp.append(param_exp)
50 | 
51 |     return params_exp
52 | 
53 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/evaluation/parameter_sweep.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | #
 4 | # Utility functions for parameter sweep.
 5 | 
 6 | from itertools import product
 7 | 
 8 | 
 9 | def generate_param_grid(params):
10 |     """Generator of parameter grids
11 |     Generate parameter lists from a paramater dictionary in the form of
12 |     {
13 |         "param1": [value1, value2],
14 |         "param2": [value1, value2]
15 |     }
16 | 
17 |     to
18 | 
19 |     [
20 |         {"param1": value1, "param2": value1},
21 |         {"param1": value2, "param2": value1},
22 |         {"param1": value1, "param2": value2},
23 |         {"param1": value2, "param2": value2}
24 |     ]
25 | 
26 |     Args:
27 |         param_dict (dict): dictionary of parameters and values (in a list).
28 | 
29 |     Return:
30 |         list: A list of parameter dictionary string that can be fed directly into
31 |         model builder as keyword arguments.
32 |     """
33 |     param_new = {}
34 |     param_fixed = {}
35 | 
36 |     for key, value in params.items():
37 |         if isinstance(value, list):
38 |             param_new[key] = value
39 |         else:
40 |             param_fixed[key] = value
41 | 
42 |     items = sorted(param_new.items())
43 |     keys, values = zip(*items)
44 | 
45 |     params_exp = []
46 |     for v in product(*values):
47 |         param_exp = dict(zip(keys, v))
48 |         param_exp.update(param_fixed)
49 |         params_exp.append(param_exp)
50 | 
51 |     return params_exp
52 | 
53 | 


--------------------------------------------------------------------------------
/reco_utils/common/spark_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import os
 5 | import sys
 6 | 
 7 | 
 8 | try:
 9 |     from pyspark.sql import SparkSession
10 | except ImportError:
11 |     pass  # skip this import if we are in pure python environment
12 | 
13 | 
14 | def start_or_get_spark(
15 |     app_name="Sample", 
16 |     url="local[*]", 
17 |     memory="10G", 
18 |     packages=None, 
19 |     jars=None, 
20 |     repository=None
21 |     ):
22 |     """Start Spark if not started
23 | 
24 |     Args:
25 |         app_name (str): Set name of the application
26 |         url (str): URL for spark master
27 |         memory (str): Size of memory for spark driver
28 |         packages (list): list of packages to install
29 |         jars (list): list of jar files to add
30 |         repository (str): The maven repository
31 | 
32 |     Returns:
33 |         obj: Spark context.
34 |     """
35 | 
36 |     submit_args = ''
37 |     if packages is not None:
38 |         submit_args = '--packages {} '.format(','.join(packages))
39 |     if jars is not None:
40 |         submit_args += '--jars {} '.format(','.join(jars))
41 |     if repository is not None:
42 |         submit_args += "--repositories {}".format(repository)
43 |     if submit_args:
44 |         os.environ['PYSPARK_SUBMIT_ARGS'] = '{} pyspark-shell'.format(submit_args)
45 | 
46 |     spark = (
47 |         SparkSession.builder.appName(app_name)
48 |         .master(url)
49 |         .config("spark.driver.memory", memory)
50 |         .getOrCreate()
51 |     )
52 | 
53 |     return spark
54 | 


--------------------------------------------------------------------------------
/tests/unit/test_notebook_utils.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {
 6 |     "collapsed": true
 7 |    },
 8 |    "source": [
 9 |     "# This is a test notebook for reco_utils.common.notebook_utils module"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": null,
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "# set the environment path to find Recommenders\n",
19 |     "import sys\n",
20 |     "sys.path.append(\"../../\")\n",
21 |     "\n",
22 |     "import papermill as pm\n",
23 |     "from reco_utils.common.notebook_utils import is_jupyter, is_databricks"
24 |    ]
25 |   },
26 |   {
27 |    "cell_type": "code",
28 |    "execution_count": null,
29 |    "metadata": {},
30 |    "outputs": [],
31 |    "source": [
32 |     "pm.record(\"is_jupyter\", is_jupyter())\n",
33 |     "pm.record(\"is_databricks\", is_databricks())"
34 |    ]
35 |   },
36 |   {
37 |    "cell_type": "code",
38 |    "execution_count": null,
39 |    "metadata": {},
40 |    "outputs": [],
41 |    "source": []
42 |   }
43 |  ],
44 |  "metadata": {
45 |   "celltoolbar": "Tags",
46 |   "kernelspec": {
47 |    "display_name": "Python 3",
48 |    "language": "python",
49 |    "name": "python3"
50 |   },
51 |   "language_info": {
52 |    "codemirror_mode": {
53 |     "name": "ipython",
54 |     "version": 3
55 |    },
56 |    "file_extension": ".py",
57 |    "mimetype": "text/x-python",
58 |    "name": "python",
59 |    "nbconvert_exporter": "python",
60 |    "pygments_lexer": "ipython3",
61 |    "version": "3.6.0"
62 |   }
63 |  },
64 |  "nbformat": 4,
65 |  "nbformat_minor": 1
66 | }
67 | 


--------------------------------------------------------------------------------
/tests/unit/test_notebooks_pyspark.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import pytest
 5 | import papermill as pm
 6 | from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
 7 | 
 8 | 
 9 | @pytest.mark.notebooks
10 | @pytest.mark.spark
11 | def test_als_pyspark_runs(notebooks):
12 |     notebook_path = notebooks["als_pyspark"]
13 |     pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)
14 | 
15 | 
16 | @pytest.mark.notebooks
17 | @pytest.mark.spark
18 | def test_data_split_runs(notebooks):
19 |     notebook_path = notebooks["data_split"]
20 |     pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)
21 | 
22 | 
23 | @pytest.mark.notebooks
24 | @pytest.mark.spark
25 | def test_als_deep_dive_runs(notebooks):
26 |     notebook_path = notebooks["als_deep_dive"]
27 |     pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)
28 | 
29 | 
30 | @pytest.mark.notebooks
31 | @pytest.mark.spark
32 | def test_evaluation_runs(notebooks):
33 |     notebook_path = notebooks["evaluation"]
34 |     pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)
35 | 
36 | 
37 | 
38 | @pytest.mark.notebooks
39 | @pytest.mark.spark
40 | def test_spark_tuning(notebooks):
41 |     notebook_path = notebooks["spark_tuning"]
42 |     pm.execute_notebook(
43 |         notebook_path,
44 |         OUTPUT_NOTEBOOK,
45 |         kernel_name=KERNEL_NAME,
46 |         parameters=dict(
47 |             NUMBER_CORES="*",
48 |             NUMBER_ITERATIONS=3,
49 |             RANK=[5, 5],
50 |             REG=[0.1, 0.01]
51 |         )
52 |     )
53 | 
54 | 


--------------------------------------------------------------------------------
/tests/ci/install_requirements.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (C) Microsoft Corporation. All rights reserved.​
 4 | #  ​
 5 | # Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual,
 6 | # royalty-free right to use, copy, and modify the software code provided by us
 7 | # ('Software Code'). You may not sublicense the Software Code or any use of it
 8 | # (except to your affiliates and to vendors to perform work on your behalf)
 9 | # through distribution, network access, service agreement, lease, rental, or
10 | # otherwise. This license does not purport to express any claim of ownership over
11 | # data you may have shared with Microsoft in the creation of the Software Code.
12 | # Unless applicable law gives you more rights, Microsoft reserves all other
13 | # rights not expressly granted herein, whether by implication, estoppel or
14 | # otherwise. ​
15 | #  ​
16 | # THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 | # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 | # MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 | # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
23 | # IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 | # ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE
25 | # POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | 
28 | python --version
29 | pip install azure-cli==2.0.46
30 | pip install --upgrade azureml-sdk[cli]
31 | pip install -r requirements.txt


--------------------------------------------------------------------------------
/notebooks/reco_utils/common/python_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def exponential_decay(value, max_val, half_life):
 5 |     """Compute decay factor for a given value based on an exponential decay
 6 |     Values greater than max_val will be set to 1
 7 |     Args:
 8 |         value (numeric): value to calculate decay factor
 9 |         max_val (numeric): value at which decay factor will be 1
10 |         half_life (numeric): value at which decay factor will be 0.5
11 |     Returns:
12 |         float: decay factor
13 |     """
14 | 
15 |     return np.minimum(1.0, np.power(0.5, (max_val - value) / half_life))
16 | 
17 | 
18 | def jaccard(cooccurrence):
19 |     """Helper method to calculate the Jaccard similarity of a matrix of co-occurrences
20 |     Args:
21 |         cooccurrence (np.array): the symmetric matrix of co-occurrences of items
22 |     Returns:
23 |         np.array: The matrix of Jaccard similarities between any two items
24 |     """
25 | 
26 |     diag = cooccurrence.diagonal()
27 |     diag_rows = np.expand_dims(diag, axis=0)
28 |     diag_cols = np.expand_dims(diag, axis=1)
29 | 
30 |     with np.errstate(invalid="ignore", divide="ignore"):
31 |         result = cooccurrence / (diag_rows + diag_cols - cooccurrence)
32 | 
33 |     return np.array(result)
34 | 
35 | 
36 | def lift(cooccurrence):
37 |     """Helper method to calculate the Lift of a matrix of co-occurrences
38 |     Args:
39 |         cooccurrence (np.array): the symmetric matrix of co-occurrences of items
40 |     Returns:
41 |         np.array: The matrix of Lifts between any two items
42 |     """
43 | 
44 |     diag = cooccurrence.diagonal()
45 |     diag_rows = np.expand_dims(diag, axis=0)
46 |     diag_cols = np.expand_dims(diag, axis=1)
47 | 
48 |     with np.errstate(invalid="ignore", divide="ignore"):
49 |         result = cooccurrence / (diag_rows * diag_cols)
50 | 
51 |     return np.array(result)
52 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Run your notebooks as-is on AzureML Service with MLOps extensions
 2 | 
 3 | 
 4 | We use MLOps to manually or automatically trigger builds due to Github PRs and changes.  The control plane is in DevOps and AzureML Service provides numerous capabilities to track your assets when running Jupyter notebooks local or in the cloud. 
 5 | 
 6 | ## AzureML improves your MLOps experience!
 7 | 
 8 | ### Build Definitions
 9 | 
10 | [Run Recommender Notebooks](https://dev.azure.com/emcmanu/NotebookPipelineDemo/_build?definitionId=15)
11 | 
12 | [Validate Notebook Changes](https://dev.azure.com/emcmanu/NotebookPipelineDemo/_apps/hub/ms.vss-ciworkflow.build-ci-hub?_a=edit-build-definition&id=14)
13 | 
14 | This folder demonstrates how to build, train and test notebooks from our [Recommendation Project](http://github.com/Microsoft/Recommenders) project so you can make your own Recommendation system. 
15 | 
16 | # Contributing
17 | 
18 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
19 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
20 | the rights to use your contribution. For details, visit https://cla.microsoft.com.
21 | 
22 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
23 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
24 | provided by the bot. You will only need to do this once across all repos using our CLA.
25 | 
26 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
27 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
28 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
29 | 
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/tests/unit/test_notebooks_python.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import pytest
 5 | import papermill as pm
 6 | from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
 7 | 
 8 | 
 9 | @pytest.mark.notebooks
10 | def test_template_runs(notebooks):
11 |     notebook_path = notebooks["template"]
12 |     pm.execute_notebook(
13 |         notebook_path,
14 |         OUTPUT_NOTEBOOK,
15 |         parameters=dict(PM_VERSION=pm.__version__),
16 |         kernel_name=KERNEL_NAME,
17 |     )
18 |     nb = pm.read_notebook(OUTPUT_NOTEBOOK)
19 |     df = nb.dataframe
20 |     assert df.shape[0] == 2
21 |     check_version = df.loc[df["name"] == "checked_version", "value"].values[0]
22 |     assert check_version is True
23 | 
24 | 
25 | @pytest.mark.notebooks
26 | def test_sar_single_node_runs(notebooks):
27 |     notebook_path = notebooks["sar_single_node"]
28 |     pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)
29 | 
30 | 
31 | @pytest.mark.notebooks
32 | def test_sar_deep_dive_runs(notebooks):
33 |     notebook_path = notebooks["sar_deep_dive"]
34 |     pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)
35 | 
36 | 
37 | @pytest.mark.notebooks
38 | def test_baseline_deep_dive_runs(notebooks):
39 |     notebook_path = notebooks["baseline_deep_dive"]
40 |     pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)
41 | 
42 | 
43 | @pytest.mark.notebooks
44 | def test_surprise_deep_dive_runs(notebooks):
45 |     notebook_path = notebooks["surprise_svd_deep_dive"]
46 |     pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)
47 |     
48 | 
49 | @pytest.mark.notebooks
50 | def test_vw_deep_dive_runs(notebooks):
51 |     notebook_path = notebooks["vowpal_wabbit_deep_dive"]
52 |     pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)
53 | 


--------------------------------------------------------------------------------
/reco_utils/common/timer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from timeit import default_timer
 5 | from datetime import timedelta
 6 | 
 7 | 
 8 | class Timer(object):
 9 |     """Timer class.
10 |     Original code: https://github.com/miguelgfierro/codebase
11 |     
12 |     Examples:
13 |         >>> import time
14 |         >>> t = Timer()
15 |         >>> t.start()
16 |         >>> time.sleep(1)
17 |         >>> t.stop()
18 |         >>> t.interval < 1
19 |         True
20 |         >>> with Timer() as t:
21 |         ...   time.sleep(1)
22 |         >>> t.interval < 1
23 |         True
24 |         >>> "Time elapsed {}".format(t) #doctest: +ELLIPSIS
25 |         'Time elapsed 1...'
26 |     """
27 | 
28 |     def __init__(self):
29 |         self._timer = default_timer
30 |         self._interval = 0
31 |         self.running = False
32 | 
33 |     def __enter__(self):
34 |         self.start()
35 |         return self
36 | 
37 |     def __exit__(self, *args):
38 |         self.stop()
39 | 
40 |     def __str__(self):
41 |         return "{:0.4f}".format(self.interval)
42 | 
43 |     def start(self):
44 |         """Start the timer."""
45 |         self.init = self._timer()
46 |         self.running = True
47 | 
48 |     def stop(self):
49 |         """Stop the timer. Calculate the interval in seconds."""
50 |         self.end = self._timer()
51 |         try:
52 |             self._interval = self.end - self.init
53 |             self.running = False
54 |         except AttributeError:
55 |             raise ValueError(
56 |                 "Timer has not been initialized: use start() or the contextual form with Timer() as t:"
57 |             )
58 | 
59 |     @property
60 |     def interval(self):
61 |         if self.running:
62 |             raise ValueError("Timer has not been stopped, please use stop().")
63 |         else:
64 |             return self._interval
65 | 
66 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/common/timer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from timeit import default_timer
 5 | from datetime import timedelta
 6 | 
 7 | 
 8 | class Timer(object):
 9 |     """Timer class.
10 |     Original code: https://github.com/miguelgfierro/codebase
11 |     
12 |     Examples:
13 |         >>> import time
14 |         >>> t = Timer()
15 |         >>> t.start()
16 |         >>> time.sleep(1)
17 |         >>> t.stop()
18 |         >>> t.interval < 1
19 |         True
20 |         >>> with Timer() as t:
21 |         ...   time.sleep(1)
22 |         >>> t.interval < 1
23 |         True
24 |         >>> "Time elapsed {}".format(t) #doctest: +ELLIPSIS
25 |         'Time elapsed 0:00:...'
26 |     """
27 | 
28 |     def __init__(self):
29 |         self._timer = default_timer
30 |         self._interval = 0
31 |         self.running = False
32 | 
33 |     def __enter__(self):
34 |         self.start()
35 |         return self
36 | 
37 |     def __exit__(self, *args):
38 |         self.stop()
39 | 
40 |     def __str__(self):
41 |         return str(timedelta(seconds=self._interval))
42 | 
43 |     def start(self):
44 |         """Start the timer."""
45 |         self.init = self._timer()
46 |         self.running = True
47 | 
48 |     def stop(self):
49 |         """Stop the timer. Calculate the interval in seconds."""
50 |         self.end = self._timer()
51 |         try:
52 |             self._interval = self.end - self.init
53 |             self.running = False
54 |         except AttributeError:
55 |             raise ValueError(
56 |                 "Timer has not been initialized: use start() or the contextual form with Timer() as t:"
57 |             )
58 | 
59 |     @property
60 |     def interval(self):
61 |         if self.running:
62 |             raise ValueError("Timer has not been stopped, please use stop().")
63 |         else:
64 |             return self._interval
65 | 


--------------------------------------------------------------------------------
/tests/unit/test_notebooks_gpu.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import shutil
 5 | import pytest
 6 | from reco_utils.common.gpu_utils import get_number_gpus
 7 | from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
 8 | import papermill as pm
 9 | 
10 | 
11 | @pytest.mark.notebooks
12 | @pytest.mark.gpu
13 | def test_gpu_vm():
14 |     assert get_number_gpus() >= 1
15 | 
16 | 
17 | @pytest.mark.notebooks
18 | @pytest.mark.gpu
19 | def test_fastai(notebooks):
20 |     notebook_path = notebooks["fastai"]
21 |     pm.execute_notebook(
22 |         notebook_path,
23 |         OUTPUT_NOTEBOOK,
24 |         kernel_name=KERNEL_NAME,
25 |         parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="100k", EPOCHS=1),
26 |     )
27 | 
28 | 
29 | @pytest.mark.notebooks
30 | @pytest.mark.gpu
31 | def test_ncf(notebooks):
32 |     notebook_path = notebooks["ncf"]
33 |     pm.execute_notebook(
34 |         notebook_path,
35 |         OUTPUT_NOTEBOOK,
36 |         kernel_name=KERNEL_NAME,
37 |         parameters=dict(
38 |             TOP_K=10, MOVIELENS_DATA_SIZE="100k", EPOCHS=1, BATCH_SIZE=1024
39 |         ),
40 |     )
41 | 
42 | 
43 | @pytest.mark.notebooks
44 | @pytest.mark.gpu
45 | def test_ncf_deep_dive(notebooks):
46 |     notebook_path = notebooks["ncf_deep_dive"]
47 |     pm.execute_notebook(
48 |         notebook_path,
49 |         OUTPUT_NOTEBOOK,
50 |         kernel_name=KERNEL_NAME,
51 |         parameters=dict(
52 |             TOP_K=10, MOVIELENS_DATA_SIZE="100k", EPOCHS=1, BATCH_SIZE=2048
53 |         ),
54 |     )
55 | 
56 | 
57 | @pytest.mark.notebooks
58 | @pytest.mark.gpu
59 | def test_wide_deep(notebooks):
60 |     notebook_path = notebooks["wide_deep"]
61 | 
62 |     MODEL_DIR = 'model_checkpoints'
63 |     params = {
64 |         'MOVIELENS_DATA_SIZE': '100k',
65 |         'EPOCHS': 1,
66 |         'EVALUATE_WHILE_TRAINING': False,
67 |         'MODEL_DIR': MODEL_DIR,
68 |         'EXPORT_DIR_BASE': MODEL_DIR,
69 |         'RATING_METRICS': ['rmse', 'mae'],
70 |         'RANKING_METRICS': ['ndcg_at_k', 'precision_at_k'],
71 |     }
72 | 
73 |     pm.execute_notebook(
74 |         notebook_path,
75 |         OUTPUT_NOTEBOOK,
76 |         kernel_name=KERNEL_NAME,
77 |         parameters=params,
78 |     )
79 | 
80 |     shutil.rmtree(MODEL_DIR, ignore_errors=True)
81 | 


--------------------------------------------------------------------------------
/tests/unit/test_deeprec_model.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | from reco_utils.recommender.deeprec.deeprec_utils import prepare_hparams, download_deeprec_resources
 4 | from reco_utils.recommender.deeprec.models.xDeepFM import XDeepFMModel
 5 | from reco_utils.recommender.deeprec.models.dkn import DKN
 6 | from reco_utils.recommender.deeprec.IO.iterator import FFMTextIterator
 7 | from reco_utils.recommender.deeprec.IO.dkn_iterator import DKNTextIterator
 8 | 
 9 | 
10 | @pytest.fixture
11 | def resource_path():
12 |     return os.path.dirname(os.path.realpath(__file__))
13 | 
14 | 
15 | @pytest.mark.gpu
16 | @pytest.mark.deeprec
17 | def test_xdeepfm_component_definition(resource_path):
18 |     data_path = os.path.join(resource_path, "..", "resources", "deeprec", "xdeepfm")
19 |     yaml_file = os.path.join(data_path, "xDeepFM.yaml")
20 | 
21 |     if not os.path.exists(yaml_file):
22 |         download_deeprec_resources(
23 |             "https://recodatasets.blob.core.windows.net/deeprec/",
24 |             data_path,
25 |             "xdeepfmresources.zip",
26 |         )
27 | 
28 |     hparams = prepare_hparams(yaml_file)
29 |     model = XDeepFMModel(hparams, FFMTextIterator)
30 | 
31 |     assert model.logit is not None
32 |     assert model.update is not None
33 |     assert model.iterator is not None
34 | 
35 | 
36 | @pytest.mark.gpu
37 | @pytest.mark.deeprec
38 | def test_dkn_component_definition(resource_path):
39 |     data_path = os.path.join(resource_path, "..", "resources", "deeprec", "dkn")
40 |     yaml_file = os.path.join(data_path, "dkn.yaml")
41 |     wordEmb_file = os.path.join(data_path, "word_embeddings_100.npy")
42 |     entityEmb_file = os.path.join(data_path, "TransE_entity2vec_100.npy")
43 | 
44 |     if not os.path.exists(yaml_file):
45 |         download_deeprec_resources(
46 |             "https://recodatasets.blob.core.windows.net/deeprec/",
47 |             data_path,
48 |             "dknresources.zip",
49 |         )
50 | 
51 |     hparams = prepare_hparams(
52 |         yaml_file,
53 |         wordEmb_file=wordEmb_file,
54 |         entityEmb_file=entityEmb_file,
55 |         epochs=5,
56 |         learning_rate=0.0001,
57 |     )
58 |     assert hparams is not None
59 |     model = DKN(hparams, DKNTextIterator)
60 | 
61 |     assert model.logit is not None
62 |     assert model.update is not None
63 |     assert model.iterator is not None
64 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/dataset/pandas_df_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from reco_utils.common.constants import (
 7 |     DEFAULT_USER_COL,
 8 |     DEFAULT_ITEM_COL,
 9 |     DEFAULT_RATING_COL,
10 | )
11 | 
12 | 
13 | def user_item_pairs(
14 |     user_df,
15 |     item_df,
16 |     user_col=DEFAULT_USER_COL,
17 |     item_col=DEFAULT_ITEM_COL,
18 |     user_item_filter_df=None,
19 |     shuffle=True,
20 | ):
21 |     """Get all pairs of users and items data.
22 | 
23 |     Args:
24 |         user_df (pd.DataFrame): User data containing unique user ids and maybe their features.
25 |         item_df (pd.DataFrame): Item data containing unique item ids and maybe their features.
26 |         user_col (str): User id column name.
27 |         item_col (str): Item id column name.
28 |         user_item_filter_df (pd.DataFrame): User-item pairs to be used as a filter.
29 |         shuffle (bool): If True, shuffles the result.
30 | 
31 |     Returns:
32 |         pd.DataFrame: All pairs of user-item from user_df and item_df, excepting the pairs in user_item_filter_df
33 |     """
34 | 
35 |     # Get all user-item pairs
36 |     user_df["key"] = 1
37 |     item_df["key"] = 1
38 |     users_items = user_df.merge(item_df, on="key")
39 | 
40 |     user_df.drop("key", axis=1, inplace=True)
41 |     item_df.drop("key", axis=1, inplace=True)
42 |     users_items.drop("key", axis=1, inplace=True)
43 | 
44 |     # Filter
45 |     if user_item_filter_df is not None:
46 |         users_items = filter_by(users_items, user_item_filter_df, [user_col, item_col])
47 | 
48 |     if shuffle:
49 |         users_items = users_items.sample(frac=1).reset_index(drop=True)
50 | 
51 |     return users_items
52 | 
53 | 
54 | def filter_by(df, filter_by_df, filter_by_cols):
55 |     """From the input DataFrame (df), remove the records whose target column (filter_by_cols) values are
56 |     exist in the filter-by DataFrame (filter_by_df)
57 | 
58 |     Args:
59 |         df (pd.DataFrame): Source dataframe.
60 |         filter_by_df (pd.DataFrame): Filter dataframe.
61 |         filter_by_cols (iterable of str): Filter columns.
62 | 
63 |     Returns:
64 |         pd.DataFrame: Dataframe filtered by filter_by_df on filter_by_cols
65 |     """
66 | 
67 |     return df.loc[
68 |         ~df.set_index(filter_by_cols).index.isin(
69 |             filter_by_df.set_index(filter_by_cols).index
70 |         )
71 |     ]
72 | 


--------------------------------------------------------------------------------
/reco_utils/azureml/azureml_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import os
 5 | 
 6 | from azureml.core import Workspace
 7 | 
 8 | 
 9 | def get_or_create_workspace(
10 |     config_path=None,
11 |     subscription_id=None,
12 |     resource_group=None,
13 |     workspace_name=None,
14 |     workspace_region=None,
15 | ):
16 |     """Get or create AzureML Workspace this will save the config to the path specified for later use
17 | 
18 |     Args:
19 |         config_path (str): optional directory to look for / store config.json file (defaults to current directory)
20 |         subscription_id (str): subscription id
21 |         resource_group (str): resource group
22 |         workspace_name (str): workspace name
23 |         workspace_region (str): region
24 | 
25 |     Returns:
26 |         Workspace
27 |     """
28 | 
29 |     # use environment variables if needed
30 |     if subscription_id is None:
31 |         subscription_id = os.getenv("SUBSCRIPTION_ID")
32 |     if resource_group is None:
33 |         resource_group = os.getenv("RESOURCE_GROUP")
34 |     if workspace_name is None:
35 |         workspace_name = os.getenv("WORKSPACE_NAME")
36 |     if workspace_region is None:
37 |         workspace_region = os.getenv("WORKSPACE_REGION")
38 | 
39 |     # define fallback options in order to try
40 |     options = [
41 |         (
42 |             Workspace,
43 |             dict(
44 |                 subscription_id=subscription_id,
45 |                 resource_group=resource_group,
46 |                 workspace_name=workspace_name,
47 |             ),
48 |         ),
49 |         (Workspace.from_config, dict(path=config_path)),
50 |         (
51 |             Workspace.create,
52 |             dict(
53 |                 subscription_id=subscription_id,
54 |                 resource_group=resource_group,
55 |                 name=workspace_name,
56 |                 location=workspace_region,
57 |                 create_resource_group=True,
58 |                 exist_ok=True,
59 |             ),
60 |         ),
61 |     ]
62 | 
63 |     for function, kwargs in options:
64 |         try:
65 |             ws = function(**kwargs)
66 |             break
67 |         except Exception:
68 |             continue
69 |     else:
70 |         raise ValueError(
71 |             "Failed to get or create AzureML Workspace with the configuration information provided"
72 |         )
73 | 
74 |     ws.write_config(path=config_path)
75 |     return ws
76 | 


--------------------------------------------------------------------------------
/tests/ci/Master-CPU-pipeline.yml:
--------------------------------------------------------------------------------
 1 | # Master-CPU-pipeline.yml
 2 | # Starter pipeline
 3 | # Start with a minimal pipeline that you can customize to build and deploy your code.
 4 | # Add steps that build, run tests, deploy, and more:
 5 | # https://aka.ms/yaml
 6 | #
 7 | # use variable group name
 8 | variables:
 9 | - group: AzureKeyVaultSecrets
10 | 
11 | #trigger:
12 | #- azure-pipelines-bz
13 | #  - master
14 | 
15 | #pr:
16 | #- staging 
17 |   
18 | pool:
19 |  vmImage: 'ubuntu-16.04'
20 | 
21 | steps:
22 | 
23 | - task: UsePythonVersion@0
24 |   inputs:
25 |     versionSpec: '3.6'
26 |     architecture: 'x64'
27 |   displayName: 'Use Python 3.6'
28 | 
29 | - script: | 
30 |    az login --service-principal -u $(ClientID) -p $(ClientSecret) --tenant $(TenantID)
31 |    
32 |   displayName: 'Login to Azure'
33 | 
34 | - script: |
35 |    sed -i 's#"subscription_id": "<>"#"subscription_id": "$(SubscriptionID)"#g' ./tests/ci/config.json
36 |    echo my subscription is $(SubscriptionID)
37 |    cat ./tests/ci/config.json
38 |   displayName: 'replace subscription value'
39 | 
40 | - script: 
41 |    sed -i 's#"tests/unit_or_smoke_int"#"tests/unit"#g' ./tests/ci/runpytest.py
42 |   displayName: 'replace unit or smoke or int'
43 | 
44 | - script:
45 |     sed -i 's#"not notebooks and not spark and not gpu"#"not notebooks and not spark and not gpu"#g' ./tests/ci/runpytest.py
46 |   displayName: 'notebooks and spark and gpu new'
47 | 
48 | - bash: |
49 |     echo "##vso[task.prependpath]/data/anaconda/bin"
50 |   displayName: Add Conda to PATH
51 | 
52 | - script: 'pip install azureml-sdk'
53 |   displayName: 'install azureml-sdk'
54 |   continueOnError: true
55 |   
56 | - script: 
57 |      python scripts/generate_conda_file.py
58 |   displayName: ' generate_conda_file.py'
59 | 
60 | - script: |
61 |      chmod +x scripts/*.py
62 |      ls -al scripts
63 |      chmod +x tests/ci/*.py
64 |      ls -al tests/ci
65 |      pwd
66 |      ls -al
67 |   displayName: 'ls'
68 | 
69 | - script: |
70 |      python --version
71 |      pip install azure-cli==2.0.46
72 |      pip install --upgrade azureml-sdk[cli]
73 | #     pip install -r tests/ci/requirements.txt
74 |   displayName: 'install cli'
75 | 
76 | - script: 
77 |       python tests/ci/submitpytest.py
78 |   displayName: 'standalone pytest test persistent'
79 | 
80 | - task: PublishTestResults@2
81 |   displayName: 'Publish Test Results **/test-*.xml'
82 |   inputs:
83 |     testResultsFiles: '**/test-*.xml'
84 |     failTaskOnFailedTests: true
85 |   condition: succeededOrFailed()
86 | 


--------------------------------------------------------------------------------
/tests/ci/pytest.yml:
--------------------------------------------------------------------------------
 1 | # pytest.yml
 2 | # Starter pipeline
 3 | # Start with a minimal pipeline that you can customize to build and deploy your code.
 4 | # Add steps that build, run tests, deploy, and more:
 5 | # https://aka.ms/yaml
 6 | #
 7 | # use variable group name
 8 | variables:
 9 | - group: AzureKeyVaultSecrets
10 | 
11 | #trigger:
12 | #- azure-pipelines-bz
13 | #  - master
14 | 
15 | #pr:
16 | #- staging 
17 |   
18 | pool:
19 |  vmImage: 'ubuntu-16.04'
20 | 
21 | steps:
22 | 
23 | - task: UsePythonVersion@0
24 |   inputs:
25 |     versionSpec: '3.6'
26 |     architecture: 'x64'
27 |   displayName: 'Use Python 3.6'
28 | 
29 | - script: | 
30 |    az login --service-principal -u $(ClientID) -p $(ClientSecret) --tenant $(TenantID)
31 |    
32 |   displayName: 'Login to Azure'
33 | 
34 | - script: |
35 |    pwd
36 |    ls ./tests/ci
37 |    sed -i 's#"subscription_id": "<>"#"subscription_id": "$(SubscriptionID)"#g' ./tests/ci/config.json
38 |    echo my subscription is $(SubscriptionID)
39 |    cat ./tests/ci/config.json
40 |   displayName: 'replace subscription value'
41 | 
42 | - script: 
43 |    sed -i 's#"tests/unit_or_smoke_int"#"tests/unit"#g' ./tests/ci/runpytest.py
44 |   displayName: 'replace unit or smoke or int'
45 | 
46 | - script:
47 |     sed -i 's#"not notebooks and not spark and not gpu"#"not notebooks and not spark and not gpu"#g' ./tests/ci/runpytest.py
48 |   displayName: 'notebooks and spark and gpu new'
49 | 
50 | - bash: |
51 |     echo "##vso[task.prependpath]/data/anaconda/bin"
52 |   displayName: Add Conda to PATH
53 | 
54 | - script: 'pip install azureml-sdk'
55 |   displayName: 'install azureml-sdk'
56 |   continueOnError: true
57 |   
58 | - script: 
59 |      python scripts/generate_conda_file.py
60 |   displayName: ' generate_conda_file.py'
61 | 
62 | - script: |
63 |      chmod +x scripts/*.py
64 |      ls -al scripts
65 |      chmod +x tests/ci/*.py
66 |      ls -al tests/ci
67 |      pwd
68 |      ls -al
69 |   displayName: 'ls'
70 | 
71 | - script: |
72 |      python --version
73 |      pip install azure-cli==2.0.46
74 |      pip install --upgrade azureml-sdk[cli]
75 | #     pip install -r tests/ci/requirements.txt
76 |   displayName: 'install cli'
77 | 
78 | - script: 
79 |       python tests/ci/submitpytest.py
80 |   displayName: 'standalone pytest test persistent'
81 | 
82 | - task: PublishTestResults@2
83 |   displayName: 'Publish Test Results **/test-*.xml'
84 |   inputs:
85 |     testResultsFiles: '**/test-*.xml'
86 |     failTaskOnFailedTests: true
87 |   condition: succeededOrFailed()


--------------------------------------------------------------------------------
/tests/unit/test_python_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | """
 5 | Test common python utils
 6 | """
 7 | import numpy as np
 8 | import pytest
 9 | 
10 | from reco_utils.common.python_utils import (
11 |     exponential_decay,
12 |     jaccard,
13 |     lift
14 | )
15 | 
16 | TOL = 0.0001
17 | 
18 | 
19 | @pytest.fixture
20 | def target_matrices(scope="module"):
21 |     J1 = np.array([[1.0, 0.0, 0.5],
22 |                    [0.0, 1.0, 0.33333],
23 |                    [0.5, 0.33333, 1.0]])
24 |     J2 = np.array([[1.0, 0.0, 0.0, 0.2],
25 |                    [0.0, 1.0, 0.0, 0.0],
26 |                    [0.0, 0.0, 1.0, 0.5],
27 |                    [0.2, 0.0, 0.5, 1.0]])
28 |     L1 = np.array([[1.0, 0.0, 0.5],
29 |                    [0.0, 0.5, 0.25],
30 |                    [0.5, 0.25, 0.5]])
31 |     L2 = np.array([[0.5, 0.0, 0.0, 0.125],
32 |                    [0.0, 0.33333, 0.0, 0.0],
33 |                    [0.0, 0.0, 0.5, 0.25],
34 |                    [0.125, 0.0, 0.25, 0.25]])
35 |     return {
36 |         "jaccard1": pytest.approx(J1, TOL),
37 |         "jaccard2": pytest.approx(J2, TOL),
38 |         "lift1": pytest.approx(L1, TOL),
39 |         "lift2": pytest.approx(L2, TOL)
40 |     }
41 | 
42 | 
43 | @pytest.fixture(scope="module")
44 | def python_data():
45 |     cooccurrence1 = np.array([[1.0, 0.0, 1.0],
46 |                               [0.0, 2.0, 1.0],
47 |                               [1.0, 1.0, 2.0]])
48 |     cooccurrence2 = np.array([[2.0, 0.0, 0.0, 1.0],
49 |                               [0.0, 3.0, 0.0, 0.0],
50 |                               [0.0, 0.0, 2.0, 2.0],
51 |                               [1.0, 0.0, 2.0, 4.0]])
52 |     return cooccurrence1, cooccurrence2
53 | 
54 | 
55 | def test_python_jaccard(python_data, target_matrices):
56 |     cooccurrence1, cooccurrence2 = python_data
57 |     J1 = jaccard(cooccurrence1)
58 |     assert type(J1) == np.ndarray
59 |     assert J1 == target_matrices["jaccard1"]
60 | 
61 |     J2 = jaccard(cooccurrence2)
62 |     assert type(J2) == np.ndarray
63 |     assert J2 == target_matrices["jaccard2"]
64 | 
65 | 
66 | def test_python_lift(python_data, target_matrices):
67 |     cooccurrence1, cooccurrence2 = python_data
68 |     L1 = lift(cooccurrence1)
69 |     assert type(L1) == np.ndarray
70 |     assert L1 == target_matrices["lift1"]
71 | 
72 |     L2 = lift(cooccurrence2)
73 |     assert type(L2) == np.ndarray
74 |     assert L2 == target_matrices["lift2"]
75 | 
76 | 
77 | def test_exponential_decay():
78 |     values = np.array([1, 2, 3, 4, 5, 6])
79 |     expected = np.array([0.25, 0.35355339, 0.5, 0.70710678, 1., 1.])
80 |     actual = exponential_decay(value=values, max_val=5, half_life=2)
81 |     assert np.allclose(actual, expected, atol=TOL)
82 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/README.md:
--------------------------------------------------------------------------------
 1 | # Recommender Utilities
 2 | 
 3 | This module (reco_utils) contains functions to simplify common tasks used when developing and evaluating recommender systems. A short description of the sub-modules is provided below. For more details about what functions are available and how to use them, please review the doc-strings provided with the code.
 4 | 
 5 | ## Sub-Modules
 6 | 
 7 | ### [Common](./common)
 8 | This submodule contains high-level utilities for defining constants used in most algorithms as well as helper functions for managing aspects of different frameworks: gpu, spark, jupyter notebook.
 9 | 
10 | ### [Dataset](./dataset)
11 | Dataset includes helper functions for interacting with Azure Cosmos databases, pulling different sizes of the Movielens dataset and formatting them appropriately as well as utilities for splitting data for training / testing.
12 | 
13 | #### Data Loading
14 | The movielens module will allow you to load a dataframe in pandas or spark formats from the Movielens dataset, with sizes of 100k, 1M, 10M, or 20M to test algorithms and evaluate performance benchmarks.
15 | ```python
16 | df = movielens.load_pandas_df(size="100k")
17 | ```
18 | 
19 | #### Splitting Techniques:
20 | Currently three methods are available for splitting datasets. All of them support splitting by user or item and filtering out minimal samples (for instance users that have not rated enough item, or items that have not been rated by enough users).
21 | - Random: this is the basic approach where entries are randomly assigned to each group based on the ratio desired
22 | - Chronological: this uses provided timestamps to order the data and selects a cut-off time that will split the desired ratio of data to train before that time and test after that time
23 | - Stratified: this is similar to random sampling, but the splits are stratified, for example if the datasets are split by user, the splitting approach will attempt to maintain the same set of items used in both training and test splits. The converse is true if splitting by item.
24 | 
25 | ### [Evaluation](./evaluation)
26 | The evaluation submodule includes functionality for performing hyperparameter sweeps as well as calculating common recommender metrics directly in python or in a Spark environment using pyspark.
27 | 
28 | Currently available metrics include:
29 | - Root Mean Squared Error
30 | - Mean Absolute Error
31 | - R<sup>2</sup>
32 | - Explained Variance
33 | - Precision at K
34 | - Recall at K
35 | - Normalized Discounted Cumulative Gain at K
36 | - Mean Average Precision at K
37 | 
38 | ### [Recommender](./recommender)
39 | The recommender submodule contains implementations of various algorithms that can be used in addition to external packages to evaluate and develop new recommender system approaches.
40 | Currently the Simple Adaptive Recommender (SAR) algorithm is implemented in python for running on a single node.
41 | 


--------------------------------------------------------------------------------
/reco_utils/azureml/aks_utils.py:
--------------------------------------------------------------------------------
 1 | from math import ceil, floor
 2 | import logging
 3 | 
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | def qps_to_replicas(target_qps, processing_time, max_qp_replica=1, target_utilization=0.7):
 7 |     """Provide a rough estimate of the number of replicas to support a given load (queries per second)
 8 |     
 9 |     Args:
10 |         target_qps (int): target queries per second that you want to support
11 |         processing_time (float): the estimated amount of time (in seconds) your service call takes
12 |         max_qp_replica (int): maximum number of concurrent queries per replica
13 |         target_utilization (float): proportion of CPU utilization you think is ideal
14 | 
15 |     Returns:
16 |         replicas: Number of estimated replicas required to support a target number of queries per second
17 |     """
18 |     concurrent_queries = target_qps * processing_time / target_utilization
19 |     replicas = ceil(concurrent_queries / max_qp_replica)
20 |     logger.info('Approximately {} replicas are estimated to support {} queries per second.'.format(replicas, target_qps))
21 |     return replicas
22 |     
23 | def replicas_to_qps(num_replicas, processing_time, max_qp_replica=1, target_utilization=0.7):
24 |     """Provide a rough estimate of the queries per second supported by a number of replicas
25 |     
26 |     Args:
27 |         num_replicas (int): number of replicas 
28 |         processing_time (float): the estimated amount of time (in seconds) your service call takes
29 |         max_qp_replica (int): maximum number of concurrent queries per replica
30 |         target_utilization (float): proportion of CPU utilization you think is ideal
31 | 
32 |     Returns:
33 |         qps: queries per second supported by the number of replicas
34 |     """
35 |     qps = floor(num_replicas*max_qp_replica*target_utilization/processing_time)
36 |     logger.info('Approximately {} queries per second are supported by {} replicas.'.format(qps, num_replicas))
37 |     return qps
38 | 
39 | 
40 | def total_cores_to_replicas(n_cores, cpu_cores_per_replica=0.1, overhead=0.1):
41 |     """Provide a rough estimate of the number of replicas supported by a particular number of cores.
42 | 
43 |     Args:
44 |         n_cores (int): Total number of cores within an AKS cluster that you want to use
45 |         cpu_cores_per_replica (float): Cores assigned to each replica. This can be fractional and corresponds to the 
46 |             cpu_cores argument passed to AksWebservice.deploy_configuration() configuration
47 |         overhead (float): Amount of overhead (as a proportion)
48 | 
49 |     Returns:
50 |         replicas: Total number of replicas supported by n_cores
51 |     """
52 |     replicas = floor((1 - overhead)*n_cores/(cpu_cores_per_replica))
53 |     logger.info('Approximately {} replicas are supported by {} cores.'.format(replicas, n_cores))
54 |     return replicas


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.7 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/reco_utils/recommender/fastai/fastai_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import fastai
 8 | from fastprogress import force_console_behavior
 9 | import fastprogress
10 | 
11 | from reco_utils.common import constants as cc
12 | 
13 | 
14 | def cartesian_product(*arrays):
15 |     """Compute the Cartesian product in fastai algo. This is a helper function.
16 | 
17 |     Args:
18 |         arrays (tuple of np.array): Input arrays
19 | 
20 |     Returns:
21 |         np.array: product
22 | 
23 |     """
24 |     la = len(arrays)
25 |     dtype = np.result_type(*arrays)
26 |     arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype)
27 |     for i, a in enumerate(np.ix_(*arrays)):
28 |         arr[..., i] = a
29 |     return arr.reshape(-1, la)
30 | 
31 | 
32 | def score(
33 |     learner,
34 |     test_df,
35 |     user_col=cc.DEFAULT_USER_COL,
36 |     item_col=cc.DEFAULT_ITEM_COL,
37 |     prediction_col=cc.DEFAULT_PREDICTION_COL,
38 |     top_k=None,
39 | ):
40 |     """Score all users+items provided and reduce to top_k items per user if top_k>0
41 |     
42 |     Args:
43 |         learner (obj): Model.
44 |         test_df (pd.DataFrame): Test dataframe.
45 |         user_col (str): User column name.
46 |         item_col (str): Item column name.
47 |         prediction_col (str): Prediction column name.
48 |         top_k (int): Number of top items to recommend.
49 | 
50 |     Returns:
51 |         pd.DataFrame: Result of recommendation 
52 |     """
53 |     # replace values not known to the model with NaN
54 |     total_users, total_items = learner.data.train_ds.x.classes.values()
55 |     test_df.loc[~test_df[user_col].isin(total_users), user_col] = np.nan
56 |     test_df.loc[~test_df[item_col].isin(total_items), item_col] = np.nan
57 | 
58 |     # map ids to embedding ids
59 |     u = learner.get_idx(test_df[user_col], is_item=False)
60 |     m = learner.get_idx(test_df[item_col], is_item=True)
61 | 
62 |     # score the pytorch model
63 |     pred = learner.model.forward(u, m)
64 |     scores = pd.DataFrame(
65 |         {user_col: test_df[user_col], item_col: test_df[item_col], prediction_col: pred}
66 |     )
67 |     scores = scores.sort_values([user_col, prediction_col], ascending=[True, False])
68 |     if top_k is not None:
69 |         top_scores = scores.groupby(user_col).head(top_k).reset_index(drop=True)
70 |     else:
71 |         top_scores = scores
72 |     return top_scores
73 | 
74 | 
75 | def hide_fastai_progress_bar():
76 |     """Hide fastai progress bar"""
77 |     fastprogress.fastprogress.NO_BAR = True
78 |     fastprogress.fastprogress.WRITER_FN = str
79 |     master_bar, progress_bar = force_console_behavior()
80 |     fastai.basic_train.master_bar, fastai.basic_train.progress_bar = (
81 |         master_bar,
82 |         progress_bar,
83 |     )
84 | 
85 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/recommender/fastai/fastai_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import fastai
 8 | from fastprogress import force_console_behavior
 9 | import fastprogress
10 | 
11 | from reco_utils.common import constants as cc
12 | 
13 | 
14 | def cartesian_product(*arrays):
15 |     """Compute the cartesian product in fastai algo. This is a helper function.
16 | 
17 |     Args:
18 |         arrays (tuple of np.array): Input arrays
19 | 
20 |     Returns:
21 |         np.array: product
22 | 
23 |     """
24 |     la = len(arrays)
25 |     dtype = np.result_type(*arrays)
26 |     arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype)
27 |     for i, a in enumerate(np.ix_(*arrays)):
28 |         arr[..., i] = a
29 |     return arr.reshape(-1, la)
30 | 
31 | 
32 | def score(
33 |     learner,
34 |     test_df,
35 |     user_col=cc.DEFAULT_USER_COL,
36 |     item_col=cc.DEFAULT_ITEM_COL,
37 |     prediction_col=cc.DEFAULT_PREDICTION_COL,
38 |     top_k=None,
39 | ):
40 |     """Score all users+items provided and reduce to top_k items per user if top_k>0
41 |     
42 |     Args:
43 |         learner (obj): Model.
44 |         test_df (pd.DataFrame): Test dataframe.
45 |         user_col (str): User column name.
46 |         item_col (str): Item column name.
47 |         prediction_col (str): Prediction column name.
48 |         top_k (int): Number of top items to recommend.
49 | 
50 |     Returns:
51 |         pd.DataFrame: Result of recommendation 
52 |     """
53 |     # replace values not known to the model with NaN
54 |     total_users, total_items = learner.data.train_ds.x.classes.values()
55 |     test_df.loc[~test_df[user_col].isin(total_users), user_col] = np.nan
56 |     test_df.loc[~test_df[item_col].isin(total_items), item_col] = np.nan
57 | 
58 |     # map ids to embedding ids
59 |     u = learner.get_idx(test_df[user_col], is_item=False)
60 |     m = learner.get_idx(test_df[item_col], is_item=True)
61 | 
62 |     # score the pytorch model
63 |     pred = learner.model.forward(u, m)
64 |     scores = pd.DataFrame(
65 |         {user_col: test_df[user_col], item_col: test_df[item_col], prediction_col: pred}
66 |     )
67 |     scores = scores.sort_values([user_col, prediction_col], ascending=[True, False])
68 |     if top_k is not None:
69 |         top_scores = scores.groupby(user_col).head(top_k).reset_index(drop=True)
70 |     else:
71 |         top_scores = scores
72 |     return top_scores
73 | 
74 | 
75 | def hide_fastai_progress_bar():
76 |     """Hide fastai progress bar"""
77 |     fastprogress.fastprogress.NO_BAR = True
78 |     fastprogress.fastprogress.WRITER_FN = str
79 |     master_bar, progress_bar = force_console_behavior()
80 |     fastai.basic_train.master_bar, fastai.basic_train.progress_bar = (
81 |         master_bar,
82 |         progress_bar,
83 |     )
84 | 
85 | 


--------------------------------------------------------------------------------
/reco_utils/README.md:
--------------------------------------------------------------------------------
 1 | # Recommender Utilities
 2 | 
 3 | This module (reco_utils) contains functions to simplify common tasks used when developing and evaluating recommender systems. A short description of the sub-modules is provided below. For more details about what functions are available and how to use them, please review the doc-strings provided with the code.
 4 | 
 5 | ## Sub-Modules
 6 | 
 7 | ### [Common](./common)
 8 | This submodule contains high-level utilities for defining constants used in most algorithms as well as helper functions for managing aspects of different frameworks: gpu, spark, jupyter notebook.
 9 | 
10 | ### [Dataset](./dataset)
11 | Dataset includes helper functions for interacting with Azure Cosmos databases, pulling different sizes of the MovieLens dataset and formatting them appropriately as well as utilities for splitting data for training / testing.
12 | 
13 | #### Data Loading
14 | The movielens module will allow you to load a dataframe in pandas or spark formats from the MovieLens dataset, with sizes of 100k, 1M, 10M, or 20M to test algorithms and evaluate performance benchmarks.
15 | ```python
16 | df = movielens.load_pandas_df(size="100k")
17 | ```
18 | 
19 | #### Splitting Techniques:
20 | Currently three methods are available for splitting datasets. All of them support splitting by user or item and filtering out minimal samples (for instance users that have not rated enough item, or items that have not been rated by enough users).
21 | - Random: this is the basic approach where entries are randomly assigned to each group based on the ratio desired
22 | - Chronological: this uses provided timestamps to order the data and selects a cut-off time that will split the desired ratio of data to train before that time and test after that time
23 | - Stratified: this is similar to random sampling, but the splits are stratified, for example if the datasets are split by user, the splitting approach will attempt to maintain the same set of items used in both training and test splits. The converse is true if splitting by item.
24 | 
25 | ### [Evaluation](./evaluation)
26 | The evaluation submodule includes functionality for performing hyperparameter sweeps as well as calculating common recommender metrics directly in python or in a Spark environment using pyspark.
27 | 
28 | Currently available metrics include:
29 | - Root Mean Squared Error
30 | - Mean Absolute Error
31 | - R<sup>2</sup>
32 | - Explained Variance
33 | - Precision at K
34 | - Recall at K
35 | - Normalized Discounted Cumulative Gain at K
36 | - Mean Average Precision at K
37 | - Area Under Curve
38 | - Logistic Loss
39 | 
40 | ### [Recommender](./recommender)
41 | The recommender submodule contains implementations of various algorithms that can be used in addition to external packages to evaluate and develop new recommender system approaches.
42 | Currently the Simple Adaptive Recommender (SAR) algorithm is implemented in python for running on a single node.
43 | 


--------------------------------------------------------------------------------
/reco_utils/dataset/download_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import os
 5 | from urllib.request import urlretrieve
 6 | import logging
 7 | from contextlib import contextmanager
 8 | from tempfile import TemporaryDirectory
 9 | from tqdm import tqdm
10 | 
11 | 
12 | log = logging.getLogger(__name__)
13 | 
14 | 
15 | class TqdmUpTo(tqdm):
16 |     """Wrapper class for the progress bar tqdm to get `update_to(n)` functionality"""
17 | 
18 |     def update_to(self, b=1, bsize=1, tsize=None):
19 |         """A progress bar showing how much is left to finish the operation
20 |         
21 |         Args:
22 |             b (int): Number of blocks transferred so far.
23 |             bsize (int): Size of each block (in tqdm units).
24 |             tsize (int): Total size (in tqdm units). 
25 |         """
26 |         if tsize is not None:
27 |             self.total = tsize
28 |         self.update(b * bsize - self.n)  # will also set self.n = b * bsize
29 | 
30 | 
31 | def maybe_download(url, filename=None, work_directory=".", expected_bytes=None):
32 |     """Download a file if it is not already downloaded.
33 |     
34 |     Args:
35 |         filename (str): File name.
36 |         work_directory (str): Working directory.
37 |         url (str): URL of the file to download.
38 |         expected_bytes (int): Expected file size in bytes.
39 | 
40 |     Returns:
41 |         str: File path of the file downloaded.
42 |     """
43 |     if filename is None:
44 |         filename = url.split("/")[-1]
45 |     filepath = os.path.join(work_directory, filename)
46 |     if not os.path.exists(filepath):
47 |         with TqdmUpTo(unit="B", unit_scale=True) as t:
48 |             filepath, _ = urlretrieve(url, filepath, reporthook=t.update_to)
49 |     else:
50 |         log.debug("File {} already downloaded".format(filepath))
51 |     if expected_bytes is not None:
52 |         statinfo = os.stat(filepath)
53 |         if statinfo.st_size != expected_bytes:
54 |             os.remove(filepath)
55 |             raise IOError("Failed to verify {}".format(filepath))
56 | 
57 |     return filepath
58 | 
59 | 
60 | @contextmanager
61 | def download_path(path=None):
62 |     """Return a path to download data. If `path=None`, then it yields a temporal path that is eventually deleted, 
63 |     otherwise the real path of the input. 
64 | 
65 |     Args:
66 |         path (str): Path to download data.
67 | 
68 |     Returns:
69 |         str: Real path where the data is stored.
70 | 
71 |     Examples:
72 |         >>> with download_path() as path:
73 |         >>> ... maybe_download(url="http://example.com/file.zip", work_directory=path)
74 | 
75 |     """
76 |     if path is None:
77 |         tmp_dir = TemporaryDirectory()
78 |         try:
79 |             yield tmp_dir.name
80 |         finally:
81 |             tmp_dir.cleanup()
82 |     else:
83 |         path = os.path.realpath(path)
84 |         yield path
85 | 
86 |     


--------------------------------------------------------------------------------
/reco_utils/dataset/cosmos_cli.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | import pydocumentdb.errors as errors
 4 | 
 5 | 
 6 | def find_collection(client, dbid, id):
 7 |     """Find whether or not a CosmosDB collection exists.
 8 |     Args:
 9 |         client (obj): A pydocumentdb client object.
10 |         dbid (str): Database ID.
11 |         id (str): Collection ID.
12 |     Returns:
13 |         bool: True if the collection exists, False otherwise.
14 |     """
15 |     database_link = "dbs/" + dbid
16 |     collections = list(
17 |         client.QueryCollections(
18 |             database_link,
19 |             {
20 |                 "query": "SELECT * FROM r WHERE r.id=@id",
21 |                 "parameters": [{"name": "@id", "value": id}],
22 |             },
23 |         )
24 |     )
25 |     if len(collections) > 0:
26 |         return True
27 |     else:
28 |         return False
29 | 
30 | 
31 | def read_collection(client, dbid, id):
32 |     """Read a CosmosDB collection.
33 |     Args:
34 |         client (obj): A pydocumentdb client object.
35 |         dbid (str): Database ID.
36 |         id (str): Collection ID.
37 |     Returns:
38 |         obj: A collection.
39 |     """
40 |     try:
41 |         database_link = "dbs/" + dbid
42 |         collection_link = database_link + "/colls/{0}".format(id)
43 |         collection = client.ReadCollection(collection_link)
44 |         return collection
45 |     except errors.DocumentDBError as e:
46 |         if e.status_code == 404:
47 |             print("A collection with id '{0}' does not exist".format(id))
48 |         else:
49 |             raise errors.HTTPFailure(e.status_code)
50 | 
51 | 
52 | def read_database(client, id):
53 |     """Read a CosmosDB database.
54 |     Args:
55 |         client (obj): A pydocumentdb client object.
56 |         id (str): Database ID.
57 |     Returns:
58 |         obj: A database.
59 |     """
60 |     try:
61 |         database_link = "dbs/" + id
62 |         database = client.ReadDatabase(database_link)
63 |         return database
64 |     except errors.DocumentDBError as e:
65 |         if e.status_code == 404:
66 |             print("A database with id '{0}' does not exist".format(id))
67 |         else:
68 |             raise errors.HTTPFailure(e.status_code)
69 | 
70 | 
71 | def find_database(client, id):
72 |     """Find whether or not a CosmosDB database exists.
73 |     Args:
74 |         client (obj): A pydocumentdb client object.
75 |         id (str): Database ID.
76 |     Returns:
77 |         bool: True if the database exists, False otherwise.
78 |     """
79 |     databases = list(
80 |         client.QueryDatabases(
81 |             {
82 |                 "query": "SELECT * FROM r WHERE r.id=@id",
83 |                 "parameters": [{"name": "@id", "value": id}],
84 |             }
85 |         )
86 |     )
87 |     if len(databases) > 0:
88 |         return True
89 |     else:
90 |         return False
91 | 
92 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/dataset/cosmos_cli.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | import pydocumentdb.errors as errors
 4 | 
 5 | 
 6 | def find_collection(client, dbid, id):
 7 |     """Find whether or not a CosmosDB collection exists.
 8 |     Args:
 9 |         client (obj): A pydocumentdb client object.
10 |         dbid (str): Database ID.
11 |         id (str): Collection ID.
12 |     Returns:
13 |         bool: True if the collection exists, False otherwise.
14 |     """
15 |     database_link = "dbs/" + dbid
16 |     collections = list(
17 |         client.QueryCollections(
18 |             database_link,
19 |             {
20 |                 "query": "SELECT * FROM r WHERE r.id=@id",
21 |                 "parameters": [{"name": "@id", "value": id}],
22 |             },
23 |         )
24 |     )
25 |     if len(collections) > 0:
26 |         return True
27 |     else:
28 |         return False
29 | 
30 | 
31 | def read_collection(client, dbid, id):
32 |     """Read a CosmosDB collection.
33 |     Args:
34 |         client (obj): A pydocumentdb client object.
35 |         dbid (str): Database ID.
36 |         id (str): Collection ID.
37 |     Returns:
38 |         obj: A collection.
39 |     """
40 |     try:
41 |         database_link = "dbs/" + dbid
42 |         collection_link = database_link + "/colls/{0}".format(id)
43 |         collection = client.ReadCollection(collection_link)
44 |         return collection
45 |     except errors.DocumentDBError as e:
46 |         if e.status_code == 404:
47 |             print("A collection with id '{0}' does not exist".format(id))
48 |         else:
49 |             raise errors.HTTPFailure(e.status_code)
50 | 
51 | 
52 | def read_database(client, id):
53 |     """Read a CosmosDB database.
54 |     Args:
55 |         client (obj): A pydocumentdb client object.
56 |         id (str): Database ID.
57 |     Returns:
58 |         obj: A database.
59 |     """
60 |     try:
61 |         database_link = "dbs/" + id
62 |         database = client.ReadDatabase(database_link)
63 |         return database
64 |     except errors.DocumentDBError as e:
65 |         if e.status_code == 404:
66 |             print("A database with id '{0}' does not exist".format(id))
67 |         else:
68 |             raise errors.HTTPFailure(e.status_code)
69 | 
70 | 
71 | def find_database(client, id):
72 |     """Find whether or not a CosmosDB database exists.
73 |     Args:
74 |         client (obj): A pydocumentdb client object.
75 |         id (str): Database ID.
76 |     Returns:
77 |         bool: True if the database exists, False otherwise.
78 |     """
79 |     databases = list(
80 |         client.QueryDatabases(
81 |             {
82 |                 "query": "SELECT * FROM r WHERE r.id=@id",
83 |                 "parameters": [{"name": "@id", "value": id}],
84 |             }
85 |         )
86 |     )
87 |     if len(databases) > 0:
88 |         return True
89 |     else:
90 |         return False
91 | 
92 | 


--------------------------------------------------------------------------------
/scripts/databricks_install.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | # ---------------------------------------------------------
 4 | # This script installs Recommenders into Databricks
 5 | 
 6 | DATABRICKS_CLI=$(which databricks)
 7 | if ! [ -x "$DATABRICKS_CLI" ]; then
 8 |     echo "No databricks-cli found!! Please see the SETUP.md file for installation prerequisites."
 9 |     exit 1
10 | fi
11 | 
12 | CLUSTER_ID=$1
13 | if [ -z $CLUSTER_ID ]; then
14 |     echo "Please provide the target cluster id: 'databricks_install.sh <CLUSTER_ID>'."
15 |     echo "Cluster id can be found by running 'databricks clusters list'"
16 |     echo "which returns a list of <CLUSTER_ID> <CLUSTER_NAME> <STATUS>."
17 |     exit 1
18 | fi
19 | 
20 | CLUSTER_EXIST=false
21 | while IFS=' ' read -ra ARR; do
22 |     if [ ${ARR[0]} = $CLUSTER_ID ]; then
23 |         CLUSTER_EXIST=true
24 | 
25 |         STATUS=${ARR[2]}
26 |         STATUS=${STATUS//[^a-zA-Z]/}
27 |         if [ $STATUS = RUNNING ]; then
28 |             echo
29 |             echo "Preparing Recommenders library file (egg)..."
30 |             zip -r -q Recommenders.egg ./reco_utils -i \*.py
31 | 
32 |             echo
33 |             echo "Uploading to databricks..."
34 |             dbfs cp --overwrite Recommenders.egg dbfs:/FileStore/jars/Recommenders.egg
35 | 
36 |             echo
37 |             echo "Installing the library onto databricks cluster $CLUSTER_ID..."
38 |             databricks libraries install --cluster-id $CLUSTER_ID --egg dbfs:/FileStore/jars/Recommenders.egg
39 | 
40 |             echo
41 |             echo "Done! Installation status checking..."
42 |             databricks libraries cluster-status --cluster-id $CLUSTER_ID
43 | 
44 |             echo
45 |             echo "Restarting the cluster to activate the library..."
46 |             databricks clusters restart --cluster-id $CLUSTER_ID
47 | 
48 |             echo "This will take few seconds. Please check the result from Databricks workspace."
49 |             echo "Alternatively, run 'databricks clusters list' to check the restart status and"
50 |             echo "run 'databricks libraries cluster-status --cluster-id $CLUSTER_ID' to check the installation status."
51 | 
52 |             rm Recommenders.egg
53 |             exit 0
54 |         else
55 |             echo "Cluster $CLUSTER_ID found, but it is not running. Status=${STATUS}"
56 |             echo "You can start the cluster with 'databricks clusters start --cluster-id $CLUSTER_ID'."
57 |             echo "Then, check the cluster status by using 'databricks clusters list' and"
58 |             echo "re-try installation once the status turns into RUNNING."
59 |             exit 1
60 |         fi
61 |     fi
62 | done < <(databricks clusters list)
63 | 
64 | if ! [ $CLUSTER_EXIST = true ]; then
65 |     echo "Cannot find the target cluster $CLUSTER_ID. Please check if you entered the valid id."
66 |     echo "Cluster id can be found by running 'databricks clusters list'"
67 |     echo "which returns a list of <CLUSTER_ID> <CLUSTER_NAME> <STATUS>."
68 |     exit 1
69 | fi
70 | 
71 | 


--------------------------------------------------------------------------------
/notebooks/scripts/databricks_install.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | # ---------------------------------------------------------
 4 | # This script installs Recommenders into Databricks
 5 | 
 6 | DATABRICKS_CLI=$(which databricks)
 7 | if ! [ -x "$DATABRICKS_CLI" ]; then
 8 |     echo "No databricks-cli found!! Please see the SETUP.md file for installation prerequisites."
 9 |     exit 1
10 | fi
11 | 
12 | CLUSTER_ID=$1
13 | if [ -z $CLUSTER_ID ]; then
14 |     echo "Please provide the target cluster id: 'databricks_install.sh <CLUSTER_ID>'."
15 |     echo "Cluster id can be found by running 'databricks clusters list'"
16 |     echo "which returns a list of <CLUSTER_ID> <CLUSTER_NAME> <STATUS>."
17 |     exit 1
18 | fi
19 | 
20 | CLUSTER_EXIST=false
21 | while IFS=' ' read -ra ARR; do
22 |     if [ ${ARR[0]} = $CLUSTER_ID ]; then
23 |         CLUSTER_EXIST=true
24 | 
25 |         STATUS=${ARR[2]}
26 |         STATUS=${STATUS//[^a-zA-Z]/}
27 |         if [ $STATUS = RUNNING ]; then
28 |             echo
29 |             echo "Preparing Recommenders library file (egg)..."
30 |             zip -r -q Recommenders.egg ./reco_utils -i \*.py
31 | 
32 |             echo
33 |             echo "Uploading to databricks..."
34 |             dbfs cp --overwrite Recommenders.egg dbfs:/FileStore/jars/Recommenders.egg
35 | 
36 |             echo
37 |             echo "Installing the library onto databricks cluster $CLUSTER_ID..."
38 |             databricks libraries install --cluster-id $CLUSTER_ID --egg dbfs:/FileStore/jars/Recommenders.egg
39 | 
40 |             echo
41 |             echo "Done! Installation status checking..."
42 |             databricks libraries cluster-status --cluster-id $CLUSTER_ID
43 | 
44 |             echo
45 |             echo "Restarting the cluster to activate the library..."
46 |             databricks clusters restart --cluster-id $CLUSTER_ID
47 | 
48 |             echo "This will take few seconds. Please check the result from Databricks workspace."
49 |             echo "Alternatively, run 'databricks clusters list' to check the restart status and"
50 |             echo "run 'databricks libraries cluster-status --cluster-id $CLUSTER_ID' to check the installation status."
51 | 
52 |             rm Recommenders.egg
53 |             exit 0
54 |         else
55 |             echo "Cluster $CLUSTER_ID found, but it is not running. Status=${STATUS}"
56 |             echo "You can start the cluster with 'databricks clusters start --cluster-id $CLUSTER_ID'."
57 |             echo "Then, check the cluster status by using 'databricks clusters list' and"
58 |             echo "re-try installation once the status turns into RUNNING."
59 |             exit 1
60 |         fi
61 |     fi
62 | done < <(databricks clusters list)
63 | 
64 | if ! [ $CLUSTER_EXIST = true ]; then
65 |     echo "Cannot find the target cluster $CLUSTER_ID. Please check if you entered the valid id."
66 |     echo "Cluster id can be found by running 'databricks clusters list'"
67 |     echo "which returns a list of <CLUSTER_ID> <CLUSTER_NAME> <STATUS>."
68 |     exit 1
69 | fi
70 | 
71 | 


--------------------------------------------------------------------------------
/tests/unit/test_pandas_df_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import pytest
 5 | import pandas as pd
 6 | from reco_utils.dataset.pandas_df_utils import (
 7 |     user_item_pairs,
 8 |     filter_by
 9 | )
10 | 
11 | 
12 | @pytest.fixture(scope="module")
13 | def user_item_dataset():
14 |     """Get users and items dataframe"""
15 |     user_df = pd.DataFrame({
16 |         'user_id': [1, 2, 3, 4, 5],
17 |         'user_age': [23, 24, 25, 26, 27]
18 |     })
19 | 
20 |     item_df = pd.DataFrame({
21 |         'item_id': [6, 7, 8],
22 |         'item_feat': [[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]]
23 |     })
24 | 
25 |     return user_df, item_df
26 | 
27 | 
28 | def test_user_item_pairs(user_item_dataset):
29 |     user_df, item_df = user_item_dataset
30 | 
31 |     user_item = user_item_pairs(
32 |         user_df=user_df,
33 |         item_df=item_df,
34 |         user_col='user_id',
35 |         item_col='item_id',
36 |         shuffle=False
37 |     )
38 |     # Validate cross-join
39 |     assert len(user_df) * len(item_df) == len(user_item)
40 |     assert user_item.loc[(user_item['user_id'] == 3) & (user_item['item_id'] == 7)].values.tolist()[0]\
41 |         == [3, 25, 7, [0.2, 0.2]]
42 | 
43 |     # Check if result is deterministic
44 |     assert user_item.iloc[0].values.tolist() == [1, 23, 6, [0.1, 0.1]]
45 | 
46 |     # Check shuffle
47 |     user_item_shuffled = user_item_pairs(
48 |         user_df=user_df,
49 |         item_df=item_df,
50 |         user_col='user_id',
51 |         item_col='item_id',
52 |         shuffle=True
53 |     )
54 |     # Check shuffled result is still valid
55 |     assert len(user_df) * len(item_df) == len(user_item_shuffled)
56 |     row = user_item.loc[(user_item['user_id'] == 2) & (user_item['item_id'] == 6)]
57 |     assert row['user_age'].iloc[0] == 24
58 |     assert row['item_feat'].iloc[0] == [0.1, 0.1]
59 |     # Check shuffled result is different from not-shuffled dataframe
60 |     assert [*user_item_shuffled['user_id'].values] != [*user_item['user_id'].values]
61 | 
62 |     # Check filter
63 |     seen_df = pd.DataFrame({
64 |         'user_id': [1, 9, 3, 5, 5, 1],
65 |         'item_id': [1, 6, 7, 6, 8, 9]
66 |     })
67 |     user_item_filtered = user_item_pairs(
68 |         user_df=user_df,
69 |         item_df=item_df,
70 |         user_col='user_id',
71 |         item_col='item_id',
72 |         user_item_filter_df=seen_df,
73 |         shuffle=False
74 |     )
75 |     # Check filtered out number
76 |     assert len(user_item_filtered) == len(user_item) - 3
77 |     # Check filtered out record
78 |     assert len(user_item_filtered.loc[(user_item['user_id'] == 3) & (user_item['item_id'] == 7)]) == 0
79 | 
80 | 
81 | def test_filter_by():
82 |     user_df = pd.DataFrame({
83 |         'user_id': [1, 9, 3, 5, 5, 1],
84 |         'item_id': [1, 6, 7, 6, 8, 9]
85 |     })
86 | 
87 |     seen_df = pd.DataFrame({
88 |         'user_id': [1, 2, 4],
89 |     })
90 | 
91 |     filtered_df = filter_by(user_df, seen_df, ['user_id'])
92 | 
93 |     # Check filtered out number
94 |     assert len(filtered_df) == len(user_df) - 2
95 |     # Check filtered out record
96 |     assert len(filtered_df.loc[(user_df['user_id'] == 1)]) == 0
97 | 


--------------------------------------------------------------------------------
/tests/unit/test_deeprec_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import tensorflow as tf
 4 | from reco_utils.recommender.deeprec.deeprec_utils import (
 5 |     prepare_hparams,
 6 |     download_deeprec_resources,
 7 |     load_yaml,
 8 | )
 9 | from reco_utils.recommender.deeprec.IO.iterator import FFMTextIterator
10 | from reco_utils.recommender.deeprec.IO.dkn_iterator import DKNTextIterator
11 | 
12 | 
13 | @pytest.fixture
14 | def resource_path():
15 |     return os.path.dirname(os.path.realpath(__file__))
16 | 
17 | 
18 | @pytest.mark.parametrize(
19 |     "must_exist_attributes", ["FEATURE_COUNT", "data_format", "dim"]
20 | )
21 | @pytest.mark.gpu
22 | @pytest.mark.deeprec
23 | def test_prepare_hparams(must_exist_attributes, resource_path):
24 |     data_path = os.path.join(resource_path, "..", "resources", "deeprec", "xdeepfm")
25 |     yaml_file = os.path.join(data_path, "xDeepFM.yaml")
26 |     if not os.path.exists(yaml_file):
27 |         download_deeprec_resources(
28 |             "https://recodatasets.blob.core.windows.net/deeprec/",
29 |             data_path,
30 |             "xdeepfmresources.zip",
31 |         )
32 |     hparams = prepare_hparams(yaml_file)
33 |     assert hasattr(hparams, must_exist_attributes)
34 | 
35 | 
36 | @pytest.mark.gpu
37 | @pytest.mark.deeprec
38 | def test_load_yaml_file(resource_path):
39 |     data_path = os.path.join(resource_path, "..", "resources", "deeprec", "xdeepfm")
40 |     yaml_file = os.path.join(data_path, "xDeepFM.yaml")
41 | 
42 |     if not os.path.exists(yaml_file):
43 |         download_deeprec_resources(
44 |             "https://recodatasets.blob.core.windows.net/deeprec/",
45 |             data_path,
46 |             "xdeepfmresources.zip",
47 |         )
48 | 
49 |     config = load_yaml(yaml_file)
50 |     assert config is not None
51 | 
52 | 
53 | @pytest.mark.gpu
54 | @pytest.mark.deeprec
55 | def test_FFM_iterator(resource_path):
56 |     data_path = os.path.join(resource_path, "..", "resources", "deeprec", "xdeepfm")
57 |     yaml_file = os.path.join(data_path, "xDeepFM.yaml")
58 |     data_file = os.path.join(data_path, "sample_FFM_data.txt")
59 | 
60 |     if not os.path.exists(yaml_file):
61 |         download_deeprec_resources(
62 |             "https://recodatasets.blob.core.windows.net/deeprec/",
63 |             data_path,
64 |             "xdeepfmresources.zip",
65 |         )
66 | 
67 |     hparams = prepare_hparams(yaml_file)
68 |     iterator = FFMTextIterator(hparams, tf.Graph())
69 |     assert iterator is not None
70 |     for res in iterator.load_data_from_file(data_file):
71 |         assert isinstance(res, dict)
72 | 
73 | 
74 | @pytest.mark.gpu
75 | @pytest.mark.deeprec
76 | def test_DKN_iterator(resource_path):
77 |     data_path = os.path.join(resource_path, "..", "resources", "deeprec", "dkn")
78 |     data_file = os.path.join(data_path, "final_test_with_entity.txt")
79 |     yaml_file = os.path.join(data_path, "dkn.yaml")
80 |     if not os.path.exists(yaml_file):
81 |         download_deeprec_resources(
82 |             "https://recodatasets.blob.core.windows.net/deeprec/",
83 |             data_path,
84 |             "dknresources.zip",
85 |         )
86 | 
87 |     hparams = prepare_hparams(yaml_file, wordEmb_file="", entityEmb_file="")
88 |     iterator = DKNTextIterator(hparams, tf.Graph())
89 |     assert iterator is not None
90 |     for res in iterator.load_data_from_file(data_file):
91 |         assert isinstance(res, dict)
92 | 


--------------------------------------------------------------------------------
/reco_utils/common/gpu_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import sys
  5 | import os
  6 | import glob
  7 | from numba import cuda
  8 | from numba.cuda.cudadrv.error import CudaSupportError
  9 | 
 10 | 
 11 | DEFAULT_CUDA_PATH_LINUX = "/usr/local/cuda/version.txt"
 12 | 
 13 | 
 14 | def get_number_gpus():
 15 |     """Get the number of GPUs in the system.
 16 |     
 17 |     Returns:
 18 |         int: Number of GPUs.
 19 |     """
 20 |     try:
 21 |         return len(cuda.gpus)
 22 |     except CudaSupportError:
 23 |         return 0
 24 | 
 25 | 
 26 | def clear_memory_all_gpus():
 27 |     """Clear memory of all GPUs."""
 28 |     try:
 29 |         for gpu in cuda.gpus:
 30 |             with gpu:
 31 |                 cuda.current_context().deallocations.clear()
 32 |     except CudaSupportError:
 33 |         print("No CUDA available")
 34 | 
 35 | 
 36 | def get_cuda_version(unix_path=DEFAULT_CUDA_PATH_LINUX):
 37 |     """Get CUDA version
 38 |     
 39 |     Args:
 40 |         unix_path (str): Path to CUDA version file in Linux/Mac.
 41 | 
 42 |     Returns:
 43 |         str: Version of the library.
 44 |     """
 45 |     if sys.platform == "win32":
 46 |         raise NotImplementedError("Implement this!")
 47 |     elif sys.platform in ["linux", "darwin"]:
 48 |         if os.path.isfile(unix_path):
 49 |             with open(unix_path, "r") as f:
 50 |                 data = f.read().replace("\n", "")
 51 |             return data
 52 |         else:
 53 |             return "No CUDA in this machine"
 54 |     else:
 55 |         raise ValueError("Not in Windows, Linux or Mac")
 56 | 
 57 | 
 58 | def get_cudnn_version():
 59 |     """Get the CuDNN version
 60 |     
 61 |     Returns:
 62 |         str: Version of the library.
 63 | 
 64 |     """
 65 | 
 66 |     def find_cudnn_in_headers(candidates):
 67 |         for c in candidates:
 68 |             file = glob.glob(c)
 69 |             if file:
 70 |                 break
 71 |         if file:
 72 |             with open(file[0], "r") as f:
 73 |                 version = ""
 74 |                 for line in f:
 75 |                     if "#define CUDNN_MAJOR" in line:
 76 |                         version = line.split()[-1]
 77 |                     if "#define CUDNN_MINOR" in line:
 78 |                         version += "." + line.split()[-1]
 79 |                     if "#define CUDNN_PATCHLEVEL" in line:
 80 |                         version += "." + line.split()[-1]
 81 |             if version:
 82 |                 return version
 83 |             else:
 84 |                 return "Cannot find CUDNN version"
 85 |         else:
 86 |             return "No CUDNN in this machine"
 87 | 
 88 |     if sys.platform == "win32":
 89 |         candidates = ["C:\\NVIDIA\\cuda\\include\\cudnn.h",
 90 |                      "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\include\\cudnn.h"]
 91 |     elif sys.platform == "linux":
 92 |         candidates = [
 93 |             "/usr/include/x86_64-linux-gnu/cudnn_v*.h",
 94 |             "/usr/local/cuda/include/cudnn.h",
 95 |             "/usr/include/cudnn.h",
 96 |         ]
 97 |     elif sys.platform == "darwin":
 98 |         candidates = ["/usr/local/cuda/include/cudnn.h", "/usr/include/cudnn.h"]
 99 |     else:
100 |         raise ValueError("Not in Windows, Linux or Mac")
101 |     return find_cudnn_in_headers(candidates)
102 | 
103 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/common/gpu_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import sys
  5 | import os
  6 | import glob
  7 | from numba import cuda
  8 | from numba.cuda.cudadrv.error import CudaSupportError
  9 | 
 10 | 
 11 | DEFAULT_CUDA_PATH_LINUX = "/usr/local/cuda/version.txt"
 12 | 
 13 | 
 14 | def get_number_gpus():
 15 |     """Get the number of GPUs in the system.
 16 |     
 17 |     Returns:
 18 |         int: Number of GPUs.
 19 |     """
 20 |     try:
 21 |         return len(cuda.gpus)
 22 |     except CudaSupportError:
 23 |         return 0
 24 | 
 25 | 
 26 | def clear_memory_all_gpus():
 27 |     """Clear memory of all GPUs."""
 28 |     try:
 29 |         for gpu in cuda.gpus:
 30 |             with gpu:
 31 |                 cuda.current_context().deallocations.clear()
 32 |     except CudaSupportError:
 33 |         print("No CUDA available")
 34 | 
 35 | 
 36 | def get_cuda_version(unix_path=DEFAULT_CUDA_PATH_LINUX):
 37 |     """Get CUDA version
 38 |     
 39 |     Args:
 40 |         unix_path (str): Path to CUDA version file in Linux/Mac.
 41 | 
 42 |     Returns:
 43 |         str: Version of the library.
 44 |     """
 45 |     if sys.platform == "win32":
 46 |         raise NotImplementedError("Implement this!")
 47 |     elif sys.platform == "linux" or sys.platform == "darwin":
 48 |         if os.path.isfile(unix_path):
 49 |             with open(unix_path, "r") as f:
 50 |                 data = f.read().replace("\n", "")
 51 |             return data
 52 |         else:
 53 |             return "No CUDA in this machine"
 54 |     else:
 55 |         raise ValueError("Not in Windows, Linux or Mac")
 56 | 
 57 | 
 58 | def get_cudnn_version():
 59 |     """Get the CuDNN version
 60 |     
 61 |     Returns:
 62 |         str: Version of the library.
 63 |     """
 64 | 
 65 |     def find_cudnn_in_headers(candiates):
 66 |         for c in candidates:
 67 |             file = glob.glob(c)
 68 |             if file:
 69 |                 break
 70 |         if file:
 71 |             with open(file[0], "r") as f:
 72 |                 version = ""
 73 |                 for line in f:
 74 |                     if "#define CUDNN_MAJOR" in line:
 75 |                         version = line.split()[-1]
 76 |                     if "#define CUDNN_MINOR" in line:
 77 |                         version += "." + line.split()[-1]
 78 |                     if "#define CUDNN_PATCHLEVEL" in line:
 79 |                         version += "." + line.split()[-1]
 80 |             if version:
 81 |                 return version
 82 |             else:
 83 |                 return "Cannot find CUDNN version"
 84 |         else:
 85 |             return "No CUDNN in this machine"
 86 | 
 87 |     if sys.platform == "win32":
 88 |         candidates = ["C:\\NVIDIA\\cuda\\include\\cudnn.h",
 89 |                      "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v[0-99]\\include\\cudnn.h"]
 90 |     elif sys.platform == "linux":
 91 |         candidates = [
 92 |             "/usr/include/x86_64-linux-gnu/cudnn_v[0-99].h",
 93 |             "/usr/local/cuda/include/cudnn.h",
 94 |             "/usr/include/cudnn.h",
 95 |         ]
 96 |     elif sys.platform == "darwin":
 97 |         candidates = ["/usr/local/cuda/include/cudnn.h", "/usr/include/cudnn.h"]
 98 |     else:
 99 |         raise ValueError("Not in Windows, Linux or Mac")
100 |     return find_cudnn_in_headers(candidates)
101 | 


--------------------------------------------------------------------------------
/reco_utils/common/python_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import logging
 5 | 
 6 | import numpy as np
 7 | from scipy import sparse
 8 | 
 9 | 
10 | logger = logging.getLogger()
11 | 
12 | 
13 | def exponential_decay(value, max_val, half_life):
14 |     """Compute decay factor for a given value based on an exponential decay
15 |     Values greater than max_val will be set to 1
16 |     Args:
17 |         value (numeric): value to calculate decay factor
18 |         max_val (numeric): value at which decay factor will be 1
19 |         half_life (numeric): value at which decay factor will be 0.5
20 |     Returns:
21 |         float: decay factor
22 |     """
23 | 
24 |     return np.minimum(1.0, np.power(0.5, (max_val - value) / half_life))
25 | 
26 | 
27 | def jaccard(cooccurrence):
28 |     """Helper method to calculate the Jaccard similarity of a matrix of co-occurrences
29 |     Args:
30 |         cooccurrence (np.array): the symmetric matrix of co-occurrences of items
31 |     Returns:
32 |         np.array: The matrix of Jaccard similarities between any two items
33 |     """
34 | 
35 |     diag = cooccurrence.diagonal()
36 |     diag_rows = np.expand_dims(diag, axis=0)
37 |     diag_cols = np.expand_dims(diag, axis=1)
38 | 
39 |     with np.errstate(invalid="ignore", divide="ignore"):
40 |         result = cooccurrence / (diag_rows + diag_cols - cooccurrence)
41 | 
42 |     return np.array(result)
43 | 
44 | 
45 | def lift(cooccurrence):
46 |     """Helper method to calculate the Lift of a matrix of co-occurrences
47 |     Args:
48 |         cooccurrence (np.array): the symmetric matrix of co-occurrences of items
49 |     Returns:
50 |         np.array: The matrix of Lifts between any two items
51 |     """
52 | 
53 |     diag = cooccurrence.diagonal()
54 |     diag_rows = np.expand_dims(diag, axis=0)
55 |     diag_cols = np.expand_dims(diag, axis=1)
56 | 
57 |     with np.errstate(invalid="ignore", divide="ignore"):
58 |         result = cooccurrence / (diag_rows * diag_cols)
59 | 
60 |     return np.array(result)
61 | 
62 | 
63 | def get_top_k_scored_items(scores, top_k, sort_top_k=False):
64 |     """Extract top K items from a matrix of scores for each user-item pair, optionally sort results per user
65 | 
66 |     Args:
67 |         scores (np.array): score matrix (users x items)
68 |         top_k (int): number of top items to recommend
69 |         sort_top_k (bool): flag to sort top k results
70 | 
71 |     Returns:
72 |         np.array, np.array: indices into score matrix for each users top items, scores corresponding to top items
73 |     """
74 | 
75 |     # ensure we're working with a dense ndarray
76 |     if isinstance(scores, sparse.spmatrix):
77 |         scores = scores.todense()
78 | 
79 |     if scores.shape[1] < top_k:
80 |         logger.warning(
81 |             "Number of items is less than top_k, limiting top_k to number of items"
82 |         )
83 |     k = min(top_k, scores.shape[1])
84 | 
85 |     test_user_idx = np.arange(scores.shape[0])[:, None]
86 | 
87 |     # get top K items and scores
88 |     # this determines the un-ordered top-k item indices for each user
89 |     top_items = np.argpartition(scores, -k, axis=1)[:, -k:]
90 |     top_scores = scores[test_user_idx, top_items]
91 | 
92 |     if sort_top_k:
93 |         sort_ind = np.argsort(-top_scores)
94 |         top_items = top_items[test_user_idx, sort_ind]
95 |         top_scores = top_scores[test_user_idx, sort_ind]
96 | 
97 |     return np.array(top_items), np.array(top_scores)
98 | 


--------------------------------------------------------------------------------
/tests/unit/test_vowpal_wabbit.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import os
  5 | import pytest
  6 | from unittest import mock
  7 | 
  8 | 
  9 | import pandas as pd
 10 | 
 11 | from reco_utils.recommender.vowpal_wabbit.vw import VW
 12 | 
 13 | 
 14 | @pytest.fixture(scope="module")
 15 | def df():
 16 |     return pd.DataFrame(
 17 |         dict(user=[1, 3, 2], item=[8, 7, 7], rating=[1, 5, 3], timestamp=[1, 2, 3])
 18 |     )
 19 | 
 20 | 
 21 | @pytest.fixture(scope="function")
 22 | def model():
 23 |     model = VW(col_user="user", col_item="item", col_prediction="prediction", q="ui")
 24 |     yield model
 25 |     del model
 26 | 
 27 | 
 28 | def test_vw_init_del():
 29 |     model = VW()
 30 |     tempdir = model.tempdir.name
 31 |     assert os.path.exists(tempdir)
 32 | 
 33 |     del model
 34 |     assert not os.path.exists(tempdir)
 35 | 
 36 | 
 37 | def test_to_vw_cmd():
 38 |     expected = [
 39 |         "vw",
 40 |         "-l",
 41 |         "0.1",
 42 |         "--l1",
 43 |         "0.2",
 44 |         "--loss_function",
 45 |         "logistic",
 46 |         "--holdout_off",
 47 |         "--rank",
 48 |         "3",
 49 |         "-t",
 50 |     ]
 51 |     params = dict(
 52 |         l=0.1,
 53 |         l1=0.2,
 54 |         loss_function="logistic",
 55 |         holdout_off=True,
 56 |         quiet=False,
 57 |         rank=3,
 58 |         t=True,
 59 |     )
 60 |     assert VW.to_vw_cmd(params=params) == expected
 61 | 
 62 | 
 63 | def test_parse_train_cmd(model):
 64 |     expected = [
 65 |         "vw",
 66 |         "--loss_function",
 67 |         "logistic",
 68 |         "--oaa",
 69 |         "5",
 70 |         "-f",
 71 |         model.model_file,
 72 |         "-d",
 73 |         model.train_file,
 74 |     ]
 75 |     params = dict(loss_function="logistic", oaa=5, f="test", d="data", quiet=False)
 76 |     assert model.parse_train_params(params=params) == expected
 77 | 
 78 | 
 79 | def test_parse_test_cmd(model):
 80 |     expected = [
 81 |         "vw",
 82 |         "--loss_function",
 83 |         "logistic",
 84 |         "-d",
 85 |         model.test_file,
 86 |         "--quiet",
 87 |         "-i",
 88 |         model.model_file,
 89 |         "-p",
 90 |         model.prediction_file,
 91 |         "-t",
 92 |     ]
 93 |     params = dict(
 94 |         loss_function="logistic", i="test", oaa=5, d="data", test_only=True, quiet=True
 95 |     )
 96 |     assert model.parse_test_params(params=params) == expected
 97 | 
 98 | 
 99 | def test_to_vw_file(model, df):
100 |     expected = ["1 0|user 1 |item 8", "5 1|user 3 |item 7", "3 2|user 2 |item 7"]
101 |     model.to_vw_file(df, train=True)
102 |     with open(model.train_file, "r") as f:
103 |         assert f.read().splitlines() == expected
104 |     del model
105 | 
106 | 
107 | def test_fit_and_predict(model, df):
108 |     # generate fake predictions
109 |     with open(model.prediction_file, "w") as f:
110 |         f.writelines(["1 0\n", "3 1\n", "5 2\n"])
111 | 
112 |     # patch subprocess call to vw
113 |     with mock.patch("reco_utils.recommender.vowpal_wabbit.vw.run") as mock_run:
114 |         model.fit(df)
115 |         result = model.predict(df)
116 | 
117 |     expected = dict(
118 |         user=dict(enumerate([1, 3, 2])),
119 |         item=dict(enumerate([8, 7, 7])),
120 |         rating=dict(enumerate([1, 5, 3])),
121 |         timestamp=dict(enumerate([1, 2, 3])),
122 |         prediction=dict(enumerate([1, 3, 5])),
123 |     )
124 | 
125 |     assert result.to_dict() == expected
126 | 


--------------------------------------------------------------------------------
/reco_utils/common/notebook_memory_management.py:
--------------------------------------------------------------------------------
  1 | # Original code: https://raw.githubusercontent.com/miguelgfierro/codebase/master/python/system/notebook_memory_management.py
  2 | #
  3 | # Profile memory usage envelope of IPython commands and report interactively.
  4 | # Usage (inside a python notebook):
  5 | #   from notebook_memory_management import start_watching_memory, stop_watching_memory
  6 | # To start profile:
  7 | #   start_watching_memory()
  8 | # To stop profile:
  9 | #   stop_watching_memory()
 10 | #
 11 | # Based on: https://github.com/ianozsvald/ipython_memory_usage
 12 | #
 13 | 
 14 | from __future__ import division  # 1/2 == 0.5, as in Py3
 15 | from __future__ import absolute_import  # avoid hiding global modules with locals
 16 | from __future__ import print_function  # force use of print("hello")
 17 | from __future__ import (
 18 |     unicode_literals
 19 | )  # force unadorned strings "" to be Unicode without prepending u""
 20 | import time
 21 | import memory_profiler
 22 | from IPython import get_ipython
 23 | import psutil
 24 | import warnings
 25 | 
 26 | 
 27 | # keep a global accounting for the last known memory usage
 28 | # which is the reference point for the memory delta calculation
 29 | previous_call_memory_usage = memory_profiler.memory_usage()[0]
 30 | t1 = time.time()  # will be set to current time later
 31 | keep_watching = True
 32 | watching_memory = True
 33 | try:
 34 |     input_cells = get_ipython().user_ns["In"]
 35 | except:
 36 |     warnings.warn("Not running on notebook")
 37 | 
 38 | 
 39 | def start_watching_memory():
 40 |     """Register memory profiling tools to IPython instance."""
 41 |     global watching_memory
 42 |     watching_memory = True
 43 |     ip = get_ipython()
 44 |     ip.events.register("post_run_cell", watch_memory)
 45 |     ip.events.register("pre_run_cell", pre_run_cell)
 46 | 
 47 | 
 48 | def stop_watching_memory():
 49 |     """Unregister memory profiling tools from IPython instance."""
 50 |     global watching_memory
 51 |     watching_memory = False
 52 |     ip = get_ipython()
 53 |     try:
 54 |         ip.events.unregister("post_run_cell", watch_memory)
 55 |     except ValueError:
 56 |         print("ERROR: problem when unregistering")
 57 |         pass
 58 |     try:
 59 |         ip.events.unregister("pre_run_cell", pre_run_cell)
 60 |     except ValueError:
 61 |         print("ERROR: problem when unregistering")
 62 |         pass
 63 | 
 64 | 
 65 | def watch_memory():
 66 |     # bring in the global memory usage value from the previous iteration
 67 |     global previous_call_memory_usage, keep_watching, watching_memory, input_cells
 68 |     new_memory_usage = memory_profiler.memory_usage()[0]
 69 |     memory_delta = new_memory_usage - previous_call_memory_usage
 70 |     keep_watching = False
 71 |     total_memory = psutil.virtual_memory()[0] / 1024 / 1024  # in Mb
 72 |     # calculate time delta using global t1 (from the pre-run event) and current time
 73 |     time_delta_secs = time.time() - t1
 74 |     num_commands = len(input_cells) - 1
 75 |     cmd = "In [{}]".format(num_commands)
 76 |     # convert the results into a pretty string
 77 |     output_template = (
 78 |         "{cmd} used {memory_delta:0.4f} Mb RAM in "
 79 |         "{time_delta:0.2f}s, total RAM usage "
 80 |         "{memory_usage:0.2f} Mb, total RAM "
 81 |         "memory {total_memory:0.2f} Mb"
 82 |     )
 83 |     output = output_template.format(
 84 |         time_delta=time_delta_secs,
 85 |         cmd=cmd,
 86 |         memory_delta=memory_delta,
 87 |         memory_usage=new_memory_usage,
 88 |         total_memory=total_memory,
 89 |     )
 90 |     if watching_memory:
 91 |         print(str(output))
 92 |     previous_call_memory_usage = new_memory_usage
 93 | 
 94 | 
 95 | def pre_run_cell():
 96 |     """Capture current time before we execute the current command"""
 97 |     global t1
 98 |     t1 = time.time()
 99 | 
100 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/common/notebook_memory_management.py:
--------------------------------------------------------------------------------
  1 | # Original code: https://raw.githubusercontent.com/miguelgfierro/codebase/master/python/system/notebook_memory_management.py
  2 | #
  3 | # Profile memory usage envelope of IPython commands and report interactively.
  4 | # Usage (inside a python notebook):
  5 | #   from notebook_memory_management import start_watching_memory, stop_watching_memory
  6 | # To start profile:
  7 | #   start_watching_memory()
  8 | # To stop profile:
  9 | #   stop_watching_memory()
 10 | #
 11 | # Based on: https://github.com/ianozsvald/ipython_memory_usage
 12 | #
 13 | 
 14 | from __future__ import division  # 1/2 == 0.5, as in Py3
 15 | from __future__ import absolute_import  # avoid hiding global modules with locals
 16 | from __future__ import print_function  # force use of print("hello")
 17 | from __future__ import (
 18 |     unicode_literals
 19 | )  # force unadorned strings "" to be unicode without prepending u""
 20 | import time
 21 | import memory_profiler
 22 | from IPython import get_ipython
 23 | import psutil
 24 | import warnings
 25 | 
 26 | 
 27 | # keep a global accounting for the last known memory usage
 28 | # which is the reference point for the memory delta calculation
 29 | previous_call_memory_usage = memory_profiler.memory_usage()[0]
 30 | t1 = time.time()  # will be set to current time later
 31 | keep_watching = True
 32 | watching_memory = True
 33 | try:
 34 |     input_cells = get_ipython().user_ns["In"]
 35 | except:
 36 |     warnings.warn("Not running on notebook")
 37 | 
 38 | 
 39 | def start_watching_memory():
 40 |     """Register memory profiling tools to IPython instance."""
 41 |     global watching_memory
 42 |     watching_memory = True
 43 |     ip = get_ipython()
 44 |     ip.events.register("post_run_cell", watch_memory)
 45 |     ip.events.register("pre_run_cell", pre_run_cell)
 46 | 
 47 | 
 48 | def stop_watching_memory():
 49 |     """Unregister memory profiling tools from IPython instance."""
 50 |     global watching_memory
 51 |     watching_memory = False
 52 |     ip = get_ipython()
 53 |     try:
 54 |         ip.events.unregister("post_run_cell", watch_memory)
 55 |     except ValueError:
 56 |         print("ERROR: problem when unregistering")
 57 |         pass
 58 |     try:
 59 |         ip.events.unregister("pre_run_cell", pre_run_cell)
 60 |     except ValueError:
 61 |         print("ERROR: problem when unregistering")
 62 |         pass
 63 | 
 64 | 
 65 | def watch_memory():
 66 |     # bring in the global memory usage value from the previous iteration
 67 |     global previous_call_memory_usage, keep_watching, watching_memory, input_cells
 68 |     new_memory_usage = memory_profiler.memory_usage()[0]
 69 |     memory_delta = new_memory_usage - previous_call_memory_usage
 70 |     keep_watching = False
 71 |     total_memory = psutil.virtual_memory()[0] / 1024 / 1024  # in Mb
 72 |     # calculate time delta using global t1 (from the pre-run event) and current time
 73 |     time_delta_secs = time.time() - t1
 74 |     num_commands = len(input_cells) - 1
 75 |     cmd = "In [{}]".format(num_commands)
 76 |     # convert the results into a pretty string
 77 |     output_template = (
 78 |         "{cmd} used {memory_delta:0.4f} Mb RAM in "
 79 |         "{time_delta:0.2f}s, total RAM usage "
 80 |         "{memory_usage:0.2f} Mb, total RAM "
 81 |         "memory {total_memory:0.2f} Mb"
 82 |     )
 83 |     output = output_template.format(
 84 |         time_delta=time_delta_secs,
 85 |         cmd=cmd,
 86 |         memory_delta=memory_delta,
 87 |         memory_usage=new_memory_usage,
 88 |         total_memory=total_memory,
 89 |     )
 90 |     if watching_memory:
 91 |         print(str(output))
 92 |     previous_call_memory_usage = new_memory_usage
 93 | 
 94 | 
 95 | def pre_run_cell():
 96 |     """Capture current time before we execute the current command"""
 97 |     global t1
 98 |     t1 = time.time()
 99 | 
100 | 


--------------------------------------------------------------------------------
/tests/unit/test_wide_deep_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import pytest
  5 | import shutil
  6 | 
  7 | import pandas as pd
  8 | import tensorflow as tf
  9 | 
 10 | from reco_utils.common.tf_utils import (
 11 |     pandas_input_fn,
 12 |     MODEL_DIR
 13 | )
 14 | from reco_utils.recommender.wide_deep.wide_deep_utils import (
 15 |     build_model,
 16 |     build_feature_columns,
 17 | )
 18 | from reco_utils.common.constants import (
 19 |     DEFAULT_USER_COL,
 20 |     DEFAULT_ITEM_COL,
 21 |     DEFAULT_RATING_COL
 22 | )
 23 | 
 24 | ITEM_FEAT_COL = 'itemFeat'
 25 | 
 26 | 
 27 | @pytest.fixture(scope='module')
 28 | def pd_df():
 29 |     df = pd.DataFrame(
 30 |         {
 31 |             DEFAULT_USER_COL: [1, 1, 1, 2, 2, 2],
 32 |             DEFAULT_ITEM_COL: [1, 2, 3, 1, 4, 5],
 33 |             ITEM_FEAT_COL: [[1, 1, 1], [2, 2, 2], [3, 3, 3], [1, 1, 1], [4, 4, 4], [5, 5, 5]],
 34 |             DEFAULT_RATING_COL: [5, 4, 3, 5, 5, 3],
 35 |         }
 36 |     )
 37 |     users = df.drop_duplicates(DEFAULT_USER_COL)[DEFAULT_USER_COL].values
 38 |     items = df.drop_duplicates(DEFAULT_ITEM_COL)[DEFAULT_ITEM_COL].values
 39 |     return df, users, items
 40 | 
 41 | 
 42 | @pytest.mark.gpu
 43 | def test_build_feature_columns(pd_df):
 44 |     data, users, items = pd_df
 45 | 
 46 |     # Test if wide column has one crossed column
 47 |     wide_columns, _ = build_feature_columns(users, items, model_type='wide')
 48 |     assert len(wide_columns) == 1
 49 | 
 50 |     # Test if deep columns have user and item columns
 51 |     _, deep_columns = build_feature_columns(users, items, model_type='deep')
 52 |     assert len(deep_columns) == 2
 53 | 
 54 |     # Test if wide and deep columns have correct columns
 55 |     wide_columns, deep_columns = build_feature_columns(users, items, model_type='wide_deep')
 56 |     assert len(wide_columns) == 1
 57 |     assert len(deep_columns) == 2
 58 | 
 59 | 
 60 | @pytest.mark.gpu
 61 | def test_build_model(pd_df):
 62 |     data, users, items = pd_df
 63 | 
 64 |     # Test wide model
 65 |     wide_columns, deep_columns = build_feature_columns(users, items, model_type='wide')
 66 |     model = build_model('wide_'+MODEL_DIR, wide_columns=wide_columns)
 67 |     assert isinstance(model, tf.estimator.LinearRegressor)
 68 |     model = build_model('wide_'+MODEL_DIR, wide_columns=wide_columns, deep_columns=deep_columns)
 69 |     assert isinstance(model, tf.estimator.LinearRegressor)
 70 | 
 71 |     # Test if model train works
 72 |     model.train(
 73 |         input_fn=pandas_input_fn(df=data, y_col=DEFAULT_RATING_COL, batch_size=1, num_epochs=10, shuffle=True)
 74 |     )
 75 |     shutil.rmtree('wide_' + MODEL_DIR, ignore_errors=True)
 76 | 
 77 |     # Test deep model
 78 |     wide_columns, deep_columns = build_feature_columns(users, items, model_type='deep')
 79 |     model = build_model('deep_'+MODEL_DIR, deep_columns=deep_columns)
 80 |     assert isinstance(model, tf.estimator.DNNRegressor)
 81 |     model = build_model('deep_'+MODEL_DIR, wide_columns=wide_columns, deep_columns=deep_columns)
 82 |     assert isinstance(model, tf.estimator.DNNRegressor)
 83 | 
 84 |     # Test if model train works
 85 |     model.train(
 86 |         input_fn=pandas_input_fn(df=data, y_col=DEFAULT_RATING_COL, batch_size=1, num_epochs=10, shuffle=True)
 87 |     )
 88 |     shutil.rmtree('deep_' + MODEL_DIR, ignore_errors=True)
 89 | 
 90 |     # Test wide_deep model
 91 |     wide_columns, deep_columns = build_feature_columns(users, items, model_type='wide_deep')
 92 |     model = build_model('wide_deep_'+MODEL_DIR, wide_columns=wide_columns, deep_columns=deep_columns)
 93 |     assert isinstance(model, tf.estimator.DNNLinearCombinedRegressor)
 94 | 
 95 |     # Test if model train works
 96 |     model.train(
 97 |         input_fn=pandas_input_fn(df=data, y_col=DEFAULT_RATING_COL, batch_size=1, num_epochs=10, shuffle=True)
 98 |     )
 99 |     shutil.rmtree('wide_deep_'+MODEL_DIR, ignore_errors=True)
100 | 


--------------------------------------------------------------------------------
/tests/unit/test_surprise_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | """
 5 | Test utils for Surprise algos
 6 | """
 7 | import pandas as pd
 8 | import pytest
 9 | 
10 | import surprise
11 | 
12 | from reco_utils.recommender.surprise.surprise_utils import (
13 |     compute_rating_predictions,
14 |     compute_ranking_predictions
15 | )
16 | from tests.unit.test_python_evaluation import python_data
17 | 
18 | TOL = 0.001
19 | 
20 | 
21 | def test_compute_rating_predictions(python_data):
22 |     rating_true, _, _ = python_data(binary_rating=False)
23 |     svd = surprise.SVD()
24 |     train_set = surprise.Dataset.load_from_df(rating_true, reader=surprise.Reader()).build_full_trainset()
25 |     svd.fit(train_set)
26 | 
27 |     preds = compute_rating_predictions(svd, rating_true)
28 |     assert set(preds.columns) == {'userID', 'itemID', 'prediction'}
29 |     assert preds['userID'].dtypes == rating_true['userID'].dtypes
30 |     assert preds['itemID'].dtypes == rating_true['itemID'].dtypes
31 |     user = rating_true.iloc[0]['userID']
32 |     item = rating_true.iloc[0]['itemID']
33 |     assert preds[(preds['userID'] == user) & (preds['itemID'] == item)]['prediction'].values == \
34 |            pytest.approx(svd.predict(user, item).est, rel=TOL)
35 | 
36 |     preds = compute_rating_predictions(svd, rating_true.rename(columns={'userID': 'uid', 'itemID': 'iid'}),
37 |                                        usercol='uid', itemcol='iid', predcol='pred')
38 |     assert set(preds.columns) == {'uid', 'iid', 'pred'}
39 |     assert preds['uid'].dtypes == rating_true['userID'].dtypes
40 |     assert preds['iid'].dtypes == rating_true['itemID'].dtypes
41 |     user = rating_true.iloc[1]['userID']
42 |     item = rating_true.iloc[1]['itemID']
43 |     assert preds[(preds['uid'] == user) & (preds['iid'] == item)]['pred'].values == \
44 |            pytest.approx(svd.predict(user, item).est, rel=TOL)
45 | 
46 | 
47 | def test_compute_ranking_predictions(python_data):
48 |     rating_true, _, _ = python_data(binary_rating=False)
49 |     n_users = len(rating_true['userID'].unique())
50 |     n_items = len(rating_true['itemID'].unique())
51 |     svd = surprise.SVD()
52 |     train_set = surprise.Dataset.load_from_df(rating_true, reader=surprise.Reader()).build_full_trainset()
53 |     svd.fit(train_set)
54 | 
55 |     preds = compute_ranking_predictions(svd, rating_true)
56 |     assert set(preds.columns) == {'userID', 'itemID', 'prediction'}
57 |     assert preds['userID'].dtypes == rating_true['userID'].dtypes
58 |     assert preds['itemID'].dtypes == rating_true['itemID'].dtypes
59 |     user = preds.iloc[0]['userID']
60 |     item = preds.iloc[0]['itemID']
61 |     assert preds[(preds['userID'] == user) & (preds['itemID'] == item)]['prediction'].values == \
62 |            pytest.approx(svd.predict(user, item).est, rel=TOL)
63 |     # Test default recommend_seen=False
64 |     assert pd.merge(rating_true, preds, on=['userID', 'itemID']).shape[0] == 0
65 |     assert preds.shape[0] == (n_users * n_items - rating_true.shape[0])
66 | 
67 |     preds = compute_ranking_predictions(svd,
68 |                                         rating_true.rename(columns={'userID': 'uid', 'itemID': 'iid', 'rating': 'r'}),
69 |                                         usercol='uid', itemcol='iid', predcol='pred', recommend_seen=True)
70 |     assert set(preds.columns) == {'uid', 'iid', 'pred'}
71 |     assert preds['uid'].dtypes == rating_true['userID'].dtypes
72 |     assert preds['iid'].dtypes == rating_true['itemID'].dtypes
73 |     user = preds.iloc[1]['uid']
74 |     item = preds.iloc[1]['iid']
75 |     assert preds[(preds['uid'] == user) & (preds['iid'] == item)]['pred'].values == \
76 |            pytest.approx(svd.predict(user, item).est, rel=TOL)
77 |     # Test recommend_seen=True
78 |     assert pd.merge(rating_true, preds, left_on=['userID', 'itemID'], right_on=['uid', 'iid']).shape[0] == \
79 |            rating_true.shape[0]
80 |     assert preds.shape[0] == n_users * n_items
81 | 


--------------------------------------------------------------------------------
/scripts/prepare_databricks_for_o16n.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | # ---------------------------------------------------------
 4 | # This script installs appropriate external libraries onto
 5 | # a databricks cluster for operationalization.
 6 | 
 7 | DATABRICKS_CLI=$(which databricks)
 8 | if ! [ -x "$DATABRICKS_CLI" ]; then
 9 |     echo "No databricks-cli found!! Please see the SETUP.md file for installation prerequisites."
10 |     exit 1
11 | fi
12 | 
13 | CLUSTER_ID=$1
14 | if [ -z $CLUSTER_ID ]; then
15 |     echo "Please provide the target cluster id: 'prepare_databricks_for_016n.sh <CLUSTER_ID>'."
16 |     echo "Cluster id can be found by running 'databricks clusters list'"
17 |     echo "which returns a list of <CLUSTER_ID> <CLUSTER_NAME> <STATUS>."
18 |     exit 1
19 | fi
20 | 
21 | ## for spark version >=2.3.0
22 | COSMOSDB_CONNECTOR_URL="https://search.maven.org/remotecontent?filepath=com/microsoft/azure/azure-cosmosdb-spark_2.3.0_2.11/1.2.2/azure-cosmosdb-spark_2.3.0_2.11-1.2.2-uber.jar"
23 | COSMOSDB_CONNECTOR_BASENAME=$(basename $COSMOSDB_CONNECTOR_URL)
24 | 
25 | CLUSTER_EXIST=false
26 | PYPI_LIBRARIES=( "azure-cli==2.0.56" "azureml-sdk[databricks]==1.0.8" "pydocumentdb==2.3.3" )
27 | while IFS=' ' read -ra ARR; do
28 |     if [ ${ARR[0]} = $CLUSTER_ID ]; then
29 |         CLUSTER_EXIST=true
30 | 
31 |         STATUS=${ARR[2]}
32 |         STATUS=${STATUS//[^a-zA-Z]/}
33 |         if [ $STATUS = RUNNING ]; then
34 |             ## install each of the pypi libraries
35 |             for lib in "${PYPI_LIBRARIES[@]}"
36 |             do
37 |                 echo
38 |                 echo "Adding $lib"
39 |                 echo
40 |                 databricks libraries install --cluster-id $CLUSTER_ID --pypi-package $lib
41 |             done
42 | 
43 |             ## get spark-cosmosdb connector:
44 |             echo
45 |             echo "downloading cosmosdb connector jar file"
46 |             echo
47 |             curl -O $COSMOSDB_CONNECTOR_URL
48 |             
49 |             ## uplaod the jar to dbfs
50 |             echo
51 |             echo "Uploading to dbfs"
52 |             echo
53 |             dbfs cp --overwrite ${COSMOSDB_CONNECTOR_BASENAME} dbfs:/FileStore/jars/${COSMOSDB_CONNECTOR_BASENAME}
54 | 
55 |             # isntall from dbfs
56 |             echo
57 |             echo "Adding ${COSMOSDB_CONNECTOR_BASENAME} as library"
58 |             echo
59 |             databricks libraries install --cluster-id $CLUSTER_ID --jar dbfs:/FileStore/jars/${COSMOSDB_CONNECTOR_BASENAME}
60 | 
61 |             ## Check installation status
62 |             echo
63 |             echo "Done! Installation status checking..."
64 |             databricks libraries cluster-status --cluster-id $CLUSTER_ID
65 | 
66 |             echo
67 |             echo "Restarting the cluster to activate the library..."
68 |             databricks clusters restart --cluster-id $CLUSTER_ID
69 | 
70 |             echo "This will take few seconds. Please check the result from Databricks workspace."
71 |             echo "Alternatively, run 'databricks clusters list' to check the restart status and"
72 |             echo "run 'databricks libraries cluster-status --cluster-id $CLUSTER_ID' to check the installation status."
73 | 
74 |             exit 0
75 |         else
76 |             echo "Cluster $CLUSTER_ID found, but it is not running. Status=${STATUS}"
77 |             echo "You can start the cluster with 'databricks clusters start --cluster-id $CLUSTER_ID'."
78 |             echo "Then, check the cluster status by using 'databricks clusters list' and"
79 |             echo "re-try installation once the status turns into RUNNING."
80 |             exit 1
81 |         fi
82 |     fi
83 | done < <(databricks clusters list)
84 | 
85 | if ! [ $CLUSTER_EXIST = true ]; then
86 |     echo "Cannot find the target cluster $CLUSTER_ID. Please check if you entered the valid id."
87 |     echo "Cluster id can be found by running 'databricks clusters list'"
88 |     echo "which returns a list of <CLUSTER_ID> <CLUSTER_NAME> <STATUS>."
89 |     exit 1
90 | fi
91 | 
92 | 


--------------------------------------------------------------------------------
/notebooks/scripts/prepare_databricks_for_o16n.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | # ---------------------------------------------------------
 4 | # This script installs appropriate external libraries onto
 5 | # a databricks cluster for operationalization.
 6 | 
 7 | DATABRICKS_CLI=$(which databricks)
 8 | if ! [ -x "$DATABRICKS_CLI" ]; then
 9 |     echo "No databricks-cli found!! Please see the SETUP.md file for installation prerequisites."
10 |     exit 1
11 | fi
12 | 
13 | CLUSTER_ID=$1
14 | if [ -z $CLUSTER_ID ]; then
15 |     echo "Please provide the target cluster id: 'prepare_databricks_for_016n.sh <CLUSTER_ID>'."
16 |     echo "Cluster id can be found by running 'databricks clusters list'"
17 |     echo "which returns a list of <CLUSTER_ID> <CLUSTER_NAME> <STATUS>."
18 |     exit 1
19 | fi
20 | 
21 | ## for spark version >=2.3.0
22 | COSMOSDB_CONNECTOR_URL="https://search.maven.org/remotecontent?filepath=com/microsoft/azure/azure-cosmosdb-spark_2.3.0_2.11/1.2.2/azure-cosmosdb-spark_2.3.0_2.11-1.2.2-uber.jar"
23 | COSMOSDB_CONNECTOR_BASENAME=$(basename $COSMOSDB_CONNECTOR_URL)
24 | 
25 | CLUSTER_EXIST=false
26 | PYPI_LIBRARIES=( "azure-cli==2.0.56" "azureml-sdk[databricks]==1.0.8" "pydocumentdb==2.3.3" )
27 | while IFS=' ' read -ra ARR; do
28 |     if [ ${ARR[0]} = $CLUSTER_ID ]; then
29 |         CLUSTER_EXIST=true
30 | 
31 |         STATUS=${ARR[2]}
32 |         STATUS=${STATUS//[^a-zA-Z]/}
33 |         if [ $STATUS = RUNNING ]; then
34 |             ## install each of the pypi libraries
35 |             for lib in "${PYPI_LIBRARIES[@]}"
36 |             do
37 |                 echo
38 |                 echo "Adding $lib"
39 |                 echo
40 |                 databricks libraries install --cluster-id $CLUSTER_ID --pypi-package $lib
41 |             done
42 | 
43 |             ## get spark-cosmosdb connector:
44 |             echo
45 |             echo "downloading cosmosdb connector jar file"
46 |             echo
47 |             curl -O $COSMOSDB_CONNECTOR_URL
48 |             
49 |             ## uplaod the jar to dbfs
50 |             echo
51 |             echo "Uploading to dbfs"
52 |             echo
53 |             dbfs cp --overwrite ${COSMOSDB_CONNECTOR_BASENAME} dbfs:/FileStore/jars/${COSMOSDB_CONNECTOR_BASENAME}
54 | 
55 |             # isntall from dbfs
56 |             echo
57 |             echo "Adding ${COSMOSDB_CONNECTOR_BASENAME} as library"
58 |             echo
59 |             databricks libraries install --cluster-id $CLUSTER_ID --jar dbfs:/FileStore/jars/${COSMOSDB_CONNECTOR_BASENAME}
60 | 
61 |             ## Check installation status
62 |             echo
63 |             echo "Done! Installation status checking..."
64 |             databricks libraries cluster-status --cluster-id $CLUSTER_ID
65 | 
66 |             echo
67 |             echo "Restarting the cluster to activate the library..."
68 |             databricks clusters restart --cluster-id $CLUSTER_ID
69 | 
70 |             echo "This will take few seconds. Please check the result from Databricks workspace."
71 |             echo "Alternatively, run 'databricks clusters list' to check the restart status and"
72 |             echo "run 'databricks libraries cluster-status --cluster-id $CLUSTER_ID' to check the installation status."
73 | 
74 |             exit 0
75 |         else
76 |             echo "Cluster $CLUSTER_ID found, but it is not running. Status=${STATUS}"
77 |             echo "You can start the cluster with 'databricks clusters start --cluster-id $CLUSTER_ID'."
78 |             echo "Then, check the cluster status by using 'databricks clusters list' and"
79 |             echo "re-try installation once the status turns into RUNNING."
80 |             exit 1
81 |         fi
82 |     fi
83 | done < <(databricks clusters list)
84 | 
85 | if ! [ $CLUSTER_EXIST = true ]; then
86 |     echo "Cannot find the target cluster $CLUSTER_ID. Please check if you entered the valid id."
87 |     echo "Cluster id can be found by running 'databricks clusters list'"
88 |     echo "which returns a list of <CLUSTER_ID> <CLUSTER_NAME> <STATUS>."
89 |     exit 1
90 | fi
91 | 
92 | 


--------------------------------------------------------------------------------
/reco_utils/recommender/surprise/surprise_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import pandas as pd
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | from reco_utils.common.constants import DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_PREDICTION_COL
 9 | from reco_utils.common.general_utils import invert_dictionary
10 | 
11 | 
12 | def surprise_trainset_to_df(
13 |     trainset, col_user="uid", col_item="iid", col_rating="rating"
14 | ):
15 |     """Converts a surprise.Trainset object to pd.DataFrame
16 |     More info: https://surprise.readthedocs.io/en/stable/trainset.html
17 | 
18 |     Args:
19 |         trainset (obj): A surprise.Trainset object.
20 |         col_user (str): User column name.
21 |         col_item (str): Item column name.
22 |         col_rating (str): Rating column name.
23 |     
24 |     Returns:
25 |         pd.DataFrame: A dataframe. The user and item columns are strings and the rating columns are floats.
26 |     """
27 |     df = pd.DataFrame(trainset.all_ratings(), columns=[col_user, col_item, col_rating])
28 |     map_user = (
29 |         trainset._inner2raw_id_users
30 |         if trainset._inner2raw_id_users is not None
31 |         else invert_dictionary(trainset._raw2inner_id_users)
32 |     )
33 |     map_item = (
34 |         trainset._inner2raw_id_items
35 |         if trainset._inner2raw_id_items is not None
36 |         else invert_dictionary(trainset._raw2inner_id_items)
37 |     )
38 |     df[col_user] = df[col_user].map(map_user)
39 |     df[col_item] = df[col_item].map(map_item)
40 |     return df
41 | 
42 | 
43 | def compute_rating_predictions(algo, data, usercol=DEFAULT_USER_COL, itemcol=DEFAULT_ITEM_COL, predcol=DEFAULT_PREDICTION_COL):
44 |     """Computes predictions of an algorithm from Surprise on the data. Can be used for computing rating metrics like RMSE.
45 |     
46 |     Args:
47 |         algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise
48 |         data (pd.DataFrame): the data on which to predict
49 |         usercol (str): name of the user column
50 |         itemcol (str): name of the item column
51 |     
52 |     Returns:
53 |         pd.DataFrame: dataframe with usercol, itemcol, predcol
54 |     """
55 |     predictions = [algo.predict(getattr(row, usercol), getattr(row, itemcol)) for row in data.itertuples()]
56 |     predictions = pd.DataFrame(predictions)
57 |     predictions = predictions.rename(index=str, columns={'uid': usercol, 'iid': itemcol, 'est': predcol})
58 |     return predictions.drop(['details', 'r_ui'], axis='columns')
59 | 
60 | 
61 | def compute_ranking_predictions(algo, data, usercol=DEFAULT_USER_COL, itemcol=DEFAULT_ITEM_COL,
62 |                             predcol=DEFAULT_PREDICTION_COL, recommend_seen=False):
63 |     """Computes predictions of an algorithm from Surprise on all users and items in data. can be used for computing
64 |     ranking metrics like NDCG.
65 |     
66 |     Args:
67 |         algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise
68 |         data (pd.DataFrame): the data from which to get the users and items
69 |         usercol (str): name of the user column
70 |         itemcol (str): name of the item column
71 |         recommend_seen (bool): flag to include (user, item) pairs that appear in data
72 |     
73 |     Returns:
74 |         pd.DataFrame: dataframe with usercol, itemcol, predcol
75 |     """
76 |     preds_lst = []
77 |     for user in data[usercol].unique():
78 |         for item in data[itemcol].unique():
79 |             preds_lst.append([user, item, algo.predict(user, item).est])
80 | 
81 |     all_predictions = pd.DataFrame(data=preds_lst, columns=[usercol, itemcol, predcol])
82 | 
83 |     if recommend_seen:
84 |         return all_predictions
85 |     else:
86 |         tempdf = pd.concat([data[[usercol, itemcol]],
87 |                             pd.DataFrame(data=np.ones(data.shape[0]), columns=['dummycol'], index=data.index)],
88 |                             axis=1)
89 |         merged = pd.merge(tempdf, all_predictions, on=[usercol, itemcol], how="outer")
90 |         return merged[merged['dummycol'].isnull()].drop('dummycol', axis=1)
91 | 
92 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/recommender/surprise/surprise_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import pandas as pd
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | from reco_utils.common.constants import DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_PREDICTION_COL
 9 | from reco_utils.common.general_utils import invert_dictionary
10 | 
11 | 
12 | def surprise_trainset_to_df(
13 |     trainset, col_user="uid", col_item="iid", col_rating="rating"
14 | ):
15 |     """Converts a surprise.Trainset object to pd.DataFrame
16 |     More info: https://surprise.readthedocs.io/en/stable/trainset.html
17 | 
18 |     Args:
19 |         trainset (obj): A surprise.Trainset object.
20 |         col_user (str): User column name.
21 |         col_item (str): Item column name.
22 |         col_rating (str): Rating column name.
23 |     
24 |     Returns:
25 |         pd.DataFrame: A dataframe. The user and item columns are strings and the rating columns are floats.
26 |     """
27 |     df = pd.DataFrame(trainset.all_ratings(), columns=[col_user, col_item, col_rating])
28 |     map_user = (
29 |         trainset._inner2raw_id_users
30 |         if trainset._inner2raw_id_users is not None
31 |         else invert_dictionary(trainset._raw2inner_id_users)
32 |     )
33 |     map_item = (
34 |         trainset._inner2raw_id_items
35 |         if trainset._inner2raw_id_items is not None
36 |         else invert_dictionary(trainset._raw2inner_id_items)
37 |     )
38 |     df[col_user] = df[col_user].map(map_user)
39 |     df[col_item] = df[col_item].map(map_item)
40 |     return df
41 | 
42 | 
43 | def compute_rating_predictions(algo, data, usercol=DEFAULT_USER_COL, itemcol=DEFAULT_ITEM_COL, predcol=DEFAULT_PREDICTION_COL):
44 |     """Computes predictions of an algorithm from Surprise on the data. Can be used for computing rating metrics like RMSE.
45 |     
46 |     Args:
47 |         algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise
48 |         data (pd.DataFrame): the data on which to predict
49 |         usercol (str): name of the user column
50 |         itemcol (str): name of the item column
51 |     
52 |     Returns:
53 |         pd.DataFrame: dataframe with usercol, itemcol, predcol
54 |     """
55 |     predictions = [algo.predict(getattr(row, usercol), getattr(row, itemcol)) for row in data.itertuples()]
56 |     predictions = pd.DataFrame(predictions)
57 |     predictions = predictions.rename(index=str, columns={'uid': usercol, 'iid': itemcol, 'est': predcol})
58 |     return predictions.drop(['details', 'r_ui'], axis='columns')
59 | 
60 | 
61 | def compute_ranking_predictions(algo, data, usercol=DEFAULT_USER_COL, itemcol=DEFAULT_ITEM_COL,
62 |                             predcol=DEFAULT_PREDICTION_COL, recommend_seen=False):
63 |     """Computes predictions of an algorithm from Surprise on all users and items in data. can be used for computing
64 |     ranking metrics like NDCG.
65 |     
66 |     Args:
67 |         algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise
68 |         data (pd.DataFrame): the data from which to get the users and items
69 |         usercol (str): name of the user column
70 |         itemcol (str): name of the item column
71 |         recommend_seen (bool): flag to include (user, item) pairs that appear in data
72 |     
73 |     Returns:
74 |         pd.DataFrame: dataframe with usercol, itemcol, predcol
75 |     """
76 |     preds_lst = []
77 |     for user in data[usercol].unique():
78 |         for item in data[itemcol].unique():
79 |             preds_lst.append([user, item, algo.predict(user, item).est])
80 | 
81 |     all_predictions = pd.DataFrame(data=preds_lst, columns=[usercol, itemcol, predcol])
82 | 
83 |     if recommend_seen:
84 |         return all_predictions
85 |     else:
86 |         tempdf = pd.concat([data[[usercol, itemcol]],
87 |                             pd.DataFrame(data=np.ones(data.shape[0]), columns=['dummycol'], index=data.index)],
88 |                             axis=1)
89 |         merged = pd.merge(tempdf, all_predictions, on=[usercol, itemcol], how="outer")
90 |         return merged[merged['dummycol'].isnull()].drop('dummycol', axis=1)
91 | 
92 | 


--------------------------------------------------------------------------------
/reco_utils/nni/nni_utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import requests
  4 | import time
  5 | 
  6 | NNI_REST_ENDPOINT = 'http://localhost:8080/api/v1/nni'
  7 | NNI_STATUS_URL = NNI_REST_ENDPOINT + '/check-status'
  8 | NNI_TRIAL_JOBS_URL = NNI_REST_ENDPOINT + '/trial-jobs'
  9 | WAITING_TIME = 20
 10 | MAX_RETRIES = 5
 11 | 
 12 | 
 13 | def get_experiment_status(status_url):
 14 |     """
 15 |     Helper method. Gets the experiment status from the REST endpoint
 16 | 
 17 |     Args:
 18 |         status_url (str): URL for the REST endpoint
 19 | 
 20 |     Returns:
 21 |         str: status of the experiment
 22 |     """
 23 |     nni_status = requests.get(status_url).json()
 24 |     return nni_status['status']
 25 | 
 26 | 
 27 | def check_experiment_status(wait=WAITING_TIME, max_retries=MAX_RETRIES):
 28 |     """ Checks the status of the current experiment on the NNI REST endpoint
 29 |     Waits until the tuning has completed
 30 | 
 31 |     Args:
 32 |         wait (numeric) : time to wait in seconds
 33 |         max_retries (int): max number of retries
 34 |     """
 35 |     i = 0
 36 |     while i < max_retries:
 37 |         status = get_experiment_status(NNI_STATUS_URL)
 38 |         if status in ['DONE', 'TUNER_NO_MORE_TRIAL']:
 39 |             break
 40 |         elif status not in ['RUNNING', 'NO_MORE_TRIAL']:
 41 |             raise RuntimeError("NNI experiment failed to complete with status {}".format(status))
 42 |         time.sleep(wait)
 43 |         i += 1
 44 |     if i == max_retries:
 45 |         raise TimeoutError("check_experiment_status() timed out")
 46 | 
 47 | 
 48 | def check_stopped(wait=WAITING_TIME, max_retries=MAX_RETRIES):
 49 |     """
 50 |     Checks that there is no NNI experiment active (the URL is not accessible)
 51 |     This method should be called after 'nnictl stop' for verification
 52 | 
 53 |     Args:
 54 |         wait (numeric) : time to wait in seconds
 55 |         max_retries (int): max number of retries
 56 |     """
 57 |     i = 0
 58 |     while i < max_retries:
 59 |         try:
 60 |             get_experiment_status(NNI_STATUS_URL)
 61 |         except:
 62 |             break
 63 |         time.sleep(wait)
 64 |         i += 1
 65 |     if i == max_retries:
 66 |         raise TimeoutError("check_stopped() timed out")
 67 | 
 68 | 
 69 | def check_metrics_written(wait=WAITING_TIME, max_retries=MAX_RETRIES):
 70 |     """
 71 |     Waits until the metrics have been written to the trial logs
 72 |     """
 73 |     i = 0
 74 |     while i < max_retries:
 75 |         all_trials = requests.get(NNI_TRIAL_JOBS_URL).json()
 76 |         if all(['finalMetricData' in trial for trial in all_trials]):
 77 |             break
 78 |         time.sleep(wait)
 79 |         i += 1
 80 |     if i == max_retries:
 81 |         raise TimeoutError("check_metrics_written() timed out")
 82 | 
 83 | 
 84 | def get_trials(optimize_mode):
 85 |     """    Obtain information about the trials of the current experiment via the REST endpoint
 86 | 
 87 |     Args:
 88 |         optimize_mode (str): One of 'minimize', 'maximize'. Determines how to obtain the best default metric.
 89 | 
 90 |     Returns:
 91 |          list: Trials info, list of (metrics, log path)
 92 |          dict: Metrics for the best choice of hyperparameters
 93 |          dict: Best hyperparameters
 94 |          str: Log path for the best trial
 95 |     """
 96 | 
 97 |     if optimize_mode not in ['minimize', 'maximize']:
 98 |         raise ValueError("optimize_mode should equal either 'minimize' or 'maximize'")
 99 |     all_trials = requests.get(NNI_TRIAL_JOBS_URL).json()
100 |     trials = [(eval(trial['finalMetricData'][0]['data']), trial['logPath'].split(':')[-1]) for trial in all_trials]
101 |     sorted_trials = sorted(trials, key=lambda x: x[0]['default'], reverse=(optimize_mode == 'maximize'))
102 |     best_trial_path = sorted_trials[0][1]
103 |     # Read the metrics from the trial directory in order to get the name of the default metric
104 |     with open(os.path.join(best_trial_path, "metrics.json"), "r") as fp:
105 |         best_metrics = json.load(fp)
106 |     with open(os.path.join(best_trial_path, "parameter.cfg"), "r") as fp:
107 |         best_params = json.load(fp)
108 |     return trials, best_metrics, best_params, best_trial_path
109 | 


--------------------------------------------------------------------------------
/tests/unit/test_ncf_dataset.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pandas as pd
  3 | import numpy as np
  4 | from itertools import product
  5 | import pytest
  6 | 
  7 | from reco_utils.common.constants import (
  8 |     DEFAULT_USER_COL,
  9 |     DEFAULT_ITEM_COL,
 10 |     DEFAULT_RATING_COL,
 11 |     DEFAULT_TIMESTAMP_COL,
 12 | )
 13 | from reco_utils.recommender.ncf.dataset import Dataset
 14 | from tests.ncf_common import python_dataset_ncf, test_specs_ncf
 15 | 
 16 | 
 17 | N_NEG = 5
 18 | N_NEG_TEST = 10
 19 | BATCH_SIZE = 32
 20 | 
 21 | 
 22 | def test_data_preprocessing(python_dataset_ncf):
 23 |     train, test = python_dataset_ncf
 24 |     data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST)
 25 |     
 26 |     # shape
 27 |     assert len(data.train) == len(train)
 28 |     assert len(data.test) == len(test)
 29 | 
 30 |     # index correctness for id2user, user2id, id2item, item2id
 31 |     for data_row, row in zip(data.train.iterrows(), train.iterrows()):
 32 |         assert data_row[1][DEFAULT_USER_COL] == data.user2id[row[1][DEFAULT_USER_COL]]
 33 |         assert row[1][DEFAULT_USER_COL] == data.id2user[data_row[1][DEFAULT_USER_COL]]
 34 |         assert data_row[1][DEFAULT_ITEM_COL] == data.item2id[row[1][DEFAULT_ITEM_COL]]
 35 |         assert row[1][DEFAULT_ITEM_COL] == data.id2item[data_row[1][DEFAULT_ITEM_COL]]
 36 | 
 37 |     for data_row, row in zip(data.test.iterrows(), test.iterrows()):
 38 |         assert data_row[1][DEFAULT_USER_COL] == data.user2id[row[1][DEFAULT_USER_COL]]
 39 |         assert row[1][DEFAULT_USER_COL] == data.id2user[data_row[1][DEFAULT_USER_COL]]
 40 |         assert data_row[1][DEFAULT_ITEM_COL] == data.item2id[row[1][DEFAULT_ITEM_COL]]
 41 |         assert row[1][DEFAULT_ITEM_COL] == data.id2item[data_row[1][DEFAULT_ITEM_COL]]
 42 | 
 43 | 
 44 | def test_train_loader(python_dataset_ncf):
 45 |     train, test = python_dataset_ncf
 46 |     data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST)
 47 | 
 48 |     # collect positvie user-item dict
 49 |     positive_pool = {}
 50 |     for u in train[DEFAULT_USER_COL].unique():
 51 |         positive_pool[u] = set(train[train[DEFAULT_USER_COL] == u][DEFAULT_ITEM_COL])
 52 | 
 53 |     # without negative sampling
 54 |     for batch in data.train_loader(batch_size=BATCH_SIZE, shuffle=False):
 55 |         user, item, labels = batch
 56 |         #shape
 57 |         assert len(user) == BATCH_SIZE
 58 |         assert len(item) == BATCH_SIZE
 59 |         assert len(labels) == BATCH_SIZE
 60 |         assert max(labels) == min(labels)
 61 | 
 62 |         # right labels
 63 |         for u, i, is_pos in zip(user, item, labels):
 64 |             if is_pos:
 65 |                 assert i in positive_pool[u] 
 66 |             else: 
 67 |                 assert i not in positive_pool[u] 
 68 | 
 69 |     data.negative_sampling()
 70 |     label_list = []
 71 |     batches = []
 72 |     for idx, batch in enumerate(data.train_loader(batch_size=1)):
 73 |         user, item, labels = batch
 74 |         assert len(user) == 1
 75 |         assert len(item) == 1
 76 |         assert len(labels) == 1
 77 | 
 78 |         # right labels
 79 |         for u, i, is_pos in zip(user, item, labels):
 80 |             if is_pos:
 81 |                 assert i in positive_pool[u] 
 82 |             else: 
 83 |                 assert i not in positive_pool[u] 
 84 | 
 85 |             label_list.append(is_pos)
 86 | 
 87 |     # neagtive smapling
 88 |     assert len(label_list) == (N_NEG + 1) * sum(label_list)
 89 | 
 90 | 
 91 | def test_test_loader(python_dataset_ncf):
 92 |     train, test = python_dataset_ncf
 93 |     data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST)
 94 | 
 95 |     # positive user-item dict, noting that the pool is train+test
 96 |     positive_pool = {}
 97 |     df = train.append(test)
 98 |     for u in df[DEFAULT_USER_COL].unique(): 
 99 |         positive_pool[u] = set(df[df[DEFAULT_USER_COL] == u][DEFAULT_ITEM_COL])
100 | 
101 |     for batch in data.test_loader():
102 |         user, item, labels = batch
103 |         # shape
104 |         assert len(user) == N_NEG_TEST + 1
105 |         assert len(item) == N_NEG_TEST + 1
106 |         assert len(labels) == N_NEG_TEST + 1
107 | 
108 |         label_list = []
109 | 
110 |         for u, i, is_pos in zip(user, item, labels):
111 |             if is_pos:
112 |                 assert i in positive_pool[u] 
113 |             else: 
114 |                 assert i not in positive_pool[u] 
115 | 
116 |             label_list.append(is_pos)
117 | 
118 |         # leave-one-out
119 |         assert sum(label_list) == 1
120 |         # right labels
121 |         assert len(label_list) == (N_NEG_TEST + 1) * sum(label_list)
122 | 


--------------------------------------------------------------------------------
/tests/unit/test_sparse.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import pandas as pd
  5 | import numpy as np
  6 | import pytest
  7 | 
  8 | from reco_utils.dataset.sparse import AffinityMatrix
  9 | from reco_utils.common.constants import (
 10 |     DEFAULT_USER_COL,
 11 |     DEFAULT_ITEM_COL,
 12 |     DEFAULT_RATING_COL,
 13 |     DEFAULT_TIMESTAMP_COL,
 14 | )
 15 | 
 16 | 
 17 | @pytest.fixture(scope="module")
 18 | def test_specs():
 19 |     return {"number_of_items": 50, "number_of_users": 20, "seed": 123}
 20 | 
 21 | 
 22 | # generate a syntetic dataset
 23 | @pytest.fixture(scope="module")
 24 | def python_dataset(test_specs):
 25 | 
 26 |     """Get Python labels"""
 27 | 
 28 |     def random_date_generator(start_date, range_in_days):
 29 |         """Helper function to generate random timestamps.
 30 | 
 31 |         Reference: https://stackoverflow.com/questions/41006182/generate-random-dates-within-a
 32 |         -range-in-numpy
 33 | 
 34 |         """
 35 | 
 36 |         days_to_add = np.arange(0, range_in_days)
 37 |         random_dates = []
 38 | 
 39 |         for i in range(range_in_days):
 40 |             random_date = np.datetime64(start_date) + np.random.choice(days_to_add)
 41 |             random_dates.append(random_date)
 42 | 
 43 |         return random_dates
 44 | 
 45 |     # fix the the random seed
 46 |     np.random.seed(test_specs["seed"])
 47 | 
 48 |     # generates the user/item affinity matrix. Ratings are from 1 to 5, with 0s denoting unrated items
 49 |     X = np.random.randint(
 50 |         low=0,
 51 |         high=6,
 52 |         size=(test_specs["number_of_users"], test_specs["number_of_items"]),
 53 |     )
 54 | 
 55 |     # In the main code, input data are passed as pandas dataframe. Below we generate such df from the above matrix
 56 |     userids = []
 57 | 
 58 |     for i in range(1, test_specs["number_of_users"] + 1):
 59 |         userids.extend([i] * test_specs["number_of_items"])
 60 | 
 61 |     itemids = [i for i in range(1, test_specs["number_of_items"] + 1)] * test_specs[
 62 |         "number_of_users"
 63 |     ]
 64 |     ratings = np.reshape(X, -1)
 65 | 
 66 |     # create dataframe
 67 |     results = pd.DataFrame.from_dict(
 68 |         {
 69 |             DEFAULT_USER_COL: userids,
 70 |             DEFAULT_ITEM_COL: itemids,
 71 |             DEFAULT_RATING_COL: ratings,
 72 |             DEFAULT_TIMESTAMP_COL: random_date_generator(
 73 |                 "2018-01-01",
 74 |                 test_specs["number_of_users"] * test_specs["number_of_items"],
 75 |             ),
 76 |         }
 77 |     )
 78 | 
 79 |     # here we eliminate the missing ratings to obtain a standard form of the df as that of real data.
 80 |     results = results[results.rating != 0]
 81 | 
 82 |     return results
 83 | 
 84 | 
 85 | def test_df_to_sparse(test_specs, python_dataset):
 86 |     # initialize the splitter
 87 |     header = {
 88 |         "col_user": DEFAULT_USER_COL,
 89 |         "col_item": DEFAULT_ITEM_COL,
 90 |         "col_rating": DEFAULT_RATING_COL,
 91 |     }
 92 | 
 93 |     # instantiate the affinity matrix
 94 |     am = AffinityMatrix(DF=python_dataset, **header)
 95 | 
 96 |     # obtain the sparse matrix representation of the input dataframe
 97 |     X = am.gen_affinity_matrix()
 98 | 
 99 |     # check that the generated matrix has the correct dimensions
100 |     assert (X.shape[0] == python_dataset.userID.unique().shape[0]) & (
101 |         X.shape[1] == python_dataset.itemID.unique().shape[0]
102 |     )
103 | 
104 | 
105 | def test_sparse_to_df(test_specs, python_dataset):
106 |     # initialize the splitter
107 |     header = {
108 |         "col_user": DEFAULT_USER_COL,
109 |         "col_item": DEFAULT_ITEM_COL,
110 |         "col_rating": DEFAULT_RATING_COL,
111 |     }
112 | 
113 |     # instantiate the the affinity matrix
114 |     am = AffinityMatrix(DF=python_dataset, **header)
115 | 
116 |     # generate the sparse matrix representation
117 |     X = am.gen_affinity_matrix()
118 | 
119 |     # use the inverse function to generate a pandas df from a sparse matrix ordered by userID
120 |     DF = am.map_back_sparse(X, kind="ratings")
121 | 
122 |     # tests: check that the two dataframes have the same elements in the same positions.
123 |     assert (
124 |         DF.userID.values.all()
125 |         == python_dataset.sort_values(by=["userID"]).userID.values.all()
126 |     )
127 | 
128 |     assert (
129 |         DF.itemID.values.all()
130 |         == python_dataset.sort_values(by=["userID"]).itemID.values.all()
131 |     )
132 | 
133 |     assert (
134 |         DF.rating.values.all()
135 |         == python_dataset.sort_values(by=["userID"]).rating.values.all()
136 |     )
137 | 


--------------------------------------------------------------------------------
/tests/ci/submitpytest.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import os
  5 | import json
  6 | import azureml.core
  7 | from azureml.core.authentication import AzureCliAuthentication
  8 | from azureml.core import Workspace
  9 | from azureml.core import Experiment
 10 | from azureml.core.runconfig import RunConfiguration
 11 | from azureml.core.conda_dependencies import CondaDependencies
 12 | from azureml.core.runconfig import DEFAULT_CPU_IMAGE
 13 | # uncomment if using gpu
 14 | from azureml.core.runconfig import DEFAULT_GPU_IMAGE
 15 | from azureml.core.script_run_config import ScriptRunConfig
 16 | 
 17 | 
 18 | from azureml.core.compute import ComputeTarget, AmlCompute
 19 | from azureml.core.compute_target import ComputeTargetException
 20 | 
 21 | 
 22 | with open("tests/ci/config.json") as f:
 23 |         config = json.load(f)
 24 | 
 25 |         workspace_name= config["workspace_name"]
 26 |         resource_group = config["resource_group"]
 27 |         subscription_id = config["subscription_id"]
 28 |         location = config["location"]
 29 | 
 30 |         print(" WS name ", workspace_name)
 31 |         print("subscription_id ", subscription_id)
 32 |         print("location",location)
 33 | 
 34 |         cli_auth = AzureCliAuthentication()
 35 |         print("cliauth")
 36 | 
 37 | try:
 38 |     print("Trying to get ws")
 39 |     ws = Workspace.get(
 40 |         name=workspace_name,
 41 |         subscription_id=subscription_id,
 42 |         resource_group=resource_group,
 43 |         auth=cli_auth
 44 |     )
 45 | 
 46 | except Exception:
 47 |     # this call might take a minute or two.
 48 |     print("Creating new workspace")
 49 |     ws = Workspace.create(
 50 |         name=ws,
 51 |         subscription_id=subscription_id,
 52 |         resource_group=resource_group,
 53 |         # create_resource_group=True,
 54 |         location=location,
 55 |         auth=cli_auth
 56 |     )
 57 | 
 58 | # Choose a name for your CPU cluster
 59 | cpu_cluster_name = "persistentcpu"
 60 | #cpu_cluster_name = "cpucluster"
 61 | print("cpu_cluster_name",cpu_cluster_name)
 62 | # Verify that cluster does not exist already
 63 | # https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets
 64 | 
 65 | try:
 66 |     cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
 67 |     print('Found existing cluster, use it.')
 68 | except ComputeTargetException:
 69 |     print("create cluster")
 70 |     compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
 71 |                                                         max_nodes=4)
 72 |     cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
 73 | 
 74 | cpu_cluster.wait_for_completion(show_output=True)
 75 | 
 76 | from azureml.core.runconfig import RunConfiguration
 77 | from azureml.core.conda_dependencies import CondaDependencies
 78 | from azureml.core.runconfig import DEFAULT_CPU_IMAGE
 79 | 
 80 | # Create a new runconfig object
 81 | run_amlcompute = RunConfiguration()
 82 | 
 83 | # Use the cpu_cluster you created above. 
 84 | run_amlcompute.target = cpu_cluster
 85 | 
 86 | # Enable Docker
 87 | run_amlcompute.environment.docker.enabled = True
 88 | 
 89 | # Set Docker base image to the default CPU-based image
 90 | run_amlcompute.environment.docker.base_image = DEFAULT_CPU_IMAGE
 91 | 
 92 | # Use conda_dependencies.yml to create a conda environment in the Docker image for execution
 93 | run_amlcompute.environment.python.user_managed_dependencies = False
 94 | 
 95 | # Auto-prepare the Docker image when used for execution (if it is not already prepared)
 96 | run_amlcompute.auto_prepare_environment = True
 97 | 
 98 | # Specify CondaDependencies obj, add necessary packages
 99 | 
100 | run_amlcompute.environment.python.conda_dependencies = CondaDependencies(
101 |         conda_dependencies_file_path='./reco_base.yaml')
102 | 
103 | from azureml.core import Experiment
104 | experiment_name = 'PersistentAML'
105 | 
106 | experiment = Experiment(workspace=ws, name=experiment_name)
107 | project_folder = "."
108 | script_run_config = ScriptRunConfig(source_directory=project_folder,
109 |                                         script='./tests/ci/runpytest.py',
110 |                                         run_config=run_amlcompute)
111 |                                         
112 | print('before submit')
113 | run = experiment.submit(script_run_config)
114 | print('after submit')
115 | run.wait_for_completion(show_output=True, wait_post_processing=True)
116 | 
117 | # go to azure portal to see log in azure ws and look for experiment name and
118 | # look for individual run
119 | print('files', run.get_file_names())
120 | run.download_files(prefix='reports')
121 | run.tag('persistentaml tag')
122 | 
123 | 


--------------------------------------------------------------------------------
/tests/unit/test_tf_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import pytest
  5 | import shutil
  6 | 
  7 | import pandas as pd
  8 | import tensorflow as tf
  9 | 
 10 | from reco_utils.common.tf_utils import (
 11 |     pandas_input_fn,
 12 |     build_optimizer,
 13 |     evaluation_log_hook,
 14 |     Logger,
 15 |     MODEL_DIR
 16 | )
 17 | from reco_utils.recommender.wide_deep.wide_deep_utils import (
 18 |     build_model,
 19 |     build_feature_columns,
 20 | )
 21 | from reco_utils.common.constants import (
 22 |     DEFAULT_USER_COL,
 23 |     DEFAULT_ITEM_COL,
 24 |     DEFAULT_RATING_COL
 25 | )
 26 | from reco_utils.evaluation.python_evaluation import rmse
 27 | 
 28 | ITEM_FEAT_COL = 'itemFeat'
 29 | 
 30 | 
 31 | @pytest.fixture(scope='module')
 32 | def pd_df():
 33 |     df = pd.DataFrame(
 34 |         {
 35 |             DEFAULT_USER_COL: [1, 1, 1, 2, 2, 2],
 36 |             DEFAULT_ITEM_COL: [1, 2, 3, 1, 4, 5],
 37 |             ITEM_FEAT_COL: [[1, 1, 1], [2, 2, 2], [3, 3, 3], [1, 1, 1], [4, 4, 4], [5, 5, 5]],
 38 |             DEFAULT_RATING_COL: [5, 4, 3, 5, 5, 3],
 39 |         }
 40 |     )
 41 |     users = df.drop_duplicates(DEFAULT_USER_COL)[DEFAULT_USER_COL].values
 42 |     items = df.drop_duplicates(DEFAULT_ITEM_COL)[DEFAULT_ITEM_COL].values
 43 |     return df, users, items
 44 | 
 45 | 
 46 | @pytest.mark.gpu
 47 | def test_pandas_input_fn(pd_df):
 48 |     df, _, _ = pd_df
 49 | 
 50 |     input_fn = pandas_input_fn(df)
 51 |     sample = input_fn()
 52 | 
 53 |     # check the input function returns all the columns
 54 |     assert len(df.columns) == len(sample)
 55 |     for k, v in sample.items():
 56 |         assert k in df.columns.values
 57 |         # check if a list feature column converted correctly
 58 |         if len(v.shape) == 2:
 59 |             assert v.shape[1] == len(df[k][0])
 60 | 
 61 |     input_fn_with_label = pandas_input_fn(df, y_col=DEFAULT_RATING_COL)
 62 |     X, y = input_fn_with_label()
 63 |     features = df.copy()
 64 |     features.pop(DEFAULT_RATING_COL)
 65 |     assert len(X) == len(features.columns)
 66 | 
 67 | 
 68 | @pytest.mark.gpu
 69 | def test_build_optimizer():
 70 |     adadelta = build_optimizer('Adadelta')
 71 |     assert isinstance(adadelta, tf.train.AdadeltaOptimizer)
 72 | 
 73 |     adagrad = build_optimizer('Adagrad')
 74 |     assert isinstance(adagrad, tf.train.AdagradOptimizer)
 75 | 
 76 |     adam = build_optimizer('Adam')
 77 |     assert isinstance(adam, tf.train.AdamOptimizer)
 78 | 
 79 |     ftrl = build_optimizer('Ftrl', **{'l1_regularization_strength': 0.001})
 80 |     assert isinstance(ftrl,  tf.train.FtrlOptimizer)
 81 | 
 82 |     momentum = build_optimizer('Momentum', **{'momentum': 0.5})
 83 |     assert isinstance(momentum, tf.train.MomentumOptimizer)
 84 | 
 85 |     rmsprop = build_optimizer('RMSProp')
 86 |     assert isinstance(rmsprop, tf.train.RMSPropOptimizer)
 87 | 
 88 |     sgd = build_optimizer('SGD')
 89 |     assert isinstance(sgd, tf.train.GradientDescentOptimizer)
 90 | 
 91 | 
 92 | @pytest.mark.gpu
 93 | def test_evaluation_log_hook(pd_df):
 94 |     data, users, items = pd_df
 95 | 
 96 |     # Run hook 10 times
 97 |     hook_frequency = 10
 98 |     train_steps = 101
 99 | 
100 |     _, deep_columns = build_feature_columns(users, items, model_type='deep')
101 | 
102 |     model = build_model(
103 |         'deep_'+MODEL_DIR, deep_columns=deep_columns, save_checkpoints_steps=train_steps//hook_frequency
104 |     )
105 | 
106 |     class EvaluationLogger(Logger):
107 |         def __init__(self):
108 |             self.eval_log = {}
109 | 
110 |         def log(self, metric, value):
111 |             if metric not in self.eval_log:
112 |                 self.eval_log[metric] = []
113 |             self.eval_log[metric].append(value)
114 | 
115 |         def get_log(self):
116 |             return self.eval_log
117 | 
118 |     evaluation_logger = EvaluationLogger()
119 | 
120 |     hooks = [
121 |         evaluation_log_hook(
122 |             model,
123 |             logger=evaluation_logger,
124 |             true_df=data,
125 |             y_col=DEFAULT_RATING_COL,
126 |             eval_df=data.drop(DEFAULT_RATING_COL, axis=1),
127 |             every_n_iter=train_steps//hook_frequency,
128 |             model_dir='deep_'+MODEL_DIR,
129 |             eval_fns=[rmse],
130 |         )
131 |     ]
132 |     model.train(
133 |         input_fn=pandas_input_fn(df=data, y_col=DEFAULT_RATING_COL, batch_size=1, num_epochs=None, shuffle=True),
134 |         hooks=hooks,
135 |         steps=train_steps
136 |     )
137 |     shutil.rmtree('deep_' + MODEL_DIR, ignore_errors=True)
138 | 
139 |     # Check if hook logged the given metric
140 |     assert rmse.__name__ in evaluation_logger.get_log()
141 |     assert len(evaluation_logger.get_log()[rmse.__name__]) == hook_frequency
142 | 


--------------------------------------------------------------------------------
/tests/unit/test_rbm.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import pytest
  5 | import numpy as np
  6 | from reco_utils.recommender.rbm.rbm import RBM
  7 | from tests.rbm_common import test_specs, affinity_matrix
  8 | 
  9 | 
 10 | @pytest.fixture(scope="module")
 11 | def init_rbm():
 12 |     return {
 13 |         "n_hidden": 100,
 14 |         "epochs": 10,
 15 |         "minibatch": 50,
 16 |         "keep_prob": 0.8,
 17 |         "learning_rate": 0.002,
 18 |         "init_stdv": 0.01,
 19 |         "sampling_protocol": [30, 50, 80, 90, 100],
 20 |         "display": 20,
 21 |     }
 22 | 
 23 | 
 24 | @pytest.mark.gpu
 25 | def test_class_init(init_rbm):
 26 |     model = RBM(
 27 |         hidden_units=init_rbm["n_hidden"],
 28 |         training_epoch=init_rbm["epochs"],
 29 |         minibatch_size=init_rbm["minibatch"],
 30 |         keep_prob=init_rbm["keep_prob"],
 31 |         learning_rate=init_rbm["learning_rate"],
 32 |         init_stdv=init_rbm["init_stdv"],
 33 |         sampling_protocol=init_rbm["sampling_protocol"],
 34 |         display_epoch=init_rbm["display"],
 35 |     )
 36 | 
 37 |     # number of hidden units
 38 |     assert model.Nhidden == init_rbm["n_hidden"]
 39 |     # number of training epochs
 40 |     assert model.epochs == init_rbm["epochs"] + 1
 41 |     # minibatch size
 42 |     assert model.minibatch == init_rbm["minibatch"]
 43 |     # keep probability for dropout regulrization
 44 |     assert model.keep == init_rbm["keep_prob"]
 45 |     # learning rate
 46 |     assert model.learning_rate == init_rbm["learning_rate"]
 47 |     # standard deviation used to initialize the weight matrix from a normal distribution
 48 |     assert model.stdv == init_rbm["init_stdv"]
 49 |     # sampling protocol used to increase the number of steps in Gibbs sampling
 50 |     assert model.sampling_protocol == init_rbm["sampling_protocol"]
 51 |     # number of epochs after which the rmse is displayed
 52 |     assert model.display == init_rbm["display"]
 53 | 
 54 | 
 55 | @pytest.mark.gpu
 56 | def test_train_param_init(init_rbm, affinity_matrix):
 57 |     # obtain the train/test set matrices
 58 |     Xtr, Xtst = affinity_matrix
 59 | 
 60 |     # initialize the model
 61 |     model = RBM(
 62 |         hidden_units=init_rbm["n_hidden"],
 63 |         training_epoch=init_rbm["epochs"],
 64 |         minibatch_size=init_rbm["minibatch"],
 65 |     )
 66 |     # fit the model to the data
 67 |     model.fit(Xtr, Xtst)
 68 | 
 69 |     # visible units placeholder (tensor)
 70 |     model.vu.shape[1] == Xtr.shape[1]
 71 |     # weight matrix
 72 |     assert model.w.shape == [Xtr.shape[1], init_rbm["n_hidden"]]
 73 |     # bias, visible units
 74 |     assert model.bv.shape == [1, Xtr.shape[1]]
 75 |     # bias, hidden units
 76 |     assert model.bh.shape == [1, init_rbm["n_hidden"]]
 77 | 
 78 | 
 79 | @pytest.mark.gpu
 80 | def test_sampling_funct(init_rbm, affinity_matrix):
 81 |     # obtain the train/test set matrices
 82 |     Xtr, Xtst = affinity_matrix
 83 | 
 84 |     # initialize the model
 85 |     model = RBM(
 86 |         hidden_units=init_rbm["n_hidden"],
 87 |         training_epoch=init_rbm["epochs"],
 88 |         minibatch_size=init_rbm["minibatch"],
 89 |     )
 90 | 
 91 |     def check_sampled_values(sampled, s):
 92 |         """
 93 |         Check if the elements of the sampled units are in {0,s}
 94 |         """
 95 |         a = []
 96 | 
 97 |         for i in range(0, s + 1):
 98 |             l = sampled == i
 99 |             a.append(l)
100 | 
101 |         return sum(a)
102 | 
103 |     r = Xtr.max()  # obtain the rating scale
104 | 
105 |     # fit the model to the data
106 |     model.fit(Xtr, Xtst)
107 | 
108 |     # evaluate the activation probabilities of the hidden units and their sampled values
109 |     phv, h = model.sess.run(model.sample_hidden_units(model.v))
110 | 
111 |     # check the dimensions of the two matrices
112 |     assert phv.shape == (Xtr.shape[0], 100)
113 |     assert h.shape == (Xtr.shape[0], 100)
114 | 
115 |     # check that the activation probabilities are in [0,1]
116 |     assert (phv <= 1).all() & (phv >= 0).all()
117 | 
118 |     # check that the sampled value of the hidden units is either 1 or 0
119 |     assert check_sampled_values(h, 1).all()
120 | 
121 |     # evaluate the activation probabilities of the visible units and their sampled values
122 |     pvh, v_sampled = model.sess.run(model.sample_visible_units(h))
123 | 
124 |     assert pvh.shape == (Xtr.shape[0], Xtr.shape[1], r)
125 |     assert v_sampled.shape == Xtr.shape
126 | 
127 |     # check that the multinomial distribution is normalized over the r classes for all users/items
128 |     assert np.sum(pvh, axis=2) == pytest.approx(np.ones(Xtr.shape))
129 | 
130 |     # check that the sampled values of the visible units is in [0,r]
131 |     assert check_sampled_values(v_sampled, r).all()
132 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/azureml/wide_deep.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | """
  4 | AzureML Hyperdrive entry script for wide-deep model
  5 | """
  6 | import argparse
  7 | import os
  8 | import shutil
  9 | 
 10 | import papermill as pm
 11 | import tensorflow as tf
 12 | print("TensorFlow version:", tf.VERSION)
 13 | 
 14 | try:
 15 |     from azureml.core import Run
 16 |     run = Run.get_context()
 17 | except ImportError:
 18 |     run = None
 19 | 
 20 | from reco_utils.common.constants import (
 21 |     DEFAULT_USER_COL,
 22 |     DEFAULT_ITEM_COL,
 23 |     DEFAULT_RATING_COL
 24 | )
 25 | 
 26 | 
 27 | NOTEBOOK_NAME = os.path.join(
 28 |     "notebooks",
 29 |     "00_quick_start",
 30 |     "wide_deep_movielens.ipynb"
 31 | )
 32 | OUTPUT_NOTEBOOK = "wide_deep.ipynb"
 33 | 
 34 | 
 35 | def _log(metric, value):
 36 |     if run is not None:
 37 |         if isinstance(value, list) and len(value) > 0 and isinstance(value[0], (int, float))
 38 |             run.log_list(metric, value)
 39 |         else:
 40 |             run.log(metric, value)
 41 |     print(metric, "=", value)
 42 | 
 43 | 
 44 | # Parse arguments passed by Hyperdrive
 45 | parser = argparse.ArgumentParser()
 46 | 
 47 | parser.add_argument('--top-k', type=int, dest='TOP_K', help="Top k recommendation", default=10)
 48 | # Data path
 49 | parser.add_argument('--datastore', type=str, dest='DATA_DIR', help="Datastore path")
 50 | parser.add_argument('--train-datapath', type=str, dest='TRAIN_PICKLE_PATH')
 51 | parser.add_argument('--test-datapath', type=str, dest='TEST_PICKLE_PATH')
 52 | parser.add_argument('--model-dir', type=str, dest='MODEL_DIR', default='model_checkpoints')
 53 | # Data column names
 54 | parser.add_argument('--user-col', type=str, dest='USER_COL', default=DEFAULT_USER_COL)
 55 | parser.add_argument('--item-col', type=str, dest='ITEM_COL', default=DEFAULT_ITEM_COL)
 56 | parser.add_argument('--rating-col', type=str, dest='RATING_COL', default=DEFAULT_RATING_COL)
 57 | parser.add_argument('--item-feat-col', type=str, dest='ITEM_FEAT_COL')  # Optional
 58 | parser.add_argument('--ranking-metrics', type=str, nargs='*', dest='RANKING_METRICS', default=['ndcg_at_k'])
 59 | parser.add_argument('--rating-metrics', type=str, nargs='*', dest='RATING_METRICS', default=['rmse'])
 60 | # Model type: either 'wide', 'deep', or 'wide_deep'
 61 | parser.add_argument('--model-type', type=str, dest='MODEL_TYPE', default='wide_deep')
 62 | # Wide model params
 63 | parser.add_argument('--linear-optimizer', type=str, dest='LINEAR_OPTIMIZER', default='Ftrl')
 64 | parser.add_argument('--linear-optimizer-lr', type=float, dest='LINEAR_OPTIMIZER_LR', default=0.01)
 65 | parser.add_argument('--linear-l1-reg', type=float, dest='LINEAR_L1_REG', default=0.0)
 66 | parser.add_argument('--linear-momentum', type=float, dest='LINEAR_MOMENTUM', default=0.9)
 67 | # Deep model params
 68 | parser.add_argument('--dnn-optimizer', type=str, dest='DNN_OPTIMIZER', default='Adagrad')
 69 | parser.add_argument('--dnn-optimizer-lr', type=float, dest='DNN_OPTIMIZER_LR', default=0.01)
 70 | parser.add_argument('--dnn-l1-reg', type=float, dest='DNN_L1_REG', default=0.0)
 71 | parser.add_argument('--dnn-momentum', type=float, dest='DNN_MOMENTUM', default=0.9)
 72 | parser.add_argument('--dnn-hidden-layer-1', type=int, dest='DNN_HIDDEN_LAYER_1', default=0)
 73 | parser.add_argument('--dnn-hidden-layer-2', type=int, dest='DNN_HIDDEN_LAYER_2', default=0)
 74 | parser.add_argument('--dnn-hidden-layer-3', type=int, dest='DNN_HIDDEN_LAYER_3', default=128)
 75 | parser.add_argument('--dnn-hidden-layer-4', type=int, dest='DNN_HIDDEN_LAYER_4', default=128)
 76 | parser.add_argument('--dnn-user-embedding-dim', type=int, dest='DNN_USER_DIM', default=8)
 77 | parser.add_argument('--dnn-item-embedding-dim', type=int, dest='DNN_ITEM_DIM', default=8)
 78 | parser.add_argument('--dnn-batch-norm', type=int, dest='DNN_BATCH_NORM', default=1)
 79 | parser.add_argument('--dnn-dropout', type=float, dest='DNN_DROPOUT', default=0.0)
 80 | # Training parameters
 81 | parser.add_argument('--epochs', type=int, dest='EPOCHS', default=50)
 82 | parser.add_argument('--batch-size', type=int, dest='BATCH_SIZE', default=128)
 83 | parser.add_argument('--evaluate-while-training', dest='EVALUATE_WHILE_TRAINING', action='store_true')
 84 | 
 85 | 
 86 | args = parser.parse_args()
 87 | 
 88 | params = vars(args)
 89 | 
 90 | if params['TOP_K'] <= 0:
 91 |     raise ValueError("Top K should be larger than 0")
 92 | 
 93 | if params['MODEL_TYPE'] not in {'wide', 'deep', 'wide_deep'}:
 94 |     raise ValueError("Model type should be either 'wide', 'deep', or 'wide_deep'")
 95 | 
 96 | if params['DATA_DIR'] is None:
 97 |     raise ValueError("Datastore path should be given")
 98 | 
 99 | print("Args:")
100 | for k, v in params.items():
101 |     _log(k, v)
102 | 
103 | 
104 | print("Run", NOTEBOOK_NAME)
105 | 
106 | pm.execute_notebook(
107 |     NOTEBOOK_NAME,
108 |     OUTPUT_NOTEBOOK,
109 |     parameters=params,
110 |     kernel_name='python3'
111 | )
112 | nb = pm.read_notebook(OUTPUT_NOTEBOOK)
113 | 
114 | for m, v in nb.data.items():
115 |     _log(m, v)
116 | 
117 | # clean-up
118 | os.remove(OUTPUT_NOTEBOOK)
119 | shutil.rmtree(params['MODEL_DIR'], ignore_errors=True)
120 | 


--------------------------------------------------------------------------------
/reco_utils/azureml/wide_deep.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | """
  4 | AzureML Hyperdrive entry script for wide-deep model
  5 | """
  6 | import argparse
  7 | import os
  8 | import shutil
  9 | 
 10 | import papermill as pm
 11 | import tensorflow as tf
 12 | print("TensorFlow version:", tf.VERSION)
 13 | 
 14 | try:
 15 |     from azureml.core import Run
 16 |     run = Run.get_context()
 17 | except ImportError:
 18 |     run = None
 19 | 
 20 | from reco_utils.common.constants import (
 21 |     DEFAULT_USER_COL,
 22 |     DEFAULT_ITEM_COL,
 23 |     DEFAULT_RATING_COL
 24 | )
 25 | 
 26 | 
 27 | NOTEBOOK_NAME = os.path.join(
 28 |     "notebooks",
 29 |     "00_quick_start",
 30 |     "wide_deep_movielens.ipynb"
 31 | )
 32 | OUTPUT_NOTEBOOK = "wide_deep.ipynb"
 33 | 
 34 | 
 35 | def _log(metric, value):
 36 |     """AzureML log wrapper.
 37 | 
 38 |     Record list of int or float as a list metrics so that we can plot it from AzureML workspace portal.
 39 |     Otherwise, record as a single value of the metric.
 40 |     """
 41 |     if run is not None:
 42 |         if isinstance(value, list) and len(value) > 0 and isinstance(value[0], (int, float)):
 43 |             run.log_list(metric, value)
 44 |         else:
 45 |             # Force cast to str since run.log will raise an error if the value is iterable.
 46 |             run.log(metric, str(value))
 47 |     print(metric, "=", value)
 48 | 
 49 | 
 50 | # Parse arguments passed by Hyperdrive
 51 | parser = argparse.ArgumentParser()
 52 | 
 53 | parser.add_argument('--top-k', type=int, dest='TOP_K', help="Top k recommendation", default=10)
 54 | # Data path
 55 | parser.add_argument('--datastore', type=str, dest='DATA_DIR', help="Datastore path")
 56 | parser.add_argument('--train-datapath', type=str, dest='TRAIN_PICKLE_PATH')
 57 | parser.add_argument('--test-datapath', type=str, dest='TEST_PICKLE_PATH')
 58 | parser.add_argument('--model-dir', type=str, dest='MODEL_DIR', default='model_checkpoints')
 59 | # Data column names
 60 | parser.add_argument('--user-col', type=str, dest='USER_COL', default=DEFAULT_USER_COL)
 61 | parser.add_argument('--item-col', type=str, dest='ITEM_COL', default=DEFAULT_ITEM_COL)
 62 | parser.add_argument('--rating-col', type=str, dest='RATING_COL', default=DEFAULT_RATING_COL)
 63 | parser.add_argument('--item-feat-col', type=str, dest='ITEM_FEAT_COL')  # Optional
 64 | parser.add_argument('--ranking-metrics', type=str, nargs='*', dest='RANKING_METRICS', default=['ndcg_at_k'])
 65 | parser.add_argument('--rating-metrics', type=str, nargs='*', dest='RATING_METRICS', default=['rmse'])
 66 | # Model type: either 'wide', 'deep', or 'wide_deep'
 67 | parser.add_argument('--model-type', type=str, dest='MODEL_TYPE', default='wide_deep')
 68 | # Wide model params
 69 | parser.add_argument('--linear-optimizer', type=str, dest='LINEAR_OPTIMIZER', default='Ftrl')
 70 | parser.add_argument('--linear-optimizer-lr', type=float, dest='LINEAR_OPTIMIZER_LR', default=0.01)
 71 | parser.add_argument('--linear-l1-reg', type=float, dest='LINEAR_L1_REG', default=0.0)
 72 | parser.add_argument('--linear-momentum', type=float, dest='LINEAR_MOMENTUM', default=0.9)
 73 | # Deep model params
 74 | parser.add_argument('--dnn-optimizer', type=str, dest='DNN_OPTIMIZER', default='Adagrad')
 75 | parser.add_argument('--dnn-optimizer-lr', type=float, dest='DNN_OPTIMIZER_LR', default=0.01)
 76 | parser.add_argument('--dnn-l1-reg', type=float, dest='DNN_L1_REG', default=0.0)
 77 | parser.add_argument('--dnn-momentum', type=float, dest='DNN_MOMENTUM', default=0.9)
 78 | parser.add_argument('--dnn-hidden-layer-1', type=int, dest='DNN_HIDDEN_LAYER_1', default=0)
 79 | parser.add_argument('--dnn-hidden-layer-2', type=int, dest='DNN_HIDDEN_LAYER_2', default=0)
 80 | parser.add_argument('--dnn-hidden-layer-3', type=int, dest='DNN_HIDDEN_LAYER_3', default=128)
 81 | parser.add_argument('--dnn-hidden-layer-4', type=int, dest='DNN_HIDDEN_LAYER_4', default=128)
 82 | parser.add_argument('--dnn-user-embedding-dim', type=int, dest='DNN_USER_DIM', default=8)
 83 | parser.add_argument('--dnn-item-embedding-dim', type=int, dest='DNN_ITEM_DIM', default=8)
 84 | parser.add_argument('--dnn-batch-norm', type=int, dest='DNN_BATCH_NORM', default=1)
 85 | parser.add_argument('--dnn-dropout', type=float, dest='DNN_DROPOUT', default=0.0)
 86 | # Training parameters
 87 | parser.add_argument('--epochs', type=int, dest='EPOCHS', default=50)
 88 | parser.add_argument('--batch-size', type=int, dest='BATCH_SIZE', default=128)
 89 | parser.add_argument('--evaluate-while-training', dest='EVALUATE_WHILE_TRAINING', action='store_true')
 90 | 
 91 | 
 92 | args = parser.parse_args()
 93 | 
 94 | params = vars(args)
 95 | 
 96 | if params['TOP_K'] <= 0:
 97 |     raise ValueError("Top K should be larger than 0")
 98 | 
 99 | if params['MODEL_TYPE'] not in {'wide', 'deep', 'wide_deep'}:
100 |     raise ValueError("Model type should be either 'wide', 'deep', or 'wide_deep'")
101 | 
102 | if params['DATA_DIR'] is None:
103 |     raise ValueError("Datastore path should be given")
104 | 
105 | print("Args:")
106 | for k, v in params.items():
107 |     _log(k, v)
108 | 
109 | 
110 | print("Run", NOTEBOOK_NAME)
111 | 
112 | pm.execute_notebook(
113 |     NOTEBOOK_NAME,
114 |     OUTPUT_NOTEBOOK,
115 |     parameters=params,
116 |     kernel_name='python3'
117 | )
118 | nb = pm.read_notebook(OUTPUT_NOTEBOOK)
119 | 
120 | for m, v in nb.data.items():
121 |     _log(m, v)
122 | 
123 | # clean-up
124 | os.remove(OUTPUT_NOTEBOOK)
125 | shutil.rmtree(params['MODEL_DIR'], ignore_errors=True)
126 | 


--------------------------------------------------------------------------------
/reco_utils/azureml/svd_training.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | import sys
  4 | 
  5 | sys.path.append("../../")
  6 | 
  7 | import argparse
  8 | import os
  9 | import pandas as pd
 10 | import surprise
 11 | 
 12 | try:
 13 |     from azureml.core import Run
 14 | 
 15 |     HAS_AML = True
 16 |     run = Run.get_context()
 17 | except ModuleNotFoundError:
 18 |     HAS_AML = False
 19 | 
 20 | from reco_utils.evaluation.python_evaluation import *
 21 | from reco_utils.recommender.surprise.surprise_utils import compute_rating_predictions, compute_ranking_predictions
 22 | 
 23 | 
 24 | def svd_training(args):
 25 |     """
 26 |     Train Surprise SVD using the given hyper-parameters
 27 |     """
 28 |     print("Start training...")
 29 |     train_data = pd.read_pickle(path=os.path.join(args.datastore, args.train_datapath))
 30 |     validation_data = pd.read_pickle(path=os.path.join(args.datastore, args.validation_datapath))
 31 | 
 32 |     svd = surprise.SVD(random_state=args.random_state, n_epochs=args.epochs, verbose=args.verbose, biased=args.biased,
 33 |                        n_factors=args.n_factors, init_mean=args.init_mean, init_std_dev=args.init_std_dev,
 34 |                        lr_all=args.lr_all, reg_all=args.reg_all, lr_bu=args.lr_bu, lr_bi=args.lr_bi, lr_pu=args.lr_pu,
 35 |                        lr_qi=args.lr_qi, reg_bu=args.reg_bu, reg_bi=args.reg_bi, reg_pu=args.reg_pu,
 36 |                        reg_qi=args.reg_qi)
 37 | 
 38 |     train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(args.surprise_reader)) \
 39 |         .build_full_trainset()
 40 |     svd.fit(train_set)
 41 | 
 42 |     print("Evaluating...")
 43 | 
 44 |     rating_metrics = args.rating_metrics
 45 |     if len(rating_metrics) > 0:
 46 |         predictions = compute_rating_predictions(svd, validation_data, usercol=args.usercol, itemcol=args.itemcol)
 47 |         for metric in rating_metrics:
 48 |             result = eval(metric)(validation_data, predictions)
 49 |             print(metric, result)
 50 |             if HAS_AML:
 51 |                 run.log(metric, result)
 52 | 
 53 |     ranking_metrics = args.ranking_metrics
 54 |     if len(ranking_metrics) > 0:
 55 |         all_predictions = compute_ranking_predictions(svd, train_data, usercol=args.usercol, itemcol=args.itemcol,
 56 |                                                   recommend_seen=args.recommend_seen)
 57 |         k = args.k
 58 |         for metric in ranking_metrics:
 59 |             result = eval(metric)(validation_data, all_predictions, col_prediction='prediction', k=k)
 60 |             print("{}@{}".format(metric, k), result)
 61 |             if HAS_AML:
 62 |                 run.log(metric, result)
 63 | 
 64 |     if len(ranking_metrics) == 0 and len(rating_metrics) == 0:
 65 |         raise ValueError("No metrics were specified.")
 66 | 
 67 |     return svd
 68 | 
 69 | 
 70 | def main():
 71 |     parser = argparse.ArgumentParser()
 72 |     # Data path
 73 |     parser.add_argument('--datastore', type=str, dest='datastore', help="Datastore path")
 74 |     parser.add_argument('--train-datapath', type=str, dest='train_datapath')
 75 |     parser.add_argument('--validation-datapath', type=str, dest='validation_datapath')
 76 |     parser.add_argument('--output_dir', type=str, help='output directory')
 77 |     parser.add_argument('--surprise-reader', type=str, dest='surprise_reader')
 78 |     parser.add_argument('--usercol', type=str, dest='usercol', default='userID')
 79 |     parser.add_argument('--itemcol', type=str, dest='itemcol', default='itemID')
 80 |     # Metrics
 81 |     parser.add_argument('--rating-metrics', type=str, nargs='*', dest='rating_metrics', default=[])
 82 |     parser.add_argument('--ranking-metrics', type=str, nargs='*', dest='ranking_metrics', default=[])
 83 |     parser.add_argument('--k', type=int, dest='k', default=None)
 84 |     parser.add_argument('--recommend-seen', dest='recommend_seen', action='store_true')
 85 |     # Training parameters
 86 |     parser.add_argument('--random-state', type=int, dest='random_state', default=0)
 87 |     parser.add_argument('--verbose', dest='verbose', action='store_true')
 88 |     parser.add_argument('--epochs', type=int, dest='epochs', default=30)
 89 |     parser.add_argument('--biased', dest='biased', action='store_true')
 90 |     # Hyperparameters to be tuned
 91 |     parser.add_argument('--n_factors', type=int, dest='n_factors', default=100)
 92 |     parser.add_argument('--init_mean', type=float, dest='init_mean', default=0.0)
 93 |     parser.add_argument('--init_std_dev', type=float, dest='init_std_dev', default=0.1)
 94 |     parser.add_argument('--lr_all', type=float, dest='lr_all', default=0.005)
 95 |     parser.add_argument('--reg_all', type=float, dest='reg_all', default=0.02)
 96 |     parser.add_argument('--lr_bu', type=float, dest='lr_bu', default=None)
 97 |     parser.add_argument('--lr_bi', type=float, dest='lr_bi', default=None)
 98 |     parser.add_argument('--lr_pu', type=float, dest='lr_pu', default=None)
 99 |     parser.add_argument('--lr_qi', type=float, dest='lr_qi', default=None)
100 |     parser.add_argument('--reg_bu', type=float, dest='reg_bu', default=None)
101 |     parser.add_argument('--reg_bi', type=float, dest='reg_bi', default=None)
102 |     parser.add_argument('--reg_pu', type=float, dest='reg_pu', default=None)
103 |     parser.add_argument('--reg_qi', type=float, dest='reg_qi', default=None)
104 | 
105 |     args = parser.parse_args()
106 | 
107 |     print("Args:", str(vars(args)), sep='\n')
108 | 
109 |     if HAS_AML:
110 |         run.log('Number of epochs', args.epochs)
111 | 
112 |     svd = svd_training(args)
113 |     # Save SVD model to the output directory for later use
114 |     os.makedirs(args.output_dir, exist_ok=True)
115 |     surprise.dump.dump(os.path.join(args.output_dir, 'model.dump'), algo=svd)
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     main()
120 | 


--------------------------------------------------------------------------------
/notebooks/reco_utils/azureml/svd_training.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | import sys
  4 | 
  5 | sys.path.append("../../")
  6 | 
  7 | import argparse
  8 | import os
  9 | import pandas as pd
 10 | import surprise
 11 | 
 12 | try:
 13 |     from azureml.core import Run
 14 | 
 15 |     HAS_AML = True
 16 |     run = Run.get_context()
 17 | except ModuleNotFoundError:
 18 |     HAS_AML = False
 19 | 
 20 | from reco_utils.evaluation.python_evaluation import *
 21 | from reco_utils.recommender.surprise.surprise_utils import compute_rating_predictions, compute_ranking_predictions
 22 | 
 23 | 
 24 | def svd_training(args):
 25 |     """
 26 |     Train Surprise SVD using the given hyper-parameters
 27 |     """
 28 |     print("Start training...")
 29 |     train_data = pd.read_pickle(path=os.path.join(args.datastore, args.train_datapath))
 30 |     validation_data = pd.read_pickle(path=os.path.join(args.datastore, args.validation_datapath))
 31 | 
 32 |     svd = surprise.SVD(random_state=args.random_state, n_epochs=args.epochs, verbose=args.verbose, biased=args.biased,
 33 |                        n_factors=args.n_factors, init_mean=args.init_mean, init_std_dev=args.init_std_dev,
 34 |                        lr_all=args.lr_all, reg_all=args.reg_all, lr_bu=args.lr_bu, lr_bi=args.lr_bi, lr_pu=args.lr_pu,
 35 |                        lr_qi=args.lr_qi, reg_bu=args.reg_bu, reg_bi=args.reg_bi, reg_pu=args.reg_pu,
 36 |                        reg_qi=args.reg_qi)
 37 | 
 38 |     train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(args.surprise_reader)) \
 39 |         .build_full_trainset()
 40 |     svd.fit(train_set)
 41 | 
 42 |     print("Evaluating...")
 43 | 
 44 |     rating_metrics = args.rating_metrics
 45 |     if len(rating_metrics) > 0:
 46 |         predictions = compute_rating_predictions(svd, validation_data, usercol=args.usercol, itemcol=args.itemcol)
 47 |         for metric in rating_metrics:
 48 |             result = eval(metric)(validation_data, predictions)
 49 |             print(metric, result)
 50 |             if HAS_AML:
 51 |                 run.log(metric, result)
 52 | 
 53 |     ranking_metrics = args.ranking_metrics
 54 |     if len(ranking_metrics) > 0:
 55 |         all_predictions = compute_ranking_predictions(svd, train_data, usercol=args.usercol, itemcol=args.itemcol,
 56 |                                                   recommend_seen=args.recommend_seen)
 57 |         k = args.k
 58 |         for metric in ranking_metrics:
 59 |             result = eval(metric)(validation_data, all_predictions, col_prediction='prediction', k=k)
 60 |             print("{}@{}".format(metric, k), result)
 61 |             if HAS_AML:
 62 |                 run.log(metric, result)
 63 | 
 64 |     if len(ranking_metrics) == 0 and len(rating_metrics) == 0:
 65 |         raise ValueError("No metrics were specified.")
 66 | 
 67 |     return svd
 68 | 
 69 | 
 70 | def main():
 71 |     parser = argparse.ArgumentParser()
 72 |     # Data path
 73 |     parser.add_argument('--datastore', type=str, dest='datastore', help="Datastore path")
 74 |     parser.add_argument('--train-datapath', type=str, dest='train_datapath')
 75 |     parser.add_argument('--validation-datapath', type=str, dest='validation_datapath')
 76 |     parser.add_argument('--output_dir', type=str, help='output directory')
 77 |     parser.add_argument('--surprise-reader', type=str, dest='surprise_reader')
 78 |     parser.add_argument('--usercol', type=str, dest='usercol', default='userID')
 79 |     parser.add_argument('--itemcol', type=str, dest='itemcol', default='itemID')
 80 |     # Metrics
 81 |     parser.add_argument('--rating-metrics', type=str, nargs='*', dest='rating_metrics', default=[])
 82 |     parser.add_argument('--ranking-metrics', type=str, nargs='*', dest='ranking_metrics', default=[])
 83 |     parser.add_argument('--k', type=int, dest='k', default=None)
 84 |     parser.add_argument('--recommend-seen', dest='recommend_seen', action='store_true')
 85 |     # Training parameters
 86 |     parser.add_argument('--random-state', type=int, dest='random_state', default=0)
 87 |     parser.add_argument('--verbose', dest='verbose', action='store_true')
 88 |     parser.add_argument('--epochs', type=int, dest='epochs', default=30)
 89 |     parser.add_argument('--biased', dest='biased', action='store_true')
 90 |     # Hyperparameters to be tuned
 91 |     parser.add_argument('--n_factors', type=int, dest='n_factors', default=100)
 92 |     parser.add_argument('--init_mean', type=float, dest='init_mean', default=0.0)
 93 |     parser.add_argument('--init_std_dev', type=float, dest='init_std_dev', default=0.1)
 94 |     parser.add_argument('--lr_all', type=float, dest='lr_all', default=0.005)
 95 |     parser.add_argument('--reg_all', type=float, dest='reg_all', default=0.02)
 96 |     parser.add_argument('--lr_bu', type=float, dest='lr_bu', default=None)
 97 |     parser.add_argument('--lr_bi', type=float, dest='lr_bi', default=None)
 98 |     parser.add_argument('--lr_pu', type=float, dest='lr_pu', default=None)
 99 |     parser.add_argument('--lr_qi', type=float, dest='lr_qi', default=None)
100 |     parser.add_argument('--reg_bu', type=float, dest='reg_bu', default=None)
101 |     parser.add_argument('--reg_bi', type=float, dest='reg_bi', default=None)
102 |     parser.add_argument('--reg_pu', type=float, dest='reg_pu', default=None)
103 |     parser.add_argument('--reg_qi', type=float, dest='reg_qi', default=None)
104 | 
105 |     args = parser.parse_args()
106 | 
107 |     print("Args:", str(vars(args)), sep='\n')
108 | 
109 |     if HAS_AML:
110 |         run.log('Number of epochs', args.epochs)
111 | 
112 |     svd = svd_training(args)
113 |     # Save SVD model to the output directory for later use
114 |     os.makedirs(args.output_dir, exist_ok=True)
115 |     surprise.dump.dump(os.path.join(args.output_dir, 'model.dump'), algo=svd)
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     main()
120 | 


--------------------------------------------------------------------------------
/scripts/repo_metrics/track_metrics.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import sys
  5 | import os
  6 | 
  7 | # Need to append a full path instead of relative path.
  8 | # This seems to be an issue from Azure DevOps command line task.
  9 | # NOTE this does not affect running directly in the shell.
 10 | sys.path.append(os.getcwd())
 11 | import argparse
 12 | import traceback
 13 | import logging
 14 | from dateutil.parser import isoparse
 15 | from pymongo import MongoClient
 16 | from datetime import datetime
 17 | from scripts.repo_metrics.git_stats import Github
 18 | from scripts.repo_metrics.config import (
 19 |     GITHUB_TOKEN,
 20 |     CONNECTION_STRING,
 21 |     DATABASE,
 22 |     COLLECTION_GITHUB_STATS,
 23 |     COLLECTION_EVENTS,
 24 | )
 25 | 
 26 | format_str = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)s]: %(message)s"
 27 | format_time = "%Y-%m-%d %H:%M:%S"
 28 | logging.basicConfig(level=logging.INFO, format=format_str, datefmt=format_time)
 29 | log = logging.getLogger()
 30 | 
 31 | 
 32 | def parse_args():
 33 |     """Argument parser.
 34 |     Returns:
 35 |         obj: Parser.
 36 |     """
 37 |     parser = argparse.ArgumentParser(
 38 |         description="Metrics Tracker",
 39 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 40 |     )
 41 |     parser.add_argument("--github_repo", type=str, help="GitHub repository")
 42 |     parser.add_argument(
 43 |         "--event",
 44 |         type=str,
 45 |         help="Input a general event that can be saved to the database",
 46 |     )
 47 |     parser.add_argument(
 48 |         "--save_to_database",
 49 |         action="store_true",
 50 |         help="Whether or not to save the information to the database",
 51 |     )
 52 |     parser.add_argument(
 53 |         "--event_date",
 54 |         default=datetime.now().isoformat(),
 55 |         type=isoparse,
 56 |         help="Date for an event (format: YYYY-MM-DD)",
 57 |     )
 58 |     return parser.parse_args()
 59 | 
 60 | 
 61 | def connect(uri="mongodb://localhost"):
 62 |     """Mongo connector.
 63 |     Args:
 64 |         uri (str): Connection string.
 65 |     Returns:
 66 |         obj: Mongo client.
 67 |     """
 68 |     client = MongoClient(uri, serverSelectionTimeoutMS=1000)
 69 | 
 70 |     # Send a query to the server to see if the connection is working.
 71 |     try:
 72 |         client.server_info()
 73 |     except Exception:
 74 |         raise
 75 |     return client
 76 | 
 77 | 
 78 | def event_as_dict(event, date):
 79 |     """Encodes an string event input as a dictionary with the date.
 80 |     Args:
 81 |         event (str): Details of a event.
 82 |         date (datetime): Date of the event.
 83 |     Returns:
 84 |         dict: Dictionary with the event and the date.
 85 |     """
 86 |     return {"date": date.strftime("%b %d %Y %H:%M:%S"), "event": event}
 87 | 
 88 | 
 89 | def github_stats_as_dict(github):
 90 |     """Encodes Github statistics as a dictionary with the date.
 91 |     Args:
 92 |         obj: Github object.
 93 |     Returns:
 94 |         dict: Dictionary with Github details and the date.
 95 |     """
 96 |     return {
 97 |         "date": datetime.now().strftime("%b %d %Y %H:%M:%S"),
 98 |         "stars": github.stars,
 99 |         "forks": github.forks,
100 |         "watchers": github.watchers,
101 |         "subscribers": github.subscribers,
102 |         "open_issues": github.open_issues,
103 |         "open_pull_requests": github.open_pull_requests,
104 |         "unique_views": github.number_unique_views,
105 |         "total_views": github.number_total_views,
106 |         "details_views": github.views,
107 |         "unique_clones": github.number_unique_clones,
108 |         "total_clones": github.number_total_clones,
109 |         "details_clones": github.clones,
110 |         "last_year_commit_frequency": github.last_year_commit_frequency,
111 |         "details_referrers": github.top_ten_referrers,
112 |         "total_referrers": github.number_total_referrers,
113 |         "unique_referrers": github.number_unique_referrers,
114 |         "details_content": github.top_ten_content,
115 |         "repo_size": github.repo_size,
116 |         "commits": github.number_commits,
117 |         "contributors": github.number_contributors,
118 |         "branches": github.number_branches,
119 |         "tags": github.number_tags,
120 |         "total_lines": github.number_total_lines,
121 |         "added_lines": github.number_added_lines,
122 |         "deleted_lines": github.number_deleted_lines,
123 |     }
124 | 
125 | 
126 | def tracker(args):
127 |     """Main function to track metrics.
128 |     Args:
129 |         args (obj): Parsed arguments.
130 |     """
131 |     if args.github_repo:
132 |         # if there is an env variable, overwrite it
133 |         token = os.environ.get("GITHUB_TOKEN", GITHUB_TOKEN)
134 |         g = Github(token, args.github_repo)
135 |         git_doc = github_stats_as_dict(g)
136 |         log.info("GitHub stats -- {}".format(git_doc))
137 |         g.clean()
138 | 
139 |     if args.event:
140 |         event_doc = event_as_dict(args.event, args.event_date)
141 |         log.info("Event -- {}".format(event_doc))
142 | 
143 |     if args.save_to_database:
144 |         # if there is an env variable, overwrite it
145 |         connection = token = os.environ.get("CONNECTION_STRING", CONNECTION_STRING)
146 |         cli = connect(connection)
147 |         db = cli[DATABASE]
148 |         if args.github_repo:
149 |             db[COLLECTION_GITHUB_STATS].insert_one(git_doc)
150 |         if args.event:
151 |             db[COLLECTION_EVENTS].insert_one(event_doc)
152 | 
153 | 
154 | if __name__ == "__main__":
155 |     log.info("Starting routine")
156 |     args = parse_args()
157 |     try:
158 |         log.info("Arguments: {}".format(args))
159 |         tracker(args)
160 |     except Exception as e:
161 |         trace = traceback.format_exc()
162 |         log.error("Traceback: {}".format(trace))
163 |         log.error("Exception: {}".format(e))
164 |     finally:
165 |         log.info("Routine finished")
166 | 


--------------------------------------------------------------------------------
/scripts/generate_conda_file.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # Copyright (c) Microsoft Corporation. All rights reserved.
  4 | # Licensed under the MIT License.
  5 | 
  6 | # This script creates yaml files to build conda environments
  7 | # For generating a conda file for running only python code:
  8 | # $ python generate_conda_file.py
  9 | # For generating a conda file for running python gpu:
 10 | # $ python generate_conda_file.py --gpu
 11 | # For generating a conda file for running pyspark:
 12 | # $ python generate_conda_file.py --pyspark
 13 | # For generating a conda file for running python gpu and pyspark:
 14 | # $ python generate_conda_file.py --gpu --pyspark
 15 | # For generating a conda file for running python gpu and pyspark with a particular version:
 16 | # $ python generate_conda_file.py --gpu --pyspark-version 2.4.0
 17 | 
 18 | import argparse
 19 | import textwrap
 20 | 
 21 | 
 22 | HELP_MSG = """
 23 | To create the conda environment:
 24 | $ conda env create -f {conda_env}.yaml
 25 | 
 26 | To update the conda environment:
 27 | $ conda env update -f {conda_env}.yaml
 28 | 
 29 | To register the conda environment in Jupyter:
 30 | $ conda activate {conda_env}
 31 | $ python -m ipykernel install --user --name {conda_env} --display-name "Python ({conda_env})"
 32 | """
 33 | 
 34 | CHANNELS = [ "defaults", "conda-forge", "pytorch", "fastai"]
 35 | 
 36 | CONDA_BASE = {
 37 |     "mock": "mock==2.0.0",
 38 |     "dask": "dask>=0.17.1",
 39 |     "fastparquet": "fastparquet>=0.1.6",
 40 |     "gitpython": "gitpython>=2.1.8",
 41 |     "ipykernel": "ipykernel>=4.6.1",
 42 |     "jupyter": "jupyter>=1.0.0",
 43 |     "matplotlib": "matplotlib>=2.2.2",
 44 |     "numpy": "numpy>=1.13.3",
 45 |     "pandas": "pandas>=0.23.4",
 46 |     "pymongo": "pymongo>=3.6.1",
 47 |     "python": "python==3.6.8",
 48 |     "pytest": "pytest>=3.6.4",
 49 |     "pytorch": "pytorch-cpu>=1.0.0",
 50 |     "seaborn": "seaborn>=0.8.1",
 51 |     "scikit-learn": "scikit-learn==0.19.1",
 52 |     "scipy": "scipy>=1.0.0",
 53 |     "scikit-surprise": "scikit-surprise>=1.0.6",
 54 |     "tensorflow": "tensorflow==1.12.0",
 55 | }
 56 | 
 57 | CONDA_PYSPARK = {"pyarrow": "pyarrow>=0.8.0", "pyspark": "pyspark==2.3.1"}
 58 | 
 59 | CONDA_GPU = {"numba": "numba>=0.38.1", "pytorch": "pytorch>=1.0.0", "tensorflow": "tensorflow-gpu==1.12.0"}
 60 | 
 61 | PIP_BASE = {
 62 |     "azureml-sdk[notebooks,contrib]": "azureml-sdk[notebooks,contrib]",
 63 |     "azure-storage": "azure-storage>=0.36.0",
 64 |     "black": "black>=18.6b4",
 65 |     "dataclasses": "dataclasses>=0.6",
 66 |     "hyperopt": "hyperopt==0.1.1",
 67 |     "idna": "idna==2.7",
 68 |     "memory-profiler": "memory-profiler>=0.54.0",
 69 |     "nvidia-ml-py3": "nvidia-ml-py3>=7.352.0",
 70 |     "papermill": "papermill==0.18.2",
 71 |     "pydocumentdb": "pydocumentdb>=2.3.3",
 72 |     "fastai": "fastai==1.0.46",
 73 | }
 74 | 
 75 | PIP_PYSPARK = {}
 76 | PIP_GPU = {}
 77 | 
 78 | 
 79 | if __name__ == "__main__":
 80 |     parser = argparse.ArgumentParser(
 81 |         description=textwrap.dedent(
 82 |             """
 83 |         This script generates a conda file for different environments.
 84 |         Plain python is the default, but flags can be used to support PySpark and GPU functionality"""
 85 |         ),
 86 |         epilog=HELP_MSG,
 87 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 88 |     )
 89 |     parser.add_argument("--name", help="specify name of conda environment")
 90 |     parser.add_argument(
 91 |         "--gpu", action="store_true", help="include packages for GPU support"
 92 |     )
 93 |     parser.add_argument(
 94 |         "--pyspark", action="store_true", help="include packages for PySpark support"
 95 |     )
 96 |     parser.add_argument(
 97 |         "--pyspark-version", help="provide specific version of PySpark to use"
 98 |     )
 99 |     args = parser.parse_args()
100 | 
101 |     # check pyspark version
102 |     if args.pyspark_version is not None:
103 |         args.pyspark = True
104 |         pyspark_version_info = args.pyspark_version.split(".")
105 |         if len(pyspark_version_info) != 3 or any(
106 |             [not x.isdigit() for x in pyspark_version_info]
107 |         ):
108 |             raise TypeError(
109 |                 "PySpark version input must be valid numeric format (e.g. --pyspark-version=2.3.1)"
110 |             )
111 |     else:
112 |         args.pyspark_version = "2.3.1"
113 | 
114 |     # set name for environment and output yaml file
115 |     conda_env = "reco_base"
116 |     if args.gpu and args.pyspark:
117 |         conda_env = "reco_full"
118 |     elif args.gpu:
119 |         conda_env = "reco_gpu"
120 |     elif args.pyspark:
121 |         conda_env = "reco_pyspark"
122 | 
123 |     # overwrite environment name with user input
124 |     if args.name is not None:
125 |         conda_env = args.name
126 | 
127 |     # update conda and pip packages based on flags provided
128 |     conda_packages = CONDA_BASE
129 |     pip_packages = PIP_BASE
130 |     if args.pyspark:
131 |         conda_packages.update(CONDA_PYSPARK)
132 |         conda_packages["pyspark"] = "pyspark=={}".format(args.pyspark_version)
133 |         pip_packages.update(PIP_PYSPARK)
134 |     if args.gpu:
135 |         conda_packages.update(CONDA_GPU)
136 |         pip_packages.update(PIP_GPU)
137 | 
138 |     # write out yaml file
139 |     conda_file = "{}.yaml".format(conda_env)
140 |     with open(conda_file, "w") as f:
141 |         for line in HELP_MSG.format(conda_env=conda_env).split("\n"):
142 |             f.write("# {}\n".format(line))
143 |         f.write("name: {}\n".format(conda_env))
144 |         f.write("channels:\n")
145 |         for channel in CHANNELS:
146 |             f.write("- {}\n".format(channel))
147 |         f.write("dependencies:\n")
148 |         for conda_package in conda_packages.values():
149 |             f.write("- {}\n".format(conda_package))
150 |         f.write("- pip:\n")
151 |         for pip_package in pip_packages.values():
152 |             f.write("  - {}\n".format(pip_package))
153 | 
154 |     print("Generated conda file: {}".format(conda_file))
155 |     print(HELP_MSG.format(conda_env=conda_env))
156 | 


--------------------------------------------------------------------------------
/notebooks/scripts/generate_conda_file.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # Copyright (c) Microsoft Corporation. All rights reserved.
  4 | # Licensed under the MIT License.
  5 | 
  6 | # This script creates yaml files to build conda environments
  7 | # For generating a conda file for running only python code:
  8 | # $ python generate_conda_file.py
  9 | # For generating a conda file for running python gpu:
 10 | # $ python generate_conda_file.py --gpu
 11 | # For generating a conda file for running pyspark:
 12 | # $ python generate_conda_file.py --pyspark
 13 | # For generating a conda file for running python gpu and pyspark:
 14 | # $ python generate_conda_file.py --gpu --pyspark
 15 | # For generating a conda file for running python gpu and pyspark with a particular version:
 16 | # $ python generate_conda_file.py --gpu --pyspark-version 2.4.0
 17 | 
 18 | import argparse
 19 | import textwrap
 20 | 
 21 | 
 22 | HELP_MSG = """
 23 | To create the conda environment:
 24 | $ conda env create -f {conda_env}.yaml
 25 | 
 26 | To update the conda environment:
 27 | $ conda env update -f {conda_env}.yaml
 28 | 
 29 | To register the conda environment in Jupyter:
 30 | $ conda activate {conda_env}
 31 | $ python -m ipykernel install --user --name {conda_env} --display-name "Python ({conda_env})"
 32 | """
 33 | 
 34 | CHANNELS = [ "defaults", "conda-forge", "pytorch", "fastai"]
 35 | 
 36 | CONDA_BASE = {
 37 |     "mock": "mock==2.0.0",
 38 |     "dask": "dask>=0.17.1",
 39 |     "fastparquet": "fastparquet>=0.1.6",
 40 |     "gitpython": "gitpython>=2.1.8",
 41 |     "ipykernel": "ipykernel>=4.6.1",
 42 |     "jupyter": "jupyter>=1.0.0",
 43 |     "matplotlib": "matplotlib>=2.2.2",
 44 |     "numpy": "numpy>=1.13.3",
 45 |     "pandas": "pandas>=0.23.4",
 46 |     "pymongo": "pymongo>=3.6.1",
 47 |     "python": "python==3.6.8",
 48 |     "pytest": "pytest>=3.6.4",
 49 |     "pytorch": "pytorch-cpu>=1.0.0",
 50 |     "seaborn": "seaborn>=0.8.1",
 51 |     "scikit-learn": "scikit-learn==0.19.1",
 52 |     "scipy": "scipy>=1.0.0",
 53 |     "scikit-surprise": "scikit-surprise>=1.0.6",
 54 |     "tensorflow": "tensorflow==1.12.0",
 55 | }
 56 | 
 57 | CONDA_PYSPARK = {"pyarrow": "pyarrow>=0.8.0", "pyspark": "pyspark==2.3.1"}
 58 | 
 59 | CONDA_GPU = {"numba": "numba>=0.38.1", "pytorch": "pytorch>=1.0.0", "tensorflow": "tensorflow-gpu==1.12.0"}
 60 | 
 61 | PIP_BASE = {
 62 |     "azureml-sdk[notebooks,contrib]": "azureml-sdk[notebooks,contrib]",
 63 |     "azure-storage": "azure-storage>=0.36.0",
 64 |     "black": "black>=18.6b4",
 65 |     "dataclasses": "dataclasses>=0.6",
 66 |     "hyperopt": "hyperopt==0.1.1",
 67 |     "idna": "idna==2.7",
 68 |     "memory-profiler": "memory-profiler>=0.54.0",
 69 |     "nvidia-ml-py3": "nvidia-ml-py3>=7.352.0",
 70 |     "papermill": "papermill>=0.15.0",
 71 |     "pydocumentdb": "pydocumentdb>=2.3.3",
 72 |     "fastai": "fastai==1.0.46",
 73 | }
 74 | 
 75 | PIP_PYSPARK = {}
 76 | PIP_GPU = {}
 77 | 
 78 | 
 79 | if __name__ == "__main__":
 80 |     parser = argparse.ArgumentParser(
 81 |         description=textwrap.dedent(
 82 |             """
 83 |         This script generates a conda file for different environments.
 84 |         Plain python is the default, but flags can be used to support PySpark and GPU functionality"""
 85 |         ),
 86 |         epilog=HELP_MSG,
 87 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 88 |     )
 89 |     parser.add_argument("--name", help="specify name of conda environment")
 90 |     parser.add_argument(
 91 |         "--gpu", action="store_true", help="include packages for GPU support"
 92 |     )
 93 |     parser.add_argument(
 94 |         "--pyspark", action="store_true", help="include packages for PySpark support"
 95 |     )
 96 |     parser.add_argument(
 97 |         "--pyspark-version", help="provide specific version of PySpark to use"
 98 |     )
 99 |     args = parser.parse_args()
100 | 
101 |     # check pyspark version
102 |     if args.pyspark_version is not None:
103 |         args.pyspark = True
104 |         pyspark_version_info = args.pyspark_version.split(".")
105 |         if len(pyspark_version_info) != 3 or any(
106 |             [not x.isdigit() for x in pyspark_version_info]
107 |         ):
108 |             raise TypeError(
109 |                 "PySpark version input must be valid numeric format (e.g. --pyspark-version=2.3.1)"
110 |             )
111 |     else:
112 |         args.pyspark_version = "2.3.1"
113 | 
114 |     # set name for environment and output yaml file
115 |     conda_env = "reco_base"
116 |     if args.gpu and args.pyspark:
117 |         conda_env = "reco_full"
118 |     elif args.gpu:
119 |         conda_env = "reco_gpu"
120 |     elif args.pyspark:
121 |         conda_env = "reco_pyspark"
122 | 
123 |     # overwrite environment name with user input
124 |     if args.name is not None:
125 |         conda_env = args.name
126 | 
127 |     # update conda and pip packages based on flags provided
128 |     conda_packages = CONDA_BASE
129 |     pip_packages = PIP_BASE
130 |     if args.pyspark:
131 |         conda_packages.update(CONDA_PYSPARK)
132 |         conda_packages["pyspark"] = "pyspark=={}".format(args.pyspark_version)
133 |         pip_packages.update(PIP_PYSPARK)
134 |     if args.gpu:
135 |         conda_packages.update(CONDA_GPU)
136 |         pip_packages.update(PIP_GPU)
137 | 
138 |     # write out yaml file
139 |     conda_file = "{}.yaml".format(conda_env)
140 |     with open(conda_file, "w") as f:
141 |         for line in HELP_MSG.format(conda_env=conda_env).split("\n"):
142 |             f.write("# {}\n".format(line))
143 |         f.write("name: {}\n".format(conda_env))
144 |         f.write("channels:\n")
145 |         for channel in CHANNELS:
146 |             f.write("- {}\n".format(channel))
147 |         f.write("dependencies:\n")
148 |         for conda_package in conda_packages.values():
149 |             f.write("- {}\n".format(conda_package))
150 |         f.write("- pip:\n")
151 |         for pip_package in pip_packages.values():
152 |             f.write("  - {}\n".format(pip_package))
153 | 
154 |     print("Generated conda file: {}".format(conda_file))
155 |     print(HELP_MSG.format(conda_env=conda_env))
156 | 


--------------------------------------------------------------------------------
/tests/unit/test_ncf_singlenode.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import pytest
  5 | import itertools
  6 | import numpy as np
  7 | import pandas as pd
  8 | import os
  9 | import shutil
 10 | from reco_utils.recommender.ncf.ncf_singlenode import NCF
 11 | from reco_utils.recommender.ncf.dataset import Dataset
 12 | from reco_utils.common.constants import (
 13 |     DEFAULT_USER_COL,
 14 |     DEFAULT_ITEM_COL,
 15 |     DEFAULT_RATING_COL,
 16 |     DEFAULT_TIMESTAMP_COL,
 17 | )
 18 | from tests.ncf_common import python_dataset_ncf, test_specs_ncf
 19 | 
 20 | 
 21 | N_NEG = 5
 22 | N_NEG_TEST = 10
 23 | 
 24 | 
 25 | @pytest.mark.gpu
 26 | @pytest.mark.parametrize(
 27 |     "model_type, n_users, n_items", [("NeuMF", 1, 1), ("GMF", 10, 10), ("MLP", 4, 8)]
 28 | )
 29 | def test_init(model_type, n_users, n_items):
 30 |     model = NCF(n_users=n_users, n_items=n_items, model_type=model_type)
 31 |     # model type
 32 |     assert model.model_type == model_type.lower()
 33 |     # number of users in dataset
 34 |     assert model.n_users == n_users
 35 |     # number of items in dataset
 36 |     assert model.n_items == n_items
 37 |     # dimension of gmf user embedding
 38 |     assert model.embedding_gmf_P.shape == [n_users, model.n_factors]
 39 |     # dimension of gmf item embedding
 40 |     assert model.embedding_gmf_Q.shape == [n_items, model.n_factors]
 41 |     # dimension of mlp user embedding
 42 |     assert model.embedding_mlp_P.shape == [n_users, model.n_factors]
 43 |     # dimension of mlp item embedding
 44 |     assert model.embedding_mlp_Q.shape == [n_items, model.n_factors]
 45 |     
 46 |     # TODO: more parameters
 47 | 
 48 | 
 49 | @pytest.mark.gpu
 50 | @pytest.mark.parametrize(
 51 |     "model_type, n_users, n_items", [("NeuMF", 5, 5), ("GMF", 5, 5), ("MLP", 5, 5)]
 52 | )
 53 | def test_regular_save_load(model_type, n_users, n_items):
 54 |     ckpt = ".%s" % model_type
 55 |     if os.path.exists(ckpt):
 56 |         shutil.rmtree(ckpt)
 57 | 
 58 |     model = NCF(n_users=n_users, n_items=n_items, model_type=model_type)
 59 |     model.save(ckpt)
 60 |     if model.model_type == "neumf":
 61 |         P = model.sess.run(model.embedding_gmf_P)
 62 |         Q = model.sess.run(model.embedding_mlp_Q)
 63 |     elif model.model_type == "gmf":
 64 |         P = model.sess.run(model.embedding_gmf_P)
 65 |         Q = model.sess.run(model.embedding_gmf_Q)
 66 |     elif model.model_type == "mlp":
 67 |         P = model.sess.run(model.embedding_mlp_P)
 68 |         Q = model.sess.run(model.embedding_mlp_Q)
 69 | 
 70 |     del model
 71 |     model = NCF(n_users=n_users, n_items=n_items, model_type=model_type)
 72 | 
 73 |     if model.model_type == "neumf":
 74 |         model.load(neumf_dir=ckpt)
 75 |         P_ = model.sess.run(model.embedding_gmf_P)
 76 |         Q_ = model.sess.run(model.embedding_mlp_Q)
 77 |     elif model.model_type == "gmf":
 78 |         model.load(gmf_dir=ckpt)
 79 |         P_ = model.sess.run(model.embedding_gmf_P)
 80 |         Q_ = model.sess.run(model.embedding_gmf_Q)
 81 |     elif model.model_type == "mlp":
 82 |         model.load(mlp_dir=ckpt)
 83 |         P_ = model.sess.run(model.embedding_mlp_P)
 84 |         Q_ = model.sess.run(model.embedding_mlp_Q)
 85 | 
 86 |     # test load function
 87 |     assert np.array_equal(P, P_)
 88 |     assert np.array_equal(Q, Q_)
 89 | 
 90 |     if os.path.exists(ckpt):
 91 |         shutil.rmtree(ckpt)
 92 | 
 93 | @pytest.mark.gpu
 94 | @pytest.mark.parametrize(
 95 |     "n_users, n_items", [(5, 5), (4, 8)]
 96 | )
 97 | def test_neumf_save_load(n_users, n_items):
 98 |     model_type = "gmf"
 99 |     ckpt_gmf = ".%s" % model_type
100 |     if os.path.exists(ckpt_gmf):
101 |         shutil.rmtree(ckpt_gmf)
102 |     model = NCF(n_users=n_users, n_items=n_items, model_type=model_type)
103 |     model.save(ckpt_gmf)
104 |     P_gmf = model.sess.run(model.embedding_gmf_P)
105 |     Q_gmf = model.sess.run(model.embedding_gmf_Q)
106 |     del model
107 | 
108 |     model_type = "mlp"
109 |     ckpt_mlp = ".%s" % model_type
110 |     if os.path.exists(ckpt_mlp):
111 |         shutil.rmtree(ckpt_mlp)
112 |     model = NCF(n_users=n_users, n_items=n_items, model_type=model_type)
113 |     model.save(".%s" % model_type)
114 |     P_mlp = model.sess.run(model.embedding_mlp_P)
115 |     Q_mlp = model.sess.run(model.embedding_mlp_Q)
116 |     del model
117 | 
118 |     model_type = "neumf"
119 |     model = NCF(n_users=n_users, n_items=n_items, model_type=model_type)
120 |     model.load(gmf_dir=ckpt_gmf, mlp_dir=ckpt_mlp)
121 | 
122 |     P_gmf_ = model.sess.run(model.embedding_gmf_P)
123 |     Q_gmf_ = model.sess.run(model.embedding_gmf_Q)
124 | 
125 |     P_mlp_ = model.sess.run(model.embedding_mlp_P)
126 |     Q_mlp_ = model.sess.run(model.embedding_mlp_Q)
127 | 
128 |     assert np.array_equal(P_gmf, P_gmf_)
129 |     assert np.array_equal(Q_gmf, Q_gmf_)
130 |     assert np.array_equal(Q_mlp, Q_mlp_)
131 |     assert np.array_equal(Q_mlp, Q_mlp_)
132 | 
133 |     if os.path.exists(ckpt_gmf):
134 |         shutil.rmtree(ckpt_gmf)
135 |     if os.path.exists(ckpt_mlp):
136 |         shutil.rmtree(ckpt_mlp)
137 | 
138 |     # TODO: test loading fc-concat
139 | 
140 | @pytest.mark.gpu
141 | @pytest.mark.parametrize(
142 |     "model_type", ["NeuMF", "GMF", "MLP"]
143 | )
144 | def test_fit(python_dataset_ncf, model_type):
145 |     train, test = python_dataset_ncf
146 |     data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST)
147 |     model = NCF(n_users=data.n_users, n_items=data.n_items, model_type=model_type)
148 |     model.fit(data)
149 | 
150 | @pytest.mark.gpu
151 | @pytest.mark.parametrize(
152 |     "model_type", ["NeuMF", "GMF", "MLP"]
153 | )
154 | def test_predict(python_dataset_ncf, model_type):
155 |     # test data format
156 |     train, test = python_dataset_ncf
157 |     data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST)
158 |     model = NCF(n_users=data.n_users, n_items=data.n_items, model_type=model_type)
159 |     model.fit(data)
160 | 
161 |     test_users, test_items = list(test[DEFAULT_USER_COL]), list(test[DEFAULT_ITEM_COL])
162 | 
163 |     assert type(model.predict(test_users[0], test_items[0])) == float
164 |     
165 |     res = model.predict(test_users, test_items, is_list=True)
166 | 
167 |     assert type(res) == list
168 |     assert len(res) == len(test)
169 | 
170 | 
171 |     
172 | 


--------------------------------------------------------------------------------