├── .circleci └── config.yml ├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.rst ├── appveyor.yml ├── docs └── images │ ├── MetaOD_Flowchart.jpg │ └── meta_vis.jpg ├── examples ├── meta_feature_generation_example.py └── model_selection_example.py ├── metaod ├── __init__.py ├── models │ ├── __init__.py │ ├── base_detectors.py │ ├── core.py │ ├── gen_meta_features.py │ ├── predict_metaod.py │ ├── train_metaod.py │ ├── trained_models │ │ ├── meta_scalar.joblib │ │ ├── model_list.joblib │ │ └── train_0.joblib │ └── utility.py ├── test │ ├── __init__.py │ └── test_predict_metaod.py └── version.py ├── requirements.txt ├── saved_models └── trained_models.zip ├── setup.cfg └── setup.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Python CircleCI 2.0 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details 4 | # Adapted from https://github.com/NeuralEnsemble/python-neo 5 | version: 2 6 | workflows: 7 | version: 2 8 | test: 9 | jobs: 10 | - test-3.6 11 | jobs: 12 | test-3.6: 13 | docker: 14 | - image: circleci/python:3.6-stretch 15 | 16 | working_directory: ~/repo 17 | 18 | steps: 19 | - checkout 20 | - run: sudo chown -R circleci:circleci /usr/local/bin 21 | 22 | # Download and cache dependencies 23 | - restore_cache: 24 | keys: 25 | - v1-py3-dependencies-{{ checksum "requirements_ci.txt" }} 26 | # fallback to using the latest cache if no exact match is found 27 | - v1-py3-dependencies- 28 | 29 | - run: 30 | name: install dependencies 31 | command: | 32 | python3 -m venv venv 33 | . venv/bin/activate 34 | pip install --upgrade pip 35 | pip install -r requirements.txt 36 | pip install pandas 37 | pip install pytest 38 | pip install pytest-cov 39 | 40 | 41 | - save_cache: 42 | paths: 43 | - ./venv 44 | key: v1-py3-dependencies-{{ checksum "requirements.txt" }} 45 | 46 | 47 | # run tests! 48 | - run: 49 | name: run tests 50 | command: | 51 | . venv/bin/activate 52 | pytest 53 | 54 | - store_artifacts: 55 | path: test-reports 56 | destination: test-reports 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | dist: xenial 3 | 4 | env: 5 | global: 6 | PIP_PREFER_BINARY=true # https://github.com/numba/llvmlite/issues/471 7 | 8 | python: 9 | - "3.5" # disabled for now as warning messages crash travis ci. turn it back in 3.5.7 10 | - "3.5-dev" # 3.5 development branch 11 | - "3.6" 12 | - "3.6-dev" # 3.6 development branch 13 | - "3.7" # 3.6 development branch 14 | 15 | install: 16 | - pip install --upgrade pip 17 | - pip install -r requirements.txt 18 | - pip install pytest 19 | - pip install pandas 20 | - pip install pytest-cov 21 | - pip install coveralls 22 | 23 | # command to run tests 24 | script: 25 | pytest --cov=metaod/ 26 | 27 | after_success: 28 | - coveralls 29 | 30 | notifications: 31 | email: 32 | recipients: 33 | - yzhao062@gmail.com 34 | on_success: never # default: change 35 | on_failure: always # default: always -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2021, Yue Zhao 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | prune examples 2 | prune notebooks 3 | prune paper_reproducibility 4 | prune metaod/test 5 | prune README.md 6 | include README.rst 7 | include requirements.txt -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Automating Outlier Detection via Meta-Learning (MetaOD) 2 | ===================================================================== 3 | 4 | 5 | .. image:: https://img.shields.io/pypi/v/metaod.svg?color=brightgreen 6 | :target: https://pypi.org/project/metaod/ 7 | :alt: PyPI version 8 | 9 | .. image:: https://img.shields.io/github/stars/yzhao062/metaod.svg 10 | :target: https://github.com/yzhao062/metaod/stargazers 11 | :alt: GitHub stars 12 | 13 | .. image:: https://img.shields.io/github/forks/yzhao062/metaod.svg?color=blue 14 | :target: https://github.com/yzhao062/metaod/network 15 | :alt: GitHub forks 16 | 17 | .. image:: https://circleci.com/gh/yzhao062/MetaOD.svg?style=svg 18 | :target: https://circleci.com/gh/yzhao062/MetaOD 19 | :alt: Circle CI 20 | 21 | .. image:: https://travis-ci.org/yzhao062/MetaOD.svg?branch=master 22 | :target: https://travis-ci.org/yzhao062/MetaOD 23 | 24 | ---- 25 | 26 | **Development Status**: **As of 09/26/2020, MetaOD is under active development and in its alpha stage. Please follow, star, and fork to get the latest update**! 27 | For paper reproducibility, please see the paper_reproducibility folder for instruction. 28 | 29 | **Given an unsupervised outlier detection (OD) task on a new dataset, how can we automatically select a good outlier detection method and its hyperparameter(s) (collectively called a model)?** 30 | Thus far, model selection for OD has been a "black art"; as any model evaluation is infeasible due to the lack of (i) hold-out data with labels, and (ii) a universal objective function. 31 | In this work, we develop the first principled data-driven approach to model selection for OD, called MetaOD, based on meta-learning. 32 | In short, MetaOD is trained on extensive OD benchmark datasets to capitalize the prior experience so that **it could select the potentially best performing model for unseen datasets**. 33 | 34 | Using MetaOD is easy. 35 | **You could pass in a dataset, and MetaOD will return the most performing outlier detection models for it**, which boosts both detection quality and reduces the cost of running multiple models. 36 | 37 | 38 | **API Demo for selecting outlier detection model on a new dataset (within 3 lines)**\ : 39 | 40 | 41 | .. code-block:: python 42 | 43 | from metaod.models.utility import prepare_trained_model 44 | from metaod.models.predict_metaod import select_model 45 | 46 | # load pretrained MetaOD model 47 | prepare_trained_model() 48 | 49 | # use MetaOD to recommend models. It returns the top n model for new data X_train 50 | selected_models = select_model(X_train, n_selection=100) 51 | 52 | 53 | 54 | `Preprint paper `_ | `Reproducibility instruction `_ 55 | 56 | **Citing MetaOD**\ : 57 | 58 | If you use MetaOD in a scientific publication, we would appreciate 59 | citations to the following paper:: 60 | 61 | @article{zhao2020automating, 62 | author = {Zhao, Yue and Ryan Rossi and Leman Akoglu}, 63 | title = {Automating Outlier Detection via Meta-Learning}, 64 | journal = {arXiv preprint arXiv:2009.10606}, 65 | year = {2020}, 66 | } 67 | 68 | or:: 69 | 70 | Zhao, Y., Rossi, R., and Akoglu, L., 2020. Automating Outlier Detection via Meta-Learning. arXiv preprint arXiv:2009.10606. 71 | 72 | 73 | **Table of Contents**\ : 74 | 75 | 76 | * `Installation <#installation>`_ 77 | * `API Cheatsheet & Reference <#api-cheatsheet--reference>`_ 78 | * `Quick Start for Model Selection <#quick-start-for-model-selection>`_ 79 | * `Quick Start for Meta Feature Generation <#quick-start-for-meta-feature-generation>`_ 80 | 81 | 82 | ------------ 83 | 84 | System Introduction 85 | ^^^^^^^^^^^^^^^^^^^ 86 | 87 | As shown in the figure below, MetaOD contains offline meta-learner training and online model selection. 88 | For selecting an outlier detection model for a new dataset, one only needs the online model selection. Specifically, to be finished. 89 | 90 | 91 | .. image:: https://raw.githubusercontent.com/yzhao062/MetaOD/master/docs/images/MetaOD_Flowchart.jpg 92 | :target: https://raw.githubusercontent.com/yzhao062/MetaOD/master/docs/images/MetaOD_Flowchart.jpg 93 | :alt: metaod_flow 94 | :align: center 95 | 96 | ----- 97 | 98 | 99 | Installation 100 | ^^^^^^^^^^^^ 101 | 102 | It is recommended to use **pip** for installation. Please make sure 103 | **the latest version** is installed, as MetaOD is updated frequently: 104 | 105 | .. code-block:: bash 106 | 107 | pip install metaod # normal install 108 | pip install --upgrade metaod # or update if needed 109 | pip install --pre metaod # or include pre-release version for new features 110 | 111 | Alternatively, you could clone and run setup.py file: 112 | 113 | .. code-block:: bash 114 | 115 | git clone https://github.com/yzhao062/metaod.git 116 | cd metaod 117 | pip install . 118 | 119 | 120 | **Required Dependencies**\ : 121 | 122 | 123 | * Python 3.5, 3.6, or 3.7 124 | * joblib>=0.14.1 125 | * liac-arff 126 | * numpy>=1.18.1 127 | * scipy>=0.20 128 | * **scikit_learn==0.22.1** 129 | * pandas>=0.20 130 | * pyod>=0.8 131 | 132 | **Note**: Since we need to load trained models, we fix the scikit-learn version 133 | to 0.20. We recommend you to use MetaOD in a fully fresh env to have the right dependency. 134 | 135 | 136 | Quick Start for Model Selection 137 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 138 | 139 | `"examples/model_selection_example.py" `_ 140 | provide an example on using MetaOD for selecting top models on a new datasets, which is fully unsupervised. 141 | 142 | The key procedures are below: 143 | 144 | #. Load some synthetic datasets 145 | 146 | .. code-block:: python 147 | 148 | # Generate sample data 149 | X_train, y_train, X_test, y_test = \ 150 | generate_data(n_train=1000, 151 | n_test=100, 152 | n_features=3, 153 | contamination=0.5, 154 | random_state=42) 155 | 156 | #. Use MetaOD to select top 100 models 157 | 158 | .. code-block:: python 159 | 160 | from metaod.models.utility import prepare_trained_model 161 | from metaod.models.predict_metaod import select_model 162 | 163 | # load pretrained models 164 | prepare_trained_model() 165 | 166 | # recommended models. this returns the top model for X_train 167 | selected_models = select_model(X_train, n_selection=100) 168 | 169 | 170 | #. Show the selected models' performance evaluation (result may vary slightly due to built-in randomness). 171 | 172 | .. code-block:: python 173 | 174 | 175 | 1st model Average Precision 0.9780551579734139 176 | 10th model Average Precision 0.959749602397687 177 | 50th model Average Precision 0.6211392467111937 178 | 179 | 180 | Quick Start for Meta Feature Generation 181 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 182 | 183 | Getting the embedding of an arbitrary dataset is first step of MetaOD, which 184 | cam be done by our specialized meta-feature generation function. 185 | 186 | It may be used for other purposes as well, e.g., measuring the similarity of 187 | two datasets. 188 | 189 | .. code-block:: python 190 | 191 | # import meta-feature generator 192 | from metaod.models.gen_meta_features import gen_meta_features 193 | 194 | meta_features, _ = generate_meta_features(X) 195 | 196 | A simple example of visualizing two different environments using TSNE with 197 | our meta-features are shown below. The environment on the left is composed 198 | 100 datasets with similarity, and the same color stands for same group of datasets. 199 | The environment on the left is composed 200 | 62 datasets without known similarity. Our meta-features successfully capture 201 | the underlying similarity in the left figure. 202 | 203 | .. image:: https://raw.githubusercontent.com/yzhao062/MetaOD/master/docs/images/meta_vis.jpg 204 | :target: https://raw.githubusercontent.com/yzhao062/MetaOD/master/docs/images/meta_vis.jpg 205 | :alt: meta_viz 206 | :align: center 207 | 208 | 209 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | build: off 2 | 3 | # branches to build 4 | branches: 5 | # whitelist 6 | only: 7 | - master 8 | - development 9 | environment: 10 | matrix: 11 | - PYTHON: "C:\\Python36-x64" 12 | - PYTHON: "C:\\Python37-x64" 13 | 14 | skip_commits: 15 | files: 16 | - "*.yml" 17 | - "*.rst" 18 | - "*.md" 19 | - "LICENSE" 20 | 21 | init: 22 | - "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%" 23 | 24 | install: 25 | - "%PYTHON%\\python.exe -m pip install --upgrade pip setuptools" 26 | - "%PYTHON%\\python.exe -m pip install wheel" 27 | - "%PYTHON%\\python.exe -m pip install pytest" 28 | - "%PYTHON%\\python.exe -m pip install pandas" 29 | - "%PYTHON%\\python.exe -m pip install -r requirements.txt" 30 | 31 | 32 | 33 | test_script: 34 | - "%PYTHON%\\python.exe -m pytest" 35 | 36 | after_test: 37 | - "%PYTHON%\\python.exe setup.py bdist_wheel" 38 | 39 | artifacts: 40 | - path: dist\* 41 | 42 | notifications: 43 | - provider: Email 44 | to: 45 | - yzhao062@gmail.com 46 | on_build_success: false 47 | on_build_failure: true 48 | on_build_status_changed: true 49 | -------------------------------------------------------------------------------- /docs/images/MetaOD_Flowchart.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/docs/images/MetaOD_Flowchart.jpg -------------------------------------------------------------------------------- /docs/images/meta_vis.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/docs/images/meta_vis.jpg -------------------------------------------------------------------------------- /examples/meta_feature_generation_example.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/examples/meta_feature_generation_example.py -------------------------------------------------------------------------------- /examples/model_selection_example.py: -------------------------------------------------------------------------------- 1 | """MetaOD prediction with the trained model 2 | """ 3 | # License: BSD 2 clause 4 | 5 | from sklearn.metrics import average_precision_score 6 | 7 | from pyod.utils.data import generate_data 8 | from pyod.models.loda import LODA 9 | from pyod.models.knn import KNN 10 | from pyod.models.iforest import IForest 11 | from pyod.models.ocsvm import OCSVM 12 | 13 | 14 | from metaod.models.utility import prepare_trained_model 15 | from metaod.models.predict_metaod import select_model 16 | 17 | 18 | if __name__ == "__main__": 19 | contamination = 0.1 # percentage of outliers 20 | n_train = 1000 # number of training points 21 | n_test = 100 # number of testing points 22 | 23 | # Generate sample data 24 | X_train, y_train, X_test, y_test = \ 25 | generate_data(n_train=n_train, 26 | n_test=n_test, 27 | n_features=3, 28 | contamination=contamination, 29 | random_state=42) 30 | # load pretrained models 31 | prepare_trained_model() 32 | 33 | # recommended models 34 | selected_models = select_model(X_train, n_selection=100) 35 | 36 | 37 | print("Showing the top recommended models...") 38 | for i, model in enumerate(selected_models): 39 | print(i, model) 40 | 41 | print() 42 | 43 | model_1 = LODA(n_bins=5, n_random_cuts=100) 44 | print("1st model Average Precision", average_precision_score(y_train, model_1.fit(X_train).decision_scores_)) 45 | 46 | model_10 = LODA(n_bins=5, n_random_cuts=20) 47 | print("10th model Average Precision", average_precision_score(y_train, model_10.fit(X_train).decision_scores_)) 48 | 49 | 50 | model_50 = OCSVM(kernel= 'sigmoid', nu=0.6) 51 | print("50th model Average Precision", average_precision_score(y_train, model_50.fit(X_train).decision_scores_)) 52 | -------------------------------------------------------------------------------- /metaod/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/metaod/__init__.py -------------------------------------------------------------------------------- /metaod/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/metaod/models/__init__.py -------------------------------------------------------------------------------- /metaod/models/base_detectors.py: -------------------------------------------------------------------------------- 1 | from pyod.models.iforest import IForest 2 | from pyod.models.lof import LOF 3 | from pyod.models.ocsvm import OCSVM 4 | from pyod.models.knn import KNN 5 | from pyod.models.hbos import HBOS 6 | from pyod.models.abod import ABOD 7 | from pyod.models.loda import LODA 8 | from pyod.models.cof import COF 9 | 10 | 11 | def get_detectors(): 12 | # randomness_flags = [] 13 | BASE_ESTIMATORS = [ 14 | LODA(n_bins=5, n_random_cuts=10), 15 | LODA(n_bins=5, n_random_cuts=20), 16 | LODA(n_bins=5, n_random_cuts=30), 17 | LODA(n_bins=5, n_random_cuts=40), 18 | LODA(n_bins=5, n_random_cuts=50), 19 | LODA(n_bins=5, n_random_cuts=75), 20 | LODA(n_bins=5, n_random_cuts=100), 21 | LODA(n_bins=5, n_random_cuts=150), 22 | LODA(n_bins=5, n_random_cuts=200), 23 | 24 | LODA(n_bins=10, n_random_cuts=10), 25 | LODA(n_bins=10, n_random_cuts=20), 26 | LODA(n_bins=10, n_random_cuts=30), 27 | LODA(n_bins=10, n_random_cuts=40), 28 | LODA(n_bins=10, n_random_cuts=50), 29 | LODA(n_bins=10, n_random_cuts=75), 30 | LODA(n_bins=10, n_random_cuts=100), 31 | LODA(n_bins=10, n_random_cuts=150), 32 | LODA(n_bins=10, n_random_cuts=200), 33 | 34 | LODA(n_bins=15, n_random_cuts=10), 35 | LODA(n_bins=15, n_random_cuts=20), 36 | LODA(n_bins=15, n_random_cuts=30), 37 | LODA(n_bins=15, n_random_cuts=40), 38 | LODA(n_bins=15, n_random_cuts=50), 39 | LODA(n_bins=15, n_random_cuts=75), 40 | LODA(n_bins=15, n_random_cuts=100), 41 | LODA(n_bins=15, n_random_cuts=150), 42 | LODA(n_bins=15, n_random_cuts=200), 43 | 44 | LODA(n_bins=20, n_random_cuts=10), 45 | LODA(n_bins=20, n_random_cuts=20), 46 | LODA(n_bins=20, n_random_cuts=30), 47 | LODA(n_bins=20, n_random_cuts=40), 48 | LODA(n_bins=20, n_random_cuts=50), 49 | LODA(n_bins=20, n_random_cuts=75), 50 | LODA(n_bins=20, n_random_cuts=100), 51 | LODA(n_bins=20, n_random_cuts=150), 52 | LODA(n_bins=20, n_random_cuts=200), 53 | 54 | LODA(n_bins=25, n_random_cuts=10), 55 | LODA(n_bins=25, n_random_cuts=20), 56 | LODA(n_bins=25, n_random_cuts=30), 57 | LODA(n_bins=25, n_random_cuts=40), 58 | LODA(n_bins=25, n_random_cuts=50), 59 | LODA(n_bins=25, n_random_cuts=75), 60 | LODA(n_bins=25, n_random_cuts=100), 61 | LODA(n_bins=25, n_random_cuts=150), 62 | LODA(n_bins=25, n_random_cuts=200), 63 | 64 | LODA(n_bins=30, n_random_cuts=10), 65 | LODA(n_bins=30, n_random_cuts=20), 66 | LODA(n_bins=30, n_random_cuts=30), 67 | LODA(n_bins=30, n_random_cuts=40), 68 | LODA(n_bins=30, n_random_cuts=50), 69 | LODA(n_bins=30, n_random_cuts=75), 70 | LODA(n_bins=30, n_random_cuts=100), 71 | LODA(n_bins=30, n_random_cuts=150), 72 | LODA(n_bins=30, n_random_cuts=200), 73 | 74 | ABOD(n_neighbors=3), 75 | ABOD(n_neighbors=5), 76 | ABOD(n_neighbors=10), 77 | ABOD(n_neighbors=15), 78 | ABOD(n_neighbors=20), 79 | ABOD(n_neighbors=25), 80 | ABOD(n_neighbors=50), 81 | ABOD(n_neighbors=60), 82 | ABOD(n_neighbors=75), 83 | ABOD(n_neighbors=80), 84 | ABOD(n_neighbors=90), 85 | ABOD(n_neighbors=100), 86 | 87 | IForest(n_estimators=10, max_features=0.1), 88 | IForest(n_estimators=10, max_features=0.2), 89 | IForest(n_estimators=10, max_features=0.3), 90 | IForest(n_estimators=10, max_features=0.4), 91 | IForest(n_estimators=10, max_features=0.5), 92 | IForest(n_estimators=10, max_features=0.6), 93 | IForest(n_estimators=10, max_features=0.7), 94 | IForest(n_estimators=10, max_features=0.8), 95 | IForest(n_estimators=10, max_features=0.9), 96 | 97 | IForest(n_estimators=20, max_features=0.1), 98 | IForest(n_estimators=20, max_features=0.2), 99 | IForest(n_estimators=20, max_features=0.3), 100 | IForest(n_estimators=20, max_features=0.4), 101 | IForest(n_estimators=20, max_features=0.5), 102 | IForest(n_estimators=20, max_features=0.6), 103 | IForest(n_estimators=20, max_features=0.7), 104 | IForest(n_estimators=20, max_features=0.8), 105 | IForest(n_estimators=20, max_features=0.9), 106 | 107 | IForest(n_estimators=30, max_features=0.1), 108 | IForest(n_estimators=30, max_features=0.2), 109 | IForest(n_estimators=30, max_features=0.3), 110 | IForest(n_estimators=30, max_features=0.4), 111 | IForest(n_estimators=30, max_features=0.5), 112 | IForest(n_estimators=30, max_features=0.6), 113 | IForest(n_estimators=30, max_features=0.7), 114 | IForest(n_estimators=30, max_features=0.8), 115 | IForest(n_estimators=30, max_features=0.9), 116 | 117 | IForest(n_estimators=40, max_features=0.1), 118 | IForest(n_estimators=40, max_features=0.2), 119 | IForest(n_estimators=40, max_features=0.3), 120 | IForest(n_estimators=40, max_features=0.4), 121 | IForest(n_estimators=40, max_features=0.5), 122 | IForest(n_estimators=40, max_features=0.6), 123 | IForest(n_estimators=40, max_features=0.7), 124 | IForest(n_estimators=40, max_features=0.8), 125 | IForest(n_estimators=40, max_features=0.9), 126 | 127 | IForest(n_estimators=50, max_features=0.1), 128 | IForest(n_estimators=50, max_features=0.2), 129 | IForest(n_estimators=50, max_features=0.3), 130 | IForest(n_estimators=50, max_features=0.4), 131 | IForest(n_estimators=50, max_features=0.5), 132 | IForest(n_estimators=50, max_features=0.6), 133 | IForest(n_estimators=50, max_features=0.7), 134 | IForest(n_estimators=50, max_features=0.8), 135 | IForest(n_estimators=50, max_features=0.9), 136 | 137 | IForest(n_estimators=75, max_features=0.1), 138 | IForest(n_estimators=75, max_features=0.2), 139 | IForest(n_estimators=75, max_features=0.3), 140 | IForest(n_estimators=75, max_features=0.4), 141 | IForest(n_estimators=75, max_features=0.5), 142 | IForest(n_estimators=75, max_features=0.6), 143 | IForest(n_estimators=75, max_features=0.7), 144 | IForest(n_estimators=75, max_features=0.8), 145 | IForest(n_estimators=75, max_features=0.9), 146 | 147 | IForest(n_estimators=100, max_features=0.1), 148 | IForest(n_estimators=100, max_features=0.2), 149 | IForest(n_estimators=100, max_features=0.3), 150 | IForest(n_estimators=100, max_features=0.4), 151 | IForest(n_estimators=100, max_features=0.5), 152 | IForest(n_estimators=100, max_features=0.6), 153 | IForest(n_estimators=100, max_features=0.7), 154 | IForest(n_estimators=100, max_features=0.8), 155 | IForest(n_estimators=100, max_features=0.9), 156 | 157 | IForest(n_estimators=150, max_features=0.1), 158 | IForest(n_estimators=150, max_features=0.2), 159 | IForest(n_estimators=150, max_features=0.3), 160 | IForest(n_estimators=150, max_features=0.4), 161 | IForest(n_estimators=150, max_features=0.5), 162 | IForest(n_estimators=150, max_features=0.6), 163 | IForest(n_estimators=150, max_features=0.7), 164 | IForest(n_estimators=150, max_features=0.8), 165 | IForest(n_estimators=150, max_features=0.9), 166 | 167 | IForest(n_estimators=200, max_features=0.1), 168 | IForest(n_estimators=200, max_features=0.2), 169 | IForest(n_estimators=200, max_features=0.3), 170 | IForest(n_estimators=200, max_features=0.4), 171 | IForest(n_estimators=200, max_features=0.5), 172 | IForest(n_estimators=200, max_features=0.6), 173 | IForest(n_estimators=200, max_features=0.7), 174 | IForest(n_estimators=200, max_features=0.8), 175 | IForest(n_estimators=200, max_features=0.9), 176 | 177 | KNN(n_neighbors=1, method='largest'), 178 | KNN(n_neighbors=5, method='largest'), 179 | KNN(n_neighbors=10, method='largest'), 180 | KNN(n_neighbors=15, method='largest'), 181 | KNN(n_neighbors=20, method='largest'), 182 | KNN(n_neighbors=25, method='largest'), 183 | KNN(n_neighbors=50, method='largest'), 184 | KNN(n_neighbors=60, method='largest'), 185 | KNN(n_neighbors=70, method='largest'), 186 | KNN(n_neighbors=80, method='largest'), 187 | KNN(n_neighbors=90, method='largest'), 188 | KNN(n_neighbors=100, method='largest'), 189 | 190 | KNN(n_neighbors=1, method='mean'), 191 | KNN(n_neighbors=5, method='mean'), 192 | KNN(n_neighbors=10, method='mean'), 193 | KNN(n_neighbors=15, method='mean'), 194 | KNN(n_neighbors=20, method='mean'), 195 | KNN(n_neighbors=25, method='mean'), 196 | KNN(n_neighbors=50, method='mean'), 197 | KNN(n_neighbors=60, method='mean'), 198 | KNN(n_neighbors=70, method='mean'), 199 | KNN(n_neighbors=80, method='mean'), 200 | KNN(n_neighbors=90, method='mean'), 201 | KNN(n_neighbors=100, method='mean'), 202 | 203 | KNN(n_neighbors=1, method='median'), 204 | KNN(n_neighbors=5, method='median'), 205 | KNN(n_neighbors=10, method='median'), 206 | KNN(n_neighbors=15, method='median'), 207 | KNN(n_neighbors=20, method='median'), 208 | KNN(n_neighbors=25, method='median'), 209 | KNN(n_neighbors=50, method='median'), 210 | KNN(n_neighbors=60, method='median'), 211 | KNN(n_neighbors=70, method='median'), 212 | KNN(n_neighbors=80, method='median'), 213 | KNN(n_neighbors=90, method='median'), 214 | KNN(n_neighbors=100, method='median'), 215 | 216 | LOF(n_neighbors=1, metric='manhattan'), 217 | LOF(n_neighbors=5, metric='manhattan'), 218 | LOF(n_neighbors=10, metric='manhattan'), 219 | LOF(n_neighbors=15, metric='manhattan'), 220 | LOF(n_neighbors=20, metric='manhattan'), 221 | LOF(n_neighbors=25, metric='manhattan'), 222 | LOF(n_neighbors=50, metric='manhattan'), 223 | LOF(n_neighbors=60, metric='manhattan'), 224 | LOF(n_neighbors=70, metric='manhattan'), 225 | LOF(n_neighbors=80, metric='manhattan'), 226 | LOF(n_neighbors=90, metric='manhattan'), 227 | LOF(n_neighbors=100, metric='manhattan'), 228 | 229 | LOF(n_neighbors=1, metric='euclidean'), 230 | LOF(n_neighbors=5, metric='euclidean'), 231 | LOF(n_neighbors=10, metric='euclidean'), 232 | LOF(n_neighbors=15, metric='euclidean'), 233 | LOF(n_neighbors=20, metric='euclidean'), 234 | LOF(n_neighbors=25, metric='euclidean'), 235 | LOF(n_neighbors=50, metric='euclidean'), 236 | LOF(n_neighbors=60, metric='euclidean'), 237 | LOF(n_neighbors=70, metric='euclidean'), 238 | LOF(n_neighbors=80, metric='euclidean'), 239 | LOF(n_neighbors=90, metric='euclidean'), 240 | LOF(n_neighbors=100, metric='euclidean'), 241 | 242 | LOF(n_neighbors=1, metric='minkowski'), 243 | LOF(n_neighbors=5, metric='minkowski'), 244 | LOF(n_neighbors=10, metric='minkowski'), 245 | LOF(n_neighbors=15, metric='minkowski'), 246 | LOF(n_neighbors=20, metric='minkowski'), 247 | LOF(n_neighbors=25, metric='minkowski'), 248 | LOF(n_neighbors=50, metric='minkowski'), 249 | LOF(n_neighbors=60, metric='minkowski'), 250 | LOF(n_neighbors=70, metric='minkowski'), 251 | LOF(n_neighbors=80, metric='minkowski'), 252 | LOF(n_neighbors=90, metric='minkowski'), 253 | LOF(n_neighbors=100, metric='minkowski'), 254 | 255 | HBOS(n_bins=5, alpha=0.1), 256 | HBOS(n_bins=5, alpha=0.2), 257 | HBOS(n_bins=5, alpha=0.3), 258 | HBOS(n_bins=5, alpha=0.4), 259 | HBOS(n_bins=5, alpha=0.5), 260 | 261 | HBOS(n_bins=10, alpha=0.1), 262 | HBOS(n_bins=10, alpha=0.2), 263 | HBOS(n_bins=10, alpha=0.3), 264 | HBOS(n_bins=10, alpha=0.4), 265 | HBOS(n_bins=10, alpha=0.5), 266 | 267 | HBOS(n_bins=20, alpha=0.1), 268 | HBOS(n_bins=20, alpha=0.2), 269 | HBOS(n_bins=20, alpha=0.3), 270 | HBOS(n_bins=20, alpha=0.4), 271 | HBOS(n_bins=20, alpha=0.5), 272 | 273 | HBOS(n_bins=30, alpha=0.1), 274 | HBOS(n_bins=30, alpha=0.2), 275 | HBOS(n_bins=30, alpha=0.3), 276 | HBOS(n_bins=30, alpha=0.4), 277 | HBOS(n_bins=30, alpha=0.5), 278 | 279 | HBOS(n_bins=40, alpha=0.1), 280 | HBOS(n_bins=40, alpha=0.2), 281 | HBOS(n_bins=40, alpha=0.3), 282 | HBOS(n_bins=40, alpha=0.4), 283 | HBOS(n_bins=40, alpha=0.5), 284 | 285 | HBOS(n_bins=50, alpha=0.1), 286 | HBOS(n_bins=50, alpha=0.2), 287 | HBOS(n_bins=50, alpha=0.3), 288 | HBOS(n_bins=50, alpha=0.4), 289 | HBOS(n_bins=50, alpha=0.5), 290 | 291 | HBOS(n_bins=75, alpha=0.1), 292 | HBOS(n_bins=75, alpha=0.2), 293 | HBOS(n_bins=75, alpha=0.3), 294 | HBOS(n_bins=75, alpha=0.4), 295 | HBOS(n_bins=75, alpha=0.5), 296 | 297 | HBOS(n_bins=100, alpha=0.1), 298 | HBOS(n_bins=100, alpha=0.2), 299 | HBOS(n_bins=100, alpha=0.3), 300 | HBOS(n_bins=100, alpha=0.4), 301 | HBOS(n_bins=100, alpha=0.5), 302 | 303 | OCSVM(nu=0.1, kernel="linear"), 304 | OCSVM(nu=0.2, kernel="linear"), 305 | OCSVM(nu=0.3, kernel="linear"), 306 | OCSVM(nu=0.4, kernel="linear"), 307 | OCSVM(nu=0.5, kernel="linear"), 308 | OCSVM(nu=0.6, kernel="linear"), 309 | OCSVM(nu=0.7, kernel="linear"), 310 | OCSVM(nu=0.8, kernel="linear"), 311 | OCSVM(nu=0.9, kernel="linear"), 312 | 313 | OCSVM(nu=0.1, kernel="poly"), 314 | OCSVM(nu=0.2, kernel="poly"), 315 | OCSVM(nu=0.3, kernel="poly"), 316 | OCSVM(nu=0.4, kernel="poly"), 317 | OCSVM(nu=0.5, kernel="poly"), 318 | OCSVM(nu=0.6, kernel="poly"), 319 | OCSVM(nu=0.7, kernel="poly"), 320 | OCSVM(nu=0.8, kernel="poly"), 321 | OCSVM(nu=0.9, kernel="poly"), 322 | 323 | OCSVM(nu=0.1, kernel="rbf"), 324 | OCSVM(nu=0.2, kernel="rbf"), 325 | OCSVM(nu=0.3, kernel="rbf"), 326 | OCSVM(nu=0.4, kernel="rbf"), 327 | OCSVM(nu=0.5, kernel="rbf"), 328 | OCSVM(nu=0.6, kernel="rbf"), 329 | OCSVM(nu=0.7, kernel="rbf"), 330 | OCSVM(nu=0.8, kernel="rbf"), 331 | OCSVM(nu=0.9, kernel="rbf"), 332 | 333 | OCSVM(nu=0.1, kernel="sigmoid"), 334 | OCSVM(nu=0.2, kernel="sigmoid"), 335 | OCSVM(nu=0.3, kernel="sigmoid"), 336 | OCSVM(nu=0.4, kernel="sigmoid"), 337 | OCSVM(nu=0.5, kernel="sigmoid"), 338 | OCSVM(nu=0.6, kernel="sigmoid"), 339 | OCSVM(nu=0.7, kernel="sigmoid"), 340 | OCSVM(nu=0.8, kernel="sigmoid"), 341 | OCSVM(nu=0.9, kernel="sigmoid"), 342 | 343 | COF(n_neighbors=3), 344 | COF(n_neighbors=5), 345 | COF(n_neighbors=10), 346 | COF(n_neighbors=15), 347 | COF(n_neighbors=20), 348 | COF(n_neighbors=25), 349 | COF(n_neighbors=50), 350 | ] 351 | 352 | # randomness_flags.extend([True] * 54) # LODA 353 | # randomness_flags.extend([False] * 7) # ABOD 354 | # randomness_flags.extend([True] * 81) # IForest 355 | # randomness_flags.extend([False] * 36) # KNN 356 | # randomness_flags.extend([False] * 36) # LOF 357 | # randomness_flags.extend([False] * 40) # HBOS 358 | # randomness_flags.extend([False] * 36) # OCSVM 359 | # randomness_flags.extend([False] * 7) # COF 360 | # return BASE_ESTIMATORS, randomness_flags 361 | return BASE_ESTIMATORS 362 | -------------------------------------------------------------------------------- /metaod/models/core.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import numpy as np 3 | 4 | from sklearn.metrics import dcg_score, ndcg_score 5 | from sklearn.utils import check_array 6 | from sklearn.metrics import mean_squared_error 7 | 8 | from sklearn.multioutput import MultiOutputRegressor 9 | from sklearn.ensemble import RandomForestRegressor 10 | from sklearn.decomposition import PCA 11 | from sklearn.preprocessing import MinMaxScaler, StandardScaler 12 | from copy import deepcopy 13 | 14 | 15 | def get_mse(pred, actual): 16 | # Ignore nonzero terms. 17 | pred = pred[actual.nonzero()].flatten() 18 | actual = actual[actual.nonzero()].flatten() 19 | return mean_squared_error(pred, actual) 20 | 21 | 22 | def sigmoid(x, a=1): 23 | return 1 / (1 + np.exp(-1 * a * x)) 24 | 25 | 26 | def sigmoid_derivate(x, a=1): 27 | return sigmoid(x, a) * (1 - sigmoid(x, a)) 28 | 29 | 30 | class MetaODClass(object): 31 | def __init__(self, 32 | train_performance, 33 | valid_performance, 34 | n_factors=40, 35 | learning='sgd', 36 | verbose=False): 37 | """ 38 | Train a matrix factorization model to predict empty 39 | entries in a matrix. The terminology assumes a 40 | train_performance matrix which is ~ user x item 41 | 42 | Params 43 | ====== 44 | train_performance : (ndarray) 45 | User x Item matrix with corresponding train_performance 46 | 47 | n_factors : (int) 48 | Number of latent factors to use in matrix 49 | factorization model 50 | learning : (str) 51 | Method of optimization. Options include 52 | 'sgd' or 'als'. 53 | 54 | item_fact_reg : (float) 55 | Regularization term for item latent factors 56 | 57 | user_fact_reg : (float) 58 | Regularization term for user latent factors 59 | 60 | item_bias_reg : (float) 61 | Regularization term for item biases 62 | 63 | user_bias_reg : (float) 64 | Regularization term for user biases 65 | 66 | verbose : (bool) 67 | Whether or not to printout training progress 68 | """ 69 | 70 | self.ratings = train_performance 71 | self.valid_ratings = valid_performance 72 | self.n_users, self.n_items = train_performance.shape 73 | self.n_factors = n_factors 74 | self.learning = learning 75 | if self.learning == 'sgd': 76 | self.n_samples, self.n_models = self.ratings.shape[0], \ 77 | self.ratings.shape[1] 78 | self._v = verbose 79 | self.train_loss_ = [0] 80 | self.valid_loss_ = [0] 81 | self.learning_rates_ = [] 82 | self.scalar_ = None 83 | self.pca_ = None 84 | 85 | def get_train_dcg(self, user_vecs, item_vecs): 86 | # make sure it is non zero 87 | user_vecs[np.isnan(self.user_vecs)] = 0 88 | 89 | ndcg_s = [] 90 | for w in range(self.ratings.shape[0]): 91 | ndcg_s.append(ndcg_score([self.ratings[w, :]], 92 | [np.dot(user_vecs[w, :], item_vecs.T)])) 93 | 94 | return np.mean(ndcg_s) 95 | 96 | def train(self, meta_features, valid_meta=None, n_iter=10, 97 | learning_rate=0.1, n_estimators=100, max_depth=10, max_rate=1.05, 98 | min_rate=0.1, discount=0.95, n_steps=10): 99 | """ Train model for n_iter iterations from scratch.""" 100 | 101 | self.pca_ = PCA(n_components=self.n_factors) 102 | self.pca_.fit(meta_features) 103 | 104 | meta_features_pca = self.pca_.transform(meta_features) 105 | meta_valid_pca = self.pca_.transform(valid_meta) 106 | 107 | self.scalar_ = StandardScaler() 108 | self.scalar_.fit(meta_features_pca) 109 | 110 | meta_features_scaled = self.scalar_.transform(meta_features_pca) 111 | meta_valid_scaled = self.scalar_.transform(meta_valid_pca) 112 | 113 | self.user_vecs = meta_features_scaled 114 | 115 | self.item_vecs = np.random.normal(scale=1. / self.n_factors, 116 | size=(self.n_items, self.n_factors)) 117 | 118 | step_size = (max_rate - min_rate) / (n_steps - 1) 119 | lr_list = list(np.arange(min_rate, max_rate, step_size)) 120 | lr_list.append(max_rate) 121 | lr_list_reverse = deepcopy(lr_list) 122 | lr_list_reverse.reverse() 123 | 124 | learning_rate_full = [] 125 | for w in range(n_iter): 126 | learning_rate_full.extend(lr_list) 127 | learning_rate_full.extend(lr_list_reverse) 128 | 129 | self.learning_rate_ = min_rate 130 | self.learning_rates_.append(self.learning_rate_) 131 | 132 | ctr = 1 133 | np_ctr = 1 134 | while ctr <= n_iter: 135 | 136 | self.learning_rate_ = learning_rate_full[ctr - 1] 137 | self.learning_rates_.append(self.learning_rate_) 138 | 139 | self.regr_multirf = MultiOutputRegressor(RandomForestRegressor( 140 | n_estimators=n_estimators, max_depth=max_depth, n_jobs=4)) 141 | 142 | # make sure it is non zero 143 | self.user_vecs[np.isnan(self.user_vecs)] = 0 144 | 145 | self.regr_multirf.fit(meta_features_scaled, self.user_vecs) 146 | 147 | meta_valid_scaled_new = self.regr_multirf.predict( 148 | meta_valid_scaled) 149 | 150 | # if ctr % 10 == 0 and self._v: 151 | # print ('\tcurrent iteration: {}'.format(ctr)) 152 | # print('ALORS Rank Fixed iteration', ctr, ndcg_score(self.train_performance, np.dot(self.user_vecs, self.item_vecs.T))) 153 | # self.learning_rates_.append(self.learning_rate) 154 | ndcg_s = [] 155 | for w in range(self.ratings.shape[0]): 156 | ndcg_s.append(ndcg_score([self.ratings[w, :]], [ 157 | np.dot(self.user_vecs[w, :], self.item_vecs.T)], 158 | k=self.n_items)) 159 | 160 | # print('ALORS Fixed iteration', ctr, ndcg_score(self.train_performance, np.dot(self.user_vecs, self.item_vecs.T))) 161 | # print('ALORS Rank Fixed iteration', ctr, 'training', np.mean(ndcg_s)) 162 | self.train_loss_.append(np.mean(ndcg_s)) 163 | 164 | ndcg_s = [] 165 | for w in range(self.valid_ratings.shape[0]): 166 | ndcg_s.append(ndcg_score([self.valid_ratings[w, :]], [ 167 | np.dot(meta_valid_scaled_new[w, :], self.item_vecs.T)], 168 | k=self.n_items)) 169 | 170 | # print('ALORS Fixed iteration', ctr, ndcg_score(self.train_performance, np.dot(self.user_vecs, self.item_vecs.T))) 171 | # print('ALORS Rank Fixed iteration', ctr, 'valid', np.mean(ndcg_s)) 172 | self.valid_loss_.append(np.mean(ndcg_s)) 173 | 174 | print('MetaOD', ctr, 'train', 175 | self.train_loss_[-1], 'valid', self.valid_loss_[-1], 176 | 'learning rate', self.learning_rates_[-1]) 177 | 178 | # improvement is smaller than 1 perc 179 | if ((self.valid_loss_[-1] - self.valid_loss_[-2]) / 180 | self.valid_loss_[-2]) <= 0.001: 181 | # print(((self.valid_loss_[-1] - self.valid_loss_[-2])/self.valid_loss_[-2])) 182 | np_ctr += 1 183 | else: 184 | np_ctr = 1 185 | if np_ctr > 5: 186 | break 187 | 188 | # update learning rates 189 | # self.learning_rate_ = self.learning_rate_ + 0.05 190 | # self.learning_rates_.append(self.learning_rate_) 191 | # if ctr % 2: 192 | # if ctr <=50: 193 | # self.learning_rate_ = min_rate * np.power(discount,ctr) 194 | # else: 195 | # self.learning_rate_ = min_rate * np.power(discount,50) 196 | 197 | # else: 198 | # if ctr <=50: 199 | # self.learning_rate_ = max_rate * np.power(discount,ctr) 200 | # else: 201 | # self.learning_rate_ = max_rate * np.power(discount,50) 202 | 203 | # self.learning_rates_.append(self.learning_rate_) 204 | 205 | train_indices = list(range(self.n_samples)) 206 | np.random.shuffle(train_indices) 207 | # print(train_indices) 208 | 209 | for h in train_indices: 210 | 211 | uh = self.user_vecs[h, :].reshape(1, -1) 212 | # print(uh.shape) 213 | grads = [] 214 | 215 | for i in range(self.n_models): 216 | # outler loop 217 | vi = self.item_vecs[i, :].reshape(-1, 1) 218 | phis = [] 219 | rights = [] 220 | rights_v = [] 221 | # remove i from js 222 | js = list(range(self.n_models)) 223 | js.remove(i) 224 | 225 | for j in js: 226 | vj = self.item_vecs[j, :].reshape(-1, 1) 227 | # temp_vt = np.exp(np.matmul(uh, (vj-vi))) 228 | # temp_vt = np.ndarray.item(temp_vt) 229 | temp_vt = sigmoid( 230 | np.ndarray.item(np.matmul(uh, (vj - vi))), a=1) 231 | temp_vt_derivative = sigmoid_derivate( 232 | np.ndarray.item(np.matmul(uh, (vj - vi))), a=1) 233 | # print(uh.re, (self.item_vecs[j,:]-self.item_vecs[i,:]).T.shape) 234 | # print((self.item_vecs[j,:]-self.item_vecs[i,:]).reshape(-1, 1).shape) 235 | # print(temp_vt.shape) 236 | # assert (len(temp_vt)==1) 237 | phis.append(temp_vt) 238 | rights.append(temp_vt_derivative * (vj - vi)) 239 | rights_v.append(temp_vt_derivative * uh) 240 | phi = np.sum(phis) + 1.5 241 | rights = np.asarray(rights).reshape(self.n_models - 1, 242 | self.n_factors) 243 | rights_v = np.asarray(rights_v).reshape(self.n_models - 1, 244 | self.n_factors) 245 | 246 | # print(rights.shape, rights_v.shape) 247 | 248 | right = np.sum(np.asarray(rights), axis=0) 249 | right_v = np.sum(np.asarray(rights_v), axis=0) 250 | # print(right, right_v) 251 | 252 | # print(np.asarray(rights).shape, np.asarray(right).shape) 253 | grad = (10 ** (self.ratings[h, i]) - 1) / ( 254 | phi * (np.log(phi)) ** 2) * right 255 | grad_v = (10 ** (self.ratings[h, i]) - 1) / ( 256 | phi * (np.log(phi)) ** 2) * right_v 257 | 258 | self.item_vecs[i, :] += self.learning_rate_ * grad_v 259 | 260 | # print(h, i, grad.shape) 261 | grads.append(grad) 262 | 263 | grads_uh = np.asarray(grads) 264 | grad_uh = np.sum(grads_uh, axis=0) 265 | 266 | self.user_vecs[h, :] -= self.learning_rate_ * grad_uh 267 | # print(self.learning_rate_) 268 | 269 | ctr += 1 270 | 271 | # self.regr_multirf = MultiOutputRegressor(RandomForestRegressor( 272 | # n_estimators=n_estimators, max_depth=max_depth, n_jobs=4)) 273 | 274 | # self.regr_multirf = MultiOutputRegressor(Lasso())) 275 | # self.regr_multirf = MultiOutputRegressor(xgb.XGBRegressor(n_estimators=n_estimators)) 276 | 277 | # self.regr_multirf.fit(meta_features, self.user_vecs) 278 | 279 | # disable unnecessary information 280 | self.ratings = None 281 | self.valid_ratings = None 282 | return self 283 | 284 | # def predict(self, u, i): 285 | # """ Single user and item prediction.""" 286 | # # prediction = self.global_bias + self.user_bias[u] + self.item_bias[i] 287 | # prediction = self.user_vecs[u, :].dot(self.item_vecs[i, :].T) 288 | # # prediction += self.user_vecs[u, :].dot(self.item_vecs[i, :].T) 289 | # return prediction 290 | 291 | # def predict_all(self): 292 | # """ Predict train_performance for every user and item.""" 293 | # predictions = np.zeros((self.user_vecs.shape[0], 294 | # self.item_vecs.shape[0])) 295 | # for u in range(self.user_vecs.shape[0]): 296 | # for i in range(self.item_vecs.shape[0]): 297 | # predictions[u, i] = self.predict(u, i) 298 | 299 | # return predictions 300 | 301 | def predict(self, test_meta): 302 | test_meta = check_array(test_meta) 303 | assert (test_meta.shape[1]==200) 304 | 305 | test_meta_scaled = self.pca_.transform(test_meta) 306 | # print('B', test_meta_scaled.shape) 307 | 308 | test_meta_scaled = self.scalar_.transform(test_meta_scaled) 309 | test_meta_scaled = self.regr_multirf.predict(test_meta_scaled) 310 | 311 | # predicted_scores = np.dot(test_k, self.item_vecs.T) + self.item_bias 312 | predicted_scores = np.dot(test_meta_scaled, self.item_vecs.T) 313 | # print(predicted_scores.shape) 314 | assert (predicted_scores.shape[0] == test_meta.shape[0]) 315 | assert (predicted_scores.shape[1] == self.n_models) 316 | 317 | return predicted_scores 318 | 319 | ##################################### 320 | # random_state = np.random.RandomState(42) 321 | 322 | # r = list(range(100)) 323 | # X = random_state.choice(r, size=[100, 5], replace=True)/100 324 | # X_meta = random_state.choice(r, size=[100, 200], replace=True) 325 | 326 | # X_train, X_test, X_train_meta, X_test_meta = train_test_split(X, X_meta, test_size=0.33, random_state=42) 327 | 328 | # train_data_cv, valid_data_cv, train_roc_cv, valid_roc_cv = train_test_split(X_train_meta, X_train, test_size=0.2) 329 | 330 | 331 | # EMF = MetaODClass(train_roc_cv, valid_roc_cv, n_factors=3, learning='sgd', verbose=False) 332 | # EMF.train(n_iter=200, meta_features=train_data_cv, valid_meta=valid_data_cv, learning_rate=0.05, min_rate=0.05, max_rate=0.2, discount=0.98) 333 | 334 | # U = EMF.user_vecs 335 | # V = EMF.item_vecs 336 | 337 | # pred_scores = np.dot(U, V.T) 338 | 339 | # print('rating matrix size:', train_roc_cv.shape) 340 | # print('Our modified loss and gradient results in NDCG:', ndcg_score(train_roc_cv, pred_scores)) 341 | # print() 342 | 343 | # for j in range(10): 344 | # U = np.random.normal(size=U.shape) 345 | # V = np.random.normal(size=V.shape) 346 | # pred_scores = np.dot(U, V.T) 347 | 348 | # print('trial', j, 'random U, V result in NDCG:', ndcg_score(train_roc_cv, pred_scores)) 349 | 350 | # # bias_global = EMF.global_bias 351 | # # bias_user = EMF.user_bias 352 | # # bias_item = EMF.item_bias 353 | 354 | # # # print(EMF.regr_multirf.predict(test_meta).shape) 355 | # predicted_scores = EMF.predict(X_test_meta) 356 | # # predicted_scores_max = np.nanargmax(predicted_scores, axis=1) 357 | -------------------------------------------------------------------------------- /metaod/models/gen_meta_features.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Generate meta-features from an arbitrary dataset. 3 | """ 4 | # Author: Yue Zhao 5 | # License: BSD 2 clause 6 | import pandas as pd 7 | import numpy as np 8 | import itertools 9 | 10 | from sklearn.decomposition import PCA as sklearn_PCA 11 | from scipy.stats import skew, kurtosis 12 | from scipy.stats import f_oneway 13 | from scipy.stats import entropy 14 | 15 | from scipy.stats import moment 16 | from scipy.stats import normaltest 17 | 18 | from pyod.models.hbos import HBOS 19 | from pyod.models.iforest import IForest 20 | from pyod.models.pca import PCA 21 | from pyod.models.loda import LODA 22 | from sklearn.utils import check_array 23 | 24 | 25 | def gini(array): 26 | """Calculate the Gini coefficient of a numpy array.""" 27 | # based on bottom eq: 28 | # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg 29 | # from: 30 | # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm 31 | # All values are treated equally, arrays must be 1d: 32 | array = array.flatten() 33 | if np.amin(array) < 0: 34 | # Values cannot be negative: 35 | array -= np.amin(array) 36 | # Values cannot be 0: 37 | array = np.add(array, 0.0000001, casting="unsafe") 38 | # Values must be sorted: 39 | array = np.sort(array) 40 | # Index per array element: 41 | index = np.arange(1, array.shape[0] + 1) 42 | # Number of array elements: 43 | n = array.shape[0] 44 | # Gini coefficient: 45 | return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array))) 46 | 47 | 48 | def Diff(li1, li2): 49 | """Calculate the difference of two list 50 | 51 | Parameters 52 | ---------- 53 | li1 54 | li2 55 | 56 | Returns 57 | ------- 58 | 59 | """ 60 | return (list(set(li1) - set(li2))) 61 | 62 | 63 | def argmaxn(w, nth): 64 | w = np.asarray(w).ravel() 65 | t = np.argsort(w) 66 | return t[-1 * nth] 67 | 68 | 69 | def flatten_diagonally(x, diags=None): 70 | diags = np.array(diags) 71 | if x.shape[1] > x.shape[0]: 72 | diags += x.shape[1] - x.shape[0] 73 | n = max(x.shape) 74 | ndiags = 2 * n - 1 75 | i, j = np.indices(x.shape) 76 | d = np.array([]) 77 | for ndi in range(ndiags): 78 | if diags != None: 79 | if not ndi in diags: 80 | continue 81 | d = np.concatenate((d, x[i == j + (n - 1) - ndi])) 82 | return d 83 | 84 | 85 | def list_process(x, r_min=True, r_max=True, r_mean=True, r_std=True, 86 | r_skew=True, r_kurtosis=True): 87 | """Return statistics of a list 88 | 89 | Parameters 90 | ---------- 91 | x 92 | r_min 93 | r_max 94 | r_mean 95 | r_std 96 | r_skew 97 | r_kurtosis 98 | 99 | Returns 100 | ------- 101 | 102 | """ 103 | x = np.asarray(x).reshape(-1, 1) 104 | return_list = [] 105 | 106 | if r_min: 107 | return_list.append(np.nanmin(x)) 108 | 109 | if r_max: 110 | return_list.append(np.nanmax(x)) 111 | 112 | if r_mean: 113 | return_list.append(np.nanmean(x)) 114 | 115 | if r_std: 116 | return_list.append(np.nanstd(x)) 117 | 118 | if r_skew: 119 | return_list.append(skew(x, nan_policy='omit')[0]) 120 | 121 | if r_kurtosis: 122 | return_list.append(kurtosis(x, nan_policy='omit')[0]) 123 | 124 | return return_list 125 | 126 | 127 | def list_process_name(var): 128 | return [var + '_min', var + '_max', var + '_mean', var + '_std', 129 | var + '_skewness', var + '_kurtosis'] 130 | 131 | 132 | def generate_meta_features(X): 133 | """Get the meta-features of a datasets X 134 | 135 | Parameters 136 | ---------- 137 | X : numpy array of shape (n_samples, n_features) 138 | Input array 139 | 140 | Returns 141 | ------- 142 | meta_features : numpy array of shape (1, 200) 143 | Meta-feature in dimension of 200 144 | 145 | """ 146 | # outliers_fraction = np.count_nonzero(y) / len(y) 147 | # outliers_percentage = round(outliers_fraction * 100, ndigits=4) 148 | X = check_array(X) 149 | 150 | meta_vec = [] 151 | meta_vec_names = [] 152 | 153 | # on the sample level 154 | n_samples, n_features = X.shape[0], X.shape[1] 155 | 156 | meta_vec.append(n_samples) 157 | meta_vec.append(n_features) 158 | 159 | meta_vec_names.append('n_samples') 160 | meta_vec_names.append('n_features') 161 | 162 | sample_mean = np.mean(X) 163 | sample_median = np.median(X) 164 | sample_var = np.var(X) 165 | sample_min = np.min(X) 166 | sample_max = np.max(X) 167 | sample_std = np.std(X) 168 | 169 | q1, q25, q75, q99 = np.percentile(X, [0.01, 0.25, 0.75, 0.99]) 170 | iqr = q75 - q25 171 | 172 | normalized_mean = sample_mean / sample_max 173 | normalized_median = sample_median / sample_max 174 | sample_range = sample_max - sample_min 175 | sample_gini = gini(X) 176 | med_abs_dev = np.median(np.absolute(X - sample_median)) 177 | avg_abs_dev = np.mean(np.absolute(X - sample_mean)) 178 | quant_coeff_disp = (q75 - q25) / (q75 + q25) 179 | coeff_var = sample_var / sample_mean 180 | 181 | outliers_15iqr = np.logical_or( 182 | X < (q25 - 1.5 * iqr), X > (q75 + 1.5 * iqr)) 183 | outliers_3iqr = np.logical_or(X < (q25 - 3 * iqr), X > (q75 + 3 * iqr)) 184 | outliers_1_99 = np.logical_or(X < q1, X > q99) 185 | outliers_3std = np.logical_or(X < (sample_mean - 3 * sample_std), 186 | X > (sample_mean + 3 * sample_std)) 187 | 188 | percent_outliers_15iqr = np.sum(outliers_15iqr) / len(X) 189 | percent_outliers_3iqr = np.sum(outliers_3iqr) / len(X) 190 | percent_outliers_1_99 = np.sum(outliers_1_99) / len(X) 191 | percent_outliers_3std = np.sum(outliers_3std) / len(X) 192 | 193 | has_outliers_15iqr = np.any(outliers_15iqr).astype(int) 194 | has_outliers_3iqr = np.any(outliers_3iqr).astype(int) 195 | has_outliers_1_99 = np.any(outliers_1_99).astype(int) 196 | has_outliers_3std = np.any(outliers_3std).astype(int) 197 | 198 | meta_vec.extend( 199 | [sample_mean, sample_median, sample_var, sample_min, sample_max, 200 | sample_std, 201 | q1, q25, q75, q99, iqr, normalized_mean, normalized_median, 202 | sample_range, sample_gini, 203 | med_abs_dev, avg_abs_dev, quant_coeff_disp, coeff_var, 204 | # moment_5, moment_6, moment_7, moment_8, moment_9, moment_10, 205 | percent_outliers_15iqr, percent_outliers_3iqr, percent_outliers_1_99, 206 | percent_outliers_3std, 207 | has_outliers_15iqr, has_outliers_3iqr, has_outliers_1_99, 208 | has_outliers_3std]) 209 | 210 | meta_vec_names.extend( 211 | ['sample_mean', 'sample_median', 'sample_var', 'sample_min', 212 | 'sample_max', 'sample_std', 213 | 'q1', 'q25', 'q75', 'q99', 'iqr', 'normalized_mean', 214 | 'normalized_median', 'sample_range', 'sample_gini', 215 | 'med_abs_dev', 'avg_abs_dev', 'quant_coeff_disp', 'coeff_var', 216 | # moment_5, moment_6, moment_7, moment_8, moment_9, moment_10, 217 | 'percent_outliers_15iqr', 'percent_outliers_3iqr', 218 | 'percent_outliers_1_99', 'percent_outliers_3std', 219 | 'has_outliers_15iqr', 'has_outliers_3iqr', 'has_outliers_1_99', 220 | 'has_outliers_3std']) 221 | 222 | ########################################################################### 223 | 224 | normality_k2, normality_p = normaltest(X) 225 | is_normal_5 = (normality_p < 0.05).astype(int) 226 | is_normal_1 = (normality_p < 0.01).astype(int) 227 | 228 | meta_vec.extend(list_process(normality_p)) 229 | meta_vec.extend(list_process(is_normal_5)) 230 | meta_vec.extend(list_process(is_normal_1)) 231 | 232 | meta_vec_names.extend(list_process_name('normality_p')) 233 | meta_vec_names.extend(list_process_name('is_normal_5')) 234 | meta_vec_names.extend(list_process_name('is_normal_1')) 235 | 236 | moment_5 = moment(X, moment=5) 237 | moment_6 = moment(X, moment=6) 238 | moment_7 = moment(X, moment=7) 239 | moment_8 = moment(X, moment=8) 240 | moment_9 = moment(X, moment=9) 241 | moment_10 = moment(X, moment=10) 242 | meta_vec.extend(list_process(moment_5)) 243 | meta_vec.extend(list_process(moment_6)) 244 | meta_vec.extend(list_process(moment_7)) 245 | meta_vec.extend(list_process(moment_8)) 246 | meta_vec.extend(list_process(moment_9)) 247 | meta_vec.extend(list_process(moment_10)) 248 | meta_vec_names.extend(list_process_name('moment_5')) 249 | meta_vec_names.extend(list_process_name('moment_6')) 250 | meta_vec_names.extend(list_process_name('moment_7')) 251 | meta_vec_names.extend(list_process_name('moment_8')) 252 | meta_vec_names.extend(list_process_name('moment_9')) 253 | meta_vec_names.extend(list_process_name('moment_10')) 254 | 255 | # note: this is for each dimension == the number of dimensions 256 | skewness_list = skew(X).reshape(-1, 1) 257 | skew_values = list_process(skewness_list) 258 | meta_vec.extend(skew_values) 259 | meta_vec_names.extend(list_process_name('skewness')) 260 | 261 | # note: this is for each dimension == the number of dimensions 262 | kurtosis_list = kurtosis(X) 263 | kurtosis_values = list_process(kurtosis_list) 264 | meta_vec.extend(kurtosis_values) 265 | meta_vec_names.extend(list_process_name('kurtosis')) 266 | 267 | correlation = np.nan_to_num(pd.DataFrame(X).corr(), nan=0) 268 | correlation_list = flatten_diagonally(correlation)[ 269 | 0:int((n_features * n_features - n_features) / 2)] 270 | correlation_values = list_process(correlation_list) 271 | meta_vec.extend(correlation_values) 272 | meta_vec_names.extend(list_process_name('correlation')) 273 | 274 | covariance = np.cov(X.T) 275 | covariance_list = flatten_diagonally(covariance)[ 276 | 0:int((n_features * n_features - n_features) / 2)] 277 | covariance_values = list_process(covariance_list) 278 | meta_vec.extend(covariance_values) 279 | meta_vec_names.extend(list_process_name('covariance')) 280 | 281 | # sparsity 282 | rep_counts = [] 283 | for i in range(n_features): 284 | rep_counts.append(len(np.unique(X[:, i]))) 285 | sparsity_list = np.asarray(rep_counts) / (n_samples) 286 | sparsity = list_process(sparsity_list) 287 | meta_vec.extend(sparsity) 288 | meta_vec_names.extend(list_process_name('sparsity')) 289 | 290 | # ANOVA p value 291 | p_values_list = [] 292 | all_perm = list(itertools.combinations(list(range(n_features)), 2)) 293 | for j in all_perm: 294 | p_values_list.append(f_oneway(X[:, j[0]], X[:, j[1]])[1]) 295 | anova_p_value = list_process(np.asarray(p_values_list)) 296 | # anova_p_value = np.mean(p_values_list) 297 | # anova_p_value_exceed_thresh = np.mean((np.asarray(p_values_list)<0.05).astype(int)) 298 | meta_vec.extend(anova_p_value) 299 | meta_vec_names.extend(list_process_name('anova_p_value')) 300 | 301 | # pca 302 | pca_transformer = sklearn_PCA(n_components=3) 303 | X_transform = pca_transformer.fit_transform(X) 304 | 305 | # first pc 306 | pca_fpc = list_process(X_transform[0, :], r_min=False, r_max=False, 307 | r_mean=False, 308 | r_std=True, r_skew=True, r_kurtosis=True) 309 | meta_vec.extend(pca_fpc) 310 | meta_vec_names.extend( 311 | ['first_pca_std', 'first_pca_skewness', 'first_pca_kurtosis']) 312 | 313 | # entropy 314 | entropy_list = [] 315 | for i in range(n_features): 316 | counts = pd.Series(X[:, i]).value_counts() 317 | entropy_list.append(entropy(counts) / n_samples) 318 | entropy_values = list_process(entropy_list) 319 | meta_vec.extend(entropy_values) 320 | meta_vec_names.extend(list_process_name('entropy')) 321 | 322 | ##############################Landmarkers###################################### 323 | # HBOS 324 | clf = HBOS(n_bins=10) 325 | clf.fit(X) 326 | HBOS_hists = clf.hist_ 327 | HBOS_mean = np.mean(HBOS_hists, axis=0) 328 | HBOS_max = np.max(HBOS_hists, axis=0) 329 | HBOS_min = np.min(HBOS_hists, axis=0) 330 | meta_vec.extend(list_process(HBOS_mean)) 331 | meta_vec.extend(list_process(HBOS_max)) 332 | meta_vec.extend(list_process(HBOS_min)) 333 | meta_vec_names.extend(list_process_name('HBOS_mean')) 334 | meta_vec_names.extend(list_process_name('HBOS_max')) 335 | meta_vec_names.extend(list_process_name('HBOS_min')) 336 | 337 | # IForest 338 | n_estimators = 100 339 | clf = IForest(n_estimators=n_estimators) 340 | clf.fit(X) 341 | 342 | n_leaves = [] 343 | n_depth = [] 344 | fi_mean = [] 345 | fi_max = [] 346 | 347 | # doing this for each sub-trees 348 | for i in range(n_estimators): 349 | n_leaves.append(clf.estimators_[i].get_n_leaves()) 350 | n_depth.append(clf.estimators_[i].get_depth()) 351 | fi_mean.append(clf.estimators_[i].feature_importances_.mean()) 352 | fi_max.append(clf.estimators_[i].feature_importances_.max()) 353 | # print(clf.estimators_[i].tree_) 354 | 355 | meta_vec.extend(list_process(n_leaves)) 356 | meta_vec.extend(list_process(n_depth)) 357 | meta_vec.extend(list_process(fi_mean)) 358 | meta_vec.extend(list_process(fi_max)) 359 | 360 | meta_vec_names.extend(list_process_name('IForest_n_leaves')) 361 | meta_vec_names.extend(list_process_name('IForest_n_depth')) 362 | meta_vec_names.extend(list_process_name('IForest_fi_mean')) 363 | meta_vec_names.extend(list_process_name('IForest_fi_max')) 364 | 365 | # PCA 366 | clf = PCA(n_components=3) 367 | clf.fit(X) 368 | meta_vec.extend(clf.explained_variance_ratio_) 369 | meta_vec.extend(clf.singular_values_) 370 | meta_vec_names.extend( 371 | ['pca_expl_ratio_1', 'pca_expl_ratio_2', 'pca_expl_ratio_3']) 372 | meta_vec_names.extend(['pca_sv_1', 'pca_sv_2', 'pca_sv_3']) 373 | 374 | # LODA 375 | n_bins = 10 376 | n_random_cuts = 100 377 | 378 | n_hists_mean = [] 379 | n_hists_max = [] 380 | 381 | n_cuts_mean = [] 382 | n_cuts_max = [] 383 | 384 | clf = LODA(n_bins=n_bins, n_random_cuts=n_random_cuts) 385 | clf.fit(X) 386 | 387 | for i in range(n_bins): 388 | n_hists_mean.append(clf.histograms_[:, i].mean()) 389 | n_hists_max.append(clf.histograms_[:, i].max()) 390 | for i in range(n_random_cuts): 391 | n_cuts_mean.append(clf.histograms_[i, :].mean()) 392 | n_cuts_max.append(clf.histograms_[i, :].max()) 393 | 394 | meta_vec.extend(list_process(n_hists_mean)) 395 | meta_vec.extend(list_process(n_hists_max)) 396 | meta_vec.extend(list_process(n_cuts_mean)) 397 | meta_vec.extend(list_process(n_cuts_max)) 398 | 399 | meta_vec_names.extend(list_process_name('LODA_n_hists_mean')) 400 | meta_vec_names.extend(list_process_name('LODA_n_hists_max')) 401 | meta_vec_names.extend(list_process_name('LODA_n_cuts_mean')) 402 | meta_vec_names.extend(list_process_name('LODA_n_cuts_max')) 403 | 404 | return meta_vec, meta_vec_names 405 | -------------------------------------------------------------------------------- /metaod/models/predict_metaod.py: -------------------------------------------------------------------------------- 1 | """MetaOD prediction with the trained model 2 | """ 3 | # License: BSD 2 clause 4 | 5 | 6 | # environment setting 7 | from zipfile import ZipFile 8 | import os 9 | from joblib import load 10 | from pyod.utils.data import generate_data 11 | import numpy as np 12 | 13 | from metaod.models.gen_meta_features import generate_meta_features 14 | from metaod.models.utility import fix_nan 15 | from pyod.utils.data import generate_data 16 | 17 | def get_top_models(p, n): 18 | return np.flip(np.argsort(p))[:n] 19 | 20 | def select_model(X, trained_model_location="trained_models", n_selection=1): 21 | 22 | # print(os.path.realpath(__file__)) 23 | # unzip trained models 24 | # with ZipFile(os.path.join(os.path.dirname(os.path.realpath(__file__)), 25 | # 'trained_models.zip'), 'r') as zip: 26 | # # # printing all the contents of the zip file 27 | # # zip.printdir() 28 | 29 | # # extracting all the files 30 | # print('Extracting trained models now...') 31 | # zip.extractall(path='trained_models') 32 | # print('Finish extracting models') 33 | 34 | # load PCA scalar 35 | meta_scalar = load(os.path.join(trained_model_location,"meta_scalar.joblib")) 36 | # generate meta features 37 | meta_X, _ = generate_meta_features(X) 38 | meta_X = np.nan_to_num(meta_X,nan=0) 39 | # replace nan by 0 for now 40 | # todo: replace by mean is better as fix_nan 41 | meta_X = meta_scalar.transform(np.asarray(meta_X).reshape(1, -1)).astype(float) 42 | 43 | # use all trained models for ensemble 44 | trained_models = [ 45 | "train_0.joblib", 46 | "train_2.joblib", 47 | # "train_42.joblib" 48 | ] 49 | print(os.getcwd()) 50 | # # load trained models 51 | model_lists = list(load(os.path.join(trained_model_location,"model_list.joblib"))) 52 | 53 | predict_scores = np.zeros([len(trained_models), len(model_lists)]) 54 | 55 | for i, model in enumerate(trained_models): 56 | clf = load(os.path.join(trained_model_location, model)) 57 | # w = load (model) 58 | predict_scores[i,] = clf.predict(meta_X) 59 | predicted_scores_max = np.nanargmax(predict_scores[i,]) 60 | # print('top model', model_lists[predicted_scores_max]) 61 | combined_predict = np.average(predict_scores, axis=0) 62 | 63 | predicted_scores_sorted = get_top_models(combined_predict, n_selection) 64 | predicted_scores_max = np.nanargmax(combined_predict) 65 | 66 | print('top model', model_lists[predicted_scores_sorted[0]]) 67 | 68 | return np.asarray(model_lists)[predicted_scores_sorted] 69 | 70 | # if __name__ == "__main__": 71 | 72 | # contamination = 0.1 # percentage of outliers 73 | # n_train = 1000 # number of training points 74 | # n_test = 100 # number of testing points 75 | 76 | # # Generate sample data 77 | # X_train, y_train, X_test, y_test = \ 78 | # generate_data(n_train=n_train, 79 | # n_test=n_test, 80 | # n_features=3, 81 | # contamination=contamination, 82 | # random_state=42) 83 | 84 | # clf_setting = select_model(X_train, n_selection=10) 85 | -------------------------------------------------------------------------------- /metaod/models/train_metaod.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Sep 24 17:13:27 2020 4 | 5 | @author: yuezh 6 | """ 7 | 8 | import os 9 | import random 10 | import pandas as pd 11 | import numpy as np 12 | 13 | from sklearn.utils import check_array 14 | from sklearn.preprocessing import MinMaxScaler 15 | 16 | from scipy.io import loadmat 17 | 18 | from joblib import dump 19 | 20 | from metaod.models.utility import read_arff, fix_nan 21 | from metaod.models.gen_meta_features import generate_meta_features 22 | from metaod.models.core import MetaODClass 23 | 24 | # read in performance table 25 | roc_df = pd.read_excel(os.path.join('data', 'performance_table.xlsx'), 26 | sheet_name='AP') 27 | 28 | # trim the table 29 | roc_mat = roc_df.to_numpy() 30 | roc_mat_red = fix_nan(roc_mat[2:, 4:].astype('float')) 31 | 32 | # get statistics of the training data 33 | n_datasets, n_configs = roc_mat_red.shape[0], roc_mat_red.shape[1] 34 | data_headers = roc_mat[2:, 0] 35 | config_headers = roc_df.columns[4:] 36 | dump(config_headers, 'model_list.joblib') 37 | 38 | # %% 39 | 40 | # build meta-features 41 | meta_mat = np.zeros([n_datasets, 200]) 42 | 43 | # read in mat files 44 | mat_file_list = [ 45 | 'annthyroid.mat', 46 | 'arrhythmia.mat', 47 | 'breastw.mat', 48 | 'glass.mat', 49 | 'ionosphere.mat', 50 | 'letter.mat', 51 | 'lympho.mat', 52 | 'mammography.mat', 53 | 'mnist.mat', 54 | 'musk.mat', 55 | 'optdigits.mat', 56 | 'pendigits.mat', 57 | 'pima.mat', 58 | 'satellite.mat', 59 | 'satimage-2.mat', 60 | 'shuttle.mat', 61 | 'smtp_n.mat', 62 | 'speech.mat', 63 | 'thyroid.mat', 64 | 'vertebral.mat', 65 | 'vowels.mat', 66 | 'wbc.mat', 67 | 'wine.mat', 68 | 'Annthyroid', 69 | 'Arrhythmia', 70 | 'Cardiotocography', 71 | 'HeartDisease', # too small 72 | 'Hepatitis', # too small 73 | 'InternetAds', 74 | 'PageBlocks', 75 | 'Pima', 76 | 'SpamBase', 77 | 'Stamps', 78 | 'Wilt', 79 | 80 | 'ALOI', # too large 81 | 'Glass', # too small 82 | 'PenDigits', 83 | 'Shuttle', 84 | 'Waveform', 85 | 'WBC', # too small 86 | 'WDBC', # too small 87 | 'WPBC', # too small 88 | ] 89 | 90 | for j in range(23): 91 | mat_file = mat_file_list[j] 92 | mat = loadmat(os.path.join("data", "ODDS", mat_file)) 93 | X = mat['X'] 94 | meta_mat[j, :], meta_vec_names = generate_meta_features(X) 95 | print(j, mat_file) 96 | 97 | # read arff files 98 | file_names = [ 99 | 'Annthyroid', 100 | 'Arrhythmia', 101 | 'Cardiotocography', 102 | 'HeartDisease', # too small 103 | 'Hepatitis', # too small 104 | 'InternetAds', 105 | 'PageBlocks', 106 | 'Pima', 107 | 'SpamBase', 108 | 'Stamps', 109 | 'Wilt', 110 | 111 | 'ALOI', # too large 112 | 'Glass', # too small 113 | 'PenDigits', 114 | 'Shuttle', 115 | 'Waveform', 116 | 'WBC', # too small 117 | 'WDBC', # too small 118 | 'WPBC', # too small 119 | ] 120 | 121 | ############################################################################# 122 | misplaced_list = ['Arrhythmia', 'Cardiotocography', 'Hepatitis', 'ALOI', 123 | 'KDDCup99'] 124 | arff_list = [ 125 | os.path.join('semantic', 'Annthyroid', 'Annthyroid_withoutdupl_07.arff'), 126 | os.path.join('semantic', 'Arrhythmia', 'Arrhythmia_withoutdupl_46.arff'), 127 | os.path.join('semantic', 'Cardiotocography', 128 | 'Cardiotocography_withoutdupl_22.arff'), 129 | os.path.join('semantic', 'HeartDisease', 130 | 'HeartDisease_withoutdupl_44.arff'), 131 | os.path.join('semantic', 'Hepatitis', 'Hepatitis_withoutdupl_16.arff'), 132 | os.path.join('semantic', 'InternetAds', 133 | 'InternetAds_withoutdupl_norm_19.arff'), 134 | os.path.join('semantic', 'PageBlocks', 'PageBlocks_withoutdupl_09.arff'), 135 | os.path.join('semantic', 'Pima', 'Pima_withoutdupl_35.arff'), 136 | os.path.join('semantic', 'SpamBase', 'SpamBase_withoutdupl_40.arff'), 137 | os.path.join('semantic', 'Stamps', 'Stamps_withoutdupl_09.arff'), 138 | os.path.join('semantic', 'Wilt', 'Wilt_withoutdupl_05.arff'), 139 | 140 | os.path.join('literature', 'ALOI', 'ALOI_withoutdupl.arff'), 141 | os.path.join('literature', 'Glass', 'Glass_withoutdupl_norm.arff'), 142 | os.path.join('literature', 'PenDigits', 143 | 'PenDigits_withoutdupl_norm_v01.arff'), 144 | os.path.join('literature', 'Shuttle', 'Shuttle_withoutdupl_v01.arff'), 145 | os.path.join('literature', 'Waveform', 'Waveform_withoutdupl_v01.arff'), 146 | os.path.join('literature', 'WBC', 'WBC_withoutdupl_v01.arff'), 147 | os.path.join('literature', 'WDBC', 'WDBC_withoutdupl_v01.arff'), 148 | os.path.join('literature', 'WPBC', 'WPBC_withoutdupl_norm.arff') 149 | ] 150 | 151 | for j in range(23, 42): 152 | mat_file = file_names[j - 23] 153 | mat_file_path = os.path.join("data", "DAMI", arff_list[j - 24]) 154 | X, y, attributes = read_arff(mat_file_path, misplaced_list) 155 | X = check_array(X).astype('float64') 156 | meta_mat[j, :], meta_vec_names = generate_meta_features(X) 157 | print("processing", j, mat_file) 158 | 159 | # read emmott dataset 160 | selected_bench = pd.read_csv(os.path.join('data', 'childsets.csv'))[ 161 | 'bench.id'].values.tolist() 162 | selected_bench_loc = pd.read_csv(os.path.join('data', 'childsets.csv'))[ 163 | 'location'].values.tolist() 164 | 165 | for j in range(42, 142): 166 | print("processing", j, selected_bench_loc[j - 42]) 167 | mat = pd.read_csv( 168 | os.path.join("data", "Emmott", selected_bench_loc[j - 42])) 169 | X = mat.to_numpy()[:, 6:].astype(float) 170 | meta_mat[j, :], meta_vec_names = generate_meta_features(X) 171 | 172 | # use cleaned and transformed meta-features 173 | meta_scalar = MinMaxScaler() 174 | meta_mat_transformed = meta_scalar.fit_transform(meta_mat) 175 | meta_mat_transformed = fix_nan(meta_mat_transformed) 176 | dump(meta_scalar, 'meta_scalar.joblib') 177 | # %% train model 178 | 179 | # split data into train and valid 180 | seed = 0 181 | full_list = list(range(n_datasets)) 182 | random.Random(seed).shuffle(full_list) 183 | n_train = int(0.85 * n_datasets) 184 | 185 | train_index = full_list[:n_train] 186 | valid_index = full_list[n_train:] 187 | 188 | train_set = roc_mat_red[train_index, :].astype('float64') 189 | valid_set = roc_mat_red[valid_index, :].astype('float64') 190 | 191 | train_meta = meta_mat_transformed[train_index, :].astype('float64') 192 | valid_meta = meta_mat_transformed[valid_index, :].astype('float64') 193 | 194 | clf = MetaODClass(train_set, valid_performance=valid_set, n_factors=30, 195 | learning='sgd') 196 | clf.train(n_iter=50, meta_features=train_meta, valid_meta=valid_meta, 197 | learning_rate=0.05, max_rate=0.9, min_rate=0.1, discount=1, 198 | n_steps=8) 199 | 200 | # U = clf.user_vecs 201 | # V = clf.item_vecs 202 | 203 | # # # print(EMF.regr_multirf.predict(test_meta).shape) 204 | # predicted_scores = clf.predict(valid_meta) 205 | # predicted_scores_max = np.nanargmax(predicted_scores, axis=1) 206 | # print() 207 | # output transformer (for meta-feature) and the trained clf 208 | dump(clf, 'train_' + str(seed) + '.joblib') 209 | 210 | #%% 211 | # # %% 212 | # import pickle 213 | # from metaod.models.core import MetaODClass 214 | 215 | # if __name__ == "__main__": 216 | # # # code for standalone use 217 | # # t = Thing("foo") 218 | # # Thing.__module__ = "thing" 219 | # # t.save("foo.pickle") 220 | # # MetaODClass.__module__ = "metaod" 221 | # file = open('test.pk', 'wb') 222 | # pickle.dump(clf, file) 223 | 224 | # # # file = open('rf.pk', 'wb') 225 | # # # pickle.dump(clf.user_vecs, file) 226 | -------------------------------------------------------------------------------- /metaod/models/trained_models/meta_scalar.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/metaod/models/trained_models/meta_scalar.joblib -------------------------------------------------------------------------------- /metaod/models/trained_models/model_list.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/metaod/models/trained_models/model_list.joblib -------------------------------------------------------------------------------- /metaod/models/trained_models/train_0.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/metaod/models/trained_models/train_0.joblib -------------------------------------------------------------------------------- /metaod/models/utility.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import numpy as np 4 | import arff 5 | from zipfile import ZipFile 6 | import urllib.request 7 | 8 | def Diff(li1, li2): 9 | return (list(set(li1) - set(li2))) 10 | 11 | def argmaxatn(w, nth): 12 | w = np.asarray(w).ravel() 13 | t = np.argsort(w) 14 | return t[-1*nth] 15 | 16 | def fix_nan(X): 17 | # TODO: should store the mean of the meta features to be used for test_meta 18 | # replace by 0 for now 19 | col_mean = np.nanmean(X, axis = 0) 20 | inds = np.where(np.isnan(X)) 21 | X[inds] = np.take(col_mean, inds[1]) 22 | 23 | return X 24 | 25 | 26 | def read_arff(file_path, misplaced_list): 27 | misplaced = False 28 | for item in misplaced_list: 29 | if item in file_path: 30 | misplaced = True 31 | 32 | file = arff.load(open(file_path)) 33 | data_value = np.asarray(file['data']) 34 | attributes = file['attributes'] 35 | 36 | X = data_value[:, 0:-2] 37 | if not misplaced: 38 | y = data_value[:, -1] 39 | else: 40 | y = data_value[:, -2] 41 | y[y == 'no'] = 0 42 | y[y == 'yes'] = 1 43 | y = y.astype('float').astype('int').ravel() 44 | 45 | if y.sum() > len(y): 46 | print(attributes) 47 | raise ValueError('wrong sum') 48 | 49 | return X, y, attributes 50 | 51 | def prepare_trained_model(url='https://github.com/yzhao062/MetaOD/raw/master/saved_models/trained_models.zip', 52 | filename='trained_models.zip', 53 | save_path='trained_models'): 54 | 55 | if not os.path.exists(save_path): 56 | os.makedirs(save_path) 57 | 58 | urllib.request.urlretrieve(url, filename) 59 | 60 | # print(os.path.join(os.path.dirname(os.path.realpath(__file__)), 61 | # # 'trained_models.zip')) 62 | # #todo: verify file exists 63 | with ZipFile(filename, 'r') as zip: 64 | # # printing all the contents of the zip file 65 | # zip.printdir() 66 | # extracting all the files 67 | print('Extracting trained models now...') 68 | zip.extractall() 69 | print('Finish extracting models') 70 | 71 | 72 | # url='https://github.com/yzhao062/MetaOD/raw/master/saved_models/trained_models.zip' 73 | # filename='trained_models.zip' 74 | # save_path='trained_models' 75 | 76 | # if not os.path.exists(save_path): 77 | # os.makedirs(save_path) 78 | 79 | # urllib.request.urlretrieve(url, os.path.join(save_path, filename)) 80 | 81 | # print(os.path.join(os.path.dirname(os.path.realpath(__file__)), 82 | # 'trained_models.zip')) 83 | # #todo: verify file exists 84 | # with ZipFile(os.path.join(save_path, filename), 'r') as zip: 85 | # # # printing all the contents of the zip file 86 | # # zip.printdir() 87 | 88 | # # extracting all the files 89 | # print('Extracting trained models now...') 90 | # zip.extractall(path='trained_models') 91 | # print('Finish extracting models') -------------------------------------------------------------------------------- /metaod/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/metaod/test/__init__.py -------------------------------------------------------------------------------- /metaod/test/test_predict_metaod.py: -------------------------------------------------------------------------------- 1 | """MetaOD prediction with the trained model 2 | """ 3 | # License: BSD 2 clause 4 | import os 5 | import unittest 6 | 7 | from pyod.utils.data import generate_data 8 | from metaod.models.utility import prepare_trained_model 9 | from metaod.models.predict_metaod import select_model 10 | 11 | 12 | class TestPredictMetaOD(unittest.TestCase): 13 | def setUp(self): 14 | self.contamination = 0.05 # percentage of outliers 15 | self.n_train = 1000 # number of training points 16 | self.n_test = 100 # number of testing points 17 | 18 | # Generate sample data 19 | self.X_train, self.y_train, self.X_test, self.y_test = \ 20 | generate_data(n_train=self.n_train, 21 | n_test=self.n_test, 22 | n_features=3, 23 | contamination=self.contamination, 24 | random_state=42) 25 | 26 | def test_prepare_trained_model(self): 27 | # load pretrained models 28 | prepare_trained_model() 29 | print(os.path.join(os.getcwd(), "trained_models")) 30 | assert (os.path.isfile("trained_models.zip")) 31 | assert (os.path.isdir("trained_models")) 32 | 33 | def test_model_selection(self): 34 | prepare_trained_model() 35 | # recommended models 36 | selected_models = select_model(self.X_train, n_selection=100) 37 | assert ((len(selected_models) == 100)) 38 | -------------------------------------------------------------------------------- /metaod/version.py: -------------------------------------------------------------------------------- 1 | """ 2 | ``pyod`` is a python toolbox for scalable outlier detection 3 | """ 4 | # Based on NiLearn package 5 | # License: simplified BSD 6 | 7 | # PEP0440 compatible formatted version, see: 8 | # https://www.python.org/dev/peps/pep-0440/ 9 | # 10 | # Generic release markers: 11 | # X.Y 12 | # X.Y.Z # For bug fix releases 13 | # 14 | # Admissible pre-release markers: 15 | # X.YaN # Alpha release 16 | # X.YbN # Beta release 17 | # X.YrcN # Release Candidate 18 | # X.Y # Final release 19 | # 20 | # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. 21 | # 'X.Y.dev0' is the canonical version of 'X.Y.dev' 22 | # 23 | __version__ = '0.0.6' # pragma: no cover 24 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib>=0.14.1 2 | liac-arff 3 | numpy>=1.18.1 4 | scipy>=0.20 5 | scikit_learn==0.22.1 6 | pandas>=0.20 7 | pyod>=0.8 -------------------------------------------------------------------------------- /saved_models/trained_models.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/saved_models/trained_models.zip -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.rst -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | # read the contents of README file 4 | from os import path 5 | from io import open # for Python 2 and 3 compatibility 6 | 7 | # get __version__ from _version.py 8 | ver_file = path.join('metaod', 'version.py') 9 | with open(ver_file) as f: 10 | exec(f.read()) 11 | 12 | this_directory = path.abspath(path.dirname(__file__)) 13 | 14 | 15 | # read the contents of README.rst 16 | def readme(): 17 | with open(path.join(this_directory, 'README.rst'), encoding='utf-8') as f: 18 | return f.read() 19 | 20 | 21 | # read the contents of requirements.txt 22 | with open(path.join(this_directory, 'requirements.txt'), 23 | encoding='utf-8') as f: 24 | requirements = f.read().splitlines() 25 | 26 | setup( 27 | name='metaod', 28 | version=__version__, 29 | description='Automating Outlier Detection via Meta-Learning (selece/recommend OD model(s) for new datasets)', 30 | long_description=readme(), 31 | long_description_content_type='text/x-rst', 32 | author='Yue Zhao', 33 | author_email='zhaoy@cmu.edu', 34 | url='https://github.com/yzhao062/metaod', 35 | download_url='https://github.com/yzhao062/metaod/archive/master.zip', 36 | keywords=['outlier detection', 'anomaly detection', 'outlier ensembles', 37 | 'data mining', 'meta learning', 'AutoML'], 38 | packages=find_packages(exclude=['test']), 39 | include_package_data=True, 40 | install_requires=requirements, 41 | setup_requires=['setuptools>=38.6.0'], 42 | classifiers=[ 43 | 'Development Status :: 2 - Pre-Alpha', 44 | 'Intended Audience :: Education', 45 | 'Intended Audience :: Financial and Insurance Industry', 46 | 'Intended Audience :: Science/Research', 47 | 'Intended Audience :: Developers', 48 | 'Intended Audience :: Information Technology', 49 | 'License :: OSI Approved :: BSD License', 50 | 'Programming Language :: Python :: 3.5', 51 | 'Programming Language :: Python :: 3.6', 52 | 'Programming Language :: Python :: 3.7', 53 | ], 54 | ) 55 | --------------------------------------------------------------------------------