├── .circleci
    └── config.yml
├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.rst
├── appveyor.yml
├── docs
    └── images
    │   ├── MetaOD_Flowchart.jpg
    │   └── meta_vis.jpg
├── examples
    ├── meta_feature_generation_example.py
    └── model_selection_example.py
├── metaod
    ├── __init__.py
    ├── models
    │   ├── __init__.py
    │   ├── base_detectors.py
    │   ├── core.py
    │   ├── gen_meta_features.py
    │   ├── predict_metaod.py
    │   ├── train_metaod.py
    │   ├── trained_models
    │   │   ├── meta_scalar.joblib
    │   │   ├── model_list.joblib
    │   │   └── train_0.joblib
    │   └── utility.py
    ├── test
    │   ├── __init__.py
    │   └── test_predict_metaod.py
    └── version.py
├── requirements.txt
├── saved_models
    └── trained_models.zip
├── setup.cfg
└── setup.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | # Python CircleCI 2.0 configuration file
 2 | #
 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details
 4 | # Adapted from https://github.com/NeuralEnsemble/python-neo
 5 | version: 2
 6 | workflows:
 7 |   version: 2
 8 |   test:
 9 |     jobs:
10 |       - test-3.6
11 | jobs:
12 |   test-3.6:
13 |     docker:
14 |       - image: circleci/python:3.6-stretch
15 | 
16 |     working_directory: ~/repo
17 | 
18 |     steps:
19 |       - checkout
20 |       - run: sudo chown -R circleci:circleci /usr/local/bin
21 | 
22 |       # Download and cache dependencies
23 |       - restore_cache:
24 |           keys:
25 |           - v1-py3-dependencies-{{ checksum "requirements_ci.txt" }}
26 |           # fallback to using the latest cache if no exact match is found
27 |           - v1-py3-dependencies-
28 | 
29 |       - run:
30 |           name: install dependencies
31 |           command: |
32 |             python3 -m venv venv
33 |             . venv/bin/activate
34 |             pip install --upgrade pip
35 |             pip install -r requirements.txt
36 |             pip install pandas
37 |             pip install pytest
38 |             pip install pytest-cov
39 | 
40 | 
41 |       - save_cache:
42 |           paths:
43 |             - ./venv
44 |           key: v1-py3-dependencies-{{ checksum "requirements.txt" }}
45 | 
46 | 
47 |       # run tests!
48 |       - run:
49 |           name: run tests
50 |           command: |
51 |             . venv/bin/activate
52 |             pytest
53 | 
54 |       - store_artifacts:
55 |           path: test-reports
56 |           destination: test-reports
57 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | dist: xenial
 3 | 
 4 | env:
 5 |   global:
 6 |     PIP_PREFER_BINARY=true # https://github.com/numba/llvmlite/issues/471
 7 | 
 8 | python:
 9 |   - "3.5"  # disabled for now as warning messages crash travis ci. turn it back in 3.5.7
10 |   - "3.5-dev"  # 3.5 development branch
11 |   - "3.6"
12 |   - "3.6-dev"  # 3.6 development branch
13 |   - "3.7"  # 3.6 development branch
14 | 
15 | install:
16 |   - pip install --upgrade pip
17 |   - pip install -r requirements.txt
18 |   - pip install pytest
19 |   - pip install pandas
20 |   - pip install pytest-cov
21 |   - pip install coveralls
22 | 
23 | # command to run tests
24 | script:
25 |    pytest --cov=metaod/
26 | 
27 | after_success:
28 |   - coveralls
29 | 
30 | notifications:
31 |   email:
32 |     recipients:
33 |       - yzhao062@gmail.com
34 |   on_success: never # default: change
35 |   on_failure: always # default: always


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2021, Yue Zhao
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | prune examples
2 | prune notebooks
3 | prune paper_reproducibility
4 | prune metaod/test
5 | prune README.md
6 | include README.rst
7 | include requirements.txt


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | Automating Outlier Detection via Meta-Learning (MetaOD)
  2 | =====================================================================
  3 | 
  4 | 
  5 | .. image:: https://img.shields.io/pypi/v/metaod.svg?color=brightgreen
  6 |    :target: https://pypi.org/project/metaod/
  7 |    :alt: PyPI version
  8 | 
  9 | .. image:: https://img.shields.io/github/stars/yzhao062/metaod.svg
 10 |    :target: https://github.com/yzhao062/metaod/stargazers
 11 |    :alt: GitHub stars
 12 | 
 13 | .. image:: https://img.shields.io/github/forks/yzhao062/metaod.svg?color=blue
 14 |    :target: https://github.com/yzhao062/metaod/network
 15 |    :alt: GitHub forks
 16 | 
 17 | .. image:: https://circleci.com/gh/yzhao062/MetaOD.svg?style=svg
 18 |    :target: https://circleci.com/gh/yzhao062/MetaOD
 19 |    :alt: Circle CI
 20 | 
 21 | .. image:: https://travis-ci.org/yzhao062/MetaOD.svg?branch=master
 22 |     :target: https://travis-ci.org/yzhao062/MetaOD
 23 | 
 24 | ----
 25 | 
 26 | **Development Status**: **As of 09/26/2020, MetaOD is under active development and in its alpha stage. Please follow, star, and fork to get the latest update**!
 27 | For paper reproducibility, please see the paper_reproducibility folder for instruction.
 28 | 
 29 | **Given an unsupervised outlier detection (OD) task on a new dataset, how can we automatically select a good outlier detection method and its hyperparameter(s) (collectively called a model)?**
 30 | Thus far, model selection for OD has been a "black art"; as any model evaluation is infeasible due to the lack of (i) hold-out data with labels, and (ii) a universal objective function.
 31 | In this work, we develop the first principled data-driven approach to model selection for OD, called MetaOD, based on meta-learning.
 32 | In short, MetaOD is trained on extensive OD benchmark datasets to capitalize the prior experience so that **it could select the potentially best performing model for unseen datasets**.
 33 | 
 34 | Using MetaOD is easy.
 35 | **You could pass in a dataset, and MetaOD will return the most performing outlier detection models for it**, which boosts both detection quality and reduces the cost of running multiple models.
 36 | 
 37 | 
 38 | **API Demo for selecting outlier detection model on a new dataset (within 3 lines)**\ :
 39 | 
 40 | 
 41 | .. code-block:: python
 42 | 
 43 |    from metaod.models.utility import prepare_trained_model
 44 |    from metaod.models.predict_metaod import select_model
 45 | 
 46 |    # load pretrained MetaOD model
 47 |    prepare_trained_model()
 48 | 
 49 |    # use MetaOD to recommend models. It returns the top n model for new data X_train
 50 |    selected_models = select_model(X_train, n_selection=100)
 51 | 
 52 | 
 53 | 
 54 | `Preprint paper <https://arxiv.org/abs/2009.10606>`_ | `Reproducibility instruction <https://github.com/yzhao062/MetaOD/tree/master/paper_reproducibility>`_
 55 | 
 56 | **Citing MetaOD**\ :
 57 | 
 58 | If you use MetaOD in a scientific publication, we would appreciate
 59 | citations to the following paper::
 60 | 
 61 |     @article{zhao2020automating,
 62 |       author  = {Zhao, Yue and Ryan Rossi and Leman Akoglu},
 63 |       title   = {Automating Outlier Detection via Meta-Learning},
 64 |       journal = {arXiv preprint arXiv:2009.10606},
 65 |       year    = {2020},
 66 |     }
 67 | 
 68 | or::
 69 | 
 70 |     Zhao, Y., Rossi, R., and Akoglu, L., 2020. Automating Outlier Detection via Meta-Learning. arXiv preprint arXiv:2009.10606.
 71 |     
 72 |     
 73 | **Table of Contents**\ :
 74 | 
 75 | 
 76 | * `Installation <#installation>`_
 77 | * `API Cheatsheet & Reference <#api-cheatsheet--reference>`_
 78 | * `Quick Start for Model Selection <#quick-start-for-model-selection>`_
 79 | * `Quick Start for Meta Feature Generation <#quick-start-for-meta-feature-generation>`_
 80 | 
 81 | 
 82 | ------------
 83 | 
 84 | System Introduction
 85 | ^^^^^^^^^^^^^^^^^^^
 86 | 
 87 | As shown in the figure below, MetaOD contains offline meta-learner training and online model selection.
 88 | For selecting an outlier detection model for a new dataset, one only needs the online model selection. Specifically, to be finished.
 89 | 
 90 | 
 91 | .. image:: https://raw.githubusercontent.com/yzhao062/MetaOD/master/docs/images/MetaOD_Flowchart.jpg
 92 |    :target: https://raw.githubusercontent.com/yzhao062/MetaOD/master/docs/images/MetaOD_Flowchart.jpg
 93 |    :alt: metaod_flow
 94 |    :align: center
 95 | 
 96 | -----
 97 | 
 98 | 
 99 | Installation
100 | ^^^^^^^^^^^^
101 | 
102 | It is recommended to use **pip** for installation. Please make sure
103 | **the latest version** is installed, as MetaOD is updated frequently:
104 | 
105 | .. code-block:: bash
106 | 
107 |    pip install metaod            # normal install
108 |    pip install --upgrade metaod  # or update if needed
109 |    pip install --pre metaod      # or include pre-release version for new features
110 | 
111 | Alternatively, you could clone and run setup.py file:
112 | 
113 | .. code-block:: bash
114 | 
115 |    git clone https://github.com/yzhao062/metaod.git
116 |    cd metaod
117 |    pip install .
118 |   
119 |   
120 | **Required Dependencies**\ :
121 | 
122 | 
123 | * Python 3.5, 3.6, or 3.7
124 | * joblib>=0.14.1
125 | * liac-arff
126 | * numpy>=1.18.1
127 | * scipy>=0.20
128 | * **scikit_learn==0.22.1**
129 | * pandas>=0.20
130 | * pyod>=0.8
131 | 
132 | **Note**: Since we need to load trained models, we fix the scikit-learn version
133 | to 0.20. We recommend you to use MetaOD in a fully fresh env to have the right dependency.
134 | 
135 | 
136 | Quick Start for Model Selection
137 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
138 | 
139 | `"examples/model_selection_example.py" <https://github.com/yzhao062/MetaOD/blob/master/examples/model_selection_example.py>`_
140 | provide an example on using MetaOD for selecting top models on a new datasets, which is fully unsupervised.
141 | 
142 | The key procedures are below:
143 | 
144 | #. Load some synthetic datasets
145 | 
146 |    .. code-block:: python
147 | 
148 |     # Generate sample data
149 |     X_train, y_train, X_test, y_test = \
150 |         generate_data(n_train=1000,
151 |                       n_test=100,
152 |                       n_features=3,
153 |                       contamination=0.5,
154 |                       random_state=42)
155 | 
156 | #. Use MetaOD to select top 100 models
157 | 
158 |    .. code-block:: python
159 | 
160 |     from metaod.models.utility import prepare_trained_model
161 |     from metaod.models.predict_metaod import select_model
162 | 
163 |     # load pretrained models
164 |     prepare_trained_model()
165 | 
166 |     # recommended models. this returns the top model for X_train
167 |     selected_models = select_model(X_train, n_selection=100)
168 | 
169 | 
170 | #. Show the selected models' performance evaluation (result may vary slightly due to built-in randomness).
171 | 
172 |    .. code-block:: python
173 | 
174 | 
175 |     1st model Average Precision 0.9780551579734139
176 |     10th model Average Precision 0.959749602397687
177 |     50th model Average Precision 0.6211392467111937
178 | 
179 | 
180 | Quick Start for Meta Feature Generation
181 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
182 | 
183 | Getting the embedding of an arbitrary dataset is first step of MetaOD, which
184 | cam be done by our specialized meta-feature generation function.
185 | 
186 | It may be used for other purposes as well, e.g., measuring the similarity of
187 | two datasets.
188 | 
189 | .. code-block:: python
190 | 
191 |     # import meta-feature generator
192 |     from metaod.models.gen_meta_features import gen_meta_features
193 | 
194 |     meta_features, _ = generate_meta_features(X)
195 | 
196 | A simple example of visualizing two different environments using TSNE with
197 | our meta-features are shown below. The environment on the left is composed
198 | 100 datasets with similarity, and the same color stands for same group of datasets.
199 | The environment on the left is composed
200 | 62 datasets without known similarity. Our meta-features successfully capture
201 | the underlying similarity in the left figure.
202 | 
203 | .. image:: https://raw.githubusercontent.com/yzhao062/MetaOD/master/docs/images/meta_vis.jpg
204 |    :target: https://raw.githubusercontent.com/yzhao062/MetaOD/master/docs/images/meta_vis.jpg
205 |    :alt: meta_viz
206 |    :align: center
207 | 
208 | 
209 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | build: off
 2 | 
 3 | # branches to build
 4 | branches:
 5 |   # whitelist
 6 |   only:
 7 |     - master
 8 |     - development
 9 | environment:
10 |   matrix:
11 |     - PYTHON: "C:\\Python36-x64"
12 |     - PYTHON: "C:\\Python37-x64"
13 | 
14 | skip_commits:
15 |   files:
16 |     - "*.yml"
17 |     - "*.rst"
18 |     - "*.md"
19 |     - "LICENSE"
20 | 
21 | init:
22 |   - "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%"
23 | 
24 | install:
25 |   - "%PYTHON%\\python.exe -m pip install --upgrade pip setuptools"
26 |   - "%PYTHON%\\python.exe -m pip install wheel"
27 |   - "%PYTHON%\\python.exe -m pip install pytest"
28 |   - "%PYTHON%\\python.exe -m pip install pandas"
29 |   - "%PYTHON%\\python.exe -m pip install -r requirements.txt"
30 | 
31 | 
32 | 
33 | test_script:
34 |   - "%PYTHON%\\python.exe -m pytest"
35 | 
36 | after_test:
37 |   - "%PYTHON%\\python.exe setup.py bdist_wheel"
38 | 
39 | artifacts:
40 |   - path: dist\*
41 | 
42 | notifications:
43 |   - provider: Email
44 |     to:
45 |       - yzhao062@gmail.com
46 |     on_build_success: false
47 |     on_build_failure: true
48 |     on_build_status_changed: true
49 | 


--------------------------------------------------------------------------------
/docs/images/MetaOD_Flowchart.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/docs/images/MetaOD_Flowchart.jpg


--------------------------------------------------------------------------------
/docs/images/meta_vis.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/docs/images/meta_vis.jpg


--------------------------------------------------------------------------------
/examples/meta_feature_generation_example.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/examples/meta_feature_generation_example.py


--------------------------------------------------------------------------------
/examples/model_selection_example.py:
--------------------------------------------------------------------------------
 1 | """MetaOD prediction with the trained model
 2 | """
 3 | # License: BSD 2 clause
 4 | 
 5 | from sklearn.metrics import average_precision_score
 6 | 
 7 | from pyod.utils.data import generate_data
 8 | from pyod.models.loda import LODA
 9 | from pyod.models.knn import KNN
10 | from pyod.models.iforest import IForest
11 | from pyod.models.ocsvm import OCSVM
12 | 
13 | 
14 | from metaod.models.utility import prepare_trained_model
15 | from metaod.models.predict_metaod import select_model
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     contamination = 0.1  # percentage of outliers
20 |     n_train = 1000  # number of training points
21 |     n_test = 100  # number of testing points
22 | 
23 |     # Generate sample data
24 |     X_train, y_train, X_test, y_test = \
25 |         generate_data(n_train=n_train,
26 |                       n_test=n_test,
27 |                       n_features=3,
28 |                       contamination=contamination,
29 |                       random_state=42)
30 |     # load pretrained models
31 |     prepare_trained_model()
32 | 
33 |     # recommended models
34 |     selected_models = select_model(X_train, n_selection=100)
35 | 
36 | 
37 |     print("Showing the top recommended models...")
38 |     for i, model in enumerate(selected_models):
39 |         print(i, model)
40 | 
41 |     print()
42 | 
43 |     model_1 = LODA(n_bins=5, n_random_cuts=100)
44 |     print("1st model Average Precision", average_precision_score(y_train, model_1.fit(X_train).decision_scores_))
45 | 
46 |     model_10 = LODA(n_bins=5, n_random_cuts=20)
47 |     print("10th model Average Precision", average_precision_score(y_train, model_10.fit(X_train).decision_scores_))
48 | 
49 | 
50 |     model_50 = OCSVM(kernel= 'sigmoid', nu=0.6) 
51 |     print("50th model Average Precision", average_precision_score(y_train, model_50.fit(X_train).decision_scores_))
52 | 


--------------------------------------------------------------------------------
/metaod/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/metaod/__init__.py


--------------------------------------------------------------------------------
/metaod/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/metaod/models/__init__.py


--------------------------------------------------------------------------------
/metaod/models/base_detectors.py:
--------------------------------------------------------------------------------
  1 | from pyod.models.iforest import IForest
  2 | from pyod.models.lof import LOF
  3 | from pyod.models.ocsvm import OCSVM
  4 | from pyod.models.knn import KNN
  5 | from pyod.models.hbos import HBOS
  6 | from pyod.models.abod import ABOD
  7 | from pyod.models.loda import LODA
  8 | from pyod.models.cof import COF
  9 | 
 10 | 
 11 | def get_detectors():
 12 |     # randomness_flags = []
 13 |     BASE_ESTIMATORS = [
 14 |         LODA(n_bins=5, n_random_cuts=10),
 15 |         LODA(n_bins=5, n_random_cuts=20),
 16 |         LODA(n_bins=5, n_random_cuts=30),
 17 |         LODA(n_bins=5, n_random_cuts=40),
 18 |         LODA(n_bins=5, n_random_cuts=50),
 19 |         LODA(n_bins=5, n_random_cuts=75),
 20 |         LODA(n_bins=5, n_random_cuts=100),
 21 |         LODA(n_bins=5, n_random_cuts=150),
 22 |         LODA(n_bins=5, n_random_cuts=200),
 23 | 
 24 |         LODA(n_bins=10, n_random_cuts=10),
 25 |         LODA(n_bins=10, n_random_cuts=20),
 26 |         LODA(n_bins=10, n_random_cuts=30),
 27 |         LODA(n_bins=10, n_random_cuts=40),
 28 |         LODA(n_bins=10, n_random_cuts=50),
 29 |         LODA(n_bins=10, n_random_cuts=75),
 30 |         LODA(n_bins=10, n_random_cuts=100),
 31 |         LODA(n_bins=10, n_random_cuts=150),
 32 |         LODA(n_bins=10, n_random_cuts=200),
 33 | 
 34 |         LODA(n_bins=15, n_random_cuts=10),
 35 |         LODA(n_bins=15, n_random_cuts=20),
 36 |         LODA(n_bins=15, n_random_cuts=30),
 37 |         LODA(n_bins=15, n_random_cuts=40),
 38 |         LODA(n_bins=15, n_random_cuts=50),
 39 |         LODA(n_bins=15, n_random_cuts=75),
 40 |         LODA(n_bins=15, n_random_cuts=100),
 41 |         LODA(n_bins=15, n_random_cuts=150),
 42 |         LODA(n_bins=15, n_random_cuts=200),
 43 | 
 44 |         LODA(n_bins=20, n_random_cuts=10),
 45 |         LODA(n_bins=20, n_random_cuts=20),
 46 |         LODA(n_bins=20, n_random_cuts=30),
 47 |         LODA(n_bins=20, n_random_cuts=40),
 48 |         LODA(n_bins=20, n_random_cuts=50),
 49 |         LODA(n_bins=20, n_random_cuts=75),
 50 |         LODA(n_bins=20, n_random_cuts=100),
 51 |         LODA(n_bins=20, n_random_cuts=150),
 52 |         LODA(n_bins=20, n_random_cuts=200),
 53 | 
 54 |         LODA(n_bins=25, n_random_cuts=10),
 55 |         LODA(n_bins=25, n_random_cuts=20),
 56 |         LODA(n_bins=25, n_random_cuts=30),
 57 |         LODA(n_bins=25, n_random_cuts=40),
 58 |         LODA(n_bins=25, n_random_cuts=50),
 59 |         LODA(n_bins=25, n_random_cuts=75),
 60 |         LODA(n_bins=25, n_random_cuts=100),
 61 |         LODA(n_bins=25, n_random_cuts=150),
 62 |         LODA(n_bins=25, n_random_cuts=200),
 63 | 
 64 |         LODA(n_bins=30, n_random_cuts=10),
 65 |         LODA(n_bins=30, n_random_cuts=20),
 66 |         LODA(n_bins=30, n_random_cuts=30),
 67 |         LODA(n_bins=30, n_random_cuts=40),
 68 |         LODA(n_bins=30, n_random_cuts=50),
 69 |         LODA(n_bins=30, n_random_cuts=75),
 70 |         LODA(n_bins=30, n_random_cuts=100),
 71 |         LODA(n_bins=30, n_random_cuts=150),
 72 |         LODA(n_bins=30, n_random_cuts=200),
 73 | 
 74 |         ABOD(n_neighbors=3),
 75 |         ABOD(n_neighbors=5),
 76 |         ABOD(n_neighbors=10),
 77 |         ABOD(n_neighbors=15),
 78 |         ABOD(n_neighbors=20),
 79 |         ABOD(n_neighbors=25),
 80 |         ABOD(n_neighbors=50),
 81 |         ABOD(n_neighbors=60),
 82 |         ABOD(n_neighbors=75),
 83 |         ABOD(n_neighbors=80),
 84 |         ABOD(n_neighbors=90),
 85 |         ABOD(n_neighbors=100),
 86 | 
 87 |         IForest(n_estimators=10, max_features=0.1),
 88 |         IForest(n_estimators=10, max_features=0.2),
 89 |         IForest(n_estimators=10, max_features=0.3),
 90 |         IForest(n_estimators=10, max_features=0.4),
 91 |         IForest(n_estimators=10, max_features=0.5),
 92 |         IForest(n_estimators=10, max_features=0.6),
 93 |         IForest(n_estimators=10, max_features=0.7),
 94 |         IForest(n_estimators=10, max_features=0.8),
 95 |         IForest(n_estimators=10, max_features=0.9),
 96 | 
 97 |         IForest(n_estimators=20, max_features=0.1),
 98 |         IForest(n_estimators=20, max_features=0.2),
 99 |         IForest(n_estimators=20, max_features=0.3),
100 |         IForest(n_estimators=20, max_features=0.4),
101 |         IForest(n_estimators=20, max_features=0.5),
102 |         IForest(n_estimators=20, max_features=0.6),
103 |         IForest(n_estimators=20, max_features=0.7),
104 |         IForest(n_estimators=20, max_features=0.8),
105 |         IForest(n_estimators=20, max_features=0.9),
106 | 
107 |         IForest(n_estimators=30, max_features=0.1),
108 |         IForest(n_estimators=30, max_features=0.2),
109 |         IForest(n_estimators=30, max_features=0.3),
110 |         IForest(n_estimators=30, max_features=0.4),
111 |         IForest(n_estimators=30, max_features=0.5),
112 |         IForest(n_estimators=30, max_features=0.6),
113 |         IForest(n_estimators=30, max_features=0.7),
114 |         IForest(n_estimators=30, max_features=0.8),
115 |         IForest(n_estimators=30, max_features=0.9),
116 | 
117 |         IForest(n_estimators=40, max_features=0.1),
118 |         IForest(n_estimators=40, max_features=0.2),
119 |         IForest(n_estimators=40, max_features=0.3),
120 |         IForest(n_estimators=40, max_features=0.4),
121 |         IForest(n_estimators=40, max_features=0.5),
122 |         IForest(n_estimators=40, max_features=0.6),
123 |         IForest(n_estimators=40, max_features=0.7),
124 |         IForest(n_estimators=40, max_features=0.8),
125 |         IForest(n_estimators=40, max_features=0.9),
126 | 
127 |         IForest(n_estimators=50, max_features=0.1),
128 |         IForest(n_estimators=50, max_features=0.2),
129 |         IForest(n_estimators=50, max_features=0.3),
130 |         IForest(n_estimators=50, max_features=0.4),
131 |         IForest(n_estimators=50, max_features=0.5),
132 |         IForest(n_estimators=50, max_features=0.6),
133 |         IForest(n_estimators=50, max_features=0.7),
134 |         IForest(n_estimators=50, max_features=0.8),
135 |         IForest(n_estimators=50, max_features=0.9),
136 | 
137 |         IForest(n_estimators=75, max_features=0.1),
138 |         IForest(n_estimators=75, max_features=0.2),
139 |         IForest(n_estimators=75, max_features=0.3),
140 |         IForest(n_estimators=75, max_features=0.4),
141 |         IForest(n_estimators=75, max_features=0.5),
142 |         IForest(n_estimators=75, max_features=0.6),
143 |         IForest(n_estimators=75, max_features=0.7),
144 |         IForest(n_estimators=75, max_features=0.8),
145 |         IForest(n_estimators=75, max_features=0.9),
146 | 
147 |         IForest(n_estimators=100, max_features=0.1),
148 |         IForest(n_estimators=100, max_features=0.2),
149 |         IForest(n_estimators=100, max_features=0.3),
150 |         IForest(n_estimators=100, max_features=0.4),
151 |         IForest(n_estimators=100, max_features=0.5),
152 |         IForest(n_estimators=100, max_features=0.6),
153 |         IForest(n_estimators=100, max_features=0.7),
154 |         IForest(n_estimators=100, max_features=0.8),
155 |         IForest(n_estimators=100, max_features=0.9),
156 | 
157 |         IForest(n_estimators=150, max_features=0.1),
158 |         IForest(n_estimators=150, max_features=0.2),
159 |         IForest(n_estimators=150, max_features=0.3),
160 |         IForest(n_estimators=150, max_features=0.4),
161 |         IForest(n_estimators=150, max_features=0.5),
162 |         IForest(n_estimators=150, max_features=0.6),
163 |         IForest(n_estimators=150, max_features=0.7),
164 |         IForest(n_estimators=150, max_features=0.8),
165 |         IForest(n_estimators=150, max_features=0.9),
166 | 
167 |         IForest(n_estimators=200, max_features=0.1),
168 |         IForest(n_estimators=200, max_features=0.2),
169 |         IForest(n_estimators=200, max_features=0.3),
170 |         IForest(n_estimators=200, max_features=0.4),
171 |         IForest(n_estimators=200, max_features=0.5),
172 |         IForest(n_estimators=200, max_features=0.6),
173 |         IForest(n_estimators=200, max_features=0.7),
174 |         IForest(n_estimators=200, max_features=0.8),
175 |         IForest(n_estimators=200, max_features=0.9),
176 | 
177 |         KNN(n_neighbors=1, method='largest'),
178 |         KNN(n_neighbors=5, method='largest'),
179 |         KNN(n_neighbors=10, method='largest'),
180 |         KNN(n_neighbors=15, method='largest'),
181 |         KNN(n_neighbors=20, method='largest'),
182 |         KNN(n_neighbors=25, method='largest'),
183 |         KNN(n_neighbors=50, method='largest'),
184 |         KNN(n_neighbors=60, method='largest'),
185 |         KNN(n_neighbors=70, method='largest'),
186 |         KNN(n_neighbors=80, method='largest'),
187 |         KNN(n_neighbors=90, method='largest'),
188 |         KNN(n_neighbors=100, method='largest'),
189 | 
190 |         KNN(n_neighbors=1, method='mean'),
191 |         KNN(n_neighbors=5, method='mean'),
192 |         KNN(n_neighbors=10, method='mean'),
193 |         KNN(n_neighbors=15, method='mean'),
194 |         KNN(n_neighbors=20, method='mean'),
195 |         KNN(n_neighbors=25, method='mean'),
196 |         KNN(n_neighbors=50, method='mean'),
197 |         KNN(n_neighbors=60, method='mean'),
198 |         KNN(n_neighbors=70, method='mean'),
199 |         KNN(n_neighbors=80, method='mean'),
200 |         KNN(n_neighbors=90, method='mean'),
201 |         KNN(n_neighbors=100, method='mean'),
202 | 
203 |         KNN(n_neighbors=1, method='median'),
204 |         KNN(n_neighbors=5, method='median'),
205 |         KNN(n_neighbors=10, method='median'),
206 |         KNN(n_neighbors=15, method='median'),
207 |         KNN(n_neighbors=20, method='median'),
208 |         KNN(n_neighbors=25, method='median'),
209 |         KNN(n_neighbors=50, method='median'),
210 |         KNN(n_neighbors=60, method='median'),
211 |         KNN(n_neighbors=70, method='median'),
212 |         KNN(n_neighbors=80, method='median'),
213 |         KNN(n_neighbors=90, method='median'),
214 |         KNN(n_neighbors=100, method='median'),
215 | 
216 |         LOF(n_neighbors=1, metric='manhattan'),
217 |         LOF(n_neighbors=5, metric='manhattan'),
218 |         LOF(n_neighbors=10, metric='manhattan'),
219 |         LOF(n_neighbors=15, metric='manhattan'),
220 |         LOF(n_neighbors=20, metric='manhattan'),
221 |         LOF(n_neighbors=25, metric='manhattan'),
222 |         LOF(n_neighbors=50, metric='manhattan'),
223 |         LOF(n_neighbors=60, metric='manhattan'),
224 |         LOF(n_neighbors=70, metric='manhattan'),
225 |         LOF(n_neighbors=80, metric='manhattan'),
226 |         LOF(n_neighbors=90, metric='manhattan'),
227 |         LOF(n_neighbors=100, metric='manhattan'),
228 | 
229 |         LOF(n_neighbors=1, metric='euclidean'),
230 |         LOF(n_neighbors=5, metric='euclidean'),
231 |         LOF(n_neighbors=10, metric='euclidean'),
232 |         LOF(n_neighbors=15, metric='euclidean'),
233 |         LOF(n_neighbors=20, metric='euclidean'),
234 |         LOF(n_neighbors=25, metric='euclidean'),
235 |         LOF(n_neighbors=50, metric='euclidean'),
236 |         LOF(n_neighbors=60, metric='euclidean'),
237 |         LOF(n_neighbors=70, metric='euclidean'),
238 |         LOF(n_neighbors=80, metric='euclidean'),
239 |         LOF(n_neighbors=90, metric='euclidean'),
240 |         LOF(n_neighbors=100, metric='euclidean'),
241 | 
242 |         LOF(n_neighbors=1, metric='minkowski'),
243 |         LOF(n_neighbors=5, metric='minkowski'),
244 |         LOF(n_neighbors=10, metric='minkowski'),
245 |         LOF(n_neighbors=15, metric='minkowski'),
246 |         LOF(n_neighbors=20, metric='minkowski'),
247 |         LOF(n_neighbors=25, metric='minkowski'),
248 |         LOF(n_neighbors=50, metric='minkowski'),
249 |         LOF(n_neighbors=60, metric='minkowski'),
250 |         LOF(n_neighbors=70, metric='minkowski'),
251 |         LOF(n_neighbors=80, metric='minkowski'),
252 |         LOF(n_neighbors=90, metric='minkowski'),
253 |         LOF(n_neighbors=100, metric='minkowski'),
254 | 
255 |         HBOS(n_bins=5, alpha=0.1),
256 |         HBOS(n_bins=5, alpha=0.2),
257 |         HBOS(n_bins=5, alpha=0.3),
258 |         HBOS(n_bins=5, alpha=0.4),
259 |         HBOS(n_bins=5, alpha=0.5),
260 | 
261 |         HBOS(n_bins=10, alpha=0.1),
262 |         HBOS(n_bins=10, alpha=0.2),
263 |         HBOS(n_bins=10, alpha=0.3),
264 |         HBOS(n_bins=10, alpha=0.4),
265 |         HBOS(n_bins=10, alpha=0.5),
266 | 
267 |         HBOS(n_bins=20, alpha=0.1),
268 |         HBOS(n_bins=20, alpha=0.2),
269 |         HBOS(n_bins=20, alpha=0.3),
270 |         HBOS(n_bins=20, alpha=0.4),
271 |         HBOS(n_bins=20, alpha=0.5),
272 | 
273 |         HBOS(n_bins=30, alpha=0.1),
274 |         HBOS(n_bins=30, alpha=0.2),
275 |         HBOS(n_bins=30, alpha=0.3),
276 |         HBOS(n_bins=30, alpha=0.4),
277 |         HBOS(n_bins=30, alpha=0.5),
278 | 
279 |         HBOS(n_bins=40, alpha=0.1),
280 |         HBOS(n_bins=40, alpha=0.2),
281 |         HBOS(n_bins=40, alpha=0.3),
282 |         HBOS(n_bins=40, alpha=0.4),
283 |         HBOS(n_bins=40, alpha=0.5),
284 | 
285 |         HBOS(n_bins=50, alpha=0.1),
286 |         HBOS(n_bins=50, alpha=0.2),
287 |         HBOS(n_bins=50, alpha=0.3),
288 |         HBOS(n_bins=50, alpha=0.4),
289 |         HBOS(n_bins=50, alpha=0.5),
290 | 
291 |         HBOS(n_bins=75, alpha=0.1),
292 |         HBOS(n_bins=75, alpha=0.2),
293 |         HBOS(n_bins=75, alpha=0.3),
294 |         HBOS(n_bins=75, alpha=0.4),
295 |         HBOS(n_bins=75, alpha=0.5),
296 | 
297 |         HBOS(n_bins=100, alpha=0.1),
298 |         HBOS(n_bins=100, alpha=0.2),
299 |         HBOS(n_bins=100, alpha=0.3),
300 |         HBOS(n_bins=100, alpha=0.4),
301 |         HBOS(n_bins=100, alpha=0.5),
302 | 
303 |         OCSVM(nu=0.1, kernel="linear"),
304 |         OCSVM(nu=0.2, kernel="linear"),
305 |         OCSVM(nu=0.3, kernel="linear"),
306 |         OCSVM(nu=0.4, kernel="linear"),
307 |         OCSVM(nu=0.5, kernel="linear"),
308 |         OCSVM(nu=0.6, kernel="linear"),
309 |         OCSVM(nu=0.7, kernel="linear"),
310 |         OCSVM(nu=0.8, kernel="linear"),
311 |         OCSVM(nu=0.9, kernel="linear"),
312 | 
313 |         OCSVM(nu=0.1, kernel="poly"),
314 |         OCSVM(nu=0.2, kernel="poly"),
315 |         OCSVM(nu=0.3, kernel="poly"),
316 |         OCSVM(nu=0.4, kernel="poly"),
317 |         OCSVM(nu=0.5, kernel="poly"),
318 |         OCSVM(nu=0.6, kernel="poly"),
319 |         OCSVM(nu=0.7, kernel="poly"),
320 |         OCSVM(nu=0.8, kernel="poly"),
321 |         OCSVM(nu=0.9, kernel="poly"),
322 | 
323 |         OCSVM(nu=0.1, kernel="rbf"),
324 |         OCSVM(nu=0.2, kernel="rbf"),
325 |         OCSVM(nu=0.3, kernel="rbf"),
326 |         OCSVM(nu=0.4, kernel="rbf"),
327 |         OCSVM(nu=0.5, kernel="rbf"),
328 |         OCSVM(nu=0.6, kernel="rbf"),
329 |         OCSVM(nu=0.7, kernel="rbf"),
330 |         OCSVM(nu=0.8, kernel="rbf"),
331 |         OCSVM(nu=0.9, kernel="rbf"),
332 | 
333 |         OCSVM(nu=0.1, kernel="sigmoid"),
334 |         OCSVM(nu=0.2, kernel="sigmoid"),
335 |         OCSVM(nu=0.3, kernel="sigmoid"),
336 |         OCSVM(nu=0.4, kernel="sigmoid"),
337 |         OCSVM(nu=0.5, kernel="sigmoid"),
338 |         OCSVM(nu=0.6, kernel="sigmoid"),
339 |         OCSVM(nu=0.7, kernel="sigmoid"),
340 |         OCSVM(nu=0.8, kernel="sigmoid"),
341 |         OCSVM(nu=0.9, kernel="sigmoid"),
342 | 
343 |         COF(n_neighbors=3),
344 |         COF(n_neighbors=5),
345 |         COF(n_neighbors=10),
346 |         COF(n_neighbors=15),
347 |         COF(n_neighbors=20),
348 |         COF(n_neighbors=25),
349 |         COF(n_neighbors=50),
350 |     ]
351 | 
352 |     # randomness_flags.extend([True] * 54)  # LODA
353 |     # randomness_flags.extend([False] * 7)  # ABOD
354 |     # randomness_flags.extend([True] * 81)  # IForest
355 |     # randomness_flags.extend([False] * 36)  # KNN
356 |     # randomness_flags.extend([False] * 36)  # LOF
357 |     # randomness_flags.extend([False] * 40)  # HBOS
358 |     # randomness_flags.extend([False] * 36)  # OCSVM
359 |     # randomness_flags.extend([False] * 7)  # COF
360 |     # return BASE_ESTIMATORS, randomness_flags
361 |     return BASE_ESTIMATORS
362 | 


--------------------------------------------------------------------------------
/metaod/models/core.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | import numpy as np
  3 | 
  4 | from sklearn.metrics import dcg_score, ndcg_score
  5 | from sklearn.utils import check_array
  6 | from sklearn.metrics import mean_squared_error
  7 | 
  8 | from sklearn.multioutput import MultiOutputRegressor
  9 | from sklearn.ensemble import RandomForestRegressor
 10 | from sklearn.decomposition import PCA
 11 | from sklearn.preprocessing import MinMaxScaler, StandardScaler
 12 | from copy import deepcopy
 13 | 
 14 | 
 15 | def get_mse(pred, actual):
 16 |     # Ignore nonzero terms.
 17 |     pred = pred[actual.nonzero()].flatten()
 18 |     actual = actual[actual.nonzero()].flatten()
 19 |     return mean_squared_error(pred, actual)
 20 | 
 21 | 
 22 | def sigmoid(x, a=1):
 23 |     return 1 / (1 + np.exp(-1 * a * x))
 24 | 
 25 | 
 26 | def sigmoid_derivate(x, a=1):
 27 |     return sigmoid(x, a) * (1 - sigmoid(x, a))
 28 | 
 29 | 
 30 | class MetaODClass(object):
 31 |     def __init__(self,
 32 |                  train_performance,
 33 |                  valid_performance,
 34 |                  n_factors=40,
 35 |                  learning='sgd',
 36 |                  verbose=False):
 37 |         """
 38 |         Train a matrix factorization model to predict empty 
 39 |         entries in a matrix. The terminology assumes a 
 40 |         train_performance matrix which is ~ user x item
 41 |         
 42 |         Params
 43 |         ======
 44 |         train_performance : (ndarray)
 45 |             User x Item matrix with corresponding train_performance
 46 |         
 47 |         n_factors : (int)
 48 |             Number of latent factors to use in matrix 
 49 |             factorization model
 50 |         learning : (str)
 51 |             Method of optimization. Options include 
 52 |             'sgd' or 'als'.
 53 |         
 54 |         item_fact_reg : (float)
 55 |             Regularization term for item latent factors
 56 |         
 57 |         user_fact_reg : (float)
 58 |             Regularization term for user latent factors
 59 |             
 60 |         item_bias_reg : (float)
 61 |             Regularization term for item biases
 62 |         
 63 |         user_bias_reg : (float)
 64 |             Regularization term for user biases
 65 |         
 66 |         verbose : (bool)
 67 |             Whether or not to printout training progress
 68 |         """
 69 | 
 70 |         self.ratings = train_performance
 71 |         self.valid_ratings = valid_performance
 72 |         self.n_users, self.n_items = train_performance.shape
 73 |         self.n_factors = n_factors
 74 |         self.learning = learning
 75 |         if self.learning == 'sgd':
 76 |             self.n_samples, self.n_models = self.ratings.shape[0], \
 77 |                                             self.ratings.shape[1]
 78 |         self._v = verbose
 79 |         self.train_loss_ = [0]
 80 |         self.valid_loss_ = [0]
 81 |         self.learning_rates_ = []
 82 |         self.scalar_ = None
 83 |         self.pca_ = None
 84 | 
 85 |     def get_train_dcg(self, user_vecs, item_vecs):
 86 |         # make sure it is non zero
 87 |         user_vecs[np.isnan(self.user_vecs)] = 0
 88 | 
 89 |         ndcg_s = []
 90 |         for w in range(self.ratings.shape[0]):
 91 |             ndcg_s.append(ndcg_score([self.ratings[w, :]],
 92 |                                      [np.dot(user_vecs[w, :], item_vecs.T)]))
 93 | 
 94 |         return np.mean(ndcg_s)
 95 | 
 96 |     def train(self, meta_features, valid_meta=None, n_iter=10,
 97 |               learning_rate=0.1, n_estimators=100, max_depth=10, max_rate=1.05,
 98 |               min_rate=0.1, discount=0.95, n_steps=10):
 99 |         """ Train model for n_iter iterations from scratch."""
100 | 
101 |         self.pca_ = PCA(n_components=self.n_factors)
102 |         self.pca_.fit(meta_features)
103 | 
104 |         meta_features_pca = self.pca_.transform(meta_features)
105 |         meta_valid_pca = self.pca_.transform(valid_meta)
106 | 
107 |         self.scalar_ = StandardScaler()
108 |         self.scalar_.fit(meta_features_pca)
109 | 
110 |         meta_features_scaled = self.scalar_.transform(meta_features_pca)
111 |         meta_valid_scaled = self.scalar_.transform(meta_valid_pca)
112 | 
113 |         self.user_vecs = meta_features_scaled
114 | 
115 |         self.item_vecs = np.random.normal(scale=1. / self.n_factors,
116 |                                           size=(self.n_items, self.n_factors))
117 | 
118 |         step_size = (max_rate - min_rate) / (n_steps - 1)
119 |         lr_list = list(np.arange(min_rate, max_rate, step_size))
120 |         lr_list.append(max_rate)
121 |         lr_list_reverse = deepcopy(lr_list)
122 |         lr_list_reverse.reverse()
123 | 
124 |         learning_rate_full = []
125 |         for w in range(n_iter):
126 |             learning_rate_full.extend(lr_list)
127 |             learning_rate_full.extend(lr_list_reverse)
128 | 
129 |         self.learning_rate_ = min_rate
130 |         self.learning_rates_.append(self.learning_rate_)
131 | 
132 |         ctr = 1
133 |         np_ctr = 1
134 |         while ctr <= n_iter:
135 | 
136 |             self.learning_rate_ = learning_rate_full[ctr - 1]
137 |             self.learning_rates_.append(self.learning_rate_)
138 | 
139 |             self.regr_multirf = MultiOutputRegressor(RandomForestRegressor(
140 |                 n_estimators=n_estimators, max_depth=max_depth, n_jobs=4))
141 | 
142 |             # make sure it is non zero
143 |             self.user_vecs[np.isnan(self.user_vecs)] = 0
144 | 
145 |             self.regr_multirf.fit(meta_features_scaled, self.user_vecs)
146 | 
147 |             meta_valid_scaled_new = self.regr_multirf.predict(
148 |                 meta_valid_scaled)
149 | 
150 |             # if ctr % 10 == 0 and self._v:
151 |             # print ('\tcurrent iteration: {}'.format(ctr))
152 |             # print('ALORS Rank Fixed iteration', ctr, ndcg_score(self.train_performance, np.dot(self.user_vecs, self.item_vecs.T)))
153 |             # self.learning_rates_.append(self.learning_rate)
154 |             ndcg_s = []
155 |             for w in range(self.ratings.shape[0]):
156 |                 ndcg_s.append(ndcg_score([self.ratings[w, :]], [
157 |                     np.dot(self.user_vecs[w, :], self.item_vecs.T)],
158 |                                          k=self.n_items))
159 | 
160 |             # print('ALORS Fixed iteration', ctr, ndcg_score(self.train_performance, np.dot(self.user_vecs, self.item_vecs.T)))
161 |             # print('ALORS Rank Fixed iteration', ctr, 'training', np.mean(ndcg_s))
162 |             self.train_loss_.append(np.mean(ndcg_s))
163 | 
164 |             ndcg_s = []
165 |             for w in range(self.valid_ratings.shape[0]):
166 |                 ndcg_s.append(ndcg_score([self.valid_ratings[w, :]], [
167 |                     np.dot(meta_valid_scaled_new[w, :], self.item_vecs.T)],
168 |                                          k=self.n_items))
169 | 
170 |             # print('ALORS Fixed iteration', ctr, ndcg_score(self.train_performance, np.dot(self.user_vecs, self.item_vecs.T)))
171 |             # print('ALORS Rank Fixed iteration', ctr, 'valid', np.mean(ndcg_s))
172 |             self.valid_loss_.append(np.mean(ndcg_s))
173 | 
174 |             print('MetaOD', ctr, 'train',
175 |                   self.train_loss_[-1], 'valid', self.valid_loss_[-1],
176 |                   'learning rate', self.learning_rates_[-1])
177 | 
178 |             # improvement is smaller than 1 perc
179 |             if ((self.valid_loss_[-1] - self.valid_loss_[-2]) /
180 |                 self.valid_loss_[-2]) <= 0.001:
181 |                 # print(((self.valid_loss_[-1] - self.valid_loss_[-2])/self.valid_loss_[-2]))
182 |                 np_ctr += 1
183 |             else:
184 |                 np_ctr = 1
185 |             if np_ctr > 5:
186 |                 break
187 | 
188 |             # update learning rates
189 |             # self.learning_rate_ = self.learning_rate_ + 0.05
190 |             # self.learning_rates_.append(self.learning_rate_)
191 |             # if ctr % 2:
192 |             #     if ctr <=50:
193 |             #         self.learning_rate_ = min_rate * np.power(discount,ctr)
194 |             #     else:
195 |             #         self.learning_rate_ = min_rate * np.power(discount,50)
196 | 
197 |             # else:
198 |             #     if ctr <=50:
199 |             #         self.learning_rate_ = max_rate * np.power(discount,ctr)
200 |             #     else:
201 |             #         self.learning_rate_ = max_rate * np.power(discount,50)
202 | 
203 |             # self.learning_rates_.append(self.learning_rate_)
204 | 
205 |             train_indices = list(range(self.n_samples))
206 |             np.random.shuffle(train_indices)
207 |             # print(train_indices)
208 | 
209 |             for h in train_indices:
210 | 
211 |                 uh = self.user_vecs[h, :].reshape(1, -1)
212 |                 # print(uh.shape)
213 |                 grads = []
214 | 
215 |                 for i in range(self.n_models):
216 |                     # outler loop
217 |                     vi = self.item_vecs[i, :].reshape(-1, 1)
218 |                     phis = []
219 |                     rights = []
220 |                     rights_v = []
221 |                     # remove i from js 
222 |                     js = list(range(self.n_models))
223 |                     js.remove(i)
224 | 
225 |                     for j in js:
226 |                         vj = self.item_vecs[j, :].reshape(-1, 1)
227 |                         # temp_vt = np.exp(np.matmul(uh, (vj-vi)))
228 |                         # temp_vt = np.ndarray.item(temp_vt)
229 |                         temp_vt = sigmoid(
230 |                             np.ndarray.item(np.matmul(uh, (vj - vi))), a=1)
231 |                         temp_vt_derivative = sigmoid_derivate(
232 |                             np.ndarray.item(np.matmul(uh, (vj - vi))), a=1)
233 |                         # print(uh.re, (self.item_vecs[j,:]-self.item_vecs[i,:]).T.shape)
234 |                         # print((self.item_vecs[j,:]-self.item_vecs[i,:]).reshape(-1, 1).shape)
235 |                         # print(temp_vt.shape)
236 |                         # assert (len(temp_vt)==1)
237 |                         phis.append(temp_vt)
238 |                         rights.append(temp_vt_derivative * (vj - vi))
239 |                         rights_v.append(temp_vt_derivative * uh)
240 |                     phi = np.sum(phis) + 1.5
241 |                     rights = np.asarray(rights).reshape(self.n_models - 1,
242 |                                                         self.n_factors)
243 |                     rights_v = np.asarray(rights_v).reshape(self.n_models - 1,
244 |                                                             self.n_factors)
245 | 
246 |                     # print(rights.shape, rights_v.shape)
247 | 
248 |                     right = np.sum(np.asarray(rights), axis=0)
249 |                     right_v = np.sum(np.asarray(rights_v), axis=0)
250 |                     # print(right, right_v)
251 | 
252 |                     # print(np.asarray(rights).shape, np.asarray(right).shape)
253 |                     grad = (10 ** (self.ratings[h, i]) - 1) / (
254 |                                 phi * (np.log(phi)) ** 2) * right
255 |                     grad_v = (10 ** (self.ratings[h, i]) - 1) / (
256 |                                 phi * (np.log(phi)) ** 2) * right_v
257 | 
258 |                     self.item_vecs[i, :] += self.learning_rate_ * grad_v
259 | 
260 |                     # print(h, i, grad.shape)
261 |                     grads.append(grad)
262 | 
263 |                 grads_uh = np.asarray(grads)
264 |                 grad_uh = np.sum(grads_uh, axis=0)
265 | 
266 |                 self.user_vecs[h, :] -= self.learning_rate_ * grad_uh
267 |                 # print(self.learning_rate_)
268 | 
269 |             ctr += 1
270 | 
271 |         # self.regr_multirf = MultiOutputRegressor(RandomForestRegressor(
272 |         #     n_estimators=n_estimators, max_depth=max_depth, n_jobs=4))
273 | 
274 |         # self.regr_multirf = MultiOutputRegressor(Lasso()))
275 |         # self.regr_multirf = MultiOutputRegressor(xgb.XGBRegressor(n_estimators=n_estimators))
276 | 
277 |         # self.regr_multirf.fit(meta_features, self.user_vecs)
278 | 
279 |         # disable unnecessary information
280 |         self.ratings = None
281 |         self.valid_ratings = None
282 |         return self
283 | 
284 |     # def predict(self, u, i):
285 |     #     """ Single user and item prediction."""
286 |     #     # prediction = self.global_bias + self.user_bias[u] + self.item_bias[i]
287 |     #     prediction = self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
288 |     #     # prediction += self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
289 |     #     return prediction
290 | 
291 |     # def predict_all(self):
292 |     #     """ Predict train_performance for every user and item."""
293 |     #     predictions = np.zeros((self.user_vecs.shape[0], 
294 |     #                             self.item_vecs.shape[0]))
295 |     #     for u in range(self.user_vecs.shape[0]):
296 |     #         for i in range(self.item_vecs.shape[0]):
297 |     #             predictions[u, i] = self.predict(u, i)
298 | 
299 |     #     return predictions
300 | 
301 |     def predict(self, test_meta):
302 |         test_meta = check_array(test_meta)
303 |         assert (test_meta.shape[1]==200)
304 | 
305 |         test_meta_scaled = self.pca_.transform(test_meta)
306 |         # print('B', test_meta_scaled.shape)
307 | 
308 |         test_meta_scaled = self.scalar_.transform(test_meta_scaled)
309 |         test_meta_scaled = self.regr_multirf.predict(test_meta_scaled)
310 | 
311 |         # predicted_scores = np.dot(test_k, self.item_vecs.T) + self.item_bias
312 |         predicted_scores = np.dot(test_meta_scaled, self.item_vecs.T)
313 |         # print(predicted_scores.shape)
314 |         assert (predicted_scores.shape[0] == test_meta.shape[0])
315 |         assert (predicted_scores.shape[1] == self.n_models)
316 | 
317 |         return predicted_scores
318 | 
319 | #####################################
320 | # random_state = np.random.RandomState(42)
321 | 
322 | # r = list(range(100))
323 | # X = random_state.choice(r, size=[100, 5], replace=True)/100
324 | # X_meta = random_state.choice(r, size=[100, 200], replace=True)
325 | 
326 | # X_train, X_test, X_train_meta, X_test_meta = train_test_split(X, X_meta, test_size=0.33, random_state=42)
327 | 
328 | # train_data_cv, valid_data_cv, train_roc_cv, valid_roc_cv = train_test_split(X_train_meta, X_train, test_size=0.2)
329 | 
330 | 
331 | # EMF = MetaODClass(train_roc_cv, valid_roc_cv, n_factors=3, learning='sgd', verbose=False)
332 | # EMF.train(n_iter=200, meta_features=train_data_cv, valid_meta=valid_data_cv, learning_rate=0.05, min_rate=0.05, max_rate=0.2, discount=0.98)
333 | 
334 | # U = EMF.user_vecs
335 | # V = EMF.item_vecs
336 | 
337 | # pred_scores = np.dot(U, V.T)
338 | 
339 | # print('rating matrix size:', train_roc_cv.shape)
340 | # print('Our modified loss and gradient results in NDCG:', ndcg_score(train_roc_cv, pred_scores))
341 | # print()
342 | 
343 | # for j in range(10):
344 | #     U = np.random.normal(size=U.shape)
345 | #     V = np.random.normal(size=V.shape)
346 | #     pred_scores = np.dot(U, V.T)
347 | 
348 | #     print('trial', j, 'random U, V result in NDCG:', ndcg_score(train_roc_cv, pred_scores))
349 | 
350 | # # bias_global = EMF.global_bias
351 | # # bias_user = EMF.user_bias
352 | # # bias_item = EMF.item_bias
353 | 
354 | # # # print(EMF.regr_multirf.predict(test_meta).shape)
355 | # predicted_scores = EMF.predict(X_test_meta)
356 | # # predicted_scores_max = np.nanargmax(predicted_scores, axis=1)
357 | 


--------------------------------------------------------------------------------
/metaod/models/gen_meta_features.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Generate meta-features from an arbitrary dataset.
  3 | """
  4 | # Author: Yue Zhao <zhaoy@cmu.edu>
  5 | # License: BSD 2 clause
  6 | import pandas as pd
  7 | import numpy as np
  8 | import itertools
  9 | 
 10 | from sklearn.decomposition import PCA as sklearn_PCA
 11 | from scipy.stats import skew, kurtosis
 12 | from scipy.stats import f_oneway
 13 | from scipy.stats import entropy
 14 | 
 15 | from scipy.stats import moment
 16 | from scipy.stats import normaltest
 17 | 
 18 | from pyod.models.hbos import HBOS
 19 | from pyod.models.iforest import IForest
 20 | from pyod.models.pca import PCA
 21 | from pyod.models.loda import LODA
 22 | from sklearn.utils import check_array
 23 | 
 24 | 
 25 | def gini(array):
 26 |     """Calculate the Gini coefficient of a numpy array."""
 27 |     # based on bottom eq:
 28 |     # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
 29 |     # from:
 30 |     # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
 31 |     # All values are treated equally, arrays must be 1d:
 32 |     array = array.flatten()
 33 |     if np.amin(array) < 0:
 34 |         # Values cannot be negative:
 35 |         array -= np.amin(array)
 36 |     # Values cannot be 0:
 37 |     array = np.add(array, 0.0000001, casting="unsafe")
 38 |     # Values must be sorted:
 39 |     array = np.sort(array)
 40 |     # Index per array element:
 41 |     index = np.arange(1, array.shape[0] + 1)
 42 |     # Number of array elements:
 43 |     n = array.shape[0]
 44 |     # Gini coefficient:
 45 |     return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array)))
 46 | 
 47 | 
 48 | def Diff(li1, li2):
 49 |     """Calculate the difference of two list
 50 | 
 51 |     Parameters
 52 |     ----------
 53 |     li1
 54 |     li2
 55 | 
 56 |     Returns
 57 |     -------
 58 | 
 59 |     """
 60 |     return (list(set(li1) - set(li2)))
 61 | 
 62 | 
 63 | def argmaxn(w, nth):
 64 |     w = np.asarray(w).ravel()
 65 |     t = np.argsort(w)
 66 |     return t[-1 * nth]
 67 | 
 68 | 
 69 | def flatten_diagonally(x, diags=None):
 70 |     diags = np.array(diags)
 71 |     if x.shape[1] > x.shape[0]:
 72 |         diags += x.shape[1] - x.shape[0]
 73 |     n = max(x.shape)
 74 |     ndiags = 2 * n - 1
 75 |     i, j = np.indices(x.shape)
 76 |     d = np.array([])
 77 |     for ndi in range(ndiags):
 78 |         if diags != None:
 79 |             if not ndi in diags:
 80 |                 continue
 81 |         d = np.concatenate((d, x[i == j + (n - 1) - ndi]))
 82 |     return d
 83 | 
 84 | 
 85 | def list_process(x, r_min=True, r_max=True, r_mean=True, r_std=True,
 86 |                  r_skew=True, r_kurtosis=True):
 87 |     """Return statistics of a list
 88 | 
 89 |     Parameters
 90 |     ----------
 91 |     x
 92 |     r_min
 93 |     r_max
 94 |     r_mean
 95 |     r_std
 96 |     r_skew
 97 |     r_kurtosis
 98 | 
 99 |     Returns
100 |     -------
101 | 
102 |     """
103 |     x = np.asarray(x).reshape(-1, 1)
104 |     return_list = []
105 | 
106 |     if r_min:
107 |         return_list.append(np.nanmin(x))
108 | 
109 |     if r_max:
110 |         return_list.append(np.nanmax(x))
111 | 
112 |     if r_mean:
113 |         return_list.append(np.nanmean(x))
114 | 
115 |     if r_std:
116 |         return_list.append(np.nanstd(x))
117 | 
118 |     if r_skew:
119 |         return_list.append(skew(x, nan_policy='omit')[0])
120 | 
121 |     if r_kurtosis:
122 |         return_list.append(kurtosis(x, nan_policy='omit')[0])
123 | 
124 |     return return_list
125 | 
126 | 
127 | def list_process_name(var):
128 |     return [var + '_min', var + '_max', var + '_mean', var + '_std',
129 |             var + '_skewness', var + '_kurtosis']
130 | 
131 | 
132 | def generate_meta_features(X):
133 |     """Get the meta-features of a datasets X
134 | 
135 |     Parameters
136 |     ----------
137 |     X : numpy array of shape (n_samples, n_features)
138 |         Input array
139 | 
140 |     Returns
141 |     -------
142 |     meta_features : numpy array of shape (1, 200)
143 |         Meta-feature in dimension of 200
144 | 
145 |     """
146 |     # outliers_fraction = np.count_nonzero(y) / len(y)
147 |     # outliers_percentage = round(outliers_fraction * 100, ndigits=4)
148 |     X = check_array(X)
149 | 
150 |     meta_vec = []
151 |     meta_vec_names = []
152 | 
153 |     # on the sample level
154 |     n_samples, n_features = X.shape[0], X.shape[1]
155 | 
156 |     meta_vec.append(n_samples)
157 |     meta_vec.append(n_features)
158 | 
159 |     meta_vec_names.append('n_samples')
160 |     meta_vec_names.append('n_features')
161 | 
162 |     sample_mean = np.mean(X)
163 |     sample_median = np.median(X)
164 |     sample_var = np.var(X)
165 |     sample_min = np.min(X)
166 |     sample_max = np.max(X)
167 |     sample_std = np.std(X)
168 | 
169 |     q1, q25, q75, q99 = np.percentile(X, [0.01, 0.25, 0.75, 0.99])
170 |     iqr = q75 - q25
171 | 
172 |     normalized_mean = sample_mean / sample_max
173 |     normalized_median = sample_median / sample_max
174 |     sample_range = sample_max - sample_min
175 |     sample_gini = gini(X)
176 |     med_abs_dev = np.median(np.absolute(X - sample_median))
177 |     avg_abs_dev = np.mean(np.absolute(X - sample_mean))
178 |     quant_coeff_disp = (q75 - q25) / (q75 + q25)
179 |     coeff_var = sample_var / sample_mean
180 | 
181 |     outliers_15iqr = np.logical_or(
182 |         X < (q25 - 1.5 * iqr), X > (q75 + 1.5 * iqr))
183 |     outliers_3iqr = np.logical_or(X < (q25 - 3 * iqr), X > (q75 + 3 * iqr))
184 |     outliers_1_99 = np.logical_or(X < q1, X > q99)
185 |     outliers_3std = np.logical_or(X < (sample_mean - 3 * sample_std),
186 |                                   X > (sample_mean + 3 * sample_std))
187 | 
188 |     percent_outliers_15iqr = np.sum(outliers_15iqr) / len(X)
189 |     percent_outliers_3iqr = np.sum(outliers_3iqr) / len(X)
190 |     percent_outliers_1_99 = np.sum(outliers_1_99) / len(X)
191 |     percent_outliers_3std = np.sum(outliers_3std) / len(X)
192 | 
193 |     has_outliers_15iqr = np.any(outliers_15iqr).astype(int)
194 |     has_outliers_3iqr = np.any(outliers_3iqr).astype(int)
195 |     has_outliers_1_99 = np.any(outliers_1_99).astype(int)
196 |     has_outliers_3std = np.any(outliers_3std).astype(int)
197 | 
198 |     meta_vec.extend(
199 |         [sample_mean, sample_median, sample_var, sample_min, sample_max,
200 |          sample_std,
201 |          q1, q25, q75, q99, iqr, normalized_mean, normalized_median,
202 |          sample_range, sample_gini,
203 |          med_abs_dev, avg_abs_dev, quant_coeff_disp, coeff_var,
204 |          # moment_5, moment_6, moment_7, moment_8, moment_9, moment_10,
205 |          percent_outliers_15iqr, percent_outliers_3iqr, percent_outliers_1_99,
206 |          percent_outliers_3std,
207 |          has_outliers_15iqr, has_outliers_3iqr, has_outliers_1_99,
208 |          has_outliers_3std])
209 | 
210 |     meta_vec_names.extend(
211 |         ['sample_mean', 'sample_median', 'sample_var', 'sample_min',
212 |          'sample_max', 'sample_std',
213 |          'q1', 'q25', 'q75', 'q99', 'iqr', 'normalized_mean',
214 |          'normalized_median', 'sample_range', 'sample_gini',
215 |          'med_abs_dev', 'avg_abs_dev', 'quant_coeff_disp', 'coeff_var',
216 |          # moment_5, moment_6, moment_7, moment_8, moment_9, moment_10,
217 |          'percent_outliers_15iqr', 'percent_outliers_3iqr',
218 |          'percent_outliers_1_99', 'percent_outliers_3std',
219 |          'has_outliers_15iqr', 'has_outliers_3iqr', 'has_outliers_1_99',
220 |          'has_outliers_3std'])
221 | 
222 |     ###########################################################################
223 | 
224 |     normality_k2, normality_p = normaltest(X)
225 |     is_normal_5 = (normality_p < 0.05).astype(int)
226 |     is_normal_1 = (normality_p < 0.01).astype(int)
227 | 
228 |     meta_vec.extend(list_process(normality_p))
229 |     meta_vec.extend(list_process(is_normal_5))
230 |     meta_vec.extend(list_process(is_normal_1))
231 | 
232 |     meta_vec_names.extend(list_process_name('normality_p'))
233 |     meta_vec_names.extend(list_process_name('is_normal_5'))
234 |     meta_vec_names.extend(list_process_name('is_normal_1'))
235 | 
236 |     moment_5 = moment(X, moment=5)
237 |     moment_6 = moment(X, moment=6)
238 |     moment_7 = moment(X, moment=7)
239 |     moment_8 = moment(X, moment=8)
240 |     moment_9 = moment(X, moment=9)
241 |     moment_10 = moment(X, moment=10)
242 |     meta_vec.extend(list_process(moment_5))
243 |     meta_vec.extend(list_process(moment_6))
244 |     meta_vec.extend(list_process(moment_7))
245 |     meta_vec.extend(list_process(moment_8))
246 |     meta_vec.extend(list_process(moment_9))
247 |     meta_vec.extend(list_process(moment_10))
248 |     meta_vec_names.extend(list_process_name('moment_5'))
249 |     meta_vec_names.extend(list_process_name('moment_6'))
250 |     meta_vec_names.extend(list_process_name('moment_7'))
251 |     meta_vec_names.extend(list_process_name('moment_8'))
252 |     meta_vec_names.extend(list_process_name('moment_9'))
253 |     meta_vec_names.extend(list_process_name('moment_10'))
254 | 
255 |     # note: this is for each dimension == the number of dimensions
256 |     skewness_list = skew(X).reshape(-1, 1)
257 |     skew_values = list_process(skewness_list)
258 |     meta_vec.extend(skew_values)
259 |     meta_vec_names.extend(list_process_name('skewness'))
260 | 
261 |     # note: this is for each dimension == the number of dimensions
262 |     kurtosis_list = kurtosis(X)
263 |     kurtosis_values = list_process(kurtosis_list)
264 |     meta_vec.extend(kurtosis_values)
265 |     meta_vec_names.extend(list_process_name('kurtosis'))
266 | 
267 |     correlation = np.nan_to_num(pd.DataFrame(X).corr(), nan=0)
268 |     correlation_list = flatten_diagonally(correlation)[
269 |                        0:int((n_features * n_features - n_features) / 2)]
270 |     correlation_values = list_process(correlation_list)
271 |     meta_vec.extend(correlation_values)
272 |     meta_vec_names.extend(list_process_name('correlation'))
273 | 
274 |     covariance = np.cov(X.T)
275 |     covariance_list = flatten_diagonally(covariance)[
276 |                       0:int((n_features * n_features - n_features) / 2)]
277 |     covariance_values = list_process(covariance_list)
278 |     meta_vec.extend(covariance_values)
279 |     meta_vec_names.extend(list_process_name('covariance'))
280 | 
281 |     # sparsity
282 |     rep_counts = []
283 |     for i in range(n_features):
284 |         rep_counts.append(len(np.unique(X[:, i])))
285 |     sparsity_list = np.asarray(rep_counts) / (n_samples)
286 |     sparsity = list_process(sparsity_list)
287 |     meta_vec.extend(sparsity)
288 |     meta_vec_names.extend(list_process_name('sparsity'))
289 | 
290 |     # ANOVA p value
291 |     p_values_list = []
292 |     all_perm = list(itertools.combinations(list(range(n_features)), 2))
293 |     for j in all_perm:
294 |         p_values_list.append(f_oneway(X[:, j[0]], X[:, j[1]])[1])
295 |     anova_p_value = list_process(np.asarray(p_values_list))
296 |     # anova_p_value = np.mean(p_values_list)
297 |     # anova_p_value_exceed_thresh = np.mean((np.asarray(p_values_list)<0.05).astype(int))
298 |     meta_vec.extend(anova_p_value)
299 |     meta_vec_names.extend(list_process_name('anova_p_value'))
300 | 
301 |     # pca
302 |     pca_transformer = sklearn_PCA(n_components=3)
303 |     X_transform = pca_transformer.fit_transform(X)
304 | 
305 |     # first pc
306 |     pca_fpc = list_process(X_transform[0, :], r_min=False, r_max=False,
307 |                            r_mean=False,
308 |                            r_std=True, r_skew=True, r_kurtosis=True)
309 |     meta_vec.extend(pca_fpc)
310 |     meta_vec_names.extend(
311 |         ['first_pca_std', 'first_pca_skewness', 'first_pca_kurtosis'])
312 | 
313 |     # entropy
314 |     entropy_list = []
315 |     for i in range(n_features):
316 |         counts = pd.Series(X[:, i]).value_counts()
317 |         entropy_list.append(entropy(counts) / n_samples)
318 |     entropy_values = list_process(entropy_list)
319 |     meta_vec.extend(entropy_values)
320 |     meta_vec_names.extend(list_process_name('entropy'))
321 | 
322 |     ##############################Landmarkers######################################
323 |     # HBOS
324 |     clf = HBOS(n_bins=10)
325 |     clf.fit(X)
326 |     HBOS_hists = clf.hist_
327 |     HBOS_mean = np.mean(HBOS_hists, axis=0)
328 |     HBOS_max = np.max(HBOS_hists, axis=0)
329 |     HBOS_min = np.min(HBOS_hists, axis=0)
330 |     meta_vec.extend(list_process(HBOS_mean))
331 |     meta_vec.extend(list_process(HBOS_max))
332 |     meta_vec.extend(list_process(HBOS_min))
333 |     meta_vec_names.extend(list_process_name('HBOS_mean'))
334 |     meta_vec_names.extend(list_process_name('HBOS_max'))
335 |     meta_vec_names.extend(list_process_name('HBOS_min'))
336 | 
337 |     # IForest
338 |     n_estimators = 100
339 |     clf = IForest(n_estimators=n_estimators)
340 |     clf.fit(X)
341 | 
342 |     n_leaves = []
343 |     n_depth = []
344 |     fi_mean = []
345 |     fi_max = []
346 | 
347 |     # doing this for each sub-trees
348 |     for i in range(n_estimators):
349 |         n_leaves.append(clf.estimators_[i].get_n_leaves())
350 |         n_depth.append(clf.estimators_[i].get_depth())
351 |         fi_mean.append(clf.estimators_[i].feature_importances_.mean())
352 |         fi_max.append(clf.estimators_[i].feature_importances_.max())
353 |         # print(clf.estimators_[i].tree_)
354 | 
355 |     meta_vec.extend(list_process(n_leaves))
356 |     meta_vec.extend(list_process(n_depth))
357 |     meta_vec.extend(list_process(fi_mean))
358 |     meta_vec.extend(list_process(fi_max))
359 | 
360 |     meta_vec_names.extend(list_process_name('IForest_n_leaves'))
361 |     meta_vec_names.extend(list_process_name('IForest_n_depth'))
362 |     meta_vec_names.extend(list_process_name('IForest_fi_mean'))
363 |     meta_vec_names.extend(list_process_name('IForest_fi_max'))
364 | 
365 |     # PCA
366 |     clf = PCA(n_components=3)
367 |     clf.fit(X)
368 |     meta_vec.extend(clf.explained_variance_ratio_)
369 |     meta_vec.extend(clf.singular_values_)
370 |     meta_vec_names.extend(
371 |         ['pca_expl_ratio_1', 'pca_expl_ratio_2', 'pca_expl_ratio_3'])
372 |     meta_vec_names.extend(['pca_sv_1', 'pca_sv_2', 'pca_sv_3'])
373 | 
374 |     # LODA
375 |     n_bins = 10
376 |     n_random_cuts = 100
377 | 
378 |     n_hists_mean = []
379 |     n_hists_max = []
380 | 
381 |     n_cuts_mean = []
382 |     n_cuts_max = []
383 | 
384 |     clf = LODA(n_bins=n_bins, n_random_cuts=n_random_cuts)
385 |     clf.fit(X)
386 | 
387 |     for i in range(n_bins):
388 |         n_hists_mean.append(clf.histograms_[:, i].mean())
389 |         n_hists_max.append(clf.histograms_[:, i].max())
390 |     for i in range(n_random_cuts):
391 |         n_cuts_mean.append(clf.histograms_[i, :].mean())
392 |         n_cuts_max.append(clf.histograms_[i, :].max())
393 | 
394 |     meta_vec.extend(list_process(n_hists_mean))
395 |     meta_vec.extend(list_process(n_hists_max))
396 |     meta_vec.extend(list_process(n_cuts_mean))
397 |     meta_vec.extend(list_process(n_cuts_max))
398 | 
399 |     meta_vec_names.extend(list_process_name('LODA_n_hists_mean'))
400 |     meta_vec_names.extend(list_process_name('LODA_n_hists_max'))
401 |     meta_vec_names.extend(list_process_name('LODA_n_cuts_mean'))
402 |     meta_vec_names.extend(list_process_name('LODA_n_cuts_max'))
403 | 
404 |     return meta_vec, meta_vec_names
405 | 


--------------------------------------------------------------------------------
/metaod/models/predict_metaod.py:
--------------------------------------------------------------------------------
 1 | """MetaOD prediction with the trained model
 2 | """
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | # environment setting
 7 | from zipfile import ZipFile
 8 | import os
 9 | from joblib import load
10 | from pyod.utils.data import generate_data
11 | import numpy as np
12 | 
13 | from metaod.models.gen_meta_features import generate_meta_features
14 | from metaod.models.utility import fix_nan
15 | from pyod.utils.data import generate_data
16 | 
17 | def get_top_models(p, n):
18 |     return np.flip(np.argsort(p))[:n]
19 | 
20 | def select_model(X, trained_model_location="trained_models", n_selection=1):
21 |     
22 |     # print(os.path.realpath(__file__))
23 |     # unzip trained models
24 |     # with ZipFile(os.path.join(os.path.dirname(os.path.realpath(__file__)),
25 |     #                           'trained_models.zip'), 'r') as zip:
26 |     #     # # printing all the contents of the zip file
27 |     #     # zip.printdir()
28 |     
29 |     #     # extracting all the files
30 |     #     print('Extracting trained models now...')
31 |     #     zip.extractall(path='trained_models')
32 |     #     print('Finish extracting models')
33 | 
34 |     # load PCA scalar
35 |     meta_scalar = load(os.path.join(trained_model_location,"meta_scalar.joblib"))
36 |     # generate meta features         
37 |     meta_X, _ = generate_meta_features(X)
38 |     meta_X = np.nan_to_num(meta_X,nan=0)
39 |     # replace nan by 0 for now
40 |     # todo: replace by mean is better as fix_nan 
41 |     meta_X = meta_scalar.transform(np.asarray(meta_X).reshape(1, -1)).astype(float)
42 |     
43 |     # use all trained models for ensemble
44 |     trained_models = [
45 |         "train_0.joblib", 
46 |         "train_2.joblib",
47 |         # "train_42.joblib"
48 |         ]
49 |     print(os.getcwd())
50 |     # # load trained models
51 |     model_lists = list(load(os.path.join(trained_model_location,"model_list.joblib")))
52 |     
53 |     predict_scores = np.zeros([len(trained_models), len(model_lists)])
54 |     
55 |     for i, model in enumerate(trained_models):
56 |         clf = load(os.path.join(trained_model_location, model))
57 |         # w = load (model)
58 |         predict_scores[i,] = clf.predict(meta_X)
59 |         predicted_scores_max = np.nanargmax(predict_scores[i,])
60 |         # print('top model', model_lists[predicted_scores_max])
61 |     combined_predict = np.average(predict_scores, axis=0)
62 |     
63 |     predicted_scores_sorted = get_top_models(combined_predict, n_selection)
64 |     predicted_scores_max = np.nanargmax(combined_predict)
65 |     
66 |     print('top model', model_lists[predicted_scores_sorted[0]])
67 |     
68 |     return np.asarray(model_lists)[predicted_scores_sorted]
69 | 
70 | # if __name__ == "__main__":
71 | 
72 | #     contamination = 0.1  # percentage of outliers
73 | #     n_train = 1000  # number of training points
74 | #     n_test = 100  # number of testing points
75 | 
76 | #     # Generate sample data
77 | #     X_train, y_train, X_test, y_test = \
78 | #         generate_data(n_train=n_train,
79 | #                       n_test=n_test,
80 | #                       n_features=3,
81 | #                       contamination=contamination,
82 | #                       random_state=42)
83 | 
84 | #     clf_setting = select_model(X_train, n_selection=10)
85 | 


--------------------------------------------------------------------------------
/metaod/models/train_metaod.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Sep 24 17:13:27 2020
  4 | 
  5 | @author: yuezh
  6 | """
  7 | 
  8 | import os
  9 | import random
 10 | import pandas as pd
 11 | import numpy as np
 12 | 
 13 | from sklearn.utils import check_array
 14 | from sklearn.preprocessing import MinMaxScaler
 15 | 
 16 | from scipy.io import loadmat
 17 | 
 18 | from joblib import dump
 19 | 
 20 | from metaod.models.utility import read_arff, fix_nan
 21 | from metaod.models.gen_meta_features import generate_meta_features
 22 | from metaod.models.core import MetaODClass
 23 | 
 24 | # read in performance table
 25 | roc_df = pd.read_excel(os.path.join('data', 'performance_table.xlsx'),
 26 |                        sheet_name='AP')
 27 | 
 28 | # trim the table
 29 | roc_mat = roc_df.to_numpy()
 30 | roc_mat_red = fix_nan(roc_mat[2:, 4:].astype('float'))
 31 | 
 32 | # get statistics of the training data
 33 | n_datasets, n_configs = roc_mat_red.shape[0], roc_mat_red.shape[1]
 34 | data_headers = roc_mat[2:, 0]
 35 | config_headers = roc_df.columns[4:]
 36 | dump(config_headers, 'model_list.joblib')
 37 | 
 38 | # %%
 39 | 
 40 | # build meta-features
 41 | meta_mat = np.zeros([n_datasets, 200])
 42 | 
 43 | # read in mat files
 44 | mat_file_list = [
 45 |     'annthyroid.mat',
 46 |     'arrhythmia.mat',
 47 |     'breastw.mat',
 48 |     'glass.mat',
 49 |     'ionosphere.mat',
 50 |     'letter.mat',
 51 |     'lympho.mat',
 52 |     'mammography.mat',
 53 |     'mnist.mat',
 54 |     'musk.mat',
 55 |     'optdigits.mat',
 56 |     'pendigits.mat',
 57 |     'pima.mat',
 58 |     'satellite.mat',
 59 |     'satimage-2.mat',
 60 |     'shuttle.mat',
 61 |     'smtp_n.mat',
 62 |     'speech.mat',
 63 |     'thyroid.mat',
 64 |     'vertebral.mat',
 65 |     'vowels.mat',
 66 |     'wbc.mat',
 67 |     'wine.mat',
 68 |     'Annthyroid',
 69 |     'Arrhythmia',
 70 |     'Cardiotocography',
 71 |     'HeartDisease',  # too small
 72 |     'Hepatitis',  # too small
 73 |     'InternetAds',
 74 |     'PageBlocks',
 75 |     'Pima',
 76 |     'SpamBase',
 77 |     'Stamps',
 78 |     'Wilt',
 79 | 
 80 |     'ALOI',  # too large
 81 |     'Glass',  # too small
 82 |     'PenDigits',
 83 |     'Shuttle',
 84 |     'Waveform',
 85 |     'WBC',  # too small
 86 |     'WDBC',  # too small
 87 |     'WPBC',  # too small
 88 | ]
 89 | 
 90 | for j in range(23):
 91 |     mat_file = mat_file_list[j]
 92 |     mat = loadmat(os.path.join("data", "ODDS", mat_file))
 93 |     X = mat['X']
 94 |     meta_mat[j, :], meta_vec_names = generate_meta_features(X)
 95 |     print(j, mat_file)
 96 | 
 97 | # read arff files
 98 | file_names = [
 99 |     'Annthyroid',
100 |     'Arrhythmia',
101 |     'Cardiotocography',
102 |     'HeartDisease',  # too small
103 |     'Hepatitis',  # too small
104 |     'InternetAds',
105 |     'PageBlocks',
106 |     'Pima',
107 |     'SpamBase',
108 |     'Stamps',
109 |     'Wilt',
110 | 
111 |     'ALOI',  # too large
112 |     'Glass',  # too small
113 |     'PenDigits',
114 |     'Shuttle',
115 |     'Waveform',
116 |     'WBC',  # too small
117 |     'WDBC',  # too small
118 |     'WPBC',  # too small
119 | ]
120 | 
121 | #############################################################################
122 | misplaced_list = ['Arrhythmia', 'Cardiotocography', 'Hepatitis', 'ALOI',
123 |                   'KDDCup99']
124 | arff_list = [
125 |     os.path.join('semantic', 'Annthyroid', 'Annthyroid_withoutdupl_07.arff'),
126 |     os.path.join('semantic', 'Arrhythmia', 'Arrhythmia_withoutdupl_46.arff'),
127 |     os.path.join('semantic', 'Cardiotocography',
128 |                  'Cardiotocography_withoutdupl_22.arff'),
129 |     os.path.join('semantic', 'HeartDisease',
130 |                  'HeartDisease_withoutdupl_44.arff'),
131 |     os.path.join('semantic', 'Hepatitis', 'Hepatitis_withoutdupl_16.arff'),
132 |     os.path.join('semantic', 'InternetAds',
133 |                  'InternetAds_withoutdupl_norm_19.arff'),
134 |     os.path.join('semantic', 'PageBlocks', 'PageBlocks_withoutdupl_09.arff'),
135 |     os.path.join('semantic', 'Pima', 'Pima_withoutdupl_35.arff'),
136 |     os.path.join('semantic', 'SpamBase', 'SpamBase_withoutdupl_40.arff'),
137 |     os.path.join('semantic', 'Stamps', 'Stamps_withoutdupl_09.arff'),
138 |     os.path.join('semantic', 'Wilt', 'Wilt_withoutdupl_05.arff'),
139 | 
140 |     os.path.join('literature', 'ALOI', 'ALOI_withoutdupl.arff'),
141 |     os.path.join('literature', 'Glass', 'Glass_withoutdupl_norm.arff'),
142 |     os.path.join('literature', 'PenDigits',
143 |                  'PenDigits_withoutdupl_norm_v01.arff'),
144 |     os.path.join('literature', 'Shuttle', 'Shuttle_withoutdupl_v01.arff'),
145 |     os.path.join('literature', 'Waveform', 'Waveform_withoutdupl_v01.arff'),
146 |     os.path.join('literature', 'WBC', 'WBC_withoutdupl_v01.arff'),
147 |     os.path.join('literature', 'WDBC', 'WDBC_withoutdupl_v01.arff'),
148 |     os.path.join('literature', 'WPBC', 'WPBC_withoutdupl_norm.arff')
149 | ]
150 | 
151 | for j in range(23, 42):
152 |     mat_file = file_names[j - 23]
153 |     mat_file_path = os.path.join("data", "DAMI", arff_list[j - 24])
154 |     X, y, attributes = read_arff(mat_file_path, misplaced_list)
155 |     X = check_array(X).astype('float64')
156 |     meta_mat[j, :], meta_vec_names = generate_meta_features(X)
157 |     print("processing", j, mat_file)
158 | 
159 | # read emmott dataset
160 | selected_bench = pd.read_csv(os.path.join('data', 'childsets.csv'))[
161 |     'bench.id'].values.tolist()
162 | selected_bench_loc = pd.read_csv(os.path.join('data', 'childsets.csv'))[
163 |     'location'].values.tolist()
164 | 
165 | for j in range(42, 142):
166 |     print("processing", j, selected_bench_loc[j - 42])
167 |     mat = pd.read_csv(
168 |         os.path.join("data", "Emmott", selected_bench_loc[j - 42]))
169 |     X = mat.to_numpy()[:, 6:].astype(float)
170 |     meta_mat[j, :], meta_vec_names = generate_meta_features(X)
171 | 
172 | # use cleaned and transformed meta-features
173 | meta_scalar = MinMaxScaler()
174 | meta_mat_transformed = meta_scalar.fit_transform(meta_mat)
175 | meta_mat_transformed = fix_nan(meta_mat_transformed)
176 | dump(meta_scalar, 'meta_scalar.joblib')
177 | # %% train model
178 | 
179 | # split data into train and valid
180 | seed = 0
181 | full_list = list(range(n_datasets))
182 | random.Random(seed).shuffle(full_list)
183 | n_train = int(0.85 * n_datasets)
184 | 
185 | train_index = full_list[:n_train]
186 | valid_index = full_list[n_train:]
187 | 
188 | train_set = roc_mat_red[train_index, :].astype('float64')
189 | valid_set = roc_mat_red[valid_index, :].astype('float64')
190 | 
191 | train_meta = meta_mat_transformed[train_index, :].astype('float64')
192 | valid_meta = meta_mat_transformed[valid_index, :].astype('float64')
193 | 
194 | clf = MetaODClass(train_set, valid_performance=valid_set, n_factors=30,
195 |                   learning='sgd')
196 | clf.train(n_iter=50, meta_features=train_meta, valid_meta=valid_meta,
197 |           learning_rate=0.05, max_rate=0.9, min_rate=0.1, discount=1,
198 |           n_steps=8)
199 | 
200 | # U = clf.user_vecs
201 | # V = clf.item_vecs
202 | 
203 | # # # print(EMF.regr_multirf.predict(test_meta).shape)
204 | # predicted_scores = clf.predict(valid_meta)
205 | # predicted_scores_max = np.nanargmax(predicted_scores, axis=1)
206 | # print()
207 | # output transformer (for meta-feature) and the trained clf
208 | dump(clf, 'train_' + str(seed) + '.joblib')
209 | 
210 | #%%
211 | # # %%
212 | # import pickle
213 | # from metaod.models.core import MetaODClass
214 | 
215 | # if __name__ == "__main__":
216 | #     # # code for standalone use
217 | #     # t = Thing("foo")
218 | #     # Thing.__module__ = "thing"
219 | #     # t.save("foo.pickle")
220 | #     # MetaODClass.__module__ = "metaod"
221 | #     file = open('test.pk', 'wb')
222 | #     pickle.dump(clf, file)
223 | 
224 | # # # file = open('rf.pk', 'wb')
225 | # # # pickle.dump(clf.user_vecs, file)
226 | 


--------------------------------------------------------------------------------
/metaod/models/trained_models/meta_scalar.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/metaod/models/trained_models/meta_scalar.joblib


--------------------------------------------------------------------------------
/metaod/models/trained_models/model_list.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/metaod/models/trained_models/model_list.joblib


--------------------------------------------------------------------------------
/metaod/models/trained_models/train_0.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/metaod/models/trained_models/train_0.joblib


--------------------------------------------------------------------------------
/metaod/models/utility.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import numpy as np
 4 | import arff
 5 | from zipfile import ZipFile
 6 | import urllib.request 
 7 | 
 8 | def Diff(li1, li2): 
 9 |     return (list(set(li1) - set(li2))) 
10 | 
11 | def argmaxatn(w, nth):
12 |     w = np.asarray(w).ravel()
13 |     t = np.argsort(w)
14 |     return t[-1*nth]
15 | 
16 | def fix_nan(X):
17 |     # TODO: should store the mean of the meta features to be used for test_meta
18 |     # replace by 0 for now
19 |     col_mean = np.nanmean(X, axis = 0) 
20 |     inds = np.where(np.isnan(X))
21 |     X[inds] = np.take(col_mean, inds[1]) 
22 |     
23 |     return X
24 | 
25 | 
26 | def read_arff(file_path, misplaced_list):
27 |     misplaced = False
28 |     for item in misplaced_list:
29 |         if item in file_path:
30 |             misplaced = True
31 | 
32 |     file = arff.load(open(file_path))
33 |     data_value = np.asarray(file['data'])
34 |     attributes = file['attributes']
35 | 
36 |     X = data_value[:, 0:-2]
37 |     if not misplaced:
38 |         y = data_value[:, -1]
39 |     else:
40 |         y = data_value[:, -2]
41 |     y[y == 'no'] = 0
42 |     y[y == 'yes'] = 1
43 |     y = y.astype('float').astype('int').ravel()
44 | 
45 |     if y.sum() > len(y):
46 |         print(attributes)
47 |         raise ValueError('wrong sum')
48 | 
49 |     return X, y, attributes
50 | 
51 | def prepare_trained_model(url='https://github.com/yzhao062/MetaOD/raw/master/saved_models/trained_models.zip', 
52 |                           filename='trained_models.zip',
53 |                           save_path='trained_models'):
54 |             
55 |     if not os.path.exists(save_path):
56 |         os.makedirs(save_path)
57 |         
58 |     urllib.request.urlretrieve(url, filename)
59 |     
60 |     # print(os.path.join(os.path.dirname(os.path.realpath(__file__)),
61 |     # #                       'trained_models.zip'))
62 |     # #todo: verify file exists
63 |     with ZipFile(filename, 'r') as zip:
64 |         # # printing all the contents of the zip file
65 |         # zip.printdir()
66 |         # extracting all the files
67 |         print('Extracting trained models now...')
68 |         zip.extractall()
69 |         print('Finish extracting models')
70 |     
71 | 
72 |     # url='https://github.com/yzhao062/MetaOD/raw/master/saved_models/trained_models.zip'
73 |     # filename='trained_models.zip'
74 |     # save_path='trained_models'
75 |     
76 |     # if not os.path.exists(save_path):
77 |     #     os.makedirs(save_path)
78 |         
79 |     # urllib.request.urlretrieve(url, os.path.join(save_path, filename))
80 |     
81 |     # print(os.path.join(os.path.dirname(os.path.realpath(__file__)),
82 |     #                       'trained_models.zip'))
83 |     # #todo: verify file exists
84 |     # with ZipFile(os.path.join(save_path, filename), 'r') as zip:
85 |     #     # # printing all the contents of the zip file
86 |     #     # zip.printdir()
87 |     
88 |     #     # extracting all the files
89 |     #     print('Extracting trained models now...')
90 |     #     zip.extractall(path='trained_models')
91 |     #     print('Finish extracting models')


--------------------------------------------------------------------------------
/metaod/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/metaod/test/__init__.py


--------------------------------------------------------------------------------
/metaod/test/test_predict_metaod.py:
--------------------------------------------------------------------------------
 1 | """MetaOD prediction with the trained model
 2 | """
 3 | # License: BSD 2 clause
 4 | import os
 5 | import unittest
 6 | 
 7 | from pyod.utils.data import generate_data
 8 | from metaod.models.utility import prepare_trained_model
 9 | from metaod.models.predict_metaod import select_model
10 | 
11 | 
12 | class TestPredictMetaOD(unittest.TestCase):
13 |     def setUp(self):
14 |         self.contamination = 0.05  # percentage of outliers
15 |         self.n_train = 1000  # number of training points
16 |         self.n_test = 100  # number of testing points
17 | 
18 |         # Generate sample data
19 |         self.X_train, self.y_train, self.X_test, self.y_test = \
20 |             generate_data(n_train=self.n_train,
21 |                           n_test=self.n_test,
22 |                           n_features=3,
23 |                           contamination=self.contamination,
24 |                           random_state=42)
25 | 
26 |     def test_prepare_trained_model(self):
27 |         # load pretrained models
28 |         prepare_trained_model()
29 |         print(os.path.join(os.getcwd(), "trained_models"))
30 |         assert (os.path.isfile("trained_models.zip"))
31 |         assert (os.path.isdir("trained_models"))
32 | 
33 |     def test_model_selection(self):
34 |         prepare_trained_model()
35 |         # recommended models
36 |         selected_models = select_model(self.X_train, n_selection=100)
37 |         assert ((len(selected_models) == 100))
38 | 


--------------------------------------------------------------------------------
/metaod/version.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ``pyod`` is a python toolbox for scalable outlier detection
 3 | """
 4 | # Based on NiLearn package
 5 | # License: simplified BSD
 6 | 
 7 | # PEP0440 compatible formatted version, see:
 8 | # https://www.python.org/dev/peps/pep-0440/
 9 | #
10 | # Generic release markers:
11 | # X.Y
12 | # X.Y.Z # For bug fix releases
13 | #
14 | # Admissible pre-release markers:
15 | # X.YaN # Alpha release
16 | # X.YbN # Beta release
17 | # X.YrcN # Release Candidate
18 | # X.Y # Final release
19 | #
20 | # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
21 | # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
22 | #
23 | __version__ = '0.0.6'  # pragma: no cover
24 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | joblib>=0.14.1
2 | liac-arff
3 | numpy>=1.18.1
4 | scipy>=0.20
5 | scikit_learn==0.22.1
6 | pandas>=0.20
7 | pyod>=0.8


--------------------------------------------------------------------------------
/saved_models/trained_models.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzhao062/MetaOD/2a8ed2761468d2f8ee2cd8194ce36b0f817576d1/saved_models/trained_models.zip


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.rst


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | # read the contents of README file
 4 | from os import path
 5 | from io import open  # for Python 2 and 3 compatibility
 6 | 
 7 | # get __version__ from _version.py
 8 | ver_file = path.join('metaod', 'version.py')
 9 | with open(ver_file) as f:
10 |     exec(f.read())
11 | 
12 | this_directory = path.abspath(path.dirname(__file__))
13 | 
14 | 
15 | # read the contents of README.rst
16 | def readme():
17 |     with open(path.join(this_directory, 'README.rst'), encoding='utf-8') as f:
18 |         return f.read()
19 | 
20 | 
21 | # read the contents of requirements.txt
22 | with open(path.join(this_directory, 'requirements.txt'),
23 |           encoding='utf-8') as f:
24 |     requirements = f.read().splitlines()
25 | 
26 | setup(
27 |     name='metaod',
28 |     version=__version__,
29 |     description='Automating Outlier Detection via Meta-Learning (selece/recommend OD model(s) for new datasets)',
30 |     long_description=readme(),
31 |     long_description_content_type='text/x-rst',
32 |     author='Yue Zhao',
33 |     author_email='zhaoy@cmu.edu',
34 |     url='https://github.com/yzhao062/metaod',
35 |     download_url='https://github.com/yzhao062/metaod/archive/master.zip',
36 |     keywords=['outlier detection', 'anomaly detection', 'outlier ensembles',
37 |               'data mining', 'meta learning', 'AutoML'],
38 |     packages=find_packages(exclude=['test']),
39 |     include_package_data=True,
40 |     install_requires=requirements,
41 |     setup_requires=['setuptools>=38.6.0'],
42 |     classifiers=[
43 |         'Development Status :: 2 - Pre-Alpha',
44 |         'Intended Audience :: Education',
45 |         'Intended Audience :: Financial and Insurance Industry',
46 |         'Intended Audience :: Science/Research',
47 |         'Intended Audience :: Developers',
48 |         'Intended Audience :: Information Technology',
49 |         'License :: OSI Approved :: BSD License',
50 |         'Programming Language :: Python :: 3.5',
51 |         'Programming Language :: Python :: 3.6',
52 |         'Programming Language :: Python :: 3.7',
53 |     ],
54 | )
55 | 


--------------------------------------------------------------------------------