├── rankeval ├── VERSION ├── test │ ├── dataset │ │ ├── __init__.py │ │ ├── test_dataset.py │ │ └── test_svmlight_format.py │ ├── metrics │ │ └── __init__.py │ ├── model │ │ ├── __init__.py │ │ ├── test_proxy_model.py │ │ ├── test_proxy_ScikitLearn.py │ │ ├── test_proxy_LightGBM.py │ │ ├── test_proxy_XGBoost.py │ │ ├── test_proxy_CatBoost.py │ │ ├── test_proxy_Jforests.py │ │ └── test_proxy_QuickRank.py │ ├── scoring │ │ ├── __init__.py │ │ └── test_scoring.py │ ├── analysis │ │ ├── __init__.py │ │ ├── test_statistical.py │ │ └── test_feature.py │ ├── __init__.py │ ├── data │ │ ├── svmlight_invalid.txt │ │ ├── CatBoost.model.coreml │ │ ├── svmlight_classification.txt │ │ ├── svmlight_classification_qid.txt │ │ ├── ScikitLearn.model.txt │ │ ├── XGBoost.model.txt │ │ ├── Jforests.model.xml │ │ ├── quickrank.model.v2.xml │ │ ├── quickrank.model.xml │ │ └── LightGBM.model.txt │ └── base.py ├── visualization │ ├── __init__.py │ └── feature.py ├── scoring │ ├── __init__.py │ ├── _efficient_scoring.pyx │ └── scorer.py ├── dataset │ ├── __init__.py │ ├── dataset_container.py │ └── write_json_dataset_catalogue.py ├── analysis │ ├── __init__.py │ ├── _efficient_feature_impl.h │ ├── _efficient_feature.pyx │ ├── _efficient_topological.pyx │ ├── topological.py │ └── _efficient_feature_impl.cpp ├── __init__.py ├── model │ ├── __init__.py │ ├── proxy_CatBoost.py │ ├── proxy_XGBoost.py │ └── proxy_Jforests.py └── metrics │ ├── __init__.py │ ├── spearman_rho.py │ ├── rmse.py │ ├── kendall_tau.py │ ├── mse.py │ ├── err.py │ ├── precision_max.py │ ├── dcg.py │ ├── precision.py │ ├── mrr.py │ ├── metric.py │ ├── recall.py │ ├── pfound.py │ ├── map.py │ ├── rbp.py │ └── ndcg.py ├── doc ├── banner.png ├── src │ ├── rankeval.rst │ ├── rankeval.scoring.rst │ ├── Makefile │ ├── rankeval.visualization.rst │ ├── make.bat │ ├── rankeval.analysis.rst │ ├── index.rst │ ├── static-index.rst │ ├── rankeval.dataset.rst │ ├── rankeval.model.rst │ ├── rankeval.metrics.rst │ └── conf.py └── Makefile ├── .gitattributes ├── MANIFEST.in ├── Makefile ├── .gitignore ├── AUTHORS.md ├── CONTRIBUTING.md ├── .travis.yml └── README.md /rankeval/VERSION: -------------------------------------------------------------------------------- 1 | 0.8.2 -------------------------------------------------------------------------------- /rankeval/test/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rankeval/test/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rankeval/test/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rankeval/test/scoring/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rankeval/test/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rankeval/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rankeval/test/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | -------------------------------------------------------------------------------- /doc/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpclab/rankeval/HEAD/doc/banner.png -------------------------------------------------------------------------------- /rankeval/test/data/svmlight_invalid.txt: -------------------------------------------------------------------------------- 1 | python 2:2.5 10:-5.2 15:1.5 2 | 2.0 5:1.0 12:-3 3 | 3.0 20:27 4 | -------------------------------------------------------------------------------- /rankeval/test/data/CatBoost.model.coreml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpclab/rankeval/HEAD/rankeval/test/data/CatBoost.model.coreml -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | doc/* linguist-documentation 2 | notebooks/*.ipynb linguist-detectable=false 3 | Makefile linguist-detectable=false 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | include AUTHORS.md 4 | include VERSION 5 | recursive-include doc * 6 | recursive-include rankeval * 7 | -------------------------------------------------------------------------------- /rankeval/test/data/svmlight_classification.txt: -------------------------------------------------------------------------------- 1 | # comment 2 | 1.0 2:2.5 10:-5.2 15:1.5 # an inline comment 3 | 2.0 5:1.0 12:-3 4 | # another comment 5 | 3.0 20:27 6 | -------------------------------------------------------------------------------- /rankeval/test/data/svmlight_classification_qid.txt: -------------------------------------------------------------------------------- 1 | # comment 2 | 1.0 qid:1 2:2.5 10:-5.2 15:0 # an inline comment 3 | 2.0 qid:37 5:1.0 12:-3 33:0.7 4 | 0.0 qid:37 6:7 8:9 10:11 5 | # another comment 6 | 3.0 qid:12 20:27 22:30 23:40 7 | -------------------------------------------------------------------------------- /doc/src/rankeval.rst: -------------------------------------------------------------------------------- 1 | rankeval package 2 | ================ 3 | 4 | .. automodule:: rankeval 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | 14 | rankeval.analysis 15 | rankeval.dataset 16 | rankeval.metrics 17 | rankeval.model 18 | rankeval.scoring 19 | rankeval.visualization 20 | 21 | -------------------------------------------------------------------------------- /rankeval/test/data/ScikitLearn.model.txt: -------------------------------------------------------------------------------- 1 | base_score=0.606000 2 | learning_rate=0.100000 3 | booster[0] [f54 f133]: 4 | 0:[f54<=0.005769] 5 | 1:leaf=-0.226426222524 6 | 2:[f133<=0.500000] 7 | 3:leaf=0.187204691788 8 | 4:leaf=0.905627900778 9 | booster[1] [f14 f52]: 10 | 0:[f52<=0.044467] 11 | 1:[f14<=28.500000] 12 | 3:leaf=0.413148111893 13 | 4:leaf=-0.203144770188 14 | 2:leaf=0.269475234496 15 | -------------------------------------------------------------------------------- /doc/src/rankeval.scoring.rst: -------------------------------------------------------------------------------- 1 | rankeval.scoring package 2 | ======================== 3 | 4 | .. automodule:: rankeval.scoring 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | rankeval.scoring.scorer module 13 | ------------------------------ 14 | 15 | .. automodule:: rankeval.scoring.scorer 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | 21 | -------------------------------------------------------------------------------- /rankeval/test/data/XGBoost.model.txt: -------------------------------------------------------------------------------- 1 | booster[0]: 2 | 0:[f52<0.0444665] yes=1,no=2,missing=1 3 | 1:[f14<26.5] yes=3,no=4,missing=3 4 | 3:leaf=0.0330693 5 | 4:leaf=-0.0274553 6 | 2:[f17<23.1987] yes=5,no=6,missing=5 7 | 5:leaf=0.0289187 8 | 6:leaf=0.067713 9 | booster[1]: 10 | 0:[f54<0.0035545] yes=1,no=2,missing=1 11 | 1:[f10<209.5] yes=3,no=4,missing=3 12 | 3:leaf=-0.0018294 13 | 4:leaf=-0.0425189 14 | 2:[f52<0.0444665] yes=5,no=6,missing=5 15 | 5:leaf=0.00209278 16 | 6:leaf=0.0306171 17 | -------------------------------------------------------------------------------- /rankeval/test/base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Salvatore Trani 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | """Generic method useful for testing""" 9 | 10 | import os 11 | 12 | test_dir = os.path.dirname(os.path.abspath(__file__)) 13 | data_dir = os.path.join(test_dir, "data") -------------------------------------------------------------------------------- /rankeval/scoring/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Salvatore Trani 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | """ 9 | The :mod:`rankeval.scoring` module includes utilities to score a model on a given dataset. 10 | """ 11 | 12 | from .scorer import Scorer 13 | 14 | __all__ = ['Scorer'] 15 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PYTHON ?= python 2 | NOSETESTS ?= nosetests 3 | 4 | clean: 5 | $(PYTHON) setup.py clean 6 | rm -rf dist 7 | rm -rf rankeval.egg-info 8 | rm -rf build 9 | rm -rf .ipynb_checkpoints 10 | rm -rf .eggs 11 | find . -name "*.so" -delete 12 | find . -name "*.pyc" -delete 13 | find . -name "*.egg" -delete 14 | 15 | test: 16 | $(NOSETESTS) 17 | 18 | ### Handling Sphinx for generating documentation 19 | .PHONY: doc 20 | doc: 21 | @echo "===================================" 22 | @echo "Producing documentation..." 23 | 24 | make -C doc doc 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.so 3 | *.pyd 4 | *~ 5 | .#* 6 | *.lprof 7 | *.swp 8 | *.swo 9 | .DS_Store 10 | 11 | # ignored dirs 12 | build 13 | dist 14 | *.egg-info 15 | .out-of-repo/ 16 | 17 | # ignored files 18 | distribute-* 19 | *eggs 20 | 21 | # ipython files 22 | .ipynb_checkpoints 23 | /*.ipynb 24 | 25 | *.prefs 26 | .pydevproject 27 | .idea 28 | 29 | # doc dirs and auto generated files 30 | doc/src/_build 31 | 32 | # auto compiled files 33 | rankeval/scoring/_efficient_scoring.c 34 | rankeval/analysis/_efficient_topological.c 35 | rankeval/analysis/_efficient_feature.cpp 36 | -------------------------------------------------------------------------------- /rankeval/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Salvatore Trani 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | """ 9 | The :mod:`rankeval.dataset` module includes utilities to load datasets 10 | and dump datasets according to several supported formats. 11 | """ 12 | 13 | from .dataset import Dataset 14 | from .dataset_container import DatasetContainer 15 | 16 | __all__ = ['Dataset', 17 | 'DatasetContainer'] -------------------------------------------------------------------------------- /doc/src/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = RankEval 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /rankeval/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Salvatore Trani 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | """ 9 | The :mod:`rankeval.analysis` module implements the functionalities for analysing the 10 | behaviour of several ranking models with respect to several metrics and 11 | datasets. It proposes a comprehensive set of analysis for tuning, evaluating 12 | and comparing Gradient Boosted Regression Tree models devoted to learning a 13 | ranking function. 14 | """ -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | History 2 | ------- 3 | 4 | This project was started in 2017 as a demo presented at the 40th International 5 | ACM SIGIR Conference on Research and Development in Information Retrieval in Tokyo (Japan). 6 | 7 | People 8 | ------ 9 | 10 | The following people have been core contributors to rankeval's development and maintenance: 11 | 12 | * Claudio Lucchese - HPC-ISTI researcher - Pisa (Italy) 13 | * Franco Maria Nardini - HPC-ISTI researcher - Pisa (Italy) 14 | * Cristina Muntean - HPC-ISTI researcher - Pisa (Italy) 15 | * Salvatore Trani - HPC-ISTI researcher - Pisa (Italy) 16 | 17 | Please do not email the authors directly to ask for assistance or report issues. 18 | Instead, please use GitHub issues or email to `rankeval@isti.cnr.it` for requests 19 | and information. -------------------------------------------------------------------------------- /rankeval/dataset/dataset_container.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Franco Maria Nardini 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | class DatasetContainer(object): 9 | """ 10 | This class is a container used to easily manage a dataset and associated 11 | learning to rank models trained by using it. It also offers the possibility 12 | to store the license coming with public dataset. 13 | """ 14 | train_dataset = None 15 | validation_dataset = None 16 | test_dataset = None 17 | 18 | license_agreement = '' 19 | 20 | model_filenames = None -------------------------------------------------------------------------------- /rankeval/test/model/test_proxy_model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | from rankeval.model import RTEnsemble 6 | from rankeval.test.base import data_dir 7 | 8 | model_file = os.path.join(data_dir, "quickrank.model.xml") 9 | 10 | 11 | class ProxyModelTestCase(unittest.TestCase): 12 | 13 | def test_not_supported_model(self): 14 | try: 15 | RTEnsemble(model_file, format="unsupported") 16 | # if we reach the code below, it means the constructor 17 | # has not failed...raise error! 18 | assert False 19 | except TypeError: 20 | pass 21 | 22 | 23 | if __name__ == '__main__': 24 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 25 | level=logging.DEBUG) 26 | unittest.main() 27 | -------------------------------------------------------------------------------- /rankeval/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This package serve the root of the RankEval package. 3 | """ 4 | 5 | import os 6 | import io 7 | 8 | cur_dir = os.path.dirname(__file__) 9 | 10 | __version__ = io.open(os.path.join(cur_dir, 'VERSION'), 11 | encoding='utf-8').read().strip() 12 | 13 | 14 | def is_notebook(): 15 | try: 16 | from IPython import get_ipython 17 | shell = get_ipython().__class__.__name__ 18 | if shell == 'ZMQInteractiveShell': 19 | return True # Jupyter notebook or qtconsole 20 | elif shell == 'TerminalInteractiveShell': 21 | return False # Terminal running IPython 22 | else: 23 | return False # Other type (?) 24 | except (NameError, ImportError): 25 | return False # Probably standard Python interpreter 26 | -------------------------------------------------------------------------------- /doc/src/rankeval.visualization.rst: -------------------------------------------------------------------------------- 1 | rankeval.visualization package 2 | ============================== 3 | 4 | .. automodule:: rankeval.visualization 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | rankeval.visualization.effectiveness module 13 | ------------------------------------------- 14 | 15 | .. automodule:: rankeval.visualization.effectiveness 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | rankeval.visualization.feature module 21 | ------------------------------------- 22 | 23 | .. automodule:: rankeval.visualization.feature 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | rankeval.visualization.topological module 29 | ----------------------------------------- 30 | 31 | .. automodule:: rankeval.visualization.topological 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | 37 | -------------------------------------------------------------------------------- /rankeval/test/dataset/test_dataset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | from rankeval.dataset import Dataset 6 | from ..base import data_dir 7 | 8 | datafile = os.path.join(data_dir, "msn1.fold1.test.5k.txt") 9 | 10 | 11 | class SVMLightLoaderTestCase(unittest.TestCase): 12 | 13 | def test_svmlight_dataset(self): 14 | try: 15 | dataset = Dataset.load(datafile, format="svmlight") 16 | except TypeError: 17 | assert False 18 | 19 | def test_not_supported_dataset(self): 20 | try: 21 | Dataset.load(datafile, format="unsupported") 22 | # if we reach the code below, it means the constructor has not failed...raise error! 23 | assert False 24 | except TypeError: 25 | pass 26 | 27 | if __name__ == '__main__': 28 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) 29 | unittest.main() 30 | -------------------------------------------------------------------------------- /doc/src/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=RankEval 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /rankeval/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Salvatore Trani 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | """ 9 | The :mod:`rankeval.model` module includes utilities to load a model 10 | and dump it according to several supported model's format. 11 | """ 12 | 13 | from .proxy_LightGBM import ProxyLightGBM 14 | from .proxy_QuickRank import ProxyQuickRank 15 | from .proxy_ScikitLearn import ProxyScikitLearn 16 | from .proxy_XGBoost import ProxyXGBoost 17 | from .proxy_Jforests import ProxyJforests 18 | from .proxy_CatBoost import ProxyCatBoost 19 | from .rt_ensemble import RTEnsemble 20 | 21 | __all__ = ['RTEnsemble', 22 | 'ProxyQuickRank', 23 | 'ProxyLightGBM', 24 | 'ProxyXGBoost', 25 | 'ProxyScikitLearn', 26 | 'ProxyJforests', 27 | 'ProxyCatBoost' 28 | ] 29 | -------------------------------------------------------------------------------- /doc/src/rankeval.analysis.rst: -------------------------------------------------------------------------------- 1 | rankeval.analysis package 2 | ========================= 3 | 4 | .. automodule:: rankeval.analysis 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | rankeval.analysis.effectiveness module 13 | -------------------------------------- 14 | 15 | .. automodule:: rankeval.analysis.effectiveness 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | rankeval.analysis.feature module 21 | -------------------------------- 22 | 23 | .. automodule:: rankeval.analysis.feature 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | rankeval.analysis.statistical module 29 | ------------------------------------ 30 | 31 | .. automodule:: rankeval.analysis.statistical 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | rankeval.analysis.topological module 37 | ------------------------------------ 38 | 39 | .. automodule:: rankeval.analysis.topological 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | 44 | 45 | -------------------------------------------------------------------------------- /doc/src/index.rst: -------------------------------------------------------------------------------- 1 | RankEval -- Analysis and evaluation of Learning-to-Rank models 2 | ============================================================== 3 | 4 | RankEval is a Python library for the *analysis* and *evaluation* of Learning-to-Rank models 5 | based on ensembles of regression trees. 6 | Target audience includes the *machine learning* (ML) and *information retrieval* (IR) communities. 7 | 8 | Citing RankEval 9 | --------------- 10 | 11 | Please cite:: 12 | 13 | @inproceedings{rankeval-sigir17, 14 | author = {Claudio Lucchese and Cristina Ioana Muntean and Franco Maria Nardini and 15 | Raffaele Perego and Salvatore Trani}, 16 | title = {RankEval: An Evaluation and Analysis Framework for Learning-to-Rank Solutions}, 17 | booktitle = {SIGIR 2017: Proceedings of the 40th International {ACM} {SIGIR} 18 | Conference on Research and Development in Information Retrieval}, 19 | year = {2017}, 20 | location = {Tokyo, Japan} 21 | } 22 | 23 | 24 | .. toctree:: 25 | :maxdepth: 1 26 | :caption: Contents: 27 | 28 | rankeval 29 | 30 | 31 | Indices and tables 32 | ================== 33 | 34 | * :ref:`genindex` 35 | * :ref:`modindex` 36 | * :ref:`search` 37 | -------------------------------------------------------------------------------- /doc/src/static-index.rst: -------------------------------------------------------------------------------- 1 | RankEval -- Analysis and evaluation of Learning-to-Rank models 2 | ============================================================== 3 | 4 | RankEval is a Python library for the *analysis* and *evaluation* of Learning-to-Rank models 5 | based on ensembles of regression trees. 6 | Target audience includes the *machine learning* (ML) and *information retrieval* (IR) communities. 7 | 8 | Citing RankEval 9 | --------------- 10 | 11 | Please cite:: 12 | 13 | @inproceedings{rankeval-sigir17, 14 | author = {Claudio Lucchese and Cristina Ioana Muntean and Franco Maria Nardini and 15 | Raffaele Perego and Salvatore Trani}, 16 | title = {RankEval: An Evaluation and Analysis Framework for Learning-to-Rank Solutions}, 17 | booktitle = {SIGIR 2017: Proceedings of the 40th International {ACM} {SIGIR} 18 | Conference on Research and Development in Information Retrieval}, 19 | year = {2017}, 20 | location = {Tokyo, Japan} 21 | } 22 | 23 | 24 | .. toctree:: 25 | :maxdepth: 1 26 | :caption: Contents: 27 | 28 | rankeval 29 | 30 | 31 | Indices and tables 32 | ================== 33 | 34 | * :ref:`genindex` 35 | * :ref:`modindex` 36 | * :ref:`search` 37 | -------------------------------------------------------------------------------- /rankeval/test/data/Jforests.model.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 129 129 107 72 55 54 4 | 1 3 5 -1 -3 -2 5 | 2 4 -4 -5 -6 -7 6 | 14884 14826 11171 485 320 1214 7 | 268.00791914235936 265.01449371350714 13.917409408136482 19.112358794482084 0.009766221082829762 0.018525300616492706 8 | -1.2156555533251343 -0.2370371246276912 -1.9329095710207922 0.8030836898094491 -0.010194310883442196 -1.939557007673878 0.584062922565639 9 | 10 | 11 | 133 72 105 130 62 121 12 | 1 4 -3 -4 -1 -6 13 | -2 2 3 -5 5 -7 14 | 0 543 7553 15359 10922 31960 15 | 0.0 21.397960464750046 13.263678873222243 181.01421193338518 0.3333333333333333 -1.5976164200695848 16 | -0.14431346869079953 1.3819800608689703 1.770799198202688 1.7353641847897887 0.22409245535128064 -0.3769555284258838 -1.793789588633319 17 | 18 | 19 | -------------------------------------------------------------------------------- /rankeval/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Cristina Muntean 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | """ 9 | The :mod:`rankeval.metrics` module includes the definition and implementation of 10 | the most common metrics adopted in the Learning to Rank community. 11 | """ 12 | 13 | from .metric import Metric 14 | from .precision import Precision 15 | from .recall import Recall 16 | from .ndcg import NDCG 17 | from .dcg import DCG 18 | from .err import ERR 19 | from .kendall_tau import Kendalltau 20 | from .map import MAP 21 | from .mrr import MRR 22 | from .pfound import Pfound 23 | from .rbp import RBP 24 | from .mse import MSE 25 | from .rmse import RMSE 26 | from .spearman_rho import SpearmanRho 27 | 28 | __all__ = ['Metric', 29 | 'Precision', 30 | 'Recall', 31 | 'NDCG', 32 | 'DCG', 33 | 'ERR', 34 | 'Kendalltau', 35 | 'MAP', 36 | 'MRR', 37 | 'Pfound', 38 | 'RBP', 39 | 'MSE', 40 | 'RMSE', 41 | 'SpearmanRho'] 42 | -------------------------------------------------------------------------------- /doc/src/rankeval.dataset.rst: -------------------------------------------------------------------------------- 1 | rankeval.dataset package 2 | ======================== 3 | 4 | .. automodule:: rankeval.dataset 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | rankeval.dataset.dataset module 13 | ------------------------------- 14 | 15 | .. automodule:: rankeval.dataset.dataset 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | rankeval.dataset.dataset\_container module 21 | ------------------------------------------ 22 | 23 | .. automodule:: rankeval.dataset.dataset_container 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | rankeval.dataset.datasets\_fetcher module 29 | ----------------------------------------- 30 | 31 | .. automodule:: rankeval.dataset.datasets_fetcher 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | rankeval.dataset.svmlight\_format module 37 | ---------------------------------------- 38 | 39 | .. automodule:: rankeval.dataset.svmlight_format 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | 44 | rankeval.dataset.write\_json\_dataset\_catalogue module 45 | ------------------------------------------------------- 46 | 47 | .. automodule:: rankeval.dataset.write_json_dataset_catalogue 48 | :members: 49 | :undoc-members: 50 | :show-inheritance: 51 | 52 | 53 | -------------------------------------------------------------------------------- /rankeval/test/data/quickrank.model.v2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | MART 4 | 2 5 | 3 6 | 0.10000000149011612 7 | 1 8 | 0 9 | 100 10 | 11 | 12 | 13 | 14 | 108 15 | 14.895151138305664 16 | 17 | 115 18 | -8.0245580673217773 19 | 20 | 0.3412887828162291 21 | 22 | 23 | 0.66845277963831218 24 | 25 | 26 | 27 | 0.96317280453257792 28 | 29 | 30 | 31 | 32 | 33 | 8 34 | 0.66666698455810547 35 | 36 | 0.5 37 | 38 | 39 | 106 40 | 17.0 41 | 42 | 0.5 43 | 44 | 45 | 1.0 46 | 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /rankeval/test/data/quickrank.model.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | MART 4 | 2 5 | 3 6 | 0.10000000149011612 7 | 1 8 | 0 9 | 100 10 | 11 | 12 | 13 | 14 | 108 15 | 14.895151138305664 16 | 17 | 115 18 | -8.0245580673217773 19 | 20 | 0.3412887828162291 21 | 22 | 23 | 0.66845277963831218 24 | 25 | 26 | 27 | 0.96317280453257792 28 | 29 | 30 | 31 | 32 | 33 | 8 34 | 0.66666698455810547 35 | 36 | 0.37133907932286642 37 | 38 | 39 | 106 40 | 17.135160446166992 41 | 42 | 0.54762687170967062 43 | 44 | 45 | 0.98651670670179537 46 | 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Steps for contributing 2 | 3 | Fixing a bug you found in RankEval? Suggesting a feature? Listed here are some guidelines to keep in mind when contributing. 4 | 5 | 1. **Open an issue** along with detailed explanation. For bug reports, include the code to reproduce the bug. For feature requests, explain why you think the feature could be useful. 6 | 7 | 2. **Fork the repository**. If you're contributing code, clone the forked repository into your local machine. 8 | 9 | 3. **Run the tests** to make sure they pass on your machine. Simply run `nosetests` at the root folder and make sure all tests pass. 10 | 11 | 4. **Create a new branch**. Please do not commit directly to the master branch. Create your own branch and place your additions there. 12 | 13 | 5. **Write your code**. For Python, please follow PEP8 coding standards. Also, if you're adding a function, you must [write a docstring using the Numpy format](http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_numpy.html#example-numpy) detailing the API of your function. Take a look at the docstrings of the other Xcessiv functions to get an idea of what the docstring of yours should look like. 14 | 15 | 6. **Write/modify the corresponding unit tests**. After adding in your code and the corresponding unit tests, run `pytest` again to make sure they pass. 16 | 17 | 7. **Submit a pull request**. After submitting a PR, if all tests pass, your code will be reviewed and merged promptly. 18 | 19 | Thank you for taking the time to make RankEval better! -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | DOCDIR=./src 2 | SRCDIR=../rankeval 3 | 4 | # documentation is compiled by using shpinx 5 | # exclude from documentation 6 | DOCEXCLUDED=../rankeval/test 7 | 8 | ### Handling Sphinx for generating documentation 9 | .PHONY: doc 10 | doc: 11 | @echo "===================================" 12 | @echo "Producing documentation..." 13 | 14 | # generate sphinx data 15 | sphinx-apidoc -o $(DOCDIR) -d 1 -f -F -M -H "RankEval" -A "HPC Lab" -V 0 -R 0.00 $(SRCDIR) $(DOCEXCLUDED) 16 | @cp $(DOCDIR)/static-index.rst $(DOCDIR)/index.rst 17 | 18 | # customize sphinx generation 19 | @echo "# custom" >> $(DOCDIR)/conf.py 20 | @echo "extensions += ['sphinx.ext.todo']" >> $(DOCDIR)/conf.py 21 | @echo "todo_include_todos = True" >> $(DOCDIR)/conf.py 22 | #@echo "extensions += ['numpydoc']" >> $(DOCDIR)/conf.py 23 | #@echo "extensions += ['sphinxcontrib.napoleon']" >> doc/conf.py 24 | @echo "extensions += ['sphinx.ext.autosummary']" >> $(DOCDIR)/conf.py 25 | @echo "extensions += ['sphinx.ext.imgmath']" >> $(DOCDIR)/conf.py 26 | @echo "numpydoc_show_class_members = False" >> $(DOCDIR)/conf.py 27 | # customize themes 28 | @echo "html_theme = \"sphinx_rtd_theme\"" >> $(DOCDIR)/conf.py 29 | 30 | @echo "import sys,os" >> $(DOCDIR)/conf.py 31 | @echo "sys.path.insert(0, os.path.abspath('../../') )" >> $(DOCDIR)/conf.py 32 | #@echo "print (sys.path)" >> $(DOCDIR)/conf.py 33 | 34 | @echo "from setuptools import sandbox" >> $(DOCDIR)/conf.py 35 | @echo "sandbox.run_setup(os.path.abspath('../../setup.py'), ['build_ext','-i'])" >> $(DOCDIR)/conf.py 36 | 37 | @echo "autoclass_content = 'both'" >> $(DOCDIR)/conf.py 38 | 39 | # compile HTML files 40 | make -C $(DOCDIR) html 41 | -------------------------------------------------------------------------------- /doc/src/rankeval.model.rst: -------------------------------------------------------------------------------- 1 | rankeval.model package 2 | ====================== 3 | 4 | .. automodule:: rankeval.model 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | rankeval.model.proxy\_CatBoost module 13 | ------------------------------------- 14 | 15 | .. automodule:: rankeval.model.proxy_CatBoost 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | rankeval.model.proxy\_Jforests module 21 | ------------------------------------- 22 | 23 | .. automodule:: rankeval.model.proxy_Jforests 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | rankeval.model.proxy\_LightGBM module 29 | ------------------------------------- 30 | 31 | .. automodule:: rankeval.model.proxy_LightGBM 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | rankeval.model.proxy\_QuickRank module 37 | -------------------------------------- 38 | 39 | .. automodule:: rankeval.model.proxy_QuickRank 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | 44 | rankeval.model.proxy\_ScikitLearn module 45 | ---------------------------------------- 46 | 47 | .. automodule:: rankeval.model.proxy_ScikitLearn 48 | :members: 49 | :undoc-members: 50 | :show-inheritance: 51 | 52 | rankeval.model.proxy\_XGBoost module 53 | ------------------------------------ 54 | 55 | .. automodule:: rankeval.model.proxy_XGBoost 56 | :members: 57 | :undoc-members: 58 | :show-inheritance: 59 | 60 | rankeval.model.rt\_ensemble module 61 | ---------------------------------- 62 | 63 | .. automodule:: rankeval.model.rt_ensemble 64 | :members: 65 | :undoc-members: 66 | :show-inheritance: 67 | 68 | 69 | -------------------------------------------------------------------------------- /rankeval/analysis/_efficient_feature_impl.h: -------------------------------------------------------------------------------- 1 | class TreeNode { 2 | public: 3 | unsigned int node_id; 4 | unsigned int start_id; 5 | unsigned int end_id; 6 | 7 | TreeNode(unsigned int node_id, 8 | unsigned int start_id, 9 | unsigned int end_id) : 10 | node_id(node_id), 11 | start_id(start_id), 12 | end_id(end_id) {} 13 | 14 | int get_n_instances() { 15 | return end_id - start_id + 1; 16 | } 17 | }; 18 | 19 | void c_feature_importance( 20 | const float* X, 21 | const float* y, 22 | const int* trees_root, 23 | const float* trees_weight, 24 | const short* trees_nodes_feature, 25 | const float* trees_nodes_value, 26 | const int* trees_left_child, 27 | const int* trees_right_child, 28 | float* feature_imp, 29 | short* feature_count, 30 | const int n_instances, 31 | const int n_features, 32 | const int n_trees); 33 | 34 | void c_feature_importance_tree( 35 | const float* X, 36 | const float* y, 37 | const int* trees_root, 38 | const float* trees_weight, 39 | const short* trees_nodes_feature, 40 | const float* trees_nodes_value, 41 | const int* trees_left_child, 42 | const int* trees_right_child, 43 | const int tree_id, 44 | float* feature_imp, 45 | short* feature_count, 46 | const int n_instances, 47 | const int n_features, 48 | float* y_pred, 49 | float* y_pred_tree); 50 | 51 | inline bool is_leaf_node(int node_id, 52 | const int* trees_left_child, 53 | const int* trees_right_child) { 54 | return trees_left_child[node_id] == -1 && trees_right_child[node_id] == -1; 55 | } -------------------------------------------------------------------------------- /rankeval/test/analysis/test_statistical.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | import numpy as np 6 | from numpy.testing import assert_allclose 7 | 8 | from rankeval.analysis.statistical import _randomization 9 | from rankeval.analysis.statistical import statistical_significance 10 | from rankeval.dataset import Dataset 11 | from rankeval.metrics.ndcg import NDCG 12 | from rankeval.model import RTEnsemble 13 | from ..base import data_dir 14 | 15 | 16 | class StatisticalSignificanceTestCase(unittest.TestCase): 17 | 18 | def setUp(self): 19 | self.model_a = RTEnsemble( 20 | os.path.join(data_dir, "quickrank.model.xml"), format="QuickRank") 21 | self.model_b = RTEnsemble( 22 | os.path.join(data_dir, "quickrank.model.v2.xml"), format="QuickRank") 23 | self.dataset = Dataset.load( 24 | os.path.join(data_dir, "msn1.fold1.test.5k.txt"), format="svmlight") 25 | self.metric = NDCG() 26 | 27 | def tearDown(self): 28 | del self.model_a 29 | self.model_a = None 30 | del self.model_b 31 | self.model_b = None 32 | del self.dataset 33 | self.dataset = None 34 | del self.metric 35 | self.metric = None 36 | 37 | def test_statistical_significance(self): 38 | statistical_significance([self.dataset], self.model_a, self.model_b, 39 | [self.metric], n_perm=100) 40 | 41 | def test_randomization(self): 42 | A = np.array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0]) 43 | B = np.array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) 44 | n_perm = 20000 45 | p1, p2 = _randomization( A, B, n_perm) 46 | # compute with https://github.com/searchivarius/PermTest 47 | assert_allclose(p2, .34, atol=0.02) 48 | 49 | 50 | if __name__ == '__main__': 51 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) 52 | unittest.main() 53 | -------------------------------------------------------------------------------- /rankeval/metrics/spearman_rho.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Cristina Muntean 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | import scipy.stats as stats 9 | from rankeval.metrics.metric import Metric 10 | 11 | 12 | class SpearmanRho(Metric): 13 | """ 14 | This class implements Spearman's Rho. 15 | We use the Spearman Rho coefficient implementation from scipy. 16 | 17 | """ 18 | 19 | def __init__(self, name='Rho'): 20 | """ 21 | This is the constructor of Spearman Rho, an object of type Metric, with 22 | the name Rho. The constructor also allows setting custom values in the 23 | following parameters. 24 | 25 | Parameters 26 | ---------- 27 | name: string 28 | Rho 29 | """ 30 | super(SpearmanRho, self).__init__(name) 31 | 32 | def eval(self, dataset, y_pred): 33 | """ 34 | This method computes the Spearman Rho tau score over the entire dataset 35 | and the detailed scores per query. It calls the eval_per query method 36 | for each query in order to get the detailed Spearman Rho score. 37 | 38 | Parameters 39 | ---------- 40 | dataset : Dataset 41 | Represents the Dataset object on which to apply Spearman Rho. 42 | y_pred : numpy 1d array of float 43 | Represents the predicted document scores for each instance in the 44 | dataset. 45 | 46 | Returns 47 | ------- 48 | avg_score: float 49 | The overall Spearman Rho score (averages over the detailed scores). 50 | detailed_scores: numpy 1d array of floats 51 | The detailed Spearman Rho scores for each query, an array of length 52 | of the number of queries. 53 | """ 54 | return super(SpearmanRho, self).eval(dataset, y_pred) 55 | 56 | def eval_per_query(self, y, y_pred): 57 | """ 58 | This methods computes Spearman Rho at per query level (on the instances 59 | belonging to a specific query). 60 | 61 | Parameters 62 | ---------- 63 | y: numpy array 64 | Represents the labels of instances corresponding to one query in the 65 | dataset (ground truth). 66 | y_pred: numpy array. 67 | Represents the predicted document scores obtained during the model 68 | scoring phase for that query. 69 | 70 | Returns 71 | ------- 72 | rho: float 73 | The Spearman Rho per query. 74 | """ 75 | spearman_rho = stats.spearmanr(y, y_pred) 76 | return spearman_rho.correlation 77 | 78 | def __str__(self): 79 | s = self.name 80 | return s 81 | -------------------------------------------------------------------------------- /doc/src/rankeval.metrics.rst: -------------------------------------------------------------------------------- 1 | rankeval.metrics package 2 | ======================== 3 | 4 | .. automodule:: rankeval.metrics 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | rankeval.metrics.dcg module 13 | --------------------------- 14 | 15 | .. automodule:: rankeval.metrics.dcg 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | rankeval.metrics.err module 21 | --------------------------- 22 | 23 | .. automodule:: rankeval.metrics.err 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | rankeval.metrics.kendall\_tau module 29 | ------------------------------------ 30 | 31 | .. automodule:: rankeval.metrics.kendall_tau 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | rankeval.metrics.map module 37 | --------------------------- 38 | 39 | .. automodule:: rankeval.metrics.map 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | 44 | rankeval.metrics.metric module 45 | ------------------------------ 46 | 47 | .. automodule:: rankeval.metrics.metric 48 | :members: 49 | :undoc-members: 50 | :show-inheritance: 51 | 52 | rankeval.metrics.mrr module 53 | --------------------------- 54 | 55 | .. automodule:: rankeval.metrics.mrr 56 | :members: 57 | :undoc-members: 58 | :show-inheritance: 59 | 60 | rankeval.metrics.mse module 61 | --------------------------- 62 | 63 | .. automodule:: rankeval.metrics.mse 64 | :members: 65 | :undoc-members: 66 | :show-inheritance: 67 | 68 | rankeval.metrics.ndcg module 69 | ---------------------------- 70 | 71 | .. automodule:: rankeval.metrics.ndcg 72 | :members: 73 | :undoc-members: 74 | :show-inheritance: 75 | 76 | rankeval.metrics.pfound module 77 | ------------------------------ 78 | 79 | .. automodule:: rankeval.metrics.pfound 80 | :members: 81 | :undoc-members: 82 | :show-inheritance: 83 | 84 | rankeval.metrics.precision module 85 | --------------------------------- 86 | 87 | .. automodule:: rankeval.metrics.precision 88 | :members: 89 | :undoc-members: 90 | :show-inheritance: 91 | 92 | rankeval.metrics.rbp module 93 | --------------------------- 94 | 95 | .. automodule:: rankeval.metrics.rbp 96 | :members: 97 | :undoc-members: 98 | :show-inheritance: 99 | 100 | rankeval.metrics.recall module 101 | ------------------------------ 102 | 103 | .. automodule:: rankeval.metrics.recall 104 | :members: 105 | :undoc-members: 106 | :show-inheritance: 107 | 108 | rankeval.metrics.rmse module 109 | ---------------------------- 110 | 111 | .. automodule:: rankeval.metrics.rmse 112 | :members: 113 | :undoc-members: 114 | :show-inheritance: 115 | 116 | rankeval.metrics.spearman\_rho module 117 | ------------------------------------- 118 | 119 | .. automodule:: rankeval.metrics.spearman_rho 120 | :members: 121 | :undoc-members: 122 | :show-inheritance: 123 | 124 | 125 | -------------------------------------------------------------------------------- /rankeval/test/analysis/test_feature.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | import numpy as np 6 | from numpy.testing import assert_array_almost_equal, assert_allclose, \ 7 | assert_array_equal 8 | 9 | from rankeval.analysis.feature import feature_importance, \ 10 | _feature_importance_tree 11 | from rankeval.dataset import Dataset 12 | from rankeval.metrics import MSE 13 | from rankeval.model import RTEnsemble 14 | from ..base import data_dir 15 | 16 | 17 | class FeatureImportanceTestCase(unittest.TestCase): 18 | 19 | @classmethod 20 | def setUpClass(cls): 21 | cls.model = RTEnsemble( 22 | os.path.join(data_dir, "quickrank.model.xml"), 23 | format="QuickRank") 24 | cls.dataset = Dataset.load( 25 | os.path.join(data_dir, "msn1.fold1.train.5k.txt"), 26 | format="svmlight") 27 | 28 | @classmethod 29 | def tearDownClass(cls): 30 | del cls.model 31 | cls.model = None 32 | del cls.dataset 33 | cls.dataset = None 34 | 35 | def test_feature_importance(self): 36 | feature_imp, feature_cnt = feature_importance( 37 | self.model, self.dataset, normalize=False) 38 | 39 | features = [7, 105, 107, 114] 40 | assert_allclose(feature_imp[features], 41 | [0.0405271754093, 0.0215954124466, 42 | 0.0478155618964, 0.018661751695], 43 | atol=1e-6) 44 | 45 | assert_array_equal(feature_cnt[features], 46 | [1, 1, 1, 1]) 47 | 48 | def test_scoring_feature_importance(self): 49 | 50 | # default scores on the root node of the first tree 51 | y_pred = np.zeros(self.dataset.n_instances, dtype=np.float32) 52 | 53 | # initialize features importance 54 | feature_imp = np.zeros(self.dataset.n_features, dtype=np.float32) 55 | 56 | # initialize features count 57 | feature_count = np.zeros(self.dataset.n_features, dtype=np.uint16) 58 | 59 | y_pred_m, partial_y_pred, y_leaves = \ 60 | self.model.score(self.dataset, detailed=True, cache=True) 61 | 62 | metric = MSE() 63 | 64 | for tree_id in np.arange(self.model.n_trees): 65 | y_pred_tree = _feature_importance_tree(self.model, self.dataset, 66 | tree_id, y_pred, metric, 67 | feature_imp, feature_count) 68 | # y_pred_tree *= self.model.trees_weight[tree_id] 69 | 70 | # Check the partial scores of each tree are compatible with 71 | # traditional scoring 72 | assert_allclose(y_pred_tree, 73 | partial_y_pred[:, tree_id], 74 | atol=1e-6) 75 | 76 | # Check the usual scoring and the scoring performed by analyzing also 77 | # the feature importance compute the same predictions 78 | assert_array_almost_equal(y_pred, y_pred_m) 79 | 80 | 81 | if __name__ == '__main__': 82 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) 83 | unittest.main() 84 | -------------------------------------------------------------------------------- /rankeval/metrics/rmse.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Cristina Muntean 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | import numpy as np 9 | from rankeval.metrics import Metric, MSE 10 | 11 | 12 | class RMSE(Metric): 13 | """ 14 | This class implements Root mean squared error (RMSE) with 15 | several parameters. 16 | 17 | """ 18 | def __init__(self, name='RMSE', cutoff=None): 19 | """ 20 | This is the constructor of RMSE, an object of type Metric, with the 21 | name RMSE. The constructor also allows setting custom values in the 22 | following parameters. 23 | 24 | Parameters 25 | ---------- 26 | name: string 27 | RMSE 28 | cutoff: int 29 | The top k results to be considered at per query level (e.g. 10), 30 | otherwise the default value is None and is computed on all the 31 | instances of a query. 32 | """ 33 | super(self.__class__, self).__init__(name) 34 | self.cutoff = cutoff 35 | self._mse = MSE(cutoff=cutoff) 36 | 37 | def eval(self, dataset, y_pred): 38 | """ 39 | This method takes the RMSE for each query and calculates 40 | the average RMSE. 41 | 42 | Parameters 43 | ---------- 44 | dataset : Dataset 45 | Represents the Dataset object on which to apply RMSE. 46 | y_pred : numpy 1d array of float 47 | Represents the predicted document scores for each instance 48 | in the dataset. 49 | 50 | Returns 51 | ------- 52 | avg_score: float 53 | The overall RMSE score (averages over the detailed RMSE scores). 54 | detailed_scores: numpy 1d array of floats 55 | The detailed RMSE@k scores for each query, an array of length of 56 | the number of queries. 57 | """ 58 | return super(self.__class__, self).eval(dataset, y_pred) 59 | 60 | def eval_per_query(self, y, y_pred): 61 | """ 62 | This method helps compute the RMSE score per query. It is called by 63 | the eval function which averages and aggregates the scores 64 | for each query. 65 | 66 | Parameters 67 | ---------- 68 | y: numpy array 69 | Represents the labels of instances corresponding to one query in 70 | the dataset (ground truth). 71 | y_pred: numpy array. 72 | Represents the predicted document scores obtained during the model 73 | scoring phase for that query. 74 | 75 | Returns 76 | ------- 77 | rmse: float 78 | Represents the RMSE score for one query. 79 | """ 80 | mse = self._mse.eval_per_query(y, y_pred) 81 | return np.sqrt(mse) 82 | 83 | def __str__(self): 84 | s = self.name 85 | if self.cutoff is not None: 86 | s += "@{}".format(self.cutoff) 87 | return s 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | 3 | cache: 4 | pip: true 5 | 6 | os: 7 | - linux 8 | - osx 9 | 10 | dist: xenial 11 | 12 | language: generic 13 | 14 | env: 15 | - TASK=sdist PYTHON_VERSION="2.7" 16 | - TASK=sdist PYTHON_VERSION="3.5" 17 | - TASK=sdist PYTHON_VERSION="3.6" 18 | - TASK=sdist PYTHON_VERSION="3.7" 19 | 20 | - TASK=bdist PYTHON_VERSION="2.7" 21 | - TASK=bdist PYTHON_VERSION="3.5" 22 | - TASK=bdist PYTHON_VERSION="3.6" 23 | - TASK=bdist PYTHON_VERSION="3.7" 24 | 25 | before_install: 26 | - export BUILD_DIRECTORY="$TRAVIS_BUILD_DIR" 27 | - if [[ $TRAVIS_OS_NAME == "osx" ]]; then 28 | export OS_NAME="macos"; 29 | if test -z "$COMPILER"; then 30 | export COMPILER="clang"; 31 | fi 32 | else 33 | export OS_NAME="linux"; 34 | export COMPILER="gcc"; 35 | fi 36 | 37 | install: 38 | - source build_tools/travis/setup.sh 39 | 40 | script: 41 | - bash build_tools/travis/test.sh 42 | 43 | jobs: 44 | include: 45 | - stage: test 46 | os: osx 47 | env: 48 | COMPILER=gcc 49 | PYTHON_VERSION="3.7" 50 | 51 | - stage: deploy 52 | if: tag IS present 53 | os: osx 54 | env: 55 | TASK=bdist 56 | PYTHON_VERSION="2.7" 57 | script: 58 | - build_tools/travis/deploy.sh 59 | 60 | - stage: deploy 61 | if: tag IS present 62 | os: osx 63 | env: 64 | TASK=bdist 65 | PYTHON_VERSION="3.5" 66 | script: 67 | - build_tools/travis/deploy.sh 68 | 69 | - stage: deploy 70 | if: tag IS present 71 | os: osx 72 | env: 73 | TASK=bdist 74 | PYTHON_VERSION="3.6" 75 | script: 76 | - build_tools/travis/deploy.sh 77 | 78 | - stage: deploy 79 | if: tag IS present 80 | os: osx 81 | env: 82 | TASK=bdist 83 | PYTHON_VERSION="3.7" 84 | script: 85 | - build_tools/travis/deploy.sh 86 | 87 | - stage: deploy 88 | if: tag IS present 89 | sudo: required 90 | services: 91 | - docker 92 | env: 93 | DOCKER_IMAGE=quay.io/pypa/manylinux1_x86_64 94 | PLAT=manylinux1_x86_64 95 | before_install: skip 96 | install: 97 | - docker pull $DOCKER_IMAGE 98 | script: 99 | - docker run --rm -e PLAT=$PLAT -e PYPI_USER=$PYPI_USER -e PYPI_PASS=$PYPI_PASS -v `pwd`:/io $DOCKER_IMAGE $PRE_CMD /io/build_tools/build_wheels.sh 100 | - ls wheelhouse/ 101 | 102 | - stage: deploy 103 | if: tag IS present 104 | sudo: required 105 | services: 106 | - docker 107 | env: 108 | DOCKER_IMAGE=quay.io/pypa/manylinux2010_x86_64 109 | PLAT=manylinux2010_x86_64 110 | before_install: skip 111 | install: 112 | - docker pull $DOCKER_IMAGE 113 | script: 114 | - docker run --rm -e PLAT=$PLAT -e PYPI_USER=$PYPI_USER -e PYPI_PASS=$PYPI_PASS -v `pwd`:/io $DOCKER_IMAGE $PRE_CMD /io/build_tools/build_wheels.sh 115 | - ls wheelhouse/ 116 | 117 | notifications: 118 | slack: 119 | rooms: $SLACK_TOKEN 120 | on_success: change 121 | on_failure: always -------------------------------------------------------------------------------- /rankeval/test/scoring/test_scoring.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | from numpy.testing import assert_array_almost_equal, assert_almost_equal 6 | 7 | from rankeval.dataset import Dataset 8 | from rankeval.model import RTEnsemble 9 | from rankeval.scoring.scorer import Scorer 10 | from rankeval.test.base import data_dir 11 | 12 | model_file = os.path.join(data_dir, "quickrank.model.xml") 13 | data_file = os.path.join(data_dir, "msn1.fold1.test.5k.txt") 14 | 15 | 16 | class ScoringTestCase(unittest.TestCase): 17 | 18 | @classmethod 19 | def setUpClass(cls): 20 | cls.model = RTEnsemble(model_file, format="QuickRank") 21 | cls.dataset = Dataset.load(data_file, format="svmlight") 22 | cls.scorer = Scorer(cls.model, cls.dataset) 23 | 24 | @classmethod 25 | def tearDownClass(cls): 26 | del cls.model 27 | cls.model = None 28 | del cls.dataset 29 | cls.dataset = None 30 | del cls.scorer 31 | cls.scorer = None 32 | 33 | def test_basic_scoring_values(self): 34 | self.scorer.score(detailed=False) 35 | assert_array_almost_equal(self.scorer.get_predicted_scores()[:3], 36 | [0.16549695, 0.07126279, 0.10397919]) 37 | assert_array_almost_equal(self.scorer.get_predicted_scores()[-3:], 38 | [0.13345119, 0.13345119, 0.07126279]) 39 | 40 | def test_basic_scoring_sum(self): 41 | self.scorer.score(detailed=False) 42 | assert_almost_equal(self.scorer.get_predicted_scores().sum(), 43 | 598.72852, decimal=5) 44 | 45 | def test_detailed_scoring_values(self): 46 | self.scorer.score(detailed=True) 47 | assert_array_almost_equal( 48 | self.scorer.get_partial_predicted_scores()[:3], 49 | [[0.06684528, 0.09865167], 50 | [0.03412888, 0.03713391], 51 | [0.06684528, 0.03713391]]) 52 | assert_array_almost_equal( 53 | self.scorer.get_partial_predicted_scores()[-3:], 54 | [[0.09631728, 0.03713391], 55 | [0.09631728, 0.03713391], 56 | [0.03412888, 0.03713391]]) 57 | 58 | def test_basic_and_detailed_scoring(self): 59 | self.scorer.score(detailed=False) 60 | y_pred_basic = self.scorer.y_pred 61 | self.scorer.score(detailed=True) 62 | y_pred_detailed = self.scorer.y_pred 63 | assert_array_almost_equal(y_pred_basic, y_pred_detailed) 64 | 65 | def test_detailed_scoring_sum(self): 66 | self.scorer.score(detailed=True) 67 | assert_almost_equal(self.scorer.get_partial_predicted_scores().sum(), 68 | 598.72852, decimal=5) 69 | assert_array_almost_equal( 70 | self.scorer.get_partial_predicted_scores().sum(axis=0), 71 | [312.43994141, 286.2948]) 72 | assert_array_almost_equal( 73 | self.scorer.get_partial_predicted_scores().sum(axis=1)[:3], 74 | [0.16549695, 0.07126279, 0.10397919]) 75 | assert_array_almost_equal( 76 | self.scorer.get_partial_predicted_scores().sum(axis=1)[-3:], 77 | [0.13345119, 0.13345119, 0.07126279]) 78 | 79 | 80 | if __name__ == '__main__': 81 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 82 | level=logging.DEBUG) 83 | unittest.main() 84 | -------------------------------------------------------------------------------- /rankeval/metrics/kendall_tau.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Cristina Muntean 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | 9 | import scipy.stats as stats 10 | from rankeval.metrics.metric import Metric 11 | 12 | 13 | class Kendalltau(Metric): 14 | """ 15 | This class implements Kendall's Tau. 16 | We use the Kendall tau coefficient implementation from scipy. 17 | 18 | """ 19 | 20 | def __init__(self, name='K'): 21 | """ 22 | This is the constructor of Kendall Tau, an object of type Metric, 23 | with the name K. The constructor also allows setting custom values in 24 | the following parameters. 25 | 26 | Parameters 27 | ---------- 28 | name: string 29 | K 30 | 31 | """ 32 | super(Kendalltau, self).__init__(name) 33 | 34 | 35 | def eval(self, dataset, y_pred): 36 | """ 37 | This method computes the Kendall tau score over the entire dataset and 38 | the detailed scores per query. It calls the eval_per query method 39 | for each query in order to get the detailed Kendall tau score. 40 | 41 | Parameters 42 | ---------- 43 | dataset : Dataset 44 | Represents the Dataset object on which to apply Kendall Tau. 45 | y_pred : numpy 1d array of float 46 | Represents the predicted document scores for each instance 47 | in the dataset. 48 | 49 | Returns 50 | ------- 51 | avg_score: float 52 | The overall Kendall tau score (averages over the detailed scores). 53 | detailed_scores: numpy 1d array of floats 54 | The detailed Kendall tau scores for each query, an array with length 55 | of the number of queries. 56 | """ 57 | return super(Kendalltau, self).eval(dataset, y_pred) 58 | 59 | 60 | def eval_per_query(self, y, y_pred): 61 | """ 62 | This methods computes Kendall tau at per query level (on the instances 63 | belonging to a specific query). The Kendall tau per query is 64 | calculated as: 65 | 66 | tau = (P - Q) / sqrt((P + Q + T) * (P + Q + U)) 67 | 68 | where P is the number of concordant pairs, Q the number of discordant 69 | pairs, T the number of ties only in x, and U the number of ties only 70 | in y. If a tie occurs for the same pair in both x and y, it is not 71 | added to either T or U. 72 | s 73 | Whether to use lexsort or quicksort as the sorting method for the 74 | initial sort of the inputs. Default is lexsort (True), for which 75 | kendalltau is of complexity O(n log(n)). If False, the complexity 76 | is O(n^2), but with a smaller pre-factor (so quicksort may be faster 77 | for small arrays). 78 | 79 | Parameters 80 | ---------- 81 | y: numpy array 82 | Represents the labels of instances corresponding to one query in 83 | the dataset (ground truth). 84 | y_pred: numpy array. 85 | Represents the predicted document scores obtained during the model 86 | scoring phase for that query. 87 | 88 | Returns 89 | ------- 90 | kendalltau: float 91 | The Kendall tau per query. 92 | """ 93 | kendall_tau = stats.kendalltau(y, y_pred, initial_lexsort=True) 94 | return kendall_tau.correlation 95 | 96 | 97 | def __str__(self): 98 | s = self.name 99 | return s -------------------------------------------------------------------------------- /rankeval/metrics/mse.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Cristina Muntean 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | import numpy as np 9 | 10 | from rankeval.metrics import Metric 11 | 12 | 13 | class MSE(Metric): 14 | """ 15 | This class implements Mean squared error (MSE) with several parameters. 16 | 17 | """ 18 | def __init__(self, name='MSE', cutoff=None): 19 | """ 20 | This is the constructor of MSE, an object of type Metric, with 21 | the name MSE. The constructor also allows setting custom values in 22 | the following parameters. 23 | 24 | Parameters 25 | ---------- 26 | name: string 27 | MSE 28 | cutoff: int 29 | The top k results to be considered at per query level (e.g. 10), 30 | otherwise the default value is None and is computed on all the 31 | instances of a query. 32 | """ 33 | super(self.__class__, self).__init__(name) 34 | self.cutoff = cutoff 35 | 36 | def eval(self, dataset, y_pred): 37 | """ 38 | This method takes the MSE for each query and calculates 39 | the average MSE. 40 | 41 | Parameters 42 | ---------- 43 | dataset : Dataset 44 | Represents the Dataset object on which to apply MSE. 45 | y_pred : numpy 1d array of float 46 | Represents the predicted document scores for each instance 47 | in the dataset. 48 | 49 | Returns 50 | ------- 51 | avg_score: float 52 | The overall MSE score (summed over the detailed MSE scores). 53 | detailed_scores: numpy 1d array of floats 54 | The detailed MSE@k scores for each query, an array of length of 55 | the number of queries. 56 | """ 57 | # return super(self.__class__, self).eval(dataset, y_pred) 58 | 59 | self.detailed_scores = np.zeros(dataset.n_queries, dtype=np.float32) 60 | 61 | for qid, q_y, q_y_pred in self.query_iterator(dataset, y_pred): 62 | self.detailed_scores[qid] = \ 63 | self.eval_per_query(q_y, q_y_pred) / dataset.n_instances 64 | return self.detailed_scores.sum(), self.detailed_scores 65 | 66 | def eval_per_query(self, y, y_pred): 67 | """ 68 | This method helps compute the MSE score per query. It is called by 69 | the eval function which averages and aggregates the scores 70 | for each query. 71 | 72 | Parameters 73 | ---------- 74 | y: numpy array 75 | Represents the labels of instances corresponding to one query in 76 | the dataset (ground truth). 77 | y_pred: numpy array. 78 | Represents the predicted document scores obtained during the model 79 | scoring phase for that query. 80 | 81 | Returns 82 | ------- 83 | rmse: float 84 | Represents the MSE score for one query. 85 | 86 | """ 87 | if self.cutoff is not None: 88 | idx = np.argsort(y_pred)[::-1][:self.cutoff] 89 | return ((y[idx] - y_pred[idx]) ** 2).sum() 90 | else: 91 | return ((y - y_pred) ** 2.0).sum() 92 | 93 | def __str__(self): 94 | s = self.name 95 | if self.cutoff is not None: 96 | s += "@{}".format(self.cutoff) 97 | return s 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /rankeval/metrics/err.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Cristina Muntean 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | # http://olivier.chapelle.cc/pub/err.pdf 9 | import numpy as np 10 | from rankeval.metrics import Metric 11 | 12 | 13 | class ERR(Metric): 14 | """ 15 | This class implements Expected Reciprocal Rank as proposed 16 | in http://olivier.chapelle.cc/pub/err.pdf 17 | 18 | """ 19 | 20 | def __init__(self, name='ERR', cutoff=None): 21 | """ 22 | This is the constructor of ERR, an object of type Metric, 23 | with the name ERR. The constructor also allows setting custom values 24 | in the following parameters. 25 | 26 | Parameters 27 | ---------- 28 | name: string 29 | ERR 30 | cutoff: int 31 | The top k results to be considered at per query level (e.g. 10) 32 | 33 | """ 34 | 35 | super(ERR, self).__init__(name) 36 | self.cutoff = cutoff 37 | 38 | def eval(self, dataset, y_pred): 39 | """ 40 | The method computes ERR by taking as input the dataset and the 41 | predicted document scores. It returns the averaged ERR score over 42 | the entire dataset and the detailed ERR scores per query. 43 | 44 | Parameters 45 | ---------- 46 | dataset : Dataset 47 | Represents the Dataset object on which to apply ERR. 48 | y_pred : numpy 1d array of float 49 | Represents the predicted document scores for each instance 50 | in the dataset. 51 | 52 | Returns 53 | ------- 54 | avg_score: float 55 | Represents the average ERR over all ERR scores per query. 56 | detailed_scores: numpy 1d array of floats 57 | Represents the detailed ERR scores for each query. It has the 58 | length of n_queries. 59 | 60 | """ 61 | return super(ERR, self).eval(dataset, y_pred) 62 | 63 | def eval_per_query(self, y, y_pred): 64 | """ 65 | This method helps compute the ERR score per query. It is called by 66 | the eval function which averages and aggregates the scores 67 | for each query. 68 | 69 | Parameters 70 | ---------- 71 | y: numpy array 72 | Represents the labels of instances corresponding to one query in 73 | the dataset (ground truth). 74 | y_pred: numpy array. 75 | Represents the predicted document scores obtained during 76 | the model scoring phase for that query. 77 | 78 | Returns 79 | ------- 80 | err: float 81 | Represents the ERR score for one query. 82 | 83 | """ 84 | idx_y_pred_sorted = np.argsort(y_pred)[::-1] 85 | if self.cutoff is not None: 86 | idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff] 87 | 88 | max_grade = y.max() # max relevance score 89 | prob_step_down = 1.0 90 | err = 0.0 91 | 92 | for i, idx in enumerate(idx_y_pred_sorted): 93 | utility = (pow(2., y[idx]) - 1.) / pow(2., max_grade) 94 | err += prob_step_down * (utility / (i + 1.)) 95 | prob_step_down *= (1. - utility) 96 | 97 | return err 98 | 99 | def __str__(self): 100 | s = self.name 101 | if self.cutoff is not None: 102 | s += "@{}".format(self.cutoff) 103 | return s -------------------------------------------------------------------------------- /rankeval/test/model/test_proxy_ScikitLearn.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | from numpy.testing import assert_equal, assert_array_equal, \ 6 | assert_array_almost_equal 7 | 8 | from rankeval.dataset import Dataset 9 | from rankeval.model import ProxyXGBoost 10 | from rankeval.model import RTEnsemble 11 | from rankeval.test.base import data_dir 12 | 13 | model_file = os.path.join(data_dir, "ScikitLearn.model.txt") 14 | data_file = os.path.join(data_dir, "msn1.fold1.test.5k.txt") 15 | 16 | 17 | class ProxyXGBoostTestCase(unittest.TestCase): 18 | 19 | @classmethod 20 | def setUpClass(cls): 21 | cls.model = RTEnsemble(model_file, format="ScikitLearn") 22 | cls.dataset = Dataset.load(data_file, format="svmlight") 23 | 24 | @classmethod 25 | def tearDownClass(cls): 26 | del cls.model 27 | cls.model = None 28 | del cls.dataset 29 | cls.dataset = None 30 | 31 | def test_count_nodes(self): 32 | n_trees, n_nodes = ProxyXGBoost._count_nodes(model_file) 33 | # print "Num Trees: %d\nNum Nodes: %d" % (n_trees, n_nodes), 34 | assert_equal(n_trees, 2) 35 | assert_equal(n_nodes, 10) 36 | assert_equal(n_trees, self.model.trees_root.size) 37 | assert_equal(n_nodes, self.model.trees_nodes_value.size) 38 | 39 | def test_root_nodes(self): 40 | assert_equal((self.model.trees_root > -1).all(), True, 41 | err_msg="Root nodes not set correctly") 42 | 43 | def test_root_nodes_adv(self): 44 | assert_array_equal(self.model.trees_root, [0, 5], 45 | err_msg="Root nodes are not correct") 46 | 47 | def test_split_features(self): 48 | assert_array_equal(self.model.trees_nodes_feature, 49 | [54, -1, 133, -1, -1, 52, 14, -1, -1, -1]) 50 | 51 | def test_tree_values(self): 52 | assert_array_almost_equal( 53 | self.model.trees_nodes_value, 54 | [5.769000e-03, -2.264262e-01, 5.000000e-01, 1.872047e-01, 55 | 9.056279e-01, 4.446700e-02, 2.850000e+01, 2.694752e-01, 56 | 4.131481e-01, -2.031448e-01], 57 | err_msg="Split thresholds or leaf outputs value are not correct") 58 | 59 | def test_left_children(self): 60 | assert_array_equal(self.model.trees_left_child, 61 | [1, -1, 3, -1, -1, 6, 8, -1, -1, -1]) 62 | 63 | def test_right_children(self): 64 | assert_array_equal(self.model.trees_right_child, 65 | [2, -1, 4, -1, -1, 7, 9, -1, -1, -1]) 66 | 67 | def test_leaf_correctness(self): 68 | for idx, feature in enumerate(self.model.trees_nodes_feature): 69 | if feature == -1: 70 | assert_equal(self.model.trees_left_child[idx], -1, 71 | "Left child of a leaf node is not empty (-1)") 72 | assert_equal(self.model.trees_right_child[idx], -1, 73 | "Right child of a leaf node is not empty (-1)") 74 | assert_equal(self.model.is_leaf_node(idx), True, 75 | "Leaf node not detected as a leaf") 76 | 77 | def test_prediction(self): 78 | y_pred = self.model.score(self.dataset, cache=True) 79 | assert_array_almost_equal(y_pred[:5], 80 | [0.651668, 0.604406, 0.604406, 81 | 0.610305, 0.563043]) 82 | assert_array_almost_equal(y_pred[-5:], 83 | [0.563043, 0.563043, 0.563043, 84 | 0.563043, 0.563043]) 85 | 86 | if __name__ == '__main__': 87 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 88 | level=logging.DEBUG) 89 | unittest.main() 90 | -------------------------------------------------------------------------------- /rankeval/metrics/precision_max.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Salvatore Trani 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | import numpy as np 9 | from rankeval.metrics.metric import Metric 10 | 11 | 12 | class PrecisionMax(Metric): 13 | """ 14 | This class implements Precision as: 15 | (relevant docs & retrieved docs) / retrieved docs. 16 | 17 | The particularity of this implementation is that the threshold is not global 18 | to all the queries as in `Precision`, but is dynamically computed for every 19 | query considering the document with top label. Thus this metric enables to 20 | compute the precision regardless of a fixed label. 21 | """ 22 | 23 | def __init__(self, name='P', cutoff=None): 24 | """ 25 | This is the constructor of Precision, an object of type Metric, with 26 | the name P. The constructor also allows setting custom values for cutoff 27 | and threshold, otherwise it uses the default values. 28 | 29 | Parameters 30 | ---------- 31 | name: string 32 | P 33 | cutoff: int 34 | The top k results to be considered at per query level (e.g. 10) 35 | 36 | """ 37 | super(PrecisionMax, self).__init__(name) 38 | self.cutoff = cutoff 39 | 40 | def eval(self, dataset, y_pred): 41 | """ 42 | This method computes the Precision score over the entire dataset and 43 | the detailed scores per query. It calls the eval_per query method for 44 | each query in order to get the detailed Precision score. 45 | 46 | Parameters 47 | ---------- 48 | dataset : Dataset 49 | Represents the Dataset object on which to apply Precision. 50 | y_pred : numpy 1d array of float 51 | Represents the predicted document scores for each instance in the 52 | dataset. 53 | 54 | Returns 55 | ------- 56 | avg_score: float 57 | The overall Precision score (averages over the detailed precision 58 | scores). 59 | detailed_scores: numpy 1d array of floats 60 | The detailed Precision scores for each query, an array of length of 61 | the number of queries. 62 | """ 63 | return super(PrecisionMax, self).eval(dataset, y_pred) 64 | 65 | def eval_per_query(self, y, y_pred): 66 | """ 67 | This methods computes Precision at per query level (on the instances 68 | belonging to a specific query). The Precision per query is calculated as 69 | <(relevant docs & retrieved docs) / retrieved docs>. 70 | 71 | Parameters 72 | ---------- 73 | y: numpy array 74 | Represents the labels of instances corresponding to one query in 75 | the dataset (ground truth). 76 | y_pred: numpy array. 77 | Represents the predicted document scores obtained during the model 78 | scoring phase for that query. 79 | 80 | Returns 81 | ------- 82 | precision: float 83 | The precision per query. 84 | """ 85 | idx_y_pred_sorted = np.argsort(y_pred)[::-1] 86 | if self.cutoff is not None: 87 | idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff] 88 | 89 | n_relevant_retrieved = (y[idx_y_pred_sorted] >= y.max()).sum() 90 | return float(n_relevant_retrieved) / len(idx_y_pred_sorted) 91 | 92 | def __str__(self): 93 | s = self.name 94 | if self.cutoff is not None: 95 | s += "@{}".format(self.cutoff) 96 | return s 97 | -------------------------------------------------------------------------------- /rankeval/test/model/test_proxy_LightGBM.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | from numpy.testing import assert_equal, assert_array_equal, \ 6 | assert_array_almost_equal 7 | 8 | from rankeval.dataset import Dataset 9 | from rankeval.model import ProxyLightGBM 10 | from rankeval.model import RTEnsemble 11 | from rankeval.test.base import data_dir 12 | 13 | model_file = os.path.join(data_dir, "LightGBM.model.txt") 14 | data_file = os.path.join(data_dir, "msn1.fold1.test.5k.txt") 15 | 16 | 17 | class ProxyLightGBMTestCase(unittest.TestCase): 18 | 19 | @classmethod 20 | def setUpClass(cls): 21 | cls.model = RTEnsemble(model_file, format="LightGBM") 22 | cls.dataset = Dataset.load(data_file, format="svmlight") 23 | 24 | @classmethod 25 | def tearDownClass(cls): 26 | del cls.model 27 | cls.model = None 28 | del cls.dataset 29 | cls.dataset = None 30 | 31 | def test_count_nodes(self): 32 | n_trees, n_nodes = ProxyLightGBM._count_nodes(model_file) 33 | # print "Num Trees: %d\nNum Nodes: %d" % (n_trees, n_nodes), 34 | assert_equal(n_trees, 2) 35 | assert_equal(n_nodes, 10) 36 | assert_equal(n_trees, self.model.trees_root.size) 37 | assert_equal(n_nodes, self.model.trees_nodes_value.size) 38 | 39 | def test_root_nodes(self): 40 | assert_equal((self.model.trees_root > -1).all(), True, 41 | err_msg="Root nodes not set correctly") 42 | 43 | def test_root_nodes_adv(self): 44 | assert_array_equal(self.model.trees_root, [0, 5], 45 | err_msg="Root nodes are not correct") 46 | 47 | def test_split_features(self): 48 | assert_array_equal(self.model.trees_nodes_feature, 49 | [55, 134, -1, -1, -1, 133, 48, -1, -1, -1]) 50 | 51 | def test_tree_values(self): 52 | assert_array_almost_equal(self.model.trees_nodes_value, 53 | [3.63099994e-03, 2.06000000e+02, -3.71787995e-02, -1.57113143e-04, 54 | 3.06654684e-02, 9.99999968e-21, 2.33031496e-01, -4.58943285e-03, 55 | 3.43261547e-02, 1.79146975e-02], 56 | err_msg="Split thresholds or leaf outputs value are not correct") 57 | 58 | def test_left_children(self): 59 | assert_array_equal(self.model.trees_left_child, 60 | [2, 3, -1, -1, -1, 6, 7, -1, -1, -1]) 61 | 62 | def test_right_children(self): 63 | assert_array_equal(self.model.trees_right_child, 64 | [1, 4, -1, -1, -1, 8, 9, -1, -1, -1]) 65 | 66 | def test_leaf_correctness(self): 67 | for idx, feature in enumerate(self.model.trees_nodes_feature): 68 | if feature == -1: 69 | assert_equal(self.model.trees_left_child[idx], -1, 70 | "Left child of a leaf node is not empty (-1)") 71 | assert_equal(self.model.trees_right_child[idx], -1, 72 | "Right child of a leaf node is not empty (-1)") 73 | assert_equal(self.model.is_leaf_node(idx), True, 74 | "Leaf node not detected as a leaf") 75 | 76 | def test_prediction(self): 77 | y_pred = self.model.score(self.dataset, cache=True) 78 | assert_array_almost_equal(y_pred[:5], 79 | [-0.00474655, -0.00474655, -0.00474655, 80 | -0.00474655, -0.00474655]) 81 | assert_array_almost_equal(y_pred[-5:], 82 | [0.01775758, -0.00474655, -0.00474655, 83 | -0.00474655, -0.00474655]) 84 | 85 | 86 | if __name__ == '__main__': 87 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 88 | level=logging.DEBUG) 89 | unittest.main() 90 | -------------------------------------------------------------------------------- /rankeval/metrics/dcg.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Cristina Muntean 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | import numpy as np 9 | from rankeval.metrics.metric import Metric 10 | 11 | 12 | class DCG(Metric): 13 | """ 14 | This class implements DCG with several parameters. 15 | """ 16 | 17 | def __init__(self, name='DCG', cutoff=None, implementation="flat"): 18 | """ 19 | This is the constructor of DCG, an object of type Metric, 20 | with the name DCG. The constructor also allows setting custom values 21 | in the following parameters. 22 | 23 | Parameters 24 | ---------- 25 | name: string 26 | DCG 27 | cutoff: int 28 | The top k results to be considered at per query level (e.g. 10). 29 | implementation: string 30 | Indicates whether to consider the flat or the exponential DCG 31 | formula (e.g. {"flat", "exp"}). 32 | 33 | """ 34 | 35 | super(DCG, self).__init__(name) 36 | self.cutoff = cutoff 37 | self.implementation = implementation 38 | 39 | def eval(self, dataset, y_pred): 40 | """ 41 | The method computes DCG by taking as input the dataset and 42 | the predicted document scores. It returns the averaged DCG score 43 | over the entire dataset and the detailed DCG scores per query. 44 | 45 | Parameters 46 | ---------- 47 | dataset : Dataset 48 | Represents the Dataset object on which to apply DCG. 49 | y_pred : numpy 1d array of float 50 | Represents the predicted document scores for each instance 51 | in the dataset. 52 | 53 | Returns 54 | ------- 55 | avg_score: float 56 | Represents the average DCG over all DCG scores per query. 57 | detailed_scores: numpy 1d array of floats 58 | Represents the detailed DCG scores for each query. 59 | It has the length of n_queries. 60 | 61 | """ 62 | 63 | return super(DCG, self).eval(dataset, y_pred) 64 | 65 | def eval_per_query(self, y, y_pred): 66 | """ 67 | This method helps compute the DCG score per query. It is called by 68 | the eval function which averages and aggregates the scores 69 | for each query. 70 | 71 | Parameters 72 | ---------- 73 | y: numpy array 74 | Represents the labels of instances corresponding to one query in 75 | the dataset (ground truth). 76 | y_pred: numpy array. 77 | Represents the predicted document scores obtained during the model 78 | scoring phase for that query. 79 | 80 | Returns 81 | ------- 82 | dcg: float 83 | Represents the DCG score for one query. 84 | 85 | """ 86 | 87 | idx_y_pred_sorted = np.argsort(y_pred)[::-1] 88 | if self.cutoff is not None: 89 | idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff] 90 | 91 | discount = np.log2(np.arange(2, idx_y_pred_sorted.size + 2)) 92 | 93 | if self.implementation == "flat": 94 | gain = y[idx_y_pred_sorted] 95 | elif self.implementation == "exp": 96 | gain = np.exp2(y[idx_y_pred_sorted]) - 1.0 97 | 98 | dcg = (gain / discount).sum() 99 | return dcg 100 | 101 | def __str__(self): 102 | s = self.name 103 | if self.cutoff is not None: 104 | s += "@{}".format(self.cutoff) 105 | return s -------------------------------------------------------------------------------- /rankeval/test/model/test_proxy_XGBoost.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | from numpy.testing import assert_equal, assert_array_equal, \ 6 | assert_array_almost_equal 7 | 8 | from rankeval.dataset import Dataset 9 | from rankeval.model import ProxyXGBoost 10 | from rankeval.model import RTEnsemble 11 | from rankeval.test.base import data_dir 12 | 13 | model_file = os.path.join(data_dir, "XGBoost.model.txt") 14 | data_file = os.path.join(data_dir, "msn1.fold1.test.5k.txt") 15 | 16 | 17 | class ProxyXGBoostTestCase(unittest.TestCase): 18 | 19 | @classmethod 20 | def setUpClass(cls): 21 | cls.model = RTEnsemble(model_file, format="XGBoost") 22 | cls.dataset = Dataset.load(data_file, format="svmlight") 23 | 24 | @classmethod 25 | def tearDownClass(cls): 26 | del cls.model 27 | cls.model = None 28 | del cls.dataset 29 | cls.dataset = None 30 | 31 | def test_count_nodes(self): 32 | n_trees, n_nodes = ProxyXGBoost._count_nodes(model_file) 33 | # print "Num Trees: %d\nNum Nodes: %d" % (n_trees, n_nodes), 34 | assert_equal(n_trees, 2) 35 | assert_equal(n_nodes, 14) 36 | assert_equal(n_trees, self.model.trees_root.size) 37 | assert_equal(n_nodes, self.model.trees_nodes_value.size) 38 | 39 | def test_root_nodes(self): 40 | assert_equal((self.model.trees_root > -1).all(), True, 41 | err_msg="Root nodes not set correctly") 42 | 43 | def test_root_nodes_adv(self): 44 | assert_array_equal(self.model.trees_root, [0, 7], 45 | err_msg="Root nodes are not correct") 46 | 47 | def test_split_features(self): 48 | assert_array_equal(self.model.trees_nodes_feature, 49 | [52, 14, -1, -1, 17, -1, -1, 50 | 54, 10, -1, -1, 52, -1, -1]) 51 | 52 | def test_tree_values(self): 53 | assert_array_almost_equal( 54 | self.model.trees_nodes_value, 55 | [4.4466496e-02, 2.6499998e+01, 3.3069301e-02, -2.7455300e-02, 56 | 2.3198698e+01, 2.8918700e-02, 6.7713000e-02, 3.5544997e-03, 57 | 2.0949998e+02, -1.8294000e-03, -4.2518899e-02, 4.4466496e-02, 58 | 2.0927801e-03, 3.0617099e-02], 59 | decimal=5, 60 | err_msg="Split thresholds or leaf outputs value are not correct") 61 | 62 | def test_left_children(self): 63 | assert_array_equal(self.model.trees_left_child, 64 | [1, 2, -1, -1, 5, -1, -1, 8, 9, -1, -1, 12, -1, -1]) 65 | 66 | def test_right_children(self): 67 | assert_array_equal(self.model.trees_right_child, 68 | [4, 3, -1, -1, 6, -1, -1, 11, 10, -1, -1, 13, -1, -1]) 69 | 70 | def test_leaf_correctness(self): 71 | for idx, feature in enumerate(self.model.trees_nodes_feature): 72 | if feature == -1: 73 | assert_equal(self.model.trees_left_child[idx], -1, 74 | "Left child of a leaf node is not empty (-1)") 75 | assert_equal(self.model.trees_right_child[idx], -1, 76 | "Right child of a leaf node is not empty (-1)") 77 | assert_equal(self.model.is_leaf_node(idx), True, 78 | "Leaf node not detected as a leaf") 79 | 80 | def test_prediction(self): 81 | y_pred = self.model.score(self.dataset, cache=True) 82 | assert_array_almost_equal(y_pred[:5], 83 | [0.55953574, 0.47463751, 0.47463751, 84 | 0.48639977, 0.47071534]) 85 | assert_array_almost_equal(y_pred[-5:], 86 | [0.43002582, 0.43002582, 0.43002582, 87 | 0.47071534, 0.43002582]) 88 | 89 | if __name__ == '__main__': 90 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 91 | level=logging.DEBUG) 92 | unittest.main() 93 | -------------------------------------------------------------------------------- /rankeval/analysis/_efficient_feature.pyx: -------------------------------------------------------------------------------- 1 | """ 2 | This file implements the feature importance analysis in an efficient way. The 3 | limit is that the metric used to compute the gain for each split is hardcoded 4 | in the source code and is the MSE. 5 | """ 6 | 7 | import cython 8 | cimport cython 9 | 10 | # Import the Python-level symbols of numpy 11 | import numpy as np 12 | 13 | # Import the C-level symbols of numpy 14 | cimport numpy as np 15 | 16 | # Numpy must be initialized. When using numpy from C or Cython you must 17 | # _always_ do that, or you will have segfaults 18 | np.import_array() 19 | 20 | cdef extern from "_efficient_feature_impl.h": 21 | void c_feature_importance( 22 | const float* X, 23 | const float* y, 24 | const int* trees_root, 25 | const float* trees_weight, 26 | const short* trees_nodes_feature, 27 | const float* trees_nodes_value, 28 | const int* trees_left_child, 29 | const int* trees_right_child, 30 | float* feature_imp, 31 | short* feature_count, 32 | const int n_instances, 33 | const int n_features, 34 | const int n_trees); 35 | 36 | void c_feature_importance_tree( 37 | const float* X, 38 | const float* y, 39 | const int* trees_root, 40 | const float* trees_weight, 41 | const short* trees_nodes_feature, 42 | const float* trees_nodes_value, 43 | const int* trees_left_child, 44 | const int* trees_right_child, 45 | const int tree_id, 46 | float* feature_imp, 47 | short* feature_count, 48 | const int n_instances, 49 | const int n_features, 50 | float* y_pred, 51 | float* y_pred_tree); 52 | 53 | @cython.boundscheck(False) 54 | @cython.wraparound(False) 55 | def eff_feature_importance(model, dataset): 56 | 57 | # initialize features importance 58 | feature_imp = np.zeros(dataset.n_features, dtype=np.float32) 59 | 60 | # initialize features importance 61 | feature_count = np.zeros(dataset.n_features, dtype=np.uint16) 62 | 63 | c_feature_importance( 64 | np.PyArray_DATA(dataset.X), 65 | np.PyArray_DATA(dataset.y), 66 | np.PyArray_DATA(model.trees_root), 67 | np.PyArray_DATA(model.trees_weight), 68 | np.PyArray_DATA(model.trees_nodes_feature), 69 | np.PyArray_DATA(model.trees_nodes_value), 70 | np.PyArray_DATA(model.trees_left_child), 71 | np.PyArray_DATA(model.trees_right_child), 72 | np.PyArray_DATA(feature_imp), 73 | np.PyArray_DATA(feature_count), 74 | dataset.X.shape[0], 75 | dataset.X.shape[1], 76 | model.n_trees); 77 | 78 | return np.asarray(feature_imp, dtype=np.float32), \ 79 | np.asarray(feature_count, dtype=np.uint16) 80 | 81 | @cython.boundscheck(False) 82 | @cython.wraparound(False) 83 | def eff_feature_importance_tree(model, dataset, tree_id, y_pred, 84 | feature_imp, feature_count): 85 | 86 | y_pred_tree = np.zeros(dataset.n_instances, dtype=np.float32); 87 | 88 | c_feature_importance_tree( 89 | np.PyArray_DATA(dataset.X), 90 | np.PyArray_DATA(dataset.y), 91 | np.PyArray_DATA(model.trees_root), 92 | np.PyArray_DATA(model.trees_weight), 93 | np.PyArray_DATA(model.trees_nodes_feature), 94 | np.PyArray_DATA(model.trees_nodes_value), 95 | np.PyArray_DATA(model.trees_left_child), 96 | np.PyArray_DATA(model.trees_right_child), 97 | tree_id, 98 | np.PyArray_DATA(feature_imp), 99 | np.PyArray_DATA(feature_count), 100 | dataset.X.shape[0], 101 | dataset.X.shape[1], 102 | np.PyArray_DATA(y_pred), 103 | np.PyArray_DATA(y_pred_tree)); 104 | 105 | return np.asarray(y_pred_tree, dtype=np.float32) -------------------------------------------------------------------------------- /rankeval/metrics/precision.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Cristina Muntean 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | import numpy as np 9 | from rankeval.metrics.metric import Metric 10 | 11 | 12 | class Precision(Metric): 13 | """ 14 | This class implements Precision as: 15 | (relevant docs & retrieved docs) / retrieved docs. 16 | 17 | It allows setting custom values for cutoff and threshold, otherwise it uses 18 | the default values. 19 | 20 | """ 21 | 22 | _threshold = 1 23 | 24 | def __init__(self, name='P', cutoff=None, threshold=_threshold): 25 | """ 26 | This is the constructor of Precision, an object of type Metric, with 27 | the name P. The constructor also allows setting custom values for cutoff 28 | and threshold, otherwise it uses the default values. 29 | 30 | Parameters 31 | ---------- 32 | name: string 33 | P 34 | cutoff: int 35 | The top k results to be considered at per query level (e.g. 10) 36 | threshold: float 37 | This parameter considers relevant results all instances with labels 38 | different from 0, thus with a minimum label value of 1. It can be 39 | set to other values as well (e.g. 3), in the range of possible 40 | labels. 41 | 42 | """ 43 | super(Precision, self).__init__(name) 44 | self.cutoff = cutoff 45 | self.threshold = threshold 46 | 47 | def eval(self, dataset, y_pred): 48 | """ 49 | This method computes the Precision score over the entire dataset and 50 | the detailed scores per query. It calls the eval_per query method for 51 | each query in order to get the detailed Precision score. 52 | 53 | Parameters 54 | ---------- 55 | dataset : Dataset 56 | Represents the Dataset object on which to apply Precision. 57 | y_pred : numpy 1d array of float 58 | Represents the predicted document scores for each instance in the 59 | dataset. 60 | 61 | Returns 62 | ------- 63 | avg_score: float 64 | The overall Precision score (averages over the detailed precision 65 | scores). 66 | detailed_scores: numpy 1d array of floats 67 | The detailed Precision scores for each query, an array of length of 68 | the number of queries. 69 | """ 70 | return super(Precision, self).eval(dataset, y_pred) 71 | 72 | def eval_per_query(self, y, y_pred): 73 | """ 74 | This methods computes Precision at per query level (on the instances 75 | belonging to a specific query). The Precision per query is calculated as 76 | <(relevant docs & retrieved docs) / retrieved docs>. 77 | 78 | Parameters 79 | ---------- 80 | y: numpy array 81 | Represents the labels of instances corresponding to one query in 82 | the dataset (ground truth). 83 | y_pred: numpy array. 84 | Represents the predicted document scores obtained during the model 85 | scoring phase for that query. 86 | 87 | Returns 88 | ------- 89 | precision: float 90 | The precision per query. 91 | """ 92 | idx_y_pred_sorted = np.argsort(y_pred)[::-1] 93 | if self.cutoff is not None: 94 | idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff] 95 | 96 | n_relevant_retrieved = (y[idx_y_pred_sorted] >= self.threshold).sum() 97 | return float(n_relevant_retrieved) / len(idx_y_pred_sorted) 98 | 99 | def __str__(self): 100 | s = self.name 101 | if self.cutoff is not None: 102 | s += "@{}".format(self.cutoff) 103 | if self.threshold != self._threshold: 104 | s += "[>{}]".format(self.threshold) 105 | return s 106 | -------------------------------------------------------------------------------- /rankeval/metrics/mrr.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Cristina Muntean 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | """ 9 | 10 | """ 11 | import numpy as np 12 | from rankeval.metrics import Metric 13 | 14 | 15 | class MRR(Metric): 16 | """ 17 | This class implements Mean Reciprocal Rank. 18 | 19 | """ 20 | 21 | _threshold = 1 22 | 23 | def __init__(self, name='MRR', cutoff=None, threshold=_threshold): 24 | """ 25 | This is the constructor of MRR, an object of type Metric, with the 26 | name MRR. The constructor also allows setting custom values in the 27 | following parameters. 28 | 29 | Parameters 30 | ---------- 31 | name: string 32 | MRR 33 | cutoff: int 34 | The top k results to be considered at per query level (e.g. 10) 35 | threshold: float 36 | This parameter considers relevant results all instances with labels 37 | different from 0, thus with a minimum label value of 1. It can be 38 | set to other values as well (e.g. 3), in the range of possible labels. 39 | """ 40 | super(MRR, self).__init__(name) 41 | self.cutoff = cutoff 42 | self.threshold = threshold 43 | 44 | def eval(self, dataset, y_pred): 45 | """ 46 | The method computes MRR by taking as input the dataset and the predicted 47 | document scores. It returns the averaged MRR score over the entire 48 | dataset and the detailed MRR scores per query. 49 | 50 | The mean reciprocal rank is the average of the reciprocal ranks of 51 | results for a sample of queries. 52 | 53 | Parameters 54 | ---------- 55 | dataset : Dataset 56 | Represents the Dataset object on which to apply MRR. 57 | y_pred : numpy 1d array of float 58 | Represents the predicted document scores for each instance 59 | in the dataset. 60 | 61 | Returns 62 | ------- 63 | avg_score: float 64 | Represents the average MRR over all MRR scores per query. 65 | detailed_scores: numpy 1d array of floats 66 | Represents the detailed MRR scores for each query. It has 67 | the length of n_queries. 68 | 69 | """ 70 | return super(MRR, self).eval(dataset, y_pred) 71 | 72 | def eval_per_query(self, y, y_pred): 73 | """ 74 | This method helps compute the MRR score per query. It is called by the 75 | eval function which averages and aggregates the scores for each query. 76 | 77 | We compute the reciprocal rank. The reciprocal rank of a query response 78 | is the multiplicative inverse of the rank of the first correct answer. 79 | 80 | Parameters 81 | ---------- 82 | y: numpy array 83 | Represents the labels of instances corresponding to one query in the 84 | dataset (ground truth). 85 | y_pred: numpy array. 86 | Represents the predicted document scores obtained during the model 87 | scoring phase for that query. 88 | 89 | Returns 90 | ------- 91 | mrr: float 92 | Represents the MRR score for one query. 93 | """ 94 | idx_y_pred_sorted = np.argsort(y_pred)[::-1] 95 | if self.cutoff is not None: 96 | idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff] 97 | 98 | # rank of max predicted score 99 | rank_max = None 100 | for i, idx in enumerate(idx_y_pred_sorted): 101 | if y[idx] >= self.threshold: 102 | rank_max = i 103 | break 104 | 105 | if rank_max is not None: 106 | return 1./(rank_max+1) 107 | else: 108 | return 0. 109 | 110 | def __str__(self): 111 | s = self.name 112 | if self.cutoff is not None: 113 | s += "@{}".format(self.cutoff) 114 | if self.threshold != self._threshold: 115 | s += "[>{}]".format(self.threshold) 116 | return s 117 | -------------------------------------------------------------------------------- /rankeval/test/model/test_proxy_CatBoost.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | from numpy.testing import assert_equal, assert_array_equal, \ 6 | assert_array_almost_equal, assert_raises 7 | 8 | from rankeval.dataset import Dataset 9 | from rankeval.model import ProxyCatBoost 10 | from rankeval.model import RTEnsemble 11 | from rankeval.test.base import data_dir 12 | 13 | model_file = os.path.join(data_dir, "CatBoost.model.coreml") 14 | data_file = os.path.join(data_dir, "msn1.fold1.test.5k.txt") 15 | 16 | try: 17 | import coremltools 18 | coremltools_missing = False 19 | except ImportError: 20 | coremltools_missing = True 21 | 22 | 23 | @unittest.skipIf(coremltools_missing, "coremltools package missing") 24 | class ProxyCatBoostTestCase(unittest.TestCase): 25 | 26 | @classmethod 27 | def setUpClass(cls): 28 | cls.model = RTEnsemble(model_file, format="CatBoost") 29 | cls.dataset = Dataset.load(data_file, format="svmlight") 30 | 31 | @classmethod 32 | def tearDownClass(cls): 33 | del cls.model 34 | cls.model = None 35 | del cls.dataset 36 | cls.dataset = None 37 | 38 | def test_count_nodes(self): 39 | 40 | coreml_model = coremltools.models.model.MLModel(model_file) 41 | n_trees, n_nodes = ProxyCatBoost._count_nodes(coreml_model) 42 | # print "Num Trees: %d\nNum Nodes: %d" % (n_trees, n_nodes), 43 | assert_equal(n_trees, 2) 44 | assert_equal(n_nodes, 14) 45 | assert_equal(n_trees, self.model.trees_root.size) 46 | assert_equal(n_nodes, self.model.trees_nodes_value.size) 47 | 48 | def test_root_nodes(self): 49 | assert_equal((self.model.trees_root > -1).all(), True, 50 | err_msg="Root nodes not set correctly") 51 | 52 | def test_root_nodes_adv(self): 53 | assert_array_equal(self.model.trees_root, [0, 7], 54 | err_msg="Root nodes are not correct") 55 | 56 | def test_split_features(self): 57 | assert_array_equal(self.model.trees_nodes_feature, 58 | [124, 62, 62, -1, -1, -1, -1, 59 | 112, 107, 107, -1, -1, -1, -1]) 60 | # 61 | def test_tree_values(self): 62 | assert_array_almost_equal(self.model.trees_nodes_value, 63 | [-6.988956e+00, 6.712700e-02, 6.712700e-02, 8.421655e-03, 64 | 1.095791e-03, 8.926381e-03, -1.645530e-02, 65 | -1.208052e+01, 1.148206e+01, 1.148206e+01, 1.408405e-02, 66 | -9.354122e-04, 6.002808e-03, -1.578260e-02], 67 | decimal=5, 68 | err_msg="Split thresholds or leaf outputs value are not correct") 69 | 70 | def test_left_children(self): 71 | assert_array_equal(self.model.trees_left_child, 72 | [2, 4, 6, -1, -1, -1, -1, 73 | 9, 11, 13, -1, -1, -1, -1]) 74 | 75 | def test_right_children(self): 76 | assert_array_equal(self.model.trees_right_child, 77 | [1, 3, 5, -1, -1, -1, -1, 78 | 8, 10, 12, -1, -1, -1, -1]) 79 | 80 | def test_leaf_correctness(self): 81 | for idx, feature in enumerate(self.model.trees_nodes_feature): 82 | if feature == -1: 83 | assert_equal(self.model.trees_left_child[idx], -1, 84 | "Left child of a leaf node is not empty (-1)") 85 | assert_equal(self.model.trees_right_child[idx], -1, 86 | "Right child of a leaf node is not empty (-1)") 87 | assert_equal(self.model.is_leaf_node(idx), True, 88 | "Leaf node not detected as a leaf") 89 | 90 | def test_prediction(self): 91 | y_pred = self.model.score(self.dataset, cache=False) 92 | assert_array_almost_equal(y_pred[:5], 93 | [0.00748624, -0.03223789, -0.01468681, 94 | 0.02301043, -0.03223789]) 95 | assert_array_almost_equal(y_pred[-5:], 96 | [0.02301043, -0.03223789, 0.02301043, 97 | 0.02301043, -0.03223789]) 98 | 99 | 100 | if __name__ == '__main__': 101 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 102 | level=logging.DEBUG) 103 | unittest.main() 104 | -------------------------------------------------------------------------------- /rankeval/metrics/metric.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Cristina Muntean 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | """ 9 | 10 | """ 11 | import numpy as np 12 | from abc import ABCMeta, abstractmethod 13 | import six 14 | 15 | 16 | class Metric(six.with_metaclass(ABCMeta)): 17 | """ 18 | Metric is an abstract class which provides an interface for specific metrics. 19 | It also offers 2 methods, one for iterating over the indeces for a certain 20 | query and another for iterating over the entire dataset based on those 21 | indices. 22 | 23 | Some intuitions: 24 | https://stats.stackexchange.com/questions/159657/metrics-for-evaluating-ranking-algorithms 25 | """ 26 | 27 | @abstractmethod 28 | def __init__(self, name): 29 | """ 30 | The constructor for any metric; it initializes that metric with the 31 | proper name. 32 | 33 | Parameters 34 | ---------- 35 | name : string 36 | Represents the name of that metric instance. 37 | """ 38 | self.name = name 39 | self.detailed_scores = None 40 | 41 | @abstractmethod 42 | def eval(self, dataset, y_pred): 43 | """ 44 | This abstract method computes a specific metric over the predicted 45 | scores for a test dataset. It calls the eval_per query method for each 46 | query in order to get the detailed metric score. 47 | 48 | Parameters 49 | ---------- 50 | dataset : Dataset 51 | Represents the Dataset object on which we want to apply the metric. 52 | y_pred : numpy 1d array of float 53 | Represents the predicted document scores for each instance in the 54 | dataset. 55 | 56 | Returns 57 | ------- 58 | avg_score: float 59 | Represents the average values of a metric over all metric scores 60 | per query. 61 | detailed_scores: numpy 1d array of floats 62 | Represents the detailed metric scores for each query. It has the 63 | length of n_queries. 64 | """ 65 | self.detailed_scores = np.zeros(dataset.n_queries, dtype=np.float32) 66 | 67 | for rel_qid, (qid, q_y, q_y_pred) in enumerate( 68 | self.query_iterator(dataset, y_pred)): 69 | self.detailed_scores[rel_qid] = self.eval_per_query(q_y, q_y_pred) 70 | return np.nanmean(self.detailed_scores), self.detailed_scores 71 | 72 | @abstractmethod 73 | def eval_per_query(self, y, y_pred): 74 | """ 75 | This methods helps to evaluate the predicted scores for a specific 76 | query within the dataset. 77 | 78 | Parameters 79 | ---------- 80 | y: numpy array 81 | Represents the instance labels corresponding to the queries in the 82 | dataset (ground truth). 83 | y_pred: numpy array. 84 | Represents the predicted document scores obtained during the model 85 | scoring phase for that query. 86 | 87 | Returns 88 | ------- 89 | : float 90 | Represents the metric score for one query. 91 | """ 92 | 93 | def query_iterator(self, dataset, y_pred): 94 | """ 95 | This method iterates over dataset document scores and predicted scores 96 | in blocks of instances which belong to the same query. 97 | Parameters 98 | ---------- 99 | dataset : Datatset 100 | y_pred : numpy array 101 | 102 | Returns 103 | ------- 104 | : int 105 | The query id. 106 | : numpy.array 107 | The document scores of the instances in the labeled dataset 108 | (instance labels) belonging to the same query id. 109 | : numpy.array 110 | The predicted scores for the instances in the dataset belonging to 111 | the same query id. 112 | """ 113 | for query_id, start_offset, end_offset in dataset.query_iterator(): 114 | yield (query_id, 115 | dataset.y[start_offset:end_offset], 116 | y_pred[start_offset:end_offset]) 117 | -------------------------------------------------------------------------------- /rankeval/test/model/test_proxy_Jforests.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | from numpy.testing import assert_equal, assert_array_equal, \ 6 | assert_array_almost_equal 7 | 8 | from rankeval.dataset import Dataset 9 | from rankeval.model import ProxyJforests 10 | from rankeval.model import RTEnsemble 11 | from rankeval.test.base import data_dir 12 | 13 | model_file = os.path.join(data_dir, "Jforests.model.xml") 14 | data_file = os.path.join(data_dir, "msn1.fold1.test.5k.txt") 15 | 16 | 17 | class ProxyJforestsTestCase(unittest.TestCase): 18 | 19 | @classmethod 20 | def setUpClass(cls): 21 | cls.model = RTEnsemble(model_file, format="Jforests") 22 | cls.dataset = Dataset.load(data_file, format="svmlight") 23 | 24 | @classmethod 25 | def tearDownClass(cls): 26 | del cls.model 27 | cls.model = None 28 | del cls.dataset 29 | cls.dataset = None 30 | 31 | def test_count_nodes(self): 32 | n_trees, n_nodes = ProxyJforests._count_nodes(model_file) 33 | assert_equal(n_trees, 2) 34 | assert_equal(n_nodes, 26) 35 | assert_equal(n_trees, self.model.trees_root.size) 36 | assert_equal(n_nodes, self.model.trees_nodes_value.size) 37 | 38 | def test_root_nodes(self): 39 | assert_equal((self.model.trees_root > -1).all(), True, 40 | "Root nodes not set correctly") 41 | 42 | def test_root_nodes_adv(self): 43 | assert_array_equal(self.model.trees_root, [0, 13], 44 | "Root nodes are not correct") 45 | 46 | def test_tree_weights(self): 47 | assert_array_almost_equal(self.model.trees_weight, 48 | [1.0, 1.0], 49 | err_msg="Tree Weights are not correct") 50 | 51 | def test_split_features(self): 52 | assert_array_equal(self.model.trees_nodes_feature, 53 | [129, 129, 107, 72, 55, 54, 54 | -1, -1, -1, -1, -1, -1, -1, 55 | 133, 72, 105, 130, 62, 121, 56 | -1, -1, -1, -1, -1, -1, -1]) 57 | 58 | def test_tree_values(self): 59 | assert_array_almost_equal(self.model.trees_nodes_value, 60 | [268.0079, 265.0144, 13.9174, 19.1123, 0.00976, 0.0185, 61 | -1.2156, -0.2370, -1.9329, 0.8030, -0.01019, -1.9395, 0.5840, 62 | 0.0, 21.3979, 13.2636, 181.0142, 0.3333, -1.5976, -0.1443, 63 | 1.3819, 1.7707, 1.7353, 0.2240, -0.3769, -1.7937], 64 | decimal=4, 65 | err_msg="Split threshold values or leaf outputs are not correct") 66 | 67 | def test_left_children(self): 68 | assert_array_equal(self.model.trees_left_child, 69 | [1, 3, 5, 6, 8, 7, -1, -1, -1, -1, -1, -1, -1, 70 | 14, 17, 21, 22, 19, 24, -1, -1, -1, -1, -1, -1, -1]) 71 | 72 | def test_right_children(self): 73 | assert_array_equal(self.model.trees_right_child, 74 | [2, 4, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1, 75 | 20, 15, 16, 23, 18, 25, -1, -1, -1, -1, -1, -1, -1]) 76 | 77 | def test_leaf_correctness(self): 78 | for idx, feature in enumerate(self.model.trees_nodes_feature): 79 | if feature == -1: 80 | assert_equal(self.model.trees_left_child[idx], -1, 81 | "Left child of a leaf node is not empty (-1)") 82 | assert_equal(self.model.trees_right_child[idx], -1, 83 | "Right child of a leaf node is not empty (-1)") 84 | assert_equal(self.model.is_leaf_node(idx), True, 85 | "Leaf node not detected as a leaf") 86 | 87 | def test_prediction(self): 88 | y_pred = self.model.score(self.dataset) 89 | assert_array_almost_equal(y_pred[:5], 90 | [-2.083870, -1.359969, -1.359969, 91 | 0.426128, -0.381351]) 92 | assert_array_almost_equal(y_pred[-5:], 93 | [1.027176, -0.381351, -2.077223, 94 | 0.658770, -0.381351]) 95 | 96 | 97 | if __name__ == '__main__': 98 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 99 | level=logging.DEBUG) 100 | unittest.main() 101 | -------------------------------------------------------------------------------- /rankeval/metrics/recall.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Cristina Muntean 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | import numpy as np 9 | from rankeval.metrics.metric import Metric 10 | 11 | 12 | class Recall(Metric): 13 | """ 14 | This class implements Recall as: 15 | (relevant docs & retrieved docs) / relevant docs. 16 | 17 | It allows setting custom values for cutoff and threshold, otherwise it uses 18 | the default values. 19 | 20 | """ 21 | 22 | _threshold = 1 23 | 24 | def __init__(self, name='R', no_relevant_results=0.0, 25 | cutoff=None, threshold=_threshold): 26 | """ 27 | This is the constructor of Recall, an object of type Metric, with 28 | the name R. The constructor also allows setting custom values 29 | for cutoff and threshold, otherwise it uses the default values. 30 | 31 | Parameters 32 | ---------- 33 | name: string 34 | R 35 | no_relevant_results: float 36 | Float indicating how to treat the cases where then are no relevant 37 | results (e.g. 0.0). 38 | cutoff: int 39 | The top k results to be considered at per query level (e.g. 10) 40 | threshold: float 41 | This parameter considers relevant results all instances with labels 42 | different from 0, thus with a minimum label value of 1. It can be 43 | set to other values as well (e.g. 3), in the range of possible 44 | labels. 45 | """ 46 | super(Recall, self).__init__(name) 47 | self.no_relevant_results = no_relevant_results 48 | self.cutoff = cutoff 49 | self.threshold = threshold 50 | 51 | def eval(self, dataset, y_pred): 52 | """ 53 | This method computes the Recall score over the entire dataset and the 54 | detailed scores per query. It calls the eval_per query method for each 55 | query in order to get the detailed Recall score. 56 | 57 | Parameters 58 | ---------- 59 | dataset : Dataset 60 | Represents the Dataset object on which to apply Recall. 61 | y_pred : numpy 1d array of float 62 | Represents the predicted document scores for each instance in the 63 | dataset. 64 | 65 | Returns 66 | ------- 67 | avg_score: float 68 | The overall Recall score (averages over the detailed precision 69 | scores). 70 | detailed_scores: numpy 1d array of floats 71 | The detailed Recall scores for each query, an array of length of 72 | the number of queries. 73 | """ 74 | return super(Recall, self).eval(dataset, y_pred) 75 | 76 | 77 | def eval_per_query(self, y, y_pred): 78 | """ 79 | This methods computes Recall at per query level (on the instances 80 | belonging to a specific query). The Recall per query is calculated as 81 | <(relevant docs & retrieved docs) / relevant docs>. 82 | 83 | Parameters 84 | ---------- 85 | y: numpy array 86 | Represents the labels of instances corresponding to one query in 87 | the dataset (ground truth). 88 | y_pred: numpy array. 89 | Represents the predicted document scores obtained during the model 90 | scoring phase for that query. 91 | 92 | Returns 93 | ------- 94 | recall: float 95 | The Recall score per query. 96 | 97 | """ 98 | idx_y_pred_sorted = np.argsort(y_pred)[::-1] 99 | if self.cutoff is not None: 100 | idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff] 101 | 102 | n_relevant_retrieved = (y[idx_y_pred_sorted] >= self.threshold).sum() 103 | n_relevant = (y >= self.threshold).sum() 104 | 105 | if n_relevant != 0: 106 | return float(n_relevant_retrieved) / n_relevant 107 | else: 108 | return self.no_relevant_results 109 | 110 | def __str__(self): 111 | s = self.name 112 | if self.cutoff is not None: 113 | s += "@{}".format(self.cutoff) 114 | if self.threshold != self._threshold: 115 | s += "[>{}]".format(self.threshold) 116 | return s 117 | -------------------------------------------------------------------------------- /rankeval/test/data/LightGBM.model.txt: -------------------------------------------------------------------------------- 1 | tree 2 | num_class=1 3 | num_tree_per_iteration=1 4 | label_index=0 5 | max_feature_idx=135 6 | objective=lambdarank 7 | feature_names=Column_0 Column_1 Column_2 Column_3 Column_4 Column_5 Column_6 Column_7 Column_8 Column_9 Column_10 Column_11 Column_12 Column_13 Column_14 Column_15 Column_16 Column_17 Column_18 Column_19 Column_20 Column_21 Column_22 Column_23 Column_24 Column_25 Column_26 Column_27 Column_28 Column_29 Column_30 Column_31 Column_32 Column_33 Column_34 Column_35 Column_36 Column_37 Column_38 Column_39 Column_40 Column_41 Column_42 Column_43 Column_44 Column_45 Column_46 Column_47 Column_48 Column_49 Column_50 Column_51 Column_52 Column_53 Column_54 Column_55 Column_56 Column_57 Column_58 Column_59 Column_60 Column_61 Column_62 Column_63 Column_64 Column_65 Column_66 Column_67 Column_68 Column_69 Column_70 Column_71 Column_72 Column_73 Column_74 Column_75 Column_76 Column_77 Column_78 Column_79 Column_80 Column_81 Column_82 Column_83 Column_84 Column_85 Column_86 Column_87 Column_88 Column_89 Column_90 Column_91 Column_92 Column_93 Column_94 Column_95 Column_96 Column_97 Column_98 Column_99 Column_100 Column_101 Column_102 Column_103 Column_104 Column_105 Column_106 Column_107 Column_108 Column_109 Column_110 Column_111 Column_112 Column_113 Column_114 Column_115 Column_116 Column_117 Column_118 Column_119 Column_120 Column_121 Column_122 Column_123 Column_124 Column_125 Column_126 Column_127 Column_128 Column_129 Column_130 Column_131 Column_132 Column_133 Column_134 Column_135 8 | feature_infos=[0:7] [0:4] [0:6] [0:6] [0:7] [0:1] [0:1] [0:1] [0:1] [0:1] [0:5487] [0:175] [0:143] [3:39] [3:5499] [1.0614420175552368:26.475236892700195] [6.3643679618835449:55.236095428466797] [6.1725778579711914:49.514362335205078] [5.7439260482788086:52.309181213378906] [1.0482590198516846:26.469913482666016] [0:585] [0:29] [0:23] [0:7] [0:593] [0:194] [0:7] [0:11] [0:2] [0:197] [0:314] [0:12] [0:18] [0:4] [0:318] [0:195] [0:9.6666669845581055] [0:11.5] [0:2.3333330154418945] [0:197.66667175292969] [0:17193.5546875] [0:8] [0:42.25] [0:2] [0:17193.5546875] [0:1] [0:1] [0:1] [0:0.66666698455810547] [0:0.5] [0:1] [0:1] [0:1] [0:0.5] [0:0.42857098579406738] [0:1] [0:1] [0:1] [0:0.5] [0:0.42857098579406738] [0:1] [0:1] [0:1] [0:0.5] [0:0.42857098579406738] [0:0.020833000540733337] [0:0.25] [0:0.25] [0:0.027777999639511108] [0:0.014824000187218189] [-0.17334100604057312:2445.951904296875] [0:255.47145080566406] [0:171.33871459960938] [0:54.946834564208984] [-0.1763560026884079:2479.425048828125] [-3.6401700973510742:1988.513427734375] [0:55.871494293212891] [0:93.686378479003906] [0:31.425260543823242] [-3.7034800052642822:2029.4710693359375] [0:1988.513427734375] [0:105.70162200927734] [0:110.33704376220703] [0:32.652210235595703] [0:2029.4710693359375] [-0.024762999266386032:1988.513427734375] [0:85.157150268554688] [0:93.686378479003906] [0:31.425260543823242] [-0.025194000452756882:2029.4710693359375] [0:301643.53125] [0:593.0015869140625] [0:1452.83251953125] [0:137.64729309082031] [0:310211.625] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [-0.056940000504255295:74.395339965820312] [0:32.302749633789062] [0:47.534080505371094] [0:36.460765838623047] [-0.058655001223087311:74.817634582519531] [-79.574264526367188:0] [-44.824966430664062:0] [-60.532871246337891:0] [-64.522880554199219:0] [-79.512725830078125:0] [-70.377952575683594:0] [-54.401905059814453:0] [-62.203113555908203:0] [-66.05316162109375:0] [-70.324241638183594:0] [-79.111625671386719:0] [-52.357414245605469:0] [-71.119285583496094:0] [-75.108413696289062:0] [-79.036079406738281:0] [1:25] [9:219] [0:159613600] [0:91] [115:65534] [1:65535] [1:251] [1:254] [0:35578] [0:1060864] [0:590.566650390625] 9 | 10 | Tree=0 11 | num_leaves=3 12 | split_feature=55 134 13 | split_gain=24.316440938511686 19.637733333720618 14 | threshold=0.0036310000577941537 206 15 | decision_type=2 2 16 | default_value=0 0 17 | left_child=-1 -2 18 | right_child=1 -3 19 | leaf_parent=0 1 1 20 | leaf_value=-0.03717880105550684 -0.00015711314723408512 0.03066546801204972 21 | leaf_count=328 4540 132 22 | internal_value=0 0.017716937701040228 23 | internal_count=5000 4672 24 | shrinkage=0.1 25 | has_categorical=0 26 | 27 | 28 | Tree=1 29 | num_leaves=3 30 | split_feature=133 48 31 | split_gain=2386.6406638072776 1066.9165015298936 32 | threshold=9.9999996826552254e-21 0.2330314964056015 33 | decision_type=2 2 34 | default_value=0 0 35 | left_child=1 -1 36 | right_child=-2 -3 37 | leaf_parent=1 0 1 38 | leaf_value=-0.0045894326568938319 0.034326154348027604 0.01791469712217866 39 | leaf_count=4507 158 335 40 | internal_value=0 -0.025416647783118203 41 | internal_count=5000 4842 42 | shrinkage=0.1 43 | has_categorical=0 44 | 45 | 46 | 47 | feature importances: 48 | Column_48=1 49 | Column_55=1 50 | Column_133=1 51 | Column_134=1 52 | 53 | pandas_categorical:null -------------------------------------------------------------------------------- /rankeval/metrics/pfound.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Cristina Muntean 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | 9 | import numpy as np 10 | from rankeval.metrics import Metric 11 | 12 | 13 | class Pfound(Metric): 14 | """ 15 | This class implements Pfound with several parameters. 16 | 17 | The ERR metric is very similar to the pFound metric used by 18 | Yandex (Segalovich, 2010). 19 | [http://proceedings.mlr.press/v14/chapelle11a/chapelle11a.pdf]. 20 | 21 | In fact pFound is identical to the ERR variant described in 22 | (Chapelle et al., 2009, Section 7.2). We implemented pFound similar 23 | to ERR in section 7.2 of http://olivier.chapelle.cc/pub/err.pdf. 24 | 25 | """ 26 | def __init__(self, name='Pf', cutoff=None, p_abandonment=0.15): 27 | """ 28 | This is the constructor of Pfound, an object of type Metric, with 29 | the name Pf. The constructor also allows setting custom values in 30 | the following parameters. 31 | 32 | Parameters 33 | ---------- 34 | name: string 35 | Pf 36 | cutoff: int 37 | The top k results to be considered at per query level (e.g. 10), 38 | otherwise the default value is None and is computed on all the 39 | instances of a query. 40 | p_abandonment: float 41 | This parameter indicates the probability of abandonment, i.e. 42 | the user stops looking a the ranked list due to an external reason. 43 | The original cascade model of ERR has later been extended to include 44 | an abandonment probability: if the user is not satisfied at a given 45 | position, he will examine the next url with probability y, but has 46 | a probability 1-y of abandoning. 47 | 48 | """ 49 | super(Pfound, self).__init__(name) 50 | self.cutoff = cutoff 51 | self.p_abandonment = p_abandonment 52 | 53 | def eval(self, dataset, y_pred): 54 | """ 55 | The method computes Pfound by taking as input the dataset and the 56 | predicted document scores. It returns the averaged Pfound score over 57 | the entire dataset and the detailed Pfound scores per query. 58 | 59 | Parameters 60 | ---------- 61 | dataset : Dataset 62 | Represents the Dataset object on which to apply Pfound. 63 | y_pred : numpy 1d array of float 64 | Represents the predicted document scores for each instance in 65 | the dataset. 66 | 67 | Returns 68 | ------- 69 | avg_score: float 70 | Represents the average Pfound over all Pfound scores per query. 71 | detailed_scores: numpy 1d array of floats 72 | Represents the detailed Pfound scores for each query. It has the 73 | length of n_queries. 74 | """ 75 | return super(Pfound, self).eval(dataset, y_pred) 76 | 77 | def eval_per_query(self, y, y_pred): 78 | """ 79 | This method helps compute the Pfound score per query. It is called by 80 | the eval function which averages and aggregates the scores for each 81 | query. 82 | 83 | Parameters 84 | ---------- 85 | y: numpy array 86 | Represents the labels of instances corresponding to one query in 87 | the dataset (ground truth). 88 | y_pred: numpy array 89 | Represents the predicted document scores obtained during the model 90 | scoring phase for that query. 91 | 92 | Returns 93 | ------- 94 | pfound: float 95 | Represents the Pfound score for one query. 96 | 97 | """ 98 | idx_y_pred_sorted = np.argsort(y_pred)[::-1] 99 | if self.cutoff is not None: 100 | idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff] 101 | 102 | max_grade = y.max() # max relevance score 103 | prob_step_down = 1.0 104 | pfound = 0.0 105 | 106 | for i, idx in enumerate(idx_y_pred_sorted): 107 | utility = (pow(2., y[idx]) - 1.) / pow(2., max_grade) 108 | pfound += prob_step_down * utility * pow(self.p_abandonment, i) 109 | prob_step_down *= (1. - utility) 110 | 111 | return pfound 112 | 113 | def __str__(self): 114 | s = self.name 115 | if self.cutoff is not None: 116 | s += "@{}".format(self.cutoff) 117 | return s -------------------------------------------------------------------------------- /rankeval/dataset/write_json_dataset_catalogue.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Franco Maria Nardini 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | import json 9 | 10 | """ 11 | This is a simple python script that generates the dataset catalogue. It is then 12 | dumped in JSON for simple and easy handling. 13 | """ 14 | 15 | output_file = "dataset_dictionary.json" 16 | 17 | def __add_istella_full__(data): 18 | item = {'TRAIN_FILE': 'full/train.txt', 19 | 'TEST_FILE': 'full/test.txt', 20 | 'VALIDATION_FILE': 'None', 21 | 'LICENSE_FILE': 'istella-letor-LA.txt', 22 | 'DATASET_ARCHIVE_NAME': 'istella-letor.tar.gz', 23 | 'MODELS_ARCHIVE_NAME': 'istella-letor-models.tar.gz', 24 | # DATASET_URL = ("http://library.istella.it/" "dataset/istella-letor.tar.gz") 25 | 'DATASET_URL': ("http://rankeval.isti.cnr.it/" "rankeval-datasets/istella-letor/dataset/istella-letor.tar.gz"), 26 | 'MODELS_URL' : ("http://rankeval.isti.cnr.it/" "rankeval-datasets/istella-letor/models/istella-letor-models.tar.gz"), 27 | 'BLOG_POST_URL': 'http://blog.istella.it/istella-learning-to-rank-dataset/', 28 | 'DATASET_NAME': 'istella-full', 29 | 'DATASET_DESCRIPTION': 'The istella LETOR full dataset', 30 | 'DATASET_FORMAT': 'svmlight'} 31 | data[item['DATASET_NAME']] = item 32 | return data 33 | 34 | def __add_istella_sample__(data): 35 | item = {'TRAIN_FILE': 'sample/train.txt', 36 | 'TEST_FILE': 'sample/test.txt', 37 | 'VALIDATION_FILE': 'sample/vali.txt', 38 | 'LICENSE_FILE': 'istella-letor-LA.txt', 39 | 'DATASET_ARCHIVE_NAME': 'istella-s-letor.tar.gz', 40 | 'MODELS_ARCHIVE_NAME': 'istella-s-letor-models.tar.gz', 41 | # DATASET_URL = ("http://library.istella.it/" "dataset/istella-letor.tar.gz") 42 | 'DATASET_URL': ("http://rankeval.isti.cnr.it/" "rankeval-datasets/istella-s-letor/dataset/istella-s-letor.tar.gz"), 43 | 'MODELS_URL' : ("http://rankeval.isti.cnr.it/" "rankeval-datasets/istella-s-letor/models/istella-s-letor-models.tar.gz"), 44 | 'BLOG_POST_URL': 'http://blog.istella.it/istella-learning-to-rank-dataset/', 45 | 'DATASET_NAME': 'istella-sample', 46 | 'DATASET_DESCRIPTION': 'The istella LETOR sample dataset', 47 | 'DATASET_FORMAT': 'svmlight'} 48 | data[item['DATASET_NAME']] = item 49 | return data 50 | 51 | def __add_msn10k__(data): 52 | item = {'COMMON_SUBFOLDER_NAME': 'Fold', 53 | 'TRAIN_FILE': 'train.txt', 54 | 'TEST_FILE': 'test.txt', 55 | 'VALIDATION_FILE': 'vali.txt', 56 | 'DATASET_ARCHIVE_NAME': 'msn10k.tar.gz', 57 | 'MODELS_ARCHIVE_NAME': 'msn10k-models.tar.gz', 58 | 'DATASET_URL': ("http://rankeval.isti.cnr.it/" "rankeval-datasets/msn10k/dataset/msn10k.tar.gz"), 59 | 'MODELS_URL' : ("http://rankeval.isti.cnr.it/" "rankeval-datasets/msn10k/models/msn10k-models.tar.gz"), 60 | 'BLOG_POST_URL': 'https://www.microsoft.com/en-us/research/project/mslr/', 61 | 'DATASET_NAME': 'msn10k', 62 | 'DATASET_DESCRIPTION': 'Microsoft Learning to Rank Datasets (WEB10K)', 63 | 'DATASET_FORMAT': 'svmlight'} 64 | data[item['DATASET_NAME']] = item 65 | return data 66 | 67 | def __add_msn30k__(data): 68 | item = {'COMMON_SUBFOLDER_NAME': 'Fold', 69 | 'TRAIN_FILE': 'train.txt', 70 | 'TEST_FILE': 'test.txt', 71 | 'VALIDATION_FILE': 'vali.txt', 72 | 'DATASET_ARCHIVE_NAME': 'msn30k.tar.gz', 73 | 'MODELS_ARCHIVE_NAME': 'msn30k-models.tar.gz', 74 | 'DATASET_URL': ("http://rankeval.isti.cnr.it/" "rankeval-datasets/msn30k/dataset/msn30k.tar.gz"), 75 | 'MODELS_URL' : ("http://rankeval.isti.cnr.it/" "rankeval-datasets/msn30k/models/msn30k-models.tar.gz"), 76 | 'BLOG_POST_URL': 'https://www.microsoft.com/en-us/research/project/mslr/', 77 | 'DATASET_NAME': 'msn30k', 78 | 'DATASET_DESCRIPTION': 'Microsoft Learning to Rank Datasets (WEB30K)', 79 | 'DATASET_FORMAT': 'svmlight'} 80 | data[item['DATASET_NAME']] = item 81 | return data 82 | 83 | def main(): 84 | data = dict() 85 | __add_istella_full__(data) 86 | __add_istella_sample__(data) 87 | __add_msn10k__(data) 88 | __add_msn30k__(data) 89 | with open(output_file, 'w') as fp: 90 | json.dump(data, fp, sort_keys=True, indent=4) 91 | 92 | if __name__ == "__main__": 93 | main() 94 | -------------------------------------------------------------------------------- /rankeval/metrics/map.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Cristina Muntean 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | 9 | import numpy as np 10 | from rankeval.metrics.metric import Metric 11 | 12 | 13 | class MAP(Metric): 14 | """ 15 | This class implements MAP with several parameters. We implemented MAP as in 16 | https://www.kaggle.com/wiki/MeanAveragePrecision, adapted from: 17 | http://en.wikipedia.org/wiki/Information_retrieval 18 | https://www.ethz.ch/content/dam/ethz/special-interest/gess/computational-social-science-dam/documents/education/Spring2017/ML/LinkPrediction.pdf 19 | http://sdsawtelle.github.io/blog/output/mean-average-precision-MAP-for-recommender-systems.html 20 | """ 21 | 22 | def __init__(self, name='MAP', cutoff=None, no_relevant_results=1.0): 23 | """ 24 | This is the constructor of MAP, an object of type Metric, with 25 | the name MAP. The constructor also allows setting custom values in the 26 | following parameters. 27 | 28 | Parameters 29 | ---------- 30 | name: string 31 | MAP 32 | cutoff: int 33 | The top k results to be considered at per query level (e.g. 10), 34 | otherwise the default value is None and is computed on all the 35 | instances of a query. 36 | no_relevant_results: float 37 | Float indicating how to treat the cases where then are no relevant 38 | results (e.g. 0.5). Default is 1.0. 39 | """ 40 | super(MAP, self).__init__(name) 41 | self.cutoff = cutoff 42 | self.no_relevant_results = no_relevant_results 43 | 44 | def eval(self, dataset, y_pred): 45 | """ 46 | This method takes the AP@k for each query and calculates the average, 47 | thus MAP@k. 48 | 49 | Parameters 50 | ---------- 51 | dataset : Dataset 52 | Represents the Dataset object on which to apply MAP. 53 | y_pred : numpy 1d array of float 54 | Represents the predicted document scores for each instance in 55 | the dataset. 56 | 57 | Returns 58 | ------- 59 | avg_score: float 60 | The overall MAP@k score (averages over the detailed MAP scores). 61 | detailed_scores: numpy 1d array of floats 62 | The detailed AP@k scores for each query, an array of length of 63 | the number of queries. 64 | """ 65 | return super(MAP, self).eval(dataset, y_pred) 66 | 67 | def eval_per_query(self, y, y_pred): 68 | """ 69 | This methods computes AP@k at per query level (on the instances 70 | belonging to a specific query). The AP@k per query is calculated as 71 | 72 | ap@k = sum( P(k) / min(m,n) ), for k=1,n 73 | 74 | where: 75 | - P(k) means the precision at cut-off k in the item list. P(k) 76 | equals 0 when the k-th item is not followed upon recommendation 77 | - m is the overall number of relevant documents 78 | - n is the number of predicted documents 79 | 80 | If the denominator is zero, P(k)/min(m,n) is set to zero. 81 | 82 | Parameters 83 | ---------- 84 | y: numpy array 85 | Represents the labels of instances corresponding to one query in 86 | the dataset (ground truth). 87 | y_pred: numpy array. 88 | Represents the predicted document scores obtained during the model 89 | scoring phase for that query. 90 | 91 | Returns 92 | ------- 93 | map : float 94 | The MAP per query. 95 | """ 96 | idx_y_pred_sorted = np.argsort(y_pred)[::-1] 97 | if self.cutoff is not None: 98 | idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff] 99 | 100 | n_retrieved = len(idx_y_pred_sorted) 101 | precision_at_i = 0. 102 | n_relevant_retrieved_at_i = 0. 103 | for i in range(n_retrieved): 104 | if y[idx_y_pred_sorted[i]] > 0: 105 | n_relevant_retrieved_at_i += 1 106 | precision_at_i += n_relevant_retrieved_at_i / (i + 1) 107 | 108 | if n_relevant_retrieved_at_i > 0: 109 | return precision_at_i / min(n_retrieved, np.count_nonzero(y)) 110 | else: 111 | return self.no_relevant_results 112 | 113 | def __str__(self): 114 | s = self.name 115 | if self.cutoff is not None: 116 | s += "@{}".format(self.cutoff) 117 | return s 118 | -------------------------------------------------------------------------------- /rankeval/test/model/test_proxy_QuickRank.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | from numpy.testing import assert_equal, assert_array_equal, \ 6 | assert_array_almost_equal 7 | 8 | from rankeval.model import ProxyQuickRank 9 | from rankeval.model import RTEnsemble 10 | from rankeval.test.base import data_dir 11 | 12 | model_file = os.path.join(data_dir, "quickrank.model.xml") 13 | 14 | 15 | class ProxyQuickRankTestCase(unittest.TestCase): 16 | 17 | def setUp(self): 18 | self.model = RTEnsemble(model_file, format="QuickRank") 19 | 20 | def tearDown(self): 21 | del self.model 22 | self.model = None 23 | 24 | def test_count_nodes(self): 25 | n_trees, n_nodes = ProxyQuickRank._count_nodes(model_file) 26 | assert_equal(n_trees, 2) 27 | assert_equal(n_nodes, 10) 28 | assert_equal(n_trees, self.model.trees_root.size) 29 | assert_equal(n_nodes, self.model.trees_nodes_value.size) 30 | 31 | def test_root_nodes(self): 32 | assert_equal((self.model.trees_root > -1).all(), True, 33 | "Root nodes not set correctly") 34 | 35 | def test_root_nodes_adv(self): 36 | assert_array_equal(self.model.trees_root, [0, 5], 37 | "Root nodes are not correct") 38 | 39 | def test_tree_weights(self): 40 | assert_array_almost_equal(self.model.trees_weight, 41 | [0.10000000149011612, 0.10000000149011612], 42 | err_msg="Tree Weights are not correct") 43 | 44 | def test_split_features(self): 45 | assert_array_equal(self.model.trees_nodes_feature, 46 | [107, 114, -1, -1, -1, 7, -1, 105, -1, -1]) 47 | 48 | def test_tree_values(self): 49 | assert_array_almost_equal(self.model.trees_nodes_value, 50 | [14.895151138305664, -8.0245580673217773, 0.3412887828162291, 51 | 0.66845277963831218, 0.96317280453257792, 0.66666698455810547, 52 | 0.37133907932286642, 17.135160446166992, 0.54762687170967062, 53 | 0.98651670670179537], 54 | err_msg="Split threshold values or leaf outputs are not correct") 55 | 56 | def test_left_children(self): 57 | assert_array_equal(self.model.trees_left_child, 58 | [1, 2, -1, -1, -1, 6, -1, 8, -1, -1]) 59 | 60 | def test_right_children(self): 61 | assert_array_equal(self.model.trees_right_child, 62 | [4, 3, -1, -1, -1, 7, -1, 9, -1, -1]) 63 | 64 | def test_leaf_correctness(self): 65 | for idx, feature in enumerate(self.model.trees_nodes_feature): 66 | if feature == -1: 67 | assert_equal(self.model.trees_left_child[idx], -1, 68 | "Left child of a leaf node is not empty (-1)") 69 | assert_equal(self.model.trees_right_child[idx], -1, 70 | "Right child of a leaf node is not empty (-1)") 71 | assert_equal(self.model.is_leaf_node(idx), True, 72 | "Leaf node not detected as a leaf") 73 | 74 | def test_load_save_quickrank_model(self): 75 | # save the model 76 | saved_model_file = model_file + ".saved.xml" 77 | saved = self.model.save(saved_model_file, format="QuickRank") 78 | assert_equal(saved, True, "File not save correctly") 79 | 80 | # reload the model 81 | model_reloaded = RTEnsemble(saved_model_file, format="QuickRank") 82 | 83 | os.remove(saved_model_file) 84 | 85 | assert_array_almost_equal(self.model.trees_root, model_reloaded.trees_root, 86 | err_msg="Tree roots are incorrect") 87 | assert_array_almost_equal(self.model.trees_weight, model_reloaded.trees_weight, 88 | err_msg="Tree weights are incorrect") 89 | assert_array_almost_equal(self.model.trees_nodes_value, model_reloaded.trees_nodes_value, 90 | err_msg="Node thresholds are incorrect") 91 | assert_array_almost_equal(self.model.trees_nodes_feature, model_reloaded.trees_nodes_feature, 92 | err_msg="Node features are incorrect") 93 | assert_array_almost_equal(self.model.trees_left_child, model_reloaded.trees_left_child, 94 | err_msg="Left children are incorrect") 95 | assert_array_almost_equal(self.model.trees_right_child, model_reloaded.trees_right_child, 96 | err_msg="Right children are incorrect") 97 | 98 | 99 | if __name__ == '__main__': 100 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 101 | level=logging.DEBUG) 102 | unittest.main() 103 | -------------------------------------------------------------------------------- /rankeval/visualization/feature.py: -------------------------------------------------------------------------------- 1 | """ 2 | This package provides support for feature analysis visualizations. 3 | """ 4 | 5 | from __future__ import print_function 6 | import six 7 | 8 | import matplotlib.pyplot as plt 9 | 10 | import numpy as np 11 | 12 | try: 13 | xrange 14 | except NameError: 15 | # Python3's range is Python2's xrange 16 | xrange = range 17 | 18 | 19 | def plot_feature_importance(feature_perf, max_features=10, sort_by="gain", 20 | feature_names=None): 21 | """ 22 | Shows the most important features as a bar plot. 23 | 24 | Parameters 25 | ---------- 26 | feature_perf : xarray.DataArray 27 | Feature importance stats of the model to be visualized 28 | max_features : int or None 29 | Maximul number of features to be visualized. If None is passed, it will 30 | show all the features 31 | sort_by : 'gain' or 'count' 32 | The method to use for selecting the top features to display. 'gain' 33 | method selects the top features by importance, 'count' selects the top 34 | features by usage (i.e., number of times it has been used by a split 35 | node). 36 | feature_names : list of string 37 | The name of the features to use for plotting. If None, their index is 38 | used in place of the name (starting from 1). 39 | 40 | Returns 41 | ------- 42 | : matplotlib.figure.Figure 43 | The matpotlib Figure 44 | """ 45 | 46 | feature_importance = feature_perf.sel(type='importance').data 47 | feature_count = feature_perf.sel(type='count').data.astype(np.uint16) 48 | 49 | # figure 50 | fig, ax1 = plt.subplots(figsize=(16, 5)) 51 | ax2 = ax1.twinx() 52 | 53 | if sort_by == "gain": 54 | idx_sorted = np.argsort(feature_importance)[::-1] 55 | title_by = "Importance" 56 | elif sort_by == "count": 57 | idx_sorted = np.argsort(feature_count)[::-1] 58 | title_by = "Count" 59 | else: 60 | raise RuntimeError("Sorting of features for visualization " 61 | "not supported!") 62 | 63 | if isinstance(max_features, six.integer_types): 64 | idx_sorted = idx_sorted[:max_features] 65 | else: 66 | max_features = len(feature_importance) 67 | 68 | top_features = idx_sorted 69 | top_importances = feature_importance[idx_sorted] 70 | top_counts = feature_count[idx_sorted] 71 | 72 | index = np.arange(max_features) 73 | bar_width = 0.35 74 | 75 | opacity = 0.7 76 | 77 | bar1 = ax1.bar(index, top_importances, bar_width, 78 | alpha=opacity, 79 | color='r', 80 | align='center', 81 | zorder=5, 82 | edgecolor='black') 83 | bar2 = ax2.bar(index + bar_width, top_counts, bar_width, 84 | alpha=opacity, 85 | color='b', 86 | align='center', 87 | zorder=5, 88 | edgecolor='black') 89 | 90 | ax1.set_title('Top-k Features by %s' % title_by) 91 | 92 | ax1.set_xlabel("Features") 93 | if feature_names is not None: 94 | feature_names_f = np.array(["%16s" % f for f in feature_names]) 95 | ax1.set_xticks(index + bar_width / 2 + 0.15) 96 | ax1.set_xticklabels(feature_names_f[idx_sorted], rotation=45, 97 | ha="right") 98 | else: 99 | ax1.set_xticks(index + bar_width / 2) 100 | ax1.set_xticklabels(top_features + 1) 101 | 102 | ax1.set_xlim(-bar_width/2 - bar_width, max_features - 1 + bar_width*5/2) 103 | 104 | step_y = np.ceil(top_importances.max() * 10) / 100 105 | align_y_axis(ax1, ax2, step_y, 100, num_ticks=6) 106 | 107 | ax1.set_ylabel("Importance Gain") 108 | ax2.set_ylabel("Usage Count") 109 | 110 | ax1.grid(False) 111 | ax2.grid(False) 112 | ax1.yaxis.grid(True, ls='--', zorder=0) 113 | 114 | ax1.legend((bar1, bar2), ("Importance", "Count"), 115 | loc='best', shadow=True, frameon=True, fancybox=True) 116 | 117 | return fig 118 | 119 | 120 | def align_y_axis(ax1, ax2, minresax1, minresax2, num_ticks=7): 121 | """ Sets tick marks of twinx axes to line up with num_ticks total tick marks 122 | 123 | ax1 and ax2 are matplotlib axes 124 | Spacing between tick marks will be a factor of minresax1 and minresax2""" 125 | 126 | ax1ylims = ax1.get_ybound() 127 | ax2ylims = ax2.get_ybound() 128 | ax1factor = minresax1 * (num_ticks - 1) 129 | ax2factor = minresax2 * (num_ticks - 1) 130 | ax1.set_yticks(np.linspace(ax1ylims[0], 131 | ax1ylims[1]+(ax1factor - 132 | (ax1ylims[1]-ax1ylims[0]) % ax1factor) % 133 | ax1factor, 134 | num_ticks)) 135 | ax2.set_yticks(np.linspace(ax2ylims[0], 136 | ax2ylims[1]+(ax2factor - 137 | (ax2ylims[1]-ax2ylims[0]) % ax2factor) % 138 | ax2factor, 139 | num_ticks)) -------------------------------------------------------------------------------- /rankeval/metrics/rbp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Cristina Muntean 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | 9 | import numpy as np 10 | from rankeval.metrics import Metric 11 | 12 | 13 | class RBP(Metric): 14 | """ 15 | This class implements Ranked biased Precision (RBP) with several parameters. 16 | We implemented RBP as in: Alistair Moffat and Justin Zobel. 2008. 17 | Rank-biased precision for measurement of retrieval effectiveness. 18 | ACM Trans. Inf. Syst. 27, 1, Article 2 (December 2008), 27 pages. 19 | DOI=http://dx.doi.org/10.1145/1416950.1416952 20 | 21 | RBP is an extension of P@k. User has certain chance to view each result. 22 | 23 | RBP = E(# viewed relevant results) / E(# viewed results) 24 | 25 | p is based on the user model perspective and allows simulating different 26 | types of users, e.g.: 27 | p = 0.95 for persistent user 28 | p = 0.8 for patient users 29 | p = 0.5 for impatient users 30 | p = 0 for i'm feeling lucky - P@1 31 | 32 | The use of different values of p reflects different ways in which ranked 33 | lists can be used. Values close to 1.0 are indicative of highly persistent 34 | users, who scrutinize many answers before ceasing their search. For example, 35 | at p = 0.95, there is a roughly 60% likelihood that a user will enter a 36 | second page of 10 results, and a 35% chance that they will go to a third 37 | page. Such users obtain a relatively low per-document utility from a search 38 | unless a high number of relevant documents are encountered, scattered 39 | through a long prefix of the ranking. 40 | 41 | """ 42 | 43 | _threshold = 1 44 | 45 | def __init__(self, name='RBP', cutoff=None, threshold=_threshold, p=0.5): 46 | """ 47 | This is the constructor of RBP, an object of type Metric, with the name 48 | RBP. The constructor also allows setting custom values in the following 49 | parameters. 50 | 51 | Parameters 52 | ---------- 53 | name: string 54 | RBP 55 | cutoff: int 56 | The top k results to be considered at per query level (e.g. 10) 57 | threshold: float 58 | This parameter considers relevant results all instances with labels 59 | different from 0, thus with a minimum label value of 1. It can be 60 | set to other values as well (e.g. 3), in the range of possible 61 | labels. 62 | p: float 63 | This parameter which simulates user type, and consequently the 64 | probability that a viewer actually inspects the document at rank k. 65 | """ 66 | super(RBP, self).__init__(name) 67 | self.cutoff = cutoff 68 | self.threshold = threshold 69 | self.p = p 70 | 71 | def eval(self, dataset, y_pred): 72 | """ 73 | This method takes the RBP for each query and calculates the average RBP. 74 | 75 | Parameters 76 | ---------- 77 | dataset : Dataset 78 | Represents the Dataset object on which to apply RBP. 79 | y_pred : numpy 1d array of float 80 | Represents the predicted document scores for each instance in the 81 | dataset. 82 | 83 | Returns 84 | ------- 85 | avg_score: float 86 | The overall RBP score (averages over the detailed MAP scores). 87 | detailed_scores: numpy 1d array of floats 88 | The detailed RBP@k scores for each query, an array of length of the 89 | number of queries. 90 | 91 | """ 92 | return super(RBP, self).eval(dataset, y_pred) 93 | 94 | def eval_per_query(self, y, y_pred): 95 | """ 96 | This method helps compute the RBP score per query. It is called by the 97 | eval function which averages and aggregates the scores for each query. 98 | 99 | Parameters 100 | ---------- 101 | y: numpy array 102 | Represents the labels of instances corresponding to one query in 103 | the dataset (ground truth). 104 | y_pred: numpy array. 105 | Represents the predicted document scores obtained during the model 106 | scoring phase for that query. 107 | 108 | Returns 109 | ------- 110 | rbp: float 111 | Represents the RBP score for one query. 112 | """ 113 | idx_y_pred_sorted = np.argsort(y_pred)[::-1] 114 | if self.cutoff is not None: 115 | idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff] 116 | 117 | discount = np.power(self.p, np.arange(len(idx_y_pred_sorted))) 118 | gain = y[idx_y_pred_sorted] >= self.threshold 119 | 120 | rbp = (1. - self.p) * (gain * discount).sum() 121 | return rbp 122 | 123 | def __str__(self): 124 | s = self.name 125 | if self.cutoff is not None: 126 | s += "@{}".format(self.cutoff) 127 | if self.threshold != self._threshold: 128 | s += "[>{}]".format(self.threshold) 129 | return s 130 | -------------------------------------------------------------------------------- /rankeval/analysis/_efficient_topological.pyx: -------------------------------------------------------------------------------- 1 | import cython 2 | cimport cython 3 | 4 | # Import the Python-level symbols of numpy 5 | import numpy as np 6 | 7 | # Import the C-level symbols of numpy 8 | cimport numpy as np 9 | 10 | import scipy as sc 11 | import scipy.sparse 12 | 13 | # Numpy must be initialized. When using numpy from C or Cython you must 14 | # _always_ do that, or you will have segfaults 15 | np.import_array() 16 | 17 | from cython.parallel import prange, parallel 18 | 19 | @cython.boundscheck(False) 20 | @cython.wraparound(False) 21 | def efficient_topological_analysis(model, include_leaves=True): 22 | 23 | cdef np.intp_t n_trees = model.n_trees 24 | cdef np.intp_t n_nodes = model.n_nodes 25 | 26 | cdef int[:] trees_root = model.trees_root 27 | cdef int[:] trees_left_child = model.trees_left_child 28 | cdef int[:] trees_right_child = model.trees_right_child 29 | 30 | node_indices = np.zeros(model.n_nodes, dtype=np.uint64) 31 | cdef unsigned long long[:] node_indices_view = node_indices 32 | cdef unsigned int[:] height_trees = np.zeros(model.n_trees, dtype=np.uint32) 33 | 34 | cdef bint c_include_leaves = include_leaves 35 | 36 | cdef np.intp_t idx_tree 37 | cdef int idx_last_node 38 | with nogil, parallel(): 39 | for idx_tree in prange(n_trees): 40 | idx_last_node = trees_root[idx_tree+1] if idx_tree < n_trees-1 else n_nodes 41 | height_trees[idx_tree] = _compute_node_indices(idx_tree, 42 | trees_root, 43 | trees_left_child, 44 | trees_right_child, 45 | node_indices_view, 46 | idx_last_node, 47 | c_include_leaves) 48 | 49 | # Computes unique indices and counts the occurrences of each index (aggregate) 50 | unique_counts = np.unique(node_indices, return_counts=True) 51 | 52 | cdef unsigned long long[:] data_indices_view = unique_counts[0] 53 | cdef long[:] counts_view = unique_counts[1] 54 | 55 | # overwrite counts of 0-values since they should identify only the 56 | # root nodes but include also the leaves when include_leaves=False) 57 | counts_view[0] = n_trees 58 | 59 | cdef np.intp_t data_indices_size = data_indices_view.size 60 | 61 | # indices in a sparse matrix representation 62 | cdef unsigned long long[:] row_ind = np.zeros(data_indices_size, dtype=np.uint64) 63 | cdef unsigned long long[:] col_ind = np.zeros(data_indices_size, dtype=np.uint64) 64 | 65 | cdef np.intp_t idx_data 66 | cdef int exp 67 | with nogil, parallel(): 68 | for idx_data in prange(data_indices_size): 69 | row_ind[idx_data] = most_significant_bit(data_indices_view[idx_data] + 1) 70 | col_ind[idx_data] = data_indices_view[idx_data] + 1 - 2**row_ind[idx_data] 71 | 72 | return sc.sparse.csr_matrix((counts_view, (row_ind, col_ind)), dtype=np.float32), np.asarray(height_trees) 73 | 74 | @cython.boundscheck(False) 75 | @cython.wraparound(False) 76 | cdef int _compute_node_indices(np.intp_t idx_tree, 77 | int[:] trees_root, 78 | int[:] trees_left_child, 79 | int[:] trees_right_child, 80 | unsigned long long[:] node_indices, 81 | int idx_last_node, 82 | bint include_leaves) nogil: 83 | 84 | cdef int cur_node = trees_root[idx_tree] 85 | cdef unsigned long long left_value, right_value, max_index = 0 86 | while cur_node < idx_last_node: 87 | if _is_leaf_node(cur_node, trees_left_child, trees_right_child): 88 | if not include_leaves: 89 | node_indices[cur_node] = 0 90 | else: 91 | left_value = 2 * node_indices[cur_node] + 1 92 | right_value = 2 * node_indices[cur_node] + 2 93 | node_indices[trees_left_child[cur_node]] = left_value 94 | node_indices[trees_right_child[cur_node]] = right_value 95 | max_index = max(max_index, left_value) 96 | max_index = max(max_index, right_value) 97 | cur_node += 1 98 | 99 | cdef int height = most_significant_bit(max_index + 1) 100 | return height 101 | 102 | @cython.boundscheck(False) 103 | @cython.wraparound(False) 104 | cdef inline bint _is_leaf_node(int idx_node, 105 | int[:] trees_left_child, 106 | int[:] trees_right_child) nogil: 107 | return trees_left_child[idx_node] == -1 and trees_right_child[idx_node] == -1 108 | 109 | @cython.boundscheck(False) 110 | @cython.wraparound(False) 111 | cdef int most_significant_bit(long long v) nogil: 112 | 113 | cdef long long *b = [0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000, 0xFFFFFFFF00000000] 114 | cdef unsigned int *S = [1, 2, 4, 8, 16, 32] 115 | 116 | # result of log2(v) will go here 117 | cdef unsigned int r = 0 118 | # unroll for speed... 119 | cdef int i = 5 120 | while i >= 0: 121 | if (v & b[i]): 122 | v >>= S[i]; 123 | r |= S[i]; 124 | i -= 1 125 | 126 | return r -------------------------------------------------------------------------------- /rankeval/scoring/_efficient_scoring.pyx: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Salvatore Trani 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | """ 9 | Optimized scoring of RankEval. 10 | """ 11 | 12 | import cython 13 | cimport cython 14 | 15 | # Import the Python-level symbols of numpy 16 | import numpy as np 17 | 18 | # Import the C-level symbols of numpy 19 | cimport numpy as np 20 | 21 | 22 | # Numpy must be initialized. When using numpy from C or Cython you must 23 | # _always_ do that, or you will have segfaults 24 | np.import_array() 25 | 26 | from cython.parallel import prange, parallel 27 | 28 | @cython.boundscheck(False) 29 | @cython.wraparound(False) 30 | def basic_scoring(model, X): 31 | 32 | cdef np.intp_t n_instances = X.shape[0] 33 | cdef np.intp_t n_trees = model.n_trees 34 | 35 | cdef float[:, :] X_view = X 36 | y = np.zeros(n_instances, dtype=np.float32) 37 | cdef float[:] y_view = y 38 | 39 | cdef int[:] trees_root = model.trees_root 40 | cdef float[:] trees_weight = model.trees_weight 41 | cdef short[:] trees_nodes_feature = model.trees_nodes_feature 42 | cdef float[:] trees_nodes_value = model.trees_nodes_value 43 | cdef int[:] trees_left_child = model.trees_left_child 44 | cdef int[:] trees_right_child = model.trees_right_child 45 | 46 | cdef int leaf_node 47 | cdef np.intp_t idx_tree, idx_instance 48 | with nogil, parallel(): 49 | for idx_instance in prange(n_instances): 50 | for idx_tree in xrange(n_trees): 51 | leaf_node = _score_single_instance_single_tree( 52 | X_view, 53 | idx_instance, 54 | idx_tree, 55 | trees_root, 56 | trees_weight, 57 | trees_nodes_feature, 58 | trees_nodes_value, 59 | trees_left_child, 60 | trees_right_child 61 | ) 62 | 63 | y_view[idx_instance] += \ 64 | trees_nodes_value[leaf_node] * trees_weight[idx_tree] 65 | return y 66 | 67 | @cython.boundscheck(False) 68 | @cython.wraparound(False) 69 | def detailed_scoring(model, X): 70 | 71 | cdef np.intp_t n_instances = X.shape[0] 72 | cdef np.intp_t n_trees = model.n_trees 73 | 74 | cdef float[:, :] X_view = X 75 | y_leaves = np.zeros((X.shape[0], model.n_trees), dtype=np.int32) 76 | cdef int[:, :] y_leaves_view = y_leaves 77 | 78 | partial_y = np.zeros((X.shape[0], model.n_trees), dtype=np.float32) 79 | cdef float[:, :] partial_y_view = partial_y 80 | 81 | cdef int[:] trees_root = model.trees_root 82 | cdef float[:] trees_weight = model.trees_weight 83 | cdef short[:] trees_nodes_feature = model.trees_nodes_feature 84 | cdef float[:] trees_nodes_value = model.trees_nodes_value 85 | cdef int[:] trees_left_child = model.trees_left_child 86 | cdef int[:] trees_right_child = model.trees_right_child 87 | 88 | cdef int leaf_node 89 | cdef np.intp_t idx_tree, idx_instance 90 | with nogil, parallel(): 91 | for idx_tree in prange(n_trees): 92 | for idx_instance in xrange(n_instances): 93 | leaf_node = _score_single_instance_single_tree( 94 | X_view, 95 | idx_instance, 96 | idx_tree, 97 | trees_root, 98 | trees_weight, 99 | trees_nodes_feature, 100 | trees_nodes_value, 101 | trees_left_child, 102 | trees_right_child 103 | ) 104 | 105 | y_leaves_view[idx_instance, idx_tree] = leaf_node 106 | partial_y_view[idx_instance, idx_tree] = \ 107 | trees_nodes_value[leaf_node] * trees_weight[idx_tree] 108 | 109 | return np.asarray(y_leaves), np.asarray(partial_y) 110 | 111 | @cython.boundscheck(False) 112 | @cython.wraparound(False) 113 | cdef int _score_single_instance_single_tree(float[:,:] X, 114 | np.intp_t idx_instance, 115 | np.intp_t idx_tree, 116 | int[:] trees_root, 117 | float[:] trees_weight, 118 | short[:] trees_nodes_feature, 119 | float[:] trees_nodes_value, 120 | int[:] trees_left_child, 121 | int[:] trees_right_child) nogil: 122 | 123 | # Check the usage of np.intp_t in plave of np.int16_t 124 | cdef int cur_node = trees_root[idx_tree] 125 | cdef short feature_idx 126 | cdef float feature_threshold 127 | while trees_left_child[cur_node] != -1 and trees_right_child[cur_node] != -1: 128 | feature_idx = trees_nodes_feature[cur_node] 129 | feature_threshold = trees_nodes_value[cur_node] 130 | if X[idx_instance, feature_idx] <= feature_threshold: 131 | cur_node = trees_left_child[cur_node] 132 | else: 133 | cur_node = trees_right_child[cur_node] 134 | return cur_node -------------------------------------------------------------------------------- /rankeval/metrics/ndcg.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Cristina Muntean 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | from collections import defaultdict 9 | import numpy as np 10 | 11 | from rankeval.metrics.dcg import DCG 12 | from rankeval.metrics.metric import Metric 13 | 14 | 15 | class NDCG(Metric): 16 | """ 17 | This class implements NDCG with several parameters. 18 | 19 | """ 20 | 21 | def __init__(self, name='NDCG', cutoff=None, no_relevant_results=1.0, 22 | implementation="exp"): 23 | """ 24 | This is the constructor of NDCG, an object of type Metric, with the 25 | name NDCG. 26 | The constructor also allows setting custom values 27 | - cutoff: the top k results to be considered at per query level 28 | - no_relevant_results: is a float values indicating how to treat 29 | the cases where then are no relevant results 30 | - ties: indicates how we should consider the ties 31 | - implementation: indicates whether to consider the flat or the 32 | exponential NDCG formula 33 | 34 | Parameters 35 | ---------- 36 | name: string 37 | NDCG 38 | cutoff: int 39 | The top k results to be considered at per query level (e.g. 10) 40 | no_relevant_results: float 41 | Float indicating how to treat the cases where then are no relevant 42 | results (e.g. 0.5). Default is 1.0. 43 | implementation: string 44 | Indicates whether to consider the flat or the exponential DCG 45 | formula: "flat" or "exp" (default). 46 | """ 47 | 48 | super(self.__class__, self).__init__(name) 49 | self.cutoff = cutoff 50 | self.no_relevant_results = no_relevant_results 51 | self.implementation = implementation 52 | self.dcg = DCG(cutoff=self.cutoff, 53 | implementation=self.implementation) 54 | 55 | self._current_dataset = None 56 | self._current_rel_qid = None 57 | self._cache_idcg_score = defaultdict(int) 58 | 59 | def eval(self, dataset, y_pred): 60 | """ 61 | The method computes NDCG by taking as input the dataset and the 62 | predicted document scores (obtained with the scoring methods). It 63 | returns the averaged NDCG score over the entire dataset and the 64 | detailed NDCG scores per query. 65 | 66 | Parameters 67 | ---------- 68 | dataset : Dataset 69 | Represents the Dataset object on which to apply NDCG. 70 | y_pred : numpy 1d array of float 71 | Represents the predicted document scores for each instance in the 72 | dataset. 73 | 74 | Returns 75 | ------- 76 | avg_score: float 77 | Represents the average NDCG over all NDCG scores per query. 78 | detailed_scores: numpy array of floats 79 | Represents the detailed NDCG scores for each query. It has the 80 | length of n_queries. 81 | 82 | """ 83 | # used to cache ideal DCG scores on a dataset basis 84 | self._current_dataset = dataset 85 | self._current_rel_qid = 0 86 | 87 | # Compute the ideal DCG scores only once and cache them 88 | if self._current_dataset not in self._cache_idcg_score: 89 | 90 | idcg_score = np.ndarray(shape=dataset.n_queries, dtype=np.float32) 91 | for rel_id, (qid, q_y, _) in enumerate( 92 | self.query_iterator(dataset, dataset.y)): 93 | idcg_score[rel_id] = self.dcg.eval_per_query(q_y, q_y) 94 | 95 | self._cache_idcg_score[self._current_dataset] = idcg_score 96 | 97 | return super(self.__class__, self).eval(dataset, y_pred) 98 | 99 | def eval_per_query(self, y, y_pred): 100 | """ 101 | This method helps compute the NDCG score per query. It is called by the 102 | eval function which averages and aggregates the scores for each query. 103 | 104 | It calculates NDCG per query as . 105 | If there are no relevant results, NDCG returns the values set by default 106 | or by the user when creating the metric. 107 | 108 | Parameters 109 | ---------- 110 | y: numpy array 111 | Represents the labels of instances corresponding to one query in the 112 | dataset (ground truth). 113 | y_pred: numpy array. 114 | Represents the predicted document scores obtained during the model 115 | scoring phase for that query. 116 | 117 | Returns 118 | ------- 119 | dcg: float 120 | Represents the DCG score for one query. 121 | """ 122 | dcg_score = self.dcg.eval_per_query(y, y_pred) 123 | 124 | if self._current_rel_qid is not None: 125 | idcg_score = \ 126 | self._cache_idcg_score[self._current_dataset][self._current_rel_qid] 127 | self._current_rel_qid += 1 128 | else: 129 | idcg_score = self.dcg.eval_per_query(y, y) 130 | 131 | if idcg_score != 0: 132 | ndcg = dcg_score / idcg_score 133 | else: 134 | ndcg = self.no_relevant_results 135 | return ndcg 136 | 137 | def __str__(self): 138 | s = self.name 139 | if self.cutoff is not None: 140 | s += "@{}".format(self.cutoff) 141 | return s 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | -------------------------------------------------------------------------------- /rankeval/scoring/scorer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Salvatore Trani 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | """ 9 | Class for efficient scoring of an ensemble-based model composed of binary regression trees on a given dataset. 10 | """ 11 | 12 | from ..dataset import Dataset 13 | from ._efficient_scoring import basic_scoring, detailed_scoring 14 | 15 | 16 | class Scorer(object): 17 | """ 18 | Class for efficient scoring of an ensemble-based model composed of binary regression trees on a given dataset. 19 | 20 | This class can be used for simple or detailed scoring, depending on the mode selected at scoring time. 21 | The document scores are cached as to avoid useless re-scoring. Thus, calling multiple times the `score` method 22 | does not involve the scoring activity to be executed again, except for a detailed scoring following a basic scoring. 23 | Indeed in this situation the scoring has to be repeated as to analyze in depth the scoring behaviour. 24 | 25 | Parameters 26 | ---------- 27 | model: RTEnsemble 28 | The model to use for scoring 29 | dataset: Dataset 30 | The dataset to use for scoring 31 | 32 | Attributes 33 | ---------- 34 | model : RTEnsemble 35 | The model to use for scoring 36 | dataset : Dataset 37 | The dataset to use for scoring 38 | y_pred : numpy array of float 39 | The predicted scores produced by the given model for each sample of the given dataset X 40 | partial_y_pred : numpy 2d-array of float 41 | The predicted score of each tree of the model for each dataset instance 42 | 43 | """ 44 | 45 | def __init__(self, model, dataset): 46 | self.model = model 47 | self.dataset = dataset 48 | 49 | # Save the predicted scores for each dataset instance 50 | self.y_pred = None 51 | 52 | # Save the partial scores of each tree for each dataset instance 53 | # (if detailed scoring is True) 54 | self.partial_y_pred = None 55 | 56 | # Save the leaf id of each tree for each dataset instance 57 | # (if detailed scoring is True) 58 | self.out_leaves = None 59 | 60 | def score(self, detailed): 61 | """ 62 | 63 | Parameters 64 | ---------- 65 | detailed : bool 66 | True if the class has to performs a detailed scoring, false otherwise 67 | 68 | Returns 69 | ------- 70 | y : numpy array of float 71 | the predicted scores produced by the given model for each sample of the given dataset X 72 | 73 | Attributes 74 | ---------- 75 | self.y : array of float 76 | The predicted scores of each dataset instance 77 | """ 78 | 79 | # Skip the scoring if it has already been done (return cached results) 80 | if not detailed and self.y_pred is not None or \ 81 | detailed and self.out_leaves is not None: 82 | return self.y_pred 83 | 84 | if detailed: 85 | self.out_leaves, self.partial_y_pred = \ 86 | detailed_scoring(self.model, self.dataset.X) 87 | self.y_pred = self.partial_y_pred.sum(axis=1) 88 | else: 89 | self.y_pred = basic_scoring(self.model, self.dataset.X) 90 | 91 | return self.y_pred 92 | 93 | def get_predicted_scores(self): 94 | """ 95 | Provide an accessor to the predicted scores produced by the given model for each sample of the given dataset X 96 | 97 | Returns 98 | ------- 99 | scores : numpy array of float 100 | The predicted scores produced by the given model for each sample of the given dataset X 101 | 102 | """ 103 | if self.y_pred is None: 104 | self.score(detailed=False) 105 | return self.y_pred 106 | 107 | def get_partial_predicted_scores(self): 108 | """ 109 | Provide an accessor to the partial scores produced by the given model 110 | for each sample of the given dataset X. Each partial score reflects the 111 | score produced by a single tree of the ensemble model to a single 112 | dataset instance. Thus, the returned numpy matrix has a shape of 113 | (n_instances, n_trees). The partial scores does not take into account 114 | the tree weights, thus for producing the final score is needed to 115 | multiply each row for the tree weight vector. 116 | 117 | Returns 118 | ------- 119 | scores : numpy 2d-array of float 120 | The predicted score of each tree of the model for each dataset instance 121 | """ 122 | if self.partial_y_pred is None: 123 | self.score(detailed=True) 124 | return self.partial_y_pred 125 | 126 | def get_predicted_leaves(self): 127 | """ 128 | Provide an accessor to the leaves that identify the exit nodes of each 129 | sample of the given dataset X using the given model. 130 | 131 | Each leaf value reflects the output node of a single tree of the 132 | ensemble model to a single dataset instance. Thus, the returned numpy 133 | matrix has a shape of (n_instances, n_trees). 134 | 135 | Returns 136 | ------- 137 | scores : numpy 2d-array of int 138 | The leaves predicted by each tree of the model on scoring 139 | each dataset instance. 140 | 141 | """ 142 | if self.self.y_leaves is None: 143 | self.score(detailed=True) 144 | return self.self.y_leaves 145 | 146 | -------------------------------------------------------------------------------- /rankeval/analysis/topological.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Salvatore Trani 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | 9 | """ 10 | This package implements several topological analysis focused on the 11 | topological characteristics of ensemble-based LtR models. These 12 | functionalities can be applied to several models, 13 | so as to have a direct comparison of the shape of the resulting 14 | forests (e.g., trained by different LtR algorithms). 15 | """ 16 | 17 | import numpy as np 18 | import scipy.stats 19 | 20 | from ..model import RTEnsemble 21 | from ._efficient_topological import efficient_topological_analysis 22 | 23 | 24 | def topological_analysis(model, include_leaves=True): 25 | """ 26 | This method implements the topological analysis of a ensemble-based 27 | LtR model. Given a model, it studies the shape of each tree composing 28 | the model and return several information useful for having insights 29 | about the shape of the trees, their completeness (level by level) as 30 | well as min/max/mean height and the fraction of trees having a specific 31 | node (where each node is identified by a pair of coordinates row-col, 32 | with row highlighting the depth and col the column with respect to a 33 | full binary tree). 34 | 35 | Parameters 36 | ---------- 37 | model : RTEnsemble 38 | The model to analyze 39 | include_leaves : bool 40 | Whether the leaves has to be included in the analysis or not 41 | 42 | Returns 43 | ------- 44 | object : TopologicalAnalysisResult 45 | The topological result, to use for retrieving several information 46 | """ 47 | return TopologicalAnalysisResult(model, include_leaves) 48 | 49 | 50 | class TopologicalAnalysisResult(object): 51 | """ 52 | This class is used to return the topological analysis made on the model. 53 | Several low-level information are stored in this class, and then 54 | re-elaborated to provide high-level analysis. 55 | """ 56 | 57 | def __init__(self, model, include_leaves): 58 | """ 59 | Analyze the model in a topological perspective 60 | 61 | Parameters 62 | ---------- 63 | model : RTEnsemble 64 | the model to analyze from the topological perspective 65 | include_leaves : bool 66 | Whether the leaves has to be included in the analysis or not 67 | 68 | Attributes 69 | ---------- 70 | model : RTEnsemble 71 | The model analyzed 72 | height_trees : numpy array 73 | The ordered height of each trees composing the ensemble 74 | topology : scipy.sparse.csr_matrix 75 | The matrix used to store low-level information related to the 76 | aggregated shape of the trees. Each matrix cell identifies a 77 | tree node with a pair of coordinates row-col, with row 78 | highlighting the depth and col the column with respect 79 | to a full binary tree. 80 | """ 81 | self.model = model 82 | self.topology, self.height_trees = efficient_topological_analysis(model, include_leaves) 83 | 84 | def describe_tree_height(self): 85 | """ 86 | Computes several descriptive statistics of the height of the trees. 87 | 88 | Returns 89 | ------- 90 | nobs : int 91 | Number of trees 92 | minmax: tuple of ndarrays or floats 93 | Minimum and maximum height of trees 94 | mean : ndarray or float 95 | Arithmetic mean of tree heights. 96 | variance : ndarray or float 97 | Unbiased variance of the tree heights. 98 | denominator is number of trees minus one. 99 | skewness : ndarray or float 100 | Skewness, based on moment calculations with denominator equal to 101 | the number of trees, i.e. no degrees of freedom correction. 102 | kurtosis : ndarray or float 103 | Kurtosis (Fisher). The kurtosis is normalized so that it is 104 | zero for the normal distribution. No degrees of freedom are used. 105 | """ 106 | return scipy.stats.describe(self.height_trees) 107 | 108 | def avg_tree_shape(self): 109 | """ 110 | Computes the fraction of trees having each node with respect to a 111 | full binary tree. The fraction is obtained by normalizing the count 112 | by the number of trees composing the ensemble model. 113 | 114 | Returns 115 | ------- 116 | fractions : scipy.sparse.csr_matrix 117 | Sparse matrix with the same shape of the topology matrix, where 118 | each matrix cell identifies a tree node by a pair of coordinates 119 | row-col, with row highlighting the depth and col the column with 120 | respect to a full binary tree. Each cell value highlights how many 121 | trees have the specific node, normalized by the number of trees. 122 | """ 123 | return self.topology / self.model.n_trees 124 | 125 | def fullness_per_level(self): 126 | """ 127 | Computes the normalized number of trees with full level i, for each 128 | level of a full binary tree. The normalization is done by the number 129 | of trees. 130 | 131 | Returns 132 | ------- 133 | fullness : np.array 134 | An array long as the maximum height of a tree in the ensemble, and 135 | where the j-th cell highlight how much the j-th level of the trees 136 | is full (normalized by the number of trees). 137 | """ 138 | # Row-sums are directly supported, and the structure of the CSR format means that 139 | # the difference between successive values in the indptr array correspond exactly 140 | # to the number of nonzero elements in each row. 141 | sums = self.topology.sum(axis=1).A1 142 | counts = np.diff(self.topology.indptr) 143 | return sums / counts / self.model.n_trees 144 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://img.shields.io/travis/com/hpclab/rankeval/master.svg?logo=travis)](https://travis-ci.com/hpclab/rankeval) 2 | [![Python version](https://img.shields.io/pypi/pyversions/rankeval.svg)](https://badge.fury.io/py/rankeval) 3 | [![PyPI version](https://img.shields.io/pypi/v/rankeval.svg)](https://badge.fury.io/py/rankeval) 4 | [![Wheel](https://img.shields.io/badge/wheels-%E2%9C%93-4c1.svg?longCache=true&logo=python&logoColor=white)](https://badge.fury.io/py/rankeval) 5 | [![CPython Implementation](https://img.shields.io/pypi/implementation/rankeval.svg)](https://badge.fury.io/py/rankeval) 6 | [![License](https://img.shields.io/badge/license-MPL%202.0-blue.svg)](https://badge.fury.io/py/rankeval) 7 | [![DOI](https://img.shields.io/badge/DOI-10.1145%2F3077136.3084140-blue)](https://doi.org/10.1145/3077136.3084140) 8 | 9 | 10 | 11 | # RankEval: An Evaluation and Analysis Framework for Learning-to-Rank Solutions 12 | 13 | RankEval is an open-source tool for the analysis and evaluation of 14 | Learning-to-Rank models based on ensembles of regression trees. The 15 | success of ensembles of regression trees fostered the development of 16 | several open-source libraries targeting efficiency of the learning phase 17 | and effectiveness of the resulting models. However, these libraries offer 18 | only very limited help for the tuning and evaluation of the trained models. 19 | 20 | RankEval aims at providing a common ground for several Learning to Rank 21 | libraries by providing useful and interoperable tools for a comprehensive 22 | comparison and in-depth analysis of ranking models. Target audience is the 23 | *machine learning* (ML) and *information retrieval* (IR) communities. 24 | 25 | RankEval is available under Mozilla Public License 2.0. 26 | 27 | The official GitHub repository is: [here](https://github.com/hpclab/rankeval). 28 | 29 | For questions/suggestions on how to improve RankEval, send us an email: 30 | rankeval@isti.cnr.it 31 | 32 | ## Features 33 | 34 | Rankeval provides a common ground between several pre-existing tools and offers 35 | services which support the interpretation of differently generated models in a 36 | unified environment, allowing an easy, comprehensive comparison and in-depth 37 | analysis. 38 | 39 | The main functionalities of RankEval can be summarized along five dimensions: 40 | - effectiveness analysis 41 | - feature analysis 42 | - structural analysis 43 | - topological analysis 44 | - interoperability among GBRT libraries 45 | 46 | Regarding the interoperability, Rankeval is able to read and process ranking ensembles learned with learning-to-rank 47 | libraries such as QuickRank, RankLib, XGBoost, LightGBM, Scikit-Learn, CatBoost, JForest. This advanced 48 | interoperability is implemented through proxy classes that make possible to interpret and understand the specific 49 | format used to represent the ranking ensemble without using the codebase of the learning-to-rank library. Thus RankEval 50 | does not have any dependency from the learning-to-rank library of choice of the user. 51 | 52 | These functionalities can be applied to several models at the same time, so to 53 | have a direct comparison of the analysis performed. The tool has been written 54 | to ensure **flexibility**, **extensibility**, and **efficiency**. 55 | 56 | ## Documentation 57 | 58 | The official API documentation is available at: [here](http://rankeval.isti.cnr.it/docs/). 59 | Soon on ReadTheDocs! 60 | 61 | ## Installation 62 | 63 | The library works with OpenMP so you need a compiler supporting it. 64 | If your machine uses a default compiler different from GNU GCC, change it 65 | appropriately before proceeding with the installation: 66 | 67 | ``` 68 | export CC=gcc-5 69 | export CXX=g++-5 70 | ``` 71 | 72 | Moreover, RankEval needs the following libraries to be installed before the 73 | installation process begin (used for compiling the low-level code by the installation process): 74 | - numpy >= 1.13 75 | - scipy >= 0.14 76 | - cython >= 0.25 77 | - matplotlib >= 2.0.2 78 | 79 | Additional dependencies will be installed automatically by setuptools. 80 | RankEval can be installed from the source by running: 81 | 82 | ```python setup.py install``` 83 | 84 | RankEval can also be easily installed from Python Package Index (PyPI). In this case, most probably you don't even need 85 | cython locally to compile low level code since the binaries should already been available for your platform. 86 | You may download and install it by running: 87 | 88 | ```pip install rankeval``` 89 | 90 | Alternatively, you can build the library from the latest commit on the master branch of the repository. 91 | Below an example of installation. 92 | 93 | ```pip install git+https://github.com/hpclab/rankeval``` 94 | 95 | ## Development 96 | 97 | If you would like to install the library in development mode, i.e., you can edit the source code and see the changes 98 | directly without having to reinstall every time that you make a little change, than you have to run the following 99 | command which will install as well the libraries required for development (documentation generation and unittests): 100 | 101 | ```pip install -e .[develop]``` 102 | 103 | Local installation of compiled libraries: 104 | 105 | ```python setup.py build_ext -i``` 106 | 107 | Execution of unit tests: 108 | 109 | ```python setup.py test``` 110 | 111 | or (if you have nose already installed): 112 | 113 | ```nosetests -v``` 114 | 115 | ## Cite RankEval 116 | 117 | If you use RankEval, please cite us! 118 | 119 | ``` 120 | @inproceedings{rankeval-sigir17, 121 | author = {Claudio Lucchese and Cristina Ioana Muntean and Franco Maria Nardini and 122 | Raffaele Perego and Salvatore Trani}, 123 | title = {RankEval: An Evaluation and Analysis Framework for Learning-to-Rank Solutions}, 124 | booktitle = {SIGIR 2017: Proceedings of the 40th International {ACM} {SIGIR} 125 | Conference on Research and Development in Information Retrieval}, 126 | year = {2017}, 127 | location = {Tokyo, Japan} 128 | } 129 | ``` 130 | 131 | ## Credits 132 | - Dataset loader: https://github.com/deronnek/svmlight-loader 133 | - Query id implementation: https://github.com/mblondel/svmlight-loader/pull/6 -------------------------------------------------------------------------------- /rankeval/test/dataset/test_svmlight_format.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | import numpy as np 6 | from nose.tools import raises 7 | from numpy.testing import assert_equal, assert_array_equal, \ 8 | assert_array_almost_equal 9 | 10 | try: 11 | from sklearn.datasets import load_svmlight_file as sk_load_svmlight_file 12 | scikit_missing = False 13 | except ImportError: 14 | scikit_missing = True 15 | 16 | from rankeval.dataset.svmlight_format import load_svmlight_file, \ 17 | load_svmlight_files, dump_svmlight_file 18 | from rankeval.test.base import data_dir 19 | 20 | datafile = os.path.join(data_dir, "svmlight_classification.txt") 21 | invalidfile = os.path.join(data_dir, "svmlight_invalid.txt") 22 | qid_datafile = os.path.join(data_dir, "svmlight_classification_qid.txt") 23 | 24 | 25 | class SVMLightLoaderTestCase(unittest.TestCase): 26 | 27 | def test_load_svmlight_qid_file(self): 28 | X, y, q = load_svmlight_file(qid_datafile, query_id=True) 29 | 30 | # test X's shape 31 | assert_array_equal(X.shape, (4, 33)) 32 | #print X 33 | 34 | # test X's non-zero values 35 | # tests X's zero values 36 | # test can change X's values 37 | 38 | # test y 39 | assert_array_equal(y, [1, 2, 0, 3]) 40 | 41 | # test q 42 | # print q 43 | assert_array_equal(q, [1, 37, 37, 12]) 44 | 45 | def test_load_svmlight_file_empty_qid(self): 46 | X, y, q = load_svmlight_file(datafile, query_id=True) 47 | 48 | # test X's shape 49 | assert_array_equal(X.shape, (3, 20)) 50 | 51 | # test X's non-zero values 52 | # tests X's zero values 53 | # test can change X's values 54 | 55 | # test y 56 | assert_array_equal(y, [1, 2, 3]) 57 | 58 | # test q 59 | assert_equal(q.shape[0], 0) 60 | 61 | def test_load_svmlight_file(self): 62 | X, y = load_svmlight_file(datafile) 63 | 64 | # test X's shape 65 | assert_array_equal(X.shape, (3, 20)) 66 | 67 | # test X's non-zero values 68 | # tests X's zero values 69 | # test can change X's values 70 | 71 | # test y 72 | assert_array_equal(y, [1, 2, 3]) 73 | 74 | def test_load_svmlight_file_descriptor(self): 75 | with open(datafile, 'rb') as reader: 76 | X, y = load_svmlight_file(reader) 77 | 78 | # test X's shape 79 | assert_array_equal(X.shape, (3, 20)) 80 | 81 | # test y 82 | assert_array_equal(y, [1, 2, 3]) 83 | 84 | def test_load_svmlight_files_comment_qid(self): 85 | X_train, y_train, q_train, X_test, y_test, q_test = \ 86 | load_svmlight_files([datafile] * 2, query_id=True) 87 | assert_array_equal(X_train, X_test) 88 | assert_array_equal(y_train, y_test) 89 | assert_equal(X_train.dtype, np.float32) 90 | assert_equal(X_test.dtype, np.float32) 91 | 92 | X1, y1, q1, X2, y2, q2, X3, y3, q3 = load_svmlight_files([datafile] * 3, query_id=True) 93 | assert_equal(X1.dtype, X2.dtype) 94 | assert_equal(X2.dtype, X3.dtype) 95 | assert_equal(X3.dtype, np.float32) 96 | 97 | def test_load_svmlight_files(self): 98 | # print load_svmlight_files([datafile] * 2) 99 | X_train, y_train, X_test, y_test = load_svmlight_files([datafile] * 2, query_id=False) 100 | assert_array_equal(X_train, X_test) 101 | assert_array_equal(y_train, y_test) 102 | assert_equal(X_train.dtype, np.float32) 103 | assert_equal(X_test.dtype, np.float32) 104 | 105 | X1, y1, X2, y2, X3, y3 = load_svmlight_files([datafile] * 3, query_id=False) 106 | assert_equal(X1.dtype, X2.dtype) 107 | assert_equal(X2.dtype, X3.dtype) 108 | assert_equal(X3.dtype, np.float32) 109 | 110 | def test_load_invalid_file(self): 111 | try: 112 | load_svmlight_file(invalidfile) 113 | assert False 114 | except RuntimeError: 115 | pass 116 | 117 | def test_load_invalid_file2(self): 118 | try: 119 | load_svmlight_files([datafile, invalidfile, datafile]) 120 | assert False 121 | except RuntimeError: 122 | pass 123 | 124 | @raises(IOError) 125 | def test_invalid_filename(self): 126 | load_svmlight_file("trou pic nic douille") 127 | 128 | @unittest.skipIf(scikit_missing, "Scikit-Learn package missing") 129 | def test_dump(self): 130 | tmpfile = "tmp_dump.txt" 131 | try: 132 | # loads from file 133 | Xs, y = load_svmlight_file(datafile) 134 | 135 | # dumps to file 136 | dump_svmlight_file(Xs, y, tmpfile, zero_based=False) 137 | 138 | # loads them as CSR MATRIX 139 | X2, y2 = sk_load_svmlight_file(tmpfile) 140 | 141 | X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype) 142 | X2.toarray(out=X3) 143 | 144 | # check assertions 145 | assert_array_almost_equal(Xs, X3) 146 | assert_array_almost_equal(y, y2) 147 | finally: 148 | if os.path.exists(tmpfile): 149 | os.remove(tmpfile) 150 | 151 | @unittest.skipIf(scikit_missing, "Scikit-Learn package missing") 152 | def test_dump_qid(self): 153 | tmpfile = "/tmp/tmp_dump.txt" 154 | try: 155 | # loads from file 156 | Xs, y, q = load_svmlight_file(qid_datafile, query_id=True) 157 | 158 | # dumps to file 159 | dump_svmlight_file(Xs, y, tmpfile, query_id=list(q), zero_based=False) 160 | 161 | # loads them as CSR MATRIX with scikit-learn 162 | X2, y2, q2 = sk_load_svmlight_file(tmpfile, query_id=True) 163 | 164 | X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype) 165 | X2.toarray(out=X3) 166 | 167 | # check assertions 168 | assert_array_almost_equal(Xs, X3) 169 | assert_array_almost_equal(y, y2) 170 | assert_array_equal(q, q2) 171 | finally: 172 | if os.path.exists(tmpfile): 173 | os.remove(tmpfile) 174 | 175 | 176 | if __name__ == '__main__': 177 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) 178 | unittest.main() 179 | -------------------------------------------------------------------------------- /doc/src/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, u'/Users/salvatore/Documents/Projects/rankeval/rankeval') 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = u'RankEval' 23 | copyright = u'2018, HPC Lab' 24 | author = u'HPC Lab' 25 | 26 | # The short X.Y version 27 | version = u'0' 28 | # The full version, including alpha/beta/rc tags 29 | release = u'0.00' 30 | 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | # needs_sphinx = '1.0' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = [ 42 | 'sphinx.ext.autodoc', 43 | 'sphinx.ext.viewcode', 44 | 'sphinx.ext.todo', 45 | ] 46 | 47 | # Add any paths that contain templates here, relative to this directory. 48 | templates_path = ['_templates'] 49 | 50 | # The suffix(es) of source filenames. 51 | # You can specify multiple suffix as a list of string: 52 | # 53 | # source_suffix = ['.rst', '.md'] 54 | source_suffix = '.rst' 55 | 56 | # The master toctree document. 57 | master_doc = 'index' 58 | 59 | # The language for content autogenerated by Sphinx. Refer to documentation 60 | # for a list of supported languages. 61 | # 62 | # This is also used if you do content translation via gettext catalogs. 63 | # Usually you set "language" from the command line for these cases. 64 | language = 'en' 65 | 66 | # List of patterns, relative to source directory, that match files and 67 | # directories to ignore when looking for source files. 68 | # This pattern also affects html_static_path and html_extra_path . 69 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 70 | 71 | # The name of the Pygments (syntax highlighting) style to use. 72 | pygments_style = 'sphinx' 73 | 74 | 75 | # -- Options for HTML output ------------------------------------------------- 76 | 77 | # The theme to use for HTML and HTML Help pages. See the documentation for 78 | # a list of builtin themes. 79 | # 80 | html_theme = 'alabaster' 81 | 82 | # Theme options are theme-specific and customize the look and feel of a theme 83 | # further. For a list of options available for each theme, see the 84 | # documentation. 85 | # 86 | # html_theme_options = {} 87 | 88 | # Add any paths that contain custom static files (such as style sheets) here, 89 | # relative to this directory. They are copied after the builtin static files, 90 | # so a file named "default.css" will overwrite the builtin "default.css". 91 | html_static_path = ['_static'] 92 | 93 | # Custom sidebar templates, must be a dictionary that maps document names 94 | # to template names. 95 | # 96 | # The default sidebars (for documents that don't match any pattern) are 97 | # defined by theme itself. Builtin themes are using these templates by 98 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 99 | # 'searchbox.html']``. 100 | # 101 | # html_sidebars = {} 102 | 103 | 104 | # -- Options for HTMLHelp output --------------------------------------------- 105 | 106 | # Output file base name for HTML help builder. 107 | htmlhelp_basename = 'RankEvaldoc' 108 | 109 | 110 | # -- Options for LaTeX output ------------------------------------------------ 111 | 112 | latex_elements = { 113 | # The paper size ('letterpaper' or 'a4paper'). 114 | # 115 | # 'papersize': 'letterpaper', 116 | 117 | # The font size ('10pt', '11pt' or '12pt'). 118 | # 119 | # 'pointsize': '10pt', 120 | 121 | # Additional stuff for the LaTeX preamble. 122 | # 123 | # 'preamble': '', 124 | 125 | # Latex figure (float) alignment 126 | # 127 | # 'figure_align': 'htbp', 128 | } 129 | 130 | # Grouping the document tree into LaTeX files. List of tuples 131 | # (source start file, target name, title, 132 | # author, documentclass [howto, manual, or own class]). 133 | latex_documents = [ 134 | (master_doc, 'RankEval.tex', u'RankEval Documentation', 135 | u'HPC Lab', 'manual'), 136 | ] 137 | 138 | 139 | # -- Options for manual page output ------------------------------------------ 140 | 141 | # One entry per manual page. List of tuples 142 | # (source start file, name, description, authors, manual section). 143 | man_pages = [ 144 | (master_doc, 'rankeval', u'RankEval Documentation', 145 | [author], 1) 146 | ] 147 | 148 | 149 | # -- Options for Texinfo output ---------------------------------------------- 150 | 151 | # Grouping the document tree into Texinfo files. List of tuples 152 | # (source start file, target name, title, author, 153 | # dir menu entry, description, category) 154 | texinfo_documents = [ 155 | (master_doc, 'RankEval', u'RankEval Documentation', 156 | author, 'RankEval', 'One line description of project.', 157 | 'Miscellaneous'), 158 | ] 159 | 160 | 161 | # -- Options for Epub output ------------------------------------------------- 162 | 163 | # Bibliographic Dublin Core info. 164 | epub_title = project 165 | epub_author = author 166 | epub_publisher = author 167 | epub_copyright = copyright 168 | 169 | # The unique identifier of the text. This can be a ISBN number 170 | # or the project homepage. 171 | # 172 | # epub_identifier = '' 173 | 174 | # A unique identification for the text. 175 | # 176 | # epub_uid = '' 177 | 178 | # A list of files that should not be packed into the epub file. 179 | epub_exclude_files = ['search.html'] 180 | 181 | 182 | # -- Extension configuration ------------------------------------------------- 183 | 184 | # -- Options for todo extension ---------------------------------------------- 185 | 186 | # If true, `todo` and `todoList` produce output, else they produce nothing. 187 | todo_include_todos = True# custom 188 | extensions += ['sphinx.ext.todo'] 189 | todo_include_todos = True 190 | extensions += ['sphinx.ext.autosummary'] 191 | extensions += ['sphinx.ext.imgmath'] 192 | numpydoc_show_class_members = False 193 | html_theme = "sphinx_rtd_theme" 194 | import sys,os 195 | sys.path.insert(0, os.path.abspath('../../') ) 196 | from setuptools import sandbox 197 | sandbox.run_setup(os.path.abspath('../../setup.py'), ['build_ext','-i']) 198 | autoclass_content = 'both' 199 | -------------------------------------------------------------------------------- /rankeval/analysis/_efficient_feature_impl.cpp: -------------------------------------------------------------------------------- 1 | #include "_efficient_feature_impl.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | void c_feature_importance( 8 | const float* X, 9 | const float* y, 10 | const int* trees_root, 11 | const float* trees_weight, 12 | const short* trees_nodes__feature, 13 | const float* trees_nodes_value, 14 | const int* trees_left_child, 15 | const int* trees_right_child, 16 | float* feature_imp, 17 | short* feature_count, 18 | int n_instances, 19 | int n_features, 20 | int n_trees) { 21 | 22 | // initialize features importance 23 | #pragma omp parallel for 24 | for (unsigned int feature = 0; feature < n_features; ++feature) { 25 | feature_imp[feature] = 0; 26 | } 27 | 28 | // default scores on the root node of the first tree 29 | std::vector y_pred(n_instances, 0); 30 | std::vector y_pred_tree(n_instances); 31 | 32 | for (unsigned int tree_id=0; tree_id split_instance(n_instances); 71 | // The residual scores to fit 72 | // y_target = y - y_pred 73 | std::vector y_target(n_instances); 74 | float mean_y_target = 0; 75 | #pragma omp parallel for reduction( + : mean_y_target ) 76 | for (unsigned int instance = 0; instance < n_instances; ++instance) { 77 | split_instance[instance] = instance; 78 | y_target[instance] = y[instance] - y_pred[instance]; 79 | mean_y_target += y_target[instance]; 80 | } 81 | mean_y_target /= n_instances; 82 | 83 | // initialize the y_pred_tree vector 84 | // y_pred_tree = np.full(n_instances, fill_value=y_target.mean()) 85 | #pragma omp parallel for 86 | for (unsigned int instance = 0; instance < n_instances; ++instance) 87 | y_pred_tree[instance] = mean_y_target; 88 | 89 | TreeNode root(trees_root[tree_id], 0, n_instances - 1); 90 | std::vector queue = { root }; 91 | 92 | while (!queue.empty()) { 93 | 94 | TreeNode node = queue.back(); 95 | queue.pop_back(); 96 | 97 | int node_id = node.node_id; 98 | short feature_id = trees_nodes_feature[node_id]; 99 | float threshold = trees_nodes_value[node_id]; 100 | 101 | feature_count[feature_id]++; 102 | 103 | // Split the instances in left-right (end_id will be the frontier) 104 | int start_id = node.start_id; 105 | int end_id = node.end_id; 106 | float y_target_mean_left = 0, y_target_mean_right = 0; 107 | unsigned int instance; 108 | while (start_id <= end_id) { 109 | instance = split_instance[start_id]; 110 | if (X[instance * n_features + feature_id] <= threshold) { 111 | y_target_mean_left += y_target[instance]; 112 | ++start_id; 113 | } else { 114 | y_target_mean_right += y_target[instance]; 115 | std::swap(split_instance[start_id], split_instance[end_id]); 116 | --end_id; 117 | } 118 | } 119 | 120 | int left_docs = end_id - node.start_id + 1; 121 | int right_docs = node.end_id - end_id; 122 | 123 | // we need to normalize the mean y_targets (left and right) 124 | if (left_docs > 0) 125 | y_target_mean_left /= left_docs; 126 | if (right_docs > 0) 127 | y_target_mean_right /= right_docs; 128 | 129 | // compute split gain 130 | float delta_mse = 0; 131 | #pragma omp parallel for reduction( + : delta_mse ) 132 | for (unsigned int i = node.start_id; i <= node.end_id; ++i) { 133 | unsigned int instance = split_instance[i]; 134 | float pre_split_mse = 135 | pow(y_target[instance] - y_pred_tree[instance], 2); 136 | 137 | if (i <= end_id) 138 | y_pred_tree[instance] = y_target_mean_left; 139 | else 140 | y_pred_tree[instance] = y_target_mean_right; 141 | 142 | float post_split_mse = 143 | pow(y_target[instance] - y_pred_tree[instance], 2); 144 | 145 | delta_mse += pre_split_mse - post_split_mse; 146 | } 147 | 148 | // update feature importance 149 | feature_imp[feature_id] += delta_mse / n_instances; 150 | 151 | // if children are not leaves, add in the queue of the nodes to visit 152 | if (!is_leaf_node(trees_left_child[node_id], 153 | trees_left_child, 154 | trees_right_child) && end_id > node.start_id) { 155 | 156 | TreeNode left(trees_left_child[node_id], node.start_id, end_id); 157 | queue.push_back(left); 158 | } 159 | 160 | if (!is_leaf_node(trees_right_child[node_id], 161 | trees_left_child, 162 | trees_right_child) && node.end_id > (end_id + 1) ) { 163 | 164 | TreeNode right(trees_right_child[node_id], end_id + 1, node.end_id); 165 | queue.push_back(right); 166 | } 167 | } 168 | 169 | #pragma omp parallel for 170 | for (unsigned int instance = 0; instance < n_instances; ++instance) { 171 | y_pred_tree[instance] *= trees_weight[tree_id]; 172 | y_pred[instance] += y_pred_tree[instance]; 173 | } 174 | } -------------------------------------------------------------------------------- /rankeval/model/proxy_CatBoost.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Salvatore Trani 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | """Class providing the implementation for loading/storing a CatBoost model 9 | from/to file. 10 | 11 | The CatBoost project is described here: 12 | https://github.com/catboost/catboost 13 | 14 | CatBoost allows to save the learned model in several formats (binary, coreml, 15 | etc). Among them, we chose to adopts the Apple CoreML format for reading and 16 | converting a model into the rankeval representation. It s possible to read the 17 | coreML representation using the coremltools python package. Once read it 18 | provides all the structured information of the ensemble, with split nodes (both 19 | features and thresholds), leaf values and tree structure. Not all the 20 | information reported in the model are useful for the different analysis, thus 21 | only the relevant parts are parsed. 22 | 23 | NOTE: CatBoost trains oblivious trees, i.e., trees where at each level a single 24 | condition is checked, independently from the which node we are currently working 25 | on. Rankeval does not exploit oblivious trees, but instead it represent them 26 | as normal decision trees Thus the same condition will appear on all the nodes 27 | of a single level of a tree. The reason behind this choice is to fasten the 28 | development of the CatBoost proxy, allowing to analyze it without focusing too 29 | much on prediction time (that is not currently measured by rankeval). 30 | """ 31 | 32 | import numpy as np 33 | import logging 34 | 35 | from .rt_ensemble import RTEnsemble 36 | 37 | 38 | class ProxyCatBoost(object): 39 | """ 40 | Class providing the implementation for loading/storing a ProxyCatBoost model 41 | from/to file. 42 | """ 43 | 44 | @staticmethod 45 | def load(file_path, model): 46 | """ 47 | Load the model from the file identified by file_path. 48 | 49 | Parameters 50 | ---------- 51 | file_path : str 52 | The path to the filename where the model has been saved 53 | model : RTEnsemble 54 | The model instance to fill 55 | """ 56 | 57 | try: 58 | import coremltools 59 | except ImportError: 60 | logging.error('Missing coremltools package!.') 61 | return 62 | 63 | coreml_model = coremltools.models.model.MLModel(file_path) 64 | 65 | n_trees, n_nodes = ProxyCatBoost._count_nodes(coreml_model) 66 | # Initialize the model and allocate the needed space 67 | # given the shape and size of the ensemble 68 | model.initialize(n_trees, n_nodes) 69 | 70 | n_nodes_per_tree = int(n_nodes / n_trees) 71 | 72 | nodes = coreml_model.get_spec().treeEnsembleRegressor.treeEnsemble.nodes 73 | behaviors = coremltools.proto.TreeEnsemble_pb2.TreeEnsembleParameters.\ 74 | TreeNode.TreeNodeBehavior 75 | 76 | for node in nodes: 77 | tree_offset = node.treeId * n_nodes_per_tree 78 | node_id_remap = ProxyCatBoost.remap_nodeId(node.nodeId, 79 | n_nodes_per_tree) 80 | node_id_off = node_id_remap + tree_offset 81 | 82 | if node_id_remap == 0: # this is the root of a tree 83 | model.trees_root[node.treeId] = tree_offset 84 | model.trees_weight[node.treeId] = 1 85 | 86 | if node.nodeBehavior == behaviors.Value('LeafNode'): 87 | model.trees_nodes_value[node_id_off] = \ 88 | node.evaluationInfo[0].evaluationValue 89 | else: 90 | if node.nodeBehavior == behaviors.Value('BranchOnValueGreaterThan'): 91 | # we need to flip the condition given we use "<=" 92 | left = node.falseChildNodeId 93 | right = node.trueChildNodeId 94 | elif node.nodeBehavior == behaviors.Value('BranchOnValueLessThanEqual'): 95 | right = node.falseChildNodeId 96 | left = node.trueChildNodeId 97 | else: 98 | raise AssertionError( 99 | "Branching condition not supported. RankEval does not " 100 | "support branching conditions different from " 101 | "BranchOnValueGreaterThan or BranchOnValueLessThanEqual.") 102 | 103 | model.trees_nodes_value[node_id_off] = node.branchFeatureValue 104 | model.trees_nodes_feature[node_id_off] = node.branchFeatureIndex 105 | model.trees_left_child[node_id_off] = tree_offset +\ 106 | ProxyCatBoost.remap_nodeId(left, n_nodes_per_tree) 107 | model.trees_right_child[node_id_off] = tree_offset + \ 108 | ProxyCatBoost.remap_nodeId(right, n_nodes_per_tree) 109 | 110 | @staticmethod 111 | def remap_nodeId(nodeId, n_nodes_per_tree): 112 | return n_nodes_per_tree - 1 - nodeId 113 | 114 | @staticmethod 115 | def save(file_path, model): 116 | """ 117 | Save the model onto the file identified by file_path. 118 | 119 | Parameters 120 | ---------- 121 | file_path : str 122 | The path to the filename where the model has to be saved 123 | model : RTEnsemble 124 | The model RTEnsemble model to save on file 125 | 126 | Returns 127 | ------- 128 | status : bool 129 | Returns true if the save is successful, false otherwise 130 | """ 131 | raise NotImplementedError("Feature not implemented!") 132 | 133 | @staticmethod 134 | def _count_nodes(coreml_model): 135 | """ 136 | Count the total number of nodes (both split and leaf nodes) 137 | in the CoreML model. 138 | 139 | Parameters 140 | ---------- 141 | coreml_model : CoreML model 142 | The CoreML model to load from 143 | 144 | Returns 145 | ------- 146 | tuple(n_trees, n_nodes) : tuple(int, int) 147 | The total number of trees and nodes (both split and leaf nodes) 148 | in the model identified by file_path. 149 | """ 150 | 151 | nodes = coreml_model.get_spec().treeEnsembleRegressor.treeEnsemble.nodes 152 | 153 | n_trees = np.max([node.treeId for node in nodes]) + 1 154 | 155 | n_nodes_trees = np.empty(n_trees, dtype=np.uint16) 156 | for node in nodes: 157 | n_nodes_trees[node.treeId] = node.nodeId 158 | 159 | # node_Id starts from 0, thus + 1 160 | n_nodes = np.sum(n_nodes_trees + 1) 161 | 162 | return n_trees, n_nodes -------------------------------------------------------------------------------- /rankeval/model/proxy_XGBoost.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Salvatore Trani 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | """Class providing the implementation for loading/storing a XGBoost model 9 | from/to file. The model has to be saved using textual representation, i.e., by 10 | using the following method: 11 | .. code-block:: python 12 | import xgboost as xgb 13 | ... 14 | bst = xgb.train(param, dtrain, num_round) 15 | bst.dump_model('xgboost.model') 16 | 17 | The XGBoost project is described here: 18 | https://github.com/dmlc/xgboost 19 | 20 | The XGBoost format adopts a textual representation where each line of the file 21 | represent a single split node or a leaf node, with several attributes describing 22 | the feature and the threshold involved (in case of a split node) or the output 23 | (in case of a leaf). Each node is identified by a unique integer as well as 24 | additional information not usefull for rankeval and thus ignored. 25 | 26 | NOTE: the XGBoost version 0.6 does not properly dump the model. Indeed, as 27 | reported in the issue here: 28 | 29 | - https://github.com/dmlc/xgboost/issues/2077 30 | 31 | The precision of the dumping is not sufficient and cause inconsistencies with 32 | the XGBoost model. This inconsistencies cause rankeval scoring to return 33 | different predictions with respect to the original model. Without a fix by 34 | XGBoost authors, DO NOT USE this proxy. 35 | """ 36 | 37 | import re 38 | import sys 39 | import numpy as np 40 | 41 | from .rt_ensemble import RTEnsemble 42 | 43 | tree_reg = re.compile("^booster\[(\d+)\]") 44 | node_reg = re.compile("(\d+):\[f(\d+)<(.*)\]") 45 | leaf_reg = re.compile("(\d+):leaf=(.+?)(,.*)?$") 46 | 47 | 48 | class ProxyXGBoost(object): 49 | """ 50 | Class providing the implementation for loading/storing a XGBoost model 51 | from/to file. 52 | """ 53 | 54 | @staticmethod 55 | def load(file_path, model): 56 | """ 57 | Load the model from the file identified by file_path. 58 | 59 | Parameters 60 | ---------- 61 | file_path : str 62 | The path to the filename where the model has been saved 63 | model : RTEnsemble 64 | The model instance to fill 65 | """ 66 | n_trees, n_nodes = ProxyXGBoost._count_nodes(file_path) 67 | # Initialize the model and allocate the needed space 68 | # given the shape and size of the ensemble 69 | model.initialize(n_trees, n_nodes) 70 | 71 | root_node = 0 72 | num_nodes = 0 73 | queue = list() 74 | with open(file_path, 'r') as f: 75 | for line in f: 76 | 77 | match_tree = tree_reg.match(line) 78 | if match_tree: 79 | assert(len(queue) == 0) 80 | curr_tree = int(match_tree.group(1)) 81 | root_node += num_nodes 82 | num_nodes = 0 83 | model.trees_root[curr_tree] = root_node 84 | model.trees_weight[curr_tree] = 1 85 | continue 86 | 87 | match_node = node_reg.search(line) 88 | if match_node: 89 | node_id = num_nodes + root_node 90 | feature_id = int(match_node.group(2).strip()) 91 | threshold = np.float32(match_node.group(3).strip()) 92 | 93 | # Needed because XGBoost use as split condition 94 | # < in place of <= 95 | threshold = np.nextafter( 96 | threshold, threshold - 1, 97 | dtype=model.trees_nodes_value.dtype) 98 | 99 | model.trees_nodes_feature[node_id] = feature_id 100 | model.trees_nodes_value[node_id] = threshold 101 | 102 | match_leaf = leaf_reg.search(line) 103 | if match_leaf: 104 | node_id = num_nodes + root_node 105 | leaf_value = float(match_leaf.group(2).strip()) 106 | model.trees_nodes_value[node_id] = leaf_value 107 | 108 | if match_node or match_leaf: 109 | num_nodes += 1 110 | if len(queue) > 0: 111 | parent_id, child = queue.pop() 112 | if child == 'L': 113 | model.trees_left_child[parent_id] = node_id 114 | else: 115 | model.trees_right_child[parent_id] = node_id 116 | 117 | if match_node: 118 | # two elements in the queue for the left and right children 119 | # Each element is identified by a node_id and the indication 120 | # of being the left or right child. 121 | queue.extend([(node_id, 'R'), (node_id, 'L')]) 122 | 123 | @staticmethod 124 | def save(file_path, model): 125 | """ 126 | Save the model onto the file identified by file_path. 127 | 128 | Parameters 129 | ---------- 130 | file_path : str 131 | The path to the filename where the model has to be saved 132 | model : RTEnsemble 133 | The model RTEnsemble model to save on file 134 | 135 | Returns 136 | ------- 137 | status : bool 138 | Returns true if the save is successful, false otherwise 139 | """ 140 | raise NotImplementedError("Feature not implemented!") 141 | 142 | @staticmethod 143 | def _count_nodes(file_path): 144 | """ 145 | Count the total number of nodes (both split and leaf nodes) 146 | in the model identified by file_path. 147 | 148 | Parameters 149 | ---------- 150 | file_path : str 151 | The path to the filename where the model has been saved 152 | 153 | Returns 154 | ------- 155 | tuple(n_trees, n_nodes) : tuple(int, int) 156 | The total number of trees and nodes (both split and leaf nodes) 157 | in the model identified by file_path. 158 | """ 159 | 160 | n_nodes = 0 161 | n_trees = 0 162 | 163 | with open(file_path, 'r') as f: 164 | for line in f: 165 | 166 | match = tree_reg.match(line) 167 | if match: 168 | n_trees += 1 169 | continue 170 | 171 | match_node = node_reg.search(line) 172 | if match_node: 173 | n_nodes += 1 174 | 175 | match_leaf = leaf_reg.search(line) 176 | if match_leaf: 177 | n_nodes += 1 178 | 179 | return n_trees, n_nodes 180 | -------------------------------------------------------------------------------- /rankeval/model/proxy_Jforests.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file) 2 | # Authors: Salvatore Trani 3 | # 4 | # This Source Code Form is subject to the terms of the Mozilla Public 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | """ 9 | Class providing the implementation for loading/storing a QuickRank model 10 | from/to file. 11 | 12 | The Jforests project is described here: https://github.com/yasserg/jforests 13 | 14 | The Jforests format adopts an XML representation. There is an ensemble node, 15 | with a sub-node for each tree, identified by the "Tree" tag, followed by the 16 | description of the tree (with splitting and leaf nodes). The splitting nodes are 17 | described with two information: the feature-id used for splitting, and the 18 | threshold value. Leaf nodes on the other hand are described by a "LeafOutputs" 19 | tag with the value as content. 20 | """ 21 | 22 | from .rt_ensemble import RTEnsemble 23 | 24 | try: 25 | import xml.etree.cElementTree as etree 26 | except ImportError: 27 | import xml.etree.ElementTree as etree 28 | 29 | 30 | class ProxyJforests(object): 31 | """ 32 | Class providing the implementation for loading/storing a Jforests model 33 | from/to file. 34 | """ 35 | 36 | @staticmethod 37 | def load(file_path, model): 38 | """ 39 | Load the model from the file identified by file_path. 40 | 41 | Parameters 42 | ---------- 43 | file_path : str 44 | The path to the filename where the model has been saved 45 | model : RTEnsemble 46 | The model instance to fill 47 | """ 48 | n_trees, n_nodes = ProxyJforests._count_nodes(file_path) 49 | # Initialize the model and allocate the needed space 50 | # given the shape and size of the ensemble 51 | model.initialize(n_trees, n_nodes) 52 | 53 | # get an iterable 54 | context = etree.iterparse(file_path, events=("start", "end")) 55 | 56 | # get the root element 57 | _, root = next(context) 58 | 59 | curr_tree = -1 60 | root_node = 0 61 | num_leaves = num_splits = 0 62 | 63 | for event, elem in context: 64 | 65 | if event == 'start' and elem.tag == 'Tree': 66 | curr_tree += 1 # increase the current number index 67 | root_node += num_leaves + num_splits 68 | # save the curr node as the root of a new tree 69 | model.trees_root[curr_tree] = root_node 70 | model.trees_weight[curr_tree] = elem.attrib['weight'] 71 | 72 | if event == 'end': 73 | 74 | if elem.tag == 'SplitFeatures': 75 | split_features = map(int, elem.text.split(" ")) 76 | num_splits = 0 77 | for pos, feature in enumerate(split_features): 78 | num_splits += 1 79 | model.trees_nodes_feature[root_node + pos] = feature 80 | elif elem.tag == 'LeftChildren': 81 | left_children = map(int, elem.text.split(" ")) 82 | for pos, child in enumerate(left_children): 83 | if child >= 0: 84 | model.trees_left_child[root_node + pos] = \ 85 | root_node + child 86 | else: 87 | model.trees_left_child[root_node + pos] = \ 88 | root_node + num_splits + abs(child) - 1 89 | elif elem.tag == 'RightChildren': 90 | right_children = map(int, elem.text.split(" ")) 91 | for pos, child in enumerate(right_children): 92 | if child >= 0: 93 | model.trees_right_child[root_node + pos] = \ 94 | root_node + child 95 | else: 96 | model.trees_right_child[root_node + pos] = \ 97 | root_node + num_splits + abs(child) - 1 98 | elif elem.tag == 'OriginalThresholds': 99 | thresholds = map(float, elem.text.split(" ")) 100 | for pos, threshold in enumerate(thresholds): 101 | model.trees_nodes_value[root_node + pos] = threshold 102 | elif elem.tag == 'LeafOutputs': 103 | leaf_values = map(float, elem.text.split(" ")) 104 | num_leaves = 0 105 | for pos, leaf_value in enumerate(leaf_values): 106 | num_leaves += 1 107 | model.trees_nodes_value[root_node + num_splits + pos] \ 108 | = leaf_value 109 | 110 | # clear the memory 111 | if event == 'end': 112 | elem.clear() # discard the element 113 | root.clear() # remove child reference from the root 114 | 115 | @staticmethod 116 | def save(file_path, model): 117 | """ 118 | Save the model onto the file identified by file_path. 119 | 120 | Parameters 121 | ---------- 122 | file_path : str 123 | The path to the filename where the model has to be saved 124 | model : RTEnsemble 125 | The model RTEnsemble model to save on file 126 | 127 | Returns 128 | ------- 129 | status : bool 130 | Returns true if the save is successful, false otherwise 131 | """ 132 | raise NotImplementedError("Feature not implemented!") 133 | 134 | @staticmethod 135 | def _count_nodes(file_path): 136 | """ 137 | Count the total number of nodes (both split and leaf nodes) 138 | in the model identified by file_path. 139 | 140 | Parameters 141 | ---------- 142 | file_path : str 143 | The path to the filename where the model has been saved 144 | 145 | Returns 146 | ------- 147 | tuple(n_trees, n_nodes) : tuple(int, int) 148 | The total number of trees and nodes (both split and leaf nodes) 149 | in the model identified by file_path. 150 | """ 151 | # get an iterable 152 | # NOTE: it seems like there is a bug inside lxmx since selecting only 153 | # terminal tags with events=("end",) some tags are skipped... 154 | context = etree.iterparse(file_path, events=("start", "end")) 155 | 156 | # get the root element 157 | _, root = next(context) 158 | 159 | n_nodes = 0 160 | n_trees = 0 161 | for event, elem in context: 162 | if event != "end": 163 | continue 164 | if elem.tag == 'Tree': 165 | n_trees += 1 166 | elif elem.tag == 'SplitFeatures' or elem.tag == 'LeafOutputs': 167 | n_nodes += len(elem.text.split(" ")) 168 | 169 | elem.clear() # discard the element 170 | root.clear() # remove root reference to the child 171 | 172 | return n_trees, n_nodes 173 | --------------------------------------------------------------------------------