├── rankeval
    ├── VERSION
    ├── test
    │   ├── dataset
    │   │   ├── __init__.py
    │   │   ├── test_dataset.py
    │   │   └── test_svmlight_format.py
    │   ├── metrics
    │   │   └── __init__.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── test_proxy_model.py
    │   │   ├── test_proxy_ScikitLearn.py
    │   │   ├── test_proxy_LightGBM.py
    │   │   ├── test_proxy_XGBoost.py
    │   │   ├── test_proxy_CatBoost.py
    │   │   ├── test_proxy_Jforests.py
    │   │   └── test_proxy_QuickRank.py
    │   ├── scoring
    │   │   ├── __init__.py
    │   │   └── test_scoring.py
    │   ├── analysis
    │   │   ├── __init__.py
    │   │   ├── test_statistical.py
    │   │   └── test_feature.py
    │   ├── __init__.py
    │   ├── data
    │   │   ├── svmlight_invalid.txt
    │   │   ├── CatBoost.model.coreml
    │   │   ├── svmlight_classification.txt
    │   │   ├── svmlight_classification_qid.txt
    │   │   ├── ScikitLearn.model.txt
    │   │   ├── XGBoost.model.txt
    │   │   ├── Jforests.model.xml
    │   │   ├── quickrank.model.v2.xml
    │   │   ├── quickrank.model.xml
    │   │   └── LightGBM.model.txt
    │   └── base.py
    ├── visualization
    │   ├── __init__.py
    │   └── feature.py
    ├── scoring
    │   ├── __init__.py
    │   ├── _efficient_scoring.pyx
    │   └── scorer.py
    ├── dataset
    │   ├── __init__.py
    │   ├── dataset_container.py
    │   └── write_json_dataset_catalogue.py
    ├── analysis
    │   ├── __init__.py
    │   ├── _efficient_feature_impl.h
    │   ├── _efficient_feature.pyx
    │   ├── _efficient_topological.pyx
    │   ├── topological.py
    │   └── _efficient_feature_impl.cpp
    ├── __init__.py
    ├── model
    │   ├── __init__.py
    │   ├── proxy_CatBoost.py
    │   ├── proxy_XGBoost.py
    │   └── proxy_Jforests.py
    └── metrics
    │   ├── __init__.py
    │   ├── spearman_rho.py
    │   ├── rmse.py
    │   ├── kendall_tau.py
    │   ├── mse.py
    │   ├── err.py
    │   ├── precision_max.py
    │   ├── dcg.py
    │   ├── precision.py
    │   ├── mrr.py
    │   ├── metric.py
    │   ├── recall.py
    │   ├── pfound.py
    │   ├── map.py
    │   ├── rbp.py
    │   └── ndcg.py
├── doc
    ├── banner.png
    ├── src
    │   ├── rankeval.rst
    │   ├── rankeval.scoring.rst
    │   ├── Makefile
    │   ├── rankeval.visualization.rst
    │   ├── make.bat
    │   ├── rankeval.analysis.rst
    │   ├── index.rst
    │   ├── static-index.rst
    │   ├── rankeval.dataset.rst
    │   ├── rankeval.model.rst
    │   ├── rankeval.metrics.rst
    │   └── conf.py
    └── Makefile
├── .gitattributes
├── MANIFEST.in
├── Makefile
├── .gitignore
├── AUTHORS.md
├── CONTRIBUTING.md
├── .travis.yml
└── README.md


/rankeval/VERSION:
--------------------------------------------------------------------------------
1 | 0.8.2


--------------------------------------------------------------------------------
/rankeval/test/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rankeval/test/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rankeval/test/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rankeval/test/scoring/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rankeval/test/analysis/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rankeval/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rankeval/test/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 


--------------------------------------------------------------------------------
/doc/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpclab/rankeval/HEAD/doc/banner.png


--------------------------------------------------------------------------------
/rankeval/test/data/svmlight_invalid.txt:
--------------------------------------------------------------------------------
1 | python 2:2.5 10:-5.2 15:1.5
2 | 2.0 5:1.0 12:-3
3 | 3.0 20:27
4 | 


--------------------------------------------------------------------------------
/rankeval/test/data/CatBoost.model.coreml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpclab/rankeval/HEAD/rankeval/test/data/CatBoost.model.coreml


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | doc/* linguist-documentation
2 | notebooks/*.ipynb linguist-detectable=false
3 | Makefile linguist-detectable=false
4 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | include AUTHORS.md
4 | include VERSION
5 | recursive-include doc *
6 | recursive-include rankeval *
7 | 


--------------------------------------------------------------------------------
/rankeval/test/data/svmlight_classification.txt:
--------------------------------------------------------------------------------
1 | # comment
2 | 1.0 2:2.5     10:-5.2 15:1.5 # an inline comment
3 | 2.0 5:1.0 12:-3 
4 | # another comment
5 | 3.0 20:27
6 | 


--------------------------------------------------------------------------------
/rankeval/test/data/svmlight_classification_qid.txt:
--------------------------------------------------------------------------------
1 | # comment
2 | 1.0 qid:1 2:2.5     10:-5.2 15:0 # an inline comment
3 | 2.0 qid:37 5:1.0 12:-3 33:0.7
4 | 0.0 qid:37 6:7 8:9 10:11
5 | # another comment
6 | 3.0 qid:12 20:27 22:30 23:40
7 | 


--------------------------------------------------------------------------------
/doc/src/rankeval.rst:
--------------------------------------------------------------------------------
 1 | rankeval package
 2 | ================
 3 | 
 4 | .. automodule:: rankeval
 5 |     :members:
 6 |     :undoc-members:
 7 |     :show-inheritance:
 8 | 
 9 | Subpackages
10 | -----------
11 | 
12 | .. toctree::
13 | 
14 |     rankeval.analysis
15 |     rankeval.dataset
16 |     rankeval.metrics
17 |     rankeval.model
18 |     rankeval.scoring
19 |     rankeval.visualization
20 | 
21 | 


--------------------------------------------------------------------------------
/rankeval/test/data/ScikitLearn.model.txt:
--------------------------------------------------------------------------------
 1 | base_score=0.606000
 2 | learning_rate=0.100000
 3 | booster[0] [f54 f133]:
 4 | 	0:[f54<=0.005769]
 5 | 		1:leaf=-0.226426222524
 6 | 		2:[f133<=0.500000]
 7 | 			3:leaf=0.187204691788
 8 | 			4:leaf=0.905627900778
 9 | booster[1] [f14 f52]:
10 | 	0:[f52<=0.044467]
11 | 		1:[f14<=28.500000]
12 | 			3:leaf=0.413148111893
13 | 			4:leaf=-0.203144770188
14 | 		2:leaf=0.269475234496
15 | 


--------------------------------------------------------------------------------
/doc/src/rankeval.scoring.rst:
--------------------------------------------------------------------------------
 1 | rankeval.scoring package
 2 | ========================
 3 | 
 4 | .. automodule:: rankeval.scoring
 5 |     :members:
 6 |     :undoc-members:
 7 |     :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | rankeval.scoring.scorer module
13 | ------------------------------
14 | 
15 | .. automodule:: rankeval.scoring.scorer
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/rankeval/test/data/XGBoost.model.txt:
--------------------------------------------------------------------------------
 1 | booster[0]:
 2 | 0:[f52<0.0444665] yes=1,no=2,missing=1
 3 | 	1:[f14<26.5] yes=3,no=4,missing=3
 4 | 		3:leaf=0.0330693
 5 | 		4:leaf=-0.0274553
 6 | 	2:[f17<23.1987] yes=5,no=6,missing=5
 7 | 		5:leaf=0.0289187
 8 | 		6:leaf=0.067713
 9 | booster[1]:
10 | 0:[f54<0.0035545] yes=1,no=2,missing=1
11 | 	1:[f10<209.5] yes=3,no=4,missing=3
12 | 		3:leaf=-0.0018294
13 | 		4:leaf=-0.0425189
14 | 	2:[f52<0.0444665] yes=5,no=6,missing=5
15 | 		5:leaf=0.00209278
16 | 		6:leaf=0.0306171
17 | 


--------------------------------------------------------------------------------
/rankeval/test/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
 2 | # Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
 3 | #
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | """Generic method useful for testing"""
 9 | 
10 | import os
11 | 
12 | test_dir = os.path.dirname(os.path.abspath(__file__))
13 | data_dir = os.path.join(test_dir, "data")


--------------------------------------------------------------------------------
/rankeval/scoring/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
 2 | # Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
 3 | #
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | """
 9 | The :mod:`rankeval.scoring` module includes utilities to score a model on a given dataset.
10 | """
11 | 
12 | from .scorer import Scorer
13 | 
14 | __all__ = ['Scorer']
15 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | PYTHON ?= python
 2 | NOSETESTS ?= nosetests
 3 | 
 4 | clean:
 5 | 	$(PYTHON) setup.py clean
 6 | 	rm -rf dist
 7 | 	rm -rf rankeval.egg-info
 8 | 	rm -rf build
 9 | 	rm -rf .ipynb_checkpoints
10 | 	rm -rf .eggs
11 | 	find . -name "*.so" -delete
12 | 	find . -name "*.pyc" -delete
13 | 	find . -name "*.egg" -delete
14 | 
15 | test:
16 | 	$(NOSETESTS)
17 | 
18 | ### Handling Sphinx for generating documentation
19 | .PHONY: doc
20 | doc: 
21 | 	@echo "==================================="
22 | 	@echo "Producing documentation..."
23 | 
24 | 	make -C doc doc
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.so
 3 | *.pyd
 4 | *~
 5 | .#*
 6 | *.lprof
 7 | *.swp
 8 | *.swo
 9 | .DS_Store
10 | 
11 | # ignored dirs
12 | build
13 | dist
14 | *.egg-info
15 | .out-of-repo/
16 | 
17 | # ignored files
18 | distribute-*
19 | *eggs
20 | 
21 | # ipython files
22 | .ipynb_checkpoints
23 | /*.ipynb
24 | 
25 | *.prefs
26 | .pydevproject
27 | .idea
28 | 
29 | # doc dirs and auto generated files
30 | doc/src/_build
31 | 
32 | # auto compiled files
33 | rankeval/scoring/_efficient_scoring.c
34 | rankeval/analysis/_efficient_topological.c
35 | rankeval/analysis/_efficient_feature.cpp
36 | 


--------------------------------------------------------------------------------
/rankeval/dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
 2 | # Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
 3 | #
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | """
 9 | The :mod:`rankeval.dataset` module includes utilities to load datasets
10 | and dump datasets according to several supported formats.
11 | """
12 | 
13 | from .dataset import Dataset
14 | from .dataset_container import DatasetContainer
15 | 
16 | __all__ = ['Dataset',
17 |            'DatasetContainer']


--------------------------------------------------------------------------------
/doc/src/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = RankEval
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/rankeval/analysis/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
 2 | # Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
 3 | #
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | """
 9 | The :mod:`rankeval.analysis` module implements the functionalities for analysing the
10 | behaviour of several ranking models with respect to several metrics and
11 | datasets. It proposes a comprehensive set of analysis for tuning, evaluating
12 | and comparing Gradient Boosted Regression Tree models devoted to learning a
13 | ranking function.
14 | """


--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
 1 | History
 2 | -------
 3 | 
 4 | This project was started in 2017 as a demo presented at the 40th International 
 5 | ACM SIGIR Conference on Research and Development in Information Retrieval in Tokyo (Japan).
 6 | 
 7 | People
 8 | ------
 9 | 
10 | The following people have been core contributors to rankeval's development and maintenance:
11 | 
12 |     * Claudio Lucchese - HPC-ISTI researcher - Pisa (Italy)
13 |     * Franco Maria Nardini - HPC-ISTI researcher - Pisa (Italy)
14 |     * Cristina Muntean - HPC-ISTI researcher - Pisa (Italy)
15 |     * Salvatore Trani - HPC-ISTI researcher - Pisa (Italy)  
16 | 
17 | Please do not email the authors directly to ask for assistance or report issues.
18 | Instead, please use GitHub issues or email to `rankeval@isti.cnr.it` for requests
19 | and information.


--------------------------------------------------------------------------------
/rankeval/dataset/dataset_container.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
 2 | # Authors: Franco Maria Nardini <francomaria.nardini@isti.cnr.it>
 3 | #
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | class DatasetContainer(object):
 9 |     """
10 |     This class is a container used to easily manage a dataset and associated
11 |     learning to rank models trained by using it. It also offers the possibility
12 |     to store the license coming with public dataset.
13 |     """
14 |     train_dataset = None
15 |     validation_dataset = None
16 |     test_dataset = None
17 | 
18 |     license_agreement = ''
19 | 
20 |     model_filenames = None


--------------------------------------------------------------------------------
/rankeval/test/model/test_proxy_model.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import unittest
 4 | 
 5 | from rankeval.model import RTEnsemble
 6 | from rankeval.test.base import data_dir
 7 | 
 8 | model_file = os.path.join(data_dir, "quickrank.model.xml")
 9 | 
10 | 
11 | class ProxyModelTestCase(unittest.TestCase):
12 | 
13 |     def test_not_supported_model(self):
14 |         try:
15 |             RTEnsemble(model_file, format="unsupported")
16 |             # if we reach the code below, it means the constructor
17 |             # has not failed...raise error!
18 |             assert False
19 |         except TypeError:
20 |             pass
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
25 |                         level=logging.DEBUG)
26 |     unittest.main()
27 | 


--------------------------------------------------------------------------------
/rankeval/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This package serve the root of the RankEval package.
 3 | """
 4 | 
 5 | import os
 6 | import io
 7 | 
 8 | cur_dir = os.path.dirname(__file__)
 9 | 
10 | __version__ = io.open(os.path.join(cur_dir, 'VERSION'),
11 |                       encoding='utf-8').read().strip()
12 | 
13 | 
14 | def is_notebook():
15 |     try:
16 |         from IPython import get_ipython
17 |         shell = get_ipython().__class__.__name__
18 |         if shell == 'ZMQInteractiveShell':
19 |             return True   # Jupyter notebook or qtconsole
20 |         elif shell == 'TerminalInteractiveShell':
21 |             return False  # Terminal running IPython
22 |         else:
23 |             return False  # Other type (?)
24 |     except (NameError, ImportError):
25 |         return False      # Probably standard Python interpreter
26 | 


--------------------------------------------------------------------------------
/doc/src/rankeval.visualization.rst:
--------------------------------------------------------------------------------
 1 | rankeval.visualization package
 2 | ==============================
 3 | 
 4 | .. automodule:: rankeval.visualization
 5 |     :members:
 6 |     :undoc-members:
 7 |     :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | rankeval.visualization.effectiveness module
13 | -------------------------------------------
14 | 
15 | .. automodule:: rankeval.visualization.effectiveness
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 
20 | rankeval.visualization.feature module
21 | -------------------------------------
22 | 
23 | .. automodule:: rankeval.visualization.feature
24 |     :members:
25 |     :undoc-members:
26 |     :show-inheritance:
27 | 
28 | rankeval.visualization.topological module
29 | -----------------------------------------
30 | 
31 | .. automodule:: rankeval.visualization.topological
32 |     :members:
33 |     :undoc-members:
34 |     :show-inheritance:
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/rankeval/test/dataset/test_dataset.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import unittest
 4 | 
 5 | from rankeval.dataset import Dataset
 6 | from ..base import data_dir
 7 | 
 8 | datafile = os.path.join(data_dir, "msn1.fold1.test.5k.txt")
 9 | 
10 | 
11 | class SVMLightLoaderTestCase(unittest.TestCase):
12 | 
13 |     def test_svmlight_dataset(self):
14 |         try:
15 |             dataset = Dataset.load(datafile, format="svmlight")
16 |         except TypeError:
17 |             assert False
18 | 
19 |     def test_not_supported_dataset(self):
20 |         try:
21 |             Dataset.load(datafile, format="unsupported")
22 |             # if we reach the code below, it means the constructor has not failed...raise error!
23 |             assert False
24 |         except TypeError:
25 |             pass
26 | 
27 | if __name__ == '__main__':
28 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
29 |     unittest.main()
30 | 


--------------------------------------------------------------------------------
/doc/src/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=RankEval
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/rankeval/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
 2 | # Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
 3 | #
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | """
 9 | The :mod:`rankeval.model` module includes utilities to load a model
10 | and dump it according to several supported model's format.
11 | """
12 | 
13 | from .proxy_LightGBM import ProxyLightGBM
14 | from .proxy_QuickRank import ProxyQuickRank
15 | from .proxy_ScikitLearn import ProxyScikitLearn
16 | from .proxy_XGBoost import ProxyXGBoost
17 | from .proxy_Jforests import ProxyJforests
18 | from .proxy_CatBoost import ProxyCatBoost
19 | from .rt_ensemble import RTEnsemble
20 | 
21 | __all__ = ['RTEnsemble',
22 |            'ProxyQuickRank',
23 |            'ProxyLightGBM',
24 |            'ProxyXGBoost',
25 |            'ProxyScikitLearn',
26 |            'ProxyJforests',
27 |            'ProxyCatBoost'
28 | ]
29 | 


--------------------------------------------------------------------------------
/doc/src/rankeval.analysis.rst:
--------------------------------------------------------------------------------
 1 | rankeval.analysis package
 2 | =========================
 3 | 
 4 | .. automodule:: rankeval.analysis
 5 |     :members:
 6 |     :undoc-members:
 7 |     :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | rankeval.analysis.effectiveness module
13 | --------------------------------------
14 | 
15 | .. automodule:: rankeval.analysis.effectiveness
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 
20 | rankeval.analysis.feature module
21 | --------------------------------
22 | 
23 | .. automodule:: rankeval.analysis.feature
24 |     :members:
25 |     :undoc-members:
26 |     :show-inheritance:
27 | 
28 | rankeval.analysis.statistical module
29 | ------------------------------------
30 | 
31 | .. automodule:: rankeval.analysis.statistical
32 |     :members:
33 |     :undoc-members:
34 |     :show-inheritance:
35 | 
36 | rankeval.analysis.topological module
37 | ------------------------------------
38 | 
39 | .. automodule:: rankeval.analysis.topological
40 |     :members:
41 |     :undoc-members:
42 |     :show-inheritance:
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/doc/src/index.rst:
--------------------------------------------------------------------------------
 1 | RankEval -- Analysis and evaluation of Learning-to-Rank models
 2 | ==============================================================
 3 | 
 4 | RankEval is a Python library for the *analysis* and *evaluation* of Learning-to-Rank models 
 5 | based on ensembles of regression trees.
 6 | Target audience includes the *machine learning* (ML) and *information retrieval* (IR) communities.
 7 | 
 8 | Citing RankEval
 9 | ---------------
10 | 
11 | Please cite::
12 | 
13 | 	@inproceedings{rankeval-sigir17,
14 | 	  author = {Claudio Lucchese and Cristina Ioana Muntean and Franco Maria Nardini and
15 | 	            Raffaele Perego and Salvatore Trani},
16 |  	  title = {RankEval: An Evaluation and Analysis Framework for Learning-to-Rank Solutions},
17 |  	  booktitle = {SIGIR 2017: Proceedings of the 40th International {ACM} {SIGIR} 
18 | 	               Conference on Research and Development in Information Retrieval},
19 |  	  year = {2017},
20 |  	  location = {Tokyo, Japan}
21 | 	} 
22 | 
23 | 
24 | .. toctree::
25 |    :maxdepth: 1
26 |    :caption: Contents:
27 | 
28 |    rankeval
29 | 
30 | 
31 | Indices and tables
32 | ==================
33 | 
34 | * :ref:`genindex`
35 | * :ref:`modindex`
36 | * :ref:`search`
37 | 


--------------------------------------------------------------------------------
/doc/src/static-index.rst:
--------------------------------------------------------------------------------
 1 | RankEval -- Analysis and evaluation of Learning-to-Rank models
 2 | ==============================================================
 3 | 
 4 | RankEval is a Python library for the *analysis* and *evaluation* of Learning-to-Rank models 
 5 | based on ensembles of regression trees.
 6 | Target audience includes the *machine learning* (ML) and *information retrieval* (IR) communities.
 7 | 
 8 | Citing RankEval
 9 | ---------------
10 | 
11 | Please cite::
12 | 
13 | 	@inproceedings{rankeval-sigir17,
14 | 	  author = {Claudio Lucchese and Cristina Ioana Muntean and Franco Maria Nardini and
15 | 	            Raffaele Perego and Salvatore Trani},
16 |  	  title = {RankEval: An Evaluation and Analysis Framework for Learning-to-Rank Solutions},
17 |  	  booktitle = {SIGIR 2017: Proceedings of the 40th International {ACM} {SIGIR} 
18 | 	               Conference on Research and Development in Information Retrieval},
19 |  	  year = {2017},
20 |  	  location = {Tokyo, Japan}
21 | 	} 
22 | 
23 | 
24 | .. toctree::
25 |    :maxdepth: 1
26 |    :caption: Contents:
27 | 
28 |    rankeval
29 | 
30 | 
31 | Indices and tables
32 | ==================
33 | 
34 | * :ref:`genindex`
35 | * :ref:`modindex`
36 | * :ref:`search`
37 | 


--------------------------------------------------------------------------------
/rankeval/test/data/Jforests.model.xml:
--------------------------------------------------------------------------------
 1 | <Ensemble>
 2 | 	<Tree leaves="7" weight="1.0">
 3 | 		<SplitFeatures>129 129 107 72 55 54</SplitFeatures>
 4 | 		<LeftChildren>1 3 5 -1 -3 -2</LeftChildren>
 5 | 		<RightChildren>2 4 -4 -5 -6 -7</RightChildren>
 6 | 		<Thresholds>14884 14826 11171 485 320 1214</Thresholds>
 7 | 		<OriginalThresholds>268.00791914235936 265.01449371350714 13.917409408136482 19.112358794482084 0.009766221082829762 0.018525300616492706</OriginalThresholds>
 8 | 		<LeafOutputs>-1.2156555533251343 -0.2370371246276912 -1.9329095710207922 0.8030836898094491 -0.010194310883442196 -1.939557007673878 0.584062922565639</LeafOutputs>
 9 | 	</Tree>
10 | 	<Tree leaves="7" weight="1.0">
11 | 		<SplitFeatures>133 72 105 130 62 121</SplitFeatures>
12 | 		<LeftChildren>1 4 -3 -4 -1 -6</LeftChildren>
13 | 		<RightChildren>-2 2 3 -5 5 -7</RightChildren>
14 | 		<Thresholds>0 543 7553 15359 10922 31960</Thresholds>
15 | 		<OriginalThresholds>0.0 21.397960464750046 13.263678873222243 181.01421193338518 0.3333333333333333 -1.5976164200695848</OriginalThresholds>
16 | 		<LeafOutputs>-0.14431346869079953 1.3819800608689703 1.770799198202688 1.7353641847897887 0.22409245535128064 -0.3769555284258838 -1.793789588633319</LeafOutputs>
17 | 	</Tree>
18 | </Ensemble>
19 | 


--------------------------------------------------------------------------------
/rankeval/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
 2 | # Authors: Cristina Muntean <cristina.muntean@isti.cnr.it>
 3 | #
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | """
 9 | The :mod:`rankeval.metrics` module includes the definition and implementation of
10 | the most common metrics adopted in the Learning to Rank community.
11 | """
12 | 
13 | from .metric import Metric
14 | from .precision import Precision
15 | from .recall import Recall
16 | from .ndcg import NDCG
17 | from .dcg import DCG
18 | from .err import ERR
19 | from .kendall_tau import Kendalltau
20 | from .map import MAP
21 | from .mrr import MRR
22 | from .pfound import Pfound
23 | from .rbp import RBP
24 | from .mse import MSE
25 | from .rmse import RMSE
26 | from .spearman_rho import SpearmanRho
27 | 
28 | __all__ = ['Metric',
29 |            'Precision',
30 |            'Recall',
31 |            'NDCG',
32 |            'DCG',
33 |            'ERR',
34 |            'Kendalltau',
35 |            'MAP',
36 |            'MRR',
37 |            'Pfound',
38 |            'RBP',
39 |            'MSE',
40 |            'RMSE',
41 |            'SpearmanRho']
42 | 


--------------------------------------------------------------------------------
/doc/src/rankeval.dataset.rst:
--------------------------------------------------------------------------------
 1 | rankeval.dataset package
 2 | ========================
 3 | 
 4 | .. automodule:: rankeval.dataset
 5 |     :members:
 6 |     :undoc-members:
 7 |     :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | rankeval.dataset.dataset module
13 | -------------------------------
14 | 
15 | .. automodule:: rankeval.dataset.dataset
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 
20 | rankeval.dataset.dataset\_container module
21 | ------------------------------------------
22 | 
23 | .. automodule:: rankeval.dataset.dataset_container
24 |     :members:
25 |     :undoc-members:
26 |     :show-inheritance:
27 | 
28 | rankeval.dataset.datasets\_fetcher module
29 | -----------------------------------------
30 | 
31 | .. automodule:: rankeval.dataset.datasets_fetcher
32 |     :members:
33 |     :undoc-members:
34 |     :show-inheritance:
35 | 
36 | rankeval.dataset.svmlight\_format module
37 | ----------------------------------------
38 | 
39 | .. automodule:: rankeval.dataset.svmlight_format
40 |     :members:
41 |     :undoc-members:
42 |     :show-inheritance:
43 | 
44 | rankeval.dataset.write\_json\_dataset\_catalogue module
45 | -------------------------------------------------------
46 | 
47 | .. automodule:: rankeval.dataset.write_json_dataset_catalogue
48 |     :members:
49 |     :undoc-members:
50 |     :show-inheritance:
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/rankeval/test/data/quickrank.model.v2.xml:
--------------------------------------------------------------------------------
 1 | <ranker>
 2 | 	<info>
 3 | 		<type>MART</type>
 4 | 		<trees>2</trees>
 5 | 		<leaves>3</leaves>
 6 | 		<shrinkage>0.10000000149011612</shrinkage>
 7 | 		<leafsupport>1</leafsupport>
 8 | 		<discretization>0</discretization>
 9 | 		<estop>100</estop>
10 | 	</info>
11 | 	<ensemble>
12 | 		<tree id="1" weight="0.10000000149011612">
13 | 			<split>
14 | 				<feature>108</feature>
15 | 				<threshold>14.895151138305664</threshold>
16 | 				<split pos="left">
17 | 					<feature>115</feature>
18 | 					<threshold>-8.0245580673217773</threshold>
19 | 					<split pos="left">
20 | 						<output>0.3412887828162291</output>
21 | 					</split>
22 | 					<split pos="right">
23 | 						<output>0.66845277963831218</output>
24 | 					</split>
25 | 				</split>
26 | 				<split pos="right">
27 | 					<output>0.96317280453257792</output>
28 | 				</split>
29 | 			</split>
30 | 		</tree>
31 | 		<tree id="2" weight="0.10000000149011612">
32 | 			<split>
33 | 				<feature>8</feature>
34 | 				<threshold>0.66666698455810547</threshold>
35 | 				<split pos="left">
36 | 					<output>0.5</output>
37 | 				</split>
38 | 				<split pos="right">
39 | 					<feature>106</feature>
40 | 					<threshold>17.0</threshold>
41 | 					<split pos="left">
42 | 						<output>0.5</output>
43 | 					</split>
44 | 					<split pos="right">
45 | 						<output>1.0</output>
46 | 					</split>
47 | 				</split>
48 | 			</split>
49 | 		</tree>
50 | 	</ensemble>
51 | </ranker>
52 | 


--------------------------------------------------------------------------------
/rankeval/test/data/quickrank.model.xml:
--------------------------------------------------------------------------------
 1 | <ranker>
 2 | 	<info>
 3 | 		<type>MART</type>
 4 | 		<trees>2</trees>
 5 | 		<leaves>3</leaves>
 6 | 		<shrinkage>0.10000000149011612</shrinkage>
 7 | 		<leafsupport>1</leafsupport>
 8 | 		<discretization>0</discretization>
 9 | 		<estop>100</estop>
10 | 	</info>
11 | 	<ensemble>
12 | 		<tree id="1" weight="0.10000000149011612">
13 | 			<split>
14 | 				<feature>108</feature>
15 | 				<threshold>14.895151138305664</threshold>
16 | 				<split pos="left">
17 | 					<feature>115</feature>
18 | 					<threshold>-8.0245580673217773</threshold>
19 | 					<split pos="left">
20 | 						<output>0.3412887828162291</output>
21 | 					</split>
22 | 					<split pos="right">
23 | 						<output>0.66845277963831218</output>
24 | 					</split>
25 | 				</split>
26 | 				<split pos="right">
27 | 					<output>0.96317280453257792</output>
28 | 				</split>
29 | 			</split>
30 | 		</tree>
31 | 		<tree id="2" weight="0.10000000149011612">
32 | 			<split>
33 | 				<feature>8</feature>
34 | 				<threshold>0.66666698455810547</threshold>
35 | 				<split pos="left">
36 | 					<output>0.37133907932286642</output>
37 | 				</split>
38 | 				<split pos="right">
39 | 					<feature>106</feature>
40 | 					<threshold>17.135160446166992</threshold>
41 | 					<split pos="left">
42 | 						<output>0.54762687170967062</output>
43 | 					</split>
44 | 					<split pos="right">
45 | 						<output>0.98651670670179537</output>
46 | 					</split>
47 | 				</split>
48 | 			</split>
49 | 		</tree>
50 | 	</ensemble>
51 | </ranker>
52 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | ## Steps for contributing
 2 | 
 3 | Fixing a bug you found in RankEval? Suggesting a feature? Listed here are some guidelines to keep in mind when contributing.
 4 | 
 5 | 1. **Open an issue** along with detailed explanation. For bug reports, include the code to reproduce the bug. For feature requests, explain why you think the feature could be useful.
 6 | 
 7 | 2. **Fork the repository**. If you're contributing code, clone the forked repository into your local machine.
 8 | 
 9 | 3. **Run the tests** to make sure they pass on your machine. Simply run `nosetests` at the root folder and make sure all tests pass.
10 | 
11 | 4. **Create a new branch**. Please do not commit directly to the master branch. Create your own branch and place your additions there.
12 | 
13 | 5. **Write your code**. For Python, please follow PEP8 coding standards. Also, if you're adding a function, you must [write a docstring using the Numpy format](http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_numpy.html#example-numpy) detailing the API of your function. Take a look at the docstrings of the other Xcessiv functions to get an idea of what the docstring of yours should look like.
14 | 
15 | 6. **Write/modify the corresponding unit tests**. After adding in your code and the corresponding unit tests, run `pytest` again to make sure they pass.
16 | 
17 | 7. **Submit a pull request**. After submitting a PR, if all tests pass, your code will be reviewed and merged promptly.
18 | 
19 | Thank you for taking the time to make RankEval better!


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | DOCDIR=./src
 2 | SRCDIR=../rankeval
 3 | 
 4 | # documentation is compiled by using shpinx
 5 | # exclude from documentation
 6 | DOCEXCLUDED=../rankeval/test
 7 | 
 8 | ### Handling Sphinx for generating documentation
 9 | .PHONY: doc
10 | doc: 
11 | 	@echo "==================================="
12 | 	@echo "Producing documentation..."
13 | 
14 | 	# generate sphinx data
15 | 	sphinx-apidoc -o $(DOCDIR) -d 1 -f -F -M -H "RankEval" -A "HPC Lab" -V 0 -R 0.00 $(SRCDIR) $(DOCEXCLUDED)
16 | 	@cp $(DOCDIR)/static-index.rst $(DOCDIR)/index.rst
17 | 
18 | 	# customize sphinx generation
19 | 	@echo "# custom" >> $(DOCDIR)/conf.py
20 | 	@echo "extensions += ['sphinx.ext.todo']" >> $(DOCDIR)/conf.py
21 | 	@echo "todo_include_todos = True" >> $(DOCDIR)/conf.py
22 | 	#@echo "extensions += ['numpydoc']" >> $(DOCDIR)/conf.py
23 | 	#@echo "extensions += ['sphinxcontrib.napoleon']" >> doc/conf.py
24 | 	@echo "extensions += ['sphinx.ext.autosummary']" >> $(DOCDIR)/conf.py
25 | 	@echo "extensions += ['sphinx.ext.imgmath']" >> $(DOCDIR)/conf.py
26 | 	@echo "numpydoc_show_class_members = False" >> $(DOCDIR)/conf.py
27 | 	# customize themes
28 | 	@echo "html_theme = \"sphinx_rtd_theme\"" >> $(DOCDIR)/conf.py
29 | 
30 | 	@echo "import sys,os" >> $(DOCDIR)/conf.py
31 | 	@echo "sys.path.insert(0, os.path.abspath('../../') )" >> $(DOCDIR)/conf.py
32 | 	#@echo "print (sys.path)" >> $(DOCDIR)/conf.py
33 | 
34 | 	@echo "from setuptools import sandbox" >> $(DOCDIR)/conf.py
35 | 	@echo "sandbox.run_setup(os.path.abspath('../../setup.py'),  ['build_ext','-i'])" >> $(DOCDIR)/conf.py
36 | 
37 | 	@echo "autoclass_content = 'both'" >> $(DOCDIR)/conf.py
38 | 
39 | 	# compile HTML files
40 | 	make -C $(DOCDIR) html
41 | 


--------------------------------------------------------------------------------
/doc/src/rankeval.model.rst:
--------------------------------------------------------------------------------
 1 | rankeval.model package
 2 | ======================
 3 | 
 4 | .. automodule:: rankeval.model
 5 |     :members:
 6 |     :undoc-members:
 7 |     :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | rankeval.model.proxy\_CatBoost module
13 | -------------------------------------
14 | 
15 | .. automodule:: rankeval.model.proxy_CatBoost
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 
20 | rankeval.model.proxy\_Jforests module
21 | -------------------------------------
22 | 
23 | .. automodule:: rankeval.model.proxy_Jforests
24 |     :members:
25 |     :undoc-members:
26 |     :show-inheritance:
27 | 
28 | rankeval.model.proxy\_LightGBM module
29 | -------------------------------------
30 | 
31 | .. automodule:: rankeval.model.proxy_LightGBM
32 |     :members:
33 |     :undoc-members:
34 |     :show-inheritance:
35 | 
36 | rankeval.model.proxy\_QuickRank module
37 | --------------------------------------
38 | 
39 | .. automodule:: rankeval.model.proxy_QuickRank
40 |     :members:
41 |     :undoc-members:
42 |     :show-inheritance:
43 | 
44 | rankeval.model.proxy\_ScikitLearn module
45 | ----------------------------------------
46 | 
47 | .. automodule:: rankeval.model.proxy_ScikitLearn
48 |     :members:
49 |     :undoc-members:
50 |     :show-inheritance:
51 | 
52 | rankeval.model.proxy\_XGBoost module
53 | ------------------------------------
54 | 
55 | .. automodule:: rankeval.model.proxy_XGBoost
56 |     :members:
57 |     :undoc-members:
58 |     :show-inheritance:
59 | 
60 | rankeval.model.rt\_ensemble module
61 | ----------------------------------
62 | 
63 | .. automodule:: rankeval.model.rt_ensemble
64 |     :members:
65 |     :undoc-members:
66 |     :show-inheritance:
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/rankeval/analysis/_efficient_feature_impl.h:
--------------------------------------------------------------------------------
 1 | class TreeNode {
 2 |     public:
 3 |         unsigned int node_id;
 4 |         unsigned int start_id;
 5 |         unsigned int end_id;
 6 | 
 7 |         TreeNode(unsigned int node_id,
 8 |                  unsigned int start_id,
 9 |                  unsigned int end_id) :
10 |             node_id(node_id),
11 |             start_id(start_id),
12 |             end_id(end_id) {}
13 | 
14 |         int get_n_instances() {
15 |             return end_id - start_id + 1;
16 |         }
17 | };
18 | 
19 | void c_feature_importance(
20 |         const float* X,
21 |         const float* y,
22 |         const int* trees_root,
23 |         const float* trees_weight,
24 |         const short* trees_nodes_feature,
25 |         const float* trees_nodes_value,
26 |         const int* trees_left_child,
27 |         const int* trees_right_child,
28 |         float* feature_imp,
29 |         short* feature_count,
30 |         const int n_instances,
31 |         const int n_features,
32 |         const int n_trees);
33 | 
34 | void c_feature_importance_tree(
35 |         const float* X,
36 |         const float* y,
37 |         const int* trees_root,
38 |         const float* trees_weight,
39 |         const short* trees_nodes_feature,
40 |         const float* trees_nodes_value,
41 |         const int* trees_left_child,
42 |         const int* trees_right_child,
43 |         const int tree_id,
44 |         float* feature_imp,
45 |         short* feature_count,
46 |         const int n_instances,
47 |         const int n_features,
48 |         float* y_pred,
49 |         float* y_pred_tree);
50 | 
51 | inline bool is_leaf_node(int node_id,
52 |                          const int* trees_left_child,
53 |                          const int* trees_right_child) {
54 |     return trees_left_child[node_id] == -1 && trees_right_child[node_id] == -1;
55 | }


--------------------------------------------------------------------------------
/rankeval/test/analysis/test_statistical.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import unittest
 4 | 
 5 | import numpy as np
 6 | from numpy.testing import assert_allclose
 7 | 
 8 | from rankeval.analysis.statistical import _randomization
 9 | from rankeval.analysis.statistical import statistical_significance
10 | from rankeval.dataset import Dataset
11 | from rankeval.metrics.ndcg import NDCG
12 | from rankeval.model import RTEnsemble
13 | from ..base import data_dir
14 | 
15 | 
16 | class StatisticalSignificanceTestCase(unittest.TestCase):
17 | 
18 |     def setUp(self):
19 |         self.model_a = RTEnsemble(
20 |             os.path.join(data_dir, "quickrank.model.xml"), format="QuickRank")
21 |         self.model_b = RTEnsemble(
22 |             os.path.join(data_dir, "quickrank.model.v2.xml"), format="QuickRank")
23 |         self.dataset = Dataset.load(
24 |             os.path.join(data_dir, "msn1.fold1.test.5k.txt"), format="svmlight")
25 |         self.metric = NDCG()
26 | 
27 |     def tearDown(self):
28 |         del self.model_a
29 |         self.model_a = None
30 |         del self.model_b
31 |         self.model_b = None
32 |         del self.dataset
33 |         self.dataset = None
34 |         del self.metric
35 |         self.metric = None
36 |  
37 |     def test_statistical_significance(self):
38 |         statistical_significance([self.dataset], self.model_a, self.model_b,
39 |                                  [self.metric], n_perm=100)
40 | 
41 |     def test_randomization(self):
42 |         A = np.array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0])
43 |         B = np.array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
44 |         n_perm = 20000
45 |         p1, p2 = _randomization( A, B, n_perm)
46 |         # compute with https://github.com/searchivarius/PermTest
47 |         assert_allclose(p2, .34, atol=0.02)
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
52 |     unittest.main()
53 | 


--------------------------------------------------------------------------------
/rankeval/metrics/spearman_rho.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
 2 | # Authors: Cristina Muntean <cristina.muntean@isti.cnr.it>
 3 | #
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | import scipy.stats as stats
 9 | from rankeval.metrics.metric import Metric
10 | 
11 | 
12 | class SpearmanRho(Metric):
13 |     """
14 |     This class implements Spearman's Rho.
15 |     We use the Spearman Rho coefficient implementation from scipy.
16 | 
17 |     """
18 | 
19 |     def __init__(self, name='Rho'):
20 |         """
21 |         This is the constructor of Spearman Rho, an object of type Metric, with
22 |         the name Rho. The constructor also allows setting custom values in the
23 |         following parameters.
24 | 
25 |         Parameters
26 |         ----------
27 |         name: string
28 |             Rho
29 |         """
30 |         super(SpearmanRho, self).__init__(name)
31 | 
32 |     def eval(self, dataset, y_pred):
33 |         """
34 |         This method computes the Spearman Rho tau score over the entire dataset
35 |         and the detailed scores per query. It calls the eval_per query method
36 |         for each query in order to get the detailed Spearman Rho score.
37 | 
38 |         Parameters
39 |         ----------
40 |         dataset : Dataset
41 |             Represents the Dataset object on which to apply Spearman Rho.
42 |         y_pred : numpy 1d array of float
43 |             Represents the predicted document scores for each instance in the
44 |             dataset.
45 | 
46 |         Returns
47 |         -------
48 |         avg_score: float
49 |             The overall Spearman Rho score (averages over the detailed scores).
50 |         detailed_scores: numpy 1d array of floats
51 |             The detailed Spearman Rho scores for each query, an array of length
52 |             of the number of queries.
53 |         """
54 |         return super(SpearmanRho, self).eval(dataset, y_pred)
55 | 
56 |     def eval_per_query(self, y, y_pred):
57 |         """
58 |         This methods computes Spearman Rho at per query level (on the instances
59 |         belonging to a specific query).
60 | 
61 |         Parameters
62 |         ----------
63 |         y: numpy array
64 |             Represents the labels of instances corresponding to one query in the
65 |             dataset (ground truth).
66 |         y_pred: numpy array.
67 |             Represents the predicted document scores obtained during the model
68 |             scoring phase for that query.
69 | 
70 |         Returns
71 |         -------
72 |         rho: float
73 |             The Spearman Rho per query.
74 |         """
75 |         spearman_rho = stats.spearmanr(y, y_pred)
76 |         return spearman_rho.correlation
77 | 
78 |     def __str__(self):
79 |         s = self.name
80 |         return s
81 | 


--------------------------------------------------------------------------------
/doc/src/rankeval.metrics.rst:
--------------------------------------------------------------------------------
  1 | rankeval.metrics package
  2 | ========================
  3 | 
  4 | .. automodule:: rankeval.metrics
  5 |     :members:
  6 |     :undoc-members:
  7 |     :show-inheritance:
  8 | 
  9 | Submodules
 10 | ----------
 11 | 
 12 | rankeval.metrics.dcg module
 13 | ---------------------------
 14 | 
 15 | .. automodule:: rankeval.metrics.dcg
 16 |     :members:
 17 |     :undoc-members:
 18 |     :show-inheritance:
 19 | 
 20 | rankeval.metrics.err module
 21 | ---------------------------
 22 | 
 23 | .. automodule:: rankeval.metrics.err
 24 |     :members:
 25 |     :undoc-members:
 26 |     :show-inheritance:
 27 | 
 28 | rankeval.metrics.kendall\_tau module
 29 | ------------------------------------
 30 | 
 31 | .. automodule:: rankeval.metrics.kendall_tau
 32 |     :members:
 33 |     :undoc-members:
 34 |     :show-inheritance:
 35 | 
 36 | rankeval.metrics.map module
 37 | ---------------------------
 38 | 
 39 | .. automodule:: rankeval.metrics.map
 40 |     :members:
 41 |     :undoc-members:
 42 |     :show-inheritance:
 43 | 
 44 | rankeval.metrics.metric module
 45 | ------------------------------
 46 | 
 47 | .. automodule:: rankeval.metrics.metric
 48 |     :members:
 49 |     :undoc-members:
 50 |     :show-inheritance:
 51 | 
 52 | rankeval.metrics.mrr module
 53 | ---------------------------
 54 | 
 55 | .. automodule:: rankeval.metrics.mrr
 56 |     :members:
 57 |     :undoc-members:
 58 |     :show-inheritance:
 59 | 
 60 | rankeval.metrics.mse module
 61 | ---------------------------
 62 | 
 63 | .. automodule:: rankeval.metrics.mse
 64 |     :members:
 65 |     :undoc-members:
 66 |     :show-inheritance:
 67 | 
 68 | rankeval.metrics.ndcg module
 69 | ----------------------------
 70 | 
 71 | .. automodule:: rankeval.metrics.ndcg
 72 |     :members:
 73 |     :undoc-members:
 74 |     :show-inheritance:
 75 | 
 76 | rankeval.metrics.pfound module
 77 | ------------------------------
 78 | 
 79 | .. automodule:: rankeval.metrics.pfound
 80 |     :members:
 81 |     :undoc-members:
 82 |     :show-inheritance:
 83 | 
 84 | rankeval.metrics.precision module
 85 | ---------------------------------
 86 | 
 87 | .. automodule:: rankeval.metrics.precision
 88 |     :members:
 89 |     :undoc-members:
 90 |     :show-inheritance:
 91 | 
 92 | rankeval.metrics.rbp module
 93 | ---------------------------
 94 | 
 95 | .. automodule:: rankeval.metrics.rbp
 96 |     :members:
 97 |     :undoc-members:
 98 |     :show-inheritance:
 99 | 
100 | rankeval.metrics.recall module
101 | ------------------------------
102 | 
103 | .. automodule:: rankeval.metrics.recall
104 |     :members:
105 |     :undoc-members:
106 |     :show-inheritance:
107 | 
108 | rankeval.metrics.rmse module
109 | ----------------------------
110 | 
111 | .. automodule:: rankeval.metrics.rmse
112 |     :members:
113 |     :undoc-members:
114 |     :show-inheritance:
115 | 
116 | rankeval.metrics.spearman\_rho module
117 | -------------------------------------
118 | 
119 | .. automodule:: rankeval.metrics.spearman_rho
120 |     :members:
121 |     :undoc-members:
122 |     :show-inheritance:
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/rankeval/test/analysis/test_feature.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import unittest
 4 | 
 5 | import numpy as np
 6 | from numpy.testing import assert_array_almost_equal, assert_allclose, \
 7 |     assert_array_equal
 8 | 
 9 | from rankeval.analysis.feature import feature_importance, \
10 |     _feature_importance_tree
11 | from rankeval.dataset import Dataset
12 | from rankeval.metrics import MSE
13 | from rankeval.model import RTEnsemble
14 | from ..base import data_dir
15 | 
16 | 
17 | class FeatureImportanceTestCase(unittest.TestCase):
18 | 
19 |     @classmethod
20 |     def setUpClass(cls):
21 |         cls.model = RTEnsemble(
22 |             os.path.join(data_dir, "quickrank.model.xml"),
23 |             format="QuickRank")
24 |         cls.dataset = Dataset.load(
25 |             os.path.join(data_dir, "msn1.fold1.train.5k.txt"),
26 |             format="svmlight")
27 | 
28 |     @classmethod
29 |     def tearDownClass(cls):
30 |         del cls.model
31 |         cls.model = None
32 |         del cls.dataset
33 |         cls.dataset = None
34 |  
35 |     def test_feature_importance(self):
36 |         feature_imp, feature_cnt = feature_importance(
37 |             self.model, self.dataset, normalize=False)
38 | 
39 |         features = [7, 105, 107, 114]
40 |         assert_allclose(feature_imp[features],
41 |                         [0.0405271754093, 0.0215954124466,
42 |                          0.0478155618964, 0.018661751695],
43 |                         atol=1e-6)
44 | 
45 |         assert_array_equal(feature_cnt[features],
46 |                            [1, 1, 1, 1])
47 | 
48 |     def test_scoring_feature_importance(self):
49 | 
50 |         # default scores on the root node of the first tree
51 |         y_pred = np.zeros(self.dataset.n_instances, dtype=np.float32)
52 | 
53 |         # initialize features importance
54 |         feature_imp = np.zeros(self.dataset.n_features, dtype=np.float32)
55 | 
56 |         # initialize features count
57 |         feature_count = np.zeros(self.dataset.n_features, dtype=np.uint16)
58 | 
59 |         y_pred_m, partial_y_pred, y_leaves = \
60 |             self.model.score(self.dataset, detailed=True, cache=True)
61 | 
62 |         metric = MSE()
63 | 
64 |         for tree_id in np.arange(self.model.n_trees):
65 |             y_pred_tree = _feature_importance_tree(self.model, self.dataset,
66 |                                                    tree_id, y_pred, metric,
67 |                                                    feature_imp, feature_count)
68 |             # y_pred_tree *= self.model.trees_weight[tree_id]
69 | 
70 |             # Check the partial scores of each tree are compatible with
71 |             # traditional scoring
72 |             assert_allclose(y_pred_tree,
73 |                             partial_y_pred[:, tree_id],
74 |                             atol=1e-6)
75 | 
76 |         # Check the usual scoring and the scoring performed by analyzing also
77 |         # the feature importance compute the same predictions
78 |         assert_array_almost_equal(y_pred, y_pred_m)
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
83 |     unittest.main()
84 | 


--------------------------------------------------------------------------------
/rankeval/metrics/rmse.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
 2 | # Authors: Cristina Muntean <cristina.muntean@isti.cnr.it>
 3 | #
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | import numpy as np
 9 | from rankeval.metrics import Metric, MSE
10 | 
11 | 
12 | class RMSE(Metric):
13 |     """
14 |     This class implements Root mean squared error (RMSE) with
15 |     several parameters.
16 | 
17 |     """
18 |     def __init__(self, name='RMSE', cutoff=None):
19 |         """
20 |         This is the constructor of RMSE, an object of type Metric, with the
21 |         name RMSE. The constructor also allows setting custom values in the
22 |         following parameters.
23 | 
24 |         Parameters
25 |         ----------
26 |         name: string
27 |             RMSE
28 |         cutoff: int
29 |             The top k results to be considered at per query level (e.g. 10),
30 |             otherwise the default value is None and is computed on all the
31 |             instances of a query.
32 |         """
33 |         super(self.__class__, self).__init__(name)
34 |         self.cutoff = cutoff
35 |         self._mse = MSE(cutoff=cutoff)
36 | 
37 |     def eval(self, dataset, y_pred):
38 |         """
39 |         This method takes the RMSE for each query and calculates
40 |         the average RMSE.
41 | 
42 |         Parameters
43 |         ----------
44 |         dataset : Dataset
45 |             Represents the Dataset object on which to apply RMSE.
46 |         y_pred : numpy 1d array of float
47 |             Represents the predicted document scores for each instance
48 |             in the dataset.
49 | 
50 |         Returns
51 |         -------
52 |         avg_score: float
53 |             The overall RMSE score (averages over the detailed RMSE scores).
54 |         detailed_scores: numpy 1d array of floats
55 |             The detailed RMSE@k scores for each query, an array of length of
56 |             the number of queries.
57 |         """
58 |         return super(self.__class__, self).eval(dataset, y_pred)
59 | 
60 |     def eval_per_query(self, y, y_pred):
61 |         """
62 |         This method helps compute the RMSE score per query. It is called by
63 |         the eval function which averages and aggregates the scores
64 |         for each query.
65 | 
66 |         Parameters
67 |         ----------
68 |         y: numpy array
69 |             Represents the labels of instances corresponding to one query in
70 |             the dataset (ground truth).
71 |         y_pred: numpy array.
72 |             Represents the predicted document scores obtained during the model
73 |             scoring phase for that query.
74 | 
75 |         Returns
76 |         -------
77 |         rmse: float
78 |             Represents the RMSE score for one query.
79 |         """
80 |         mse = self._mse.eval_per_query(y, y_pred)
81 |         return np.sqrt(mse)
82 | 
83 |     def __str__(self):
84 |         s = self.name
85 |         if self.cutoff is not None:
86 |             s += "@{}".format(self.cutoff)
87 |         return s
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
  1 | sudo: false
  2 | 
  3 | cache:
  4 |   pip: true
  5 | 
  6 | os:
  7 |   - linux
  8 |   - osx
  9 | 
 10 | dist: xenial
 11 | 
 12 | language: generic
 13 | 
 14 | env:
 15 |   - TASK=sdist PYTHON_VERSION="2.7"
 16 |   - TASK=sdist PYTHON_VERSION="3.5"
 17 |   - TASK=sdist PYTHON_VERSION="3.6"
 18 |   - TASK=sdist PYTHON_VERSION="3.7"
 19 | 
 20 |   - TASK=bdist PYTHON_VERSION="2.7"
 21 |   - TASK=bdist PYTHON_VERSION="3.5"
 22 |   - TASK=bdist PYTHON_VERSION="3.6"
 23 |   - TASK=bdist PYTHON_VERSION="3.7"
 24 | 
 25 | before_install:
 26 |   - export BUILD_DIRECTORY="$TRAVIS_BUILD_DIR"
 27 |   - if [[ $TRAVIS_OS_NAME == "osx" ]]; then
 28 |         export OS_NAME="macos";
 29 |         if test -z "$COMPILER"; then
 30 |             export COMPILER="clang";
 31 |         fi
 32 |     else
 33 |         export OS_NAME="linux";
 34 |         export COMPILER="gcc";
 35 |     fi
 36 | 
 37 | install:
 38 |   - source build_tools/travis/setup.sh
 39 | 
 40 | script:
 41 |   - bash build_tools/travis/test.sh
 42 | 
 43 | jobs:
 44 |   include:
 45 |     - stage: test
 46 |       os: osx
 47 |       env:
 48 |         COMPILER=gcc
 49 |         PYTHON_VERSION="3.7"
 50 | 
 51 |     - stage: deploy
 52 |       if: tag IS present
 53 |       os: osx
 54 |       env:
 55 |         TASK=bdist
 56 |         PYTHON_VERSION="2.7"
 57 |       script:
 58 |         - build_tools/travis/deploy.sh
 59 | 
 60 |     - stage: deploy
 61 |       if: tag IS present
 62 |       os: osx
 63 |       env:
 64 |         TASK=bdist
 65 |         PYTHON_VERSION="3.5"
 66 |       script:
 67 |         - build_tools/travis/deploy.sh
 68 | 
 69 |     - stage: deploy
 70 |       if: tag IS present
 71 |       os: osx
 72 |       env:
 73 |         TASK=bdist
 74 |         PYTHON_VERSION="3.6"
 75 |       script:
 76 |         - build_tools/travis/deploy.sh
 77 | 
 78 |     - stage: deploy
 79 |       if: tag IS present
 80 |       os: osx
 81 |       env:
 82 |         TASK=bdist
 83 |         PYTHON_VERSION="3.7"
 84 |       script:
 85 |         - build_tools/travis/deploy.sh
 86 | 
 87 |     - stage: deploy
 88 |       if: tag IS present
 89 |       sudo: required
 90 |       services:
 91 |         - docker
 92 |       env:
 93 |         DOCKER_IMAGE=quay.io/pypa/manylinux1_x86_64
 94 |         PLAT=manylinux1_x86_64
 95 |       before_install: skip
 96 |       install:
 97 |         - docker pull $DOCKER_IMAGE
 98 |       script:
 99 |         - docker run --rm -e PLAT=$PLAT -e PYPI_USER=$PYPI_USER -e PYPI_PASS=$PYPI_PASS -v `pwd`:/io $DOCKER_IMAGE $PRE_CMD /io/build_tools/build_wheels.sh
100 |         - ls wheelhouse/
101 | 
102 |     - stage: deploy
103 |       if: tag IS present
104 |       sudo: required
105 |       services:
106 |         - docker
107 |       env:
108 |         DOCKER_IMAGE=quay.io/pypa/manylinux2010_x86_64
109 |         PLAT=manylinux2010_x86_64
110 |       before_install: skip
111 |       install:
112 |         - docker pull $DOCKER_IMAGE
113 |       script:
114 |         - docker run --rm -e PLAT=$PLAT -e PYPI_USER=$PYPI_USER -e PYPI_PASS=$PYPI_PASS -v `pwd`:/io $DOCKER_IMAGE $PRE_CMD /io/build_tools/build_wheels.sh
115 |         - ls wheelhouse/
116 | 
117 | notifications:
118 |   slack:
119 |     rooms: $SLACK_TOKEN
120 |     on_success: change
121 |     on_failure: always


--------------------------------------------------------------------------------
/rankeval/test/scoring/test_scoring.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import unittest
 4 | 
 5 | from numpy.testing import assert_array_almost_equal, assert_almost_equal
 6 | 
 7 | from rankeval.dataset import Dataset
 8 | from rankeval.model import RTEnsemble
 9 | from rankeval.scoring.scorer import Scorer
10 | from rankeval.test.base import data_dir
11 | 
12 | model_file = os.path.join(data_dir, "quickrank.model.xml")
13 | data_file = os.path.join(data_dir, "msn1.fold1.test.5k.txt")
14 | 
15 | 
16 | class ScoringTestCase(unittest.TestCase):
17 | 
18 |     @classmethod
19 |     def setUpClass(cls):
20 |         cls.model = RTEnsemble(model_file, format="QuickRank")
21 |         cls.dataset = Dataset.load(data_file, format="svmlight")
22 |         cls.scorer = Scorer(cls.model, cls.dataset)
23 | 
24 |     @classmethod
25 |     def tearDownClass(cls):
26 |         del cls.model
27 |         cls.model = None
28 |         del cls.dataset
29 |         cls.dataset = None
30 |         del cls.scorer
31 |         cls.scorer = None
32 | 
33 |     def test_basic_scoring_values(self):
34 |         self.scorer.score(detailed=False)
35 |         assert_array_almost_equal(self.scorer.get_predicted_scores()[:3],
36 |                                   [0.16549695, 0.07126279, 0.10397919])
37 |         assert_array_almost_equal(self.scorer.get_predicted_scores()[-3:],
38 |                                   [0.13345119, 0.13345119, 0.07126279])
39 | 
40 |     def test_basic_scoring_sum(self):
41 |         self.scorer.score(detailed=False)
42 |         assert_almost_equal(self.scorer.get_predicted_scores().sum(),
43 |                             598.72852, decimal=5)
44 | 
45 |     def test_detailed_scoring_values(self):
46 |         self.scorer.score(detailed=True)
47 |         assert_array_almost_equal(
48 |             self.scorer.get_partial_predicted_scores()[:3],
49 |             [[0.06684528,  0.09865167],
50 |             [0.03412888,  0.03713391],
51 |             [0.06684528,  0.03713391]])
52 |         assert_array_almost_equal(
53 |             self.scorer.get_partial_predicted_scores()[-3:],
54 |             [[0.09631728,  0.03713391],
55 |             [0.09631728,  0.03713391],
56 |             [0.03412888,  0.03713391]])
57 | 
58 |     def test_basic_and_detailed_scoring(self):
59 |         self.scorer.score(detailed=False)
60 |         y_pred_basic = self.scorer.y_pred
61 |         self.scorer.score(detailed=True)
62 |         y_pred_detailed = self.scorer.y_pred
63 |         assert_array_almost_equal(y_pred_basic, y_pred_detailed)
64 | 
65 |     def test_detailed_scoring_sum(self):
66 |         self.scorer.score(detailed=True)
67 |         assert_almost_equal(self.scorer.get_partial_predicted_scores().sum(),
68 |                             598.72852, decimal=5)
69 |         assert_array_almost_equal(
70 |             self.scorer.get_partial_predicted_scores().sum(axis=0),
71 |             [312.43994141, 286.2948])
72 |         assert_array_almost_equal(
73 |             self.scorer.get_partial_predicted_scores().sum(axis=1)[:3],
74 |             [0.16549695, 0.07126279, 0.10397919])
75 |         assert_array_almost_equal(
76 |             self.scorer.get_partial_predicted_scores().sum(axis=1)[-3:],
77 |             [0.13345119, 0.13345119, 0.07126279])
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
82 |                         level=logging.DEBUG)
83 |     unittest.main()
84 | 


--------------------------------------------------------------------------------
/rankeval/metrics/kendall_tau.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
 2 | # Authors: Cristina Muntean <cristina.muntean@isti.cnr.it>
 3 | #
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | 
 9 | import scipy.stats as stats
10 | from rankeval.metrics.metric import Metric
11 | 
12 | 
13 | class Kendalltau(Metric):
14 |     """
15 |     This class implements Kendall's Tau.
16 |     We use the Kendall tau coefficient implementation from scipy.
17 | 
18 |     """
19 | 
20 |     def __init__(self, name='K'):
21 |         """
22 |         This is the constructor of Kendall Tau, an object of type Metric,
23 |         with the name K. The constructor also allows setting custom values in
24 |         the following parameters.
25 | 
26 |         Parameters
27 |         ----------
28 |         name: string
29 |             K
30 | 
31 |         """
32 |         super(Kendalltau, self).__init__(name)
33 | 
34 | 
35 |     def eval(self, dataset, y_pred):
36 |         """
37 |         This method computes the Kendall tau score over the entire dataset and
38 |         the detailed scores per query. It calls the eval_per query method
39 |         for each query in order to get the detailed Kendall tau score.
40 | 
41 |         Parameters
42 |         ----------
43 |         dataset : Dataset
44 |             Represents the Dataset object on which to apply Kendall Tau.
45 |         y_pred : numpy 1d array of float
46 |             Represents the predicted document scores for each instance
47 |             in the dataset.
48 | 
49 |         Returns
50 |         -------
51 |         avg_score: float
52 |             The overall Kendall tau score (averages over the detailed scores).
53 |         detailed_scores: numpy 1d array of floats
54 |             The detailed Kendall tau scores for each query, an array with length
55 |             of the number of queries.
56 |         """
57 |         return super(Kendalltau, self).eval(dataset, y_pred)
58 | 
59 | 
60 |     def eval_per_query(self, y, y_pred):
61 |         """
62 |         This methods computes Kendall tau at per query level (on the instances
63 |         belonging to a specific query). The Kendall tau per query is
64 |         calculated as:
65 | 
66 |             tau = (P - Q) / sqrt((P + Q + T) * (P + Q + U))
67 | 
68 |         where P is the number of concordant pairs, Q the number of discordant
69 |         pairs, T the number of ties only in x, and U the number of ties only
70 |         in y. If a tie occurs for the same pair in both x and y, it is not
71 |         added to either T or U.
72 |         s
73 |         Whether to use lexsort or quicksort as the sorting method for the
74 |         initial sort of the inputs.  Default is lexsort (True), for which
75 |         kendalltau is of complexity O(n log(n)). If False, the complexity
76 |         is O(n^2), but with a smaller pre-factor (so quicksort may be faster
77 |         for small arrays).
78 | 
79 |         Parameters
80 |         ----------
81 |         y: numpy array
82 |             Represents the labels of instances corresponding to one query in
83 |             the dataset (ground truth).
84 |         y_pred: numpy array.
85 |             Represents the predicted document scores obtained during the model
86 |             scoring phase for that query.
87 | 
88 |         Returns
89 |         -------
90 |         kendalltau: float
91 |             The Kendall tau per query.
92 |         """
93 |         kendall_tau = stats.kendalltau(y, y_pred, initial_lexsort=True)
94 |         return kendall_tau.correlation
95 | 
96 | 
97 |     def __str__(self):
98 |         s = self.name
99 |         return s


--------------------------------------------------------------------------------
/rankeval/metrics/mse.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors: Cristina Muntean <cristina.muntean@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | import numpy as np
  9 | 
 10 | from rankeval.metrics import Metric
 11 | 
 12 | 
 13 | class MSE(Metric):
 14 |     """
 15 |     This class implements Mean squared error (MSE) with several parameters.
 16 | 
 17 |     """
 18 |     def __init__(self, name='MSE', cutoff=None):
 19 |         """
 20 |         This is the constructor of MSE, an object of type Metric, with
 21 |         the name MSE. The constructor also allows setting custom values in
 22 |         the following parameters.
 23 | 
 24 |         Parameters
 25 |         ----------
 26 |         name: string
 27 |             MSE
 28 |         cutoff: int
 29 |             The top k results to be considered at per query level (e.g. 10),
 30 |             otherwise the default value is None and is computed on all the
 31 |             instances of a query.
 32 |         """
 33 |         super(self.__class__, self).__init__(name)
 34 |         self.cutoff = cutoff
 35 | 
 36 |     def eval(self, dataset, y_pred):
 37 |         """
 38 |         This method takes the MSE for each query and calculates
 39 |         the average MSE.
 40 | 
 41 |         Parameters
 42 |         ----------
 43 |         dataset : Dataset
 44 |             Represents the Dataset object on which to apply MSE.
 45 |         y_pred : numpy 1d array of float
 46 |             Represents the predicted document scores for each instance
 47 |             in the dataset.
 48 | 
 49 |         Returns
 50 |         -------
 51 |         avg_score: float
 52 |             The overall MSE score (summed over the detailed MSE scores).
 53 |         detailed_scores: numpy 1d array of floats
 54 |             The detailed MSE@k scores for each query, an array of length of
 55 |             the number of queries.
 56 |         """
 57 |         # return super(self.__class__, self).eval(dataset, y_pred)
 58 | 
 59 |         self.detailed_scores = np.zeros(dataset.n_queries, dtype=np.float32)
 60 | 
 61 |         for qid, q_y, q_y_pred in self.query_iterator(dataset, y_pred):
 62 |             self.detailed_scores[qid] = \
 63 |                 self.eval_per_query(q_y, q_y_pred) / dataset.n_instances
 64 |         return self.detailed_scores.sum(), self.detailed_scores
 65 | 
 66 |     def eval_per_query(self, y, y_pred):
 67 |         """
 68 |         This method helps compute the MSE score per query. It is called by
 69 |         the eval function which averages and aggregates the scores
 70 |         for each query.
 71 | 
 72 |         Parameters
 73 |         ----------
 74 |         y: numpy array
 75 |             Represents the labels of instances corresponding to one query in
 76 |             the dataset (ground truth).
 77 |         y_pred: numpy array.
 78 |             Represents the predicted document scores obtained during the model
 79 |             scoring phase for that query.
 80 | 
 81 |         Returns
 82 |         -------
 83 |         rmse: float
 84 |             Represents the MSE score for one query.
 85 | 
 86 |         """
 87 |         if self.cutoff is not None:
 88 |             idx = np.argsort(y_pred)[::-1][:self.cutoff]
 89 |             return ((y[idx] - y_pred[idx]) ** 2).sum()
 90 |         else:
 91 |             return ((y - y_pred) ** 2.0).sum()
 92 | 
 93 |     def __str__(self):
 94 |         s = self.name
 95 |         if self.cutoff is not None:
 96 |             s += "@{}".format(self.cutoff)
 97 |         return s
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/rankeval/metrics/err.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors: Cristina Muntean <cristina.muntean@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | # http://olivier.chapelle.cc/pub/err.pdf
  9 | import numpy as np
 10 | from rankeval.metrics import Metric
 11 | 
 12 | 
 13 | class ERR(Metric):
 14 |     """
 15 |     This class implements Expected Reciprocal Rank as proposed
 16 |     in http://olivier.chapelle.cc/pub/err.pdf
 17 | 
 18 |     """
 19 | 
 20 |     def __init__(self, name='ERR', cutoff=None):
 21 |         """
 22 |         This is the constructor of ERR, an object of type Metric,
 23 |         with the name ERR. The constructor also allows setting custom values
 24 |         in the following parameters.
 25 | 
 26 |         Parameters
 27 |         ----------
 28 |         name: string
 29 |             ERR
 30 |         cutoff: int
 31 |             The top k results to be considered at per query level (e.g. 10)
 32 | 
 33 |         """
 34 | 
 35 |         super(ERR, self).__init__(name)
 36 |         self.cutoff = cutoff
 37 | 
 38 |     def eval(self, dataset, y_pred):
 39 |         """
 40 |         The method computes ERR by taking as input the dataset and the
 41 |         predicted document scores. It returns the averaged ERR score over
 42 |         the entire dataset and the detailed ERR scores per query.
 43 | 
 44 |         Parameters
 45 |         ----------
 46 |         dataset : Dataset
 47 |             Represents the Dataset object on which to apply ERR.
 48 |         y_pred : numpy 1d array of float
 49 |             Represents the predicted document scores for each instance
 50 |             in the dataset.
 51 | 
 52 |         Returns
 53 |         -------
 54 |         avg_score: float
 55 |             Represents the average ERR over all ERR scores per query.
 56 |         detailed_scores: numpy 1d array of floats
 57 |             Represents the detailed ERR scores for each query. It has the
 58 |             length of n_queries.
 59 | 
 60 |         """
 61 |         return super(ERR, self).eval(dataset, y_pred)
 62 | 
 63 |     def eval_per_query(self, y, y_pred):
 64 |         """
 65 |         This method helps compute the ERR score per query. It is called by
 66 |         the eval function which averages and aggregates the scores
 67 |         for each query.
 68 | 
 69 |         Parameters
 70 |         ----------
 71 |         y: numpy array
 72 |             Represents the labels of instances corresponding to one query in
 73 |             the dataset (ground truth).
 74 |         y_pred: numpy array.
 75 |             Represents the predicted document scores obtained during
 76 |             the model scoring phase for that query.
 77 | 
 78 |         Returns
 79 |         -------
 80 |         err: float
 81 |             Represents the ERR score for one query.
 82 | 
 83 |         """
 84 |         idx_y_pred_sorted = np.argsort(y_pred)[::-1]
 85 |         if self.cutoff is not None:
 86 |             idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff]
 87 | 
 88 |         max_grade = y.max()  # max relevance score
 89 |         prob_step_down = 1.0
 90 |         err = 0.0
 91 | 
 92 |         for i, idx in enumerate(idx_y_pred_sorted):
 93 |             utility = (pow(2., y[idx]) - 1.) / pow(2., max_grade)
 94 |             err += prob_step_down * (utility / (i + 1.))
 95 |             prob_step_down *= (1. - utility)
 96 | 
 97 |         return err
 98 | 
 99 |     def __str__(self):
100 |         s = self.name
101 |         if self.cutoff is not None:
102 |             s += "@{}".format(self.cutoff)
103 |         return s


--------------------------------------------------------------------------------
/rankeval/test/model/test_proxy_ScikitLearn.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import unittest
 4 | 
 5 | from numpy.testing import assert_equal, assert_array_equal, \
 6 |     assert_array_almost_equal
 7 | 
 8 | from rankeval.dataset import Dataset
 9 | from rankeval.model import ProxyXGBoost
10 | from rankeval.model import RTEnsemble
11 | from rankeval.test.base import data_dir
12 | 
13 | model_file = os.path.join(data_dir, "ScikitLearn.model.txt")
14 | data_file = os.path.join(data_dir, "msn1.fold1.test.5k.txt")
15 | 
16 | 
17 | class ProxyXGBoostTestCase(unittest.TestCase):
18 | 
19 |     @classmethod
20 |     def setUpClass(cls):
21 |         cls.model = RTEnsemble(model_file, format="ScikitLearn")
22 |         cls.dataset = Dataset.load(data_file, format="svmlight")
23 | 
24 |     @classmethod
25 |     def tearDownClass(cls):
26 |         del cls.model
27 |         cls.model = None
28 |         del cls.dataset
29 |         cls.dataset = None
30 | 
31 |     def test_count_nodes(self):
32 |         n_trees, n_nodes = ProxyXGBoost._count_nodes(model_file)
33 |         # print "Num Trees: %d\nNum Nodes: %d" % (n_trees, n_nodes),
34 |         assert_equal(n_trees, 2)
35 |         assert_equal(n_nodes, 10)
36 |         assert_equal(n_trees, self.model.trees_root.size)
37 |         assert_equal(n_nodes, self.model.trees_nodes_value.size)
38 | 
39 |     def test_root_nodes(self):
40 |         assert_equal((self.model.trees_root > -1).all(), True,
41 |                      err_msg="Root nodes not set correctly")
42 | 
43 |     def test_root_nodes_adv(self):
44 |         assert_array_equal(self.model.trees_root, [0, 5],
45 |                            err_msg="Root nodes are not correct")
46 | 
47 |     def test_split_features(self):
48 |         assert_array_equal(self.model.trees_nodes_feature,
49 |                            [54, -1, 133, -1, -1, 52, 14, -1, -1, -1])
50 | 
51 |     def test_tree_values(self):
52 |         assert_array_almost_equal(
53 |             self.model.trees_nodes_value,
54 |             [5.769000e-03, -2.264262e-01, 5.000000e-01, 1.872047e-01,
55 |              9.056279e-01, 4.446700e-02, 2.850000e+01, 2.694752e-01,
56 |              4.131481e-01, -2.031448e-01],
57 |             err_msg="Split thresholds or leaf outputs value are not correct")
58 | 
59 |     def test_left_children(self):
60 |         assert_array_equal(self.model.trees_left_child,
61 |                            [1, -1, 3, -1, -1, 6, 8, -1, -1, -1])
62 | 
63 |     def test_right_children(self):
64 |         assert_array_equal(self.model.trees_right_child,
65 |                            [2, -1, 4, -1, -1, 7, 9, -1, -1, -1])
66 | 
67 |     def test_leaf_correctness(self):
68 |         for idx, feature in enumerate(self.model.trees_nodes_feature):
69 |             if feature == -1:
70 |                 assert_equal(self.model.trees_left_child[idx], -1,
71 |                              "Left child of a leaf node is not empty (-1)")
72 |                 assert_equal(self.model.trees_right_child[idx], -1,
73 |                              "Right child of a leaf node is not empty (-1)")
74 |                 assert_equal(self.model.is_leaf_node(idx), True,
75 |                              "Leaf node not detected as a leaf")
76 | 
77 |     def test_prediction(self):
78 |         y_pred = self.model.score(self.dataset, cache=True)
79 |         assert_array_almost_equal(y_pred[:5],
80 |                                   [0.651668,  0.604406,  0.604406,
81 |                                    0.610305,  0.563043])
82 |         assert_array_almost_equal(y_pred[-5:],
83 |                                   [0.563043,  0.563043,  0.563043,
84 |                                    0.563043,  0.563043])
85 | 
86 | if __name__ == '__main__':
87 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
88 |                         level=logging.DEBUG)
89 |     unittest.main()
90 | 


--------------------------------------------------------------------------------
/rankeval/metrics/precision_max.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
 2 | # Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
 3 | #
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | import numpy as np
 9 | from rankeval.metrics.metric import Metric
10 | 
11 | 
12 | class PrecisionMax(Metric):
13 |     """
14 |     This class implements Precision as:
15 |     (relevant docs & retrieved docs) / retrieved docs.
16 | 
17 |     The particularity of this implementation is that the threshold is not global
18 |     to all the queries as in `Precision`, but is dynamically computed for every
19 |     query considering the document with top label. Thus this metric enables to
20 |     compute the precision regardless of a fixed label.
21 |     """
22 | 
23 |     def __init__(self, name='P', cutoff=None):
24 |         """
25 |         This is the constructor of Precision, an object of type Metric, with
26 |         the name P. The constructor also allows setting custom values for cutoff
27 |         and threshold, otherwise it uses the default values.
28 | 
29 |         Parameters
30 |         ----------
31 |         name: string
32 |             P
33 |         cutoff: int
34 |             The top k results to be considered at per query level (e.g. 10)
35 | 
36 |         """
37 |         super(PrecisionMax, self).__init__(name)
38 |         self.cutoff = cutoff
39 | 
40 |     def eval(self, dataset, y_pred):
41 |         """
42 |         This method computes the Precision score over the entire dataset and
43 |         the detailed scores per query. It calls the eval_per query method for
44 |         each query in order to get the detailed Precision score.
45 | 
46 |         Parameters
47 |         ----------
48 |         dataset : Dataset
49 |             Represents the Dataset object on which to apply Precision.
50 |         y_pred : numpy 1d array of float
51 |             Represents the predicted document scores for each instance in the
52 |             dataset.
53 | 
54 |         Returns
55 |         -------
56 |         avg_score: float
57 |             The overall Precision score (averages over the detailed precision
58 |             scores).
59 |         detailed_scores: numpy 1d array of floats
60 |             The detailed Precision scores for each query, an array of length of
61 |             the number of queries.
62 |         """
63 |         return super(PrecisionMax, self).eval(dataset, y_pred)
64 | 
65 |     def eval_per_query(self, y, y_pred):
66 |         """
67 |         This methods computes Precision at per query level (on the instances
68 |         belonging to a specific query). The Precision per query is calculated as
69 |         <(relevant docs & retrieved docs) / retrieved docs>.
70 | 
71 |         Parameters
72 |         ----------
73 |         y: numpy array
74 |             Represents the labels of instances corresponding to one query in
75 |             the dataset (ground truth).
76 |         y_pred: numpy array.
77 |             Represents the predicted document scores obtained during the model
78 |             scoring phase for that query.
79 | 
80 |         Returns
81 |         -------
82 |         precision: float
83 |             The precision per query.
84 |         """
85 |         idx_y_pred_sorted = np.argsort(y_pred)[::-1]
86 |         if self.cutoff is not None:
87 |             idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff]
88 | 
89 |         n_relevant_retrieved = (y[idx_y_pred_sorted] >= y.max()).sum()
90 |         return float(n_relevant_retrieved) / len(idx_y_pred_sorted)
91 | 
92 |     def __str__(self):
93 |         s = self.name
94 |         if self.cutoff is not None:
95 |             s += "@{}".format(self.cutoff)
96 |         return s
97 | 


--------------------------------------------------------------------------------
/rankeval/test/model/test_proxy_LightGBM.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import unittest
 4 | 
 5 | from numpy.testing import assert_equal, assert_array_equal, \
 6 |     assert_array_almost_equal
 7 | 
 8 | from rankeval.dataset import Dataset
 9 | from rankeval.model import ProxyLightGBM
10 | from rankeval.model import RTEnsemble
11 | from rankeval.test.base import data_dir
12 | 
13 | model_file = os.path.join(data_dir, "LightGBM.model.txt")
14 | data_file = os.path.join(data_dir, "msn1.fold1.test.5k.txt")
15 | 
16 | 
17 | class ProxyLightGBMTestCase(unittest.TestCase):
18 | 
19 |     @classmethod
20 |     def setUpClass(cls):
21 |         cls.model = RTEnsemble(model_file, format="LightGBM")
22 |         cls.dataset = Dataset.load(data_file, format="svmlight")
23 | 
24 |     @classmethod
25 |     def tearDownClass(cls):
26 |         del cls.model
27 |         cls.model = None
28 |         del cls.dataset
29 |         cls.dataset = None
30 | 
31 |     def test_count_nodes(self):
32 |         n_trees, n_nodes = ProxyLightGBM._count_nodes(model_file)
33 |         # print "Num Trees: %d\nNum Nodes: %d" % (n_trees, n_nodes),
34 |         assert_equal(n_trees, 2)
35 |         assert_equal(n_nodes, 10)
36 |         assert_equal(n_trees, self.model.trees_root.size)
37 |         assert_equal(n_nodes, self.model.trees_nodes_value.size)
38 | 
39 |     def test_root_nodes(self):
40 |         assert_equal((self.model.trees_root > -1).all(), True,
41 |                      err_msg="Root nodes not set correctly")
42 | 
43 |     def test_root_nodes_adv(self):
44 |         assert_array_equal(self.model.trees_root, [0, 5],
45 |                            err_msg="Root nodes are not correct")
46 | 
47 |     def test_split_features(self):
48 |         assert_array_equal(self.model.trees_nodes_feature,
49 |                            [55, 134, -1, -1, -1, 133, 48, -1, -1, -1])
50 | 
51 |     def test_tree_values(self):
52 |         assert_array_almost_equal(self.model.trees_nodes_value,
53 |             [3.63099994e-03, 2.06000000e+02, -3.71787995e-02, -1.57113143e-04,
54 |              3.06654684e-02, 9.99999968e-21, 2.33031496e-01, -4.58943285e-03,
55 |              3.43261547e-02, 1.79146975e-02],
56 |             err_msg="Split thresholds or leaf outputs value are not correct")
57 | 
58 |     def test_left_children(self):
59 |         assert_array_equal(self.model.trees_left_child,
60 |                            [2, 3, -1, -1, -1, 6, 7, -1, -1, -1])
61 | 
62 |     def test_right_children(self):
63 |         assert_array_equal(self.model.trees_right_child,
64 |                            [1, 4, -1, -1, -1, 8, 9, -1, -1, -1])
65 | 
66 |     def test_leaf_correctness(self):
67 |         for idx, feature in enumerate(self.model.trees_nodes_feature):
68 |             if feature == -1:
69 |                 assert_equal(self.model.trees_left_child[idx], -1,
70 |                              "Left child of a leaf node is not empty (-1)")
71 |                 assert_equal(self.model.trees_right_child[idx], -1,
72 |                              "Right child of a leaf node is not empty (-1)")
73 |                 assert_equal(self.model.is_leaf_node(idx), True,
74 |                              "Leaf node not detected as a leaf")
75 | 
76 |     def test_prediction(self):
77 |         y_pred = self.model.score(self.dataset, cache=True)
78 |         assert_array_almost_equal(y_pred[:5],
79 |                                   [-0.00474655, -0.00474655, -0.00474655,
80 |                                    -0.00474655, -0.00474655])
81 |         assert_array_almost_equal(y_pred[-5:],
82 |                                   [0.01775758, -0.00474655, -0.00474655,
83 |                                    -0.00474655, -0.00474655])
84 | 
85 | 
86 | if __name__ == '__main__':
87 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
88 |                         level=logging.DEBUG)
89 |     unittest.main()
90 | 


--------------------------------------------------------------------------------
/rankeval/metrics/dcg.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors: Cristina Muntean <cristina.muntean@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | import numpy as np
  9 | from rankeval.metrics.metric import Metric
 10 | 
 11 | 
 12 | class DCG(Metric):
 13 |     """
 14 |     This class implements DCG with several parameters.
 15 |     """
 16 | 
 17 |     def __init__(self, name='DCG', cutoff=None, implementation="flat"):
 18 |         """
 19 |         This is the constructor of DCG, an object of type Metric,
 20 |         with the name DCG. The constructor also allows setting custom values
 21 |         in the following parameters.
 22 | 
 23 |         Parameters
 24 |         ----------
 25 |         name: string
 26 |             DCG
 27 |         cutoff: int
 28 |             The top k results to be considered at per query level (e.g. 10).
 29 |         implementation: string
 30 |             Indicates whether to consider the flat or the exponential DCG
 31 |             formula (e.g.  {"flat", "exp"}).
 32 | 
 33 |         """
 34 | 
 35 |         super(DCG, self).__init__(name)
 36 |         self.cutoff = cutoff
 37 |         self.implementation = implementation
 38 | 
 39 |     def eval(self, dataset, y_pred):
 40 |         """
 41 |         The method computes DCG by taking as input the dataset and
 42 |         the predicted document scores. It returns the averaged DCG score
 43 |         over the entire dataset and the detailed DCG scores per query.
 44 |         
 45 |         Parameters
 46 |         ----------
 47 |         dataset : Dataset
 48 |             Represents the Dataset object on which to apply DCG.
 49 |         y_pred : numpy 1d array of float
 50 |             Represents the predicted document scores for each instance
 51 |             in the dataset.
 52 | 
 53 |         Returns
 54 |         -------
 55 |         avg_score: float 
 56 |             Represents the average DCG over all DCG scores per query.
 57 |         detailed_scores: numpy 1d array of floats
 58 |             Represents the detailed DCG scores for each query.
 59 |             It has the length of n_queries.
 60 | 
 61 |         """
 62 | 
 63 |         return super(DCG, self).eval(dataset, y_pred)
 64 | 
 65 |     def eval_per_query(self, y, y_pred):
 66 |         """
 67 |         This method helps compute the DCG score per query. It is called by
 68 |         the eval function which averages and aggregates the scores
 69 |         for each query.
 70 |         
 71 |         Parameters
 72 |         ----------
 73 |         y: numpy array
 74 |             Represents the labels of instances corresponding to one query in
 75 |             the dataset (ground truth).
 76 |         y_pred: numpy array. 
 77 |             Represents the predicted document scores obtained during the model
 78 |             scoring phase for that query.
 79 | 
 80 |         Returns
 81 |         -------
 82 |         dcg: float
 83 |             Represents the DCG score for one query.
 84 | 
 85 |         """
 86 | 
 87 |         idx_y_pred_sorted = np.argsort(y_pred)[::-1]
 88 |         if self.cutoff is not None:
 89 |             idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff]
 90 | 
 91 |         discount = np.log2(np.arange(2, idx_y_pred_sorted.size + 2))
 92 | 
 93 |         if self.implementation == "flat":
 94 |             gain = y[idx_y_pred_sorted]
 95 |         elif self.implementation == "exp":
 96 |             gain = np.exp2(y[idx_y_pred_sorted]) - 1.0
 97 | 
 98 |         dcg = (gain / discount).sum()
 99 |         return dcg
100 | 
101 |     def __str__(self):
102 |         s = self.name
103 |         if self.cutoff is not None:
104 |             s += "@{}".format(self.cutoff)
105 |         return s


--------------------------------------------------------------------------------
/rankeval/test/model/test_proxy_XGBoost.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import unittest
 4 | 
 5 | from numpy.testing import assert_equal, assert_array_equal, \
 6 |     assert_array_almost_equal
 7 | 
 8 | from rankeval.dataset import Dataset
 9 | from rankeval.model import ProxyXGBoost
10 | from rankeval.model import RTEnsemble
11 | from rankeval.test.base import data_dir
12 | 
13 | model_file = os.path.join(data_dir, "XGBoost.model.txt")
14 | data_file = os.path.join(data_dir, "msn1.fold1.test.5k.txt")
15 | 
16 | 
17 | class ProxyXGBoostTestCase(unittest.TestCase):
18 | 
19 |     @classmethod
20 |     def setUpClass(cls):
21 |         cls.model = RTEnsemble(model_file, format="XGBoost")
22 |         cls.dataset = Dataset.load(data_file, format="svmlight")
23 | 
24 |     @classmethod
25 |     def tearDownClass(cls):
26 |         del cls.model
27 |         cls.model = None
28 |         del cls.dataset
29 |         cls.dataset = None
30 | 
31 |     def test_count_nodes(self):
32 |         n_trees, n_nodes = ProxyXGBoost._count_nodes(model_file)
33 |         # print "Num Trees: %d\nNum Nodes: %d" % (n_trees, n_nodes),
34 |         assert_equal(n_trees, 2)
35 |         assert_equal(n_nodes, 14)
36 |         assert_equal(n_trees, self.model.trees_root.size)
37 |         assert_equal(n_nodes, self.model.trees_nodes_value.size)
38 | 
39 |     def test_root_nodes(self):
40 |         assert_equal((self.model.trees_root > -1).all(), True,
41 |                      err_msg="Root nodes not set correctly")
42 | 
43 |     def test_root_nodes_adv(self):
44 |         assert_array_equal(self.model.trees_root, [0, 7],
45 |                            err_msg="Root nodes are not correct")
46 | 
47 |     def test_split_features(self):
48 |         assert_array_equal(self.model.trees_nodes_feature,
49 |                            [52, 14, -1, -1, 17, -1, -1,
50 |                             54, 10, -1, -1, 52, -1, -1])
51 | 
52 |     def test_tree_values(self):
53 |         assert_array_almost_equal(
54 |             self.model.trees_nodes_value,
55 |             [4.4466496e-02, 2.6499998e+01, 3.3069301e-02, -2.7455300e-02,
56 |              2.3198698e+01, 2.8918700e-02, 6.7713000e-02, 3.5544997e-03,
57 |              2.0949998e+02, -1.8294000e-03, -4.2518899e-02, 4.4466496e-02,
58 |              2.0927801e-03, 3.0617099e-02],
59 |             decimal=5,
60 |             err_msg="Split thresholds or leaf outputs value are not correct")
61 | 
62 |     def test_left_children(self):
63 |         assert_array_equal(self.model.trees_left_child,
64 |                            [1, 2, -1, -1, 5, -1, -1, 8, 9, -1, -1, 12, -1, -1])
65 | 
66 |     def test_right_children(self):
67 |         assert_array_equal(self.model.trees_right_child,
68 |                            [4, 3, -1, -1, 6, -1, -1, 11, 10, -1, -1, 13, -1, -1])
69 | 
70 |     def test_leaf_correctness(self):
71 |         for idx, feature in enumerate(self.model.trees_nodes_feature):
72 |             if feature == -1:
73 |                 assert_equal(self.model.trees_left_child[idx], -1,
74 |                              "Left child of a leaf node is not empty (-1)")
75 |                 assert_equal(self.model.trees_right_child[idx], -1,
76 |                              "Right child of a leaf node is not empty (-1)")
77 |                 assert_equal(self.model.is_leaf_node(idx), True,
78 |                              "Leaf node not detected as a leaf")
79 | 
80 |     def test_prediction(self):
81 |         y_pred = self.model.score(self.dataset, cache=True)
82 |         assert_array_almost_equal(y_pred[:5],
83 |                                   [0.55953574, 0.47463751, 0.47463751,
84 |                                    0.48639977, 0.47071534])
85 |         assert_array_almost_equal(y_pred[-5:],
86 |                                   [0.43002582, 0.43002582, 0.43002582,
87 |                                    0.47071534, 0.43002582])
88 | 
89 | if __name__ == '__main__':
90 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
91 |                         level=logging.DEBUG)
92 |     unittest.main()
93 | 


--------------------------------------------------------------------------------
/rankeval/analysis/_efficient_feature.pyx:
--------------------------------------------------------------------------------
  1 | """
  2 | This file implements the feature importance analysis in an efficient way. The
  3 | limit is that the metric used to compute the gain for each split is hardcoded
  4 | in the source code and is the MSE.
  5 | """
  6 | 
  7 | import cython
  8 | cimport cython
  9 | 
 10 | # Import the Python-level symbols of numpy
 11 | import numpy as np
 12 | 
 13 | # Import the C-level symbols of numpy
 14 | cimport numpy as np
 15 | 
 16 | # Numpy must be initialized. When using numpy from C or Cython you must
 17 | # _always_ do that, or you will have segfaults
 18 | np.import_array()
 19 | 
 20 | cdef extern from "_efficient_feature_impl.h":
 21 |     void c_feature_importance(
 22 |         const float* X,
 23 |         const float* y,
 24 |         const int* trees_root,
 25 |         const float* trees_weight,
 26 |         const short* trees_nodes_feature,
 27 |         const float* trees_nodes_value,
 28 |         const int* trees_left_child,
 29 |         const int* trees_right_child,
 30 |         float* feature_imp,
 31 |         short* feature_count,
 32 |         const int n_instances,
 33 |         const int n_features,
 34 |         const int n_trees);
 35 | 
 36 |     void c_feature_importance_tree(
 37 |         const float* X,
 38 |         const float* y,
 39 |         const int* trees_root,
 40 |         const float* trees_weight,
 41 |         const short* trees_nodes_feature,
 42 |         const float* trees_nodes_value,
 43 |         const int* trees_left_child,
 44 |         const int* trees_right_child,
 45 |         const int tree_id,
 46 |         float* feature_imp,
 47 |         short* feature_count,
 48 |         const int n_instances,
 49 |         const int n_features,
 50 |         float* y_pred,
 51 |         float* y_pred_tree);
 52 | 
 53 | @cython.boundscheck(False)
 54 | @cython.wraparound(False)
 55 | def eff_feature_importance(model, dataset):
 56 | 
 57 |     # initialize features importance
 58 |     feature_imp = np.zeros(dataset.n_features, dtype=np.float32)
 59 | 
 60 |     # initialize features importance
 61 |     feature_count = np.zeros(dataset.n_features, dtype=np.uint16)
 62 | 
 63 |     c_feature_importance(
 64 |         <float*> np.PyArray_DATA(dataset.X),
 65 |         <float*> np.PyArray_DATA(dataset.y),
 66 |         <int*> np.PyArray_DATA(model.trees_root),
 67 |         <float*> np.PyArray_DATA(model.trees_weight),
 68 |         <short*> np.PyArray_DATA(model.trees_nodes_feature),
 69 |         <float*> np.PyArray_DATA(model.trees_nodes_value),
 70 |         <int*> np.PyArray_DATA(model.trees_left_child),
 71 |         <int*> np.PyArray_DATA(model.trees_right_child),
 72 |         <float*> np.PyArray_DATA(feature_imp),
 73 |         <short*> np.PyArray_DATA(feature_count),
 74 |         dataset.X.shape[0],
 75 |         dataset.X.shape[1],
 76 |         model.n_trees);
 77 | 
 78 |     return np.asarray(feature_imp, dtype=np.float32), \
 79 |            np.asarray(feature_count, dtype=np.uint16)
 80 | 
 81 | @cython.boundscheck(False)
 82 | @cython.wraparound(False)
 83 | def eff_feature_importance_tree(model, dataset, tree_id, y_pred,
 84 |                              feature_imp, feature_count):
 85 | 
 86 |     y_pred_tree = np.zeros(dataset.n_instances, dtype=np.float32);
 87 | 
 88 |     c_feature_importance_tree(
 89 |         <float*> np.PyArray_DATA(dataset.X),
 90 |         <float*> np.PyArray_DATA(dataset.y),
 91 |         <int*> np.PyArray_DATA(model.trees_root),
 92 |         <float*> np.PyArray_DATA(model.trees_weight),
 93 |         <short*> np.PyArray_DATA(model.trees_nodes_feature),
 94 |         <float*> np.PyArray_DATA(model.trees_nodes_value),
 95 |         <int*> np.PyArray_DATA(model.trees_left_child),
 96 |         <int*> np.PyArray_DATA(model.trees_right_child),
 97 |         tree_id,
 98 |         <float*> np.PyArray_DATA(feature_imp),
 99 |         <short*> np.PyArray_DATA(feature_count),
100 |         dataset.X.shape[0],
101 |         dataset.X.shape[1],
102 |         <float*> np.PyArray_DATA(y_pred),
103 |         <float*> np.PyArray_DATA(y_pred_tree));
104 | 
105 |     return np.asarray(y_pred_tree, dtype=np.float32)


--------------------------------------------------------------------------------
/rankeval/metrics/precision.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors: Cristina Muntean <cristina.muntean@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | import numpy as np
  9 | from rankeval.metrics.metric import Metric
 10 | 
 11 | 
 12 | class Precision(Metric):
 13 |     """
 14 |     This class implements Precision as:
 15 |     (relevant docs & retrieved docs) / retrieved docs.
 16 | 
 17 |     It allows setting custom values for cutoff and threshold, otherwise it uses
 18 |     the default values.
 19 | 
 20 |     """
 21 | 
 22 |     _threshold = 1
 23 | 
 24 |     def __init__(self, name='P', cutoff=None, threshold=_threshold):
 25 |         """
 26 |         This is the constructor of Precision, an object of type Metric, with
 27 |         the name P. The constructor also allows setting custom values for cutoff
 28 |         and threshold, otherwise it uses the default values.
 29 | 
 30 |         Parameters
 31 |         ----------
 32 |         name: string
 33 |             P
 34 |         cutoff: int
 35 |             The top k results to be considered at per query level (e.g. 10)
 36 |         threshold: float
 37 |             This parameter considers relevant results all instances with labels
 38 |             different from 0, thus with a minimum label value of 1. It can be
 39 |             set to other values as well (e.g. 3), in the range of possible
 40 |             labels.
 41 | 
 42 |         """
 43 |         super(Precision, self).__init__(name)
 44 |         self.cutoff = cutoff
 45 |         self.threshold = threshold
 46 | 
 47 |     def eval(self, dataset, y_pred):
 48 |         """
 49 |         This method computes the Precision score over the entire dataset and
 50 |         the detailed scores per query. It calls the eval_per query method for
 51 |         each query in order to get the detailed Precision score.
 52 | 
 53 |         Parameters
 54 |         ----------
 55 |         dataset : Dataset
 56 |             Represents the Dataset object on which to apply Precision.
 57 |         y_pred : numpy 1d array of float
 58 |             Represents the predicted document scores for each instance in the
 59 |             dataset.
 60 | 
 61 |         Returns
 62 |         -------
 63 |         avg_score: float
 64 |             The overall Precision score (averages over the detailed precision
 65 |             scores).
 66 |         detailed_scores: numpy 1d array of floats
 67 |             The detailed Precision scores for each query, an array of length of
 68 |             the number of queries.
 69 |         """
 70 |         return super(Precision, self).eval(dataset, y_pred)
 71 | 
 72 |     def eval_per_query(self, y, y_pred):
 73 |         """
 74 |         This methods computes Precision at per query level (on the instances
 75 |         belonging to a specific query). The Precision per query is calculated as
 76 |         <(relevant docs & retrieved docs) / retrieved docs>.
 77 | 
 78 |         Parameters
 79 |         ----------
 80 |         y: numpy array
 81 |             Represents the labels of instances corresponding to one query in
 82 |             the dataset (ground truth).
 83 |         y_pred: numpy array.
 84 |             Represents the predicted document scores obtained during the model
 85 |             scoring phase for that query.
 86 | 
 87 |         Returns
 88 |         -------
 89 |         precision: float
 90 |             The precision per query.
 91 |         """
 92 |         idx_y_pred_sorted = np.argsort(y_pred)[::-1]
 93 |         if self.cutoff is not None:
 94 |             idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff]
 95 | 
 96 |         n_relevant_retrieved = (y[idx_y_pred_sorted] >= self.threshold).sum()
 97 |         return float(n_relevant_retrieved) / len(idx_y_pred_sorted)
 98 | 
 99 |     def __str__(self):
100 |         s = self.name
101 |         if self.cutoff is not None:
102 |             s += "@{}".format(self.cutoff)
103 |         if self.threshold != self._threshold:
104 |             s += "[>{}]".format(self.threshold)
105 |         return s
106 | 


--------------------------------------------------------------------------------
/rankeval/metrics/mrr.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors: Cristina Muntean <cristina.muntean@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | """
  9 | 
 10 | """
 11 | import numpy as np
 12 | from rankeval.metrics import Metric
 13 | 
 14 | 
 15 | class MRR(Metric):
 16 |     """
 17 |     This class implements Mean Reciprocal Rank.
 18 | 
 19 |     """
 20 | 
 21 |     _threshold = 1
 22 | 
 23 |     def __init__(self, name='MRR', cutoff=None, threshold=_threshold):
 24 |         """
 25 |         This is the constructor of MRR, an object of type Metric, with the
 26 |         name MRR. The constructor also allows setting custom values in the
 27 |         following parameters.
 28 | 
 29 |         Parameters
 30 |         ----------
 31 |         name: string
 32 |             MRR
 33 |         cutoff: int
 34 |             The top k results to be considered at per query level (e.g. 10)
 35 |         threshold: float
 36 |             This parameter considers relevant results all instances with labels
 37 |             different from 0, thus with a minimum label value of 1. It can be
 38 |             set to other values as well (e.g. 3), in the range of possible labels.
 39 |         """
 40 |         super(MRR, self).__init__(name)
 41 |         self.cutoff = cutoff
 42 |         self.threshold = threshold
 43 | 
 44 |     def eval(self, dataset, y_pred):
 45 |         """
 46 |         The method computes MRR by taking as input the dataset and the predicted
 47 |         document scores. It returns the averaged MRR score over the entire
 48 |         dataset and the detailed MRR scores per query.
 49 | 
 50 |         The mean reciprocal rank is the average of the reciprocal ranks of
 51 |         results for a sample of queries.
 52 | 
 53 |         Parameters
 54 |         ----------
 55 |         dataset : Dataset
 56 |             Represents the Dataset object on which to apply MRR.
 57 |         y_pred : numpy 1d array of float
 58 |             Represents the predicted document scores for each instance
 59 |             in the dataset.
 60 | 
 61 |         Returns
 62 |         -------
 63 |         avg_score: float
 64 |             Represents the average MRR over all MRR scores per query.
 65 |         detailed_scores: numpy 1d array of floats
 66 |             Represents the detailed MRR scores for each query. It has
 67 |             the length of n_queries.
 68 | 
 69 |         """
 70 |         return super(MRR, self).eval(dataset, y_pred)
 71 | 
 72 |     def eval_per_query(self, y, y_pred):
 73 |         """
 74 |         This method helps compute the MRR score per query. It is called by the
 75 |         eval function which averages and aggregates the scores for each query.
 76 | 
 77 |         We compute the reciprocal rank. The reciprocal rank of a query response
 78 |         is the multiplicative inverse of the rank of the first correct answer.
 79 | 
 80 |         Parameters
 81 |         ----------
 82 |         y: numpy array
 83 |             Represents the labels of instances corresponding to one query in the
 84 |             dataset (ground truth).
 85 |         y_pred: numpy array.
 86 |             Represents the predicted document scores obtained during the model
 87 |             scoring phase for that query.
 88 | 
 89 |         Returns
 90 |         -------
 91 |         mrr: float
 92 |             Represents the MRR score for one query.
 93 |         """
 94 |         idx_y_pred_sorted = np.argsort(y_pred)[::-1]
 95 |         if self.cutoff is not None:
 96 |             idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff]
 97 | 
 98 |         # rank of max predicted score
 99 |         rank_max = None
100 |         for i, idx in enumerate(idx_y_pred_sorted):
101 |             if y[idx] >= self.threshold:
102 |                 rank_max = i
103 |                 break
104 | 
105 |         if rank_max is not None:
106 |             return 1./(rank_max+1)
107 |         else:
108 |             return 0.
109 | 
110 |     def __str__(self):
111 |         s = self.name
112 |         if self.cutoff is not None:
113 |             s += "@{}".format(self.cutoff)
114 |         if self.threshold != self._threshold:
115 |             s += "[>{}]".format(self.threshold)
116 |         return s
117 | 


--------------------------------------------------------------------------------
/rankeval/test/model/test_proxy_CatBoost.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import unittest
  4 | 
  5 | from numpy.testing import assert_equal, assert_array_equal, \
  6 |     assert_array_almost_equal, assert_raises
  7 | 
  8 | from rankeval.dataset import Dataset
  9 | from rankeval.model import ProxyCatBoost
 10 | from rankeval.model import RTEnsemble
 11 | from rankeval.test.base import data_dir
 12 | 
 13 | model_file = os.path.join(data_dir, "CatBoost.model.coreml")
 14 | data_file = os.path.join(data_dir, "msn1.fold1.test.5k.txt")
 15 | 
 16 | try:
 17 |     import coremltools
 18 |     coremltools_missing = False
 19 | except ImportError:
 20 |     coremltools_missing = True
 21 | 
 22 | 
 23 | @unittest.skipIf(coremltools_missing, "coremltools package missing")
 24 | class ProxyCatBoostTestCase(unittest.TestCase):
 25 | 
 26 |     @classmethod
 27 |     def setUpClass(cls):
 28 |         cls.model = RTEnsemble(model_file, format="CatBoost")
 29 |         cls.dataset = Dataset.load(data_file, format="svmlight")
 30 | 
 31 |     @classmethod
 32 |     def tearDownClass(cls):
 33 |         del cls.model
 34 |         cls.model = None
 35 |         del cls.dataset
 36 |         cls.dataset = None
 37 | 
 38 |     def test_count_nodes(self):
 39 | 
 40 |         coreml_model = coremltools.models.model.MLModel(model_file)
 41 |         n_trees, n_nodes = ProxyCatBoost._count_nodes(coreml_model)
 42 |         # print "Num Trees: %d\nNum Nodes: %d" % (n_trees, n_nodes),
 43 |         assert_equal(n_trees, 2)
 44 |         assert_equal(n_nodes, 14)
 45 |         assert_equal(n_trees, self.model.trees_root.size)
 46 |         assert_equal(n_nodes, self.model.trees_nodes_value.size)
 47 | 
 48 |     def test_root_nodes(self):
 49 |         assert_equal((self.model.trees_root > -1).all(), True,
 50 |                      err_msg="Root nodes not set correctly")
 51 | 
 52 |     def test_root_nodes_adv(self):
 53 |         assert_array_equal(self.model.trees_root, [0, 7],
 54 |                            err_msg="Root nodes are not correct")
 55 | 
 56 |     def test_split_features(self):
 57 |         assert_array_equal(self.model.trees_nodes_feature,
 58 |                            [124, 62, 62, -1, -1, -1, -1,
 59 |                             112, 107, 107, -1, -1, -1, -1])
 60 |     #
 61 |     def test_tree_values(self):
 62 |         assert_array_almost_equal(self.model.trees_nodes_value,
 63 |             [-6.988956e+00,  6.712700e-02,  6.712700e-02,  8.421655e-03,
 64 |              1.095791e-03,  8.926381e-03, -1.645530e-02,
 65 |              -1.208052e+01, 1.148206e+01,  1.148206e+01,  1.408405e-02,
 66 |              -9.354122e-04, 6.002808e-03, -1.578260e-02],
 67 |             decimal=5,
 68 |             err_msg="Split thresholds or leaf outputs value are not correct")
 69 | 
 70 |     def test_left_children(self):
 71 |         assert_array_equal(self.model.trees_left_child,
 72 |                            [2, 4, 6, -1, -1, -1, -1,
 73 |                             9, 11, 13, -1, -1, -1, -1])
 74 | 
 75 |     def test_right_children(self):
 76 |         assert_array_equal(self.model.trees_right_child,
 77 |                            [1, 3, 5, -1, -1, -1, -1,
 78 |                             8, 10, 12, -1, -1, -1, -1])
 79 | 
 80 |     def test_leaf_correctness(self):
 81 |         for idx, feature in enumerate(self.model.trees_nodes_feature):
 82 |             if feature == -1:
 83 |                 assert_equal(self.model.trees_left_child[idx], -1,
 84 |                              "Left child of a leaf node is not empty (-1)")
 85 |                 assert_equal(self.model.trees_right_child[idx], -1,
 86 |                              "Right child of a leaf node is not empty (-1)")
 87 |                 assert_equal(self.model.is_leaf_node(idx), True,
 88 |                              "Leaf node not detected as a leaf")
 89 | 
 90 |     def test_prediction(self):
 91 |         y_pred = self.model.score(self.dataset, cache=False)
 92 |         assert_array_almost_equal(y_pred[:5],
 93 |                                   [0.00748624, -0.03223789, -0.01468681,
 94 |                                    0.02301043, -0.03223789])
 95 |         assert_array_almost_equal(y_pred[-5:],
 96 |                                   [0.02301043, -0.03223789, 0.02301043,
 97 |                                    0.02301043, -0.03223789])
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
102 |                         level=logging.DEBUG)
103 |     unittest.main()
104 | 


--------------------------------------------------------------------------------
/rankeval/metrics/metric.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors: Cristina Muntean <cristina.muntean@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | """
  9 | 
 10 | """
 11 | import numpy as np
 12 | from abc import ABCMeta, abstractmethod
 13 | import six
 14 | 
 15 | 
 16 | class Metric(six.with_metaclass(ABCMeta)):
 17 |     """
 18 |     Metric is an abstract class which provides an interface for specific metrics.
 19 |     It also offers 2 methods, one for iterating over the indeces for a certain
 20 |     query and another for iterating over the entire dataset based on those
 21 |     indices.
 22 | 
 23 |     Some intuitions:
 24 |     https://stats.stackexchange.com/questions/159657/metrics-for-evaluating-ranking-algorithms
 25 |     """
 26 | 
 27 |     @abstractmethod
 28 |     def __init__(self, name):
 29 |         """
 30 |         The constructor for any metric; it initializes that metric with the
 31 |         proper name.
 32 |         
 33 |         Parameters
 34 |         ----------
 35 |         name : string
 36 |             Represents the name of that metric instance.
 37 |         """
 38 |         self.name = name
 39 |         self.detailed_scores = None
 40 | 
 41 |     @abstractmethod
 42 |     def eval(self, dataset, y_pred):
 43 |         """
 44 |         This abstract method computes a specific metric over the predicted
 45 |         scores for a test dataset. It calls the eval_per query method for each
 46 |         query in order to get the detailed metric score.
 47 | 
 48 |         Parameters
 49 |         ----------
 50 |         dataset : Dataset
 51 |             Represents the Dataset object on which we want to apply the metric.
 52 |         y_pred : numpy 1d array of float
 53 |             Represents the predicted document scores for each instance in the
 54 |             dataset.
 55 | 
 56 |         Returns
 57 |         -------
 58 |         avg_score: float
 59 |             Represents the average values of a metric over all metric scores
 60 |             per query.
 61 |         detailed_scores: numpy 1d array of floats
 62 |             Represents the detailed metric scores for each query. It has the
 63 |             length of n_queries.
 64 |         """
 65 |         self.detailed_scores = np.zeros(dataset.n_queries, dtype=np.float32)
 66 | 
 67 |         for rel_qid, (qid, q_y, q_y_pred) in enumerate(
 68 |                 self.query_iterator(dataset, y_pred)):
 69 |             self.detailed_scores[rel_qid] = self.eval_per_query(q_y, q_y_pred)
 70 |         return np.nanmean(self.detailed_scores), self.detailed_scores
 71 | 
 72 |     @abstractmethod
 73 |     def eval_per_query(self, y, y_pred):
 74 |         """
 75 |         This methods helps to evaluate the predicted scores for a specific
 76 |         query within the dataset.
 77 | 
 78 |         Parameters
 79 |         ----------
 80 |         y: numpy array
 81 |             Represents the instance labels corresponding to the queries in the
 82 |             dataset (ground truth).
 83 |         y_pred: numpy array.
 84 |             Represents the predicted document scores obtained during the model
 85 |             scoring phase for that query.
 86 | 
 87 |         Returns
 88 |         -------
 89 |         : float
 90 |             Represents the metric score for one query.
 91 |         """
 92 | 
 93 |     def query_iterator(self, dataset, y_pred):
 94 |         """
 95 |         This method iterates over dataset document scores and predicted scores
 96 |         in blocks of instances which belong to the same query.
 97 |         Parameters
 98 |         ----------
 99 |         dataset :  Datatset
100 |         y_pred  : numpy array
101 | 
102 |         Returns
103 |         -------
104 |         : int
105 |             The query id.
106 |         : numpy.array
107 |             The document scores of the instances in the labeled dataset
108 |             (instance labels) belonging to the same query id.
109 |         : numpy.array
110 |             The predicted scores for the instances in the dataset belonging to
111 |             the same query id.
112 |         """
113 |         for query_id, start_offset, end_offset in dataset.query_iterator():
114 |             yield (query_id,
115 |                    dataset.y[start_offset:end_offset],
116 |                    y_pred[start_offset:end_offset])
117 | 


--------------------------------------------------------------------------------
/rankeval/test/model/test_proxy_Jforests.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import unittest
  4 | 
  5 | from numpy.testing import assert_equal, assert_array_equal, \
  6 |     assert_array_almost_equal
  7 | 
  8 | from rankeval.dataset import Dataset
  9 | from rankeval.model import ProxyJforests
 10 | from rankeval.model import RTEnsemble
 11 | from rankeval.test.base import data_dir
 12 | 
 13 | model_file = os.path.join(data_dir, "Jforests.model.xml")
 14 | data_file = os.path.join(data_dir, "msn1.fold1.test.5k.txt")
 15 | 
 16 | 
 17 | class ProxyJforestsTestCase(unittest.TestCase):
 18 | 
 19 |     @classmethod
 20 |     def setUpClass(cls):
 21 |         cls.model = RTEnsemble(model_file, format="Jforests")
 22 |         cls.dataset = Dataset.load(data_file, format="svmlight")
 23 | 
 24 |     @classmethod
 25 |     def tearDownClass(cls):
 26 |         del cls.model
 27 |         cls.model = None
 28 |         del cls.dataset
 29 |         cls.dataset = None
 30 | 
 31 |     def test_count_nodes(self):
 32 |         n_trees, n_nodes = ProxyJforests._count_nodes(model_file)
 33 |         assert_equal(n_trees, 2)
 34 |         assert_equal(n_nodes, 26)
 35 |         assert_equal(n_trees, self.model.trees_root.size)
 36 |         assert_equal(n_nodes, self.model.trees_nodes_value.size)
 37 | 
 38 |     def test_root_nodes(self):
 39 |         assert_equal((self.model.trees_root > -1).all(), True,
 40 |                      "Root nodes not set correctly")
 41 | 
 42 |     def test_root_nodes_adv(self):
 43 |         assert_array_equal(self.model.trees_root, [0, 13],
 44 |                            "Root nodes are not correct")
 45 | 
 46 |     def test_tree_weights(self):
 47 |         assert_array_almost_equal(self.model.trees_weight,
 48 |                                   [1.0, 1.0],
 49 |                                   err_msg="Tree Weights are not correct")
 50 | 
 51 |     def test_split_features(self):
 52 |         assert_array_equal(self.model.trees_nodes_feature,
 53 |                            [129, 129, 107, 72, 55, 54,
 54 |                             -1, -1, -1, -1, -1, -1, -1,
 55 |                             133, 72, 105, 130, 62, 121,
 56 |                             -1, -1, -1, -1, -1, -1, -1])
 57 | 
 58 |     def test_tree_values(self):
 59 |         assert_array_almost_equal(self.model.trees_nodes_value,
 60 |             [268.0079, 265.0144, 13.9174, 19.1123, 0.00976, 0.0185,
 61 |              -1.2156, -0.2370, -1.9329, 0.8030, -0.01019, -1.9395, 0.5840,
 62 |              0.0, 21.3979, 13.2636, 181.0142, 0.3333, -1.5976, -0.1443,
 63 |              1.3819, 1.7707, 1.7353, 0.2240, -0.3769, -1.7937],
 64 |             decimal=4,
 65 |             err_msg="Split threshold values or leaf outputs are not correct")
 66 | 
 67 |     def test_left_children(self):
 68 |         assert_array_equal(self.model.trees_left_child,
 69 |                            [1,  3,  5,  6,  8,  7, -1, -1, -1, -1, -1, -1, -1,
 70 |                             14, 17, 21, 22, 19, 24, -1, -1, -1, -1, -1, -1, -1])
 71 | 
 72 |     def test_right_children(self):
 73 |         assert_array_equal(self.model.trees_right_child,
 74 |                            [2, 4, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1,
 75 |                             20, 15, 16, 23, 18, 25, -1, -1, -1, -1, -1, -1, -1])
 76 | 
 77 |     def test_leaf_correctness(self):
 78 |         for idx, feature in enumerate(self.model.trees_nodes_feature):
 79 |             if feature == -1:
 80 |                 assert_equal(self.model.trees_left_child[idx], -1,
 81 |                              "Left child of a leaf node is not empty (-1)")
 82 |                 assert_equal(self.model.trees_right_child[idx], -1,
 83 |                              "Right child of a leaf node is not empty (-1)")
 84 |                 assert_equal(self.model.is_leaf_node(idx), True,
 85 |                              "Leaf node not detected as a leaf")
 86 | 
 87 |     def test_prediction(self):
 88 |         y_pred = self.model.score(self.dataset)
 89 |         assert_array_almost_equal(y_pred[:5],
 90 |                                   [-2.083870, -1.359969, -1.359969,
 91 |                                    0.426128, -0.381351])
 92 |         assert_array_almost_equal(y_pred[-5:],
 93 |                                   [1.027176, -0.381351, -2.077223,
 94 |                                    0.658770, -0.381351])
 95 | 
 96 | 
 97 | if __name__ == '__main__':
 98 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
 99 |                         level=logging.DEBUG)
100 |     unittest.main()
101 | 


--------------------------------------------------------------------------------
/rankeval/metrics/recall.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors: Cristina Muntean <cristina.muntean@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | import numpy as np
  9 | from rankeval.metrics.metric import Metric
 10 | 
 11 | 
 12 | class Recall(Metric):
 13 |     """
 14 |     This class implements Recall as:
 15 |     (relevant docs & retrieved docs) / relevant docs.
 16 | 
 17 |     It allows setting custom values for cutoff and threshold, otherwise it uses
 18 |     the default values.
 19 | 
 20 |     """
 21 | 
 22 |     _threshold = 1
 23 | 
 24 |     def __init__(self, name='R', no_relevant_results=0.0,
 25 |                  cutoff=None, threshold=_threshold):
 26 |         """
 27 |         This is the constructor of Recall, an object of type Metric, with
 28 |         the name R. The constructor also allows setting custom values
 29 |         for cutoff and threshold, otherwise it uses the default values.
 30 | 
 31 |         Parameters
 32 |         ----------
 33 |         name: string
 34 |             R
 35 |         no_relevant_results: float
 36 |             Float indicating how to treat the cases where then are no relevant
 37 |             results (e.g. 0.0).
 38 |         cutoff: int
 39 |             The top k results to be considered at per query level (e.g. 10)
 40 |         threshold: float
 41 |             This parameter considers relevant results all instances with labels
 42 |             different from 0, thus with a minimum label value of 1. It can be
 43 |             set to other values as well (e.g. 3), in the range of possible
 44 |             labels.
 45 |         """
 46 |         super(Recall, self).__init__(name)
 47 |         self.no_relevant_results = no_relevant_results
 48 |         self.cutoff = cutoff
 49 |         self.threshold = threshold
 50 | 
 51 |     def eval(self, dataset, y_pred):
 52 |         """
 53 |         This method computes the Recall score over the entire dataset and the
 54 |         detailed scores per query. It calls the eval_per query method for each
 55 |         query in order to get the detailed Recall score.
 56 | 
 57 |         Parameters
 58 |         ----------
 59 |         dataset : Dataset
 60 |             Represents the Dataset object on which to apply Recall.
 61 |         y_pred : numpy 1d array of float
 62 |             Represents the predicted document scores for each instance in the
 63 |             dataset.
 64 | 
 65 |         Returns
 66 |         -------
 67 |         avg_score: float
 68 |             The overall Recall score (averages over the detailed precision
 69 |             scores).
 70 |         detailed_scores: numpy 1d array of floats
 71 |             The detailed Recall scores for each query, an array of length of
 72 |             the number of queries.
 73 |         """
 74 |         return super(Recall, self).eval(dataset, y_pred)
 75 | 
 76 | 
 77 |     def eval_per_query(self, y, y_pred):
 78 |         """
 79 |         This methods computes Recall at per query level (on the instances
 80 |         belonging to a specific query). The Recall per query is calculated as
 81 |         <(relevant docs & retrieved docs) / relevant docs>.
 82 | 
 83 |         Parameters
 84 |         ----------
 85 |         y: numpy array
 86 |             Represents the labels of instances corresponding to one query in
 87 |             the dataset (ground truth).
 88 |         y_pred: numpy array.
 89 |             Represents the predicted document scores obtained during the model
 90 |             scoring phase for that query.
 91 | 
 92 |         Returns
 93 |         -------
 94 |         recall: float
 95 |             The Recall score per query.
 96 | 
 97 |         """
 98 |         idx_y_pred_sorted = np.argsort(y_pred)[::-1]
 99 |         if self.cutoff is not None:
100 |             idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff]
101 | 
102 |         n_relevant_retrieved = (y[idx_y_pred_sorted] >= self.threshold).sum()
103 |         n_relevant = (y >= self.threshold).sum()
104 | 
105 |         if n_relevant != 0:
106 |             return float(n_relevant_retrieved) / n_relevant
107 |         else:
108 |             return self.no_relevant_results
109 | 
110 |     def __str__(self):
111 |         s = self.name
112 |         if self.cutoff is not None:
113 |             s += "@{}".format(self.cutoff)
114 |         if self.threshold != self._threshold:
115 |             s += "[>{}]".format(self.threshold)
116 |         return s
117 | 


--------------------------------------------------------------------------------
/rankeval/test/data/LightGBM.model.txt:
--------------------------------------------------------------------------------
 1 | tree
 2 | num_class=1
 3 | num_tree_per_iteration=1
 4 | label_index=0
 5 | max_feature_idx=135
 6 | objective=lambdarank
 7 | feature_names=Column_0 Column_1 Column_2 Column_3 Column_4 Column_5 Column_6 Column_7 Column_8 Column_9 Column_10 Column_11 Column_12 Column_13 Column_14 Column_15 Column_16 Column_17 Column_18 Column_19 Column_20 Column_21 Column_22 Column_23 Column_24 Column_25 Column_26 Column_27 Column_28 Column_29 Column_30 Column_31 Column_32 Column_33 Column_34 Column_35 Column_36 Column_37 Column_38 Column_39 Column_40 Column_41 Column_42 Column_43 Column_44 Column_45 Column_46 Column_47 Column_48 Column_49 Column_50 Column_51 Column_52 Column_53 Column_54 Column_55 Column_56 Column_57 Column_58 Column_59 Column_60 Column_61 Column_62 Column_63 Column_64 Column_65 Column_66 Column_67 Column_68 Column_69 Column_70 Column_71 Column_72 Column_73 Column_74 Column_75 Column_76 Column_77 Column_78 Column_79 Column_80 Column_81 Column_82 Column_83 Column_84 Column_85 Column_86 Column_87 Column_88 Column_89 Column_90 Column_91 Column_92 Column_93 Column_94 Column_95 Column_96 Column_97 Column_98 Column_99 Column_100 Column_101 Column_102 Column_103 Column_104 Column_105 Column_106 Column_107 Column_108 Column_109 Column_110 Column_111 Column_112 Column_113 Column_114 Column_115 Column_116 Column_117 Column_118 Column_119 Column_120 Column_121 Column_122 Column_123 Column_124 Column_125 Column_126 Column_127 Column_128 Column_129 Column_130 Column_131 Column_132 Column_133 Column_134 Column_135
 8 | feature_infos=[0:7] [0:4] [0:6] [0:6] [0:7] [0:1] [0:1] [0:1] [0:1] [0:1] [0:5487] [0:175] [0:143] [3:39] [3:5499] [1.0614420175552368:26.475236892700195] [6.3643679618835449:55.236095428466797] [6.1725778579711914:49.514362335205078] [5.7439260482788086:52.309181213378906] [1.0482590198516846:26.469913482666016] [0:585] [0:29] [0:23] [0:7] [0:593] [0:194] [0:7] [0:11] [0:2] [0:197] [0:314] [0:12] [0:18] [0:4] [0:318] [0:195] [0:9.6666669845581055] [0:11.5] [0:2.3333330154418945] [0:197.66667175292969] [0:17193.5546875] [0:8] [0:42.25] [0:2] [0:17193.5546875] [0:1] [0:1] [0:1] [0:0.66666698455810547] [0:0.5] [0:1] [0:1] [0:1] [0:0.5] [0:0.42857098579406738] [0:1] [0:1] [0:1] [0:0.5] [0:0.42857098579406738] [0:1] [0:1] [0:1] [0:0.5] [0:0.42857098579406738] [0:0.020833000540733337] [0:0.25] [0:0.25] [0:0.027777999639511108] [0:0.014824000187218189] [-0.17334100604057312:2445.951904296875] [0:255.47145080566406] [0:171.33871459960938] [0:54.946834564208984] [-0.1763560026884079:2479.425048828125] [-3.6401700973510742:1988.513427734375] [0:55.871494293212891] [0:93.686378479003906] [0:31.425260543823242] [-3.7034800052642822:2029.4710693359375] [0:1988.513427734375] [0:105.70162200927734] [0:110.33704376220703] [0:32.652210235595703] [0:2029.4710693359375] [-0.024762999266386032:1988.513427734375] [0:85.157150268554688] [0:93.686378479003906] [0:31.425260543823242] [-0.025194000452756882:2029.4710693359375] [0:301643.53125] [0:593.0015869140625] [0:1452.83251953125] [0:137.64729309082031] [0:310211.625] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [0:1] [-0.056940000504255295:74.395339965820312] [0:32.302749633789062] [0:47.534080505371094] [0:36.460765838623047] [-0.058655001223087311:74.817634582519531] [-79.574264526367188:0] [-44.824966430664062:0] [-60.532871246337891:0] [-64.522880554199219:0] [-79.512725830078125:0] [-70.377952575683594:0] [-54.401905059814453:0] [-62.203113555908203:0] [-66.05316162109375:0] [-70.324241638183594:0] [-79.111625671386719:0] [-52.357414245605469:0] [-71.119285583496094:0] [-75.108413696289062:0] [-79.036079406738281:0] [1:25] [9:219] [0:159613600] [0:91] [115:65534] [1:65535] [1:251] [1:254] [0:35578] [0:1060864] [0:590.566650390625]
 9 | 
10 | Tree=0
11 | num_leaves=3
12 | split_feature=55 134
13 | split_gain=24.316440938511686 19.637733333720618
14 | threshold=0.0036310000577941537 206
15 | decision_type=2 2
16 | default_value=0 0
17 | left_child=-1 -2
18 | right_child=1 -3
19 | leaf_parent=0 1 1
20 | leaf_value=-0.03717880105550684 -0.00015711314723408512 0.03066546801204972
21 | leaf_count=328 4540 132
22 | internal_value=0 0.017716937701040228
23 | internal_count=5000 4672
24 | shrinkage=0.1
25 | has_categorical=0
26 | 
27 | 
28 | Tree=1
29 | num_leaves=3
30 | split_feature=133 48
31 | split_gain=2386.6406638072776 1066.9165015298936
32 | threshold=9.9999996826552254e-21 0.2330314964056015
33 | decision_type=2 2
34 | default_value=0 0
35 | left_child=1 -1
36 | right_child=-2 -3
37 | leaf_parent=1 0 1
38 | leaf_value=-0.0045894326568938319 0.034326154348027604 0.01791469712217866
39 | leaf_count=4507 158 335
40 | internal_value=0 -0.025416647783118203
41 | internal_count=5000 4842
42 | shrinkage=0.1
43 | has_categorical=0
44 | 
45 | 
46 | 
47 | feature importances:
48 | Column_48=1
49 | Column_55=1
50 | Column_133=1
51 | Column_134=1
52 | 
53 | pandas_categorical:null


--------------------------------------------------------------------------------
/rankeval/metrics/pfound.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors: Cristina Muntean <cristina.muntean@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | 
  9 | import numpy as np
 10 | from rankeval.metrics import Metric
 11 | 
 12 | 
 13 | class Pfound(Metric):
 14 |     """
 15 |     This class implements Pfound with several parameters.
 16 | 
 17 |     The ERR metric is very similar to the pFound metric used by
 18 |     Yandex (Segalovich, 2010).
 19 |     [http://proceedings.mlr.press/v14/chapelle11a/chapelle11a.pdf].
 20 | 
 21 |     In fact pFound is identical to the ERR variant described in
 22 |     (Chapelle et al., 2009, Section 7.2). We implemented pFound similar
 23 |     to ERR in section 7.2 of http://olivier.chapelle.cc/pub/err.pdf.
 24 | 
 25 |     """
 26 |     def __init__(self, name='Pf', cutoff=None, p_abandonment=0.15):
 27 |         """
 28 |         This is the constructor of Pfound, an object of type Metric, with
 29 |         the name Pf. The constructor also allows setting custom values in
 30 |         the following parameters.
 31 | 
 32 |         Parameters
 33 |         ----------
 34 |         name: string
 35 |             Pf
 36 |         cutoff: int
 37 |             The top k results to be considered at per query level (e.g. 10),
 38 |             otherwise the default value is None and is computed on all the
 39 |             instances of a query.
 40 |         p_abandonment: float
 41 |             This parameter indicates the probability of abandonment, i.e.
 42 |             the user stops looking a the ranked list due to an external reason.
 43 |             The original cascade model of ERR has later been extended to include
 44 |             an abandonment probability: if the user is not satisfied at a given
 45 |             position, he will examine the next url with probability y, but has
 46 |             a probability 1-y of abandoning.
 47 | 
 48 |         """
 49 |         super(Pfound, self).__init__(name)
 50 |         self.cutoff = cutoff
 51 |         self.p_abandonment = p_abandonment
 52 | 
 53 |     def eval(self, dataset, y_pred):
 54 |         """
 55 |         The method computes Pfound by taking as input the dataset and the
 56 |         predicted document scores. It returns the averaged Pfound score over
 57 |         the entire dataset and the detailed Pfound scores per query.
 58 | 
 59 |         Parameters
 60 |         ----------
 61 |         dataset : Dataset
 62 |             Represents the Dataset object on which to apply Pfound.
 63 |         y_pred : numpy 1d array of float
 64 |             Represents the predicted document scores for each instance in
 65 |             the dataset.
 66 | 
 67 |         Returns
 68 |         -------
 69 |         avg_score: float
 70 |             Represents the average Pfound over all Pfound scores per query.
 71 |         detailed_scores: numpy 1d array of floats
 72 |             Represents the detailed Pfound scores for each query. It has the
 73 |             length of n_queries.
 74 |         """
 75 |         return super(Pfound, self).eval(dataset, y_pred)
 76 | 
 77 |     def eval_per_query(self, y, y_pred):
 78 |         """
 79 |         This method helps compute the Pfound score per query. It is called by
 80 |         the eval function which averages and aggregates the scores for each
 81 |         query.
 82 | 
 83 |         Parameters
 84 |         ----------
 85 |         y: numpy array
 86 |             Represents the labels of instances corresponding to one query in
 87 |             the dataset (ground truth).
 88 |         y_pred: numpy array
 89 |             Represents the predicted document scores obtained during the model
 90 |             scoring phase for that query.
 91 | 
 92 |         Returns
 93 |         -------
 94 |         pfound: float
 95 |             Represents the Pfound score for one query.
 96 | 
 97 |         """
 98 |         idx_y_pred_sorted = np.argsort(y_pred)[::-1]
 99 |         if self.cutoff is not None:
100 |             idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff]
101 | 
102 |         max_grade = y.max()  # max relevance score
103 |         prob_step_down = 1.0
104 |         pfound = 0.0
105 | 
106 |         for i, idx in enumerate(idx_y_pred_sorted):
107 |             utility = (pow(2., y[idx]) - 1.) / pow(2., max_grade)
108 |             pfound += prob_step_down * utility * pow(self.p_abandonment, i)
109 |             prob_step_down *= (1. - utility)
110 | 
111 |         return pfound
112 | 
113 |     def __str__(self):
114 |         s = self.name
115 |         if self.cutoff is not None:
116 |             s += "@{}".format(self.cutoff)
117 |         return s


--------------------------------------------------------------------------------
/rankeval/dataset/write_json_dataset_catalogue.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
 2 | # Authors: Franco Maria Nardini <francomaria.nardini@isti.cnr.it>
 3 | #
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | import json
 9 | 
10 | """
11 | This is a simple python script that generates the dataset catalogue. It is then
12 | dumped in JSON for simple and easy handling.
13 | """
14 | 
15 | output_file = "dataset_dictionary.json"
16 | 
17 | def __add_istella_full__(data):
18 |     item = {'TRAIN_FILE': 'full/train.txt',
19 |             'TEST_FILE': 'full/test.txt',
20 |             'VALIDATION_FILE': 'None',
21 |             'LICENSE_FILE': 'istella-letor-LA.txt',
22 |             'DATASET_ARCHIVE_NAME': 'istella-letor.tar.gz',
23 |             'MODELS_ARCHIVE_NAME': 'istella-letor-models.tar.gz',
24 |             # DATASET_URL = ("http://library.istella.it/" "dataset/istella-letor.tar.gz")
25 |             'DATASET_URL': ("http://rankeval.isti.cnr.it/" "rankeval-datasets/istella-letor/dataset/istella-letor.tar.gz"),
26 |             'MODELS_URL' : ("http://rankeval.isti.cnr.it/" "rankeval-datasets/istella-letor/models/istella-letor-models.tar.gz"),
27 |             'BLOG_POST_URL': 'http://blog.istella.it/istella-learning-to-rank-dataset/',
28 |             'DATASET_NAME': 'istella-full',
29 |             'DATASET_DESCRIPTION': 'The istella LETOR full dataset',
30 |             'DATASET_FORMAT': 'svmlight'}
31 |     data[item['DATASET_NAME']] = item
32 |     return data
33 | 
34 | def __add_istella_sample__(data):
35 |     item = {'TRAIN_FILE': 'sample/train.txt',
36 |             'TEST_FILE': 'sample/test.txt',
37 |             'VALIDATION_FILE': 'sample/vali.txt',
38 |             'LICENSE_FILE': 'istella-letor-LA.txt',
39 |             'DATASET_ARCHIVE_NAME': 'istella-s-letor.tar.gz',
40 |             'MODELS_ARCHIVE_NAME': 'istella-s-letor-models.tar.gz',
41 |             # DATASET_URL = ("http://library.istella.it/" "dataset/istella-letor.tar.gz")
42 |             'DATASET_URL': ("http://rankeval.isti.cnr.it/" "rankeval-datasets/istella-s-letor/dataset/istella-s-letor.tar.gz"),
43 |             'MODELS_URL' : ("http://rankeval.isti.cnr.it/" "rankeval-datasets/istella-s-letor/models/istella-s-letor-models.tar.gz"),
44 |             'BLOG_POST_URL': 'http://blog.istella.it/istella-learning-to-rank-dataset/',
45 |             'DATASET_NAME': 'istella-sample',
46 |             'DATASET_DESCRIPTION': 'The istella LETOR sample dataset',
47 |             'DATASET_FORMAT': 'svmlight'}
48 |     data[item['DATASET_NAME']] = item
49 |     return data
50 | 
51 | def __add_msn10k__(data):
52 |     item = {'COMMON_SUBFOLDER_NAME': 'Fold',
53 |             'TRAIN_FILE': 'train.txt',
54 |             'TEST_FILE': 'test.txt',
55 |             'VALIDATION_FILE': 'vali.txt',
56 |             'DATASET_ARCHIVE_NAME': 'msn10k.tar.gz',
57 |             'MODELS_ARCHIVE_NAME': 'msn10k-models.tar.gz',
58 |             'DATASET_URL': ("http://rankeval.isti.cnr.it/" "rankeval-datasets/msn10k/dataset/msn10k.tar.gz"),
59 |             'MODELS_URL' : ("http://rankeval.isti.cnr.it/" "rankeval-datasets/msn10k/models/msn10k-models.tar.gz"),
60 |             'BLOG_POST_URL': 'https://www.microsoft.com/en-us/research/project/mslr/',
61 |             'DATASET_NAME': 'msn10k',
62 |             'DATASET_DESCRIPTION': 'Microsoft Learning to Rank Datasets (WEB10K)',
63 |             'DATASET_FORMAT': 'svmlight'}
64 |     data[item['DATASET_NAME']] = item
65 |     return data
66 | 
67 | def __add_msn30k__(data):
68 |     item = {'COMMON_SUBFOLDER_NAME': 'Fold',
69 |             'TRAIN_FILE': 'train.txt',
70 |             'TEST_FILE': 'test.txt',
71 |             'VALIDATION_FILE': 'vali.txt',
72 |             'DATASET_ARCHIVE_NAME': 'msn30k.tar.gz',
73 |             'MODELS_ARCHIVE_NAME': 'msn30k-models.tar.gz',
74 |             'DATASET_URL': ("http://rankeval.isti.cnr.it/" "rankeval-datasets/msn30k/dataset/msn30k.tar.gz"),
75 |             'MODELS_URL' : ("http://rankeval.isti.cnr.it/" "rankeval-datasets/msn30k/models/msn30k-models.tar.gz"),
76 |             'BLOG_POST_URL': 'https://www.microsoft.com/en-us/research/project/mslr/',
77 |             'DATASET_NAME': 'msn30k',
78 |             'DATASET_DESCRIPTION': 'Microsoft Learning to Rank Datasets (WEB30K)',
79 |             'DATASET_FORMAT': 'svmlight'}
80 |     data[item['DATASET_NAME']] = item
81 |     return data
82 | 
83 | def main():
84 |     data = dict()
85 |     __add_istella_full__(data)
86 |     __add_istella_sample__(data)
87 |     __add_msn10k__(data)
88 |     __add_msn30k__(data)
89 |     with open(output_file, 'w') as fp:
90 |         json.dump(data, fp, sort_keys=True, indent=4)
91 | 
92 | if __name__ == "__main__":
93 |     main()
94 | 


--------------------------------------------------------------------------------
/rankeval/metrics/map.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors: Cristina Muntean <cristina.muntean@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | 
  9 | import numpy as np
 10 | from rankeval.metrics.metric import Metric
 11 | 
 12 | 
 13 | class MAP(Metric):
 14 |     """
 15 |     This class implements MAP with several parameters. We implemented MAP as in
 16 |     https://www.kaggle.com/wiki/MeanAveragePrecision, adapted from:
 17 |     http://en.wikipedia.org/wiki/Information_retrieval
 18 |     https://www.ethz.ch/content/dam/ethz/special-interest/gess/computational-social-science-dam/documents/education/Spring2017/ML/LinkPrediction.pdf
 19 |     http://sdsawtelle.github.io/blog/output/mean-average-precision-MAP-for-recommender-systems.html
 20 |     """
 21 | 
 22 |     def __init__(self, name='MAP', cutoff=None, no_relevant_results=1.0):
 23 |         """
 24 |         This is the constructor of MAP, an object of type Metric, with
 25 |         the name MAP. The constructor also allows setting custom values in the
 26 |         following parameters.
 27 | 
 28 |         Parameters
 29 |         ----------
 30 |         name: string
 31 |             MAP
 32 |         cutoff: int
 33 |             The top k results to be considered at per query level (e.g. 10),
 34 |             otherwise the default value is None and is computed on all the
 35 |             instances of a query.
 36 |         no_relevant_results: float
 37 |             Float indicating how to treat the cases where then are no relevant
 38 |             results (e.g. 0.5). Default is 1.0.
 39 |         """
 40 |         super(MAP, self).__init__(name)
 41 |         self.cutoff = cutoff
 42 |         self.no_relevant_results = no_relevant_results
 43 | 
 44 |     def eval(self, dataset, y_pred):
 45 |         """
 46 |         This method takes the AP@k for each query and calculates the average,
 47 |         thus MAP@k.
 48 | 
 49 |         Parameters
 50 |         ----------
 51 |         dataset : Dataset
 52 |             Represents the Dataset object on which to apply MAP.
 53 |         y_pred : numpy 1d array of float
 54 |             Represents the predicted document scores for each instance in
 55 |             the dataset.
 56 | 
 57 |         Returns
 58 |         -------
 59 |         avg_score: float
 60 |             The overall MAP@k score (averages over the detailed MAP scores).
 61 |         detailed_scores: numpy 1d array of floats
 62 |             The detailed AP@k scores for each query, an array of length of
 63 |             the number of queries.
 64 |         """
 65 |         return super(MAP, self).eval(dataset, y_pred)
 66 | 
 67 |     def eval_per_query(self, y, y_pred):
 68 |         """
 69 |         This methods computes AP@k at per query level (on the instances
 70 |         belonging to a specific query). The AP@k per query is calculated as
 71 | 
 72 |         ap@k = sum( P(k) / min(m,n) ), for k=1,n
 73 | 
 74 |         where:
 75 |             - P(k) means the precision at cut-off k in the item list. P(k)
 76 |             equals 0 when the k-th item is not followed upon recommendation
 77 |             - m is the overall number of relevant documents
 78 |             - n is the number of predicted documents
 79 | 
 80 |         If the denominator is zero, P(k)/min(m,n) is set to zero.
 81 | 
 82 |         Parameters
 83 |         ----------
 84 |         y: numpy array
 85 |             Represents the labels of instances corresponding to one query in
 86 |             the dataset (ground truth).
 87 |         y_pred: numpy array.
 88 |             Represents the predicted document scores obtained during the model
 89 |             scoring phase for that query.
 90 | 
 91 |         Returns
 92 |         -------
 93 |         map : float
 94 |             The MAP per query.
 95 |         """
 96 |         idx_y_pred_sorted = np.argsort(y_pred)[::-1]
 97 |         if self.cutoff is not None:
 98 |             idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff]
 99 | 
100 |         n_retrieved = len(idx_y_pred_sorted)
101 |         precision_at_i = 0.
102 |         n_relevant_retrieved_at_i = 0.
103 |         for i in range(n_retrieved):
104 |             if y[idx_y_pred_sorted[i]] > 0:
105 |                 n_relevant_retrieved_at_i += 1
106 |                 precision_at_i += n_relevant_retrieved_at_i / (i + 1)
107 | 
108 |         if n_relevant_retrieved_at_i > 0:
109 |             return precision_at_i / min(n_retrieved, np.count_nonzero(y))
110 |         else:
111 |             return self.no_relevant_results
112 | 
113 |     def __str__(self):
114 |         s = self.name
115 |         if self.cutoff is not None:
116 |             s += "@{}".format(self.cutoff)
117 |         return s
118 | 


--------------------------------------------------------------------------------
/rankeval/test/model/test_proxy_QuickRank.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import unittest
  4 | 
  5 | from numpy.testing import assert_equal, assert_array_equal, \
  6 |     assert_array_almost_equal
  7 | 
  8 | from rankeval.model import ProxyQuickRank
  9 | from rankeval.model import RTEnsemble
 10 | from rankeval.test.base import data_dir
 11 | 
 12 | model_file = os.path.join(data_dir, "quickrank.model.xml")
 13 | 
 14 | 
 15 | class ProxyQuickRankTestCase(unittest.TestCase):
 16 | 
 17 |     def setUp(self):
 18 |         self.model = RTEnsemble(model_file, format="QuickRank")
 19 | 
 20 |     def tearDown(self):
 21 |         del self.model
 22 |         self.model = None
 23 | 
 24 |     def test_count_nodes(self):
 25 |         n_trees, n_nodes = ProxyQuickRank._count_nodes(model_file)
 26 |         assert_equal(n_trees, 2)
 27 |         assert_equal(n_nodes, 10)
 28 |         assert_equal(n_trees, self.model.trees_root.size)
 29 |         assert_equal(n_nodes, self.model.trees_nodes_value.size)
 30 | 
 31 |     def test_root_nodes(self):
 32 |         assert_equal((self.model.trees_root > -1).all(), True,
 33 |                      "Root nodes not set correctly")
 34 | 
 35 |     def test_root_nodes_adv(self):
 36 |         assert_array_equal(self.model.trees_root, [0, 5],
 37 |                            "Root nodes are not correct")
 38 | 
 39 |     def test_tree_weights(self):
 40 |         assert_array_almost_equal(self.model.trees_weight,
 41 |                                   [0.10000000149011612, 0.10000000149011612],
 42 |                                   err_msg="Tree Weights are not correct")
 43 | 
 44 |     def test_split_features(self):
 45 |         assert_array_equal(self.model.trees_nodes_feature,
 46 |                            [107, 114, -1, -1, -1, 7, -1, 105, -1, -1])
 47 | 
 48 |     def test_tree_values(self):
 49 |         assert_array_almost_equal(self.model.trees_nodes_value,
 50 |             [14.895151138305664, -8.0245580673217773, 0.3412887828162291,
 51 |              0.66845277963831218, 0.96317280453257792, 0.66666698455810547,
 52 |              0.37133907932286642, 17.135160446166992, 0.54762687170967062,
 53 |              0.98651670670179537],
 54 |             err_msg="Split threshold values or leaf outputs are not correct")
 55 | 
 56 |     def test_left_children(self):
 57 |         assert_array_equal(self.model.trees_left_child,
 58 |                            [1, 2, -1, -1, -1, 6, -1, 8, -1, -1])
 59 | 
 60 |     def test_right_children(self):
 61 |         assert_array_equal(self.model.trees_right_child,
 62 |                            [4, 3, -1, -1, -1, 7, -1, 9, -1, -1])
 63 | 
 64 |     def test_leaf_correctness(self):
 65 |         for idx, feature in enumerate(self.model.trees_nodes_feature):
 66 |             if feature == -1:
 67 |                 assert_equal(self.model.trees_left_child[idx], -1,
 68 |                              "Left child of a leaf node is not empty (-1)")
 69 |                 assert_equal(self.model.trees_right_child[idx], -1,
 70 |                              "Right child of a leaf node is not empty (-1)")
 71 |                 assert_equal(self.model.is_leaf_node(idx), True,
 72 |                              "Leaf node not detected as a leaf")
 73 | 
 74 |     def test_load_save_quickrank_model(self):
 75 |         # save the model
 76 |         saved_model_file = model_file + ".saved.xml"
 77 |         saved = self.model.save(saved_model_file, format="QuickRank")
 78 |         assert_equal(saved, True, "File not save correctly")
 79 | 
 80 |         # reload the model
 81 |         model_reloaded = RTEnsemble(saved_model_file, format="QuickRank")
 82 | 
 83 |         os.remove(saved_model_file)
 84 | 
 85 |         assert_array_almost_equal(self.model.trees_root, model_reloaded.trees_root,
 86 |                                   err_msg="Tree roots are incorrect")
 87 |         assert_array_almost_equal(self.model.trees_weight, model_reloaded.trees_weight,
 88 |                                   err_msg="Tree weights are incorrect")
 89 |         assert_array_almost_equal(self.model.trees_nodes_value, model_reloaded.trees_nodes_value,
 90 |                                   err_msg="Node thresholds are incorrect")
 91 |         assert_array_almost_equal(self.model.trees_nodes_feature, model_reloaded.trees_nodes_feature,
 92 |                                   err_msg="Node features are incorrect")
 93 |         assert_array_almost_equal(self.model.trees_left_child, model_reloaded.trees_left_child,
 94 |                                   err_msg="Left children are incorrect")
 95 |         assert_array_almost_equal(self.model.trees_right_child, model_reloaded.trees_right_child,
 96 |                                   err_msg="Right children are incorrect")
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
101 |                         level=logging.DEBUG)
102 |     unittest.main()
103 | 


--------------------------------------------------------------------------------
/rankeval/visualization/feature.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This package provides support for feature analysis visualizations.
  3 | """
  4 | 
  5 | from __future__ import print_function
  6 | import six
  7 | 
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | import numpy as np
 11 | 
 12 | try:
 13 |     xrange
 14 | except NameError:
 15 |     # Python3's range is Python2's xrange
 16 |     xrange = range
 17 | 
 18 | 
 19 | def plot_feature_importance(feature_perf, max_features=10, sort_by="gain",
 20 |                             feature_names=None):
 21 |     """
 22 |     Shows the most important features as a bar plot.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     feature_perf : xarray.DataArray
 27 |         Feature importance stats of the model to be visualized
 28 |     max_features : int or None
 29 |         Maximul number of features to be visualized. If None is passed, it will
 30 |         show all the features
 31 |     sort_by : 'gain' or 'count'
 32 |         The method to use for selecting the top features to display. 'gain'
 33 |         method selects the top features by importance, 'count' selects the top
 34 |         features by usage (i.e., number of times it has been used by a split
 35 |         node).
 36 |     feature_names : list of string
 37 |         The name of the features to use for plotting. If None, their index is
 38 |         used in place of the name (starting from 1).
 39 | 
 40 |     Returns
 41 |     -------
 42 |     : matplotlib.figure.Figure
 43 |         The matpotlib Figure
 44 |     """
 45 | 
 46 |     feature_importance = feature_perf.sel(type='importance').data
 47 |     feature_count = feature_perf.sel(type='count').data.astype(np.uint16)
 48 | 
 49 |     # figure
 50 |     fig, ax1 = plt.subplots(figsize=(16, 5))
 51 |     ax2 = ax1.twinx()
 52 | 
 53 |     if sort_by == "gain":
 54 |         idx_sorted = np.argsort(feature_importance)[::-1]
 55 |         title_by = "Importance"
 56 |     elif sort_by == "count":
 57 |         idx_sorted = np.argsort(feature_count)[::-1]
 58 |         title_by = "Count"
 59 |     else:
 60 |         raise RuntimeError("Sorting of features for visualization "
 61 |                            "not supported!")
 62 | 
 63 |     if isinstance(max_features, six.integer_types):
 64 |         idx_sorted = idx_sorted[:max_features]
 65 |     else:
 66 |         max_features = len(feature_importance)
 67 | 
 68 |     top_features = idx_sorted
 69 |     top_importances = feature_importance[idx_sorted]
 70 |     top_counts = feature_count[idx_sorted]
 71 | 
 72 |     index = np.arange(max_features)
 73 |     bar_width = 0.35
 74 | 
 75 |     opacity = 0.7
 76 | 
 77 |     bar1 = ax1.bar(index, top_importances, bar_width,
 78 |                    alpha=opacity,
 79 |                    color='r',
 80 |                    align='center',
 81 |                    zorder=5,
 82 |                    edgecolor='black')
 83 |     bar2 = ax2.bar(index + bar_width, top_counts, bar_width,
 84 |                    alpha=opacity,
 85 |                    color='b',
 86 |                    align='center',
 87 |                    zorder=5,
 88 |                    edgecolor='black')
 89 | 
 90 |     ax1.set_title('Top-k Features by %s' % title_by)
 91 | 
 92 |     ax1.set_xlabel("Features")
 93 |     if feature_names is not None:
 94 |         feature_names_f = np.array(["%16s" % f for f in feature_names])
 95 |         ax1.set_xticks(index + bar_width / 2 + 0.15)
 96 |         ax1.set_xticklabels(feature_names_f[idx_sorted], rotation=45,
 97 |                             ha="right")
 98 |     else:
 99 |         ax1.set_xticks(index + bar_width / 2)
100 |         ax1.set_xticklabels(top_features + 1)
101 | 
102 |     ax1.set_xlim(-bar_width/2 - bar_width, max_features - 1 + bar_width*5/2)
103 | 
104 |     step_y = np.ceil(top_importances.max() * 10) / 100
105 |     align_y_axis(ax1, ax2, step_y, 100, num_ticks=6)
106 | 
107 |     ax1.set_ylabel("Importance Gain")
108 |     ax2.set_ylabel("Usage Count")
109 | 
110 |     ax1.grid(False)
111 |     ax2.grid(False)
112 |     ax1.yaxis.grid(True, ls='--', zorder=0)
113 | 
114 |     ax1.legend((bar1, bar2), ("Importance", "Count"),
115 |                loc='best', shadow=True, frameon=True, fancybox=True)
116 | 
117 |     return fig
118 | 
119 | 
120 | def align_y_axis(ax1, ax2, minresax1, minresax2, num_ticks=7):
121 |     """ Sets tick marks of twinx axes to line up with num_ticks total tick marks
122 | 
123 |     ax1 and ax2 are matplotlib axes
124 |     Spacing between tick marks will be a factor of minresax1 and minresax2"""
125 | 
126 |     ax1ylims = ax1.get_ybound()
127 |     ax2ylims = ax2.get_ybound()
128 |     ax1factor = minresax1 * (num_ticks - 1)
129 |     ax2factor = minresax2 * (num_ticks - 1)
130 |     ax1.set_yticks(np.linspace(ax1ylims[0],
131 |                                ax1ylims[1]+(ax1factor -
132 |                                (ax1ylims[1]-ax1ylims[0]) % ax1factor) %
133 |                                ax1factor,
134 |                                num_ticks))
135 |     ax2.set_yticks(np.linspace(ax2ylims[0],
136 |                                ax2ylims[1]+(ax2factor -
137 |                                (ax2ylims[1]-ax2ylims[0]) % ax2factor) %
138 |                                ax2factor,
139 |                                num_ticks))


--------------------------------------------------------------------------------
/rankeval/metrics/rbp.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors: Cristina Muntean <cristina.muntean@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | 
  9 | import numpy as np
 10 | from rankeval.metrics import Metric
 11 | 
 12 | 
 13 | class RBP(Metric):
 14 |     """
 15 |     This class implements Ranked biased Precision (RBP) with several parameters.
 16 |     We implemented RBP as in: Alistair Moffat and Justin Zobel. 2008.
 17 |      Rank-biased precision for measurement of retrieval effectiveness.
 18 |     ACM Trans. Inf. Syst. 27, 1, Article 2 (December 2008), 27 pages.
 19 |     DOI=http://dx.doi.org/10.1145/1416950.1416952
 20 | 
 21 |     RBP is an extension of P@k. User has certain chance to view each result.
 22 | 
 23 |     RBP = E(# viewed relevant results) / E(# viewed results)
 24 | 
 25 |     p is based on the user model perspective and allows simulating different
 26 |     types of users, e.g.:
 27 |         p = 0.95 for persistent user
 28 |         p = 0.8 for patient users
 29 |         p = 0.5 for impatient users
 30 |         p = 0 for i'm feeling lucky - P@1
 31 | 
 32 |     The use of different values of p reflects different ways in which ranked
 33 |     lists can be used. Values close to 1.0 are indicative of highly persistent
 34 |     users, who scrutinize many answers before ceasing their search. For example,
 35 |     at p = 0.95, there is a roughly 60% likelihood that a user will enter a
 36 |     second page of 10 results, and a 35% chance that they will go to a third
 37 |     page. Such users obtain a relatively low per-document utility from a search
 38 |     unless a high number of relevant documents are encountered, scattered
 39 |     through a long prefix of the ranking.
 40 | 
 41 |     """
 42 | 
 43 |     _threshold = 1
 44 | 
 45 |     def __init__(self, name='RBP', cutoff=None, threshold=_threshold, p=0.5):
 46 |         """
 47 |         This is the constructor of RBP, an object of type Metric, with the name
 48 |         RBP. The constructor also allows setting custom values in the following
 49 |         parameters.
 50 | 
 51 |         Parameters
 52 |         ----------
 53 |         name: string
 54 |             RBP
 55 |         cutoff: int
 56 |             The top k results to be considered at per query level (e.g. 10)
 57 |         threshold: float
 58 |             This parameter considers relevant results all instances with labels
 59 |             different from 0, thus with a minimum label value of 1. It can be
 60 |             set to other values as well (e.g. 3), in the range of possible
 61 |             labels.
 62 |         p: float
 63 |             This parameter which simulates user type, and consequently the
 64 |             probability that a viewer actually inspects the document at rank k.
 65 |         """
 66 |         super(RBP, self).__init__(name)
 67 |         self.cutoff = cutoff
 68 |         self.threshold = threshold
 69 |         self.p = p
 70 | 
 71 |     def eval(self, dataset, y_pred):
 72 |         """
 73 |         This method takes the RBP for each query and calculates the average RBP.
 74 | 
 75 |         Parameters
 76 |         ----------
 77 |         dataset : Dataset
 78 |             Represents the Dataset object on which to apply RBP.
 79 |         y_pred : numpy 1d array of float
 80 |             Represents the predicted document scores for each instance in the
 81 |             dataset.
 82 | 
 83 |         Returns
 84 |         -------
 85 |         avg_score: float
 86 |             The overall RBP score (averages over the detailed MAP scores).
 87 |         detailed_scores: numpy 1d array of floats
 88 |             The detailed RBP@k scores for each query, an array of length of the
 89 |             number of queries.
 90 | 
 91 |         """
 92 |         return super(RBP, self).eval(dataset, y_pred)
 93 | 
 94 |     def eval_per_query(self, y, y_pred):
 95 |         """
 96 |         This method helps compute the RBP score per query. It is called by the
 97 |         eval function which averages and aggregates the scores for each query.
 98 | 
 99 |         Parameters
100 |         ----------
101 |         y: numpy array
102 |             Represents the labels of instances corresponding to one query in
103 |             the dataset (ground truth).
104 |         y_pred: numpy array.
105 |             Represents the predicted document scores obtained during the model
106 |             scoring phase for that query.
107 | 
108 |         Returns
109 |         -------
110 |         rbp: float
111 |             Represents the RBP score for one query.
112 |         """
113 |         idx_y_pred_sorted = np.argsort(y_pred)[::-1]
114 |         if self.cutoff is not None:
115 |             idx_y_pred_sorted = idx_y_pred_sorted[:self.cutoff]
116 | 
117 |         discount = np.power(self.p, np.arange(len(idx_y_pred_sorted)))
118 |         gain = y[idx_y_pred_sorted] >= self.threshold
119 | 
120 |         rbp = (1. - self.p) * (gain * discount).sum()
121 |         return rbp
122 | 
123 |     def __str__(self):
124 |         s = self.name
125 |         if self.cutoff is not None:
126 |             s += "@{}".format(self.cutoff)
127 |         if self.threshold != self._threshold:
128 |             s += "[>{}]".format(self.threshold)
129 |         return s
130 | 


--------------------------------------------------------------------------------
/rankeval/analysis/_efficient_topological.pyx:
--------------------------------------------------------------------------------
  1 | import cython
  2 | cimport cython
  3 | 
  4 | # Import the Python-level symbols of numpy
  5 | import numpy as np
  6 | 
  7 | # Import the C-level symbols of numpy
  8 | cimport numpy as np
  9 | 
 10 | import scipy as sc
 11 | import scipy.sparse
 12 | 
 13 | # Numpy must be initialized. When using numpy from C or Cython you must
 14 | # _always_ do that, or you will have segfaults
 15 | np.import_array()
 16 | 
 17 | from cython.parallel import prange, parallel
 18 | 
 19 | @cython.boundscheck(False)
 20 | @cython.wraparound(False)
 21 | def efficient_topological_analysis(model, include_leaves=True):
 22 | 
 23 |     cdef np.intp_t n_trees = model.n_trees
 24 |     cdef np.intp_t n_nodes = model.n_nodes
 25 | 
 26 |     cdef int[:] trees_root = model.trees_root
 27 |     cdef int[:] trees_left_child = model.trees_left_child
 28 |     cdef int[:] trees_right_child  = model.trees_right_child
 29 | 
 30 |     node_indices = np.zeros(model.n_nodes, dtype=np.uint64)
 31 |     cdef unsigned long long[:] node_indices_view = node_indices
 32 |     cdef unsigned int[:] height_trees = np.zeros(model.n_trees, dtype=np.uint32)
 33 | 
 34 |     cdef bint c_include_leaves = include_leaves
 35 | 
 36 |     cdef np.intp_t idx_tree
 37 |     cdef int idx_last_node
 38 |     with nogil, parallel():
 39 |         for idx_tree in prange(n_trees):
 40 |             idx_last_node = trees_root[idx_tree+1] if idx_tree < n_trees-1 else n_nodes
 41 |             height_trees[idx_tree] = _compute_node_indices(idx_tree,
 42 |                                                            trees_root,
 43 |                                                            trees_left_child,
 44 |                                                            trees_right_child,
 45 |                                                            node_indices_view,
 46 |                                                            idx_last_node,
 47 |                                                            c_include_leaves)
 48 | 
 49 |     # Computes unique indices and counts the occurrences of each index (aggregate)
 50 |     unique_counts = np.unique(node_indices, return_counts=True)
 51 | 
 52 |     cdef unsigned long long[:] data_indices_view = unique_counts[0]
 53 |     cdef long[:] counts_view = unique_counts[1]
 54 | 
 55 |     # overwrite counts of 0-values since they should identify only the
 56 |     # root nodes but include also the leaves when include_leaves=False)
 57 |     counts_view[0] = n_trees
 58 | 
 59 |     cdef np.intp_t data_indices_size = data_indices_view.size
 60 | 
 61 |     # indices in a sparse matrix representation
 62 |     cdef unsigned long long[:] row_ind = np.zeros(data_indices_size, dtype=np.uint64)
 63 |     cdef unsigned long long[:] col_ind = np.zeros(data_indices_size, dtype=np.uint64)
 64 | 
 65 |     cdef np.intp_t idx_data
 66 |     cdef int exp
 67 |     with nogil, parallel():
 68 |         for idx_data in prange(data_indices_size):
 69 |             row_ind[idx_data] = most_significant_bit(data_indices_view[idx_data] + 1)
 70 |             col_ind[idx_data] = data_indices_view[idx_data] + 1 - 2**row_ind[idx_data]
 71 | 
 72 |     return sc.sparse.csr_matrix((counts_view, (row_ind, col_ind)), dtype=np.float32), np.asarray(height_trees)
 73 | 
 74 | @cython.boundscheck(False)
 75 | @cython.wraparound(False)
 76 | cdef int _compute_node_indices(np.intp_t idx_tree,
 77 |                                int[:] trees_root,
 78 |                                int[:] trees_left_child,
 79 |                                int[:] trees_right_child,
 80 |                                unsigned long long[:] node_indices,
 81 |                                int idx_last_node,
 82 |                                bint include_leaves) nogil:
 83 | 
 84 |     cdef int cur_node = trees_root[idx_tree]
 85 |     cdef unsigned long long left_value, right_value, max_index = 0
 86 |     while cur_node < idx_last_node:
 87 |         if _is_leaf_node(cur_node, trees_left_child, trees_right_child):
 88 |             if not include_leaves:
 89 |                 node_indices[cur_node] = 0
 90 |         else:
 91 |             left_value = 2 * node_indices[cur_node] + 1
 92 |             right_value = 2 * node_indices[cur_node] + 2
 93 |             node_indices[trees_left_child[cur_node]] = left_value
 94 |             node_indices[trees_right_child[cur_node]] = right_value
 95 |             max_index = max(max_index, left_value)
 96 |             max_index = max(max_index, right_value)
 97 |         cur_node += 1
 98 | 
 99 |     cdef int height = most_significant_bit(max_index + 1)
100 |     return height
101 | 
102 | @cython.boundscheck(False)
103 | @cython.wraparound(False)
104 | cdef inline bint _is_leaf_node(int idx_node,
105 |                                int[:] trees_left_child,
106 |                                int[:] trees_right_child) nogil:
107 |     return trees_left_child[idx_node] == -1 and trees_right_child[idx_node] == -1
108 | 
109 | @cython.boundscheck(False)
110 | @cython.wraparound(False)
111 | cdef int most_significant_bit(long long v) nogil:
112 | 
113 |     cdef long long *b = [0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000, 0xFFFFFFFF00000000]
114 |     cdef unsigned int *S = [1, 2, 4, 8, 16, 32]
115 | 
116 |     # result of log2(v) will go here
117 |     cdef unsigned int r = 0
118 |     # unroll for speed...
119 |     cdef int i = 5
120 |     while i >= 0:
121 |         if (v & b[i]):
122 |             v >>= S[i];
123 |             r |= S[i];
124 |         i -= 1
125 |     
126 |     return r


--------------------------------------------------------------------------------
/rankeval/scoring/_efficient_scoring.pyx:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | """
  9 | Optimized scoring of RankEval.
 10 | """
 11 | 
 12 | import cython
 13 | cimport cython
 14 | 
 15 | # Import the Python-level symbols of numpy
 16 | import numpy as np
 17 | 
 18 | # Import the C-level symbols of numpy
 19 | cimport numpy as np
 20 | 
 21 | 
 22 | # Numpy must be initialized. When using numpy from C or Cython you must
 23 | # _always_ do that, or you will have segfaults
 24 | np.import_array()
 25 | 
 26 | from cython.parallel import prange, parallel
 27 | 
 28 | @cython.boundscheck(False)
 29 | @cython.wraparound(False)
 30 | def basic_scoring(model, X):
 31 | 
 32 |     cdef np.intp_t n_instances = X.shape[0]
 33 |     cdef np.intp_t n_trees = model.n_trees
 34 | 
 35 |     cdef float[:, :] X_view = X
 36 |     y = np.zeros(n_instances, dtype=np.float32)
 37 |     cdef float[:] y_view = y
 38 | 
 39 |     cdef int[:] trees_root = model.trees_root
 40 |     cdef float[:] trees_weight = model.trees_weight
 41 |     cdef short[:] trees_nodes_feature = model.trees_nodes_feature
 42 |     cdef float[:] trees_nodes_value = model.trees_nodes_value
 43 |     cdef int[:] trees_left_child = model.trees_left_child
 44 |     cdef int[:] trees_right_child  = model.trees_right_child
 45 | 
 46 |     cdef int leaf_node
 47 |     cdef np.intp_t idx_tree, idx_instance
 48 |     with nogil, parallel():
 49 |         for idx_instance in prange(n_instances):
 50 |             for idx_tree in xrange(n_trees):
 51 |                 leaf_node = _score_single_instance_single_tree(
 52 |                     X_view,
 53 |                     idx_instance,
 54 |                     idx_tree,
 55 |                     trees_root,
 56 |                     trees_weight,
 57 |                     trees_nodes_feature,
 58 |                     trees_nodes_value,
 59 |                     trees_left_child,
 60 |                     trees_right_child
 61 |                 )
 62 | 
 63 |                 y_view[idx_instance] += \
 64 |                     trees_nodes_value[leaf_node] * trees_weight[idx_tree]
 65 |     return y
 66 | 
 67 | @cython.boundscheck(False)
 68 | @cython.wraparound(False)
 69 | def detailed_scoring(model, X):
 70 | 
 71 |     cdef np.intp_t n_instances = X.shape[0]
 72 |     cdef np.intp_t n_trees = model.n_trees
 73 | 
 74 |     cdef float[:, :] X_view = X
 75 |     y_leaves = np.zeros((X.shape[0], model.n_trees), dtype=np.int32)
 76 |     cdef int[:, :] y_leaves_view = y_leaves
 77 | 
 78 |     partial_y = np.zeros((X.shape[0], model.n_trees), dtype=np.float32)
 79 |     cdef float[:, :] partial_y_view = partial_y
 80 | 
 81 |     cdef int[:] trees_root = model.trees_root
 82 |     cdef float[:] trees_weight = model.trees_weight
 83 |     cdef short[:] trees_nodes_feature = model.trees_nodes_feature
 84 |     cdef float[:] trees_nodes_value = model.trees_nodes_value
 85 |     cdef int[:] trees_left_child = model.trees_left_child
 86 |     cdef int[:] trees_right_child  = model.trees_right_child
 87 | 
 88 |     cdef int leaf_node
 89 |     cdef np.intp_t idx_tree, idx_instance
 90 |     with nogil, parallel():
 91 |         for idx_tree in prange(n_trees):
 92 |             for idx_instance in xrange(n_instances):
 93 |                 leaf_node = _score_single_instance_single_tree(
 94 |                     X_view,
 95 |                     idx_instance,
 96 |                     idx_tree,
 97 |                     trees_root,
 98 |                     trees_weight,
 99 |                     trees_nodes_feature,
100 |                     trees_nodes_value,
101 |                     trees_left_child,
102 |                     trees_right_child
103 |                 )
104 | 
105 |                 y_leaves_view[idx_instance, idx_tree] = leaf_node
106 |                 partial_y_view[idx_instance, idx_tree] = \
107 |                     trees_nodes_value[leaf_node] * trees_weight[idx_tree]
108 | 
109 |     return np.asarray(y_leaves), np.asarray(partial_y)
110 | 
111 | @cython.boundscheck(False)
112 | @cython.wraparound(False)
113 | cdef int _score_single_instance_single_tree(float[:,:] X,
114 |                                             np.intp_t idx_instance,
115 |                                             np.intp_t idx_tree,
116 |                                             int[:] trees_root,
117 |                                             float[:] trees_weight,
118 |                                             short[:] trees_nodes_feature,
119 |                                             float[:] trees_nodes_value,
120 |                                             int[:] trees_left_child,
121 |                                             int[:] trees_right_child) nogil:
122 | 
123 |     # Check the usage of np.intp_t in plave of np.int16_t
124 |     cdef int cur_node = trees_root[idx_tree]
125 |     cdef short feature_idx
126 |     cdef float feature_threshold
127 |     while trees_left_child[cur_node] != -1 and trees_right_child[cur_node] != -1:
128 |         feature_idx = trees_nodes_feature[cur_node]
129 |         feature_threshold = trees_nodes_value[cur_node]
130 |         if X[idx_instance, feature_idx] <= feature_threshold:
131 |             cur_node = trees_left_child[cur_node]
132 |         else:
133 |             cur_node = trees_right_child[cur_node]
134 |     return cur_node


--------------------------------------------------------------------------------
/rankeval/metrics/ndcg.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors: Cristina Muntean <cristina.muntean@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | from collections import defaultdict
  9 | import numpy as np
 10 | 
 11 | from rankeval.metrics.dcg import DCG
 12 | from rankeval.metrics.metric import Metric
 13 | 
 14 | 
 15 | class NDCG(Metric):
 16 |     """
 17 |     This class implements NDCG with several parameters.
 18 | 
 19 |     """
 20 | 
 21 |     def __init__(self, name='NDCG', cutoff=None, no_relevant_results=1.0,
 22 |                  implementation="exp"):
 23 |         """
 24 |         This is the constructor of NDCG, an object of type Metric, with the
 25 |         name NDCG.
 26 |         The constructor also allows setting custom values
 27 |             - cutoff: the top k results to be considered at per query level
 28 |             - no_relevant_results: is a float values indicating how to treat
 29 |                 the cases where then are no relevant results
 30 |             - ties: indicates how we should consider the ties
 31 |             - implementation: indicates whether to consider the flat or the
 32 |                 exponential NDCG formula
 33 | 
 34 |         Parameters
 35 |         ----------
 36 |         name: string
 37 |             NDCG
 38 |         cutoff: int
 39 |             The top k results to be considered at per query level (e.g. 10)
 40 |         no_relevant_results: float
 41 |             Float indicating how to treat the cases where then are no relevant
 42 |             results (e.g. 0.5). Default is 1.0.
 43 |         implementation: string
 44 |             Indicates whether to consider the flat or the exponential DCG
 45 |             formula: "flat" or "exp" (default).
 46 |         """
 47 | 
 48 |         super(self.__class__, self).__init__(name)
 49 |         self.cutoff = cutoff
 50 |         self.no_relevant_results = no_relevant_results
 51 |         self.implementation = implementation
 52 |         self.dcg = DCG(cutoff=self.cutoff,
 53 |                        implementation=self.implementation)
 54 | 
 55 |         self._current_dataset = None
 56 |         self._current_rel_qid = None
 57 |         self._cache_idcg_score = defaultdict(int)
 58 | 
 59 |     def eval(self, dataset, y_pred):
 60 |         """
 61 |         The method computes NDCG by taking as input the dataset and the
 62 |         predicted document scores (obtained with the scoring methods). It
 63 |         returns the averaged NDCG score over the entire dataset and the
 64 |         detailed NDCG scores per query.
 65 | 
 66 |         Parameters
 67 |         ----------
 68 |         dataset : Dataset
 69 |             Represents the Dataset object on which to apply NDCG.
 70 |         y_pred : numpy 1d array of float
 71 |             Represents the predicted document scores for each instance in the
 72 |             dataset.
 73 | 
 74 |         Returns
 75 |         -------
 76 |         avg_score: float
 77 |             Represents the average NDCG over all NDCG scores per query.
 78 |         detailed_scores: numpy array of floats
 79 |             Represents the detailed NDCG scores for each query. It has the
 80 |             length of n_queries.
 81 | 
 82 |         """
 83 |         # used to cache ideal DCG scores on a dataset basis
 84 |         self._current_dataset = dataset
 85 |         self._current_rel_qid = 0
 86 | 
 87 |         # Compute the ideal DCG scores only once and cache them
 88 |         if self._current_dataset not in self._cache_idcg_score:
 89 | 
 90 |             idcg_score = np.ndarray(shape=dataset.n_queries, dtype=np.float32)
 91 |             for rel_id, (qid, q_y, _) in enumerate(
 92 |                     self.query_iterator(dataset, dataset.y)):
 93 |                 idcg_score[rel_id] = self.dcg.eval_per_query(q_y, q_y)
 94 | 
 95 |             self._cache_idcg_score[self._current_dataset] = idcg_score
 96 | 
 97 |         return super(self.__class__, self).eval(dataset, y_pred)
 98 | 
 99 |     def eval_per_query(self, y, y_pred):
100 |         """
101 |         This method helps compute the NDCG score per query. It is called by the
102 |         eval function which averages and aggregates the scores for each query.
103 | 
104 |         It calculates NDCG per query as <dcg_score/idcg_score>.
105 |         If there are no relevant results, NDCG returns the values set by default
106 |         or by the user when creating the metric.
107 | 
108 |         Parameters
109 |         ----------
110 |         y: numpy array
111 |             Represents the labels of instances corresponding to one query in the
112 |             dataset (ground truth).
113 |         y_pred: numpy array.
114 |             Represents the predicted document scores obtained during the model
115 |             scoring phase for that query.
116 | 
117 |         Returns
118 |         -------
119 |         dcg: float
120 |             Represents the DCG score for one query.
121 |         """
122 |         dcg_score = self.dcg.eval_per_query(y, y_pred)
123 | 
124 |         if self._current_rel_qid is not None:
125 |             idcg_score = \
126 |                 self._cache_idcg_score[self._current_dataset][self._current_rel_qid]
127 |             self._current_rel_qid += 1
128 |         else:
129 |             idcg_score = self.dcg.eval_per_query(y, y)
130 | 
131 |         if idcg_score != 0:
132 |             ndcg = dcg_score / idcg_score
133 |         else:
134 |             ndcg = self.no_relevant_results
135 |         return ndcg
136 | 
137 |     def __str__(self):
138 |         s = self.name
139 |         if self.cutoff is not None:
140 |             s += "@{}".format(self.cutoff)
141 |         return s
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 


--------------------------------------------------------------------------------
/rankeval/scoring/scorer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | """
  9 | Class for efficient scoring of an ensemble-based model composed of binary regression trees on a given dataset.
 10 | """
 11 | 
 12 | from ..dataset import Dataset
 13 | from ._efficient_scoring import basic_scoring, detailed_scoring
 14 | 
 15 | 
 16 | class Scorer(object):
 17 |     """
 18 |     Class for efficient scoring of an ensemble-based model composed of binary regression trees on a given dataset.
 19 | 
 20 |     This class can be used for simple or detailed scoring, depending on the mode selected at scoring time.
 21 |     The document scores are cached as to avoid useless re-scoring. Thus, calling multiple times the `score` method
 22 |     does not involve the scoring activity to be executed again, except for a detailed scoring following a basic scoring.
 23 |     Indeed in this situation the scoring has to be repeated as to analyze in depth the scoring behaviour.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     model: RTEnsemble
 28 |         The model to use for scoring
 29 |     dataset: Dataset
 30 |         The dataset to use for scoring
 31 | 
 32 |     Attributes
 33 |     ----------
 34 |     model : RTEnsemble
 35 |         The model to use for scoring
 36 |     dataset : Dataset
 37 |         The dataset to use for scoring
 38 |     y_pred : numpy array of float
 39 |         The predicted scores produced by the given model for each sample of the given dataset X
 40 |     partial_y_pred : numpy 2d-array of float
 41 |         The predicted score of each tree of the model for each dataset instance
 42 | 
 43 |     """
 44 | 
 45 |     def __init__(self, model, dataset):
 46 |         self.model = model
 47 |         self.dataset = dataset
 48 | 
 49 |         # Save the predicted scores for each dataset instance
 50 |         self.y_pred = None
 51 | 
 52 |         # Save the partial scores of each tree for each dataset instance
 53 |         # (if detailed scoring is True)
 54 |         self.partial_y_pred = None
 55 | 
 56 |         # Save the leaf id of each tree for each dataset instance
 57 |         # (if detailed scoring is True)
 58 |         self.out_leaves = None
 59 | 
 60 |     def score(self, detailed):
 61 |         """
 62 | 
 63 |         Parameters
 64 |         ----------
 65 |         detailed : bool
 66 |             True if the class has to performs a detailed scoring, false otherwise
 67 | 
 68 |         Returns
 69 |         -------
 70 |         y : numpy array of float
 71 |             the predicted scores produced by the given model for each sample of the given dataset X
 72 | 
 73 |         Attributes
 74 |         ----------
 75 |         self.y : array of float
 76 |             The predicted scores of each dataset instance
 77 |         """
 78 | 
 79 |         # Skip the scoring if it has already been done (return cached results)
 80 |         if not detailed and self.y_pred is not None or \
 81 |                         detailed and self.out_leaves is not None:
 82 |             return self.y_pred
 83 | 
 84 |         if detailed:
 85 |             self.out_leaves, self.partial_y_pred = \
 86 |                 detailed_scoring(self.model, self.dataset.X)
 87 |             self.y_pred = self.partial_y_pred.sum(axis=1)
 88 |         else:
 89 |             self.y_pred = basic_scoring(self.model, self.dataset.X)
 90 | 
 91 |         return self.y_pred
 92 | 
 93 |     def get_predicted_scores(self):
 94 |         """
 95 |         Provide an accessor to the predicted scores produced by the given model for each sample of the given dataset X
 96 | 
 97 |         Returns
 98 |         -------
 99 |         scores : numpy array of float
100 |             The predicted scores produced by the given model for each sample of the given dataset X
101 | 
102 |         """
103 |         if self.y_pred is None:
104 |             self.score(detailed=False)
105 |         return self.y_pred
106 | 
107 |     def get_partial_predicted_scores(self):
108 |         """
109 |         Provide an accessor to the partial scores produced by the given model
110 |         for each sample of the given dataset X. Each partial score reflects the
111 |         score produced by a single tree of the ensemble model to a single
112 |         dataset instance. Thus, the returned numpy matrix has a shape of
113 |         (n_instances, n_trees). The partial scores does not take into account
114 |         the tree weights, thus for producing the final score is needed to
115 |         multiply each row for the tree weight vector.
116 | 
117 |         Returns
118 |         -------
119 |         scores : numpy 2d-array of float
120 |             The predicted score of each tree of the model for each dataset instance
121 |         """
122 |         if self.partial_y_pred is None:
123 |             self.score(detailed=True)
124 |         return self.partial_y_pred
125 | 
126 |     def get_predicted_leaves(self):
127 |         """
128 |         Provide an accessor to the leaves that identify the exit nodes of each
129 |         sample of the given dataset X using the given model.
130 | 
131 |         Each leaf value reflects the output node of a single tree of the
132 |         ensemble model to a single dataset instance. Thus, the returned numpy
133 |         matrix has a shape of (n_instances, n_trees).
134 | 
135 |         Returns
136 |         -------
137 |         scores : numpy 2d-array of int
138 |             The leaves predicted by each tree of the model on scoring
139 |             each dataset instance.
140 | 
141 |         """
142 |         if self.self.y_leaves is None:
143 |             self.score(detailed=True)
144 |         return self.self.y_leaves
145 | 
146 | 


--------------------------------------------------------------------------------
/rankeval/analysis/topological.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors:  Salvatore Trani <salvatore.trani@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | 
  9 | """
 10 | This package implements several topological analysis focused on the
 11 | topological characteristics of ensemble-based LtR models. These
 12 | functionalities can be applied to several models,
 13 | so as to have a direct comparison of the shape of the resulting
 14 | forests (e.g., trained by different LtR algorithms).
 15 | """
 16 | 
 17 | import numpy as np
 18 | import scipy.stats
 19 | 
 20 | from ..model import RTEnsemble
 21 | from ._efficient_topological import efficient_topological_analysis
 22 | 
 23 | 
 24 | def topological_analysis(model, include_leaves=True):
 25 |     """
 26 |     This method implements the topological analysis of a ensemble-based
 27 |     LtR model. Given a model, it studies the shape of each tree composing
 28 |     the model and return several information useful for having insights
 29 |     about the shape of the trees, their completeness (level by level) as
 30 |     well as min/max/mean height and the fraction of trees having a specific
 31 |     node (where each node is identified by a pair of coordinates row-col,
 32 |     with row highlighting the depth and col the column with respect to a
 33 |     full binary tree).
 34 | 
 35 |     Parameters
 36 |     ----------
 37 |     model : RTEnsemble
 38 |         The model to analyze
 39 |     include_leaves : bool
 40 |         Whether the leaves has to be included in the analysis or not
 41 | 
 42 |     Returns
 43 |     -------
 44 |     object : TopologicalAnalysisResult
 45 |         The topological result, to use for retrieving several information
 46 |     """
 47 |     return TopologicalAnalysisResult(model, include_leaves)
 48 | 
 49 | 
 50 | class TopologicalAnalysisResult(object):
 51 |     """
 52 |     This class is used to return the topological analysis made on the model.
 53 |     Several low-level information are stored in this class, and then
 54 |     re-elaborated to provide high-level analysis.
 55 |     """
 56 | 
 57 |     def __init__(self, model, include_leaves):
 58 |         """
 59 |         Analyze the model in a topological perspective
 60 | 
 61 |         Parameters
 62 |         ----------
 63 |         model : RTEnsemble
 64 |             the model to analyze from the topological perspective
 65 |         include_leaves : bool
 66 |             Whether the leaves has to be included in the analysis or not
 67 | 
 68 |         Attributes
 69 |         ----------
 70 |         model : RTEnsemble
 71 |             The model analyzed
 72 |         height_trees : numpy array
 73 |             The ordered height of each trees composing the ensemble
 74 |         topology : scipy.sparse.csr_matrix
 75 |             The matrix used to store low-level information related to the
 76 |             aggregated shape of the trees. Each matrix cell identifies a
 77 |             tree node with a pair of coordinates row-col, with row
 78 |             highlighting the depth and col the column with respect
 79 |             to a full binary tree.
 80 |         """
 81 |         self.model = model
 82 |         self.topology, self.height_trees = efficient_topological_analysis(model, include_leaves)
 83 | 
 84 |     def describe_tree_height(self):
 85 |         """
 86 |         Computes several descriptive statistics of the height of the trees.
 87 | 
 88 |         Returns
 89 |         -------
 90 |         nobs : int
 91 |            Number of trees
 92 |         minmax: tuple of ndarrays or floats
 93 |            Minimum and maximum height of trees
 94 |         mean : ndarray or float
 95 |            Arithmetic mean of tree heights.
 96 |         variance : ndarray or float
 97 |            Unbiased variance of the tree heights.
 98 |            denominator is number of trees minus one.
 99 |         skewness : ndarray or float
100 |            Skewness, based on moment calculations with denominator equal to
101 |            the number of trees, i.e. no degrees of freedom correction.
102 |         kurtosis : ndarray or float
103 |            Kurtosis (Fisher).  The kurtosis is normalized so that it is
104 |            zero for the normal distribution.  No degrees of freedom are used.
105 |         """
106 |         return scipy.stats.describe(self.height_trees)
107 | 
108 |     def avg_tree_shape(self):
109 |         """
110 |         Computes the fraction of trees having each node with respect to a
111 |         full binary tree. The fraction is obtained by normalizing the count
112 |         by the number of trees composing the ensemble model.
113 | 
114 |         Returns
115 |         -------
116 |         fractions : scipy.sparse.csr_matrix
117 |             Sparse matrix with the same shape of the topology matrix, where
118 |             each matrix cell identifies a tree node by a pair of coordinates
119 |             row-col, with row highlighting the depth and col the column with
120 |             respect to a full binary tree. Each cell value highlights how many
121 |             trees have the specific node, normalized by the number of trees.
122 |         """
123 |         return self.topology / self.model.n_trees
124 | 
125 |     def fullness_per_level(self):
126 |         """
127 |         Computes the normalized number of trees with full level i, for each
128 |         level of a full binary tree. The normalization is done by the number
129 |         of trees.
130 | 
131 |         Returns
132 |         -------
133 |         fullness : np.array
134 |             An array long as the maximum height of a tree in the ensemble, and
135 |             where the j-th cell highlight how much the j-th level of the trees
136 |             is full (normalized by the number of trees).
137 |         """
138 |         # Row-sums are directly supported, and the structure of the CSR format means that
139 |         # the difference between successive values in the indptr array correspond exactly
140 |         # to the number of nonzero elements in each row.
141 |         sums = self.topology.sum(axis=1).A1
142 |         counts = np.diff(self.topology.indptr)
143 |         return sums / counts / self.model.n_trees
144 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://img.shields.io/travis/com/hpclab/rankeval/master.svg?logo=travis)](https://travis-ci.com/hpclab/rankeval)
  2 | [![Python version](https://img.shields.io/pypi/pyversions/rankeval.svg)](https://badge.fury.io/py/rankeval)
  3 | [![PyPI version](https://img.shields.io/pypi/v/rankeval.svg)](https://badge.fury.io/py/rankeval)
  4 | [![Wheel](https://img.shields.io/badge/wheels-%E2%9C%93-4c1.svg?longCache=true&logo=python&logoColor=white)](https://badge.fury.io/py/rankeval)
  5 | [![CPython Implementation](https://img.shields.io/pypi/implementation/rankeval.svg)](https://badge.fury.io/py/rankeval)
  6 | [![License](https://img.shields.io/badge/license-MPL%202.0-blue.svg)](https://badge.fury.io/py/rankeval)
  7 | [![DOI](https://img.shields.io/badge/DOI-10.1145%2F3077136.3084140-blue)](https://doi.org/10.1145/3077136.3084140)
  8 | 
  9 | <img src="https://github.com/hpclab/rankeval/blob/master/doc/banner.png?raw=true">
 10 | 
 11 | # RankEval: An Evaluation and Analysis Framework for Learning-to-Rank Solutions
 12 | 
 13 | RankEval is an open-source tool for the analysis and evaluation of
 14 | Learning-to-Rank models based on ensembles of regression trees. The
 15 | success of ensembles of regression trees fostered the development of
 16 | several open-source libraries targeting efficiency of the learning phase
 17 | and effectiveness of the resulting models. However, these libraries offer
 18 | only very limited help for the tuning and evaluation of the trained models.
 19 | 
 20 | RankEval aims at providing a common ground for several Learning to Rank 
 21 | libraries by providing useful and interoperable tools for a comprehensive
 22 | comparison and in-depth analysis of ranking models. Target audience is the 
 23 | *machine learning* (ML) and *information retrieval* (IR) communities.
 24 | 
 25 | RankEval is available under Mozilla Public License 2.0.
 26 | 
 27 | The official GitHub repository is: [here](https://github.com/hpclab/rankeval).
 28 | 
 29 | For questions/suggestions on how to improve RankEval, send us an email: 
 30 | rankeval@isti.cnr.it
 31 | 
 32 | ## Features
 33 | 
 34 | Rankeval provides a common ground between several pre-existing tools and offers 
 35 | services which support the interpretation of differently generated models in a 
 36 | unified environment, allowing an easy, comprehensive comparison and in-depth 
 37 | analysis.
 38 | 
 39 | The main functionalities of RankEval can be summarized along five dimensions:
 40 | - effectiveness analysis
 41 | - feature analysis
 42 | - structural analysis
 43 | - topological analysis
 44 | - interoperability among GBRT libraries
 45 | 
 46 | Regarding the interoperability, Rankeval is able to read and process ranking ensembles learned with learning-to-rank 
 47 | libraries such as QuickRank, RankLib, XGBoost, LightGBM, Scikit-Learn, CatBoost, JForest. This advanced 
 48 | interoperability is implemented through proxy classes that make possible to interpret and understand the specific 
 49 | format used to represent the ranking ensemble without using the codebase of the learning-to-rank library. Thus RankEval 
 50 | does not have any dependency from the learning-to-rank library of choice of the user.
 51 | 
 52 | These functionalities can be applied to several models at the same time, so to 
 53 | have a direct comparison of the analysis performed. The tool has been written 
 54 | to ensure **flexibility**, **extensibility**, and **efficiency**. 
 55 | 
 56 | ## Documentation
 57 | 
 58 | The official API documentation is available at: [here](http://rankeval.isti.cnr.it/docs/).
 59 | Soon on ReadTheDocs!
 60 | 
 61 | ## Installation
 62 | 
 63 | The library works with OpenMP so you need a compiler supporting it. 
 64 | If your machine uses a default compiler different from GNU GCC, change it 
 65 | appropriately before proceeding with the installation:
 66 | 
 67 | ```
 68 | export CC=gcc-5
 69 | export CXX=g++-5
 70 | ```
 71 | 
 72 | Moreover, RankEval needs the following libraries to be installed before the 
 73 | installation process begin (used for compiling the low-level code by the installation process):
 74 |   - numpy >= 1.13
 75 |   - scipy >= 0.14
 76 |   - cython >= 0.25
 77 |   - matplotlib >= 2.0.2
 78 | 
 79 | Additional dependencies will be installed automatically by setuptools.
 80 | RankEval can be installed from the source by running:
 81 | 
 82 | ```python setup.py install```
 83 | 
 84 | RankEval can also be easily installed from Python Package Index (PyPI). In this case, most probably you don't even need 
 85 | cython locally to compile low level code since the binaries should already been available for your platform.  
 86 | You may download and install it by running:
 87 | 
 88 | ```pip install rankeval```
 89 | 
 90 | Alternatively, you can build the library from the latest commit on the master branch of the repository.
 91 | Below an example of installation.
 92 | 
 93 | ```pip install git+https://github.com/hpclab/rankeval```
 94 | 
 95 | ## Development
 96 | 
 97 | If you would like to install the library in development mode, i.e., you can edit the source code and see the changes 
 98 | directly without having to reinstall every time that you make a little change, than you have to run the following 
 99 | command which will install as well the libraries required for development (documentation generation and unittests):
100 | 
101 | ```pip install -e .[develop]```
102 | 
103 | Local installation of compiled libraries: 
104 | 
105 | ```python setup.py build_ext -i```
106 | 
107 | Execution of unit tests:
108 | 
109 | ```python setup.py test```
110 | 
111 | or (if you have nose already installed):
112 | 
113 | ```nosetests -v```
114 | 
115 | ## Cite RankEval
116 | 
117 | If you use RankEval, please cite us!
118 | 
119 | ```
120 | @inproceedings{rankeval-sigir17,
121 |   author = {Claudio Lucchese and Cristina Ioana Muntean and Franco Maria Nardini and
122 |             Raffaele Perego and Salvatore Trani},
123 |   title = {RankEval: An Evaluation and Analysis Framework for Learning-to-Rank Solutions},
124 |   booktitle = {SIGIR 2017: Proceedings of the 40th International {ACM} {SIGIR}
125 |                Conference on Research and Development in Information Retrieval},
126 |   year = {2017},
127 |   location = {Tokyo, Japan}
128 | }
129 | ```
130 | 
131 | ## Credits
132 |     - Dataset loader: https://github.com/deronnek/svmlight-loader
133 |     - Query id implementation: https://github.com/mblondel/svmlight-loader/pull/6


--------------------------------------------------------------------------------
/rankeval/test/dataset/test_svmlight_format.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import unittest
  4 | 
  5 | import numpy as np
  6 | from nose.tools import raises
  7 | from numpy.testing import assert_equal, assert_array_equal, \
  8 |     assert_array_almost_equal
  9 | 
 10 | try:
 11 |     from sklearn.datasets import load_svmlight_file as sk_load_svmlight_file
 12 |     scikit_missing = False
 13 | except ImportError:
 14 |     scikit_missing = True
 15 | 
 16 | from rankeval.dataset.svmlight_format import load_svmlight_file, \
 17 |     load_svmlight_files, dump_svmlight_file
 18 | from rankeval.test.base import data_dir
 19 | 
 20 | datafile = os.path.join(data_dir, "svmlight_classification.txt")
 21 | invalidfile = os.path.join(data_dir, "svmlight_invalid.txt")
 22 | qid_datafile = os.path.join(data_dir, "svmlight_classification_qid.txt")
 23 | 
 24 | 
 25 | class SVMLightLoaderTestCase(unittest.TestCase):
 26 | 
 27 |     def test_load_svmlight_qid_file(self):
 28 |         X, y, q = load_svmlight_file(qid_datafile, query_id=True)
 29 | 
 30 |         # test X's shape
 31 |         assert_array_equal(X.shape, (4, 33))
 32 |         #print X
 33 | 
 34 |         # test X's non-zero values
 35 |         # tests X's zero values
 36 |         # test can change X's values
 37 | 
 38 |         # test y
 39 |         assert_array_equal(y, [1, 2, 0, 3])
 40 | 
 41 |         # test q
 42 |         # print q
 43 |         assert_array_equal(q, [1, 37, 37, 12])
 44 | 
 45 |     def test_load_svmlight_file_empty_qid(self):
 46 |         X, y, q = load_svmlight_file(datafile, query_id=True)
 47 | 
 48 |         # test X's shape
 49 |         assert_array_equal(X.shape, (3, 20))
 50 | 
 51 |         # test X's non-zero values
 52 |         # tests X's zero values
 53 |         # test can change X's values
 54 | 
 55 |         # test y
 56 |         assert_array_equal(y, [1, 2, 3])
 57 | 
 58 |         # test q
 59 |         assert_equal(q.shape[0], 0)
 60 | 
 61 |     def test_load_svmlight_file(self):
 62 |         X, y = load_svmlight_file(datafile)
 63 | 
 64 |         # test X's shape
 65 |         assert_array_equal(X.shape, (3, 20))
 66 | 
 67 |         # test X's non-zero values
 68 |         # tests X's zero values
 69 |         # test can change X's values
 70 | 
 71 |         # test y
 72 |         assert_array_equal(y, [1, 2, 3])
 73 | 
 74 |     def test_load_svmlight_file_descriptor(self):
 75 |         with open(datafile, 'rb') as reader:
 76 |             X, y = load_svmlight_file(reader)
 77 | 
 78 |         # test X's shape
 79 |         assert_array_equal(X.shape, (3, 20))
 80 | 
 81 |         # test y
 82 |         assert_array_equal(y, [1, 2, 3])
 83 | 
 84 |     def test_load_svmlight_files_comment_qid(self):
 85 |         X_train, y_train, q_train, X_test, y_test, q_test = \
 86 |             load_svmlight_files([datafile] * 2,  query_id=True)
 87 |         assert_array_equal(X_train, X_test)
 88 |         assert_array_equal(y_train, y_test)
 89 |         assert_equal(X_train.dtype, np.float32)
 90 |         assert_equal(X_test.dtype, np.float32)
 91 | 
 92 |         X1, y1, q1, X2, y2, q2, X3, y3, q3 = load_svmlight_files([datafile] * 3, query_id=True)
 93 |         assert_equal(X1.dtype, X2.dtype)
 94 |         assert_equal(X2.dtype, X3.dtype)
 95 |         assert_equal(X3.dtype, np.float32)
 96 | 
 97 |     def test_load_svmlight_files(self):
 98 |         # print load_svmlight_files([datafile] * 2)
 99 |         X_train, y_train, X_test, y_test = load_svmlight_files([datafile] * 2, query_id=False)
100 |         assert_array_equal(X_train, X_test)
101 |         assert_array_equal(y_train, y_test)
102 |         assert_equal(X_train.dtype, np.float32)
103 |         assert_equal(X_test.dtype, np.float32)
104 | 
105 |         X1, y1, X2, y2, X3, y3 = load_svmlight_files([datafile] * 3, query_id=False)
106 |         assert_equal(X1.dtype, X2.dtype)
107 |         assert_equal(X2.dtype, X3.dtype)
108 |         assert_equal(X3.dtype, np.float32)
109 | 
110 |     def test_load_invalid_file(self):
111 |         try:
112 |             load_svmlight_file(invalidfile)
113 |             assert False
114 |         except RuntimeError:
115 |             pass
116 | 
117 |     def test_load_invalid_file2(self):
118 |         try:
119 |             load_svmlight_files([datafile, invalidfile, datafile])
120 |             assert False
121 |         except RuntimeError:
122 |             pass
123 | 
124 |     @raises(IOError)
125 |     def test_invalid_filename(self):
126 |         load_svmlight_file("trou pic nic douille")
127 | 
128 |     @unittest.skipIf(scikit_missing, "Scikit-Learn package missing")
129 |     def test_dump(self):
130 |         tmpfile = "tmp_dump.txt"
131 |         try:
132 |             # loads from file
133 |             Xs, y = load_svmlight_file(datafile)
134 | 
135 |             # dumps to file
136 |             dump_svmlight_file(Xs, y, tmpfile, zero_based=False)
137 | 
138 |             # loads them as CSR MATRIX
139 |             X2, y2 = sk_load_svmlight_file(tmpfile)
140 | 
141 |             X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype)
142 |             X2.toarray(out=X3)
143 | 
144 |             # check assertions
145 |             assert_array_almost_equal(Xs, X3)
146 |             assert_array_almost_equal(y, y2)
147 |         finally:
148 |             if os.path.exists(tmpfile):
149 |                 os.remove(tmpfile)
150 | 
151 |     @unittest.skipIf(scikit_missing, "Scikit-Learn package missing")
152 |     def test_dump_qid(self):
153 |         tmpfile = "/tmp/tmp_dump.txt"
154 |         try:
155 |             # loads from file
156 |             Xs, y, q = load_svmlight_file(qid_datafile, query_id=True)
157 | 
158 |             # dumps to file
159 |             dump_svmlight_file(Xs, y, tmpfile, query_id=list(q), zero_based=False)
160 | 
161 |             # loads them as CSR MATRIX with scikit-learn
162 |             X2, y2, q2 = sk_load_svmlight_file(tmpfile, query_id=True)
163 | 
164 |             X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype)
165 |             X2.toarray(out=X3)
166 | 
167 |             # check assertions
168 |             assert_array_almost_equal(Xs, X3)
169 |             assert_array_almost_equal(y, y2)
170 |             assert_array_equal(q, q2)
171 |         finally:
172 |             if os.path.exists(tmpfile):
173 |                 os.remove(tmpfile)
174 | 
175 | 
176 | if __name__ == '__main__':
177 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
178 |     unittest.main()
179 | 


--------------------------------------------------------------------------------
/doc/src/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | # import os
 16 | # import sys
 17 | # sys.path.insert(0, u'/Users/salvatore/Documents/Projects/rankeval/rankeval')
 18 | 
 19 | 
 20 | # -- Project information -----------------------------------------------------
 21 | 
 22 | project = u'RankEval'
 23 | copyright = u'2018, HPC Lab'
 24 | author = u'HPC Lab'
 25 | 
 26 | # The short X.Y version
 27 | version = u'0'
 28 | # The full version, including alpha/beta/rc tags
 29 | release = u'0.00'
 30 | 
 31 | 
 32 | # -- General configuration ---------------------------------------------------
 33 | 
 34 | # If your documentation needs a minimal Sphinx version, state it here.
 35 | #
 36 | # needs_sphinx = '1.0'
 37 | 
 38 | # Add any Sphinx extension module names here, as strings. They can be
 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 40 | # ones.
 41 | extensions = [
 42 |     'sphinx.ext.autodoc',
 43 |     'sphinx.ext.viewcode',
 44 |     'sphinx.ext.todo',
 45 | ]
 46 | 
 47 | # Add any paths that contain templates here, relative to this directory.
 48 | templates_path = ['_templates']
 49 | 
 50 | # The suffix(es) of source filenames.
 51 | # You can specify multiple suffix as a list of string:
 52 | #
 53 | # source_suffix = ['.rst', '.md']
 54 | source_suffix = '.rst'
 55 | 
 56 | # The master toctree document.
 57 | master_doc = 'index'
 58 | 
 59 | # The language for content autogenerated by Sphinx. Refer to documentation
 60 | # for a list of supported languages.
 61 | #
 62 | # This is also used if you do content translation via gettext catalogs.
 63 | # Usually you set "language" from the command line for these cases.
 64 | language = 'en'
 65 | 
 66 | # List of patterns, relative to source directory, that match files and
 67 | # directories to ignore when looking for source files.
 68 | # This pattern also affects html_static_path and html_extra_path .
 69 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 70 | 
 71 | # The name of the Pygments (syntax highlighting) style to use.
 72 | pygments_style = 'sphinx'
 73 | 
 74 | 
 75 | # -- Options for HTML output -------------------------------------------------
 76 | 
 77 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 78 | # a list of builtin themes.
 79 | #
 80 | html_theme = 'alabaster'
 81 | 
 82 | # Theme options are theme-specific and customize the look and feel of a theme
 83 | # further.  For a list of options available for each theme, see the
 84 | # documentation.
 85 | #
 86 | # html_theme_options = {}
 87 | 
 88 | # Add any paths that contain custom static files (such as style sheets) here,
 89 | # relative to this directory. They are copied after the builtin static files,
 90 | # so a file named "default.css" will overwrite the builtin "default.css".
 91 | html_static_path = ['_static']
 92 | 
 93 | # Custom sidebar templates, must be a dictionary that maps document names
 94 | # to template names.
 95 | #
 96 | # The default sidebars (for documents that don't match any pattern) are
 97 | # defined by theme itself.  Builtin themes are using these templates by
 98 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
 99 | # 'searchbox.html']``.
100 | #
101 | # html_sidebars = {}
102 | 
103 | 
104 | # -- Options for HTMLHelp output ---------------------------------------------
105 | 
106 | # Output file base name for HTML help builder.
107 | htmlhelp_basename = 'RankEvaldoc'
108 | 
109 | 
110 | # -- Options for LaTeX output ------------------------------------------------
111 | 
112 | latex_elements = {
113 |     # The paper size ('letterpaper' or 'a4paper').
114 |     #
115 |     # 'papersize': 'letterpaper',
116 | 
117 |     # The font size ('10pt', '11pt' or '12pt').
118 |     #
119 |     # 'pointsize': '10pt',
120 | 
121 |     # Additional stuff for the LaTeX preamble.
122 |     #
123 |     # 'preamble': '',
124 | 
125 |     # Latex figure (float) alignment
126 |     #
127 |     # 'figure_align': 'htbp',
128 | }
129 | 
130 | # Grouping the document tree into LaTeX files. List of tuples
131 | # (source start file, target name, title,
132 | #  author, documentclass [howto, manual, or own class]).
133 | latex_documents = [
134 |     (master_doc, 'RankEval.tex', u'RankEval Documentation',
135 |      u'HPC Lab', 'manual'),
136 | ]
137 | 
138 | 
139 | # -- Options for manual page output ------------------------------------------
140 | 
141 | # One entry per manual page. List of tuples
142 | # (source start file, name, description, authors, manual section).
143 | man_pages = [
144 |     (master_doc, 'rankeval', u'RankEval Documentation',
145 |      [author], 1)
146 | ]
147 | 
148 | 
149 | # -- Options for Texinfo output ----------------------------------------------
150 | 
151 | # Grouping the document tree into Texinfo files. List of tuples
152 | # (source start file, target name, title, author,
153 | #  dir menu entry, description, category)
154 | texinfo_documents = [
155 |     (master_doc, 'RankEval', u'RankEval Documentation',
156 |      author, 'RankEval', 'One line description of project.',
157 |      'Miscellaneous'),
158 | ]
159 | 
160 | 
161 | # -- Options for Epub output -------------------------------------------------
162 | 
163 | # Bibliographic Dublin Core info.
164 | epub_title = project
165 | epub_author = author
166 | epub_publisher = author
167 | epub_copyright = copyright
168 | 
169 | # The unique identifier of the text. This can be a ISBN number
170 | # or the project homepage.
171 | #
172 | # epub_identifier = ''
173 | 
174 | # A unique identification for the text.
175 | #
176 | # epub_uid = ''
177 | 
178 | # A list of files that should not be packed into the epub file.
179 | epub_exclude_files = ['search.html']
180 | 
181 | 
182 | # -- Extension configuration -------------------------------------------------
183 | 
184 | # -- Options for todo extension ----------------------------------------------
185 | 
186 | # If true, `todo` and `todoList` produce output, else they produce nothing.
187 | todo_include_todos = True# custom
188 | extensions += ['sphinx.ext.todo']
189 | todo_include_todos = True
190 | extensions += ['sphinx.ext.autosummary']
191 | extensions += ['sphinx.ext.imgmath']
192 | numpydoc_show_class_members = False
193 | html_theme = "sphinx_rtd_theme"
194 | import sys,os
195 | sys.path.insert(0, os.path.abspath('../../') )
196 | from setuptools import sandbox
197 | sandbox.run_setup(os.path.abspath('../../setup.py'),  ['build_ext','-i'])
198 | autoclass_content = 'both'
199 | 


--------------------------------------------------------------------------------
/rankeval/analysis/_efficient_feature_impl.cpp:
--------------------------------------------------------------------------------
  1 | #include "_efficient_feature_impl.h"
  2 | 
  3 | #include <math.h>
  4 | #include <vector>
  5 | #include <iostream>
  6 | 
  7 | void c_feature_importance(
  8 |         const float* X,
  9 |         const float* y,
 10 |         const int* trees_root,
 11 |         const float* trees_weight,
 12 |         const short* trees_nodes__feature,
 13 |         const float* trees_nodes_value,
 14 |         const int* trees_left_child,
 15 |         const int* trees_right_child,
 16 |         float* feature_imp,
 17 |         short* feature_count,
 18 |         int n_instances,
 19 |         int n_features,
 20 |         int n_trees) {
 21 | 
 22 |     // initialize features importance
 23 |     #pragma omp parallel for
 24 |     for (unsigned int feature = 0; feature < n_features; ++feature) {
 25 |         feature_imp[feature] = 0;
 26 |     }
 27 | 
 28 |     // default scores on the root node of the first tree
 29 |     std::vector<float> y_pred(n_instances, 0);
 30 |     std::vector<float> y_pred_tree(n_instances);
 31 | 
 32 |     for (unsigned int tree_id=0; tree_id<n_trees; ++tree_id) {
 33 |         c_feature_importance_tree(X,
 34 |                                   y,
 35 |                                   trees_root,
 36 |                                   trees_weight,
 37 |                                   trees_nodes__feature,
 38 |                                   trees_nodes_value,
 39 |                                   trees_left_child,
 40 |                                   trees_right_child,
 41 |                                   tree_id,
 42 |                                   feature_imp,
 43 |                                   feature_count,
 44 |                                   n_instances,
 45 |                                   n_features,
 46 |                                   y_pred.data(),
 47 |                                   y_pred_tree.data());
 48 |     }
 49 | }
 50 | 
 51 | void c_feature_importance_tree(
 52 |         const float* X,
 53 |         const float* y,
 54 |         const int* trees_root,
 55 |         const float* trees_weight,
 56 |         const short* trees_nodes_feature,
 57 |         const float* trees_nodes_value,
 58 |         const int* trees_left_child,
 59 |         const int* trees_right_child,
 60 |         const int tree_id,
 61 |         float* feature_imp,
 62 |         short* feature_count,
 63 |         const int n_instances,
 64 |         const int n_features,
 65 |         float* y_pred,
 66 |         float* y_pred_tree) {
 67 | 
 68 |     // Initialize the vector of instance ids used to identify which instances
 69 |     // reaches each tree node (the split node partition this set)
 70 |     std::vector<unsigned int> split_instance(n_instances);
 71 |     // The residual scores to fit
 72 |     // y_target = y - y_pred
 73 |     std::vector<float> y_target(n_instances);
 74 |     float mean_y_target = 0;
 75 |     #pragma omp parallel for reduction( + : mean_y_target )
 76 |     for (unsigned int instance = 0; instance < n_instances; ++instance) {
 77 |         split_instance[instance] = instance;
 78 |         y_target[instance] = y[instance] - y_pred[instance];
 79 |         mean_y_target += y_target[instance];
 80 |     }
 81 |     mean_y_target /= n_instances;
 82 | 
 83 |     // initialize the y_pred_tree vector
 84 |     // y_pred_tree = np.full(n_instances, fill_value=y_target.mean())
 85 |     #pragma omp parallel for
 86 |     for (unsigned int instance = 0; instance < n_instances; ++instance)
 87 |         y_pred_tree[instance] = mean_y_target;
 88 | 
 89 |     TreeNode root(trees_root[tree_id], 0, n_instances - 1);
 90 |     std::vector<TreeNode> queue = { root };
 91 | 
 92 |     while (!queue.empty()) {
 93 | 
 94 |         TreeNode node = queue.back();
 95 |         queue.pop_back();
 96 | 
 97 |         int node_id = node.node_id;
 98 |         short feature_id = trees_nodes_feature[node_id];
 99 |         float threshold = trees_nodes_value[node_id];
100 | 
101 |         feature_count[feature_id]++;
102 | 
103 |         // Split the instances in left-right (end_id will be the frontier)
104 |         int start_id = node.start_id;
105 |         int end_id = node.end_id;
106 |         float y_target_mean_left = 0, y_target_mean_right = 0;
107 |         unsigned int instance;
108 |         while (start_id <= end_id) {
109 |             instance = split_instance[start_id];
110 |             if (X[instance * n_features + feature_id] <= threshold) {
111 |                 y_target_mean_left += y_target[instance];
112 |                 ++start_id;
113 |             } else {
114 |                 y_target_mean_right += y_target[instance];
115 |                 std::swap(split_instance[start_id], split_instance[end_id]);
116 |                 --end_id;
117 |             }
118 |         }
119 | 
120 |         int left_docs = end_id - node.start_id + 1;
121 |         int right_docs = node.end_id - end_id;
122 | 
123 |         // we need to normalize the mean y_targets (left and right)
124 |         if (left_docs > 0)
125 |             y_target_mean_left /= left_docs;
126 |         if (right_docs > 0)
127 |             y_target_mean_right /= right_docs;
128 | 
129 |         // compute split gain
130 |         float delta_mse = 0;
131 |         #pragma omp parallel for reduction( + : delta_mse )
132 |         for (unsigned int i = node.start_id; i <= node.end_id; ++i) {
133 |             unsigned int instance = split_instance[i];
134 |             float pre_split_mse =
135 |                 pow(y_target[instance] - y_pred_tree[instance], 2);
136 | 
137 |             if (i <= end_id)
138 |                 y_pred_tree[instance] = y_target_mean_left;
139 |             else
140 |                 y_pred_tree[instance] = y_target_mean_right;
141 | 
142 |             float post_split_mse =
143 |                 pow(y_target[instance] - y_pred_tree[instance], 2);
144 | 
145 |             delta_mse += pre_split_mse - post_split_mse;
146 |         }
147 | 
148 |         // update feature importance
149 |         feature_imp[feature_id] += delta_mse / n_instances;
150 | 
151 |         // if children are not leaves, add in the queue of the nodes to visit
152 |         if (!is_leaf_node(trees_left_child[node_id],
153 |                           trees_left_child,
154 |                           trees_right_child) && end_id > node.start_id) {
155 | 
156 |             TreeNode left(trees_left_child[node_id], node.start_id, end_id);
157 |             queue.push_back(left);
158 |         }
159 | 
160 |         if (!is_leaf_node(trees_right_child[node_id],
161 |                           trees_left_child,
162 |                           trees_right_child) && node.end_id > (end_id + 1) ) {
163 | 
164 |             TreeNode right(trees_right_child[node_id], end_id + 1, node.end_id);
165 |             queue.push_back(right);
166 |         }
167 |     }
168 | 
169 |     #pragma omp parallel for
170 |     for (unsigned int instance = 0; instance < n_instances; ++instance) {
171 |         y_pred_tree[instance] *= trees_weight[tree_id];
172 |         y_pred[instance] += y_pred_tree[instance];
173 |     }
174 | }


--------------------------------------------------------------------------------
/rankeval/model/proxy_CatBoost.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | """Class providing the implementation for loading/storing a CatBoost model
  9 | from/to file.
 10 | 
 11 | The CatBoost project is described here:
 12 |     https://github.com/catboost/catboost
 13 | 
 14 | CatBoost allows to save the learned model in several formats (binary, coreml,
 15 | etc). Among them, we chose to adopts the Apple CoreML format for reading and
 16 | converting a model into the rankeval representation. It s possible to read  the
 17 | coreML representation using the coremltools python package. Once read it
 18 | provides all the structured information of the ensemble, with split nodes (both
 19 | features and thresholds), leaf values and tree structure. Not all the
 20 | information reported in the model are useful for the different analysis, thus
 21 | only the relevant parts are parsed.
 22 | 
 23 | NOTE: CatBoost trains oblivious trees, i.e., trees where at each level a single
 24 | condition is checked, independently from the which node we are currently working
 25 | on. Rankeval does not exploit oblivious trees, but instead it represent them
 26 | as normal decision trees Thus the same condition will appear on all the nodes
 27 | of a single level of a tree. The reason behind this choice is to fasten the
 28 | development of the CatBoost proxy, allowing to analyze it without focusing too
 29 | much on prediction time (that is not currently measured by rankeval).
 30 | """
 31 | 
 32 | import numpy as np
 33 | import logging
 34 | 
 35 | from .rt_ensemble import RTEnsemble
 36 | 
 37 | 
 38 | class ProxyCatBoost(object):
 39 |     """
 40 |     Class providing the implementation for loading/storing a ProxyCatBoost model
 41 |     from/to file.
 42 |     """
 43 | 
 44 |     @staticmethod
 45 |     def load(file_path, model):
 46 |         """
 47 |         Load the model from the file identified by file_path.
 48 | 
 49 |         Parameters
 50 |         ----------
 51 |         file_path : str
 52 |             The path to the filename where the model has been saved
 53 |         model : RTEnsemble
 54 |             The model instance to fill
 55 |         """
 56 | 
 57 |         try:
 58 |             import coremltools
 59 |         except ImportError:
 60 |             logging.error('Missing coremltools package!.')
 61 |             return
 62 | 
 63 |         coreml_model = coremltools.models.model.MLModel(file_path)
 64 | 
 65 |         n_trees, n_nodes = ProxyCatBoost._count_nodes(coreml_model)
 66 |         # Initialize the model and allocate the needed space
 67 |         # given the shape and size of the ensemble
 68 |         model.initialize(n_trees, n_nodes)
 69 | 
 70 |         n_nodes_per_tree = int(n_nodes / n_trees)
 71 | 
 72 |         nodes = coreml_model.get_spec().treeEnsembleRegressor.treeEnsemble.nodes
 73 |         behaviors = coremltools.proto.TreeEnsemble_pb2.TreeEnsembleParameters.\
 74 |             TreeNode.TreeNodeBehavior
 75 | 
 76 |         for node in nodes:
 77 |             tree_offset = node.treeId * n_nodes_per_tree
 78 |             node_id_remap = ProxyCatBoost.remap_nodeId(node.nodeId,
 79 |                                                        n_nodes_per_tree)
 80 |             node_id_off = node_id_remap + tree_offset
 81 | 
 82 |             if node_id_remap == 0:  # this is the root of a tree
 83 |                 model.trees_root[node.treeId] = tree_offset
 84 |                 model.trees_weight[node.treeId] = 1
 85 | 
 86 |             if node.nodeBehavior == behaviors.Value('LeafNode'):
 87 |                 model.trees_nodes_value[node_id_off] = \
 88 |                     node.evaluationInfo[0].evaluationValue
 89 |             else:
 90 |                 if node.nodeBehavior == behaviors.Value('BranchOnValueGreaterThan'):
 91 |                     # we need to flip the condition given we use "<="
 92 |                     left = node.falseChildNodeId
 93 |                     right = node.trueChildNodeId
 94 |                 elif node.nodeBehavior == behaviors.Value('BranchOnValueLessThanEqual'):
 95 |                     right = node.falseChildNodeId
 96 |                     left = node.trueChildNodeId
 97 |                 else:
 98 |                     raise AssertionError(
 99 |                         "Branching condition not supported. RankEval does not "
100 |                         "support branching conditions different from "
101 |                         "BranchOnValueGreaterThan or BranchOnValueLessThanEqual.")
102 | 
103 |                 model.trees_nodes_value[node_id_off] = node.branchFeatureValue
104 |                 model.trees_nodes_feature[node_id_off] = node.branchFeatureIndex
105 |                 model.trees_left_child[node_id_off] = tree_offset +\
106 |                     ProxyCatBoost.remap_nodeId(left, n_nodes_per_tree)
107 |                 model.trees_right_child[node_id_off] = tree_offset + \
108 |                     ProxyCatBoost.remap_nodeId(right, n_nodes_per_tree)
109 | 
110 |     @staticmethod
111 |     def remap_nodeId(nodeId, n_nodes_per_tree):
112 |         return n_nodes_per_tree - 1 - nodeId
113 | 
114 |     @staticmethod
115 |     def save(file_path, model):
116 |         """
117 |         Save the model onto the file identified by file_path.
118 | 
119 |         Parameters
120 |         ----------
121 |         file_path : str
122 |             The path to the filename where the model has to be saved
123 |         model : RTEnsemble
124 |             The model RTEnsemble model to save on file
125 | 
126 |         Returns
127 |         -------
128 |         status : bool
129 |             Returns true if the save is successful, false otherwise
130 |         """
131 |         raise NotImplementedError("Feature not implemented!")
132 | 
133 |     @staticmethod
134 |     def _count_nodes(coreml_model):
135 |         """
136 |         Count the total number of nodes (both split and leaf nodes)
137 |         in the CoreML model.
138 | 
139 |         Parameters
140 |         ----------
141 |         coreml_model : CoreML model
142 |             The CoreML model to load from
143 | 
144 |         Returns
145 |         -------
146 |         tuple(n_trees, n_nodes) : tuple(int, int)
147 |             The total number of trees and nodes (both split and leaf nodes)
148 |             in the model identified by file_path.
149 |         """
150 | 
151 |         nodes = coreml_model.get_spec().treeEnsembleRegressor.treeEnsemble.nodes
152 | 
153 |         n_trees = np.max([node.treeId for node in nodes]) + 1
154 | 
155 |         n_nodes_trees = np.empty(n_trees, dtype=np.uint16)
156 |         for node in nodes:
157 |             n_nodes_trees[node.treeId] = node.nodeId
158 | 
159 |         # node_Id starts from 0, thus + 1
160 |         n_nodes = np.sum(n_nodes_trees + 1)
161 | 
162 |         return n_trees, n_nodes


--------------------------------------------------------------------------------
/rankeval/model/proxy_XGBoost.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | """Class providing the implementation for loading/storing a XGBoost model
  9 | from/to file. The model has to be saved using textual representation, i.e., by
 10 | using the following method:
 11 | .. code-block:: python
 12 |     import xgboost as xgb
 13 |     ...
 14 |     bst = xgb.train(param, dtrain, num_round)
 15 |     bst.dump_model('xgboost.model')
 16 | 
 17 | The XGBoost project is described here:
 18 |     https://github.com/dmlc/xgboost
 19 | 
 20 | The XGBoost format adopts a textual representation where each line of the file
 21 | represent a single split node or a leaf node, with several attributes describing
 22 | the feature and the threshold involved (in case of a split node) or the output
 23 | (in case of a leaf). Each node is identified by a unique integer as well as
 24 | additional information not usefull for rankeval and thus ignored.
 25 | 
 26 | NOTE: the XGBoost version 0.6 does not properly dump the model. Indeed, as
 27 | reported in the issue here:
 28 | 
 29 | - https://github.com/dmlc/xgboost/issues/2077
 30 | 
 31 | The precision of the dumping is not sufficient and cause inconsistencies with
 32 | the XGBoost model. This inconsistencies cause rankeval scoring to return
 33 | different predictions with respect to the original model. Without a fix by
 34 | XGBoost authors, DO NOT USE this proxy.
 35 | """
 36 | 
 37 | import re
 38 | import sys
 39 | import numpy as np
 40 | 
 41 | from .rt_ensemble import RTEnsemble
 42 | 
 43 | tree_reg = re.compile("^booster\[(\d+)\]")
 44 | node_reg = re.compile("(\d+):\[f(\d+)<(.*)\]")
 45 | leaf_reg = re.compile("(\d+):leaf=(.+?)(,.*)?$")
 46 | 
 47 | 
 48 | class ProxyXGBoost(object):
 49 |     """
 50 |     Class providing the implementation for loading/storing a XGBoost model
 51 |     from/to file.
 52 |     """
 53 | 
 54 |     @staticmethod
 55 |     def load(file_path, model):
 56 |         """
 57 |         Load the model from the file identified by file_path.
 58 | 
 59 |         Parameters
 60 |         ----------
 61 |         file_path : str
 62 |             The path to the filename where the model has been saved
 63 |         model : RTEnsemble
 64 |             The model instance to fill
 65 |         """
 66 |         n_trees, n_nodes = ProxyXGBoost._count_nodes(file_path)
 67 |         # Initialize the model and allocate the needed space
 68 |         # given the shape and size of the ensemble
 69 |         model.initialize(n_trees, n_nodes)
 70 | 
 71 |         root_node = 0
 72 |         num_nodes = 0
 73 |         queue = list()
 74 |         with open(file_path, 'r') as f:
 75 |             for line in f:
 76 | 
 77 |                 match_tree = tree_reg.match(line)
 78 |                 if match_tree:
 79 |                     assert(len(queue) == 0)
 80 |                     curr_tree = int(match_tree.group(1))
 81 |                     root_node += num_nodes
 82 |                     num_nodes = 0
 83 |                     model.trees_root[curr_tree] = root_node
 84 |                     model.trees_weight[curr_tree] = 1
 85 |                     continue
 86 | 
 87 |                 match_node = node_reg.search(line)
 88 |                 if match_node:
 89 |                     node_id = num_nodes + root_node
 90 |                     feature_id = int(match_node.group(2).strip())
 91 |                     threshold = np.float32(match_node.group(3).strip())
 92 | 
 93 |                     # Needed because XGBoost use as split condition
 94 |                     # < in place of <=
 95 |                     threshold = np.nextafter(
 96 |                         threshold, threshold - 1,
 97 |                         dtype=model.trees_nodes_value.dtype)
 98 | 
 99 |                     model.trees_nodes_feature[node_id] = feature_id
100 |                     model.trees_nodes_value[node_id] = threshold
101 | 
102 |                 match_leaf = leaf_reg.search(line)
103 |                 if match_leaf:
104 |                     node_id = num_nodes + root_node
105 |                     leaf_value = float(match_leaf.group(2).strip())
106 |                     model.trees_nodes_value[node_id] = leaf_value
107 | 
108 |                 if match_node or match_leaf:
109 |                     num_nodes += 1
110 |                     if len(queue) > 0:
111 |                         parent_id, child = queue.pop()
112 |                         if child == 'L':
113 |                             model.trees_left_child[parent_id] = node_id
114 |                         else:
115 |                             model.trees_right_child[parent_id] = node_id
116 | 
117 |                 if match_node:
118 |                     # two elements in the queue for the left and right children
119 |                     # Each element is identified by a node_id and the indication
120 |                     # of being the left or right child.
121 |                     queue.extend([(node_id, 'R'), (node_id, 'L')])
122 | 
123 |     @staticmethod
124 |     def save(file_path, model):
125 |         """
126 |         Save the model onto the file identified by file_path.
127 | 
128 |         Parameters
129 |         ----------
130 |         file_path : str
131 |             The path to the filename where the model has to be saved
132 |         model : RTEnsemble
133 |             The model RTEnsemble model to save on file
134 | 
135 |         Returns
136 |         -------
137 |         status : bool
138 |             Returns true if the save is successful, false otherwise
139 |         """
140 |         raise NotImplementedError("Feature not implemented!")
141 | 
142 |     @staticmethod
143 |     def _count_nodes(file_path):
144 |         """
145 |         Count the total number of nodes (both split and leaf nodes)
146 |         in the model identified by file_path.
147 | 
148 |         Parameters
149 |         ----------
150 |         file_path : str
151 |             The path to the filename where the model has been saved
152 | 
153 |         Returns
154 |         -------
155 |         tuple(n_trees, n_nodes) : tuple(int, int)
156 |             The total number of trees and nodes (both split and leaf nodes)
157 |             in the model identified by file_path.
158 |         """
159 | 
160 |         n_nodes = 0
161 |         n_trees = 0
162 | 
163 |         with open(file_path, 'r') as f:
164 |             for line in f:
165 | 
166 |                 match = tree_reg.match(line)
167 |                 if match:
168 |                     n_trees += 1
169 |                     continue
170 | 
171 |                 match_node = node_reg.search(line)
172 |                 if match_node:
173 |                     n_nodes += 1
174 | 
175 |                 match_leaf = leaf_reg.search(line)
176 |                 if match_leaf:
177 |                     n_nodes += 1
178 | 
179 |         return n_trees, n_nodes
180 | 


--------------------------------------------------------------------------------
/rankeval/model/proxy_Jforests.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
  2 | # Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
  3 | #
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | """
  9 | Class providing the implementation for loading/storing a QuickRank model
 10 | from/to file.
 11 | 
 12 | The Jforests project is described here: https://github.com/yasserg/jforests
 13 | 
 14 | The Jforests format adopts an XML representation. There is an ensemble node,
 15 | with a sub-node for each tree, identified by the "Tree" tag, followed by the
 16 | description of the tree (with splitting and leaf nodes). The splitting nodes are
 17 | described with two information: the feature-id used for splitting, and the
 18 | threshold value. Leaf nodes on the other hand are described by a "LeafOutputs"
 19 | tag with the value as content.
 20 | """
 21 | 
 22 | from .rt_ensemble import RTEnsemble
 23 | 
 24 | try:
 25 |     import xml.etree.cElementTree as etree
 26 | except ImportError:
 27 |     import xml.etree.ElementTree as etree
 28 | 
 29 | 
 30 | class ProxyJforests(object):
 31 |     """
 32 |     Class providing the implementation for loading/storing a Jforests model
 33 |     from/to file.
 34 |     """
 35 | 
 36 |     @staticmethod
 37 |     def load(file_path, model):
 38 |         """
 39 |         Load the model from the file identified by file_path.
 40 | 
 41 |         Parameters
 42 |         ----------
 43 |         file_path : str
 44 |             The path to the filename where the model has been saved
 45 |         model : RTEnsemble
 46 |             The model instance to fill
 47 |         """
 48 |         n_trees, n_nodes = ProxyJforests._count_nodes(file_path)
 49 |         # Initialize the model and allocate the needed space
 50 |         # given the shape and size of the ensemble
 51 |         model.initialize(n_trees, n_nodes)
 52 | 
 53 |         # get an iterable
 54 |         context = etree.iterparse(file_path, events=("start", "end"))
 55 | 
 56 |         # get the root element
 57 |         _, root = next(context)
 58 | 
 59 |         curr_tree = -1
 60 |         root_node = 0
 61 |         num_leaves = num_splits = 0
 62 | 
 63 |         for event, elem in context:
 64 | 
 65 |             if event == 'start' and elem.tag == 'Tree':
 66 |                     curr_tree += 1  # increase the current number index
 67 |                     root_node += num_leaves + num_splits
 68 |                     # save the curr node as the root of a new tree
 69 |                     model.trees_root[curr_tree] = root_node
 70 |                     model.trees_weight[curr_tree] = elem.attrib['weight']
 71 | 
 72 |             if event == 'end':
 73 | 
 74 |                 if elem.tag == 'SplitFeatures':
 75 |                     split_features = map(int, elem.text.split(" "))
 76 |                     num_splits = 0
 77 |                     for pos, feature in enumerate(split_features):
 78 |                         num_splits += 1
 79 |                         model.trees_nodes_feature[root_node + pos] = feature
 80 |                 elif elem.tag == 'LeftChildren':
 81 |                     left_children = map(int, elem.text.split(" "))
 82 |                     for pos, child in enumerate(left_children):
 83 |                         if child >= 0:
 84 |                             model.trees_left_child[root_node + pos] = \
 85 |                                 root_node + child
 86 |                         else:
 87 |                             model.trees_left_child[root_node + pos] = \
 88 |                                 root_node + num_splits + abs(child) - 1
 89 |                 elif elem.tag == 'RightChildren':
 90 |                     right_children = map(int, elem.text.split(" "))
 91 |                     for pos, child in enumerate(right_children):
 92 |                         if child >= 0:
 93 |                             model.trees_right_child[root_node + pos] = \
 94 |                                 root_node + child
 95 |                         else:
 96 |                             model.trees_right_child[root_node + pos] = \
 97 |                                 root_node + num_splits + abs(child) - 1
 98 |                 elif elem.tag == 'OriginalThresholds':
 99 |                     thresholds = map(float, elem.text.split(" "))
100 |                     for pos, threshold in enumerate(thresholds):
101 |                         model.trees_nodes_value[root_node + pos] = threshold
102 |                 elif elem.tag == 'LeafOutputs':
103 |                     leaf_values = map(float, elem.text.split(" "))
104 |                     num_leaves = 0
105 |                     for pos, leaf_value in enumerate(leaf_values):
106 |                         num_leaves += 1
107 |                         model.trees_nodes_value[root_node + num_splits + pos] \
108 |                             = leaf_value
109 | 
110 |             # clear the memory
111 |             if event == 'end':
112 |                 elem.clear()    # discard the element
113 |                 root.clear()    # remove child reference from the root
114 | 
115 |     @staticmethod
116 |     def save(file_path, model):
117 |         """
118 |         Save the model onto the file identified by file_path.
119 | 
120 |         Parameters
121 |         ----------
122 |         file_path : str
123 |             The path to the filename where the model has to be saved
124 |         model : RTEnsemble
125 |             The model RTEnsemble model to save on file
126 | 
127 |         Returns
128 |         -------
129 |         status : bool
130 |             Returns true if the save is successful, false otherwise
131 |         """
132 |         raise NotImplementedError("Feature not implemented!")
133 | 
134 |     @staticmethod
135 |     def _count_nodes(file_path):
136 |         """
137 |         Count the total number of nodes (both split and leaf nodes)
138 |         in the model identified by file_path.
139 | 
140 |         Parameters
141 |         ----------
142 |         file_path : str
143 |             The path to the filename where the model has been saved
144 | 
145 |         Returns
146 |         -------
147 |         tuple(n_trees, n_nodes) : tuple(int, int)
148 |             The total number of trees and nodes (both split and leaf nodes)
149 |             in the model identified by file_path.
150 |         """
151 |         # get an iterable
152 |         # NOTE: it seems like there is a bug inside lxmx since selecting only
153 |         # terminal tags with events=("end",) some tags are skipped...
154 |         context = etree.iterparse(file_path, events=("start", "end"))
155 | 
156 |         # get the root element
157 |         _, root = next(context)
158 | 
159 |         n_nodes = 0
160 |         n_trees = 0
161 |         for event, elem in context:
162 |             if event != "end":
163 |                 continue
164 |             if elem.tag == 'Tree':
165 |                 n_trees += 1
166 |             elif elem.tag == 'SplitFeatures' or elem.tag == 'LeafOutputs':
167 |                 n_nodes += len(elem.text.split(" "))
168 | 
169 |             elem.clear()    # discard the element
170 |             root.clear()    # remove root reference to the child
171 | 
172 |         return n_trees, n_nodes
173 | 


--------------------------------------------------------------------------------