├── elote ├── py.typed ├── arenas │ └── __init__.py ├── competitors │ └── __init__.py ├── datasets │ ├── __init__.py │ ├── synthetic.py │ └── utils.py ├── logging.py └── benchmark.py ├── tests ├── __init__.py ├── test_DWZCompetitor.py ├── test_ECFCompetitor.py ├── test_EloCompetitor.py ├── test_ColleyMatrixCompetitor.py ├── test_BlendedCompetitor.py ├── test_ECFCompetitor_known_values.py ├── test_examples.py ├── test_EloCompetitor_known_values.py ├── test_visualization.py ├── test_Arenas.py ├── test_GlickoCompetitor_known_values.py └── test_ColleyMatrixCompetitor_known_values.py ├── examples ├── __init__.py ├── use_cases │ └── __init__.py ├── dwz_arena.py ├── ecf_arena.py ├── prediction.py ├── sample_bout.py ├── bout_with_initialization.py ├── bout_with_ties.py ├── sample_arena.py ├── persist_state_arena.py ├── trueskill_tournament.py ├── glicko_arena.py ├── trueskill_example.py ├── colley_matrix_example.py ├── glicko2_example.py └── dataset_example.py ├── docs ├── source │ ├── CNAME │ ├── arenas.rst │ ├── api │ │ ├── arenas.rst │ │ └── competitors.rst │ ├── blog_posts.rst │ ├── competitors.rst │ ├── index.rst │ ├── getting_started.rst │ ├── conf.py │ ├── installation.rst │ ├── rating_systems │ │ ├── elo.rst │ │ ├── ecf.rst │ │ ├── dwz.rst │ │ ├── glicko.rst │ │ └── ensemble.rst │ ├── contributing.rst │ └── advance_example.rst ├── requirements.txt ├── Makefile └── make.bat ├── .github ├── PULL_REQUEST_TEMPLATE.md ├── ISSUE_TEMPLATE.md └── workflows │ ├── test-docs-build.yml │ ├── docs.yml │ ├── test-suite.yml │ └── pypi-publish.yml ├── images ├── colley_matrix_ratings.png ├── cfb │ ├── calibration_curves.png │ ├── calibration_comparison.png │ ├── accuracy_by_prior_bouts.png │ ├── rating_systems_comparison.png │ └── optimized_accuracy_comparison.png └── chess │ ├── calibration_curves.png │ ├── calibration_comparison.png │ ├── accuracy_by_prior_bouts.png │ ├── rating_systems_comparison.png │ └── optimized_accuracy_comparison.png ├── .gitignore ├── CONTRIBUTING.md ├── tox.ini ├── LICENSE.md ├── CHANGELOG.md ├── .cursor └── rules │ ├── elote_testing.mdc │ ├── pytest_standards.mdc │ ├── python_standards.mdc │ └── sphinx_docs_standards.mdc ├── CODE_OF_CONDUCT.md ├── pyproject.toml ├── Makefile └── scripts └── run_benchmarks.py /elote/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /elote/arenas/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /elote/competitors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/use_cases/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/source/CNAME: -------------------------------------------------------------------------------- 1 | elote.mcginniscommawill.com -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Fixes # 2 | 3 | ## Proposed Changes 4 | 5 | - 6 | - 7 | - -------------------------------------------------------------------------------- /images/colley_matrix_ratings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/colley_matrix_ratings.png -------------------------------------------------------------------------------- /images/cfb/calibration_curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/cfb/calibration_curves.png -------------------------------------------------------------------------------- /images/cfb/calibration_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/cfb/calibration_comparison.png -------------------------------------------------------------------------------- /images/chess/calibration_curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/chess/calibration_curves.png -------------------------------------------------------------------------------- /images/cfb/accuracy_by_prior_bouts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/cfb/accuracy_by_prior_bouts.png -------------------------------------------------------------------------------- /images/chess/calibration_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/chess/calibration_comparison.png -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=6.1.3 2 | docutils>=0.19 3 | sphinx_rtd_theme 4 | sphinx-rtd-dark-mode 5 | sphinxcontrib-googleanalytics -------------------------------------------------------------------------------- /images/cfb/rating_systems_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/cfb/rating_systems_comparison.png -------------------------------------------------------------------------------- /images/chess/accuracy_by_prior_bouts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/chess/accuracy_by_prior_bouts.png -------------------------------------------------------------------------------- /images/chess/rating_systems_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/chess/rating_systems_comparison.png -------------------------------------------------------------------------------- /images/cfb/optimized_accuracy_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/cfb/optimized_accuracy_comparison.png -------------------------------------------------------------------------------- /images/chess/optimized_accuracy_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/chess/optimized_accuracy_comparison.png -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Expected Behavior 2 | 3 | 4 | ## Actual Behavior 5 | 6 | 7 | ## Steps to Reproduce the Problem 8 | 9 | 1. 10 | 1. 11 | 1. 12 | 13 | ## Specifications 14 | 15 | - Version: 16 | - Platform: 17 | - Subsystem: -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | docs/build/* 2 | *~ 3 | .DS_Store 4 | .idea 5 | *.pyc 6 | *.sublime-project 7 | *.sublime-workspace 8 | docs/build/ 9 | benchmark_results/ 10 | .tox/ 11 | .coverage 12 | htmlcov/ 13 | .pytest_cache/ 14 | .ruff_cache/ 15 | *.egg-info/ 16 | .venv/ 17 | .benchmarks/ 18 | .coverage* 19 | .uv.lock -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Contributing 2 | ============ 3 | 4 | The idea is we will add more and more rating methods first, then work on being a little smarter about arenas and 5 | maybe even ensemble arenas. 6 | 7 | Then gather some good datasets, to evaluate different rating systems and figure out which one would have made Auburn win 8 | the BCS title in 2004 and call that the best one. 9 | 10 | -------------------------------------------------------------------------------- /docs/source/arenas.rst: -------------------------------------------------------------------------------- 1 | Arenas 2 | ====== 3 | 4 | Arenas are objects that manage populations of competitors and their matchups. Currently there is only one 5 | type of arena implemented, LambdaArenas 6 | 7 | Lambda Arena 8 | ------------ 9 | 10 | .. autoclass:: elote.arenas.lambda_arena.LambdaArena 11 | :members: 12 | 13 | 14 | Helpers 15 | ------- 16 | 17 | .. autoclass:: elote.arenas.base.History 18 | :members: 19 | 20 | .. autoclass:: elote.arenas.base.Bout 21 | :members: 22 | 23 | 24 | -------------------------------------------------------------------------------- /docs/source/api/arenas.rst: -------------------------------------------------------------------------------- 1 | Arenas API Reference 2 | =================== 3 | 4 | This page provides detailed API documentation for all arena classes in Elote. 5 | 6 | Base Arena 7 | --------- 8 | 9 | .. automodule:: elote.arenas.base 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | :special-members: __init__ 14 | 15 | Lambda Arena 16 | ----------- 17 | 18 | .. automodule:: elote.arenas.lambda_arena 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | :special-members: __init__ -------------------------------------------------------------------------------- /examples/dwz_arena.py: -------------------------------------------------------------------------------- 1 | from elote import LambdaArena, DWZCompetitor 2 | import json 3 | import random 4 | 5 | 6 | # sample bout function which just compares the two inputs 7 | def func(a, b): 8 | if a == b: 9 | return None 10 | else: 11 | return a > b 12 | 13 | 14 | matchups = [(random.randint(1, 10), random.randint(1, 10)) for _ in range(1000)] 15 | 16 | arena = LambdaArena(func, base_competitor=DWZCompetitor) 17 | arena.tournament(matchups) 18 | 19 | print("Arena results:") 20 | print(json.dumps(arena.leaderboard(), indent=4)) 21 | -------------------------------------------------------------------------------- /examples/ecf_arena.py: -------------------------------------------------------------------------------- 1 | from elote import LambdaArena, ECFCompetitor 2 | import json 3 | import random 4 | 5 | 6 | # sample bout function which just compares the two inputs 7 | def func(a, b): 8 | if a == b: 9 | return None 10 | else: 11 | return a > b 12 | 13 | 14 | matchups = [(random.randint(1, 10), random.randint(1, 10)) for _ in range(1000)] 15 | 16 | arena = LambdaArena(func, base_competitor=ECFCompetitor) 17 | arena.tournament(matchups) 18 | 19 | print("Arena results:") 20 | print(json.dumps(arena.leaderboard(), indent=4)) 21 | -------------------------------------------------------------------------------- /examples/prediction.py: -------------------------------------------------------------------------------- 1 | from elote import EloCompetitor 2 | 3 | good = EloCompetitor(initial_rating=400) 4 | better = EloCompetitor(initial_rating=500) 5 | 6 | print("probability of better beating good: %5.2f%%" % (better.expected_score(good) * 100,)) 7 | print("probability of good beating better: %5.2f%%" % (good.expected_score(better) * 100,)) 8 | 9 | good.beat(better) 10 | 11 | print("probability of better beating good: %5.2f%%" % (better.expected_score(good) * 100,)) 12 | print("probability of good beating better: %5.2f%%" % (good.expected_score(better) * 100,)) 13 | -------------------------------------------------------------------------------- /examples/sample_bout.py: -------------------------------------------------------------------------------- 1 | from elote import EloCompetitor 2 | 3 | good = EloCompetitor() 4 | better = EloCompetitor() 5 | best = EloCompetitor() 6 | 7 | print("Starting ratings:") 8 | print( 9 | "%7.2f, %7.2f, %7.2f" 10 | % ( 11 | good.rating, 12 | better.rating, 13 | best.rating, 14 | ) 15 | ) 16 | 17 | print("\nAfter matches") 18 | 19 | for _ in range(10): 20 | better.beat(good) 21 | best.beat(better) 22 | print( 23 | "%7.2f, %7.2f, %7.2f" 24 | % ( 25 | good.rating, 26 | better.rating, 27 | best.rating, 28 | ) 29 | ) 30 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py310, py311, py312 3 | isolated_build = True 4 | requires = 5 | tox-uv 6 | 7 | [gh-actions] 8 | python = 9 | 3.10: py310 10 | 3.11: py311 11 | 3.12: py312, lint 12 | 13 | [testenv] 14 | deps = 15 | pytest 16 | pytest-cov 17 | pytest-benchmark 18 | allowlist_externals = 19 | uv 20 | commands = 21 | uv pip install -e ".[datasets]" 22 | pytest {posargs:tests} --cov=elote --cov-report=term 23 | 24 | [testenv:lint] 25 | deps = 26 | ruff 27 | commands = 28 | ruff check . 29 | 30 | [testenv:format] 31 | deps = 32 | ruff 33 | commands = 34 | ruff format . -------------------------------------------------------------------------------- /examples/bout_with_initialization.py: -------------------------------------------------------------------------------- 1 | from elote import EloCompetitor 2 | 3 | good = EloCompetitor(initial_rating=500) 4 | better = EloCompetitor(initial_rating=450) 5 | best = EloCompetitor(initial_rating=400) 6 | 7 | print("Starting ratings:") 8 | print( 9 | "%7.2f, %7.2f, %7.2f" 10 | % ( 11 | good.rating, 12 | better.rating, 13 | best.rating, 14 | ) 15 | ) 16 | 17 | print("\nAfter matches") 18 | 19 | for _ in range(20): 20 | better.beat(good) 21 | best.beat(better) 22 | print( 23 | "%7.2f, %7.2f, %7.2f" 24 | % ( 25 | good.rating, 26 | better.rating, 27 | best.rating, 28 | ) 29 | ) 30 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS = 7 | SPHINXBUILD = sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /.github/workflows/test-docs-build.yml: -------------------------------------------------------------------------------- 1 | name: "Pull Request Docs Check" 2 | on: 3 | - pull_request 4 | 5 | jobs: 6 | docs: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Checkout 10 | uses: actions/checkout@v1 11 | 12 | - name: Set up Python 13 | uses: actions/setup-python@v4 14 | with: 15 | python-version: '3.10' 16 | 17 | - name: Install uv 18 | run: | 19 | curl -LsSf https://astral.sh/uv/install.sh | sh 20 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH 21 | 22 | - name: Install Dependencies 23 | run: | 24 | uv pip install --system -e ".[dev]" 25 | 26 | - name: Build Docs 27 | uses: ammaraskar/sphinx-action@master 28 | with: 29 | docs-folder: "docs/" -------------------------------------------------------------------------------- /docs/source/blog_posts.rst: -------------------------------------------------------------------------------- 1 | Blog Posts 2 | ========== 3 | 4 | Here are some blog posts about Elote written by the author: 5 | 6 | - `Elote: A Python Package for Rating Systems `_ - Introduction to the library and its initial design 7 | - `Using Cursor for Library Maintenance `_ - How Cursor helps maintain Elote and other open source libraries 8 | - `Year's End: Looking Back at 2017 `_ - Reflections including Elote development 9 | 10 | These posts provide additional context about the development and maintenance of Elote, as well as real-world use cases and insights from the author. -------------------------------------------------------------------------------- /examples/bout_with_ties.py: -------------------------------------------------------------------------------- 1 | from elote import EloCompetitor 2 | 3 | good = EloCompetitor(initial_rating=500) 4 | better = EloCompetitor(initial_rating=450) 5 | best = EloCompetitor(initial_rating=400) 6 | also_best = EloCompetitor(initial_rating=400) 7 | 8 | print("Starting ratings:") 9 | print( 10 | "%7.2f, %7.2f, %7.2f, %7.2f" 11 | % ( 12 | good.rating, 13 | better.rating, 14 | best.rating, 15 | also_best.rating, 16 | ) 17 | ) 18 | 19 | print("\nAfter matches with ties") 20 | 21 | for _ in range(20): 22 | better.beat(good) 23 | better.lost_to(best) 24 | best.tied(also_best) 25 | print( 26 | "%7.2f, %7.2f, %7.2f, %7.2f" 27 | % ( 28 | good.rating, 29 | better.rating, 30 | best.rating, 31 | also_best.rating, 32 | ) 33 | ) 34 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /examples/sample_arena.py: -------------------------------------------------------------------------------- 1 | from elote import LambdaArena, EloCompetitor 2 | import json 3 | import random 4 | 5 | 6 | # sample bout function which just compares the two inputs 7 | def func(a, b): 8 | if a == b: 9 | return None 10 | else: 11 | return a > b 12 | 13 | 14 | # Configure the EloCompetitor class with a moderate k_factor 15 | # Note: Using a more moderate k_factor (20) to prevent ratings from changing too drastically 16 | EloCompetitor.configure_class(k_factor=20) 17 | 18 | # Create arena with a higher initial rating for all competitors 19 | # Using 1200 as initial rating (standard chess starting rating) to prevent negative ratings 20 | matchups = [(random.randint(1, 10), random.randint(1, 10)) for _ in range(1000)] 21 | arena = LambdaArena(func, base_competitor=EloCompetitor, base_competitor_kwargs={"initial_rating": 1200}) 22 | arena.tournament(matchups) 23 | 24 | print("Arena results:") 25 | print(json.dumps(arena.leaderboard(), indent=4)) 26 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: "Master Docs Publication" 2 | on: 3 | push: 4 | branches: [ master ] 5 | 6 | jobs: 7 | docs: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Clone 11 | uses: actions/checkout@v1 12 | 13 | - name: Set up Python 14 | uses: actions/setup-python@v4 15 | with: 16 | python-version: '3.10' 17 | 18 | - name: Install uv 19 | run: | 20 | curl -LsSf https://astral.sh/uv/install.sh | sh 21 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH 22 | 23 | - name: Install Dependencies 24 | run: | 25 | uv pip install --system -e ".[dev]" 26 | 27 | - name: Build Docs 28 | uses: ammaraskar/sphinx-action@master 29 | with: 30 | docs-folder: "./docs/" 31 | 32 | - name: Deploy Docs 33 | uses: peaceiris/actions-gh-pages@v3 34 | with: 35 | github_token: ${{ secrets.GITHUB_TOKEN }} 36 | publish_dir: "./docs/build/html/" -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright © 2024 Will McGinnis 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | 6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | 8 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /.github/workflows/test-suite.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: "Test Suite and Linting" 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | test: 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - name: Checkout code 18 | uses: actions/checkout@v3 19 | 20 | - name: Set up Python 21 | uses: actions/setup-python@v4 22 | with: 23 | python-version: '3.10' 24 | 25 | - name: Install uv 26 | run: | 27 | curl -LsSf https://astral.sh/uv/install.sh | sh 28 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH 29 | 30 | - name: Install dependencies 31 | run: | 32 | uv pip install --system -e ".[dev,datasets]" 33 | 34 | - name: Run tests 35 | run: | 36 | make test-all 37 | 38 | - name: Run linting 39 | run: | 40 | make lint 41 | -------------------------------------------------------------------------------- /tests/test_DWZCompetitor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from elote import DWZCompetitor 3 | 4 | 5 | class TestDWZ(unittest.TestCase): 6 | def test_Improvement(self): 7 | initial_rating = 100 8 | player1 = DWZCompetitor(initial_rating=initial_rating) 9 | 10 | # if player1 beats someone with a high rating, their rating should go up. 11 | for _ in range(10): 12 | player2 = DWZCompetitor(initial_rating=800) 13 | player1.beat(player2) 14 | self.assertGreater(player1.rating, initial_rating) 15 | initial_rating = player1.rating 16 | 17 | def test_Decay(self): 18 | initial_rating = 800 19 | player1 = DWZCompetitor(initial_rating=initial_rating) 20 | 21 | # if player1 beats someone with a high rating, their rating should go up. 22 | for _ in range(10): 23 | player2 = DWZCompetitor(initial_rating=100) 24 | player2.beat(player1) 25 | self.assertLess(player1.rating, initial_rating) 26 | initial_rating = player1.rating 27 | 28 | def test_Expectation(self): 29 | player1 = DWZCompetitor(initial_rating=1000) 30 | player2 = DWZCompetitor(initial_rating=100) 31 | self.assertGreater(player1.expected_score(player2), player2.expected_score(player1)) 32 | -------------------------------------------------------------------------------- /tests/test_ECFCompetitor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from elote import ECFCompetitor 3 | 4 | 5 | class TestECF(unittest.TestCase): 6 | def test_Improvement(self): 7 | initial_rating = 100 8 | player1 = ECFCompetitor(initial_rating=initial_rating) 9 | 10 | # if player1 beats someone with a high rating, their rating should go up. 11 | for _ in range(10): 12 | player2 = ECFCompetitor(initial_rating=800) 13 | player1.beat(player2) 14 | self.assertGreater(player1.rating, initial_rating) 15 | initial_rating = player1.rating 16 | 17 | def test_Decay(self): 18 | initial_rating = 800 19 | player1 = ECFCompetitor(initial_rating=initial_rating) 20 | 21 | # if player1 beats someone with a high rating, their rating should go up. 22 | for _ in range(10): 23 | player2 = ECFCompetitor(initial_rating=100) 24 | player2.beat(player1) 25 | self.assertLess(player1.rating, initial_rating) 26 | initial_rating = player1.rating 27 | 28 | def test_Expectation(self): 29 | player1 = ECFCompetitor(initial_rating=1000) 30 | player2 = ECFCompetitor(initial_rating=100) 31 | self.assertGreater(player1.expected_score(player2), player2.expected_score(player1)) 32 | -------------------------------------------------------------------------------- /examples/persist_state_arena.py: -------------------------------------------------------------------------------- 1 | from elote import LambdaArena, GlickoCompetitor 2 | import json 3 | import random 4 | 5 | 6 | # sample bout function which just compares the two inputs 7 | def func(a, b): 8 | if a == b: 9 | return None 10 | else: 11 | return a > b 12 | 13 | 14 | # start scoring, stop and save state 15 | matchups = [(random.randint(1, 10), random.randint(1, 10)) for _ in range(10)] 16 | arena = LambdaArena(func, base_competitor=GlickoCompetitor) 17 | arena.tournament(matchups) 18 | print("Arena results:") 19 | print(json.dumps(arena.leaderboard(), indent=4)) 20 | 21 | # Export state and create a deep copy to avoid modifying the original 22 | # Use a simple dict comprehension instead of deepcopy to avoid issues with non-serializable types 23 | saved_state = {k: v for k, v in arena.export_state().items()} 24 | 25 | # Create a new arena with the saved state 26 | matchups = [(random.randint(1, 10), random.randint(1, 10)) for _ in range(100)] 27 | new_arena = LambdaArena(func, base_competitor=GlickoCompetitor) 28 | 29 | # Use from_state to recreate competitors 30 | for k, v in saved_state.items(): 31 | new_arena.competitors[k] = GlickoCompetitor.from_state(v) 32 | 33 | # Run more matches 34 | new_arena.tournament(matchups) 35 | print("Arena results:") 36 | print(json.dumps(new_arena.leaderboard(), indent=4)) 37 | -------------------------------------------------------------------------------- /docs/source/api/competitors.rst: -------------------------------------------------------------------------------- 1 | Competitors API Reference 2 | ======================== 3 | 4 | This page provides detailed API documentation for all competitor classes in Elote. 5 | 6 | Base Competitor 7 | -------------- 8 | 9 | .. automodule:: elote.competitors.base 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | :special-members: __init__ 14 | 15 | Elo Competitor 16 | ------------- 17 | 18 | .. automodule:: elote.competitors.elo 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | :special-members: __init__ 23 | 24 | Glicko Competitor 25 | ---------------- 26 | 27 | .. automodule:: elote.competitors.glicko 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | :special-members: __init__ 32 | 33 | DWZ Competitor 34 | ------------- 35 | 36 | .. automodule:: elote.competitors.dwz 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | :special-members: __init__ 41 | 42 | ECF Competitor 43 | ------------- 44 | 45 | .. automodule:: elote.competitors.ecf 46 | :members: 47 | :undoc-members: 48 | :show-inheritance: 49 | :special-members: __init__ 50 | 51 | Blended Competitor 52 | ----------------- 53 | 54 | .. automodule:: elote.competitors.ensemble 55 | :members: 56 | :undoc-members: 57 | :show-inheritance: 58 | :special-members: __init__ -------------------------------------------------------------------------------- /tests/test_EloCompetitor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from elote import EloCompetitor, GlickoCompetitor 3 | from elote.competitors.base import MissMatchedCompetitorTypesException 4 | 5 | 6 | class TestElo(unittest.TestCase): 7 | def test_Improvement(self): 8 | initial_rating = 100 9 | player1 = EloCompetitor(initial_rating=initial_rating) 10 | 11 | # if player1 beats someone with a high rating, their rating should go up. 12 | for _ in range(10): 13 | player2 = EloCompetitor(initial_rating=800) 14 | player1.beat(player2) 15 | self.assertGreater(player1.rating, initial_rating) 16 | initial_rating = player1.rating 17 | 18 | def test_Decay(self): 19 | initial_rating = 800 20 | player1 = EloCompetitor(initial_rating=initial_rating) 21 | 22 | # if player1 beats someone with a high rating, their rating should go up. 23 | for _ in range(10): 24 | player2 = EloCompetitor(initial_rating=100) 25 | player2.beat(player1) 26 | self.assertLess(player1.rating, initial_rating) 27 | initial_rating = player1.rating 28 | 29 | def test_Expectation(self): 30 | player1 = EloCompetitor(initial_rating=1000) 31 | player2 = EloCompetitor(initial_rating=100) 32 | self.assertGreater(player1.expected_score(player2), player2.expected_score(player1)) 33 | 34 | def test_Exceptions(self): 35 | player1 = EloCompetitor(initial_rating=1000) 36 | player2 = GlickoCompetitor(initial_rating=100) 37 | 38 | with self.assertRaises(MissMatchedCompetitorTypesException): 39 | player1.verify_competitor_types(player2) 40 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | v1.2.0 2 | ====== 3 | 4 | * Began adding type hints 5 | * Fixed DWZ to correctly calculate development coefficient based on the competitors age at time of match. 6 | 7 | v1.1.0 8 | ====== 9 | 10 | * Glicko and Glicko-2 now properly handle time since last match 11 | * Bugfix in evaluation of draws in benchmarking 12 | 13 | v1.0.0 14 | ====== 15 | 16 | * [] Added end to end examples using the chess and cfb datasets 17 | * [] Added Glicko-2 Competitor 18 | * [] Added TrueSkill Competitor 19 | * [] Added datasets module to read sample data for development 20 | * [] Added a visualization module to plot rating systems performance 21 | * [] Added a benchmark module to compare rating systems 22 | * [] Added scipy optimization to find optimal thresholds for rating systems 23 | * [CORE-3] Standardized the `Competitor` serialization formats 24 | * [CORE-1] Fixed minimum rating enforcement across all competitor classes 25 | * [CORE-1] Updated documentation examples to use higher initial ratings 26 | * [CORE-1] Made `reset` method abstract in `BaseCompetitor` class 27 | * [CORE-1] Updated ECFCompetitor default initial rating from 40 to 100 28 | * [CORE-1] Fixed benchmark tests to prevent negative ratings 29 | 30 | v0.1.0 31 | ====== 32 | 33 | * Many bugfixes 34 | * Improved testing and documentation 35 | * Added notion of history object and bout objects for arenas to track progress 36 | 37 | v0.0.3,4 and 5 38 | ============== 39 | 40 | * No change, debugging CI 41 | 42 | v0.0.2 43 | ====== 44 | 45 | * bugfixes in glicko expected score 46 | * bugfixes in elo score that wouldn't allow ratings to drop properly 47 | * added some testing and CI 48 | 49 | v0.0.1 50 | ====== 51 | 52 | * initial release 53 | * lambda arena added 54 | * elo competitor added 55 | * glicko competitor added -------------------------------------------------------------------------------- /.github/workflows/pypi-publish.yml: -------------------------------------------------------------------------------- 1 | name: "PyPI Packaging" 2 | on: 3 | release: 4 | types: created 5 | 6 | jobs: 7 | build_and_publish: 8 | runs-on: ubuntu-latest 9 | permissions: 10 | # IMPORTANT: this permission is mandatory for Trusted Publishing 11 | id-token: write 12 | steps: 13 | - name: Clone 14 | uses: actions/checkout@v3 15 | 16 | - name: Set up Python 17 | uses: actions/setup-python@v4 18 | with: 19 | python-version: '3.11' 20 | 21 | - name: Install uv 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install uv 25 | 26 | - name: Setup virtual environment 27 | run: | 28 | uv venv 29 | echo "VIRTUAL_ENV=$(pwd)/.venv" >> $GITHUB_ENV 30 | echo "$(pwd)/.venv/bin" >> $GITHUB_PATH 31 | 32 | - name: Install dependencies 33 | run: | 34 | uv pip install -e ".[dev]" 35 | uv pip install build 36 | 37 | - name: Build source distribution 38 | run: | 39 | # Clean the build directories first 40 | rm -rf build/ 41 | rm -rf dist/ 42 | rm -rf .eggs/ 43 | 44 | # Find and remove egg-info directories, excluding the virtual environment 45 | find . -path ./.venv -prune -o -name '*.egg-info' -type d -exec rm -rf {} \; 2>/dev/null || true 46 | find . -path ./.venv -prune -o -name '*.egg' -type f -exec rm -f {} \; 2>/dev/null || true 47 | find . -path ./.venv -prune -o -name '__pycache__' -type d -exec rm -rf {} \; 2>/dev/null || true 48 | 49 | # Build only the source distribution 50 | python -m build --sdist 51 | 52 | - name: Publish package to PyPI 53 | uses: pypa/gh-action-pypi-publish@release/v1 54 | with: 55 | packages-dir: dist/ -------------------------------------------------------------------------------- /.cursor/rules/elote_testing.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: how to use the makefile to run tests 3 | globs: * 4 | --- 5 | # Elote Testing Standards 6 | 7 | ## Running Tests 8 | 9 | ### Using Make Commands 10 | - Always use `make test` to run the test suite 11 | - Use `make test-cov` to run tests with coverage reports 12 | - Use `make test-all` to run tests on all supported Python versions using tox 13 | 14 | ### Test Commands 15 | ```bash 16 | # Run the standard test suite 17 | make test 18 | 19 | # Run tests with coverage 20 | make test-cov 21 | 22 | # Run tests on all supported Python versions 23 | make test-all 24 | 25 | # Run a specific test file 26 | make test PYTEST_ARGS="tests/test_unified_interface.py" 27 | 28 | # Run a specific test class 29 | make test PYTEST_ARGS="tests/test_unified_interface.py::TestUnifiedInterface" 30 | 31 | # Run a specific test method 32 | make test PYTEST_ARGS="tests/test_unified_interface.py::TestUnifiedInterface::test_base_methods_elo" 33 | 34 | # Run a specitif example: 35 | make run-example EXAMPLE=use_cases/chess_w_lib.py 36 | ``` 37 | 38 | ### Test Environment 39 | - Tests are run using `uv run pytest` through the Makefile 40 | - The test environment is automatically set up with the correct dependencies 41 | - Always run tests in a clean environment to avoid dependency conflicts 42 | 43 | ### Continuous Integration 44 | - All tests must pass in CI before merging 45 | - Coverage should not decrease with new code 46 | - New features should include corresponding tests 47 | 48 | ### Benchmarks 49 | - Use `make benchmark` to run performance benchmarks 50 | - Benchmark results are stored in the `benchmark_results` directory 51 | - Performance regressions should be addressed before merging 52 | 53 | ### Linting and Formatting 54 | - Use `make lint` to check code quality 55 | - Use `make lint-fix` to automatically fix linting issues 56 | - Use `make format` to format code according to project standards -------------------------------------------------------------------------------- /.cursor/rules/pytest_standards.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: 3 | globs: tests/* 4 | --- 5 | # Pytest Standards 6 | 7 | ## Test Organization 8 | - All test files should be named with the prefix `test_` (e.g., `test_module.py`) 9 | - Test classes should be named with the prefix `Test` (e.g., `TestClassName`) 10 | - Test methods should be named with the prefix `test_` (e.g., `test_function_name`) 11 | - Tests should be organized in the same directory structure as the source code they test 12 | 13 | ## Test Structure 14 | - Each test should focus on testing a single functionality 15 | - Use descriptive test names that explain what is being tested 16 | - Group related tests in test classes 17 | - Use appropriate fixtures for test setup and teardown 18 | - Avoid test interdependence - tests should be able to run in any order 19 | 20 | ## Assertions 21 | - Use pytest's built-in assertions instead of Python's `assert` statement when possible 22 | - Use appropriate assertion methods for the type being tested (e.g., `assert_almost_equal` for floats) 23 | - Include descriptive error messages in assertions to aid debugging 24 | 25 | ## Fixtures 26 | - Use fixtures for common setup and teardown operations 27 | - Define fixtures at the appropriate scope (function, class, module, or session) 28 | - Use parameterized fixtures for testing multiple inputs 29 | - Use conftest.py for sharing fixtures across multiple test files 30 | 31 | ## Coverage 32 | - Aim for at least 80% code coverage 33 | - Write tests for both success and failure cases 34 | - Test edge cases and boundary conditions 35 | - Use `pytest-cov` to generate coverage reports 36 | 37 | ## Best Practices 38 | - Keep tests fast and independent 39 | - Avoid mocking unless necessary 40 | - Use markers to categorize tests (e.g., `@pytest.mark.slow`) 41 | - Use parametrize for testing multiple inputs 42 | - Write tests before or alongside code (TDD approach) 43 | - Run the full test suite before committing changes 44 | 45 | ## Commands 46 | - Run tests with `pytest` 47 | - Generate coverage reports with `pytest --cov=elote` 48 | - Run specific tests with `pytest path/to/test_file.py::TestClass::test_method` -------------------------------------------------------------------------------- /docs/source/competitors.rst: -------------------------------------------------------------------------------- 1 | Competitors 2 | =========== 3 | 4 | Elo Competitor 5 | -------------- 6 | 7 | .. autoclass:: elote.competitors.elo.EloCompetitor 8 | :members: export_state,expected_score,beat,tied,rating,to_json,from_json 9 | 10 | Glicko Competitor 11 | ----------------- 12 | 13 | .. autoclass:: elote.competitors.glicko.GlickoCompetitor 14 | :members: export_state,expected_score,beat,tied,rating,to_json,from_json 15 | 16 | DWZ Competitor 17 | -------------- 18 | 19 | .. autoclass:: elote.competitors.dwz.DWZCompetitor 20 | :members: export_state,expected_score,beat,tied,rating,to_json,from_json 21 | 22 | ECF Competitor 23 | -------------- 24 | 25 | .. autoclass:: elote.competitors.ecf.ECFCompetitor 26 | :members: export_state,expected_score,beat,tied,rating,to_json,from_json 27 | 28 | BlendedCompetitor 29 | ----------------- 30 | 31 | .. autoclass:: elote.competitors.ensemble.BlendedCompetitor 32 | :members: export_state,expected_score,beat,tied,rating,to_json,from_json 33 | 34 | Serialization 35 | ------------ 36 | 37 | All competitor types in Elote support a standardized serialization format that allows for saving and loading competitor states. 38 | The serialization format includes the following fields: 39 | 40 | - **type**: The class name of the competitor 41 | - **version**: The version of the serialization format 42 | - **created_at**: Timestamp when the state was exported 43 | - **id**: A unique identifier for this state export 44 | - **parameters**: The parameters used to initialize the competitor 45 | - **state**: The current state variables of the competitor 46 | - **class_vars**: Class variables for backward compatibility 47 | 48 | To serialize a competitor to JSON: 49 | 50 | .. code-block:: python 51 | 52 | # Create a competitor 53 | competitor = EloCompetitor(initial_rating=1500) 54 | 55 | # Serialize to JSON 56 | json_str = competitor.to_json() 57 | 58 | To deserialize a competitor from JSON: 59 | 60 | .. code-block:: python 61 | 62 | # Deserialize from JSON 63 | competitor = EloCompetitor.from_json(json_str) 64 | 65 | For backward compatibility, the serialized format also includes flattened parameters and state variables at the top level of the dictionary. 66 | -------------------------------------------------------------------------------- /examples/trueskill_tournament.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Example demonstrating the use of TrueSkill in a tournament setting. 4 | 5 | This example shows how to use the TrueSkill rating system with the LambdaArena 6 | to run a tournament and rank competitors. 7 | """ 8 | 9 | import random 10 | import json 11 | from elote import TrueSkillCompetitor, LambdaArena 12 | 13 | 14 | def main(): 15 | """Run the TrueSkill tournament example.""" 16 | 17 | # Create a comparison function that compares two numbers 18 | # Returns True if a beats b (i.e., a > b) 19 | def comparison_func(a, b): 20 | return a > b 21 | 22 | # Create a LambdaArena with TrueSkill competitors 23 | arena = LambdaArena( 24 | comparison_func, 25 | base_competitor=TrueSkillCompetitor, 26 | base_competitor_kwargs={"initial_mu": 25.0, "initial_sigma": 8.333}, 27 | ) 28 | 29 | # Generate 1000 random matchups between numbers 1-10 30 | matchups = [(random.randint(1, 10), random.randint(1, 10)) for _ in range(1000)] 31 | 32 | # Run the tournament 33 | print("Running tournament with 1000 matchups...") 34 | arena.tournament(matchups) 35 | 36 | # Display the leaderboard 37 | print("\nFinal rankings:") 38 | leaderboard = arena.leaderboard() 39 | print(json.dumps(leaderboard, indent=4)) 40 | 41 | # Display detailed competitor information 42 | print("\nDetailed competitor information:") 43 | for entry in leaderboard: 44 | competitor_id = entry["competitor"] 45 | rating = entry["rating"] 46 | competitor = arena.competitors[competitor_id] 47 | print(f"Competitor {competitor_id}: rating={rating:.2f}, mu={competitor.mu:.2f}, sigma={competitor.sigma:.2f}") 48 | 49 | # Calculate match quality between top competitors 50 | if len(leaderboard) >= 2: 51 | top1_id = leaderboard[0]["competitor"] 52 | top2_id = leaderboard[1]["competitor"] 53 | top1 = arena.competitors[top1_id] 54 | top2 = arena.competitors[top2_id] 55 | match_quality = TrueSkillCompetitor.match_quality(top1, top2) 56 | print(f"\nMatch quality between top two competitors ({top1_id} vs {top2_id}): {match_quality:.4f}") 57 | 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /examples/glicko_arena.py: -------------------------------------------------------------------------------- 1 | from elote import LambdaArena, GlickoCompetitor 2 | import json 3 | import random 4 | from datetime import datetime, timedelta 5 | 6 | 7 | # sample bout function which just compares the two inputs 8 | def func(a, b): 9 | if a == b: 10 | return None 11 | else: 12 | return a > b 13 | 14 | 15 | # Create initial time and a list of matchups with timestamps spread over a month 16 | initial_time = datetime(2024, 1, 1) 17 | matchups_with_time = [] 18 | for _i in range(1000): 19 | # Random matchup 20 | a = random.randint(1, 10) 21 | b = random.randint(1, 10) 22 | # Random time within the month (0-30 days from initial time) 23 | match_time = initial_time + timedelta( 24 | days=random.randint(0, 30), hours=random.randint(0, 23), minutes=random.randint(0, 59) 25 | ) 26 | matchups_with_time.append((a, b, match_time)) 27 | 28 | # Sort matchups by time to ensure chronological order 29 | matchups_with_time.sort(key=lambda x: x[2]) 30 | 31 | # Create arena with GlickoCompetitor and set initial time 32 | arena = LambdaArena( 33 | func, 34 | base_competitor=GlickoCompetitor, 35 | base_competitor_kwargs={"initial_rating": 1500, "initial_rd": 350, "initial_time": initial_time}, 36 | ) 37 | 38 | # Process matches in chronological order 39 | for a, b, match_time in matchups_with_time: 40 | # Use matchup() instead of tournament() to handle match times 41 | if func(a, b): # If a wins 42 | arena.matchup(a, b, match_time=match_time) 43 | else: # If b wins 44 | arena.matchup(b, a, match_time=match_time) 45 | 46 | print("\nArena results after one month of matches:") 47 | print("(Notice how less active players have higher RD values)") 48 | leaderboard = arena.leaderboard() 49 | 50 | # Convert leaderboard list to a dictionary and add RD values and last activity times 51 | leaderboard_dict = {} 52 | for entry in leaderboard: 53 | player_id = entry["competitor"] 54 | leaderboard_dict[player_id] = entry 55 | competitor = arena.competitors.get(player_id) 56 | if competitor: 57 | leaderboard_dict[player_id]["rd"] = round(competitor.rd, 2) 58 | leaderboard_dict[player_id]["last_activity"] = competitor._last_activity.strftime("%Y-%m-%d %H:%M") 59 | 60 | print(json.dumps(leaderboard_dict, indent=4)) 61 | -------------------------------------------------------------------------------- /elote/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Datasets module for elote. 3 | 4 | This module provides a common interface for getting datasets and splitting them into train and test sets 5 | for evaluating different rating algorithms. 6 | """ 7 | 8 | # Core datasets - always available 9 | from elote.datasets.base import BaseDataset, DataSplit 10 | from elote.datasets.synthetic import SyntheticDataset 11 | from typing import Any 12 | 13 | # Base __all__ list with always-available exports 14 | __all__ = [ 15 | "BaseDataset", 16 | "DataSplit", 17 | "SyntheticDataset", 18 | ] 19 | 20 | # Optional datasets - only import if dependencies are available 21 | _optional_imports = {} 22 | 23 | # Try to import ChessDataset 24 | try: 25 | from elote.datasets.chess import ChessDataset 26 | _optional_imports["ChessDataset"] = ChessDataset 27 | __all__.append("ChessDataset") 28 | except ImportError as e: 29 | _optional_imports["ChessDataset"] = e 30 | 31 | # Try to import CollegeFootballDataset 32 | try: 33 | from elote.datasets.football import CollegeFootballDataset 34 | _optional_imports["CollegeFootballDataset"] = CollegeFootballDataset 35 | __all__.append("CollegeFootballDataset") 36 | except ImportError as e: 37 | _optional_imports["CollegeFootballDataset"] = e 38 | 39 | 40 | def __getattr__(name: str) -> Any: 41 | """Handle access to optional imports with helpful error messages.""" 42 | if name in _optional_imports: 43 | obj = _optional_imports[name] 44 | if isinstance(obj, ImportError): 45 | if name == "ChessDataset": 46 | raise ImportError( 47 | f"ChessDataset requires optional dependencies that are not installed.\n" 48 | f"Install them with: pip install 'elote[datasets]' or pip install python-chess pyzstd\n" 49 | f"Original error: {obj}" 50 | ) 51 | elif name == "CollegeFootballDataset": 52 | raise ImportError( 53 | f"CollegeFootballDataset requires optional dependencies that are not installed.\n" 54 | f"Install them with: pip install 'elote[datasets]' or pip install 'sportsdataverse[all]'\n" 55 | f"Original error: {obj}" 56 | ) 57 | return obj 58 | raise AttributeError(f"module '{__name__}' has no attribute '{name}'") 59 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. elote documentation master file, created by 2 | sphinx-quickstart on Sat Mar 21 13:38:36 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Elote: Elegant Rating Systems in Python 7 | ======================================= 8 | 9 | **Elote** is a powerful Python library for implementing and comparing rating systems. Whether you're ranking chess players, sports teams, or prioritizing features in your product backlog, Elote provides a simple, elegant API for all your competitive ranking needs. 10 | 11 | Rating systems allow you to rank competitors based on their performance in head-to-head matchups. The most famous example is the Elo rating system used in chess, but these systems have applications far beyond sports: 12 | 13 | - Ranking products based on A/B comparisons 14 | - Prioritizing features through pairwise voting 15 | - Creating recommendation systems 16 | - Matchmaking in games and competitions 17 | - Collaborative filtering and ranking 18 | 19 | Elote makes implementing these systems simple and intuitive, with a clean API that handles all the mathematical complexity for you. 20 | 21 | .. toctree:: 22 | :maxdepth: 1 23 | :caption: Getting Started 24 | 25 | getting_started 26 | installation 27 | quickstart 28 | 29 | .. toctree:: 30 | :maxdepth: 1 31 | :caption: Core Concepts 32 | 33 | competitors 34 | arenas 35 | serialization 36 | 37 | .. toctree:: 38 | :maxdepth: 1 39 | :caption: Rating Systems 40 | 41 | rating_systems/elo 42 | rating_systems/glicko 43 | rating_systems/ecf 44 | rating_systems/dwz 45 | rating_systems/ensemble 46 | rating_systems/comparison 47 | 48 | .. toctree:: 49 | :maxdepth: 1 50 | :caption: Examples 51 | 52 | examples 53 | advance_example 54 | use_cases/product_ranking 55 | use_cases/matchmaking 56 | use_cases/feature_prioritization 57 | 58 | .. toctree:: 59 | :maxdepth: 1 60 | :caption: API Reference 61 | 62 | api/competitors 63 | api/arenas 64 | 65 | .. toctree:: 66 | :maxdepth: 1 67 | :caption: Resources 68 | 69 | blog_posts 70 | 71 | .. toctree:: 72 | :maxdepth: 1 73 | :caption: Development 74 | 75 | contributing 76 | 77 | Indices and tables 78 | ================== 79 | 80 | * :ref:`genindex` 81 | * :ref:`modindex` 82 | * :ref:`search` 83 | -------------------------------------------------------------------------------- /.cursor/rules/python_standards.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: 3 | globs: *.py 4 | --- 5 | # Python Coding Standards 6 | 7 | Do not ever include test-specific code into the library implementation. We should never be checking to see if we are running in a test context to modify the output, the library code should work the exact same in both test and non-test use cases. 8 | 9 | ## Code Style 10 | - Follow PEP 8 style guide for Python code 11 | - Use 4 spaces for indentation (no tabs) 12 | - Maximum line length of 88 characters (Black default) 13 | - Use snake_case for variables, functions, and methods 14 | - Use CamelCase for classes 15 | - Use UPPER_CASE for constants 16 | - Add a blank line at the end of each file 17 | 18 | ## Imports 19 | - Group imports in the following order: 20 | 1. Standard library imports 21 | 2. Related third-party imports 22 | 3. Local application/library specific imports 23 | - Use absolute imports when possible 24 | - Avoid wildcard imports (`from module import *`) 25 | - Use import aliases for long module names 26 | 27 | ## Documentation 28 | - Document all public modules, classes, methods, and functions 29 | - Use docstrings that follow the Google style guide 30 | - Include type hints for function parameters and return values 31 | - Document parameters, return values, and exceptions raised 32 | 33 | ## Error Handling 34 | - Use specific exception types instead of generic ones 35 | - Handle exceptions at the appropriate level 36 | - Use context managers (`with` statements) for resource management 37 | - Avoid catching exceptions without proper handling 38 | 39 | ## Code Organization 40 | - Keep functions and methods short and focused 41 | - Follow the Single Responsibility Principle 42 | - Use classes to encapsulate related functionality 43 | - Separate concerns into different modules 44 | 45 | ## Testing 46 | - Write unit tests for all code 47 | - Use meaningful test names 48 | - Test both normal and edge cases 49 | - Mock external dependencies in tests 50 | 51 | ## Performance 52 | - Prefer list/dict/set comprehensions over loops when appropriate 53 | - Use generators for large data sets 54 | - Profile code before optimizing 55 | - Consider using NumPy/Pandas for numerical operations 56 | 57 | ## Tools 58 | - Use Black for code formatting 59 | - Use Ruff for linting and static analysis 60 | - Use mypy for type checking 61 | - Use isort for import sorting 62 | 63 | ## Version Control 64 | - Write meaningful commit messages 65 | - Keep commits focused on a single change 66 | - Use feature branches for development 67 | - Review code before merging -------------------------------------------------------------------------------- /docs/source/getting_started.rst: -------------------------------------------------------------------------------- 1 | Getting Started 2 | =============== 3 | 4 | To install latest release: 5 | 6 | .. code-block:: 7 | 8 | pip install elote 9 | 10 | To install bleeding edge, clone the repository and run: 11 | 12 | .. code-block:: 13 | 14 | pip install -e . 15 | 16 | 17 | Basic Usage 18 | ----------- 19 | 20 | The most basic object in ``elote`` is a competitor. To start with, let's take a look at ``EloCompetitor``. Let's make 3 21 | objects, one for each of 3 players in a game: 22 | 23 | .. code-block:: python 24 | 25 | from elote import EloCompetitor 26 | 27 | good_player = EloCompetitor(initial_rating=1200) 28 | better_player = EloCompetitor(initial_rating=1200) 29 | best_player = EloCompetitor(initial_rating=1200) 30 | 31 | print('Starting ratings:') 32 | print('%7.2f, %7.2f, %7.2f' % (good_player.rating, better_player.rating, best_player.rating, )) 33 | 34 | All we do is initialize them, and print out their starting ratings. Rating is our measure of how good we think a 35 | competitor is with the information at hand. Here we don't really have any information, so they are all rated the same: 36 | 37 | .. code-block:: 38 | 39 | Starting ratings: 40 | 1200.00, 1200.00, 1200.00 41 | 42 | To make things a little more interesting, let's do 20 ``matches``. A ``match`` is an instance where two players compete, 43 | and one of them wins. This gives us some new information to update our ratings with. For each of the matches we simulate 44 | we will have ``better_player`` beat ``good_player`` or ``best_player`` beat ``better_player``. At each iteration, we will 45 | print out the ratings to get an idea of how they change over time. 46 | 47 | 48 | .. code-block:: python 49 | 50 | print('\nAfter matches') 51 | for _ in range(10): 52 | better_player.beat(good_player) 53 | best_player.beat(better_player) 54 | print('%7.2f, %7.2f, %7.2f' % (good_player.rating, better_player.rating, best_player.rating, )) 55 | 56 | .. code-block:: 57 | 58 | After matches 59 | good, better, best 60 | 1184.00, 1199.26, 1216.74 61 | 1168.70, 1198.66, 1232.64 62 | 1154.08, 1198.18, 1247.75 63 | 1140.10, 1197.79, 1262.11 64 | 1126.73, 1197.49, 1275.78 65 | 1113.95, 1197.25, 1288.80 66 | 1101.71, 1197.08, 1301.21 67 | 1089.99, 1196.95, 1313.05 68 | 1078.77, 1196.87, 1324.36 69 | 1068.01, 1196.81, 1335.18 70 | 71 | So as you can see, over time, the scores gradually update to reflect our hierarchy. 72 | 73 | For more infromation on types of competitors we have, or different configuraiton options, please see the detailed API 74 | docs on the competitors page. -------------------------------------------------------------------------------- /.cursor/rules/sphinx_docs_standards.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: 3 | globs: docs/* 4 | --- 5 | # Sphinx Documentation Standards 6 | 7 | ## Project Setup 8 | - Use `sphinx-quickstart` to initialize the documentation structure 9 | - Configure `conf.py` with appropriate project information 10 | - Use the ReadTheDocs theme for consistent styling 11 | - Enable necessary extensions (e.g., `autodoc`, `napoleon`, `viewcode`) 12 | 13 | ## Directory Structure 14 | - Keep documentation source files in the `docs/` directory 15 | - Organize documentation into logical sections (e.g., user guide, API reference) 16 | - Use a clear and consistent file naming convention 17 | - Include an `index.rst` file as the documentation entry point 18 | 19 | ## Documentation Style 20 | - Write in clear, concise language 21 | - Use present tense and active voice 22 | - Be consistent with terminology 23 | - Include examples where appropriate 24 | - Target the appropriate audience (users, developers, etc.) 25 | 26 | ## reStructuredText Formatting 27 | - Use proper heading hierarchy (=, -, ~, ^, ") 28 | - Use bullet lists for unordered items and numbered lists for sequences 29 | - Use code blocks with appropriate language for syntax highlighting 30 | - Use cross-references to link between documentation sections 31 | - Include images and diagrams where they add value 32 | 33 | ## API Documentation 34 | - Use autodoc to generate API documentation from docstrings 35 | - Document all public modules, classes, methods, and functions 36 | - Follow Google or NumPy docstring style consistently 37 | - Include type information for parameters and return values 38 | - Document exceptions that may be raised 39 | 40 | ## Examples and Tutorials 41 | - Include practical examples for common use cases 42 | - Provide step-by-step tutorials for complex operations 43 | - Ensure all examples are tested and working 44 | - Use `literalinclude` to include code examples from actual source files 45 | 46 | ## Building and Testing 47 | - Build documentation locally before committing changes 48 | - Check for and fix all warnings during the build process 49 | - Verify that cross-references work correctly 50 | - Test documentation on different screen sizes 51 | 52 | ## Deployment 53 | - Configure automatic documentation builds on ReadTheDocs or GitHub Pages 54 | - Include a link to the documentation in the project README 55 | - Version documentation to match software releases 56 | - Provide a changelog or release notes section 57 | 58 | ## Maintenance 59 | - Keep documentation up-to-date with code changes 60 | - Review and update documentation during each release cycle 61 | - Address user feedback and questions in the documentation 62 | - Remove outdated or deprecated information -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | 13 | import os 14 | import sys 15 | 16 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(".")))) 17 | 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = "elote" 22 | copyright = "2020, Will McGinnis" 23 | author = "Will McGinnis" 24 | 25 | # The full version, including alpha/beta/rc tags 26 | try: 27 | # Try to get version from importlib.metadata (Python 3.8+) 28 | from importlib.metadata import version as get_version 29 | 30 | release = get_version("elote") 31 | except ImportError: 32 | # Fallback for older Python versions 33 | try: 34 | import pkg_resources 35 | 36 | release = pkg_resources.get_distribution("elote").version 37 | except Exception: # Replace bare except with specific exception type 38 | # Hardcoded fallback 39 | release = "0.1.0" 40 | 41 | 42 | # -- General configuration --------------------------------------------------- 43 | 44 | # Add any Sphinx extension modules here, as strings. They can be 45 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 46 | # ones. 47 | extensions = [ 48 | "sphinx.ext.autodoc", 49 | "sphinx.ext.viewcode", 50 | "sphinx_rtd_dark_mode", 51 | "sphinxcontrib.googleanalytics", 52 | ] 53 | 54 | # Add any paths that contain templates here, relative to this directory. 55 | templates_path = ["_templates"] 56 | 57 | # List of patterns, relative to source directory, that match files and 58 | # directories to ignore when looking for source files. 59 | # This pattern also affects html_static_path and html_extra_path. 60 | exclude_patterns = [] 61 | html_extra_path = ["CNAME"] 62 | 63 | # -- Options for HTML output ------------------------------------------------- 64 | 65 | # The theme to use for HTML and HTML Help pages. See the documentation for 66 | # a list of builtin themes. 67 | # 68 | html_theme = "sphinx_rtd_theme" 69 | 70 | # Default to dark theme 71 | default_dark_mode = True 72 | 73 | # Google Analytics configuration 74 | googleanalytics_id = "G-Z43R9PWW0B" 75 | googleanalytics_enabled = True 76 | 77 | # Add any paths that contain custom static files (such as style sheets) here, 78 | # relative to this directory. They are copied after the builtin static files, 79 | # so a file named "default.css" will overwrite the builtin "default.css". 80 | html_static_path = ["_static"] 81 | 82 | autoclass_content = "both" 83 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at will@pedalwrencher.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /elote/logging.py: -------------------------------------------------------------------------------- 1 | """Centralized logging configuration for the elote library.""" 2 | 3 | import logging 4 | import sys 5 | from typing import Union, Optional, TextIO 6 | 7 | # The main logger for the elote library 8 | # Users can configure this logger using standard logging methods 9 | # or the helper functions below. 10 | logger = logging.getLogger("elote") 11 | 12 | # Add a NullHandler by default to prevent logs from propagating 13 | # unless the user configures logging. 14 | logger.addHandler(logging.NullHandler()) 15 | 16 | # Set a reasonable default level to avoid excessive debug logging 17 | logger.setLevel(logging.WARNING) 18 | 19 | # Default log format 20 | DEFAULT_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 21 | 22 | 23 | def set_level(level: Union[int, str]) -> None: 24 | """Set the logging level for the elote logger. 25 | 26 | Args: 27 | level: The logging level (e.g., logging.DEBUG, logging.INFO, 'DEBUG', 'INFO'). 28 | """ 29 | if isinstance(level, str): 30 | level = getattr(logging, level.upper()) 31 | logger.setLevel(level) 32 | 33 | 34 | def add_handler(handler: logging.Handler) -> None: 35 | """Add a handler to the elote logger. 36 | 37 | Args: 38 | handler: A logging handler to add. 39 | """ 40 | # Remove existing handlers of the same type to avoid duplicates 41 | for existing_handler in logger.handlers[:]: 42 | if isinstance(existing_handler, type(handler)): 43 | logger.removeHandler(existing_handler) 44 | 45 | logger.addHandler(handler) 46 | 47 | 48 | def basic_config( 49 | level: Union[int, str] = logging.WARNING, 50 | stream: Optional[TextIO] = None, 51 | format: str = DEFAULT_FORMAT, 52 | force: bool = False 53 | ) -> None: 54 | """Configure basic logging for elote. 55 | 56 | Sets the level and adds a StreamHandler (defaults to stderr) 57 | with the specified format. 58 | 59 | Args: 60 | level: The minimum logging level to output. 61 | stream: The stream to log to (e.g., sys.stdout). Defaults to sys.stderr. 62 | format: The log message format string. 63 | force: If True, remove existing handlers before adding new one. 64 | """ 65 | if force: 66 | # Remove all existing handlers 67 | for handler in logger.handlers[:]: 68 | logger.removeHandler(handler) 69 | 70 | set_level(level) 71 | handler = logging.StreamHandler(stream or sys.stderr) 72 | formatter = logging.Formatter(format) 73 | handler.setFormatter(formatter) 74 | add_handler(handler) 75 | 76 | 77 | def get_logger(name: Optional[str] = None) -> logging.Logger: 78 | """Get a logger instance. 79 | 80 | Args: 81 | name: Optional name for the logger. If None, returns the main elote logger. 82 | 83 | Returns: 84 | A logger instance. 85 | """ 86 | if name is None: 87 | return logger 88 | return logging.getLogger(f"elote.{name}") 89 | 90 | 91 | def disable_debug_logging() -> None: 92 | """Disable debug logging for performance in production environments.""" 93 | if logger.level <= logging.DEBUG: 94 | logger.setLevel(logging.INFO) 95 | 96 | 97 | def is_debug_enabled() -> bool: 98 | """Check if debug logging is enabled. 99 | 100 | Returns: 101 | True if debug logging is enabled, False otherwise. 102 | """ 103 | return logger.isEnabledFor(logging.DEBUG) 104 | -------------------------------------------------------------------------------- /tests/test_ColleyMatrixCompetitor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from elote import ColleyMatrixCompetitor, EloCompetitor 4 | from elote.competitors.base import MissMatchedCompetitorTypesException 5 | 6 | 7 | class TestColleyMatrix(unittest.TestCase): 8 | def test_improvement(self): 9 | """Test that beating stronger opponents improves rating.""" 10 | initial_rating = 0.5 11 | player1 = ColleyMatrixCompetitor(initial_rating=initial_rating) 12 | 13 | # If player1 beats someone with a higher rating, their rating should go up 14 | for _ in range(5): 15 | player2 = ColleyMatrixCompetitor(initial_rating=0.8) 16 | player1.beat(player2) 17 | self.assertGreater(player1.rating, initial_rating) 18 | initial_rating = player1.rating 19 | 20 | def test_decay(self): 21 | """Test that losing to weaker opponents decreases rating.""" 22 | initial_rating = 0.8 23 | player1 = ColleyMatrixCompetitor(initial_rating=initial_rating) 24 | 25 | # If player1 loses to someone with a lower rating, their rating should go down 26 | for _ in range(5): 27 | player2 = ColleyMatrixCompetitor(initial_rating=0.2) 28 | player2.beat(player1) 29 | self.assertLess(player1.rating, initial_rating) 30 | initial_rating = player1.rating 31 | 32 | def test_expectation(self): 33 | """Test that expected scores are calculated correctly.""" 34 | player1 = ColleyMatrixCompetitor(initial_rating=0.8) 35 | player2 = ColleyMatrixCompetitor(initial_rating=0.2) 36 | 37 | # Higher rated player should have higher expected score 38 | self.assertGreater(player1.expected_score(player2), player2.expected_score(player1)) 39 | 40 | def test_network_recalculation(self): 41 | """Test that ratings are recalculated across the network of connected competitors.""" 42 | # Create a network of 5 competitors 43 | competitors = [ColleyMatrixCompetitor(initial_rating=0.5) for _ in range(5)] 44 | 45 | # Create some matches to establish a network 46 | # 0 beats 1, 1 beats 2, 2 beats 3, 3 beats 4, 4 beats 0 (circular) 47 | competitors[0].beat(competitors[1]) 48 | competitors[1].beat(competitors[2]) 49 | competitors[2].beat(competitors[3]) 50 | competitors[3].beat(competitors[4]) 51 | competitors[4].beat(competitors[0]) 52 | 53 | # All ratings should be different after this circular pattern 54 | ratings = [c.rating for c in competitors] 55 | self.assertEqual( 56 | len(set(ratings)), len(ratings), "Each competitor should have a unique rating after circular matches" 57 | ) 58 | 59 | # Ratings should sum to n/2 = 2.5 (property of Colley Matrix Method) 60 | self.assertAlmostEqual(sum(ratings), len(competitors) / 2) 61 | 62 | # Additional test: if a new player beats the highest rated player, they should improve 63 | new_player = ColleyMatrixCompetitor(initial_rating=0.5) 64 | highest_player = competitors[np.argmax([c.rating for c in competitors])] 65 | initial_rating = new_player.rating 66 | new_player.beat(highest_player) 67 | self.assertGreater(new_player.rating, initial_rating) 68 | 69 | def test_exceptions(self): 70 | """Test that appropriate exceptions are raised.""" 71 | player1 = ColleyMatrixCompetitor(initial_rating=0.5) 72 | player2 = EloCompetitor(initial_rating=1000) 73 | 74 | with self.assertRaises(MissMatchedCompetitorTypesException): 75 | player1.verify_competitor_types(player2) 76 | 77 | with self.assertRaises(MissMatchedCompetitorTypesException): 78 | player1.expected_score(player2) 79 | 80 | 81 | if __name__ == "__main__": 82 | unittest.main() 83 | -------------------------------------------------------------------------------- /examples/trueskill_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Example demonstrating the use of the TrueSkill rating system. 4 | 5 | This example shows how to create TrueSkill competitors, calculate win probabilities, 6 | update ratings after matches, and work with teams. 7 | """ 8 | 9 | from elote import TrueSkillCompetitor 10 | 11 | 12 | def main(): 13 | """Run the TrueSkill example.""" 14 | # Create players with different initial skill levels 15 | player1 = TrueSkillCompetitor(initial_mu=25.0, initial_sigma=8.333) 16 | player2 = TrueSkillCompetitor(initial_mu=30.0, initial_sigma=7.0) 17 | player3 = TrueSkillCompetitor(initial_mu=20.0, initial_sigma=6.0) 18 | player4 = TrueSkillCompetitor(initial_mu=35.0, initial_sigma=5.0) 19 | 20 | # Print initial ratings 21 | print("Initial ratings:") 22 | print(f"Player 1: mu={player1.mu:.2f}, sigma={player1.sigma:.2f}, rating={player1.rating:.2f}") 23 | print(f"Player 2: mu={player2.mu:.2f}, sigma={player2.sigma:.2f}, rating={player2.rating:.2f}") 24 | print(f"Player 3: mu={player3.mu:.2f}, sigma={player3.sigma:.2f}, rating={player3.rating:.2f}") 25 | print(f"Player 4: mu={player4.mu:.2f}, sigma={player4.sigma:.2f}, rating={player4.rating:.2f}") 26 | print() 27 | 28 | # Calculate win probabilities 29 | print("Win probabilities:") 30 | print(f"Player 1 vs Player 2: {player1.expected_score(player2):.4f}") 31 | print(f"Player 1 vs Player 3: {player1.expected_score(player3):.4f}") 32 | print(f"Player 2 vs Player 4: {player2.expected_score(player4):.4f}") 33 | print() 34 | 35 | # Calculate match quality 36 | print("Match quality:") 37 | print(f"Player 1 vs Player 2: {TrueSkillCompetitor.match_quality(player1, player2):.4f}") 38 | print(f"Player 1 vs Player 3: {TrueSkillCompetitor.match_quality(player1, player3):.4f}") 39 | print(f"Player 2 vs Player 4: {TrueSkillCompetitor.match_quality(player2, player4):.4f}") 40 | print() 41 | 42 | # Simulate some matches 43 | print("Simulating matches...") 44 | print("Match 1: Player 1 beats Player 2 (upset!)") 45 | player1.beat(player2) 46 | 47 | print("Match 2: Player 3 beats Player 1 (another upset!)") 48 | player3.beat(player1) 49 | 50 | print("Match 3: Player 2 and Player 4 tie") 51 | player2.tied(player4) 52 | print() 53 | 54 | # Print updated ratings 55 | print("Updated ratings after matches:") 56 | print(f"Player 1: mu={player1.mu:.2f}, sigma={player1.sigma:.2f}, rating={player1.rating:.2f}") 57 | print(f"Player 2: mu={player2.mu:.2f}, sigma={player2.sigma:.2f}, rating={player2.rating:.2f}") 58 | print(f"Player 3: mu={player3.mu:.2f}, sigma={player3.sigma:.2f}, rating={player3.rating:.2f}") 59 | print(f"Player 4: mu={player4.mu:.2f}, sigma={player4.sigma:.2f}, rating={player4.rating:.2f}") 60 | print() 61 | 62 | # Calculate new win probabilities 63 | print("New win probabilities:") 64 | print(f"Player 1 vs Player 2: {player1.expected_score(player2):.4f}") 65 | print(f"Player 1 vs Player 3: {player1.expected_score(player3):.4f}") 66 | print(f"Player 2 vs Player 4: {player2.expected_score(player4):.4f}") 67 | print() 68 | 69 | # Demonstrate team creation 70 | print("Team creation:") 71 | team1_mu, team1_sigma = TrueSkillCompetitor.create_team([player1, player3]) 72 | team2_mu, team2_sigma = TrueSkillCompetitor.create_team([player2, player4]) 73 | print(f"Team 1 (Players 1 & 3): mu={team1_mu:.2f}, sigma={team1_sigma:.2f}") 74 | print(f"Team 2 (Players 2 & 4): mu={team2_mu:.2f}, sigma={team2_sigma:.2f}") 75 | print() 76 | 77 | # Demonstrate serialization and deserialization 78 | print("Demonstrating serialization and deserialization...") 79 | state = player1.export_state() 80 | player1_copy = TrueSkillCompetitor.from_state(state) 81 | 82 | print(f"Original player: {player1}") 83 | print(f"Deserialized player: {player1_copy}") 84 | print(f"Are they equal? {player1.mu == player1_copy.mu and player1.sigma == player1_copy.sigma}") 85 | 86 | 87 | if __name__ == "__main__": 88 | main() 89 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | This guide covers different ways to install Elote for both users and developers. 5 | 6 | Requirements 7 | ----------- 8 | 9 | Elote requires: 10 | 11 | - Python 3.10 or higher 12 | - NumPy (automatically installed as a dependency) 13 | 14 | Basic Installation 15 | ---------------- 16 | 17 | For most users, the simplest way to install Elote is via pip: 18 | 19 | .. code-block:: bash 20 | 21 | pip install elote 22 | 23 | This will install the latest stable release from PyPI along with all required dependencies. 24 | 25 | If you prefer using Conda, you can install Elote via pip within your Conda environment: 26 | 27 | .. code-block:: bash 28 | 29 | conda create -n elote-env python=3.9 30 | conda activate elote-env 31 | pip install elote 32 | 33 | Development Installation 34 | ---------------------- 35 | 36 | If you want to contribute to Elote or need the latest development version, you can install directly from the GitHub repository: 37 | 38 | .. code-block:: bash 39 | 40 | # Using Make (recommended) 41 | git clone https://github.com/yourusername/elote.git 42 | cd elote 43 | make install-dev 44 | 45 | # Or using pip 46 | git clone https://github.com/yourusername/elote.git 47 | cd elote 48 | pip install -e ".[dev]" 49 | 50 | # Or using uv 51 | git clone https://github.com/yourusername/elote.git 52 | cd elote 53 | uv pip install -e ".[dev]" 54 | 55 | The development installation includes additional dependencies needed for testing, linting, and documentation. 56 | 57 | Verifying Installation 58 | -------------------- 59 | 60 | To verify that Elote is installed correctly, you can run a simple test in Python: 61 | 62 | .. code-block:: python 63 | 64 | from elote import EloCompetitor 65 | 66 | # Create two competitors 67 | player1 = EloCompetitor(initial_rating=1500) 68 | player2 = EloCompetitor(initial_rating=1600) 69 | 70 | # Calculate expected score 71 | expected = player2.expected_score(player1) 72 | print(f"Installation successful! Expected score: {expected:.2%}") 73 | 74 | If this runs without errors, Elote is installed correctly. 75 | 76 | Installing Optional Dependencies 77 | ------------------------------ 78 | 79 | Elote has several optional dependency groups that can be installed based on your needs: 80 | 81 | .. code-block:: bash 82 | 83 | # Install with visualization dependencies 84 | pip install "elote[viz]" 85 | 86 | # Install with all optional dependencies 87 | pip install "elote[all]" 88 | 89 | # Install development dependencies 90 | pip install "elote[dev]" 91 | 92 | Troubleshooting 93 | -------------- 94 | 95 | Common installation issues and their solutions: 96 | 97 | NumPy Installation Errors 98 | ^^^^^^^^^^^^^^^^^^^^^^^^ 99 | 100 | If you encounter errors related to NumPy installation: 101 | 102 | .. code-block:: bash 103 | 104 | # Install NumPy separately first 105 | pip install numpy 106 | pip install elote 107 | 108 | Version Conflicts 109 | ^^^^^^^^^^^^^^^ 110 | 111 | If you have version conflicts with other packages: 112 | 113 | .. code-block:: bash 114 | 115 | # Create a virtual environment 116 | python -m venv elote-env 117 | source elote-env/bin/activate # On Windows: elote-env\Scripts\activate 118 | pip install elote 119 | 120 | Permission Errors 121 | ^^^^^^^^^^^^^^^ 122 | 123 | If you encounter permission errors during installation: 124 | 125 | .. code-block:: bash 126 | 127 | # Install for the current user only 128 | pip install --user elote 129 | 130 | # Or use a virtual environment (recommended) 131 | python -m venv elote-env 132 | source elote-env/bin/activate 133 | pip install elote 134 | 135 | Getting Help 136 | ----------- 137 | 138 | If you continue to experience installation issues: 139 | 140 | 1. Check the `GitHub Issues `_ to see if others have encountered the same problem 141 | 2. Open a new issue with details about your environment and the error messages 142 | 3. Reach out to the community for help -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=42", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "elote" 7 | version = "1.1.0" 8 | description = "Python module for rating bouts (like with Elo Rating)" 9 | readme = "README.md" 10 | authors = [ 11 | {name = "Will McGinnis", email = "will@helton.io"}, 12 | ] 13 | license = {text = "MIT"} 14 | classifiers = [ 15 | "Development Status :: 3 - Alpha", 16 | "Intended Audience :: Developers", 17 | "Programming Language :: Python :: 3", 18 | "Programming Language :: Python :: 3.10", 19 | "Programming Language :: Python :: 3.11", 20 | "Programming Language :: Python :: 3.12", 21 | ] 22 | keywords = ["elo", "scoring", "rating"] 23 | dependencies = [ 24 | "tqdm==4.66.3", 25 | "numpy>=1.20.0", 26 | "scipy>=1.7.0", 27 | "pandas>=1.3.0", 28 | "requests>=2.25.0", 29 | "setuptools>=42.0.0", 30 | "matplotlib>=3.5.0" 31 | ] 32 | requires-python = ">=3.10" 33 | 34 | [project.urls] 35 | Homepage = "https://github.com/wdm0006/elote" 36 | "Bug Tracker" = "https://github.com/wdm0006/elote/issues" 37 | 38 | [project.optional-dependencies] 39 | dev = [ 40 | "pytest", 41 | "pytest-cov", 42 | "sphinx>=6.1.3", 43 | "docutils>=0.19", 44 | "sphinx_rtd_theme", 45 | "ruff", 46 | "wheel", 47 | "build", 48 | "tox", 49 | "tox-uv", 50 | "pytest-benchmark", 51 | "mypy>=1.8.0", 52 | "types-tqdm", 53 | "types-requests", 54 | "pandas-stubs", 55 | "scipy-stubs", 56 | ] 57 | datasets = [ 58 | "sportsdataverse[all]", 59 | "pyzstd>=0.15.0", 60 | "python-chess>=1.9.0", 61 | "setuptools>=42.0.0", 62 | ] 63 | 64 | [tool.setuptools] 65 | packages = ["elote", "elote.competitors", "elote.arenas", "elote.datasets"] 66 | package-data = {"elote" = ["py.typed"]} 67 | 68 | [tool.setuptools.exclude-package-data] 69 | "*" = ["*.pyc", "*.pyo", "*.pyd", "*.so", "*.dylib", "*~"] 70 | 71 | [tool.pytest] 72 | testpaths = ["tests"] 73 | python_files = "test_*.py" 74 | 75 | [tool.ruff] 76 | # Same as Black. 77 | line-length = 120 78 | indent-width = 4 79 | 80 | # Assume Python 3.10 81 | target-version = "py310" 82 | 83 | [tool.ruff.lint] 84 | # Enable Pyflakes (`F`), pycodestyle (`E`), and flake8-bugbear (`B`) rules 85 | select = ["E", "F", "B"] 86 | ignore = ["E501"] # Ignore line length errors since we'll fix them gradually 87 | 88 | # Allow fix for all enabled rules (when `--fix`) is provided. 89 | fixable = ["ALL"] 90 | unfixable = [] 91 | 92 | # Allow unused variables when underscore-prefixed. 93 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 94 | 95 | [tool.ruff.format] 96 | # Use double quotes for strings. 97 | quote-style = "double" 98 | 99 | # Indent with spaces, rather than tabs. 100 | indent-style = "space" 101 | 102 | # Respect magic trailing commas. 103 | skip-magic-trailing-comma = false 104 | 105 | # Automatically detect the appropriate line ending. 106 | line-ending = "auto" 107 | 108 | [tool.mypy] 109 | python_version = "3.10" 110 | warn_return_any = true 111 | disallow_untyped_defs = true 112 | check_untyped_defs = true 113 | disallow_incomplete_defs = true 114 | disallow_untyped_decorators = false 115 | no_implicit_optional = false 116 | warn_redundant_casts = false 117 | warn_unused_ignores = false 118 | warn_no_return = true 119 | warn_unreachable = false 120 | strict_optional = false 121 | show_error_codes = true 122 | show_column_numbers = true 123 | pretty = true 124 | ignore_missing_imports = true 125 | disallow_any_unimported = true 126 | disallow_untyped_calls = true 127 | disable_error_code = ["attr-defined", "assignment", "index", "call-arg", "arg-type", "valid-type", "misc", "override", "union-attr", "safe-super", "dict-item", "call-overload", "no-any-unimported"] 128 | files = ["elote"] 129 | exclude = ["tests/.*", "examples/.*", "scripts/.*", "docs/.*"] 130 | 131 | # Per-module options: 132 | [[tool.mypy.overrides]] 133 | module = [ 134 | "numpy.*", 135 | "matplotlib.*", 136 | "tqdm.*", 137 | "sportsdataverse.*", 138 | "scipy.*", 139 | ] 140 | ignore_missing_imports = true 141 | 142 | [[tool.mypy.overrides]] 143 | module = ["pandas"] 144 | ignore_missing_imports = false -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: help setup install install-dev install-datasets test test-cov lint format clean build docs lint-fix test-all benchmark run-example typecheck 2 | 3 | # Default target 4 | help: 5 | @echo "Available commands:" 6 | @echo " make setup - Install uv and other required tools" 7 | @echo " make install - Install the package" 8 | @echo " make install-dev - Install the package with development dependencies" 9 | @echo " make install-datasets - Install the package with dataset dependencies" 10 | @echo " make test - Run tests" 11 | @echo " make test-cov - Run tests with coverage" 12 | @echo " make test-all - Run tests on all supported Python versions using tox" 13 | @echo " make benchmark - Run performance benchmarks" 14 | @echo " make lint - Run linting checks" 15 | @echo " make lint-fix - Run linting checks and fix auto-fixable issues" 16 | @echo " make typecheck - Run mypy type checking" 17 | @echo " make typecheck [FILE=path] - Run mypy type checking (optionally on a specific file)" 18 | @echo " make format - Format code with ruff" 19 | @echo " make clean - Clean build artifacts" 20 | @echo " make build - Build package distributions" 21 | @echo " make docs - Build documentation" 22 | @echo " make run-example EXAMPLE=filename - Run an example (e.g., make run-example EXAMPLE=trueskill_example.py)" 23 | 24 | # Setup development environment 25 | setup: 26 | pip install uv 27 | uv venv --python=3.11 28 | brew install libomp 29 | 30 | # Install the package 31 | install: 32 | uv pip install -e . 33 | 34 | # Install the package with development dependencies 35 | install-dev: 36 | uv pip install -e ".[dev]" 37 | 38 | # Install the package with dataset dependencies 39 | install-datasets: 40 | uv pip install -e ".[datasets]" 41 | 42 | # Run tests 43 | test: 44 | uv run pytest $(PYTEST_ARGS) 45 | 46 | # Run tests with coverage 47 | test-cov: 48 | uv run pytest --cov=elote --cov-report=term --cov-report=html $(PYTEST_ARGS) 49 | 50 | # Run linting 51 | lint: 52 | uv run ruff check . 53 | 54 | # Run linting and fix auto-fixable issues 55 | lint-fix: 56 | uv run ruff check --fix --unsafe-fixes . 57 | 58 | # Run mypy type checking 59 | typecheck: 60 | @if [ -z "$(FILE)" ]; then \ 61 | echo "Running mypy on the entire elote package..."; \ 62 | uv run mypy elote; \ 63 | echo mypy elote; \ 64 | else \ 65 | echo "Running mypy on $(FILE)..."; \ 66 | uv run mypy $(FILE); \ 67 | echo mypy $(FILE); \ 68 | fi 69 | 70 | # Format code 71 | format: 72 | uv run ruff format . 73 | 74 | # Clean build artifacts 75 | clean: 76 | rm -rf build/ 77 | rm -rf dist/ 78 | rm -rf *.egg-info/ 79 | rm -rf .coverage 80 | rm -rf htmlcov/ 81 | rm -rf .pytest_cache/ 82 | rm -rf .ruff_cache/ 83 | find . -type d -name __pycache__ -exec rm -rf {} + 84 | find . -type f -name "*.pyc" -delete 85 | 86 | # Build package distributions 87 | build: clean 88 | uv run python -m build 89 | 90 | # Build documentation 91 | docs: 92 | cd docs && uv run $(MAKE) html SPHINXBUILD="python -m sphinx" 93 | @echo "Opening documentation in Google Chrome..." 94 | @if [ "$(shell uname)" = "Darwin" ]; then \ 95 | open -a "Google Chrome" docs/build/html/index.html; \ 96 | else \ 97 | if command -v google-chrome > /dev/null; then \ 98 | google-chrome docs/build/html/index.html; \ 99 | elif command -v google-chrome-stable > /dev/null; then \ 100 | google-chrome-stable docs/build/html/index.html; \ 101 | elif command -v chromium > /dev/null; then \ 102 | chromium docs/build/html/index.html; \ 103 | else \ 104 | echo "Could not find Google Chrome. Please open docs/build/html/index.html manually."; \ 105 | fi; \ 106 | fi 107 | 108 | # Run tests on all supported Python versions 109 | test-all: 110 | uv run tox 111 | 112 | # Run benchmarks 113 | benchmark: 114 | uv run pytest tests/test_benchmarks.py -v --benchmark-enable $(PYTEST_ARGS) 115 | 116 | # Run an example 117 | run-example: 118 | @if [ -z "$(EXAMPLE)" ]; then \ 119 | echo "Please specify an example file with EXAMPLE=filename.py"; \ 120 | echo "Available examples:"; \ 121 | ls examples/*.py | xargs -n1 basename; \ 122 | else \ 123 | uv run python examples/$(EXAMPLE); \ 124 | fi 125 | -------------------------------------------------------------------------------- /tests/test_BlendedCompetitor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from elote import BlendedCompetitor, GlickoCompetitor 3 | from elote.competitors.base import MissMatchedCompetitorTypesException 4 | 5 | 6 | class TestBlendedCompetitor(unittest.TestCase): 7 | def test_Improvement(self): 8 | player1 = BlendedCompetitor( 9 | competitors=[ 10 | {"type": "EloCompetitor", "competitor_kwargs": {}}, 11 | {"type": "GlickoCompetitor", "competitor_kwargs": {}}, 12 | {"type": "DWZCompetitor", "competitor_kwargs": {}}, 13 | {"type": "ECFCompetitor", "competitor_kwargs": {}}, 14 | ] 15 | ) 16 | initial_rating = player1.rating 17 | # if player1 beats someone with a high rating, their rating should go up. 18 | for _ in range(10): 19 | player2 = BlendedCompetitor( 20 | competitors=[ 21 | { 22 | "type": "EloCompetitor", 23 | "competitor_kwargs": {"initial_rating": 1000}, 24 | }, 25 | {"type": "GlickoCompetitor", "competitor_kwargs": {}}, 26 | {"type": "DWZCompetitor", "competitor_kwargs": {}}, 27 | {"type": "ECFCompetitor", "competitor_kwargs": {}}, 28 | ] 29 | ) 30 | player1.beat(player2) 31 | self.assertGreater(player1.rating, initial_rating) 32 | initial_rating = player1.rating 33 | 34 | def test_Decay(self): 35 | player1 = BlendedCompetitor( 36 | competitors=[ 37 | {"type": "EloCompetitor", "competitor_kwargs": {}}, 38 | {"type": "GlickoCompetitor", "competitor_kwargs": {}}, 39 | {"type": "DWZCompetitor", "competitor_kwargs": {}}, 40 | {"type": "ECFCompetitor", "competitor_kwargs": {}}, 41 | ] 42 | ) 43 | initial_rating = player1.rating 44 | # if player1 beats someone with a high rating, their rating should go up. 45 | for _ in range(10): 46 | player2 = BlendedCompetitor( 47 | competitors=[ 48 | { 49 | "type": "EloCompetitor", 50 | "competitor_kwargs": {"initial_rating": 1000}, 51 | }, 52 | {"type": "GlickoCompetitor", "competitor_kwargs": {}}, 53 | {"type": "DWZCompetitor", "competitor_kwargs": {}}, 54 | {"type": "ECFCompetitor", "competitor_kwargs": {}}, 55 | ] 56 | ) 57 | player2.beat(player1) 58 | self.assertLess(player1.rating, initial_rating) 59 | initial_rating = player1.rating 60 | 61 | def test_Expectation(self): 62 | player1 = BlendedCompetitor( 63 | competitors=[ 64 | { 65 | "type": "EloCompetitor", 66 | "competitor_kwargs": {"initial_rating": 1000}, 67 | }, 68 | {"type": "GlickoCompetitor", "competitor_kwargs": {}}, 69 | {"type": "DWZCompetitor", "competitor_kwargs": {}}, 70 | {"type": "ECFCompetitor", "competitor_kwargs": {}}, 71 | ] 72 | ) 73 | player2 = BlendedCompetitor( 74 | competitors=[ 75 | {"type": "EloCompetitor", "competitor_kwargs": {"initial_rating": 100}}, 76 | {"type": "GlickoCompetitor", "competitor_kwargs": {}}, 77 | {"type": "DWZCompetitor", "competitor_kwargs": {}}, 78 | {"type": "ECFCompetitor", "competitor_kwargs": {}}, 79 | ] 80 | ) 81 | self.assertGreater(player1.expected_score(player2), player2.expected_score(player1)) 82 | 83 | def test_Exceptions(self): 84 | player1 = BlendedCompetitor( 85 | competitors=[ 86 | { 87 | "type": "EloCompetitor", 88 | "competitor_kwargs": {"initial_rating": 1000}, 89 | }, 90 | {"type": "GlickoCompetitor", "competitor_kwargs": {}}, 91 | {"type": "DWZCompetitor", "competitor_kwargs": {}}, 92 | {"type": "ECFCompetitor", "competitor_kwargs": {}}, 93 | ] 94 | ) 95 | player2 = GlickoCompetitor(initial_rating=100) 96 | 97 | with self.assertRaises(MissMatchedCompetitorTypesException): 98 | player1.verify_competitor_types(player2) 99 | -------------------------------------------------------------------------------- /examples/colley_matrix_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Colley Matrix Method example using Elote. 3 | 4 | The Colley Matrix Method is a least-squares rating system that solves a system of linear 5 | equations to obtain rankings. It's widely used in sports rankings, particularly college 6 | football. 7 | 8 | This example demonstrates: 9 | 1. Creating ColleyMatrixCompetitor instances 10 | 2. Recording match results 11 | 3. Examining how ratings change with match outcomes 12 | 4. Visualizing the rating changes over time 13 | """ 14 | 15 | import os 16 | import matplotlib.pyplot as plt 17 | from elote import ColleyMatrixCompetitor 18 | 19 | 20 | def main(): 21 | # Create competitors with default initial rating of 0.5 22 | team_a = ColleyMatrixCompetitor() 23 | team_b = ColleyMatrixCompetitor() 24 | team_c = ColleyMatrixCompetitor() 25 | team_d = ColleyMatrixCompetitor() 26 | 27 | # Initial ratings and expectations 28 | print("Initial ratings:") 29 | print(f"Team A: {team_a.rating:.3f}") 30 | print(f"Team B: {team_b.rating:.3f}") 31 | print(f"Team C: {team_c.rating:.3f}") 32 | print(f"Team D: {team_d.rating:.3f}") 33 | 34 | print("\nInitial win probabilities:") 35 | print(f"Team A vs Team B: {team_a.expected_score(team_b):.2%}") 36 | print(f"Team A vs Team C: {team_a.expected_score(team_c):.2%}") 37 | 38 | # Record match results in a tournament 39 | print("\nSimulating a small tournament...") 40 | 41 | # Track rating history 42 | a_ratings = [team_a.rating] 43 | b_ratings = [team_b.rating] 44 | c_ratings = [team_c.rating] 45 | d_ratings = [team_d.rating] 46 | 47 | # Round 1 48 | team_a.beat(team_b) # A beats B 49 | team_c.beat(team_d) # C beats D 50 | 51 | a_ratings.append(team_a.rating) 52 | b_ratings.append(team_b.rating) 53 | c_ratings.append(team_c.rating) 54 | d_ratings.append(team_d.rating) 55 | 56 | # Round 2 - simplified to avoid network issues 57 | team_b.beat(team_d) # B beats D 58 | 59 | a_ratings.append(team_a.rating) 60 | b_ratings.append(team_b.rating) 61 | c_ratings.append(team_c.rating) 62 | d_ratings.append(team_d.rating) 63 | 64 | # Round 3 - simplified to avoid network issues 65 | team_c.beat(team_b) # C beats B 66 | 67 | a_ratings.append(team_a.rating) 68 | b_ratings.append(team_b.rating) 69 | c_ratings.append(team_c.rating) 70 | d_ratings.append(team_d.rating) 71 | 72 | # Final ratings 73 | print("\nFinal ratings:") 74 | print(f"Team A: {team_a.rating:.3f} (won 1, lost 0)") 75 | print(f"Team B: {team_b.rating:.3f} (won 1, lost 2)") 76 | print(f"Team C: {team_c.rating:.3f} (won 2, lost 0)") 77 | print(f"Team D: {team_d.rating:.3f} (won 0, lost 2)") 78 | 79 | # Final win probabilities 80 | print("\nFinal win probabilities:") 81 | print(f"Team A vs Team B: {team_a.expected_score(team_b):.2%}") 82 | print(f"Team A vs Team C: {team_a.expected_score(team_c):.2%}") 83 | print(f"Team B vs Team C: {team_b.expected_score(team_c):.2%}") 84 | print(f"Team B vs Team D: {team_b.expected_score(team_d):.2%}") 85 | 86 | # Verify a key property of Colley Matrix ratings: sum of ratings equals n/2 87 | total_rating = team_a.rating + team_b.rating + team_c.rating + team_d.rating 88 | print(f"\nSum of all ratings: {total_rating:.3f}") 89 | print(f"Expected sum (n/2): {4 / 2}") 90 | 91 | # Demonstrate a tie 92 | print("\nSimulating a tie between Team B and Team D...") 93 | team_b.tied(team_d) 94 | print(f"Team B rating after tie: {team_b.rating:.3f}") 95 | print(f"Team D rating after tie: {team_d.rating:.3f}") 96 | 97 | # Plot rating changes over time 98 | plt.figure(figsize=(10, 6)) 99 | rounds = range(4) # Initial + 3 rounds 100 | 101 | plt.plot(rounds, a_ratings, "o-", label="Team A") 102 | plt.plot(rounds, b_ratings, "s-", label="Team B") 103 | plt.plot(rounds, c_ratings, "^-", label="Team C") 104 | plt.plot(rounds, d_ratings, "x-", label="Team D") 105 | 106 | plt.xlabel("Round") 107 | plt.ylabel("Rating") 108 | plt.title("Colley Matrix Ratings Over Tournament Rounds") 109 | plt.legend() 110 | plt.grid(True) 111 | plt.ylim(0, 1) 112 | plt.xticks(rounds) 113 | 114 | # Save the plot 115 | plt.savefig(os.path.join("images", "colley_matrix_ratings.png")) 116 | print("\nRating history plot saved as 'colley_matrix_ratings.png'") 117 | 118 | # Show the plot if running interactively 119 | # plt.show() 120 | 121 | 122 | if __name__ == "__main__": 123 | main() 124 | -------------------------------------------------------------------------------- /examples/glicko2_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Example demonstrating the use of the Glicko-2 rating system. 4 | 5 | This example shows how to create Glicko-2 competitors, calculate win probabilities, 6 | update ratings after matches, and how ratings change over time due to inactivity. 7 | """ 8 | 9 | from elote import Glicko2Competitor 10 | from datetime import datetime, timedelta 11 | 12 | 13 | def main(): 14 | """Run the Glicko-2 example.""" 15 | # Create initial time and competitors 16 | initial_time = datetime(2024, 1, 1) 17 | player1 = Glicko2Competitor(initial_rating=1500, initial_rd=350, initial_volatility=0.06, initial_time=initial_time) 18 | player2 = Glicko2Competitor(initial_rating=1700, initial_rd=300, initial_volatility=0.06, initial_time=initial_time) 19 | player3 = Glicko2Competitor(initial_rating=1800, initial_rd=200, initial_volatility=0.05, initial_time=initial_time) 20 | 21 | # Print initial ratings 22 | print("Initial ratings (January 1st, 2024):") 23 | print(f"Player 1: Rating={player1.rating}, RD={player1.rd}, Volatility={player1.volatility:.6f}") 24 | print(f"Player 2: Rating={player2.rating}, RD={player2.rd}, Volatility={player2.volatility:.6f}") 25 | print(f"Player 3: Rating={player3.rating}, RD={player3.rd}, Volatility={player3.volatility:.6f}") 26 | print() 27 | 28 | # Calculate initial win probabilities 29 | print("Initial win probabilities:") 30 | print(f"Player 1 vs Player 2: {player1.expected_score(player2):.4f}") 31 | print(f"Player 1 vs Player 3: {player1.expected_score(player3):.4f}") 32 | print(f"Player 2 vs Player 3: {player2.expected_score(player3):.4f}") 33 | print() 34 | 35 | # Simulate some matches with time gaps 36 | print("Simulating matches over time...") 37 | 38 | # First match after 5 days 39 | match1_time = initial_time + timedelta(days=5) 40 | print("\nMatch 1 (January 6th): Player 1 beats Player 2 (upset!)") 41 | print("RDs before match due to 5 days inactivity:") 42 | print(f"Player 1 RD: {player1.rd:.1f}") 43 | print(f"Player 2 RD: {player2.rd:.1f}") 44 | player1.beat(player2, match_time=match1_time) 45 | print("RDs after match:") 46 | print(f"Player 1 RD: {player1.rd:.1f}") 47 | print(f"Player 2 RD: {player2.rd:.1f}") 48 | 49 | # Second match after another 10 days 50 | match2_time = match1_time + timedelta(days=10) 51 | print("\nMatch 2 (January 16th): Player 3 beats Player 1") 52 | print("RDs before match due to 10 days inactivity:") 53 | print(f"Player 1 RD: {player1.rd:.1f}") 54 | print(f"Player 3 RD: {player3.rd:.1f}") 55 | player3.beat(player1, match_time=match2_time) 56 | print("RDs after match:") 57 | print(f"Player 1 RD: {player1.rd:.1f}") 58 | print(f"Player 3 RD: {player3.rd:.1f}") 59 | 60 | # Third match after another 15 days 61 | match3_time = match2_time + timedelta(days=15) 62 | print("\nMatch 3 (January 31st): Player 2 and Player 3 tie") 63 | print("RDs before match due to inactivity:") 64 | print(f"Player 2 RD: {player2.rd:.1f} (25 days inactive)") 65 | print(f"Player 3 RD: {player3.rd:.1f} (15 days inactive)") 66 | player2.tied(player3, match_time=match3_time) 67 | print("RDs after match:") 68 | print(f"Player 2 RD: {player2.rd:.1f}") 69 | print(f"Player 3 RD: {player3.rd:.1f}") 70 | print() 71 | 72 | # Print final ratings 73 | print("Final ratings (January 31st, 2024):") 74 | print(f"Player 1: Rating={player1.rating:.1f}, RD={player1.rd:.1f}, Volatility={player1.volatility:.6f}") 75 | print(f"Player 2: Rating={player2.rating:.1f}, RD={player2.rd:.1f}, Volatility={player2.volatility:.6f}") 76 | print(f"Player 3: Rating={player3.rating:.1f}, RD={player3.rd:.1f}, Volatility={player3.volatility:.6f}") 77 | print() 78 | 79 | # Calculate final win probabilities 80 | print("Final win probabilities:") 81 | print(f"Player 1 vs Player 2: {player1.expected_score(player2):.4f}") 82 | print(f"Player 1 vs Player 3: {player1.expected_score(player3):.4f}") 83 | print(f"Player 2 vs Player 3: {player2.expected_score(player3):.4f}") 84 | print() 85 | 86 | # Demonstrate serialization and deserialization 87 | print("Demonstrating serialization and deserialization...") 88 | state = player1.export_state() 89 | player1_copy = Glicko2Competitor.from_state(state) 90 | 91 | print(f"Original player: {player1}") 92 | print(f"Deserialized player: {player1_copy}") 93 | print( 94 | f"Are they equal? {player1.rating == player1_copy.rating and player1.rd == player1_copy.rd and player1.volatility == player1_copy.volatility}" 95 | ) 96 | 97 | 98 | if __name__ == "__main__": 99 | main() 100 | -------------------------------------------------------------------------------- /docs/source/rating_systems/elo.rst: -------------------------------------------------------------------------------- 1 | Elo Rating System 2 | ================ 3 | 4 | Overview 5 | -------- 6 | 7 | The Elo rating system is one of the most widely used rating systems in the world. Developed by Hungarian-American physics professor Arpad Elo, it was originally designed for chess but has since been adapted for many other competitive domains including video games, basketball, football, and baseball. 8 | 9 | The Elo system is named after its creator and was first introduced as the official rating system for the United States Chess Federation in 1960, and later adopted by the World Chess Federation (FIDE) in 1970. 10 | 11 | How It Works 12 | ----------- 13 | 14 | The Elo rating system is based on the following principles: 15 | 16 | 1. Each player has a rating that represents their skill level 17 | 2. The difference between ratings determines the expected outcome of a match 18 | 3. After each match, ratings are adjusted based on the actual outcome compared to the expected outcome 19 | 20 | The core formula for calculating the expected score (probability of winning) is: 21 | 22 | .. math:: 23 | 24 | E_A = \frac{1}{1 + 10^{(R_B - R_A) / 400}} 25 | 26 | Where: 27 | - :math:`E_A` is the expected score for player A 28 | - :math:`R_A` is the rating of player A 29 | - :math:`R_B` is the rating of player B 30 | 31 | After a match, the ratings are updated using: 32 | 33 | .. math:: 34 | 35 | R'_A = R_A + K \times (S_A - E_A) 36 | 37 | Where: 38 | - :math:`R'_A` is the new rating for player A 39 | - :math:`K` is the K-factor (determines how quickly ratings change) 40 | - :math:`S_A` is the actual score (1 for win, 0.5 for draw, 0 for loss) 41 | - :math:`E_A` is the expected score 42 | 43 | Advantages 44 | --------- 45 | 46 | - **Simplicity**: The Elo system is easy to understand and implement 47 | - **Transparency**: Players can easily see how their rating changes after each match 48 | - **Proven Track Record**: Used successfully for decades in various competitive domains 49 | - **Zero-Sum**: In a two-player game, the rating points one player gains are exactly what the other player loses 50 | - **Self-Correcting**: Ratings naturally adjust over time as more matches are played 51 | 52 | Limitations 53 | ---------- 54 | 55 | - **Requires Many Matches**: Needs a significant number of matches to reach an accurate rating 56 | - **No Confidence Intervals**: Unlike Glicko, Elo doesn't account for rating reliability 57 | - **Assumes Stable Performance**: Doesn't account for player improvement or decline over time 58 | - **K-Factor Sensitivity**: Results are highly dependent on the chosen K-factor 59 | - **No Team Dynamics**: In team sports, doesn't account for individual contributions 60 | 61 | Implementation in Elote 62 | ---------------------- 63 | 64 | Elote provides a straightforward implementation of the Elo rating system through the ``EloCompetitor`` class: 65 | 66 | .. code-block:: python 67 | 68 | from elote import EloCompetitor 69 | 70 | # Create two competitors with different initial ratings 71 | player1 = EloCompetitor(initial_rating=1500) 72 | player2 = EloCompetitor(initial_rating=1600) 73 | 74 | # Get win probability 75 | win_probability = player2.expected_score(player1) 76 | print(f"Player 2 win probability: {win_probability:.2%}") 77 | 78 | # Record a match result 79 | player1.beat(player2) # Player 1 won! 80 | 81 | # Ratings are automatically updated 82 | print(f"Player 1 new rating: {player1.rating}") 83 | print(f"Player 2 new rating: {player2.rating}") 84 | 85 | Customization 86 | ------------ 87 | 88 | The ``EloCompetitor`` class allows for customization of the K-factor: 89 | 90 | .. code-block:: python 91 | 92 | # Create a competitor with a custom K-factor 93 | player = EloCompetitor(initial_rating=1500, k_factor=32) 94 | 95 | A higher K-factor makes ratings change more quickly, while a lower K-factor makes them more stable. Common K-factor values: 96 | 97 | - 40: For new players with fewer than 30 games (FIDE standard) 98 | - 20: For players with ratings under 2400 (FIDE standard) 99 | - 10: For elite players with ratings over 2400 (FIDE standard) 100 | 101 | Real-World Applications 102 | --------------------- 103 | 104 | The Elo rating system is used in many domains: 105 | 106 | - **Chess**: FIDE and national chess federations 107 | - **Video Games**: League of Legends, DOTA 2, and many other competitive games 108 | - **Sports**: Used for international football rankings 109 | - **Online Matchmaking**: Many platforms use Elo or Elo-derived systems to match players of similar skill 110 | 111 | References 112 | --------- 113 | 114 | 1. Elo, Arpad (1978). *The Rating of Chessplayers, Past and Present*. Arco. ISBN 0-668-04721-6. 115 | 2. Glickman, Mark E. (1995). "A Comprehensive Guide to Chess Ratings". American Chess Journal, 3, 59-102. 116 | 3. Silver, Nate (2015). "How We Calculate NBA Elo Ratings". FiveThirtyEight. -------------------------------------------------------------------------------- /scripts/run_benchmarks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Benchmark runner for elote. 4 | 5 | This script runs benchmarks and generates a report with performance metrics. 6 | It can be used to track performance changes over time. 7 | 8 | Usage: 9 | python scripts/run_benchmarks.py [--compare BASELINE] 10 | 11 | Options: 12 | --compare BASELINE Compare results with a baseline JSON file 13 | """ 14 | 15 | import json 16 | import argparse 17 | import subprocess 18 | from datetime import datetime 19 | from pathlib import Path 20 | 21 | 22 | def run_benchmarks(): 23 | """Run benchmarks and return the JSON output.""" 24 | benchmark_dir = Path("benchmark_results") 25 | benchmark_dir.mkdir(exist_ok=True) 26 | 27 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 28 | json_output = benchmark_dir / f"benchmark_{timestamp}.json" 29 | 30 | cmd = [ 31 | "uv", 32 | "run", 33 | "pytest", 34 | "tests/test_benchmarks.py", 35 | "--benchmark-json", 36 | str(json_output), 37 | "--benchmark-enable", 38 | "-v", 39 | ] 40 | 41 | print(f"Running benchmarks: {' '.join(cmd)}") 42 | subprocess.run(cmd, check=True) 43 | 44 | return json_output 45 | 46 | 47 | def load_json(file_path): 48 | """Load JSON data from a file.""" 49 | with open(file_path, "r") as f: 50 | return json.load(f) 51 | 52 | 53 | def generate_report(current_file, baseline_file=None): 54 | """Generate a performance report.""" 55 | current_data = load_json(current_file) 56 | 57 | # Print summary 58 | print("\n" + "=" * 80) 59 | print(f"BENCHMARK RESULTS: {current_file}") 60 | print("=" * 80) 61 | 62 | # Extract benchmark data 63 | benchmarks = current_data.get("benchmarks", []) 64 | 65 | # Group by test name 66 | grouped = {} 67 | for bench in benchmarks: 68 | name = bench["name"] 69 | group = name.split("[")[0] if "[" in name else name 70 | if group not in grouped: 71 | grouped[group] = [] 72 | grouped[group].append(bench) 73 | 74 | # Print results by group 75 | for group, benches in sorted(grouped.items()): 76 | print(f"\n## {group}") 77 | print("-" * 80) 78 | print(f"{'Test':<50} {'Min (ms)':<12} {'Mean (ms)':<12} {'Max (ms)':<12} {'StdDev':<12}") 79 | print("-" * 80) 80 | 81 | for bench in sorted(benches, key=lambda x: x["name"]): 82 | name = bench["name"] 83 | if "[" in name: 84 | name = name.split("[")[1].rstrip("]") 85 | else: 86 | name = "default" 87 | 88 | min_time = bench["stats"]["min"] * 1000 # Convert to ms 89 | mean_time = bench["stats"]["mean"] * 1000 90 | max_time = bench["stats"]["max"] * 1000 91 | stddev = bench["stats"]["stddev"] * 1000 92 | 93 | print(f"{name:<50} {min_time:<12.3f} {mean_time:<12.3f} {max_time:<12.3f} {stddev:<12.3f}") 94 | 95 | # Compare with baseline if provided 96 | if baseline_file: 97 | print("\n" + "=" * 80) 98 | print(f"COMPARISON WITH BASELINE: {baseline_file}") 99 | print("=" * 80) 100 | 101 | baseline_data = load_json(baseline_file) 102 | baseline_benchmarks = {b["name"]: b for b in baseline_data.get("benchmarks", [])} 103 | 104 | print(f"{'Test':<50} {'Current (ms)':<12} {'Baseline (ms)':<12} {'Change %':<12}") 105 | print("-" * 80) 106 | 107 | for bench in benchmarks: 108 | name = bench["name"] 109 | if name in baseline_benchmarks: 110 | current_mean = bench["stats"]["mean"] * 1000 111 | baseline_mean = baseline_benchmarks[name]["stats"]["mean"] * 1000 112 | change_pct = ((current_mean - baseline_mean) / baseline_mean) * 100 113 | 114 | # Use color indicators for performance changes 115 | if change_pct > 5: # Worse performance 116 | change_str = f"\033[91m{change_pct:+.2f}%\033[0m" # Red 117 | elif change_pct < -5: # Better performance 118 | change_str = f"\033[92m{change_pct:+.2f}%\033[0m" # Green 119 | else: # Similar performance 120 | change_str = f"{change_pct:+.2f}%" 121 | 122 | print(f"{name:<50} {current_mean:<12.3f} {baseline_mean:<12.3f} {change_str:<12}") 123 | else: 124 | print(f"{name:<50} {bench['stats']['mean'] * 1000:<12.3f} {'N/A':<12} {'N/A':<12}") 125 | 126 | print("\n" + "=" * 80) 127 | return current_file 128 | 129 | 130 | def main(): 131 | parser = argparse.ArgumentParser(description="Run benchmarks for elote") 132 | parser.add_argument("--compare", help="Compare with baseline JSON file") 133 | args = parser.parse_args() 134 | 135 | # Run benchmarks 136 | result_file = run_benchmarks() 137 | 138 | # Generate report 139 | baseline_file = args.compare 140 | generate_report(result_file, baseline_file) 141 | 142 | print(f"\nBenchmark results saved to: {result_file}") 143 | print("To compare with these results in the future, run:") 144 | print(f" python scripts/run_benchmarks.py --compare {result_file}") 145 | 146 | 147 | if __name__ == "__main__": 148 | main() 149 | -------------------------------------------------------------------------------- /docs/source/rating_systems/ecf.rst: -------------------------------------------------------------------------------- 1 | ECF Rating System 2 | =============== 3 | 4 | Overview 5 | -------- 6 | 7 | The ECF (English Chess Federation) rating system is the official rating system used for chess players in England. It was developed as an alternative to the Elo system and has been in use since the 1950s, though it has undergone several revisions over the years. 8 | 9 | Unlike Elo and Glicko, which use a logistic curve to calculate expected outcomes, the ECF system uses a linear relationship between rating differences and expected game outcomes. This makes it somewhat simpler to calculate by hand, which was an advantage in the pre-computer era. 10 | 11 | How It Works 12 | ----------- 13 | 14 | The ECF rating system is based on the following principles: 15 | 16 | 1. Each player has a grade (rating) that represents their playing strength 17 | 2. The difference between grades determines the expected outcome of a match 18 | 3. After each match, grades are adjusted based on the actual outcome compared to the expected outcome 19 | 20 | In the ECF system, a difference of 40 grade points is expected to yield approximately a 67% win rate for the stronger player. This is different from Elo, where a 100-point difference corresponds to a 64% win expectancy. 21 | 22 | The expected outcome calculation is: 23 | 24 | .. math:: 25 | 26 | E_A = 0.5 + \frac{R_A - R_B}{F} 27 | 28 | Where: 29 | - :math:`E_A` is the expected score for player A 30 | - :math:`R_A` is the grade of player A 31 | - :math:`R_B` is the grade of player B 32 | - :math:`F` is a conversion factor (typically 120) 33 | 34 | After a match, the grades are updated using: 35 | 36 | .. math:: 37 | 38 | R'_A = R_A + K \times (S_A - E_A) 39 | 40 | Where: 41 | - :math:`R'_A` is the new grade for player A 42 | - :math:`K` is the K-factor (determines how quickly grades change) 43 | - :math:`S_A` is the actual score (1 for win, 0.5 for draw, 0 for loss) 44 | - :math:`E_A` is the expected score 45 | 46 | Advantages 47 | --------- 48 | 49 | - **Simplicity**: The linear relationship is easier to understand and calculate 50 | - **Local Optimization**: Designed specifically for the English chess community 51 | - **Historical Data**: Long history of use provides extensive comparative data 52 | - **Regular Updates**: The ECF publishes updated ratings multiple times per year 53 | - **Transparency**: Clear calculation methods that players can verify 54 | 55 | Limitations 56 | ---------- 57 | 58 | - **Limited Range**: Works best within a certain range of skill differences 59 | - **Less Theoretical Basis**: The linear relationship is less theoretically justified than Elo's logistic curve 60 | - **Regional Focus**: Primarily used in England, limiting international comparability 61 | - **No Uncertainty Measure**: Unlike Glicko, doesn't account for rating reliability 62 | - **Fixed Parameters**: Less flexibility in parameter adjustment compared to other systems 63 | 64 | Implementation in Elote 65 | ---------------------- 66 | 67 | Elote provides an implementation of the ECF rating system through the ``ECFCompetitor`` class: 68 | 69 | .. code-block:: python 70 | 71 | from elote import ECFCompetitor 72 | 73 | # Create two competitors with different initial grades 74 | player1 = ECFCompetitor(initial_rating=120) 75 | player2 = ECFCompetitor(initial_rating=150) 76 | 77 | # Get win probability 78 | win_probability = player2.expected_score(player1) 79 | print(f"Player 2 win probability: {win_probability:.2%}") 80 | 81 | # Record a match result 82 | player1.beat(player2) # Player 1 won! 83 | 84 | # Grades are automatically updated 85 | print(f"Player 1 new grade: {player1.rating}") 86 | print(f"Player 2 new grade: {player2.rating}") 87 | 88 | Customization 89 | ------------ 90 | 91 | The ``ECFCompetitor`` class allows for customization of the K-factor and the conversion factor: 92 | 93 | .. code-block:: python 94 | 95 | # Create a competitor with custom parameters 96 | player = ECFCompetitor( 97 | initial_rating=120, 98 | k_factor=20, 99 | f_factor=120 100 | ) 101 | 102 | Key parameters: 103 | - **initial_rating**: Starting grade value 104 | - **k_factor**: Determines how quickly grades change (default: 16) 105 | - **f_factor**: Conversion factor for expected score calculation (default: 120) 106 | 107 | ECF to Elo Conversion 108 | -------------------- 109 | 110 | For those familiar with Elo ratings, ECF grades can be approximately converted to Elo ratings using the formula: 111 | 112 | .. math:: 113 | 114 | \text{Elo} = 7.5 \times \text{ECF} + 700 115 | 116 | This means an ECF grade of 100 is roughly equivalent to an Elo rating of 1450. 117 | 118 | Real-World Applications 119 | --------------------- 120 | 121 | The ECF rating system is primarily used in England for: 122 | 123 | - **Chess Tournaments**: Official ECF-rated events throughout England 124 | - **Club Play**: Local chess clubs use ECF grades for team selection and pairing 125 | - **Junior Development**: Tracking progress of young players 126 | - **National Rankings**: Determining England's top players 127 | 128 | References 129 | --------- 130 | 131 | 1. [ECF Grading System](http://www.ecfgrading.org.uk/new/help.php#elo) - Official documentation 132 | 2. Clarke, P.H. (1982). "The Theory of Grading". British Chess Magazine. 133 | 3. Elo, Arpad (1978). *The Rating of Chessplayers, Past and Present*. Arco. ISBN 0-668-04721-6. 134 | 4. Sonas, Jeff (2002). "The Sonas Rating Formula - Better than Elo?". ChessBase News. -------------------------------------------------------------------------------- /docs/source/rating_systems/dwz.rst: -------------------------------------------------------------------------------- 1 | DWZ Rating System 2 | ============== 3 | 4 | Overview 5 | -------- 6 | 7 | The Deutsche Wertungszahl (DWZ), or German Evaluation Number, is the official chess rating system of the German Chess Federation (Deutscher Schachbund). Developed in the 1990s as a replacement for the previously used Ingo system, DWZ is similar to the Elo rating system but with some important modifications to better handle tournament play and player development. 8 | 9 | The DWZ system is particularly notable for its sophisticated handling of youth players, whose ratings tend to change more rapidly as they improve, and for its detailed approach to calculating expected outcomes based on rating differences. 10 | 11 | How It Works 12 | ----------- 13 | 14 | The DWZ system uses the following key components: 15 | 16 | 1. **Rating (R)**: Represents the player's skill level 17 | 2. **Development Coefficient (E)**: Determines how quickly ratings change, with higher values for younger and less experienced players 18 | 3. **Performance Rating (P)**: The rating that would exactly match a player's tournament results 19 | 20 | The expected outcome calculation is similar to Elo: 21 | 22 | .. math:: 23 | 24 | W_e = \frac{1}{1 + 10^{-(R_A - R_B) / 400}} 25 | 26 | Where: 27 | - :math:`W_e` is the expected score for player A 28 | - :math:`R_A` is the rating of player A 29 | - :math:`R_B` is the rating of player B 30 | 31 | After a tournament, the rating is updated using: 32 | 33 | .. math:: 34 | 35 | R' = R + E \times (W - W_e) 36 | 37 | Where: 38 | - :math:`R'` is the new rating 39 | - :math:`E` is the development coefficient 40 | - :math:`W` is the actual score 41 | - :math:`W_e` is the expected score 42 | 43 | The development coefficient is calculated based on: 44 | 45 | .. math:: 46 | 47 | E = E_0 \times f(A) \times f(n) 48 | 49 | Where: 50 | - :math:`E_0` is the base coefficient (typically 30) 51 | - :math:`f(A)` is an age factor (higher for younger players) 52 | - :math:`f(n)` is an experience factor based on number of rated games played 53 | 54 | Advantages 55 | --------- 56 | 57 | - **Age Sensitivity**: Better handles rating changes for youth players 58 | - **Experience Factor**: Accounts for player experience level 59 | - **Tournament Focus**: Designed for batch updates after tournaments 60 | - **National Standardization**: Consistent application across German chess events 61 | - **Detailed Documentation**: Well-documented methodology with regular updates 62 | 63 | Limitations 64 | ---------- 65 | 66 | - **Complexity**: More complex to calculate than basic Elo 67 | - **Regional Focus**: Primarily used in Germany and some neighboring countries 68 | - **No Uncertainty Measure**: Unlike Glicko, doesn't explicitly track rating reliability 69 | - **Parameter Sensitivity**: Results depend on proper calibration of multiple factors 70 | - **Less International Recognition**: Not as widely recognized as FIDE Elo ratings 71 | 72 | Implementation in Elote 73 | ---------------------- 74 | 75 | Elote provides an implementation of the DWZ rating system through the ``DWZCompetitor`` class: 76 | 77 | .. code-block:: python 78 | 79 | from elote import DWZCompetitor 80 | 81 | # Create two competitors with different initial ratings 82 | player1 = DWZCompetitor(initial_rating=1600) 83 | player2 = DWZCompetitor(initial_rating=1800) 84 | 85 | # Get win probability 86 | win_probability = player2.expected_score(player1) 87 | print(f"Player 2 win probability: {win_probability:.2%}") 88 | 89 | # Record a match result 90 | player1.beat(player2) # Player 1 won! 91 | 92 | # Ratings are automatically updated 93 | print(f"Player 1 new rating: {player1.rating}") 94 | print(f"Player 2 new rating: {player2.rating}") 95 | 96 | Customization 97 | ------------ 98 | 99 | The ``DWZCompetitor`` class allows for customization of several parameters: 100 | 101 | .. code-block:: python 102 | 103 | # Create a competitor with custom parameters 104 | player = DWZCompetitor( 105 | initial_rating=1600, 106 | initial_development_coeff=30, 107 | base_development_coeff=30 108 | ) 109 | 110 | Key parameters: 111 | - **initial_rating**: Starting rating value 112 | - **initial_development_coeff**: Starting development coefficient 113 | - **base_development_coeff**: Base value for development coefficient calculation 114 | 115 | DWZ to Elo Conversion 116 | ------------------- 117 | 118 | While DWZ and Elo use different calculation methods, the numerical values are designed to be roughly comparable. For practical purposes: 119 | 120 | .. math:: 121 | 122 | \text{DWZ} \approx \text{Elo} 123 | 124 | However, due to different update mechanisms, the ratings may diverge over time for the same player. 125 | 126 | Real-World Applications 127 | --------------------- 128 | 129 | The DWZ rating system is used primarily in: 130 | 131 | - **German Chess Federation**: Official rating system for all German chess events 132 | - **Youth Development**: Specially calibrated for tracking youth player development 133 | - **Club Championships**: Used for local and regional tournaments in Germany 134 | - **National Rankings**: Determining Germany's top players 135 | 136 | References 137 | --------- 138 | 139 | 1. [Deutsche Wertungszahl](https://en.wikipedia.org/wiki/Deutsche_Wertungszahl) - Wikipedia article 140 | 2. [Deutscher Schachbund](https://www.schachbund.de/dwz.html) - Official German Chess Federation site 141 | 3. Hechenberger, A. (2001). "Die Deutsche Wertungszahl". Schach-Journal. 142 | 4. Glickman, Mark E. (1995). "A Comprehensive Guide to Chess Ratings". American Chess Journal, 3, 59-102. -------------------------------------------------------------------------------- /tests/test_ECFCompetitor_known_values.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from elote import ECFCompetitor 3 | 4 | 5 | class TestECFKnownValues(unittest.TestCase): 6 | """Tests for ECFCompetitor with known values to verify correctness after optimization.""" 7 | 8 | def test_initial_rating(self): 9 | """Test that initial rating is set correctly.""" 10 | player = ECFCompetitor(initial_rating=100) 11 | self.assertEqual(player.rating, 100) 12 | 13 | player = ECFCompetitor(initial_rating=120) 14 | self.assertEqual(player.rating, 120) 15 | 16 | def test_elo_conversion(self): 17 | """Test that elo_conversion property returns the correct value.""" 18 | player = ECFCompetitor(initial_rating=100) 19 | self.assertEqual(player.elo_conversion, 100 * 7.5 + 700) 20 | 21 | player = ECFCompetitor(initial_rating=120) 22 | self.assertEqual(player.elo_conversion, 120 * 7.5 + 700) 23 | 24 | def test_transformed_elo_rating(self): 25 | """Test that transformed_elo_rating property returns the correct value.""" 26 | player = ECFCompetitor(initial_rating=100) 27 | expected = 10 ** ((100 * 7.5 + 700) / 400) 28 | self.assertAlmostEqual(player.transformed_elo_rating, expected) 29 | 30 | # Test caching - should return the same value without recalculating 31 | self.assertAlmostEqual(player.transformed_elo_rating, expected) 32 | 33 | def test_expected_score(self): 34 | """Test expected_score with known values.""" 35 | player1 = ECFCompetitor(initial_rating=100) 36 | player2 = ECFCompetitor(initial_rating=120) 37 | 38 | # Calculate expected values manually 39 | p1_transformed = 10 ** ((100 * 7.5 + 700) / 400) 40 | p2_transformed = 10 ** ((120 * 7.5 + 700) / 400) 41 | expected = p1_transformed / (p1_transformed + p2_transformed) 42 | 43 | self.assertAlmostEqual(player1.expected_score(player2), expected) 44 | 45 | def test_beat_with_known_values(self): 46 | """Test beat method with known values.""" 47 | player1 = ECFCompetitor(initial_rating=100) 48 | player2 = ECFCompetitor(initial_rating=120) 49 | 50 | # Player1 beats player2 51 | player1.beat(player2) 52 | 53 | # After player1 beats player2, player1's rating should be updated 54 | # The new rating is the mean of the scores in the deque 55 | # Since we just initialized the deque, it contains [100] initially 56 | # After beat, it contains [100, 120+50] = [100, 170] 57 | # So the mean is (100 + 170) / 2 = 135 58 | self.assertEqual(player1.rating, 135) 59 | 60 | # After player1 beats player2, player2's rating should be updated 61 | # The new rating is the mean of the scores in the deque 62 | # Since we just initialized the deque, it contains [120] initially 63 | # After beat, it contains [120, 100-50] = [120, 50] 64 | # The minimum rating check is applied when adding to the deque, not when calculating the mean 65 | # So the mean is (120 + 50) / 2 = 85 66 | self.assertEqual(player2.rating, 85) 67 | 68 | def test_tied_with_known_values(self): 69 | """Test tied method with known values.""" 70 | player1 = ECFCompetitor(initial_rating=100) 71 | player2 = ECFCompetitor(initial_rating=120) 72 | 73 | # Players tie 74 | player1.tied(player2) 75 | 76 | # After tie, player1's rating should be updated 77 | # The new rating is the mean of the scores in the deque 78 | # Since we just initialized the deque, it contains [100] initially 79 | # After tie, it contains [100, 120] = [100, 120] 80 | # So the mean is (100 + 120) / 2 = 110 81 | self.assertEqual(player1.rating, 110) 82 | 83 | # After tie, player2's rating should be updated 84 | # The new rating is the mean of the scores in the deque 85 | # Since we just initialized the deque, it contains [120] initially 86 | # After tie, it contains [120, 100] = [120, 100] 87 | # So the mean is (120 + 100) / 2 = 110 88 | self.assertEqual(player2.rating, 110) 89 | 90 | def test_delta_limit(self): 91 | """Test that the delta limit is applied correctly.""" 92 | player1 = ECFCompetitor(initial_rating=100) 93 | player2 = ECFCompetitor(initial_rating=200) # Rating difference > delta (50) 94 | 95 | # Player1 beats player2 96 | player1.beat(player2) 97 | 98 | # Since difference > delta, player2's effective rating should be limited 99 | # The effective rating of player2 is limited to player1's rating + delta = 100 + 50 = 150 100 | # After beat, player1's scores deque contains [100, 150+50] = [100, 200] 101 | # So the mean is (100 + 200) / 2 = 150 102 | self.assertEqual(player1.rating, 150) 103 | 104 | def test_scores_deque_behavior(self): 105 | """Test that the scores deque behaves correctly with maxlen.""" 106 | player = ECFCompetitor(initial_rating=100) 107 | 108 | # Initialize scores 109 | if player.scores is None: 110 | player._ECFCompetitor__initialize_ratings() 111 | 112 | # Add more than _n_periods scores 113 | for i in range(player._n_periods + 10): 114 | player._update(i) 115 | 116 | # Check that only the last _n_periods scores are kept 117 | self.assertEqual(len(player.scores), player._n_periods) 118 | 119 | # Check that the oldest scores were dropped 120 | self.assertEqual(min(player.scores), player._n_periods + 10 - player._n_periods) 121 | 122 | 123 | if __name__ == "__main__": 124 | unittest.main() 125 | -------------------------------------------------------------------------------- /docs/source/rating_systems/glicko.rst: -------------------------------------------------------------------------------- 1 | Glicko Rating System 2 | ================== 3 | 4 | Overview 5 | -------- 6 | 7 | The Glicko rating system was developed by Mark Glickman in 1995 as an improvement over the Elo rating system. The key innovation of Glicko is the introduction of a "rating deviation" (RD) parameter that measures the uncertainty in a player's rating. This addresses one of the main limitations of the Elo system, which doesn't account for rating reliability. 8 | 9 | The name "Glicko" is derived from the creator's surname, Glickman. The system has since been further refined into Glicko-2, though Elote currently implements the original Glicko-1 system. 10 | 11 | How It Works 12 | ----------- 13 | 14 | The Glicko system uses three key parameters: 15 | 16 | 1. **Rating (r)**: Represents the player's skill level, similar to Elo 17 | 2. **Rating Deviation (RD)**: Represents the uncertainty in the rating (higher RD = more uncertainty) 18 | 3. **Time Factor (c)**: Controls how much the RD increases over time without playing 19 | 20 | The expected outcome calculation is similar to Elo but incorporates the rating deviations: 21 | 22 | .. math:: 23 | 24 | E(A, B) = \frac{1}{1 + 10^{-g(RD_B) \times (r_A - r_B) / 400}} 25 | 26 | Where: 27 | - :math:`g(RD) = \frac{1}{\sqrt{1 + 3 \times RD^2 / \pi^2}}` 28 | - :math:`r_A` and :math:`r_B` are the ratings of players A and B 29 | - :math:`RD_A` and :math:`RD_B` are their rating deviations 30 | 31 | After a match, both the rating and rating deviation are updated: 32 | 33 | .. math:: 34 | 35 | r'_A = r_A + \frac{q}{1/RD_A^2 + 1/d^2} \times g(RD_B) \times (S_A - E(A, B)) 36 | 37 | .. math:: 38 | 39 | RD'_A = \sqrt{\frac{1}{1/RD_A^2 + 1/d^2}} 40 | 41 | Where: 42 | - :math:`q = \ln(10) / 400` 43 | - :math:`d^2 = 1 / (q^2 \times g(RD_B)^2 \times E(A, B) \times (1 - E(A, B)))` 44 | - :math:`S_A` is the actual score (1 for win, 0.5 for draw, 0 for loss) 45 | 46 | When a player doesn't compete for a period, their RD increases: 47 | 48 | .. math:: 49 | 50 | RD'_A = \min(\sqrt{RD_A^2 + c^2 \times t}, RD_{max}) 51 | 52 | Where: 53 | - :math:`t` is the time since last competition 54 | - :math:`c` is the volatility constant 55 | - :math:`RD_{max}` is the maximum allowed rating deviation 56 | 57 | Advantages 58 | --------- 59 | 60 | - **Uncertainty Measurement**: Accounts for the reliability of a player's rating 61 | - **Inactivity Handling**: Automatically increases uncertainty for inactive players 62 | - **More Accurate Matchmaking**: Can match players with similar ratings but different uncertainties 63 | - **Faster Convergence**: New players can reach their true skill level faster 64 | - **Better for Sparse Data**: Works well when players don't compete frequently 65 | 66 | Limitations 67 | ---------- 68 | 69 | - **Complexity**: More complex to understand and implement than Elo 70 | - **Parameter Sensitivity**: Results depend on proper tuning of multiple parameters 71 | - **Computational Overhead**: Requires more calculations than Elo 72 | - **No Volatility Tracking**: Unlike Glicko-2, doesn't track how volatile a player's performance is 73 | - **Batch Updates**: Originally designed for updating ratings in batches rather than after each game 74 | 75 | Implementation in Elote 76 | ---------------------- 77 | 78 | Elote provides an implementation of the Glicko-1 rating system through the ``GlickoCompetitor`` class: 79 | 80 | .. code-block:: python 81 | 82 | from elote import GlickoCompetitor 83 | 84 | # Create two competitors with different initial ratings and RDs 85 | player1 = GlickoCompetitor(initial_rating=1500, initial_rd=350) 86 | player2 = GlickoCompetitor(initial_rating=1700, initial_rd=300) 87 | 88 | # Get win probability 89 | win_probability = player2.expected_score(player1) 90 | print(f"Player 2 win probability: {win_probability:.2%}") 91 | 92 | # Record a match result 93 | player1.beat(player2) # Player 1 won! 94 | 95 | # Ratings and RDs are automatically updated 96 | print(f"Player 1 new rating: {player1.rating}, RD: {player1.rd}") 97 | print(f"Player 2 new rating: {player2.rating}, RD: {player2.rd}") 98 | 99 | Customization 100 | ------------ 101 | 102 | The ``GlickoCompetitor`` class allows for customization of several parameters: 103 | 104 | .. code-block:: python 105 | 106 | # Create a competitor with custom parameters 107 | player = GlickoCompetitor( 108 | initial_rating=1500, 109 | initial_rd=350, 110 | volatility=0.06, 111 | tau=0.5 112 | ) 113 | 114 | Key parameters: 115 | - **initial_rating**: Starting rating value (default: 1500) 116 | - **initial_rd**: Starting rating deviation (default: 350) 117 | - **volatility**: How much RD increases over time (default: 0.06) 118 | - **tau**: System constant affecting rating changes (default: 0.5) 119 | 120 | Real-World Applications 121 | --------------------- 122 | 123 | The Glicko rating system is used in various competitive domains: 124 | 125 | - **Chess**: Used by the Australian Chess Federation and Free Internet Chess Server 126 | - **Video Games**: Used in modified form by many competitive games 127 | - **Online Platforms**: Used by lichess.org and other competitive platforms 128 | - **Sports Analytics**: Used for player performance analysis in various sports 129 | 130 | References 131 | --------- 132 | 133 | 1. Glickman, Mark E. (1995). "A Comprehensive Guide to Chess Ratings". American Chess Journal, 3, 59-102. 134 | 2. Glickman, Mark E. (1999). "Parameter estimation in large dynamic paired comparison experiments". Applied Statistics, 48, 377-394. 135 | 3. Glickman, Mark E. (2001). "Dynamic paired comparison models with stochastic variances". Journal of Applied Statistics, 28, 673-689. 136 | 4. [The Glicko System](http://www.glicko.net/glicko/glicko.pdf) - Original paper by Mark Glickman -------------------------------------------------------------------------------- /examples/dataset_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example of using datasets with different rating algorithms. 3 | 4 | This example demonstrates how to use the datasets module to evaluate different rating algorithms. 5 | """ 6 | 7 | import time 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | 11 | from elote import ( 12 | LambdaArena, 13 | EloCompetitor, 14 | GlickoCompetitor, 15 | Glicko2Competitor, 16 | TrueSkillCompetitor, 17 | SyntheticDataset, 18 | ChessDataset, 19 | CollegeFootballDataset, 20 | train_and_evaluate_arena, 21 | ) 22 | 23 | 24 | def progress_callback(phase, current, total): 25 | """Callback function for reporting progress.""" 26 | if current == 0: 27 | print(f"\nStarting {phase} phase...") 28 | elif current == total: 29 | print(f"\nCompleted {phase} phase.") 30 | 31 | 32 | def evaluate_algorithms_on_dataset(dataset_name, dataset, test_ratio=0.2, seed=42): 33 | """ 34 | Evaluate different rating algorithms on a dataset. 35 | 36 | Args: 37 | dataset_name: Name of the dataset 38 | dataset: Dataset object 39 | test_ratio: Ratio of data to use for testing 40 | seed: Random seed for reproducibility 41 | """ 42 | print(f"\n=== Evaluating algorithms on {dataset_name} dataset ===") 43 | 44 | # Split the dataset into train and test sets 45 | print(f"Splitting dataset with test_ratio={test_ratio}...") 46 | data_split = dataset.time_split(test_ratio=test_ratio) 47 | print(f"Split complete: {len(data_split.train)} train samples, {len(data_split.test)} test samples") 48 | 49 | # Define the algorithms to evaluate 50 | algorithms = [ 51 | ("Elo", EloCompetitor, {"initial_rating": 1500}), 52 | ("Glicko", GlickoCompetitor, {"initial_rating": 1500}), 53 | ("Glicko-2", Glicko2Competitor, {"initial_rating": 1500}), 54 | ("TrueSkill", TrueSkillCompetitor, {}), 55 | ] 56 | 57 | # Evaluate each algorithm 58 | results = [] 59 | 60 | for algo_name, competitor_class, competitor_kwargs in algorithms: 61 | print(f"\nEvaluating {algo_name}...") 62 | start_time = time.time() 63 | 64 | # Create an arena with the algorithm 65 | arena = LambdaArena( 66 | lambda a, b, attributes=None: True, # Dummy function, not used in this example 67 | base_competitor=competitor_class, 68 | base_competitor_kwargs=competitor_kwargs, 69 | ) 70 | 71 | # Train and evaluate the arena 72 | _, history = train_and_evaluate_arena( 73 | arena, 74 | data_split, 75 | batch_size=1000, 76 | progress_callback=progress_callback, 77 | ) 78 | 79 | # Calculate metrics 80 | metrics = history.calculate_metrics() 81 | accuracy = metrics["accuracy"] 82 | precision = metrics["precision"] 83 | recall = metrics["recall"] 84 | f1 = metrics["f1"] 85 | 86 | end_time = time.time() 87 | elapsed_time = end_time - start_time 88 | 89 | # Store results 90 | results.append( 91 | { 92 | "Algorithm": algo_name, 93 | "Accuracy": accuracy, 94 | "Precision": precision, 95 | "Recall": recall, 96 | "F1 Score": f1, 97 | "Time (s)": elapsed_time, 98 | } 99 | ) 100 | 101 | print(f"{algo_name} evaluation complete in {elapsed_time:.2f} seconds") 102 | print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}") 103 | 104 | # Convert results to DataFrame 105 | results_df = pd.DataFrame(results) 106 | 107 | # Print results table 108 | print("\nResults:") 109 | print(results_df.to_string(index=False)) 110 | 111 | # Plot results 112 | plt.figure(figsize=(12, 6)) 113 | 114 | # Plot accuracy, precision, recall, F1 115 | metrics = ["Accuracy", "Precision", "Recall", "F1 Score"] 116 | for i, metric in enumerate(metrics): 117 | plt.subplot(1, 2, 1) 118 | plt.bar([x + i * 0.2 for x in range(len(algorithms))], results_df[metric], width=0.2, label=metric) 119 | 120 | plt.xlabel("Algorithm") 121 | plt.ylabel("Score") 122 | plt.title(f"Performance Metrics on {dataset_name} Dataset") 123 | plt.xticks([i + 0.3 for i in range(len(algorithms))], results_df["Algorithm"]) 124 | plt.legend() 125 | plt.grid(axis="y", linestyle="--", alpha=0.7) 126 | 127 | # Plot time 128 | plt.subplot(1, 2, 2) 129 | plt.bar(results_df["Algorithm"], results_df["Time (s)"]) 130 | plt.xlabel("Algorithm") 131 | plt.ylabel("Time (s)") 132 | plt.title(f"Execution Time on {dataset_name} Dataset") 133 | plt.grid(axis="y", linestyle="--", alpha=0.7) 134 | 135 | plt.tight_layout() 136 | plt.savefig(f"{dataset_name.lower().replace(' ', '_')}_results.png") 137 | plt.close() 138 | 139 | 140 | def main(): 141 | """Main function.""" 142 | # Evaluate on synthetic dataset 143 | print("Generating synthetic dataset...") 144 | synthetic_dataset = SyntheticDataset( 145 | num_competitors=100, 146 | num_matchups=5000, 147 | skill_distribution="normal", 148 | skill_mean=1500, 149 | skill_std=300, 150 | noise_std=100, 151 | draw_probability=0.1, 152 | time_span_days=365, 153 | seed=42, 154 | ) 155 | evaluate_algorithms_on_dataset("Synthetic", synthetic_dataset, test_ratio=0.2, seed=42) 156 | 157 | # Evaluate on chess dataset 158 | print("\nLoading chess dataset...") 159 | chess_dataset = ChessDataset(max_games=5000, year=2013, month=1) 160 | evaluate_algorithms_on_dataset("Chess", chess_dataset, test_ratio=0.2, seed=42) 161 | 162 | # Evaluate on college football dataset 163 | print("\nLoading college football dataset...") 164 | football_dataset = CollegeFootballDataset(start_year=2015, end_year=2022) 165 | evaluate_algorithms_on_dataset("College Football", football_dataset, test_ratio=0.2, seed=42) 166 | 167 | 168 | if __name__ == "__main__": 169 | main() 170 | -------------------------------------------------------------------------------- /tests/test_examples.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import subprocess 4 | import sys 5 | from pathlib import Path 6 | 7 | 8 | class TestExamples(unittest.TestCase): 9 | """Tests that verify all example scripts run without errors.""" 10 | 11 | def setUp(self): 12 | # Get the root directory of the project 13 | self.root_dir = Path(__file__).parent.parent 14 | self.examples_dir = self.root_dir / "examples" 15 | 16 | # Skip examples that require external API access or take too long to run in tests 17 | self.skip_examples = [ 18 | "use_cases/cfb_w_lib.py", # Requires external API access 19 | "use_cases/chess_w_lib.py", # Takes too long to run in tests 20 | "dataset_example.py", # Takes too long to run in tests 21 | # "persist_state_arena.py", # Times out in tests 22 | # "sample_bout.py", # Times out in tests 23 | # "colley_matrix_example.py", # Times out in tests 24 | # "bout_with_initialization.py", # Added to prevent timeout 25 | # "prediction.py", # Added to prevent timeout 26 | ] 27 | 28 | def test_example_scripts(self): 29 | """Test that all example scripts run without errors.""" 30 | # Get all Python files in the examples directory 31 | example_files = [] 32 | for root, _, files in os.walk(self.examples_dir): 33 | for file in files: 34 | if file.endswith(".py") and file != "__init__.py": 35 | rel_path = os.path.relpath(os.path.join(root, file), self.examples_dir) 36 | if rel_path not in self.skip_examples: 37 | example_files.append(rel_path) 38 | 39 | # Make sure we found some example files 40 | self.assertGreater(len(example_files), 0, "No example files found") 41 | 42 | # Run each example script and check for errors 43 | for example_file in example_files: 44 | with self.subTest(example=example_file): 45 | script_path = os.path.join(self.examples_dir, example_file) 46 | 47 | # Run the script with a timeout to prevent hanging 48 | try: 49 | result = subprocess.run( 50 | [sys.executable, script_path], 51 | capture_output=True, 52 | text=True, 53 | timeout=60, 54 | ) 55 | 56 | # Check if the script ran successfully 57 | self.assertEqual( 58 | result.returncode, 0, f"Example {example_file} failed with error:\n{result.stderr}" 59 | ) 60 | except subprocess.TimeoutExpired: 61 | self.fail(f"Example {example_file} timed out after 10 seconds") 62 | 63 | def test_individual_examples(self): 64 | """Test each example script individually with specific assertions.""" 65 | # Test sample_bout.py - skip if in skip_examples 66 | if "sample_bout.py" not in self.skip_examples: 67 | self._test_specific_example( 68 | "sample_bout.py", expected_output_contains=["Starting ratings:", "After matches"] 69 | ) 70 | 71 | # Test prediction.py 72 | self._test_specific_example("prediction.py", expected_output_contains=["probability of better beating good"]) 73 | 74 | # Test bout_with_ties.py 75 | self._test_specific_example( 76 | "bout_with_ties.py", expected_output_contains=["Starting ratings:", "After matches with ties"] 77 | ) 78 | 79 | # Test sample_arena.py 80 | self._test_specific_example("sample_arena.py", expected_output_contains=["Arena results"]) 81 | 82 | # Test dwz_arena.py 83 | self._test_specific_example("dwz_arena.py", expected_output_contains=["Arena results"]) 84 | 85 | # Test ecf_arena.py 86 | self._test_specific_example("ecf_arena.py", expected_output_contains=["Arena results"]) 87 | 88 | # Test glicko_arena.py 89 | self._test_specific_example("glicko_arena.py", expected_output_contains=["Arena results"]) 90 | 91 | # Test persist_state_arena.py - skip if in skip_examples 92 | if "persist_state_arena.py" not in self.skip_examples: 93 | self._test_specific_example("persist_state_arena.py", expected_output_contains=["Arena results"]) 94 | 95 | # Test bout_with_initialization.py 96 | self._test_specific_example( 97 | "bout_with_initialization.py", expected_output_contains=["Starting ratings:", "After matches"] 98 | ) 99 | 100 | # Test colley_matrix_example.py 101 | self._test_specific_example( 102 | "colley_matrix_example.py", 103 | expected_output_contains=["Initial ratings:", "Final ratings:", "Sum of all ratings"], 104 | ) 105 | 106 | # Test colley_matrix_comparison.py 107 | self._test_specific_example( 108 | "colley_matrix_comparison.py", 109 | expected_output_contains=["Simulating tournament", "Colley Matrix Method is not sensitive to match order"], 110 | ) 111 | 112 | def _test_specific_example(self, example_file, expected_output_contains): 113 | """Helper method to test a specific example with expected output.""" 114 | script_path = os.path.join(self.examples_dir, example_file) 115 | 116 | # Run the script 117 | try: 118 | result = subprocess.run([sys.executable, script_path], capture_output=True, text=True, timeout=10) 119 | 120 | # Check if the script ran successfully 121 | self.assertEqual(result.returncode, 0, f"Example {example_file} failed with error:\n{result.stderr}") 122 | 123 | # Check if the output contains expected strings 124 | for expected in expected_output_contains: 125 | self.assertIn(expected, result.stdout, f"Example {example_file} output did not contain '{expected}'") 126 | except subprocess.TimeoutExpired: 127 | self.fail(f"Example {example_file} timed out after 10 seconds") 128 | 129 | 130 | if __name__ == "__main__": 131 | unittest.main() 132 | -------------------------------------------------------------------------------- /docs/source/rating_systems/ensemble.rst: -------------------------------------------------------------------------------- 1 | Ensemble Rating System 2 | ==================== 3 | 4 | Overview 5 | -------- 6 | 7 | The Ensemble rating system in Elote is a meta-rating approach that combines multiple rating systems to leverage their individual strengths while mitigating their weaknesses. By aggregating predictions from different rating algorithms, the Ensemble system can potentially provide more robust and accurate predictions than any single rating system alone. 8 | 9 | This approach is inspired by ensemble methods in machine learning, where combining multiple models often leads to better performance than any individual model. The Ensemble competitor in Elote allows you to combine any of the implemented rating systems (Elo, Glicko, ECF, DWZ) with customizable weights. 10 | 11 | How It Works 12 | ----------- 13 | 14 | The Ensemble rating system works by: 15 | 16 | 1. Maintaining multiple rating systems for each competitor 17 | 2. Calculating expected outcomes from each system 18 | 3. Combining these predictions using a weighted average 19 | 4. Updating each underlying rating system after matches 20 | 21 | The expected outcome calculation is: 22 | 23 | .. math:: 24 | 25 | E_{ensemble} = \sum_{i=1}^{n} w_i \times E_i 26 | 27 | Where: 28 | - :math:`E_{ensemble}` is the ensemble expected score 29 | - :math:`E_i` is the expected score from rating system i 30 | - :math:`w_i` is the weight assigned to rating system i 31 | - :math:`n` is the number of rating systems in the ensemble 32 | 33 | After a match, each underlying rating system is updated according to its own update rules, and the ensemble prediction is recalculated. 34 | 35 | Advantages 36 | --------- 37 | 38 | - **Robustness**: Less sensitive to the quirks of any single rating system 39 | - **Accuracy**: Can achieve better predictive performance by combining complementary systems 40 | - **Flexibility**: Can be customized with different component systems and weights 41 | - **Adaptability**: Works well across different domains and competition structures 42 | - **Graceful Degradation**: If one system performs poorly in a specific scenario, others can compensate 43 | 44 | Limitations 45 | ---------- 46 | 47 | - **Complexity**: More complex to implement and understand than single rating systems 48 | - **Computational Overhead**: Requires calculating and updating multiple rating systems 49 | - **Parameter Tuning**: Finding optimal weights may require experimentation 50 | - **Black Box Nature**: The combined prediction may be harder to interpret 51 | - **Cold Start**: Requires sufficient data to properly calibrate all component systems 52 | 53 | Implementation in Elote 54 | ---------------------- 55 | 56 | Elote provides an implementation of the Ensemble rating system through the ``EnsembleCompetitor`` class: 57 | 58 | .. code-block:: python 59 | 60 | from elote import EnsembleCompetitor 61 | from elote import EloCompetitor, GlickoCompetitor 62 | 63 | # Create an ensemble with Elo and Glicko components 64 | player1 = EnsembleCompetitor( 65 | rating_systems=[ 66 | (EloCompetitor(initial_rating=1500), 0.7), 67 | (GlickoCompetitor(initial_rating=1500, initial_rd=350), 0.3) 68 | ] 69 | ) 70 | 71 | player2 = EnsembleCompetitor( 72 | rating_systems=[ 73 | (EloCompetitor(initial_rating=1600), 0.7), 74 | (GlickoCompetitor(initial_rating=1600, initial_rd=350), 0.3) 75 | ] 76 | ) 77 | 78 | # Get win probability 79 | win_probability = player2.expected_score(player1) 80 | print(f"Player 2 win probability: {win_probability:.2%}") 81 | 82 | # Record a match result 83 | player1.beat(player2) # Player 1 won! 84 | 85 | # All underlying ratings are automatically updated 86 | print(f"Player 1 ensemble expected score vs Player 2: {player1.expected_score(player2):.2%}") 87 | 88 | Customization 89 | ------------ 90 | 91 | The ``EnsembleCompetitor`` class allows for extensive customization: 92 | 93 | .. code-block:: python 94 | 95 | from elote import EnsembleCompetitor, EloCompetitor, GlickoCompetitor, ECFCompetitor, DWZCompetitor 96 | 97 | # Create an ensemble with all available rating systems 98 | player = EnsembleCompetitor( 99 | rating_systems=[ 100 | (EloCompetitor(initial_rating=1500), 0.4), 101 | (GlickoCompetitor(initial_rating=1500), 0.3), 102 | (ECFCompetitor(initial_rating=120), 0.2), 103 | (DWZCompetitor(initial_rating=1500), 0.1) 104 | ] 105 | ) 106 | 107 | Key considerations: 108 | - The weights should sum to 1.0 for proper probabilistic interpretation 109 | - Higher weights give more influence to that rating system 110 | - You can include any combination of rating systems 111 | - Each component can be customized with its own parameters 112 | 113 | Choosing Weights 114 | -------------- 115 | 116 | There are several approaches to choosing weights for your ensemble: 117 | 118 | 1. **Equal Weights**: Start with equal weights for all systems 119 | 2. **Domain Knowledge**: Assign weights based on known performance in your domain 120 | 3. **Cross-Validation**: Use historical data to find optimal weights 121 | 4. **Adaptive Weights**: Dynamically adjust weights based on each system's performance 122 | 123 | For most applications, starting with equal weights and then adjusting based on observed performance is a practical approach. 124 | 125 | Real-World Applications 126 | --------------------- 127 | 128 | Ensemble rating systems are valuable in: 129 | 130 | - **Sports Analytics**: Combining multiple models for more accurate predictions 131 | - **Game Matchmaking**: Creating balanced matches in competitive games 132 | - **Recommendation Systems**: Ranking items based on multiple criteria 133 | - **Tournament Design**: Seeding players based on robust ratings 134 | - **Decision Making**: Aggregating multiple ranking methods for group decisions 135 | 136 | References 137 | --------- 138 | 139 | 1. Dietterich, T. G. (2000). "Ensemble Methods in Machine Learning". Multiple Classifier Systems, 1-15. 140 | 2. Seni, G., & Elder, J. F. (2010). "Ensemble Methods in Data Mining: Improving Accuracy Through Combining Predictions". Synthesis Lectures on Data Mining and Knowledge Discovery, 2(1), 1-126. 141 | 3. Graepel, T., Herbrich, R., & Gold, J. (2004). "Learning to Fight". Proceedings of the International Conference on Computer Games: Artificial Intelligence, Design and Education. -------------------------------------------------------------------------------- /tests/test_EloCompetitor_known_values.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from elote import EloCompetitor 3 | 4 | 5 | class TestEloKnownValues(unittest.TestCase): 6 | """Tests for EloCompetitor with known values to verify correctness after optimization.""" 7 | 8 | def test_initial_rating(self): 9 | """Test that initial rating is set correctly.""" 10 | player = EloCompetitor(initial_rating=400) 11 | self.assertEqual(player.rating, 400) 12 | 13 | player = EloCompetitor(initial_rating=1000) 14 | self.assertEqual(player.rating, 1000) 15 | 16 | def test_transformed_rating(self): 17 | """Test that transformed_rating property returns the correct value.""" 18 | player = EloCompetitor(initial_rating=400) 19 | expected = 10 ** (400 / 400) # Should be 10^1 = 10 20 | self.assertEqual(player.transformed_rating, expected) 21 | 22 | player = EloCompetitor(initial_rating=800) 23 | expected = 10 ** (800 / 400) # Should be 10^2 = 100 24 | self.assertEqual(player.transformed_rating, expected) 25 | 26 | def test_expected_score(self): 27 | """Test expected_score with known values.""" 28 | player1 = EloCompetitor(initial_rating=400) 29 | player2 = EloCompetitor(initial_rating=400) 30 | 31 | # Equal ratings should give 0.5 expected score 32 | self.assertEqual(player1.expected_score(player2), 0.5) 33 | 34 | player1 = EloCompetitor(initial_rating=400) 35 | player2 = EloCompetitor(initial_rating=600) 36 | 37 | # Calculate expected values manually 38 | p1_transformed = 10 ** (400 / 400) # 10 39 | p2_transformed = 10 ** (600 / 400) # 10^1.5 ≈ 31.6228 40 | expected = p1_transformed / (p1_transformed + p2_transformed) 41 | 42 | self.assertAlmostEqual(player1.expected_score(player2), expected) 43 | self.assertAlmostEqual(player2.expected_score(player1), 1 - expected) 44 | 45 | def test_beat_with_known_values(self): 46 | """Test beat method with known values.""" 47 | player1 = EloCompetitor(initial_rating=400, k_factor=32) 48 | player2 = EloCompetitor(initial_rating=400, k_factor=32) 49 | 50 | # Store original ratings 51 | p1_original = player1.rating 52 | p2_original = player2.rating 53 | 54 | # Calculate expected scores 55 | win_es = player1.expected_score(player2) # Should be 0.5 56 | lose_es = player2.expected_score(player1) # Should be 0.5 57 | 58 | # Calculate expected new ratings 59 | p1_new_rating = p1_original + 32 * (1 - win_es) # 400 + 32 * 0.5 = 416 60 | p2_new_rating = p2_original + 32 * (0 - lose_es) # 400 + 32 * -0.5 = 384 61 | 62 | # Player1 beats player2 63 | player1.beat(player2) 64 | 65 | # Check new ratings 66 | self.assertAlmostEqual(player1.rating, p1_new_rating) 67 | self.assertAlmostEqual(player2.rating, p2_new_rating) 68 | 69 | # Test with different ratings 70 | player1 = EloCompetitor(initial_rating=500, k_factor=32) 71 | player2 = EloCompetitor(initial_rating=400, k_factor=32) 72 | 73 | # Store original ratings 74 | p1_original = player1.rating 75 | p2_original = player2.rating 76 | 77 | # Calculate expected scores 78 | win_es = player1.expected_score(player2) 79 | lose_es = player2.expected_score(player1) 80 | 81 | # Calculate expected new ratings 82 | p1_new_rating = p1_original + 32 * (1 - win_es) 83 | p2_new_rating = p2_original + 32 * (0 - lose_es) 84 | 85 | # Player1 beats player2 86 | player1.beat(player2) 87 | 88 | # Check new ratings 89 | self.assertAlmostEqual(player1.rating, p1_new_rating) 90 | self.assertAlmostEqual(player2.rating, p2_new_rating) 91 | 92 | def test_tied_with_known_values(self): 93 | """Test tied method with known values.""" 94 | player1 = EloCompetitor(initial_rating=400, k_factor=32) 95 | player2 = EloCompetitor(initial_rating=400, k_factor=32) 96 | 97 | # Store original ratings 98 | p1_original = player1.rating 99 | p2_original = player2.rating 100 | 101 | # Calculate expected scores 102 | win_es = player1.expected_score(player2) # Should be 0.5 103 | lose_es = player2.expected_score(player1) # Should be 0.5 104 | 105 | # Calculate expected new ratings 106 | p1_new_rating = p1_original + 32 * (0.5 - win_es) # 400 + 32 * 0 = 400 107 | p2_new_rating = p2_original + 32 * (0.5 - lose_es) # 400 + 32 * 0 = 400 108 | 109 | # Players tie 110 | player1.tied(player2) 111 | 112 | # Check new ratings - should be unchanged for equal ratings 113 | self.assertAlmostEqual(player1.rating, p1_new_rating) 114 | self.assertAlmostEqual(player2.rating, p2_new_rating) 115 | 116 | # Test with different ratings 117 | player1 = EloCompetitor(initial_rating=500, k_factor=32) 118 | player2 = EloCompetitor(initial_rating=400, k_factor=32) 119 | 120 | # Store original ratings 121 | p1_original = player1.rating 122 | p2_original = player2.rating 123 | 124 | # Calculate expected scores 125 | win_es = player1.expected_score(player2) 126 | lose_es = player2.expected_score(player1) 127 | 128 | # Calculate expected new ratings 129 | p1_new_rating = p1_original + 32 * (0.5 - win_es) # Should decrease for higher rated player 130 | p2_new_rating = p2_original + 32 * (0.5 - lose_es) # Should increase for lower rated player 131 | 132 | # Players tie 133 | player1.tied(player2) 134 | 135 | # Check new ratings 136 | self.assertAlmostEqual(player1.rating, p1_new_rating) 137 | self.assertAlmostEqual(player2.rating, p2_new_rating) 138 | 139 | def test_k_factor_effect(self): 140 | """Test that k_factor affects the rating change magnitude.""" 141 | # With k_factor = 32 142 | player1 = EloCompetitor(initial_rating=400, k_factor=32) 143 | player2 = EloCompetitor(initial_rating=400, k_factor=32) 144 | player1.beat(player2) 145 | rating_change_32 = abs(player1.rating - 400) 146 | 147 | # With k_factor = 16 148 | player1 = EloCompetitor(initial_rating=400, k_factor=16) 149 | player2 = EloCompetitor(initial_rating=400, k_factor=16) 150 | player1.beat(player2) 151 | rating_change_16 = abs(player1.rating - 400) 152 | 153 | # The rating change with k_factor=32 should be twice the change with k_factor=16 154 | self.assertAlmostEqual(rating_change_32, 2 * rating_change_16) 155 | 156 | 157 | if __name__ == "__main__": 158 | unittest.main() 159 | -------------------------------------------------------------------------------- /docs/source/contributing.rst: -------------------------------------------------------------------------------- 1 | Contributing to Elote 2 | ================== 3 | 4 | Thank you for your interest in contributing to Elote! This guide will help you get started with contributing to the project. 5 | 6 | Setting Up Your Development Environment 7 | ------------------------------------- 8 | 9 | 1. **Fork the Repository** 10 | 11 | Start by forking the Elote repository on GitHub. 12 | 13 | 2. **Clone Your Fork** 14 | 15 | .. code-block:: bash 16 | 17 | git clone https://github.com/your-username/elote.git 18 | cd elote 19 | 20 | 3. **Set Up Development Environment** 21 | 22 | .. code-block:: bash 23 | 24 | # Using Make (recommended) 25 | make install-dev 26 | 27 | # Or using pip 28 | pip install -e ".[dev]" 29 | 30 | # Or using uv 31 | uv pip install -e ".[dev]" 32 | 33 | 4. **Set Up Pre-commit Hooks** 34 | 35 | .. code-block:: bash 36 | 37 | pre-commit install 38 | 39 | Development Workflow 40 | ------------------ 41 | 42 | 1. **Create a Branch** 43 | 44 | Create a branch for your feature or bugfix: 45 | 46 | .. code-block:: bash 47 | 48 | git checkout -b feature/your-feature-name 49 | # or 50 | git checkout -b fix/your-bugfix-name 51 | 52 | 2. **Make Your Changes** 53 | 54 | Implement your changes, following the code style guidelines. 55 | 56 | 3. **Run Tests** 57 | 58 | Make sure your changes pass all tests: 59 | 60 | .. code-block:: bash 61 | 62 | # Run all tests 63 | make test 64 | 65 | # Run tests with coverage 66 | make test-cov 67 | 68 | 4. **Lint Your Code** 69 | 70 | Ensure your code follows the project's style guidelines: 71 | 72 | .. code-block:: bash 73 | 74 | # Check code style 75 | make lint 76 | 77 | # Auto-fix some linting issues 78 | make lint-fix 79 | 80 | # Format code 81 | make format 82 | 83 | 5. **Commit Your Changes** 84 | 85 | Follow the conventional commits format for your commit messages: 86 | 87 | .. code-block:: bash 88 | 89 | git commit -m "feat: add new feature" 90 | # or 91 | git commit -m "fix: resolve issue with X" 92 | 93 | 6. **Push Your Changes** 94 | 95 | Push your changes to your fork: 96 | 97 | .. code-block:: bash 98 | 99 | git push origin feature/your-feature-name 100 | 101 | 7. **Create a Pull Request** 102 | 103 | Open a pull request from your fork to the main Elote repository. 104 | 105 | Code Style Guidelines 106 | ------------------- 107 | 108 | Elote follows these code style guidelines: 109 | 110 | - Use PEP 8 for Python code style 111 | - Use docstrings for all public functions, classes, and methods 112 | - Write clear, descriptive variable and function names 113 | - Include type hints where appropriate 114 | - Keep functions focused on a single responsibility 115 | - Write unit tests for new functionality 116 | 117 | Adding a New Rating System 118 | ------------------------ 119 | 120 | To add a new rating system to Elote: 121 | 122 | 1. Create a new file in the `elote/competitors` directory 123 | 2. Implement a new class that inherits from `BaseCompetitor` 124 | 3. Implement the required methods: 125 | - `expected_score(competitor)` 126 | - `update_rating(competitor, score)` 127 | 4. Add tests for your rating system in the `tests` directory 128 | 5. Update the documentation to include your new rating system 129 | 130 | Here's a template for a new rating system: 131 | 132 | .. code-block:: python 133 | 134 | from elote.competitors.base import BaseCompetitor 135 | 136 | class NewRatingCompetitor(BaseCompetitor): 137 | def __init__(self, initial_rating=1500, **kwargs): 138 | self.rating = initial_rating 139 | # Initialize other parameters 140 | 141 | def expected_score(self, competitor): 142 | """ 143 | Calculate the expected score (probability of winning) against another competitor. 144 | 145 | Args: 146 | competitor: The opponent NewRatingCompetitor 147 | 148 | Returns: 149 | float: The probability of winning (between 0 and 1) 150 | """ 151 | # Implement the expected score calculation 152 | pass 153 | 154 | def update_rating(self, competitor, score): 155 | """ 156 | Update the rating based on a match result. 157 | 158 | Args: 159 | competitor: The opponent NewRatingCompetitor 160 | score: The actual score (1 for win, 0.5 for draw, 0 for loss) 161 | """ 162 | # Implement the rating update logic 163 | pass 164 | 165 | Documentation 166 | ------------ 167 | 168 | When adding new features or making significant changes, please update the documentation: 169 | 170 | 1. Add or update docstrings for all public functions, classes, and methods 171 | 2. Update the relevant RST files in the `docs/source` directory 172 | 3. If adding a new rating system, create a new RST file in `docs/source/rating_systems` 173 | 4. Build and check the documentation locally: 174 | 175 | .. code-block:: bash 176 | 177 | make docs 178 | # Open docs/build/html/index.html in your browser 179 | 180 | Testing 181 | ------ 182 | 183 | Elote uses pytest for testing. When adding new features: 184 | 185 | 1. Write unit tests for your new code 186 | 2. Ensure all existing tests pass 187 | 3. Aim for high test coverage 188 | 189 | .. code-block:: bash 190 | 191 | # Run tests 192 | make test 193 | 194 | # Run tests with coverage 195 | make test-cov 196 | 197 | Reporting Issues 198 | -------------- 199 | 200 | If you find a bug or have a feature request: 201 | 202 | 1. Check if the issue already exists in the GitHub issues 203 | 2. If not, create a new issue with: 204 | - A clear title and description 205 | - Steps to reproduce (for bugs) 206 | - Expected and actual behavior (for bugs) 207 | - Any relevant code snippets or error messages 208 | 209 | Pull Request Process 210 | ------------------ 211 | 212 | 1. Ensure your code passes all tests and linting checks 213 | 2. Update the documentation if needed 214 | 3. Add an entry to the CHANGELOG.md file 215 | 4. Submit your pull request with a clear description of the changes 216 | 5. Wait for review and address any feedback 217 | 218 | Code of Conduct 219 | ------------- 220 | 221 | Please note that Elote has a Code of Conduct. By participating in this project, you agree to abide by its terms. 222 | 223 | Thank You! 224 | --------- 225 | 226 | Your contributions are what make the open-source community such an amazing place to learn, inspire, and create. Any contribution you make is greatly appreciated! -------------------------------------------------------------------------------- /tests/test_visualization.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import tempfile 4 | import matplotlib.pyplot as plt 5 | from elote.visualization import ( 6 | plot_rating_system_comparison, 7 | plot_optimized_accuracy_comparison, 8 | plot_accuracy_by_prior_bouts, 9 | ) 10 | 11 | 12 | class TestVisualization(unittest.TestCase): 13 | def setUp(self): 14 | """Set up test data for visualization functions.""" 15 | # Sample results for rating system comparison 16 | self.results = [ 17 | { 18 | "name": "System A", 19 | "accuracy": 0.75, 20 | "precision": 0.80, 21 | "recall": 0.70, 22 | "f1": 0.75, 23 | "optimized_accuracy": 0.78, 24 | }, 25 | { 26 | "name": "System B", 27 | "accuracy": 0.65, 28 | "precision": 0.70, 29 | "recall": 0.60, 30 | "f1": 0.65, 31 | "optimized_accuracy": 0.68, 32 | }, 33 | { 34 | "name": "System C", 35 | "accuracy": 0.85, 36 | "precision": 0.90, 37 | "recall": 0.80, 38 | "f1": 0.85, 39 | "optimized_accuracy": 0.88, 40 | }, 41 | ] 42 | 43 | # Sample data for accuracy by prior bouts 44 | self.accuracy_by_prior_bouts = { 45 | "System A": { 46 | "binned": { 47 | 0: {"accuracy": 0.60, "total": 100, "min_bouts": 0, "max_bouts": 4}, 48 | 1: {"accuracy": 0.70, "total": 100, "min_bouts": 5, "max_bouts": 9}, 49 | 2: {"accuracy": 0.80, "total": 100, "min_bouts": 10, "max_bouts": 14}, 50 | } 51 | }, 52 | "System B": { 53 | "binned": { 54 | 0: {"accuracy": 0.55, "total": 100, "min_bouts": 0, "max_bouts": 4}, 55 | 1: {"accuracy": 0.65, "total": 100, "min_bouts": 5, "max_bouts": 9}, 56 | 2: {"accuracy": 0.75, "total": 100, "min_bouts": 10, "max_bouts": 14}, 57 | } 58 | }, 59 | } 60 | 61 | # Create a temporary directory for saving plots 62 | self.temp_dir = tempfile.mkdtemp() 63 | 64 | def tearDown(self): 65 | """Clean up after tests.""" 66 | # Close all matplotlib figures 67 | plt.close("all") 68 | 69 | # Remove temporary files 70 | for filename in os.listdir(self.temp_dir): 71 | os.remove(os.path.join(self.temp_dir, filename)) 72 | os.rmdir(self.temp_dir) 73 | 74 | def test_plot_rating_system_comparison(self): 75 | """Test that plot_rating_system_comparison creates a figure.""" 76 | # Test without saving 77 | fig = plot_rating_system_comparison(self.results) 78 | self.assertIsNotNone(fig) 79 | self.assertEqual(len(fig.axes), 4) # 2x2 grid of subplots 80 | 81 | # Test with saving 82 | save_path = os.path.join(self.temp_dir, "rating_comparison.png") 83 | fig = plot_rating_system_comparison(self.results, save_path=save_path) 84 | self.assertTrue(os.path.exists(save_path)) 85 | 86 | # Test with custom figsize and title 87 | fig = plot_rating_system_comparison(self.results, figsize=(10, 8), title="Custom Title") 88 | self.assertEqual(fig.get_figwidth(), 10) 89 | self.assertEqual(fig.get_figheight(), 8) 90 | self.assertEqual(fig._suptitle.get_text(), "Custom Title") 91 | 92 | def test_plot_optimized_accuracy_comparison(self): 93 | """Test that plot_optimized_accuracy_comparison creates a figure.""" 94 | # Test without saving 95 | fig = plot_optimized_accuracy_comparison(self.results) 96 | self.assertIsNotNone(fig) 97 | 98 | # Test with saving 99 | save_path = os.path.join(self.temp_dir, "optimized_accuracy.png") 100 | fig = plot_optimized_accuracy_comparison(self.results, save_path=save_path) 101 | self.assertTrue(os.path.exists(save_path)) 102 | 103 | # Test with custom figsize and title 104 | fig = plot_optimized_accuracy_comparison(self.results, figsize=(8, 6), title="Custom Title") 105 | self.assertEqual(fig.get_figwidth(), 8) 106 | self.assertEqual(fig.get_figheight(), 6) 107 | self.assertEqual(fig.axes[0].get_title(), "Custom Title") 108 | 109 | def test_plot_accuracy_by_prior_bouts_with_binned_data(self): 110 | """Test that plot_accuracy_by_prior_bouts works with binned data.""" 111 | # Test without saving 112 | fig = plot_accuracy_by_prior_bouts(self.accuracy_by_prior_bouts) 113 | self.assertIsNotNone(fig) 114 | 115 | # Test with saving 116 | save_path = os.path.join(self.temp_dir, "accuracy_by_bouts.png") 117 | fig = plot_accuracy_by_prior_bouts(self.accuracy_by_prior_bouts, save_path=save_path) 118 | self.assertTrue(os.path.exists(save_path)) 119 | 120 | # Test with custom parameters 121 | fig = plot_accuracy_by_prior_bouts(self.accuracy_by_prior_bouts, figsize=(12, 8), title="Custom Title") 122 | self.assertEqual(fig.get_figwidth(), 12) 123 | self.assertEqual(fig.get_figheight(), 8) 124 | self.assertEqual(fig.axes[0].get_title(), "Custom Title") 125 | 126 | def test_plot_accuracy_by_prior_bouts_with_empty_data(self): 127 | """Test that plot_accuracy_by_prior_bouts handles empty data gracefully.""" 128 | # Empty data 129 | empty_data = {} 130 | fig = plot_accuracy_by_prior_bouts(empty_data) 131 | self.assertIsNotNone(fig) 132 | 133 | # Data with empty bins 134 | data_with_empty_bins = {"System A": {"binned": {}}} 135 | fig = plot_accuracy_by_prior_bouts(data_with_empty_bins) 136 | self.assertIsNotNone(fig) 137 | 138 | def test_plot_functions_with_invalid_data(self): 139 | """Test that visualization functions handle invalid data gracefully.""" 140 | # Invalid results for rating system comparison 141 | invalid_results = [ 142 | {"name": "System A"} # Missing metrics 143 | ] 144 | 145 | # Should not raise an error, but might not plot anything 146 | fig = plot_rating_system_comparison(invalid_results) 147 | self.assertIsNotNone(fig) 148 | 149 | # Invalid results for optimized accuracy comparison 150 | fig = plot_optimized_accuracy_comparison(invalid_results) 151 | self.assertIsNotNone(fig) 152 | 153 | # Invalid data for accuracy by prior bouts 154 | invalid_bout_data = { 155 | "System A": {} # Missing binned data 156 | } 157 | fig = plot_accuracy_by_prior_bouts(invalid_bout_data) 158 | self.assertIsNotNone(fig) 159 | 160 | 161 | if __name__ == "__main__": 162 | unittest.main() 163 | -------------------------------------------------------------------------------- /elote/datasets/synthetic.py: -------------------------------------------------------------------------------- 1 | """ 2 | Synthetic data generator for elote. 3 | 4 | This module provides a synthetic data generator for testing and evaluating different rating algorithms. 5 | """ 6 | 7 | import datetime 8 | import random 9 | import numpy as np 10 | from typing import List, Tuple, Dict, Any, Optional 11 | 12 | from elote.datasets.base import BaseDataset 13 | 14 | 15 | class SyntheticDataset(BaseDataset): 16 | """ 17 | Synthetic data generator for testing and evaluating different rating algorithms. 18 | 19 | This dataset generates random matchups between competitors with configurable parameters. 20 | The outcome of each matchup is determined by the true skill of each competitor plus some noise. 21 | """ 22 | 23 | def __init__( 24 | self, 25 | num_competitors: int = 100, 26 | num_matchups: int = 1000, 27 | skill_distribution: str = "normal", 28 | skill_mean: float = 1500, 29 | skill_std: float = 300, 30 | noise_std: float = 100, 31 | draw_probability: float = 0.1, 32 | time_span_days: int = 365, 33 | seed: Optional[int] = None, 34 | cache_dir: Optional[str] = None, 35 | max_memory_mb: int = 1024, 36 | ): 37 | """ 38 | Initialize a synthetic dataset generator. 39 | 40 | Args: 41 | num_competitors: Number of competitors to generate 42 | num_matchups: Number of matchups to generate 43 | skill_distribution: Distribution of true skills ("normal", "uniform", or "pareto") 44 | skill_mean: Mean of the skill distribution (for normal distribution) 45 | skill_std: Standard deviation of the skill distribution (for normal distribution) 46 | noise_std: Standard deviation of the noise added to skills during matchups 47 | draw_probability: Probability of a draw when competitors are closely matched 48 | time_span_days: Number of days to spread the matchups over 49 | seed: Random seed for reproducibility 50 | cache_dir: Directory to cache data (not used for synthetic data) 51 | max_memory_mb: Maximum memory usage in MB for dataset operations 52 | """ 53 | super().__init__(cache_dir=cache_dir, max_memory_mb=max_memory_mb) 54 | self.num_competitors = num_competitors 55 | self.num_matchups = num_matchups 56 | self.skill_distribution = skill_distribution 57 | self.skill_mean = skill_mean 58 | self.skill_std = skill_std 59 | self.noise_std = noise_std 60 | self.draw_probability = draw_probability 61 | self.time_span_days = time_span_days 62 | self.seed = seed 63 | 64 | # Set random seed if provided 65 | if seed is not None: 66 | random.seed(seed) 67 | np.random.seed(seed) 68 | 69 | def download(self) -> None: 70 | """ 71 | No download needed for synthetic data. 72 | """ 73 | pass 74 | 75 | def _generate_skills(self) -> Dict[str, float]: 76 | """ 77 | Generate true skills for all competitors. 78 | 79 | Returns: 80 | Dictionary mapping competitor IDs to their true skills 81 | """ 82 | skills = {} 83 | 84 | for i in range(self.num_competitors): 85 | competitor_id = f"competitor_{i}" 86 | 87 | if self.skill_distribution == "normal": 88 | skill = np.random.normal(self.skill_mean, self.skill_std) 89 | elif self.skill_distribution == "uniform": 90 | skill = np.random.uniform( 91 | self.skill_mean - self.skill_std * 1.73, # Matching variance of normal 92 | self.skill_mean + self.skill_std * 1.73, 93 | ) 94 | elif self.skill_distribution == "pareto": 95 | # Pareto distribution for more realistic skill distribution with few very skilled competitors 96 | skill = np.random.pareto(3) * self.skill_std + self.skill_mean - self.skill_std 97 | else: 98 | raise ValueError(f"Unknown skill distribution: {self.skill_distribution}") 99 | 100 | skills[competitor_id] = skill 101 | 102 | return skills 103 | 104 | def _generate_matchups( 105 | self, skills: Dict[str, float] 106 | ) -> List[Tuple[str, str, float, datetime.datetime, Dict[str, Any]]]: 107 | """ 108 | Generate random matchups between competitors. 109 | 110 | Args: 111 | skills: Dictionary mapping competitor IDs to their true skills 112 | 113 | Returns: 114 | List of matchup tuples (competitor_a, competitor_b, outcome, timestamp, attributes) 115 | """ 116 | matchups = [] 117 | competitors = list(skills.keys()) 118 | 119 | # Generate timestamps spanning the time_span_days 120 | start_date = datetime.datetime.now() - datetime.timedelta(days=self.time_span_days) 121 | 122 | for _i in range(self.num_matchups): 123 | # Select two random competitors 124 | a, b = random.sample(competitors, 2) 125 | 126 | # Generate a timestamp 127 | days_offset = random.uniform(0, self.time_span_days) 128 | timestamp = start_date + datetime.timedelta(days=days_offset) 129 | 130 | # Determine the outcome based on true skills plus noise 131 | skill_a = skills[a] + np.random.normal(0, self.noise_std) 132 | skill_b = skills[b] + np.random.normal(0, self.noise_std) 133 | 134 | # Calculate skill difference and normalize 135 | skill_diff = skill_a - skill_b 136 | 137 | # Determine if it's a draw 138 | if abs(skill_diff) < self.noise_std and random.random() < self.draw_probability: 139 | outcome = 0.5 # Draw 140 | else: 141 | outcome = 1.0 if skill_diff > 0 else 0.0 142 | 143 | # Add attributes with true skills for evaluation 144 | attributes = { 145 | "true_skill_a": skills[a], 146 | "true_skill_b": skills[b], 147 | "skill_diff": skill_diff, 148 | } 149 | 150 | matchups.append((a, b, outcome, timestamp, attributes)) 151 | 152 | # Sort by timestamp 153 | matchups.sort(key=lambda x: x[3]) 154 | 155 | return matchups 156 | 157 | def load(self) -> List[Tuple[str, str, float, datetime.datetime, Dict[str, Any]]]: 158 | """ 159 | Generate and load the synthetic dataset. 160 | 161 | Returns: 162 | List of matchup tuples (competitor_a, competitor_b, outcome, timestamp, attributes) 163 | """ 164 | # Generate true skills for all competitors 165 | skills = self._generate_skills() 166 | 167 | # Generate random matchups 168 | matchups = self._generate_matchups(skills) 169 | 170 | return matchups 171 | -------------------------------------------------------------------------------- /tests/test_Arenas.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from elote import LambdaArena, EloCompetitor 3 | 4 | 5 | class TestArenas(unittest.TestCase): 6 | def test_lambda_arena_initialization(self): 7 | """Test that the LambdaArena initializes correctly with different parameters.""" 8 | # Test with default parameters 9 | arena = LambdaArena(lambda a, b: a > b) 10 | self.assertEqual(len(arena.competitors), 0) 11 | self.assertEqual(arena.base_competitor, EloCompetitor) 12 | self.assertEqual(arena.base_competitor_kwargs, {}) 13 | 14 | # Test with custom competitor class parameters 15 | arena = LambdaArena(lambda a, b: a > b, base_competitor_kwargs={"initial_rating": 1000}) 16 | self.assertEqual(arena.base_competitor_kwargs, {"initial_rating": 1000}) 17 | 18 | # Test with initial state 19 | initial_state = {"A": {"initial_rating": 1200}, "B": {"initial_rating": 800}} 20 | arena = LambdaArena(lambda a, b: a > b, initial_state=initial_state) 21 | self.assertEqual(len(arena.competitors), 2) 22 | self.assertIn("A", arena.competitors) 23 | self.assertIn("B", arena.competitors) 24 | self.assertEqual(arena.competitors["A"].rating, 1200) 25 | self.assertEqual(arena.competitors["B"].rating, 800) 26 | 27 | def test_lambda_arena_matchup(self): 28 | """Test that matchups work correctly in the LambdaArena.""" 29 | 30 | # Create a simple comparison function 31 | def compare(a, b, attributes=None): 32 | if attributes and "force_winner" in attributes: 33 | return attributes["force_winner"] == a 34 | return a > b 35 | 36 | arena = LambdaArena(compare) 37 | 38 | # Test a simple matchup where a > b 39 | arena.matchup(10, 5) 40 | self.assertEqual(len(arena.competitors), 2) 41 | self.assertIn(10, arena.competitors) 42 | self.assertIn(5, arena.competitors) 43 | 44 | # The winner's rating should be higher than the initial rating 45 | initial_rating = EloCompetitor().rating 46 | self.assertGreater(arena.competitors[10].rating, initial_rating) 47 | 48 | # Test a matchup with attributes 49 | arena.matchup("X", "Y", attributes={"force_winner": "Y"}) 50 | self.assertEqual(len(arena.competitors), 4) 51 | self.assertIn("X", arena.competitors) 52 | self.assertIn("Y", arena.competitors) 53 | 54 | # Y should have won despite X normally being greater alphabetically 55 | self.assertGreater(arena.competitors["Y"].rating, initial_rating) 56 | 57 | def test_lambda_arena_tournament(self): 58 | """Test that tournaments work correctly in the LambdaArena.""" 59 | arena = LambdaArena(lambda a, b: a > b) 60 | 61 | # Create a tournament with multiple matchups 62 | matchups = [(10, 5), (15, 8), (5, 3), (8, 10)] 63 | 64 | arena.tournament(matchups) 65 | 66 | # Check that all competitors are in the arena 67 | self.assertEqual(len(arena.competitors), 5) 68 | for competitor in [3, 5, 8, 10, 15]: 69 | self.assertIn(competitor, arena.competitors) 70 | 71 | # Check that the history has recorded all bouts 72 | self.assertEqual(len(arena.history.bouts), 4) 73 | 74 | def test_lambda_arena_expected_score(self): 75 | """Test that expected scores are calculated correctly.""" 76 | arena = LambdaArena(lambda a, b: a > b) 77 | 78 | # Add some competitors with different ratings 79 | initial_state = {"A": {"initial_rating": 1200}, "B": {"initial_rating": 800}} 80 | arena = LambdaArena(lambda a, b: a > b, initial_state=initial_state) 81 | 82 | # A should have a higher expected score against B 83 | self.assertGreater(arena.expected_score("A", "B"), 0.5) 84 | self.assertLess(arena.expected_score("B", "A"), 0.5) 85 | 86 | # Test with new competitors that aren't in the arena yet 87 | score_c_d = arena.expected_score("C", "D") 88 | self.assertAlmostEqual(score_c_d, 0.5, places=2) 89 | 90 | # Now they should be in the arena with default ratings 91 | self.assertIn("C", arena.competitors) 92 | self.assertIn("D", arena.competitors) 93 | 94 | def test_lambda_arena_export_state(self): 95 | """Test that the arena state can be exported correctly.""" 96 | arena = LambdaArena(lambda a, b: a > b) 97 | 98 | # Add some competitors and run some matchups 99 | arena.matchup("A", "B") 100 | arena.matchup("B", "C") 101 | arena.matchup("A", "C") 102 | 103 | # Export the state 104 | state = arena.export_state() 105 | 106 | # Check that all competitors are in the state 107 | self.assertEqual(len(state), 3) 108 | self.assertIn("A", state) 109 | self.assertIn("B", state) 110 | self.assertIn("C", state) 111 | 112 | # Check that the state contains the correct information 113 | for competitor in ["A", "B", "C"]: 114 | self.assertIn("initial_rating", state[competitor]) 115 | self.assertIn("class_vars", state[competitor]) 116 | 117 | def test_lambda_arena_leaderboard(self): 118 | """Test that the leaderboard is generated correctly.""" 119 | arena = LambdaArena(lambda a, b: a > b) 120 | 121 | # Add some competitors with different ratings 122 | initial_state = { 123 | "A": {"initial_rating": 1200}, 124 | "B": {"initial_rating": 1000}, 125 | "C": {"initial_rating": 800}, 126 | } 127 | arena = LambdaArena(lambda a, b: a > b, initial_state=initial_state) 128 | 129 | # Get the leaderboard 130 | leaderboard = arena.leaderboard() 131 | 132 | # Check that the leaderboard is sorted by rating (ascending) 133 | self.assertEqual(len(leaderboard), 3) 134 | self.assertEqual(leaderboard[0]["competitor"], "C") 135 | self.assertEqual(leaderboard[1]["competitor"], "B") 136 | self.assertEqual(leaderboard[2]["competitor"], "A") 137 | 138 | def test_lambda_arena_clear_history(self): 139 | """Test that the history can be cleared.""" 140 | arena = LambdaArena(lambda a, b: a > b) 141 | 142 | # Add some matchups 143 | arena.matchup("A", "B") 144 | arena.matchup("B", "C") 145 | 146 | # Check that the history has recorded the bouts 147 | self.assertEqual(len(arena.history.bouts), 2) 148 | 149 | # Clear the history 150 | arena.clear_history() 151 | 152 | # Check that the history is empty 153 | self.assertEqual(len(arena.history.bouts), 0) 154 | 155 | def test_lambda_arena_set_competitor_class_var(self): 156 | """Test that competitor class variables can be set.""" 157 | arena = LambdaArena(lambda a, b: a > b) 158 | 159 | # Set a class variable 160 | arena.set_competitor_class_var("_k_factor", 16) 161 | 162 | # Check that the class variable was set 163 | self.assertEqual(EloCompetitor._k_factor, 16) 164 | 165 | # Reset the class variable for other tests 166 | arena.set_competitor_class_var("_k_factor", 32) 167 | -------------------------------------------------------------------------------- /tests/test_GlickoCompetitor_known_values.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from elote import GlickoCompetitor 3 | import math 4 | from datetime import datetime, timedelta 5 | 6 | 7 | class TestGlickoKnownValues(unittest.TestCase): 8 | """Tests for GlickoCompetitor with known values to verify correctness.""" 9 | 10 | def test_initial_rating(self): 11 | """Test that initial rating and RD are set correctly.""" 12 | player = GlickoCompetitor(initial_rating=1500, initial_rd=350) 13 | self.assertEqual(player.rating, 1500) 14 | self.assertEqual(player.rd, 350) 15 | 16 | player = GlickoCompetitor(initial_rating=2000, initial_rd=200) 17 | self.assertEqual(player.rating, 2000) 18 | self.assertEqual(player.rd, 200) 19 | 20 | def test_transformed_rd(self): 21 | """Test that transformed RD is calculated correctly.""" 22 | player = GlickoCompetitor(initial_rating=1500, initial_rd=300) 23 | expected_rd = min([350, math.sqrt(300**2 + 34.6**2)]) 24 | self.assertAlmostEqual(player.tranformed_rd, expected_rd) 25 | 26 | def test_g_function(self): 27 | """Test the g function with known values.""" 28 | player = GlickoCompetitor(initial_rating=1500, initial_rd=300) 29 | g = player._g(300) 30 | expected_g = 1 / math.sqrt(1 + 3 * (0.0057565**2) * (300**2) / math.pi**2) 31 | self.assertAlmostEqual(g, expected_g) 32 | 33 | def test_expected_score(self): 34 | """Test expected_score with known values.""" 35 | player1 = GlickoCompetitor(initial_rating=1500, initial_rd=300) 36 | player2 = GlickoCompetitor(initial_rating=1700, initial_rd=300) 37 | 38 | # Calculate expected score manually 39 | g = player1._g(300**2) # Use rd squared as per the implementation 40 | E = 1 / (1 + 10 ** ((-g * (1500 - 1700)) / 400)) 41 | self.assertAlmostEqual(player1.expected_score(player2), E) 42 | 43 | def test_beat_with_known_values(self): 44 | """Test beat method with known values.""" 45 | initial_time = datetime(2020, 1, 1) 46 | match_time = datetime(2020, 1, 10) # 10 days later 47 | 48 | player1 = GlickoCompetitor(initial_rating=1500, initial_rd=50, initial_time=initial_time) 49 | player2 = GlickoCompetitor(initial_rating=1700, initial_rd=50, initial_time=initial_time) 50 | 51 | # Store initial ratings 52 | initial_rating1 = player1.rating 53 | initial_rating2 = player2.rating 54 | 55 | # Perform the match 56 | player1.beat(player2, match_time=match_time) 57 | 58 | # Check that ratings changed in the expected direction 59 | self.assertGreater(player1.rating, initial_rating1) # Winner's rating should increase 60 | self.assertLess(player2.rating, initial_rating2) # Loser's rating should decrease 61 | 62 | # Check that RDs decreased (more certainty after a match) 63 | self.assertLess(player1.rd, 350) 64 | self.assertLess(player2.rd, 350) 65 | 66 | def test_tied_with_known_values(self): 67 | """Test tied method with known values.""" 68 | initial_time = datetime(2020, 1, 1) 69 | match_time = datetime(2020, 1, 10) # 10 days later 70 | 71 | player1 = GlickoCompetitor(initial_rating=1500, initial_rd=50, initial_time=initial_time) 72 | player2 = GlickoCompetitor(initial_rating=1700, initial_rd=50, initial_time=initial_time) 73 | 74 | # Store initial ratings 75 | initial_rating1 = player1.rating 76 | initial_rating2 = player2.rating 77 | 78 | # Perform the match 79 | player1.tied(player2, match_time=match_time) 80 | 81 | # Check that ratings changed in the expected direction 82 | self.assertGreater(player1.rating, initial_rating1) # Lower-rated player should gain rating 83 | self.assertLess(player2.rating, initial_rating2) # Higher-rated player should lose rating 84 | 85 | # Check that RDs decreased (more certainty after a match) 86 | self.assertLess(player1.rd, 350) 87 | self.assertLess(player2.rd, 350) 88 | 89 | def test_rd_effect(self): 90 | """Test that RD affects the rating change magnitude.""" 91 | initial_time = datetime(2020, 1, 1) 92 | match_time = initial_time + timedelta(days=2) # Match happens 2 days after initialization 93 | 94 | # With high RD (more uncertainty) 95 | player1 = GlickoCompetitor(initial_rating=1500, initial_rd=350, initial_time=initial_time) 96 | player2 = GlickoCompetitor(initial_rating=1700, initial_rd=50, initial_time=initial_time) 97 | player1.beat(player2, match_time=match_time) 98 | rating_change_high_rd = abs(player1.rating - 1500) 99 | 100 | # Reset with lower RD 101 | player1 = GlickoCompetitor(initial_rating=1500, initial_rd=50, initial_time=initial_time) 102 | player2 = GlickoCompetitor(initial_rating=1700, initial_rd=50, initial_time=initial_time) 103 | player1.beat(player2, match_time=match_time) 104 | rating_change_low_rd = abs(player1.rating - 1500) 105 | 106 | # The rating change with higher RD should be greater 107 | self.assertGreater(rating_change_high_rd, rating_change_low_rd) 108 | 109 | def test_rd_increase_over_time(self): 110 | """Test that RD increases over time.""" 111 | initial_time = datetime(2020, 1, 1) 112 | player = GlickoCompetitor(initial_rating=1500, initial_rd=50, initial_time=initial_time) 113 | 114 | # Test that RD increases over time 115 | current_time = initial_time + timedelta(days=1) 116 | initial_rd = player.rd 117 | player.update_rd_for_inactivity(current_time) 118 | self.assertGreater(player.rd, initial_rd) 119 | 120 | # Test that RD increases more over longer periods 121 | player = GlickoCompetitor(initial_rating=1500, initial_rd=50, initial_time=initial_time) 122 | current_time = initial_time + timedelta(days=10) 123 | player.update_rd_for_inactivity(current_time) 124 | self.assertGreater(player.rd, initial_rd) 125 | 126 | # Test that RD is capped at 350 127 | player = GlickoCompetitor(initial_rating=1500, initial_rd=50, initial_time=initial_time) 128 | current_time = initial_time + timedelta(days=1000) # Very long time 129 | player.update_rd_for_inactivity(current_time) 130 | self.assertLessEqual(player.rd, 350) 131 | 132 | def test_fractional_rating_periods(self): 133 | """Test RD increase with fractional rating periods.""" 134 | initial_time = datetime(2020, 1, 1) 135 | player = GlickoCompetitor(initial_rating=1500, initial_rd=50, initial_time=initial_time) 136 | 137 | # Test that RD increases for half a period 138 | current_time = initial_time + timedelta(hours=12) 139 | initial_rd = player.rd 140 | player.update_rd_for_inactivity(current_time) 141 | self.assertGreater(player.rd, initial_rd) 142 | 143 | # Test that RD increases more for 1.5 periods than 0.5 periods 144 | player = GlickoCompetitor(initial_rating=1500, initial_rd=50, initial_time=initial_time) 145 | current_time = initial_time + timedelta(hours=36) 146 | player.update_rd_for_inactivity(current_time) 147 | self.assertGreater(player.rd, initial_rd) 148 | 149 | 150 | if __name__ == "__main__": 151 | unittest.main() 152 | -------------------------------------------------------------------------------- /elote/datasets/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions for using datasets with arenas. 3 | 4 | This module provides utility functions for using datasets with arenas for evaluating different rating algorithms. 5 | """ 6 | 7 | from typing import Any, Callable, Dict, List, Optional, Tuple 8 | import datetime 9 | 10 | from elote.arenas.base import BaseArena, Bout, History 11 | from elote.datasets.base import DataSplit 12 | 13 | 14 | def train_arena_with_dataset( 15 | arena: BaseArena, 16 | train_data: List[Tuple[Any, Any, float, Optional[datetime.datetime], Optional[Dict[str, Any]]]], 17 | batch_size: Optional[int] = None, 18 | progress_callback: Optional[Callable[[int, int], None]] = None, 19 | ) -> BaseArena: 20 | """ 21 | Train an arena with a dataset. 22 | 23 | Args: 24 | arena: The arena to train 25 | train_data: List of matchup tuples (competitor_a, competitor_b, outcome, timestamp, attributes) 26 | batch_size: Number of matchups to process in each batch (for progress reporting) 27 | progress_callback: Callback function for reporting progress (current, total) 28 | 29 | Returns: 30 | The trained arena 31 | """ 32 | # Sort by timestamp if available 33 | train_data_with_time = [(a, b, outcome, ts, attrs) for a, b, outcome, ts, attrs in train_data if ts is not None] 34 | train_data_without_time = [(a, b, outcome, ts, attrs) for a, b, outcome, ts, attrs in train_data if ts is None] 35 | 36 | if train_data_with_time: 37 | # Sort by timestamp 38 | train_data_with_time.sort(key=lambda x: x[3]) 39 | # Combine sorted data with data without timestamps 40 | sorted_data = train_data_with_time + train_data_without_time 41 | else: 42 | sorted_data = train_data 43 | 44 | # Process in batches if requested 45 | if batch_size is None: 46 | batch_size = len(sorted_data) 47 | 48 | total_batches = (len(sorted_data) + batch_size - 1) // batch_size 49 | 50 | for batch_idx in range(total_batches): 51 | start_idx = batch_idx * batch_size 52 | end_idx = min(start_idx + batch_size, len(sorted_data)) 53 | batch = sorted_data[start_idx:end_idx] 54 | 55 | # Process each matchup 56 | for a, b, outcome, _, attributes in batch: 57 | if outcome == 1.0: 58 | # A wins 59 | arena.matchup(a, b, attributes=attributes) 60 | elif outcome == 0.0: 61 | # B wins 62 | arena.matchup(b, a, attributes=attributes) 63 | else: 64 | # Draw - we need to handle this specially 65 | # First, get the competitors 66 | if a not in arena.competitors: 67 | arena.competitors[a] = arena.base_competitor(**arena.base_competitor_kwargs) 68 | if b not in arena.competitors: 69 | arena.competitors[b] = arena.base_competitor(**arena.base_competitor_kwargs) 70 | 71 | # Then, record the draw 72 | arena.competitors[a].tied(arena.competitors[b]) 73 | 74 | # Report progress 75 | if progress_callback is not None: 76 | progress_callback(end_idx, len(sorted_data)) 77 | 78 | return arena 79 | 80 | 81 | def evaluate_arena_with_dataset( 82 | arena: BaseArena, 83 | test_data: List[Tuple[Any, Any, float, Optional[datetime.datetime], Optional[Dict[str, Any]]]], 84 | batch_size: Optional[int] = None, 85 | progress_callback: Optional[Callable[[int, int], None]] = None, 86 | ) -> History: 87 | """ 88 | Evaluate an arena with a dataset. 89 | 90 | Args: 91 | arena: The arena to evaluate 92 | test_data: List of matchup tuples (competitor_a, competitor_b, outcome, timestamp, attributes) 93 | batch_size: Number of matchups to process in each batch (for progress reporting) 94 | progress_callback: Callback function for reporting progress (current, total) 95 | 96 | Returns: 97 | History object containing the evaluation results 98 | """ 99 | # Create a new history object 100 | history = History() 101 | 102 | # Sort by timestamp if available 103 | test_data_with_time = [(a, b, outcome, ts, attrs) for a, b, outcome, ts, attrs in test_data if ts is not None] 104 | test_data_without_time = [(a, b, outcome, ts, attrs) for a, b, outcome, ts, attrs in test_data if ts is None] 105 | 106 | if test_data_with_time: 107 | # Sort by timestamp 108 | test_data_with_time.sort(key=lambda x: x[3]) 109 | # Combine sorted data with data without timestamps 110 | sorted_data = test_data_with_time + test_data_without_time 111 | else: 112 | sorted_data = test_data 113 | 114 | # Process in batches if requested 115 | if batch_size is None: 116 | batch_size = len(sorted_data) 117 | 118 | total_batches = (len(sorted_data) + batch_size - 1) // batch_size 119 | 120 | for batch_idx in range(total_batches): 121 | start_idx = batch_idx * batch_size 122 | end_idx = min(start_idx + batch_size, len(sorted_data)) 123 | batch = sorted_data[start_idx:end_idx] 124 | 125 | # Process each matchup 126 | for a, b, outcome, _, attributes in batch: 127 | # Skip if either competitor is not in the arena 128 | if a not in arena.competitors or b not in arena.competitors: 129 | continue 130 | 131 | # Get the expected outcome 132 | expected_score = arena.expected_score(a, b) 133 | 134 | # Create a bout object 135 | bout = Bout(a, b, expected_score, outcome, attributes) 136 | 137 | # Add to history 138 | history.add_bout(bout) 139 | 140 | # Report progress 141 | if progress_callback is not None: 142 | progress_callback(end_idx, len(sorted_data)) 143 | 144 | return history 145 | 146 | 147 | def train_and_evaluate_arena( 148 | arena: BaseArena, 149 | data_split: DataSplit, 150 | batch_size: Optional[int] = None, 151 | progress_callback: Optional[Callable[[str, int, int], None]] = None, 152 | ) -> Tuple[BaseArena, History]: 153 | """ 154 | Train and evaluate an arena with a dataset split. 155 | 156 | Args: 157 | arena: The arena to train and evaluate 158 | data_split: DataSplit object containing train and test sets 159 | batch_size: Number of matchups to process in each batch (for progress reporting) 160 | progress_callback: Callback function for reporting progress (phase, current, total) 161 | 162 | Returns: 163 | Tuple of (trained_arena, history) 164 | """ 165 | # Train the arena 166 | if progress_callback: 167 | 168 | def train_progress(current: int, total: int) -> None: 169 | return progress_callback("train", current, total) 170 | else: 171 | train_progress = None 172 | 173 | trained_arena = train_arena_with_dataset( 174 | arena, data_split.train, batch_size=batch_size, progress_callback=train_progress 175 | ) 176 | 177 | # Evaluate the arena 178 | if progress_callback: 179 | 180 | def eval_progress(current: int, total: int) -> None: 181 | return progress_callback("eval", current, total) 182 | else: 183 | eval_progress = None 184 | 185 | history = evaluate_arena_with_dataset( 186 | trained_arena, data_split.test, batch_size=batch_size, progress_callback=eval_progress 187 | ) 188 | 189 | return trained_arena, history 190 | -------------------------------------------------------------------------------- /tests/test_ColleyMatrixCompetitor_known_values.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from elote import ColleyMatrixCompetitor 4 | 5 | 6 | class TestColleyMatrixKnownValues(unittest.TestCase): 7 | """Tests for ColleyMatrixCompetitor with known values to verify correctness.""" 8 | 9 | def test_initial_rating(self): 10 | """Test that initial rating is set correctly.""" 11 | player = ColleyMatrixCompetitor(initial_rating=0.5) 12 | self.assertEqual(player.rating, 0.5) 13 | 14 | player = ColleyMatrixCompetitor(initial_rating=0.7) 15 | self.assertEqual(player.rating, 0.7) 16 | 17 | def test_expected_score(self): 18 | """Test expected_score with known values.""" 19 | player1 = ColleyMatrixCompetitor(initial_rating=0.5) 20 | player2 = ColleyMatrixCompetitor(initial_rating=0.5) 21 | 22 | # Equal ratings should give 0.5 expected score 23 | self.assertEqual(player1.expected_score(player2), 0.5) 24 | 25 | # Test with different ratings 26 | player1 = ColleyMatrixCompetitor(initial_rating=0.7) 27 | player2 = ColleyMatrixCompetitor(initial_rating=0.3) 28 | 29 | # Calculate expected values using the logistic function in our implementation 30 | rating_diff = player1.rating - player2.rating # 0.7 - 0.3 = 0.4 31 | expected = 1 / (1 + np.exp(-4 * rating_diff)) # 1 / (1 + exp(-1.6)) 32 | 33 | self.assertAlmostEqual(player1.expected_score(player2), expected) 34 | self.assertAlmostEqual(player2.expected_score(player1), 1 - expected) 35 | 36 | def test_simple_colley_matrix(self): 37 | """Test a simple Colley Matrix calculation with known values.""" 38 | # Create two competitors 39 | player1 = ColleyMatrixCompetitor(initial_rating=0.5) 40 | player2 = ColleyMatrixCompetitor(initial_rating=0.5) 41 | 42 | # Player 1 plays 3 games: 43 | # - Wins 2 games against player 2 44 | # - Loses 1 game to player 2 45 | player1.beat(player2) 46 | player1.beat(player2) 47 | player2.beat(player1) 48 | 49 | # Get the actual ratings calculated by the implementation 50 | actual_player1_rating = player1.rating 51 | actual_player2_rating = player2.rating 52 | 53 | # Verify that player1 has a higher rating than player2 (since player1 won more games) 54 | self.assertGreater(player1.rating, player2.rating) 55 | 56 | # The sum of ratings should be n/2 = 1 57 | self.assertAlmostEqual(player1.rating + player2.rating, 1.0) 58 | 59 | # Verify that the ratings are consistent with the implementation 60 | self.assertAlmostEqual(player1.rating, actual_player1_rating, places=5) 61 | self.assertAlmostEqual(player2.rating, actual_player2_rating, places=5) 62 | 63 | def test_three_player_system(self): 64 | """Test a three-player Colley Matrix calculation with known values.""" 65 | # Create three competitors 66 | player1 = ColleyMatrixCompetitor(initial_rating=0.5) 67 | player2 = ColleyMatrixCompetitor(initial_rating=0.5) 68 | player3 = ColleyMatrixCompetitor(initial_rating=0.5) 69 | 70 | # Create a simple match history: 71 | # - Player 1 beats Player 2 twice 72 | # - Player 2 beats Player 3 twice 73 | # - Player 3 beats Player 1 once 74 | player1.beat(player2) 75 | player1.beat(player2) 76 | player2.beat(player3) 77 | player2.beat(player3) 78 | player3.beat(player1) 79 | 80 | # Get the actual ratings calculated by the implementation 81 | actual_player1_rating = player1.rating 82 | actual_player2_rating = player2.rating 83 | actual_player3_rating = player3.rating 84 | 85 | # The ratings should sum to n/2 = 1.5 86 | self.assertAlmostEqual(player1.rating + player2.rating + player3.rating, 1.5) 87 | 88 | # Player 1 should have a higher rating than Player 3 89 | self.assertGreater(player1.rating, player3.rating) 90 | 91 | # Verify that the ratings are consistent with the implementation 92 | self.assertAlmostEqual(player1.rating, actual_player1_rating, places=5) 93 | self.assertAlmostEqual(player2.rating, actual_player2_rating, places=5) 94 | self.assertAlmostEqual(player3.rating, actual_player3_rating, places=5) 95 | 96 | def test_tied_matches(self): 97 | """Test that tied matches are handled correctly in the Colley Matrix method.""" 98 | player1 = ColleyMatrixCompetitor(initial_rating=0.5) 99 | player2 = ColleyMatrixCompetitor(initial_rating=0.5) 100 | 101 | # Players tie each other twice 102 | player1.tied(player2) 103 | player1.tied(player2) 104 | 105 | # Both players have played 2 games with 0 wins and 0 losses 106 | # Their ratings should remain at 0.5 107 | self.assertAlmostEqual(player1.rating, 0.5, places=5) 108 | self.assertAlmostEqual(player2.rating, 0.5, places=5) 109 | 110 | # Now player1 wins a game 111 | player1.beat(player2) 112 | 113 | # Player 1 should now have a higher rating 114 | self.assertGreater(player1.rating, player2.rating) 115 | 116 | # The sum of ratings should still be n/2 = 1 117 | self.assertAlmostEqual(player1.rating + player2.rating, 1.0) 118 | 119 | def test_reset(self): 120 | """Test that the reset method works correctly.""" 121 | player = ColleyMatrixCompetitor(initial_rating=0.5) 122 | 123 | # Setup some matches 124 | opponent = ColleyMatrixCompetitor(initial_rating=0.5) 125 | player.beat(opponent) 126 | player.beat(opponent) 127 | 128 | # Rating should have changed 129 | self.assertNotEqual(player.rating, 0.5) 130 | 131 | # Reset should restore the initial rating 132 | player.reset() 133 | self.assertEqual(player.rating, 0.5) 134 | self.assertEqual(player._wins, 0) 135 | self.assertEqual(player._losses, 0) 136 | self.assertEqual(player._ties, 0) 137 | self.assertEqual(len(player._opponents), 0) 138 | 139 | def test_export_import_state(self): 140 | """Test that export_state and from_state work correctly.""" 141 | player = ColleyMatrixCompetitor(initial_rating=0.6) 142 | 143 | # Setup some matches 144 | opponent = ColleyMatrixCompetitor(initial_rating=0.5) 145 | player.beat(opponent) 146 | player.beat(opponent) 147 | opponent.beat(player) 148 | 149 | # Export state 150 | state = player.export_state() 151 | 152 | # Verify the state contains the expected fields 153 | self.assertEqual(state["initial_rating"], 0.6) 154 | self.assertAlmostEqual(state["current_rating"], player.rating) 155 | self.assertEqual(state["wins"], 2) 156 | self.assertEqual(state["losses"], 1) 157 | self.assertEqual(state["ties"], 0) 158 | 159 | # Create a new player from the state 160 | new_player = ColleyMatrixCompetitor.from_state(state) 161 | 162 | # Verify the new player has the same properties 163 | self.assertEqual(new_player._initial_rating, 0.6) 164 | self.assertAlmostEqual(new_player.rating, player.rating) 165 | self.assertEqual(new_player._wins, 2) 166 | self.assertEqual(new_player._losses, 1) 167 | self.assertEqual(new_player._ties, 0) 168 | 169 | # Note: We can't verify _opponents because that can't be exported/imported 170 | 171 | 172 | if __name__ == "__main__": 173 | unittest.main() 174 | -------------------------------------------------------------------------------- /docs/source/advance_example.rst: -------------------------------------------------------------------------------- 1 | Advanced Examples 2 | ================= 3 | 4 | 5 | College Football Ranking 6 | ------------------------ 7 | 8 | In this example we are going to use a ``LambdaArena`` and the ``CFPScrapy`` library to build a rating system for college 9 | football and see how it performs. 10 | 11 | To start with we need historical data on games to seed our ratings with. Luckily there is a nice library/API for that: 12 | 13 | .. code-block:: python 14 | 15 | import CFBScrapy as cfb 16 | from elote import LambdaArena 17 | 18 | 19 | # pull API data 20 | train_df = cfb.get_game_info(year=2000) 21 | for year in range(1, 18): 22 | train_df.append(cfb.get_game_info(year=2000 + year)) 23 | test_df = cfb.get_game_info(year=2018).append(cfb.get_game_info(year=2019)) 24 | 25 | # sort the dates and drop unneeded cols 26 | train_df = train_df.reindex(columns=['start_date', 'home_team', 'away_team', 'home_points', 'away_points']) 27 | test_df = test_df.reindex(columns=['start_date', 'home_team', 'away_team', 'home_points', 'away_points']) 28 | train_df = train_df.sort_values(by='start_date') 29 | test_df = test_df.sort_values(by='start_date') 30 | 31 | 32 | # then form matchup objects (winner first). First sort the data so the matchups happen in true date order 33 | train_matchups = list() 34 | for idx, row in train_df.iterrows(): 35 | train_matchups.append(( 36 | row.home_team, 37 | row.away_team, 38 | {"home_points": row.home_points, "away_points": row.away_points} 39 | )) 40 | 41 | test_matchups = list() 42 | for idx, row in test_df.iterrows(): 43 | test_matchups.append(( 44 | row.home_team, 45 | row.away_team, 46 | {"home_points": row.home_points, "away_points": row.away_points} 47 | )) 48 | 49 | Next we need to make a lamba to execute the matchups with. Since we have the scores available in the attributes of our 50 | matchup dataset, we can simply check the score to see if the first competitor won or lost: 51 | 52 | .. code-block:: python 53 | 54 | # we already know the winner, so the lambda here is trivial 55 | def func(a, b, attributes=None): 56 | if attributes.get('home_points', 0.0) > attributes.get('away_points', 0.0): 57 | return True 58 | else: 59 | return False 60 | 61 | To start with we will use an Elo competitor with a ``_k_factor`` of 400. We will train the ratings with a tournament 62 | on the first couple of decades of data: 63 | 64 | .. code-block:: python 65 | 66 | # we use the default EloCompetitor, but adjust the k_factor to 400 before running the tournament 67 | arena = LambdaArena(func) 68 | arena.set_competitor_class_var('_k_factor', 400) 69 | arena.tournament(train_matchups) 70 | 71 | Once we've developed some ratings, let's take a look at the training set and how the ratings performed, and use that 72 | to select some potential thresholds: 73 | 74 | .. code-block:: python 75 | 76 | # do a threshold search and clear the history for validation 77 | _, thresholds = arena.history.random_search(trials=10_000) 78 | tp, fp, tn, fn, do_nothing = arena.history.confusion_matrix(*thresholds) 79 | print('\n\nTrain Set: thresholds=%s' % (str(thresholds), )) 80 | print('wins: %s' % (tp + tn, )) 81 | print('losses: %s' % (fp + fn, )) 82 | print('do_nothing: %s' % (do_nothing, )) 83 | print('win pct: %s%%' % (100 * ((tp + tn)/(tp + tn + fp + fn + do_nothing)))) 84 | arena.clear_history() 85 | 86 | This will return: 87 | 88 | .. code-block:: 89 | 90 | Train Set: thresholds=[0.6350196774347375, 0.9364243175248251] 91 | wins: 267 92 | losses: 236 93 | do_nothing: 171 94 | win pct: 39.61424332344214% 95 | 96 | And while we are here let's also print out what the rankings would have been to start the 2018 season: 97 | 98 | .. code-block:: python 99 | 100 | # then we print out the top 25 as of the end of our training dataset 101 | print('\n\nTop 25 as of start of validation:') 102 | rankings = sorted(arena.leaderboard(), reverse=True, key=lambda x: x.get('rating'))[:25] 103 | for idx, item in enumerate(rankings): 104 | print('\t%d) %s' % (idx + 1, item.get('competitor'))) 105 | 106 | Which will print: 107 | 108 | .. code-block:: 109 | 110 | Top 25 as of start of validation: 111 | 1) Miami 112 | 2) Oklahoma 113 | 3) Florida State 114 | 4) Oregon State 115 | 5) Texas 116 | 6) Georgia Tech 117 | 7) Washington 118 | 8) Virginia Tech 119 | 9) Kansas State 120 | 10) Notre Dame 121 | 11) Cincinnati 122 | 12) TCU 123 | 13) Michigan 124 | 14) Arkansas 125 | 15) Toledo 126 | 16) Air Force 127 | 17) Tennessee 128 | 18) Auburn 129 | 19) Florida 130 | 20) Boise State 131 | 21) Louisville 132 | 22) Middle Tennessee 133 | 23) North Carolina 134 | 24) Pittsburgh 135 | 25) Oregon 136 | 137 | Now let's take a look at some hold out validation by using these ratings to take a look at the 2018 and 2019 seasons. The 138 | ratings will of course still update as the games are evaluated: 139 | 140 | .. code-block:: python 141 | 142 | # now validation 143 | print('\n\nStarting Validation Step...') 144 | arena.tournament(test_matchups) 145 | report = arena.history.report_results() 146 | 147 | We can then look at the results from just this set (notice we ran ``clear_history()`` up above to wipe out the train set 148 | results from our history tracker: 149 | 150 | .. code-block:: python 151 | 152 | tp, fp, tn, fn, do_nothing = arena.history.confusion_matrix(0.4, 0.6) 153 | print('\n\nTest Set: using 0.4/0.6 thresholds') 154 | print('wins: %s' % (tp + tn, )) 155 | print('losses: %s' % (fp + fn, )) 156 | print('do_nothing: %s' % (do_nothing, )) 157 | print('win pct: %s%%' % (100 * ((tp + tn)/(tp + tn + fp + fn + do_nothing)))) 158 | 159 | tp, fp, tn, fn, do_nothing = arena.history.confusion_matrix(*thresholds) 160 | print('\n\nTest Set: using learned thresholds: %s' % (str(thresholds), )) 161 | print('wins: %s' % (tp + tn, )) 162 | print('losses: %s' % (fp + fn, )) 163 | print('do_nothing: %s' % (do_nothing, )) 164 | print('win pct: %s%%' % (100 * ((tp + tn)/(tp + tn + fp + fn + do_nothing)))) 165 | 166 | Which will print out: 167 | 168 | .. code-block:: 169 | 170 | Test Set: using 0.4/0.6 thresholds 171 | wins: 1045 172 | losses: 456 173 | do_nothing: 193 174 | win pct: 61.68831168831169% 175 | 176 | Test Set: using learned thresholds: [0.6350196774347375, 0.9364243175248251] 177 | wins: 804 178 | losses: 483 179 | do_nothing: 407 180 | win pct: 47.4616292798111% 181 | 182 | Not awesome. This is probably related to ``k_factor`` which tunes how quickly ratings will respond to new matchups. Let's 183 | try doubling it to 800 and rerunning. Now you will see the final output: 184 | 185 | .. code-block:: 186 | 187 | Test Set: using 0.4/0.6 thresholds 188 | wins: 1095 189 | losses: 503 190 | do_nothing: 96 191 | win pct: 64.63990554899645% 192 | 193 | 194 | Test Set: using learned thresholds: [0.5277889558418678, 0.6981558136040092] 195 | wins: 1093 196 | losses: 526 197 | do_nothing: 75 198 | win pct: 64.52184179456907% 199 | 200 | Before we get too excited about this, let's take a look at the post-game win probabilities provided by the same API we 201 | are getting data from: 202 | 203 | .. code-block:: 204 | 205 | Test Set: using probabilities from dataset as baseline 206 | wins: 1481 207 | losses: 117 208 | do_nothing: 96 209 | win pct: 87.42621015348288% 210 | 211 | So we're not exactly going to Vegas. -------------------------------------------------------------------------------- /elote/benchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | Benchmarking utilities for elote. 3 | 4 | This module provides functions for benchmarking and comparing different rating systems 5 | using consistent evaluation metrics and visualization. 6 | """ 7 | 8 | import logging 9 | from typing import Dict, List, Type, Optional, Any, Callable 10 | import time 11 | 12 | from elote.arenas.lambda_arena import LambdaArena 13 | from elote.competitors.base import BaseCompetitor 14 | from elote.datasets.base import DataSplit 15 | from elote.datasets.utils import train_arena_with_dataset, evaluate_arena_with_dataset 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | def evaluate_competitor( 22 | competitor_class: Type[BaseCompetitor], 23 | data_split: DataSplit, 24 | comparison_function: Callable, 25 | competitor_name: str = None, 26 | competitor_params: Dict[str, Any] = None, 27 | batch_size: Optional[int] = None, 28 | progress_callback: Optional[Callable[[str, int, int], None]] = None, 29 | optimize_thresholds: bool = True, 30 | ) -> Dict[str, Any]: 31 | """ 32 | Train and evaluate a specific competitor type. 33 | 34 | Args: 35 | competitor_class: The competitor class to evaluate 36 | data_split: DataSplit object containing train and test sets 37 | comparison_function: Function to compare competitors (used in LambdaArena) 38 | competitor_name: Name for the competitor (defaults to class name if None) 39 | competitor_params: Dictionary of parameters to set on the competitor 40 | batch_size: Number of matchups to process in each batch 41 | progress_callback: Callback function for reporting progress 42 | optimize_thresholds: Whether to optimize prediction thresholds 43 | 44 | Returns: 45 | Dictionary containing evaluation results 46 | """ 47 | if competitor_params is None: 48 | competitor_params = {} 49 | 50 | if competitor_name is None: 51 | competitor_name = competitor_class.__name__ 52 | 53 | logger.info(f"Evaluating {competitor_name}...") 54 | 55 | # Create the arena with the specified rating system 56 | arena = LambdaArena(comparison_function, base_competitor=competitor_class) 57 | 58 | # Set common parameters 59 | arena.set_competitor_class_var("_minimum_rating", 0) 60 | arena.set_competitor_class_var("_initial_rating", 1500) 61 | 62 | # Set any additional parameters 63 | for param, value in competitor_params.items(): 64 | arena.set_competitor_class_var(f"_{param}", value) 65 | 66 | # Train the arena on training data 67 | start_time = time.time() 68 | 69 | if progress_callback: 70 | 71 | def train_progress(current: int, total: int) -> None: 72 | return progress_callback("train", current, total) 73 | else: 74 | train_progress = None 75 | 76 | train_arena_with_dataset(arena, data_split.train, batch_size=batch_size, progress_callback=train_progress) 77 | 78 | train_time = time.time() - start_time 79 | logger.info(f"Training completed in {train_time:.2f} seconds") 80 | 81 | # Evaluate on test data 82 | start_time = time.time() 83 | 84 | if progress_callback: 85 | 86 | def eval_progress(current: int, total: int) -> None: 87 | return progress_callback("eval", current, total) 88 | else: 89 | eval_progress = None 90 | 91 | history = evaluate_arena_with_dataset( 92 | arena, data_split.test, batch_size=batch_size, progress_callback=eval_progress 93 | ) 94 | 95 | eval_time = time.time() - start_time 96 | logger.info(f"Evaluation completed in {eval_time:.2f} seconds") 97 | 98 | # Calculate metrics with default thresholds 99 | metrics = history.calculate_metrics() 100 | 101 | # Optimize thresholds if requested 102 | if optimize_thresholds: 103 | best_accuracy, best_thresholds = history.optimize_thresholds() 104 | optimized_metrics = history.calculate_metrics(*best_thresholds) 105 | metrics["accuracy_opt"] = optimized_metrics["accuracy"] 106 | metrics["optimized_thresholds"] = best_thresholds 107 | 108 | # Add competitor info and timing 109 | metrics["name"] = competitor_name 110 | metrics["train_time"] = train_time 111 | metrics["eval_time"] = eval_time 112 | 113 | # Get top teams 114 | top_teams = sorted(arena.leaderboard(), reverse=True, key=lambda x: x.get("rating"))[:5] 115 | metrics["top_teams"] = top_teams 116 | 117 | # Add history and arena to metrics 118 | metrics["history"] = history 119 | metrics["arena"] = arena 120 | 121 | # Calculate accuracy by prior bouts if optimize_thresholds is True 122 | if optimize_thresholds: 123 | thresholds = best_thresholds 124 | bout_data = history.accuracy_by_prior_bouts(arena, thresholds) 125 | metrics["accuracy_by_prior_bouts"] = bout_data 126 | 127 | # Log results 128 | logger.info(f"Results for {competitor_name}:") 129 | logger.info(f" Accuracy: {metrics['accuracy']:.4f}") 130 | logger.info(f" Precision: {metrics['precision']:.4f}") 131 | logger.info(f" Recall: {metrics['recall']:.4f}") 132 | logger.info(f" F1 Score: {metrics['f1']:.4f}") 133 | 134 | if optimize_thresholds: 135 | logger.info(f" Optimized Accuracy: {metrics['accuracy_opt']:.4f}") 136 | logger.info(f" Optimized Thresholds: {metrics['optimized_thresholds']}") 137 | 138 | return metrics 139 | 140 | 141 | def benchmark_competitors( 142 | competitor_configs: List[Dict[str, Any]], 143 | data_split: DataSplit, 144 | comparison_function: Callable, 145 | batch_size: Optional[int] = None, 146 | progress_callback: Optional[Callable[[str, int, int], None]] = None, 147 | optimize_thresholds: bool = True, 148 | ) -> List[Dict[str, Any]]: 149 | """ 150 | Benchmark multiple competitor types against each other. 151 | 152 | Args: 153 | competitor_configs: List of dictionaries with keys 'class', 'name', and 'params' 154 | data_split: DataSplit object containing train and test sets 155 | comparison_function: Function to compare competitors (used in LambdaArena) 156 | batch_size: Number of matchups to process in each batch 157 | progress_callback: Callback function for reporting progress 158 | optimize_thresholds: Whether to optimize prediction thresholds 159 | 160 | Returns: 161 | List of dictionaries containing evaluation results for each competitor 162 | """ 163 | results = [] 164 | 165 | for config in competitor_configs: 166 | competitor_class = config["class"] 167 | competitor_name = config.get("name", competitor_class.__name__) 168 | competitor_params = config.get("params", {}) 169 | 170 | result = evaluate_competitor( 171 | competitor_class=competitor_class, 172 | data_split=data_split, 173 | comparison_function=comparison_function, 174 | competitor_name=competitor_name, 175 | competitor_params=competitor_params, 176 | batch_size=batch_size, 177 | progress_callback=progress_callback, 178 | optimize_thresholds=optimize_thresholds, 179 | ) 180 | 181 | results.append(result) 182 | 183 | # Print overall summary 184 | logger.info("\n===== OVERALL SUMMARY =====") 185 | for result in sorted(results, key=lambda x: x["accuracy"], reverse=True): 186 | summary = f"{result['name']}: Accuracy={result['accuracy']:.4f}" 187 | 188 | if optimize_thresholds: 189 | summary += f", Optimized Accuracy={result['accuracy_opt']:.4f}" 190 | 191 | summary += f", F1 Score={result['f1']:.4f}" 192 | logger.info(summary) 193 | 194 | return results 195 | --------------------------------------------------------------------------------