├── elote
├── py.typed
├── arenas
│ └── __init__.py
├── competitors
│ └── __init__.py
├── datasets
│ ├── __init__.py
│ ├── synthetic.py
│ └── utils.py
├── logging.py
└── benchmark.py
├── tests
├── __init__.py
├── test_DWZCompetitor.py
├── test_ECFCompetitor.py
├── test_EloCompetitor.py
├── test_ColleyMatrixCompetitor.py
├── test_BlendedCompetitor.py
├── test_ECFCompetitor_known_values.py
├── test_examples.py
├── test_EloCompetitor_known_values.py
├── test_visualization.py
├── test_Arenas.py
├── test_GlickoCompetitor_known_values.py
└── test_ColleyMatrixCompetitor_known_values.py
├── examples
├── __init__.py
├── use_cases
│ └── __init__.py
├── dwz_arena.py
├── ecf_arena.py
├── prediction.py
├── sample_bout.py
├── bout_with_initialization.py
├── bout_with_ties.py
├── sample_arena.py
├── persist_state_arena.py
├── trueskill_tournament.py
├── glicko_arena.py
├── trueskill_example.py
├── colley_matrix_example.py
├── glicko2_example.py
└── dataset_example.py
├── docs
├── source
│ ├── CNAME
│ ├── arenas.rst
│ ├── api
│ │ ├── arenas.rst
│ │ └── competitors.rst
│ ├── blog_posts.rst
│ ├── competitors.rst
│ ├── index.rst
│ ├── getting_started.rst
│ ├── conf.py
│ ├── installation.rst
│ ├── rating_systems
│ │ ├── elo.rst
│ │ ├── ecf.rst
│ │ ├── dwz.rst
│ │ ├── glicko.rst
│ │ └── ensemble.rst
│ ├── contributing.rst
│ └── advance_example.rst
├── requirements.txt
├── Makefile
└── make.bat
├── .github
├── PULL_REQUEST_TEMPLATE.md
├── ISSUE_TEMPLATE.md
└── workflows
│ ├── test-docs-build.yml
│ ├── docs.yml
│ ├── test-suite.yml
│ └── pypi-publish.yml
├── images
├── colley_matrix_ratings.png
├── cfb
│ ├── calibration_curves.png
│ ├── calibration_comparison.png
│ ├── accuracy_by_prior_bouts.png
│ ├── rating_systems_comparison.png
│ └── optimized_accuracy_comparison.png
└── chess
│ ├── calibration_curves.png
│ ├── calibration_comparison.png
│ ├── accuracy_by_prior_bouts.png
│ ├── rating_systems_comparison.png
│ └── optimized_accuracy_comparison.png
├── .gitignore
├── CONTRIBUTING.md
├── tox.ini
├── LICENSE.md
├── CHANGELOG.md
├── .cursor
└── rules
│ ├── elote_testing.mdc
│ ├── pytest_standards.mdc
│ ├── python_standards.mdc
│ └── sphinx_docs_standards.mdc
├── CODE_OF_CONDUCT.md
├── pyproject.toml
├── Makefile
└── scripts
└── run_benchmarks.py
/elote/py.typed:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/elote/arenas/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/elote/competitors/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/examples/use_cases/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/source/CNAME:
--------------------------------------------------------------------------------
1 | elote.mcginniscommawill.com
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | Fixes #
2 |
3 | ## Proposed Changes
4 |
5 | -
6 | -
7 | -
--------------------------------------------------------------------------------
/images/colley_matrix_ratings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/colley_matrix_ratings.png
--------------------------------------------------------------------------------
/images/cfb/calibration_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/cfb/calibration_curves.png
--------------------------------------------------------------------------------
/images/cfb/calibration_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/cfb/calibration_comparison.png
--------------------------------------------------------------------------------
/images/chess/calibration_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/chess/calibration_curves.png
--------------------------------------------------------------------------------
/images/cfb/accuracy_by_prior_bouts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/cfb/accuracy_by_prior_bouts.png
--------------------------------------------------------------------------------
/images/chess/calibration_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/chess/calibration_comparison.png
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx>=6.1.3
2 | docutils>=0.19
3 | sphinx_rtd_theme
4 | sphinx-rtd-dark-mode
5 | sphinxcontrib-googleanalytics
--------------------------------------------------------------------------------
/images/cfb/rating_systems_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/cfb/rating_systems_comparison.png
--------------------------------------------------------------------------------
/images/chess/accuracy_by_prior_bouts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/chess/accuracy_by_prior_bouts.png
--------------------------------------------------------------------------------
/images/chess/rating_systems_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/chess/rating_systems_comparison.png
--------------------------------------------------------------------------------
/images/cfb/optimized_accuracy_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/cfb/optimized_accuracy_comparison.png
--------------------------------------------------------------------------------
/images/chess/optimized_accuracy_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/elote/HEAD/images/chess/optimized_accuracy_comparison.png
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## Expected Behavior
2 |
3 |
4 | ## Actual Behavior
5 |
6 |
7 | ## Steps to Reproduce the Problem
8 |
9 | 1.
10 | 1.
11 | 1.
12 |
13 | ## Specifications
14 |
15 | - Version:
16 | - Platform:
17 | - Subsystem:
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | docs/build/*
2 | *~
3 | .DS_Store
4 | .idea
5 | *.pyc
6 | *.sublime-project
7 | *.sublime-workspace
8 | docs/build/
9 | benchmark_results/
10 | .tox/
11 | .coverage
12 | htmlcov/
13 | .pytest_cache/
14 | .ruff_cache/
15 | *.egg-info/
16 | .venv/
17 | .benchmarks/
18 | .coverage*
19 | .uv.lock
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Contributing
2 | ============
3 |
4 | The idea is we will add more and more rating methods first, then work on being a little smarter about arenas and
5 | maybe even ensemble arenas.
6 |
7 | Then gather some good datasets, to evaluate different rating systems and figure out which one would have made Auburn win
8 | the BCS title in 2004 and call that the best one.
9 |
10 |
--------------------------------------------------------------------------------
/docs/source/arenas.rst:
--------------------------------------------------------------------------------
1 | Arenas
2 | ======
3 |
4 | Arenas are objects that manage populations of competitors and their matchups. Currently there is only one
5 | type of arena implemented, LambdaArenas
6 |
7 | Lambda Arena
8 | ------------
9 |
10 | .. autoclass:: elote.arenas.lambda_arena.LambdaArena
11 | :members:
12 |
13 |
14 | Helpers
15 | -------
16 |
17 | .. autoclass:: elote.arenas.base.History
18 | :members:
19 |
20 | .. autoclass:: elote.arenas.base.Bout
21 | :members:
22 |
23 |
24 |
--------------------------------------------------------------------------------
/docs/source/api/arenas.rst:
--------------------------------------------------------------------------------
1 | Arenas API Reference
2 | ===================
3 |
4 | This page provides detailed API documentation for all arena classes in Elote.
5 |
6 | Base Arena
7 | ---------
8 |
9 | .. automodule:: elote.arenas.base
10 | :members:
11 | :undoc-members:
12 | :show-inheritance:
13 | :special-members: __init__
14 |
15 | Lambda Arena
16 | -----------
17 |
18 | .. automodule:: elote.arenas.lambda_arena
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 | :special-members: __init__
--------------------------------------------------------------------------------
/examples/dwz_arena.py:
--------------------------------------------------------------------------------
1 | from elote import LambdaArena, DWZCompetitor
2 | import json
3 | import random
4 |
5 |
6 | # sample bout function which just compares the two inputs
7 | def func(a, b):
8 | if a == b:
9 | return None
10 | else:
11 | return a > b
12 |
13 |
14 | matchups = [(random.randint(1, 10), random.randint(1, 10)) for _ in range(1000)]
15 |
16 | arena = LambdaArena(func, base_competitor=DWZCompetitor)
17 | arena.tournament(matchups)
18 |
19 | print("Arena results:")
20 | print(json.dumps(arena.leaderboard(), indent=4))
21 |
--------------------------------------------------------------------------------
/examples/ecf_arena.py:
--------------------------------------------------------------------------------
1 | from elote import LambdaArena, ECFCompetitor
2 | import json
3 | import random
4 |
5 |
6 | # sample bout function which just compares the two inputs
7 | def func(a, b):
8 | if a == b:
9 | return None
10 | else:
11 | return a > b
12 |
13 |
14 | matchups = [(random.randint(1, 10), random.randint(1, 10)) for _ in range(1000)]
15 |
16 | arena = LambdaArena(func, base_competitor=ECFCompetitor)
17 | arena.tournament(matchups)
18 |
19 | print("Arena results:")
20 | print(json.dumps(arena.leaderboard(), indent=4))
21 |
--------------------------------------------------------------------------------
/examples/prediction.py:
--------------------------------------------------------------------------------
1 | from elote import EloCompetitor
2 |
3 | good = EloCompetitor(initial_rating=400)
4 | better = EloCompetitor(initial_rating=500)
5 |
6 | print("probability of better beating good: %5.2f%%" % (better.expected_score(good) * 100,))
7 | print("probability of good beating better: %5.2f%%" % (good.expected_score(better) * 100,))
8 |
9 | good.beat(better)
10 |
11 | print("probability of better beating good: %5.2f%%" % (better.expected_score(good) * 100,))
12 | print("probability of good beating better: %5.2f%%" % (good.expected_score(better) * 100,))
13 |
--------------------------------------------------------------------------------
/examples/sample_bout.py:
--------------------------------------------------------------------------------
1 | from elote import EloCompetitor
2 |
3 | good = EloCompetitor()
4 | better = EloCompetitor()
5 | best = EloCompetitor()
6 |
7 | print("Starting ratings:")
8 | print(
9 | "%7.2f, %7.2f, %7.2f"
10 | % (
11 | good.rating,
12 | better.rating,
13 | best.rating,
14 | )
15 | )
16 |
17 | print("\nAfter matches")
18 |
19 | for _ in range(10):
20 | better.beat(good)
21 | best.beat(better)
22 | print(
23 | "%7.2f, %7.2f, %7.2f"
24 | % (
25 | good.rating,
26 | better.rating,
27 | best.rating,
28 | )
29 | )
30 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py310, py311, py312
3 | isolated_build = True
4 | requires =
5 | tox-uv
6 |
7 | [gh-actions]
8 | python =
9 | 3.10: py310
10 | 3.11: py311
11 | 3.12: py312, lint
12 |
13 | [testenv]
14 | deps =
15 | pytest
16 | pytest-cov
17 | pytest-benchmark
18 | allowlist_externals =
19 | uv
20 | commands =
21 | uv pip install -e ".[datasets]"
22 | pytest {posargs:tests} --cov=elote --cov-report=term
23 |
24 | [testenv:lint]
25 | deps =
26 | ruff
27 | commands =
28 | ruff check .
29 |
30 | [testenv:format]
31 | deps =
32 | ruff
33 | commands =
34 | ruff format .
--------------------------------------------------------------------------------
/examples/bout_with_initialization.py:
--------------------------------------------------------------------------------
1 | from elote import EloCompetitor
2 |
3 | good = EloCompetitor(initial_rating=500)
4 | better = EloCompetitor(initial_rating=450)
5 | best = EloCompetitor(initial_rating=400)
6 |
7 | print("Starting ratings:")
8 | print(
9 | "%7.2f, %7.2f, %7.2f"
10 | % (
11 | good.rating,
12 | better.rating,
13 | best.rating,
14 | )
15 | )
16 |
17 | print("\nAfter matches")
18 |
19 | for _ in range(20):
20 | better.beat(good)
21 | best.beat(better)
22 | print(
23 | "%7.2f, %7.2f, %7.2f"
24 | % (
25 | good.rating,
26 | better.rating,
27 | best.rating,
28 | )
29 | )
30 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS =
7 | SPHINXBUILD = sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/.github/workflows/test-docs-build.yml:
--------------------------------------------------------------------------------
1 | name: "Pull Request Docs Check"
2 | on:
3 | - pull_request
4 |
5 | jobs:
6 | docs:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - name: Checkout
10 | uses: actions/checkout@v1
11 |
12 | - name: Set up Python
13 | uses: actions/setup-python@v4
14 | with:
15 | python-version: '3.10'
16 |
17 | - name: Install uv
18 | run: |
19 | curl -LsSf https://astral.sh/uv/install.sh | sh
20 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH
21 |
22 | - name: Install Dependencies
23 | run: |
24 | uv pip install --system -e ".[dev]"
25 |
26 | - name: Build Docs
27 | uses: ammaraskar/sphinx-action@master
28 | with:
29 | docs-folder: "docs/"
--------------------------------------------------------------------------------
/docs/source/blog_posts.rst:
--------------------------------------------------------------------------------
1 | Blog Posts
2 | ==========
3 |
4 | Here are some blog posts about Elote written by the author:
5 |
6 | - `Elote: A Python Package for Rating Systems `_ - Introduction to the library and its initial design
7 | - `Using Cursor for Library Maintenance `_ - How Cursor helps maintain Elote and other open source libraries
8 | - `Year's End: Looking Back at 2017 `_ - Reflections including Elote development
9 |
10 | These posts provide additional context about the development and maintenance of Elote, as well as real-world use cases and insights from the author.
--------------------------------------------------------------------------------
/examples/bout_with_ties.py:
--------------------------------------------------------------------------------
1 | from elote import EloCompetitor
2 |
3 | good = EloCompetitor(initial_rating=500)
4 | better = EloCompetitor(initial_rating=450)
5 | best = EloCompetitor(initial_rating=400)
6 | also_best = EloCompetitor(initial_rating=400)
7 |
8 | print("Starting ratings:")
9 | print(
10 | "%7.2f, %7.2f, %7.2f, %7.2f"
11 | % (
12 | good.rating,
13 | better.rating,
14 | best.rating,
15 | also_best.rating,
16 | )
17 | )
18 |
19 | print("\nAfter matches with ties")
20 |
21 | for _ in range(20):
22 | better.beat(good)
23 | better.lost_to(best)
24 | best.tied(also_best)
25 | print(
26 | "%7.2f, %7.2f, %7.2f, %7.2f"
27 | % (
28 | good.rating,
29 | better.rating,
30 | best.rating,
31 | also_best.rating,
32 | )
33 | )
34 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/examples/sample_arena.py:
--------------------------------------------------------------------------------
1 | from elote import LambdaArena, EloCompetitor
2 | import json
3 | import random
4 |
5 |
6 | # sample bout function which just compares the two inputs
7 | def func(a, b):
8 | if a == b:
9 | return None
10 | else:
11 | return a > b
12 |
13 |
14 | # Configure the EloCompetitor class with a moderate k_factor
15 | # Note: Using a more moderate k_factor (20) to prevent ratings from changing too drastically
16 | EloCompetitor.configure_class(k_factor=20)
17 |
18 | # Create arena with a higher initial rating for all competitors
19 | # Using 1200 as initial rating (standard chess starting rating) to prevent negative ratings
20 | matchups = [(random.randint(1, 10), random.randint(1, 10)) for _ in range(1000)]
21 | arena = LambdaArena(func, base_competitor=EloCompetitor, base_competitor_kwargs={"initial_rating": 1200})
22 | arena.tournament(matchups)
23 |
24 | print("Arena results:")
25 | print(json.dumps(arena.leaderboard(), indent=4))
26 |
--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
1 | name: "Master Docs Publication"
2 | on:
3 | push:
4 | branches: [ master ]
5 |
6 | jobs:
7 | docs:
8 | runs-on: ubuntu-latest
9 | steps:
10 | - name: Clone
11 | uses: actions/checkout@v1
12 |
13 | - name: Set up Python
14 | uses: actions/setup-python@v4
15 | with:
16 | python-version: '3.10'
17 |
18 | - name: Install uv
19 | run: |
20 | curl -LsSf https://astral.sh/uv/install.sh | sh
21 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH
22 |
23 | - name: Install Dependencies
24 | run: |
25 | uv pip install --system -e ".[dev]"
26 |
27 | - name: Build Docs
28 | uses: ammaraskar/sphinx-action@master
29 | with:
30 | docs-folder: "./docs/"
31 |
32 | - name: Deploy Docs
33 | uses: peaceiris/actions-gh-pages@v3
34 | with:
35 | github_token: ${{ secrets.GITHUB_TOKEN }}
36 | publish_dir: "./docs/build/html/"
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 | Copyright © 2024 Will McGinnis
3 |
4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
5 |
6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7 |
8 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/.github/workflows/test-suite.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: "Test Suite and Linting"
5 |
6 | on:
7 | push:
8 | branches: [ master ]
9 | pull_request:
10 | branches: [ master ]
11 |
12 | jobs:
13 | test:
14 | runs-on: ubuntu-latest
15 |
16 | steps:
17 | - name: Checkout code
18 | uses: actions/checkout@v3
19 |
20 | - name: Set up Python
21 | uses: actions/setup-python@v4
22 | with:
23 | python-version: '3.10'
24 |
25 | - name: Install uv
26 | run: |
27 | curl -LsSf https://astral.sh/uv/install.sh | sh
28 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH
29 |
30 | - name: Install dependencies
31 | run: |
32 | uv pip install --system -e ".[dev,datasets]"
33 |
34 | - name: Run tests
35 | run: |
36 | make test-all
37 |
38 | - name: Run linting
39 | run: |
40 | make lint
41 |
--------------------------------------------------------------------------------
/tests/test_DWZCompetitor.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from elote import DWZCompetitor
3 |
4 |
5 | class TestDWZ(unittest.TestCase):
6 | def test_Improvement(self):
7 | initial_rating = 100
8 | player1 = DWZCompetitor(initial_rating=initial_rating)
9 |
10 | # if player1 beats someone with a high rating, their rating should go up.
11 | for _ in range(10):
12 | player2 = DWZCompetitor(initial_rating=800)
13 | player1.beat(player2)
14 | self.assertGreater(player1.rating, initial_rating)
15 | initial_rating = player1.rating
16 |
17 | def test_Decay(self):
18 | initial_rating = 800
19 | player1 = DWZCompetitor(initial_rating=initial_rating)
20 |
21 | # if player1 beats someone with a high rating, their rating should go up.
22 | for _ in range(10):
23 | player2 = DWZCompetitor(initial_rating=100)
24 | player2.beat(player1)
25 | self.assertLess(player1.rating, initial_rating)
26 | initial_rating = player1.rating
27 |
28 | def test_Expectation(self):
29 | player1 = DWZCompetitor(initial_rating=1000)
30 | player2 = DWZCompetitor(initial_rating=100)
31 | self.assertGreater(player1.expected_score(player2), player2.expected_score(player1))
32 |
--------------------------------------------------------------------------------
/tests/test_ECFCompetitor.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from elote import ECFCompetitor
3 |
4 |
5 | class TestECF(unittest.TestCase):
6 | def test_Improvement(self):
7 | initial_rating = 100
8 | player1 = ECFCompetitor(initial_rating=initial_rating)
9 |
10 | # if player1 beats someone with a high rating, their rating should go up.
11 | for _ in range(10):
12 | player2 = ECFCompetitor(initial_rating=800)
13 | player1.beat(player2)
14 | self.assertGreater(player1.rating, initial_rating)
15 | initial_rating = player1.rating
16 |
17 | def test_Decay(self):
18 | initial_rating = 800
19 | player1 = ECFCompetitor(initial_rating=initial_rating)
20 |
21 | # if player1 beats someone with a high rating, their rating should go up.
22 | for _ in range(10):
23 | player2 = ECFCompetitor(initial_rating=100)
24 | player2.beat(player1)
25 | self.assertLess(player1.rating, initial_rating)
26 | initial_rating = player1.rating
27 |
28 | def test_Expectation(self):
29 | player1 = ECFCompetitor(initial_rating=1000)
30 | player2 = ECFCompetitor(initial_rating=100)
31 | self.assertGreater(player1.expected_score(player2), player2.expected_score(player1))
32 |
--------------------------------------------------------------------------------
/examples/persist_state_arena.py:
--------------------------------------------------------------------------------
1 | from elote import LambdaArena, GlickoCompetitor
2 | import json
3 | import random
4 |
5 |
6 | # sample bout function which just compares the two inputs
7 | def func(a, b):
8 | if a == b:
9 | return None
10 | else:
11 | return a > b
12 |
13 |
14 | # start scoring, stop and save state
15 | matchups = [(random.randint(1, 10), random.randint(1, 10)) for _ in range(10)]
16 | arena = LambdaArena(func, base_competitor=GlickoCompetitor)
17 | arena.tournament(matchups)
18 | print("Arena results:")
19 | print(json.dumps(arena.leaderboard(), indent=4))
20 |
21 | # Export state and create a deep copy to avoid modifying the original
22 | # Use a simple dict comprehension instead of deepcopy to avoid issues with non-serializable types
23 | saved_state = {k: v for k, v in arena.export_state().items()}
24 |
25 | # Create a new arena with the saved state
26 | matchups = [(random.randint(1, 10), random.randint(1, 10)) for _ in range(100)]
27 | new_arena = LambdaArena(func, base_competitor=GlickoCompetitor)
28 |
29 | # Use from_state to recreate competitors
30 | for k, v in saved_state.items():
31 | new_arena.competitors[k] = GlickoCompetitor.from_state(v)
32 |
33 | # Run more matches
34 | new_arena.tournament(matchups)
35 | print("Arena results:")
36 | print(json.dumps(new_arena.leaderboard(), indent=4))
37 |
--------------------------------------------------------------------------------
/docs/source/api/competitors.rst:
--------------------------------------------------------------------------------
1 | Competitors API Reference
2 | ========================
3 |
4 | This page provides detailed API documentation for all competitor classes in Elote.
5 |
6 | Base Competitor
7 | --------------
8 |
9 | .. automodule:: elote.competitors.base
10 | :members:
11 | :undoc-members:
12 | :show-inheritance:
13 | :special-members: __init__
14 |
15 | Elo Competitor
16 | -------------
17 |
18 | .. automodule:: elote.competitors.elo
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 | :special-members: __init__
23 |
24 | Glicko Competitor
25 | ----------------
26 |
27 | .. automodule:: elote.competitors.glicko
28 | :members:
29 | :undoc-members:
30 | :show-inheritance:
31 | :special-members: __init__
32 |
33 | DWZ Competitor
34 | -------------
35 |
36 | .. automodule:: elote.competitors.dwz
37 | :members:
38 | :undoc-members:
39 | :show-inheritance:
40 | :special-members: __init__
41 |
42 | ECF Competitor
43 | -------------
44 |
45 | .. automodule:: elote.competitors.ecf
46 | :members:
47 | :undoc-members:
48 | :show-inheritance:
49 | :special-members: __init__
50 |
51 | Blended Competitor
52 | -----------------
53 |
54 | .. automodule:: elote.competitors.ensemble
55 | :members:
56 | :undoc-members:
57 | :show-inheritance:
58 | :special-members: __init__
--------------------------------------------------------------------------------
/tests/test_EloCompetitor.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from elote import EloCompetitor, GlickoCompetitor
3 | from elote.competitors.base import MissMatchedCompetitorTypesException
4 |
5 |
6 | class TestElo(unittest.TestCase):
7 | def test_Improvement(self):
8 | initial_rating = 100
9 | player1 = EloCompetitor(initial_rating=initial_rating)
10 |
11 | # if player1 beats someone with a high rating, their rating should go up.
12 | for _ in range(10):
13 | player2 = EloCompetitor(initial_rating=800)
14 | player1.beat(player2)
15 | self.assertGreater(player1.rating, initial_rating)
16 | initial_rating = player1.rating
17 |
18 | def test_Decay(self):
19 | initial_rating = 800
20 | player1 = EloCompetitor(initial_rating=initial_rating)
21 |
22 | # if player1 beats someone with a high rating, their rating should go up.
23 | for _ in range(10):
24 | player2 = EloCompetitor(initial_rating=100)
25 | player2.beat(player1)
26 | self.assertLess(player1.rating, initial_rating)
27 | initial_rating = player1.rating
28 |
29 | def test_Expectation(self):
30 | player1 = EloCompetitor(initial_rating=1000)
31 | player2 = EloCompetitor(initial_rating=100)
32 | self.assertGreater(player1.expected_score(player2), player2.expected_score(player1))
33 |
34 | def test_Exceptions(self):
35 | player1 = EloCompetitor(initial_rating=1000)
36 | player2 = GlickoCompetitor(initial_rating=100)
37 |
38 | with self.assertRaises(MissMatchedCompetitorTypesException):
39 | player1.verify_competitor_types(player2)
40 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | v1.2.0
2 | ======
3 |
4 | * Began adding type hints
5 | * Fixed DWZ to correctly calculate development coefficient based on the competitors age at time of match.
6 |
7 | v1.1.0
8 | ======
9 |
10 | * Glicko and Glicko-2 now properly handle time since last match
11 | * Bugfix in evaluation of draws in benchmarking
12 |
13 | v1.0.0
14 | ======
15 |
16 | * [] Added end to end examples using the chess and cfb datasets
17 | * [] Added Glicko-2 Competitor
18 | * [] Added TrueSkill Competitor
19 | * [] Added datasets module to read sample data for development
20 | * [] Added a visualization module to plot rating systems performance
21 | * [] Added a benchmark module to compare rating systems
22 | * [] Added scipy optimization to find optimal thresholds for rating systems
23 | * [CORE-3] Standardized the `Competitor` serialization formats
24 | * [CORE-1] Fixed minimum rating enforcement across all competitor classes
25 | * [CORE-1] Updated documentation examples to use higher initial ratings
26 | * [CORE-1] Made `reset` method abstract in `BaseCompetitor` class
27 | * [CORE-1] Updated ECFCompetitor default initial rating from 40 to 100
28 | * [CORE-1] Fixed benchmark tests to prevent negative ratings
29 |
30 | v0.1.0
31 | ======
32 |
33 | * Many bugfixes
34 | * Improved testing and documentation
35 | * Added notion of history object and bout objects for arenas to track progress
36 |
37 | v0.0.3,4 and 5
38 | ==============
39 |
40 | * No change, debugging CI
41 |
42 | v0.0.2
43 | ======
44 |
45 | * bugfixes in glicko expected score
46 | * bugfixes in elo score that wouldn't allow ratings to drop properly
47 | * added some testing and CI
48 |
49 | v0.0.1
50 | ======
51 |
52 | * initial release
53 | * lambda arena added
54 | * elo competitor added
55 | * glicko competitor added
--------------------------------------------------------------------------------
/.github/workflows/pypi-publish.yml:
--------------------------------------------------------------------------------
1 | name: "PyPI Packaging"
2 | on:
3 | release:
4 | types: created
5 |
6 | jobs:
7 | build_and_publish:
8 | runs-on: ubuntu-latest
9 | permissions:
10 | # IMPORTANT: this permission is mandatory for Trusted Publishing
11 | id-token: write
12 | steps:
13 | - name: Clone
14 | uses: actions/checkout@v3
15 |
16 | - name: Set up Python
17 | uses: actions/setup-python@v4
18 | with:
19 | python-version: '3.11'
20 |
21 | - name: Install uv
22 | run: |
23 | python -m pip install --upgrade pip
24 | pip install uv
25 |
26 | - name: Setup virtual environment
27 | run: |
28 | uv venv
29 | echo "VIRTUAL_ENV=$(pwd)/.venv" >> $GITHUB_ENV
30 | echo "$(pwd)/.venv/bin" >> $GITHUB_PATH
31 |
32 | - name: Install dependencies
33 | run: |
34 | uv pip install -e ".[dev]"
35 | uv pip install build
36 |
37 | - name: Build source distribution
38 | run: |
39 | # Clean the build directories first
40 | rm -rf build/
41 | rm -rf dist/
42 | rm -rf .eggs/
43 |
44 | # Find and remove egg-info directories, excluding the virtual environment
45 | find . -path ./.venv -prune -o -name '*.egg-info' -type d -exec rm -rf {} \; 2>/dev/null || true
46 | find . -path ./.venv -prune -o -name '*.egg' -type f -exec rm -f {} \; 2>/dev/null || true
47 | find . -path ./.venv -prune -o -name '__pycache__' -type d -exec rm -rf {} \; 2>/dev/null || true
48 |
49 | # Build only the source distribution
50 | python -m build --sdist
51 |
52 | - name: Publish package to PyPI
53 | uses: pypa/gh-action-pypi-publish@release/v1
54 | with:
55 | packages-dir: dist/
--------------------------------------------------------------------------------
/.cursor/rules/elote_testing.mdc:
--------------------------------------------------------------------------------
1 | ---
2 | description: how to use the makefile to run tests
3 | globs: *
4 | ---
5 | # Elote Testing Standards
6 |
7 | ## Running Tests
8 |
9 | ### Using Make Commands
10 | - Always use `make test` to run the test suite
11 | - Use `make test-cov` to run tests with coverage reports
12 | - Use `make test-all` to run tests on all supported Python versions using tox
13 |
14 | ### Test Commands
15 | ```bash
16 | # Run the standard test suite
17 | make test
18 |
19 | # Run tests with coverage
20 | make test-cov
21 |
22 | # Run tests on all supported Python versions
23 | make test-all
24 |
25 | # Run a specific test file
26 | make test PYTEST_ARGS="tests/test_unified_interface.py"
27 |
28 | # Run a specific test class
29 | make test PYTEST_ARGS="tests/test_unified_interface.py::TestUnifiedInterface"
30 |
31 | # Run a specific test method
32 | make test PYTEST_ARGS="tests/test_unified_interface.py::TestUnifiedInterface::test_base_methods_elo"
33 |
34 | # Run a specitif example:
35 | make run-example EXAMPLE=use_cases/chess_w_lib.py
36 | ```
37 |
38 | ### Test Environment
39 | - Tests are run using `uv run pytest` through the Makefile
40 | - The test environment is automatically set up with the correct dependencies
41 | - Always run tests in a clean environment to avoid dependency conflicts
42 |
43 | ### Continuous Integration
44 | - All tests must pass in CI before merging
45 | - Coverage should not decrease with new code
46 | - New features should include corresponding tests
47 |
48 | ### Benchmarks
49 | - Use `make benchmark` to run performance benchmarks
50 | - Benchmark results are stored in the `benchmark_results` directory
51 | - Performance regressions should be addressed before merging
52 |
53 | ### Linting and Formatting
54 | - Use `make lint` to check code quality
55 | - Use `make lint-fix` to automatically fix linting issues
56 | - Use `make format` to format code according to project standards
--------------------------------------------------------------------------------
/.cursor/rules/pytest_standards.mdc:
--------------------------------------------------------------------------------
1 | ---
2 | description:
3 | globs: tests/*
4 | ---
5 | # Pytest Standards
6 |
7 | ## Test Organization
8 | - All test files should be named with the prefix `test_` (e.g., `test_module.py`)
9 | - Test classes should be named with the prefix `Test` (e.g., `TestClassName`)
10 | - Test methods should be named with the prefix `test_` (e.g., `test_function_name`)
11 | - Tests should be organized in the same directory structure as the source code they test
12 |
13 | ## Test Structure
14 | - Each test should focus on testing a single functionality
15 | - Use descriptive test names that explain what is being tested
16 | - Group related tests in test classes
17 | - Use appropriate fixtures for test setup and teardown
18 | - Avoid test interdependence - tests should be able to run in any order
19 |
20 | ## Assertions
21 | - Use pytest's built-in assertions instead of Python's `assert` statement when possible
22 | - Use appropriate assertion methods for the type being tested (e.g., `assert_almost_equal` for floats)
23 | - Include descriptive error messages in assertions to aid debugging
24 |
25 | ## Fixtures
26 | - Use fixtures for common setup and teardown operations
27 | - Define fixtures at the appropriate scope (function, class, module, or session)
28 | - Use parameterized fixtures for testing multiple inputs
29 | - Use conftest.py for sharing fixtures across multiple test files
30 |
31 | ## Coverage
32 | - Aim for at least 80% code coverage
33 | - Write tests for both success and failure cases
34 | - Test edge cases and boundary conditions
35 | - Use `pytest-cov` to generate coverage reports
36 |
37 | ## Best Practices
38 | - Keep tests fast and independent
39 | - Avoid mocking unless necessary
40 | - Use markers to categorize tests (e.g., `@pytest.mark.slow`)
41 | - Use parametrize for testing multiple inputs
42 | - Write tests before or alongside code (TDD approach)
43 | - Run the full test suite before committing changes
44 |
45 | ## Commands
46 | - Run tests with `pytest`
47 | - Generate coverage reports with `pytest --cov=elote`
48 | - Run specific tests with `pytest path/to/test_file.py::TestClass::test_method`
--------------------------------------------------------------------------------
/docs/source/competitors.rst:
--------------------------------------------------------------------------------
1 | Competitors
2 | ===========
3 |
4 | Elo Competitor
5 | --------------
6 |
7 | .. autoclass:: elote.competitors.elo.EloCompetitor
8 | :members: export_state,expected_score,beat,tied,rating,to_json,from_json
9 |
10 | Glicko Competitor
11 | -----------------
12 |
13 | .. autoclass:: elote.competitors.glicko.GlickoCompetitor
14 | :members: export_state,expected_score,beat,tied,rating,to_json,from_json
15 |
16 | DWZ Competitor
17 | --------------
18 |
19 | .. autoclass:: elote.competitors.dwz.DWZCompetitor
20 | :members: export_state,expected_score,beat,tied,rating,to_json,from_json
21 |
22 | ECF Competitor
23 | --------------
24 |
25 | .. autoclass:: elote.competitors.ecf.ECFCompetitor
26 | :members: export_state,expected_score,beat,tied,rating,to_json,from_json
27 |
28 | BlendedCompetitor
29 | -----------------
30 |
31 | .. autoclass:: elote.competitors.ensemble.BlendedCompetitor
32 | :members: export_state,expected_score,beat,tied,rating,to_json,from_json
33 |
34 | Serialization
35 | ------------
36 |
37 | All competitor types in Elote support a standardized serialization format that allows for saving and loading competitor states.
38 | The serialization format includes the following fields:
39 |
40 | - **type**: The class name of the competitor
41 | - **version**: The version of the serialization format
42 | - **created_at**: Timestamp when the state was exported
43 | - **id**: A unique identifier for this state export
44 | - **parameters**: The parameters used to initialize the competitor
45 | - **state**: The current state variables of the competitor
46 | - **class_vars**: Class variables for backward compatibility
47 |
48 | To serialize a competitor to JSON:
49 |
50 | .. code-block:: python
51 |
52 | # Create a competitor
53 | competitor = EloCompetitor(initial_rating=1500)
54 |
55 | # Serialize to JSON
56 | json_str = competitor.to_json()
57 |
58 | To deserialize a competitor from JSON:
59 |
60 | .. code-block:: python
61 |
62 | # Deserialize from JSON
63 | competitor = EloCompetitor.from_json(json_str)
64 |
65 | For backward compatibility, the serialized format also includes flattened parameters and state variables at the top level of the dictionary.
66 |
--------------------------------------------------------------------------------
/examples/trueskill_tournament.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Example demonstrating the use of TrueSkill in a tournament setting.
4 |
5 | This example shows how to use the TrueSkill rating system with the LambdaArena
6 | to run a tournament and rank competitors.
7 | """
8 |
9 | import random
10 | import json
11 | from elote import TrueSkillCompetitor, LambdaArena
12 |
13 |
14 | def main():
15 | """Run the TrueSkill tournament example."""
16 |
17 | # Create a comparison function that compares two numbers
18 | # Returns True if a beats b (i.e., a > b)
19 | def comparison_func(a, b):
20 | return a > b
21 |
22 | # Create a LambdaArena with TrueSkill competitors
23 | arena = LambdaArena(
24 | comparison_func,
25 | base_competitor=TrueSkillCompetitor,
26 | base_competitor_kwargs={"initial_mu": 25.0, "initial_sigma": 8.333},
27 | )
28 |
29 | # Generate 1000 random matchups between numbers 1-10
30 | matchups = [(random.randint(1, 10), random.randint(1, 10)) for _ in range(1000)]
31 |
32 | # Run the tournament
33 | print("Running tournament with 1000 matchups...")
34 | arena.tournament(matchups)
35 |
36 | # Display the leaderboard
37 | print("\nFinal rankings:")
38 | leaderboard = arena.leaderboard()
39 | print(json.dumps(leaderboard, indent=4))
40 |
41 | # Display detailed competitor information
42 | print("\nDetailed competitor information:")
43 | for entry in leaderboard:
44 | competitor_id = entry["competitor"]
45 | rating = entry["rating"]
46 | competitor = arena.competitors[competitor_id]
47 | print(f"Competitor {competitor_id}: rating={rating:.2f}, mu={competitor.mu:.2f}, sigma={competitor.sigma:.2f}")
48 |
49 | # Calculate match quality between top competitors
50 | if len(leaderboard) >= 2:
51 | top1_id = leaderboard[0]["competitor"]
52 | top2_id = leaderboard[1]["competitor"]
53 | top1 = arena.competitors[top1_id]
54 | top2 = arena.competitors[top2_id]
55 | match_quality = TrueSkillCompetitor.match_quality(top1, top2)
56 | print(f"\nMatch quality between top two competitors ({top1_id} vs {top2_id}): {match_quality:.4f}")
57 |
58 |
59 | if __name__ == "__main__":
60 | main()
61 |
--------------------------------------------------------------------------------
/examples/glicko_arena.py:
--------------------------------------------------------------------------------
1 | from elote import LambdaArena, GlickoCompetitor
2 | import json
3 | import random
4 | from datetime import datetime, timedelta
5 |
6 |
7 | # sample bout function which just compares the two inputs
8 | def func(a, b):
9 | if a == b:
10 | return None
11 | else:
12 | return a > b
13 |
14 |
15 | # Create initial time and a list of matchups with timestamps spread over a month
16 | initial_time = datetime(2024, 1, 1)
17 | matchups_with_time = []
18 | for _i in range(1000):
19 | # Random matchup
20 | a = random.randint(1, 10)
21 | b = random.randint(1, 10)
22 | # Random time within the month (0-30 days from initial time)
23 | match_time = initial_time + timedelta(
24 | days=random.randint(0, 30), hours=random.randint(0, 23), minutes=random.randint(0, 59)
25 | )
26 | matchups_with_time.append((a, b, match_time))
27 |
28 | # Sort matchups by time to ensure chronological order
29 | matchups_with_time.sort(key=lambda x: x[2])
30 |
31 | # Create arena with GlickoCompetitor and set initial time
32 | arena = LambdaArena(
33 | func,
34 | base_competitor=GlickoCompetitor,
35 | base_competitor_kwargs={"initial_rating": 1500, "initial_rd": 350, "initial_time": initial_time},
36 | )
37 |
38 | # Process matches in chronological order
39 | for a, b, match_time in matchups_with_time:
40 | # Use matchup() instead of tournament() to handle match times
41 | if func(a, b): # If a wins
42 | arena.matchup(a, b, match_time=match_time)
43 | else: # If b wins
44 | arena.matchup(b, a, match_time=match_time)
45 |
46 | print("\nArena results after one month of matches:")
47 | print("(Notice how less active players have higher RD values)")
48 | leaderboard = arena.leaderboard()
49 |
50 | # Convert leaderboard list to a dictionary and add RD values and last activity times
51 | leaderboard_dict = {}
52 | for entry in leaderboard:
53 | player_id = entry["competitor"]
54 | leaderboard_dict[player_id] = entry
55 | competitor = arena.competitors.get(player_id)
56 | if competitor:
57 | leaderboard_dict[player_id]["rd"] = round(competitor.rd, 2)
58 | leaderboard_dict[player_id]["last_activity"] = competitor._last_activity.strftime("%Y-%m-%d %H:%M")
59 |
60 | print(json.dumps(leaderboard_dict, indent=4))
61 |
--------------------------------------------------------------------------------
/elote/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Datasets module for elote.
3 |
4 | This module provides a common interface for getting datasets and splitting them into train and test sets
5 | for evaluating different rating algorithms.
6 | """
7 |
8 | # Core datasets - always available
9 | from elote.datasets.base import BaseDataset, DataSplit
10 | from elote.datasets.synthetic import SyntheticDataset
11 | from typing import Any
12 |
13 | # Base __all__ list with always-available exports
14 | __all__ = [
15 | "BaseDataset",
16 | "DataSplit",
17 | "SyntheticDataset",
18 | ]
19 |
20 | # Optional datasets - only import if dependencies are available
21 | _optional_imports = {}
22 |
23 | # Try to import ChessDataset
24 | try:
25 | from elote.datasets.chess import ChessDataset
26 | _optional_imports["ChessDataset"] = ChessDataset
27 | __all__.append("ChessDataset")
28 | except ImportError as e:
29 | _optional_imports["ChessDataset"] = e
30 |
31 | # Try to import CollegeFootballDataset
32 | try:
33 | from elote.datasets.football import CollegeFootballDataset
34 | _optional_imports["CollegeFootballDataset"] = CollegeFootballDataset
35 | __all__.append("CollegeFootballDataset")
36 | except ImportError as e:
37 | _optional_imports["CollegeFootballDataset"] = e
38 |
39 |
40 | def __getattr__(name: str) -> Any:
41 | """Handle access to optional imports with helpful error messages."""
42 | if name in _optional_imports:
43 | obj = _optional_imports[name]
44 | if isinstance(obj, ImportError):
45 | if name == "ChessDataset":
46 | raise ImportError(
47 | f"ChessDataset requires optional dependencies that are not installed.\n"
48 | f"Install them with: pip install 'elote[datasets]' or pip install python-chess pyzstd\n"
49 | f"Original error: {obj}"
50 | )
51 | elif name == "CollegeFootballDataset":
52 | raise ImportError(
53 | f"CollegeFootballDataset requires optional dependencies that are not installed.\n"
54 | f"Install them with: pip install 'elote[datasets]' or pip install 'sportsdataverse[all]'\n"
55 | f"Original error: {obj}"
56 | )
57 | return obj
58 | raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
59 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. elote documentation master file, created by
2 | sphinx-quickstart on Sat Mar 21 13:38:36 2020.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Elote: Elegant Rating Systems in Python
7 | =======================================
8 |
9 | **Elote** is a powerful Python library for implementing and comparing rating systems. Whether you're ranking chess players, sports teams, or prioritizing features in your product backlog, Elote provides a simple, elegant API for all your competitive ranking needs.
10 |
11 | Rating systems allow you to rank competitors based on their performance in head-to-head matchups. The most famous example is the Elo rating system used in chess, but these systems have applications far beyond sports:
12 |
13 | - Ranking products based on A/B comparisons
14 | - Prioritizing features through pairwise voting
15 | - Creating recommendation systems
16 | - Matchmaking in games and competitions
17 | - Collaborative filtering and ranking
18 |
19 | Elote makes implementing these systems simple and intuitive, with a clean API that handles all the mathematical complexity for you.
20 |
21 | .. toctree::
22 | :maxdepth: 1
23 | :caption: Getting Started
24 |
25 | getting_started
26 | installation
27 | quickstart
28 |
29 | .. toctree::
30 | :maxdepth: 1
31 | :caption: Core Concepts
32 |
33 | competitors
34 | arenas
35 | serialization
36 |
37 | .. toctree::
38 | :maxdepth: 1
39 | :caption: Rating Systems
40 |
41 | rating_systems/elo
42 | rating_systems/glicko
43 | rating_systems/ecf
44 | rating_systems/dwz
45 | rating_systems/ensemble
46 | rating_systems/comparison
47 |
48 | .. toctree::
49 | :maxdepth: 1
50 | :caption: Examples
51 |
52 | examples
53 | advance_example
54 | use_cases/product_ranking
55 | use_cases/matchmaking
56 | use_cases/feature_prioritization
57 |
58 | .. toctree::
59 | :maxdepth: 1
60 | :caption: API Reference
61 |
62 | api/competitors
63 | api/arenas
64 |
65 | .. toctree::
66 | :maxdepth: 1
67 | :caption: Resources
68 |
69 | blog_posts
70 |
71 | .. toctree::
72 | :maxdepth: 1
73 | :caption: Development
74 |
75 | contributing
76 |
77 | Indices and tables
78 | ==================
79 |
80 | * :ref:`genindex`
81 | * :ref:`modindex`
82 | * :ref:`search`
83 |
--------------------------------------------------------------------------------
/.cursor/rules/python_standards.mdc:
--------------------------------------------------------------------------------
1 | ---
2 | description:
3 | globs: *.py
4 | ---
5 | # Python Coding Standards
6 |
7 | Do not ever include test-specific code into the library implementation. We should never be checking to see if we are running in a test context to modify the output, the library code should work the exact same in both test and non-test use cases.
8 |
9 | ## Code Style
10 | - Follow PEP 8 style guide for Python code
11 | - Use 4 spaces for indentation (no tabs)
12 | - Maximum line length of 88 characters (Black default)
13 | - Use snake_case for variables, functions, and methods
14 | - Use CamelCase for classes
15 | - Use UPPER_CASE for constants
16 | - Add a blank line at the end of each file
17 |
18 | ## Imports
19 | - Group imports in the following order:
20 | 1. Standard library imports
21 | 2. Related third-party imports
22 | 3. Local application/library specific imports
23 | - Use absolute imports when possible
24 | - Avoid wildcard imports (`from module import *`)
25 | - Use import aliases for long module names
26 |
27 | ## Documentation
28 | - Document all public modules, classes, methods, and functions
29 | - Use docstrings that follow the Google style guide
30 | - Include type hints for function parameters and return values
31 | - Document parameters, return values, and exceptions raised
32 |
33 | ## Error Handling
34 | - Use specific exception types instead of generic ones
35 | - Handle exceptions at the appropriate level
36 | - Use context managers (`with` statements) for resource management
37 | - Avoid catching exceptions without proper handling
38 |
39 | ## Code Organization
40 | - Keep functions and methods short and focused
41 | - Follow the Single Responsibility Principle
42 | - Use classes to encapsulate related functionality
43 | - Separate concerns into different modules
44 |
45 | ## Testing
46 | - Write unit tests for all code
47 | - Use meaningful test names
48 | - Test both normal and edge cases
49 | - Mock external dependencies in tests
50 |
51 | ## Performance
52 | - Prefer list/dict/set comprehensions over loops when appropriate
53 | - Use generators for large data sets
54 | - Profile code before optimizing
55 | - Consider using NumPy/Pandas for numerical operations
56 |
57 | ## Tools
58 | - Use Black for code formatting
59 | - Use Ruff for linting and static analysis
60 | - Use mypy for type checking
61 | - Use isort for import sorting
62 |
63 | ## Version Control
64 | - Write meaningful commit messages
65 | - Keep commits focused on a single change
66 | - Use feature branches for development
67 | - Review code before merging
--------------------------------------------------------------------------------
/docs/source/getting_started.rst:
--------------------------------------------------------------------------------
1 | Getting Started
2 | ===============
3 |
4 | To install latest release:
5 |
6 | .. code-block::
7 |
8 | pip install elote
9 |
10 | To install bleeding edge, clone the repository and run:
11 |
12 | .. code-block::
13 |
14 | pip install -e .
15 |
16 |
17 | Basic Usage
18 | -----------
19 |
20 | The most basic object in ``elote`` is a competitor. To start with, let's take a look at ``EloCompetitor``. Let's make 3
21 | objects, one for each of 3 players in a game:
22 |
23 | .. code-block:: python
24 |
25 | from elote import EloCompetitor
26 |
27 | good_player = EloCompetitor(initial_rating=1200)
28 | better_player = EloCompetitor(initial_rating=1200)
29 | best_player = EloCompetitor(initial_rating=1200)
30 |
31 | print('Starting ratings:')
32 | print('%7.2f, %7.2f, %7.2f' % (good_player.rating, better_player.rating, best_player.rating, ))
33 |
34 | All we do is initialize them, and print out their starting ratings. Rating is our measure of how good we think a
35 | competitor is with the information at hand. Here we don't really have any information, so they are all rated the same:
36 |
37 | .. code-block::
38 |
39 | Starting ratings:
40 | 1200.00, 1200.00, 1200.00
41 |
42 | To make things a little more interesting, let's do 20 ``matches``. A ``match`` is an instance where two players compete,
43 | and one of them wins. This gives us some new information to update our ratings with. For each of the matches we simulate
44 | we will have ``better_player`` beat ``good_player`` or ``best_player`` beat ``better_player``. At each iteration, we will
45 | print out the ratings to get an idea of how they change over time.
46 |
47 |
48 | .. code-block:: python
49 |
50 | print('\nAfter matches')
51 | for _ in range(10):
52 | better_player.beat(good_player)
53 | best_player.beat(better_player)
54 | print('%7.2f, %7.2f, %7.2f' % (good_player.rating, better_player.rating, best_player.rating, ))
55 |
56 | .. code-block::
57 |
58 | After matches
59 | good, better, best
60 | 1184.00, 1199.26, 1216.74
61 | 1168.70, 1198.66, 1232.64
62 | 1154.08, 1198.18, 1247.75
63 | 1140.10, 1197.79, 1262.11
64 | 1126.73, 1197.49, 1275.78
65 | 1113.95, 1197.25, 1288.80
66 | 1101.71, 1197.08, 1301.21
67 | 1089.99, 1196.95, 1313.05
68 | 1078.77, 1196.87, 1324.36
69 | 1068.01, 1196.81, 1335.18
70 |
71 | So as you can see, over time, the scores gradually update to reflect our hierarchy.
72 |
73 | For more infromation on types of competitors we have, or different configuraiton options, please see the detailed API
74 | docs on the competitors page.
--------------------------------------------------------------------------------
/.cursor/rules/sphinx_docs_standards.mdc:
--------------------------------------------------------------------------------
1 | ---
2 | description:
3 | globs: docs/*
4 | ---
5 | # Sphinx Documentation Standards
6 |
7 | ## Project Setup
8 | - Use `sphinx-quickstart` to initialize the documentation structure
9 | - Configure `conf.py` with appropriate project information
10 | - Use the ReadTheDocs theme for consistent styling
11 | - Enable necessary extensions (e.g., `autodoc`, `napoleon`, `viewcode`)
12 |
13 | ## Directory Structure
14 | - Keep documentation source files in the `docs/` directory
15 | - Organize documentation into logical sections (e.g., user guide, API reference)
16 | - Use a clear and consistent file naming convention
17 | - Include an `index.rst` file as the documentation entry point
18 |
19 | ## Documentation Style
20 | - Write in clear, concise language
21 | - Use present tense and active voice
22 | - Be consistent with terminology
23 | - Include examples where appropriate
24 | - Target the appropriate audience (users, developers, etc.)
25 |
26 | ## reStructuredText Formatting
27 | - Use proper heading hierarchy (=, -, ~, ^, ")
28 | - Use bullet lists for unordered items and numbered lists for sequences
29 | - Use code blocks with appropriate language for syntax highlighting
30 | - Use cross-references to link between documentation sections
31 | - Include images and diagrams where they add value
32 |
33 | ## API Documentation
34 | - Use autodoc to generate API documentation from docstrings
35 | - Document all public modules, classes, methods, and functions
36 | - Follow Google or NumPy docstring style consistently
37 | - Include type information for parameters and return values
38 | - Document exceptions that may be raised
39 |
40 | ## Examples and Tutorials
41 | - Include practical examples for common use cases
42 | - Provide step-by-step tutorials for complex operations
43 | - Ensure all examples are tested and working
44 | - Use `literalinclude` to include code examples from actual source files
45 |
46 | ## Building and Testing
47 | - Build documentation locally before committing changes
48 | - Check for and fix all warnings during the build process
49 | - Verify that cross-references work correctly
50 | - Test documentation on different screen sizes
51 |
52 | ## Deployment
53 | - Configure automatic documentation builds on ReadTheDocs or GitHub Pages
54 | - Include a link to the documentation in the project README
55 | - Version documentation to match software releases
56 | - Provide a changelog or release notes section
57 |
58 | ## Maintenance
59 | - Keep documentation up-to-date with code changes
60 | - Review and update documentation during each release cycle
61 | - Address user feedback and questions in the documentation
62 | - Remove outdated or deprecated information
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 |
13 | import os
14 | import sys
15 |
16 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath("."))))
17 |
18 |
19 | # -- Project information -----------------------------------------------------
20 |
21 | project = "elote"
22 | copyright = "2020, Will McGinnis"
23 | author = "Will McGinnis"
24 |
25 | # The full version, including alpha/beta/rc tags
26 | try:
27 | # Try to get version from importlib.metadata (Python 3.8+)
28 | from importlib.metadata import version as get_version
29 |
30 | release = get_version("elote")
31 | except ImportError:
32 | # Fallback for older Python versions
33 | try:
34 | import pkg_resources
35 |
36 | release = pkg_resources.get_distribution("elote").version
37 | except Exception: # Replace bare except with specific exception type
38 | # Hardcoded fallback
39 | release = "0.1.0"
40 |
41 |
42 | # -- General configuration ---------------------------------------------------
43 |
44 | # Add any Sphinx extension modules here, as strings. They can be
45 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
46 | # ones.
47 | extensions = [
48 | "sphinx.ext.autodoc",
49 | "sphinx.ext.viewcode",
50 | "sphinx_rtd_dark_mode",
51 | "sphinxcontrib.googleanalytics",
52 | ]
53 |
54 | # Add any paths that contain templates here, relative to this directory.
55 | templates_path = ["_templates"]
56 |
57 | # List of patterns, relative to source directory, that match files and
58 | # directories to ignore when looking for source files.
59 | # This pattern also affects html_static_path and html_extra_path.
60 | exclude_patterns = []
61 | html_extra_path = ["CNAME"]
62 |
63 | # -- Options for HTML output -------------------------------------------------
64 |
65 | # The theme to use for HTML and HTML Help pages. See the documentation for
66 | # a list of builtin themes.
67 | #
68 | html_theme = "sphinx_rtd_theme"
69 |
70 | # Default to dark theme
71 | default_dark_mode = True
72 |
73 | # Google Analytics configuration
74 | googleanalytics_id = "G-Z43R9PWW0B"
75 | googleanalytics_enabled = True
76 |
77 | # Add any paths that contain custom static files (such as style sheets) here,
78 | # relative to this directory. They are copied after the builtin static files,
79 | # so a file named "default.css" will overwrite the builtin "default.css".
80 | html_static_path = ["_static"]
81 |
82 | autoclass_content = "both"
83 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
6 |
7 | ## Our Standards
8 |
9 | Examples of behavior that contributes to creating a positive environment include:
10 |
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 |
17 | Examples of unacceptable behavior by participants include:
18 |
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 |
25 | ## Our Responsibilities
26 |
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 |
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 |
31 | ## Scope
32 |
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 |
35 | ## Enforcement
36 |
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at will@pedalwrencher.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 |
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 |
41 | ## Attribution
42 |
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 |
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 |
--------------------------------------------------------------------------------
/elote/logging.py:
--------------------------------------------------------------------------------
1 | """Centralized logging configuration for the elote library."""
2 |
3 | import logging
4 | import sys
5 | from typing import Union, Optional, TextIO
6 |
7 | # The main logger for the elote library
8 | # Users can configure this logger using standard logging methods
9 | # or the helper functions below.
10 | logger = logging.getLogger("elote")
11 |
12 | # Add a NullHandler by default to prevent logs from propagating
13 | # unless the user configures logging.
14 | logger.addHandler(logging.NullHandler())
15 |
16 | # Set a reasonable default level to avoid excessive debug logging
17 | logger.setLevel(logging.WARNING)
18 |
19 | # Default log format
20 | DEFAULT_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
21 |
22 |
23 | def set_level(level: Union[int, str]) -> None:
24 | """Set the logging level for the elote logger.
25 |
26 | Args:
27 | level: The logging level (e.g., logging.DEBUG, logging.INFO, 'DEBUG', 'INFO').
28 | """
29 | if isinstance(level, str):
30 | level = getattr(logging, level.upper())
31 | logger.setLevel(level)
32 |
33 |
34 | def add_handler(handler: logging.Handler) -> None:
35 | """Add a handler to the elote logger.
36 |
37 | Args:
38 | handler: A logging handler to add.
39 | """
40 | # Remove existing handlers of the same type to avoid duplicates
41 | for existing_handler in logger.handlers[:]:
42 | if isinstance(existing_handler, type(handler)):
43 | logger.removeHandler(existing_handler)
44 |
45 | logger.addHandler(handler)
46 |
47 |
48 | def basic_config(
49 | level: Union[int, str] = logging.WARNING,
50 | stream: Optional[TextIO] = None,
51 | format: str = DEFAULT_FORMAT,
52 | force: bool = False
53 | ) -> None:
54 | """Configure basic logging for elote.
55 |
56 | Sets the level and adds a StreamHandler (defaults to stderr)
57 | with the specified format.
58 |
59 | Args:
60 | level: The minimum logging level to output.
61 | stream: The stream to log to (e.g., sys.stdout). Defaults to sys.stderr.
62 | format: The log message format string.
63 | force: If True, remove existing handlers before adding new one.
64 | """
65 | if force:
66 | # Remove all existing handlers
67 | for handler in logger.handlers[:]:
68 | logger.removeHandler(handler)
69 |
70 | set_level(level)
71 | handler = logging.StreamHandler(stream or sys.stderr)
72 | formatter = logging.Formatter(format)
73 | handler.setFormatter(formatter)
74 | add_handler(handler)
75 |
76 |
77 | def get_logger(name: Optional[str] = None) -> logging.Logger:
78 | """Get a logger instance.
79 |
80 | Args:
81 | name: Optional name for the logger. If None, returns the main elote logger.
82 |
83 | Returns:
84 | A logger instance.
85 | """
86 | if name is None:
87 | return logger
88 | return logging.getLogger(f"elote.{name}")
89 |
90 |
91 | def disable_debug_logging() -> None:
92 | """Disable debug logging for performance in production environments."""
93 | if logger.level <= logging.DEBUG:
94 | logger.setLevel(logging.INFO)
95 |
96 |
97 | def is_debug_enabled() -> bool:
98 | """Check if debug logging is enabled.
99 |
100 | Returns:
101 | True if debug logging is enabled, False otherwise.
102 | """
103 | return logger.isEnabledFor(logging.DEBUG)
104 |
--------------------------------------------------------------------------------
/tests/test_ColleyMatrixCompetitor.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import numpy as np
3 | from elote import ColleyMatrixCompetitor, EloCompetitor
4 | from elote.competitors.base import MissMatchedCompetitorTypesException
5 |
6 |
7 | class TestColleyMatrix(unittest.TestCase):
8 | def test_improvement(self):
9 | """Test that beating stronger opponents improves rating."""
10 | initial_rating = 0.5
11 | player1 = ColleyMatrixCompetitor(initial_rating=initial_rating)
12 |
13 | # If player1 beats someone with a higher rating, their rating should go up
14 | for _ in range(5):
15 | player2 = ColleyMatrixCompetitor(initial_rating=0.8)
16 | player1.beat(player2)
17 | self.assertGreater(player1.rating, initial_rating)
18 | initial_rating = player1.rating
19 |
20 | def test_decay(self):
21 | """Test that losing to weaker opponents decreases rating."""
22 | initial_rating = 0.8
23 | player1 = ColleyMatrixCompetitor(initial_rating=initial_rating)
24 |
25 | # If player1 loses to someone with a lower rating, their rating should go down
26 | for _ in range(5):
27 | player2 = ColleyMatrixCompetitor(initial_rating=0.2)
28 | player2.beat(player1)
29 | self.assertLess(player1.rating, initial_rating)
30 | initial_rating = player1.rating
31 |
32 | def test_expectation(self):
33 | """Test that expected scores are calculated correctly."""
34 | player1 = ColleyMatrixCompetitor(initial_rating=0.8)
35 | player2 = ColleyMatrixCompetitor(initial_rating=0.2)
36 |
37 | # Higher rated player should have higher expected score
38 | self.assertGreater(player1.expected_score(player2), player2.expected_score(player1))
39 |
40 | def test_network_recalculation(self):
41 | """Test that ratings are recalculated across the network of connected competitors."""
42 | # Create a network of 5 competitors
43 | competitors = [ColleyMatrixCompetitor(initial_rating=0.5) for _ in range(5)]
44 |
45 | # Create some matches to establish a network
46 | # 0 beats 1, 1 beats 2, 2 beats 3, 3 beats 4, 4 beats 0 (circular)
47 | competitors[0].beat(competitors[1])
48 | competitors[1].beat(competitors[2])
49 | competitors[2].beat(competitors[3])
50 | competitors[3].beat(competitors[4])
51 | competitors[4].beat(competitors[0])
52 |
53 | # All ratings should be different after this circular pattern
54 | ratings = [c.rating for c in competitors]
55 | self.assertEqual(
56 | len(set(ratings)), len(ratings), "Each competitor should have a unique rating after circular matches"
57 | )
58 |
59 | # Ratings should sum to n/2 = 2.5 (property of Colley Matrix Method)
60 | self.assertAlmostEqual(sum(ratings), len(competitors) / 2)
61 |
62 | # Additional test: if a new player beats the highest rated player, they should improve
63 | new_player = ColleyMatrixCompetitor(initial_rating=0.5)
64 | highest_player = competitors[np.argmax([c.rating for c in competitors])]
65 | initial_rating = new_player.rating
66 | new_player.beat(highest_player)
67 | self.assertGreater(new_player.rating, initial_rating)
68 |
69 | def test_exceptions(self):
70 | """Test that appropriate exceptions are raised."""
71 | player1 = ColleyMatrixCompetitor(initial_rating=0.5)
72 | player2 = EloCompetitor(initial_rating=1000)
73 |
74 | with self.assertRaises(MissMatchedCompetitorTypesException):
75 | player1.verify_competitor_types(player2)
76 |
77 | with self.assertRaises(MissMatchedCompetitorTypesException):
78 | player1.expected_score(player2)
79 |
80 |
81 | if __name__ == "__main__":
82 | unittest.main()
83 |
--------------------------------------------------------------------------------
/examples/trueskill_example.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Example demonstrating the use of the TrueSkill rating system.
4 |
5 | This example shows how to create TrueSkill competitors, calculate win probabilities,
6 | update ratings after matches, and work with teams.
7 | """
8 |
9 | from elote import TrueSkillCompetitor
10 |
11 |
12 | def main():
13 | """Run the TrueSkill example."""
14 | # Create players with different initial skill levels
15 | player1 = TrueSkillCompetitor(initial_mu=25.0, initial_sigma=8.333)
16 | player2 = TrueSkillCompetitor(initial_mu=30.0, initial_sigma=7.0)
17 | player3 = TrueSkillCompetitor(initial_mu=20.0, initial_sigma=6.0)
18 | player4 = TrueSkillCompetitor(initial_mu=35.0, initial_sigma=5.0)
19 |
20 | # Print initial ratings
21 | print("Initial ratings:")
22 | print(f"Player 1: mu={player1.mu:.2f}, sigma={player1.sigma:.2f}, rating={player1.rating:.2f}")
23 | print(f"Player 2: mu={player2.mu:.2f}, sigma={player2.sigma:.2f}, rating={player2.rating:.2f}")
24 | print(f"Player 3: mu={player3.mu:.2f}, sigma={player3.sigma:.2f}, rating={player3.rating:.2f}")
25 | print(f"Player 4: mu={player4.mu:.2f}, sigma={player4.sigma:.2f}, rating={player4.rating:.2f}")
26 | print()
27 |
28 | # Calculate win probabilities
29 | print("Win probabilities:")
30 | print(f"Player 1 vs Player 2: {player1.expected_score(player2):.4f}")
31 | print(f"Player 1 vs Player 3: {player1.expected_score(player3):.4f}")
32 | print(f"Player 2 vs Player 4: {player2.expected_score(player4):.4f}")
33 | print()
34 |
35 | # Calculate match quality
36 | print("Match quality:")
37 | print(f"Player 1 vs Player 2: {TrueSkillCompetitor.match_quality(player1, player2):.4f}")
38 | print(f"Player 1 vs Player 3: {TrueSkillCompetitor.match_quality(player1, player3):.4f}")
39 | print(f"Player 2 vs Player 4: {TrueSkillCompetitor.match_quality(player2, player4):.4f}")
40 | print()
41 |
42 | # Simulate some matches
43 | print("Simulating matches...")
44 | print("Match 1: Player 1 beats Player 2 (upset!)")
45 | player1.beat(player2)
46 |
47 | print("Match 2: Player 3 beats Player 1 (another upset!)")
48 | player3.beat(player1)
49 |
50 | print("Match 3: Player 2 and Player 4 tie")
51 | player2.tied(player4)
52 | print()
53 |
54 | # Print updated ratings
55 | print("Updated ratings after matches:")
56 | print(f"Player 1: mu={player1.mu:.2f}, sigma={player1.sigma:.2f}, rating={player1.rating:.2f}")
57 | print(f"Player 2: mu={player2.mu:.2f}, sigma={player2.sigma:.2f}, rating={player2.rating:.2f}")
58 | print(f"Player 3: mu={player3.mu:.2f}, sigma={player3.sigma:.2f}, rating={player3.rating:.2f}")
59 | print(f"Player 4: mu={player4.mu:.2f}, sigma={player4.sigma:.2f}, rating={player4.rating:.2f}")
60 | print()
61 |
62 | # Calculate new win probabilities
63 | print("New win probabilities:")
64 | print(f"Player 1 vs Player 2: {player1.expected_score(player2):.4f}")
65 | print(f"Player 1 vs Player 3: {player1.expected_score(player3):.4f}")
66 | print(f"Player 2 vs Player 4: {player2.expected_score(player4):.4f}")
67 | print()
68 |
69 | # Demonstrate team creation
70 | print("Team creation:")
71 | team1_mu, team1_sigma = TrueSkillCompetitor.create_team([player1, player3])
72 | team2_mu, team2_sigma = TrueSkillCompetitor.create_team([player2, player4])
73 | print(f"Team 1 (Players 1 & 3): mu={team1_mu:.2f}, sigma={team1_sigma:.2f}")
74 | print(f"Team 2 (Players 2 & 4): mu={team2_mu:.2f}, sigma={team2_sigma:.2f}")
75 | print()
76 |
77 | # Demonstrate serialization and deserialization
78 | print("Demonstrating serialization and deserialization...")
79 | state = player1.export_state()
80 | player1_copy = TrueSkillCompetitor.from_state(state)
81 |
82 | print(f"Original player: {player1}")
83 | print(f"Deserialized player: {player1_copy}")
84 | print(f"Are they equal? {player1.mu == player1_copy.mu and player1.sigma == player1_copy.sigma}")
85 |
86 |
87 | if __name__ == "__main__":
88 | main()
89 |
--------------------------------------------------------------------------------
/docs/source/installation.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | ============
3 |
4 | This guide covers different ways to install Elote for both users and developers.
5 |
6 | Requirements
7 | -----------
8 |
9 | Elote requires:
10 |
11 | - Python 3.10 or higher
12 | - NumPy (automatically installed as a dependency)
13 |
14 | Basic Installation
15 | ----------------
16 |
17 | For most users, the simplest way to install Elote is via pip:
18 |
19 | .. code-block:: bash
20 |
21 | pip install elote
22 |
23 | This will install the latest stable release from PyPI along with all required dependencies.
24 |
25 | If you prefer using Conda, you can install Elote via pip within your Conda environment:
26 |
27 | .. code-block:: bash
28 |
29 | conda create -n elote-env python=3.9
30 | conda activate elote-env
31 | pip install elote
32 |
33 | Development Installation
34 | ----------------------
35 |
36 | If you want to contribute to Elote or need the latest development version, you can install directly from the GitHub repository:
37 |
38 | .. code-block:: bash
39 |
40 | # Using Make (recommended)
41 | git clone https://github.com/yourusername/elote.git
42 | cd elote
43 | make install-dev
44 |
45 | # Or using pip
46 | git clone https://github.com/yourusername/elote.git
47 | cd elote
48 | pip install -e ".[dev]"
49 |
50 | # Or using uv
51 | git clone https://github.com/yourusername/elote.git
52 | cd elote
53 | uv pip install -e ".[dev]"
54 |
55 | The development installation includes additional dependencies needed for testing, linting, and documentation.
56 |
57 | Verifying Installation
58 | --------------------
59 |
60 | To verify that Elote is installed correctly, you can run a simple test in Python:
61 |
62 | .. code-block:: python
63 |
64 | from elote import EloCompetitor
65 |
66 | # Create two competitors
67 | player1 = EloCompetitor(initial_rating=1500)
68 | player2 = EloCompetitor(initial_rating=1600)
69 |
70 | # Calculate expected score
71 | expected = player2.expected_score(player1)
72 | print(f"Installation successful! Expected score: {expected:.2%}")
73 |
74 | If this runs without errors, Elote is installed correctly.
75 |
76 | Installing Optional Dependencies
77 | ------------------------------
78 |
79 | Elote has several optional dependency groups that can be installed based on your needs:
80 |
81 | .. code-block:: bash
82 |
83 | # Install with visualization dependencies
84 | pip install "elote[viz]"
85 |
86 | # Install with all optional dependencies
87 | pip install "elote[all]"
88 |
89 | # Install development dependencies
90 | pip install "elote[dev]"
91 |
92 | Troubleshooting
93 | --------------
94 |
95 | Common installation issues and their solutions:
96 |
97 | NumPy Installation Errors
98 | ^^^^^^^^^^^^^^^^^^^^^^^^
99 |
100 | If you encounter errors related to NumPy installation:
101 |
102 | .. code-block:: bash
103 |
104 | # Install NumPy separately first
105 | pip install numpy
106 | pip install elote
107 |
108 | Version Conflicts
109 | ^^^^^^^^^^^^^^^
110 |
111 | If you have version conflicts with other packages:
112 |
113 | .. code-block:: bash
114 |
115 | # Create a virtual environment
116 | python -m venv elote-env
117 | source elote-env/bin/activate # On Windows: elote-env\Scripts\activate
118 | pip install elote
119 |
120 | Permission Errors
121 | ^^^^^^^^^^^^^^^
122 |
123 | If you encounter permission errors during installation:
124 |
125 | .. code-block:: bash
126 |
127 | # Install for the current user only
128 | pip install --user elote
129 |
130 | # Or use a virtual environment (recommended)
131 | python -m venv elote-env
132 | source elote-env/bin/activate
133 | pip install elote
134 |
135 | Getting Help
136 | -----------
137 |
138 | If you continue to experience installation issues:
139 |
140 | 1. Check the `GitHub Issues `_ to see if others have encountered the same problem
141 | 2. Open a new issue with details about your environment and the error messages
142 | 3. Reach out to the community for help
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=42", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "elote"
7 | version = "1.1.0"
8 | description = "Python module for rating bouts (like with Elo Rating)"
9 | readme = "README.md"
10 | authors = [
11 | {name = "Will McGinnis", email = "will@helton.io"},
12 | ]
13 | license = {text = "MIT"}
14 | classifiers = [
15 | "Development Status :: 3 - Alpha",
16 | "Intended Audience :: Developers",
17 | "Programming Language :: Python :: 3",
18 | "Programming Language :: Python :: 3.10",
19 | "Programming Language :: Python :: 3.11",
20 | "Programming Language :: Python :: 3.12",
21 | ]
22 | keywords = ["elo", "scoring", "rating"]
23 | dependencies = [
24 | "tqdm==4.66.3",
25 | "numpy>=1.20.0",
26 | "scipy>=1.7.0",
27 | "pandas>=1.3.0",
28 | "requests>=2.25.0",
29 | "setuptools>=42.0.0",
30 | "matplotlib>=3.5.0"
31 | ]
32 | requires-python = ">=3.10"
33 |
34 | [project.urls]
35 | Homepage = "https://github.com/wdm0006/elote"
36 | "Bug Tracker" = "https://github.com/wdm0006/elote/issues"
37 |
38 | [project.optional-dependencies]
39 | dev = [
40 | "pytest",
41 | "pytest-cov",
42 | "sphinx>=6.1.3",
43 | "docutils>=0.19",
44 | "sphinx_rtd_theme",
45 | "ruff",
46 | "wheel",
47 | "build",
48 | "tox",
49 | "tox-uv",
50 | "pytest-benchmark",
51 | "mypy>=1.8.0",
52 | "types-tqdm",
53 | "types-requests",
54 | "pandas-stubs",
55 | "scipy-stubs",
56 | ]
57 | datasets = [
58 | "sportsdataverse[all]",
59 | "pyzstd>=0.15.0",
60 | "python-chess>=1.9.0",
61 | "setuptools>=42.0.0",
62 | ]
63 |
64 | [tool.setuptools]
65 | packages = ["elote", "elote.competitors", "elote.arenas", "elote.datasets"]
66 | package-data = {"elote" = ["py.typed"]}
67 |
68 | [tool.setuptools.exclude-package-data]
69 | "*" = ["*.pyc", "*.pyo", "*.pyd", "*.so", "*.dylib", "*~"]
70 |
71 | [tool.pytest]
72 | testpaths = ["tests"]
73 | python_files = "test_*.py"
74 |
75 | [tool.ruff]
76 | # Same as Black.
77 | line-length = 120
78 | indent-width = 4
79 |
80 | # Assume Python 3.10
81 | target-version = "py310"
82 |
83 | [tool.ruff.lint]
84 | # Enable Pyflakes (`F`), pycodestyle (`E`), and flake8-bugbear (`B`) rules
85 | select = ["E", "F", "B"]
86 | ignore = ["E501"] # Ignore line length errors since we'll fix them gradually
87 |
88 | # Allow fix for all enabled rules (when `--fix`) is provided.
89 | fixable = ["ALL"]
90 | unfixable = []
91 |
92 | # Allow unused variables when underscore-prefixed.
93 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
94 |
95 | [tool.ruff.format]
96 | # Use double quotes for strings.
97 | quote-style = "double"
98 |
99 | # Indent with spaces, rather than tabs.
100 | indent-style = "space"
101 |
102 | # Respect magic trailing commas.
103 | skip-magic-trailing-comma = false
104 |
105 | # Automatically detect the appropriate line ending.
106 | line-ending = "auto"
107 |
108 | [tool.mypy]
109 | python_version = "3.10"
110 | warn_return_any = true
111 | disallow_untyped_defs = true
112 | check_untyped_defs = true
113 | disallow_incomplete_defs = true
114 | disallow_untyped_decorators = false
115 | no_implicit_optional = false
116 | warn_redundant_casts = false
117 | warn_unused_ignores = false
118 | warn_no_return = true
119 | warn_unreachable = false
120 | strict_optional = false
121 | show_error_codes = true
122 | show_column_numbers = true
123 | pretty = true
124 | ignore_missing_imports = true
125 | disallow_any_unimported = true
126 | disallow_untyped_calls = true
127 | disable_error_code = ["attr-defined", "assignment", "index", "call-arg", "arg-type", "valid-type", "misc", "override", "union-attr", "safe-super", "dict-item", "call-overload", "no-any-unimported"]
128 | files = ["elote"]
129 | exclude = ["tests/.*", "examples/.*", "scripts/.*", "docs/.*"]
130 |
131 | # Per-module options:
132 | [[tool.mypy.overrides]]
133 | module = [
134 | "numpy.*",
135 | "matplotlib.*",
136 | "tqdm.*",
137 | "sportsdataverse.*",
138 | "scipy.*",
139 | ]
140 | ignore_missing_imports = true
141 |
142 | [[tool.mypy.overrides]]
143 | module = ["pandas"]
144 | ignore_missing_imports = false
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: help setup install install-dev install-datasets test test-cov lint format clean build docs lint-fix test-all benchmark run-example typecheck
2 |
3 | # Default target
4 | help:
5 | @echo "Available commands:"
6 | @echo " make setup - Install uv and other required tools"
7 | @echo " make install - Install the package"
8 | @echo " make install-dev - Install the package with development dependencies"
9 | @echo " make install-datasets - Install the package with dataset dependencies"
10 | @echo " make test - Run tests"
11 | @echo " make test-cov - Run tests with coverage"
12 | @echo " make test-all - Run tests on all supported Python versions using tox"
13 | @echo " make benchmark - Run performance benchmarks"
14 | @echo " make lint - Run linting checks"
15 | @echo " make lint-fix - Run linting checks and fix auto-fixable issues"
16 | @echo " make typecheck - Run mypy type checking"
17 | @echo " make typecheck [FILE=path] - Run mypy type checking (optionally on a specific file)"
18 | @echo " make format - Format code with ruff"
19 | @echo " make clean - Clean build artifacts"
20 | @echo " make build - Build package distributions"
21 | @echo " make docs - Build documentation"
22 | @echo " make run-example EXAMPLE=filename - Run an example (e.g., make run-example EXAMPLE=trueskill_example.py)"
23 |
24 | # Setup development environment
25 | setup:
26 | pip install uv
27 | uv venv --python=3.11
28 | brew install libomp
29 |
30 | # Install the package
31 | install:
32 | uv pip install -e .
33 |
34 | # Install the package with development dependencies
35 | install-dev:
36 | uv pip install -e ".[dev]"
37 |
38 | # Install the package with dataset dependencies
39 | install-datasets:
40 | uv pip install -e ".[datasets]"
41 |
42 | # Run tests
43 | test:
44 | uv run pytest $(PYTEST_ARGS)
45 |
46 | # Run tests with coverage
47 | test-cov:
48 | uv run pytest --cov=elote --cov-report=term --cov-report=html $(PYTEST_ARGS)
49 |
50 | # Run linting
51 | lint:
52 | uv run ruff check .
53 |
54 | # Run linting and fix auto-fixable issues
55 | lint-fix:
56 | uv run ruff check --fix --unsafe-fixes .
57 |
58 | # Run mypy type checking
59 | typecheck:
60 | @if [ -z "$(FILE)" ]; then \
61 | echo "Running mypy on the entire elote package..."; \
62 | uv run mypy elote; \
63 | echo mypy elote; \
64 | else \
65 | echo "Running mypy on $(FILE)..."; \
66 | uv run mypy $(FILE); \
67 | echo mypy $(FILE); \
68 | fi
69 |
70 | # Format code
71 | format:
72 | uv run ruff format .
73 |
74 | # Clean build artifacts
75 | clean:
76 | rm -rf build/
77 | rm -rf dist/
78 | rm -rf *.egg-info/
79 | rm -rf .coverage
80 | rm -rf htmlcov/
81 | rm -rf .pytest_cache/
82 | rm -rf .ruff_cache/
83 | find . -type d -name __pycache__ -exec rm -rf {} +
84 | find . -type f -name "*.pyc" -delete
85 |
86 | # Build package distributions
87 | build: clean
88 | uv run python -m build
89 |
90 | # Build documentation
91 | docs:
92 | cd docs && uv run $(MAKE) html SPHINXBUILD="python -m sphinx"
93 | @echo "Opening documentation in Google Chrome..."
94 | @if [ "$(shell uname)" = "Darwin" ]; then \
95 | open -a "Google Chrome" docs/build/html/index.html; \
96 | else \
97 | if command -v google-chrome > /dev/null; then \
98 | google-chrome docs/build/html/index.html; \
99 | elif command -v google-chrome-stable > /dev/null; then \
100 | google-chrome-stable docs/build/html/index.html; \
101 | elif command -v chromium > /dev/null; then \
102 | chromium docs/build/html/index.html; \
103 | else \
104 | echo "Could not find Google Chrome. Please open docs/build/html/index.html manually."; \
105 | fi; \
106 | fi
107 |
108 | # Run tests on all supported Python versions
109 | test-all:
110 | uv run tox
111 |
112 | # Run benchmarks
113 | benchmark:
114 | uv run pytest tests/test_benchmarks.py -v --benchmark-enable $(PYTEST_ARGS)
115 |
116 | # Run an example
117 | run-example:
118 | @if [ -z "$(EXAMPLE)" ]; then \
119 | echo "Please specify an example file with EXAMPLE=filename.py"; \
120 | echo "Available examples:"; \
121 | ls examples/*.py | xargs -n1 basename; \
122 | else \
123 | uv run python examples/$(EXAMPLE); \
124 | fi
125 |
--------------------------------------------------------------------------------
/tests/test_BlendedCompetitor.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from elote import BlendedCompetitor, GlickoCompetitor
3 | from elote.competitors.base import MissMatchedCompetitorTypesException
4 |
5 |
6 | class TestBlendedCompetitor(unittest.TestCase):
7 | def test_Improvement(self):
8 | player1 = BlendedCompetitor(
9 | competitors=[
10 | {"type": "EloCompetitor", "competitor_kwargs": {}},
11 | {"type": "GlickoCompetitor", "competitor_kwargs": {}},
12 | {"type": "DWZCompetitor", "competitor_kwargs": {}},
13 | {"type": "ECFCompetitor", "competitor_kwargs": {}},
14 | ]
15 | )
16 | initial_rating = player1.rating
17 | # if player1 beats someone with a high rating, their rating should go up.
18 | for _ in range(10):
19 | player2 = BlendedCompetitor(
20 | competitors=[
21 | {
22 | "type": "EloCompetitor",
23 | "competitor_kwargs": {"initial_rating": 1000},
24 | },
25 | {"type": "GlickoCompetitor", "competitor_kwargs": {}},
26 | {"type": "DWZCompetitor", "competitor_kwargs": {}},
27 | {"type": "ECFCompetitor", "competitor_kwargs": {}},
28 | ]
29 | )
30 | player1.beat(player2)
31 | self.assertGreater(player1.rating, initial_rating)
32 | initial_rating = player1.rating
33 |
34 | def test_Decay(self):
35 | player1 = BlendedCompetitor(
36 | competitors=[
37 | {"type": "EloCompetitor", "competitor_kwargs": {}},
38 | {"type": "GlickoCompetitor", "competitor_kwargs": {}},
39 | {"type": "DWZCompetitor", "competitor_kwargs": {}},
40 | {"type": "ECFCompetitor", "competitor_kwargs": {}},
41 | ]
42 | )
43 | initial_rating = player1.rating
44 | # if player1 beats someone with a high rating, their rating should go up.
45 | for _ in range(10):
46 | player2 = BlendedCompetitor(
47 | competitors=[
48 | {
49 | "type": "EloCompetitor",
50 | "competitor_kwargs": {"initial_rating": 1000},
51 | },
52 | {"type": "GlickoCompetitor", "competitor_kwargs": {}},
53 | {"type": "DWZCompetitor", "competitor_kwargs": {}},
54 | {"type": "ECFCompetitor", "competitor_kwargs": {}},
55 | ]
56 | )
57 | player2.beat(player1)
58 | self.assertLess(player1.rating, initial_rating)
59 | initial_rating = player1.rating
60 |
61 | def test_Expectation(self):
62 | player1 = BlendedCompetitor(
63 | competitors=[
64 | {
65 | "type": "EloCompetitor",
66 | "competitor_kwargs": {"initial_rating": 1000},
67 | },
68 | {"type": "GlickoCompetitor", "competitor_kwargs": {}},
69 | {"type": "DWZCompetitor", "competitor_kwargs": {}},
70 | {"type": "ECFCompetitor", "competitor_kwargs": {}},
71 | ]
72 | )
73 | player2 = BlendedCompetitor(
74 | competitors=[
75 | {"type": "EloCompetitor", "competitor_kwargs": {"initial_rating": 100}},
76 | {"type": "GlickoCompetitor", "competitor_kwargs": {}},
77 | {"type": "DWZCompetitor", "competitor_kwargs": {}},
78 | {"type": "ECFCompetitor", "competitor_kwargs": {}},
79 | ]
80 | )
81 | self.assertGreater(player1.expected_score(player2), player2.expected_score(player1))
82 |
83 | def test_Exceptions(self):
84 | player1 = BlendedCompetitor(
85 | competitors=[
86 | {
87 | "type": "EloCompetitor",
88 | "competitor_kwargs": {"initial_rating": 1000},
89 | },
90 | {"type": "GlickoCompetitor", "competitor_kwargs": {}},
91 | {"type": "DWZCompetitor", "competitor_kwargs": {}},
92 | {"type": "ECFCompetitor", "competitor_kwargs": {}},
93 | ]
94 | )
95 | player2 = GlickoCompetitor(initial_rating=100)
96 |
97 | with self.assertRaises(MissMatchedCompetitorTypesException):
98 | player1.verify_competitor_types(player2)
99 |
--------------------------------------------------------------------------------
/examples/colley_matrix_example.py:
--------------------------------------------------------------------------------
1 | """
2 | Colley Matrix Method example using Elote.
3 |
4 | The Colley Matrix Method is a least-squares rating system that solves a system of linear
5 | equations to obtain rankings. It's widely used in sports rankings, particularly college
6 | football.
7 |
8 | This example demonstrates:
9 | 1. Creating ColleyMatrixCompetitor instances
10 | 2. Recording match results
11 | 3. Examining how ratings change with match outcomes
12 | 4. Visualizing the rating changes over time
13 | """
14 |
15 | import os
16 | import matplotlib.pyplot as plt
17 | from elote import ColleyMatrixCompetitor
18 |
19 |
20 | def main():
21 | # Create competitors with default initial rating of 0.5
22 | team_a = ColleyMatrixCompetitor()
23 | team_b = ColleyMatrixCompetitor()
24 | team_c = ColleyMatrixCompetitor()
25 | team_d = ColleyMatrixCompetitor()
26 |
27 | # Initial ratings and expectations
28 | print("Initial ratings:")
29 | print(f"Team A: {team_a.rating:.3f}")
30 | print(f"Team B: {team_b.rating:.3f}")
31 | print(f"Team C: {team_c.rating:.3f}")
32 | print(f"Team D: {team_d.rating:.3f}")
33 |
34 | print("\nInitial win probabilities:")
35 | print(f"Team A vs Team B: {team_a.expected_score(team_b):.2%}")
36 | print(f"Team A vs Team C: {team_a.expected_score(team_c):.2%}")
37 |
38 | # Record match results in a tournament
39 | print("\nSimulating a small tournament...")
40 |
41 | # Track rating history
42 | a_ratings = [team_a.rating]
43 | b_ratings = [team_b.rating]
44 | c_ratings = [team_c.rating]
45 | d_ratings = [team_d.rating]
46 |
47 | # Round 1
48 | team_a.beat(team_b) # A beats B
49 | team_c.beat(team_d) # C beats D
50 |
51 | a_ratings.append(team_a.rating)
52 | b_ratings.append(team_b.rating)
53 | c_ratings.append(team_c.rating)
54 | d_ratings.append(team_d.rating)
55 |
56 | # Round 2 - simplified to avoid network issues
57 | team_b.beat(team_d) # B beats D
58 |
59 | a_ratings.append(team_a.rating)
60 | b_ratings.append(team_b.rating)
61 | c_ratings.append(team_c.rating)
62 | d_ratings.append(team_d.rating)
63 |
64 | # Round 3 - simplified to avoid network issues
65 | team_c.beat(team_b) # C beats B
66 |
67 | a_ratings.append(team_a.rating)
68 | b_ratings.append(team_b.rating)
69 | c_ratings.append(team_c.rating)
70 | d_ratings.append(team_d.rating)
71 |
72 | # Final ratings
73 | print("\nFinal ratings:")
74 | print(f"Team A: {team_a.rating:.3f} (won 1, lost 0)")
75 | print(f"Team B: {team_b.rating:.3f} (won 1, lost 2)")
76 | print(f"Team C: {team_c.rating:.3f} (won 2, lost 0)")
77 | print(f"Team D: {team_d.rating:.3f} (won 0, lost 2)")
78 |
79 | # Final win probabilities
80 | print("\nFinal win probabilities:")
81 | print(f"Team A vs Team B: {team_a.expected_score(team_b):.2%}")
82 | print(f"Team A vs Team C: {team_a.expected_score(team_c):.2%}")
83 | print(f"Team B vs Team C: {team_b.expected_score(team_c):.2%}")
84 | print(f"Team B vs Team D: {team_b.expected_score(team_d):.2%}")
85 |
86 | # Verify a key property of Colley Matrix ratings: sum of ratings equals n/2
87 | total_rating = team_a.rating + team_b.rating + team_c.rating + team_d.rating
88 | print(f"\nSum of all ratings: {total_rating:.3f}")
89 | print(f"Expected sum (n/2): {4 / 2}")
90 |
91 | # Demonstrate a tie
92 | print("\nSimulating a tie between Team B and Team D...")
93 | team_b.tied(team_d)
94 | print(f"Team B rating after tie: {team_b.rating:.3f}")
95 | print(f"Team D rating after tie: {team_d.rating:.3f}")
96 |
97 | # Plot rating changes over time
98 | plt.figure(figsize=(10, 6))
99 | rounds = range(4) # Initial + 3 rounds
100 |
101 | plt.plot(rounds, a_ratings, "o-", label="Team A")
102 | plt.plot(rounds, b_ratings, "s-", label="Team B")
103 | plt.plot(rounds, c_ratings, "^-", label="Team C")
104 | plt.plot(rounds, d_ratings, "x-", label="Team D")
105 |
106 | plt.xlabel("Round")
107 | plt.ylabel("Rating")
108 | plt.title("Colley Matrix Ratings Over Tournament Rounds")
109 | plt.legend()
110 | plt.grid(True)
111 | plt.ylim(0, 1)
112 | plt.xticks(rounds)
113 |
114 | # Save the plot
115 | plt.savefig(os.path.join("images", "colley_matrix_ratings.png"))
116 | print("\nRating history plot saved as 'colley_matrix_ratings.png'")
117 |
118 | # Show the plot if running interactively
119 | # plt.show()
120 |
121 |
122 | if __name__ == "__main__":
123 | main()
124 |
--------------------------------------------------------------------------------
/examples/glicko2_example.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Example demonstrating the use of the Glicko-2 rating system.
4 |
5 | This example shows how to create Glicko-2 competitors, calculate win probabilities,
6 | update ratings after matches, and how ratings change over time due to inactivity.
7 | """
8 |
9 | from elote import Glicko2Competitor
10 | from datetime import datetime, timedelta
11 |
12 |
13 | def main():
14 | """Run the Glicko-2 example."""
15 | # Create initial time and competitors
16 | initial_time = datetime(2024, 1, 1)
17 | player1 = Glicko2Competitor(initial_rating=1500, initial_rd=350, initial_volatility=0.06, initial_time=initial_time)
18 | player2 = Glicko2Competitor(initial_rating=1700, initial_rd=300, initial_volatility=0.06, initial_time=initial_time)
19 | player3 = Glicko2Competitor(initial_rating=1800, initial_rd=200, initial_volatility=0.05, initial_time=initial_time)
20 |
21 | # Print initial ratings
22 | print("Initial ratings (January 1st, 2024):")
23 | print(f"Player 1: Rating={player1.rating}, RD={player1.rd}, Volatility={player1.volatility:.6f}")
24 | print(f"Player 2: Rating={player2.rating}, RD={player2.rd}, Volatility={player2.volatility:.6f}")
25 | print(f"Player 3: Rating={player3.rating}, RD={player3.rd}, Volatility={player3.volatility:.6f}")
26 | print()
27 |
28 | # Calculate initial win probabilities
29 | print("Initial win probabilities:")
30 | print(f"Player 1 vs Player 2: {player1.expected_score(player2):.4f}")
31 | print(f"Player 1 vs Player 3: {player1.expected_score(player3):.4f}")
32 | print(f"Player 2 vs Player 3: {player2.expected_score(player3):.4f}")
33 | print()
34 |
35 | # Simulate some matches with time gaps
36 | print("Simulating matches over time...")
37 |
38 | # First match after 5 days
39 | match1_time = initial_time + timedelta(days=5)
40 | print("\nMatch 1 (January 6th): Player 1 beats Player 2 (upset!)")
41 | print("RDs before match due to 5 days inactivity:")
42 | print(f"Player 1 RD: {player1.rd:.1f}")
43 | print(f"Player 2 RD: {player2.rd:.1f}")
44 | player1.beat(player2, match_time=match1_time)
45 | print("RDs after match:")
46 | print(f"Player 1 RD: {player1.rd:.1f}")
47 | print(f"Player 2 RD: {player2.rd:.1f}")
48 |
49 | # Second match after another 10 days
50 | match2_time = match1_time + timedelta(days=10)
51 | print("\nMatch 2 (January 16th): Player 3 beats Player 1")
52 | print("RDs before match due to 10 days inactivity:")
53 | print(f"Player 1 RD: {player1.rd:.1f}")
54 | print(f"Player 3 RD: {player3.rd:.1f}")
55 | player3.beat(player1, match_time=match2_time)
56 | print("RDs after match:")
57 | print(f"Player 1 RD: {player1.rd:.1f}")
58 | print(f"Player 3 RD: {player3.rd:.1f}")
59 |
60 | # Third match after another 15 days
61 | match3_time = match2_time + timedelta(days=15)
62 | print("\nMatch 3 (January 31st): Player 2 and Player 3 tie")
63 | print("RDs before match due to inactivity:")
64 | print(f"Player 2 RD: {player2.rd:.1f} (25 days inactive)")
65 | print(f"Player 3 RD: {player3.rd:.1f} (15 days inactive)")
66 | player2.tied(player3, match_time=match3_time)
67 | print("RDs after match:")
68 | print(f"Player 2 RD: {player2.rd:.1f}")
69 | print(f"Player 3 RD: {player3.rd:.1f}")
70 | print()
71 |
72 | # Print final ratings
73 | print("Final ratings (January 31st, 2024):")
74 | print(f"Player 1: Rating={player1.rating:.1f}, RD={player1.rd:.1f}, Volatility={player1.volatility:.6f}")
75 | print(f"Player 2: Rating={player2.rating:.1f}, RD={player2.rd:.1f}, Volatility={player2.volatility:.6f}")
76 | print(f"Player 3: Rating={player3.rating:.1f}, RD={player3.rd:.1f}, Volatility={player3.volatility:.6f}")
77 | print()
78 |
79 | # Calculate final win probabilities
80 | print("Final win probabilities:")
81 | print(f"Player 1 vs Player 2: {player1.expected_score(player2):.4f}")
82 | print(f"Player 1 vs Player 3: {player1.expected_score(player3):.4f}")
83 | print(f"Player 2 vs Player 3: {player2.expected_score(player3):.4f}")
84 | print()
85 |
86 | # Demonstrate serialization and deserialization
87 | print("Demonstrating serialization and deserialization...")
88 | state = player1.export_state()
89 | player1_copy = Glicko2Competitor.from_state(state)
90 |
91 | print(f"Original player: {player1}")
92 | print(f"Deserialized player: {player1_copy}")
93 | print(
94 | f"Are they equal? {player1.rating == player1_copy.rating and player1.rd == player1_copy.rd and player1.volatility == player1_copy.volatility}"
95 | )
96 |
97 |
98 | if __name__ == "__main__":
99 | main()
100 |
--------------------------------------------------------------------------------
/docs/source/rating_systems/elo.rst:
--------------------------------------------------------------------------------
1 | Elo Rating System
2 | ================
3 |
4 | Overview
5 | --------
6 |
7 | The Elo rating system is one of the most widely used rating systems in the world. Developed by Hungarian-American physics professor Arpad Elo, it was originally designed for chess but has since been adapted for many other competitive domains including video games, basketball, football, and baseball.
8 |
9 | The Elo system is named after its creator and was first introduced as the official rating system for the United States Chess Federation in 1960, and later adopted by the World Chess Federation (FIDE) in 1970.
10 |
11 | How It Works
12 | -----------
13 |
14 | The Elo rating system is based on the following principles:
15 |
16 | 1. Each player has a rating that represents their skill level
17 | 2. The difference between ratings determines the expected outcome of a match
18 | 3. After each match, ratings are adjusted based on the actual outcome compared to the expected outcome
19 |
20 | The core formula for calculating the expected score (probability of winning) is:
21 |
22 | .. math::
23 |
24 | E_A = \frac{1}{1 + 10^{(R_B - R_A) / 400}}
25 |
26 | Where:
27 | - :math:`E_A` is the expected score for player A
28 | - :math:`R_A` is the rating of player A
29 | - :math:`R_B` is the rating of player B
30 |
31 | After a match, the ratings are updated using:
32 |
33 | .. math::
34 |
35 | R'_A = R_A + K \times (S_A - E_A)
36 |
37 | Where:
38 | - :math:`R'_A` is the new rating for player A
39 | - :math:`K` is the K-factor (determines how quickly ratings change)
40 | - :math:`S_A` is the actual score (1 for win, 0.5 for draw, 0 for loss)
41 | - :math:`E_A` is the expected score
42 |
43 | Advantages
44 | ---------
45 |
46 | - **Simplicity**: The Elo system is easy to understand and implement
47 | - **Transparency**: Players can easily see how their rating changes after each match
48 | - **Proven Track Record**: Used successfully for decades in various competitive domains
49 | - **Zero-Sum**: In a two-player game, the rating points one player gains are exactly what the other player loses
50 | - **Self-Correcting**: Ratings naturally adjust over time as more matches are played
51 |
52 | Limitations
53 | ----------
54 |
55 | - **Requires Many Matches**: Needs a significant number of matches to reach an accurate rating
56 | - **No Confidence Intervals**: Unlike Glicko, Elo doesn't account for rating reliability
57 | - **Assumes Stable Performance**: Doesn't account for player improvement or decline over time
58 | - **K-Factor Sensitivity**: Results are highly dependent on the chosen K-factor
59 | - **No Team Dynamics**: In team sports, doesn't account for individual contributions
60 |
61 | Implementation in Elote
62 | ----------------------
63 |
64 | Elote provides a straightforward implementation of the Elo rating system through the ``EloCompetitor`` class:
65 |
66 | .. code-block:: python
67 |
68 | from elote import EloCompetitor
69 |
70 | # Create two competitors with different initial ratings
71 | player1 = EloCompetitor(initial_rating=1500)
72 | player2 = EloCompetitor(initial_rating=1600)
73 |
74 | # Get win probability
75 | win_probability = player2.expected_score(player1)
76 | print(f"Player 2 win probability: {win_probability:.2%}")
77 |
78 | # Record a match result
79 | player1.beat(player2) # Player 1 won!
80 |
81 | # Ratings are automatically updated
82 | print(f"Player 1 new rating: {player1.rating}")
83 | print(f"Player 2 new rating: {player2.rating}")
84 |
85 | Customization
86 | ------------
87 |
88 | The ``EloCompetitor`` class allows for customization of the K-factor:
89 |
90 | .. code-block:: python
91 |
92 | # Create a competitor with a custom K-factor
93 | player = EloCompetitor(initial_rating=1500, k_factor=32)
94 |
95 | A higher K-factor makes ratings change more quickly, while a lower K-factor makes them more stable. Common K-factor values:
96 |
97 | - 40: For new players with fewer than 30 games (FIDE standard)
98 | - 20: For players with ratings under 2400 (FIDE standard)
99 | - 10: For elite players with ratings over 2400 (FIDE standard)
100 |
101 | Real-World Applications
102 | ---------------------
103 |
104 | The Elo rating system is used in many domains:
105 |
106 | - **Chess**: FIDE and national chess federations
107 | - **Video Games**: League of Legends, DOTA 2, and many other competitive games
108 | - **Sports**: Used for international football rankings
109 | - **Online Matchmaking**: Many platforms use Elo or Elo-derived systems to match players of similar skill
110 |
111 | References
112 | ---------
113 |
114 | 1. Elo, Arpad (1978). *The Rating of Chessplayers, Past and Present*. Arco. ISBN 0-668-04721-6.
115 | 2. Glickman, Mark E. (1995). "A Comprehensive Guide to Chess Ratings". American Chess Journal, 3, 59-102.
116 | 3. Silver, Nate (2015). "How We Calculate NBA Elo Ratings". FiveThirtyEight.
--------------------------------------------------------------------------------
/scripts/run_benchmarks.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Benchmark runner for elote.
4 |
5 | This script runs benchmarks and generates a report with performance metrics.
6 | It can be used to track performance changes over time.
7 |
8 | Usage:
9 | python scripts/run_benchmarks.py [--compare BASELINE]
10 |
11 | Options:
12 | --compare BASELINE Compare results with a baseline JSON file
13 | """
14 |
15 | import json
16 | import argparse
17 | import subprocess
18 | from datetime import datetime
19 | from pathlib import Path
20 |
21 |
22 | def run_benchmarks():
23 | """Run benchmarks and return the JSON output."""
24 | benchmark_dir = Path("benchmark_results")
25 | benchmark_dir.mkdir(exist_ok=True)
26 |
27 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
28 | json_output = benchmark_dir / f"benchmark_{timestamp}.json"
29 |
30 | cmd = [
31 | "uv",
32 | "run",
33 | "pytest",
34 | "tests/test_benchmarks.py",
35 | "--benchmark-json",
36 | str(json_output),
37 | "--benchmark-enable",
38 | "-v",
39 | ]
40 |
41 | print(f"Running benchmarks: {' '.join(cmd)}")
42 | subprocess.run(cmd, check=True)
43 |
44 | return json_output
45 |
46 |
47 | def load_json(file_path):
48 | """Load JSON data from a file."""
49 | with open(file_path, "r") as f:
50 | return json.load(f)
51 |
52 |
53 | def generate_report(current_file, baseline_file=None):
54 | """Generate a performance report."""
55 | current_data = load_json(current_file)
56 |
57 | # Print summary
58 | print("\n" + "=" * 80)
59 | print(f"BENCHMARK RESULTS: {current_file}")
60 | print("=" * 80)
61 |
62 | # Extract benchmark data
63 | benchmarks = current_data.get("benchmarks", [])
64 |
65 | # Group by test name
66 | grouped = {}
67 | for bench in benchmarks:
68 | name = bench["name"]
69 | group = name.split("[")[0] if "[" in name else name
70 | if group not in grouped:
71 | grouped[group] = []
72 | grouped[group].append(bench)
73 |
74 | # Print results by group
75 | for group, benches in sorted(grouped.items()):
76 | print(f"\n## {group}")
77 | print("-" * 80)
78 | print(f"{'Test':<50} {'Min (ms)':<12} {'Mean (ms)':<12} {'Max (ms)':<12} {'StdDev':<12}")
79 | print("-" * 80)
80 |
81 | for bench in sorted(benches, key=lambda x: x["name"]):
82 | name = bench["name"]
83 | if "[" in name:
84 | name = name.split("[")[1].rstrip("]")
85 | else:
86 | name = "default"
87 |
88 | min_time = bench["stats"]["min"] * 1000 # Convert to ms
89 | mean_time = bench["stats"]["mean"] * 1000
90 | max_time = bench["stats"]["max"] * 1000
91 | stddev = bench["stats"]["stddev"] * 1000
92 |
93 | print(f"{name:<50} {min_time:<12.3f} {mean_time:<12.3f} {max_time:<12.3f} {stddev:<12.3f}")
94 |
95 | # Compare with baseline if provided
96 | if baseline_file:
97 | print("\n" + "=" * 80)
98 | print(f"COMPARISON WITH BASELINE: {baseline_file}")
99 | print("=" * 80)
100 |
101 | baseline_data = load_json(baseline_file)
102 | baseline_benchmarks = {b["name"]: b for b in baseline_data.get("benchmarks", [])}
103 |
104 | print(f"{'Test':<50} {'Current (ms)':<12} {'Baseline (ms)':<12} {'Change %':<12}")
105 | print("-" * 80)
106 |
107 | for bench in benchmarks:
108 | name = bench["name"]
109 | if name in baseline_benchmarks:
110 | current_mean = bench["stats"]["mean"] * 1000
111 | baseline_mean = baseline_benchmarks[name]["stats"]["mean"] * 1000
112 | change_pct = ((current_mean - baseline_mean) / baseline_mean) * 100
113 |
114 | # Use color indicators for performance changes
115 | if change_pct > 5: # Worse performance
116 | change_str = f"\033[91m{change_pct:+.2f}%\033[0m" # Red
117 | elif change_pct < -5: # Better performance
118 | change_str = f"\033[92m{change_pct:+.2f}%\033[0m" # Green
119 | else: # Similar performance
120 | change_str = f"{change_pct:+.2f}%"
121 |
122 | print(f"{name:<50} {current_mean:<12.3f} {baseline_mean:<12.3f} {change_str:<12}")
123 | else:
124 | print(f"{name:<50} {bench['stats']['mean'] * 1000:<12.3f} {'N/A':<12} {'N/A':<12}")
125 |
126 | print("\n" + "=" * 80)
127 | return current_file
128 |
129 |
130 | def main():
131 | parser = argparse.ArgumentParser(description="Run benchmarks for elote")
132 | parser.add_argument("--compare", help="Compare with baseline JSON file")
133 | args = parser.parse_args()
134 |
135 | # Run benchmarks
136 | result_file = run_benchmarks()
137 |
138 | # Generate report
139 | baseline_file = args.compare
140 | generate_report(result_file, baseline_file)
141 |
142 | print(f"\nBenchmark results saved to: {result_file}")
143 | print("To compare with these results in the future, run:")
144 | print(f" python scripts/run_benchmarks.py --compare {result_file}")
145 |
146 |
147 | if __name__ == "__main__":
148 | main()
149 |
--------------------------------------------------------------------------------
/docs/source/rating_systems/ecf.rst:
--------------------------------------------------------------------------------
1 | ECF Rating System
2 | ===============
3 |
4 | Overview
5 | --------
6 |
7 | The ECF (English Chess Federation) rating system is the official rating system used for chess players in England. It was developed as an alternative to the Elo system and has been in use since the 1950s, though it has undergone several revisions over the years.
8 |
9 | Unlike Elo and Glicko, which use a logistic curve to calculate expected outcomes, the ECF system uses a linear relationship between rating differences and expected game outcomes. This makes it somewhat simpler to calculate by hand, which was an advantage in the pre-computer era.
10 |
11 | How It Works
12 | -----------
13 |
14 | The ECF rating system is based on the following principles:
15 |
16 | 1. Each player has a grade (rating) that represents their playing strength
17 | 2. The difference between grades determines the expected outcome of a match
18 | 3. After each match, grades are adjusted based on the actual outcome compared to the expected outcome
19 |
20 | In the ECF system, a difference of 40 grade points is expected to yield approximately a 67% win rate for the stronger player. This is different from Elo, where a 100-point difference corresponds to a 64% win expectancy.
21 |
22 | The expected outcome calculation is:
23 |
24 | .. math::
25 |
26 | E_A = 0.5 + \frac{R_A - R_B}{F}
27 |
28 | Where:
29 | - :math:`E_A` is the expected score for player A
30 | - :math:`R_A` is the grade of player A
31 | - :math:`R_B` is the grade of player B
32 | - :math:`F` is a conversion factor (typically 120)
33 |
34 | After a match, the grades are updated using:
35 |
36 | .. math::
37 |
38 | R'_A = R_A + K \times (S_A - E_A)
39 |
40 | Where:
41 | - :math:`R'_A` is the new grade for player A
42 | - :math:`K` is the K-factor (determines how quickly grades change)
43 | - :math:`S_A` is the actual score (1 for win, 0.5 for draw, 0 for loss)
44 | - :math:`E_A` is the expected score
45 |
46 | Advantages
47 | ---------
48 |
49 | - **Simplicity**: The linear relationship is easier to understand and calculate
50 | - **Local Optimization**: Designed specifically for the English chess community
51 | - **Historical Data**: Long history of use provides extensive comparative data
52 | - **Regular Updates**: The ECF publishes updated ratings multiple times per year
53 | - **Transparency**: Clear calculation methods that players can verify
54 |
55 | Limitations
56 | ----------
57 |
58 | - **Limited Range**: Works best within a certain range of skill differences
59 | - **Less Theoretical Basis**: The linear relationship is less theoretically justified than Elo's logistic curve
60 | - **Regional Focus**: Primarily used in England, limiting international comparability
61 | - **No Uncertainty Measure**: Unlike Glicko, doesn't account for rating reliability
62 | - **Fixed Parameters**: Less flexibility in parameter adjustment compared to other systems
63 |
64 | Implementation in Elote
65 | ----------------------
66 |
67 | Elote provides an implementation of the ECF rating system through the ``ECFCompetitor`` class:
68 |
69 | .. code-block:: python
70 |
71 | from elote import ECFCompetitor
72 |
73 | # Create two competitors with different initial grades
74 | player1 = ECFCompetitor(initial_rating=120)
75 | player2 = ECFCompetitor(initial_rating=150)
76 |
77 | # Get win probability
78 | win_probability = player2.expected_score(player1)
79 | print(f"Player 2 win probability: {win_probability:.2%}")
80 |
81 | # Record a match result
82 | player1.beat(player2) # Player 1 won!
83 |
84 | # Grades are automatically updated
85 | print(f"Player 1 new grade: {player1.rating}")
86 | print(f"Player 2 new grade: {player2.rating}")
87 |
88 | Customization
89 | ------------
90 |
91 | The ``ECFCompetitor`` class allows for customization of the K-factor and the conversion factor:
92 |
93 | .. code-block:: python
94 |
95 | # Create a competitor with custom parameters
96 | player = ECFCompetitor(
97 | initial_rating=120,
98 | k_factor=20,
99 | f_factor=120
100 | )
101 |
102 | Key parameters:
103 | - **initial_rating**: Starting grade value
104 | - **k_factor**: Determines how quickly grades change (default: 16)
105 | - **f_factor**: Conversion factor for expected score calculation (default: 120)
106 |
107 | ECF to Elo Conversion
108 | --------------------
109 |
110 | For those familiar with Elo ratings, ECF grades can be approximately converted to Elo ratings using the formula:
111 |
112 | .. math::
113 |
114 | \text{Elo} = 7.5 \times \text{ECF} + 700
115 |
116 | This means an ECF grade of 100 is roughly equivalent to an Elo rating of 1450.
117 |
118 | Real-World Applications
119 | ---------------------
120 |
121 | The ECF rating system is primarily used in England for:
122 |
123 | - **Chess Tournaments**: Official ECF-rated events throughout England
124 | - **Club Play**: Local chess clubs use ECF grades for team selection and pairing
125 | - **Junior Development**: Tracking progress of young players
126 | - **National Rankings**: Determining England's top players
127 |
128 | References
129 | ---------
130 |
131 | 1. [ECF Grading System](http://www.ecfgrading.org.uk/new/help.php#elo) - Official documentation
132 | 2. Clarke, P.H. (1982). "The Theory of Grading". British Chess Magazine.
133 | 3. Elo, Arpad (1978). *The Rating of Chessplayers, Past and Present*. Arco. ISBN 0-668-04721-6.
134 | 4. Sonas, Jeff (2002). "The Sonas Rating Formula - Better than Elo?". ChessBase News.
--------------------------------------------------------------------------------
/docs/source/rating_systems/dwz.rst:
--------------------------------------------------------------------------------
1 | DWZ Rating System
2 | ==============
3 |
4 | Overview
5 | --------
6 |
7 | The Deutsche Wertungszahl (DWZ), or German Evaluation Number, is the official chess rating system of the German Chess Federation (Deutscher Schachbund). Developed in the 1990s as a replacement for the previously used Ingo system, DWZ is similar to the Elo rating system but with some important modifications to better handle tournament play and player development.
8 |
9 | The DWZ system is particularly notable for its sophisticated handling of youth players, whose ratings tend to change more rapidly as they improve, and for its detailed approach to calculating expected outcomes based on rating differences.
10 |
11 | How It Works
12 | -----------
13 |
14 | The DWZ system uses the following key components:
15 |
16 | 1. **Rating (R)**: Represents the player's skill level
17 | 2. **Development Coefficient (E)**: Determines how quickly ratings change, with higher values for younger and less experienced players
18 | 3. **Performance Rating (P)**: The rating that would exactly match a player's tournament results
19 |
20 | The expected outcome calculation is similar to Elo:
21 |
22 | .. math::
23 |
24 | W_e = \frac{1}{1 + 10^{-(R_A - R_B) / 400}}
25 |
26 | Where:
27 | - :math:`W_e` is the expected score for player A
28 | - :math:`R_A` is the rating of player A
29 | - :math:`R_B` is the rating of player B
30 |
31 | After a tournament, the rating is updated using:
32 |
33 | .. math::
34 |
35 | R' = R + E \times (W - W_e)
36 |
37 | Where:
38 | - :math:`R'` is the new rating
39 | - :math:`E` is the development coefficient
40 | - :math:`W` is the actual score
41 | - :math:`W_e` is the expected score
42 |
43 | The development coefficient is calculated based on:
44 |
45 | .. math::
46 |
47 | E = E_0 \times f(A) \times f(n)
48 |
49 | Where:
50 | - :math:`E_0` is the base coefficient (typically 30)
51 | - :math:`f(A)` is an age factor (higher for younger players)
52 | - :math:`f(n)` is an experience factor based on number of rated games played
53 |
54 | Advantages
55 | ---------
56 |
57 | - **Age Sensitivity**: Better handles rating changes for youth players
58 | - **Experience Factor**: Accounts for player experience level
59 | - **Tournament Focus**: Designed for batch updates after tournaments
60 | - **National Standardization**: Consistent application across German chess events
61 | - **Detailed Documentation**: Well-documented methodology with regular updates
62 |
63 | Limitations
64 | ----------
65 |
66 | - **Complexity**: More complex to calculate than basic Elo
67 | - **Regional Focus**: Primarily used in Germany and some neighboring countries
68 | - **No Uncertainty Measure**: Unlike Glicko, doesn't explicitly track rating reliability
69 | - **Parameter Sensitivity**: Results depend on proper calibration of multiple factors
70 | - **Less International Recognition**: Not as widely recognized as FIDE Elo ratings
71 |
72 | Implementation in Elote
73 | ----------------------
74 |
75 | Elote provides an implementation of the DWZ rating system through the ``DWZCompetitor`` class:
76 |
77 | .. code-block:: python
78 |
79 | from elote import DWZCompetitor
80 |
81 | # Create two competitors with different initial ratings
82 | player1 = DWZCompetitor(initial_rating=1600)
83 | player2 = DWZCompetitor(initial_rating=1800)
84 |
85 | # Get win probability
86 | win_probability = player2.expected_score(player1)
87 | print(f"Player 2 win probability: {win_probability:.2%}")
88 |
89 | # Record a match result
90 | player1.beat(player2) # Player 1 won!
91 |
92 | # Ratings are automatically updated
93 | print(f"Player 1 new rating: {player1.rating}")
94 | print(f"Player 2 new rating: {player2.rating}")
95 |
96 | Customization
97 | ------------
98 |
99 | The ``DWZCompetitor`` class allows for customization of several parameters:
100 |
101 | .. code-block:: python
102 |
103 | # Create a competitor with custom parameters
104 | player = DWZCompetitor(
105 | initial_rating=1600,
106 | initial_development_coeff=30,
107 | base_development_coeff=30
108 | )
109 |
110 | Key parameters:
111 | - **initial_rating**: Starting rating value
112 | - **initial_development_coeff**: Starting development coefficient
113 | - **base_development_coeff**: Base value for development coefficient calculation
114 |
115 | DWZ to Elo Conversion
116 | -------------------
117 |
118 | While DWZ and Elo use different calculation methods, the numerical values are designed to be roughly comparable. For practical purposes:
119 |
120 | .. math::
121 |
122 | \text{DWZ} \approx \text{Elo}
123 |
124 | However, due to different update mechanisms, the ratings may diverge over time for the same player.
125 |
126 | Real-World Applications
127 | ---------------------
128 |
129 | The DWZ rating system is used primarily in:
130 |
131 | - **German Chess Federation**: Official rating system for all German chess events
132 | - **Youth Development**: Specially calibrated for tracking youth player development
133 | - **Club Championships**: Used for local and regional tournaments in Germany
134 | - **National Rankings**: Determining Germany's top players
135 |
136 | References
137 | ---------
138 |
139 | 1. [Deutsche Wertungszahl](https://en.wikipedia.org/wiki/Deutsche_Wertungszahl) - Wikipedia article
140 | 2. [Deutscher Schachbund](https://www.schachbund.de/dwz.html) - Official German Chess Federation site
141 | 3. Hechenberger, A. (2001). "Die Deutsche Wertungszahl". Schach-Journal.
142 | 4. Glickman, Mark E. (1995). "A Comprehensive Guide to Chess Ratings". American Chess Journal, 3, 59-102.
--------------------------------------------------------------------------------
/tests/test_ECFCompetitor_known_values.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from elote import ECFCompetitor
3 |
4 |
5 | class TestECFKnownValues(unittest.TestCase):
6 | """Tests for ECFCompetitor with known values to verify correctness after optimization."""
7 |
8 | def test_initial_rating(self):
9 | """Test that initial rating is set correctly."""
10 | player = ECFCompetitor(initial_rating=100)
11 | self.assertEqual(player.rating, 100)
12 |
13 | player = ECFCompetitor(initial_rating=120)
14 | self.assertEqual(player.rating, 120)
15 |
16 | def test_elo_conversion(self):
17 | """Test that elo_conversion property returns the correct value."""
18 | player = ECFCompetitor(initial_rating=100)
19 | self.assertEqual(player.elo_conversion, 100 * 7.5 + 700)
20 |
21 | player = ECFCompetitor(initial_rating=120)
22 | self.assertEqual(player.elo_conversion, 120 * 7.5 + 700)
23 |
24 | def test_transformed_elo_rating(self):
25 | """Test that transformed_elo_rating property returns the correct value."""
26 | player = ECFCompetitor(initial_rating=100)
27 | expected = 10 ** ((100 * 7.5 + 700) / 400)
28 | self.assertAlmostEqual(player.transformed_elo_rating, expected)
29 |
30 | # Test caching - should return the same value without recalculating
31 | self.assertAlmostEqual(player.transformed_elo_rating, expected)
32 |
33 | def test_expected_score(self):
34 | """Test expected_score with known values."""
35 | player1 = ECFCompetitor(initial_rating=100)
36 | player2 = ECFCompetitor(initial_rating=120)
37 |
38 | # Calculate expected values manually
39 | p1_transformed = 10 ** ((100 * 7.5 + 700) / 400)
40 | p2_transformed = 10 ** ((120 * 7.5 + 700) / 400)
41 | expected = p1_transformed / (p1_transformed + p2_transformed)
42 |
43 | self.assertAlmostEqual(player1.expected_score(player2), expected)
44 |
45 | def test_beat_with_known_values(self):
46 | """Test beat method with known values."""
47 | player1 = ECFCompetitor(initial_rating=100)
48 | player2 = ECFCompetitor(initial_rating=120)
49 |
50 | # Player1 beats player2
51 | player1.beat(player2)
52 |
53 | # After player1 beats player2, player1's rating should be updated
54 | # The new rating is the mean of the scores in the deque
55 | # Since we just initialized the deque, it contains [100] initially
56 | # After beat, it contains [100, 120+50] = [100, 170]
57 | # So the mean is (100 + 170) / 2 = 135
58 | self.assertEqual(player1.rating, 135)
59 |
60 | # After player1 beats player2, player2's rating should be updated
61 | # The new rating is the mean of the scores in the deque
62 | # Since we just initialized the deque, it contains [120] initially
63 | # After beat, it contains [120, 100-50] = [120, 50]
64 | # The minimum rating check is applied when adding to the deque, not when calculating the mean
65 | # So the mean is (120 + 50) / 2 = 85
66 | self.assertEqual(player2.rating, 85)
67 |
68 | def test_tied_with_known_values(self):
69 | """Test tied method with known values."""
70 | player1 = ECFCompetitor(initial_rating=100)
71 | player2 = ECFCompetitor(initial_rating=120)
72 |
73 | # Players tie
74 | player1.tied(player2)
75 |
76 | # After tie, player1's rating should be updated
77 | # The new rating is the mean of the scores in the deque
78 | # Since we just initialized the deque, it contains [100] initially
79 | # After tie, it contains [100, 120] = [100, 120]
80 | # So the mean is (100 + 120) / 2 = 110
81 | self.assertEqual(player1.rating, 110)
82 |
83 | # After tie, player2's rating should be updated
84 | # The new rating is the mean of the scores in the deque
85 | # Since we just initialized the deque, it contains [120] initially
86 | # After tie, it contains [120, 100] = [120, 100]
87 | # So the mean is (120 + 100) / 2 = 110
88 | self.assertEqual(player2.rating, 110)
89 |
90 | def test_delta_limit(self):
91 | """Test that the delta limit is applied correctly."""
92 | player1 = ECFCompetitor(initial_rating=100)
93 | player2 = ECFCompetitor(initial_rating=200) # Rating difference > delta (50)
94 |
95 | # Player1 beats player2
96 | player1.beat(player2)
97 |
98 | # Since difference > delta, player2's effective rating should be limited
99 | # The effective rating of player2 is limited to player1's rating + delta = 100 + 50 = 150
100 | # After beat, player1's scores deque contains [100, 150+50] = [100, 200]
101 | # So the mean is (100 + 200) / 2 = 150
102 | self.assertEqual(player1.rating, 150)
103 |
104 | def test_scores_deque_behavior(self):
105 | """Test that the scores deque behaves correctly with maxlen."""
106 | player = ECFCompetitor(initial_rating=100)
107 |
108 | # Initialize scores
109 | if player.scores is None:
110 | player._ECFCompetitor__initialize_ratings()
111 |
112 | # Add more than _n_periods scores
113 | for i in range(player._n_periods + 10):
114 | player._update(i)
115 |
116 | # Check that only the last _n_periods scores are kept
117 | self.assertEqual(len(player.scores), player._n_periods)
118 |
119 | # Check that the oldest scores were dropped
120 | self.assertEqual(min(player.scores), player._n_periods + 10 - player._n_periods)
121 |
122 |
123 | if __name__ == "__main__":
124 | unittest.main()
125 |
--------------------------------------------------------------------------------
/docs/source/rating_systems/glicko.rst:
--------------------------------------------------------------------------------
1 | Glicko Rating System
2 | ==================
3 |
4 | Overview
5 | --------
6 |
7 | The Glicko rating system was developed by Mark Glickman in 1995 as an improvement over the Elo rating system. The key innovation of Glicko is the introduction of a "rating deviation" (RD) parameter that measures the uncertainty in a player's rating. This addresses one of the main limitations of the Elo system, which doesn't account for rating reliability.
8 |
9 | The name "Glicko" is derived from the creator's surname, Glickman. The system has since been further refined into Glicko-2, though Elote currently implements the original Glicko-1 system.
10 |
11 | How It Works
12 | -----------
13 |
14 | The Glicko system uses three key parameters:
15 |
16 | 1. **Rating (r)**: Represents the player's skill level, similar to Elo
17 | 2. **Rating Deviation (RD)**: Represents the uncertainty in the rating (higher RD = more uncertainty)
18 | 3. **Time Factor (c)**: Controls how much the RD increases over time without playing
19 |
20 | The expected outcome calculation is similar to Elo but incorporates the rating deviations:
21 |
22 | .. math::
23 |
24 | E(A, B) = \frac{1}{1 + 10^{-g(RD_B) \times (r_A - r_B) / 400}}
25 |
26 | Where:
27 | - :math:`g(RD) = \frac{1}{\sqrt{1 + 3 \times RD^2 / \pi^2}}`
28 | - :math:`r_A` and :math:`r_B` are the ratings of players A and B
29 | - :math:`RD_A` and :math:`RD_B` are their rating deviations
30 |
31 | After a match, both the rating and rating deviation are updated:
32 |
33 | .. math::
34 |
35 | r'_A = r_A + \frac{q}{1/RD_A^2 + 1/d^2} \times g(RD_B) \times (S_A - E(A, B))
36 |
37 | .. math::
38 |
39 | RD'_A = \sqrt{\frac{1}{1/RD_A^2 + 1/d^2}}
40 |
41 | Where:
42 | - :math:`q = \ln(10) / 400`
43 | - :math:`d^2 = 1 / (q^2 \times g(RD_B)^2 \times E(A, B) \times (1 - E(A, B)))`
44 | - :math:`S_A` is the actual score (1 for win, 0.5 for draw, 0 for loss)
45 |
46 | When a player doesn't compete for a period, their RD increases:
47 |
48 | .. math::
49 |
50 | RD'_A = \min(\sqrt{RD_A^2 + c^2 \times t}, RD_{max})
51 |
52 | Where:
53 | - :math:`t` is the time since last competition
54 | - :math:`c` is the volatility constant
55 | - :math:`RD_{max}` is the maximum allowed rating deviation
56 |
57 | Advantages
58 | ---------
59 |
60 | - **Uncertainty Measurement**: Accounts for the reliability of a player's rating
61 | - **Inactivity Handling**: Automatically increases uncertainty for inactive players
62 | - **More Accurate Matchmaking**: Can match players with similar ratings but different uncertainties
63 | - **Faster Convergence**: New players can reach their true skill level faster
64 | - **Better for Sparse Data**: Works well when players don't compete frequently
65 |
66 | Limitations
67 | ----------
68 |
69 | - **Complexity**: More complex to understand and implement than Elo
70 | - **Parameter Sensitivity**: Results depend on proper tuning of multiple parameters
71 | - **Computational Overhead**: Requires more calculations than Elo
72 | - **No Volatility Tracking**: Unlike Glicko-2, doesn't track how volatile a player's performance is
73 | - **Batch Updates**: Originally designed for updating ratings in batches rather than after each game
74 |
75 | Implementation in Elote
76 | ----------------------
77 |
78 | Elote provides an implementation of the Glicko-1 rating system through the ``GlickoCompetitor`` class:
79 |
80 | .. code-block:: python
81 |
82 | from elote import GlickoCompetitor
83 |
84 | # Create two competitors with different initial ratings and RDs
85 | player1 = GlickoCompetitor(initial_rating=1500, initial_rd=350)
86 | player2 = GlickoCompetitor(initial_rating=1700, initial_rd=300)
87 |
88 | # Get win probability
89 | win_probability = player2.expected_score(player1)
90 | print(f"Player 2 win probability: {win_probability:.2%}")
91 |
92 | # Record a match result
93 | player1.beat(player2) # Player 1 won!
94 |
95 | # Ratings and RDs are automatically updated
96 | print(f"Player 1 new rating: {player1.rating}, RD: {player1.rd}")
97 | print(f"Player 2 new rating: {player2.rating}, RD: {player2.rd}")
98 |
99 | Customization
100 | ------------
101 |
102 | The ``GlickoCompetitor`` class allows for customization of several parameters:
103 |
104 | .. code-block:: python
105 |
106 | # Create a competitor with custom parameters
107 | player = GlickoCompetitor(
108 | initial_rating=1500,
109 | initial_rd=350,
110 | volatility=0.06,
111 | tau=0.5
112 | )
113 |
114 | Key parameters:
115 | - **initial_rating**: Starting rating value (default: 1500)
116 | - **initial_rd**: Starting rating deviation (default: 350)
117 | - **volatility**: How much RD increases over time (default: 0.06)
118 | - **tau**: System constant affecting rating changes (default: 0.5)
119 |
120 | Real-World Applications
121 | ---------------------
122 |
123 | The Glicko rating system is used in various competitive domains:
124 |
125 | - **Chess**: Used by the Australian Chess Federation and Free Internet Chess Server
126 | - **Video Games**: Used in modified form by many competitive games
127 | - **Online Platforms**: Used by lichess.org and other competitive platforms
128 | - **Sports Analytics**: Used for player performance analysis in various sports
129 |
130 | References
131 | ---------
132 |
133 | 1. Glickman, Mark E. (1995). "A Comprehensive Guide to Chess Ratings". American Chess Journal, 3, 59-102.
134 | 2. Glickman, Mark E. (1999). "Parameter estimation in large dynamic paired comparison experiments". Applied Statistics, 48, 377-394.
135 | 3. Glickman, Mark E. (2001). "Dynamic paired comparison models with stochastic variances". Journal of Applied Statistics, 28, 673-689.
136 | 4. [The Glicko System](http://www.glicko.net/glicko/glicko.pdf) - Original paper by Mark Glickman
--------------------------------------------------------------------------------
/examples/dataset_example.py:
--------------------------------------------------------------------------------
1 | """
2 | Example of using datasets with different rating algorithms.
3 |
4 | This example demonstrates how to use the datasets module to evaluate different rating algorithms.
5 | """
6 |
7 | import time
8 | import pandas as pd
9 | import matplotlib.pyplot as plt
10 |
11 | from elote import (
12 | LambdaArena,
13 | EloCompetitor,
14 | GlickoCompetitor,
15 | Glicko2Competitor,
16 | TrueSkillCompetitor,
17 | SyntheticDataset,
18 | ChessDataset,
19 | CollegeFootballDataset,
20 | train_and_evaluate_arena,
21 | )
22 |
23 |
24 | def progress_callback(phase, current, total):
25 | """Callback function for reporting progress."""
26 | if current == 0:
27 | print(f"\nStarting {phase} phase...")
28 | elif current == total:
29 | print(f"\nCompleted {phase} phase.")
30 |
31 |
32 | def evaluate_algorithms_on_dataset(dataset_name, dataset, test_ratio=0.2, seed=42):
33 | """
34 | Evaluate different rating algorithms on a dataset.
35 |
36 | Args:
37 | dataset_name: Name of the dataset
38 | dataset: Dataset object
39 | test_ratio: Ratio of data to use for testing
40 | seed: Random seed for reproducibility
41 | """
42 | print(f"\n=== Evaluating algorithms on {dataset_name} dataset ===")
43 |
44 | # Split the dataset into train and test sets
45 | print(f"Splitting dataset with test_ratio={test_ratio}...")
46 | data_split = dataset.time_split(test_ratio=test_ratio)
47 | print(f"Split complete: {len(data_split.train)} train samples, {len(data_split.test)} test samples")
48 |
49 | # Define the algorithms to evaluate
50 | algorithms = [
51 | ("Elo", EloCompetitor, {"initial_rating": 1500}),
52 | ("Glicko", GlickoCompetitor, {"initial_rating": 1500}),
53 | ("Glicko-2", Glicko2Competitor, {"initial_rating": 1500}),
54 | ("TrueSkill", TrueSkillCompetitor, {}),
55 | ]
56 |
57 | # Evaluate each algorithm
58 | results = []
59 |
60 | for algo_name, competitor_class, competitor_kwargs in algorithms:
61 | print(f"\nEvaluating {algo_name}...")
62 | start_time = time.time()
63 |
64 | # Create an arena with the algorithm
65 | arena = LambdaArena(
66 | lambda a, b, attributes=None: True, # Dummy function, not used in this example
67 | base_competitor=competitor_class,
68 | base_competitor_kwargs=competitor_kwargs,
69 | )
70 |
71 | # Train and evaluate the arena
72 | _, history = train_and_evaluate_arena(
73 | arena,
74 | data_split,
75 | batch_size=1000,
76 | progress_callback=progress_callback,
77 | )
78 |
79 | # Calculate metrics
80 | metrics = history.calculate_metrics()
81 | accuracy = metrics["accuracy"]
82 | precision = metrics["precision"]
83 | recall = metrics["recall"]
84 | f1 = metrics["f1"]
85 |
86 | end_time = time.time()
87 | elapsed_time = end_time - start_time
88 |
89 | # Store results
90 | results.append(
91 | {
92 | "Algorithm": algo_name,
93 | "Accuracy": accuracy,
94 | "Precision": precision,
95 | "Recall": recall,
96 | "F1 Score": f1,
97 | "Time (s)": elapsed_time,
98 | }
99 | )
100 |
101 | print(f"{algo_name} evaluation complete in {elapsed_time:.2f} seconds")
102 | print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
103 |
104 | # Convert results to DataFrame
105 | results_df = pd.DataFrame(results)
106 |
107 | # Print results table
108 | print("\nResults:")
109 | print(results_df.to_string(index=False))
110 |
111 | # Plot results
112 | plt.figure(figsize=(12, 6))
113 |
114 | # Plot accuracy, precision, recall, F1
115 | metrics = ["Accuracy", "Precision", "Recall", "F1 Score"]
116 | for i, metric in enumerate(metrics):
117 | plt.subplot(1, 2, 1)
118 | plt.bar([x + i * 0.2 for x in range(len(algorithms))], results_df[metric], width=0.2, label=metric)
119 |
120 | plt.xlabel("Algorithm")
121 | plt.ylabel("Score")
122 | plt.title(f"Performance Metrics on {dataset_name} Dataset")
123 | plt.xticks([i + 0.3 for i in range(len(algorithms))], results_df["Algorithm"])
124 | plt.legend()
125 | plt.grid(axis="y", linestyle="--", alpha=0.7)
126 |
127 | # Plot time
128 | plt.subplot(1, 2, 2)
129 | plt.bar(results_df["Algorithm"], results_df["Time (s)"])
130 | plt.xlabel("Algorithm")
131 | plt.ylabel("Time (s)")
132 | plt.title(f"Execution Time on {dataset_name} Dataset")
133 | plt.grid(axis="y", linestyle="--", alpha=0.7)
134 |
135 | plt.tight_layout()
136 | plt.savefig(f"{dataset_name.lower().replace(' ', '_')}_results.png")
137 | plt.close()
138 |
139 |
140 | def main():
141 | """Main function."""
142 | # Evaluate on synthetic dataset
143 | print("Generating synthetic dataset...")
144 | synthetic_dataset = SyntheticDataset(
145 | num_competitors=100,
146 | num_matchups=5000,
147 | skill_distribution="normal",
148 | skill_mean=1500,
149 | skill_std=300,
150 | noise_std=100,
151 | draw_probability=0.1,
152 | time_span_days=365,
153 | seed=42,
154 | )
155 | evaluate_algorithms_on_dataset("Synthetic", synthetic_dataset, test_ratio=0.2, seed=42)
156 |
157 | # Evaluate on chess dataset
158 | print("\nLoading chess dataset...")
159 | chess_dataset = ChessDataset(max_games=5000, year=2013, month=1)
160 | evaluate_algorithms_on_dataset("Chess", chess_dataset, test_ratio=0.2, seed=42)
161 |
162 | # Evaluate on college football dataset
163 | print("\nLoading college football dataset...")
164 | football_dataset = CollegeFootballDataset(start_year=2015, end_year=2022)
165 | evaluate_algorithms_on_dataset("College Football", football_dataset, test_ratio=0.2, seed=42)
166 |
167 |
168 | if __name__ == "__main__":
169 | main()
170 |
--------------------------------------------------------------------------------
/tests/test_examples.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import subprocess
4 | import sys
5 | from pathlib import Path
6 |
7 |
8 | class TestExamples(unittest.TestCase):
9 | """Tests that verify all example scripts run without errors."""
10 |
11 | def setUp(self):
12 | # Get the root directory of the project
13 | self.root_dir = Path(__file__).parent.parent
14 | self.examples_dir = self.root_dir / "examples"
15 |
16 | # Skip examples that require external API access or take too long to run in tests
17 | self.skip_examples = [
18 | "use_cases/cfb_w_lib.py", # Requires external API access
19 | "use_cases/chess_w_lib.py", # Takes too long to run in tests
20 | "dataset_example.py", # Takes too long to run in tests
21 | # "persist_state_arena.py", # Times out in tests
22 | # "sample_bout.py", # Times out in tests
23 | # "colley_matrix_example.py", # Times out in tests
24 | # "bout_with_initialization.py", # Added to prevent timeout
25 | # "prediction.py", # Added to prevent timeout
26 | ]
27 |
28 | def test_example_scripts(self):
29 | """Test that all example scripts run without errors."""
30 | # Get all Python files in the examples directory
31 | example_files = []
32 | for root, _, files in os.walk(self.examples_dir):
33 | for file in files:
34 | if file.endswith(".py") and file != "__init__.py":
35 | rel_path = os.path.relpath(os.path.join(root, file), self.examples_dir)
36 | if rel_path not in self.skip_examples:
37 | example_files.append(rel_path)
38 |
39 | # Make sure we found some example files
40 | self.assertGreater(len(example_files), 0, "No example files found")
41 |
42 | # Run each example script and check for errors
43 | for example_file in example_files:
44 | with self.subTest(example=example_file):
45 | script_path = os.path.join(self.examples_dir, example_file)
46 |
47 | # Run the script with a timeout to prevent hanging
48 | try:
49 | result = subprocess.run(
50 | [sys.executable, script_path],
51 | capture_output=True,
52 | text=True,
53 | timeout=60,
54 | )
55 |
56 | # Check if the script ran successfully
57 | self.assertEqual(
58 | result.returncode, 0, f"Example {example_file} failed with error:\n{result.stderr}"
59 | )
60 | except subprocess.TimeoutExpired:
61 | self.fail(f"Example {example_file} timed out after 10 seconds")
62 |
63 | def test_individual_examples(self):
64 | """Test each example script individually with specific assertions."""
65 | # Test sample_bout.py - skip if in skip_examples
66 | if "sample_bout.py" not in self.skip_examples:
67 | self._test_specific_example(
68 | "sample_bout.py", expected_output_contains=["Starting ratings:", "After matches"]
69 | )
70 |
71 | # Test prediction.py
72 | self._test_specific_example("prediction.py", expected_output_contains=["probability of better beating good"])
73 |
74 | # Test bout_with_ties.py
75 | self._test_specific_example(
76 | "bout_with_ties.py", expected_output_contains=["Starting ratings:", "After matches with ties"]
77 | )
78 |
79 | # Test sample_arena.py
80 | self._test_specific_example("sample_arena.py", expected_output_contains=["Arena results"])
81 |
82 | # Test dwz_arena.py
83 | self._test_specific_example("dwz_arena.py", expected_output_contains=["Arena results"])
84 |
85 | # Test ecf_arena.py
86 | self._test_specific_example("ecf_arena.py", expected_output_contains=["Arena results"])
87 |
88 | # Test glicko_arena.py
89 | self._test_specific_example("glicko_arena.py", expected_output_contains=["Arena results"])
90 |
91 | # Test persist_state_arena.py - skip if in skip_examples
92 | if "persist_state_arena.py" not in self.skip_examples:
93 | self._test_specific_example("persist_state_arena.py", expected_output_contains=["Arena results"])
94 |
95 | # Test bout_with_initialization.py
96 | self._test_specific_example(
97 | "bout_with_initialization.py", expected_output_contains=["Starting ratings:", "After matches"]
98 | )
99 |
100 | # Test colley_matrix_example.py
101 | self._test_specific_example(
102 | "colley_matrix_example.py",
103 | expected_output_contains=["Initial ratings:", "Final ratings:", "Sum of all ratings"],
104 | )
105 |
106 | # Test colley_matrix_comparison.py
107 | self._test_specific_example(
108 | "colley_matrix_comparison.py",
109 | expected_output_contains=["Simulating tournament", "Colley Matrix Method is not sensitive to match order"],
110 | )
111 |
112 | def _test_specific_example(self, example_file, expected_output_contains):
113 | """Helper method to test a specific example with expected output."""
114 | script_path = os.path.join(self.examples_dir, example_file)
115 |
116 | # Run the script
117 | try:
118 | result = subprocess.run([sys.executable, script_path], capture_output=True, text=True, timeout=10)
119 |
120 | # Check if the script ran successfully
121 | self.assertEqual(result.returncode, 0, f"Example {example_file} failed with error:\n{result.stderr}")
122 |
123 | # Check if the output contains expected strings
124 | for expected in expected_output_contains:
125 | self.assertIn(expected, result.stdout, f"Example {example_file} output did not contain '{expected}'")
126 | except subprocess.TimeoutExpired:
127 | self.fail(f"Example {example_file} timed out after 10 seconds")
128 |
129 |
130 | if __name__ == "__main__":
131 | unittest.main()
132 |
--------------------------------------------------------------------------------
/docs/source/rating_systems/ensemble.rst:
--------------------------------------------------------------------------------
1 | Ensemble Rating System
2 | ====================
3 |
4 | Overview
5 | --------
6 |
7 | The Ensemble rating system in Elote is a meta-rating approach that combines multiple rating systems to leverage their individual strengths while mitigating their weaknesses. By aggregating predictions from different rating algorithms, the Ensemble system can potentially provide more robust and accurate predictions than any single rating system alone.
8 |
9 | This approach is inspired by ensemble methods in machine learning, where combining multiple models often leads to better performance than any individual model. The Ensemble competitor in Elote allows you to combine any of the implemented rating systems (Elo, Glicko, ECF, DWZ) with customizable weights.
10 |
11 | How It Works
12 | -----------
13 |
14 | The Ensemble rating system works by:
15 |
16 | 1. Maintaining multiple rating systems for each competitor
17 | 2. Calculating expected outcomes from each system
18 | 3. Combining these predictions using a weighted average
19 | 4. Updating each underlying rating system after matches
20 |
21 | The expected outcome calculation is:
22 |
23 | .. math::
24 |
25 | E_{ensemble} = \sum_{i=1}^{n} w_i \times E_i
26 |
27 | Where:
28 | - :math:`E_{ensemble}` is the ensemble expected score
29 | - :math:`E_i` is the expected score from rating system i
30 | - :math:`w_i` is the weight assigned to rating system i
31 | - :math:`n` is the number of rating systems in the ensemble
32 |
33 | After a match, each underlying rating system is updated according to its own update rules, and the ensemble prediction is recalculated.
34 |
35 | Advantages
36 | ---------
37 |
38 | - **Robustness**: Less sensitive to the quirks of any single rating system
39 | - **Accuracy**: Can achieve better predictive performance by combining complementary systems
40 | - **Flexibility**: Can be customized with different component systems and weights
41 | - **Adaptability**: Works well across different domains and competition structures
42 | - **Graceful Degradation**: If one system performs poorly in a specific scenario, others can compensate
43 |
44 | Limitations
45 | ----------
46 |
47 | - **Complexity**: More complex to implement and understand than single rating systems
48 | - **Computational Overhead**: Requires calculating and updating multiple rating systems
49 | - **Parameter Tuning**: Finding optimal weights may require experimentation
50 | - **Black Box Nature**: The combined prediction may be harder to interpret
51 | - **Cold Start**: Requires sufficient data to properly calibrate all component systems
52 |
53 | Implementation in Elote
54 | ----------------------
55 |
56 | Elote provides an implementation of the Ensemble rating system through the ``EnsembleCompetitor`` class:
57 |
58 | .. code-block:: python
59 |
60 | from elote import EnsembleCompetitor
61 | from elote import EloCompetitor, GlickoCompetitor
62 |
63 | # Create an ensemble with Elo and Glicko components
64 | player1 = EnsembleCompetitor(
65 | rating_systems=[
66 | (EloCompetitor(initial_rating=1500), 0.7),
67 | (GlickoCompetitor(initial_rating=1500, initial_rd=350), 0.3)
68 | ]
69 | )
70 |
71 | player2 = EnsembleCompetitor(
72 | rating_systems=[
73 | (EloCompetitor(initial_rating=1600), 0.7),
74 | (GlickoCompetitor(initial_rating=1600, initial_rd=350), 0.3)
75 | ]
76 | )
77 |
78 | # Get win probability
79 | win_probability = player2.expected_score(player1)
80 | print(f"Player 2 win probability: {win_probability:.2%}")
81 |
82 | # Record a match result
83 | player1.beat(player2) # Player 1 won!
84 |
85 | # All underlying ratings are automatically updated
86 | print(f"Player 1 ensemble expected score vs Player 2: {player1.expected_score(player2):.2%}")
87 |
88 | Customization
89 | ------------
90 |
91 | The ``EnsembleCompetitor`` class allows for extensive customization:
92 |
93 | .. code-block:: python
94 |
95 | from elote import EnsembleCompetitor, EloCompetitor, GlickoCompetitor, ECFCompetitor, DWZCompetitor
96 |
97 | # Create an ensemble with all available rating systems
98 | player = EnsembleCompetitor(
99 | rating_systems=[
100 | (EloCompetitor(initial_rating=1500), 0.4),
101 | (GlickoCompetitor(initial_rating=1500), 0.3),
102 | (ECFCompetitor(initial_rating=120), 0.2),
103 | (DWZCompetitor(initial_rating=1500), 0.1)
104 | ]
105 | )
106 |
107 | Key considerations:
108 | - The weights should sum to 1.0 for proper probabilistic interpretation
109 | - Higher weights give more influence to that rating system
110 | - You can include any combination of rating systems
111 | - Each component can be customized with its own parameters
112 |
113 | Choosing Weights
114 | --------------
115 |
116 | There are several approaches to choosing weights for your ensemble:
117 |
118 | 1. **Equal Weights**: Start with equal weights for all systems
119 | 2. **Domain Knowledge**: Assign weights based on known performance in your domain
120 | 3. **Cross-Validation**: Use historical data to find optimal weights
121 | 4. **Adaptive Weights**: Dynamically adjust weights based on each system's performance
122 |
123 | For most applications, starting with equal weights and then adjusting based on observed performance is a practical approach.
124 |
125 | Real-World Applications
126 | ---------------------
127 |
128 | Ensemble rating systems are valuable in:
129 |
130 | - **Sports Analytics**: Combining multiple models for more accurate predictions
131 | - **Game Matchmaking**: Creating balanced matches in competitive games
132 | - **Recommendation Systems**: Ranking items based on multiple criteria
133 | - **Tournament Design**: Seeding players based on robust ratings
134 | - **Decision Making**: Aggregating multiple ranking methods for group decisions
135 |
136 | References
137 | ---------
138 |
139 | 1. Dietterich, T. G. (2000). "Ensemble Methods in Machine Learning". Multiple Classifier Systems, 1-15.
140 | 2. Seni, G., & Elder, J. F. (2010). "Ensemble Methods in Data Mining: Improving Accuracy Through Combining Predictions". Synthesis Lectures on Data Mining and Knowledge Discovery, 2(1), 1-126.
141 | 3. Graepel, T., Herbrich, R., & Gold, J. (2004). "Learning to Fight". Proceedings of the International Conference on Computer Games: Artificial Intelligence, Design and Education.
--------------------------------------------------------------------------------
/tests/test_EloCompetitor_known_values.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from elote import EloCompetitor
3 |
4 |
5 | class TestEloKnownValues(unittest.TestCase):
6 | """Tests for EloCompetitor with known values to verify correctness after optimization."""
7 |
8 | def test_initial_rating(self):
9 | """Test that initial rating is set correctly."""
10 | player = EloCompetitor(initial_rating=400)
11 | self.assertEqual(player.rating, 400)
12 |
13 | player = EloCompetitor(initial_rating=1000)
14 | self.assertEqual(player.rating, 1000)
15 |
16 | def test_transformed_rating(self):
17 | """Test that transformed_rating property returns the correct value."""
18 | player = EloCompetitor(initial_rating=400)
19 | expected = 10 ** (400 / 400) # Should be 10^1 = 10
20 | self.assertEqual(player.transformed_rating, expected)
21 |
22 | player = EloCompetitor(initial_rating=800)
23 | expected = 10 ** (800 / 400) # Should be 10^2 = 100
24 | self.assertEqual(player.transformed_rating, expected)
25 |
26 | def test_expected_score(self):
27 | """Test expected_score with known values."""
28 | player1 = EloCompetitor(initial_rating=400)
29 | player2 = EloCompetitor(initial_rating=400)
30 |
31 | # Equal ratings should give 0.5 expected score
32 | self.assertEqual(player1.expected_score(player2), 0.5)
33 |
34 | player1 = EloCompetitor(initial_rating=400)
35 | player2 = EloCompetitor(initial_rating=600)
36 |
37 | # Calculate expected values manually
38 | p1_transformed = 10 ** (400 / 400) # 10
39 | p2_transformed = 10 ** (600 / 400) # 10^1.5 ≈ 31.6228
40 | expected = p1_transformed / (p1_transformed + p2_transformed)
41 |
42 | self.assertAlmostEqual(player1.expected_score(player2), expected)
43 | self.assertAlmostEqual(player2.expected_score(player1), 1 - expected)
44 |
45 | def test_beat_with_known_values(self):
46 | """Test beat method with known values."""
47 | player1 = EloCompetitor(initial_rating=400, k_factor=32)
48 | player2 = EloCompetitor(initial_rating=400, k_factor=32)
49 |
50 | # Store original ratings
51 | p1_original = player1.rating
52 | p2_original = player2.rating
53 |
54 | # Calculate expected scores
55 | win_es = player1.expected_score(player2) # Should be 0.5
56 | lose_es = player2.expected_score(player1) # Should be 0.5
57 |
58 | # Calculate expected new ratings
59 | p1_new_rating = p1_original + 32 * (1 - win_es) # 400 + 32 * 0.5 = 416
60 | p2_new_rating = p2_original + 32 * (0 - lose_es) # 400 + 32 * -0.5 = 384
61 |
62 | # Player1 beats player2
63 | player1.beat(player2)
64 |
65 | # Check new ratings
66 | self.assertAlmostEqual(player1.rating, p1_new_rating)
67 | self.assertAlmostEqual(player2.rating, p2_new_rating)
68 |
69 | # Test with different ratings
70 | player1 = EloCompetitor(initial_rating=500, k_factor=32)
71 | player2 = EloCompetitor(initial_rating=400, k_factor=32)
72 |
73 | # Store original ratings
74 | p1_original = player1.rating
75 | p2_original = player2.rating
76 |
77 | # Calculate expected scores
78 | win_es = player1.expected_score(player2)
79 | lose_es = player2.expected_score(player1)
80 |
81 | # Calculate expected new ratings
82 | p1_new_rating = p1_original + 32 * (1 - win_es)
83 | p2_new_rating = p2_original + 32 * (0 - lose_es)
84 |
85 | # Player1 beats player2
86 | player1.beat(player2)
87 |
88 | # Check new ratings
89 | self.assertAlmostEqual(player1.rating, p1_new_rating)
90 | self.assertAlmostEqual(player2.rating, p2_new_rating)
91 |
92 | def test_tied_with_known_values(self):
93 | """Test tied method with known values."""
94 | player1 = EloCompetitor(initial_rating=400, k_factor=32)
95 | player2 = EloCompetitor(initial_rating=400, k_factor=32)
96 |
97 | # Store original ratings
98 | p1_original = player1.rating
99 | p2_original = player2.rating
100 |
101 | # Calculate expected scores
102 | win_es = player1.expected_score(player2) # Should be 0.5
103 | lose_es = player2.expected_score(player1) # Should be 0.5
104 |
105 | # Calculate expected new ratings
106 | p1_new_rating = p1_original + 32 * (0.5 - win_es) # 400 + 32 * 0 = 400
107 | p2_new_rating = p2_original + 32 * (0.5 - lose_es) # 400 + 32 * 0 = 400
108 |
109 | # Players tie
110 | player1.tied(player2)
111 |
112 | # Check new ratings - should be unchanged for equal ratings
113 | self.assertAlmostEqual(player1.rating, p1_new_rating)
114 | self.assertAlmostEqual(player2.rating, p2_new_rating)
115 |
116 | # Test with different ratings
117 | player1 = EloCompetitor(initial_rating=500, k_factor=32)
118 | player2 = EloCompetitor(initial_rating=400, k_factor=32)
119 |
120 | # Store original ratings
121 | p1_original = player1.rating
122 | p2_original = player2.rating
123 |
124 | # Calculate expected scores
125 | win_es = player1.expected_score(player2)
126 | lose_es = player2.expected_score(player1)
127 |
128 | # Calculate expected new ratings
129 | p1_new_rating = p1_original + 32 * (0.5 - win_es) # Should decrease for higher rated player
130 | p2_new_rating = p2_original + 32 * (0.5 - lose_es) # Should increase for lower rated player
131 |
132 | # Players tie
133 | player1.tied(player2)
134 |
135 | # Check new ratings
136 | self.assertAlmostEqual(player1.rating, p1_new_rating)
137 | self.assertAlmostEqual(player2.rating, p2_new_rating)
138 |
139 | def test_k_factor_effect(self):
140 | """Test that k_factor affects the rating change magnitude."""
141 | # With k_factor = 32
142 | player1 = EloCompetitor(initial_rating=400, k_factor=32)
143 | player2 = EloCompetitor(initial_rating=400, k_factor=32)
144 | player1.beat(player2)
145 | rating_change_32 = abs(player1.rating - 400)
146 |
147 | # With k_factor = 16
148 | player1 = EloCompetitor(initial_rating=400, k_factor=16)
149 | player2 = EloCompetitor(initial_rating=400, k_factor=16)
150 | player1.beat(player2)
151 | rating_change_16 = abs(player1.rating - 400)
152 |
153 | # The rating change with k_factor=32 should be twice the change with k_factor=16
154 | self.assertAlmostEqual(rating_change_32, 2 * rating_change_16)
155 |
156 |
157 | if __name__ == "__main__":
158 | unittest.main()
159 |
--------------------------------------------------------------------------------
/docs/source/contributing.rst:
--------------------------------------------------------------------------------
1 | Contributing to Elote
2 | ==================
3 |
4 | Thank you for your interest in contributing to Elote! This guide will help you get started with contributing to the project.
5 |
6 | Setting Up Your Development Environment
7 | -------------------------------------
8 |
9 | 1. **Fork the Repository**
10 |
11 | Start by forking the Elote repository on GitHub.
12 |
13 | 2. **Clone Your Fork**
14 |
15 | .. code-block:: bash
16 |
17 | git clone https://github.com/your-username/elote.git
18 | cd elote
19 |
20 | 3. **Set Up Development Environment**
21 |
22 | .. code-block:: bash
23 |
24 | # Using Make (recommended)
25 | make install-dev
26 |
27 | # Or using pip
28 | pip install -e ".[dev]"
29 |
30 | # Or using uv
31 | uv pip install -e ".[dev]"
32 |
33 | 4. **Set Up Pre-commit Hooks**
34 |
35 | .. code-block:: bash
36 |
37 | pre-commit install
38 |
39 | Development Workflow
40 | ------------------
41 |
42 | 1. **Create a Branch**
43 |
44 | Create a branch for your feature or bugfix:
45 |
46 | .. code-block:: bash
47 |
48 | git checkout -b feature/your-feature-name
49 | # or
50 | git checkout -b fix/your-bugfix-name
51 |
52 | 2. **Make Your Changes**
53 |
54 | Implement your changes, following the code style guidelines.
55 |
56 | 3. **Run Tests**
57 |
58 | Make sure your changes pass all tests:
59 |
60 | .. code-block:: bash
61 |
62 | # Run all tests
63 | make test
64 |
65 | # Run tests with coverage
66 | make test-cov
67 |
68 | 4. **Lint Your Code**
69 |
70 | Ensure your code follows the project's style guidelines:
71 |
72 | .. code-block:: bash
73 |
74 | # Check code style
75 | make lint
76 |
77 | # Auto-fix some linting issues
78 | make lint-fix
79 |
80 | # Format code
81 | make format
82 |
83 | 5. **Commit Your Changes**
84 |
85 | Follow the conventional commits format for your commit messages:
86 |
87 | .. code-block:: bash
88 |
89 | git commit -m "feat: add new feature"
90 | # or
91 | git commit -m "fix: resolve issue with X"
92 |
93 | 6. **Push Your Changes**
94 |
95 | Push your changes to your fork:
96 |
97 | .. code-block:: bash
98 |
99 | git push origin feature/your-feature-name
100 |
101 | 7. **Create a Pull Request**
102 |
103 | Open a pull request from your fork to the main Elote repository.
104 |
105 | Code Style Guidelines
106 | -------------------
107 |
108 | Elote follows these code style guidelines:
109 |
110 | - Use PEP 8 for Python code style
111 | - Use docstrings for all public functions, classes, and methods
112 | - Write clear, descriptive variable and function names
113 | - Include type hints where appropriate
114 | - Keep functions focused on a single responsibility
115 | - Write unit tests for new functionality
116 |
117 | Adding a New Rating System
118 | ------------------------
119 |
120 | To add a new rating system to Elote:
121 |
122 | 1. Create a new file in the `elote/competitors` directory
123 | 2. Implement a new class that inherits from `BaseCompetitor`
124 | 3. Implement the required methods:
125 | - `expected_score(competitor)`
126 | - `update_rating(competitor, score)`
127 | 4. Add tests for your rating system in the `tests` directory
128 | 5. Update the documentation to include your new rating system
129 |
130 | Here's a template for a new rating system:
131 |
132 | .. code-block:: python
133 |
134 | from elote.competitors.base import BaseCompetitor
135 |
136 | class NewRatingCompetitor(BaseCompetitor):
137 | def __init__(self, initial_rating=1500, **kwargs):
138 | self.rating = initial_rating
139 | # Initialize other parameters
140 |
141 | def expected_score(self, competitor):
142 | """
143 | Calculate the expected score (probability of winning) against another competitor.
144 |
145 | Args:
146 | competitor: The opponent NewRatingCompetitor
147 |
148 | Returns:
149 | float: The probability of winning (between 0 and 1)
150 | """
151 | # Implement the expected score calculation
152 | pass
153 |
154 | def update_rating(self, competitor, score):
155 | """
156 | Update the rating based on a match result.
157 |
158 | Args:
159 | competitor: The opponent NewRatingCompetitor
160 | score: The actual score (1 for win, 0.5 for draw, 0 for loss)
161 | """
162 | # Implement the rating update logic
163 | pass
164 |
165 | Documentation
166 | ------------
167 |
168 | When adding new features or making significant changes, please update the documentation:
169 |
170 | 1. Add or update docstrings for all public functions, classes, and methods
171 | 2. Update the relevant RST files in the `docs/source` directory
172 | 3. If adding a new rating system, create a new RST file in `docs/source/rating_systems`
173 | 4. Build and check the documentation locally:
174 |
175 | .. code-block:: bash
176 |
177 | make docs
178 | # Open docs/build/html/index.html in your browser
179 |
180 | Testing
181 | ------
182 |
183 | Elote uses pytest for testing. When adding new features:
184 |
185 | 1. Write unit tests for your new code
186 | 2. Ensure all existing tests pass
187 | 3. Aim for high test coverage
188 |
189 | .. code-block:: bash
190 |
191 | # Run tests
192 | make test
193 |
194 | # Run tests with coverage
195 | make test-cov
196 |
197 | Reporting Issues
198 | --------------
199 |
200 | If you find a bug or have a feature request:
201 |
202 | 1. Check if the issue already exists in the GitHub issues
203 | 2. If not, create a new issue with:
204 | - A clear title and description
205 | - Steps to reproduce (for bugs)
206 | - Expected and actual behavior (for bugs)
207 | - Any relevant code snippets or error messages
208 |
209 | Pull Request Process
210 | ------------------
211 |
212 | 1. Ensure your code passes all tests and linting checks
213 | 2. Update the documentation if needed
214 | 3. Add an entry to the CHANGELOG.md file
215 | 4. Submit your pull request with a clear description of the changes
216 | 5. Wait for review and address any feedback
217 |
218 | Code of Conduct
219 | -------------
220 |
221 | Please note that Elote has a Code of Conduct. By participating in this project, you agree to abide by its terms.
222 |
223 | Thank You!
224 | ---------
225 |
226 | Your contributions are what make the open-source community such an amazing place to learn, inspire, and create. Any contribution you make is greatly appreciated!
--------------------------------------------------------------------------------
/tests/test_visualization.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import tempfile
4 | import matplotlib.pyplot as plt
5 | from elote.visualization import (
6 | plot_rating_system_comparison,
7 | plot_optimized_accuracy_comparison,
8 | plot_accuracy_by_prior_bouts,
9 | )
10 |
11 |
12 | class TestVisualization(unittest.TestCase):
13 | def setUp(self):
14 | """Set up test data for visualization functions."""
15 | # Sample results for rating system comparison
16 | self.results = [
17 | {
18 | "name": "System A",
19 | "accuracy": 0.75,
20 | "precision": 0.80,
21 | "recall": 0.70,
22 | "f1": 0.75,
23 | "optimized_accuracy": 0.78,
24 | },
25 | {
26 | "name": "System B",
27 | "accuracy": 0.65,
28 | "precision": 0.70,
29 | "recall": 0.60,
30 | "f1": 0.65,
31 | "optimized_accuracy": 0.68,
32 | },
33 | {
34 | "name": "System C",
35 | "accuracy": 0.85,
36 | "precision": 0.90,
37 | "recall": 0.80,
38 | "f1": 0.85,
39 | "optimized_accuracy": 0.88,
40 | },
41 | ]
42 |
43 | # Sample data for accuracy by prior bouts
44 | self.accuracy_by_prior_bouts = {
45 | "System A": {
46 | "binned": {
47 | 0: {"accuracy": 0.60, "total": 100, "min_bouts": 0, "max_bouts": 4},
48 | 1: {"accuracy": 0.70, "total": 100, "min_bouts": 5, "max_bouts": 9},
49 | 2: {"accuracy": 0.80, "total": 100, "min_bouts": 10, "max_bouts": 14},
50 | }
51 | },
52 | "System B": {
53 | "binned": {
54 | 0: {"accuracy": 0.55, "total": 100, "min_bouts": 0, "max_bouts": 4},
55 | 1: {"accuracy": 0.65, "total": 100, "min_bouts": 5, "max_bouts": 9},
56 | 2: {"accuracy": 0.75, "total": 100, "min_bouts": 10, "max_bouts": 14},
57 | }
58 | },
59 | }
60 |
61 | # Create a temporary directory for saving plots
62 | self.temp_dir = tempfile.mkdtemp()
63 |
64 | def tearDown(self):
65 | """Clean up after tests."""
66 | # Close all matplotlib figures
67 | plt.close("all")
68 |
69 | # Remove temporary files
70 | for filename in os.listdir(self.temp_dir):
71 | os.remove(os.path.join(self.temp_dir, filename))
72 | os.rmdir(self.temp_dir)
73 |
74 | def test_plot_rating_system_comparison(self):
75 | """Test that plot_rating_system_comparison creates a figure."""
76 | # Test without saving
77 | fig = plot_rating_system_comparison(self.results)
78 | self.assertIsNotNone(fig)
79 | self.assertEqual(len(fig.axes), 4) # 2x2 grid of subplots
80 |
81 | # Test with saving
82 | save_path = os.path.join(self.temp_dir, "rating_comparison.png")
83 | fig = plot_rating_system_comparison(self.results, save_path=save_path)
84 | self.assertTrue(os.path.exists(save_path))
85 |
86 | # Test with custom figsize and title
87 | fig = plot_rating_system_comparison(self.results, figsize=(10, 8), title="Custom Title")
88 | self.assertEqual(fig.get_figwidth(), 10)
89 | self.assertEqual(fig.get_figheight(), 8)
90 | self.assertEqual(fig._suptitle.get_text(), "Custom Title")
91 |
92 | def test_plot_optimized_accuracy_comparison(self):
93 | """Test that plot_optimized_accuracy_comparison creates a figure."""
94 | # Test without saving
95 | fig = plot_optimized_accuracy_comparison(self.results)
96 | self.assertIsNotNone(fig)
97 |
98 | # Test with saving
99 | save_path = os.path.join(self.temp_dir, "optimized_accuracy.png")
100 | fig = plot_optimized_accuracy_comparison(self.results, save_path=save_path)
101 | self.assertTrue(os.path.exists(save_path))
102 |
103 | # Test with custom figsize and title
104 | fig = plot_optimized_accuracy_comparison(self.results, figsize=(8, 6), title="Custom Title")
105 | self.assertEqual(fig.get_figwidth(), 8)
106 | self.assertEqual(fig.get_figheight(), 6)
107 | self.assertEqual(fig.axes[0].get_title(), "Custom Title")
108 |
109 | def test_plot_accuracy_by_prior_bouts_with_binned_data(self):
110 | """Test that plot_accuracy_by_prior_bouts works with binned data."""
111 | # Test without saving
112 | fig = plot_accuracy_by_prior_bouts(self.accuracy_by_prior_bouts)
113 | self.assertIsNotNone(fig)
114 |
115 | # Test with saving
116 | save_path = os.path.join(self.temp_dir, "accuracy_by_bouts.png")
117 | fig = plot_accuracy_by_prior_bouts(self.accuracy_by_prior_bouts, save_path=save_path)
118 | self.assertTrue(os.path.exists(save_path))
119 |
120 | # Test with custom parameters
121 | fig = plot_accuracy_by_prior_bouts(self.accuracy_by_prior_bouts, figsize=(12, 8), title="Custom Title")
122 | self.assertEqual(fig.get_figwidth(), 12)
123 | self.assertEqual(fig.get_figheight(), 8)
124 | self.assertEqual(fig.axes[0].get_title(), "Custom Title")
125 |
126 | def test_plot_accuracy_by_prior_bouts_with_empty_data(self):
127 | """Test that plot_accuracy_by_prior_bouts handles empty data gracefully."""
128 | # Empty data
129 | empty_data = {}
130 | fig = plot_accuracy_by_prior_bouts(empty_data)
131 | self.assertIsNotNone(fig)
132 |
133 | # Data with empty bins
134 | data_with_empty_bins = {"System A": {"binned": {}}}
135 | fig = plot_accuracy_by_prior_bouts(data_with_empty_bins)
136 | self.assertIsNotNone(fig)
137 |
138 | def test_plot_functions_with_invalid_data(self):
139 | """Test that visualization functions handle invalid data gracefully."""
140 | # Invalid results for rating system comparison
141 | invalid_results = [
142 | {"name": "System A"} # Missing metrics
143 | ]
144 |
145 | # Should not raise an error, but might not plot anything
146 | fig = plot_rating_system_comparison(invalid_results)
147 | self.assertIsNotNone(fig)
148 |
149 | # Invalid results for optimized accuracy comparison
150 | fig = plot_optimized_accuracy_comparison(invalid_results)
151 | self.assertIsNotNone(fig)
152 |
153 | # Invalid data for accuracy by prior bouts
154 | invalid_bout_data = {
155 | "System A": {} # Missing binned data
156 | }
157 | fig = plot_accuracy_by_prior_bouts(invalid_bout_data)
158 | self.assertIsNotNone(fig)
159 |
160 |
161 | if __name__ == "__main__":
162 | unittest.main()
163 |
--------------------------------------------------------------------------------
/elote/datasets/synthetic.py:
--------------------------------------------------------------------------------
1 | """
2 | Synthetic data generator for elote.
3 |
4 | This module provides a synthetic data generator for testing and evaluating different rating algorithms.
5 | """
6 |
7 | import datetime
8 | import random
9 | import numpy as np
10 | from typing import List, Tuple, Dict, Any, Optional
11 |
12 | from elote.datasets.base import BaseDataset
13 |
14 |
15 | class SyntheticDataset(BaseDataset):
16 | """
17 | Synthetic data generator for testing and evaluating different rating algorithms.
18 |
19 | This dataset generates random matchups between competitors with configurable parameters.
20 | The outcome of each matchup is determined by the true skill of each competitor plus some noise.
21 | """
22 |
23 | def __init__(
24 | self,
25 | num_competitors: int = 100,
26 | num_matchups: int = 1000,
27 | skill_distribution: str = "normal",
28 | skill_mean: float = 1500,
29 | skill_std: float = 300,
30 | noise_std: float = 100,
31 | draw_probability: float = 0.1,
32 | time_span_days: int = 365,
33 | seed: Optional[int] = None,
34 | cache_dir: Optional[str] = None,
35 | max_memory_mb: int = 1024,
36 | ):
37 | """
38 | Initialize a synthetic dataset generator.
39 |
40 | Args:
41 | num_competitors: Number of competitors to generate
42 | num_matchups: Number of matchups to generate
43 | skill_distribution: Distribution of true skills ("normal", "uniform", or "pareto")
44 | skill_mean: Mean of the skill distribution (for normal distribution)
45 | skill_std: Standard deviation of the skill distribution (for normal distribution)
46 | noise_std: Standard deviation of the noise added to skills during matchups
47 | draw_probability: Probability of a draw when competitors are closely matched
48 | time_span_days: Number of days to spread the matchups over
49 | seed: Random seed for reproducibility
50 | cache_dir: Directory to cache data (not used for synthetic data)
51 | max_memory_mb: Maximum memory usage in MB for dataset operations
52 | """
53 | super().__init__(cache_dir=cache_dir, max_memory_mb=max_memory_mb)
54 | self.num_competitors = num_competitors
55 | self.num_matchups = num_matchups
56 | self.skill_distribution = skill_distribution
57 | self.skill_mean = skill_mean
58 | self.skill_std = skill_std
59 | self.noise_std = noise_std
60 | self.draw_probability = draw_probability
61 | self.time_span_days = time_span_days
62 | self.seed = seed
63 |
64 | # Set random seed if provided
65 | if seed is not None:
66 | random.seed(seed)
67 | np.random.seed(seed)
68 |
69 | def download(self) -> None:
70 | """
71 | No download needed for synthetic data.
72 | """
73 | pass
74 |
75 | def _generate_skills(self) -> Dict[str, float]:
76 | """
77 | Generate true skills for all competitors.
78 |
79 | Returns:
80 | Dictionary mapping competitor IDs to their true skills
81 | """
82 | skills = {}
83 |
84 | for i in range(self.num_competitors):
85 | competitor_id = f"competitor_{i}"
86 |
87 | if self.skill_distribution == "normal":
88 | skill = np.random.normal(self.skill_mean, self.skill_std)
89 | elif self.skill_distribution == "uniform":
90 | skill = np.random.uniform(
91 | self.skill_mean - self.skill_std * 1.73, # Matching variance of normal
92 | self.skill_mean + self.skill_std * 1.73,
93 | )
94 | elif self.skill_distribution == "pareto":
95 | # Pareto distribution for more realistic skill distribution with few very skilled competitors
96 | skill = np.random.pareto(3) * self.skill_std + self.skill_mean - self.skill_std
97 | else:
98 | raise ValueError(f"Unknown skill distribution: {self.skill_distribution}")
99 |
100 | skills[competitor_id] = skill
101 |
102 | return skills
103 |
104 | def _generate_matchups(
105 | self, skills: Dict[str, float]
106 | ) -> List[Tuple[str, str, float, datetime.datetime, Dict[str, Any]]]:
107 | """
108 | Generate random matchups between competitors.
109 |
110 | Args:
111 | skills: Dictionary mapping competitor IDs to their true skills
112 |
113 | Returns:
114 | List of matchup tuples (competitor_a, competitor_b, outcome, timestamp, attributes)
115 | """
116 | matchups = []
117 | competitors = list(skills.keys())
118 |
119 | # Generate timestamps spanning the time_span_days
120 | start_date = datetime.datetime.now() - datetime.timedelta(days=self.time_span_days)
121 |
122 | for _i in range(self.num_matchups):
123 | # Select two random competitors
124 | a, b = random.sample(competitors, 2)
125 |
126 | # Generate a timestamp
127 | days_offset = random.uniform(0, self.time_span_days)
128 | timestamp = start_date + datetime.timedelta(days=days_offset)
129 |
130 | # Determine the outcome based on true skills plus noise
131 | skill_a = skills[a] + np.random.normal(0, self.noise_std)
132 | skill_b = skills[b] + np.random.normal(0, self.noise_std)
133 |
134 | # Calculate skill difference and normalize
135 | skill_diff = skill_a - skill_b
136 |
137 | # Determine if it's a draw
138 | if abs(skill_diff) < self.noise_std and random.random() < self.draw_probability:
139 | outcome = 0.5 # Draw
140 | else:
141 | outcome = 1.0 if skill_diff > 0 else 0.0
142 |
143 | # Add attributes with true skills for evaluation
144 | attributes = {
145 | "true_skill_a": skills[a],
146 | "true_skill_b": skills[b],
147 | "skill_diff": skill_diff,
148 | }
149 |
150 | matchups.append((a, b, outcome, timestamp, attributes))
151 |
152 | # Sort by timestamp
153 | matchups.sort(key=lambda x: x[3])
154 |
155 | return matchups
156 |
157 | def load(self) -> List[Tuple[str, str, float, datetime.datetime, Dict[str, Any]]]:
158 | """
159 | Generate and load the synthetic dataset.
160 |
161 | Returns:
162 | List of matchup tuples (competitor_a, competitor_b, outcome, timestamp, attributes)
163 | """
164 | # Generate true skills for all competitors
165 | skills = self._generate_skills()
166 |
167 | # Generate random matchups
168 | matchups = self._generate_matchups(skills)
169 |
170 | return matchups
171 |
--------------------------------------------------------------------------------
/tests/test_Arenas.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from elote import LambdaArena, EloCompetitor
3 |
4 |
5 | class TestArenas(unittest.TestCase):
6 | def test_lambda_arena_initialization(self):
7 | """Test that the LambdaArena initializes correctly with different parameters."""
8 | # Test with default parameters
9 | arena = LambdaArena(lambda a, b: a > b)
10 | self.assertEqual(len(arena.competitors), 0)
11 | self.assertEqual(arena.base_competitor, EloCompetitor)
12 | self.assertEqual(arena.base_competitor_kwargs, {})
13 |
14 | # Test with custom competitor class parameters
15 | arena = LambdaArena(lambda a, b: a > b, base_competitor_kwargs={"initial_rating": 1000})
16 | self.assertEqual(arena.base_competitor_kwargs, {"initial_rating": 1000})
17 |
18 | # Test with initial state
19 | initial_state = {"A": {"initial_rating": 1200}, "B": {"initial_rating": 800}}
20 | arena = LambdaArena(lambda a, b: a > b, initial_state=initial_state)
21 | self.assertEqual(len(arena.competitors), 2)
22 | self.assertIn("A", arena.competitors)
23 | self.assertIn("B", arena.competitors)
24 | self.assertEqual(arena.competitors["A"].rating, 1200)
25 | self.assertEqual(arena.competitors["B"].rating, 800)
26 |
27 | def test_lambda_arena_matchup(self):
28 | """Test that matchups work correctly in the LambdaArena."""
29 |
30 | # Create a simple comparison function
31 | def compare(a, b, attributes=None):
32 | if attributes and "force_winner" in attributes:
33 | return attributes["force_winner"] == a
34 | return a > b
35 |
36 | arena = LambdaArena(compare)
37 |
38 | # Test a simple matchup where a > b
39 | arena.matchup(10, 5)
40 | self.assertEqual(len(arena.competitors), 2)
41 | self.assertIn(10, arena.competitors)
42 | self.assertIn(5, arena.competitors)
43 |
44 | # The winner's rating should be higher than the initial rating
45 | initial_rating = EloCompetitor().rating
46 | self.assertGreater(arena.competitors[10].rating, initial_rating)
47 |
48 | # Test a matchup with attributes
49 | arena.matchup("X", "Y", attributes={"force_winner": "Y"})
50 | self.assertEqual(len(arena.competitors), 4)
51 | self.assertIn("X", arena.competitors)
52 | self.assertIn("Y", arena.competitors)
53 |
54 | # Y should have won despite X normally being greater alphabetically
55 | self.assertGreater(arena.competitors["Y"].rating, initial_rating)
56 |
57 | def test_lambda_arena_tournament(self):
58 | """Test that tournaments work correctly in the LambdaArena."""
59 | arena = LambdaArena(lambda a, b: a > b)
60 |
61 | # Create a tournament with multiple matchups
62 | matchups = [(10, 5), (15, 8), (5, 3), (8, 10)]
63 |
64 | arena.tournament(matchups)
65 |
66 | # Check that all competitors are in the arena
67 | self.assertEqual(len(arena.competitors), 5)
68 | for competitor in [3, 5, 8, 10, 15]:
69 | self.assertIn(competitor, arena.competitors)
70 |
71 | # Check that the history has recorded all bouts
72 | self.assertEqual(len(arena.history.bouts), 4)
73 |
74 | def test_lambda_arena_expected_score(self):
75 | """Test that expected scores are calculated correctly."""
76 | arena = LambdaArena(lambda a, b: a > b)
77 |
78 | # Add some competitors with different ratings
79 | initial_state = {"A": {"initial_rating": 1200}, "B": {"initial_rating": 800}}
80 | arena = LambdaArena(lambda a, b: a > b, initial_state=initial_state)
81 |
82 | # A should have a higher expected score against B
83 | self.assertGreater(arena.expected_score("A", "B"), 0.5)
84 | self.assertLess(arena.expected_score("B", "A"), 0.5)
85 |
86 | # Test with new competitors that aren't in the arena yet
87 | score_c_d = arena.expected_score("C", "D")
88 | self.assertAlmostEqual(score_c_d, 0.5, places=2)
89 |
90 | # Now they should be in the arena with default ratings
91 | self.assertIn("C", arena.competitors)
92 | self.assertIn("D", arena.competitors)
93 |
94 | def test_lambda_arena_export_state(self):
95 | """Test that the arena state can be exported correctly."""
96 | arena = LambdaArena(lambda a, b: a > b)
97 |
98 | # Add some competitors and run some matchups
99 | arena.matchup("A", "B")
100 | arena.matchup("B", "C")
101 | arena.matchup("A", "C")
102 |
103 | # Export the state
104 | state = arena.export_state()
105 |
106 | # Check that all competitors are in the state
107 | self.assertEqual(len(state), 3)
108 | self.assertIn("A", state)
109 | self.assertIn("B", state)
110 | self.assertIn("C", state)
111 |
112 | # Check that the state contains the correct information
113 | for competitor in ["A", "B", "C"]:
114 | self.assertIn("initial_rating", state[competitor])
115 | self.assertIn("class_vars", state[competitor])
116 |
117 | def test_lambda_arena_leaderboard(self):
118 | """Test that the leaderboard is generated correctly."""
119 | arena = LambdaArena(lambda a, b: a > b)
120 |
121 | # Add some competitors with different ratings
122 | initial_state = {
123 | "A": {"initial_rating": 1200},
124 | "B": {"initial_rating": 1000},
125 | "C": {"initial_rating": 800},
126 | }
127 | arena = LambdaArena(lambda a, b: a > b, initial_state=initial_state)
128 |
129 | # Get the leaderboard
130 | leaderboard = arena.leaderboard()
131 |
132 | # Check that the leaderboard is sorted by rating (ascending)
133 | self.assertEqual(len(leaderboard), 3)
134 | self.assertEqual(leaderboard[0]["competitor"], "C")
135 | self.assertEqual(leaderboard[1]["competitor"], "B")
136 | self.assertEqual(leaderboard[2]["competitor"], "A")
137 |
138 | def test_lambda_arena_clear_history(self):
139 | """Test that the history can be cleared."""
140 | arena = LambdaArena(lambda a, b: a > b)
141 |
142 | # Add some matchups
143 | arena.matchup("A", "B")
144 | arena.matchup("B", "C")
145 |
146 | # Check that the history has recorded the bouts
147 | self.assertEqual(len(arena.history.bouts), 2)
148 |
149 | # Clear the history
150 | arena.clear_history()
151 |
152 | # Check that the history is empty
153 | self.assertEqual(len(arena.history.bouts), 0)
154 |
155 | def test_lambda_arena_set_competitor_class_var(self):
156 | """Test that competitor class variables can be set."""
157 | arena = LambdaArena(lambda a, b: a > b)
158 |
159 | # Set a class variable
160 | arena.set_competitor_class_var("_k_factor", 16)
161 |
162 | # Check that the class variable was set
163 | self.assertEqual(EloCompetitor._k_factor, 16)
164 |
165 | # Reset the class variable for other tests
166 | arena.set_competitor_class_var("_k_factor", 32)
167 |
--------------------------------------------------------------------------------
/tests/test_GlickoCompetitor_known_values.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from elote import GlickoCompetitor
3 | import math
4 | from datetime import datetime, timedelta
5 |
6 |
7 | class TestGlickoKnownValues(unittest.TestCase):
8 | """Tests for GlickoCompetitor with known values to verify correctness."""
9 |
10 | def test_initial_rating(self):
11 | """Test that initial rating and RD are set correctly."""
12 | player = GlickoCompetitor(initial_rating=1500, initial_rd=350)
13 | self.assertEqual(player.rating, 1500)
14 | self.assertEqual(player.rd, 350)
15 |
16 | player = GlickoCompetitor(initial_rating=2000, initial_rd=200)
17 | self.assertEqual(player.rating, 2000)
18 | self.assertEqual(player.rd, 200)
19 |
20 | def test_transformed_rd(self):
21 | """Test that transformed RD is calculated correctly."""
22 | player = GlickoCompetitor(initial_rating=1500, initial_rd=300)
23 | expected_rd = min([350, math.sqrt(300**2 + 34.6**2)])
24 | self.assertAlmostEqual(player.tranformed_rd, expected_rd)
25 |
26 | def test_g_function(self):
27 | """Test the g function with known values."""
28 | player = GlickoCompetitor(initial_rating=1500, initial_rd=300)
29 | g = player._g(300)
30 | expected_g = 1 / math.sqrt(1 + 3 * (0.0057565**2) * (300**2) / math.pi**2)
31 | self.assertAlmostEqual(g, expected_g)
32 |
33 | def test_expected_score(self):
34 | """Test expected_score with known values."""
35 | player1 = GlickoCompetitor(initial_rating=1500, initial_rd=300)
36 | player2 = GlickoCompetitor(initial_rating=1700, initial_rd=300)
37 |
38 | # Calculate expected score manually
39 | g = player1._g(300**2) # Use rd squared as per the implementation
40 | E = 1 / (1 + 10 ** ((-g * (1500 - 1700)) / 400))
41 | self.assertAlmostEqual(player1.expected_score(player2), E)
42 |
43 | def test_beat_with_known_values(self):
44 | """Test beat method with known values."""
45 | initial_time = datetime(2020, 1, 1)
46 | match_time = datetime(2020, 1, 10) # 10 days later
47 |
48 | player1 = GlickoCompetitor(initial_rating=1500, initial_rd=50, initial_time=initial_time)
49 | player2 = GlickoCompetitor(initial_rating=1700, initial_rd=50, initial_time=initial_time)
50 |
51 | # Store initial ratings
52 | initial_rating1 = player1.rating
53 | initial_rating2 = player2.rating
54 |
55 | # Perform the match
56 | player1.beat(player2, match_time=match_time)
57 |
58 | # Check that ratings changed in the expected direction
59 | self.assertGreater(player1.rating, initial_rating1) # Winner's rating should increase
60 | self.assertLess(player2.rating, initial_rating2) # Loser's rating should decrease
61 |
62 | # Check that RDs decreased (more certainty after a match)
63 | self.assertLess(player1.rd, 350)
64 | self.assertLess(player2.rd, 350)
65 |
66 | def test_tied_with_known_values(self):
67 | """Test tied method with known values."""
68 | initial_time = datetime(2020, 1, 1)
69 | match_time = datetime(2020, 1, 10) # 10 days later
70 |
71 | player1 = GlickoCompetitor(initial_rating=1500, initial_rd=50, initial_time=initial_time)
72 | player2 = GlickoCompetitor(initial_rating=1700, initial_rd=50, initial_time=initial_time)
73 |
74 | # Store initial ratings
75 | initial_rating1 = player1.rating
76 | initial_rating2 = player2.rating
77 |
78 | # Perform the match
79 | player1.tied(player2, match_time=match_time)
80 |
81 | # Check that ratings changed in the expected direction
82 | self.assertGreater(player1.rating, initial_rating1) # Lower-rated player should gain rating
83 | self.assertLess(player2.rating, initial_rating2) # Higher-rated player should lose rating
84 |
85 | # Check that RDs decreased (more certainty after a match)
86 | self.assertLess(player1.rd, 350)
87 | self.assertLess(player2.rd, 350)
88 |
89 | def test_rd_effect(self):
90 | """Test that RD affects the rating change magnitude."""
91 | initial_time = datetime(2020, 1, 1)
92 | match_time = initial_time + timedelta(days=2) # Match happens 2 days after initialization
93 |
94 | # With high RD (more uncertainty)
95 | player1 = GlickoCompetitor(initial_rating=1500, initial_rd=350, initial_time=initial_time)
96 | player2 = GlickoCompetitor(initial_rating=1700, initial_rd=50, initial_time=initial_time)
97 | player1.beat(player2, match_time=match_time)
98 | rating_change_high_rd = abs(player1.rating - 1500)
99 |
100 | # Reset with lower RD
101 | player1 = GlickoCompetitor(initial_rating=1500, initial_rd=50, initial_time=initial_time)
102 | player2 = GlickoCompetitor(initial_rating=1700, initial_rd=50, initial_time=initial_time)
103 | player1.beat(player2, match_time=match_time)
104 | rating_change_low_rd = abs(player1.rating - 1500)
105 |
106 | # The rating change with higher RD should be greater
107 | self.assertGreater(rating_change_high_rd, rating_change_low_rd)
108 |
109 | def test_rd_increase_over_time(self):
110 | """Test that RD increases over time."""
111 | initial_time = datetime(2020, 1, 1)
112 | player = GlickoCompetitor(initial_rating=1500, initial_rd=50, initial_time=initial_time)
113 |
114 | # Test that RD increases over time
115 | current_time = initial_time + timedelta(days=1)
116 | initial_rd = player.rd
117 | player.update_rd_for_inactivity(current_time)
118 | self.assertGreater(player.rd, initial_rd)
119 |
120 | # Test that RD increases more over longer periods
121 | player = GlickoCompetitor(initial_rating=1500, initial_rd=50, initial_time=initial_time)
122 | current_time = initial_time + timedelta(days=10)
123 | player.update_rd_for_inactivity(current_time)
124 | self.assertGreater(player.rd, initial_rd)
125 |
126 | # Test that RD is capped at 350
127 | player = GlickoCompetitor(initial_rating=1500, initial_rd=50, initial_time=initial_time)
128 | current_time = initial_time + timedelta(days=1000) # Very long time
129 | player.update_rd_for_inactivity(current_time)
130 | self.assertLessEqual(player.rd, 350)
131 |
132 | def test_fractional_rating_periods(self):
133 | """Test RD increase with fractional rating periods."""
134 | initial_time = datetime(2020, 1, 1)
135 | player = GlickoCompetitor(initial_rating=1500, initial_rd=50, initial_time=initial_time)
136 |
137 | # Test that RD increases for half a period
138 | current_time = initial_time + timedelta(hours=12)
139 | initial_rd = player.rd
140 | player.update_rd_for_inactivity(current_time)
141 | self.assertGreater(player.rd, initial_rd)
142 |
143 | # Test that RD increases more for 1.5 periods than 0.5 periods
144 | player = GlickoCompetitor(initial_rating=1500, initial_rd=50, initial_time=initial_time)
145 | current_time = initial_time + timedelta(hours=36)
146 | player.update_rd_for_inactivity(current_time)
147 | self.assertGreater(player.rd, initial_rd)
148 |
149 |
150 | if __name__ == "__main__":
151 | unittest.main()
152 |
--------------------------------------------------------------------------------
/elote/datasets/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Utility functions for using datasets with arenas.
3 |
4 | This module provides utility functions for using datasets with arenas for evaluating different rating algorithms.
5 | """
6 |
7 | from typing import Any, Callable, Dict, List, Optional, Tuple
8 | import datetime
9 |
10 | from elote.arenas.base import BaseArena, Bout, History
11 | from elote.datasets.base import DataSplit
12 |
13 |
14 | def train_arena_with_dataset(
15 | arena: BaseArena,
16 | train_data: List[Tuple[Any, Any, float, Optional[datetime.datetime], Optional[Dict[str, Any]]]],
17 | batch_size: Optional[int] = None,
18 | progress_callback: Optional[Callable[[int, int], None]] = None,
19 | ) -> BaseArena:
20 | """
21 | Train an arena with a dataset.
22 |
23 | Args:
24 | arena: The arena to train
25 | train_data: List of matchup tuples (competitor_a, competitor_b, outcome, timestamp, attributes)
26 | batch_size: Number of matchups to process in each batch (for progress reporting)
27 | progress_callback: Callback function for reporting progress (current, total)
28 |
29 | Returns:
30 | The trained arena
31 | """
32 | # Sort by timestamp if available
33 | train_data_with_time = [(a, b, outcome, ts, attrs) for a, b, outcome, ts, attrs in train_data if ts is not None]
34 | train_data_without_time = [(a, b, outcome, ts, attrs) for a, b, outcome, ts, attrs in train_data if ts is None]
35 |
36 | if train_data_with_time:
37 | # Sort by timestamp
38 | train_data_with_time.sort(key=lambda x: x[3])
39 | # Combine sorted data with data without timestamps
40 | sorted_data = train_data_with_time + train_data_without_time
41 | else:
42 | sorted_data = train_data
43 |
44 | # Process in batches if requested
45 | if batch_size is None:
46 | batch_size = len(sorted_data)
47 |
48 | total_batches = (len(sorted_data) + batch_size - 1) // batch_size
49 |
50 | for batch_idx in range(total_batches):
51 | start_idx = batch_idx * batch_size
52 | end_idx = min(start_idx + batch_size, len(sorted_data))
53 | batch = sorted_data[start_idx:end_idx]
54 |
55 | # Process each matchup
56 | for a, b, outcome, _, attributes in batch:
57 | if outcome == 1.0:
58 | # A wins
59 | arena.matchup(a, b, attributes=attributes)
60 | elif outcome == 0.0:
61 | # B wins
62 | arena.matchup(b, a, attributes=attributes)
63 | else:
64 | # Draw - we need to handle this specially
65 | # First, get the competitors
66 | if a not in arena.competitors:
67 | arena.competitors[a] = arena.base_competitor(**arena.base_competitor_kwargs)
68 | if b not in arena.competitors:
69 | arena.competitors[b] = arena.base_competitor(**arena.base_competitor_kwargs)
70 |
71 | # Then, record the draw
72 | arena.competitors[a].tied(arena.competitors[b])
73 |
74 | # Report progress
75 | if progress_callback is not None:
76 | progress_callback(end_idx, len(sorted_data))
77 |
78 | return arena
79 |
80 |
81 | def evaluate_arena_with_dataset(
82 | arena: BaseArena,
83 | test_data: List[Tuple[Any, Any, float, Optional[datetime.datetime], Optional[Dict[str, Any]]]],
84 | batch_size: Optional[int] = None,
85 | progress_callback: Optional[Callable[[int, int], None]] = None,
86 | ) -> History:
87 | """
88 | Evaluate an arena with a dataset.
89 |
90 | Args:
91 | arena: The arena to evaluate
92 | test_data: List of matchup tuples (competitor_a, competitor_b, outcome, timestamp, attributes)
93 | batch_size: Number of matchups to process in each batch (for progress reporting)
94 | progress_callback: Callback function for reporting progress (current, total)
95 |
96 | Returns:
97 | History object containing the evaluation results
98 | """
99 | # Create a new history object
100 | history = History()
101 |
102 | # Sort by timestamp if available
103 | test_data_with_time = [(a, b, outcome, ts, attrs) for a, b, outcome, ts, attrs in test_data if ts is not None]
104 | test_data_without_time = [(a, b, outcome, ts, attrs) for a, b, outcome, ts, attrs in test_data if ts is None]
105 |
106 | if test_data_with_time:
107 | # Sort by timestamp
108 | test_data_with_time.sort(key=lambda x: x[3])
109 | # Combine sorted data with data without timestamps
110 | sorted_data = test_data_with_time + test_data_without_time
111 | else:
112 | sorted_data = test_data
113 |
114 | # Process in batches if requested
115 | if batch_size is None:
116 | batch_size = len(sorted_data)
117 |
118 | total_batches = (len(sorted_data) + batch_size - 1) // batch_size
119 |
120 | for batch_idx in range(total_batches):
121 | start_idx = batch_idx * batch_size
122 | end_idx = min(start_idx + batch_size, len(sorted_data))
123 | batch = sorted_data[start_idx:end_idx]
124 |
125 | # Process each matchup
126 | for a, b, outcome, _, attributes in batch:
127 | # Skip if either competitor is not in the arena
128 | if a not in arena.competitors or b not in arena.competitors:
129 | continue
130 |
131 | # Get the expected outcome
132 | expected_score = arena.expected_score(a, b)
133 |
134 | # Create a bout object
135 | bout = Bout(a, b, expected_score, outcome, attributes)
136 |
137 | # Add to history
138 | history.add_bout(bout)
139 |
140 | # Report progress
141 | if progress_callback is not None:
142 | progress_callback(end_idx, len(sorted_data))
143 |
144 | return history
145 |
146 |
147 | def train_and_evaluate_arena(
148 | arena: BaseArena,
149 | data_split: DataSplit,
150 | batch_size: Optional[int] = None,
151 | progress_callback: Optional[Callable[[str, int, int], None]] = None,
152 | ) -> Tuple[BaseArena, History]:
153 | """
154 | Train and evaluate an arena with a dataset split.
155 |
156 | Args:
157 | arena: The arena to train and evaluate
158 | data_split: DataSplit object containing train and test sets
159 | batch_size: Number of matchups to process in each batch (for progress reporting)
160 | progress_callback: Callback function for reporting progress (phase, current, total)
161 |
162 | Returns:
163 | Tuple of (trained_arena, history)
164 | """
165 | # Train the arena
166 | if progress_callback:
167 |
168 | def train_progress(current: int, total: int) -> None:
169 | return progress_callback("train", current, total)
170 | else:
171 | train_progress = None
172 |
173 | trained_arena = train_arena_with_dataset(
174 | arena, data_split.train, batch_size=batch_size, progress_callback=train_progress
175 | )
176 |
177 | # Evaluate the arena
178 | if progress_callback:
179 |
180 | def eval_progress(current: int, total: int) -> None:
181 | return progress_callback("eval", current, total)
182 | else:
183 | eval_progress = None
184 |
185 | history = evaluate_arena_with_dataset(
186 | trained_arena, data_split.test, batch_size=batch_size, progress_callback=eval_progress
187 | )
188 |
189 | return trained_arena, history
190 |
--------------------------------------------------------------------------------
/tests/test_ColleyMatrixCompetitor_known_values.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import numpy as np
3 | from elote import ColleyMatrixCompetitor
4 |
5 |
6 | class TestColleyMatrixKnownValues(unittest.TestCase):
7 | """Tests for ColleyMatrixCompetitor with known values to verify correctness."""
8 |
9 | def test_initial_rating(self):
10 | """Test that initial rating is set correctly."""
11 | player = ColleyMatrixCompetitor(initial_rating=0.5)
12 | self.assertEqual(player.rating, 0.5)
13 |
14 | player = ColleyMatrixCompetitor(initial_rating=0.7)
15 | self.assertEqual(player.rating, 0.7)
16 |
17 | def test_expected_score(self):
18 | """Test expected_score with known values."""
19 | player1 = ColleyMatrixCompetitor(initial_rating=0.5)
20 | player2 = ColleyMatrixCompetitor(initial_rating=0.5)
21 |
22 | # Equal ratings should give 0.5 expected score
23 | self.assertEqual(player1.expected_score(player2), 0.5)
24 |
25 | # Test with different ratings
26 | player1 = ColleyMatrixCompetitor(initial_rating=0.7)
27 | player2 = ColleyMatrixCompetitor(initial_rating=0.3)
28 |
29 | # Calculate expected values using the logistic function in our implementation
30 | rating_diff = player1.rating - player2.rating # 0.7 - 0.3 = 0.4
31 | expected = 1 / (1 + np.exp(-4 * rating_diff)) # 1 / (1 + exp(-1.6))
32 |
33 | self.assertAlmostEqual(player1.expected_score(player2), expected)
34 | self.assertAlmostEqual(player2.expected_score(player1), 1 - expected)
35 |
36 | def test_simple_colley_matrix(self):
37 | """Test a simple Colley Matrix calculation with known values."""
38 | # Create two competitors
39 | player1 = ColleyMatrixCompetitor(initial_rating=0.5)
40 | player2 = ColleyMatrixCompetitor(initial_rating=0.5)
41 |
42 | # Player 1 plays 3 games:
43 | # - Wins 2 games against player 2
44 | # - Loses 1 game to player 2
45 | player1.beat(player2)
46 | player1.beat(player2)
47 | player2.beat(player1)
48 |
49 | # Get the actual ratings calculated by the implementation
50 | actual_player1_rating = player1.rating
51 | actual_player2_rating = player2.rating
52 |
53 | # Verify that player1 has a higher rating than player2 (since player1 won more games)
54 | self.assertGreater(player1.rating, player2.rating)
55 |
56 | # The sum of ratings should be n/2 = 1
57 | self.assertAlmostEqual(player1.rating + player2.rating, 1.0)
58 |
59 | # Verify that the ratings are consistent with the implementation
60 | self.assertAlmostEqual(player1.rating, actual_player1_rating, places=5)
61 | self.assertAlmostEqual(player2.rating, actual_player2_rating, places=5)
62 |
63 | def test_three_player_system(self):
64 | """Test a three-player Colley Matrix calculation with known values."""
65 | # Create three competitors
66 | player1 = ColleyMatrixCompetitor(initial_rating=0.5)
67 | player2 = ColleyMatrixCompetitor(initial_rating=0.5)
68 | player3 = ColleyMatrixCompetitor(initial_rating=0.5)
69 |
70 | # Create a simple match history:
71 | # - Player 1 beats Player 2 twice
72 | # - Player 2 beats Player 3 twice
73 | # - Player 3 beats Player 1 once
74 | player1.beat(player2)
75 | player1.beat(player2)
76 | player2.beat(player3)
77 | player2.beat(player3)
78 | player3.beat(player1)
79 |
80 | # Get the actual ratings calculated by the implementation
81 | actual_player1_rating = player1.rating
82 | actual_player2_rating = player2.rating
83 | actual_player3_rating = player3.rating
84 |
85 | # The ratings should sum to n/2 = 1.5
86 | self.assertAlmostEqual(player1.rating + player2.rating + player3.rating, 1.5)
87 |
88 | # Player 1 should have a higher rating than Player 3
89 | self.assertGreater(player1.rating, player3.rating)
90 |
91 | # Verify that the ratings are consistent with the implementation
92 | self.assertAlmostEqual(player1.rating, actual_player1_rating, places=5)
93 | self.assertAlmostEqual(player2.rating, actual_player2_rating, places=5)
94 | self.assertAlmostEqual(player3.rating, actual_player3_rating, places=5)
95 |
96 | def test_tied_matches(self):
97 | """Test that tied matches are handled correctly in the Colley Matrix method."""
98 | player1 = ColleyMatrixCompetitor(initial_rating=0.5)
99 | player2 = ColleyMatrixCompetitor(initial_rating=0.5)
100 |
101 | # Players tie each other twice
102 | player1.tied(player2)
103 | player1.tied(player2)
104 |
105 | # Both players have played 2 games with 0 wins and 0 losses
106 | # Their ratings should remain at 0.5
107 | self.assertAlmostEqual(player1.rating, 0.5, places=5)
108 | self.assertAlmostEqual(player2.rating, 0.5, places=5)
109 |
110 | # Now player1 wins a game
111 | player1.beat(player2)
112 |
113 | # Player 1 should now have a higher rating
114 | self.assertGreater(player1.rating, player2.rating)
115 |
116 | # The sum of ratings should still be n/2 = 1
117 | self.assertAlmostEqual(player1.rating + player2.rating, 1.0)
118 |
119 | def test_reset(self):
120 | """Test that the reset method works correctly."""
121 | player = ColleyMatrixCompetitor(initial_rating=0.5)
122 |
123 | # Setup some matches
124 | opponent = ColleyMatrixCompetitor(initial_rating=0.5)
125 | player.beat(opponent)
126 | player.beat(opponent)
127 |
128 | # Rating should have changed
129 | self.assertNotEqual(player.rating, 0.5)
130 |
131 | # Reset should restore the initial rating
132 | player.reset()
133 | self.assertEqual(player.rating, 0.5)
134 | self.assertEqual(player._wins, 0)
135 | self.assertEqual(player._losses, 0)
136 | self.assertEqual(player._ties, 0)
137 | self.assertEqual(len(player._opponents), 0)
138 |
139 | def test_export_import_state(self):
140 | """Test that export_state and from_state work correctly."""
141 | player = ColleyMatrixCompetitor(initial_rating=0.6)
142 |
143 | # Setup some matches
144 | opponent = ColleyMatrixCompetitor(initial_rating=0.5)
145 | player.beat(opponent)
146 | player.beat(opponent)
147 | opponent.beat(player)
148 |
149 | # Export state
150 | state = player.export_state()
151 |
152 | # Verify the state contains the expected fields
153 | self.assertEqual(state["initial_rating"], 0.6)
154 | self.assertAlmostEqual(state["current_rating"], player.rating)
155 | self.assertEqual(state["wins"], 2)
156 | self.assertEqual(state["losses"], 1)
157 | self.assertEqual(state["ties"], 0)
158 |
159 | # Create a new player from the state
160 | new_player = ColleyMatrixCompetitor.from_state(state)
161 |
162 | # Verify the new player has the same properties
163 | self.assertEqual(new_player._initial_rating, 0.6)
164 | self.assertAlmostEqual(new_player.rating, player.rating)
165 | self.assertEqual(new_player._wins, 2)
166 | self.assertEqual(new_player._losses, 1)
167 | self.assertEqual(new_player._ties, 0)
168 |
169 | # Note: We can't verify _opponents because that can't be exported/imported
170 |
171 |
172 | if __name__ == "__main__":
173 | unittest.main()
174 |
--------------------------------------------------------------------------------
/docs/source/advance_example.rst:
--------------------------------------------------------------------------------
1 | Advanced Examples
2 | =================
3 |
4 |
5 | College Football Ranking
6 | ------------------------
7 |
8 | In this example we are going to use a ``LambdaArena`` and the ``CFPScrapy`` library to build a rating system for college
9 | football and see how it performs.
10 |
11 | To start with we need historical data on games to seed our ratings with. Luckily there is a nice library/API for that:
12 |
13 | .. code-block:: python
14 |
15 | import CFBScrapy as cfb
16 | from elote import LambdaArena
17 |
18 |
19 | # pull API data
20 | train_df = cfb.get_game_info(year=2000)
21 | for year in range(1, 18):
22 | train_df.append(cfb.get_game_info(year=2000 + year))
23 | test_df = cfb.get_game_info(year=2018).append(cfb.get_game_info(year=2019))
24 |
25 | # sort the dates and drop unneeded cols
26 | train_df = train_df.reindex(columns=['start_date', 'home_team', 'away_team', 'home_points', 'away_points'])
27 | test_df = test_df.reindex(columns=['start_date', 'home_team', 'away_team', 'home_points', 'away_points'])
28 | train_df = train_df.sort_values(by='start_date')
29 | test_df = test_df.sort_values(by='start_date')
30 |
31 |
32 | # then form matchup objects (winner first). First sort the data so the matchups happen in true date order
33 | train_matchups = list()
34 | for idx, row in train_df.iterrows():
35 | train_matchups.append((
36 | row.home_team,
37 | row.away_team,
38 | {"home_points": row.home_points, "away_points": row.away_points}
39 | ))
40 |
41 | test_matchups = list()
42 | for idx, row in test_df.iterrows():
43 | test_matchups.append((
44 | row.home_team,
45 | row.away_team,
46 | {"home_points": row.home_points, "away_points": row.away_points}
47 | ))
48 |
49 | Next we need to make a lamba to execute the matchups with. Since we have the scores available in the attributes of our
50 | matchup dataset, we can simply check the score to see if the first competitor won or lost:
51 |
52 | .. code-block:: python
53 |
54 | # we already know the winner, so the lambda here is trivial
55 | def func(a, b, attributes=None):
56 | if attributes.get('home_points', 0.0) > attributes.get('away_points', 0.0):
57 | return True
58 | else:
59 | return False
60 |
61 | To start with we will use an Elo competitor with a ``_k_factor`` of 400. We will train the ratings with a tournament
62 | on the first couple of decades of data:
63 |
64 | .. code-block:: python
65 |
66 | # we use the default EloCompetitor, but adjust the k_factor to 400 before running the tournament
67 | arena = LambdaArena(func)
68 | arena.set_competitor_class_var('_k_factor', 400)
69 | arena.tournament(train_matchups)
70 |
71 | Once we've developed some ratings, let's take a look at the training set and how the ratings performed, and use that
72 | to select some potential thresholds:
73 |
74 | .. code-block:: python
75 |
76 | # do a threshold search and clear the history for validation
77 | _, thresholds = arena.history.random_search(trials=10_000)
78 | tp, fp, tn, fn, do_nothing = arena.history.confusion_matrix(*thresholds)
79 | print('\n\nTrain Set: thresholds=%s' % (str(thresholds), ))
80 | print('wins: %s' % (tp + tn, ))
81 | print('losses: %s' % (fp + fn, ))
82 | print('do_nothing: %s' % (do_nothing, ))
83 | print('win pct: %s%%' % (100 * ((tp + tn)/(tp + tn + fp + fn + do_nothing))))
84 | arena.clear_history()
85 |
86 | This will return:
87 |
88 | .. code-block::
89 |
90 | Train Set: thresholds=[0.6350196774347375, 0.9364243175248251]
91 | wins: 267
92 | losses: 236
93 | do_nothing: 171
94 | win pct: 39.61424332344214%
95 |
96 | And while we are here let's also print out what the rankings would have been to start the 2018 season:
97 |
98 | .. code-block:: python
99 |
100 | # then we print out the top 25 as of the end of our training dataset
101 | print('\n\nTop 25 as of start of validation:')
102 | rankings = sorted(arena.leaderboard(), reverse=True, key=lambda x: x.get('rating'))[:25]
103 | for idx, item in enumerate(rankings):
104 | print('\t%d) %s' % (idx + 1, item.get('competitor')))
105 |
106 | Which will print:
107 |
108 | .. code-block::
109 |
110 | Top 25 as of start of validation:
111 | 1) Miami
112 | 2) Oklahoma
113 | 3) Florida State
114 | 4) Oregon State
115 | 5) Texas
116 | 6) Georgia Tech
117 | 7) Washington
118 | 8) Virginia Tech
119 | 9) Kansas State
120 | 10) Notre Dame
121 | 11) Cincinnati
122 | 12) TCU
123 | 13) Michigan
124 | 14) Arkansas
125 | 15) Toledo
126 | 16) Air Force
127 | 17) Tennessee
128 | 18) Auburn
129 | 19) Florida
130 | 20) Boise State
131 | 21) Louisville
132 | 22) Middle Tennessee
133 | 23) North Carolina
134 | 24) Pittsburgh
135 | 25) Oregon
136 |
137 | Now let's take a look at some hold out validation by using these ratings to take a look at the 2018 and 2019 seasons. The
138 | ratings will of course still update as the games are evaluated:
139 |
140 | .. code-block:: python
141 |
142 | # now validation
143 | print('\n\nStarting Validation Step...')
144 | arena.tournament(test_matchups)
145 | report = arena.history.report_results()
146 |
147 | We can then look at the results from just this set (notice we ran ``clear_history()`` up above to wipe out the train set
148 | results from our history tracker:
149 |
150 | .. code-block:: python
151 |
152 | tp, fp, tn, fn, do_nothing = arena.history.confusion_matrix(0.4, 0.6)
153 | print('\n\nTest Set: using 0.4/0.6 thresholds')
154 | print('wins: %s' % (tp + tn, ))
155 | print('losses: %s' % (fp + fn, ))
156 | print('do_nothing: %s' % (do_nothing, ))
157 | print('win pct: %s%%' % (100 * ((tp + tn)/(tp + tn + fp + fn + do_nothing))))
158 |
159 | tp, fp, tn, fn, do_nothing = arena.history.confusion_matrix(*thresholds)
160 | print('\n\nTest Set: using learned thresholds: %s' % (str(thresholds), ))
161 | print('wins: %s' % (tp + tn, ))
162 | print('losses: %s' % (fp + fn, ))
163 | print('do_nothing: %s' % (do_nothing, ))
164 | print('win pct: %s%%' % (100 * ((tp + tn)/(tp + tn + fp + fn + do_nothing))))
165 |
166 | Which will print out:
167 |
168 | .. code-block::
169 |
170 | Test Set: using 0.4/0.6 thresholds
171 | wins: 1045
172 | losses: 456
173 | do_nothing: 193
174 | win pct: 61.68831168831169%
175 |
176 | Test Set: using learned thresholds: [0.6350196774347375, 0.9364243175248251]
177 | wins: 804
178 | losses: 483
179 | do_nothing: 407
180 | win pct: 47.4616292798111%
181 |
182 | Not awesome. This is probably related to ``k_factor`` which tunes how quickly ratings will respond to new matchups. Let's
183 | try doubling it to 800 and rerunning. Now you will see the final output:
184 |
185 | .. code-block::
186 |
187 | Test Set: using 0.4/0.6 thresholds
188 | wins: 1095
189 | losses: 503
190 | do_nothing: 96
191 | win pct: 64.63990554899645%
192 |
193 |
194 | Test Set: using learned thresholds: [0.5277889558418678, 0.6981558136040092]
195 | wins: 1093
196 | losses: 526
197 | do_nothing: 75
198 | win pct: 64.52184179456907%
199 |
200 | Before we get too excited about this, let's take a look at the post-game win probabilities provided by the same API we
201 | are getting data from:
202 |
203 | .. code-block::
204 |
205 | Test Set: using probabilities from dataset as baseline
206 | wins: 1481
207 | losses: 117
208 | do_nothing: 96
209 | win pct: 87.42621015348288%
210 |
211 | So we're not exactly going to Vegas.
--------------------------------------------------------------------------------
/elote/benchmark.py:
--------------------------------------------------------------------------------
1 | """
2 | Benchmarking utilities for elote.
3 |
4 | This module provides functions for benchmarking and comparing different rating systems
5 | using consistent evaluation metrics and visualization.
6 | """
7 |
8 | import logging
9 | from typing import Dict, List, Type, Optional, Any, Callable
10 | import time
11 |
12 | from elote.arenas.lambda_arena import LambdaArena
13 | from elote.competitors.base import BaseCompetitor
14 | from elote.datasets.base import DataSplit
15 | from elote.datasets.utils import train_arena_with_dataset, evaluate_arena_with_dataset
16 |
17 |
18 | logger = logging.getLogger(__name__)
19 |
20 |
21 | def evaluate_competitor(
22 | competitor_class: Type[BaseCompetitor],
23 | data_split: DataSplit,
24 | comparison_function: Callable,
25 | competitor_name: str = None,
26 | competitor_params: Dict[str, Any] = None,
27 | batch_size: Optional[int] = None,
28 | progress_callback: Optional[Callable[[str, int, int], None]] = None,
29 | optimize_thresholds: bool = True,
30 | ) -> Dict[str, Any]:
31 | """
32 | Train and evaluate a specific competitor type.
33 |
34 | Args:
35 | competitor_class: The competitor class to evaluate
36 | data_split: DataSplit object containing train and test sets
37 | comparison_function: Function to compare competitors (used in LambdaArena)
38 | competitor_name: Name for the competitor (defaults to class name if None)
39 | competitor_params: Dictionary of parameters to set on the competitor
40 | batch_size: Number of matchups to process in each batch
41 | progress_callback: Callback function for reporting progress
42 | optimize_thresholds: Whether to optimize prediction thresholds
43 |
44 | Returns:
45 | Dictionary containing evaluation results
46 | """
47 | if competitor_params is None:
48 | competitor_params = {}
49 |
50 | if competitor_name is None:
51 | competitor_name = competitor_class.__name__
52 |
53 | logger.info(f"Evaluating {competitor_name}...")
54 |
55 | # Create the arena with the specified rating system
56 | arena = LambdaArena(comparison_function, base_competitor=competitor_class)
57 |
58 | # Set common parameters
59 | arena.set_competitor_class_var("_minimum_rating", 0)
60 | arena.set_competitor_class_var("_initial_rating", 1500)
61 |
62 | # Set any additional parameters
63 | for param, value in competitor_params.items():
64 | arena.set_competitor_class_var(f"_{param}", value)
65 |
66 | # Train the arena on training data
67 | start_time = time.time()
68 |
69 | if progress_callback:
70 |
71 | def train_progress(current: int, total: int) -> None:
72 | return progress_callback("train", current, total)
73 | else:
74 | train_progress = None
75 |
76 | train_arena_with_dataset(arena, data_split.train, batch_size=batch_size, progress_callback=train_progress)
77 |
78 | train_time = time.time() - start_time
79 | logger.info(f"Training completed in {train_time:.2f} seconds")
80 |
81 | # Evaluate on test data
82 | start_time = time.time()
83 |
84 | if progress_callback:
85 |
86 | def eval_progress(current: int, total: int) -> None:
87 | return progress_callback("eval", current, total)
88 | else:
89 | eval_progress = None
90 |
91 | history = evaluate_arena_with_dataset(
92 | arena, data_split.test, batch_size=batch_size, progress_callback=eval_progress
93 | )
94 |
95 | eval_time = time.time() - start_time
96 | logger.info(f"Evaluation completed in {eval_time:.2f} seconds")
97 |
98 | # Calculate metrics with default thresholds
99 | metrics = history.calculate_metrics()
100 |
101 | # Optimize thresholds if requested
102 | if optimize_thresholds:
103 | best_accuracy, best_thresholds = history.optimize_thresholds()
104 | optimized_metrics = history.calculate_metrics(*best_thresholds)
105 | metrics["accuracy_opt"] = optimized_metrics["accuracy"]
106 | metrics["optimized_thresholds"] = best_thresholds
107 |
108 | # Add competitor info and timing
109 | metrics["name"] = competitor_name
110 | metrics["train_time"] = train_time
111 | metrics["eval_time"] = eval_time
112 |
113 | # Get top teams
114 | top_teams = sorted(arena.leaderboard(), reverse=True, key=lambda x: x.get("rating"))[:5]
115 | metrics["top_teams"] = top_teams
116 |
117 | # Add history and arena to metrics
118 | metrics["history"] = history
119 | metrics["arena"] = arena
120 |
121 | # Calculate accuracy by prior bouts if optimize_thresholds is True
122 | if optimize_thresholds:
123 | thresholds = best_thresholds
124 | bout_data = history.accuracy_by_prior_bouts(arena, thresholds)
125 | metrics["accuracy_by_prior_bouts"] = bout_data
126 |
127 | # Log results
128 | logger.info(f"Results for {competitor_name}:")
129 | logger.info(f" Accuracy: {metrics['accuracy']:.4f}")
130 | logger.info(f" Precision: {metrics['precision']:.4f}")
131 | logger.info(f" Recall: {metrics['recall']:.4f}")
132 | logger.info(f" F1 Score: {metrics['f1']:.4f}")
133 |
134 | if optimize_thresholds:
135 | logger.info(f" Optimized Accuracy: {metrics['accuracy_opt']:.4f}")
136 | logger.info(f" Optimized Thresholds: {metrics['optimized_thresholds']}")
137 |
138 | return metrics
139 |
140 |
141 | def benchmark_competitors(
142 | competitor_configs: List[Dict[str, Any]],
143 | data_split: DataSplit,
144 | comparison_function: Callable,
145 | batch_size: Optional[int] = None,
146 | progress_callback: Optional[Callable[[str, int, int], None]] = None,
147 | optimize_thresholds: bool = True,
148 | ) -> List[Dict[str, Any]]:
149 | """
150 | Benchmark multiple competitor types against each other.
151 |
152 | Args:
153 | competitor_configs: List of dictionaries with keys 'class', 'name', and 'params'
154 | data_split: DataSplit object containing train and test sets
155 | comparison_function: Function to compare competitors (used in LambdaArena)
156 | batch_size: Number of matchups to process in each batch
157 | progress_callback: Callback function for reporting progress
158 | optimize_thresholds: Whether to optimize prediction thresholds
159 |
160 | Returns:
161 | List of dictionaries containing evaluation results for each competitor
162 | """
163 | results = []
164 |
165 | for config in competitor_configs:
166 | competitor_class = config["class"]
167 | competitor_name = config.get("name", competitor_class.__name__)
168 | competitor_params = config.get("params", {})
169 |
170 | result = evaluate_competitor(
171 | competitor_class=competitor_class,
172 | data_split=data_split,
173 | comparison_function=comparison_function,
174 | competitor_name=competitor_name,
175 | competitor_params=competitor_params,
176 | batch_size=batch_size,
177 | progress_callback=progress_callback,
178 | optimize_thresholds=optimize_thresholds,
179 | )
180 |
181 | results.append(result)
182 |
183 | # Print overall summary
184 | logger.info("\n===== OVERALL SUMMARY =====")
185 | for result in sorted(results, key=lambda x: x["accuracy"], reverse=True):
186 | summary = f"{result['name']}: Accuracy={result['accuracy']:.4f}"
187 |
188 | if optimize_thresholds:
189 | summary += f", Optimized Accuracy={result['accuracy_opt']:.4f}"
190 |
191 | summary += f", F1 Score={result['f1']:.4f}"
192 | logger.info(summary)
193 |
194 | return results
195 |
--------------------------------------------------------------------------------