├── gym_soccer
    ├── utils
    │   ├── __init__.py
    │   ├── policies.py
    │   └── planners.py
    ├── envs
    │   ├── __init__.py
    │   ├── soccer_alternating_env.py
    │   └── soccer_simultaneous_env.py
    ├── __init__.py
    └── tests
    │   ├── test_slip_soccer_simultaneous_env.py
    │   ├── test_deterministic_soccer_simultaneous_env.py
    │   └── test_general.py
├── setup.py
├── README.md
├── LICENSE
├── .github
    └── workflows
    │   └── test.yml
└── .gitignore


/gym_soccer/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gym_soccer/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from gym_soccer.envs.soccer_simultaneous_env import SoccerSimultaneousEnv
2 | 


--------------------------------------------------------------------------------
/gym_soccer/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | # NO REGISTRATION JUST YET
 4 | # classics
 5 | # register(
 6 | #     id='SoccerSimultaneous-v0',
 7 | #     entry_point='gym_soccer.envs:SoccerSimultaneousEnv',
 8 | #     kwargs={'width': 5, 'height': 4, 'slip_prob': 0.2, 'player_a_policy': None, 'player_b_policy': None},
 9 | #     max_episode_steps=100,
10 | #     reward_threshold=1.0,
11 | #     nondeterministic=True,
12 | # )
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='gym_soccer',
 5 |     version='0.0.1',
 6 |     description='Gym soccer environment - useful to replicate soccer experiments from Littman 94',
 7 |     url='https://github.com/mimoralea/gym-soccer-littman94',
 8 |     author='Miguel Morales',
 9 |     author_email='mimoralea@gmail.com',
10 |     packages=find_packages(),  # Automatically find and include packages in the directory
11 |     license='MIT License',
12 |     install_requires=[
13 |         'numpy==1.26.4',
14 |         'gym>=0.26.2'
15 |     ],
16 | )
17 | 


--------------------------------------------------------------------------------
/gym_soccer/utils/policies.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym_soccer.envs.soccer_simultaneous_env import SoccerSimultaneousEnv
 3 | 
 4 | def get_random_policy(n_states=761, n_actions=5, seed=0):
 5 |     random_policy = {}
 6 |     random_state = np.random.RandomState(seed)
 7 |     for s in range(n_states):
 8 |         random_policy[s] = random_state.randint(0, n_actions)
 9 |     return random_policy
10 | 
11 | def get_stand_policy(n_states=761):
12 |     stand_policy = {}
13 |     for s in range(n_states):
14 |         stand_policy[s] = SoccerSimultaneousEnv.NOOP
15 |     return stand_policy
16 | 
17 | def save_policy(policy, filename, mode='wb'):
18 |     import pickle
19 |     assert isinstance(policy, dict), "Policy must be a dictionary"
20 |     # Save dictionary to a file
21 |     with open(filename, mode) as f:
22 |         pickle.dump(policy, f)
23 | 
24 | def load_policy(filename, mode='rb'):
25 |     import pickle
26 |     with open(filename, mode) as f:
27 |         return pickle.load(f)
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # gym-soccer-littman94
 2 | 
 3 | ## Installation
 4 | 
 5 | ```bash
 6 | git clone https://github.com/mimoralea/gym-soccer-littman94.git
 7 | cd gym-soccer-littman94
 8 | pip install .
 9 | ```
10 | 
11 | or:
12 | 
13 | ```bash
14 | pip install git+https://github.com/mimoralea/gym-soccer-littman94#egg=gym-soccer-littman94
15 | ```
16 | 
17 | ## Use
18 | 
19 | ```python
20 | import gym, gym_walk, numpy as np
21 | env = gym.make('WalkFive-v0')
22 | pi = lambda x: np.random.randint(2)
23 | 
24 | def td(pi, env, gamma=1.0, alpha=0.01, n_episodes=100000):
25 |     V = np.zeros(env.observation_space.n)
26 |     for t in range(n_episodes):
27 |         state, done = env.reset(), False
28 |         while not done:
29 |             action = pi(state)
30 |             next_state, reward, done, _ = env.step(action)
31 |             td_target = reward + gamma * V[next_state] * (not done)
32 |             td_error = td_target - V[state]
33 |             V[state] = V[state] + alpha * td_error
34 |             state = next_state
35 |     return V
36 | 
37 | V = td(pi, env)
38 | V
39 | ```
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 GT RLDM
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python application
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   build:
17 | 
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v4
22 |     - name: Set up Python 3.10
23 |       uses: actions/setup-python@v3
24 |       with:
25 |         python-version: "3.10"
26 |         
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         pip install flake8 pytest
31 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 |         if [ -f setup.py ]; then pip install .; fi  # Install package if setup.py exists
33 |     
34 |     - name: Lint with flake8
35 |       run: |
36 |         # stop the build if there are Python syntax errors or undefined names
37 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
38 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
39 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
40 | 
41 |     - name: Test with pytest
42 |       run: |
43 |         pytest
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # pycharm
104 | .idea
105 | 
106 | 


--------------------------------------------------------------------------------
/gym_soccer/utils/planners.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numpy import linalg as LA
 3 | 
 4 | def value_iteration(env, theta, discount_factor):
 5 |     cc, P = 0, env.P
 6 |     V = np.zeros(len(P), dtype=np.float64)
 7 |     while True:
 8 |         Q = np.zeros((len(P), len(P[0])), dtype=np.float64)
 9 |         for s in range(len(P)):
10 |             for a in range(len(P[s])):
11 |                 for prob, next_state, reward, done in P[s][a]:
12 |                     Q[s][a] += prob * (reward + discount_factor * V[next_state] * (not done))
13 |         cc += 1
14 |         if np.max(np.abs(V - np.max(Q, axis=1))) < theta:
15 |             break
16 |         V = np.max(Q, axis=1)
17 |     pi = np.argmax(Q, axis=1)
18 |     return pi, V, Q, cc
19 | 
20 | def policy_evaluation(pi, env, theta, discount_factor):
21 |     P = env.P
22 |     prev_V = np.zeros(len(P), dtype=np.float64)
23 |     while True:
24 |         V = np.zeros(len(P), dtype=np.float64)
25 |         for s in range(len(P)):
26 |             for prob, next_state, reward, done in P[s][pi[s]]:
27 |                 V[s] += prob * (reward + discount_factor * prev_V[next_state] * (not done))
28 |         if np.max(np.abs(prev_V - V)) < theta:
29 |             break
30 |         prev_V = V.copy()
31 |     return V
32 | 
33 | def policy_improvement(V, env, discount_factor):
34 |     P = env.P
35 |     Q = np.zeros((len(P), len(P[0])), dtype=np.float64)
36 |     for s in range(len(P)):
37 |         for a in range(len(P[s])):
38 |             for prob, next_state, reward, done in P[s][a]:
39 |                 Q[s][a] += prob * (reward + discount_factor * V[next_state] * (not done))
40 |     new_pi = np.argmax(Q, axis=1)
41 |     return new_pi, Q
42 | 
43 | def policy_iteration(env, theta, discount_factor):
44 |     cc, P = 0, env.P
45 |     pi = np.random.choice(tuple(P[0].keys()), len(P))
46 |     while True:
47 |         old_pi = pi.copy()
48 |         V = policy_evaluation(pi, env, theta, discount_factor)
49 |         pi, Q = policy_improvement(V, env, discount_factor)
50 |         cc += 1
51 |         if np.all(old_pi == pi):
52 |             break
53 |     return pi, V, Q, cc
54 | 
55 | def policy_eval(env, policy, theta, discount_factor, k=10000000, init=None):
56 |     v = np.zeros(env.nS) if init is None else init
57 |     cc = 0
58 |     for i in range(k):
59 |         value_fc = np.zeros(env.nS)
60 |         for s in range(env.nS):
61 |             r_pi = np.dot(policy[s, :], env.Rmat[s, :])
62 |             pv = np.dot(env.Pmat[s, :, :].T, v)
63 |             p_pi = np.dot(pv, policy[s, :])
64 |             value_fc[s] = r_pi + discount_factor * p_pi
65 |         delta = LA.norm(value_fc - v, np.inf)
66 |         v[:] = value_fc
67 |         cc += 1
68 |         if delta < theta:
69 |             break
70 |     return v, cc
71 | 
72 | 
73 | def modified_policy_iteration(env, k, theta, discount_factor):
74 |     v = np.zeros(env.nS)
75 |     threshold = (theta * (1 - discount_factor))/(2 * discount_factor)
76 |     counter = 0
77 |     while True:
78 |         q = np.zeros([env.nS, env.nA])
79 |         for a in range(env.nA):
80 |             q[:, a] = env.Rmat[:, a] + discount_factor * np.dot(env.Pmat[:, :, a], v)
81 |         greedy_v = np.max(q, -1)
82 |         best_action = np.argmax(q, -1)
83 |         policy = np.eye(env.nA)[best_action]
84 |         if LA.norm(v - greedy_v, np.inf) <= threshold:
85 |             return policy.argmax(axis=1), greedy_v, q, counter
86 |         else:
87 |             v, cc = policy_eval(env, policy, theta=theta, discount_factor=discount_factor, k=k,  init=greedy_v)
88 |             counter += 1


--------------------------------------------------------------------------------
/gym_soccer/tests/test_slip_soccer_simultaneous_env.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | from gym_soccer.envs import SoccerSimultaneousEnv
  4 | 
  5 | @pytest.fixture
  6 | def env():
  7 |     return SoccerSimultaneousEnv(width=5, height=4, slip_prob=0.2)
  8 | 
  9 | @pytest.fixture(autouse=True)
 10 | def reset_env(env):
 11 |     env.reset()
 12 |     yield
 13 | 
 14 | def test_initialization(env):
 15 |     env.reset()
 16 |     assert env.width == 7  # 5 + 2 for goal columns
 17 |     assert env.height == 4
 18 |     assert env.slip_prob == 0.2
 19 |     assert env.action_space['player_a'].n == 5
 20 |     assert env.action_space['player_b'].n == 5
 21 | 
 22 | def test_reset(env):
 23 |     obs, info = env.reset()
 24 |     assert isinstance(obs, dict)
 25 |     assert 'player_a' in obs and 'player_b' in obs
 26 |     assert isinstance(info, dict)
 27 |     assert 'player_a' in info and 'player_b' in info
 28 | 
 29 | def test_step(env):
 30 |     env.reset()
 31 |     action = {'player_a': env.NOOP, 'player_b': env.NOOP}
 32 |     obs, reward, terminated, truncated, info = env.step(action)
 33 |     assert isinstance(obs, dict)
 34 |     assert isinstance(reward, dict)
 35 |     assert isinstance(terminated, dict)
 36 |     assert isinstance(truncated, dict)
 37 |     assert isinstance(info, dict)
 38 | 
 39 | def test_scoring(env):
 40 |     def run_scoring_test(initial_state, action_a, action_b, iterations=100000):
 41 |         score_count = 0
 42 |         for _ in range(iterations):
 43 |             env.reset()
 44 |             env.state = initial_state
 45 |             action = {'player_a': action_a, 'player_b': action_b}
 46 |             obs, reward, terminated, truncated, info = env.step(action)
 47 |             if terminated['player_a'] or terminated['player_b']:
 48 |                 assert abs(reward['player_a']) == 1 and abs(reward['player_b']) == 1, "Both players must receive a reward/penalty for a goal"
 49 |                 score_count += 1
 50 | 
 51 |         score_ratio = score_count / iterations
 52 |         print(f"Score ratio: {score_ratio:.2f}")
 53 |         assert 0.75 <= score_ratio <= 0.85, f"Score ratio: {score_ratio:.2f}, expected close to 0.8"
 54 | 
 55 |     # Test Player A scoring
 56 |     run_scoring_test((1, 5, 3, 1, 0), env.EAST, env.NOOP)
 57 | 
 58 |     # Test Player B scoring
 59 |     run_scoring_test((3, 5, 1, 1, 1), env.NOOP, env.WEST)
 60 | 
 61 | def test_render(env, capsys):
 62 |     env.reset()
 63 |     env.render()
 64 |     captured = capsys.readouterr()
 65 |     assert "Player A position" in captured.out
 66 |     assert "Player B position" in captured.out
 67 |     assert "Ball possession" in captured.out
 68 | 
 69 | def test_possession_change_non_collision(env):
 70 |     # Test that possession doesn't change when players move without colliding
 71 |     env.reset()
 72 |     env.state = (1, 1, 3, 3, 0)  # Player A has possession
 73 |     action = {'player_a': env.EAST, 'player_b': env.WEST}
 74 |     obs, reward, terminated, truncated, info = env.step(action)
 75 |     assert env.state[4] == 0, "Possession should not change without collision"
 76 | 
 77 |     env.reset()
 78 |     env.state = (1, 1, 3, 3, 1)  # Player B has possession
 79 |     action = {'player_a': env.EAST, 'player_b': env.WEST}
 80 |     obs, reward, terminated, truncated, info = env.step(action)
 81 |     assert env.state[4] == 1, "Possession should not change without collision"
 82 | 
 83 | def test_slip_into_goal(env):
 84 |     def run_slip_goal_test(initial_state, action_a, action_b, iterations=100000):
 85 |         goal_count = 0
 86 |         for _ in range(iterations):
 87 |             env.reset()
 88 |             env.state = initial_state
 89 |             action = {'player_a': action_a, 'player_b': action_b}
 90 |             obs, reward, terminated, truncated, info = env.step(action)
 91 |             if terminated['player_a'] or terminated['player_b']:
 92 |                 goal_count += 1
 93 | 
 94 |         goal_ratio = goal_count / iterations
 95 |         assert 0.09 <= goal_ratio <= 0.11, f"Goal ratio: {goal_ratio:.2f}, expected close to 0.1"
 96 | 
 97 |     # Test A slipping into own goal
 98 |     run_slip_goal_test((1, 1, 3, 3, 0), env.NORTH, env.NOOP)
 99 |     run_slip_goal_test((2, 1, 3, 3, 0), env.NORTH, env.NOOP)
100 |     run_slip_goal_test((1, 1, 3, 3, 0), env.SOUTH, env.NOOP)
101 |     run_slip_goal_test((2, 1, 3, 3, 0), env.SOUTH, env.NOOP)
102 | 
103 |     # Test A slipping into B's goal
104 |     run_slip_goal_test((1, 5, 3, 3, 0), env.NORTH, env.NOOP)
105 |     run_slip_goal_test((2, 5, 3, 3, 0), env.NORTH, env.NOOP)
106 |     run_slip_goal_test((1, 5, 3, 3, 0), env.SOUTH, env.NOOP)
107 |     run_slip_goal_test((2, 5, 3, 3, 0), env.SOUTH, env.NOOP)
108 | 
109 |     # Test B slipping into A's goal
110 |     run_slip_goal_test((3, 3, 1, 1, 1), env.NOOP, env.NORTH)
111 |     run_slip_goal_test((3, 3, 2, 1, 1), env.NOOP, env.NORTH)
112 |     run_slip_goal_test((3, 3, 1, 1, 1), env.NOOP, env.SOUTH)
113 |     run_slip_goal_test((3, 3, 2, 1, 1), env.NOOP, env.SOUTH)
114 | 
115 |     # Test B slipping into own goal
116 |     run_slip_goal_test((3, 3, 1, 5, 1), env.NOOP, env.NORTH)
117 |     run_slip_goal_test((3, 3, 2, 5, 1), env.NOOP, env.NORTH)
118 |     run_slip_goal_test((3, 3, 1, 5, 1), env.NOOP, env.SOUTH)
119 |     run_slip_goal_test((3, 3, 2, 5, 1), env.NOOP, env.SOUTH)
120 | 
121 | def test_bounce_off_horizontal_edges(env):
122 |     def run_bounce_test(initial_state, action_a, action_b, iterations=100000):
123 |         bounce_count = 0
124 |         slip_count = 0
125 |         for _ in range(iterations):
126 |             env.reset()
127 |             env.state = initial_state
128 |             obs, reward, done, truncated, info = env.step({'player_a': action_a, 'player_b': action_b})
129 |             if env.state == initial_state:
130 |                 bounce_count += 1
131 |             elif env.state != initial_state:
132 |                 slip_count += 1
133 | 
134 |         bounce_ratio = bounce_count / iterations
135 |         slip_ratio = slip_count / iterations
136 |         assert 0.79 <= bounce_ratio <= 0.81, f"Bounce ratio: {bounce_ratio:.2f}, expected close to 0.8"
137 |         assert 0.19 <= slip_ratio <= 0.21, f"Slip ratio: {slip_ratio:.2f}, expected close to 0.2"
138 | 
139 |     # Test bouncing off top edge
140 |     run_bounce_test((0, 2, 3, 3, 0), env.NORTH, env.NOOP)
141 |     run_bounce_test((0, 3, 3, 3, 0), env.NORTH, env.NOOP)
142 |     run_bounce_test((3, 3, 0, 2, 1), env.NOOP, env.NORTH)
143 |     run_bounce_test((3, 3, 0, 3, 1), env.NOOP, env.NORTH)
144 | 
145 |     # Test bouncing off bottom edge
146 |     run_bounce_test((3, 2, 0, 3, 0), env.SOUTH, env.NOOP)
147 |     run_bounce_test((3, 3, 0, 3, 0), env.SOUTH, env.NOOP)
148 |     run_bounce_test((0, 3, 3, 2, 0), env.NOOP, env.SOUTH)
149 |     run_bounce_test((0, 3, 3, 3, 0), env.NOOP, env.SOUTH)
150 | 
151 | def test_bounce_off_corner_edges(env):
152 |     def run_bounce_test(initial_state, action, iterations=100000):
153 |         bounce_count = 0
154 |         slip_count = 0
155 |         for _ in range(iterations):
156 |             env.reset()
157 |             env.state = initial_state
158 |             obs, reward, done, truncated, info = env.step({'player_a': action, 'player_b': env.NOOP})
159 |             if env.state == initial_state:
160 |                 bounce_count += 1
161 |             elif env.state != initial_state:
162 |                 slip_count += 1
163 | 
164 |         bounce_ratio = bounce_count / iterations
165 |         slip_ratio = slip_count / iterations
166 |         assert 0.89 <= bounce_ratio <= 0.91, f"Bounce ratio: {bounce_ratio:.2f}, expected close to 0.9"
167 |         assert 0.09 <= slip_ratio <= 0.11, f"Slip ratio: {slip_ratio:.2f}, expected close to 0.1"
168 | 
169 |     # Test bouncing off left edge (non-goal row)
170 |     run_bounce_test((0, 1, 3, 3, 1), env.WEST)
171 | 
172 |     # Test bouncing off right edge (non-goal row)
173 |     run_bounce_test((3, 5, 0, 3, 1), env.EAST)
174 | 
175 | def test_collision_through_slip(env):
176 |     def run_slip_collision_test(initial_state, action_a, action_b, iterations=100000):
177 |         collision_count = 0
178 |         for _ in range(iterations):
179 |             env.reset()
180 |             env.state = initial_state
181 |             obs, reward, done, truncated, info = env.step({'player_a': action_a, 'player_b': action_b})
182 |             if env.state[0] == initial_state[0] and env.state[1] == initial_state[1] and \
183 |                 env.state[2] == initial_state[2] and env.state[3] == initial_state[3]:
184 |                 collision_count += 1
185 | 
186 |         collision_ratio = collision_count / iterations
187 |         expected_ratio = 0.1  # 10% chance of slip for one player, other player moves as intended
188 |         assert np.isclose(collision_ratio, expected_ratio, atol=0.02), f"Collision ratio: {collision_ratio:.2f}, expected close to {expected_ratio:.2f}"
189 | 
190 |     # Test A slipping into B's cell
191 |     run_slip_collision_test((2, 2, 2, 3, 0), env.NORTH, env.NOOP)
192 |     run_slip_collision_test((2, 2, 2, 3, 1), env.NORTH, env.NOOP)
193 | 
194 |     # Test B slipping into A's cell
195 |     run_slip_collision_test((2, 3, 2, 2, 0), env.NOOP, env.NORTH)
196 |     run_slip_collision_test((2, 3, 2, 2, 1), env.NOOP, env.NORTH)
197 | 
198 | def test_no_slip_on_stand(env):
199 |     initial_state = (1, 2, 3, 4, 0)
200 |     iterations = 100000
201 |     slip_count = 0
202 | 
203 |     for _ in range(iterations):
204 |         env.reset()
205 |         env.state = initial_state
206 |         obs, reward, done, truncated, info = env.step({'player_a': env.NOOP, 'player_b': env.NOOP})
207 |         if env.state != initial_state:
208 |             slip_count += 1
209 | 
210 |     assert slip_count == 0, f"Expected no slips on STAND action, got {slip_count} slips"
211 | 


--------------------------------------------------------------------------------
/gym_soccer/envs/soccer_alternating_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | from gym.envs.toy_text.utils import categorical_sample
  4 | 
  5 | class SoccerGridWorld:
  6 |     # Define constants for actions
  7 |     UP = 0
  8 |     DOWN = 1
  9 |     LEFT = 2
 10 |     RIGHT = 3
 11 |     STAND = 4
 12 | 
 13 |     def __init__(self, width=5, height=4, slip_prob=0.2, isd_possession_a=0.5, simultaneous_action=True, player_a_policy=None, player_b_policy=None):
 14 |         assert width >= 5, "Width must be at least 5 columns."
 15 |         assert height >= 4, "Height must be at least 4 rows."
 16 | 
 17 |         self.width = width + 2  # +2 for the columns where goals are located
 18 |         self.height = height
 19 |         self.slip_prob = slip_prob
 20 |         self.isd_possession_a = isd_possession_a
 21 |         self.simultaneous_action = simultaneous_action
 22 |         self.player_a_policy = player_a_policy
 23 |         self.player_b_policy = player_b_policy
 24 |         self.np_random = np.random.RandomState()
 25 |         
 26 |         # Initialize the state space and action space
 27 |         self.n_states = self.width * self.height * 2  # width * height * 2 (possession)
 28 |         self.n_actions = 5  # Actions: UP, DOWN, LEFT, RIGHT, STAND
 29 |         
 30 |         # Define the initial state distribution
 31 |         self.isd = self.generate_isd()
 32 |         
 33 |         # Define transition dynamics
 34 |         self.P = self._initialize_transition_dynamics()
 35 |         
 36 |         # Initialize current state
 37 |         self.s = self.reset()[0]
 38 |         self.lastaction = None
 39 |         
 40 |         # For alternating case
 41 |         if not self.simultaneous_action:
 42 |             self.current_player = None  # Will be set in reset()
 43 | 
 44 |     def generate_isd(self):
 45 |         distribution = []
 46 |         col_a = 2  # Player A starts 2 columns from their goal
 47 |         col_b = self.width - 3  # Player B starts 2 columns from their goal
 48 | 
 49 |         if self.height % 2 == 1:
 50 |             # Odd height: both players start in the middle row
 51 |             middle_row = self.height // 2
 52 |             for possession in range(2):  # 0: A, 1: B
 53 |                 if self.simultaneous_action:
 54 |                     state = (middle_row, col_a, middle_row, col_b, possession)
 55 |                 else:
 56 |                     for who_moves_first in range(2):  # 0: A, 1: B
 57 |                         state = (middle_row, col_a, middle_row, col_b, possession, who_moves_first)
 58 |                         distribution.append((0.25, state))
 59 |                 if self.simultaneous_action:
 60 |                     distribution.append((0.5, state))
 61 |         else:
 62 |             # Even height: players start in different rows around the middle
 63 |             row_a_options = [self.height // 2 - 1, self.height // 2]
 64 |             row_b_options = [self.height // 2, self.height // 2 - 1]
 65 |             for i in range(2):
 66 |                 row_a = row_a_options[i]
 67 |                 row_b = row_b_options[i]
 68 |                 for possession in range(2):  # 0: A, 1: B
 69 |                     if self.simultaneous_action:
 70 |                         state = (row_a, col_a, row_b, col_b, possession)
 71 |                         distribution.append((0.25, state))
 72 |                     else:
 73 |                         for who_moves_first in range(2):  # 0: A, 1: B
 74 |                             state = (row_a, col_a, row_b, col_b, possession, who_moves_first)
 75 |                             distribution.append((0.125, state))
 76 |         
 77 |         return distribution
 78 | 
 79 |     def _initialize_transition_dynamics(self):
 80 |         P = {}
 81 |         self.directions = [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)]  # UP, DOWN, LEFT, RIGHT, STAND
 82 |         
 83 |         for row_a in range(self.height):
 84 |             for col_a in range(self.width):
 85 |                 for row_b in range(self.height):
 86 |                     for col_b in range(self.width):
 87 |                         for possession in range(2):  # 0: A, 1: B
 88 |                             if self.simultaneous_action:
 89 |                                 state = (row_a, col_a, row_b, col_b, possession)
 90 |                                 P[state] = {}
 91 |                                 # Simultaneous action dynamics
 92 |                                 for action_a in range(self.n_actions):
 93 |                                     for action_b in range(self.n_actions):
 94 |                                         transitions = []
 95 |                                         next_state, reward, done = self._get_next_state(state, action_a, action_b)
 96 |                                         transitions.append((1 - self.slip_prob, next_state, reward, done))
 97 |                                         
 98 |                                         # Handle slips in orthogonal directions
 99 |                                         orthogonal_moves_a = [(-self.directions[action_a][1], self.directions[action_a][0]), (self.directions[action_a][1], -self.directions[action_a][0])]
100 |                                         orthogonal_moves_b = [(-self.directions[action_b][1], self.directions[action_b][0]), (self.directions[action_b][1], -self.directions[action_b][0])]
101 |                                         for orth_move_a in orthogonal_moves_a:
102 |                                             for orth_move_b in orthogonal_moves_b:
103 |                                                 slip_state, _, _ = self._get_next_state(state, 
104 |                                                     self._action_from_direction(orth_move_a), 
105 |                                                     self._action_from_direction(orth_move_b))
106 |                                                 transitions.append((self.slip_prob / 4, slip_state, reward, done))
107 |                                         P[state][(action_a, action_b)] = transitions
108 |                             else:
109 |                                 # Alternating action dynamics
110 |                                 for who_moves_next in [0, 1]:  # 0: Player A, 1: Player B
111 |                                     state = (row_a, col_a, row_b, col_b, possession, who_moves_next)
112 |                                     P[state] = {}
113 |                                     for action in range(self.n_actions):
114 |                                         transitions = []
115 |                                         next_state, reward, done = self._get_next_state(state, action, None)
116 |                                         next_state = (*next_state[:5], 1 - who_moves_next)  # Switch to other player's turn
117 |                                         transitions.append((1 - self.slip_prob, next_state, reward, done))
118 |                                         
119 |                                         # Handle slips in orthogonal directions
120 |                                         orthogonal_moves = [(-self.directions[action][1], self.directions[action][0]), (self.directions[action][1], -self.directions[action][0])]
121 |                                         for orth_move in orthogonal_moves:
122 |                                             slip_action = self._action_from_direction(orth_move)
123 |                                             slip_state, slip_reward, slip_done = self._get_next_state(state, slip_action, None)
124 |                                             slip_state = (*slip_state[:5], 1 - who_moves_next)  # Switch to other player's turn
125 |                                             transitions.append((self.slip_prob / 2, slip_state, slip_reward, slip_done))
126 |                                         P[state][action] = transitions
127 |         
128 |         return P
129 | 
130 |     def _get_next_state(self, state, action_a, action_b):
131 |         if self.simultaneous_action:
132 |             row_a, col_a, row_b, col_b, possession = state
133 |         else:
134 |             row_a, col_a, row_b, col_b, possession, who_moves_next = state
135 |         
136 |         # Handle actions and slip probability
137 |         def move(row, col, action):
138 |             if action is None:
139 |                 return row, col
140 |             intended_move = self.directions[action]
141 |             new_row = max(0, min(self.height - 1, row + intended_move[0]))
142 |             new_col = max(0, min(self.width - 1, col + intended_move[1]))
143 |             return new_row, new_col
144 |         
145 |         # Update positions based on actions
146 |         if self.simultaneous_action or who_moves_next == 0:
147 |             next_row_a, next_col_a = move(row_a, col_a, action_a)
148 |         else:
149 |             next_row_a, next_col_a = row_a, col_a
150 |         
151 |         if self.simultaneous_action or who_moves_next == 1:
152 |             next_row_b, next_col_b = move(row_b, col_b, action_b)
153 |         else:
154 |             next_row_b, next_col_b = row_b, col_b
155 |         
156 |         # Handle STAND action properly in alternating action case
157 |         if not self.simultaneous_action:
158 |             if who_moves_next == 0:  # Player A's turn
159 |                 if action_a == self.STAND:
160 |                     next_row_a, next_col_a = row_a, col_a
161 |                 elif (next_row_a, next_col_a) == (row_b, col_b):
162 |                     possession = 1  # Player B gains possession
163 |                     next_row_a, next_col_a = row_a, col_a
164 |             else:  # Player B's turn
165 |                 if action_b == self.STAND:
166 |                     next_row_b, next_col_b = row_b, col_b
167 |                 elif (next_row_b, next_col_b) == (row_a, col_a):
168 |                     possession = 0  # Player A gains possession
169 |                     next_row_b, next_col_b = row_b, col_b
170 |         
171 |         # Check for goals and terminal state
172 |         done = False
173 |         reward = 0
174 |         if possession == 0:  # Player A has the ball
175 |             if next_col_a == 0:  # Player A scores in its own goal (own goal)
176 |                 done = True
177 |                 reward = -1  # Negative reward for own goal
178 |             elif next_col_a == self.width - 1:  # Player A scores in opponent's goal
179 |                 done = True
180 |                 reward = 1
181 |             elif next_row_a == row_b and next_col_a == col_b:  # Player B steals
182 |                 possession = 1
183 |         else:  # Player B has the ball
184 |             if next_col_b == self.width - 1:  # Player B scores in its own goal (own goal)
185 |                 done = True
186 |                 reward = 1  # Positive reward for player A when B scores own goal
187 |             elif next_col_b == 0:  # Player B scores in opponent's goal
188 |                 done = True
189 |                 reward = -1
190 |             elif next_row_b == row_a and next_col_b == col_a:  # Player A steals
191 |                 possession = 0
192 |         
193 |         # Handle simultaneous action collision
194 |         if self.simultaneous_action and (next_row_a == next_row_b and next_col_a == next_col_b):
195 |             if action_a == self.STAND and action_b != self.STAND:
196 |                 possession = 0  # Player A gains possession
197 |             elif action_b == self.STAND and action_a != self.STAND:
198 |                 possession = 1  # Player B gains possession
199 |             elif action_a != self.STAND and action_b != self.STAND:
200 |                 possession = self.np_random.choice([0, 1])  # Randomly decide who gets possession
201 |         
202 |         next_state = (next_row_a, next_col_a, next_row_b, next_col_b, possession)
203 |         
204 |         return next_state, reward, done
205 | 
206 |     def step(self, action):
207 |         if self.simultaneous_action:
208 |             action_a, action_b = action
209 |             transitions = self.P[self.s][(action_a, action_b)]
210 |             i = categorical_sample([t[0] for t in transitions], self.np_random)
211 |             prob, next_state, reward, done = transitions[i]
212 |             self.s = next_state
213 |             self.lastaction = action
214 |             obs = {
215 |                 "player_a": (next_state[0], next_state[1], next_state[2], next_state[3], 1 if next_state[4] == 0 else 0),
216 |                 "player_b": (next_state[2], next_state[3], next_state[0], next_state[1], 1 if next_state[4] == 1 else 0)
217 |             }
218 |             rewards = {"player_a": reward, "player_b": -reward}
219 |             dones = {"player_a": done, "player_b": done}
220 |             truncateds = {"player_a": False, "player_b": False}
221 |             infos = {"player_a": {"prob": prob}, "player_b": {"prob": prob}}
222 |             return obs, rewards, dones, truncateds, infos
223 |         else:
224 |             transitions = self.P[self.s][action]
225 |             i = categorical_sample([t[0] for t in transitions], self.np_random)
226 |             prob, next_state, reward, done = transitions[i]
227 |             self.s = next_state
228 |             self.lastaction = action
229 |             self.current_player = next_state[5]  # Update current player
230 |             
231 |             current_player_name = "player_a" if self.current_player == 0 else "player_b"
232 |             obs = (next_state[0], next_state[1], next_state[2], next_state[3], 1 if next_state[4] == self.current_player else 0)
233 |             reward = reward if self.current_player == 0 else -reward
234 |             
235 |             return obs, reward, done, False, {"prob": prob}
236 | 
237 |     def reset(self, seed=None, options=None):
238 |         if seed is not None:
239 |             self.np_random.seed(seed)
240 |         i = categorical_sample([is_[0] for is_ in self.isd], self.np_random)
241 |         p, self.s = self.isd[i]
242 |         self.lastaction = None
243 |         if self.simultaneous_action:
244 |             obs = {
245 |                 "player_a": (self.s[0], self.s[1], self.s[2], self.s[3], 1 if self.s[4] == 0 else 0),
246 |                 "player_b": (self.s[2], self.s[3], self.s[0], self.s[1], 1 if self.s[4] == 1 else 0)
247 |             }
248 |         else:
249 |             self.current_player = self.s[5]
250 |             obs = (self.s[0], self.s[1], self.s[2], self.s[3], 1 if self.s[4] == self.current_player else 0)
251 |         return obs, {"prob": p}
252 | 
253 |     def _action_from_direction(self, direction):
254 |         return self.directions.index(direction)
255 | 


--------------------------------------------------------------------------------
/gym_soccer/tests/test_deterministic_soccer_simultaneous_env.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | from gym_soccer.envs import SoccerSimultaneousEnv
  4 | 
  5 | @pytest.fixture
  6 | def env():
  7 |     return SoccerSimultaneousEnv(width=5, height=4, slip_prob=0.0)
  8 | 
  9 | @pytest.fixture(autouse=True)
 10 | def reset_env(env):
 11 |     env.reset()
 12 |     yield
 13 | 
 14 | def test_initialization(env):
 15 |     env.reset()
 16 |     assert env.width == 7  # 5 + 2 for goal columns
 17 |     assert env.height == 4
 18 |     assert env.slip_prob == 0.0
 19 |     assert env.action_space['player_a'].n == 5
 20 |     assert env.action_space['player_b'].n == 5
 21 | 
 22 | def test_reset(env):
 23 |     obs, info = env.reset()
 24 |     assert isinstance(obs, dict)
 25 |     assert 'player_a' in obs and 'player_b' in obs
 26 |     assert isinstance(info, dict)
 27 |     assert 'player_a' in info and 'player_b' in info
 28 | 
 29 | def test_step(env):
 30 |     env.reset()
 31 |     action = {'player_a': env.NOOP, 'player_b': env.NOOP}
 32 |     obs, reward, terminated, truncated, info = env.step(action)
 33 |     assert isinstance(obs, dict)
 34 |     assert isinstance(reward, dict)
 35 |     assert isinstance(terminated, dict)
 36 |     assert isinstance(truncated, dict)
 37 |     assert isinstance(info, dict)
 38 | 
 39 | def test_scoring(env):
 40 |     def run_scoring_test(initial_state, action_a, action_b):
 41 |         env.reset()
 42 |         env.state = initial_state
 43 |         action = {'player_a': action_a, 'player_b': action_b}
 44 |         obs, reward, terminated, truncated, info = env.step(action)
 45 |         assert terminated['player_a'] and terminated['player_b'], "Game should end"
 46 |         assert abs(reward['player_a']) == 1 and abs(reward['player_b']) == 1, "Both players must receive a reward/penalty for a goal"
 47 | 
 48 |     # Test Player A scoring
 49 |     run_scoring_test((1, 5, 3, 1, 0), env.EAST, env.NOOP)
 50 | 
 51 |     # Test Player B scoring
 52 |     run_scoring_test((3, 5, 1, 1, 1), env.NOOP, env.WEST)
 53 | 
 54 | def test_own_goals(env):
 55 |     # Test Player A scoring an own goal (row 1)
 56 |     env.state = (1, 1, 3, 5, 0)  # Player A with ball, near own goal
 57 |     obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.NOOP})
 58 |     assert done['player_a'] and done['player_b']
 59 |     assert reward['player_a'] == -1
 60 |     assert reward['player_b'] == 1
 61 | 
 62 |     # Test Player A scoring an own goal (row 2)
 63 |     env.reset()
 64 |     env.state = (2, 1, 3, 5, 0)  # Player A with ball, near own goal
 65 |     obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.NOOP})
 66 |     assert done['player_a'] and done['player_b']
 67 |     assert reward['player_a'] == -1
 68 |     assert reward['player_b'] == 1
 69 | 
 70 |     # Test Player B scoring an own goal (row 1)
 71 |     env.reset()
 72 |     env.state = (3, 1, 1, 5, 1)  # Player B with ball, near own goal
 73 |     obs, reward, done, truncated, info = env.step({'player_a': env.NOOP, 'player_b': env.EAST})
 74 |     assert done['player_a'] and done['player_b']
 75 |     assert reward['player_a'] == 1
 76 |     assert reward['player_b'] == -1
 77 | 
 78 |     # Test Player B scoring an own goal (row 2)
 79 |     env.reset()
 80 |     env.state = (3, 1, 2, 5, 1)  # Player B with ball, near own goal
 81 |     obs, reward, done, truncated, info = env.step({'player_a': env.NOOP, 'player_b': env.EAST})
 82 |     assert done['player_a'] and done['player_b']
 83 |     assert reward['player_a'] == 1
 84 |     assert reward['player_b'] == -1
 85 | 
 86 | def test_both_players_moving_collision(env):
 87 |     # Test collision when both players are moving
 88 |     env.reset()
 89 |     env.state = (1, 2, 1, 3, 0)  # Player A has possession
 90 |     action = {'player_a': env.EAST, 'player_b': env.WEST}
 91 |     obs, reward, terminated, truncated, info = env.step(action)
 92 |     assert env.state[1] == 2 and env.state[3] == 3, "Players should bounce back to original positions"
 93 |     assert env.state[4] in [0, 1], "Possession should be randomly assigned"
 94 | 
 95 |     env.reset()
 96 |     env.state = (1, 2, 1, 3, 1)  # Player B has possession
 97 |     action = {'player_a': env.EAST, 'player_b': env.WEST}
 98 |     obs, reward, terminated, truncated, info = env.step(action)
 99 |     assert env.state[1] == 2 and env.state[3] == 3, "Players should bounce back to original positions"
100 |     assert env.state[4] in [0, 1], "Possession should be randomly assigned"
101 | 
102 | def test_one_player_standing_collision(env):
103 |     # Test collision when one player is standing still
104 |     env.reset()
105 |     env.state = (1, 2, 1, 3, 0)  # Player A has possession
106 |     action = {'player_a': env.EAST, 'player_b': env.NOOP}
107 |     obs, reward, terminated, truncated, info = env.step(action)
108 |     assert env.state[1] == 2 and env.state[3] == 3, "Players should remain in original positions"
109 |     assert env.state[4] in [0, 1], "Possession should be randomly assigned"
110 | 
111 |     env.reset()
112 |     env.state = (1, 2, 1, 3, 1)  # Player B has possession
113 |     action = {'player_a': env.NOOP, 'player_b': env.WEST}
114 |     obs, reward, terminated, truncated, info = env.step(action)
115 |     assert env.state[1] == 2 and env.state[3] == 3, "Players should remain in original positions"
116 |     assert env.state[4] in [0, 1], "Possession should be randomly assigned"
117 | 
118 | def test_move_to_same_cell_collision(env):
119 |     def run_move_to_same_cell_collision_test(initial_state, action_a, action_b, iterations=1000):
120 |         move_success_counts = {'A': 0, 'B': 0}
121 |         possession_switch_count = 0
122 |         initial_possession = initial_state[4]
123 | 
124 |         for _ in range(iterations):
125 |             env.reset()
126 |             env.state = initial_state
127 |             action = {'player_a': action_a, 'player_b': action_b}
128 |             obs, reward, terminated, truncated, info = env.step(action)
129 | 
130 |             if env.state[0] != initial_state[0] or env.state[1] != initial_state[1]:
131 |                 move_success_counts['A'] += 1
132 |             elif env.state[2] != initial_state[2] or env.state[3] != initial_state[3]:
133 |                 move_success_counts['B'] += 1
134 | 
135 |             if env.state[4] != initial_possession:
136 |                 possession_switch_count += 1
137 | 
138 |         for player, count in move_success_counts.items():
139 |             success_ratio = count / iterations
140 |             assert 0.45 <= success_ratio <= 0.55, f"Move success ratio for Player {player}: {success_ratio:.2f}, expected close to 0.5"
141 | 
142 |         possession_switch_ratio = possession_switch_count / iterations
143 |         assert 0.45 <= possession_switch_ratio <= 0.55, f"Possession switch ratio: {possession_switch_ratio:.2f}, expected close to 0.5"
144 | 
145 |     # Diagonal movements
146 |     run_move_to_same_cell_collision_test((1, 1, 2, 2, 0), env.EAST, env.NORTH)    # A: right, B: up
147 |     run_move_to_same_cell_collision_test((1, 1, 2, 2, 1), env.EAST, env.NORTH)    # Same, but B has initial possession
148 |     run_move_to_same_cell_collision_test((1, 2, 2, 1, 0), env.WEST, env.NORTH)     # A: left, B: up
149 |     run_move_to_same_cell_collision_test((1, 2, 2, 1, 1), env.WEST, env.NORTH)     # Same, but B has initial possession
150 |     run_move_to_same_cell_collision_test((2, 1, 1, 2, 0), env.EAST, env.SOUTH)  # A: right, B: down
151 |     run_move_to_same_cell_collision_test((2, 1, 1, 2, 1), env.EAST, env.SOUTH)  # Same, but B has initial possession
152 |     run_move_to_same_cell_collision_test((2, 2, 1, 1, 0), env.WEST, env.SOUTH)   # A: left, B: down
153 |     run_move_to_same_cell_collision_test((2, 2, 1, 1, 1), env.WEST, env.SOUTH)   # Same, but B has initial possession
154 | 
155 |     # Horizontal movements
156 |     run_move_to_same_cell_collision_test((1, 1, 1, 3, 0), env.EAST, env.WEST)  # A: right, B: left
157 |     run_move_to_same_cell_collision_test((1, 1, 1, 3, 1), env.EAST, env.WEST)  # Same, but B has initial possession
158 |     run_move_to_same_cell_collision_test((1, 3, 1, 1, 0), env.WEST, env.EAST)  # A: left, B: right
159 |     run_move_to_same_cell_collision_test((1, 3, 1, 1, 1), env.WEST, env.EAST)  # Same, but B has initial possession
160 | 
161 |     # Vertical movements
162 |     run_move_to_same_cell_collision_test((1, 1, 3, 1, 0), env.SOUTH, env.NORTH)     # A: down, B: up
163 |     run_move_to_same_cell_collision_test((1, 1, 3, 1, 1), env.SOUTH, env.NORTH)     # Same, but B has initial possession
164 |     run_move_to_same_cell_collision_test((3, 1, 1, 1, 0), env.NORTH, env.SOUTH)     # A: up, B: down
165 |     run_move_to_same_cell_collision_test((3, 1, 1, 1, 1), env.NORTH, env.SOUTH)     # Same, but B has initial possession
166 | 
167 | def test_all_edges(env):
168 |     # Test Player A at top edge, B at right edge
169 |     # Case 1: A has possession, A moves UP, B moves RIGHT
170 |     initial_state = (0, 1, 3, 5, 0)
171 |     env.state = initial_state
172 |     obs, reward, done, truncated, info = env.step({'player_a': env.NORTH, 'player_b': env.EAST})
173 |     assert env.state == initial_state, "State should not change when players attempt to move out of bounds"
174 | 
175 |     # Case 2: B has possession, A moves UP, B moves RIGHT
176 |     initial_state = (0, 1, 3, 5, 1)
177 |     env.state = initial_state
178 |     obs, reward, done, truncated, info = env.step({'player_a': env.NORTH, 'player_b': env.EAST})
179 |     assert env.state == initial_state, "State should not change when players attempt to move out of bounds"
180 | 
181 |     # Case 3: A has possession, A moves LEFT, B moves RIGHT
182 |     initial_state = (0, 1, 3, 5, 0)
183 |     env.state = initial_state
184 |     obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.EAST})
185 |     assert env.state == initial_state, "State should not change when players attempt to move out of bounds"
186 | 
187 |     # Case 4: B has possession, A moves LEFT, B moves RIGHT
188 |     initial_state = (0, 1, 3, 5, 1)
189 |     env.state = initial_state
190 |     obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.EAST})
191 |     assert env.state == initial_state, "State should not change when players attempt to move out of bounds"
192 | 
193 |     # Case 5: A has possession, A moves UP, B moves DOWN
194 |     initial_state = (0, 1, 3, 5, 0)
195 |     env.state = initial_state
196 |     obs, reward, done, truncated, info = env.step({'player_a': env.NORTH, 'player_b': env.SOUTH})
197 |     assert env.state == initial_state, "State should not change when Player A attempts to move out of bounds"
198 | 
199 |     # Case 6: B has possession, A moves UP, B moves DOWN
200 |     initial_state = (0, 1, 3, 5, 1)
201 |     env.state = initial_state
202 |     obs, reward, done, truncated, info = env.step({'player_a': env.NORTH, 'player_b': env.SOUTH})
203 |     assert env.state == initial_state, "State should not change when Player A attempts to move out of bounds"
204 | 
205 |     # Case 7: A has possession, A moves LEFT, B moves DOWN
206 |     initial_state = (0, 1, 3, 5, 0)
207 |     env.state = initial_state
208 |     obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.SOUTH})
209 |     assert env.state == initial_state, "State should not change when Player A attempts to move out of bounds"
210 | 
211 |     # Case 8: B has possession, A moves LEFT, B moves DOWN
212 |     initial_state = (0, 1, 3, 5, 1)
213 |     env.state = initial_state
214 |     obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.SOUTH})
215 |     assert env.state == initial_state, "State should not change when Player A attempts to move out of bounds"
216 | 
217 |     # Swap positions: A at right edge, B at top edge
218 |     # Case 9: A has possession, A moves RIGHT, B moves UP
219 |     initial_state = (3, 5, 0, 1, 0)
220 |     env.state = initial_state
221 |     obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.NORTH})
222 |     assert env.state == initial_state, "State should not change when players attempt to move out of bounds"
223 | 
224 |     # Case 10: B has possession, A moves RIGHT, B moves UP
225 |     initial_state = (3, 5, 0, 1, 1)
226 |     env.state = initial_state
227 |     obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.NORTH})
228 |     assert env.state == initial_state, "State should not change when players attempt to move out of bounds"
229 | 
230 |     # Case 11: A has possession, A moves RIGHT, B moves LEFT
231 |     initial_state = (3, 5, 0, 1, 0)
232 |     env.state = initial_state
233 |     obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.WEST})
234 |     assert env.state == initial_state, "State should not change when players attempt to move out of bounds"
235 | 
236 |     # Case 12: B has possession, A moves RIGHT, B moves LEFT
237 |     initial_state = (3, 5, 0, 1, 1)
238 |     env.state = initial_state
239 |     obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.WEST})
240 |     assert env.state == initial_state, "State should not change when players attempt to move out of bounds"
241 | 
242 |     # Case 13: A has possession, A moves DOWN, B moves UP
243 |     initial_state = (3, 5, 0, 1, 0)
244 |     env.state = initial_state
245 |     obs, reward, done, truncated, info = env.step({'player_a': env.SOUTH, 'player_b': env.NORTH})
246 |     assert env.state == initial_state, "State should not change when Player B attempts to move out of bounds"
247 | 
248 |     # Case 14: B has possession, A moves DOWN, B moves UP
249 |     initial_state = (3, 5, 0, 1, 1)
250 |     env.state = initial_state
251 |     obs, reward, done, truncated, info = env.step({'player_a': env.SOUTH, 'player_b': env.NORTH})
252 |     assert env.state == initial_state, "State should not change when Player B attempts to move out of bounds"
253 | 
254 |     # Case 15: A has possession, A moves DOWN, B moves LEFT
255 |     initial_state = (3, 5, 0, 1, 0)
256 |     env.state = initial_state
257 |     obs, reward, done, truncated, info = env.step({'player_a': env.SOUTH, 'player_b': env.WEST})
258 |     assert env.state == initial_state, "State should not change when Player B attempts to move out of bounds"
259 | 
260 |     # Case 16: B has possession, A moves DOWN, B moves LEFT
261 |     initial_state = (3, 5, 0, 1, 1)
262 |     env.state = initial_state
263 |     obs, reward, done, truncated, info = env.step({'player_a': env.SOUTH, 'player_b': env.WEST})
264 |     assert env.state == initial_state, "State should not change when Player B attempts to move out of bounds"
265 | 
266 |     # GOAL BOUNDARIES WITHOUT POSSESSION
267 |     # Test Player A at left goal boundary without possession (row 1)
268 |     env.reset()
269 |     initial_state = (1, 1, 3, 3, 1)  # B has possession
270 |     env.state = initial_state
271 |     obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.NOOP})
272 |     assert env.state == initial_state, "Player A should not enter left goal area without possession (row 1)"
273 | 
274 |     # Test Player A at left goal boundary without possession (row 2)
275 |     env.reset()
276 |     initial_state = (2, 1, 3, 3, 1)  # B has possession
277 |     env.state = initial_state
278 |     obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.NOOP})
279 |     assert env.state == initial_state, "Player A should not enter left goal area without possession (row 2)"
280 | 
281 |     # Test Player B at right goal boundary without possession (row 1)
282 |     env.reset()
283 |     initial_state = (3, 3, 1, 5, 0)  # A has possession
284 |     env.state = initial_state
285 |     obs, reward, done, truncated, info = env.step({'player_a': env.NOOP, 'player_b': env.EAST})
286 |     assert env.state == initial_state, "Player B should not enter right goal area without possession (row 1)"
287 | 
288 |     # Test Player B at right goal boundary without possession (row 2)
289 |     env.reset()
290 |     initial_state = (3, 3, 2, 5, 0)  # A has possession
291 |     env.state = initial_state
292 |     obs, reward, done, truncated, info = env.step({'player_a': env.NOOP, 'player_b': env.EAST})
293 |     assert env.state == initial_state, "Player B should not enter right goal area without possession (row 2)"
294 | 
295 |     # Test Player B at left goal boundary without possession (row 1)
296 |     env.reset()
297 |     initial_state = (3, 3, 1, 1, 0)  # A has possession
298 |     env.state = initial_state
299 |     obs, reward, done, truncated, info = env.step({'player_a': env.NOOP, 'player_b': env.WEST})
300 |     assert env.state == initial_state, "Player B should not move beyond left goal boundary without possession (row 1)"
301 | 
302 |     # Test Player B at left goal boundary without possession (row 2)
303 |     env.reset()
304 |     initial_state = (3, 3, 2, 1, 0)  # A has possession
305 |     env.state = initial_state
306 |     obs, reward, done, truncated, info = env.step({'player_a': env.NOOP, 'player_b': env.WEST})
307 |     assert env.state == initial_state, "Player B should not move beyond left goal boundary without possession (row 2)"
308 | 
309 |     # Test Player A at right goal boundary without possession (row 1)
310 |     env.reset()
311 |     initial_state = (1, 5, 3, 3, 1)  # B has possession
312 |     env.state = initial_state
313 |     obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.NOOP})
314 |     assert env.state == initial_state, "Player A should not move beyond right goal boundary without possession (row 1)"
315 | 
316 |     # Test Player A at right goal boundary without possession (row 2)
317 |     env.reset()
318 |     initial_state = (2, 5, 3, 3, 1)  # B has possession
319 |     env.state = initial_state
320 |     obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.NOOP})
321 |     assert env.state == initial_state, "Player A should not move beyond right goal boundary without possession (row 2)"
322 | 
323 | def test_render(env, capsys):
324 |     env.reset()
325 |     env.render()
326 |     captured = capsys.readouterr()
327 |     assert "Player A position" in captured.out
328 |     assert "Player B position" in captured.out
329 |     assert "Ball possession" in captured.out
330 | 
331 | def test_possession_change_non_collision(env):
332 |     # Test that possession doesn't change when players move without colliding
333 |     env.state = (1, 1, 3, 3, 0)  # Player A has possession
334 |     obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.WEST})
335 |     assert env.state[4] == 0, "Possession should not change without collision"
336 | 
337 |     env.state = (1, 1, 3, 3, 1)  # Player B has possession
338 |     obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.WEST})
339 |     assert env.state[4] == 1, "Possession should not change without collision"
340 | 
341 | def test_simultaneous_goal_attempts(env):
342 |     # Both players attempt to score simultaneously
343 |     env.state = (1, 5, 1, 1, 0)  # A with ball near B's goal, B near A's goal
344 |     obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.WEST})
345 |     assert done['player_a'] and done['player_b'], "Game should end"
346 |     assert reward['player_a'] == 1 and reward['player_b'] == -1, "Only A should score"
347 | 
348 |     env.reset()
349 |     env.state = (1, 5, 1, 1, 1)  # B with ball near A's goal, A near B's goal
350 |     obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.WEST})
351 |     assert done['player_a'] and done['player_b'], "Game should end"
352 |     assert reward['player_a'] == -1 and reward['player_b'] == 1, "Only B should score"
353 | 
354 | def test_edge_case_possession(env):
355 |     # Test possession change when moving to the same cell from different distances
356 |     env.state = (1, 1, 1, 2, 0)  # A has ball, both move right
357 |     obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.EAST})
358 |     assert env.state[4] == 0, "A should keep possession as it's closer"
359 | 
360 |     env.state = (1, 1, 1, 2, 1)  # B has ball, both move right
361 |     obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.EAST})
362 |     assert env.state[4] == 1, "B should keep possession even though A moves to the same cell"
363 | 
364 |     # Test possession change when moving to the same cell from different distances
365 |     env.state = (1, 1, 1, 3, 0)  # A has ball, both move right
366 |     obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.EAST})
367 |     assert env.state[4] == 0, "A should keep possession as it's closer"
368 | 
369 |     env.state = (1, 1, 1, 3, 1)  # B has ball, both move right
370 |     obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.EAST})
371 |     assert env.state[4] == 1, "B should keep possession even though A moves to the same cell"
372 | 
373 | def test_multiple_consecutive_collisions(env):
374 |     initial_state = (1, 2, 1, 3, 0)  # A has ball, players adjacent
375 |     n_samples = 1000
376 |     collision_count = 0
377 |     possession_changes = 0
378 |     last_possession = 0
379 | 
380 |     for _ in range(n_samples):
381 |         env.state = initial_state
382 |         obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.WEST})
383 |         
384 |         if env.state[1] == initial_state[1] and env.state[3] == initial_state[3]:
385 |             collision_count += 1
386 |         
387 |         if env.state[4] != last_possession:
388 |             possession_changes += 1
389 |         
390 |         last_possession = env.state[4]
391 | 
392 |     assert collision_count == n_samples, f"All steps should result in collision, got {collision_count}"
393 |     possession_ratio = possession_changes / n_samples
394 |     assert 0.45 <= possession_ratio <= 0.55, f"Possession should change roughly half the time, got {possession_ratio:.2f}"
395 | 
396 | def test_simultaneous_out_of_bounds(env):
397 |     # Both players try to move out of bounds simultaneously
398 |     env.state = (0, 1, 3, 5, 0)  # A at top edge, B at right edge
399 |     initial_state = env.state
400 |     obs, reward, done, truncated, info = env.step({'player_a': env.NORTH, 'player_b': env.EAST})
401 |     assert env.state == initial_state, "State should not change when both players attempt to move out of bounds"
402 | 
403 |     # One player tries to move out of bounds, the other moves validly
404 |     env.state = (0, 1, 3, 4, 1)  # A at top edge, B has possession
405 |     obs, reward, done, truncated, info = env.step({'player_a': env.NORTH, 'player_b': env.EAST})
406 |     assert env.state[3] == 5, "B should move right"
407 |     assert env.state[0] == 0 and env.state[1] == 1, "A should not move"
408 | 
409 | def test_edge_case_goal_scoring(env):
410 |     # Test scoring from the edge of the goal area
411 |     env.state = (1, 5, 3, 3, 0)  # A with ball, at edge of B's goal
412 |     obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.NOOP})
413 |     assert done['player_a'] and done['player_b'], "Game should end"
414 |     assert reward['player_a'] == 1 and reward['player_b'] == -1, "A should score"
415 | 
416 |     # Test scoring from the edge of own goal area
417 |     env.reset()
418 |     env.state = (2, 1, 3, 3, 0)  # A with ball, at edge of own goal
419 |     obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.NOOP})
420 |     assert done['player_a'] and done['player_b'], "Game should end"
421 |     assert reward['player_a'] == -1 and reward['player_b'] == 1, "A should score an own goal"


--------------------------------------------------------------------------------
/gym_soccer/tests/test_general.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | from gym_soccer.envs.soccer_simultaneous_env import SoccerSimultaneousEnv
  4 | 
  5 | @pytest.mark.parametrize("width,height", [
  6 |     (5, 4),  # Minimum size, even height
  7 |     (6, 4),  # even height
  8 |     (7, 5),  # Odd height
  9 |     (9, 6),  # Even height
 10 |     (11, 7),  # Odd height
 11 | ])
 12 | def test_initial_state_distribution(width, height):
 13 |     env = SoccerSimultaneousEnv(width=width, height=height)
 14 | 
 15 |     # Check that the total probability sums to 1
 16 |     total_prob = sum(prob for prob, _ in env.isd)
 17 |     assert abs(total_prob - 1.0) < 1e-6, f"Total probability should be 1, but is {total_prob}"
 18 | 
 19 |     # Check that all probabilities are equal
 20 |     first_prob = env.isd[0][0]
 21 |     assert all(abs(prob - first_prob) < 1e-6 for prob, _ in env.isd), "All probabilities should be equal"
 22 | 
 23 |     # Check starting positions
 24 |     for _, state in env.isd:
 25 |         row_a, col_a, row_b, col_b, possession = state
 26 | 
 27 |         # Check columns
 28 |         assert col_a == 2, f"Player A should start in column 2, but starts in column {col_a}"
 29 |         assert col_b == env.width - 3, f"Player B should start in column {env.width - 3}, but starts in column {col_b}"
 30 | 
 31 |         # Check rows
 32 |         if len(env.goal_rows) % 2 == 0:  # Even number of goal rows
 33 |             middle_index = len(env.goal_rows) // 2
 34 |             valid_rows = [env.goal_rows[middle_index - 1], env.goal_rows[middle_index]]
 35 |             assert row_a in valid_rows, f"Player A should start in row {valid_rows[0]} or {valid_rows[1]}, but starts in row {row_a}"
 36 |             assert row_b in valid_rows, f"Player B should start in row {valid_rows[0]} or {valid_rows[1]}, but starts in row {row_b}"
 37 |             assert row_a != row_b, f"Players should not start in the same row, but both start in row {row_a}"
 38 |         else:  # Odd number of goal rows
 39 |             middle_row = env.goal_rows[len(env.goal_rows) // 2]
 40 |             assert row_a == middle_row, f"Player A should start in middle row {middle_row}, but starts in row {row_a}"
 41 |             assert row_b == middle_row, f"Player B should start in middle row {middle_row}, but starts in row {row_b}"
 42 | 
 43 |         # Check possession
 44 |         assert possession in [0, 1], f"Possession should be 0 or 1, but is {possession}"
 45 | 
 46 |     # Check number of initial states
 47 |     if len(env.goal_rows) % 2 == 0:
 48 |         expected_states = 4  # Two row combinations, two possession states
 49 |     else:
 50 |         expected_states = 2  # One middle row, two possession states
 51 | 
 52 |     assert len(env.isd) == expected_states, f"Expected {expected_states} initial states, but got {len(env.isd)}"
 53 | 
 54 | @pytest.mark.parametrize("width,height", [
 55 |     (5, 4),  # Minimum size, even height
 56 |     (6, 4),  # even height
 57 |     (7, 5),  # Odd height
 58 |     (9, 6),  # Even height
 59 |     (11, 7),  # Odd height
 60 | ])
 61 | def test_env_P_structure(width, height):
 62 |     env = SoccerSimultaneousEnv(width=width, height=height)
 63 | 
 64 |     # Check that env.P is a dictionary
 65 |     assert isinstance(env.P, dict), "env.P should be a dictionary"
 66 | 
 67 |     # Check that all keys in env.P are integers from 0 to len(env.P) - 1
 68 |     expected_keys = set(range(len(env.P)))
 69 |     actual_keys = set(env.P.keys())
 70 |     assert actual_keys == expected_keys, f"env.P keys should be integers from 0 to {len(env.P) - 1}"
 71 | 
 72 |     # Check that all values in env.P are dictionaries
 73 |     for state, actions in env.P.items():
 74 |         assert isinstance(actions, dict), f"env.P[{state}] should be a dictionary"
 75 | 
 76 |         # Check that all action keys are valid
 77 |         valid_actions = set(env.P[0].keys())
 78 |         assert set(actions.keys()) == valid_actions, f"Invalid action keys in env.P[{state}]"
 79 | 
 80 |         # Check the structure of each action's transitions
 81 |         for action, transitions in actions.items():
 82 |             assert isinstance(transitions, list), f"env.P[{state}][{action}] should be a list"
 83 |             for transition in transitions:
 84 |                 assert len(transition) == 4, f"Each transition in env.P[{state}][{action}] should have 4 elements"
 85 |                 prob, next_state, reward, done = transition
 86 |                 assert 0 <= prob <= 1, f"Transition probability should be between 0 and 1, got {prob}"
 87 |                 assert isinstance(next_state, int) and 0 <= next_state < len(env.P), f"Invalid next state: {next_state}"
 88 |                 assert isinstance(reward, (int, float)), f"Reward should be a number, got {type(reward)}"
 89 |                 assert isinstance(done, bool), f"Done flag should be a boolean, got {type(done)}"
 90 | 
 91 |     print(f"env.P structure test passed for width={width}, height={height}")
 92 | 
 93 | @pytest.mark.parametrize("width,height", [
 94 |     (5, 4),  # Minimum size, even height
 95 |     (6, 4),  # even height
 96 |     (7, 5),  # Odd height
 97 |     (9, 6),  # Even height
 98 |     (11, 7),  # Odd height
 99 | ])
100 | def test_initial_state_sampling(width, height):
101 |     env = SoccerSimultaneousEnv(width=width, height=height)
102 |     n_samples = 10000
103 |     state_counts = {}
104 | 
105 |     for _ in range(n_samples):
106 |         env.reset()
107 |         state_counts[env.state] = state_counts.get(env.state, 0) + 1
108 | 
109 |     total_states = len(state_counts)
110 |     expected_prob = 1 / total_states
111 |     expected_count = n_samples / total_states
112 |     rtol = 0.1  # 10% relative tolerance
113 | 
114 |     for state, count in state_counts.items():
115 |         row_a, col_a, row_b, col_b, possession = state
116 | 
117 |         # Check columns
118 |         assert col_a == 2, f"Player A should start in column 2, but starts in column {col_a}"
119 |         assert col_b == env.width - 3, f"Player B should start in column {env.width - 3}, but starts in column {col_b}"
120 | 
121 |         # Check rows
122 |         if len(env.goal_rows) % 2 == 0:  # Even number of goal rows
123 |             middle_index = len(env.goal_rows) // 2
124 |             valid_rows = [env.goal_rows[middle_index - 1], env.goal_rows[middle_index]]
125 |             assert row_a in valid_rows, f"Player A should start in row {valid_rows[0]} or {valid_rows[1]}, but starts in row {row_a}"
126 |             assert row_b in valid_rows, f"Player B should start in row {valid_rows[0]} or {valid_rows[1]}, but starts in row {row_b}"
127 |             assert row_a != row_b, f"Players should not start in the same row, but both start in row {row_a}"
128 |         else:  # Odd number of goal rows
129 |             middle_row = env.goal_rows[len(env.goal_rows) // 2]
130 |             assert row_a == middle_row, f"Player A should start in middle row {middle_row}, but starts in row {row_a}"
131 |             assert row_b == middle_row, f"Player B should start in middle row {middle_row}, but starts in row {row_b}"
132 | 
133 |         # Check possession
134 |         assert possession in [0, 1], f"Possession should be 0 or 1, but is {possession}"
135 | 
136 |         # Check if the count is approximately equal to the expected count
137 |         assert np.isclose(count, expected_count, rtol=rtol), \
138 |             f"State {state} appeared {count} times, expected close to {expected_count}"
139 | 
140 |     # Check number of initial states
141 |     if len(env.goal_rows) % 2 == 0:
142 |         expected_states = 4  # Two row combinations, two possession states
143 |     else:
144 |         expected_states = 2  # One middle row, two possession states
145 | 
146 |     assert total_states == expected_states, f"Expected {expected_states} initial states, but got {total_states}"
147 | 
148 |     # Check that the empirical probabilities are close to the expected probability
149 |     observed = np.array(list(state_counts.values()))
150 |     empirical_probs = observed / n_samples
151 |     assert np.allclose(empirical_probs, expected_prob, rtol=rtol), \
152 |         f"Empirical probabilities {empirical_probs} not close to expected {expected_prob}"
153 | 
154 |     # Check for uniformity using coefficient of variation
155 |     cv = np.std(observed) / np.mean(observed)
156 |     assert cv < 0.05, f"Distribution not uniform enough. Coefficient of variation: {cv:.3f}"
157 | 
158 | 
159 | def test_singleagent_a():
160 |     from gym import spaces
161 |     width = 5
162 |     height = 4
163 |     slip_prob = 0.2
164 |     n_states = 761 # 4x5 field
165 |     n_actions = 5
166 |     random_policy = {}
167 |     for s in range(n_states):
168 |         random_policy[s] = np.random.randint(0, n_actions)
169 | 
170 |     env = SoccerSimultaneousEnv(width=width, height=height, slip_prob=slip_prob, player_a_policy=None, player_b_policy=random_policy)
171 |     assert not env.multiagent, "Environment should not be multiagent, one policy was provided."
172 | 
173 |     # Check that the observation space is Dict
174 |     assert isinstance(env.observation_space, spaces.Dict), "Observation space should be a dictionary."
175 |     assert env.observation_space['player_a'].n == n_states, "Observation space should have the correct number of states."
176 |     assert 'player_b' not in env.observation_space, "Observation space should not contain player_b."
177 | 
178 |     # Check that the action space is Dict
179 |     assert isinstance(env.action_space, spaces.Dict), "Action space should be a dictionary."
180 |     assert env.action_space['player_a'].n == n_actions, "Action space should have the correct number of actions."
181 |     assert 'player_b' not in env.action_space, "Action space should not contain player_b."
182 | 
183 |     obs, info = env.reset()
184 |     assert isinstance(obs, dict), "Observation should be a dictionary, single agent mode."
185 |     assert 'player_a' in obs, "Observation should contain player_a."
186 |     assert 'player_b' not in obs, "Observation should not contain player_b."
187 |     assert 0 <= obs['player_a'] < n_states, "Observation should be a state index."
188 |     assert isinstance(info, dict), "Info should be a dictionary."
189 |     assert 'player_a' in info, "Info should contain player_a."
190 |     assert 'player_b' not in info, "Info should not contain player_b."
191 | 
192 |     random_action = np.random.randint(0, n_actions)
193 |     obs, reward, terminated, truncated, info = env.step({'player_a': random_action})
194 |     assert isinstance(obs, dict), "Observation should be a dictionary, single agent mode."
195 |     assert 'player_a' in obs, "Observation should contain player_a."
196 |     assert 'player_b' not in obs, "Observation should not contain player_b."
197 |     assert 0 <= obs['player_a'] < n_states, "Observation should be a state index."
198 |     assert isinstance(reward, dict), "Reward should be a dictionary."
199 |     assert 'player_a' in reward, "Reward should contain player_a."
200 |     assert 'player_b' not in reward, "Reward should not contain player_b."
201 |     assert isinstance(terminated, dict), "Terminated should be a dictionary."
202 |     assert 'player_a' in terminated, "Terminated should contain player_a."
203 |     assert 'player_b' not in terminated, "Terminated should not contain player_b."
204 |     assert isinstance(truncated, dict), "Truncated should be a dictionary."
205 |     assert 'player_a' in truncated, "Truncated should contain player_a."
206 |     assert 'player_b' not in truncated, "Truncated should not contain player_b."
207 |     assert isinstance(info, dict), "Info should be a dictionary."
208 |     assert 'player_a' in info, "Info should contain player_a."
209 |     assert 'player_b' not in info, "Info should not contain player_b."
210 | 
211 | def test_singleagent_b():
212 |     from gym import spaces
213 |     width = 5
214 |     height = 4
215 |     slip_prob = 0.2
216 |     n_states = 761 # 4x5 field
217 |     n_actions = 5
218 |     random_policy = {}
219 |     for s in range(n_states):
220 |         random_policy[s] = np.random.randint(0, n_actions)
221 | 
222 |     env = SoccerSimultaneousEnv(width=width, height=height, slip_prob=slip_prob, player_a_policy=random_policy, player_b_policy=None)
223 |     assert not env.multiagent, "Environment should not be multiagent, one policy was provided."
224 | 
225 |     # Check that the observation space is Dict
226 |     assert isinstance(env.observation_space, spaces.Dict), "Observation space should be a dictionary."
227 |     assert env.observation_space['player_b'].n == n_states, "Observation space should have the correct number of states."
228 |     assert 'player_a' not in env.observation_space, "Observation space should not contain player_a."
229 | 
230 |     # Check that the action space is Dict
231 |     assert isinstance(env.action_space, spaces.Dict), "Action space should be a dictionary."
232 |     assert env.action_space['player_b'].n == n_actions, "Action space should have the correct number of actions."
233 |     assert 'player_a' not in env.action_space, "Action space should not contain player_a."
234 | 
235 |     obs, info = env.reset()
236 |     assert isinstance(obs, dict), "Observation should be a dictionary, single agent mode."
237 |     assert 'player_b' in obs, "Observation should contain player_b."
238 |     assert 'player_a' not in obs, "Observation should not contain player_a."
239 |     assert 0 <= obs['player_b'] < n_states, "Observation should be a state index."
240 |     assert isinstance(info, dict), "Info should be a dictionary."
241 |     assert 'player_b' in info, "Info should contain player_b."
242 |     assert 'player_a' not in info, "Info should not contain player_a."
243 | 
244 |     random_action = np.random.randint(0, n_actions)
245 |     obs, reward, terminated, truncated, info = env.step({'player_b': random_action})
246 |     assert isinstance(obs, dict), "Observation should be a dictionary, single agent mode."
247 |     assert 'player_b' in obs, "Observation should contain player_b."
248 |     assert 'player_a' not in obs, "Observation should not contain player_a."
249 |     assert 0 <= obs['player_b'] < n_states, "Observation should be a state index."
250 |     assert isinstance(reward, dict), "Reward should be a dictionary."
251 |     assert 'player_b' in reward, "Reward should contain player_b."
252 |     assert 'player_a' not in reward, "Reward should not contain player_a."
253 |     assert isinstance(terminated, dict), "Terminated should be a dictionary."
254 |     assert 'player_b' in terminated, "Terminated should contain player_b."
255 |     assert 'player_a' not in terminated, "Terminated should not contain player_a."
256 |     assert isinstance(truncated, dict), "Truncated should be a dictionary."
257 |     assert 'player_b' in truncated, "Truncated should contain player_b."
258 |     assert 'player_a' not in truncated, "Truncated should not contain player_a."
259 |     assert isinstance(info, dict), "Info should be a dictionary."
260 |     assert 'player_b' in info, "Info should contain player_b."
261 |     assert 'player_a' not in info, "Info should not contain player_a."
262 | 
263 | def test_multiagent():
264 |     from gym import spaces
265 |     width = 5
266 |     height = 4
267 |     slip_prob = 0.2
268 |     n_states = 761 # 4x5 field
269 |     n_actions = 5
270 | 
271 |     env = SoccerSimultaneousEnv(width=width, height=height, slip_prob=slip_prob, player_a_policy=None, player_b_policy=None)
272 |     assert env.multiagent, "Environment should be multiagent, no policies were provided."
273 | 
274 |     # Check that the observation space is Dict
275 |     assert isinstance(env.observation_space, spaces.Dict), "Observation space should be a dictionary."
276 |     assert env.observation_space['player_a'].n == n_states, "Observation space should have the correct number of states for player_a."
277 |     assert env.observation_space['player_b'].n == n_states, "Observation space should have the correct number of states for player_b."
278 | 
279 |     # Check that the action space is Dict
280 |     assert isinstance(env.action_space, spaces.Dict), "Action space should be a dictionary."
281 |     assert env.action_space['player_a'].n == n_actions, "Action space should have the correct number of actions for player_a."
282 |     assert env.action_space['player_b'].n == n_actions, "Action space should have the correct number of actions for player_b."
283 | 
284 |     obs, info = env.reset()
285 |     assert isinstance(obs, dict), "Observation should be a dictionary, multiagent mode."
286 |     assert 'player_a' in obs and 'player_b' in obs, "Observation should contain both player_a and player_b."
287 |     assert 0 <= obs['player_a'] < n_states and 0 <= obs['player_b'] < n_states, "Observations should be state indices."
288 |     assert isinstance(info, dict), "Info should be a dictionary."
289 |     assert 'player_a' in info and 'player_b' in info, "Info should contain both player_a and player_b."
290 | 
291 |     random_action_a = np.random.randint(0, n_actions)
292 |     random_action_b = np.random.randint(0, n_actions)
293 |     obs, reward, terminated, truncated, info = env.step({'player_a': random_action_a, 'player_b': random_action_b})
294 |     assert isinstance(obs, dict), "Observation should be a dictionary, multiagent mode."
295 |     assert 'player_a' in obs and 'player_b' in obs, "Observation should contain both player_a and player_b."
296 |     assert 0 <= obs['player_a'] < n_states and 0 <= obs['player_b'] < n_states, "Observations should be state indices."
297 |     assert isinstance(reward, dict), "Reward should be a dictionary."
298 |     assert 'player_a' in reward and 'player_b' in reward, "Reward should contain both player_a and player_b."
299 |     assert isinstance(reward['player_a'], float) and isinstance(reward['player_b'], float), "Rewards should be floats."
300 |     assert isinstance(terminated, dict) and isinstance(terminated['player_a'], bool) and isinstance(terminated['player_b'], bool), "Terminated should be a dictionary with boolean values."
301 |     assert isinstance(truncated, dict) and isinstance(truncated['player_a'], bool) and isinstance(truncated['player_b'], bool), "Truncated should be a dictionary with boolean values."
302 |     assert isinstance(info, dict) and 'player_a' in info and 'player_b' in info, "Info should be a dictionary with both player_a and player_b."
303 | 
304 | def test_value_iteration_against_stand_policy_for_player_a():
305 |     from gym_soccer.utils.policies import get_stand_policy
306 |     from gym_soccer.utils.planners import value_iteration
307 | 
308 | 
309 |     width = 5
310 |     height = 4
311 |     slip_prob = 0.2
312 |     n_states = 761  # 4x5 field
313 | 
314 |     # Create stand policy for player B
315 |     stand_policy = get_stand_policy(n_states)
316 | 
317 |     # Create the environment with player B using the stand policy
318 |     env = SoccerSimultaneousEnv(
319 |         width=width, height=height, slip_prob=slip_prob,
320 |         player_a_policy=None, player_b_policy=stand_policy
321 |     )
322 | 
323 |     # Run value iteration to get the optimal policy for player A
324 |     optimal_policy, optimal_V, optimal_Q, cc = value_iteration(env, theta=1e-10, discount_factor=0.99)
325 | 
326 |     # Test the optimal policy against the stand policy
327 |     n_episodes = 1000
328 |     wins = 0
329 | 
330 |     for _ in range(n_episodes):
331 |         obs, _ = env.reset()
332 |         done = False
333 |         while not done:
334 |             action = optimal_policy[obs['player_a']]
335 |             obs, reward, terminated, truncated, _ = env.step({'player_a': action})
336 |             done = terminated['player_a'] or truncated['player_a']
337 |             if terminated['player_a'] and reward['player_a'] > 0:
338 |                 wins += 1
339 | 
340 |     win_rate = wins / n_episodes
341 |     assert win_rate == 1.0, f"Expected 100% win rate, but got {win_rate * 100}%"
342 | 
343 | 
344 | def test_value_iteration_against_random_policy_for_player_a():
345 |     from gym_soccer.utils.policies import get_random_policy
346 |     from gym_soccer.utils.planners import value_iteration
347 | 
348 |     width = 5
349 |     height = 4
350 |     slip_prob = 0.2
351 |     n_states = 761  # 4x5 field
352 |     n_actions = 5
353 | 
354 |     # Create random policy for player B
355 |     random_policy = get_random_policy(n_states, n_actions, seed=42)
356 | 
357 |     # Create the environment with player B using the random policy
358 |     env = SoccerSimultaneousEnv(
359 |         width=width, height=height, slip_prob=slip_prob,
360 |         player_a_policy=None, player_b_policy=random_policy
361 |     )
362 | 
363 |     # Run value iteration to get the optimal policy for player A
364 |     optimal_policy, optimal_V, optimal_Q, cc = value_iteration(env, theta=1e-10, discount_factor=0.99)
365 | 
366 |     # Test the optimal policy against the random policy
367 |     n_episodes = 1000
368 |     wins = 0
369 | 
370 |     for _ in range(n_episodes):
371 |         obs, _ = env.reset()
372 |         done = False
373 |         while not done:
374 |             action = optimal_policy[obs['player_a']]
375 |             obs, reward, terminated, truncated, _ = env.step({'player_a': action})
376 |             done = terminated['player_a'] or truncated['player_a']
377 |             if terminated['player_a'] and reward['player_a'] > 0:
378 |                 wins += 1
379 | 
380 |     win_rate = wins / n_episodes
381 |     assert win_rate > 0.95, f"Expected win rate > 95%, but got {win_rate * 100}%"
382 | 
383 | def test_value_iteration_against_stand_policy_for_player_b():
384 |     from gym_soccer.utils.policies import get_stand_policy
385 |     from gym_soccer.utils.planners import value_iteration
386 | 
387 |     width = 5
388 |     height = 4
389 |     slip_prob = 0.2
390 |     n_states = 761  # 4x5 field
391 | 
392 |     # Create stand policy for player A
393 |     stand_policy = get_stand_policy(n_states)
394 | 
395 |     # Create the environment with player A using the stand policy
396 |     env = SoccerSimultaneousEnv(
397 |         width=width, height=height, slip_prob=slip_prob,
398 |         player_a_policy=stand_policy, player_b_policy=None
399 |     )
400 | 
401 |     # Run value iteration to get the optimal policy for player B
402 |     optimal_policy, optimal_V, optimal_Q, cc = value_iteration(env, theta=1e-10, discount_factor=0.99)
403 | 
404 |     # Test the optimal policy against the stand policy
405 |     n_episodes = 1000
406 |     wins = 0
407 | 
408 |     for _ in range(n_episodes):
409 |         obs, _ = env.reset()
410 |         done = False
411 |         while not done:
412 |             action = optimal_policy[obs['player_b']]
413 |             obs, reward, terminated, truncated, _ = env.step({'player_b': action})
414 |             done = terminated['player_b'] or truncated['player_b']
415 |             if terminated['player_b'] and reward['player_b'] > 0:
416 |                 wins += 1
417 | 
418 |     win_rate = wins / n_episodes
419 |     assert win_rate == 1.0, f"Expected 100% win rate, but got {win_rate * 100}%"
420 | 
421 | def test_value_iteration_against_random_policy_for_player_b():
422 |     from gym_soccer.utils.policies import get_random_policy
423 |     from gym_soccer.utils.planners import value_iteration
424 | 
425 |     width = 5
426 |     height = 4
427 |     slip_prob = 0.2
428 |     n_states = 761  # 4x5 field
429 |     n_actions = 5
430 | 
431 |     # Create random policy for player A
432 |     random_policy = get_random_policy(n_states, n_actions, seed=42)
433 | 
434 |     # Create the environment with player A using the random policy
435 |     env = SoccerSimultaneousEnv(
436 |         width=width, height=height, slip_prob=slip_prob,
437 |         player_a_policy=random_policy, player_b_policy=None
438 |     )
439 | 
440 |     # Run value iteration to get the optimal policy for player B
441 |     optimal_policy, optimal_V, optimal_Q, cc = value_iteration(env, theta=1e-10, discount_factor=0.99)
442 | 
443 |     # Test the optimal policy against the random policy
444 |     n_episodes = 1000
445 |     wins = 0
446 | 
447 |     for _ in range(n_episodes):
448 |         obs, _ = env.reset()
449 |         done = False
450 |         while not done:
451 |             action = optimal_policy[obs['player_b']]
452 |             obs, reward, terminated, truncated, _ = env.step({'player_b': action})
453 |             done = terminated['player_b'] or truncated['player_b']
454 |             if terminated['player_b'] and reward['player_b'] > 0:
455 |                 wins += 1
456 | 
457 |     win_rate = wins / n_episodes
458 |     assert win_rate > 0.95, f"Expected win rate > 95%, but got {win_rate * 100}%"
459 | 


--------------------------------------------------------------------------------
/gym_soccer/envs/soccer_simultaneous_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym.envs.toy_text.utils import categorical_sample
  3 | from gym import spaces
  4 | 
  5 | class SoccerSimultaneousEnv:
  6 |     # Define constants for actions
  7 | 
  8 |     NOOP = 0
  9 |     NORTH = 1
 10 |     SOUTH = 2
 11 |     EAST = 3
 12 |     WEST = 4
 13 |     ACTION_STRING = ['NOOP', 'NORTH', 'SOUTH', 'EAST', 'WEST']
 14 |     ACTION_STRING_TO_INT = {k: v for v, k in enumerate(ACTION_STRING)}
 15 | 
 16 |     ACTION_STRING_TO_MOVE = {
 17 |         ACTION_STRING[NOOP]: (0, 0),
 18 |         ACTION_STRING[NORTH]: (0, -1),
 19 |         ACTION_STRING[SOUTH]: (0, 1),
 20 |         ACTION_STRING[EAST]: (1, 0),
 21 |         ACTION_STRING[WEST]: (-1, 0),
 22 |     }
 23 |     MOVE_TO_ACTION_STRING = {v: k for k, v in ACTION_STRING_TO_MOVE.items()}
 24 |     ACTION_INT_TO_MOVE = {
 25 |         NOOP: (0, 0),
 26 |         NORTH: (0, -1),
 27 |         SOUTH: (0, 1),
 28 |         EAST: (1, 0),
 29 |         WEST: (-1, 0),
 30 |     }
 31 |     MOVE_TO_ACTION_INT = {v: k for k, v in ACTION_INT_TO_MOVE.items()}
 32 |     TERMINAL_STATE = (-1, -1, -1, -1, -1)
 33 | 
 34 | 
 35 |     def __init__(self, width=5, height=4, slip_prob=0.0, player_a_policy=None, player_b_policy=None, seed=0):
 36 | 
 37 |         # Assert that both policies cannot be set simultaneously
 38 |         assert not (player_a_policy is not None and player_b_policy is not None), "Both players cannot have a policy. At least one must be None."
 39 |         # if player_a_policy is not None:
 40 |         #     assert isinstance(player_a_policy, dict), "Player A policy must be a dictionary."
 41 |         # if player_b_policy is not None:
 42 |         #     assert isinstance(player_b_policy, dict), "Player B policy must be a dictionary."
 43 | 
 44 |         # Minimum pitch size is 5x4
 45 |         assert width >= 5, "Width must be at least 5 columns."
 46 |         assert height >= 4, "Height must be at least 4 rows."
 47 | 
 48 |         self.width = width + 2  # +2 for the columns where goals are located
 49 |         self.height = height
 50 |         self.slip_prob = slip_prob
 51 |         self.seed = seed
 52 |         self.player_a_policy = player_a_policy
 53 |         self.player_b_policy = player_b_policy
 54 |         self.multiagent = player_a_policy is None and player_b_policy is None
 55 |         self.return_agent = ['player_a', 'player_b'] if self.multiagent else ['player_a'] \
 56 |             if player_a_policy is None else ['player_b']
 57 |         self.np_random = np.random.RandomState()
 58 |         self.np_random.seed(self.seed)
 59 | 
 60 |         self.goal_rows = ((self.height - 1) // 2, self.height // 2) if self.height % 2 == 0 else (self.height // 2 - 1, self.height // 2, self.height // 2 + 1)
 61 |         self.goal_cols = (0, self.width - 1)
 62 | 
 63 |         self.unreachable_states, self.goal_states = [], {} # containing rewards for player A
 64 |         self.state_space, self.nS = {}, 1
 65 |         self.state_space[self.TERMINAL_STATE] = 0 # initialize the terminal state
 66 |         for xa in range(self.height):
 67 |             for ya in range(self.width):
 68 |                 for xb in range(self.height):
 69 |                     for yb in range(self.width):
 70 |                         for p in range(2):
 71 |                             state_tuple = (xa, ya, xb, yb, p)
 72 | 
 73 |                             # Top/bottom left/right corners (goal columns but not goal)
 74 |                             if ya in self.goal_cols and xa not in self.goal_rows or \
 75 |                                yb in self.goal_cols and xb not in self.goal_rows:
 76 |                                 self.unreachable_states.append(state_tuple)
 77 |                                 continue
 78 | 
 79 |                             # Goals without possession
 80 |                             if xa in self.goal_rows and ya in self.goal_cols and p != 0 or \
 81 |                                xb in self.goal_rows and yb in self.goal_cols and p != 1:
 82 |                                 self.unreachable_states.append(state_tuple)
 83 |                                 continue
 84 | 
 85 |                             # Players occupy the same cell
 86 |                             if xa == xb and ya == yb:
 87 |                                 self.unreachable_states.append(state_tuple)
 88 |                                 continue
 89 | 
 90 |                             # Terminal states, goals (with possession)
 91 |                             if xa in self.goal_rows and ya in self.goal_cols and p == 0 or \
 92 |                                xb in self.goal_rows and yb in self.goal_cols and p == 1:
 93 |                                 # Goal for player A, or player B own goal
 94 |                                 ga = p == 0 and xa in self.goal_rows and ya == self.width - 1 or \
 95 |                                     p == 1 and xb in self.goal_rows and yb == self.width - 1
 96 |                                 # Goal for player B, or player A own goal
 97 |                                 gb = p == 1 and xb in self.goal_rows and yb == 0 or \
 98 |                                     p == 0 and xa in self.goal_rows and ya == 0
 99 | 
100 |                                 assert ga or gb, "At least one goal must have been scored to be here"
101 |                                 assert not (ga and gb), "We cannot have both goals scored"
102 |                                 self.goal_states[state_tuple] = 1.0 if ga else -1.0 if gb else 0.0
103 |                                 continue
104 | 
105 |                             self.state_space[state_tuple] = self.nS
106 |                             self.nS += 1
107 | 
108 |         assert self.nS == len(self.state_space), "State space should be the same length as the number of states"
109 |         self._reverse_state_space = {v: k for k, v in self.state_space.items()}
110 |         # Initialize the state space and action space
111 |         # self.n_states = self.width * self.height * self.width * self.height * 2  # width * height * width * height * 2 (possession)
112 |         self.nA = len(self.ACTION_STRING)  # Actions: UP, DOWN, LEFT, RIGHT, STAND
113 | 
114 |         # # TODO: this is a test, remove it
115 |         # # Generate a random policy for player B
116 |         # import pickle
117 |         # self.player_b_policy = {}
118 |         # for s in range(self.n_states):
119 |         #     self.player_b_policy[s] = self.np_random.randint(0, self.n_actions)
120 |         # self.multiagent = False
121 |         # # Save dictionary to a file
122 |         # with open('random_policy_5x4.pkl', 'wb') as f:
123 |         #     pickle.dump(self.player_b_policy, f)
124 | 
125 |         # Update observation space to be Discrete
126 |         self.observation_space = spaces.Dict({
127 |             a: spaces.Discrete(self.nS) for a in self.return_agent
128 |         })
129 |         self.action_space = spaces.Dict({
130 |             a: spaces.Discrete(self.nA) for a in self.return_agent
131 |         })
132 | 
133 |         # Define the initial state distribution
134 |         self.isd = self._generate_isd()
135 | 
136 |         # Define transition dynamics and create observation cache
137 |         self.P, self.P_readable, self.Pmat, self.Rmat = self._initialize_transition_dynamics()
138 | 
139 |         # Add a flag to track if reset has been called
140 |         self.needs_reset = True
141 | 
142 |         # Initialize self.state and self.observations
143 |         self.state = None
144 |         self.observations = None
145 | 
146 |     def _generate_isd(self):
147 |         distribution = []
148 |         col_a = 2  # Player A starts 2 columns from their goal
149 |         col_b = self.width - 3  # Player B starts 2 columns from their goal
150 | 
151 |         if len(self.goal_rows) % 2 == 0:  # Even number of goal rows
152 |             middle_index = len(self.goal_rows) // 2
153 |             row_options = [self.goal_rows[middle_index - 1], self.goal_rows[middle_index]]
154 |             for row_a in row_options:
155 |                 row_b = row_options[1] if row_a == row_options[0] else row_options[0]
156 |                 for possession in range(2):  # 0: A, 1: B
157 |                     state = (row_a, col_a, row_b, col_b, possession)
158 |                     distribution.append((0.25, state))
159 |         else:  # Odd number of goal rows
160 |             middle_row = self.goal_rows[len(self.goal_rows) // 2]
161 |             for possession in range(2):  # 0: A, 1: B
162 |                 state = (middle_row, col_a, middle_row, col_b, possession)
163 |                 distribution.append((0.5, state))
164 | 
165 |         return distribution
166 | 
167 |     def _initialize_transition_dynamics(self):
168 |         P = {}
169 |         P_readable = {}
170 |         Pmat = np.zeros([self.nS, self.nS, self.nA, self.nA]) if self.multiagent else np.zeros([self.nS, self.nS, self.nA])
171 |         Rmat = np.zeros([self.nS, self.nA, self.nA]) if self.multiagent else np.zeros([self.nS, self.nA])
172 | 
173 |         for xa in range(self.height):
174 |             for ya in range(self.width):
175 |                 for xb in range(self.height):
176 |                     for yb in range(self.width):
177 |                         for p in range(2):  # 0: A, 1: B
178 |                             st = (xa, ya, xb, yb, p)
179 |                             if st in self.unreachable_states:
180 |                                 continue # skip unreachable states
181 | 
182 |                             s = self._state_to_observation(st)
183 |                             P[s] = {}
184 |                             P_readable[st] = {}
185 | 
186 |                             # All actions integer for a and b, sample a policy if provided
187 |                             aaa = list(range(self.nA)) if self.player_a_policy is None else [self.player_a_policy[s]]
188 |                             aab = list(range(self.nA)) if self.player_b_policy is None else [self.player_b_policy[s]]
189 |                             for aa in aaa:
190 |                                 asa = self.ACTION_STRING[aa]
191 | 
192 |                                 for ab in aab:
193 |                                     asb = self.ACTION_STRING[ab]
194 | 
195 |                                     # Original joint action, integer and string
196 |                                     ja = (aa, ab)
197 |                                     jas = (asa, asb)
198 | 
199 |                                     transitions = []
200 |                                     transitions_readable = []
201 | 
202 |                                     # Calculate intended moves for a and b, as well as orthogonal slips
203 |                                     ma = self.ACTION_INT_TO_MOVE[aa]
204 |                                     mb = self.ACTION_INT_TO_MOVE[ab]
205 |                                     mas = [(-ma[1], ma[0]), (ma[1], -ma[0])]
206 |                                     mbs = [(-mb[1], mb[0]), (mb[1], -mb[0])]
207 | 
208 |                                     # All move combinations to consider
209 |                                     amc = [
210 |                                         # No slip
211 |                                         (ma, mb, (1 - self.slip_prob) * (1 - self.slip_prob)),
212 |                                         # B slips, A does not
213 |                                         (ma, mbs[0], (1 - self.slip_prob) * self.slip_prob * 0.5),
214 |                                         (ma, mbs[1], (1 - self.slip_prob) * self.slip_prob * 0.5),
215 |                                         # A slips, B does not
216 |                                         (mas[0], mb, self.slip_prob * (1 - self.slip_prob) * 0.5),
217 |                                         (mas[1], mb, self.slip_prob * (1 - self.slip_prob) * 0.5),
218 |                                         # Both slip
219 |                                         (mas[0], mbs[0], self.slip_prob * self.slip_prob * 0.25),
220 |                                         (mas[0], mbs[1], self.slip_prob * self.slip_prob * 0.25),
221 |                                         (mas[1], mbs[0], self.slip_prob * self.slip_prob * 0.25),
222 |                                         (mas[1], mbs[1], self.slip_prob * self.slip_prob * 0.25),
223 |                                     ]
224 | 
225 |                                     for ma, mb, mp in amc:
226 |                                         if mp == 0:
227 |                                             continue # remove zero probability transitions
228 | 
229 |                                         # Joint move action
230 |                                         jma = (ma, mb)
231 | 
232 |                                         # Get all next state possible outcomes for the action, and move (slip)
233 |                                         nso = self._get_next_state(st, ja, jma)
234 |                                         for nsp, ns in nso:
235 |                                             if st == ns and st in self.goal_states:
236 |                                                 d, r = True, 0.0
237 |                                             elif st != ns and ns in self.goal_states:
238 |                                                 d, r = True, self.goal_states[ns]
239 |                                             else:
240 |                                                 d, r = False, 0.0
241 |                                             p = mp * nsp
242 |                                             # flip reward for player B in single agent case
243 |                                             if not self.multiagent and 'player_b' in self.return_agent:
244 |                                                 r = -1 * r
245 |                                             transitions.append((
246 |                                                 p, # probability of the move (slip), and next_state
247 |                                                 self._state_to_observation(ns), # next state
248 |                                                 r, # reward
249 |                                                 d # done
250 |                                             ))
251 |                                             transitions_readable.append((
252 |                                                 p, # probability of the move (slip), and next_state
253 |                                                 ns, # next state
254 |                                                 r, # reward
255 |                                                 d # done
256 |                                             ))
257 |                                     # if we need to account for joint actions
258 |                                     if self.multiagent:
259 |                                         P[s][ja] = transitions
260 |                                         Rmat[s][ja[0]][ja[1]] = 0  # Initialize reward to 0
261 |                                         for prob, next_state, reward, done in transitions:
262 |                                             Pmat[s][next_state][ja[0]][ja[1]] += prob
263 |                                             Rmat[s][ja[0]][ja[1]] += prob * reward  # Weighted sum of rewards
264 |                                         P_readable[st][jas] = transitions_readable
265 |                                     # if we need to account for individual actions a and b
266 |                                     elif self.player_a_policy is None and self.player_b_policy is not None:
267 |                                         P[s][aa] = transitions
268 |                                         Rmat[s][aa] = 0  # Initialize reward to 0
269 |                                         for prob, next_state, reward, done in transitions:
270 |                                             Pmat[s][next_state][aa] += prob
271 |                                             Rmat[s][aa] += prob * reward  # Weighted sum of rewards
272 |                                         P_readable[st][asa] = transitions_readable
273 |                                     elif self.player_b_policy is None and self.player_a_policy is not None:
274 |                                         P[s][ab] = transitions
275 |                                         Rmat[s][ab] = 0  # Initialize reward to 0
276 |                                         for prob, next_state, reward, done in transitions:
277 |                                             Pmat[s][next_state][ab] += prob
278 |                                             Rmat[s][ab] += prob * reward  # Weighted sum of rewards
279 |                                         P_readable[st][asb] = transitions_readable
280 |                                     # error case
281 |                                     else:
282 |                                         raise ValueError("No policy provided for both players, but action is an integer")
283 | 
284 |                                     # Assert that probabilities sum to 1
285 |                                     tp = sum(t[0] for t in transitions)
286 |                                     assert abs(tp - 1.0) < 1e-6, \
287 |                                         f"Probabilities do not sum to 1 for state {st}, actions {aa}, {ab}. Sum: {tp}"
288 | 
289 |         # P is the compact representation of the transition dynamics
290 |         # P_readable is the same but with the states represented as tuples
291 |         # P_readable terminal states are the tuples that are in goal_states
292 |         # P has 0 as the terminal states
293 |         return P, P_readable, Pmat, Rmat
294 | 
295 | 
296 |     def _get_next_state(self, st, ja, jma):
297 |         xa, ya, xb, yb, p = st
298 | 
299 |         # terminal states
300 |         if st in self.goal_states:
301 |             return [(1.0, st)]
302 | 
303 |         # original action integers and move action (including slips)
304 |         aa, ab = ja
305 |         maa, mab = jma
306 | 
307 |         # Get potential next positions based on move actions and ball possession
308 |         nxa, nya = self._next_cell(xa, ya, maa, p == 0)
309 |         nxb, nyb = self._next_cell(xb, yb, mab, p == 1)
310 | 
311 |         # Handle collisions and possession changes
312 |         nso = []
313 | 
314 |         # Collision case 1: Players moving through each other
315 |         if (xa == xb and
316 |             abs(ya - yb) == 1 and
317 |             nya == yb and
318 |             nyb == ya) or \
319 |            (ya == yb and
320 |             abs(xa - xb) == 1 and
321 |             nxa == xb and
322 |             nxb == xa):
323 | 
324 |             # Players stay in their original positions, possession changes randomly
325 |             assert not (xa == xb and ya == yb), "Players should not be in the same cell"
326 |             nso.append((0.5, (xa, ya, xb, yb, 0)))  # A gets possession
327 |             nso.append((0.5, (xa, ya, xb, yb, 1)))  # B gets possession
328 | 
329 |         # Collision case 2: One player moves into the opponent's cell, the opponent stands
330 |         elif (nxa == xb and nya == yb and ab == self.NOOP) or \
331 |              (nxb == xa and nyb == ya and aa == self.NOOP):
332 | 
333 |             # Nobody moves, they bounce back to their original location. Possession is changed.
334 |             assert not (xa == xb and ya == yb), "Players should not be in the same cell"
335 |             nso.append((1.0, (xa, ya, xb, yb, 1 - p)))
336 | 
337 |         # Collision case 3: Players moving to the same cell through a bounce
338 |         elif (xa == nxa and ya == nya and aa != self.NOOP and nxb == xa and nyb == ya) or \
339 |              (xb == nxb and yb == nyb and ab != self.NOOP and nxa == xb and nya == yb):
340 | 
341 |             # Bounce back both players, random possession
342 |             assert not (xa == xb and ya == yb), "Players should not be in the same cell"
343 |             nso.append((0.5, (xa, ya, xb, yb, 0)))
344 |             nso.append((0.5, (xa, ya, xb, yb, 1)))
345 | 
346 |         # Collision case 4: Players moving to the same empty cell
347 |         elif nxa == nxb and nya == nyb:
348 |             assert not (xa == nxb and ya == nyb), "Players should not be in the same cell"
349 |             assert not (nxa == xb and nya == yb), "Players should not be in the same cell"
350 | 
351 |             # Bounce back player a, player b moves, random possession
352 |             nso.append((0.25, (xa, ya, nxb, nyb, 0)))
353 |             nso.append((0.25, (xa, ya, nxb, nyb, 1)))
354 |             # Bounce back player b, player a moves, random possession
355 |             nso.append((0.25, (nxa, nya, xb, yb, 0)))
356 |             nso.append((0.25, (nxa, nya, xb, yb, 1)))
357 |         else:
358 |             # No collision: players move to their new positions
359 |             assert not (nxa == nxb and nya == nyb), "Players should not be in the same cell"
360 |             nso.append((1.0, (nxa, nya, nxb, nyb, p)))
361 | 
362 |         return nso
363 | 
364 |     def _next_cell(self, x, y, ma, p):
365 |         nx = max(0, min(self.height - 1, x + ma[1])) # Clamp to pitch height boundaries
366 |         ny = y + ma[0] # assume the move in y
367 | 
368 |         # Revert x edges unless there is a goal (and not out of bounds)
369 |         xoob = ny == 0 or ny == self.width - 1
370 |         goal = xoob and nx in self.goal_rows and p # has possession
371 |         if xoob and not goal:
372 |             ny = y  # Bounce back
373 |         return nx, ny
374 | 
375 |     def step(self, action):
376 |         assert not self.needs_reset, "Please reset the environment before taking a step"
377 |         assert isinstance(action, dict), "Action must be a dictionary"
378 |         assert len(action) == 1 or len(action) == 2, "Action must be a dictionary of length 1 or 2"
379 |         assert self.multiagent or self.player_a_policy is not None or self.player_b_policy is not None, "Multiagent environment or policy for one player must be provided"
380 |         assert self.player_a_policy is not None or 'player_a' in action, "A policy for player_a must be provided"
381 |         assert self.player_b_policy is not None or 'player_b' in action, "A policy for player_b must be provided"
382 | 
383 |         only_agent = None
384 |         if self.multiagent:
385 |             assert (isinstance(action, dict) and len(action) == 2), "Action must be a dictionary of length 2 for multiagent case"
386 |             assert 'player_a' in action and 'player_b' in action, "Action must contain both 'player_a' and 'player_b'"
387 |         else:
388 |             assert (isinstance(action, dict) and len(action) == 1), "Action must be a dictionary of length 1 for single agent case"
389 |             assert 'player_a' in action or 'player_b' in action, "Action must contain either 'player_a' or 'player_b'"
390 |             assert not ('player_a' in action and 'player_b' in action), "Action must contain only one of 'player_a' or 'player_b'"
391 |             only_agent = 'player_a' if self.player_a_policy is None else 'player_b'
392 | 
393 |         action_readable = (self.ACTION_STRING[action['player_a']], self.ACTION_STRING[action['player_b']]) if self.multiagent else self.ACTION_STRING[action[only_agent]]
394 |         transitions = self.P_readable[self.state][action_readable]
395 |         i = categorical_sample([t[0] for t in transitions], self.np_random)
396 |         prob, self.state, reward, done = transitions[i]
397 |         self.observations = {a: self._state_to_observation(self.state) for a in self.return_agent}
398 |         self.lastaction = action
399 |         self.timestep += 1
400 |         rewards = {a: reward for a in self.return_agent}
401 |         if self.multiagent:
402 |             rewards['player_b'] *= -1
403 |         dones = {a: done for a in self.return_agent}
404 |         truncateds = {a: self.timestep >= 100 for a in self.return_agent}
405 |         infos = {a: {"p": np.round(prob, 2)} for a in self.return_agent}
406 |         self.needs_reset = any(dones.values()) or any(truncateds.values())
407 | 
408 |         return self.observations, rewards, dones, truncateds, infos
409 | 
410 |     def reset(self, seed=None, options=None):
411 |         if seed is not None:
412 |             self.np_random.seed(seed)
413 | 
414 |         i = categorical_sample([is_[0] for is_ in self.isd], self.np_random)
415 |         p, self.state = self.isd[i]
416 |         # currently the integer representation of the state
417 |         # later we need the rotation, then integer (both player "see" the same perspective)
418 |         # also this observation is the same integer for both, later it won't
419 |         self.observations = {a: self._state_to_observation(self.state) for a in self.return_agent}
420 |         infos = {a: {"p": np.round(p, 2)} for a in self.return_agent}
421 |         self.lastaction = None
422 |         self.needs_reset = False
423 |         self.timestep = 0
424 |         return self.observations, infos
425 | 
426 |     def render(self):
427 |         # Use self.state directly (it's already a dictionary)
428 |         print(self.state)
429 |         xa, ya, xb, yb, p = self.state
430 | 
431 |         # Print player positions
432 |         print(f"Player A position: x={xa}, y={ya}, possession={p==0}")
433 |         print(f"Player B position: x={xb}, y={yb}, possession={p==1}")
434 | 
435 |         # Create the pitch
436 |         pitch = [[' ' for _ in range(self.width)] for _ in range(self.height)]
437 | 
438 |         # Add players and ball possession
439 |         pitch[xa][ya] = 'A' + ('*' if p == 0 else ' ')
440 |         pitch[xb][yb] = 'B' + ('*' if p == 1 else ' ')
441 | 
442 |         rendered_pitch = []
443 |         rendered_pitch.append('  ' + '-' * (self.width * 2 - 4))
444 |         for ri, r in enumerate(pitch):
445 |             if ri in self.goal_rows:
446 |                 if '*' in r[0]:
447 |                     rendered_pitch.append(''.join(f'{cell:<2}' for cell in r[0:-1]) + '||')
448 |                 elif '*' in r[-1]:
449 |                     rendered_pitch.append('||' + ''.join(f'{cell:<2}' for cell in r[1:]))
450 |                 else:
451 |                     rendered_pitch.append('||' + ''.join(f'{cell:<2}' for cell in r[1:-1]) + '||')
452 |             else:
453 |                 rendered_pitch.append(' |' + ''.join(f'{cell:<2}' for cell in r[1:-1]) + '| ')
454 |         rendered_pitch.append('  ' + '-' * (self.width * 2 - 4))
455 | 
456 |         # Print the entire pitch
457 |         for r in rendered_pitch:
458 |             print(r)
459 | 
460 |         # Print additional information
461 |         print(f"Ball possession: {'A' if p == 0 else 'B'}")
462 |         if self.lastaction and self.multiagent:
463 |             action_a, action_b = self.lastaction.values()
464 |             print(f"Last actions: A: {self.ACTION_STRING[action_a]}, B: {self.ACTION_STRING[action_b]}")
465 |         elif self.lastaction and not self.multiagent:
466 |             if self.player_a_policy is None:
467 |                 action_a = self.lastaction['player_a']
468 |                 print(f"Last action: A: {self.ACTION_STRING[action_a]}")
469 |             elif self.player_b_policy is None:
470 |                 action_b = self.lastaction['player_b']
471 |                 print(f"Last action: B: {self.ACTION_STRING[action_b]}")
472 |             else:
473 |                 raise ValueError("No policy provided for both players, but action is an integer")
474 | 
475 |         # Check for goal or own goal
476 |         if p == 0:  # Player A has the ball
477 |             if ya == 0 and xa in self.goal_rows:
478 |                 print("OWN GOAL! Player A scored in their own goal!")
479 |             elif ya == self.width - 1 and xa in self.goal_rows:
480 |                 print("GOAL! Player A scored!")
481 |         else:  # Player B has the ball
482 |             if yb == 0 and xb in self.goal_rows:
483 |                 print("GOAL! Player B scored!")
484 |             elif yb == self.width - 1 and xb in self.goal_rows:
485 |                 print("OWN GOAL! Player B scored in their own goal!")
486 | 
487 |     def _state_to_observation(self, state):
488 |         # This function later should rotate the observations
489 |         # so that both players see the same perspective
490 |         # currently it's they see the global game state
491 |         # the problem is the a players trained to solve player a's perspective
492 |         # cannot perform on player b's perspective
493 |         state = self.TERMINAL_STATE if state in self.goal_states else state
494 |         return self.state_space[state]
495 | 
496 |     def _observation_to_state(self, observation):
497 |         return self._reverse_state_space[observation]
498 | 
499 | def main():
500 |     n_states = 761 # 5x4 field
501 |     # n_states = 11705 # 11x7 field
502 |     n_actions = 5
503 |     import time
504 |     from gym_soccer.utils.policies import get_random_policy, get_stand_policy
505 |     from gym_soccer.utils.planners import value_iteration, policy_iteration, modified_policy_iteration
506 | 
507 |     random_policy = get_random_policy(n_states, n_actions, seed=0)
508 |     stand_policy = get_stand_policy(n_states)
509 |     player_b_policy = random_policy
510 | 
511 |     # Create the environment
512 |     # env = SoccerSimultaneousEnv(
513 |     #     width=5, height=4, slip_prob=0.2,
514 |     #     player_a_policy=None, player_b_policy=None)
515 |     # env = SoccerSimultaneousEnv(
516 |     #     width=11, height=7, slip_prob=0.2,
517 |     #     player_a_policy=None, player_b_policy=player_b_policy)
518 |     env = SoccerSimultaneousEnv(
519 |         width=5, height=4, slip_prob=0.2,
520 |         player_a_policy=None, player_b_policy=player_b_policy)
521 |     # env = SoccerSimultaneousEnv(
522 |     #     width=5, height=4, slip_prob=0.2,
523 |     #     player_a_policy=player_b_policy, player_b_policy=None)
524 | 
525 |     k_1 = 1
526 |     k_2 = 10000000
527 |     theta = 1e-10
528 |     discount_factor = 0.99
529 |     # Value iteration
530 |     vi_time = time.time()
531 |     vi_br_pi, vi_br_V, vi_br_Q, vi_cc = value_iteration(env, theta=theta, discount_factor=discount_factor)
532 |     vi_time = time.time() - vi_time
533 |     print("Value iteration converged in {} iterations in {:.2f} seconds".format(vi_cc, vi_time))
534 | 
535 |     # Policy iteration
536 |     pi_time = time.time()
537 |     pi_br_pi, pi_br_V, pi_br_Q, pi_cc = policy_iteration(env, theta=theta, discount_factor=discount_factor)
538 |     pi_time = time.time() - pi_time
539 |     print("Policy iteration converged in {} iterations in {:.2f} seconds".format(pi_cc, pi_time))
540 | 
541 |     # Modified policy iteration, 1 pass for each policy evaluation
542 |     mpi_1_time = time.time()
543 |     mpi_1_br_pi, mpi_1_br_V, mpi_1_br_Q, mpi_1_cc = modified_policy_iteration(env, k=k_1, theta=theta, discount_factor=discount_factor)
544 |     mpi_1_time = time.time() - mpi_1_time
545 |     print("Modified policy iteration (k={}) converged in {} iterations in {:.2f} seconds".format(k_1, mpi_1_cc, mpi_1_time))
546 | 
547 |     # Modified policy iteration, infinite passes for each policy evaluation
548 |     mpi_2_time = time.time()
549 |     mpi_2_br_pi, mpi_2_br_V, mpi_2_br_Q, mpi_2_cc = modified_policy_iteration(env, k=k_2, theta=theta, discount_factor=discount_factor)
550 |     mpi_2_time = time.time() - mpi_2_time
551 |     print("Modified policy iteration (k={}) converged in {} iterations in {:.2f} seconds".format(k_2, mpi_2_cc, mpi_2_time))
552 | 
553 |     # Check if all policies are the same
554 |     assert np.all(vi_br_pi == pi_br_pi), "Value iteration and policy iteration should converge to the same policy"
555 |     assert np.all(vi_br_pi == mpi_1_br_pi), "Value iteration and modified policy iteration should converge to the same policy"
556 |     assert np.all(vi_br_pi == mpi_2_br_pi), "Value iteration and modified policy iteration should converge to the same policy"
557 | 
558 |     # Check if all value functions are the same
559 |     assert np.allclose(vi_br_V, pi_br_V), "Value iteration and policy iteration should converge to the same value function"
560 |     assert np.allclose(vi_br_V, mpi_1_br_V), "Value iteration and modified policy iteration should converge to the same value function"
561 |     assert np.allclose(vi_br_V, mpi_2_br_V), "Value iteration and modified policy iteration should converge to the same value function"
562 | 
563 |     # Check if all Q-functions are the same
564 |     assert np.allclose(vi_br_Q, pi_br_Q), "Value iteration and policy iteration should converge to the same Q-function"
565 |     assert np.allclose(vi_br_Q, mpi_1_br_Q), "Value iteration and modified policy iteration should converge to the same Q-function"
566 |     assert np.allclose(vi_br_Q, mpi_2_br_Q), "Value iteration and modified policy iteration should converge to the same Q-function"
567 |     print("All algorithms converged to the same result.")
568 | 
569 |     n_episodes = 1000
570 |     rewards, steps = [], []
571 |     for i in range(n_episodes):
572 | 
573 |         # Reset the environment
574 |         os, fs = env.reset()
575 |         rewards.append(0)
576 |         steps.append(0)
577 |         all_done = False
578 |         while not all_done:
579 | 
580 |             # Render the environment
581 |             if i == n_episodes - 1:
582 |                 env.render()
583 | 
584 |             # Select random actions for both players
585 |             # action_a = env.action_space['player_a'].sample()
586 |             # action_b = env.action_space.sample()
587 |             action_a = vi_br_pi[os['player_a']]
588 |             # action_a = vi_br_pi[os['player_b']]
589 |             # action_a = env.EAST
590 | 
591 |             # Take a step in the environment
592 |             # observation, reward, done, truncated, info = env.step({'player_a': action_a, 'player_b': action_b})
593 |             os, rs, ds, ts, fs = env.step({'player_a': action_a})
594 |             rewards[-1] += rs['player_a']
595 | 
596 |             all_done = any(ds.values()) or any(ts.values())
597 |             if i == n_episodes - 1:
598 |                 print(f"Values after step {steps[-1]}:")
599 |                 for k, po in os.items():
600 |                     print(f"{po}:")
601 |                     print(f"\tobservation: {os[k]}")
602 |                     print(f"\treward: {rs[k]}")
603 |                     print(f"\tdone: {ds[k]}")
604 |                     print(f"\ttruncated: {ts[k]}")
605 |                     print(f"\tinfo: {fs[k]}")
606 | 
607 |             steps[-1] += 1
608 | 
609 |         if i == n_episodes - 1:
610 |             # Render the final state
611 |             env.render()
612 | 
613 |     print(f"All {n_episodes} episodes finished with average reward {np.mean(rewards)} and average steps {np.mean(steps)}.")
614 | 
615 | 
616 | if __name__ == "__main__":
617 |     main()


--------------------------------------------------------------------------------