├── gym_soccer ├── utils │ ├── __init__.py │ ├── policies.py │ └── planners.py ├── envs │ ├── __init__.py │ ├── soccer_alternating_env.py │ └── soccer_simultaneous_env.py ├── __init__.py └── tests │ ├── test_slip_soccer_simultaneous_env.py │ ├── test_deterministic_soccer_simultaneous_env.py │ └── test_general.py ├── setup.py ├── README.md ├── LICENSE ├── .github └── workflows │ └── test.yml └── .gitignore /gym_soccer/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gym_soccer/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from gym_soccer.envs.soccer_simultaneous_env import SoccerSimultaneousEnv 2 | -------------------------------------------------------------------------------- /gym_soccer/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | # NO REGISTRATION JUST YET 4 | # classics 5 | # register( 6 | # id='SoccerSimultaneous-v0', 7 | # entry_point='gym_soccer.envs:SoccerSimultaneousEnv', 8 | # kwargs={'width': 5, 'height': 4, 'slip_prob': 0.2, 'player_a_policy': None, 'player_b_policy': None}, 9 | # max_episode_steps=100, 10 | # reward_threshold=1.0, 11 | # nondeterministic=True, 12 | # ) 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='gym_soccer', 5 | version='0.0.1', 6 | description='Gym soccer environment - useful to replicate soccer experiments from Littman 94', 7 | url='https://github.com/mimoralea/gym-soccer-littman94', 8 | author='Miguel Morales', 9 | author_email='mimoralea@gmail.com', 10 | packages=find_packages(), # Automatically find and include packages in the directory 11 | license='MIT License', 12 | install_requires=[ 13 | 'numpy==1.26.4', 14 | 'gym>=0.26.2' 15 | ], 16 | ) 17 | -------------------------------------------------------------------------------- /gym_soccer/utils/policies.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym_soccer.envs.soccer_simultaneous_env import SoccerSimultaneousEnv 3 | 4 | def get_random_policy(n_states=761, n_actions=5, seed=0): 5 | random_policy = {} 6 | random_state = np.random.RandomState(seed) 7 | for s in range(n_states): 8 | random_policy[s] = random_state.randint(0, n_actions) 9 | return random_policy 10 | 11 | def get_stand_policy(n_states=761): 12 | stand_policy = {} 13 | for s in range(n_states): 14 | stand_policy[s] = SoccerSimultaneousEnv.NOOP 15 | return stand_policy 16 | 17 | def save_policy(policy, filename, mode='wb'): 18 | import pickle 19 | assert isinstance(policy, dict), "Policy must be a dictionary" 20 | # Save dictionary to a file 21 | with open(filename, mode) as f: 22 | pickle.dump(policy, f) 23 | 24 | def load_policy(filename, mode='rb'): 25 | import pickle 26 | with open(filename, mode) as f: 27 | return pickle.load(f) 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gym-soccer-littman94 2 | 3 | ## Installation 4 | 5 | ```bash 6 | git clone https://github.com/mimoralea/gym-soccer-littman94.git 7 | cd gym-soccer-littman94 8 | pip install . 9 | ``` 10 | 11 | or: 12 | 13 | ```bash 14 | pip install git+https://github.com/mimoralea/gym-soccer-littman94#egg=gym-soccer-littman94 15 | ``` 16 | 17 | ## Use 18 | 19 | ```python 20 | import gym, gym_walk, numpy as np 21 | env = gym.make('WalkFive-v0') 22 | pi = lambda x: np.random.randint(2) 23 | 24 | def td(pi, env, gamma=1.0, alpha=0.01, n_episodes=100000): 25 | V = np.zeros(env.observation_space.n) 26 | for t in range(n_episodes): 27 | state, done = env.reset(), False 28 | while not done: 29 | action = pi(state) 30 | next_state, reward, done, _ = env.step(action) 31 | td_target = reward + gamma * V[next_state] * (not done) 32 | td_error = td_target - V[state] 33 | V[state] = V[state] + alpha * td_error 34 | state = next_state 35 | return V 36 | 37 | V = td(pi, env) 38 | V 39 | ``` 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 GT RLDM 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | - name: Set up Python 3.10 23 | uses: actions/setup-python@v3 24 | with: 25 | python-version: "3.10" 26 | 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install flake8 pytest 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | if [ -f setup.py ]; then pip install .; fi # Install package if setup.py exists 33 | 34 | - name: Lint with flake8 35 | run: | 36 | # stop the build if there are Python syntax errors or undefined names 37 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 38 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 39 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 40 | 41 | - name: Test with pytest 42 | run: | 43 | pytest 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # pycharm 104 | .idea 105 | 106 | -------------------------------------------------------------------------------- /gym_soccer/utils/planners.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy import linalg as LA 3 | 4 | def value_iteration(env, theta, discount_factor): 5 | cc, P = 0, env.P 6 | V = np.zeros(len(P), dtype=np.float64) 7 | while True: 8 | Q = np.zeros((len(P), len(P[0])), dtype=np.float64) 9 | for s in range(len(P)): 10 | for a in range(len(P[s])): 11 | for prob, next_state, reward, done in P[s][a]: 12 | Q[s][a] += prob * (reward + discount_factor * V[next_state] * (not done)) 13 | cc += 1 14 | if np.max(np.abs(V - np.max(Q, axis=1))) < theta: 15 | break 16 | V = np.max(Q, axis=1) 17 | pi = np.argmax(Q, axis=1) 18 | return pi, V, Q, cc 19 | 20 | def policy_evaluation(pi, env, theta, discount_factor): 21 | P = env.P 22 | prev_V = np.zeros(len(P), dtype=np.float64) 23 | while True: 24 | V = np.zeros(len(P), dtype=np.float64) 25 | for s in range(len(P)): 26 | for prob, next_state, reward, done in P[s][pi[s]]: 27 | V[s] += prob * (reward + discount_factor * prev_V[next_state] * (not done)) 28 | if np.max(np.abs(prev_V - V)) < theta: 29 | break 30 | prev_V = V.copy() 31 | return V 32 | 33 | def policy_improvement(V, env, discount_factor): 34 | P = env.P 35 | Q = np.zeros((len(P), len(P[0])), dtype=np.float64) 36 | for s in range(len(P)): 37 | for a in range(len(P[s])): 38 | for prob, next_state, reward, done in P[s][a]: 39 | Q[s][a] += prob * (reward + discount_factor * V[next_state] * (not done)) 40 | new_pi = np.argmax(Q, axis=1) 41 | return new_pi, Q 42 | 43 | def policy_iteration(env, theta, discount_factor): 44 | cc, P = 0, env.P 45 | pi = np.random.choice(tuple(P[0].keys()), len(P)) 46 | while True: 47 | old_pi = pi.copy() 48 | V = policy_evaluation(pi, env, theta, discount_factor) 49 | pi, Q = policy_improvement(V, env, discount_factor) 50 | cc += 1 51 | if np.all(old_pi == pi): 52 | break 53 | return pi, V, Q, cc 54 | 55 | def policy_eval(env, policy, theta, discount_factor, k=10000000, init=None): 56 | v = np.zeros(env.nS) if init is None else init 57 | cc = 0 58 | for i in range(k): 59 | value_fc = np.zeros(env.nS) 60 | for s in range(env.nS): 61 | r_pi = np.dot(policy[s, :], env.Rmat[s, :]) 62 | pv = np.dot(env.Pmat[s, :, :].T, v) 63 | p_pi = np.dot(pv, policy[s, :]) 64 | value_fc[s] = r_pi + discount_factor * p_pi 65 | delta = LA.norm(value_fc - v, np.inf) 66 | v[:] = value_fc 67 | cc += 1 68 | if delta < theta: 69 | break 70 | return v, cc 71 | 72 | 73 | def modified_policy_iteration(env, k, theta, discount_factor): 74 | v = np.zeros(env.nS) 75 | threshold = (theta * (1 - discount_factor))/(2 * discount_factor) 76 | counter = 0 77 | while True: 78 | q = np.zeros([env.nS, env.nA]) 79 | for a in range(env.nA): 80 | q[:, a] = env.Rmat[:, a] + discount_factor * np.dot(env.Pmat[:, :, a], v) 81 | greedy_v = np.max(q, -1) 82 | best_action = np.argmax(q, -1) 83 | policy = np.eye(env.nA)[best_action] 84 | if LA.norm(v - greedy_v, np.inf) <= threshold: 85 | return policy.argmax(axis=1), greedy_v, q, counter 86 | else: 87 | v, cc = policy_eval(env, policy, theta=theta, discount_factor=discount_factor, k=k, init=greedy_v) 88 | counter += 1 -------------------------------------------------------------------------------- /gym_soccer/tests/test_slip_soccer_simultaneous_env.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from gym_soccer.envs import SoccerSimultaneousEnv 4 | 5 | @pytest.fixture 6 | def env(): 7 | return SoccerSimultaneousEnv(width=5, height=4, slip_prob=0.2) 8 | 9 | @pytest.fixture(autouse=True) 10 | def reset_env(env): 11 | env.reset() 12 | yield 13 | 14 | def test_initialization(env): 15 | env.reset() 16 | assert env.width == 7 # 5 + 2 for goal columns 17 | assert env.height == 4 18 | assert env.slip_prob == 0.2 19 | assert env.action_space['player_a'].n == 5 20 | assert env.action_space['player_b'].n == 5 21 | 22 | def test_reset(env): 23 | obs, info = env.reset() 24 | assert isinstance(obs, dict) 25 | assert 'player_a' in obs and 'player_b' in obs 26 | assert isinstance(info, dict) 27 | assert 'player_a' in info and 'player_b' in info 28 | 29 | def test_step(env): 30 | env.reset() 31 | action = {'player_a': env.NOOP, 'player_b': env.NOOP} 32 | obs, reward, terminated, truncated, info = env.step(action) 33 | assert isinstance(obs, dict) 34 | assert isinstance(reward, dict) 35 | assert isinstance(terminated, dict) 36 | assert isinstance(truncated, dict) 37 | assert isinstance(info, dict) 38 | 39 | def test_scoring(env): 40 | def run_scoring_test(initial_state, action_a, action_b, iterations=100000): 41 | score_count = 0 42 | for _ in range(iterations): 43 | env.reset() 44 | env.state = initial_state 45 | action = {'player_a': action_a, 'player_b': action_b} 46 | obs, reward, terminated, truncated, info = env.step(action) 47 | if terminated['player_a'] or terminated['player_b']: 48 | assert abs(reward['player_a']) == 1 and abs(reward['player_b']) == 1, "Both players must receive a reward/penalty for a goal" 49 | score_count += 1 50 | 51 | score_ratio = score_count / iterations 52 | print(f"Score ratio: {score_ratio:.2f}") 53 | assert 0.75 <= score_ratio <= 0.85, f"Score ratio: {score_ratio:.2f}, expected close to 0.8" 54 | 55 | # Test Player A scoring 56 | run_scoring_test((1, 5, 3, 1, 0), env.EAST, env.NOOP) 57 | 58 | # Test Player B scoring 59 | run_scoring_test((3, 5, 1, 1, 1), env.NOOP, env.WEST) 60 | 61 | def test_render(env, capsys): 62 | env.reset() 63 | env.render() 64 | captured = capsys.readouterr() 65 | assert "Player A position" in captured.out 66 | assert "Player B position" in captured.out 67 | assert "Ball possession" in captured.out 68 | 69 | def test_possession_change_non_collision(env): 70 | # Test that possession doesn't change when players move without colliding 71 | env.reset() 72 | env.state = (1, 1, 3, 3, 0) # Player A has possession 73 | action = {'player_a': env.EAST, 'player_b': env.WEST} 74 | obs, reward, terminated, truncated, info = env.step(action) 75 | assert env.state[4] == 0, "Possession should not change without collision" 76 | 77 | env.reset() 78 | env.state = (1, 1, 3, 3, 1) # Player B has possession 79 | action = {'player_a': env.EAST, 'player_b': env.WEST} 80 | obs, reward, terminated, truncated, info = env.step(action) 81 | assert env.state[4] == 1, "Possession should not change without collision" 82 | 83 | def test_slip_into_goal(env): 84 | def run_slip_goal_test(initial_state, action_a, action_b, iterations=100000): 85 | goal_count = 0 86 | for _ in range(iterations): 87 | env.reset() 88 | env.state = initial_state 89 | action = {'player_a': action_a, 'player_b': action_b} 90 | obs, reward, terminated, truncated, info = env.step(action) 91 | if terminated['player_a'] or terminated['player_b']: 92 | goal_count += 1 93 | 94 | goal_ratio = goal_count / iterations 95 | assert 0.09 <= goal_ratio <= 0.11, f"Goal ratio: {goal_ratio:.2f}, expected close to 0.1" 96 | 97 | # Test A slipping into own goal 98 | run_slip_goal_test((1, 1, 3, 3, 0), env.NORTH, env.NOOP) 99 | run_slip_goal_test((2, 1, 3, 3, 0), env.NORTH, env.NOOP) 100 | run_slip_goal_test((1, 1, 3, 3, 0), env.SOUTH, env.NOOP) 101 | run_slip_goal_test((2, 1, 3, 3, 0), env.SOUTH, env.NOOP) 102 | 103 | # Test A slipping into B's goal 104 | run_slip_goal_test((1, 5, 3, 3, 0), env.NORTH, env.NOOP) 105 | run_slip_goal_test((2, 5, 3, 3, 0), env.NORTH, env.NOOP) 106 | run_slip_goal_test((1, 5, 3, 3, 0), env.SOUTH, env.NOOP) 107 | run_slip_goal_test((2, 5, 3, 3, 0), env.SOUTH, env.NOOP) 108 | 109 | # Test B slipping into A's goal 110 | run_slip_goal_test((3, 3, 1, 1, 1), env.NOOP, env.NORTH) 111 | run_slip_goal_test((3, 3, 2, 1, 1), env.NOOP, env.NORTH) 112 | run_slip_goal_test((3, 3, 1, 1, 1), env.NOOP, env.SOUTH) 113 | run_slip_goal_test((3, 3, 2, 1, 1), env.NOOP, env.SOUTH) 114 | 115 | # Test B slipping into own goal 116 | run_slip_goal_test((3, 3, 1, 5, 1), env.NOOP, env.NORTH) 117 | run_slip_goal_test((3, 3, 2, 5, 1), env.NOOP, env.NORTH) 118 | run_slip_goal_test((3, 3, 1, 5, 1), env.NOOP, env.SOUTH) 119 | run_slip_goal_test((3, 3, 2, 5, 1), env.NOOP, env.SOUTH) 120 | 121 | def test_bounce_off_horizontal_edges(env): 122 | def run_bounce_test(initial_state, action_a, action_b, iterations=100000): 123 | bounce_count = 0 124 | slip_count = 0 125 | for _ in range(iterations): 126 | env.reset() 127 | env.state = initial_state 128 | obs, reward, done, truncated, info = env.step({'player_a': action_a, 'player_b': action_b}) 129 | if env.state == initial_state: 130 | bounce_count += 1 131 | elif env.state != initial_state: 132 | slip_count += 1 133 | 134 | bounce_ratio = bounce_count / iterations 135 | slip_ratio = slip_count / iterations 136 | assert 0.79 <= bounce_ratio <= 0.81, f"Bounce ratio: {bounce_ratio:.2f}, expected close to 0.8" 137 | assert 0.19 <= slip_ratio <= 0.21, f"Slip ratio: {slip_ratio:.2f}, expected close to 0.2" 138 | 139 | # Test bouncing off top edge 140 | run_bounce_test((0, 2, 3, 3, 0), env.NORTH, env.NOOP) 141 | run_bounce_test((0, 3, 3, 3, 0), env.NORTH, env.NOOP) 142 | run_bounce_test((3, 3, 0, 2, 1), env.NOOP, env.NORTH) 143 | run_bounce_test((3, 3, 0, 3, 1), env.NOOP, env.NORTH) 144 | 145 | # Test bouncing off bottom edge 146 | run_bounce_test((3, 2, 0, 3, 0), env.SOUTH, env.NOOP) 147 | run_bounce_test((3, 3, 0, 3, 0), env.SOUTH, env.NOOP) 148 | run_bounce_test((0, 3, 3, 2, 0), env.NOOP, env.SOUTH) 149 | run_bounce_test((0, 3, 3, 3, 0), env.NOOP, env.SOUTH) 150 | 151 | def test_bounce_off_corner_edges(env): 152 | def run_bounce_test(initial_state, action, iterations=100000): 153 | bounce_count = 0 154 | slip_count = 0 155 | for _ in range(iterations): 156 | env.reset() 157 | env.state = initial_state 158 | obs, reward, done, truncated, info = env.step({'player_a': action, 'player_b': env.NOOP}) 159 | if env.state == initial_state: 160 | bounce_count += 1 161 | elif env.state != initial_state: 162 | slip_count += 1 163 | 164 | bounce_ratio = bounce_count / iterations 165 | slip_ratio = slip_count / iterations 166 | assert 0.89 <= bounce_ratio <= 0.91, f"Bounce ratio: {bounce_ratio:.2f}, expected close to 0.9" 167 | assert 0.09 <= slip_ratio <= 0.11, f"Slip ratio: {slip_ratio:.2f}, expected close to 0.1" 168 | 169 | # Test bouncing off left edge (non-goal row) 170 | run_bounce_test((0, 1, 3, 3, 1), env.WEST) 171 | 172 | # Test bouncing off right edge (non-goal row) 173 | run_bounce_test((3, 5, 0, 3, 1), env.EAST) 174 | 175 | def test_collision_through_slip(env): 176 | def run_slip_collision_test(initial_state, action_a, action_b, iterations=100000): 177 | collision_count = 0 178 | for _ in range(iterations): 179 | env.reset() 180 | env.state = initial_state 181 | obs, reward, done, truncated, info = env.step({'player_a': action_a, 'player_b': action_b}) 182 | if env.state[0] == initial_state[0] and env.state[1] == initial_state[1] and \ 183 | env.state[2] == initial_state[2] and env.state[3] == initial_state[3]: 184 | collision_count += 1 185 | 186 | collision_ratio = collision_count / iterations 187 | expected_ratio = 0.1 # 10% chance of slip for one player, other player moves as intended 188 | assert np.isclose(collision_ratio, expected_ratio, atol=0.02), f"Collision ratio: {collision_ratio:.2f}, expected close to {expected_ratio:.2f}" 189 | 190 | # Test A slipping into B's cell 191 | run_slip_collision_test((2, 2, 2, 3, 0), env.NORTH, env.NOOP) 192 | run_slip_collision_test((2, 2, 2, 3, 1), env.NORTH, env.NOOP) 193 | 194 | # Test B slipping into A's cell 195 | run_slip_collision_test((2, 3, 2, 2, 0), env.NOOP, env.NORTH) 196 | run_slip_collision_test((2, 3, 2, 2, 1), env.NOOP, env.NORTH) 197 | 198 | def test_no_slip_on_stand(env): 199 | initial_state = (1, 2, 3, 4, 0) 200 | iterations = 100000 201 | slip_count = 0 202 | 203 | for _ in range(iterations): 204 | env.reset() 205 | env.state = initial_state 206 | obs, reward, done, truncated, info = env.step({'player_a': env.NOOP, 'player_b': env.NOOP}) 207 | if env.state != initial_state: 208 | slip_count += 1 209 | 210 | assert slip_count == 0, f"Expected no slips on STAND action, got {slip_count} slips" 211 | -------------------------------------------------------------------------------- /gym_soccer/envs/soccer_alternating_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from gym.envs.toy_text.utils import categorical_sample 4 | 5 | class SoccerGridWorld: 6 | # Define constants for actions 7 | UP = 0 8 | DOWN = 1 9 | LEFT = 2 10 | RIGHT = 3 11 | STAND = 4 12 | 13 | def __init__(self, width=5, height=4, slip_prob=0.2, isd_possession_a=0.5, simultaneous_action=True, player_a_policy=None, player_b_policy=None): 14 | assert width >= 5, "Width must be at least 5 columns." 15 | assert height >= 4, "Height must be at least 4 rows." 16 | 17 | self.width = width + 2 # +2 for the columns where goals are located 18 | self.height = height 19 | self.slip_prob = slip_prob 20 | self.isd_possession_a = isd_possession_a 21 | self.simultaneous_action = simultaneous_action 22 | self.player_a_policy = player_a_policy 23 | self.player_b_policy = player_b_policy 24 | self.np_random = np.random.RandomState() 25 | 26 | # Initialize the state space and action space 27 | self.n_states = self.width * self.height * 2 # width * height * 2 (possession) 28 | self.n_actions = 5 # Actions: UP, DOWN, LEFT, RIGHT, STAND 29 | 30 | # Define the initial state distribution 31 | self.isd = self.generate_isd() 32 | 33 | # Define transition dynamics 34 | self.P = self._initialize_transition_dynamics() 35 | 36 | # Initialize current state 37 | self.s = self.reset()[0] 38 | self.lastaction = None 39 | 40 | # For alternating case 41 | if not self.simultaneous_action: 42 | self.current_player = None # Will be set in reset() 43 | 44 | def generate_isd(self): 45 | distribution = [] 46 | col_a = 2 # Player A starts 2 columns from their goal 47 | col_b = self.width - 3 # Player B starts 2 columns from their goal 48 | 49 | if self.height % 2 == 1: 50 | # Odd height: both players start in the middle row 51 | middle_row = self.height // 2 52 | for possession in range(2): # 0: A, 1: B 53 | if self.simultaneous_action: 54 | state = (middle_row, col_a, middle_row, col_b, possession) 55 | else: 56 | for who_moves_first in range(2): # 0: A, 1: B 57 | state = (middle_row, col_a, middle_row, col_b, possession, who_moves_first) 58 | distribution.append((0.25, state)) 59 | if self.simultaneous_action: 60 | distribution.append((0.5, state)) 61 | else: 62 | # Even height: players start in different rows around the middle 63 | row_a_options = [self.height // 2 - 1, self.height // 2] 64 | row_b_options = [self.height // 2, self.height // 2 - 1] 65 | for i in range(2): 66 | row_a = row_a_options[i] 67 | row_b = row_b_options[i] 68 | for possession in range(2): # 0: A, 1: B 69 | if self.simultaneous_action: 70 | state = (row_a, col_a, row_b, col_b, possession) 71 | distribution.append((0.25, state)) 72 | else: 73 | for who_moves_first in range(2): # 0: A, 1: B 74 | state = (row_a, col_a, row_b, col_b, possession, who_moves_first) 75 | distribution.append((0.125, state)) 76 | 77 | return distribution 78 | 79 | def _initialize_transition_dynamics(self): 80 | P = {} 81 | self.directions = [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)] # UP, DOWN, LEFT, RIGHT, STAND 82 | 83 | for row_a in range(self.height): 84 | for col_a in range(self.width): 85 | for row_b in range(self.height): 86 | for col_b in range(self.width): 87 | for possession in range(2): # 0: A, 1: B 88 | if self.simultaneous_action: 89 | state = (row_a, col_a, row_b, col_b, possession) 90 | P[state] = {} 91 | # Simultaneous action dynamics 92 | for action_a in range(self.n_actions): 93 | for action_b in range(self.n_actions): 94 | transitions = [] 95 | next_state, reward, done = self._get_next_state(state, action_a, action_b) 96 | transitions.append((1 - self.slip_prob, next_state, reward, done)) 97 | 98 | # Handle slips in orthogonal directions 99 | orthogonal_moves_a = [(-self.directions[action_a][1], self.directions[action_a][0]), (self.directions[action_a][1], -self.directions[action_a][0])] 100 | orthogonal_moves_b = [(-self.directions[action_b][1], self.directions[action_b][0]), (self.directions[action_b][1], -self.directions[action_b][0])] 101 | for orth_move_a in orthogonal_moves_a: 102 | for orth_move_b in orthogonal_moves_b: 103 | slip_state, _, _ = self._get_next_state(state, 104 | self._action_from_direction(orth_move_a), 105 | self._action_from_direction(orth_move_b)) 106 | transitions.append((self.slip_prob / 4, slip_state, reward, done)) 107 | P[state][(action_a, action_b)] = transitions 108 | else: 109 | # Alternating action dynamics 110 | for who_moves_next in [0, 1]: # 0: Player A, 1: Player B 111 | state = (row_a, col_a, row_b, col_b, possession, who_moves_next) 112 | P[state] = {} 113 | for action in range(self.n_actions): 114 | transitions = [] 115 | next_state, reward, done = self._get_next_state(state, action, None) 116 | next_state = (*next_state[:5], 1 - who_moves_next) # Switch to other player's turn 117 | transitions.append((1 - self.slip_prob, next_state, reward, done)) 118 | 119 | # Handle slips in orthogonal directions 120 | orthogonal_moves = [(-self.directions[action][1], self.directions[action][0]), (self.directions[action][1], -self.directions[action][0])] 121 | for orth_move in orthogonal_moves: 122 | slip_action = self._action_from_direction(orth_move) 123 | slip_state, slip_reward, slip_done = self._get_next_state(state, slip_action, None) 124 | slip_state = (*slip_state[:5], 1 - who_moves_next) # Switch to other player's turn 125 | transitions.append((self.slip_prob / 2, slip_state, slip_reward, slip_done)) 126 | P[state][action] = transitions 127 | 128 | return P 129 | 130 | def _get_next_state(self, state, action_a, action_b): 131 | if self.simultaneous_action: 132 | row_a, col_a, row_b, col_b, possession = state 133 | else: 134 | row_a, col_a, row_b, col_b, possession, who_moves_next = state 135 | 136 | # Handle actions and slip probability 137 | def move(row, col, action): 138 | if action is None: 139 | return row, col 140 | intended_move = self.directions[action] 141 | new_row = max(0, min(self.height - 1, row + intended_move[0])) 142 | new_col = max(0, min(self.width - 1, col + intended_move[1])) 143 | return new_row, new_col 144 | 145 | # Update positions based on actions 146 | if self.simultaneous_action or who_moves_next == 0: 147 | next_row_a, next_col_a = move(row_a, col_a, action_a) 148 | else: 149 | next_row_a, next_col_a = row_a, col_a 150 | 151 | if self.simultaneous_action or who_moves_next == 1: 152 | next_row_b, next_col_b = move(row_b, col_b, action_b) 153 | else: 154 | next_row_b, next_col_b = row_b, col_b 155 | 156 | # Handle STAND action properly in alternating action case 157 | if not self.simultaneous_action: 158 | if who_moves_next == 0: # Player A's turn 159 | if action_a == self.STAND: 160 | next_row_a, next_col_a = row_a, col_a 161 | elif (next_row_a, next_col_a) == (row_b, col_b): 162 | possession = 1 # Player B gains possession 163 | next_row_a, next_col_a = row_a, col_a 164 | else: # Player B's turn 165 | if action_b == self.STAND: 166 | next_row_b, next_col_b = row_b, col_b 167 | elif (next_row_b, next_col_b) == (row_a, col_a): 168 | possession = 0 # Player A gains possession 169 | next_row_b, next_col_b = row_b, col_b 170 | 171 | # Check for goals and terminal state 172 | done = False 173 | reward = 0 174 | if possession == 0: # Player A has the ball 175 | if next_col_a == 0: # Player A scores in its own goal (own goal) 176 | done = True 177 | reward = -1 # Negative reward for own goal 178 | elif next_col_a == self.width - 1: # Player A scores in opponent's goal 179 | done = True 180 | reward = 1 181 | elif next_row_a == row_b and next_col_a == col_b: # Player B steals 182 | possession = 1 183 | else: # Player B has the ball 184 | if next_col_b == self.width - 1: # Player B scores in its own goal (own goal) 185 | done = True 186 | reward = 1 # Positive reward for player A when B scores own goal 187 | elif next_col_b == 0: # Player B scores in opponent's goal 188 | done = True 189 | reward = -1 190 | elif next_row_b == row_a and next_col_b == col_a: # Player A steals 191 | possession = 0 192 | 193 | # Handle simultaneous action collision 194 | if self.simultaneous_action and (next_row_a == next_row_b and next_col_a == next_col_b): 195 | if action_a == self.STAND and action_b != self.STAND: 196 | possession = 0 # Player A gains possession 197 | elif action_b == self.STAND and action_a != self.STAND: 198 | possession = 1 # Player B gains possession 199 | elif action_a != self.STAND and action_b != self.STAND: 200 | possession = self.np_random.choice([0, 1]) # Randomly decide who gets possession 201 | 202 | next_state = (next_row_a, next_col_a, next_row_b, next_col_b, possession) 203 | 204 | return next_state, reward, done 205 | 206 | def step(self, action): 207 | if self.simultaneous_action: 208 | action_a, action_b = action 209 | transitions = self.P[self.s][(action_a, action_b)] 210 | i = categorical_sample([t[0] for t in transitions], self.np_random) 211 | prob, next_state, reward, done = transitions[i] 212 | self.s = next_state 213 | self.lastaction = action 214 | obs = { 215 | "player_a": (next_state[0], next_state[1], next_state[2], next_state[3], 1 if next_state[4] == 0 else 0), 216 | "player_b": (next_state[2], next_state[3], next_state[0], next_state[1], 1 if next_state[4] == 1 else 0) 217 | } 218 | rewards = {"player_a": reward, "player_b": -reward} 219 | dones = {"player_a": done, "player_b": done} 220 | truncateds = {"player_a": False, "player_b": False} 221 | infos = {"player_a": {"prob": prob}, "player_b": {"prob": prob}} 222 | return obs, rewards, dones, truncateds, infos 223 | else: 224 | transitions = self.P[self.s][action] 225 | i = categorical_sample([t[0] for t in transitions], self.np_random) 226 | prob, next_state, reward, done = transitions[i] 227 | self.s = next_state 228 | self.lastaction = action 229 | self.current_player = next_state[5] # Update current player 230 | 231 | current_player_name = "player_a" if self.current_player == 0 else "player_b" 232 | obs = (next_state[0], next_state[1], next_state[2], next_state[3], 1 if next_state[4] == self.current_player else 0) 233 | reward = reward if self.current_player == 0 else -reward 234 | 235 | return obs, reward, done, False, {"prob": prob} 236 | 237 | def reset(self, seed=None, options=None): 238 | if seed is not None: 239 | self.np_random.seed(seed) 240 | i = categorical_sample([is_[0] for is_ in self.isd], self.np_random) 241 | p, self.s = self.isd[i] 242 | self.lastaction = None 243 | if self.simultaneous_action: 244 | obs = { 245 | "player_a": (self.s[0], self.s[1], self.s[2], self.s[3], 1 if self.s[4] == 0 else 0), 246 | "player_b": (self.s[2], self.s[3], self.s[0], self.s[1], 1 if self.s[4] == 1 else 0) 247 | } 248 | else: 249 | self.current_player = self.s[5] 250 | obs = (self.s[0], self.s[1], self.s[2], self.s[3], 1 if self.s[4] == self.current_player else 0) 251 | return obs, {"prob": p} 252 | 253 | def _action_from_direction(self, direction): 254 | return self.directions.index(direction) 255 | -------------------------------------------------------------------------------- /gym_soccer/tests/test_deterministic_soccer_simultaneous_env.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from gym_soccer.envs import SoccerSimultaneousEnv 4 | 5 | @pytest.fixture 6 | def env(): 7 | return SoccerSimultaneousEnv(width=5, height=4, slip_prob=0.0) 8 | 9 | @pytest.fixture(autouse=True) 10 | def reset_env(env): 11 | env.reset() 12 | yield 13 | 14 | def test_initialization(env): 15 | env.reset() 16 | assert env.width == 7 # 5 + 2 for goal columns 17 | assert env.height == 4 18 | assert env.slip_prob == 0.0 19 | assert env.action_space['player_a'].n == 5 20 | assert env.action_space['player_b'].n == 5 21 | 22 | def test_reset(env): 23 | obs, info = env.reset() 24 | assert isinstance(obs, dict) 25 | assert 'player_a' in obs and 'player_b' in obs 26 | assert isinstance(info, dict) 27 | assert 'player_a' in info and 'player_b' in info 28 | 29 | def test_step(env): 30 | env.reset() 31 | action = {'player_a': env.NOOP, 'player_b': env.NOOP} 32 | obs, reward, terminated, truncated, info = env.step(action) 33 | assert isinstance(obs, dict) 34 | assert isinstance(reward, dict) 35 | assert isinstance(terminated, dict) 36 | assert isinstance(truncated, dict) 37 | assert isinstance(info, dict) 38 | 39 | def test_scoring(env): 40 | def run_scoring_test(initial_state, action_a, action_b): 41 | env.reset() 42 | env.state = initial_state 43 | action = {'player_a': action_a, 'player_b': action_b} 44 | obs, reward, terminated, truncated, info = env.step(action) 45 | assert terminated['player_a'] and terminated['player_b'], "Game should end" 46 | assert abs(reward['player_a']) == 1 and abs(reward['player_b']) == 1, "Both players must receive a reward/penalty for a goal" 47 | 48 | # Test Player A scoring 49 | run_scoring_test((1, 5, 3, 1, 0), env.EAST, env.NOOP) 50 | 51 | # Test Player B scoring 52 | run_scoring_test((3, 5, 1, 1, 1), env.NOOP, env.WEST) 53 | 54 | def test_own_goals(env): 55 | # Test Player A scoring an own goal (row 1) 56 | env.state = (1, 1, 3, 5, 0) # Player A with ball, near own goal 57 | obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.NOOP}) 58 | assert done['player_a'] and done['player_b'] 59 | assert reward['player_a'] == -1 60 | assert reward['player_b'] == 1 61 | 62 | # Test Player A scoring an own goal (row 2) 63 | env.reset() 64 | env.state = (2, 1, 3, 5, 0) # Player A with ball, near own goal 65 | obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.NOOP}) 66 | assert done['player_a'] and done['player_b'] 67 | assert reward['player_a'] == -1 68 | assert reward['player_b'] == 1 69 | 70 | # Test Player B scoring an own goal (row 1) 71 | env.reset() 72 | env.state = (3, 1, 1, 5, 1) # Player B with ball, near own goal 73 | obs, reward, done, truncated, info = env.step({'player_a': env.NOOP, 'player_b': env.EAST}) 74 | assert done['player_a'] and done['player_b'] 75 | assert reward['player_a'] == 1 76 | assert reward['player_b'] == -1 77 | 78 | # Test Player B scoring an own goal (row 2) 79 | env.reset() 80 | env.state = (3, 1, 2, 5, 1) # Player B with ball, near own goal 81 | obs, reward, done, truncated, info = env.step({'player_a': env.NOOP, 'player_b': env.EAST}) 82 | assert done['player_a'] and done['player_b'] 83 | assert reward['player_a'] == 1 84 | assert reward['player_b'] == -1 85 | 86 | def test_both_players_moving_collision(env): 87 | # Test collision when both players are moving 88 | env.reset() 89 | env.state = (1, 2, 1, 3, 0) # Player A has possession 90 | action = {'player_a': env.EAST, 'player_b': env.WEST} 91 | obs, reward, terminated, truncated, info = env.step(action) 92 | assert env.state[1] == 2 and env.state[3] == 3, "Players should bounce back to original positions" 93 | assert env.state[4] in [0, 1], "Possession should be randomly assigned" 94 | 95 | env.reset() 96 | env.state = (1, 2, 1, 3, 1) # Player B has possession 97 | action = {'player_a': env.EAST, 'player_b': env.WEST} 98 | obs, reward, terminated, truncated, info = env.step(action) 99 | assert env.state[1] == 2 and env.state[3] == 3, "Players should bounce back to original positions" 100 | assert env.state[4] in [0, 1], "Possession should be randomly assigned" 101 | 102 | def test_one_player_standing_collision(env): 103 | # Test collision when one player is standing still 104 | env.reset() 105 | env.state = (1, 2, 1, 3, 0) # Player A has possession 106 | action = {'player_a': env.EAST, 'player_b': env.NOOP} 107 | obs, reward, terminated, truncated, info = env.step(action) 108 | assert env.state[1] == 2 and env.state[3] == 3, "Players should remain in original positions" 109 | assert env.state[4] in [0, 1], "Possession should be randomly assigned" 110 | 111 | env.reset() 112 | env.state = (1, 2, 1, 3, 1) # Player B has possession 113 | action = {'player_a': env.NOOP, 'player_b': env.WEST} 114 | obs, reward, terminated, truncated, info = env.step(action) 115 | assert env.state[1] == 2 and env.state[3] == 3, "Players should remain in original positions" 116 | assert env.state[4] in [0, 1], "Possession should be randomly assigned" 117 | 118 | def test_move_to_same_cell_collision(env): 119 | def run_move_to_same_cell_collision_test(initial_state, action_a, action_b, iterations=1000): 120 | move_success_counts = {'A': 0, 'B': 0} 121 | possession_switch_count = 0 122 | initial_possession = initial_state[4] 123 | 124 | for _ in range(iterations): 125 | env.reset() 126 | env.state = initial_state 127 | action = {'player_a': action_a, 'player_b': action_b} 128 | obs, reward, terminated, truncated, info = env.step(action) 129 | 130 | if env.state[0] != initial_state[0] or env.state[1] != initial_state[1]: 131 | move_success_counts['A'] += 1 132 | elif env.state[2] != initial_state[2] or env.state[3] != initial_state[3]: 133 | move_success_counts['B'] += 1 134 | 135 | if env.state[4] != initial_possession: 136 | possession_switch_count += 1 137 | 138 | for player, count in move_success_counts.items(): 139 | success_ratio = count / iterations 140 | assert 0.45 <= success_ratio <= 0.55, f"Move success ratio for Player {player}: {success_ratio:.2f}, expected close to 0.5" 141 | 142 | possession_switch_ratio = possession_switch_count / iterations 143 | assert 0.45 <= possession_switch_ratio <= 0.55, f"Possession switch ratio: {possession_switch_ratio:.2f}, expected close to 0.5" 144 | 145 | # Diagonal movements 146 | run_move_to_same_cell_collision_test((1, 1, 2, 2, 0), env.EAST, env.NORTH) # A: right, B: up 147 | run_move_to_same_cell_collision_test((1, 1, 2, 2, 1), env.EAST, env.NORTH) # Same, but B has initial possession 148 | run_move_to_same_cell_collision_test((1, 2, 2, 1, 0), env.WEST, env.NORTH) # A: left, B: up 149 | run_move_to_same_cell_collision_test((1, 2, 2, 1, 1), env.WEST, env.NORTH) # Same, but B has initial possession 150 | run_move_to_same_cell_collision_test((2, 1, 1, 2, 0), env.EAST, env.SOUTH) # A: right, B: down 151 | run_move_to_same_cell_collision_test((2, 1, 1, 2, 1), env.EAST, env.SOUTH) # Same, but B has initial possession 152 | run_move_to_same_cell_collision_test((2, 2, 1, 1, 0), env.WEST, env.SOUTH) # A: left, B: down 153 | run_move_to_same_cell_collision_test((2, 2, 1, 1, 1), env.WEST, env.SOUTH) # Same, but B has initial possession 154 | 155 | # Horizontal movements 156 | run_move_to_same_cell_collision_test((1, 1, 1, 3, 0), env.EAST, env.WEST) # A: right, B: left 157 | run_move_to_same_cell_collision_test((1, 1, 1, 3, 1), env.EAST, env.WEST) # Same, but B has initial possession 158 | run_move_to_same_cell_collision_test((1, 3, 1, 1, 0), env.WEST, env.EAST) # A: left, B: right 159 | run_move_to_same_cell_collision_test((1, 3, 1, 1, 1), env.WEST, env.EAST) # Same, but B has initial possession 160 | 161 | # Vertical movements 162 | run_move_to_same_cell_collision_test((1, 1, 3, 1, 0), env.SOUTH, env.NORTH) # A: down, B: up 163 | run_move_to_same_cell_collision_test((1, 1, 3, 1, 1), env.SOUTH, env.NORTH) # Same, but B has initial possession 164 | run_move_to_same_cell_collision_test((3, 1, 1, 1, 0), env.NORTH, env.SOUTH) # A: up, B: down 165 | run_move_to_same_cell_collision_test((3, 1, 1, 1, 1), env.NORTH, env.SOUTH) # Same, but B has initial possession 166 | 167 | def test_all_edges(env): 168 | # Test Player A at top edge, B at right edge 169 | # Case 1: A has possession, A moves UP, B moves RIGHT 170 | initial_state = (0, 1, 3, 5, 0) 171 | env.state = initial_state 172 | obs, reward, done, truncated, info = env.step({'player_a': env.NORTH, 'player_b': env.EAST}) 173 | assert env.state == initial_state, "State should not change when players attempt to move out of bounds" 174 | 175 | # Case 2: B has possession, A moves UP, B moves RIGHT 176 | initial_state = (0, 1, 3, 5, 1) 177 | env.state = initial_state 178 | obs, reward, done, truncated, info = env.step({'player_a': env.NORTH, 'player_b': env.EAST}) 179 | assert env.state == initial_state, "State should not change when players attempt to move out of bounds" 180 | 181 | # Case 3: A has possession, A moves LEFT, B moves RIGHT 182 | initial_state = (0, 1, 3, 5, 0) 183 | env.state = initial_state 184 | obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.EAST}) 185 | assert env.state == initial_state, "State should not change when players attempt to move out of bounds" 186 | 187 | # Case 4: B has possession, A moves LEFT, B moves RIGHT 188 | initial_state = (0, 1, 3, 5, 1) 189 | env.state = initial_state 190 | obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.EAST}) 191 | assert env.state == initial_state, "State should not change when players attempt to move out of bounds" 192 | 193 | # Case 5: A has possession, A moves UP, B moves DOWN 194 | initial_state = (0, 1, 3, 5, 0) 195 | env.state = initial_state 196 | obs, reward, done, truncated, info = env.step({'player_a': env.NORTH, 'player_b': env.SOUTH}) 197 | assert env.state == initial_state, "State should not change when Player A attempts to move out of bounds" 198 | 199 | # Case 6: B has possession, A moves UP, B moves DOWN 200 | initial_state = (0, 1, 3, 5, 1) 201 | env.state = initial_state 202 | obs, reward, done, truncated, info = env.step({'player_a': env.NORTH, 'player_b': env.SOUTH}) 203 | assert env.state == initial_state, "State should not change when Player A attempts to move out of bounds" 204 | 205 | # Case 7: A has possession, A moves LEFT, B moves DOWN 206 | initial_state = (0, 1, 3, 5, 0) 207 | env.state = initial_state 208 | obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.SOUTH}) 209 | assert env.state == initial_state, "State should not change when Player A attempts to move out of bounds" 210 | 211 | # Case 8: B has possession, A moves LEFT, B moves DOWN 212 | initial_state = (0, 1, 3, 5, 1) 213 | env.state = initial_state 214 | obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.SOUTH}) 215 | assert env.state == initial_state, "State should not change when Player A attempts to move out of bounds" 216 | 217 | # Swap positions: A at right edge, B at top edge 218 | # Case 9: A has possession, A moves RIGHT, B moves UP 219 | initial_state = (3, 5, 0, 1, 0) 220 | env.state = initial_state 221 | obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.NORTH}) 222 | assert env.state == initial_state, "State should not change when players attempt to move out of bounds" 223 | 224 | # Case 10: B has possession, A moves RIGHT, B moves UP 225 | initial_state = (3, 5, 0, 1, 1) 226 | env.state = initial_state 227 | obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.NORTH}) 228 | assert env.state == initial_state, "State should not change when players attempt to move out of bounds" 229 | 230 | # Case 11: A has possession, A moves RIGHT, B moves LEFT 231 | initial_state = (3, 5, 0, 1, 0) 232 | env.state = initial_state 233 | obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.WEST}) 234 | assert env.state == initial_state, "State should not change when players attempt to move out of bounds" 235 | 236 | # Case 12: B has possession, A moves RIGHT, B moves LEFT 237 | initial_state = (3, 5, 0, 1, 1) 238 | env.state = initial_state 239 | obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.WEST}) 240 | assert env.state == initial_state, "State should not change when players attempt to move out of bounds" 241 | 242 | # Case 13: A has possession, A moves DOWN, B moves UP 243 | initial_state = (3, 5, 0, 1, 0) 244 | env.state = initial_state 245 | obs, reward, done, truncated, info = env.step({'player_a': env.SOUTH, 'player_b': env.NORTH}) 246 | assert env.state == initial_state, "State should not change when Player B attempts to move out of bounds" 247 | 248 | # Case 14: B has possession, A moves DOWN, B moves UP 249 | initial_state = (3, 5, 0, 1, 1) 250 | env.state = initial_state 251 | obs, reward, done, truncated, info = env.step({'player_a': env.SOUTH, 'player_b': env.NORTH}) 252 | assert env.state == initial_state, "State should not change when Player B attempts to move out of bounds" 253 | 254 | # Case 15: A has possession, A moves DOWN, B moves LEFT 255 | initial_state = (3, 5, 0, 1, 0) 256 | env.state = initial_state 257 | obs, reward, done, truncated, info = env.step({'player_a': env.SOUTH, 'player_b': env.WEST}) 258 | assert env.state == initial_state, "State should not change when Player B attempts to move out of bounds" 259 | 260 | # Case 16: B has possession, A moves DOWN, B moves LEFT 261 | initial_state = (3, 5, 0, 1, 1) 262 | env.state = initial_state 263 | obs, reward, done, truncated, info = env.step({'player_a': env.SOUTH, 'player_b': env.WEST}) 264 | assert env.state == initial_state, "State should not change when Player B attempts to move out of bounds" 265 | 266 | # GOAL BOUNDARIES WITHOUT POSSESSION 267 | # Test Player A at left goal boundary without possession (row 1) 268 | env.reset() 269 | initial_state = (1, 1, 3, 3, 1) # B has possession 270 | env.state = initial_state 271 | obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.NOOP}) 272 | assert env.state == initial_state, "Player A should not enter left goal area without possession (row 1)" 273 | 274 | # Test Player A at left goal boundary without possession (row 2) 275 | env.reset() 276 | initial_state = (2, 1, 3, 3, 1) # B has possession 277 | env.state = initial_state 278 | obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.NOOP}) 279 | assert env.state == initial_state, "Player A should not enter left goal area without possession (row 2)" 280 | 281 | # Test Player B at right goal boundary without possession (row 1) 282 | env.reset() 283 | initial_state = (3, 3, 1, 5, 0) # A has possession 284 | env.state = initial_state 285 | obs, reward, done, truncated, info = env.step({'player_a': env.NOOP, 'player_b': env.EAST}) 286 | assert env.state == initial_state, "Player B should not enter right goal area without possession (row 1)" 287 | 288 | # Test Player B at right goal boundary without possession (row 2) 289 | env.reset() 290 | initial_state = (3, 3, 2, 5, 0) # A has possession 291 | env.state = initial_state 292 | obs, reward, done, truncated, info = env.step({'player_a': env.NOOP, 'player_b': env.EAST}) 293 | assert env.state == initial_state, "Player B should not enter right goal area without possession (row 2)" 294 | 295 | # Test Player B at left goal boundary without possession (row 1) 296 | env.reset() 297 | initial_state = (3, 3, 1, 1, 0) # A has possession 298 | env.state = initial_state 299 | obs, reward, done, truncated, info = env.step({'player_a': env.NOOP, 'player_b': env.WEST}) 300 | assert env.state == initial_state, "Player B should not move beyond left goal boundary without possession (row 1)" 301 | 302 | # Test Player B at left goal boundary without possession (row 2) 303 | env.reset() 304 | initial_state = (3, 3, 2, 1, 0) # A has possession 305 | env.state = initial_state 306 | obs, reward, done, truncated, info = env.step({'player_a': env.NOOP, 'player_b': env.WEST}) 307 | assert env.state == initial_state, "Player B should not move beyond left goal boundary without possession (row 2)" 308 | 309 | # Test Player A at right goal boundary without possession (row 1) 310 | env.reset() 311 | initial_state = (1, 5, 3, 3, 1) # B has possession 312 | env.state = initial_state 313 | obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.NOOP}) 314 | assert env.state == initial_state, "Player A should not move beyond right goal boundary without possession (row 1)" 315 | 316 | # Test Player A at right goal boundary without possession (row 2) 317 | env.reset() 318 | initial_state = (2, 5, 3, 3, 1) # B has possession 319 | env.state = initial_state 320 | obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.NOOP}) 321 | assert env.state == initial_state, "Player A should not move beyond right goal boundary without possession (row 2)" 322 | 323 | def test_render(env, capsys): 324 | env.reset() 325 | env.render() 326 | captured = capsys.readouterr() 327 | assert "Player A position" in captured.out 328 | assert "Player B position" in captured.out 329 | assert "Ball possession" in captured.out 330 | 331 | def test_possession_change_non_collision(env): 332 | # Test that possession doesn't change when players move without colliding 333 | env.state = (1, 1, 3, 3, 0) # Player A has possession 334 | obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.WEST}) 335 | assert env.state[4] == 0, "Possession should not change without collision" 336 | 337 | env.state = (1, 1, 3, 3, 1) # Player B has possession 338 | obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.WEST}) 339 | assert env.state[4] == 1, "Possession should not change without collision" 340 | 341 | def test_simultaneous_goal_attempts(env): 342 | # Both players attempt to score simultaneously 343 | env.state = (1, 5, 1, 1, 0) # A with ball near B's goal, B near A's goal 344 | obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.WEST}) 345 | assert done['player_a'] and done['player_b'], "Game should end" 346 | assert reward['player_a'] == 1 and reward['player_b'] == -1, "Only A should score" 347 | 348 | env.reset() 349 | env.state = (1, 5, 1, 1, 1) # B with ball near A's goal, A near B's goal 350 | obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.WEST}) 351 | assert done['player_a'] and done['player_b'], "Game should end" 352 | assert reward['player_a'] == -1 and reward['player_b'] == 1, "Only B should score" 353 | 354 | def test_edge_case_possession(env): 355 | # Test possession change when moving to the same cell from different distances 356 | env.state = (1, 1, 1, 2, 0) # A has ball, both move right 357 | obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.EAST}) 358 | assert env.state[4] == 0, "A should keep possession as it's closer" 359 | 360 | env.state = (1, 1, 1, 2, 1) # B has ball, both move right 361 | obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.EAST}) 362 | assert env.state[4] == 1, "B should keep possession even though A moves to the same cell" 363 | 364 | # Test possession change when moving to the same cell from different distances 365 | env.state = (1, 1, 1, 3, 0) # A has ball, both move right 366 | obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.EAST}) 367 | assert env.state[4] == 0, "A should keep possession as it's closer" 368 | 369 | env.state = (1, 1, 1, 3, 1) # B has ball, both move right 370 | obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.EAST}) 371 | assert env.state[4] == 1, "B should keep possession even though A moves to the same cell" 372 | 373 | def test_multiple_consecutive_collisions(env): 374 | initial_state = (1, 2, 1, 3, 0) # A has ball, players adjacent 375 | n_samples = 1000 376 | collision_count = 0 377 | possession_changes = 0 378 | last_possession = 0 379 | 380 | for _ in range(n_samples): 381 | env.state = initial_state 382 | obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.WEST}) 383 | 384 | if env.state[1] == initial_state[1] and env.state[3] == initial_state[3]: 385 | collision_count += 1 386 | 387 | if env.state[4] != last_possession: 388 | possession_changes += 1 389 | 390 | last_possession = env.state[4] 391 | 392 | assert collision_count == n_samples, f"All steps should result in collision, got {collision_count}" 393 | possession_ratio = possession_changes / n_samples 394 | assert 0.45 <= possession_ratio <= 0.55, f"Possession should change roughly half the time, got {possession_ratio:.2f}" 395 | 396 | def test_simultaneous_out_of_bounds(env): 397 | # Both players try to move out of bounds simultaneously 398 | env.state = (0, 1, 3, 5, 0) # A at top edge, B at right edge 399 | initial_state = env.state 400 | obs, reward, done, truncated, info = env.step({'player_a': env.NORTH, 'player_b': env.EAST}) 401 | assert env.state == initial_state, "State should not change when both players attempt to move out of bounds" 402 | 403 | # One player tries to move out of bounds, the other moves validly 404 | env.state = (0, 1, 3, 4, 1) # A at top edge, B has possession 405 | obs, reward, done, truncated, info = env.step({'player_a': env.NORTH, 'player_b': env.EAST}) 406 | assert env.state[3] == 5, "B should move right" 407 | assert env.state[0] == 0 and env.state[1] == 1, "A should not move" 408 | 409 | def test_edge_case_goal_scoring(env): 410 | # Test scoring from the edge of the goal area 411 | env.state = (1, 5, 3, 3, 0) # A with ball, at edge of B's goal 412 | obs, reward, done, truncated, info = env.step({'player_a': env.EAST, 'player_b': env.NOOP}) 413 | assert done['player_a'] and done['player_b'], "Game should end" 414 | assert reward['player_a'] == 1 and reward['player_b'] == -1, "A should score" 415 | 416 | # Test scoring from the edge of own goal area 417 | env.reset() 418 | env.state = (2, 1, 3, 3, 0) # A with ball, at edge of own goal 419 | obs, reward, done, truncated, info = env.step({'player_a': env.WEST, 'player_b': env.NOOP}) 420 | assert done['player_a'] and done['player_b'], "Game should end" 421 | assert reward['player_a'] == -1 and reward['player_b'] == 1, "A should score an own goal" -------------------------------------------------------------------------------- /gym_soccer/tests/test_general.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from gym_soccer.envs.soccer_simultaneous_env import SoccerSimultaneousEnv 4 | 5 | @pytest.mark.parametrize("width,height", [ 6 | (5, 4), # Minimum size, even height 7 | (6, 4), # even height 8 | (7, 5), # Odd height 9 | (9, 6), # Even height 10 | (11, 7), # Odd height 11 | ]) 12 | def test_initial_state_distribution(width, height): 13 | env = SoccerSimultaneousEnv(width=width, height=height) 14 | 15 | # Check that the total probability sums to 1 16 | total_prob = sum(prob for prob, _ in env.isd) 17 | assert abs(total_prob - 1.0) < 1e-6, f"Total probability should be 1, but is {total_prob}" 18 | 19 | # Check that all probabilities are equal 20 | first_prob = env.isd[0][0] 21 | assert all(abs(prob - first_prob) < 1e-6 for prob, _ in env.isd), "All probabilities should be equal" 22 | 23 | # Check starting positions 24 | for _, state in env.isd: 25 | row_a, col_a, row_b, col_b, possession = state 26 | 27 | # Check columns 28 | assert col_a == 2, f"Player A should start in column 2, but starts in column {col_a}" 29 | assert col_b == env.width - 3, f"Player B should start in column {env.width - 3}, but starts in column {col_b}" 30 | 31 | # Check rows 32 | if len(env.goal_rows) % 2 == 0: # Even number of goal rows 33 | middle_index = len(env.goal_rows) // 2 34 | valid_rows = [env.goal_rows[middle_index - 1], env.goal_rows[middle_index]] 35 | assert row_a in valid_rows, f"Player A should start in row {valid_rows[0]} or {valid_rows[1]}, but starts in row {row_a}" 36 | assert row_b in valid_rows, f"Player B should start in row {valid_rows[0]} or {valid_rows[1]}, but starts in row {row_b}" 37 | assert row_a != row_b, f"Players should not start in the same row, but both start in row {row_a}" 38 | else: # Odd number of goal rows 39 | middle_row = env.goal_rows[len(env.goal_rows) // 2] 40 | assert row_a == middle_row, f"Player A should start in middle row {middle_row}, but starts in row {row_a}" 41 | assert row_b == middle_row, f"Player B should start in middle row {middle_row}, but starts in row {row_b}" 42 | 43 | # Check possession 44 | assert possession in [0, 1], f"Possession should be 0 or 1, but is {possession}" 45 | 46 | # Check number of initial states 47 | if len(env.goal_rows) % 2 == 0: 48 | expected_states = 4 # Two row combinations, two possession states 49 | else: 50 | expected_states = 2 # One middle row, two possession states 51 | 52 | assert len(env.isd) == expected_states, f"Expected {expected_states} initial states, but got {len(env.isd)}" 53 | 54 | @pytest.mark.parametrize("width,height", [ 55 | (5, 4), # Minimum size, even height 56 | (6, 4), # even height 57 | (7, 5), # Odd height 58 | (9, 6), # Even height 59 | (11, 7), # Odd height 60 | ]) 61 | def test_env_P_structure(width, height): 62 | env = SoccerSimultaneousEnv(width=width, height=height) 63 | 64 | # Check that env.P is a dictionary 65 | assert isinstance(env.P, dict), "env.P should be a dictionary" 66 | 67 | # Check that all keys in env.P are integers from 0 to len(env.P) - 1 68 | expected_keys = set(range(len(env.P))) 69 | actual_keys = set(env.P.keys()) 70 | assert actual_keys == expected_keys, f"env.P keys should be integers from 0 to {len(env.P) - 1}" 71 | 72 | # Check that all values in env.P are dictionaries 73 | for state, actions in env.P.items(): 74 | assert isinstance(actions, dict), f"env.P[{state}] should be a dictionary" 75 | 76 | # Check that all action keys are valid 77 | valid_actions = set(env.P[0].keys()) 78 | assert set(actions.keys()) == valid_actions, f"Invalid action keys in env.P[{state}]" 79 | 80 | # Check the structure of each action's transitions 81 | for action, transitions in actions.items(): 82 | assert isinstance(transitions, list), f"env.P[{state}][{action}] should be a list" 83 | for transition in transitions: 84 | assert len(transition) == 4, f"Each transition in env.P[{state}][{action}] should have 4 elements" 85 | prob, next_state, reward, done = transition 86 | assert 0 <= prob <= 1, f"Transition probability should be between 0 and 1, got {prob}" 87 | assert isinstance(next_state, int) and 0 <= next_state < len(env.P), f"Invalid next state: {next_state}" 88 | assert isinstance(reward, (int, float)), f"Reward should be a number, got {type(reward)}" 89 | assert isinstance(done, bool), f"Done flag should be a boolean, got {type(done)}" 90 | 91 | print(f"env.P structure test passed for width={width}, height={height}") 92 | 93 | @pytest.mark.parametrize("width,height", [ 94 | (5, 4), # Minimum size, even height 95 | (6, 4), # even height 96 | (7, 5), # Odd height 97 | (9, 6), # Even height 98 | (11, 7), # Odd height 99 | ]) 100 | def test_initial_state_sampling(width, height): 101 | env = SoccerSimultaneousEnv(width=width, height=height) 102 | n_samples = 10000 103 | state_counts = {} 104 | 105 | for _ in range(n_samples): 106 | env.reset() 107 | state_counts[env.state] = state_counts.get(env.state, 0) + 1 108 | 109 | total_states = len(state_counts) 110 | expected_prob = 1 / total_states 111 | expected_count = n_samples / total_states 112 | rtol = 0.1 # 10% relative tolerance 113 | 114 | for state, count in state_counts.items(): 115 | row_a, col_a, row_b, col_b, possession = state 116 | 117 | # Check columns 118 | assert col_a == 2, f"Player A should start in column 2, but starts in column {col_a}" 119 | assert col_b == env.width - 3, f"Player B should start in column {env.width - 3}, but starts in column {col_b}" 120 | 121 | # Check rows 122 | if len(env.goal_rows) % 2 == 0: # Even number of goal rows 123 | middle_index = len(env.goal_rows) // 2 124 | valid_rows = [env.goal_rows[middle_index - 1], env.goal_rows[middle_index]] 125 | assert row_a in valid_rows, f"Player A should start in row {valid_rows[0]} or {valid_rows[1]}, but starts in row {row_a}" 126 | assert row_b in valid_rows, f"Player B should start in row {valid_rows[0]} or {valid_rows[1]}, but starts in row {row_b}" 127 | assert row_a != row_b, f"Players should not start in the same row, but both start in row {row_a}" 128 | else: # Odd number of goal rows 129 | middle_row = env.goal_rows[len(env.goal_rows) // 2] 130 | assert row_a == middle_row, f"Player A should start in middle row {middle_row}, but starts in row {row_a}" 131 | assert row_b == middle_row, f"Player B should start in middle row {middle_row}, but starts in row {row_b}" 132 | 133 | # Check possession 134 | assert possession in [0, 1], f"Possession should be 0 or 1, but is {possession}" 135 | 136 | # Check if the count is approximately equal to the expected count 137 | assert np.isclose(count, expected_count, rtol=rtol), \ 138 | f"State {state} appeared {count} times, expected close to {expected_count}" 139 | 140 | # Check number of initial states 141 | if len(env.goal_rows) % 2 == 0: 142 | expected_states = 4 # Two row combinations, two possession states 143 | else: 144 | expected_states = 2 # One middle row, two possession states 145 | 146 | assert total_states == expected_states, f"Expected {expected_states} initial states, but got {total_states}" 147 | 148 | # Check that the empirical probabilities are close to the expected probability 149 | observed = np.array(list(state_counts.values())) 150 | empirical_probs = observed / n_samples 151 | assert np.allclose(empirical_probs, expected_prob, rtol=rtol), \ 152 | f"Empirical probabilities {empirical_probs} not close to expected {expected_prob}" 153 | 154 | # Check for uniformity using coefficient of variation 155 | cv = np.std(observed) / np.mean(observed) 156 | assert cv < 0.05, f"Distribution not uniform enough. Coefficient of variation: {cv:.3f}" 157 | 158 | 159 | def test_singleagent_a(): 160 | from gym import spaces 161 | width = 5 162 | height = 4 163 | slip_prob = 0.2 164 | n_states = 761 # 4x5 field 165 | n_actions = 5 166 | random_policy = {} 167 | for s in range(n_states): 168 | random_policy[s] = np.random.randint(0, n_actions) 169 | 170 | env = SoccerSimultaneousEnv(width=width, height=height, slip_prob=slip_prob, player_a_policy=None, player_b_policy=random_policy) 171 | assert not env.multiagent, "Environment should not be multiagent, one policy was provided." 172 | 173 | # Check that the observation space is Dict 174 | assert isinstance(env.observation_space, spaces.Dict), "Observation space should be a dictionary." 175 | assert env.observation_space['player_a'].n == n_states, "Observation space should have the correct number of states." 176 | assert 'player_b' not in env.observation_space, "Observation space should not contain player_b." 177 | 178 | # Check that the action space is Dict 179 | assert isinstance(env.action_space, spaces.Dict), "Action space should be a dictionary." 180 | assert env.action_space['player_a'].n == n_actions, "Action space should have the correct number of actions." 181 | assert 'player_b' not in env.action_space, "Action space should not contain player_b." 182 | 183 | obs, info = env.reset() 184 | assert isinstance(obs, dict), "Observation should be a dictionary, single agent mode." 185 | assert 'player_a' in obs, "Observation should contain player_a." 186 | assert 'player_b' not in obs, "Observation should not contain player_b." 187 | assert 0 <= obs['player_a'] < n_states, "Observation should be a state index." 188 | assert isinstance(info, dict), "Info should be a dictionary." 189 | assert 'player_a' in info, "Info should contain player_a." 190 | assert 'player_b' not in info, "Info should not contain player_b." 191 | 192 | random_action = np.random.randint(0, n_actions) 193 | obs, reward, terminated, truncated, info = env.step({'player_a': random_action}) 194 | assert isinstance(obs, dict), "Observation should be a dictionary, single agent mode." 195 | assert 'player_a' in obs, "Observation should contain player_a." 196 | assert 'player_b' not in obs, "Observation should not contain player_b." 197 | assert 0 <= obs['player_a'] < n_states, "Observation should be a state index." 198 | assert isinstance(reward, dict), "Reward should be a dictionary." 199 | assert 'player_a' in reward, "Reward should contain player_a." 200 | assert 'player_b' not in reward, "Reward should not contain player_b." 201 | assert isinstance(terminated, dict), "Terminated should be a dictionary." 202 | assert 'player_a' in terminated, "Terminated should contain player_a." 203 | assert 'player_b' not in terminated, "Terminated should not contain player_b." 204 | assert isinstance(truncated, dict), "Truncated should be a dictionary." 205 | assert 'player_a' in truncated, "Truncated should contain player_a." 206 | assert 'player_b' not in truncated, "Truncated should not contain player_b." 207 | assert isinstance(info, dict), "Info should be a dictionary." 208 | assert 'player_a' in info, "Info should contain player_a." 209 | assert 'player_b' not in info, "Info should not contain player_b." 210 | 211 | def test_singleagent_b(): 212 | from gym import spaces 213 | width = 5 214 | height = 4 215 | slip_prob = 0.2 216 | n_states = 761 # 4x5 field 217 | n_actions = 5 218 | random_policy = {} 219 | for s in range(n_states): 220 | random_policy[s] = np.random.randint(0, n_actions) 221 | 222 | env = SoccerSimultaneousEnv(width=width, height=height, slip_prob=slip_prob, player_a_policy=random_policy, player_b_policy=None) 223 | assert not env.multiagent, "Environment should not be multiagent, one policy was provided." 224 | 225 | # Check that the observation space is Dict 226 | assert isinstance(env.observation_space, spaces.Dict), "Observation space should be a dictionary." 227 | assert env.observation_space['player_b'].n == n_states, "Observation space should have the correct number of states." 228 | assert 'player_a' not in env.observation_space, "Observation space should not contain player_a." 229 | 230 | # Check that the action space is Dict 231 | assert isinstance(env.action_space, spaces.Dict), "Action space should be a dictionary." 232 | assert env.action_space['player_b'].n == n_actions, "Action space should have the correct number of actions." 233 | assert 'player_a' not in env.action_space, "Action space should not contain player_a." 234 | 235 | obs, info = env.reset() 236 | assert isinstance(obs, dict), "Observation should be a dictionary, single agent mode." 237 | assert 'player_b' in obs, "Observation should contain player_b." 238 | assert 'player_a' not in obs, "Observation should not contain player_a." 239 | assert 0 <= obs['player_b'] < n_states, "Observation should be a state index." 240 | assert isinstance(info, dict), "Info should be a dictionary." 241 | assert 'player_b' in info, "Info should contain player_b." 242 | assert 'player_a' not in info, "Info should not contain player_a." 243 | 244 | random_action = np.random.randint(0, n_actions) 245 | obs, reward, terminated, truncated, info = env.step({'player_b': random_action}) 246 | assert isinstance(obs, dict), "Observation should be a dictionary, single agent mode." 247 | assert 'player_b' in obs, "Observation should contain player_b." 248 | assert 'player_a' not in obs, "Observation should not contain player_a." 249 | assert 0 <= obs['player_b'] < n_states, "Observation should be a state index." 250 | assert isinstance(reward, dict), "Reward should be a dictionary." 251 | assert 'player_b' in reward, "Reward should contain player_b." 252 | assert 'player_a' not in reward, "Reward should not contain player_a." 253 | assert isinstance(terminated, dict), "Terminated should be a dictionary." 254 | assert 'player_b' in terminated, "Terminated should contain player_b." 255 | assert 'player_a' not in terminated, "Terminated should not contain player_a." 256 | assert isinstance(truncated, dict), "Truncated should be a dictionary." 257 | assert 'player_b' in truncated, "Truncated should contain player_b." 258 | assert 'player_a' not in truncated, "Truncated should not contain player_a." 259 | assert isinstance(info, dict), "Info should be a dictionary." 260 | assert 'player_b' in info, "Info should contain player_b." 261 | assert 'player_a' not in info, "Info should not contain player_a." 262 | 263 | def test_multiagent(): 264 | from gym import spaces 265 | width = 5 266 | height = 4 267 | slip_prob = 0.2 268 | n_states = 761 # 4x5 field 269 | n_actions = 5 270 | 271 | env = SoccerSimultaneousEnv(width=width, height=height, slip_prob=slip_prob, player_a_policy=None, player_b_policy=None) 272 | assert env.multiagent, "Environment should be multiagent, no policies were provided." 273 | 274 | # Check that the observation space is Dict 275 | assert isinstance(env.observation_space, spaces.Dict), "Observation space should be a dictionary." 276 | assert env.observation_space['player_a'].n == n_states, "Observation space should have the correct number of states for player_a." 277 | assert env.observation_space['player_b'].n == n_states, "Observation space should have the correct number of states for player_b." 278 | 279 | # Check that the action space is Dict 280 | assert isinstance(env.action_space, spaces.Dict), "Action space should be a dictionary." 281 | assert env.action_space['player_a'].n == n_actions, "Action space should have the correct number of actions for player_a." 282 | assert env.action_space['player_b'].n == n_actions, "Action space should have the correct number of actions for player_b." 283 | 284 | obs, info = env.reset() 285 | assert isinstance(obs, dict), "Observation should be a dictionary, multiagent mode." 286 | assert 'player_a' in obs and 'player_b' in obs, "Observation should contain both player_a and player_b." 287 | assert 0 <= obs['player_a'] < n_states and 0 <= obs['player_b'] < n_states, "Observations should be state indices." 288 | assert isinstance(info, dict), "Info should be a dictionary." 289 | assert 'player_a' in info and 'player_b' in info, "Info should contain both player_a and player_b." 290 | 291 | random_action_a = np.random.randint(0, n_actions) 292 | random_action_b = np.random.randint(0, n_actions) 293 | obs, reward, terminated, truncated, info = env.step({'player_a': random_action_a, 'player_b': random_action_b}) 294 | assert isinstance(obs, dict), "Observation should be a dictionary, multiagent mode." 295 | assert 'player_a' in obs and 'player_b' in obs, "Observation should contain both player_a and player_b." 296 | assert 0 <= obs['player_a'] < n_states and 0 <= obs['player_b'] < n_states, "Observations should be state indices." 297 | assert isinstance(reward, dict), "Reward should be a dictionary." 298 | assert 'player_a' in reward and 'player_b' in reward, "Reward should contain both player_a and player_b." 299 | assert isinstance(reward['player_a'], float) and isinstance(reward['player_b'], float), "Rewards should be floats." 300 | assert isinstance(terminated, dict) and isinstance(terminated['player_a'], bool) and isinstance(terminated['player_b'], bool), "Terminated should be a dictionary with boolean values." 301 | assert isinstance(truncated, dict) and isinstance(truncated['player_a'], bool) and isinstance(truncated['player_b'], bool), "Truncated should be a dictionary with boolean values." 302 | assert isinstance(info, dict) and 'player_a' in info and 'player_b' in info, "Info should be a dictionary with both player_a and player_b." 303 | 304 | def test_value_iteration_against_stand_policy_for_player_a(): 305 | from gym_soccer.utils.policies import get_stand_policy 306 | from gym_soccer.utils.planners import value_iteration 307 | 308 | 309 | width = 5 310 | height = 4 311 | slip_prob = 0.2 312 | n_states = 761 # 4x5 field 313 | 314 | # Create stand policy for player B 315 | stand_policy = get_stand_policy(n_states) 316 | 317 | # Create the environment with player B using the stand policy 318 | env = SoccerSimultaneousEnv( 319 | width=width, height=height, slip_prob=slip_prob, 320 | player_a_policy=None, player_b_policy=stand_policy 321 | ) 322 | 323 | # Run value iteration to get the optimal policy for player A 324 | optimal_policy, optimal_V, optimal_Q, cc = value_iteration(env, theta=1e-10, discount_factor=0.99) 325 | 326 | # Test the optimal policy against the stand policy 327 | n_episodes = 1000 328 | wins = 0 329 | 330 | for _ in range(n_episodes): 331 | obs, _ = env.reset() 332 | done = False 333 | while not done: 334 | action = optimal_policy[obs['player_a']] 335 | obs, reward, terminated, truncated, _ = env.step({'player_a': action}) 336 | done = terminated['player_a'] or truncated['player_a'] 337 | if terminated['player_a'] and reward['player_a'] > 0: 338 | wins += 1 339 | 340 | win_rate = wins / n_episodes 341 | assert win_rate == 1.0, f"Expected 100% win rate, but got {win_rate * 100}%" 342 | 343 | 344 | def test_value_iteration_against_random_policy_for_player_a(): 345 | from gym_soccer.utils.policies import get_random_policy 346 | from gym_soccer.utils.planners import value_iteration 347 | 348 | width = 5 349 | height = 4 350 | slip_prob = 0.2 351 | n_states = 761 # 4x5 field 352 | n_actions = 5 353 | 354 | # Create random policy for player B 355 | random_policy = get_random_policy(n_states, n_actions, seed=42) 356 | 357 | # Create the environment with player B using the random policy 358 | env = SoccerSimultaneousEnv( 359 | width=width, height=height, slip_prob=slip_prob, 360 | player_a_policy=None, player_b_policy=random_policy 361 | ) 362 | 363 | # Run value iteration to get the optimal policy for player A 364 | optimal_policy, optimal_V, optimal_Q, cc = value_iteration(env, theta=1e-10, discount_factor=0.99) 365 | 366 | # Test the optimal policy against the random policy 367 | n_episodes = 1000 368 | wins = 0 369 | 370 | for _ in range(n_episodes): 371 | obs, _ = env.reset() 372 | done = False 373 | while not done: 374 | action = optimal_policy[obs['player_a']] 375 | obs, reward, terminated, truncated, _ = env.step({'player_a': action}) 376 | done = terminated['player_a'] or truncated['player_a'] 377 | if terminated['player_a'] and reward['player_a'] > 0: 378 | wins += 1 379 | 380 | win_rate = wins / n_episodes 381 | assert win_rate > 0.95, f"Expected win rate > 95%, but got {win_rate * 100}%" 382 | 383 | def test_value_iteration_against_stand_policy_for_player_b(): 384 | from gym_soccer.utils.policies import get_stand_policy 385 | from gym_soccer.utils.planners import value_iteration 386 | 387 | width = 5 388 | height = 4 389 | slip_prob = 0.2 390 | n_states = 761 # 4x5 field 391 | 392 | # Create stand policy for player A 393 | stand_policy = get_stand_policy(n_states) 394 | 395 | # Create the environment with player A using the stand policy 396 | env = SoccerSimultaneousEnv( 397 | width=width, height=height, slip_prob=slip_prob, 398 | player_a_policy=stand_policy, player_b_policy=None 399 | ) 400 | 401 | # Run value iteration to get the optimal policy for player B 402 | optimal_policy, optimal_V, optimal_Q, cc = value_iteration(env, theta=1e-10, discount_factor=0.99) 403 | 404 | # Test the optimal policy against the stand policy 405 | n_episodes = 1000 406 | wins = 0 407 | 408 | for _ in range(n_episodes): 409 | obs, _ = env.reset() 410 | done = False 411 | while not done: 412 | action = optimal_policy[obs['player_b']] 413 | obs, reward, terminated, truncated, _ = env.step({'player_b': action}) 414 | done = terminated['player_b'] or truncated['player_b'] 415 | if terminated['player_b'] and reward['player_b'] > 0: 416 | wins += 1 417 | 418 | win_rate = wins / n_episodes 419 | assert win_rate == 1.0, f"Expected 100% win rate, but got {win_rate * 100}%" 420 | 421 | def test_value_iteration_against_random_policy_for_player_b(): 422 | from gym_soccer.utils.policies import get_random_policy 423 | from gym_soccer.utils.planners import value_iteration 424 | 425 | width = 5 426 | height = 4 427 | slip_prob = 0.2 428 | n_states = 761 # 4x5 field 429 | n_actions = 5 430 | 431 | # Create random policy for player A 432 | random_policy = get_random_policy(n_states, n_actions, seed=42) 433 | 434 | # Create the environment with player A using the random policy 435 | env = SoccerSimultaneousEnv( 436 | width=width, height=height, slip_prob=slip_prob, 437 | player_a_policy=random_policy, player_b_policy=None 438 | ) 439 | 440 | # Run value iteration to get the optimal policy for player B 441 | optimal_policy, optimal_V, optimal_Q, cc = value_iteration(env, theta=1e-10, discount_factor=0.99) 442 | 443 | # Test the optimal policy against the random policy 444 | n_episodes = 1000 445 | wins = 0 446 | 447 | for _ in range(n_episodes): 448 | obs, _ = env.reset() 449 | done = False 450 | while not done: 451 | action = optimal_policy[obs['player_b']] 452 | obs, reward, terminated, truncated, _ = env.step({'player_b': action}) 453 | done = terminated['player_b'] or truncated['player_b'] 454 | if terminated['player_b'] and reward['player_b'] > 0: 455 | wins += 1 456 | 457 | win_rate = wins / n_episodes 458 | assert win_rate > 0.95, f"Expected win rate > 95%, but got {win_rate * 100}%" 459 | -------------------------------------------------------------------------------- /gym_soccer/envs/soccer_simultaneous_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.toy_text.utils import categorical_sample 3 | from gym import spaces 4 | 5 | class SoccerSimultaneousEnv: 6 | # Define constants for actions 7 | 8 | NOOP = 0 9 | NORTH = 1 10 | SOUTH = 2 11 | EAST = 3 12 | WEST = 4 13 | ACTION_STRING = ['NOOP', 'NORTH', 'SOUTH', 'EAST', 'WEST'] 14 | ACTION_STRING_TO_INT = {k: v for v, k in enumerate(ACTION_STRING)} 15 | 16 | ACTION_STRING_TO_MOVE = { 17 | ACTION_STRING[NOOP]: (0, 0), 18 | ACTION_STRING[NORTH]: (0, -1), 19 | ACTION_STRING[SOUTH]: (0, 1), 20 | ACTION_STRING[EAST]: (1, 0), 21 | ACTION_STRING[WEST]: (-1, 0), 22 | } 23 | MOVE_TO_ACTION_STRING = {v: k for k, v in ACTION_STRING_TO_MOVE.items()} 24 | ACTION_INT_TO_MOVE = { 25 | NOOP: (0, 0), 26 | NORTH: (0, -1), 27 | SOUTH: (0, 1), 28 | EAST: (1, 0), 29 | WEST: (-1, 0), 30 | } 31 | MOVE_TO_ACTION_INT = {v: k for k, v in ACTION_INT_TO_MOVE.items()} 32 | TERMINAL_STATE = (-1, -1, -1, -1, -1) 33 | 34 | 35 | def __init__(self, width=5, height=4, slip_prob=0.0, player_a_policy=None, player_b_policy=None, seed=0): 36 | 37 | # Assert that both policies cannot be set simultaneously 38 | assert not (player_a_policy is not None and player_b_policy is not None), "Both players cannot have a policy. At least one must be None." 39 | # if player_a_policy is not None: 40 | # assert isinstance(player_a_policy, dict), "Player A policy must be a dictionary." 41 | # if player_b_policy is not None: 42 | # assert isinstance(player_b_policy, dict), "Player B policy must be a dictionary." 43 | 44 | # Minimum pitch size is 5x4 45 | assert width >= 5, "Width must be at least 5 columns." 46 | assert height >= 4, "Height must be at least 4 rows." 47 | 48 | self.width = width + 2 # +2 for the columns where goals are located 49 | self.height = height 50 | self.slip_prob = slip_prob 51 | self.seed = seed 52 | self.player_a_policy = player_a_policy 53 | self.player_b_policy = player_b_policy 54 | self.multiagent = player_a_policy is None and player_b_policy is None 55 | self.return_agent = ['player_a', 'player_b'] if self.multiagent else ['player_a'] \ 56 | if player_a_policy is None else ['player_b'] 57 | self.np_random = np.random.RandomState() 58 | self.np_random.seed(self.seed) 59 | 60 | self.goal_rows = ((self.height - 1) // 2, self.height // 2) if self.height % 2 == 0 else (self.height // 2 - 1, self.height // 2, self.height // 2 + 1) 61 | self.goal_cols = (0, self.width - 1) 62 | 63 | self.unreachable_states, self.goal_states = [], {} # containing rewards for player A 64 | self.state_space, self.nS = {}, 1 65 | self.state_space[self.TERMINAL_STATE] = 0 # initialize the terminal state 66 | for xa in range(self.height): 67 | for ya in range(self.width): 68 | for xb in range(self.height): 69 | for yb in range(self.width): 70 | for p in range(2): 71 | state_tuple = (xa, ya, xb, yb, p) 72 | 73 | # Top/bottom left/right corners (goal columns but not goal) 74 | if ya in self.goal_cols and xa not in self.goal_rows or \ 75 | yb in self.goal_cols and xb not in self.goal_rows: 76 | self.unreachable_states.append(state_tuple) 77 | continue 78 | 79 | # Goals without possession 80 | if xa in self.goal_rows and ya in self.goal_cols and p != 0 or \ 81 | xb in self.goal_rows and yb in self.goal_cols and p != 1: 82 | self.unreachable_states.append(state_tuple) 83 | continue 84 | 85 | # Players occupy the same cell 86 | if xa == xb and ya == yb: 87 | self.unreachable_states.append(state_tuple) 88 | continue 89 | 90 | # Terminal states, goals (with possession) 91 | if xa in self.goal_rows and ya in self.goal_cols and p == 0 or \ 92 | xb in self.goal_rows and yb in self.goal_cols and p == 1: 93 | # Goal for player A, or player B own goal 94 | ga = p == 0 and xa in self.goal_rows and ya == self.width - 1 or \ 95 | p == 1 and xb in self.goal_rows and yb == self.width - 1 96 | # Goal for player B, or player A own goal 97 | gb = p == 1 and xb in self.goal_rows and yb == 0 or \ 98 | p == 0 and xa in self.goal_rows and ya == 0 99 | 100 | assert ga or gb, "At least one goal must have been scored to be here" 101 | assert not (ga and gb), "We cannot have both goals scored" 102 | self.goal_states[state_tuple] = 1.0 if ga else -1.0 if gb else 0.0 103 | continue 104 | 105 | self.state_space[state_tuple] = self.nS 106 | self.nS += 1 107 | 108 | assert self.nS == len(self.state_space), "State space should be the same length as the number of states" 109 | self._reverse_state_space = {v: k for k, v in self.state_space.items()} 110 | # Initialize the state space and action space 111 | # self.n_states = self.width * self.height * self.width * self.height * 2 # width * height * width * height * 2 (possession) 112 | self.nA = len(self.ACTION_STRING) # Actions: UP, DOWN, LEFT, RIGHT, STAND 113 | 114 | # # TODO: this is a test, remove it 115 | # # Generate a random policy for player B 116 | # import pickle 117 | # self.player_b_policy = {} 118 | # for s in range(self.n_states): 119 | # self.player_b_policy[s] = self.np_random.randint(0, self.n_actions) 120 | # self.multiagent = False 121 | # # Save dictionary to a file 122 | # with open('random_policy_5x4.pkl', 'wb') as f: 123 | # pickle.dump(self.player_b_policy, f) 124 | 125 | # Update observation space to be Discrete 126 | self.observation_space = spaces.Dict({ 127 | a: spaces.Discrete(self.nS) for a in self.return_agent 128 | }) 129 | self.action_space = spaces.Dict({ 130 | a: spaces.Discrete(self.nA) for a in self.return_agent 131 | }) 132 | 133 | # Define the initial state distribution 134 | self.isd = self._generate_isd() 135 | 136 | # Define transition dynamics and create observation cache 137 | self.P, self.P_readable, self.Pmat, self.Rmat = self._initialize_transition_dynamics() 138 | 139 | # Add a flag to track if reset has been called 140 | self.needs_reset = True 141 | 142 | # Initialize self.state and self.observations 143 | self.state = None 144 | self.observations = None 145 | 146 | def _generate_isd(self): 147 | distribution = [] 148 | col_a = 2 # Player A starts 2 columns from their goal 149 | col_b = self.width - 3 # Player B starts 2 columns from their goal 150 | 151 | if len(self.goal_rows) % 2 == 0: # Even number of goal rows 152 | middle_index = len(self.goal_rows) // 2 153 | row_options = [self.goal_rows[middle_index - 1], self.goal_rows[middle_index]] 154 | for row_a in row_options: 155 | row_b = row_options[1] if row_a == row_options[0] else row_options[0] 156 | for possession in range(2): # 0: A, 1: B 157 | state = (row_a, col_a, row_b, col_b, possession) 158 | distribution.append((0.25, state)) 159 | else: # Odd number of goal rows 160 | middle_row = self.goal_rows[len(self.goal_rows) // 2] 161 | for possession in range(2): # 0: A, 1: B 162 | state = (middle_row, col_a, middle_row, col_b, possession) 163 | distribution.append((0.5, state)) 164 | 165 | return distribution 166 | 167 | def _initialize_transition_dynamics(self): 168 | P = {} 169 | P_readable = {} 170 | Pmat = np.zeros([self.nS, self.nS, self.nA, self.nA]) if self.multiagent else np.zeros([self.nS, self.nS, self.nA]) 171 | Rmat = np.zeros([self.nS, self.nA, self.nA]) if self.multiagent else np.zeros([self.nS, self.nA]) 172 | 173 | for xa in range(self.height): 174 | for ya in range(self.width): 175 | for xb in range(self.height): 176 | for yb in range(self.width): 177 | for p in range(2): # 0: A, 1: B 178 | st = (xa, ya, xb, yb, p) 179 | if st in self.unreachable_states: 180 | continue # skip unreachable states 181 | 182 | s = self._state_to_observation(st) 183 | P[s] = {} 184 | P_readable[st] = {} 185 | 186 | # All actions integer for a and b, sample a policy if provided 187 | aaa = list(range(self.nA)) if self.player_a_policy is None else [self.player_a_policy[s]] 188 | aab = list(range(self.nA)) if self.player_b_policy is None else [self.player_b_policy[s]] 189 | for aa in aaa: 190 | asa = self.ACTION_STRING[aa] 191 | 192 | for ab in aab: 193 | asb = self.ACTION_STRING[ab] 194 | 195 | # Original joint action, integer and string 196 | ja = (aa, ab) 197 | jas = (asa, asb) 198 | 199 | transitions = [] 200 | transitions_readable = [] 201 | 202 | # Calculate intended moves for a and b, as well as orthogonal slips 203 | ma = self.ACTION_INT_TO_MOVE[aa] 204 | mb = self.ACTION_INT_TO_MOVE[ab] 205 | mas = [(-ma[1], ma[0]), (ma[1], -ma[0])] 206 | mbs = [(-mb[1], mb[0]), (mb[1], -mb[0])] 207 | 208 | # All move combinations to consider 209 | amc = [ 210 | # No slip 211 | (ma, mb, (1 - self.slip_prob) * (1 - self.slip_prob)), 212 | # B slips, A does not 213 | (ma, mbs[0], (1 - self.slip_prob) * self.slip_prob * 0.5), 214 | (ma, mbs[1], (1 - self.slip_prob) * self.slip_prob * 0.5), 215 | # A slips, B does not 216 | (mas[0], mb, self.slip_prob * (1 - self.slip_prob) * 0.5), 217 | (mas[1], mb, self.slip_prob * (1 - self.slip_prob) * 0.5), 218 | # Both slip 219 | (mas[0], mbs[0], self.slip_prob * self.slip_prob * 0.25), 220 | (mas[0], mbs[1], self.slip_prob * self.slip_prob * 0.25), 221 | (mas[1], mbs[0], self.slip_prob * self.slip_prob * 0.25), 222 | (mas[1], mbs[1], self.slip_prob * self.slip_prob * 0.25), 223 | ] 224 | 225 | for ma, mb, mp in amc: 226 | if mp == 0: 227 | continue # remove zero probability transitions 228 | 229 | # Joint move action 230 | jma = (ma, mb) 231 | 232 | # Get all next state possible outcomes for the action, and move (slip) 233 | nso = self._get_next_state(st, ja, jma) 234 | for nsp, ns in nso: 235 | if st == ns and st in self.goal_states: 236 | d, r = True, 0.0 237 | elif st != ns and ns in self.goal_states: 238 | d, r = True, self.goal_states[ns] 239 | else: 240 | d, r = False, 0.0 241 | p = mp * nsp 242 | # flip reward for player B in single agent case 243 | if not self.multiagent and 'player_b' in self.return_agent: 244 | r = -1 * r 245 | transitions.append(( 246 | p, # probability of the move (slip), and next_state 247 | self._state_to_observation(ns), # next state 248 | r, # reward 249 | d # done 250 | )) 251 | transitions_readable.append(( 252 | p, # probability of the move (slip), and next_state 253 | ns, # next state 254 | r, # reward 255 | d # done 256 | )) 257 | # if we need to account for joint actions 258 | if self.multiagent: 259 | P[s][ja] = transitions 260 | Rmat[s][ja[0]][ja[1]] = 0 # Initialize reward to 0 261 | for prob, next_state, reward, done in transitions: 262 | Pmat[s][next_state][ja[0]][ja[1]] += prob 263 | Rmat[s][ja[0]][ja[1]] += prob * reward # Weighted sum of rewards 264 | P_readable[st][jas] = transitions_readable 265 | # if we need to account for individual actions a and b 266 | elif self.player_a_policy is None and self.player_b_policy is not None: 267 | P[s][aa] = transitions 268 | Rmat[s][aa] = 0 # Initialize reward to 0 269 | for prob, next_state, reward, done in transitions: 270 | Pmat[s][next_state][aa] += prob 271 | Rmat[s][aa] += prob * reward # Weighted sum of rewards 272 | P_readable[st][asa] = transitions_readable 273 | elif self.player_b_policy is None and self.player_a_policy is not None: 274 | P[s][ab] = transitions 275 | Rmat[s][ab] = 0 # Initialize reward to 0 276 | for prob, next_state, reward, done in transitions: 277 | Pmat[s][next_state][ab] += prob 278 | Rmat[s][ab] += prob * reward # Weighted sum of rewards 279 | P_readable[st][asb] = transitions_readable 280 | # error case 281 | else: 282 | raise ValueError("No policy provided for both players, but action is an integer") 283 | 284 | # Assert that probabilities sum to 1 285 | tp = sum(t[0] for t in transitions) 286 | assert abs(tp - 1.0) < 1e-6, \ 287 | f"Probabilities do not sum to 1 for state {st}, actions {aa}, {ab}. Sum: {tp}" 288 | 289 | # P is the compact representation of the transition dynamics 290 | # P_readable is the same but with the states represented as tuples 291 | # P_readable terminal states are the tuples that are in goal_states 292 | # P has 0 as the terminal states 293 | return P, P_readable, Pmat, Rmat 294 | 295 | 296 | def _get_next_state(self, st, ja, jma): 297 | xa, ya, xb, yb, p = st 298 | 299 | # terminal states 300 | if st in self.goal_states: 301 | return [(1.0, st)] 302 | 303 | # original action integers and move action (including slips) 304 | aa, ab = ja 305 | maa, mab = jma 306 | 307 | # Get potential next positions based on move actions and ball possession 308 | nxa, nya = self._next_cell(xa, ya, maa, p == 0) 309 | nxb, nyb = self._next_cell(xb, yb, mab, p == 1) 310 | 311 | # Handle collisions and possession changes 312 | nso = [] 313 | 314 | # Collision case 1: Players moving through each other 315 | if (xa == xb and 316 | abs(ya - yb) == 1 and 317 | nya == yb and 318 | nyb == ya) or \ 319 | (ya == yb and 320 | abs(xa - xb) == 1 and 321 | nxa == xb and 322 | nxb == xa): 323 | 324 | # Players stay in their original positions, possession changes randomly 325 | assert not (xa == xb and ya == yb), "Players should not be in the same cell" 326 | nso.append((0.5, (xa, ya, xb, yb, 0))) # A gets possession 327 | nso.append((0.5, (xa, ya, xb, yb, 1))) # B gets possession 328 | 329 | # Collision case 2: One player moves into the opponent's cell, the opponent stands 330 | elif (nxa == xb and nya == yb and ab == self.NOOP) or \ 331 | (nxb == xa and nyb == ya and aa == self.NOOP): 332 | 333 | # Nobody moves, they bounce back to their original location. Possession is changed. 334 | assert not (xa == xb and ya == yb), "Players should not be in the same cell" 335 | nso.append((1.0, (xa, ya, xb, yb, 1 - p))) 336 | 337 | # Collision case 3: Players moving to the same cell through a bounce 338 | elif (xa == nxa and ya == nya and aa != self.NOOP and nxb == xa and nyb == ya) or \ 339 | (xb == nxb and yb == nyb and ab != self.NOOP and nxa == xb and nya == yb): 340 | 341 | # Bounce back both players, random possession 342 | assert not (xa == xb and ya == yb), "Players should not be in the same cell" 343 | nso.append((0.5, (xa, ya, xb, yb, 0))) 344 | nso.append((0.5, (xa, ya, xb, yb, 1))) 345 | 346 | # Collision case 4: Players moving to the same empty cell 347 | elif nxa == nxb and nya == nyb: 348 | assert not (xa == nxb and ya == nyb), "Players should not be in the same cell" 349 | assert not (nxa == xb and nya == yb), "Players should not be in the same cell" 350 | 351 | # Bounce back player a, player b moves, random possession 352 | nso.append((0.25, (xa, ya, nxb, nyb, 0))) 353 | nso.append((0.25, (xa, ya, nxb, nyb, 1))) 354 | # Bounce back player b, player a moves, random possession 355 | nso.append((0.25, (nxa, nya, xb, yb, 0))) 356 | nso.append((0.25, (nxa, nya, xb, yb, 1))) 357 | else: 358 | # No collision: players move to their new positions 359 | assert not (nxa == nxb and nya == nyb), "Players should not be in the same cell" 360 | nso.append((1.0, (nxa, nya, nxb, nyb, p))) 361 | 362 | return nso 363 | 364 | def _next_cell(self, x, y, ma, p): 365 | nx = max(0, min(self.height - 1, x + ma[1])) # Clamp to pitch height boundaries 366 | ny = y + ma[0] # assume the move in y 367 | 368 | # Revert x edges unless there is a goal (and not out of bounds) 369 | xoob = ny == 0 or ny == self.width - 1 370 | goal = xoob and nx in self.goal_rows and p # has possession 371 | if xoob and not goal: 372 | ny = y # Bounce back 373 | return nx, ny 374 | 375 | def step(self, action): 376 | assert not self.needs_reset, "Please reset the environment before taking a step" 377 | assert isinstance(action, dict), "Action must be a dictionary" 378 | assert len(action) == 1 or len(action) == 2, "Action must be a dictionary of length 1 or 2" 379 | assert self.multiagent or self.player_a_policy is not None or self.player_b_policy is not None, "Multiagent environment or policy for one player must be provided" 380 | assert self.player_a_policy is not None or 'player_a' in action, "A policy for player_a must be provided" 381 | assert self.player_b_policy is not None or 'player_b' in action, "A policy for player_b must be provided" 382 | 383 | only_agent = None 384 | if self.multiagent: 385 | assert (isinstance(action, dict) and len(action) == 2), "Action must be a dictionary of length 2 for multiagent case" 386 | assert 'player_a' in action and 'player_b' in action, "Action must contain both 'player_a' and 'player_b'" 387 | else: 388 | assert (isinstance(action, dict) and len(action) == 1), "Action must be a dictionary of length 1 for single agent case" 389 | assert 'player_a' in action or 'player_b' in action, "Action must contain either 'player_a' or 'player_b'" 390 | assert not ('player_a' in action and 'player_b' in action), "Action must contain only one of 'player_a' or 'player_b'" 391 | only_agent = 'player_a' if self.player_a_policy is None else 'player_b' 392 | 393 | action_readable = (self.ACTION_STRING[action['player_a']], self.ACTION_STRING[action['player_b']]) if self.multiagent else self.ACTION_STRING[action[only_agent]] 394 | transitions = self.P_readable[self.state][action_readable] 395 | i = categorical_sample([t[0] for t in transitions], self.np_random) 396 | prob, self.state, reward, done = transitions[i] 397 | self.observations = {a: self._state_to_observation(self.state) for a in self.return_agent} 398 | self.lastaction = action 399 | self.timestep += 1 400 | rewards = {a: reward for a in self.return_agent} 401 | if self.multiagent: 402 | rewards['player_b'] *= -1 403 | dones = {a: done for a in self.return_agent} 404 | truncateds = {a: self.timestep >= 100 for a in self.return_agent} 405 | infos = {a: {"p": np.round(prob, 2)} for a in self.return_agent} 406 | self.needs_reset = any(dones.values()) or any(truncateds.values()) 407 | 408 | return self.observations, rewards, dones, truncateds, infos 409 | 410 | def reset(self, seed=None, options=None): 411 | if seed is not None: 412 | self.np_random.seed(seed) 413 | 414 | i = categorical_sample([is_[0] for is_ in self.isd], self.np_random) 415 | p, self.state = self.isd[i] 416 | # currently the integer representation of the state 417 | # later we need the rotation, then integer (both player "see" the same perspective) 418 | # also this observation is the same integer for both, later it won't 419 | self.observations = {a: self._state_to_observation(self.state) for a in self.return_agent} 420 | infos = {a: {"p": np.round(p, 2)} for a in self.return_agent} 421 | self.lastaction = None 422 | self.needs_reset = False 423 | self.timestep = 0 424 | return self.observations, infos 425 | 426 | def render(self): 427 | # Use self.state directly (it's already a dictionary) 428 | print(self.state) 429 | xa, ya, xb, yb, p = self.state 430 | 431 | # Print player positions 432 | print(f"Player A position: x={xa}, y={ya}, possession={p==0}") 433 | print(f"Player B position: x={xb}, y={yb}, possession={p==1}") 434 | 435 | # Create the pitch 436 | pitch = [[' ' for _ in range(self.width)] for _ in range(self.height)] 437 | 438 | # Add players and ball possession 439 | pitch[xa][ya] = 'A' + ('*' if p == 0 else ' ') 440 | pitch[xb][yb] = 'B' + ('*' if p == 1 else ' ') 441 | 442 | rendered_pitch = [] 443 | rendered_pitch.append(' ' + '-' * (self.width * 2 - 4)) 444 | for ri, r in enumerate(pitch): 445 | if ri in self.goal_rows: 446 | if '*' in r[0]: 447 | rendered_pitch.append(''.join(f'{cell:<2}' for cell in r[0:-1]) + '||') 448 | elif '*' in r[-1]: 449 | rendered_pitch.append('||' + ''.join(f'{cell:<2}' for cell in r[1:])) 450 | else: 451 | rendered_pitch.append('||' + ''.join(f'{cell:<2}' for cell in r[1:-1]) + '||') 452 | else: 453 | rendered_pitch.append(' |' + ''.join(f'{cell:<2}' for cell in r[1:-1]) + '| ') 454 | rendered_pitch.append(' ' + '-' * (self.width * 2 - 4)) 455 | 456 | # Print the entire pitch 457 | for r in rendered_pitch: 458 | print(r) 459 | 460 | # Print additional information 461 | print(f"Ball possession: {'A' if p == 0 else 'B'}") 462 | if self.lastaction and self.multiagent: 463 | action_a, action_b = self.lastaction.values() 464 | print(f"Last actions: A: {self.ACTION_STRING[action_a]}, B: {self.ACTION_STRING[action_b]}") 465 | elif self.lastaction and not self.multiagent: 466 | if self.player_a_policy is None: 467 | action_a = self.lastaction['player_a'] 468 | print(f"Last action: A: {self.ACTION_STRING[action_a]}") 469 | elif self.player_b_policy is None: 470 | action_b = self.lastaction['player_b'] 471 | print(f"Last action: B: {self.ACTION_STRING[action_b]}") 472 | else: 473 | raise ValueError("No policy provided for both players, but action is an integer") 474 | 475 | # Check for goal or own goal 476 | if p == 0: # Player A has the ball 477 | if ya == 0 and xa in self.goal_rows: 478 | print("OWN GOAL! Player A scored in their own goal!") 479 | elif ya == self.width - 1 and xa in self.goal_rows: 480 | print("GOAL! Player A scored!") 481 | else: # Player B has the ball 482 | if yb == 0 and xb in self.goal_rows: 483 | print("GOAL! Player B scored!") 484 | elif yb == self.width - 1 and xb in self.goal_rows: 485 | print("OWN GOAL! Player B scored in their own goal!") 486 | 487 | def _state_to_observation(self, state): 488 | # This function later should rotate the observations 489 | # so that both players see the same perspective 490 | # currently it's they see the global game state 491 | # the problem is the a players trained to solve player a's perspective 492 | # cannot perform on player b's perspective 493 | state = self.TERMINAL_STATE if state in self.goal_states else state 494 | return self.state_space[state] 495 | 496 | def _observation_to_state(self, observation): 497 | return self._reverse_state_space[observation] 498 | 499 | def main(): 500 | n_states = 761 # 5x4 field 501 | # n_states = 11705 # 11x7 field 502 | n_actions = 5 503 | import time 504 | from gym_soccer.utils.policies import get_random_policy, get_stand_policy 505 | from gym_soccer.utils.planners import value_iteration, policy_iteration, modified_policy_iteration 506 | 507 | random_policy = get_random_policy(n_states, n_actions, seed=0) 508 | stand_policy = get_stand_policy(n_states) 509 | player_b_policy = random_policy 510 | 511 | # Create the environment 512 | # env = SoccerSimultaneousEnv( 513 | # width=5, height=4, slip_prob=0.2, 514 | # player_a_policy=None, player_b_policy=None) 515 | # env = SoccerSimultaneousEnv( 516 | # width=11, height=7, slip_prob=0.2, 517 | # player_a_policy=None, player_b_policy=player_b_policy) 518 | env = SoccerSimultaneousEnv( 519 | width=5, height=4, slip_prob=0.2, 520 | player_a_policy=None, player_b_policy=player_b_policy) 521 | # env = SoccerSimultaneousEnv( 522 | # width=5, height=4, slip_prob=0.2, 523 | # player_a_policy=player_b_policy, player_b_policy=None) 524 | 525 | k_1 = 1 526 | k_2 = 10000000 527 | theta = 1e-10 528 | discount_factor = 0.99 529 | # Value iteration 530 | vi_time = time.time() 531 | vi_br_pi, vi_br_V, vi_br_Q, vi_cc = value_iteration(env, theta=theta, discount_factor=discount_factor) 532 | vi_time = time.time() - vi_time 533 | print("Value iteration converged in {} iterations in {:.2f} seconds".format(vi_cc, vi_time)) 534 | 535 | # Policy iteration 536 | pi_time = time.time() 537 | pi_br_pi, pi_br_V, pi_br_Q, pi_cc = policy_iteration(env, theta=theta, discount_factor=discount_factor) 538 | pi_time = time.time() - pi_time 539 | print("Policy iteration converged in {} iterations in {:.2f} seconds".format(pi_cc, pi_time)) 540 | 541 | # Modified policy iteration, 1 pass for each policy evaluation 542 | mpi_1_time = time.time() 543 | mpi_1_br_pi, mpi_1_br_V, mpi_1_br_Q, mpi_1_cc = modified_policy_iteration(env, k=k_1, theta=theta, discount_factor=discount_factor) 544 | mpi_1_time = time.time() - mpi_1_time 545 | print("Modified policy iteration (k={}) converged in {} iterations in {:.2f} seconds".format(k_1, mpi_1_cc, mpi_1_time)) 546 | 547 | # Modified policy iteration, infinite passes for each policy evaluation 548 | mpi_2_time = time.time() 549 | mpi_2_br_pi, mpi_2_br_V, mpi_2_br_Q, mpi_2_cc = modified_policy_iteration(env, k=k_2, theta=theta, discount_factor=discount_factor) 550 | mpi_2_time = time.time() - mpi_2_time 551 | print("Modified policy iteration (k={}) converged in {} iterations in {:.2f} seconds".format(k_2, mpi_2_cc, mpi_2_time)) 552 | 553 | # Check if all policies are the same 554 | assert np.all(vi_br_pi == pi_br_pi), "Value iteration and policy iteration should converge to the same policy" 555 | assert np.all(vi_br_pi == mpi_1_br_pi), "Value iteration and modified policy iteration should converge to the same policy" 556 | assert np.all(vi_br_pi == mpi_2_br_pi), "Value iteration and modified policy iteration should converge to the same policy" 557 | 558 | # Check if all value functions are the same 559 | assert np.allclose(vi_br_V, pi_br_V), "Value iteration and policy iteration should converge to the same value function" 560 | assert np.allclose(vi_br_V, mpi_1_br_V), "Value iteration and modified policy iteration should converge to the same value function" 561 | assert np.allclose(vi_br_V, mpi_2_br_V), "Value iteration and modified policy iteration should converge to the same value function" 562 | 563 | # Check if all Q-functions are the same 564 | assert np.allclose(vi_br_Q, pi_br_Q), "Value iteration and policy iteration should converge to the same Q-function" 565 | assert np.allclose(vi_br_Q, mpi_1_br_Q), "Value iteration and modified policy iteration should converge to the same Q-function" 566 | assert np.allclose(vi_br_Q, mpi_2_br_Q), "Value iteration and modified policy iteration should converge to the same Q-function" 567 | print("All algorithms converged to the same result.") 568 | 569 | n_episodes = 1000 570 | rewards, steps = [], [] 571 | for i in range(n_episodes): 572 | 573 | # Reset the environment 574 | os, fs = env.reset() 575 | rewards.append(0) 576 | steps.append(0) 577 | all_done = False 578 | while not all_done: 579 | 580 | # Render the environment 581 | if i == n_episodes - 1: 582 | env.render() 583 | 584 | # Select random actions for both players 585 | # action_a = env.action_space['player_a'].sample() 586 | # action_b = env.action_space.sample() 587 | action_a = vi_br_pi[os['player_a']] 588 | # action_a = vi_br_pi[os['player_b']] 589 | # action_a = env.EAST 590 | 591 | # Take a step in the environment 592 | # observation, reward, done, truncated, info = env.step({'player_a': action_a, 'player_b': action_b}) 593 | os, rs, ds, ts, fs = env.step({'player_a': action_a}) 594 | rewards[-1] += rs['player_a'] 595 | 596 | all_done = any(ds.values()) or any(ts.values()) 597 | if i == n_episodes - 1: 598 | print(f"Values after step {steps[-1]}:") 599 | for k, po in os.items(): 600 | print(f"{po}:") 601 | print(f"\tobservation: {os[k]}") 602 | print(f"\treward: {rs[k]}") 603 | print(f"\tdone: {ds[k]}") 604 | print(f"\ttruncated: {ts[k]}") 605 | print(f"\tinfo: {fs[k]}") 606 | 607 | steps[-1] += 1 608 | 609 | if i == n_episodes - 1: 610 | # Render the final state 611 | env.render() 612 | 613 | print(f"All {n_episodes} episodes finished with average reward {np.mean(rewards)} and average steps {np.mean(steps)}.") 614 | 615 | 616 | if __name__ == "__main__": 617 | main() --------------------------------------------------------------------------------