├── elsciRL ├── agents │ ├── __init__.py │ ├── clean_rl │ │ └── __init__.py │ ├── LLM_agents │ │ └── agent_modelfiles │ │ │ └── llama3_2.modelfile │ ├── random_agent.py │ ├── agent_abstract.py │ ├── stable_baselines │ │ ├── SB3_DQN.py │ │ ├── SB3_PPO.py │ │ └── SB3_A2C.py │ └── DQN.py ├── examples │ ├── placeholder.png │ ├── sailing_setup.png │ ├── local_configs │ │ ├── gym_frozenlake_config_local.py │ │ └── sailing_config_local.py │ ├── Readme.md │ ├── experiment_config.py │ ├── adapters │ │ ├── gym_frozenlake_default.py │ │ ├── gym_frozenlake_language.py │ │ ├── elsciRL_sailing_default.py │ │ └── elsciRL_sailing_language.py │ ├── environments │ │ ├── gym_frozenlake.py │ │ └── elsciRL_sailing.py │ └── DemoExperiment.py ├── analysis │ ├── tabular_output.py │ ├── combined_tabular_results.py │ └── convergence_measure.py ├── experiments │ ├── experiment_utils │ │ ├── config_utils.py │ │ ├── render_current_results.py │ │ ├── policy_agent_factory.py │ │ ├── result_manager.py │ │ ├── env_manager.py │ │ └── agent_factory.py │ └── training_procedures │ │ ├── policy_gradient.py │ │ └── default_exp_training.py ├── adapters │ ├── __init__.py │ ├── LLM_state_generators │ │ ├── base_prompt.py │ │ └── text_gpt-4.1.py │ └── LLM_logic_generators │ │ ├── adapter_prompt.py │ │ └── ollama_adapter_generator.py ├── application_suite │ ├── search_agent.py │ ├── experiment_agent.py │ └── CACHE_README.md ├── encoders │ ├── encoder_abstract.py │ ├── __init__.py │ ├── poss_actions_encoded.py │ ├── observable_objects_encoded.py │ ├── prior_actions_encoded.py │ ├── poss_state_encoded.py │ └── language_transformers │ │ └── MiniLM_L6v2.py ├── environment_setup │ ├── instruction_reward_wrapper.py │ ├── imports.py │ ├── results_table.py │ ├── elsciRL_info.py │ └── gym_translator.py ├── GUI │ ├── templates │ │ └── _generic_agent_param_form.html │ ├── prerender_encoder.py │ ├── LLM_tools │ │ └── LLM_utils.py │ └── static │ │ └── app_setup.md ├── instruction_following │ ├── instr_utils │ │ └── elsciRL_instr_input.py │ └── LLM_instr_planner │ │ └── LLM_instr_validator.py ├── __init__.py ├── config_local.py ├── config.py └── interaction_loops │ ├── state_search.py │ ├── policy_gradient.py │ └── standard_gym.py ├── .github └── FUNDING.yml ├── requirements.txt ├── pyelsciRL.toml ├── setup.py ├── .gitignore └── tests └── test_policy_gradient_classroom.py /elsciRL/agents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: pdfosborne 4 | -------------------------------------------------------------------------------- /elsciRL/agents/clean_rl/__init__.py: -------------------------------------------------------------------------------- 1 | from .ppo import CleanRLPPO 2 | 3 | __all__ = ["CleanRLPPO"] 4 | -------------------------------------------------------------------------------- /elsciRL/examples/placeholder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pdfosborne/elsciRL/HEAD/elsciRL/examples/placeholder.png -------------------------------------------------------------------------------- /elsciRL/examples/sailing_setup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pdfosborne/elsciRL/HEAD/elsciRL/examples/sailing_setup.png -------------------------------------------------------------------------------- /elsciRL/agents/LLM_agents/agent_modelfiles/llama3_2.modelfile: -------------------------------------------------------------------------------- 1 | FROM llama3.2 2 | 3 | # Set temperature to 0 for deterministic responses 4 | PARAMETER temperature 0 5 | 6 | # Set context length to 4000 tokens 7 | PARAMETER num_ctx 4000 8 | 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | numpy 3 | pandas 4 | matplotlib 5 | seaborn 6 | scipy>=1.10.1 7 | torch 8 | tqdm 9 | httpimport 10 | sentence-transformers 11 | gymnasium 12 | stable-baselines3 13 | ollama 14 | openai 15 | markdown 16 | pyboy -------------------------------------------------------------------------------- /elsciRL/examples/local_configs/gym_frozenlake_config_local.py: -------------------------------------------------------------------------------- 1 | LocalConfigData ={ 2 | "adapter_select": ["Default", "Language"], 3 | "training_action_cap": 100, 4 | "testing_action_cap":100, 5 | "reward_signal": [1,-0.01,-0.1], 6 | "sub_goal": "None" 7 | } -------------------------------------------------------------------------------- /elsciRL/examples/Readme.md: -------------------------------------------------------------------------------- 1 | # elsciRL Examples 2 | 3 | These are designed to be be run quickly to test installation. 4 | 5 | After installing elsciRL simply use the following Python commands: 6 | 7 | ```python 8 | import elsciRL.examples.experiment.DemoExperiment 9 | 10 | exp = DemoExperiment() 11 | 12 | exp.run() 13 | exp.evaluate() 14 | ``` -------------------------------------------------------------------------------- /elsciRL/examples/local_configs/sailing_config_local.py: -------------------------------------------------------------------------------- 1 | LocalConfigData = { 2 | "env_select":"simple_river", 3 | "adapter_select": ["Default", "Language"], 4 | "training_action_cap": 100, 5 | "testing_action_cap":100, 6 | "reward_signal": [0.5,0,-0.1], 7 | "sub_goal": "None", 8 | "supervised_rewards":"False", 9 | "y_limit":25, 10 | "obs_precision":2 11 | } -------------------------------------------------------------------------------- /elsciRL/analysis/tabular_output.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | class TabularOutput: 5 | def __init__(self, results_data, save_dir): 6 | self.results_data = results_data 7 | self.save_dir = save_dir 8 | self.num_episodes = np.max(results_data['episode']) 9 | 10 | def save_results(self): 11 | pd.DataFrame(self.results_data).to_csv(self.save_dir+'/results.csv') -------------------------------------------------------------------------------- /pyelsciRL.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "elscirl" 3 | version = "1.0.0" 4 | authors = [ 5 | { name="Philip Osborne", email="pdfosborne@gmail.com" }, 6 | ] 7 | description = "Applying the elsciRL architecture to Reinforcement Learning problems." 8 | readme = "README.md" 9 | requires-python = ">=3.11" 10 | classifiers = [ 11 | "Programming Language :: Python :: 3", 12 | "License :: OSI Approved :: Apache-2.0 license", 13 | "Operating System :: OS Independent", 14 | ] 15 | 16 | [project.urls] 17 | "Homepage" = "https://github.com/pdfosborne/elscirl" 18 | "Bug Tracker" = "https://github.com/pdfosborne/elscirl/issues" -------------------------------------------------------------------------------- /elsciRL/examples/experiment_config.py: -------------------------------------------------------------------------------- 1 | ExperimentConfigData = { 2 | "name": "Example Experiment", 3 | "problem_type": "Examples", 4 | 5 | "number_training_episodes": 100, 6 | "number_training_repeats": 5, 7 | "number_training_seeds": 1, 8 | 9 | "test_agent_type":"best", 10 | "number_test_episodes": 25, 11 | "number_test_repeats": 5, 12 | 13 | "agent_select": ["Qlearntab", "Qlearntab"], 14 | "agent_parameters":{ 15 | "Qlearntab":{ 16 | "alpha": 0.1, 17 | "gamma": 0.95, 18 | "epsilon": 0.2, 19 | "epsilon_step":0.01 20 | } 21 | } 22 | } -------------------------------------------------------------------------------- /elsciRL/experiments/experiment_utils/config_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | def ensure_dir(path): 5 | if not os.path.exists(path): 6 | os.makedirs(path) 7 | 8 | 9 | def load_config(config_path): 10 | with open(config_path, 'r') as f: 11 | if config_path.endswith('.json'): 12 | return json.load(f) 13 | # Add more config formats if needed 14 | raise ValueError("Unsupported config file format.") 15 | 16 | 17 | def merge_configs(config1, config2): 18 | # Simple dict merge, can be improved for deep merge 19 | merged = config1.copy() 20 | merged.update(config2) 21 | return merged 22 | -------------------------------------------------------------------------------- /elsciRL/agents/random_agent.py: -------------------------------------------------------------------------------- 1 | import random 2 | from elsciRL.agents.agent_abstract import Agent 3 | import torch 4 | from torch import Tensor 5 | 6 | class RandomAgent(Agent): 7 | """This is simply a random decision maker, does not learn.""" 8 | def __init__(self): 9 | super().__init__() 10 | 11 | def policy(self, state: Tensor, legal_actions: list) -> str: 12 | action = random.choice(legal_actions) 13 | return action 14 | 15 | def learn(self, state: Tensor, next_state: Tensor, r_p: float, 16 | action_code: str) -> float: 17 | # Do nothing. 18 | return None 19 | 20 | def q_result(self): 21 | """Random agent has no knowledge.""" 22 | total_q = 0 23 | mean_q = 0 24 | return total_q, mean_q 25 | -------------------------------------------------------------------------------- /elsciRL/adapters/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Any 3 | 4 | class StateAdapter(ABC): 5 | def __init__(self, raw_state): 6 | super().__init__() 7 | # Define the fields that describe the state features: 8 | self.state: list = self._read(raw_state) 9 | 10 | @abstractmethod 11 | def _read(raw_state) -> list: 12 | # Read the data. 13 | # fill in the feature fields 14 | raise NotImplementedError 15 | 16 | def adapter(self): 17 | "Returns the adapted form, may require input flag for encoded or non-encoded output." 18 | 19 | 20 | def sample(self): 21 | """Returns a sample of an adapted state form (typically initial position of the environment).""" 22 | 23 | 24 | -------------------------------------------------------------------------------- /elsciRL/application_suite/search_agent.py: -------------------------------------------------------------------------------- 1 | class DefaultAgentConfig: 2 | def __init__(self): 3 | self.data ={ 4 | "name": "Default", 5 | "problem_type": "Default", 6 | 7 | "number_training_episodes": 1000, 8 | "number_training_repeats": 5, 9 | "number_training_seeds": 1, 10 | 11 | "test_agent_type":"best", 12 | "number_test_episodes": 200, 13 | "number_test_repeats": 10, 14 | 15 | "agent_select": ["Qlearntab"], 16 | "adapter_select": ["default"], 17 | "agent_parameters":{ 18 | "Qlearntab":{ 19 | "alpha": 0.1, 20 | "gamma": 0.95, 21 | "epsilon": 1, 22 | "epsilon_step":0 23 | }, 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /elsciRL/agents/agent_abstract.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Iterable, Hashable, Any 3 | from torch import Tensor 4 | 5 | class Agent(ABC): 6 | @abstractmethod 7 | def policy(self, **kwargs) -> str: 8 | pass 9 | 10 | def learn(self, **kwargs) -> str: 11 | pass 12 | 13 | class QLearningAgent(Agent): 14 | def policy(self, state:Tensor, game_over:bool, 15 | legal_actions:list, **kwargs) -> Hashable: 16 | pass 17 | 18 | def learn(self, state:Tensor, action:Hashable, next_state:Iterable[Any], 19 | immediate_reward:float, **kwargs): 20 | pass 21 | 22 | 23 | class LLMAgentAbstract(Agent): 24 | def policy(self, state:str, legal_actions:list, **kwargs) -> str: 25 | pass 26 | 27 | def learn(self, state:str, action:str, next_state:str, reward:float, **kwargs) -> str: 28 | pass 29 | 30 | -------------------------------------------------------------------------------- /elsciRL/encoders/encoder_abstract.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from typing import List, Dict 4 | from abc import ABC, abstractmethod 5 | from torch import Tensor 6 | 7 | class Encoder(ABC): 8 | @abstractmethod 9 | def encode(self, *args, **kwargs) -> Tensor: 10 | pass 11 | 12 | class StateEncoder(Encoder): 13 | tensor_cache: Dict[int, Tensor] = dict() 14 | tensor_cache_index: int = 0 15 | 16 | @staticmethod 17 | def cache_insert(t: Tensor): 18 | StateEncoder.tensor_cache[StateEncoder.tensor_cache_index] = t 19 | StateEncoder.tensor_cache_index += 1 20 | 21 | @staticmethod 22 | def cache_retrieve(offset: int, index: int): 23 | return StateEncoder.tensor_cache[offset + index] 24 | 25 | 26 | def encode(self, state:list = None, legal_actions:list = None, episode_action_history:str = None) -> Tensor: 27 | pass 28 | -------------------------------------------------------------------------------- /elsciRL/adapters/LLM_state_generators/base_prompt.py: -------------------------------------------------------------------------------- 1 | elsciRL_base_prompt = """ 2 | You are a helpful assistant that needs to describe the current state of a reinforcement learning environment to help an agent understand the context of the problem and how to act optimally. 3 | 4 | The state can be text but is typically a list of numbers, you will be provided with prior actions and their outcome states and should use this information to describe the current state. 5 | 6 | If no actions are provided, you should still describe the current state as best as you can. 7 | 8 | You will be provided with a list of legal actions that the agent can take in the current state, you should describe these actions in a way that is useful for the agent to understand what it can do. 9 | 10 | You do not need to provide any details about what the agent should do, just describe the current state and the legal actions available to the agent in a single paragraph with less than 200 words. 11 | 12 | 13 | """ -------------------------------------------------------------------------------- /elsciRL/environment_setup/instruction_reward_wrapper.py: -------------------------------------------------------------------------------- 1 | """Gym wrapper utilities for instruction-following reward shaping.""" 2 | from __future__ import annotations 3 | 4 | from typing import Callable, Dict, Optional 5 | 6 | import numpy as np 7 | 8 | from elsciRL.environment_setup.gym_wrapper_abstract import RewardWrapper 9 | 10 | 11 | class InstructionRewardWrapper(RewardWrapper): 12 | """Adds adapter-derived instruction rewards to a Gym environment.""" 13 | 14 | def __init__(self, env, reward_fn: Optional[Callable[[np.ndarray | None, Dict], float]] = None): 15 | super().__init__(env) 16 | self.reward_fn = reward_fn 17 | 18 | def reward(self, reward): 19 | if self.reward_fn is None: 20 | return reward 21 | obs = getattr(self.env, "last_obs", None) 22 | info = getattr(self.env, "last_info", {}) 23 | shaped_reward = self.reward_fn(obs, info) 24 | if shaped_reward is None: 25 | return reward 26 | return reward + shaped_reward 27 | -------------------------------------------------------------------------------- /elsciRL/experiments/experiment_utils/render_current_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def render_current_result(training_setup, current_environment, current_agent, local_save_dir): 4 | """Apply fixed policy to render current decision making for limited number of episodes.""" 5 | # Override input training setups with previously saved 6 | 7 | test_setup_info = training_setup.copy() 8 | 9 | test_setup_info['train'] = False # Testing Phase 10 | test_setup_info['training_results'] = False 11 | test_setup_info['observed_states'] = False 12 | test_setup_info['experience_sampling'] = False 13 | print("----------") 14 | print("Rendering trained agent's policy:") 15 | 16 | env = current_environment 17 | # --- 18 | env.number_episodes = 1 # Only render 1 episode 19 | env.agent = current_agent 20 | env.agent.epsilon = 0 # Remove random actions 21 | # --- 22 | # Render results 23 | if not os.path.exists(local_save_dir): 24 | os.mkdir(local_save_dir) 25 | env.episode_loop(render=True, render_save_dir=local_save_dir) -------------------------------------------------------------------------------- /elsciRL/experiments/experiment_utils/policy_agent_factory.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Type 2 | 3 | from elsciRL.agents.stable_baselines.SB3_PPO import SB_PPO 4 | from elsciRL.agents.stable_baselines.SB3_A2C import SB_A2C 5 | from elsciRL.agents.stable_baselines.SB3_DQN import SB_DQN 6 | from elsciRL.agents.clean_rl.ppo import CleanRLPPO 7 | 8 | 9 | class PolicyAgentFactory: 10 | """Factory for Gym/PyTorch policy-gradient agents (SB3-backed).""" 11 | 12 | def __init__(self): 13 | self.agent_types: Dict[str, Type] = { 14 | "SB3_PPO": SB_PPO, 15 | "SB3_A2C": SB_A2C, 16 | "SB3_DQN": SB_DQN, 17 | "PPO": CleanRLPPO, 18 | } 19 | 20 | def register_agent(self, name: str, agent_cls: Type): 21 | self.agent_types[name] = agent_cls 22 | 23 | def create(self, agent_type: str, agent_parameters: Dict, env): 24 | if agent_type not in self.agent_types: 25 | raise ValueError(f"Unknown policy agent type: {agent_type}") 26 | agent_cls = self.agent_types[agent_type] 27 | # Most SB3 wrappers accept the env kwarg directly. 28 | return agent_cls(env=env, **agent_parameters) 29 | -------------------------------------------------------------------------------- /elsciRL/GUI/templates/_generic_agent_param_form.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /elsciRL/experiments/experiment_utils/result_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | class ResultManager: 5 | """Handles saving, loading, and reporting of results.""" 6 | def __init__(self, analysis): 7 | self.analysis = analysis 8 | 9 | def save_results(self, results, save_dir, filename): 10 | os.makedirs(save_dir, exist_ok=True) 11 | path = os.path.join(save_dir, filename) 12 | results.to_csv(path) 13 | 14 | def load_results(self, path): 15 | # Assumes CSV for now 16 | import pandas as pd 17 | return pd.read_csv(path) 18 | 19 | def train_report(self, training_results, save_dir, show_figures): 20 | return self.analysis.train_report(training_results, save_dir, show_figures) 21 | 22 | def test_report(self, testing_results, save_dir, show_figures): 23 | return self.analysis.test_report(testing_results, save_dir, show_figures) 24 | 25 | def training_variance_report(self, save_dir, show_figures): 26 | return self.analysis.training_variance_report(save_dir, show_figures) 27 | 28 | def testing_variance_report(self, save_dir, show_figures): 29 | return self.analysis.testing_variance_report(save_dir, show_figures) 30 | -------------------------------------------------------------------------------- /elsciRL/instruction_following/instr_utils/elsciRL_instr_input.py: -------------------------------------------------------------------------------- 1 | class elsciRLInput: 2 | def __init__(self, description_lookup:dict=None): 3 | self.description_lookup = description_lookup 4 | # New: store descriptions provided so the user doesn't need to provide multiple times 5 | self.descriptions_stored:dict={} 6 | 7 | def user_input(self): 8 | instructions = [] 9 | instruction_descriptions = [] 10 | while True: 11 | instr = input("Please provide the current instruction... ([e/exit] to end path)") 12 | if (instr == "e")|(instr=="exit"): 13 | break 14 | 15 | if not self.description_lookup: 16 | if instr not in self.descriptions_stored: 17 | description = input("Please provide a description of the instruction...") 18 | else: 19 | print("Instruction description provided previously.") 20 | description = self.descriptions_stored[instr] 21 | if description == "None": 22 | description = instr 23 | 24 | instructions.append(instr) 25 | instruction_descriptions.append(description) 26 | self.descriptions_stored[instr] = description 27 | 28 | 29 | return instructions, instruction_descriptions -------------------------------------------------------------------------------- /elsciRL/application_suite/experiment_agent.py: -------------------------------------------------------------------------------- 1 | class DefaultAgentConfig: 2 | def __init__(self): 3 | self.data ={ 4 | "name": "Default", 5 | "problem_type": "Default", 6 | 7 | "instruction_chain": True, 8 | "instruction_chain_how": "continuous", 9 | 10 | "number_training_episodes": 1000, 11 | "number_training_repeats": 5, 12 | "number_training_seeds": 1, 13 | 14 | "test_agent_type":"best", 15 | "number_test_episodes": 200, 16 | "number_test_repeats": 10, 17 | 18 | "agent_select": ["Qlearntab"], 19 | "adapter_select": ["default"], 20 | "agent_parameters":{ 21 | "Qlearntab":{ 22 | "alpha": 0.1, 23 | "gamma": 0.95, 24 | "epsilon": 1, 25 | "epsilon_step":0 26 | }, 27 | "DQN":{ 28 | "learning_rate": 0.001, 29 | "gamma": 0.99, 30 | "epsilon": 1.0, 31 | "epsilon_min": 0.01, 32 | "epsilon_decay": 0.995, 33 | "memory_size": 10000, 34 | "batch_size": 64, 35 | "target_update": 10, 36 | "hidden_size": 128 37 | }, 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /elsciRL/experiments/experiment_utils/env_manager.py: -------------------------------------------------------------------------------- 1 | from elsciRL.environment_setup.gym_translator import EngineToGym 2 | 3 | class EnvManager: 4 | """Handles environment setup and management.""" 5 | def __init__(self, interaction_loop_class, adapters): 6 | self.interaction_loop_class = interaction_loop_class 7 | self.adapters = adapters 8 | 9 | def create_env(self, Engine, Adapters, local_setup_info): 10 | return self.interaction_loop_class(Engine=Engine, Adapters=Adapters, local_setup_info=local_setup_info) 11 | 12 | def create_gym_env(self, Engine, Adapter, setup_info, wrappers=None): 13 | """Create a Gym environment from an elsciRL Engine and Adapter using gym_translator. 14 | 15 | Adapter can be either the adapter class itself or the lookup key registered in 16 | ``self.adapters``. Optional wrappers can be provided to post-process the created 17 | environment (e.g., to add reward shaping). 18 | """ 19 | adapter_cls = Adapter 20 | if not callable(Adapter): 21 | adapter_cls = self.adapters.get(Adapter) 22 | if adapter_cls is None: 23 | raise ValueError(f"Adapter '{Adapter}' not found when creating Gym environment.") 24 | 25 | gym_env = EngineToGym() 26 | gym_env.load(Engine, Adapter=adapter_cls, setup_info=setup_info) 27 | 28 | if wrappers: 29 | for wrapper in wrappers: 30 | gym_env = wrapper(gym_env) 31 | return gym_env 32 | -------------------------------------------------------------------------------- /elsciRL/__init__.py: -------------------------------------------------------------------------------- 1 | # Try to import modules, handle missing dependencies gracefully 2 | try: 3 | from .examples.DemoExperiment import DemoExperiment as Demo 4 | except ImportError as e: 5 | print(f"Warning: Could not import DemoExperiment: {e}") 6 | Demo = None 7 | 8 | try: 9 | from .GUI.app import app as App 10 | except ImportError as e: 11 | print(f"Warning: Could not import GUI app: {e}") 12 | App = None 13 | 14 | try: 15 | from .GUI.prerender import Prerender as get_prerender_data 16 | except ImportError as e: 17 | print(f"Warning: Could not import Prerender: {e}") 18 | get_prerender_data = None 19 | 20 | try: 21 | from .experiments.standard import Experiment as STANDARD_RL 22 | except ImportError as e: 23 | print(f"Warning: Could not import STANDARD_RL: {e}") 24 | STANDARD_RL = None 25 | 26 | try: 27 | from .instruction_following.elsciRL_instruction_search import elsciRLSearch as elsciRL_SEARCH 28 | except ImportError as e: 29 | print(f"Warning: Could not import elsciRL_SEARCH: {e}") 30 | elsciRL_SEARCH = None 31 | 32 | try: 33 | from .instruction_following.elsciRL_instruction_following import elsciRLOptimize as elsciRL_OPTIMIZE 34 | except ImportError as e: 35 | print(f"Warning: Could not import elsciRL_OPTIMIZE: {e}") 36 | elsciRL_OPTIMIZE = None 37 | 38 | try: 39 | from .analysis.combined_variance_visual import combined_variance_analysis_graph as COMBINED_VARIANCE_ANALYSIS_GRAPH 40 | except ImportError as e: 41 | print(f"Warning: Could not import COMBINED_VARIANCE_ANALYSIS_GRAPH: {e}") 42 | COMBINED_VARIANCE_ANALYSIS_GRAPH = None 43 | -------------------------------------------------------------------------------- /elsciRL/encoders/__init__.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import pandas as pd 4 | from typing import List, Dict, Iterable 5 | from abc import ABC, abstractmethod 6 | from elsciRL.adapters import StateAdapter 7 | from torch import Tensor 8 | 9 | 10 | class Encoder(ABC): 11 | @abstractmethod 12 | def encode(self, *args, **kwargs) -> Tensor: 13 | pass 14 | 15 | class StateEncoder(Encoder): 16 | tensor_cache: Dict[int, Tensor] = dict() 17 | tensor_cache_index: int = 0 18 | 19 | @staticmethod 20 | def cache_insert(t: Tensor): 21 | StateEncoder.tensor_cache[StateEncoder.tensor_cache_index] = t 22 | StateEncoder.tensor_cache_index += 1 23 | 24 | @staticmethod 25 | def cache_retrieve(offset: int, index: int): 26 | return StateEncoder.tensor_cache[offset + index] 27 | 28 | # index_objects are the complete list of adapter specific elements used to define the encoder's index 29 | def encode(self, index_objects:list=None, state:list = None, legal_actions:list = None, prior_action:str = None, 30 | opponent_action:str = None, indexed: bool = False) -> Tensor: 31 | pass 32 | 33 | 34 | class EncodedState(ABC): 35 | @abstractmethod 36 | def data() -> Iterable: 37 | raise NotImplementedError 38 | 39 | 40 | class StateConverter(ABC): 41 | def __init__(self, adapter: StateAdapter): 42 | super().__init__() 43 | # Calls the conversion procedure 44 | self.data: EncodedState = self.convert(adapter.s) 45 | 46 | 47 | def convert(state: list) -> EncodedState: 48 | pass -------------------------------------------------------------------------------- /elsciRL/encoders/poss_actions_encoded.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import List 3 | from torch import Tensor 4 | import numpy as np 5 | 6 | #from elsciRL.encoders.encoder_abstract import StateEncoder 7 | class PossibleActionsEncoder(): 8 | def __init__(self, all_possible_actions): 9 | self.all_possible_actions = all_possible_actions 10 | device = "cuda" if torch.cuda.is_available() else "cpu" # Make this optional choice with parameter 11 | self.vectors: Tensor = torch.cat([torch.eye(len(self.all_possible_actions)), torch.zeros(1, len(self.all_possible_actions))]).to(device) 12 | 13 | self.all_possible_actions_dict_init = {} 14 | for action in self.all_possible_actions: 15 | self.all_possible_actions_dict_init[action] = 0 16 | 17 | self.name = "PossibleActionsEncoder" 18 | self.input_type = "list" 19 | self.output_type = "tensor" 20 | self.output_dim = len(self.all_possible_actions)**2 21 | 22 | def encode(self, state: List[str] = None, legal_actions:list = None, episode_action_history:list = None, 23 | indexed: bool = False) -> Tensor: 24 | """Vector of possible actions.""" 25 | # Binary vector for all currently possible action to denote if it exists in all known possible actions 26 | all_possible_actions = self.all_possible_actions_dict_init.copy() 27 | for a,action in enumerate(legal_actions): 28 | all_possible_actions[action] = int(1) 29 | 30 | state_encoded = torch.tensor(list(all_possible_actions.values())) 31 | if (not indexed): 32 | state_encoded = self.vectors[state_encoded].flatten() 33 | 34 | return state_encoded -------------------------------------------------------------------------------- /elsciRL/examples/adapters/gym_frozenlake_default.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | import pandas as pd 3 | import torch 4 | from torch import Tensor 5 | # StateAdapter includes static methods for adapters 6 | from elsciRL.encoders.poss_state_encoded import StateEncoder 7 | 8 | class DefaultAdapter: 9 | _cached_state_idx: Dict[str, int] = dict() 10 | 11 | def __init__(self, setup_info:dict={}): 12 | # NOTE: Update this based on the current problem, each requires preset 13 | # knowledge of all possible states/actions/objects 14 | # - Possible Atates 15 | # - Possible Actions 16 | # - Prior Actions 17 | # - Possible Objects 18 | 19 | # Initialise encoder based on all possilbe env states 20 | all_possible_states = [i for i in range(4*4)] 21 | self.encoder = StateEncoder(all_possible_states) 22 | 23 | def adapter(self, state:any, legal_moves:list = None, episode_action_history:list = None, encode:bool = True, indexed: bool = False) -> Tensor: 24 | """ Use Language name for every piece name for current board position """ 25 | 26 | # Encode to Tensor for agents 27 | if encode: 28 | state_encoded = self.encoder.encode(state=state) 29 | else: 30 | state_encoded = state 31 | 32 | if (indexed): 33 | state_indexed = list() 34 | for sent in state: 35 | if (sent not in DefaultAdapter._cached_state_idx): 36 | DefaultAdapter._cached_state_idx[sent] = len(DefaultAdapter._cached_state_idx) 37 | state_indexed.append(DefaultAdapter._cached_state_idx[sent]) 38 | 39 | state_encoded = torch.tensor(state_indexed) 40 | 41 | return state_encoded -------------------------------------------------------------------------------- /elsciRL/examples/environments/gym_frozenlake.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | 3 | class Engine: 4 | """Defines the environment function from the generator engine. 5 | Expects the following: 6 | - reset() to reset the env a start position(s) 7 | - step() to make an action and update the game state 8 | - legal_moves_generator() to generate the list of legal moves 9 | """ 10 | def __init__(self, local_setup_info:dict={}) -> None: 11 | """Initialize Engine""" 12 | self.Environment = gym.make('FrozenLake-v1', desc=None, map_name="4x4", 13 | is_slippery=True, 14 | render_mode='rgb_array') 15 | 16 | def reset(self, start_obs:str=None): 17 | """Fully reset the environment.""" 18 | obs, info = self.Environment.reset() 19 | return obs 20 | 21 | 22 | def step(self, state:any, action:any): 23 | """Enact an action.""" 24 | # In problems where the agent can choose to reset the env 25 | if (state=="ENV_RESET")|(action=="ENV_RESET"): 26 | self.reset() 27 | 28 | obs, reward, terminated, truncated, info = self.Environment.step(action) 29 | return obs, reward, terminated, info 30 | 31 | def legal_move_generator(self, obs:any=None): 32 | """Define legal moves at each position""" 33 | legal_moves = [0,1,2,3] 34 | return legal_moves 35 | 36 | def render(self): 37 | """Render an image or text of the environment.""" 38 | return self.Environment.render() 39 | 40 | def close(self): 41 | """Close/Exit the environment.""" 42 | self.Environment.close() 43 | print("Environment Closed") 44 | -------------------------------------------------------------------------------- /elsciRL/adapters/LLM_logic_generators/adapter_prompt.py: -------------------------------------------------------------------------------- 1 | adapter_prompt = """ 2 | Your role is to generate pseudocode for an adapter function that will be used to transform the state of an environment into a form that can be used by an agent. 3 | 4 | Adapters unify problems into a standard form so any agent in the elsciRL library can be used. 5 | 6 | In short, it transforms the state to a new form, optionally adding more context and then outputting a tensor. 7 | 8 | inputs: state, legal moves, action history for episode 9 | outputs: tensor for the encoded form of the adapted state 10 | 11 | # numeric adapter (numeric.py) 12 | class DefaultAdapter(setup_info): 13 | def __init__(): 14 | # Determine discrete environment size: e.g. "4x4" => 16 positions 15 | # Initialize a StateEncoder for these positions 16 | # Optionally define an observation space (e.g., Discrete) needed for Gym agents 17 | 18 | def adapter(state, legal_moves=[], episode_action_history=[], encode=True, indexed=False): 19 | # If encode=True, convert the numeric state to a tensor (StateEncoder) 20 | # If indexed=True, map states to integer IDs 21 | 22 | return tensor(state_encoded) 23 | 24 | # language adapter (language.py) 25 | class LanguageAdapter(setup_info): 26 | def __init__(): 27 | # Build obs_mapping dictionary describing each state as text 28 | # Initialize LanguageEncoder 29 | 30 | def adapter(state, legal_moves=[], episode_action_history=[], encode=True, indexed=False): 31 | # Convert numeric state ID to a text description (obs_mapping) 32 | # Optionally encode the text into a tensor (LanguageEncoder) 33 | # Optionally map each unique description to an indexed ID 34 | 35 | return tensor(state_encoded) 36 | 37 | """ -------------------------------------------------------------------------------- /elsciRL/encoders/observable_objects_encoded.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import List 3 | from torch import Tensor 4 | 5 | from elsciRL.encoders.encoder_abstract import StateEncoder 6 | 7 | class ObjectEncoder(): 8 | def __init__(self, local_objects): 9 | """Encoder for default state representation produced by the environment/engine.""" 10 | self.local_objects = {obj: i for i, obj in enumerate(local_objects)} 11 | device = "cuda" if torch.cuda.is_available() else "cpu" # Make this optional choice with parameter 12 | self.vectors: Tensor = torch.cat([torch.eye(len(self.local_objects)), torch.zeros(1, len(self.local_objects))]).to(device) # tensor needs to be defined to len(local_object) 13 | self.name = "ObjectEncoder" 14 | self.input_type = "list" 15 | self.output_type = "tensor" 16 | self.output_dim = len(self.local_objects)**2 17 | 18 | def encode(self, state:list = None, legal_actions:list = None, episode_action_history:list = None, 19 | indexed: bool = False) -> Tensor: 20 | """ NO CHANGE - Board itself is used as state as is and simply converted to a vector""" 21 | # Goes through every item in state and labels based on the known objects available in the environment 22 | # New vector encoded form, for Chess: 64x12 flattened into 768x1 int vector to denote object occurance 23 | # NOT BINARY vector, value is the occurance of each object type. 24 | # -> In chess this happens to be [1 or 0] because you cant have more than one piece in each position. 25 | state_encoded: Tensor = torch.tensor([self.local_objects.get(state_pos, len(self.local_objects)) for state_pos in state], 26 | device=self.vectors.device) 27 | 28 | if (not indexed): 29 | state_encoded = self.vectors[state_encoded].flatten() 30 | 31 | return state_encoded -------------------------------------------------------------------------------- /elsciRL/encoders/prior_actions_encoded.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import List 3 | from torch import Tensor 4 | import numpy as np 5 | 6 | #from elsciRL.encoders.encoder_abstract import StateEncoder 7 | class PriorActionsEncoder(): 8 | def __init__(self, all_possible_actions): 9 | self.all_possible_actions = all_possible_actions 10 | device = "cuda" if torch.cuda.is_available() else "cpu" # Make this optional choice with parameter 11 | self.vectors: Tensor = torch.cat([torch.eye(len(self.all_possible_actions)), torch.zeros(1, len(self.all_possible_actions))]).to(device) 12 | 13 | self.all_possible_actions_dict_init = {} 14 | for action in self.all_possible_actions: 15 | self.all_possible_actions_dict_init[action] = int(0) 16 | 17 | self.name = "PriorActionsEncoder" 18 | self.input_type = "list" 19 | self.output_type = "tensor" 20 | self.output_dim = len(self.all_possible_actions)**2 21 | 22 | def encode(self, state: List[str] = None, legal_actions:list = None, episode_action_history:list = None, 23 | indexed: bool = False) -> Tensor: 24 | """Vector of prio actions in game so far, similar to blindfold chess.""" 25 | # STATE ENCODER 26 | # - Updated to use all possible actions for consistency with poss action encoder and generally more suitable 27 | # - Chess has loads of possible actions which is somewhat unique to the problem 28 | # - BUT order must be preserved in the prior action encoder 29 | all_possible_actions = self.all_possible_actions_dict_init.copy() 30 | for a,action in enumerate(episode_action_history): 31 | all_possible_actions[action] = int(a) 32 | 33 | state_encoded = torch.tensor(list(all_possible_actions.values())) 34 | if (not indexed): 35 | state_encoded = self.vectors[state_encoded].flatten() 36 | 37 | return state_encoded -------------------------------------------------------------------------------- /elsciRL/examples/adapters/gym_frozenlake_language.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | import pandas as pd 3 | import torch 4 | from torch import Tensor 5 | # StateAdapter includes static methods for adapters 6 | from elsciRL.encoders.language_transformers.MiniLM_L6v2 import LanguageEncoder 7 | 8 | class LanguageAdapter: 9 | _cached_state_idx: Dict[str, int] = dict() 10 | 11 | def __init__(self, setup_info:dict={}): 12 | # Language encoder doesn't require any preset knowledge of env to use 13 | self.encoder = LanguageEncoder() 14 | self.obs_mapping = {0:'You are at the start position.', 1:'You are on ice.', 2:'You are on ice.', 3:'You are on ice.', 15 | 4:'You are on ice.', 5:'You fell through a hole in the ice!', 6:'You are on ice.', 7:'You fell through a hole in the ice!', 16 | 8:'You are on ice.', 9:'You are on ice.', 10:'You are on ice.', 11:'You fell through a hole in the ice!', 17 | 12:'You fell through a hole in the ice!', 13:'You are on ice.', 14:'You are on ice.', 15:'You found the chest!'} 18 | self.key_found = False 19 | 20 | def adapter(self, state:any, legal_moves:list = None, episode_action_history:list = None, encode:bool = True, indexed: bool = False) -> Tensor: 21 | """ Use Language name for every piece name for current board position """ 22 | # --- 23 | # Convert to lanugage 24 | state = self.obs_mapping[state] 25 | # --- 26 | 27 | # Encode to Tensor for agents 28 | if encode: 29 | state_encoded = self.encoder.encode(state=state) 30 | else: 31 | state_encoded = state 32 | 33 | if (indexed): 34 | state_indexed = list() 35 | for sent in state: 36 | if (sent not in LanguageAdapter._cached_state_idx): 37 | LanguageAdapter._cached_state_idx[sent] = len(LanguageAdapter._cached_state_idx) 38 | state_indexed.append(LanguageAdapter._cached_state_idx[sent]) 39 | 40 | state_encoded = torch.tensor(state_indexed) 41 | 42 | return state_encoded -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # from distutils.core import setup 2 | from setuptools import setup, find_packages 3 | 4 | setup( 5 | name='elsciRL', 6 | version='0.4.0', 7 | packages=[ 8 | 'elsciRL', 9 | 'elsciRL.adapters', 10 | 'elsciRL.adapters.LLM_state_generators', 11 | 'elsciRL.agents', 12 | 'elsciRL.agents.LLM_agents', 13 | 'elsciRL.agents.stable_baselines', 14 | 'elsciRL.analysis', 15 | 'elsciRL.application_suite', 16 | 'elsciRL.encoders', 17 | 'elsciRL.encoders.language_transformers', 18 | 'elsciRL.environment_setup', 19 | 'elsciRL.evaluation', 20 | 'elsciRL.examples', 21 | 'elsciRL.examples.adapters', 22 | 'elsciRL.examples.environments', 23 | 'elsciRL.examples.local_configs', 24 | 'elsciRL.experiments', 25 | 'elsciRL.experiments.experiment_utils', 26 | 'elsciRL.experiments.training_procedures', 27 | 'elsciRL.GUI', 28 | 'elsciRL.GUI.static', 29 | 'elsciRL.GUI.templates', 30 | 'elsciRL.experiments', 31 | 'elsciRL.instruction_following', 32 | 'elsciRL.instruction_following.LLM_instr_planner', 33 | 'elsciRL.instruction_following.instr_utils', 34 | 'elsciRL.interaction_loops', 35 | 'elsciRL.published_experiments', 36 | ], 37 | package_data={ 38 | 'elsciRL.GUI.templates': ['index.html', '_generic_agent_param_form.html'], 39 | 'elsciRL.GUI.static': ['styles.css', 'app_setup.md'], 40 | }, 41 | include_package_data=True, 42 | url='https://github.com/pdfosborne/elsciRL', 43 | license='Apache-2.0 license', 44 | author='Philip Osborne', 45 | author_email='pdfosborne@gmail.com', 46 | description='Apply language solutions to Reinforcement Learning problems.', 47 | install_requires=[ 48 | 'numpy', 49 | 'pandas', 50 | 'matplotlib', 51 | 'seaborn', 52 | 'scipy>=1.10.1', 53 | 'torch', 54 | 'tqdm', 55 | 'httpimport', 56 | 'sentence-transformers', 57 | 'gymnasium', 58 | 'stable-baselines3', 59 | 'flask', 60 | 'ollama', 61 | 'markdown', 62 | ] 63 | ) 64 | -------------------------------------------------------------------------------- /elsciRL/encoders/poss_state_encoded.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import List, Any 3 | from torch import Tensor 4 | from tqdm import tqdm 5 | from elsciRL.encoders.encoder_abstract import StateEncoder 6 | 7 | class StateEncoder(StateEncoder): 8 | def __init__(self, num_states): 9 | """Encoder for default state representation produced by the environment/engine.""" 10 | # Create dict lookup 11 | # - get binary list that indexes the state e.g. 0_0 -> [1,0,0,0] or 0_3 -> [0,0,0,1] 12 | # UPDATED - Now uses torch.nn.functional.one_hot for one-hot encoding 13 | # Using one-hot encoder is incredibly inefficient for large state spaces 14 | # Instead, we consider using an index-based encoding where each unique state is assigned a unique index. 15 | self.device = "cuda" if torch.cuda.is_available() else "cpu" # Make this optional choice with parameter 16 | self.vectors: Tensor = torch.cat([torch.eye(num_states), torch.zeros(1,num_states)]).to(self.device) # tensor needs to be defined to len(local_object) 17 | self.name = "StateEncoder" 18 | self.input_type = "list" 19 | self.output_type = "tensor" 20 | self.output_dim = num_states 21 | 22 | self.encoder = {} 23 | self.encoder_idx = 0 24 | self.num_states = num_states 25 | 26 | def encode(self, state:Any = None, legal_actions:list = None, episode_action_history:list = None, 27 | indexed: bool = False) -> Tensor: 28 | """ Set of all possible states are simply converted to a vector""" 29 | # One hot encode the state if it is not already indexed 30 | if state not in self.encoder: 31 | state_encoded = self.encoder_idx # Use the index as the state encoded value 32 | # Store the encoded state in the encoder dictionary 33 | self.encoder[state] = state_encoded 34 | # Increment the encoder index for the next unique state 35 | self.encoder_idx += 1 36 | else: 37 | state_encoded = self.encoder[state] 38 | 39 | # If indexed, use one-hot encoding 40 | # If not indexed, use the unique index to retrieve the vector 41 | if indexed: 42 | state_encoded = torch.nn.functional.one_hot(torch.tensor(state_encoded), num_classes=self.num_states).float().to(self.device) 43 | else: 44 | state_encoded = self.vectors[int(state_encoded)].flatten() 45 | 46 | return state_encoded -------------------------------------------------------------------------------- /elsciRL/config_local.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | # Define Agent's parameters for problem 5 | # Opponent is considered a 'local' specification as benchmarks vary between setting 6 | 7 | class LocalConfig: 8 | def __init__(self, config_file_path: str): 9 | if (config_file_path): 10 | with open(config_file_path) as config_file: 11 | self.data = json.load(config_file) 12 | self.config_file_path = config_file_path 13 | 14 | else: 15 | self.data = dict() 16 | self.config_path = "" 17 | logging.info("No arguments given, using default configuration...") 18 | 19 | def __getitem__(self, key: str): 20 | item = None 21 | 22 | if (key in self.__dict__): 23 | item = self.__dict__[key] 24 | else: 25 | item = self.data[key] 26 | 27 | return item 28 | 29 | #TODO this is not universal at all !!! 30 | class ProblemConfig(LocalConfig): 31 | """Local Config is used to define any problem specific parameters.""" 32 | def __init__(self, config_path: str): 33 | super(ProblemConfig, self).__init__(config_path) 34 | # State form 35 | self.adapter_select = self.data.get("adapter_select", [""]) 36 | # Enabled agent to be trained against multiple opponents in order provided 37 | self.training_opponent_agent = self.data.get( 38 | "training_opponent_agent", "") 39 | self.testing_opponent_agent = self.data.get( 40 | "testing_opponent_agent", "") 41 | 42 | self.training_setup = self.data.get("training_setup",'default') 43 | self.testing_setup = self.data.get("testing_setup",'default') 44 | 45 | self.training_action_cap = self.data.get("training_action_cap",1000) # Arbitrary number to ensure games dont last forever 46 | self.testing_action_cap = self.data.get("testing_action_cap",1000) # Arbitrary number to ensure games dont last forever 47 | # Reward Signal, should be consistent between all agent being compared 48 | self.reward_signal = self.data.get("reward_signal",[1,-0.1,0,0] )# [Value of winning, Value for draw, Value for each action, Value for reaching new state] 49 | # Sub-Goal Defined 50 | self.sub_goal = self.data.get("sub_goal",None) 51 | 52 | class ConfigSetup(LocalConfig): 53 | def __init__(self, config_dir: str): 54 | super(ConfigSetup, self).__init__(config_dir) 55 | self.state_configs = ProblemConfig(os.path.join(config_dir)) -------------------------------------------------------------------------------- /elsciRL/environment_setup/imports.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from elsciRL.agents.agent_abstract import Agent, QLearningAgent, LLMAgentAbstract 3 | 4 | class ImportHelper: 5 | def __init__(self, local_setup_info:dict={}) -> None: 6 | self.setup_info = local_setup_info 7 | 8 | def agent_info(self, STATE_ADAPTER_TYPES:dict={}): 9 | agent: Agent | QLearningAgent | LLMAgentAbstract = self.setup_info['agent'] 10 | agent_type: str = self.setup_info['agent_type'] 11 | agent_name: str = self.setup_info['agent_name'] 12 | if self.setup_info['adapter_select'] in STATE_ADAPTER_TYPES: 13 | agent_state_adapter = STATE_ADAPTER_TYPES[self.setup_info['adapter_select']](setup_info=self.setup_info) 14 | else: 15 | print(f"Adapter {self.setup_info['adapter_select']} not found in STATE_ADAPTER_TYPES.") 16 | print(STATE_ADAPTER_TYPES) 17 | agent_state_adapter = '' 18 | return agent, agent_type, agent_name, agent_state_adapter 19 | 20 | def parameter_info(self): 21 | num_train_episodes: int = self.setup_info['number_training_episodes'] 22 | num_test_episodes: int = self.setup_info['number_test_episodes'] 23 | try: 24 | training_action_cap: int = self.setup_info['training_action_cap'] 25 | testing_action_cap: int = self.setup_info['testing_action_cap'] 26 | except: 27 | if 'action_limit' in self.setup_info: 28 | training_action_cap: int = self.setup_info['action_limit'] 29 | testing_action_cap: int = self.setup_info['action_limit'] 30 | elif 'action_cap' in self.setup_info: 31 | training_action_cap: int = self.setup_info['action_cap'] 32 | testing_action_cap: int = self.setup_info['action_cap'] 33 | else: 34 | print('No action cap specified, using default values') 35 | training_action_cap: int = 1000 36 | testing_action_cap: int = 1000 37 | reward_signal: List[int] = self.setup_info['reward_signal'] 38 | 39 | return num_train_episodes, num_test_episodes, training_action_cap, testing_action_cap, reward_signal 40 | 41 | def training_flag(self): 42 | train: bool = self.setup_info['train'] 43 | return train 44 | 45 | def live_env_flag(self): 46 | live_env: bool = self.setup_info['live_env'] 47 | observed_states: bool = self.setup_info['observed_states'] 48 | #experience_sampling: bool = self.setup_info['experience_sampling'] 49 | return live_env, observed_states -------------------------------------------------------------------------------- /elsciRL/agents/stable_baselines/SB3_DQN.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import torch 3 | import numpy as np 4 | from elsciRL.agents.agent_abstract import QLearningAgent 5 | import gymnasium as gym 6 | from stable_baselines3 import DQN 7 | from stable_baselines3.common.evaluation import evaluate_policy 8 | from PIL import Image # Used to generate GIF 9 | 10 | class SB_DQN(QLearningAgent): 11 | def __init__(self, policy:str='MlpPolicy', env:gym.Env = None, learning_rate:float=0.0001, buffer_size:int=1000000): 12 | self.epsilon: int = 0 # Not used currently but required for compatibility 13 | self.device = "auto" if torch.cuda.is_available() else "cpu" 14 | self.dqn = DQN(policy, env, verbose=0, device=self.device, 15 | learning_rate=learning_rate, buffer_size=buffer_size) 16 | if torch.cuda.is_available(): 17 | print("---- Using GPU ----") 18 | print("Device:", self.dqn.device) 19 | 20 | def policy(self, state: any) -> str: 21 | return self.dqn.predict(state) 22 | 23 | def learn(self, total_steps:int=100) -> float: 24 | self.dqn.learn(total_timesteps=total_steps) 25 | 26 | def test(self, env, render:bool=False): 27 | #mean_reward, std_reward = evaluate_policy(self.a2c, env, n_eval_episodes=1) 28 | vec_env = self.dqn.get_env() 29 | obs = vec_env.reset() 30 | 31 | actions = [] 32 | states = [] 33 | 34 | done = False 35 | render_stack = [] 36 | if render: 37 | render_stack.append( 38 | Image.fromarray(vec_env.render().astype('uint8')) 39 | ) 40 | while not done: 41 | action, _state = self.dqn.predict(obs, deterministic=True) 42 | if isinstance(action, np.int64): 43 | actions.append(action.item()) 44 | else: 45 | actions.append(action[0]) 46 | #actions.append(action[0]) 47 | 48 | obs, r, done, info = vec_env.step(action) 49 | states.append(info[0]['obs']) 50 | if render: 51 | render_stack.append( 52 | Image.fromarray(vec_env.render().astype('uint8')) 53 | ) 54 | 55 | #vec_env.render("human") 56 | episode_reward = info[0]['episode']['r'] 57 | if episode_reward > 0.5: 58 | print("----> ", episode_reward) 59 | 60 | return episode_reward, actions, states, render_stack 61 | 62 | def q_result(self): 63 | results = [0,0] 64 | total_q = results[0] 65 | mean_q = results[1] 66 | return total_q, mean_q 67 | 68 | def clone(self): 69 | clone = pickle.loads(pickle.dumps(self)) 70 | return clone -------------------------------------------------------------------------------- /elsciRL/agents/stable_baselines/SB3_PPO.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import torch 3 | import numpy as np 4 | from elsciRL.agents.agent_abstract import QLearningAgent 5 | import gymnasium as gym 6 | from stable_baselines3 import PPO 7 | from stable_baselines3.common.evaluation import evaluate_policy 8 | from PIL import Image # Used to generate GIF 9 | 10 | class SB_PPO(QLearningAgent): 11 | def __init__(self, policy:str='MlpPolicy', env:gym.Env = None, learning_rate:float=0.0003, n_steps:int=2048): 12 | self.epsilon: int = 0 # Not used currently but required for compatibility 13 | self.device = "auto" if torch.cuda.is_available() else "cpu" 14 | self.ppo = PPO(policy, env, verbose=0, device=self.device, 15 | learning_rate=learning_rate, n_steps=n_steps) 16 | if torch.cuda.is_available(): 17 | print("---- Using GPU ----") 18 | print("Device:", self.ppo.device) 19 | 20 | def policy(self, state: any) -> str: 21 | return self.ppo.predict(state) 22 | 23 | def learn(self, total_steps:int=100) -> float: 24 | self.ppo.learn(total_timesteps=total_steps) 25 | 26 | def test(self, env, render:bool=False): 27 | #mean_reward, std_reward = evaluate_policy(self.a2c, env, n_eval_episodes=1) 28 | vec_env = self.ppo.get_env() 29 | obs = vec_env.reset() 30 | 31 | actions = [] 32 | states = [] 33 | 34 | done = False 35 | render_stack = [] 36 | if render: 37 | render_stack.append( 38 | Image.fromarray(vec_env.render().astype('uint8')) 39 | ) 40 | while not done: 41 | action, _state = self.ppo.predict(obs, deterministic=True) 42 | if isinstance(action, np.int64): 43 | actions.append(action.item()) 44 | else: 45 | actions.append(action[0]) 46 | #actions.append(action[0]) 47 | 48 | obs, r, done, info = vec_env.step(action) 49 | states.append(info[0]['obs']) 50 | if render: 51 | render_stack.append( 52 | Image.fromarray(vec_env.render().astype('uint8')) 53 | ) 54 | 55 | #vec_env.render("human") 56 | episode_reward = info[0]['episode']['r'] 57 | if episode_reward > 0.5: 58 | print("----> ", episode_reward) 59 | 60 | return episode_reward, actions, states, render_stack 61 | 62 | def q_result(self): 63 | results = [0,0] 64 | total_q = results[0] 65 | mean_q = results[1] 66 | return total_q, mean_q 67 | 68 | def clone(self): 69 | clone = pickle.loads(pickle.dumps(self)) 70 | return clone 71 | 72 | -------------------------------------------------------------------------------- /elsciRL/encoders/language_transformers/MiniLM_L6v2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from multiprocessing.spawn import import_main_path 4 | from typing import Dict, List, Tuple 5 | from collections import Counter 6 | from gymnasium.spaces import Box 7 | 8 | from torch import Tensor 9 | from elsciRL.encoders.encoder_abstract import StateEncoder 10 | 11 | # Language Encoder 12 | from sentence_transformers import SentenceTransformer 13 | 14 | 15 | 16 | class LanguageEncoder(StateEncoder): 17 | """Required Language Model included in requisite packages.""" 18 | _cached_enc: Dict[str, Tensor] = dict() 19 | _cached_freq: Counter = Counter() 20 | 21 | def __init__(self, device: str = None): 22 | autodev = "cuda" if torch.cuda.is_available() else "cpu" 23 | self.device = device if device else autodev 24 | self.sentence_model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2', device=self.device) 25 | low_array = [-1 for i in range(384)] 26 | high_array = [1 for i in range(384)] 27 | self.observation_space = Box(low=np.array(low_array), high=np.array(high_array), dtype=np.float32) 28 | self.name = "MiniLM_L6v2" 29 | self.input_type = "text" 30 | self.output_type = "tensor" 31 | self.output_dim = 384 32 | 33 | def encode(self, state: str|List[str], legal_actions:list = None, episode_action_history:list = None, 34 | indexed: bool = False, progress_bar:bool=False) -> Tensor: 35 | 36 | # I think typing is overriding the input type anyway -> need to ensure sentences are split up 37 | if type(state) == str: 38 | state = [state] 39 | # state = state.split(".") 40 | # state = [s for s in state if s.strip()] 41 | if (len(state) == 0): 42 | state = [""] 43 | to_encode = [sent for sent in state if sent not in LanguageEncoder._cached_enc] 44 | if (to_encode): 45 | # Show progress bar if state is a list of strings 46 | encoded = self.sentence_model.encode(to_encode, batch_size=256, convert_to_tensor=True, show_progress_bar=progress_bar) 47 | LanguageEncoder._cached_enc.update({to_encode[i]: encoded[i] for i in range(len(to_encode))}) 48 | 49 | LanguageEncoder._cached_freq.update(state) 50 | LanguageEncoder._cached_freq.subtract(LanguageEncoder._cached_freq.keys()) 51 | state_encoded = torch.stack([LanguageEncoder._cached_enc[sent] for sent in state]) 52 | 53 | if (len(LanguageEncoder._cached_freq) > 10000): 54 | for key, freq in list(reversed(LanguageEncoder._cached_freq.most_common()))[:2000]: 55 | del LanguageEncoder._cached_enc[key] 56 | del LanguageEncoder._cached_freq[key] 57 | 58 | return state_encoded -------------------------------------------------------------------------------- /elsciRL/agents/stable_baselines/SB3_A2C.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import torch 3 | import numpy as np 4 | from elsciRL.agents.agent_abstract import QLearningAgent 5 | import gymnasium as gym 6 | from stable_baselines3 import A2C 7 | from PIL import Image # Used to generate GIF 8 | 9 | 10 | class SB_A2C(QLearningAgent): 11 | def __init__(self, policy:str='MlpPolicy', env:gym.Env = None, learning_rate=0.0007, n_steps=500): 12 | self.epsilon: int = 0 # Not used currently but required for compatibility 13 | self.device = "auto" if torch.cuda.is_available() else "cpu" # A2C is meant to be run primarily on the CPU, especially when you are not using a CNN. 14 | self.a2c = A2C(policy, env, verbose=0, device="cpu", 15 | learning_rate=learning_rate, n_steps=n_steps) 16 | if torch.cuda.is_available(): 17 | print("---- A2C is meant to be run primarily on the CPU ----") 18 | print("Device:", self.a2c.device) 19 | 20 | def policy(self, state: any) -> str: 21 | # TODO: make sure output is int 22 | return self.a2c.predict(state) 23 | 24 | def learn(self, total_steps:int=100) -> float: 25 | self.a2c.learn(total_timesteps=total_steps) 26 | 27 | def test(self, env, render:bool=False): 28 | #mean_reward, std_reward = evaluate_policy(self.a2c, env, n_eval_episodes=1) 29 | # Using environment from agent may limit episodes based on prior experience 30 | #vec_env = self.a2c.get_env() 31 | 32 | vec_env = env 33 | obs, info = vec_env.reset() 34 | 35 | actions = [] 36 | states = [] 37 | 38 | done = False 39 | episode_reward = 0 40 | render_stack = [] 41 | if render: 42 | render_stack.append( 43 | Image.fromarray(vec_env.render().astype('uint8')) 44 | ) 45 | while not done: 46 | action, _state = self.a2c.predict(obs, deterministic=True) 47 | if isinstance(action, np.int64): 48 | actions.append(action.item()) 49 | else: 50 | actions.append(action) 51 | # actions.append(int(action)) 52 | obs, r, done, truncated, info = vec_env.step(action) 53 | episode_reward += r 54 | if render: 55 | render_stack.append(Image.fromarray(vec_env.render().astype('uint8'))) 56 | 57 | #states.append(info[0]['obs']) 58 | states.append(info['obs']) 59 | #vec_env.render("human") 60 | 61 | #episode_reward = info[0]['episode']['r'] 62 | if episode_reward > 0.5: 63 | print("----> ", episode_reward) 64 | 65 | return episode_reward, actions, states, render_stack 66 | 67 | def q_result(self): 68 | results = [0,0] 69 | total_q = results[0] 70 | mean_q = results[1] 71 | return total_q, mean_q 72 | 73 | def clone(self): 74 | clone = pickle.loads(pickle.dumps(self)) 75 | return clone 76 | 77 | -------------------------------------------------------------------------------- /elsciRL/experiments/experiment_utils/agent_factory.py: -------------------------------------------------------------------------------- 1 | class AgentFactory: 2 | """Factory for creating agent instances based on type name and parameters.""" 3 | def __init__(self, adapters, setup_info): 4 | from elsciRL.agents.table_q_agent import TableQLearningAgent 5 | from elsciRL.agents.DQN import DQNAgent 6 | from elsciRL.agents.LLM_agents.ollama_agent import LLMAgent as OllamaAgent 7 | self.adapters = adapters 8 | self.agent_types = { 9 | "Qlearntab": TableQLearningAgent, 10 | "DQN": DQNAgent, 11 | "LLM_Ollama": OllamaAgent, 12 | } 13 | self.setup_info = setup_info 14 | 15 | def register_agent(self, name, agent_class): 16 | self.agent_types[name] = agent_class 17 | 18 | def create(self, agent_type, agent_parameters, engine=None, adapter=None): 19 | if agent_type == "DQN": 20 | if adapter: 21 | adapter_sample = self.adapters[adapter](setup_info=self.setup_info) 22 | # Set input_size from adapter 23 | try: 24 | input_size = adapter_sample.input_dim 25 | print(f"Using input_dim from adapter {adapter}: {input_size}") 26 | except Exception: 27 | try: 28 | input_size = adapter_sample.encoder.output_dim 29 | print(f"Using encoder output_dim from encoder {adapter_sample.encoder}: {input_size}") 30 | except Exception: 31 | try: 32 | input_size = adapter_sample.LLM_adapter.encoder.output_dim 33 | print(f"Using LLM_adapter encoder output_dim from LLM adapter {adapter_sample.LLM_adapter}: {input_size}") 34 | except Exception: 35 | print(f"Adapter {adapter} does not have input_dim specified.") 36 | raise ValueError(f"No input dim size found in adapter: {adapter}") 37 | 38 | if engine: 39 | print(engine) 40 | engine_sample = engine(local_setup_info=self.setup_info) 41 | try: 42 | output_size = engine_sample.output_size 43 | except Exception: 44 | try: 45 | output_size = engine_sample.output_dim 46 | except Exception: 47 | try: 48 | output_size = engine_sample.output_dim_size 49 | except Exception: 50 | print(f"Engine {engine} does not contain output dim size for DQN agent, using default 1,000.") 51 | output_size = 1000 52 | # Order must match DQN input 53 | temp_dict = {'input_size': input_size, 'output_size': output_size} 54 | temp_dict.update(agent_parameters) 55 | else: 56 | # For other agents, we assume the parameters are already in the correct format 57 | temp_dict = agent_parameters 58 | if agent_type not in self.agent_types: 59 | raise ValueError(f"Unknown agent type: {agent_type}") 60 | return self.agent_types[agent_type](**temp_dict) 61 | -------------------------------------------------------------------------------- /elsciRL/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import json 4 | 5 | 6 | class Config: 7 | def __init__(self, config_file_path: str): 8 | if config_file_path: 9 | with open(config_file_path) as config_file: 10 | self.data = json.load(config_file) 11 | self.config_file_path = config_file_path 12 | 13 | else: 14 | self.data = dict() 15 | self.config_path = "" 16 | logging.info("No arguments given, using default configuration...") 17 | 18 | def __getitem__(self, key: str): 19 | item = None 20 | 21 | if key in self.__dict__: 22 | item = self.__dict__[key] 23 | else: 24 | item = self.data[key] 25 | 26 | return item 27 | 28 | 29 | class ExperimentConfig(Config): 30 | def __init__(self, config_path: str): 31 | super(ExperimentConfig, self).__init__(config_path) 32 | 33 | # Name setup 34 | self.name = self.data.get( 35 | "name", os.path.split(self.config_file_path)[-1].replace(".json", "") 36 | ) 37 | # Define Problem Type Choice 38 | self.problem_type = self.data.get("problem_type", "") 39 | # Specify local config choices to select agents of interest 40 | self.agent_select = self.data.get("agent_select", ["Qlearntab"]) 41 | 42 | # ---> We then parse these three inputs to obtain the local config setup info 43 | # ---> Ideally input is a dict input: setups = { 'Setup1':{"Adapter":"Engine", "Encoder":"Yes", "Agent":"TabQ"},... } 44 | 45 | # Training repeated 46 | self.num_training_episodes = self.data.get("num_training_episodes", 1000) 47 | self.number_training_repeats = self.data.get("number_training_repeats", 5) 48 | 49 | # Testing repeated 50 | self.number_test_episodes = self.data.get("number_test_episodes", 100) 51 | self.number_test_repeats = self.data.get("number_test_repeats", 5) 52 | self.test_agent_type = self.data.get("test_agent_type", "best") 53 | 54 | # Tab Q Agent parameters 55 | self.alpha = self.data.get("alpha", [0.05]) 56 | self.gamma = self.data.get("gamma", [0.95]) 57 | self.epsilon = self.data.get("epsilon", [0.05]) 58 | # Neural Agent Parameters 59 | self.input_type = "lm" 60 | self.input_size = self.data.get("input_size", [384]) 61 | self.sent_hidden_dim = self.data.get("sent_hidden_dim", [10]) 62 | self.hidden_dim = self.data.get("hidden_dim", [128]) 63 | self.num_hidden = self.data.get("num_hidden", [2]) 64 | self.sequence_size = self.data.get("sequence_size", [20]) 65 | self.memory_size = self.data.get("memory_size", [2000]) 66 | self.target_replace_iter = self.data.get("target_replace_iter", [100]) 67 | self.learning_rate = self.data.get("learning_rate", [0.001]) 68 | self.batch_size = self.data.get("batch_size", [1]) 69 | 70 | self.number_test_episodes = self.data.get("number_test_episodes", 250) 71 | self.number_test_repeats = self.data.get("number_test_repeats", 5) 72 | 73 | 74 | class TestingSetupConfig(Config): 75 | def __init__(self, config_dir: str): 76 | super(TestingSetupConfig, self).__init__(config_dir) 77 | self.state_configs = ExperimentConfig(os.path.join(config_dir)) 78 | -------------------------------------------------------------------------------- /elsciRL/GUI/prerender_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch import Tensor 4 | from elsciRL.encoders.language_transformers.MiniLM_L6v2 import LanguageEncoder as MiniLM_L6v2 5 | 6 | # Get search method 7 | import os 8 | import json 9 | from datetime import datetime 10 | 11 | 12 | def encode_prerender_data(observed_states:dict|str=None, 13 | save_dir:str=None, 14 | encoder:str ='MiniLM_L6v2') -> Tensor: 15 | """ Encodes the observed states using a language encoder. 16 | Args: 17 | observed_states (dict or str): The observed states to encode, can be the dictionary or the directory path string. 18 | save_dir (str): The directory where the encoded states will be saved. If None, defaults to './encoded-prerender-data'. 19 | encoder (str): The name of the encoder to use. Defaults to 'MiniLM_L6v2', options include: 20 | - 'MiniLM_L6v2': A lightweight language model suitable for encoding text. 21 | - ~~Other encoders can be added in the future.~~ 22 | Returns: 23 | Tensor: The encoded representation of the observed states. 24 | """ 25 | # ------------------------------------------------------------------ 26 | # Define the available encoders 27 | # Currently only MiniLM_L6v2 is available, but can be extended in the future. 28 | ENCODERS = {'MiniLM_L6v2': MiniLM_L6v2} 29 | encoder = ENCODERS[encoder]() 30 | # ------------------------------------------------------------------ 31 | if observed_states is None: 32 | print("\n ----------------------------------------------------") 33 | print(" No observed states provided. Please select a file to encode.") 34 | print(" ----------------------------------------------------\n") 35 | file_names = [file for file in os.listdir('./') if file.endswith('.txt')] 36 | for n, file in enumerate(file_names): 37 | print(f"- {n}: {file}") 38 | selection = input("\n Select the file to encode (by number): ") 39 | observed_states_filename = file_names[int(selection)] 40 | observed_states_path = os.path.join('./', observed_states_filename) 41 | with open(observed_states_path, 'r') as f: 42 | observed_states = json.loads(f.read()) 43 | save_dir = './' 44 | else: 45 | if isinstance(observed_states, str): 46 | observed_states_filename = observed_states.split('/')[-1].split('.')[0] 47 | if not save_dir: 48 | save_dir = os.path.dirname(observed_states) 49 | with open(observed_states, 'r') as f: 50 | observed_states = json.loads(f.read()) 51 | else: 52 | observed_states_filename = 'observed_states' 53 | if not save_dir: 54 | save_dir = './' 55 | 56 | # Encode the observed states 57 | print(f"\n Encoding observed state file {observed_states_filename} using {encoder.name}...") 58 | str_states = [str_state for str_state in observed_states.values()] 59 | observed_states_encoded = encoder.encode(str_states) 60 | 61 | if not os.path.exists(save_dir): 62 | os.makedirs(save_dir) 63 | file_path = os.path.join(save_dir, 'encoded_' + observed_states_filename.split('.')[0] + '.txt') 64 | np.savetxt(file_path, observed_states_encoded.numpy()) 65 | print(f"Encoded states saved to {file_path}") 66 | print(f"Number of States: {len(observed_states_encoded)}") 67 | 68 | return observed_states_encoded -------------------------------------------------------------------------------- /elsciRL/examples/adapters/elsciRL_sailing_default.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | import pandas as pd 3 | import numpy as np 4 | import torch 5 | from torch import Tensor 6 | # StateAdapter includes static methods for adapters 7 | from elsciRL.encoders.poss_state_encoded import StateEncoder 8 | from gymnasium.spaces import Text, Discrete 9 | 10 | class DefaultAdapter: 11 | 12 | # ------ Static Methods --------------------------------------- 13 | # - Defined by simulator source https://github.com/PPierzc/ai-learns-to-sail/blob/master/tasks/channel.py 14 | @staticmethod 15 | def angle_to_state(angle): 16 | return int(30 * ((angle + np.pi) / (2 * np.pi) % 1)) # Discretization of the angle space 17 | 18 | @staticmethod 19 | def x_to_state(x): 20 | return int(40 * ((x + -10) / 20)) # Discretization of the x space 21 | 22 | @staticmethod 23 | def state_discretizer(state): 24 | x = float(state.split('_')[0]) 25 | x_state = DefaultAdapter.x_to_state(x) 26 | 27 | angle = float(state.split('_')[1]) 28 | angle_state = DefaultAdapter.angle_to_state(angle) 29 | 30 | state_out = str(x_state)+'_'+str(angle_state) 31 | return state_out 32 | # ------------------------------------------------------------- 33 | 34 | _cached_state_idx: Dict[str, int] = dict() 35 | def __init__(self, setup_info:dict={}) -> None: 36 | # ------ State Encoder --------------------------------------- 37 | # Initialise encoder based on all possible env states 38 | all_possible_x = [i*-1 for i in range(40)] 39 | all_possible_angle = [i for i in range(30)] 40 | # Need an index that preserves the identity of both the x and angle values 41 | all_possible_states = [] 42 | for x_ind in all_possible_x: 43 | for angle_ind in all_possible_angle: 44 | index = str(x_ind)+'_'+str(angle_ind) 45 | all_possible_states.append(index) 46 | # Input to pre-built possible state encoder 47 | #self.encoder = StateEncoder(all_possible_states) 48 | self.encoder = {} 49 | # Observartion is string: "x_angle" 50 | # -> Then discretized and returned as string: "x_state_angle_state" 51 | # -> Before being numeritized to a unique id (x:-10-10*2dp * angle:0-2pi*1dp) 52 | self.observation_space = Discrete(2000*30) 53 | 54 | 55 | def adapter(self, state:any, legal_moves:list = None, episode_action_history:list = None, encode:bool = True, indexed: bool = False) -> Tensor: 56 | """ Use Language name for every piece name for current board position """ 57 | 58 | state = DefaultAdapter.state_discretizer(state) 59 | 60 | # Encode to Tensor for agents 61 | if encode: 62 | #state_encoded = self.encoder.encode(state=state) 63 | # elsciRL state encoder is large and not needed for tabular agents 64 | # - Wont work for neural agents 65 | if (state not in self.encoder): 66 | state_encoded = torch.tensor(len(self.encoder)) 67 | self.encoder[state] = state_encoded 68 | else: 69 | state_encoded = self.encoder[state] 70 | else: 71 | state_encoded = state 72 | 73 | if (indexed): 74 | state_indexed = list() 75 | for sent in state: 76 | if (sent not in DefaultAdapter._cached_state_idx): 77 | DefaultAdapter._cached_state_idx[sent] = len(DefaultAdapter._cached_state_idx) 78 | state_indexed.append(DefaultAdapter._cached_state_idx[sent]) 79 | 80 | state_encoded = torch.tensor(state_indexed) 81 | 82 | return state_encoded -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | .vscode/ 163 | 164 | # Dev testing problem 165 | benchmark/output/* 166 | elsciRL/benchmark/output/* 167 | elsciRL-App-output/* -------------------------------------------------------------------------------- /elsciRL/experiments/training_procedures/policy_gradient.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from elsciRL.environment_setup.results_table import ResultsTable 4 | from elsciRL.experiments.experiment_utils.config_utils import ensure_dir 5 | from elsciRL.interaction_loops.policy_gradient import PolicyGradientInteractionLoop 6 | 7 | 8 | def run_policy_gradient_training_loop( 9 | env_manager, 10 | policy_agent_factory, 11 | result_manager, 12 | training_render, 13 | training_render_save_dir, 14 | save_dir, 15 | engine_name, 16 | engine, 17 | agent_type, 18 | adapter, 19 | train_setup_info, 20 | trained_agents, 21 | num_training_seeds, 22 | test_agent_type, 23 | show_figures, 24 | number_training_repeats, 25 | wrappers=None, 26 | ): 27 | """Specialized training loop for policy-gradient agents.""" 28 | 29 | key = f"{engine_name}_{agent_type}_{adapter}" 30 | if key not in trained_agents: 31 | trained_agents[key] = {} 32 | 33 | seed_recall = {} 34 | seed_results_connection = {} 35 | observed_states_stored = {} 36 | training_results_stored = None 37 | 38 | for seed_num in range(num_training_seeds): 39 | if num_training_seeds > 1: 40 | print("------\n- Seed Num: ", seed_num) 41 | 42 | setup_num = 0 43 | temp_agent_store = {} 44 | 45 | for training_repeat in range(1, number_training_repeats + 1): 46 | setup_num += 1 47 | env = env_manager.create_gym_env(engine, adapter, train_setup_info, wrappers=wrappers) 48 | agent_parameters = train_setup_info['agent_parameters'][agent_type] 49 | agent = policy_agent_factory.create(agent_type, agent_parameters, env) 50 | 51 | total_steps = train_setup_info.get('training_action_cap', 100) * train_setup_info.get('number_training_episodes', 1) 52 | agent.learn(total_steps=total_steps) 53 | 54 | agent_name = train_setup_info.get('agent_name', f"{agent_type}_{adapter}") 55 | results_table = ResultsTable(train_setup_info) 56 | table_results = PolicyGradientInteractionLoop.policy_rollout( 57 | agent, 58 | env, 59 | agent_name, 60 | train_setup_info.get('number_training_episodes', 1), 61 | results_table, 62 | render=False, 63 | action_limit=train_setup_info.get('training_action_cap'), 64 | ) 65 | 66 | table_results['episode'] = table_results.index 67 | table_results.insert(loc=0, column='Repeat', value=setup_num) 68 | 69 | agent_save_dir = os.path.join( 70 | save_dir, 71 | f"{engine_name}_{agent_type}_{adapter}__training_results_{setup_num}" 72 | ) 73 | ensure_dir(agent_save_dir) 74 | Return = result_manager.train_report(table_results, agent_save_dir, show_figures) 75 | train_setup_info['train_save_dir'] = agent_save_dir 76 | 77 | if key not in temp_agent_store: 78 | temp_agent_store[key] = {} 79 | temp_agent_store[key][setup_num] = {'Return': Return, 'agent': agent, 'train_setup': train_setup_info.copy()} 80 | 81 | seed_recall[agent_name] = setup_num 82 | training_results_stored = table_results 83 | 84 | seed_results_connection[key] = training_results_stored 85 | 86 | def _select_training_setups(): 87 | if test_agent_type.lower() == 'best': 88 | best_repeat = max(temp_agent_store[key], key=lambda r: temp_agent_store[key][r]['Return']) 89 | return [temp_agent_store[key][best_repeat]] 90 | if test_agent_type.lower() == 'all': 91 | return list(temp_agent_store[key].values()) 92 | return [temp_agent_store[key][setup_num]] 93 | 94 | selected_setups = _select_training_setups() 95 | trained_agents[key][agent_name] = [entry['agent'] for entry in selected_setups] if len(selected_setups) > 1 else selected_setups[0]['agent'] 96 | 97 | training_setups_for_key = {} 98 | for idx, entry in enumerate(selected_setups, start=1): 99 | training_setup = entry['train_setup'] 100 | repeat_label = entry.get('train_setup', {}).get('Repeat', idx) 101 | training_setups_for_key[f"Training_Setup_{engine_name}_{agent_type}_{adapter}_{repeat_label}"] = training_setup 102 | 103 | return trained_agents, seed_results_connection, temp_agent_store, training_results_stored, observed_states_stored 104 | -------------------------------------------------------------------------------- /tests/test_policy_gradient_classroom.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run the following to test: 3 | pytest tests/test_policy_gradient_classroom.py -k Classroom 4 | """ 5 | 6 | import pytest 7 | 8 | from elsciRL.application_suite.import_tool import PullApplications 9 | from elsciRL.experiments.policy_gradient import PolicyGradientExperiment 10 | 11 | 12 | @pytest.mark.integration 13 | @pytest.mark.slow 14 | def test_policy_gradient_runs_on_classroom(tmp_path): 15 | pytest.importorskip("stable_baselines3") 16 | 17 | puller = PullApplications() 18 | try: 19 | application_data = puller.pull(['Classroom']) 20 | except Exception as exc: 21 | pytest.skip(f"Classroom application unavailable: {exc}") 22 | 23 | classroom_data = application_data.get('Classroom') 24 | if not classroom_data: 25 | pytest.skip("Classroom application data missing") 26 | 27 | default_adapter = 'default' 28 | if default_adapter not in classroom_data['adapters']: 29 | default_adapter = list(classroom_data['adapters'].keys())[0] 30 | 31 | # Sanity-check the Classroom engine's API matches Gym expectations. 32 | engine_cls = classroom_data['engine'] 33 | engine_instance = engine_cls(classroom_data['local_configs']['classroom_A']) 34 | initial_obs = engine_instance.reset() 35 | step_output = engine_instance.step(state=initial_obs, action=0) 36 | assert isinstance(step_output, tuple), "Engine.step should return a tuple" 37 | assert len(step_output) == 4, f"Engine.step must return 4 values, got {len(step_output)}" 38 | 39 | base_experiment_data = { 40 | "number_training_episodes": 100, 41 | "number_training_repeats": 1, 42 | "number_training_seeds": 1, 43 | "number_test_episodes": 100, 44 | "number_test_repeats": 1, 45 | "training_action_cap": 16, 46 | "testing_action_cap": 16, 47 | "test_agent_type": "best", 48 | "reward_signal": [1, 0, 0], 49 | "train": True, 50 | "live_env": True, 51 | } 52 | 53 | agent_configs = { 54 | "PPO": { 55 | "agent_parameters": { 56 | "learning_rate": 3e-4, 57 | "batch_size": 64, 58 | "minibatch_size": 16, 59 | "update_epochs": 2, 60 | "hidden_size": 64, 61 | } 62 | }, 63 | } 64 | 65 | for agent_type, config in agent_configs.items(): 66 | experiment_data = base_experiment_data.copy() 67 | experiment_data["agent_select"] = [agent_type] 68 | experiment_data["adapter_select"] = [default_adapter] 69 | experiment_data["adapter_input_dict"] = {agent_type: [default_adapter]} 70 | experiment_data["agent_parameters"] = {agent_type: config["agent_parameters"]} 71 | 72 | experiment_config = {"data": experiment_data} 73 | local_config = {"data": classroom_data['local_configs']['classroom_A']} 74 | 75 | agent_tmp_dir = tmp_path / agent_type 76 | agent_tmp_dir.mkdir(parents=True, exist_ok=True) 77 | 78 | experiment = PolicyGradientExperiment( 79 | Config=experiment_config, 80 | ProblemConfig=local_config, 81 | Engine=classroom_data['engine'], 82 | Adapters=classroom_data['adapters'], 83 | save_dir=str(agent_tmp_dir), 84 | show_figures='No', 85 | window_size=0.1, 86 | ) 87 | 88 | print(f"Training {agent_type} agent...") 89 | training_setups = experiment.train() 90 | assert training_setups, f"Policy gradient training should generate setups for {agent_type}" 91 | 92 | print(f"Testing {agent_type} agent...") 93 | evaluation = experiment.test() 94 | assert evaluation is not None 95 | 96 | print(f"Rendering {agent_type} agent...") 97 | render_dir = agent_tmp_dir / "renders" 98 | render_outputs = experiment.render_results(render_save_dir=str(render_dir)) 99 | assert render_outputs is not None 100 | print('Test complete') 101 | 102 | 103 | if __name__ == "__main__": 104 | import argparse 105 | import tempfile 106 | from pathlib import Path 107 | 108 | parser = argparse.ArgumentParser(description="Run policy-gradient classroom test") 109 | parser.add_argument( 110 | "--output-dir", 111 | type=Path, 112 | default=None, 113 | help="Directory where test artifacts should be saved. Uses a temp dir if omitted.", 114 | ) 115 | args = parser.parse_args() 116 | 117 | if args.output_dir is not None: 118 | args.output_dir.mkdir(parents=True, exist_ok=True) 119 | test_policy_gradient_runs_on_classroom(args.output_dir) 120 | else: 121 | with tempfile.TemporaryDirectory() as tmp_dir: 122 | test_policy_gradient_runs_on_classroom(Path(tmp_dir)) 123 | -------------------------------------------------------------------------------- /elsciRL/analysis/combined_tabular_results.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | import json 5 | 6 | def combined_tabular_analysis_results(results_dir:str='', analysis_type='training'): 7 | if results_dir == '': 8 | raise ValueError("Save directory not specified.") 9 | analysis_type = analysis_type.lower() # lowercase analysis type input 10 | # Get sub-dir for each problem-experiment type 11 | instruction_folders = [os.path.join(results_dir, instr) for instr in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, instr))] 12 | variance_results = {} 13 | for instr_folder_dir in instruction_folders: 14 | instr_id = instr_folder_dir.split('/')[-1].split('//')[-1].split('\\')[-1].split('\\\\')[-1] 15 | if instr_id not in variance_results.keys(): 16 | variance_results[instr_id] = {} 17 | print(f"Processing {instr_id} for {analysis_type} analysis.") 18 | problem_folders = [name for name in os.listdir(instr_folder_dir) if os.path.isdir(os.path.join(instr_folder_dir, name))] 19 | # Find experiment folders 20 | # - Capture case where there is only one experiment type 21 | # and so wont have sub-directory for experiments to search 22 | for experiment_dir in problem_folders: 23 | if analysis_type == 'training': 24 | experiment_name = experiment_dir+'_training' 25 | file_names = [name for name in os.listdir(instr_folder_dir+'/'+experiment_dir) if name[0:25] == 'training_variance_results'] 26 | elif analysis_type == 'testing': 27 | experiment_name = experiment_dir+'_testing' 28 | file_names = [name for name in os.listdir(instr_folder_dir+'/'+experiment_dir) if name[0:24] == 'testing_variance_results'] 29 | else: 30 | raise ValueError("Analysis type must be either 'training' or 'testing'.") 31 | 32 | if experiment_name not in variance_results[instr_id].keys(): 33 | variance_results[instr_id][experiment_name] = {} 34 | 35 | for file in file_names: 36 | results = pd.read_csv(instr_folder_dir+'/'+experiment_dir+'/'+file) 37 | agent = results['agent'].iloc[0].split('__')[0] 38 | if agent not in variance_results[instr_id][experiment_name].keys(): 39 | variance_results[instr_id][experiment_name][agent] = {} 40 | 41 | # Calculate Mean and Std Dev 42 | variance_results[instr_id][experiment_name][agent]['num_repeats'] = results['num_repeats'].iloc[0] 43 | variance_results[instr_id][experiment_name][agent]['number_episodes'] = results.index.max() + 1 44 | # - rolling avg R per episode 45 | variance_results[instr_id][experiment_name][agent]['mean'] = results['avg_R_mean'].mean() 46 | variance_results[instr_id][experiment_name][agent]['median'] = results['avg_R_mean'].median() 47 | variance_results[instr_id][experiment_name][agent]['std_error'] = results['avg_R_mean'].sem() 48 | variance_results[instr_id][experiment_name][agent]['std_dev'] = results['avg_R_mean'].std() 49 | variance_results[instr_id][experiment_name][agent]['variance'] = results['avg_R_mean'].var() 50 | # - cumulative R per episode 51 | variance_results[instr_id][experiment_name][agent]['cum_R_mean'] = results['cum_R_mean'].mean() 52 | variance_results[instr_id][experiment_name][agent]['cum_R_median'] = results['cum_R_mean'].median() 53 | variance_results[instr_id][experiment_name][agent]['cum_R_std_error'] = results['cum_R_mean'].sem() 54 | variance_results[instr_id][experiment_name][agent]['cum_R_std_dev'] = results['cum_R_mean'].std() 55 | variance_results[instr_id][experiment_name][agent]['cum_R_variance'] = results['cum_R_mean'].var() 56 | # - time avg per episode 57 | variance_results[instr_id][experiment_name][agent]['time_avg'] = results['time_mean'].mean() 58 | 59 | variance_results_df = pd.DataFrame.from_dict( 60 | {f"{instr}/{experiment}/{agent}": data for instr, experiments in variance_results.items() 61 | for experiment, agents in experiments.items() 62 | for agent, data in agents.items()}, 63 | orient='index' 64 | ).reset_index() 65 | variance_results_df.columns = ['Instruction/Experiment/Agent', 'Num Repeats', 'Number Episodes', 66 | 'Avg R Mean', 'Avg R Median', 'Avg R Std Error', 'Avg R Std Dev', 'Avg R Variance', 67 | 'Cumulative R Mean', 'Cumulative R Median', 'Cumulative R Std Error', 68 | 'Cumulative R Std Dev', 'Cumulative R Variance', 'Time Avg'] 69 | # Save the combined results to a CSV file 70 | combined_results_filename = f"{analysis_type}_combined_results.csv" 71 | combined_results_path = os.path.join(results_dir, combined_results_filename) 72 | variance_results_df.to_csv(combined_results_path, index=False) 73 | -------------------------------------------------------------------------------- /elsciRL/examples/adapters/elsciRL_sailing_language.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | import numpy as np 3 | import pandas as pd 4 | import torch 5 | from torch import Tensor 6 | # StateAdapter includes static methods for adapters 7 | from elsciRL.encoders.language_transformers.MiniLM_L6v2 import LanguageEncoder 8 | from gymnasium.spaces import Text, Box 9 | 10 | 11 | class LanguageAdapter: 12 | _cached_state_idx: Dict[str, int] = dict() 13 | 14 | def __init__(self, setup_info:dict={}) -> None: 15 | # Language encoder doesn't require any preset knowledge of env to use 16 | self.encoder = LanguageEncoder() 17 | # Observartion is string: "x_angle" 18 | # -> encoder output is 1x384 tensor from miniLM 19 | self.observation_space = Box(low=-1, high=1, shape=(1,384), dtype=np.float32) 20 | 21 | def adapter(self, state:any, legal_moves:list = None, episode_action_history:list = None, encode:bool = True, indexed: bool = False) -> Tensor: 22 | """ Use Language name for every piece name for current board position """ 23 | 24 | # state = 'x_angle' 25 | # legal_moves = [0,1] 26 | # episode_action_history = [action, action, action] where aciton = [0,1] 27 | 28 | # Angle is relative to the goal of moving forward (i.e. bearing) 29 | # - angle=0 is directly forward 30 | # - angle<0 is slightly left 31 | # - angle>0 is slightly right 32 | 33 | x = float(state.split('_')[0]) 34 | angle = float(state.split('_')[1]) 35 | 36 | # Horizontal position 37 | if (x>-1)&(x<1): 38 | L_x = 'in the middle' 39 | elif (x>-3)&(x<3): 40 | L_x = 'near to the center' 41 | elif (x>-5)&(x<5): 42 | L_x = 'in between the edge and the center' 43 | elif (x>-7)&(x<7): 44 | L_x = 'near to the edge' 45 | elif (x>=-10)&(x<=10): 46 | L_x = 'very close to the edge' 47 | else: 48 | L_x = 'out of bounds' 49 | 50 | # Side of river 51 | if x<0: 52 | L_x_side = 'on the harbor side of the river' 53 | elif x>0: 54 | L_x_side = 'on the beach side of the river' 55 | else: 56 | L_x_side = '' 57 | 58 | # Angle 59 | # - Defined in radians where 90deg = 1.57 60 | # - Peak velocity at 45deg = pi/4 = 0.7853... 61 | if angle==0: 62 | L_angle = 'facing directly into the wind' 63 | elif (angle>-0.1)&(angle<0.1): 64 | L_angle = 'facing into the wind' 65 | elif (angle>-0.5)&(angle<0.5): 66 | L_angle = 'close hauled with wind' 67 | elif (angle>-1)&(angle<1): 68 | L_angle = 'cutting the wind' 69 | else: 70 | L_angle = 'moving across the wind' 71 | # Wind side 72 | if angle<0: 73 | L_wind_side = 'on the starboard side' 74 | elif angle>0: 75 | L_wind_side = 'on the port side' 76 | else: 77 | L_wind_side = '' 78 | 79 | L_state = 'The boat is ' + L_x_side + ' ' + L_x + ', ' + L_angle + ' ' + L_wind_side + ', ' 80 | L_state = L_state.replace(' ', ' ').replace(' .','.').replace(' ,',',').replace(' and,','') # Remove double spaces 81 | 82 | # Last action taken and final language state output 83 | if len(episode_action_history)>0: 84 | last_action = episode_action_history[-1] 85 | # if last_action==0: 86 | # L_action = 'the last action was to turn to the left slightly.' 87 | # elif last_action==1: 88 | # L_action = 'the last action was to turn to the right slightly.' 89 | 90 | if (x<=0)&(last_action==0): 91 | L_action = 'the last action was to turn towards the harbor.' 92 | elif (x<0)&(last_action==1): 93 | L_action = 'the last action was to turn towards the center of the river.' 94 | elif (x>=0)&(last_action==1): 95 | L_action = 'the last action was to turn towards the beach.' 96 | elif (x>0)&(last_action==0): 97 | L_action = 'the last action was to turn towards the center of the river.' 98 | 99 | state = L_state + ' ' + L_action 100 | else: 101 | state = L_state 102 | 103 | #print(state) 104 | 105 | # Encode to Tensor for agents 106 | if encode: 107 | state_encoded = self.encoder.encode(state=state) 108 | else: 109 | state_encoded = state 110 | 111 | if (indexed): 112 | state_indexed = list() 113 | for sent in state: 114 | if (sent not in LanguageAdapter._cached_state_idx): 115 | LanguageAdapter._cached_state_idx[sent] = len(LanguageAdapter._cached_state_idx) 116 | state_indexed.append(LanguageAdapter._cached_state_idx[sent]) 117 | 118 | state_encoded = torch.tensor(state_indexed) 119 | 120 | return state_encoded -------------------------------------------------------------------------------- /elsciRL/environment_setup/results_table.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | class ResultsTable: 5 | def __init__(self, local_setup_info:dict = None) -> None: 6 | if type(local_setup_info['training_results']) != type(pd.DataFrame()): 7 | self.agent:list = [] 8 | self.opponent:list =[] 9 | self.episode:list = [] 10 | self.num_actions:list = [] 11 | self.episode_reward:list = [] 12 | self.cumulative_reward:list = [] 13 | self.time_per_episode:list = [] 14 | self.action_history:list = [] 15 | self.q_total:list = [] 16 | self.q_mean:list = [] 17 | # new 18 | self.cum_r = 0 19 | else: 20 | self.agent:list = local_setup_info['training_results'].agent.tolist() 21 | self.opponent:list =local_setup_info['training_results'].opponent.tolist() 22 | self.episode:list = local_setup_info['training_results'].episode.tolist() 23 | self.num_actions:list = local_setup_info['training_results'].num_actions.tolist() 24 | self.episode_reward:list = local_setup_info['training_results'].episode_reward.tolist() 25 | self.cumulative_reward:list = local_setup_info['training_results'].cumulative_reward.tolist() 26 | self.cum_r = self.cumulative_reward[-1] 27 | self.time_per_episode:list = local_setup_info['training_results'].time_per_episode.tolist() 28 | self.action_history:list = local_setup_info['training_results'].action_history.tolist() 29 | self.q_total:list = local_setup_info['training_results'].q_total.tolist() 30 | self.q_mean:list = local_setup_info['training_results'].q_mean.tolist() 31 | 32 | def results_per_episode(self,agent_name:str='missing', opponent_name:str='None', episode_num:int=0, action_num:int=0, episode_reward:float=0, time:float=0, episode_action_history:list=[], q_total:float=0, q_mean:float=0): 33 | self.agent.append(agent_name) 34 | self.opponent.append(opponent_name) 35 | self.episode.append(episode_num) 36 | self.num_actions.append(action_num) 37 | self.episode_reward.append(episode_reward) 38 | self.cum_r +=episode_reward 39 | self.cumulative_reward.append(self.cum_r) 40 | self.time_per_episode.append(time) 41 | self.action_history.append(episode_action_history) 42 | self.q_total.append(q_total) 43 | self.q_mean.append(q_mean) 44 | 45 | 46 | def results_table_format(self): 47 | results= pd.DataFrame({ 48 | 'agent': self.agent, 49 | 'opponent': self.opponent, 50 | 'episode': self.episode, 51 | 'num_actions': self.num_actions, 52 | 'episode_reward': self.episode_reward, 53 | "cumulative_reward": self.cumulative_reward, 54 | "time_per_episode":self.time_per_episode, 55 | "action_history": self.action_history, 56 | "q_total":self.q_total, 57 | "q_mean":self.q_mean}) 58 | return results 59 | 60 | def reset(self): 61 | self.agent:list = [] 62 | self.opponent:list =[] 63 | self.episode:list = [] 64 | self.num_actions:list = [] 65 | self.episode_reward:list = [] 66 | self.cum_r = 0 67 | self.cumulative_reward:list = [] 68 | self.time_per_episode:list = [] 69 | self.action_history:list = [] 70 | self.q_total:list = [] 71 | self.q_mean:list = [] 72 | 73 | def copy(self): 74 | results_copy= pd.DataFrame({ 75 | 'agent': self.agent.copy(), 76 | 'opponent': self.opponent.copy(), 77 | 'episode': self.episode.copy(), 78 | 'num_actions': self.num_actions.copy(), 79 | 'episode_reward': self.episode_reward.copy(), 80 | "cumulative_reward": self.cumulative_reward.copy(), 81 | "time_per_episode":self.time_per_episode.copy(), 82 | "action_history":self.action_history.copy(), 83 | "q_total":self.q_total.copy(), 84 | "q_mean":self.q_mean.copy()}) 85 | return results_copy 86 | 87 | def load(self, results_copy): 88 | self.agent:list = results_copy.agent.tolist() 89 | self.opponent:list = results_copy.opponent.tolist() 90 | self.episode:list = results_copy.episode.tolist() 91 | self.num_actions:list = results_copy.num_actions.tolist() 92 | self.episode_reward:list = results_copy.episode_reward.tolist() 93 | self.cumulative_reward:list = results_copy.cumulative_reward.tolist() 94 | self.cum_r = self.cumulative_reward[-1] 95 | self.time_per_episode:list = results_copy.time_per_episode.tolist() 96 | self.action_history:list = results_copy.action_history.tolist() 97 | self.q_total:list = results_copy.q_total.tolist() 98 | self.q_mean:list = results_copy.q_mean.tolist() 99 | 100 | 101 | -------------------------------------------------------------------------------- /elsciRL/adapters/LLM_state_generators/text_gpt-4.1.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Any 3 | import os 4 | 5 | from openai import OpenAI 6 | 7 | try: 8 | from torch import Tensor 9 | from elsciRL.encoders.language_transformers.MiniLM_L6v2 import LanguageEncoder 10 | except ImportError: 11 | print("Warning: torch or LanguageEncoder not found. Please ensure elsciRL is properly installed.") 12 | Tensor = None 13 | LanguageEncoder = None 14 | 15 | class LLMAdapter(ABC): 16 | """Convert a general prompt and raw text state into a description of the state.""" 17 | def __init__(self, base_prompt: str): 18 | super().__init__() 19 | # Define the fields that describe the state features: 20 | self.base_prompt = base_prompt 21 | 22 | @abstractmethod 23 | def _read(raw_state) -> list: 24 | # Read the data. 25 | # fill in the feature fields 26 | raise NotImplementedError 27 | 28 | 29 | class GPTAdapter(LLMAdapter): 30 | """Adapter for OpenAI GPT models.""" 31 | 32 | def __init__(self, base_prompt: str, model_name: str = "gpt-4"): 33 | super().__init__(base_prompt) 34 | self.model_name = model_name 35 | 36 | # Initialize the language encoder for encoding functionality 37 | if LanguageEncoder is not None: 38 | self.encoder = LanguageEncoder() 39 | else: 40 | print("Warning: LanguageEncoder not available. Encoding will not work.") 41 | self.encoder = None 42 | 43 | def _read(self, raw_state) -> list: 44 | """Read the data and fill in the feature fields.""" 45 | # This method should be implemented based on specific requirements 46 | # For now, returning the raw state as a list 47 | return [raw_state] if isinstance(raw_state, str) else raw_state 48 | 49 | def call_gpt_api(self, prompt: str): 50 | """Call the OpenAI GPT API with the given prompt.""" 51 | try: 52 | api_key = os.environ.get("OPENAI_API_KEY") 53 | if not api_key: 54 | raise ValueError("OPENAI_API_KEY environment variable not set") 55 | 56 | client = OpenAI(api_key=api_key) 57 | response = client.chat.completions.create( 58 | model=self.model_name, 59 | messages=[ 60 | {"role": "system", "content": self.base_prompt}, 61 | {"role": "user", "content": prompt} 62 | ], 63 | max_tokens=5000 64 | ) 65 | return response.to_dict() if hasattr(response, 'to_dict') else response 66 | except Exception as e: 67 | print(f"Error calling OpenAI API: {e}") 68 | return None 69 | 70 | def process_gpt_response(self, response): 71 | """Process the response from OpenAI API.""" 72 | if response and 'choices' in response: 73 | return response['choices'][0]['message']['content'] 74 | return None 75 | 76 | def adapter(self, state: any, legal_moves: list = None, episode_action_history: list = None, encode: bool = True, indexed: bool = False) -> Tensor: 77 | """Returns the adapted form, may require input flag for encoded or non-encoded output.""" 78 | # Build the full context prompt including legal moves and action history 79 | context_parts = [] 80 | 81 | # Add state information 82 | if state is not None: 83 | context_parts.append(f"Current state: {state}") 84 | 85 | # Add legal moves if provided 86 | if legal_moves is not None and len(legal_moves) > 0: 87 | context_parts.append(f"Legal moves: {legal_moves}") 88 | 89 | # Add action history if provided 90 | if episode_action_history is not None and len(episode_action_history) > 0: 91 | recent_actions = episode_action_history[-5:] # Last 5 actions 92 | context_parts.append(f"Recent actions: {recent_actions}") 93 | 94 | # Combine all context into a single prompt 95 | full_prompt = " | ".join(context_parts) 96 | 97 | # Get GPT response 98 | adapted_state = self.call_gpt_api(full_prompt) 99 | processed_response = self.process_gpt_response(adapted_state) 100 | 101 | if processed_response is None: 102 | processed_response = str(state) if state is not None else "No state available" 103 | 104 | # Handle encoding 105 | if encode: 106 | if self.encoder is not None: 107 | # Use the LanguageEncoder to encode the response 108 | state_encoded = self.encoder.encode( 109 | state=processed_response, 110 | legal_actions=legal_moves, 111 | episode_action_history=episode_action_history, 112 | indexed=indexed 113 | ) 114 | return state_encoded 115 | else: 116 | print("Warning: Encoder not available, returning processed response as string") 117 | return processed_response 118 | else: 119 | return processed_response 120 | 121 | def sample(self, state: any): 122 | """Returns a sample of an adapted state form (typically initial position of the environment).""" 123 | if not state: 124 | state = 'The current state is empty.' 125 | return self.adapter(state, encode=True) 126 | 127 | 128 | -------------------------------------------------------------------------------- /elsciRL/experiments/training_procedures/default_exp_training.py: -------------------------------------------------------------------------------- 1 | import os 2 | from elsciRL.experiments.experiment_utils.config_utils import ensure_dir 3 | from elsciRL.experiments.experiment_utils.render_current_results import render_current_result 4 | 5 | 6 | def run_training_loop( 7 | env_manager, 8 | agent_factory, 9 | result_manager, 10 | training_render, 11 | training_render_save_dir, 12 | save_dir, 13 | engine_name, 14 | engine, 15 | agent_type, 16 | adapter, 17 | all_adapters, 18 | train_setup_info, 19 | trained_agents, 20 | num_training_seeds, 21 | test_agent_type, 22 | show_figures, 23 | number_training_repeats, 24 | gym_env:bool=False 25 | ): 26 | if f"{engine_name}_{agent_type}_{adapter}" not in trained_agents: 27 | trained_agents[f"{engine_name}_{agent_type}_{adapter}"] = {} 28 | seed_recall = {} 29 | seed_results_connection = {} 30 | for seed_num in range(num_training_seeds): 31 | if num_training_seeds > 1: 32 | print("------\n- Seed Num: ", seed_num) 33 | if seed_num == 0: 34 | train_setup_info['training_results'] = False 35 | train_setup_info['observed_states'] = False 36 | else: 37 | train_setup_info['training_results'] = False 38 | train_setup_info['observed_states'] = observed_states_stored.copy() 39 | setup_num = 0 40 | temp_agent_store = {} 41 | for training_repeat in range(1, number_training_repeats + 1): 42 | if number_training_repeats > 1: 43 | print("------\n- Repeat Num: ", training_repeat) 44 | setup_num += 1 45 | agent = agent_factory.create(agent_type, train_setup_info['agent_parameters'][agent_type], engine, adapter) 46 | train_setup_info['agent'] = agent 47 | # Create the environment, use gym_env if specified 48 | if gym_env: 49 | live_env = env_manager.create_gym_env(engine, adapter, train_setup_info) 50 | else: 51 | live_env = env_manager.create_env(engine, all_adapters, train_setup_info) 52 | # --- 53 | if training_repeat > 1: 54 | live_env.start_obs = env_start 55 | env_start = live_env.start_obs 56 | goal = str(env_start).split(".")[0] + "---GOAL" 57 | print("Flat agent Goal: ", goal) 58 | if goal in seed_recall: 59 | setup_num = seed_recall[goal] 60 | else: 61 | seed_recall[goal] = 1 62 | agent_save_dir = os.path.join(save_dir, 63 | f"{engine_name}_{agent_type}_{adapter}__training_results_{goal}_{setup_num}" 64 | ) if num_training_seeds > 1 else os.path.join(save_dir, 65 | f"{engine_name}_{agent_type}_{adapter}__training_results_{setup_num}" 66 | ) 67 | ensure_dir(agent_save_dir) 68 | if goal in trained_agents[f"{engine_name}_{agent_type}_{adapter}"]: 69 | live_env.agent = trained_agents[f"{engine_name}_{agent_type}_{adapter}"][goal].clone() 70 | live_env.agent.exploration_parameter_reset() 71 | if goal in seed_results_connection: 72 | live_env.results.load(seed_results_connection[goal]) 73 | training_results = live_env.episode_loop() 74 | training_results['episode'] = training_results.index 75 | training_results.insert(loc=0, column='Repeat', value=setup_num) 76 | Return = result_manager.train_report(training_results, agent_save_dir, show_figures) 77 | if goal not in temp_agent_store: 78 | temp_agent_store[goal] = {} 79 | temp_agent_store[goal][setup_num] = {'Return': Return, 'agent': live_env.agent.clone()} 80 | if training_repeat == 1: 81 | max_Return = Return 82 | best_agent = live_env.agent 83 | training_results_stored = live_env.results.copy() 84 | observed_states_stored = live_env.elsciRL.observed_states 85 | if Return > max_Return: 86 | max_Return = Return 87 | best_agent = live_env.agent 88 | training_results_stored = live_env.results.copy() 89 | observed_states_stored = live_env.elsciRL.observed_states 90 | seed_recall[goal] = seed_recall[goal] + 1 91 | train_setup_info['train_save_dir'] = agent_save_dir 92 | if training_render: 93 | current_render_save_dir = training_render_save_dir or agent_save_dir 94 | render_current_result( 95 | training_setup=train_setup_info, 96 | current_environment=live_env, 97 | current_agent=live_env.agent, 98 | local_save_dir=current_render_save_dir 99 | ) 100 | seed_results_connection[goal] = training_results_stored 101 | # Save trained agent(s) 102 | if test_agent_type.lower() == 'best': 103 | trained_agents[f"{engine_name}_{agent_type}_{adapter}"][goal] = best_agent.clone() 104 | elif test_agent_type.lower() == 'all': 105 | start_repeat_num = list(temp_agent_store[goal].keys())[0] 106 | end_repeat_num = list(temp_agent_store[goal].keys())[-1] 107 | all_agents = [temp_agent_store[goal][repeat]['agent'] for repeat in range(start_repeat_num, end_repeat_num + 1)] 108 | trained_agents[f"{engine_name}_{agent_type}_{adapter}"][goal] = all_agents 109 | 110 | return trained_agents, seed_results_connection, temp_agent_store, training_results_stored, observed_states_stored 111 | -------------------------------------------------------------------------------- /elsciRL/GUI/LLM_tools/LLM_utils.py: -------------------------------------------------------------------------------- 1 | # IMPORTS LLM API TOOLS 2 | # EDITED OUT FOR NOW UNTIL FULL IMPLEMENTATION READY 3 | 4 | # import os 5 | # import json 6 | 7 | # from openai import OpenAI 8 | 9 | # def call_gpt_api(prompt): 10 | # import os 11 | # api_key = os.environ.get("OPENAI_API_KEY") 12 | # client = OpenAI(api_key=api_key) 13 | # response = client.chat.completions.create( 14 | # model="gpt-4.1", 15 | # messages=[{"role": "system", "content": prompt}], 16 | # max_tokens=5000 17 | # ) 18 | # return response.to_dict() if hasattr(response, 'to_dict') else response 19 | 20 | # def process_gpt_response(response): 21 | # if response and 'choices' in response: 22 | # return response['choices'][0]['message']['content'] 23 | # return None 24 | 25 | # def generate_application(self, user_input:str=''): 26 | # # TODO: Use this in a new tab with user input to update application list 27 | # # Load the app_setup.md content as part of the system prompt 28 | 29 | # # Add requirement to system prompt for code chunk separation 30 | # system_prompt_requirement = ( 31 | # "If your response contains any code chunks, you must output them in a separate section clearly marked as 'Code Output', " 32 | # "so that the application can extract and save them to a file. Do not mix code with explanations in the same section." 33 | # ) 34 | # # Combine the app_setup.md info with the system prompt and the new requirement 35 | # system_prompt = ( 36 | # "You are a helpful assistant. " 37 | # "Below is important application setup information for elsciRL:\n" 38 | # f"{self.app_setup_info}\n" 39 | # f"{system_prompt_requirement}\n" 40 | # "Please use this information to answer user queries." 41 | # ) 42 | 43 | # if not user_input: 44 | # return {"error": "No input provided"} 45 | 46 | # # Use the utils function to call the GPT API 47 | # response = call_gpt_api(system_prompt + "\nUser: " + user_input) 48 | # reply = process_gpt_response(response) 49 | # print(reply) 50 | # if not reply: 51 | # return {"error": "Failed to get response from GPT API"} 52 | 53 | # # Save the complete output to a .txt file 54 | # output_dir = os.path.join(os.path.dirname(__file__), 'output') 55 | # os.makedirs(output_dir, exist_ok=True) 56 | # output_path = os.path.join(output_dir, 'last_gpt_response.txt') 57 | # with open(output_path, 'w', encoding='utf-8') as f: 58 | # f.write(str(response)) 59 | 60 | # # Follow-up: Ask the AI model to extract all Python code and JSON config blocks and return a list of (filename, code) pairs 61 | # followup_prompt = ( 62 | # "Extract all Python code blocks and JSON config blocks from the following text. " 63 | # "For each code or config block, output a JSON array where each item has 'filename' and 'code' fields. " 64 | # "Choose a descriptive filename for each code block (e.g., based on class/function names or comments, use .py for Python and .json for configs). " 65 | # "Do not include any explanation, only the JSON array.\n\n" + reply 66 | # ) 67 | # code_response = call_gpt_api(followup_prompt) 68 | # code_reply = process_gpt_response(code_response) 69 | # try: 70 | # code_blocks = json.loads(code_reply) 71 | # generated_data = {} 72 | # for block in code_blocks: 73 | # fname = block.get('filename', 'extracted_code.py') 74 | # code = block.get('code', '') 75 | # generated_data[fname] = code 76 | # code_file_path = os.path.join(output_dir, fname) 77 | # with open(code_file_path, 'w', encoding='utf-8') as code_file: 78 | # code_file.write(code) 79 | # except Exception as e: 80 | # # fallback: save the raw reply if not valid JSON 81 | # code_file_path = os.path.join(output_dir, 'extracted_code.py') 82 | # with open(code_file_path, 'w', encoding='utf-8') as code_file: 83 | # code_file.write(code_reply.strip()) 84 | 85 | # for name,code in generated_data.items(): 86 | # if 'engine' in name.lower(): 87 | # generated_data['engine'] = code 88 | # elif 'analysis' in name.lower(): 89 | # generated_data['analysis'] = code 90 | # elif ('experiment' in name.lower()) | ('agent' in name.lower()): 91 | # generated_data['agent_config'] = code 92 | # elif ('local' in name.lower()) | ('env' in name.lower()): 93 | # generated_data['local_config'] = code 94 | # elif 'adapter_language' in name.lower(): 95 | # generated_data['adapter_language'] = code 96 | # elif ('numeric' in name.lower()) | ('default' in name.lower()): 97 | # generated_data['adapter_numeric'] = code 98 | 99 | # # Create the application setup dictionary 100 | # application_setup = { 101 | # 'engine':generated_data['engine'], 102 | # 'experiment_configs':{'quick_test':generated_data['agent_config']}, 103 | # 'local_configs':{'env_config':generated_data['local_config']}, 104 | # 'adapters':{'numeric_adapter':generated_data['adapter_numeric'], 105 | # 'language_adapter':generated_data['adapter_language']}, 106 | # 'local_analysis':{'blackjack_graphs':generated_data['analysis']}, 107 | # 'prerender_data':{}, 108 | # 'prerender_images':{}, 109 | # } 110 | 111 | # # Add the new application to the application data 112 | # self.pull_app_data = self.application_data.add_applicaiton( 113 | # problem=generated_data['agent_config']['name'], 114 | # application_data=application_setup 115 | # ) 116 | 117 | # return reply -------------------------------------------------------------------------------- /elsciRL/interaction_loops/state_search.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | from tqdm import tqdm 4 | 5 | # ------ Imports ----------------------------------------- 6 | # Agent Setup 7 | from elsciRL.environment_setup.imports import ImportHelper 8 | 9 | # Evaluation standards 10 | from elsciRL.environment_setup.results_table import ResultsTable 11 | from elsciRL.environment_setup.elsciRL_info import elsciRLInfo 12 | 13 | 14 | def episode_loop(Engine, Adapters: dict, local_setup_info: dict, number_episodes: int = 1000, 15 | batch_number: int = 0, observed_states: dict = {}) -> dict: 16 | # --- INIT state space from engine 17 | agent_adapter_name = local_setup_info['agent_type'] + "_" + local_setup_info['adapter_select'] 18 | engine = Engine(local_setup_info) 19 | start_obs = engine.reset() 20 | # --- PRESET elsciRL INFO 21 | # Agent 22 | Imports = ImportHelper(local_setup_info) 23 | agent, agent_type, agent_name, agent_state_adapter = ( 24 | Imports.agent_info(Adapters) 25 | ) 26 | ( 27 | num_train_episodes, 28 | num_test_episodes, 29 | training_action_cap, 30 | testing_action_cap, 31 | reward_signal, 32 | ) = Imports.parameter_info() 33 | 34 | # Training or testing phase flag 35 | train = Imports.training_flag() 36 | 37 | # Mode selection (already initialized) 38 | # --- elsciRL 39 | live_env, observed_states_flag = ( 40 | Imports.live_env_flag() 41 | ) 42 | # Results formatting 43 | results = ResultsTable(local_setup_info) 44 | # elsciRL input function 45 | # - We only want to init trackers on first batch otherwise it resets knowledge 46 | elsciRL = elsciRLInfo(observed_states) 47 | # RENDER AND SUB-GOALS REMOVED COMPLETELY SO SAVE RUN-TIME 48 | 49 | for episode in tqdm(range(0, number_episodes)): 50 | action_history = [] 51 | # --- 52 | # Start observation is used instead of .reset() fn so that this can be overridden for repeat analysis from the same start pos 53 | obs = engine.reset(start_obs=start_obs) 54 | legal_moves = engine.legal_move_generator(obs) 55 | 56 | # LLM agents need to pass the state as a string 57 | if agent_type.split("_")[0] == "LLM": 58 | state = agent_state_adapter.adapter( 59 | state=obs, 60 | legal_moves=legal_moves, 61 | episode_action_history=action_history, 62 | encode=False, 63 | ) 64 | else: 65 | state = agent_state_adapter.adapter( 66 | state=obs, 67 | legal_moves=legal_moves, 68 | episode_action_history=action_history, 69 | encode=True, 70 | ) 71 | # --- 72 | start_time = time.time() 73 | episode_reward: int = 0 74 | # --- 75 | for action in range(0, training_action_cap): 76 | if live_env: 77 | # Agent takes action 78 | legal_moves = engine.legal_move_generator(obs) 79 | agent_action = agent.policy(state, legal_moves) 80 | 81 | if isinstance(agent_action, np.int64): 82 | action_history.append(agent_action.item()) 83 | else: 84 | action_history.append(agent_action) 85 | 86 | next_obs, reward, terminated, _ = engine.step( 87 | state=obs, action=agent_action 88 | ) 89 | 90 | # Can override reward per action with small negative punishment 91 | if reward == 0: 92 | reward = reward_signal[1] 93 | 94 | # Only update observed states if not already observed 95 | if next_obs not in observed_states: 96 | legal_moves = engine.legal_move_generator(next_obs) 97 | # LLM agents need to pass the state as a string 98 | if agent_type.split("_")[0] == "LLM": 99 | next_state = agent_state_adapter.adapter( 100 | state=next_obs, 101 | legal_moves=legal_moves, 102 | episode_action_history=action_history, 103 | encode=False, 104 | ) 105 | else: 106 | next_state = agent_state_adapter.adapter( 107 | state=next_obs, 108 | legal_moves=legal_moves, 109 | episode_action_history=action_history, 110 | encode=True, 111 | ) 112 | # elsciRL trackers 113 | # TODO: Consider adding prior action history to the tracker so that we can 114 | # transform observed data across adapters without loss of information 115 | observed_states = elsciRL.observed_state_tracker( 116 | engine_observation=next_obs, 117 | language_state=agent_state_adapter.adapter( 118 | state=next_obs, 119 | legal_moves=legal_moves, 120 | episode_action_history=action_history, 121 | encode=False, 122 | ), 123 | ) 124 | 125 | episode_reward += reward 126 | if terminated: 127 | break 128 | else: 129 | state = next_state 130 | if live_env: 131 | obs = next_obs 132 | 133 | # If action limit reached 134 | if not terminated: 135 | reward = reward_signal[2] 136 | 137 | end_time = time.time() 138 | try: 139 | agent_results = agent.q_result() 140 | except: 141 | agent_results = [0, 0] 142 | 143 | if live_env: 144 | results.results_per_episode( 145 | agent_name, 146 | None, 147 | episode, 148 | action, 149 | episode_reward, 150 | (end_time - start_time), 151 | action_history, 152 | agent_results[0], 153 | agent_results[1], 154 | ) 155 | # Output GIF image of all episode frames 156 | return observed_states 157 | -------------------------------------------------------------------------------- /elsciRL/analysis/convergence_measure.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from typing import List 3 | 4 | # Define convergence evaluation function 5 | class Convergence_Measure: 6 | def __init__(self, total_num_episodes): 7 | # --- PARAMETERS --- 8 | self.conv_threshold_perc = 5 9 | self.num_prior_epi = int(total_num_episodes/10) 10 | self.num_prior_epi_points = 5 11 | self.plot_convergence_figures = False 12 | # ------------------ 13 | # Ploy display time 14 | self.display_plot_time = 10 15 | 16 | def convergence_check(self, value_list: List[float], player_side: str, visual_save_dir: str): 17 | """ CONVERGENCE CHECK METHODOLOGY 18 | - Goes through each Q value by episode and calculates the percentage change from the previous result 19 | - Because a single point can not provide accurate results, we introduce a system in which N prior output points are checked 20 | - We set the prior check points by setting a range of episodes and evenly space N points between the current episode and the fist check point defined by the range 21 | - We accept that the output has converged if ALL the prior N outputs percentage change is less than our threshold 22 | - The episode for which the output has converged in then the first check point of this providing a systematic numeric convergence evaluation 23 | """ 24 | perc_change_tracker = [] 25 | prior_change_long_term_tracker = [] 26 | conv_met_check = [] 27 | conv_met = False 28 | for n in range(0,len(value_list)): 29 | value = value_list[n] 30 | # First row fixed value 31 | if n == 0: 32 | perc_change = 100 33 | else: 34 | perc_change = abs((value - prior_row_value)/prior_row_value)*100 35 | perc_change_tracker.append(perc_change) 36 | 37 | prior_epi_points_tracker = [] 38 | if n 0 else 1 109 | agent.learn(total_steps=total_steps) 110 | reward, actions, _, render_stack = agent.test(env, render=render) 111 | end_time = time.time() 112 | 113 | if actions: 114 | if isinstance(actions[0], np.int64): 115 | actions = [action.item() for action in actions] 116 | elif isinstance(actions[0], np.ndarray): 117 | actions = [action.item() for action in actions] 118 | 119 | results_table.results_per_episode( 120 | agent_name, 121 | None, 122 | episode, 123 | len(actions), 124 | reward, 125 | (end_time - start_time), 126 | actions, 127 | 0, 128 | 0, 129 | ) 130 | if render and render_stack: 131 | episode_render.extend(_normalize_render_stack(render_stack)) 132 | 133 | table_results = results_table.results_table_format() 134 | if render and episode_render: 135 | ensure_dir(render_save_dir or "renders") 136 | episode_render[0].save( 137 | f"{render_save_dir or 'renders'}/{agent_name}_policy.gif", 138 | save_all=True, 139 | append_images=episode_render[1:], 140 | optimize=False, 141 | duration=200, 142 | loop=1, 143 | ) 144 | return table_results 145 | 146 | @classmethod 147 | def policy_rollout( 148 | cls, 149 | agent, 150 | env, 151 | agent_name: str, 152 | num_episodes: int, 153 | results_table, 154 | render: bool = False, 155 | render_save_dir: Optional[str] = None, 156 | action_limit: Optional[int] = None, 157 | ): 158 | return cls._run_rollout( 159 | agent=agent, 160 | env=env, 161 | agent_name=agent_name, 162 | num_episodes=num_episodes, 163 | results_table=results_table, 164 | train=False, 165 | training_action_cap=None, 166 | testing_action_cap=action_limit, 167 | render=render, 168 | render_save_dir=render_save_dir, 169 | ) 170 | -------------------------------------------------------------------------------- /elsciRL/GUI/static/app_setup.md: -------------------------------------------------------------------------------- 1 | # New elsciRL Application Setup 2 | 3 | Each application can be added to the elsciRL library to be used within the GUI interface app. 4 | 5 | ## Add Application to elsciRL 6 | 7 | Once the following functions are specified, you can add the application to the elsciRL library by first publishing it to GitHub and then referencing this in the application suite. 8 | 9 | > elsciRL > application_suite > import_data.py 10 | 11 | ![import_data](<./_images/import_data_small.png>) 12 | 13 | ## Core Requirements 14 | 15 | ### Environment 16 | Each application is defined by a unique engine that generates the data. Variations of the same problem specified by different data engines are considered different applications. 17 | 18 | Define the MDP data engine with the following functions. 19 | 20 | ```python 21 | Class Engine: 22 | def __init__(self, local_setup_info:dict): 23 | # Store optional setup info 24 | # Initialize ledger of required & optional data 25 | # Prepare any internal environment data structures 26 | # Initialize histories (e.g., action_history, obs_history) 27 | 28 | def reset(self, start_obs = None): 29 | # Reset environment state 30 | # Optionally accept a specified start_obs 31 | return start_state 32 | 33 | def step(self, state, action): 34 | # Apply the chosen action 35 | # Update state and compute reward 36 | # Determine if episode is terminated 37 | return next_state, reward, terminated, info 38 | 39 | def legal_move_generator(self, state = None): 40 | # Return a list of valid actions given the current observation 41 | return legal_moves 42 | 43 | def render(self, state = None): 44 | # Generate and optionally display a visual representation of the environment 45 | # Return a figure object 46 | return fig 47 | 48 | def close(self): 49 | # Close active processes/handles related to the environment 50 | ``` 51 | 52 | 53 | ### Configs 54 | There are two type of configuration files. Note that only agent parameters can be adjusted in the GUI app interface. 55 | 56 | Environment configurations can be varied and saved as separate inputs to change in the interface. 57 | 58 | - *agent config* is used specify agent parameters 59 | - fixed by agent methodology and hierarchy architecture 60 | - *env config* is used to specify problems specific parameters 61 | - Any parameters that are used by the environment 62 | - Specify the action limit 63 | - Specify manual sub_goal positions (exact state matching) 64 | - Specify the reward signal 65 | - [*sub_goal_reached, per_action, incomplete*] 66 | - *sub_goal_reached* is used for instruction completion 67 | - *per_action* and *incomplete* are optional if not specified by environment already 68 | 69 | ```json 70 | // agent config (agent_config.json) 71 | name = "Gym-FrozenLake" 72 | problem_type = "Gymnasium-ToyText" 73 | number_training_episodes = 1000 74 | number_training_repeats = 20 75 | agent_select = ["Qlearntab"] 76 | agent_parameters = { 77 | "Qlearntab": { 78 | "alpha": 0.1, 79 | "gamma": 0.9, 80 | "epsilon": 0.2, 81 | "epsilon_step": 0 82 | } 83 | } 84 | ``` 85 | 86 | ```json 87 | // environment config (env_config.json) 88 | environment_size = "4x4" 89 | adapter_select = ["default", "language"] 90 | action_limit = 100 91 | reward_signal = [1, 0, -0.05] 92 | sub_goal = "None" 93 | ``` 94 | ### Adapters 95 | Adapters unify problems into a standard form so any agent in the elsciRL library can be used. 96 | 97 | In short, it transforms the state to a new form, optionally adding more context and then outputting a tensor. 98 | - *inputs*: state, legal moves, action history for episode 99 | - *outputs*: tensor for the encoded form of the adapted state 100 | 101 | 102 | ```python 103 | # numeric adapter (numeric.py) 104 | class DefaultAdapter(setup_info): 105 | def __init__(): 106 | # Determine discrete environment size: e.g. "4x4" => 16 positions 107 | # Initialize a StateEncoder for these positions 108 | # Optionally define an observation space (e.g., Discrete) needed for Gym agents 109 | 110 | def adapter(state, legal_moves=[], episode_action_history=[], encode=True, indexed=False): 111 | # If encode=True, convert the numeric state to a tensor (StateEncoder) 112 | # If indexed=True, map states to integer IDs 113 | 114 | return tensor(state_encoded) 115 | ``` 116 | 117 | ```python 118 | # language adapter (language.py) 119 | class LanguageAdapter(setup_info): 120 | def __init__(): 121 | # Build obs_mapping dictionary describing each state as text 122 | # Initialize LanguageEncoder 123 | 124 | def adapter(state, legal_moves=[], episode_action_history=[], encode=True, indexed=False): 125 | # Convert numeric state ID to a text description (obs_mapping) 126 | # Optionally encode the text into a tensor (LanguageEncoder) 127 | # Optionally map each unique description to an indexed ID 128 | 129 | return tensor(state_encoded) 130 | ``` 131 | 132 | ## Analysis Scripts 133 | 134 | You can add a script file that produces problem specific analysis for the results tab. 135 | 136 | The form of this is the following, note the class function must be call *Analysis* and it must return a dict of matplotlib figures. 137 | 138 | ```python 139 | class Analysis: 140 | def __init__(self, save_dir): 141 | self.save_dir = save_dir 142 | 143 | def plot_1(self): 144 | """ 145 | Extract the results data from the save_dir and create problem 146 | specific evaluation. 147 | Return a dict of the form: 148 | { 149 | 'plot_name_1':matplotlib.figure, 150 | 'plot_name_2':matplotlib.figure 151 | } 152 | """ 153 | plot_dict = {} 154 | n = 1 155 | for data in self.save_dir: 156 | ... 157 | figure = plt.figure() 158 | ax = figure.add_subplot(1, 1, 1) 159 | ax.scatter(data['x'],data['y']) 160 | ... 161 | 162 | plot_dict['plot'+str(n)] = figure 163 | n+=1 164 | return plot_dict 165 | 166 | def plot_2(self) 167 | """Any number of plot functions will be used.""" 168 | ... 169 | return plot_dict 170 | ``` 171 | 172 | 173 | 174 | ## Prerender Data 175 | Prerender data can be used to add and image to describe the problem. 176 | 177 | Observed states are required to complete the unsupervised instruction following method. 178 | 179 | Once you have added the application to the *import_data.py* library you can use the elsciRL *get_prerender_data* tool to extract the observed states data. 180 | 181 | Run the following code and it will guide you through a prompt to select your application, the language adapter to use in instruction following and number of exploration episodes. 182 | 183 | ```python 184 | from elsciRL import get_prerender_data 185 | get = get_prerender_data() 186 | get.run() 187 | ``` 188 | 189 | A fully random search agent is used to find as many states as possible. *observed_states.txt* will be saved in the directory you run the code and you can then add this to the prerender data. -------------------------------------------------------------------------------- /elsciRL/application_suite/CACHE_README.md: -------------------------------------------------------------------------------- 1 | # Import Tool Caching Functionality 2 | 3 | This document describes the caching functionality added to the `PullApplications` class in `import_tool.py`. 4 | 5 | ## Overview 6 | 7 | The import tool now automatically caches imported data to improve performance and reduce network requests. When you import applications, the tool: 8 | 9 | 1. **Checks cache first**: Before downloading from GitHub, it checks if the data is already cached 10 | 2. **Saves to cache**: After successful imports, data is saved to a local cache file 11 | 3. **Tracks imports**: A log file records all import activities with commit IDs and timestamps 12 | 4. **Validates cache**: Cache is validated using commit IDs and source data hashes 13 | 14 | ## Cache Directory Structure 15 | 16 | The caching system creates a directory structure in `.cache`: 17 | 18 | ``` 19 | .cache/ 20 | ├── import_log.json # Import activity log 21 | ├── problem1/ # Problem-specific cache 22 | │ ├── cache_metadata.json # Cache metadata 23 | │ ├── engine/ # Engine Python files 24 | │ │ └── sailing.py 25 | │ ├── adapters/ # Adapter Python files 26 | │ │ ├── adapter1.py 27 | │ │ └── adapter2.py 28 | │ ├── experiment_configs/ # Experiment configuration files 29 | │ │ ├── config1.json 30 | │ │ └── config2.json 31 | │ ├── local_configs/ # Local configuration files 32 | │ │ ├── local_config1.json 33 | │ │ └── local_config2.json 34 | │ ├── prerender_data/ # Prerender data files 35 | │ │ ├── data1.json 36 | │ │ └── data2.json 37 | │ ├── prerender_data_encoded/ # Encoded prerender data (numpy arrays) 38 | │ │ ├── data1.npy 39 | │ │ └── data2.npy 40 | │ ├── prerender_images/ # Image files 41 | │ │ ├── image1.png 42 | │ │ └── image2.jpg 43 | │ └── instructions/ # Instruction files 44 | │ ├── instruction1.json 45 | │ └── instruction2.json 46 | └── problem2/ # Another problem's cache 47 | └── ... 48 | ``` 49 | 50 | ## Key Features 51 | 52 | ### Automatic Caching 53 | ```python 54 | from elsciRL.application_suite.import_tool import PullApplications 55 | 56 | puller = PullApplications() 57 | result = puller.pull(['sailing']) # Automatically uses cache if available 58 | ``` 59 | 60 | ### Cache Information 61 | ```python 62 | # Get information about cached data 63 | cache_info = puller.get_cache_info() 64 | print(cache_info) 65 | ``` 66 | 67 | ### Import History 68 | ```python 69 | # Get latest import information for a problem 70 | latest_info = puller.get_latest_import_info('sailing') 71 | print(latest_info) 72 | ``` 73 | 74 | ### Force Refresh 75 | ```python 76 | # Force refresh (ignores cache) 77 | result = puller.force_refresh(['sailing']) 78 | ``` 79 | 80 | ### Cache Management 81 | ```python 82 | # Clear cache for specific problem 83 | puller.clear_cache('sailing') 84 | 85 | # Clear all cache 86 | puller.clear_cache() 87 | ``` 88 | 89 | ### Main Branch Status Check 90 | ```python 91 | # Check if main branch has been updated 92 | status = puller.check_main_branch_status('sailing') 93 | if status: 94 | print(f"Needs update: {status['needs_update']}") 95 | print(f"Current main date: {status['current_main_date']}") 96 | print(f"Cached main date: {status['cached_main_date']}") 97 | ``` 98 | 99 | ### Automatic Main Branch Updates 100 | When importing with `commit_id='main'`, the system automatically: 101 | 1. Checks if the main branch has been updated since last cache 102 | 2. If updated, pulls fresh data and caches it 103 | 3. If unchanged, uses cached data 104 | 4. Logs all activities with timestamps and commit IDs 105 | 106 | ```python 107 | # This will automatically check for updates and pull fresh data if needed 108 | result = puller.pull(['sailing']) # commit_id='main' in config 109 | ``` 110 | 111 | ## Cache Validation 112 | 113 | The cache is validated using: 114 | 1. **Commit ID**: Ensures the cached data matches the requested commit 115 | 2. **Source Hash**: Detects changes in source configuration files 116 | 3. **Timestamp**: Records when the data was cached 117 | 4. **Main Branch Date Check**: For 'main' branch, checks if the main branch has been updated since last cache 118 | 119 | ## Log File Structure 120 | 121 | The import log (`import_log.json`) contains entries like: 122 | ```json 123 | { 124 | "sailing": [ 125 | { 126 | "timestamp": "2024-01-15T10:30:00.123456", 127 | "commit_id": "main", 128 | "source_hash": "abc123...", 129 | "cache_hit": false, 130 | "source_data": { 131 | "engine_folder": "environments", 132 | "engine_filename": "sailing.py", 133 | ... 134 | } 135 | } 136 | ] 137 | } 138 | ``` 139 | 140 | ## Cache Metadata 141 | 142 | Each cached problem includes metadata: 143 | ```python 144 | { 145 | "cache_metadata": { 146 | "commit_id": "main", 147 | "source_hash": "abc123...", 148 | "timestamp": "2024-01-15T10:30:00.123456", 149 | "main_branch_date": "2024-01-15T10:30:00Z", # Only for 'main' branch 150 | "main_branch_sha": "abc123def456..." # Only for 'main' branch 151 | }, 152 | "engine": , 153 | "adapters": {...}, 154 | "experiment_configs": {...}, 155 | ... 156 | } 157 | ``` 158 | 159 | ## Performance Benefits 160 | 161 | - **Faster imports**: Cached data loads instantly 162 | - **Reduced network usage**: Avoids re-downloading unchanged data 163 | - **Offline capability**: Can work with previously cached data 164 | - **Version tracking**: Know exactly which version of data you're using 165 | - **Smart main branch updates**: Only re-downloads when main branch has actually changed 166 | 167 | ## Engine and Adapter File Handling 168 | 169 | - **Python files**: Engine and adapter .py files are downloaded and cached as actual Python files 170 | - **Dynamic loading**: When loading from cache, Python files are dynamically imported 171 | - **Path management**: Cache directories are temporarily added to Python path for import 172 | - **Error handling**: Graceful fallback if cached Python files can't be loaded 173 | - **Version consistency**: Ensures cached Python files match the commit version 174 | 175 | ## Error Handling 176 | 177 | The caching system includes robust error handling: 178 | - Graceful fallback if cache files are corrupted 179 | - Automatic cache directory creation 180 | - Detailed logging of cache operations 181 | - Safe cache validation 182 | 183 | ## Example Usage 184 | 185 | See `cache_example.py` for a complete demonstration of the caching functionality. 186 | 187 | ## File Locations 188 | 189 | - Cache directory: `./.cache/` 190 | - Log file: `./.cache/import_log.json` 191 | - Problem cache: `./.cache/problem_name/` 192 | - Engine files: `./.cache/problem_name/engine/` 193 | - Adapter files: `./.cache/problem_name/adapters/` 194 | - Metadata file: `./.cache/problem_name/cache_metadata.json` 195 | 196 | The cache directory structure is automatically created when the `PullApplications` class is initialized. -------------------------------------------------------------------------------- /elsciRL/adapters/LLM_logic_generators/ollama_adapter_generator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import ollama 3 | from elsciRL.adapters.LLM_logic_generators.adapter_prompt import adapter_prompt 4 | 5 | # Configure logging 6 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 7 | 8 | class OllamaAdapterGenerator: 9 | def __init__(self, pseudocode_model: str, save_pseudocode: bool = False, pseudocode_file_path: str = None): 10 | """ 11 | Initializes the OllamaAdapterGenerator. 12 | 13 | Args: 14 | primary_ollama_model_func: A function that simulates/calls the first Ollama model 15 | (equivalent to text_ollama with encode=False). 16 | It should take text and return transformed text. 17 | pseudocode_ollama_model_func: A function that simulates/calls the Ollama model 18 | for generating pseudocode. It should take a prompt 19 | and return the generated pseudocode string. 20 | """ 21 | logging.info("OllamaAdapterGenerator initialized.") 22 | self.pseudocode_model = pseudocode_model 23 | if save_pseudocode: 24 | self.pseudocode_file_path = pseudocode_file_path 25 | else: 26 | self.pseudocode_file_path = None 27 | 28 | def _generate_pseudocode_via_ollama(self, prompt: str) -> str: 29 | """ 30 | Simulates calling another Ollama model to generate pseudocode. 31 | """ 32 | logging.info("Generating pseudocode via (simulated) Ollama LLM...") 33 | 34 | response = ollama.chat( 35 | model=self.pseudocode_model, # Or another model suitable for code generation 36 | messages=[ 37 | { 38 | 'role': 'user', 39 | 'content': prompt, 40 | } 41 | ], 42 | stream=False 43 | ) 44 | 45 | logging.info(f"Generated pseudocode (simulated):\n{prompt}") 46 | return response['message']['content'] 47 | 48 | def generate_adapter_pseudocode(self, environment_states: dict, transformed_states: str) -> str: 49 | """ 50 | Logs environment states, processes input text using a primary Ollama model (simulated), 51 | and then uses another Ollama model (simulated) to generate Python pseudocode 52 | for an adapter function. 53 | 54 | Args: 55 | environment_states: A dictionary representing states from the environment. 56 | transformed_states: The LLM generated states. 57 | 58 | Returns: 59 | A string containing Python-like pseudocode for the adapter function. 60 | """ 61 | logging.info(f"Generating adapter pseudocode for input text: '{transformed_states}'") 62 | logging.info(f"Environment states: {environment_states}") 63 | 64 | 65 | # 1. Prepare data for the pseudocode-generating LLM 66 | prompt_for_pseudocode_llm = f""" 67 | Given the following information: 68 | 1. Environment States: {environment_states} 69 | 2. Transformed Output Text: "{transformed_states}" 70 | 71 | Generate Python-like pseudocode for an 'adapter_function' in the form defined by {adapter_prompt}. 72 | 73 | The pseudocode should outline the logic rules necessary to transform the original input text 74 | (or a similar input) into the transformed output text, considering the environment states. 75 | The function should aim to replicate the transformation performed by the primary LLM. 76 | 77 | These logic rules can be defied directly by a set of functions such as: 78 | def adapter(state, legal_moves=[], episode_action_history=[], encode=True, indexed=False): 79 | if state[0] == 'some_value': 80 | return "{transformed_states[:0]}..." # (Adjust based on logic) 81 | elif state[1] == 'some_other_value': 82 | return "{transformed_states[:0]}..." # (Adjust based on logic) 83 | else: 84 | return "some_other_transformation..." 85 | 86 | Or a lookup dictionary or table such as: 87 | obs_mapping = {{ 88 | 'some_value': 'some_other_value', 89 | 'some_other_value': 'some_other_other_value', 90 | 'some_other_other_value': 'some_other_other_other_value', 91 | }} 92 | def adapter(state, legal_moves=[], episode_action_history=[], encode=True, indexed=False): 93 | return obs_mapping[state] 94 | 95 | The logic rules can use the current state, legal moves, and action history to determine the output. 96 | 97 | Please provide only the Python pseudocode for adapter_function. 98 | """ 99 | logging.info("Constructed prompt for pseudocode generation LLM.") 100 | 101 | # 2. Pass data to another LLM to create Python pseudocode 102 | pseudocode = self._generate_pseudocode_via_ollama(prompt_for_pseudocode_llm) 103 | 104 | logging.info("Successfully generated adapter pseudocode.") 105 | if self.pseudocode_file_path: 106 | with open(self.pseudocode_file_path, 'w') as f: 107 | f.write(pseudocode) 108 | logging.info(f"Pseudocode saved to {self.pseudocode_file_path}") 109 | return pseudocode 110 | 111 | if __name__ == '__main__': 112 | # Example Usage 113 | # Initialize the generator 114 | adapter_gen = OllamaAdapterGenerator(pseudocode_model='llama3.2', save_pseudocode=True, pseudocode_file_path='./pseudocode_sample.py') 115 | 116 | # Example data 117 | sample_env_states = {'Location': 'London', 118 | 'Day': 'Monday', 119 | 'Time': 'Morning', 120 | 'Weather':{ 121 | "cloud_cover": "low", 122 | "temperature": "70 degrees", 123 | "humidity": "20%", 124 | "wind_speed": "10 mph", 125 | "wind_direction": "N" 126 | }, 127 | 'Location': 'London', 128 | 'Day': 'Monday', 129 | 'Time': 'Afternoon', 130 | 'Weather':{ 131 | "cloud_cover": "moderate", 132 | "temperature": "85 degrees", 133 | "humidity": "40%", 134 | "wind_speed": "15 mph", 135 | "wind_direction": "SW" 136 | }, 137 | 138 | } 139 | sample_output = ["The weather on Monday morning in London is sunny and dry, the temperature is 70 degrees and low humidity and a light breeze.", 140 | "The weather on Monday afternoon in London is cloudy, the temperature is 85 degrees and moderate humidity and a moderate breeze from the south-west."] 141 | 142 | # Generate pseudocode 143 | generated_code = adapter_gen.generate_adapter_pseudocode(sample_env_states, sample_output) 144 | 145 | -------------------------------------------------------------------------------- /elsciRL/environment_setup/elsciRL_info.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | import numpy.typing as npt 4 | import random 5 | 6 | 7 | class elsciRLInfo: 8 | def __init__( 9 | self, 10 | observed_states: dict | None = None, 11 | experience_sampling: dict | None = None, 12 | # tensor_index: dict | None = None, 13 | ) -> None: 14 | if not experience_sampling: 15 | self.experience_sampling = {} 16 | else: 17 | self.experience_sampling = experience_sampling 18 | 19 | if not observed_states: 20 | self.observed_states = {} 21 | else: 22 | self.observed_states = observed_states 23 | 24 | def observed_state_tracker( 25 | self, 26 | engine_observation: Tensor | npt.ArrayLike | list | None = None, 27 | language_state: Tensor | npt.ArrayLike | list | None = None, 28 | ): 29 | """Tracks adapted form of state from engine observation for unsupervised approaches.""" 30 | if engine_observation not in self.observed_states: 31 | self.observed_states[engine_observation] = language_state 32 | return self.observed_states 33 | 34 | def experience_sampling_add( 35 | self, 36 | engine_observation: Tensor | npt.ArrayLike | list | None = None, 37 | action: int | str | bool | None = None, 38 | next_observation: Tensor | npt.ArrayLike | list | None = None, 39 | reward: float = 0, 40 | terminated: bool = False, 41 | ): 42 | """Adds experience from interaction with the Live environment to sample from.""" 43 | # -------------------------------------------------------------------------- 44 | # Required if input observation is tensor as this cant be used for dict keys 45 | # - create tuple store transitions 46 | if type(engine_observation) is Tensor: 47 | engine_observation = tuple(engine_observation.cpu().numpy().flatten()) 48 | if type(next_observation) is Tensor: 49 | next_observation = tuple(next_observation.cpu().numpy().flatten()) 50 | # -------------------------------------------------------------------------- 51 | # Get occurrence of current observation+action 52 | if engine_observation not in self.experience_sampling: 53 | self.experience_sampling[engine_observation] = {} 54 | if action not in self.experience_sampling[engine_observation]: 55 | self.experience_sampling[engine_observation][action] = {} 56 | self.experience_sampling[engine_observation][action]["obs_a_count"] = 1 57 | obs_a_count = ( 58 | self.experience_sampling[engine_observation][action]["obs_a_count"] + 1 59 | ) 60 | self.experience_sampling[engine_observation][action][ 61 | "obs_a_count" 62 | ] = obs_a_count 63 | 64 | # Get occurrence of next obs given obs+action 65 | # - Compute prob, reward is static and set on first occurrence 66 | if next_observation in self.experience_sampling[engine_observation][action]: 67 | next_obs_count = ( 68 | self.experience_sampling[engine_observation][action][next_observation][ 69 | "next_obs_count" 70 | ] 71 | + 1 72 | ) 73 | prob = next_obs_count / obs_a_count 74 | self.experience_sampling[engine_observation][action][next_observation][ 75 | "next_obs_count" 76 | ] = next_obs_count 77 | self.experience_sampling[engine_observation][action][next_observation][ 78 | "prob" 79 | ] = prob 80 | else: 81 | self.experience_sampling[engine_observation][action][next_observation] = {} 82 | self.experience_sampling[engine_observation][action][next_observation][ 83 | "next_obs_count" 84 | ] = 1 85 | self.experience_sampling[engine_observation][action][next_observation][ 86 | "prob" 87 | ] = (1 / obs_a_count) 88 | self.experience_sampling[engine_observation][action][next_observation][ 89 | "reward" 90 | ] = reward 91 | self.experience_sampling[engine_observation][action][next_observation][ 92 | "terminated" 93 | ] = terminated 94 | 95 | def experience_sampling_legal_actions( 96 | self, engine_observation: Tensor | npt.ArrayLike | list | None = None 97 | ): 98 | """Returns a list of known actions from the experience.""" 99 | if type(engine_observation) is Tensor: 100 | engine_observation = tuple(engine_observation.cpu().numpy().flatten()) 101 | # state_tuple = tuple(engine_observation) 102 | # engine_observation = self.tensor_index.index(state_tuple) 103 | if engine_observation in self.experience_sampling: 104 | legal_actions = list(self.experience_sampling[engine_observation].keys()) 105 | else: 106 | legal_actions = None 107 | return legal_actions 108 | 109 | def experience_sampling_step( 110 | self, 111 | engine_observation: Tensor | npt.ArrayLike | list | None = None, 112 | action: int | str | bool | None = None, 113 | ): 114 | """Outcome of action given current observation from sampled experience.""" 115 | # If state-action has not been seen from live system 116 | engine_observation_shape = None 117 | next_obs = None 118 | if type(engine_observation) is Tensor: 119 | engine_observation_shape = engine_observation.shape 120 | engine_observation = tuple(engine_observation.cpu().numpy().flatten()) 121 | 122 | if action not in self.experience_sampling[engine_observation]: 123 | next_obs = engine_observation 124 | reward = 0 125 | terminated = False 126 | # Select action from distribution of probabilities 127 | else: 128 | cumulative = 0 129 | rng = random.random() 130 | for next_obs in self.experience_sampling[engine_observation][action]: 131 | # first key is just the count of obs+action so skip over this 132 | if next_obs != "obs_a_count": 133 | if ( 134 | self.experience_sampling[engine_observation][action][next_obs][ 135 | "prob" 136 | ] 137 | <= rng 138 | ): 139 | break 140 | else: 141 | cumulative += self.experience_sampling[engine_observation][ 142 | action 143 | ][next_obs]["prob"] 144 | 145 | reward = self.experience_sampling[engine_observation][action][next_obs][ 146 | "reward" 147 | ] 148 | terminated = self.experience_sampling[engine_observation][action][next_obs][ 149 | "terminated" 150 | ] 151 | 152 | # -------------------------------------------------------------------------- 153 | # Converts stored obs back from int to tensor to match env 154 | if (type(next_obs) is tuple) and ( 155 | engine_observation_shape is not None 156 | ): # Phil: had to fix AND if statements by separating fully 157 | next_obs = torch.tensor(next_obs).reshape(engine_observation_shape) 158 | # -------------------------------------------------------------------------- 159 | return next_obs, reward, terminated 160 | -------------------------------------------------------------------------------- /elsciRL/agents/DQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | import numpy as np 5 | from collections import deque 6 | import random 7 | import pickle 8 | from typing import List, Tuple, Dict, Any, Hashable, Iterable 9 | 10 | from elsciRL.agents.agent_abstract import QLearningAgent 11 | 12 | class DQN(nn.Module): 13 | def __init__(self, input_size: int, output_size: int, hidden_size: int = 128): 14 | super(DQN, self).__init__() 15 | self.network = nn.Sequential( 16 | nn.Linear(input_size, hidden_size), 17 | nn.ReLU(), 18 | nn.Linear(hidden_size, hidden_size), 19 | nn.ReLU(), 20 | nn.Linear(hidden_size, output_size) 21 | ) 22 | 23 | def forward(self, x: torch.Tensor) -> torch.Tensor: 24 | return self.network(x) 25 | 26 | class DQNAgent(QLearningAgent): 27 | def __init__(self, 28 | input_size: int, 29 | output_size: int, 30 | hidden_size: int = None, 31 | learning_rate: float = 0.001, 32 | gamma: float = 0.99, 33 | epsilon: float = 1.0, 34 | epsilon_min: float = 0.01, 35 | epsilon_decay: float = 0.995, 36 | memory_size: int = 10000, 37 | batch_size: int = 64, 38 | target_update: int = 10, 39 | device: str = None, 40 | ): 41 | 42 | self.input_size = input_size 43 | self.output_size = output_size 44 | 45 | 46 | self.memory = deque(maxlen=memory_size) 47 | self.batch_size = batch_size 48 | self.gamma = gamma 49 | self.epsilon_reset = epsilon 50 | self.epsilon = epsilon 51 | self.epsilon_min = epsilon_min 52 | self.epsilon_decay = epsilon_decay 53 | self.target_update = target_update 54 | self.update_counter = 0 55 | 56 | # Create main and target networks with optional device specification 57 | if device is None: 58 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 59 | else: 60 | self.device = torch.device(device) 61 | 62 | self.policy_net = DQN(input_size, output_size, hidden_size).to(self.device) 63 | print(f"DQN Agent initialized on device: {self.device}") 64 | print(self.policy_net) 65 | self.target_net = DQN(input_size, output_size, hidden_size).to(self.device) 66 | self.target_net.load_state_dict(self.policy_net.state_dict()) 67 | 68 | self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate) 69 | self.criterion = nn.MSELoss() 70 | 71 | def save(self) -> List[Dict]: 72 | """Save the agent's state""" 73 | return [ 74 | self.policy_net.state_dict(), 75 | self.target_net.state_dict(), 76 | self.optimizer.state_dict(), 77 | { 78 | 'epsilon': self.epsilon, 79 | 'update_counter': self.update_counter, 80 | 'memory': list(self.memory) # Convert deque to list for serialization 81 | } 82 | ] 83 | 84 | def load(self, saved_agent: List[Dict]): 85 | """Load the agent's state""" 86 | if len(saved_agent) >= 4: 87 | self.policy_net.load_state_dict(saved_agent[0]) 88 | self.target_net.load_state_dict(saved_agent[1]) 89 | self.optimizer.load_state_dict(saved_agent[2]) 90 | state_dict = saved_agent[3] 91 | self.epsilon = state_dict['epsilon'] 92 | self.update_counter = state_dict['update_counter'] 93 | if 'memory' in state_dict: 94 | self.memory = deque(state_dict['memory'], maxlen=self.memory.maxlen) 95 | 96 | def exploration_parameter_reset(self): 97 | """Reset the exploration parameter to its initial value""" 98 | self.epsilon = self.epsilon_reset 99 | 100 | def clone(self): 101 | """Create a deep copy of the agent""" 102 | clone = pickle.loads(pickle.dumps(self)) 103 | clone.epsilon = self.epsilon_reset 104 | return clone 105 | 106 | def policy(self, state: torch.Tensor, legal_actions: list, **kwargs) -> Hashable: 107 | """Select action using epsilon-greedy policy""" 108 | if random.random() < self.epsilon: 109 | # Decay epsilon 110 | if self.epsilon > self.epsilon_min: 111 | self.epsilon *= self.epsilon_decay 112 | return random.choice(legal_actions) 113 | 114 | with torch.no_grad(): 115 | state = state.to(self.device) 116 | # Ensure state has correct shape [batch_size, input_size] 117 | if len(state.shape) == 1: 118 | state = state.unsqueeze(0) 119 | q_values = self.policy_net(state) 120 | 121 | # Mask illegal actions with large negative values 122 | mask = torch.ones_like(q_values) * float('-inf') 123 | for action in legal_actions: 124 | mask[0][action] = 0 125 | q_values = q_values + mask 126 | 127 | return q_values.argmax().item() 128 | 129 | def learn(self, state: torch.Tensor, next_state: torch.Tensor, 130 | immediate_reward: float, action: Hashable, **kwargs) -> None: 131 | """Store experience and train the network""" 132 | # Detach tensors and move to CPU to save GPU memory 133 | # This prevents keeping computational graphs in replay buffer 134 | if isinstance(state, torch.Tensor): 135 | state = state.detach().cpu() 136 | if len(state.shape) == 1: 137 | state = state.unsqueeze(0) 138 | if isinstance(next_state, torch.Tensor): 139 | next_state = next_state.detach().cpu() 140 | if len(next_state.shape) == 1: 141 | next_state = next_state.unsqueeze(0) 142 | 143 | # Store experience in replay memory (on CPU to save GPU memory) 144 | self.memory.append((state, action, next_state, immediate_reward)) 145 | 146 | # Train if enough samples 147 | if len(self.memory) >= self.batch_size: 148 | self._train() 149 | 150 | # Update target network periodically 151 | self.update_counter += 1 152 | if self.update_counter % self.target_update == 0: 153 | self.target_net.load_state_dict(self.policy_net.state_dict()) 154 | 155 | 156 | def _train(self): 157 | """Train the network using experience replay""" 158 | batch = random.sample(self.memory, self.batch_size) 159 | states, actions, next_states, rewards = zip(*batch) 160 | 161 | # Convert to tensors and move to GPU only for training 162 | states = torch.cat(states).to(self.device) # [batch_size, input_size] 163 | next_states = torch.cat(next_states).to(self.device) # [batch_size, input_size] 164 | actions = torch.tensor(actions, device=self.device).long() # [batch_size] 165 | rewards = torch.tensor(rewards, device=self.device).float() # [batch_size] 166 | 167 | # Clear optimizer gradients 168 | self.optimizer.zero_grad() 169 | 170 | # Get current Q values 171 | current_q_values = self.policy_net(states) # [batch_size, output_size] 172 | current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1) # [batch_size] 173 | 174 | # Get next Q values from target network 175 | with torch.no_grad(): 176 | next_q_values = self.target_net(next_states).max(1)[0][:self.batch_size] # [batch_size] 177 | 178 | # Compute target Q values (detach to prevent gradient flow) 179 | target_q_values = (rewards + (self.gamma * next_q_values)).detach() 180 | 181 | # Compute loss and update 182 | loss = self.criterion(current_q_values, target_q_values) 183 | loss.backward() 184 | self.optimizer.step() 185 | 186 | # Clear intermediate tensors from GPU memory 187 | del states, next_states, actions, rewards, current_q_values, next_q_values, target_q_values, loss 188 | torch.cuda.empty_cache() if torch.cuda.is_available() else None -------------------------------------------------------------------------------- /elsciRL/environment_setup/gym_translator.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Any 2 | import gymnasium as gym 3 | from gymnasium.envs.registration import register 4 | from gymnasium import spaces 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from matplotlib.backends.backend_agg import FigureCanvasAgg 8 | 9 | 10 | def _figure_to_rgb_array(fig): 11 | """Convert a Matplotlib figure into an RGB numpy array.""" 12 | canvas = fig.canvas 13 | if not isinstance(canvas, FigureCanvasAgg): 14 | canvas = FigureCanvasAgg(fig) 15 | canvas.draw() 16 | width, height = canvas.get_width_height() 17 | # Prefer buffer RGBA for compatibility, then drop alpha channel. 18 | if hasattr(canvas, "buffer_rgba"): 19 | buffer = np.asarray(canvas.buffer_rgba(), dtype=np.uint8) 20 | array = np.array(buffer).reshape((height, width, 4))[..., :3] 21 | else: 22 | buffer = np.frombuffer(canvas.tostring_rgb(), dtype=np.uint8) 23 | array = buffer.reshape((height, width, 3)) 24 | plt.close(fig) 25 | return array 26 | 27 | class EngineToGym(gym.Env): 28 | def __init__(self): 29 | print("elsciRL Env transformed to Gym Env.") 30 | 31 | def load(self, Engine, engine_name:str=None, Adapter:Callable[[Any], Any]=None, setup_info:dict={}): 32 | self.engine = Engine(setup_info) 33 | self.Adapter = Adapter(setup_info=setup_info) 34 | self.reward_signal = None 35 | self.reward_signal_tracker = [] 36 | # Use name if given directly, otherwise check engine ledger 37 | if engine_name is not None: 38 | self.name = engine_name 39 | elif (self.engine.ledger['id'] != 'Unique Problem ID')&(self.engine.ledger['id'] != ''): 40 | self.name = self.engine.ledger['id'] 41 | else: 42 | print("\n WARNING: Engine name not set, using default name --> set inside ledger [id] field.") 43 | self.name = "elsciRLGymEnv-v0" 44 | 45 | # -------------------------- 46 | # Define observation and action spaces 47 | # - Observations are dictionaries with the agent's and the target's location. 48 | # - Each location is encoded as an element of {0, ..., `size`}^2, i.e. MultiDiscrete([size, size]). 49 | try: 50 | # First check if observation space is defined by the adapter 51 | self.observation_space = self.Adapter.observation_space 52 | except: 53 | # Then check if observation space is defined by the engine 54 | try: 55 | self.observation_space = self.engine.observation_space 56 | except AttributeError: 57 | # Otherwise, use default observation space 58 | print("WARNING: Observation space not defined in either adapter of engine.") 59 | 60 | # - A single dimension of N number of discrete actions 61 | self.action_space = spaces.Discrete(self.engine.ledger['action_space_size']) 62 | # -------------------------- 63 | self.render_mode = self.engine.ledger['render_data']['render_mode'] 64 | 65 | def reset(self, seed=None, options=None): 66 | observation = self.engine.reset() 67 | self.last_obs = observation 68 | self.last_info = {} 69 | obs_enc = self.Adapter.adapter( 70 | observation, 71 | self.engine.legal_move_generator(), 72 | self.engine.action_history, 73 | encode=True, 74 | ) 75 | obs_enc = self._format_observation(obs_enc) 76 | self.reward_signal_tracker = [] # Only give agent reward for first time it sees a sub-goal 77 | self.action_history = [] # Reset action history 78 | self.episode_reward = 0 79 | #self.obs_history = [] 80 | return obs_enc, {} 81 | 82 | def step(self, state=[], action=0): 83 | # Gym step function combines elsciRL Engine step and Adapter 84 | base_state = getattr(self, "last_obs", None) 85 | step_result = self.engine.step(state=base_state, action=action) 86 | if not isinstance(step_result, tuple) or len(step_result) != 4: 87 | print( 88 | "[EngineToGym] Invalid engine step output:", 89 | { 90 | "engine": type(self.engine).__name__, 91 | "adapter": type(self.Adapter).__name__, 92 | "result": step_result, 93 | }, 94 | ) 95 | raise ValueError( 96 | "Engine.step must return a tuple of (observation, reward, terminated, info). " 97 | f"Received: {step_result!r}" 98 | ) 99 | observation, reward, terminated, info = step_result 100 | if isinstance(action, np.int64): 101 | self.action_history.append(action.item()) 102 | else: 103 | self.action_history.append(action) 104 | # if observation not in self.obs_history: 105 | # reward += 0.05 # Give small reward to encourage exploration 106 | # self.obs_history.append(observation) 107 | if info: 108 | info['obs'] = observation 109 | else: 110 | info = {'obs': observation} 111 | 112 | # Apply custom reward signal if defined 113 | # - Defined as dict:= {obs:reward, obs:reward, ...} 114 | engine_reward_signal = getattr(self.engine, "reward_signal", None) 115 | if engine_reward_signal: 116 | if observation in engine_reward_signal: 117 | if observation not in self.reward_signal_tracker: 118 | # Only override if new reward is higher 119 | if engine_reward_signal[observation] > reward: 120 | reward = engine_reward_signal[observation] 121 | self.reward_signal_tracker.append(observation) 122 | 123 | 124 | # If a language problem then we also want processed observation 125 | # TODO: Need better method for checking if language problem 126 | if 'lang' in self.engine.ledger['type'].lower(): 127 | obs_adapted = self.Adapter.adapter(observation, self.engine.legal_move_generator(), 128 | self.engine.action_history, encode = False) 129 | info['obs_adapted'] = obs_adapted 130 | obs_enc = self.Adapter.adapter( 131 | observation, 132 | self.engine.legal_move_generator(), 133 | self.engine.action_history, 134 | encode=True, 135 | ) 136 | obs_enc = self._format_observation(obs_enc) 137 | truncated = False 138 | self.episode_reward += reward 139 | self.last_obs = observation 140 | self.last_info = info 141 | return obs_enc, reward, terminated, truncated, info 142 | 143 | def _format_observation(self, obs_enc): 144 | """Ensure adapter outputs match the declared Gym observation space.""" 145 | 146 | # Handle PyTorch tensors - move to CPU before converting to numpy 147 | if hasattr(obs_enc, "detach"): 148 | obs_enc = obs_enc.detach() 149 | if hasattr(obs_enc, "cpu"): 150 | obs_enc = obs_enc.cpu() 151 | obs_array = np.asarray(obs_enc, dtype=np.float32) 152 | 153 | if isinstance(self.observation_space, spaces.Discrete): 154 | # Convert one-hot or vector encodings to scalar indices 155 | if obs_array.ndim == 0: 156 | return np.int64(obs_array.item()) 157 | if obs_array.ndim == 1 and obs_array.size > 1: 158 | return np.int64(np.argmax(obs_array)) 159 | return np.int64(obs_array.flatten()[0]) 160 | 161 | # Default: ensure numpy array on CPU with correct dtype 162 | return obs_array 163 | 164 | def render(self): 165 | render_output = self.engine.render() 166 | if isinstance(render_output, np.ndarray): 167 | return render_output 168 | if hasattr(render_output, "canvas"): 169 | return _figure_to_rgb_array(render_output) 170 | return np.asarray(render_output) 171 | 172 | def close(self): 173 | self.engine.close() 174 | 175 | def __call__(self, *args: Any, **kwds: Any) -> Any: 176 | return self 177 | 178 | 179 | @staticmethod 180 | def GymRegistration(Engine, Adapter, setup_info:dict={}): 181 | """This provides a function for converting elsciRL engines into OpenAI Gym environments. \n 182 | elsciRL engines include a conditional action space which is not inherently supported by OpenAI Gym. \n 183 | Outputs Engine in the OpenAI Gym format with a wrapper for the elsciRL adapter. 184 | """ 185 | # Translate Engine to OpenAI Gym class structure 186 | environment = EngineToGym() 187 | environment.load(Engine, 'Test-1', Adapter, setup_info) 188 | # Register and make the environment 189 | register(id=environment.name, entry_point=environment) 190 | gym_env = gym.make(environment.name) 191 | 192 | return gym_env 193 | -------------------------------------------------------------------------------- /elsciRL/examples/DemoExperiment.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import os 3 | # ====== elsciRL IMPORTS =============================================== 4 | # ------ EXPERIMENT ---------------------------------------------------- 5 | from elsciRL.experiments.standard import Experiment as STANDARD_RL 6 | # ------ Visual Analysis ----------------------------------------------- 7 | from elsciRL.analysis.combined_variance_visual import combined_variance_analysis_graph as COMBINED_VARIANCE_ANALYSIS_GRAPH 8 | # ---------------------------------------------------------------------- 9 | 10 | class DemoExperiment: 11 | def __init__(self): 12 | # Create output directory if it doesn't exist 13 | self.cwd = os.getcwd()+'/elsciRL-EXAMPLE-output' 14 | if not os.path.exists(self.cwd): 15 | os.mkdir(self.cwd) 16 | 17 | def help(self): 18 | help_output = """ 19 | This is a demo experiment script for elsciRL. 20 | It allows you to run a standard RL experiment on a selected problem from the elsciRL application suite. 21 | The script will guide you through the process of selecting a problem, configuring the experiment, and running it. 22 | You can also evaluate the results of the experiment after it has been run. 23 | Usage: 24 | 1. Run the script. 25 | 2. Follow the prompts to select a problem and configure the experiment. 26 | 3. The experiment will be run and the results will be saved in a directory. 27 | 4. You can evaluate the results by calling the evaluate() method. 28 | Example: 29 | >>> demo = DemoExperiment() 30 | >>> demo.run() 31 | >>> demo.evaluate() 32 | """ 33 | print(help_output) 34 | 35 | def input(self): 36 | # ----- User Input ----- 37 | # 1. Number training episodes 38 | print("Please enter the number of ... (skip to use default) ") 39 | num_train_epi = input('\t - Training episodes: ') 40 | if num_train_epi == '': 41 | num_train_epi = 1000 42 | else: 43 | num_train_epi = int(num_train_epi) 44 | 45 | # Update experiment config 46 | self.num_train_epi = num_train_epi 47 | # ---------------------- 48 | 49 | def results_save_dir(self): 50 | # Specify save dir 51 | # - Needs to be performed here in case user changes parameters and re-runs 52 | time = datetime.now().strftime("%d-%m-%Y_%H-%M") 53 | self.save_dir = self.cwd+'/test_'+time 54 | if not os.path.exists(self.save_dir): 55 | os.mkdir(self.save_dir) 56 | # --- 57 | 58 | def experiment(self, problem:str, exp_save_dir:str, num_train_epi:int=0): 59 | 60 | # --- Select local config and experiment config --- 61 | print("--- LOCAL CONFIGURATION SELECTION ---") 62 | for i, local_config in enumerate(self.pull_app_data[problem]['local_configs'].keys()): 63 | print(f"{i+1}. {local_config}") 64 | local_config_id = input("Please select the local config number (default 1): ") 65 | if local_config_id.isdigit() and 0 < int(local_config_id) <= len(self.pull_app_data[problem]['local_configs']): 66 | local_config_id = int(local_config_id) - 1 67 | else: 68 | local_config_id = 0 69 | LocalConfig = self.pull_app_data[problem]['local_configs'][list(self.pull_app_data[problem]['local_configs'].keys())[local_config_id]] 70 | 71 | print("\n --- EXPERIMENT CONFIGURATION SELECTION ---") 72 | for i, experiment_config in enumerate(self.pull_app_data[problem]['experiment_configs'].keys()): 73 | print(f"{i+1}. {experiment_config}") 74 | experiment_config_id = input("Please select the experiment config number (default 1): ") 75 | if experiment_config_id.isdigit() and 0 < int(experiment_config_id) <= len(self.pull_app_data[problem]['experiment_configs']): 76 | experiment_config_id = int(experiment_config_id) - 1 77 | else: 78 | experiment_config_id = 0 79 | ExperimentConfig = self.pull_app_data[problem]['experiment_configs'][list(self.pull_app_data[problem]['experiment_configs'].keys())[experiment_config_id]] 80 | 81 | if num_train_epi != 0: 82 | ExperimentConfig['number_training_episodes'] = num_train_epi 83 | if int(num_train_epi/10) > 10: 84 | ExperimentConfig['number_test_episodes'] = int(num_train_epi/10) 85 | else: 86 | ExperimentConfig['number_test_episodes'] = 10 87 | 88 | # ------------------------------------------------------ 89 | # Adapter Selection 90 | print("\n --- ADAPTER SELECTION ---") 91 | for i, adapter in enumerate(self.pull_app_data[problem]['adapters'].keys()): 92 | if not adapter.startswith('LLM'): 93 | print(f"{i+1}. {adapter}") 94 | adapter_id = input("Please select the adapter number (default 1): ") 95 | if adapter_id.isdigit() and 0 < int(adapter_id) <= len(self.pull_app_data[problem]['adapters']): 96 | adapter_id = int(adapter_id) - 1 97 | else: 98 | adapter_id = 0 99 | 100 | # -------------------------------------------------------------------- 101 | # Set the selected agent 102 | ExperimentConfig['agent_select'] = ['Qlearntab'] 103 | ExperimentConfig['adapter_select'] = [list(self.pull_app_data[problem]['adapters'].keys())[adapter_id]] 104 | ExperimentConfig['adapter_input_dict'] = {'Qlearntab': [list(self.pull_app_data[problem]['adapters'].keys())[adapter_id]]} 105 | if ExperimentConfig['number_training_repeats'] > 1: 106 | ExperimentConfig['number_training_repeats'] = 5 107 | if ExperimentConfig['number_training_seeds'] > 1: 108 | ExperimentConfig['number_training_seeds'] = 5 109 | # Flat Baselines 110 | exp = STANDARD_RL(Config=ExperimentConfig, ProblemConfig=LocalConfig, 111 | Engine=self.pull_app_data[problem]['engine'], Adapters=self.pull_app_data[problem]['adapters'], 112 | save_dir=exp_save_dir, show_figures = 'No', window_size=0.1) 113 | # -------------------------------------------------------------------- 114 | return exp 115 | 116 | def run(self): 117 | # IMPORT HERE SO ITS NOT LOADED ON STARTUP 118 | from elsciRL.application_suite.import_tool import PullApplications 119 | self.application_data = PullApplications() 120 | self.application_list:list=['Classroom', 'Gym-FrozenLake', 'Sailing'] 121 | self.pull_app_data = self.application_data.pull(problem_selection=self.application_list) 122 | print("\n --- PULLING APPLICATION DATA ---") 123 | for app in self.application_list: 124 | print("--------------------------------------------------") 125 | print(f"Application: {app}") 126 | print("Engine:", self.pull_app_data[app]['engine']) 127 | print("Adapters:", self.pull_app_data[app]['adapters']) 128 | print("Experiment Configs:", self.pull_app_data[app]['experiment_configs']) 129 | print("Local Configs:", self.pull_app_data[app]['local_configs']) 130 | print("-------------------------------------------------- \n ") 131 | # USER INPUTS FOR BASIC SELECTION OPTIONS 132 | # --- Problem selection --- 133 | print("\n --- PROBLEM SELECTION ---") 134 | for i, prob in enumerate(self.application_list): 135 | print(f"{i+1}. {prob}") 136 | problem_id = input("Please enter the problem number to run (default 1): ") 137 | if problem_id.isdigit() and 0 < int(problem_id) <= len(self.application_list): 138 | problem = self.application_list[int(problem_id) - 1] 139 | else: 140 | problem = self.application_list[0] 141 | 142 | # --- TRAINING EPISODES INPUT --- 143 | num_train_episodes = input("Please enter the number of training episodes (default 1000): ") 144 | if num_train_episodes == '': 145 | num_train_episodes = 1000 146 | try: 147 | num_train_episodes = int(num_train_episodes) 148 | except ValueError: 149 | print("Invalid input for number of training episodes. Using default value of 1000.") 150 | num_train_episodes = 1000 151 | # ------------------------------- 152 | 153 | self.results_save_dir() 154 | problem_save_dir = self.save_dir + '/' + problem 155 | if not os.path.exists(problem_save_dir): 156 | os.mkdir(problem_save_dir) 157 | print("\n --------------------------------------------------") 158 | print('Training and Testing on {p} environment'.format(p=problem)) 159 | print("-------------------------------------------------- \n ") 160 | exp = self.experiment(problem, problem_save_dir, num_train_epi=int(num_train_episodes)) 161 | exp.train() 162 | exp.test() 163 | # exp.render_results() 164 | 165 | def evaluate(self): 166 | COMBINED_VARIANCE_ANALYSIS_GRAPH(self.save_dir, 'TRAINING', show_figures='Yes') 167 | COMBINED_VARIANCE_ANALYSIS_GRAPH(self.save_dir, 'TESTING', show_figures='Yes') -------------------------------------------------------------------------------- /elsciRL/instruction_following/LLM_instr_planner/LLM_instr_validator.py: -------------------------------------------------------------------------------- 1 | import ollama 2 | from typing import Optional, Dict, Any 3 | import json 4 | import logging 5 | 6 | class LLMInstructionValidator: 7 | """ 8 | A class for validating if a given text matches or completes an instruction 9 | using Large Language Model reasoning via Ollama. 10 | """ 11 | 12 | def __init__(self, 13 | model: str = "llama3.2"): 14 | """ 15 | Initialize the LLM Instruction Validator. 16 | 17 | Args: 18 | model: Ollama model to use for validation (e.g., "llama3.2", "mistral", "codellama"). 19 | temperature: Temperature for LLM responses (lower = more deterministic). 20 | host: Ollama host URL. If None, uses default localhost. 21 | """ 22 | self.model = model 23 | self.logger = logging.getLogger(__name__) 24 | 25 | def validate_instruction_completion(self, 26 | instruction_description: str, 27 | best_match: str) -> Dict[str, Any]: 28 | """ 29 | Compare instruction description with best match to determine if the 30 | best match completes or fulfills the instruction. 31 | 32 | Args: 33 | instruction_description: The original instruction or task description 34 | best_match: The text/response that potentially completes the instruction 35 | 36 | Returns: 37 | Dict containing: 38 | - 'is_complete': Boolean indicating if instruction is completed 39 | - 'confidence': Float between 0-1 indicating confidence level 40 | - 'reasoning': String explaining the LLM's reasoning 41 | - 'partial_completion': Boolean if partially completed 42 | """ 43 | 44 | # Construct the prompt for the LLM 45 | prompt = self._construct_validation_prompt(instruction_description, best_match) 46 | 47 | try: 48 | response = ollama.chat( 49 | model=self.model, 50 | messages=[ 51 | { 52 | "role": "system", 53 | "content": "You are an expert at evaluating whether responses complete given instructions. " 54 | "You must respond with valid JSON format." 55 | }, 56 | { 57 | "role": "user", 58 | "content": prompt 59 | } 60 | ], 61 | options={ 62 | "num_predict": 500 63 | } 64 | ) 65 | 66 | # Parse the LLM response 67 | result = self._parse_llm_response(response['message']['content']) 68 | 69 | self.logger.info(f"Validation completed. Is complete: {result['is_complete']}") 70 | return result 71 | 72 | except Exception as e: 73 | self.logger.error(f"Error during LLM validation: {str(e)}") 74 | return { 75 | 'is_complete': False, 76 | 'confidence': 0.0, 77 | 'reasoning': f"Error occurred during validation: {str(e)}", 78 | 'partial_completion': False 79 | } 80 | 81 | def _construct_validation_prompt(self, instruction: str, match: str) -> str: 82 | """ 83 | Construct the prompt for the LLM to evaluate instruction completion. 84 | 85 | Args: 86 | instruction: The instruction description 87 | match: The best match text 88 | 89 | Returns: 90 | Formatted prompt string 91 | """ 92 | prompt = f""" 93 | Please evaluate whether the "Best Match" text completes or fulfills the given "Instruction". 94 | You only need to confirm that the language match well and do not need to check if the best match would update the environment. 95 | The language structure from the environment is fixed and does not change, so do not expect a 'Best Match' that is better structured than what is given. 96 | You need to determine if the current 'Best Match' is likely to be the best match for the instruction given the language structure of the environment, do not expect more detail than what is given. 97 | 98 | INSTRUCTION: 99 | {instruction} 100 | 101 | BEST MATCH: 102 | {match} 103 | 104 | Analyze if the Best Match adequately completes, addresses, or fulfills the Instruction. Consider: 105 | 1. Does it directly address what was asked? 106 | 2. Is the response complete and comprehensive? 107 | 3. Does it meet the intent of the instruction? 108 | 109 | Respond ONLY with valid JSON in this exact format: 110 | {{ 111 | "is_complete": true/false, 112 | "confidence": 0.0-1.0, 113 | "reasoning": "Brief explanation of your evaluation", 114 | "partial_completion": true/false 115 | }} 116 | 117 | Your confidence should be: 118 | - 0.9-1.0: Very confident the instruction is completed 119 | - 0.7-0.8: Mostly confident but some minor gaps 120 | - 0.5-0.6: Partially completed with significant gaps 121 | - 0.0-0.4: Does not complete the instruction 122 | """ 123 | return prompt 124 | 125 | def _parse_llm_response(self, response_text: str) -> Dict[str, Any]: 126 | """ 127 | Parse the LLM response and extract validation results. 128 | 129 | Args: 130 | response_text: Raw text response from LLM 131 | 132 | Returns: 133 | Parsed validation results 134 | """ 135 | try: 136 | # Clean and parse JSON response 137 | cleaned_response = response_text.strip() 138 | 139 | # Handle potential markdown code blocks 140 | if "```json" in cleaned_response: 141 | start = cleaned_response.find("```json") + 7 142 | end = cleaned_response.find("```", start) 143 | cleaned_response = cleaned_response[start:end].strip() 144 | elif "```" in cleaned_response: 145 | start = cleaned_response.find("```") + 3 146 | end = cleaned_response.find("```", start) 147 | cleaned_response = cleaned_response[start:end].strip() 148 | 149 | result = json.loads(cleaned_response) 150 | 151 | # Validate required fields 152 | required_fields = ['is_complete', 'confidence', 'reasoning', 'partial_completion'] 153 | for field in required_fields: 154 | if field not in result: 155 | raise ValueError(f"Missing required field: {field}") 156 | 157 | # Ensure confidence is between 0 and 1 158 | result['confidence'] = max(0.0, min(1.0, float(result['confidence']))) 159 | 160 | return result 161 | 162 | except (json.JSONDecodeError, ValueError, KeyError) as e: 163 | self.logger.error(f"Failed to parse LLM response: {str(e)}") 164 | return { 165 | 'is_complete': False, 166 | 'confidence': 0.0, 167 | 'reasoning': f"Failed to parse LLM response: {response_text[:100]}...", 168 | 'partial_completion': False 169 | } 170 | 171 | def batch_validate(self, instruction_match_pairs: list) -> list: 172 | """ 173 | Validate multiple instruction-match pairs in batch. 174 | 175 | Args: 176 | instruction_match_pairs: List of tuples (instruction, best_match) 177 | 178 | Returns: 179 | List of validation results 180 | """ 181 | results = [] 182 | for instruction, match in instruction_match_pairs: 183 | result = self.validate_instruction_completion(instruction, match) 184 | results.append(result) 185 | 186 | return results 187 | 188 | def list_available_models(self) -> list: 189 | """ 190 | List available Ollama models. 191 | 192 | Returns: 193 | List of available model names 194 | """ 195 | try: 196 | models = ollama.list() 197 | return [model['model'].split(':')[0] for model in models['models']] 198 | except Exception as e: 199 | self.logger.error(f"Error listing models: {str(e)}") 200 | return [] 201 | 202 | 203 | # Example usage and convenience function 204 | def validate_instruction_match(instruction_description: str, 205 | best_match: str, 206 | model: str = "llama3.2") -> Dict[str, Any]: 207 | """ 208 | Convenience function to quickly validate if a best match completes an instruction. 209 | 210 | Args: 211 | instruction_description: The instruction to validate against 212 | best_match: The text that potentially completes the instruction 213 | model: Ollama model to use (default: "llama2") 214 | host: Ollama host URL (optional) 215 | 216 | Returns: 217 | Validation result dictionary 218 | """ 219 | validator = LLMInstructionValidator(model=model) 220 | return validator.validate_instruction_completion(instruction_description, best_match) 221 | -------------------------------------------------------------------------------- /elsciRL/examples/environments/elsciRL_sailing.py: -------------------------------------------------------------------------------- 1 | # Sailing Simulator 2 | # - https://github.com/topics/sailing-simulator 3 | # - Simple sailing simulator from https://github.com/PPierzc/ai-learns-to-sail 4 | # - https://github.com/PPierzc/ai-learns-to-sail/blob/master/tasks/channel.py 5 | import io 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from elsciRL.examples.environments.sailing_image import SailingImageData 9 | 10 | class Engine: 11 | """Defines the environment function from the generator engine. 12 | Expects the following: 13 | - reset() to reset the env a start position(s) 14 | - step() to make an action and update the game state 15 | - legal_moves_generator() to generate the list of legal moves 16 | """ 17 | def __init__(self, local_setup_info:dict={}) -> None: 18 | """Initialize Engine""" 19 | #self.Environment = "Engine Initialization" 20 | self.x_limit = 10 21 | self.y_limit = local_setup_info["y_limit"] 22 | self.angle_limit = np.pi / 2 23 | self.supervised_rewards = local_setup_info["supervised_rewards"] 24 | # Precision parameter 25 | self.obs_precision = local_setup_info["obs_precision"] 26 | 27 | # Ledger of the environment with meta information for the problem 28 | ledger_required = { 29 | 'id': 'Unique Problem ID', 30 | 'type': 'Language/Numeric', 31 | 'description': 'Problem Description', 32 | 'goal': 'Goal Description' 33 | } 34 | 35 | ledger_optional = { 36 | 'reward': 'Reward Description', 37 | 'punishment': 'Punishment Description (if any)', 38 | 'state': 'State Description', 39 | 'constraints': 'Constraints Description', 40 | 'action': 'Action Description', 41 | 'author': 'Author', 42 | 'year': 'Year', 43 | 'render_data':{'render_mode':'rgb_array', 44 | 'render_fps':4} 45 | } 46 | ledger_gym_compatibility = { 47 | # Limited to discrete actions for now, set to arbitrary large number if uncertain 48 | 'action_space_size':2, 49 | } 50 | self.ledger = ledger_required | ledger_optional | ledger_gym_compatibility 51 | # Initialize history 52 | self.action_history = [] 53 | self.obs_history = [] 54 | 55 | 56 | 57 | # -------------------------- 58 | # Defined functions used by engine source 59 | @staticmethod 60 | def vel(theta, theta_0=0, theta_dead=np.pi / 12): 61 | return 1 - np.exp(-(theta - theta_0) ** 2 / theta_dead) 62 | 63 | @staticmethod 64 | def rew(theta, theta_0=0, theta_dead=np.pi / 12): 65 | return Engine.vel(theta, theta_0, theta_dead) * np.cos(theta) 66 | # -------------------------- 67 | 68 | def reset(self, start_obs:str=None, render_dir:str=None): 69 | """Fully reset the environment.""" 70 | # Allow reset to be at fixed start position or random 71 | if start_obs: 72 | self.x = np.round(float(start_obs.split('_')[0]),self.obs_precision) 73 | self.angle = np.round(float(start_obs.split('_')[1]),1) 74 | else: 75 | self.x = 0 #np.round(np.random.randint(-9.9, 9.9),4) # Changed to rand_int to reduce num of start states 76 | self.angle = 0 # always start with angle 0 77 | self.y = 0 78 | obs = "{n:.{d}f}".format(n=self.x, d=self.obs_precision)+'_'+"{:0.1f}".format(self.angle) 79 | 80 | if render_dir: 81 | 82 | # SHOW PRETTY IMAGE OF PROBLEM 83 | raw_image = SailingImageData['data'].split(",") 84 | 85 | width = 240 86 | height = 300 87 | 88 | full_array = [] 89 | column_counter = 0 90 | row = [] 91 | pixel_counter = 0 92 | pixel_list = [] 93 | for input_item in raw_image: 94 | pixel_item = int(input_item.replace(" ","")) 95 | if pixel_counter == 3: 96 | # new pixel and reset pixel counter 97 | pixel_counter = 0 98 | pixel_list = [] 99 | # Add 3-d pixel to row 100 | if column_counter == width: 101 | # Add row to full array 102 | full_array.append(row) 103 | # new row and reset column counter 104 | column_counter = 0 105 | row = [] 106 | 107 | row.append(pixel_list) 108 | column_counter+=1 109 | 110 | pixel_list.append(pixel_item) 111 | pixel_counter+=1 112 | 113 | render = np.array(full_array) 114 | plt.imshow(render, interpolation='nearest') 115 | plt.axis('off') 116 | plt.title("Sailing Simulation \n Simple River with Fixed Wind Direction") 117 | plt.show() 118 | plt.pause(5) 119 | plt.savefig(render_dir,bbox_inches='tight') 120 | plt.close() 121 | 122 | return obs 123 | 124 | 125 | def step(self, state:any=None, action:any=None): 126 | """Enact an action.""" 127 | self.action_history.append(action) 128 | a = [-0.1, 0.1][action] 129 | # Observation space 130 | self.x += np.round((Engine.vel(self.angle + a) * np.sin(self.angle + a)),self.obs_precision) # Round x to Ndp 131 | self.y += np.round((Engine.vel(self.angle + a) * np.cos(self.angle + a)),4) # Round y to 4dp 132 | self.angle = np.round(self.angle+a,1) 133 | #obs = str(self.x)+'_'+str(self.angle) 134 | obs = "{n:.{d}f}".format(n=self.x, d=self.obs_precision)+'_'+"{:0.1f}".format(self.angle) # fix - https://docs.python.org/3.4/library/string.html#format-specification-mini-language 135 | self.obs_history.append(obs) 136 | # Reward signal 137 | # - Added flag for whether we give agent immediate positive reward 138 | # - Update: Added scale factor if using supervised rewards to not override goal rewards 139 | if self.supervised_rewards=="True": 140 | reward = Engine.rew(self.angle)/10 141 | else: 142 | reward = 0 143 | 144 | # Termination signal 145 | # - Source: Terminal only on hitting piers/walls, otherwise continues to action limit 146 | # - Update: Add terminal state if y > 25 (or another arbitrary value) 147 | # - Update: Limit angle to [-90,90] degrees (i.e. no backwards sailing) 148 | if np.abs(self.x)>self.x_limit: 149 | reward = -1 150 | terminated = True 151 | elif np.abs(self.y)>self.y_limit: 152 | reward = 1 153 | terminated = True 154 | elif np.abs(self.y)<0: 155 | reward = -1 156 | terminated = True 157 | elif np.abs(self.angle)>self.angle_limit: 158 | #print("\n \t - Angle limit reached") 159 | reward = -1 160 | terminated = True 161 | else: 162 | terminated = False 163 | 164 | return obs, reward, terminated, {} 165 | 166 | def legal_move_generator(self, obs:any=None): 167 | """Define legal moves at each position""" 168 | # Action space: [0,1] for turn slightly left or right 169 | # - Kept as binary but might be better as continuous [-0.1, 0.1] 170 | legal_moves = [0, 1] 171 | return legal_moves 172 | 173 | def render(self, state:any=None): 174 | """Render the environment.""" 175 | #render = print("Current State: ", state, " | Action History: ", self.action_history) 176 | # state = x_angle 177 | x = self.x 178 | y = self.y 179 | angle = self.angle 180 | # Angle is bearing into wind -pi/2 < angle < pi/2 181 | if angle < np.pi/2: 182 | U = np.sin(angle) 183 | V = np.cos(angle) 184 | elif angle == np.pi/2: 185 | U = 1 186 | V = 0 187 | elif angle == -np.pi/2: 188 | U = -1 189 | V = 0 190 | else: 191 | U = np.sin(angle) 192 | V = -np.cos(angle) 193 | 194 | DPI = 128 195 | fig, ax = plt.subplots(figsize=(5,5), dpi = DPI) 196 | ax.scatter(x,y,c='b',marker='x',alpha=1) 197 | ax.quiver(x,y,U,V,angles='uv',scale_units='xy') 198 | if y > 1: 199 | ax.text(x+0.5,y-1,'Sailboat',color='b') 200 | 201 | # Draw wind direction 202 | ax.quiver(0,25,0,-1,angles='uv',scale_units='xy',color='r') 203 | ax.text(0,25.25,'Wind',color='r') 204 | 205 | 206 | ax.plot([10,10],[0,25],'r') 207 | ax.plot([-10,-10],[0,25],'r') 208 | ax.set_title("Sailboat Position with Direction against Wind") 209 | ax.set_xlabel("Horizontal Position (x)") 210 | ax.set_ylabel("Vertical Position (y)") 211 | # Save as rgba array 212 | # https://stackoverflow.com/questions/7821518/save-plot-to-numpy-array 213 | 214 | 215 | fig.canvas.draw() 216 | # data = np.frombuffer(fig.canvas.tostring_argb(), dtype=np.uint8) 217 | # render = data.reshape(fig.canvas.get_width_height()[::-1] + (4,)) 218 | 219 | buf = fig.canvas.buffer_rgba() 220 | data = np.asarray(buf) 221 | render = data.reshape(fig.canvas.get_width_height()[::-1] + (4,)) 222 | return render 223 | 224 | def close(self): 225 | """Close/Exit the environment.""" 226 | print("Environment Closed") -------------------------------------------------------------------------------- /elsciRL/interaction_loops/standard_gym.py: -------------------------------------------------------------------------------- 1 | # TODO: Simplify and remove sub-goals/elsciRL tracking/live_env/exp sampling 2 | import time 3 | import numpy as np 4 | from PIL import Image 5 | from tqdm import tqdm 6 | from gymnasium.wrappers import TimeLimit 7 | # ------ Imports ----------------------------------------- 8 | # Agent Setup 9 | from elsciRL.environment_setup.imports import ImportHelper 10 | # Evaluation standards 11 | from elsciRL.environment_setup.results_table import ResultsTable 12 | from elsciRL.environment_setup.elsciRL_info import elsciRLInfo 13 | # Non-gym interaction loop setup 14 | from elsciRL.interaction_loops.standard import StandardInteractionLoop 15 | from elsciRL.experiments.experiment_utils.config_utils import ensure_dir 16 | 17 | 18 | def _apply_action_limit(env, max_steps: int | None): 19 | """Wrap env with a TimeLimit so runaway episodes truncate after max_steps.""" 20 | 21 | if not max_steps or max_steps <= 0: 22 | return env 23 | if isinstance(env, TimeLimit): 24 | env._max_episode_steps = min(env._max_episode_steps, max_steps) 25 | return env 26 | try: 27 | return TimeLimit(env, max_episode_steps=max_steps) 28 | except Exception: 29 | # Fall back to manual attribute hints if wrapper fails (non-gym envs) 30 | setattr(env, "_elsci_max_episode_steps", max_steps) 31 | return env 32 | 33 | def _normalize_render_stack(render_stack): 34 | """Convert renderer outputs to PIL Images so GIF saving works consistently.""" 35 | 36 | normalized = [] 37 | for frame in render_stack or []: 38 | if frame is None: 39 | continue 40 | if hasattr(frame, "save"): 41 | normalized.append(frame) 42 | elif isinstance(frame, np.ndarray): 43 | normalized.append(Image.fromarray(frame.astype(np.uint8))) 44 | return normalized 45 | 46 | 47 | class GymInteractionLoop: 48 | """Interaction Loop for standard environments. 49 | REQUIRES: 50 | - Engine: Environment engine defined with elsciRLAI format 51 | - Adapters: Dictionary of local adapters with unique names: {"name_1": Adapter_1, "name_2": Adapter_2,...} 52 | - local_setup_info: Dictionary of local setup info (i.e. local config file) 53 | """ 54 | def __init__(self, Engine, Adapters:dict, local_setup_info: dict): 55 | # Define agent type for interaction process, call alternative if not gym agent 56 | if local_setup_info['agent_type'].split('_')[0] == "SB3": 57 | self.gym_agent = True 58 | Imports = ImportHelper(local_setup_info) 59 | self.agent, self.agent_type, self.agent_name, self.agent_state_adapter = Imports.agent_info(Adapters) 60 | self.num_train_episodes, self.num_test_episodes, self.training_action_cap, self.testing_action_cap, self.reward_signal = Imports.parameter_info() 61 | self.train = Imports.training_flag() 62 | # --- INIT env from engine 63 | self.env = Engine(local_setup_info) 64 | max_steps = self.training_action_cap if self.train else self.testing_action_cap 65 | self.env = _apply_action_limit(self.env, max_steps) 66 | self.start_obs = self.env.reset() 67 | # --- 68 | # --- PRESET elsciRL INFO 69 | # Agent 70 | # Training or testing phase flag 71 | # --- elsciRL 72 | self.live_env, self.observed_states, self.experience_sampling = Imports.live_env_flag() 73 | # Results formatting 74 | self.results = ResultsTable(local_setup_info) 75 | # elsciRL input function 76 | # - We only want to init trackers on first batch otherwise it resets knowledge 77 | self.elsciRL = elsciRLInfo(self.observed_states, self.experience_sampling) 78 | else: 79 | # --- Used for initialisation default interaction loop as alternative 80 | self.gym_agent = False 81 | self.interaction = StandardInteractionLoop(Engine, Adapters, local_setup_info) 82 | self.start_obs = self.interaction.start_obs 83 | self.results = ResultsTable(local_setup_info) 84 | 85 | def episode_loop(self, render:bool=False, render_save_dir:str=None): 86 | if self.gym_agent: 87 | # Mode selection (already initialized) 88 | if self.train: 89 | number_episodes = self.num_train_episodes 90 | else: 91 | number_episodes = self.num_test_episodes 92 | 93 | episode_render = [] 94 | print("\n Episode Interaction Loop: ") 95 | if self.train: 96 | for episode in tqdm(range(0, number_episodes)): 97 | start_time = time.time() 98 | # Can force the agent to train on a single episode 99 | # Very time consuming to do this 100 | self.agent.learn(total_steps=self.training_action_cap) 101 | end_time = time.time() 102 | reward, actions, states, render_stack = self.agent.test(self.env, render=render) 103 | episode_render.append(render_stack) 104 | # Need to get values from actions 105 | # TODO: Ensure all agents output int directly to solve this 106 | if isinstance(actions[0], np.int64): 107 | actions = [action.item() for action in actions] 108 | elif isinstance(actions[0], np.ndarray): 109 | actions = [action.item() for action in actions] 110 | 111 | 112 | 113 | self.results.results_per_episode(self.agent_name, None, episode, len(actions), 114 | reward, (end_time-start_time), actions, 0, 0) 115 | else: 116 | for episode in tqdm(range(0, number_episodes)): 117 | start_time = time.time() 118 | # Evaluate fixed policy on single episode 119 | reward, actions, states, render_stack = self.agent.test(self.env, render=render) 120 | # Need to get values from actions 121 | # TODO: Ensure all agents output int directly to solve this 122 | if isinstance(actions[0], np.int64): 123 | actions = [action.item() for action in actions] 124 | elif isinstance(actions[0], np.ndarray): 125 | actions = [action.item() for action in actions] 126 | 127 | episode_render.append(render_stack) 128 | end_time = time.time() 129 | self.results.results_per_episode(self.agent_name, None, episode, len(actions), 130 | reward, (end_time-start_time), actions, 0, 0) 131 | table_results = self.results.results_table_format() 132 | # Output GIF image of all episode frames 133 | if render and render_stack: 134 | frames = _normalize_render_stack(render_stack) 135 | if frames: 136 | frames[0].save( 137 | render_save_dir + '/render.gif', 138 | save_all=True, 139 | append_images=frames[1:], 140 | optimize=False, 141 | duration=200, 142 | loop=1, 143 | ) 144 | else: 145 | table_results = self.interaction.episode_loop() 146 | self.agent = self.interaction.agent 147 | self.results = self.interaction.results 148 | self.elsciRL = self.interaction.elsciRL 149 | 150 | return table_results 151 | 152 | @staticmethod 153 | def policy_rollout( 154 | agent, 155 | env, 156 | agent_name: str, 157 | num_episodes: int, 158 | results_table, 159 | render: bool = False, 160 | render_save_dir: str | None = None, 161 | action_limit: int | None = None, 162 | ): 163 | """Execute a pre-configured policy-gradient agent on a Gym env and log results.""" 164 | env = _apply_action_limit(env, action_limit) 165 | episode_render = [] 166 | for episode in range(num_episodes): 167 | start_time = time.time() 168 | reward, actions, states, render_stack = agent.test(env, render=render) 169 | end_time = time.time() 170 | if actions: 171 | if isinstance(actions[0], np.int64): 172 | actions = [action.item() for action in actions] 173 | elif isinstance(actions[0], np.ndarray): 174 | actions = [action.item() for action in actions] 175 | results_table.results_per_episode( 176 | agent_name, 177 | None, 178 | episode, 179 | len(actions), 180 | reward, 181 | (end_time - start_time), 182 | actions, 183 | 0, 184 | 0, 185 | ) 186 | if render and render_stack: 187 | episode_render.extend(_normalize_render_stack(render_stack)) 188 | table_results = results_table.results_table_format() 189 | if render and episode_render: 190 | ensure_dir(render_save_dir or "renders") 191 | episode_render[0].save( 192 | f"{render_save_dir or 'renders'}/{agent_name}_policy.gif", 193 | save_all=True, 194 | append_images=episode_render[1:], 195 | optimize=False, 196 | duration=200, 197 | loop=1, 198 | ) 199 | return table_results --------------------------------------------------------------------------------