├── src ├── utils │ ├── __init__.py │ ├── generic_typevars.py │ ├── beta_distribution.py │ ├── gen_utils.py │ └── standard_typevars.py ├── algorithms │ ├── __init__.py │ ├── adp │ │ └── __init__.py │ ├── dp │ │ ├── __init__.py │ │ ├── dp_analytic.py │ │ ├── dp_numeric.py │ │ └── dp_base.py │ ├── mab │ │ ├── __init__.py │ │ ├── mab_graphs_gen.py │ │ ├── ts_bernoulli.py │ │ ├── mab_base.py │ │ ├── ucb1.py │ │ ├── gradient_bandits.py │ │ ├── epsilon_greedy.py │ │ ├── ts_gaussian.py │ │ └── plot_mab_graphs.py │ ├── rl_tabular │ │ ├── __init__.py │ │ ├── rl_tabular_base.py │ │ ├── td0.py │ │ ├── monte_carlo.py │ │ └── tdlambda.py │ ├── rl_func_approx │ │ ├── __init__.py │ │ ├── rl_func_approx_base.py │ │ ├── td0.py │ │ └── lspi.py │ ├── td_algo_enum.py │ ├── opt_base.py │ ├── tabular_base.py │ ├── func_approx_spec.py │ ├── backward_adp.py │ ├── backward_dp.py │ ├── helper_funcs.py │ └── ams.py ├── examples │ ├── __init__.py │ ├── port_opt │ │ └── __init__.py │ ├── american_pricing │ │ ├── __init__.py │ │ ├── bs_pricing.py │ │ ├── num_utils.py │ │ └── vanilla_american_test.py │ ├── deriv_pricing_hedging │ │ └── __init__.py │ ├── exam_problems │ │ ├── grid_maze.py │ │ ├── price_control.py │ │ ├── frog_lilypad.py │ │ ├── wage_max.py │ │ ├── W2021 │ │ │ └── career_optimization.py │ │ ├── mrp_tdmc_outline.py │ │ └── mrp_tdmc.py │ ├── clearance_pricing.py │ └── inv_control.py ├── func_approx │ ├── __init__.py │ ├── dnn_spec.py │ ├── eligibility_traces.py │ ├── linear_approx.py │ └── func_approx_base.py └── processes │ ├── __init__.py │ ├── mdp_rep_for_rl_pg.py │ ├── det_policy.py │ ├── mdp_rep_for_adp.py │ ├── mdp_rep_for_adp_pg.py │ ├── mrp_refined.py │ ├── mab_env.py │ ├── policy.py │ ├── mdp_rep_for_rl_tabular.py │ ├── mdp_rep_for_rl_fa.py │ ├── mp.py │ ├── mrp.py │ ├── mdp.py │ └── mdp_refined.py ├── .gitignore └── README.md /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/func_approx/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/processes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/algorithms/adp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/algorithms/dp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/algorithms/mab/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/examples/port_opt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__ 3 | -------------------------------------------------------------------------------- /src/algorithms/rl_tabular/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/algorithms/rl_func_approx/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/examples/american_pricing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/examples/deriv_pricing_hedging/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/utils/generic_typevars.py: -------------------------------------------------------------------------------- 1 | from typing import TypeVar 2 | 3 | S = TypeVar('S') 4 | A = TypeVar('A') 5 | -------------------------------------------------------------------------------- /src/algorithms/td_algo_enum.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, auto 2 | 3 | 4 | class TDAlgorithm(Enum): 5 | SARSA = auto() 6 | QLearning = auto() 7 | ExpectedSARSA = auto() 8 | -------------------------------------------------------------------------------- /src/processes/mdp_rep_for_rl_pg.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Generic, Tuple 2 | from utils.generic_typevars import S, A 3 | 4 | 5 | class MDPRepForRLPG(Generic[S, A]): 6 | 7 | def __init__( 8 | self, 9 | gamma: float, 10 | init_state_gen_func: Callable[[], S], 11 | state_reward_gen_func: Callable[[S, A], Tuple[S, float]], 12 | terminal_state_func: Callable[[S], bool], 13 | ) -> None: 14 | self.gamma: float = gamma 15 | self.init_state_gen_func: Callable[[], S] = init_state_gen_func 16 | self.state_reward_gen_func: Callable[[S, A], Tuple[S, float]] = \ 17 | state_reward_gen_func 18 | self.terminal_state_func = terminal_state_func 19 | 20 | 21 | if __name__ == '__main__': 22 | print(0) 23 | -------------------------------------------------------------------------------- /src/processes/det_policy.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping 2 | from processes.policy import Policy 3 | from utils.generic_typevars import S, A 4 | 5 | 6 | class DetPolicy(Policy): 7 | 8 | def __init__(self, det_policy_data: Mapping[S, A]) -> None: 9 | super().__init__({s: {a: 1.0} for s, a in det_policy_data.items()}) 10 | 11 | def get_action_for_state(self, state: S) -> A: 12 | return list(self.get_state_probabilities(state).keys())[0] 13 | 14 | def get_state_to_action_map(self) -> Mapping[S, A]: 15 | return {s: self.get_action_for_state(s) for s in self.policy_data} 16 | 17 | def __repr__(self) -> str: 18 | return self.get_state_to_action_map().__repr__() 19 | 20 | def __str__(self) -> str: 21 | return self.get_state_to_action_map().__str__() 22 | 23 | -------------------------------------------------------------------------------- /src/processes/mdp_rep_for_adp.py: -------------------------------------------------------------------------------- 1 | from typing import Set, Callable, Sequence, Mapping, Generic 2 | from utils.generic_typevars import S, A 3 | 4 | 5 | class MDPRepForADP(Generic[S, A]): 6 | 7 | def __init__( 8 | self, 9 | state_action_func: Callable[[S], Set[A]], 10 | gamma: float, 11 | sample_states_gen_func: Callable[[int], Sequence[S]], 12 | reward_func: Callable[[S, A], float], 13 | transitions_func: Callable[[S, A], Mapping[S, float]] 14 | ) -> None: 15 | self.state_action_func: Callable[[S], Set[A]] = state_action_func 16 | self.gamma: float = gamma 17 | self.sample_states_gen_func: Callable[[int], Sequence[S]] = \ 18 | sample_states_gen_func 19 | self.reward_func: Callable[[S, A], float] = reward_func 20 | self.transitions_func: Callable[[S, A], Mapping[S, float]] = \ 21 | transitions_func 22 | 23 | 24 | if __name__ == '__main__': 25 | print(0) 26 | 27 | -------------------------------------------------------------------------------- /src/algorithms/opt_base.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | from abc import ABC, abstractmethod 3 | from utils.generic_typevars import S, A 4 | from utils.standard_typevars import VFType, QFType, PolicyType 5 | 6 | 7 | class OptBase(ABC): 8 | 9 | @abstractmethod 10 | def get_value_func(self, polf: PolicyType) -> VFType: 11 | pass 12 | 13 | @abstractmethod 14 | def get_act_value_func(self, polf: PolicyType) -> QFType: 15 | pass 16 | 17 | @abstractmethod 18 | def get_optimal_det_policy_func(self) -> Callable[[S], A]: 19 | pass 20 | 21 | # noinspection PyShadowingNames 22 | def get_optimal_value_func(self) -> VFType: 23 | pf = self.get_optimal_det_policy_func() 24 | return self.get_value_func( 25 | lambda s, pf=pf: lambda n, s=s, pf=pf: [pf(s)] * n 26 | ) 27 | 28 | # noinspection PyShadowingNames 29 | def get_optimal_act_value_func(self) -> QFType: 30 | pf = self.get_optimal_det_policy_func() 31 | return self.get_act_value_func( 32 | lambda s, pf=pf: lambda n, s=s, pf=pf: [pf(s)] * n 33 | ) 34 | 35 | -------------------------------------------------------------------------------- /src/processes/mdp_rep_for_adp_pg.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Sequence, Generic, Tuple 2 | from utils.generic_typevars import S, A 3 | 4 | 5 | class MDPRepForADPPG(Generic[S, A]): 6 | 7 | def __init__( 8 | self, 9 | gamma: float, 10 | init_states_gen_func: Callable[[int], Sequence[S]], 11 | state_reward_gen_func: Callable[[S, A, int], Sequence[Tuple[S, float]]], 12 | # reward_func: Callable[[S, A], float], 13 | # transitions_func: Callable[[S, A], Mapping[S, float]], 14 | terminal_state_func: Callable[[S], bool], 15 | ) -> None: 16 | self.gamma: float = gamma 17 | self.init_states_gen_func: Callable[[int], Sequence[S]] = \ 18 | init_states_gen_func 19 | self.state_reward_gen_func: Callable[[S, A, int], Sequence[Tuple[S, float]]] =\ 20 | state_reward_gen_func 21 | # self.reward_func: Callable[[S, A], float] = reward_func 22 | # self.transitions_func: Callable[[S, A], Mapping[S, float]] = \ 23 | # transitions_func 24 | self.terminal_state_func = terminal_state_func 25 | 26 | 27 | if __name__ == '__main__': 28 | print(0) 29 | -------------------------------------------------------------------------------- /src/processes/mrp_refined.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Tuple 2 | from processes.mrp import MRP 3 | from utils.gen_utils import zip_dict_of_tuple 4 | import numpy as np 5 | from utils.generic_typevars import S 6 | from utils.standard_typevars import SSf, SSTff 7 | 8 | 9 | class MRPRefined(MRP): 10 | 11 | def __init__( 12 | self, 13 | info: SSTff, 14 | gamma: float 15 | ) -> None: 16 | d1, d2, d3 = MRPRefined.split_info(info) 17 | super().__init__({k: (v, d3[k]) for k, v in d1.items()}, gamma) 18 | self.rewards_refined: SSf = d2 19 | 20 | @staticmethod 21 | def split_info(info: SSTff) -> Tuple[SSf, SSf, Mapping[S, float]]: 22 | d = {k: zip_dict_of_tuple(v) for k, v in info.items()} 23 | d1, d2 = zip_dict_of_tuple(d) 24 | d3 = {k: sum(np.prod(x) for x in v.values()) for k, v in info.items()} 25 | return d1, d2, d3 26 | 27 | 28 | if __name__ == '__main__': 29 | data = { 30 | 1: {1: (0.3, 9.2), 2: (0.6, 3.4), 3: (0.1, -0.3)}, 31 | 2: {1: (0.4, 0.0), 2: (0.2, 8.9), 3: (0.4, 3.5)}, 32 | 3: {3: (1.0, 0.0)} 33 | } 34 | mrp_refined_obj = MRPRefined(data, 0.95) 35 | print(mrp_refined_obj.trans_matrix) 36 | print(mrp_refined_obj.rewards_vec) 37 | 38 | 39 | -------------------------------------------------------------------------------- /src/examples/exam_problems/grid_maze.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Mapping 2 | 3 | SPACE = 'SPACE' 4 | BLOCK = 'BLOCK' 5 | GOAL = 'GOAL' 6 | 7 | maze_grid = {(0, 0): SPACE, (0, 1): BLOCK, (0, 2): SPACE, (0, 3): SPACE, (0, 4): SPACE, 8 | (0, 5): SPACE, (0, 6): SPACE, (0, 7): SPACE, (1, 0): SPACE, (1, 1): BLOCK, 9 | (1, 2): BLOCK, (1, 3): SPACE, (1, 4): BLOCK, (1, 5): BLOCK, (1, 6): BLOCK, 10 | (1, 7): BLOCK, (2, 0): SPACE, (2, 1): BLOCK, (2, 2): SPACE, (2, 3): SPACE, 11 | (2, 4): SPACE, (2, 5): SPACE, (2, 6): BLOCK, (2, 7): SPACE, (3, 0): SPACE, 12 | (3, 1): SPACE, (3, 2): SPACE, (3, 3): BLOCK, (3, 4): BLOCK, (3, 5): SPACE, 13 | (3, 6): BLOCK, (3, 7): SPACE, (4, 0): SPACE, (4, 1): BLOCK, (4, 2): SPACE, 14 | (4, 3): BLOCK, (4, 4): SPACE, (4, 5): SPACE, (4, 6): SPACE, (4, 7): SPACE, 15 | (5, 0): BLOCK, (5, 1): BLOCK, (5, 2): SPACE, (5, 3): BLOCK, (5, 4): SPACE, 16 | (5, 5): BLOCK, (5, 6): SPACE, (5, 7): BLOCK, (6, 0): SPACE, (6, 1): BLOCK, 17 | (6, 2): BLOCK, (6, 3): BLOCK, (6, 4): SPACE, (6, 5): BLOCK, (6, 6): SPACE, 18 | (6, 7): SPACE, (7, 0): SPACE, (7, 1): SPACE, (7, 2): SPACE, (7, 3): SPACE, 19 | (7, 4): SPACE, (7, 5): BLOCK, (7, 6): BLOCK, (7, 7): GOAL} 20 | -------------------------------------------------------------------------------- /src/processes/mab_env.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Callable, Sequence, NamedTuple 2 | from numpy.random import normal, binomial, uniform 3 | 4 | 5 | class MabEnv(NamedTuple): 6 | 7 | arms_sampling_funcs: Sequence[Callable[[], float]] 8 | 9 | @staticmethod 10 | def get_gaussian_mab_env(means_vars: Sequence[Tuple[float, float]]) -> 'MabEnv': 11 | return MabEnv([lambda m=m, s=s: normal(m, s, 1)[0] for m, s in means_vars]) 12 | 13 | @staticmethod 14 | def get_bernoulli_mab_env(probs: Sequence[float]) -> 'MabEnv': 15 | return MabEnv([lambda p=p: float(binomial(1, p, 1)[0]) for p in probs]) 16 | 17 | @staticmethod 18 | def get_uniform_mab_env(bounds: Sequence[Tuple[float, float]]) -> 'MabEnv': 19 | return MabEnv([lambda c=c, d=d: uniform(c, d, 1)[0] for c, d in bounds]) 20 | 21 | @staticmethod 22 | def get_binomial_mab_env(params: Sequence[Tuple[int, float]]) -> 'MabEnv': 23 | return MabEnv([lambda n=n, p=p: float(binomial(n, p, 1)[0]) for n, p in params]) 24 | 25 | 26 | if __name__ == '__main__': 27 | mean_vars_data = [(5., 2.), (10., 3.), (0., 4.)] 28 | me = MabEnv.get_gaussian_mab_env(mean_vars_data) 29 | asf = me.arms_sampling_funcs 30 | res = [[asf[i]() for _ in range(10000)] for i in range(len(asf))] 31 | for i in range(len(mean_vars_data)): 32 | nums = res[i] 33 | print("Mean = %.3f" % np.mean(nums)) 34 | print("Stdev = %.3f" % np.std(nums)) 35 | -------------------------------------------------------------------------------- /src/examples/exam_problems/price_control.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import Sequence, Tuple 3 | from scipy.stats import poisson 4 | import matplotlib.pyplot as plt 5 | from mpl_toolkits.mplot3d.axes3d import Axes3D 6 | from matplotlib import cm 7 | 8 | T: int = 10 # time steps 9 | M: int = 20 # initial inventory 10 | # the following are (price, poisson mean) pairs, i.e., elasticity 11 | el: Sequence[Tuple[float, float]] = [ 12 | (10.0, 0.3), (9.0, 0.8), (8.0, 1.6), 13 | (7.0, 2.7), (6.0, 4.1), (5.0, 7.2) 14 | ] 15 | 16 | # v represents the Optimal Value Function (time, Inventory) -> E[Sum of Sales Revenue] 17 | # pi represents the Optimal Policy (time, Inventory) -> Price 18 | v: np.ndarray = np.zeros((T + 1, M + 1)) 19 | pi: np.ndarray = np.zeros((T, M + 1)) 20 | rvs: Sequence = [poisson(l) for _, l in el] 21 | 22 | for t in range(T - 1, -1, -1): 23 | for s in range(M + 1): 24 | q_vals = [sum(rvs[i].pmf(d) * (d * p + v[t + 1, s - d]) 25 | for d in range(s)) + 26 | (1. - rvs[i].cdf(s - 1)) * s * p 27 | for i, (p, _) in enumerate(el)] 28 | v[t, s] = np.max(q_vals) 29 | pi[t, s] = el[int(np.argmax(q_vals))][0] 30 | 31 | print(pi) 32 | print(v) 33 | 34 | 35 | x, y = np.meshgrid(range(M + 1), range(T)) 36 | fig = plt.figure() 37 | ax = fig.gca(projection='3d') 38 | surf = ax.plot_surface(x, y, pi, cmap=cm.coolwarm, 39 | linewidth=0, antialiased=False) 40 | fig.colorbar(surf, shrink=0.5, aspect=5) 41 | plt.show() 42 | 43 | 44 | -------------------------------------------------------------------------------- /src/processes/policy.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Generic, Dict 2 | from processes.mp_funcs import verify_policy 3 | from processes.mp_funcs import get_epsilon_action_probs 4 | from processes.mp_funcs import get_softmax_action_probs 5 | from utils.generic_typevars import S, A 6 | 7 | 8 | class Policy(Generic[S, A]): 9 | 10 | def __init__(self, data: Dict[S, Mapping[A, float]]) -> None: 11 | if verify_policy(data): 12 | self.policy_data = data 13 | else: 14 | raise ValueError 15 | 16 | def get_state_probabilities(self, state: S) -> Mapping[A, float]: 17 | return self.policy_data[state] 18 | 19 | def get_state_action_probability(self, state: S, action: A) -> float: 20 | return self.get_state_probabilities(state).get(action, 0.) 21 | 22 | def edit_state_action_to_epsilon_greedy( 23 | self, 24 | state: S, 25 | action_value_dict: Mapping[A, float], 26 | epsilon: float 27 | ) -> None: 28 | self.policy_data[state] = get_epsilon_action_probs( 29 | action_value_dict, 30 | epsilon 31 | ) 32 | 33 | def edit_state_action_to_softmax( 34 | self, 35 | state: S, 36 | action_value_dict: Mapping[A, float] 37 | ) -> None: 38 | self.policy_data[state] = get_softmax_action_probs( 39 | action_value_dict 40 | ) 41 | 42 | def __repr__(self): 43 | return self.policy_data.__repr__() 44 | 45 | def __str__(self): 46 | return self.policy_data.__str__() 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/processes/mdp_rep_for_rl_tabular.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Set, Callable, Tuple 2 | from processes.mp_funcs import get_rv_gen_func_single 3 | from processes.mdp_rep_for_rl_fa import MDPRepForRLFA 4 | from utils.generic_typevars import S, A 5 | 6 | Type1 = Mapping[S, Mapping[A, Callable[[], Tuple[S, float]]]] 7 | 8 | 9 | class MDPRepForRLTabular(MDPRepForRLFA): 10 | 11 | def __init__( 12 | self, 13 | state_action_dict: Mapping[S, Set[A]], 14 | terminal_states: Set[S], 15 | state_reward_gen_dict: Type1, 16 | gamma: float 17 | ) -> None: 18 | self.state_action_dict: Mapping[S, Set[A]] = state_action_dict 19 | self.terminal_states: Set[S] = terminal_states 20 | self.state_reward_gen_dict: Type1 = state_reward_gen_dict 21 | super().__init__( 22 | state_action_func=lambda x: self.state_action_dict[x], 23 | gamma=gamma, 24 | terminal_state_func=lambda x: x in self.terminal_states, 25 | state_reward_gen_func=lambda x, y: self.state_reward_gen_dict[x][y](), 26 | init_state_gen=get_rv_gen_func_single( 27 | {s: 1. / len(self.state_action_dict) for s 28 | in self.state_action_dict.keys()} 29 | ), 30 | init_state_action_gen=get_rv_gen_func_single( 31 | {(s, a): 1. / sum(len(v) for v 32 | in self.state_action_dict.values()) 33 | for s, v1 in self.state_action_dict.items() for a in v1} 34 | ) 35 | ) 36 | 37 | 38 | if __name__ == '__main__': 39 | print(0) 40 | -------------------------------------------------------------------------------- /src/processes/mdp_rep_for_rl_fa.py: -------------------------------------------------------------------------------- 1 | from typing import Set, Callable, Tuple, Optional, Generic 2 | from processes.mp_funcs import get_rv_gen_func_single 3 | from utils.generic_typevars import S, A 4 | 5 | 6 | class MDPRepForRLFA(Generic[S, A]): 7 | 8 | def __init__( 9 | self, 10 | state_action_func: Callable[[S], Set[A]], 11 | gamma: float, 12 | terminal_state_func: Callable[[S], bool], 13 | state_reward_gen_func: Callable[[S, A], Tuple[S, float]], 14 | init_state_gen: Callable[[], S], 15 | init_state_action_gen: Optional[Callable[[], Tuple[S, A]]] 16 | ) -> None: 17 | # noinspection PyShadowingNames 18 | def init_sa( 19 | init_state_gen=init_state_gen, 20 | state_action_func=state_action_func 21 | ) -> Tuple[S, A]: 22 | s = init_state_gen() 23 | actions = state_action_func(s) 24 | a = get_rv_gen_func_single({a: 1. / len(actions) for a in actions})() 25 | return s, a 26 | 27 | self.state_action_func: Callable[[S], Set[A]] = state_action_func 28 | self.gamma: float = gamma 29 | self.terminal_state_func: Callable[[S], bool] = terminal_state_func 30 | self.state_reward_gen_func: Callable[[S, A], Tuple[S, float]] = \ 31 | state_reward_gen_func 32 | self.init_state_gen: Callable[[], S] = init_state_gen 33 | self.init_state_action_gen: Callable[[], Tuple[S, A]] =\ 34 | (init_state_action_gen if init_state_action_gen is not None 35 | else init_sa) 36 | 37 | 38 | if __name__ == '__main__': 39 | print(0) 40 | -------------------------------------------------------------------------------- /src/algorithms/mab/mab_graphs_gen.py: -------------------------------------------------------------------------------- 1 | from typing import NoReturn 2 | import numpy as np 3 | 4 | 5 | def graph_regret_curve() -> NoReturn: 6 | import matplotlib.pyplot as plt 7 | x_vals = range(1, 71) 8 | plt.plot(x_vals, [3*x for x in x_vals], "r", label="Greedy") 9 | plt.plot(x_vals, [2*x for x in x_vals], "b", label="$\epsilon$-Greedy") 10 | plt.plot(x_vals, [20 * np.log(x) for x in x_vals], "g", label="Decaying $\epsilon$-Greedy") 11 | plt.xlabel("Time Steps", fontsize=25) 12 | plt.ylabel("Total Regret", fontsize=25) 13 | plt.title("Total Regret Curves", fontsize=25) 14 | plt.xlim(xmin=x_vals[0], xmax=x_vals[-1]) 15 | plt.ylim(ymin=0.0) 16 | # plt.xticks(x_vals) 17 | plt.grid(True) 18 | plt.legend(loc='upper left', fontsize=15) 19 | plt.show() 20 | 21 | def get_pdf(x: float, mu: float, sigma: float) -> float: 22 | return np.exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)) / (np.sqrt(2 * np.pi) * sigma) 23 | 24 | def graph_qestimate_pdfs() -> NoReturn: 25 | import matplotlib.pyplot as plt 26 | x_vals = np.arange(-2., 6., 0.01) 27 | mu_b = 1.0 28 | sigma_b = 1.0 29 | mu_r = 2.0 30 | sigma_r = 0.8 31 | mu_g = 2.5 32 | sigma_g = 0.3 33 | plt.plot(x_vals, [get_pdf(x, mu_b, sigma_b) for x in x_vals], "b", label="$Q(a_1)$") 34 | plt.plot(x_vals, [get_pdf(x, mu_r, sigma_r) for x in x_vals], "r", label="$Q(a_2)$") 35 | plt.plot(x_vals, [get_pdf(x, mu_g, sigma_g) for x in x_vals], "g", label="$Q(a_3)$") 36 | plt.xlabel("Q", fontsize=25) 37 | plt.ylabel("Prob(Q)", fontsize=25) 38 | # plt.title("Total Regret Curves", fontsize=25) 39 | plt.xlim(xmin=x_vals[0], xmax=x_vals[-1]) 40 | plt.ylim(ymin=0.0) 41 | # plt.xticks(x_vals) 42 | plt.grid(True) 43 | plt.legend(loc='upper left', fontsize=15) 44 | plt.show() 45 | 46 | 47 | 48 | if __name__ == '__main__': 49 | # graph_regret_curve() 50 | graph_qestimate_pdfs() -------------------------------------------------------------------------------- /src/algorithms/mab/ts_bernoulli.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, Tuple, List 2 | from processes.mab_env import MabEnv 3 | from operator import itemgetter 4 | from numpy import ndarray, empty 5 | from numpy.random import beta 6 | from algorithms.mab.mab_base import MABBase 7 | 8 | 9 | class ThompsonSamplingBernoulli(MABBase): 10 | 11 | def __init__( 12 | self, 13 | mab: MabEnv, 14 | time_steps: int, 15 | num_episodes: int 16 | ) -> None: 17 | super().__init__( 18 | mab=mab, 19 | time_steps=time_steps, 20 | num_episodes=num_episodes 21 | ) 22 | 23 | def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]: 24 | ep_rewards: ndarray = empty(self.time_steps) 25 | ep_actions: ndarray = empty(self.time_steps, dtype=int) 26 | bayes: List[Tuple[int, int]] = [(1, 1)] * self.num_arms 27 | 28 | for i in range(self.time_steps): 29 | mean_draws: Sequence[float] = [beta(a, b, 1)[0] for a, b in bayes] 30 | action: int = max(enumerate(mean_draws), key=itemgetter(1))[0] 31 | reward: float = self.mab_funcs[action]() 32 | a, b = bayes[action] 33 | bayes[action] = (a + int(reward), b + int(1 - reward)) 34 | ep_rewards[i] = reward 35 | ep_actions[i] = action 36 | return ep_rewards, ep_actions 37 | 38 | 39 | if __name__ == '__main__': 40 | probs_data = [0.2, 0.4, 0.8, 0.5, 0.1, 0.9] 41 | mu_star = max(probs_data) 42 | steps = 200 43 | episodes = 1000 44 | 45 | me = MabEnv.get_bernoulli_mab_env(probs_data) 46 | ucb1 = ThompsonSamplingBernoulli( 47 | mab=me, 48 | time_steps=steps, 49 | num_episodes=episodes 50 | ) 51 | exp_cum_regret = ucb1.get_expected_cum_regret(mu_star) 52 | print(exp_cum_regret) 53 | 54 | exp_act_count = ucb1.get_expected_action_counts() 55 | print(exp_act_count) 56 | 57 | ucb1.plot_exp_cum_regret_curve(mu_star) 58 | -------------------------------------------------------------------------------- /src/utils/beta_distribution.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import Tuple 3 | from scipy.special import digamma 4 | 5 | 6 | class BetaDistribution: 7 | 8 | SMALL_POS = 1e-8 9 | """ 10 | Beta Distribution is normally defined by parameters alpha and beta 11 | with alpha, beta > 0. Here we define the beta distribution in terms 12 | of parameters mu (for mean of beta distribution and nu (= alpha + beta). 13 | 14 | So, mu = alpha / (alpha + beta) = alpha / nu 15 | alpha = mu * nu, beta = (1-mu) * nu 16 | 17 | p(x) = Gamma(alpa + beta) / (Gamma(alpha) * Gamma(beta)) * 18 | x^{alpha-1) * (1-x)^{beta-1) 19 | 20 | Score_mu(x) = d(log(p(x)))/d(mu) = Score_alpha * d(alpha)/d(mu) 21 | + Score_beta * d(beta)/d(mu) 22 | = (digamma(beta) - digamma(alpha) + log(x) - log(1-x)) * nu 23 | 24 | Score_nu(x) = d(log(p(x)))/d(nu) = Score_alpha * d(alpha)/d(nu) 25 | + Score_beta * d(beta)/d(nu) 26 | = (digamma(beta) - digamma(alpha) + log(x) - log(1-x)) * mu + 27 | digamma(nu) - digamma(beta) + log(1-x) 28 | """ 29 | 30 | def __init__(self, mu, nu) -> None: 31 | if 0 < mu < 1 and nu > 0: 32 | self.mu = mu 33 | self.nu = nu 34 | self.alpha = mu * nu 35 | self.beta = (1. - mu) * nu 36 | else: 37 | raise ValueError("mu = %.3f, nu = %.3f" % (mu, nu)) 38 | 39 | def get_samples(self, n: int) -> np.ndarray: 40 | sp = BetaDistribution.SMALL_POS 41 | return np.vectorize(lambda x: min(1. - sp, max(sp, x)))( 42 | np.random.beta(a=self.alpha, b=self.beta, size=n) 43 | ) 44 | 45 | def get_mu_nu_scores(self, x: float) -> Tuple[float, float]: 46 | diga = digamma(self.alpha) 47 | digb = digamma(self.beta) 48 | dign = digamma(self.nu) 49 | lx = np.log(x) 50 | l1x = np.log(1. - x) 51 | temp = digb - diga + lx - l1x 52 | r1 = temp * self.nu 53 | r2 = temp * self.mu + dign - digb + l1x 54 | return r1, r2 55 | 56 | 57 | -------------------------------------------------------------------------------- /src/algorithms/rl_tabular/rl_tabular_base.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Optional, Set, Callable 2 | from abc import abstractmethod 3 | from algorithms.tabular_base import TabularBase 4 | from processes.mdp_rep_for_rl_tabular import MDPRepForRLTabular 5 | from processes.policy import Policy 6 | from processes.det_policy import DetPolicy 7 | from algorithms.helper_funcs import get_vf_dict_from_qf_dict_and_policy 8 | from algorithms.helper_funcs import get_uniform_policy 9 | from algorithms.helper_funcs import get_det_policy_from_qf_dict 10 | from algorithms.helper_funcs import get_epsilon_decay_func 11 | from utils.generic_typevars import S, A 12 | from utils.standard_typevars import VFDictType, QFDictType 13 | 14 | 15 | class RLTabularBase(TabularBase): 16 | 17 | def __init__( 18 | self, 19 | mdp_rep_for_rl: MDPRepForRLTabular, 20 | exploring_start: bool, 21 | softmax: bool, 22 | epsilon: float, 23 | epsilon_half_life: float, 24 | num_episodes: int, 25 | max_steps: int 26 | ) -> None: 27 | 28 | self.mdp_rep: MDPRepForRLTabular = mdp_rep_for_rl 29 | self.exploring_start: bool = exploring_start 30 | self.softmax: bool = softmax 31 | self.epsilon_func: Callable[[int], float] = get_epsilon_decay_func( 32 | epsilon, 33 | epsilon_half_life 34 | ) 35 | self.num_episodes: int = num_episodes 36 | self.max_steps: int = max_steps 37 | 38 | def get_state_action_dict(self) -> Mapping[S, Set[A]]: 39 | return self.mdp_rep.state_action_dict 40 | 41 | def get_init_policy(self) -> Policy: 42 | return get_uniform_policy(self.mdp_rep.state_action_dict) 43 | 44 | def get_value_func_dict(self, pol: Policy) -> VFDictType: 45 | return get_vf_dict_from_qf_dict_and_policy( 46 | self.get_qv_func_dict(pol), 47 | pol 48 | ) 49 | 50 | @abstractmethod 51 | def get_qv_func_dict(self, pol: Optional[Policy]) -> QFDictType: 52 | pass 53 | 54 | def get_act_value_func_dict(self, pol: Policy) -> QFDictType: 55 | return self.get_qv_func_dict(pol) 56 | 57 | def get_optimal_det_policy(self) -> DetPolicy: 58 | return get_det_policy_from_qf_dict(self.get_qv_func_dict(None)) 59 | -------------------------------------------------------------------------------- /src/algorithms/dp/dp_analytic.py: -------------------------------------------------------------------------------- 1 | from algorithms.dp.dp_base import DPBase 2 | from processes.policy import Policy 3 | from processes.mdp import MDP 4 | from processes.det_policy import DetPolicy 5 | from utils.standard_typevars import VFDictType 6 | 7 | 8 | class DPAnalytic(DPBase): 9 | 10 | def __init__(self, mdp_obj: MDP, tol: float) -> None: 11 | super().__init__(mdp_obj, tol) 12 | 13 | def get_value_func_dict(self, pol: Policy) -> VFDictType: 14 | mrp_obj = self.mdp_obj.get_mrp(pol) 15 | value_func_vec = mrp_obj.get_value_func_vec() 16 | nt_vf = {mrp_obj.nt_states_list[i]: value_func_vec[i] 17 | for i in range(len(mrp_obj.nt_states_list))} 18 | t_vf = {s: 0. for s in self.mdp_obj.terminal_states} 19 | return {**nt_vf, **t_vf} 20 | 21 | def get_optimal_det_policy(self) -> DetPolicy: 22 | return self.get_optimal_policy_pi() 23 | 24 | 25 | if __name__ == '__main__': 26 | from processes.mdp import MDP 27 | policy_data = { 28 | 1: {'a': 0.4, 'b': 0.6}, 29 | 2: {'a': 0.7, 'c': 0.3}, 30 | 3: {'b': 1.0} 31 | } 32 | pol_obj = Policy(policy_data) 33 | mdp_data = { 34 | 1: { 35 | 'a': ({1: 0.2, 2: 0.6, 3: 0.2}, 7.0), 36 | 'b': ({1: 0.6, 2: 0.3, 3: 0.1}, -2.0), 37 | 'c': ({1: 0.1, 2: 0.2, 3: 0.7}, 10.0) 38 | }, 39 | 2: { 40 | 'a': ({1: 0.1, 2: 0.6, 3: 0.3}, 1.0), 41 | 'c': ({1: 0.6, 2: 0.2, 3: 0.2}, -1.2) 42 | }, 43 | 3: { 44 | 'b': ({3: 1.0}, 0.0) 45 | } 46 | } 47 | gamma_val = 0.9 48 | mdp1_obj = MDP(mdp_data, gamma_val) 49 | mrp1_obj = mdp1_obj.get_mrp(pol_obj) 50 | print(mrp1_obj.transitions) 51 | print(mrp1_obj.rewards) 52 | print(mrp1_obj.trans_matrix) 53 | print(mrp1_obj.rewards_vec) 54 | print(mrp1_obj.get_value_func_vec()) 55 | tol_val = 1e-4 56 | opn = DPAnalytic(mdp1_obj, tol_val) 57 | opt_policy_pi = opn.get_optimal_policy_pi() 58 | print(opt_policy_pi) 59 | opt_vf_dict_pi = opn.get_value_func_dict(opt_policy_pi) 60 | print(opt_vf_dict_pi) 61 | opt_policy_vi = opn.get_optimal_policy_vi() 62 | print(opt_policy_vi) 63 | opt_vf_dict_vi = opn.get_value_func_dict(opt_policy_vi) 64 | print(opt_vf_dict_vi) 65 | -------------------------------------------------------------------------------- /src/algorithms/tabular_base.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Set, Callable 2 | from abc import abstractmethod 3 | from algorithms.opt_base import OptBase 4 | from processes.policy import Policy 5 | from processes.det_policy import DetPolicy 6 | from algorithms.helper_funcs import get_pdf_from_samples 7 | from utils.generic_typevars import S, A 8 | from utils.standard_typevars import VFDictType, QFDictType, PolicyType 9 | 10 | 11 | class TabularBase(OptBase): 12 | 13 | NUM_SAMPLES_PER_ACTION = 10 14 | 15 | @abstractmethod 16 | def get_init_policy(self) -> Policy: 17 | pass 18 | 19 | @abstractmethod 20 | def get_value_func_dict(self, pol: Policy) -> VFDictType: 21 | pass 22 | 23 | @abstractmethod 24 | def get_act_value_func_dict(self, pol: Policy) -> QFDictType: 25 | pass 26 | 27 | @abstractmethod 28 | def get_optimal_det_policy(self) -> DetPolicy: 29 | pass 30 | 31 | @abstractmethod 32 | def get_state_action_dict(self) -> Mapping[S, Set[A]]: 33 | pass 34 | 35 | def get_value_func(self, polf: PolicyType) -> Callable[[S], float]: 36 | pol = Policy({s: get_pdf_from_samples( 37 | polf(s)(len(v) * TabularBase.NUM_SAMPLES_PER_ACTION) 38 | ) for s, v in self.get_state_action_dict().items()}) 39 | 40 | # noinspection PyShadowingNames 41 | def vf(state: S, pol=pol) -> float: 42 | return self.get_value_func_dict(pol)[state] 43 | 44 | return vf 45 | 46 | def get_act_value_func(self, polf: PolicyType)\ 47 | -> Callable[[S], Callable[[A], float]]: 48 | pol = Policy({s: get_pdf_from_samples( 49 | polf(s)(len(v) * TabularBase.NUM_SAMPLES_PER_ACTION) 50 | ) for s, v in self.get_state_action_dict().items()}) 51 | 52 | # noinspection PyShadowingNames 53 | def qvf(state: S, pol=pol) -> Callable[[A], float]: 54 | 55 | # noinspection PyShadowingNames 56 | def inner_f(action: A, pol=pol, state=state) -> float: 57 | return self.get_act_value_func_dict(pol)[state][action] 58 | 59 | return inner_f 60 | 61 | return qvf 62 | 63 | def get_optimal_det_policy_func(self) -> Callable[[S], A]: 64 | return lambda s: self.get_optimal_det_policy().get_action_for_state(s) 65 | 66 | -------------------------------------------------------------------------------- /src/processes/mp.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Set, Generic, Sequence 2 | from graphviz import Digraph 3 | from processes.mp_funcs import get_all_states, verify_mp, get_lean_transitions 4 | import numpy as np 5 | from scipy.linalg import eig 6 | from utils.generic_typevars import S 7 | from utils.standard_typevars import SSf 8 | 9 | 10 | class MP(Generic[S]): 11 | 12 | def __init__( 13 | self, 14 | tr: SSf 15 | ) -> None: 16 | if verify_mp(tr): 17 | self.all_states_list: Sequence[S] = list(get_all_states(tr)) 18 | self.transitions: SSf = {s: get_lean_transitions(v) 19 | for s, v in tr.items()} 20 | else: 21 | raise ValueError 22 | 23 | def get_sink_states(self) -> Set[S]: 24 | return {k for k, v in self.transitions.items() 25 | if len(v) == 1 and k in v.keys()} 26 | 27 | def generate_image(self): 28 | d = Digraph() 29 | for s in self.all_states_list: 30 | d.node(str(s)) 31 | for s, v in self.transitions.items(): 32 | for s1, p in v.items(): 33 | d.edge(str(s), str(s1), label=str(p)) 34 | d.view() 35 | 36 | def get_stationary_distribution(self) -> Mapping[S, float]: 37 | sz = len(self.all_states_list) 38 | mat = np.zeros((sz, sz)) 39 | for i, s1 in enumerate(self.all_states_list): 40 | for j, s2 in enumerate(self.all_states_list): 41 | mat[i, j] = self.transitions[s1].get(s2, 0.) 42 | 43 | eig_vals, eig_vecs = eig(mat.T) 44 | stat = np.array( 45 | eig_vecs[:, np.where(np.abs(eig_vals - 1.) < 1e-8)[0][0]].flat 46 | ).astype(float) 47 | norm_stat = stat / sum(stat) 48 | return {s: norm_stat[i] for i, s in enumerate(self.all_states_list)} 49 | 50 | 51 | if __name__ == '__main__': 52 | transitions = { 53 | 1: {1: 0.1, 2: 0.6, 3: 0.1, 4: 0.2}, 54 | 2: {1: 0.25, 2: 0.22, 3: 0.24, 4: 0.29}, 55 | 3: {1: 0.7, 2: 0.3}, 56 | 4: {1: 0.3, 2: 0.5, 3: 0.2} 57 | } 58 | mp_obj = MP(transitions) 59 | print(mp_obj.transitions) 60 | print(mp_obj.all_states_list) 61 | print(mp_obj.get_sink_states()) 62 | stationary = mp_obj.get_stationary_distribution() 63 | print(stationary) 64 | mp_obj.generate_image() 65 | -------------------------------------------------------------------------------- /src/func_approx/dnn_spec.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Sequence, NamedTuple 2 | import numpy as np 3 | 4 | 5 | class DNNSpec(NamedTuple): 6 | 7 | neurons: Sequence[int] 8 | hidden_activation: Callable[[np.ndarray], np.ndarray] 9 | hidden_activation_deriv: Callable[[np.ndarray], np.ndarray] 10 | output_activation: Callable[[np.ndarray], np.ndarray] 11 | output_activation_deriv: Callable[[np.ndarray], np.ndarray] 12 | 13 | SMALL_POS = 1e-8 14 | 15 | @staticmethod 16 | def fexp(arg: np.ndarray) -> np.ndarray: 17 | return np.vectorize( 18 | lambda x: max(DNNSpec.SMALL_POS, x) 19 | )(np.exp(arg)) 20 | 21 | @staticmethod 22 | def relu(arg: np.ndarray) -> np.ndarray: 23 | return np.vectorize(lambda x: x if x > 0. else 0.)(arg) 24 | 25 | @staticmethod 26 | def relu_deriv(res: np.ndarray) -> np.ndarray: 27 | return np.vectorize(lambda x: 1. if x > 0. else 0.)(res) 28 | 29 | @staticmethod 30 | def identity(arg: np.ndarray) -> np.ndarray: 31 | return arg 32 | 33 | @staticmethod 34 | def identity_deriv(res: np.ndarray) -> np.ndarray: 35 | return np.ones_like(res) 36 | 37 | @staticmethod 38 | def sigmoid(arg: np.ndarray) -> np.ndarray: 39 | return np.vectorize( 40 | lambda x: max( 41 | DNNSpec.SMALL_POS, 42 | 1. / (1. + max(DNNSpec.SMALL_POS, np.exp(-x))) 43 | ) 44 | )(arg) 45 | 46 | @staticmethod 47 | def sigmoid_deriv(res: np.ndarray) -> np.ndarray: 48 | return res * (1. - res) 49 | 50 | @staticmethod 51 | def softplus(arg: np.ndarray) -> np.ndarray: 52 | return np.log(1. + DNNSpec.fexp(arg)) 53 | 54 | @staticmethod 55 | def softplus_deriv(res: np.ndarray) -> np.ndarray: 56 | return 1. - DNNSpec.fexp(-res) 57 | 58 | @staticmethod 59 | def log_squish(arg: np.ndarray) -> np.ndarray: 60 | return np.sign(arg) * np.log(1 + np.abs(arg)) 61 | 62 | @staticmethod 63 | def log_squish_deriv(res: np.ndarray) -> np.ndarray: 64 | return DNNSpec.fexp(-np.abs(res)) 65 | 66 | @staticmethod 67 | def pos_log_squish(arg: np.ndarray) -> np.ndarray: 68 | return np.vectorize( 69 | lambda x: 1. + np.log(1. + x) if x > 0. else DNNSpec.fexp(x) 70 | )(arg) 71 | 72 | @staticmethod 73 | def pos_log_squish_deriv(res: np.ndarray) -> np.ndarray: 74 | return np.vectorize( 75 | lambda x: DNNSpec.fexp(1. - x) if x > 1. else x 76 | )(res) 77 | -------------------------------------------------------------------------------- /src/examples/exam_problems/frog_lilypad.py: -------------------------------------------------------------------------------- 1 | from processes.mdp_refined import MDPRefined 2 | from typing import Sequence, Mapping, Tuple, NoReturn 3 | 4 | 5 | def get_lily_pads_mdp(n: int) -> MDPRefined: 6 | data = { 7 | i: { 8 | 'A': { 9 | i - 1: (i / n, 0.), 10 | i + 1: (1. - i / n, 1. if i == n - 1 else 0.) 11 | }, 12 | 'B': { 13 | j: (1 / n, 1. if j == n else 0.) 14 | for j in range(n + 1) if j != i 15 | } 16 | } for i in range(1, n) 17 | } 18 | data[0] = {'A': {0: (1., 0.)}, 'B': {0: (1., 0.)}} 19 | data[n] = {'A': {n: (1., 0.)}, 'B': {n: (1., 0.)}} 20 | 21 | gamma = 1.0 22 | return MDPRefined(data, gamma) 23 | 24 | 25 | def get_sorted_q_val( 26 | q_val: Mapping[int, Mapping[str, float]] 27 | ) -> Sequence[Tuple[float, float]]: 28 | d = sorted([(s, (t['A'], t['B'])) for s, t in q_val.items()], key=lambda x: x[0]) 29 | return [z for _, z in d[1:-1]] 30 | 31 | 32 | def direct_bellman(n: int) -> Mapping[int, float]: 33 | vf = [0.5] * (n + 1) 34 | vf[0] = 0. 35 | vf[n] = 0. 36 | tol = 1e-8 37 | epsilon = tol * 1e4 38 | while epsilon >= tol: 39 | old_vf = [v for v in vf] 40 | for i in range(1, n): 41 | vf[i] = max( 42 | (1. if i == n - 1 else 0.) + i * vf[i - 1] + (n - i) * vf[i + 1], 43 | 1. + sum(vf[j] for j in range(1, n) if j != i) 44 | ) / n 45 | epsilon = max(abs(old_vf[i] - v) for i, v in enumerate(vf)) 46 | return {v: f for v, f in enumerate(vf)} 47 | 48 | 49 | def graph_q_func(a: Sequence[Tuple[float, float]]) -> NoReturn: 50 | import matplotlib.pyplot as plt 51 | x_vals = range(1, len(a) + 1) 52 | plt.plot(x_vals, [x for x, _ in a], "r", label="Q* for Action A") 53 | plt.plot(x_vals, [y for _, y in a], "b", label="Q* for Action B") 54 | plt.xlabel("Lilypad Number") 55 | plt.ylabel("Value") 56 | plt.title("Optimal Action Value Function") 57 | plt.xlim(xmin=x_vals[0], xmax=x_vals[-1]) 58 | plt.ylim(ymin=0.5, ymax=0.8) 59 | plt.xticks(x_vals) 60 | plt.grid(True) 61 | plt.legend(loc='lower right') 62 | plt.show() 63 | 64 | 65 | if __name__ == '__main__': 66 | pads: int = 10 67 | mdp: MDPRefined = get_lily_pads_mdp(pads) 68 | pol = mdp.get_optimal_policy(1e-8) 69 | print(pol.policy_data) 70 | print(mdp.get_value_func_dict(pol)) 71 | qv = mdp.get_act_value_func_dict(pol) 72 | graph_q_func(get_sorted_q_val(qv)) 73 | -------------------------------------------------------------------------------- /src/processes/mrp.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Set, Sequence 2 | from processes.mp import MP 3 | from utils.gen_utils import zip_dict_of_tuple, is_approx_eq 4 | import numpy as np 5 | from utils.generic_typevars import S 6 | from utils.standard_typevars import STSff 7 | 8 | 9 | class MRP(MP): 10 | 11 | def __init__( 12 | self, 13 | info: STSff, 14 | gamma: float 15 | ): 16 | d1, d2 = zip_dict_of_tuple(info) 17 | super().__init__(d1) 18 | self.gamma: float = gamma 19 | self.rewards: Mapping[S, float] = d2 20 | self.terminal_states = self.get_terminal_states() 21 | self.nt_states_list: Sequence[S] = self.get_nt_states_list() 22 | self.trans_matrix: np.ndarray = self.get_trans_matrix() 23 | self.rewards_vec: np.ndarray = self.get_rewards_vec() 24 | 25 | def get_terminal_states(self) -> Set[S]: 26 | sink = self.get_sink_states() 27 | return {s for s in sink if is_approx_eq(self.rewards[s], 0.0)} 28 | 29 | def get_nt_states_list(self) -> Sequence[S]: 30 | return [s for s in self.all_states_list 31 | if s not in self.terminal_states] 32 | 33 | def get_trans_matrix(self) -> np.ndarray: 34 | """ 35 | This transition matrix is only for the non-terminal states 36 | """ 37 | n = len(self.nt_states_list) 38 | m = np.zeros((n, n)) 39 | for i in range(n): 40 | for s, d in self.transitions[self.nt_states_list[i]].items(): 41 | if s in self.nt_states_list: 42 | m[i, self.nt_states_list.index(s)] = d 43 | return m 44 | 45 | def get_rewards_vec(self) -> np.ndarray: 46 | """ 47 | This rewards vec is only for the non-terminal states 48 | """ 49 | return np.array([self.rewards[s] for s in self.nt_states_list]) 50 | 51 | def get_value_func_vec(self) -> np.ndarray: 52 | """ 53 | This value func vec is only for the non-terminal states 54 | """ 55 | return np.linalg.inv( 56 | np.eye(len(self.nt_states_list)) - self.gamma * self.trans_matrix 57 | ).dot(self.rewards_vec) 58 | 59 | 60 | if __name__ == '__main__': 61 | data = { 62 | 1: ({1: 0.6, 2: 0.3, 3: 0.1}, 7.0), 63 | 2: ({1: 0.1, 2: 0.2, 3: 0.7}, 10.0), 64 | 3: ({3: 1.0}, 0.0) 65 | } 66 | mrp_obj = MRP(data, 1.0) 67 | print(mrp_obj.trans_matrix) 68 | print(mrp_obj.rewards_vec) 69 | terminal = mrp_obj.get_terminal_states() 70 | print(terminal) 71 | value_func_vec = mrp_obj.get_value_func_vec() 72 | print(value_func_vec) 73 | -------------------------------------------------------------------------------- /src/algorithms/dp/dp_numeric.py: -------------------------------------------------------------------------------- 1 | from algorithms.dp.dp_base import DPBase 2 | from processes.policy import Policy 3 | from processes.det_policy import DetPolicy 4 | from processes.mp_funcs import mdp_rep_to_mrp_rep1, mdp_rep_to_mrp_rep2 5 | from processes.mdp import MDP 6 | from utils.standard_typevars import VFDictType 7 | 8 | 9 | class DPNumeric(DPBase): 10 | 11 | def __init__(self, mdp_obj: MDP, tol: float) -> None: 12 | super().__init__(mdp_obj, tol) 13 | 14 | def get_value_func_dict(self, pol: Policy) -> VFDictType: 15 | vf = {s: 0. for s in self.mdp_obj.all_states} 16 | epsilon = self.tol * 1e4 17 | mo = self.mdp_obj 18 | pd = pol.policy_data 19 | rew = mdp_rep_to_mrp_rep2(mo.rewards, pd) 20 | prob = mdp_rep_to_mrp_rep1(mo.transitions, pd) 21 | while epsilon >= self.tol: 22 | new_vf = {s: rew[s] + mo.gamma * sum(p * vf[s1] 23 | for s1, p in prob[s].items()) 24 | for s in mo.all_states} 25 | epsilon = max(abs(new_vf[s] - v) for s, v in vf.items()) 26 | vf = new_vf 27 | return vf 28 | 29 | def get_optimal_det_policy(self) -> DetPolicy: 30 | return self.get_optimal_policy_vi() 31 | 32 | 33 | if __name__ == '__main__': 34 | from processes.mdp import MDP 35 | policy_data = { 36 | 1: {'a': 0.4, 'b': 0.6}, 37 | 2: {'a': 0.7, 'c': 0.3}, 38 | 3: {'b': 1.0} 39 | } 40 | pol_obj = Policy(policy_data) 41 | mdp_data = { 42 | 1: { 43 | 'a': ({1: 0.2, 2: 0.6, 3: 0.2}, 7.0), 44 | 'b': ({1: 0.6, 2: 0.3, 3: 0.1}, -2.0), 45 | 'c': ({1: 0.1, 2: 0.2, 3: 0.7}, 10.0) 46 | }, 47 | 2: { 48 | 'a': ({1: 0.1, 2: 0.6, 3: 0.3}, 1.0), 49 | 'c': ({1: 0.6, 2: 0.2, 3: 0.2}, -1.2) 50 | }, 51 | 3: { 52 | 'b': ({3: 1.0}, 0.0) 53 | } 54 | } 55 | gamma_val = 0.9 56 | mdp1_obj = MDP(mdp_data, gamma_val) 57 | mrp1_obj = mdp1_obj.get_mrp(pol_obj) 58 | print(mrp1_obj.transitions) 59 | print(mrp1_obj.rewards) 60 | print(mrp1_obj.trans_matrix) 61 | print(mrp1_obj.rewards_vec) 62 | print(mrp1_obj.get_value_func_vec()) 63 | tol_val = 1e-4 64 | opn = DPNumeric(mdp1_obj, tol_val) 65 | opt_policy_pi = opn.get_optimal_policy_pi() 66 | print(opt_policy_pi) 67 | opt_vf_dict_pi = opn.get_value_func_dict(opt_policy_pi) 68 | print(opt_vf_dict_pi) 69 | opt_policy_vi = opn.get_optimal_policy_vi() 70 | print(opt_policy_vi) 71 | opt_vf_dict_vi = opn.get_value_func_dict(opt_policy_vi) 72 | print(opt_vf_dict_vi) 73 | -------------------------------------------------------------------------------- /src/algorithms/mab/mab_base.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, Callable, Tuple, NoReturn 2 | from abc import ABC, abstractmethod 3 | from processes.mab_env import MabEnv 4 | from numpy import ndarray, mean, vstack, cumsum, full, bincount 5 | from utils.gen_utils import memoize 6 | 7 | 8 | class MABBase(ABC): 9 | 10 | def __init__( 11 | self, 12 | mab: MabEnv, 13 | time_steps: int, 14 | num_episodes: int 15 | ) -> None: 16 | self.mab_funcs: Sequence[Callable[[], float]] = mab.arms_sampling_funcs 17 | self.num_arms: int = len(self.mab_funcs) 18 | self.time_steps: int = time_steps 19 | self.num_episodes: int = num_episodes 20 | 21 | @abstractmethod 22 | def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]: 23 | pass 24 | 25 | @memoize 26 | def get_all_rewards_actions(self) -> Sequence[Tuple[ndarray, ndarray]]: 27 | return [self.get_episode_rewards_actions() for _ in range(self.num_episodes)] 28 | 29 | def get_rewards_matrix(self) -> ndarray: 30 | return vstack([x for x, _ in self.get_all_rewards_actions()]) 31 | 32 | def get_actions_matrix(self) -> ndarray: 33 | return vstack([y for _, y in self.get_all_rewards_actions()]) 34 | 35 | def get_expected_rewards(self) -> ndarray: 36 | return mean(self.get_rewards_matrix(), axis=0) 37 | 38 | def get_expected_cum_rewards(self) -> ndarray: 39 | return cumsum(self.get_expected_rewards()) 40 | 41 | def get_expected_regret(self, best_mean) -> ndarray: 42 | return full(self.time_steps, best_mean) - self.get_expected_rewards() 43 | 44 | def get_expected_cum_regret(self, best_mean) -> ndarray: 45 | return cumsum(self.get_expected_regret(best_mean)) 46 | 47 | def get_action_counts(self) -> ndarray: 48 | return vstack([bincount(ep, minlength=self.num_arms) 49 | for ep in self.get_actions_matrix()]) 50 | 51 | def get_expected_action_counts(self) -> ndarray: 52 | return mean(self.get_action_counts(), axis=0) 53 | 54 | def plot_exp_cum_regret_curve(self, best_mean) -> NoReturn: 55 | import matplotlib.pyplot as plt 56 | x_vals = range(1, self.time_steps + 1) 57 | plt.plot(self.get_expected_cum_regret(best_mean), "b", label="Exp Cum Regret") 58 | plt.xlabel("Time Steps", fontsize=20) 59 | plt.ylabel("Expected Cumulative Regret", fontsize=20) 60 | plt.title("Cumulative Regret Curve", fontsize=25) 61 | plt.xlim(xmin=x_vals[0], xmax=x_vals[-1]) 62 | plt.ylim(ymin=0.0) 63 | # plt.xticks(x_vals) 64 | plt.grid(True) 65 | # plt.legend(loc='upper left') 66 | plt.show() 67 | 68 | 69 | -------------------------------------------------------------------------------- /src/algorithms/mab/ucb1.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, Tuple, List 2 | from processes.mab_env import MabEnv 3 | from operator import itemgetter 4 | from numpy import ndarray, empty, sqrt, log 5 | from algorithms.mab.mab_base import MABBase 6 | 7 | 8 | class UCB1(MABBase): 9 | 10 | def __init__( 11 | self, 12 | mab: MabEnv, 13 | time_steps: int, 14 | num_episodes: int, 15 | bounds_range: float, 16 | alpha: float 17 | ) -> None: 18 | if bounds_range < 0 or alpha <= 0: 19 | raise ValueError 20 | super().__init__( 21 | mab=mab, 22 | time_steps=time_steps, 23 | num_episodes=num_episodes 24 | ) 25 | self.bounds_range: float = bounds_range 26 | self.alpha: float = alpha 27 | 28 | def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]: 29 | ep_rewards: ndarray = empty(self.time_steps) 30 | ep_actions: ndarray = empty(self.time_steps, dtype=int) 31 | for i in range(self.num_arms): 32 | ep_rewards[i] = self.mab_funcs[i]() 33 | ep_actions[i] = i 34 | counts: List[int] = [1] * self.num_arms 35 | means: List[float] = [ep_rewards[j] for j in range(self.num_arms)] 36 | for i in range(self.num_arms, self.time_steps): 37 | ucbs: Sequence[float] = [means[j] + self.bounds_range * 38 | sqrt(0.5 * self.alpha * log(i) / counts[j]) 39 | for j in range(self.num_arms)] 40 | action: int = max(enumerate(ucbs), key=itemgetter(1))[0] 41 | reward: float = self.mab_funcs[action]() 42 | counts[action] += 1 43 | means[action] += (reward - means[action]) / counts[action] 44 | ep_rewards[i] = reward 45 | ep_actions[i] = action 46 | return ep_rewards, ep_actions 47 | 48 | 49 | if __name__ == '__main__': 50 | binomial_count = 10 51 | binomial_probs = [0.4, 0.8, 0.1, 0.5, 0.9, 0.2] 52 | binomial_params = [(binomial_count, p) for p in binomial_probs] 53 | mu_star = max(n * p for n, p in binomial_params) 54 | steps = 200 55 | episodes = 1000 56 | this_range = binomial_count 57 | this_alpha = 4.0 58 | 59 | me = MabEnv.get_binomial_mab_env(binomial_params) 60 | ucb1 = UCB1( 61 | mab=me, 62 | time_steps=steps, 63 | num_episodes=episodes, 64 | bounds_range=this_range, 65 | alpha=this_alpha 66 | ) 67 | exp_cum_regret = ucb1.get_expected_cum_regret(mu_star) 68 | print(exp_cum_regret) 69 | 70 | exp_act_count = ucb1.get_expected_action_counts() 71 | print(exp_act_count) 72 | 73 | ucb1.plot_exp_cum_regret_curve(mu_star) 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /src/algorithms/func_approx_spec.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Sequence, NamedTuple, Optional, Tuple 2 | from func_approx.dnn_spec import DNNSpec 3 | from func_approx.func_approx_base import FuncApproxBase 4 | from func_approx.linear_approx import LinearApprox 5 | from func_approx.dnn import DNN 6 | from utils.generic_typevars import S, A 7 | 8 | 9 | class FuncApproxSpec(NamedTuple): 10 | state_feature_funcs: Sequence[Callable[[S], float]] 11 | sa_feature_funcs: Sequence[Callable[[Tuple[S, A]], float]] 12 | dnn_spec: Optional[DNNSpec] 13 | reglr_coeff: float = 0. 14 | learning_rate: float = 0.1 15 | adam_params: Tuple[bool, float, float] = (True, 0.9, 0.99) 16 | add_unit_feature: bool = True 17 | 18 | def get_vf_func_approx_obj(self) -> FuncApproxBase: 19 | if self.dnn_spec is None: 20 | ret = LinearApprox( 21 | feature_funcs=self.state_feature_funcs, 22 | reglr_coeff=self.reglr_coeff, 23 | learning_rate=self.learning_rate, 24 | adam=self.adam_params[0], 25 | adam_decay1=self.adam_params[1], 26 | adam_decay2=self.adam_params[2], 27 | add_unit_feature=self.add_unit_feature 28 | ) 29 | else: 30 | ret = DNN( 31 | feature_funcs=self.state_feature_funcs, 32 | dnn_obj=self.dnn_spec, 33 | reglr_coeff=self.reglr_coeff, 34 | learning_rate=self.learning_rate, 35 | adam=self.adam_params[0], 36 | adam_decay1=self.adam_params[1], 37 | adam_decay2=self.adam_params[2], 38 | add_unit_feature=self.add_unit_feature 39 | ) 40 | return ret 41 | 42 | def get_qvf_func_approx_obj(self) -> FuncApproxBase: 43 | if self.dnn_spec is None: 44 | ret = LinearApprox( 45 | feature_funcs=self.sa_feature_funcs, 46 | reglr_coeff=self.reglr_coeff, 47 | learning_rate=self.learning_rate, 48 | adam=self.adam_params[0], 49 | adam_decay1=self.adam_params[1], 50 | adam_decay2=self.adam_params[2], 51 | add_unit_feature=self.add_unit_feature 52 | ) 53 | else: 54 | ret = DNN( 55 | feature_funcs=self.sa_feature_funcs, 56 | dnn_obj=self.dnn_spec, 57 | reglr_coeff=self.reglr_coeff, 58 | learning_rate=self.learning_rate, 59 | adam=self.adam_params[0], 60 | adam_decay1=self.adam_params[1], 61 | adam_decay2=self.adam_params[2], 62 | add_unit_feature=self.add_unit_feature 63 | ) 64 | return ret 65 | -------------------------------------------------------------------------------- /src/algorithms/mab/gradient_bandits.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, Tuple, List 2 | from processes.mab_env import MabEnv 3 | from operator import itemgetter 4 | from numpy import ndarray, empty, exp 5 | from numpy.random import choice 6 | from algorithms.mab.mab_base import MABBase 7 | 8 | 9 | class GradientBandits(MABBase): 10 | 11 | def __init__( 12 | self, 13 | mab: MabEnv, 14 | time_steps: int, 15 | num_episodes: int, 16 | learning_rate: float, 17 | learning_rate_decay: float 18 | ) -> None: 19 | if learning_rate <= 0 or learning_rate_decay <= 0: 20 | raise ValueError 21 | super().__init__( 22 | mab=mab, 23 | time_steps=time_steps, 24 | num_episodes=num_episodes 25 | ) 26 | self.learning_rate: float = learning_rate 27 | self.learning_rate_decay: float = learning_rate_decay 28 | 29 | def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]: 30 | ep_rewards: ndarray = empty(self.time_steps) 31 | ep_actions: ndarray = empty(self.time_steps, dtype=int) 32 | scores: List[float] = [0.] * self.num_arms 33 | avg_reward: float = 0. 34 | 35 | for i in range(self.time_steps): 36 | max_score: float = max(scores) 37 | exp_scores: Sequence[float] = [exp(s - max_score) for s in scores] 38 | sum_exp_scores = sum(exp_scores) 39 | probs: Sequence[float] = [s / sum_exp_scores for s in exp_scores] 40 | action: int = choice(self.num_arms, p=probs) 41 | reward: float = self.mab_funcs[action]() 42 | avg_reward += (reward - avg_reward) / (i + 1) 43 | step_size: float = self.learning_rate *\ 44 | (i / self.learning_rate_decay + 1) ** -0.5 45 | for j in range(self.num_arms): 46 | scores[j] += step_size * (reward - avg_reward) *\ 47 | ((1 if j == action else 0) - probs[j]) 48 | 49 | ep_rewards[i] = reward 50 | ep_actions[i] = action 51 | return ep_rewards, ep_actions 52 | 53 | 54 | if __name__ == '__main__': 55 | mean_vars_data = [(9., 5.), (10., 2.), (0., 4.), (6., 10.), (2., 20.), (4., 1.)] 56 | mu_star = max(mean_vars_data, key=itemgetter(0))[0] 57 | steps = 200 58 | episodes = 1000 59 | lr = 0.1 60 | lr_decay = 20.0 61 | 62 | me = MabEnv.get_gaussian_mab_env(mean_vars_data) 63 | ucb1 = GradientBandits( 64 | mab=me, 65 | time_steps=steps, 66 | num_episodes=episodes, 67 | learning_rate=lr, 68 | learning_rate_decay=lr_decay 69 | ) 70 | exp_cum_regret = ucb1.get_expected_cum_regret(mu_star) 71 | print(exp_cum_regret) 72 | 73 | exp_act_count = ucb1.get_expected_action_counts() 74 | print(exp_act_count) 75 | 76 | ucb1.plot_exp_cum_regret_curve(mu_star) 77 | -------------------------------------------------------------------------------- /src/examples/exam_problems/wage_max.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, NamedTuple 2 | import numpy as np 3 | 4 | 5 | class WageMax(NamedTuple): 6 | 7 | probs: Sequence[float] 8 | wages: Sequence[float] 9 | gamma: float 10 | alpha: float 11 | risk_aversion: float 12 | 13 | def validate_inputs(self) -> bool: 14 | b1 = abs(sum(self.probs) - 1) <= 1e-8 15 | b2 = len(self.probs) + 1 == len(self.wages) 16 | b3 = all(self.wages[0] < w for w in self.wages[1:]) 17 | b4 = 0. <= self.gamma < 1. 18 | b5 = 0. <= self.alpha <= 1. 19 | b6 = self.risk_aversion > 0. 20 | return all([b1, b2, b3, b4, b5, b6]) 21 | 22 | # noinspection PyShadowingNames 23 | def get_wages_utility(self) -> Sequence[float]: 24 | a = self.risk_aversion 25 | f = (lambda x, a=a: (pow(x, 1 - a) - 1) / (1 - a)) \ 26 | if a != 1 else (lambda x: np.log(x)) 27 | return [f(w) for w in self.wages] 28 | 29 | def get_opt_vf(self) -> Sequence[float]: 30 | jobs = len(self.probs) 31 | utils = self.get_wages_utility() 32 | vf = [0.] * (jobs + 1) 33 | tol = 1e-6 34 | epsilon = tol * 1e6 35 | while epsilon >= tol: 36 | old_vf = [v for v in vf] 37 | vf[0] = sum(self.probs[i] * max( 38 | vf[i + 1], 39 | utils[0] + self.gamma * vf[0] 40 | ) for i in range(jobs)) 41 | for i in range(1, jobs + 1): 42 | vf[i] = utils[i] + self.gamma *\ 43 | (self.alpha * vf[0] + (1 - self.alpha) * vf[i]) 44 | epsilon = max(abs(old_vf[i] - v) for i, v in enumerate(vf)) 45 | return vf 46 | 47 | def get_opt_policy(self) -> Sequence[str]: 48 | jobs = len(self.probs) 49 | utils = self.get_wages_utility() 50 | vf = self.get_opt_vf() 51 | return ["Accept" if vf[i] > utils[0] + self.gamma * vf[0] 52 | else "Decline" for i in range(1, jobs + 1)] 53 | 54 | 55 | if __name__ == '__main__': 56 | this_probs: Sequence[float] = [0.5, 0.3, 0.2] 57 | this_wages: Sequence[float] = [1.0, 1.8, 2.8, 5.2] 58 | this_gamma: float = 0.9 59 | this_alpha: float = 0.2 60 | this_risk_aversion: float = 1.0 61 | # all_jobs = 10 62 | # this_probs: Sequence[float] = [1. / all_jobs] * all_jobs 63 | # this_wages: Sequence[float] = [i + 1 for i in range(all_jobs + 1)] 64 | # this_gamma: float = 0.5 65 | # this_alpha: float = 0.1 66 | # this_risk_aversion: float = 0.5 67 | wm = WageMax( 68 | probs=this_probs, 69 | wages=this_wages, 70 | gamma=this_gamma, 71 | alpha=this_alpha, 72 | risk_aversion=this_risk_aversion 73 | ) 74 | if not wm.validate_inputs(): 75 | raise ValueError 76 | opt_vf = wm.get_opt_vf() 77 | opt_policy = wm.get_opt_policy() 78 | print(opt_vf) 79 | print(opt_policy) -------------------------------------------------------------------------------- /src/algorithms/mab/epsilon_greedy.py: -------------------------------------------------------------------------------- 1 | from typing import List, Callable, Tuple 2 | from processes.mab_env import MabEnv 3 | from algorithms.helper_funcs import get_epsilon_decay_func 4 | from operator import itemgetter 5 | from numpy.random import binomial, randint 6 | from numpy import ndarray, empty 7 | from algorithms.mab.mab_base import MABBase 8 | 9 | 10 | class EpsilonGreedy(MABBase): 11 | 12 | def __init__( 13 | self, 14 | mab: MabEnv, 15 | time_steps: int, 16 | num_episodes: int, 17 | epsilon: float, 18 | epsilon_half_life: float = 1e8, 19 | count_init: int = 0, 20 | mean_init: float = 0., 21 | ) -> None: 22 | if epsilon < 0 or epsilon > 1 or epsilon_half_life <= 1 or count_init < 0: 23 | raise ValueError 24 | 25 | super().__init__( 26 | mab=mab, 27 | time_steps=time_steps, 28 | num_episodes=num_episodes 29 | ) 30 | self.epsilon_func: Callable[[int], float] = get_epsilon_decay_func( 31 | epsilon, 32 | epsilon_half_life 33 | ) 34 | self.count_init: int = count_init 35 | self.mean_init: float = mean_init 36 | 37 | def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]: 38 | counts: List[int] = [self.count_init] * self.num_arms 39 | means: List[float] = [self.mean_init] * self.num_arms 40 | ep_rewards: ndarray = empty(self.time_steps) 41 | ep_actions: ndarray = empty(self.time_steps, dtype=int) 42 | for i in range(self.time_steps): 43 | max_action: int = max(enumerate(means), key=itemgetter(1))[0] 44 | epsl: float = self.epsilon_func(i) 45 | action: int = max_action if binomial(1, epsl, size=1)[0] == 0 else\ 46 | randint(self.num_arms, size=1)[0] 47 | reward: float = self.mab_funcs[action]() 48 | counts[action] += 1 49 | means[action] += (reward - means[action]) / counts[action] 50 | ep_rewards[i] = reward 51 | ep_actions[i] = action 52 | return ep_rewards, ep_actions 53 | 54 | 55 | if __name__ == '__main__': 56 | mean_vars_data = [(9., 5.), (10., 2.), (0., 4.), (6., 10.), (2., 20.), (4., 1.)] 57 | mu_star = max(mean_vars_data, key=itemgetter(0))[0] 58 | steps = 200 59 | episodes = 1000 60 | eps = 0.2 61 | eps_hl = 50 62 | ci = 5 63 | mi = mu_star * 3. 64 | 65 | me = MabEnv.get_gaussian_mab_env(mean_vars_data) 66 | eg = EpsilonGreedy( 67 | mab=me, 68 | time_steps=steps, 69 | num_episodes=episodes, 70 | epsilon=eps, 71 | epsilon_half_life=eps_hl, 72 | count_init=ci, 73 | mean_init=mi 74 | ) 75 | exp_cum_regret = eg.get_expected_cum_regret(mu_star) 76 | print(exp_cum_regret) 77 | 78 | exp_act_count = eg.get_expected_action_counts() 79 | print(exp_act_count) 80 | 81 | eg.plot_exp_cum_regret_curve(mu_star) 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /src/algorithms/dp/dp_base.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Set 2 | from abc import abstractmethod 3 | from algorithms.tabular_base import TabularBase 4 | from processes.policy import Policy 5 | from processes.det_policy import DetPolicy 6 | from processes.mdp import MDP 7 | from operator import itemgetter 8 | from algorithms.helper_funcs import get_uniform_policy 9 | from algorithms.helper_funcs import get_det_policy_from_qf_dict 10 | from utils.generic_typevars import S, A 11 | from utils.standard_typevars import VFDictType, QFDictType 12 | 13 | 14 | class DPBase(TabularBase): 15 | 16 | def __init__(self, mdp_obj: MDP, tol: float) -> None: 17 | self.mdp_obj: MDP = mdp_obj 18 | self.tol = tol 19 | 20 | def get_state_action_dict(self) -> Mapping[S, Set[A]]: 21 | return self.mdp_obj.state_action_dict 22 | 23 | def get_init_policy(self) -> Policy: 24 | return get_uniform_policy(self.mdp_obj.state_action_dict) 25 | 26 | @abstractmethod 27 | def get_value_func_dict(self, pol: Policy) -> VFDictType: 28 | pass 29 | 30 | def get_improved_det_policy(self, pol: Policy) -> DetPolicy: 31 | return get_det_policy_from_qf_dict(self.get_act_value_func_dict(pol)) 32 | 33 | def get_act_value_func_dict(self, pol: Policy) -> QFDictType: 34 | v_dict = self.get_value_func_dict(pol) 35 | mo = self.mdp_obj 36 | return {s: {a: r + mo.gamma * 37 | sum(p * v_dict[s1] for s1, p in 38 | mo.transitions[s][a].items()) for a, r in v.items()} 39 | for s, v in mo.rewards.items()} 40 | 41 | def get_optimal_policy_pi(self) -> DetPolicy: 42 | pol = self.get_init_policy() 43 | vf = self.get_value_func_dict(pol) 44 | epsilon = self.tol * 1e4 45 | while epsilon >= self.tol: 46 | pol = self.get_improved_det_policy(pol) 47 | new_vf = self.get_value_func_dict(pol) 48 | epsilon = max(abs(new_vf[s] - v) for s, v in vf.items()) 49 | vf = new_vf 50 | return pol 51 | 52 | def get_optimal_policy_vi(self) -> DetPolicy: 53 | vf = {s: 0. for s in self.mdp_obj.all_states} 54 | epsilon = self.tol * 1e4 55 | mo = self.mdp_obj 56 | while epsilon >= self.tol: 57 | new_vf = {s: max(r + mo.gamma * sum(p * vf[s1] for s1, p in 58 | mo.transitions[s][a].items()) 59 | for a, r in v.items()) 60 | for s, v in mo.rewards.items()} 61 | epsilon = max(abs(new_vf[s] - v) for s, v in vf.items()) 62 | vf = new_vf 63 | pol = DetPolicy({s: max( 64 | [(a, r + mo.gamma * sum(p * vf[s1] 65 | for s1, p in mo.transitions[s][a].items())) 66 | for a, r in v.items()], 67 | key=itemgetter(1) 68 | )[0] for s, v in mo.rewards.items()}) 69 | return pol 70 | 71 | @abstractmethod 72 | def get_optimal_det_policy(self) -> DetPolicy: 73 | pass 74 | 75 | -------------------------------------------------------------------------------- /src/examples/american_pricing/bs_pricing.py: -------------------------------------------------------------------------------- 1 | from scipy.stats import norm 2 | import numpy as np 3 | from typing import Mapping, Tuple 4 | 5 | 6 | class EuropeanBSPricing: 7 | 8 | def __init__( 9 | self, 10 | is_call: bool, 11 | spot_price: float, 12 | strike: float, 13 | expiry: float, 14 | r: float, 15 | sigma: float 16 | ) -> None: 17 | self.is_call: bool = is_call 18 | self.spot_price: float = spot_price 19 | self.strike: float = strike 20 | self.expiry: float = expiry 21 | self.r: float = r 22 | self.sigma: float = sigma 23 | self.option_price: float = self.get_option_price() 24 | self.greeks: Mapping[str, float] = self.get_greeks() 25 | 26 | def get_d1_d2(self) -> Tuple[float, float]: 27 | sigma_sqrt = self.sigma * np.sqrt(self.expiry) 28 | d1 = (np.log(self.spot_price / self.strike) + 29 | (self.r + self.sigma ** 2 / 2.) * self.expiry) / sigma_sqrt 30 | d2 = d1 - sigma_sqrt 31 | return d1, d2 32 | 33 | def get_option_price(self) -> float: 34 | d1, d2 = self.get_d1_d2() 35 | if self.is_call: 36 | ret = self.spot_price * norm.cdf(d1) -\ 37 | self.strike * np.exp(-self.r * self.expiry) * norm.cdf(d2) 38 | else: 39 | ret = self.strike * np.exp(-self.r * self.expiry) * norm.cdf(-d2)\ 40 | - self.spot_price * norm.cdf(-d1) 41 | return ret 42 | 43 | def get_greeks(self) -> Mapping[str, float]: 44 | d1, d2 = self.get_d1_d2() 45 | sqrtt = np.sqrt(self.expiry) 46 | 47 | gamma = norm.pdf(d1) / (self.spot_price * self.sigma * sqrtt) 48 | vega = self.spot_price * sqrtt * norm.pdf(d1) 49 | rho_temp = -self.strike * self.expiry * np.exp(-self.r * self.expiry) 50 | theta_temp1 = (self.spot_price * self.sigma * norm.pdf(d1)) / (2 * sqrtt) 51 | theta_temp2 = self.r * self.strike * np.exp(-self.r * self.expiry) 52 | 53 | if self.is_call: 54 | delta = norm.cdf(d1) 55 | theta = - theta_temp1 - theta_temp2 * norm.cdf(d2) 56 | rho = rho_temp * norm.cdf(d2) 57 | else: 58 | delta = -norm.cdf(-d1) 59 | theta = - theta_temp1 + theta_temp2 * norm.cdf(-d2) 60 | rho = rho_temp * norm.cdf(-d2) 61 | 62 | return { 63 | "Delta": delta, 64 | "Gamma": gamma, 65 | "Theta": theta, 66 | "Vega": vega, 67 | "Rho": rho 68 | } 69 | 70 | 71 | if __name__ == "__main__": 72 | is_call_val = False 73 | spot_price_val = 80.0 74 | strike_val = 78.0 75 | expiry_val = 2.0 76 | r_val = 0.02 77 | sigma_val = 0.25 78 | opt_obj = EuropeanBSPricing( 79 | is_call=is_call_val, 80 | spot_price=spot_price_val, 81 | strike=strike_val, 82 | expiry=expiry_val, 83 | r=r_val, 84 | sigma=sigma_val 85 | ) 86 | print(opt_obj.option_price) 87 | print(opt_obj.greeks) 88 | -------------------------------------------------------------------------------- /src/algorithms/mab/ts_gaussian.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, Tuple, List 2 | from processes.mab_env import MabEnv 3 | from operator import itemgetter 4 | from numpy import ndarray, empty, sqrt 5 | from numpy.random import gamma, normal 6 | from algorithms.mab.mab_base import MABBase 7 | 8 | 9 | class ThompsonSamplingGaussian(MABBase): 10 | 11 | def __init__( 12 | self, 13 | mab: MabEnv, 14 | time_steps: int, 15 | num_episodes: int, 16 | init_mean: float, 17 | init_stdev: float 18 | ) -> None: 19 | if init_stdev <= 0: 20 | raise ValueError 21 | super().__init__( 22 | mab=mab, 23 | time_steps=time_steps, 24 | num_episodes=num_episodes 25 | ) 26 | self.mu0: float = init_mean 27 | self.n0: int = 1 28 | self.alpha0: float = 1 29 | self.beta0: float = init_stdev * init_stdev 30 | 31 | def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]: 32 | # Bayesian update based on the treatment in 33 | # https://people.eecs.berkeley.edu/~jordan/courses/260-spring10/lectures/lecture5.pdf 34 | # (Section 3 on page 5, where both the mean and the variance are random) 35 | ep_rewards: ndarray = empty(self.time_steps) 36 | ep_actions: ndarray = empty(self.time_steps, dtype=int) 37 | bayes: List[Tuple[float, int, float, float]] =\ 38 | [(self.mu0, self.n0, self.alpha0, self.beta0)] * self.num_arms 39 | 40 | for i in range(self.time_steps): 41 | mean_draws: Sequence[float] = [normal( 42 | mu, 43 | 1 / sqrt(n * gamma(alpha, 1 / beta, 1)[0]), 44 | 1 45 | )[0] for mu, n, alpha, beta in bayes] 46 | action: int = max(enumerate(mean_draws), key=itemgetter(1))[0] 47 | reward: float = self.mab_funcs[action]() 48 | mu, n, alpha, beta = bayes[action] 49 | bayes[action] = ( 50 | (reward + n * mu) / (n + 1), 51 | n + 1, 52 | alpha + 0.5, 53 | beta + 0.5 * n / (n + 1) * (reward - mu) * (reward - mu) 54 | ) 55 | ep_rewards[i] = reward 56 | ep_actions[i] = action 57 | return ep_rewards, ep_actions 58 | 59 | 60 | if __name__ == '__main__': 61 | mean_vars_data = [(9., 5.), (10., 2.), (0., 4.), (6., 10.), (2., 20.), (4., 1.)] 62 | mu_star = max(mean_vars_data, key=itemgetter(0))[0] 63 | steps = 200 64 | episodes = 1000 65 | guess_mean = 0. 66 | guess_stdev = 10. 67 | 68 | me = MabEnv.get_gaussian_mab_env(mean_vars_data) 69 | ucb1 = ThompsonSamplingGaussian( 70 | mab=me, 71 | time_steps=steps, 72 | num_episodes=episodes, 73 | init_mean=guess_mean, 74 | init_stdev=guess_stdev 75 | ) 76 | exp_cum_regret = ucb1.get_expected_cum_regret(mu_star) 77 | print(exp_cum_regret) 78 | 79 | exp_act_count = ucb1.get_expected_action_counts() 80 | print(exp_act_count) 81 | 82 | ucb1.plot_exp_cum_regret_curve(mu_star) 83 | -------------------------------------------------------------------------------- /src/algorithms/backward_adp.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Set, Sequence, Tuple, Generic, Callable 2 | from utils.gen_utils import is_approx_eq 3 | from utils.generic_typevars import S, A 4 | from operator import itemgetter 5 | from algorithms.func_approx_spec import FuncApproxSpec 6 | from func_approx.func_approx_base import FuncApproxBase 7 | 8 | 9 | class BackwardADP(Generic[S, A]): 10 | 11 | def __init__( 12 | self, 13 | state_actions_funcs: Sequence[Callable[[S], Set[A]]], 14 | sample_states_gen_funcs: Sequence[Callable[[int], Sequence[S]]], 15 | transitions_rewards_funcs: Sequence[Callable[[S, A], Mapping[S, Tuple[float, float]]]], 16 | terminal_opt_val_func: Callable[[S], float], 17 | gamma: float, 18 | fa_specs: Sequence[FuncApproxSpec] 19 | ) -> None: 20 | if (len(state_actions_funcs) == len(sample_states_gen_funcs)\ 21 | == len(transitions_rewards_funcs) == len(fa_specs))\ 22 | and 0. <= gamma <= 1.: 23 | self.state_actions_funcs = state_actions_funcs 24 | self.sample_states_gen_funcs = sample_states_gen_funcs 25 | self.transitions_rewards_funcs = transitions_rewards_funcs 26 | self.terminal_opt_val_func = terminal_opt_val_func 27 | self.gamma = gamma 28 | self.fas: Sequence[FuncApproxBase] = [x.get_vf_func_approx_obj() for x in fa_specs] 29 | self.vf_and_policy_func = self.get_vf_and_policy_func() 30 | else: 31 | raise ValueError 32 | 33 | 34 | def get_vf_and_policy_func(self) -> Sequence[Mapping[S, Tuple[float, A]]]: 35 | vf_pol = {s: (v, None) for s, v in self.terminal_opt_val.items()} 36 | ret = [] 37 | for tr in self.transitions_rewards[::-1]: 38 | vf_pol = {s: max( 39 | [( 40 | sum(p * (r + self.gamma * vf_pol[s1][0]) 41 | for s1, (p, r) in d1.items()), 42 | a 43 | ) for a, d1 in d.items()], 44 | key=itemgetter(0) 45 | ) for s, d in tr.items()} 46 | ret.append(vf_pol) 47 | return ret[::-1] 48 | 49 | 50 | if __name__ == '__main__': 51 | from scipy.stats import poisson 52 | T: int = 10 # time steps 53 | M: int = 200 # initial inventory 54 | # the following are (price, poisson mean) pairs, i.e., elasticity 55 | el: Sequence[Tuple[float, float]] = [ 56 | (10.0, 10.0), (9.0, 16.0), (8.0, 20.0), 57 | (7.0, 23.0), (6.0, 25.0), (5.0, 26.0) 58 | ] 59 | rvs = [(p, poisson(l)) for p, l in el] 60 | 61 | tr_rew_dict = { 62 | s: { 63 | p: { 64 | s - d: ( 65 | rv.pmf(d) if d < s else 1. - rv.cdf(s - 1), 66 | d * p 67 | ) for d in range(s + 1) 68 | } for p, rv in rvs 69 | } for s in range(M + 1) 70 | } 71 | 72 | bdp = BackwardADP( 73 | transitions_rewards=[tr_rew_dict] * T, 74 | terminal_opt_val={s: 0. for s in range(M + 1)}, 75 | gamma=1. 76 | ) 77 | print(bdp.vf_and_policy[0]) 78 | -------------------------------------------------------------------------------- /src/utils/gen_utils.py: -------------------------------------------------------------------------------- 1 | import functools 2 | from typing import Mapping, TypeVar, Tuple, Sequence, List 3 | 4 | FlattenedDict = List[Tuple[Tuple, float]] 5 | 6 | X = TypeVar('X') 7 | Y = TypeVar('Y') 8 | Z = TypeVar('Z') 9 | 10 | epsilon = 1e-8 11 | 12 | 13 | def memoize(func): 14 | cache = func.cache = {} 15 | 16 | @functools.wraps(func) 17 | def memoized_func(*args, **kwargs): 18 | key = str(args) + str(kwargs) 19 | if key not in cache: 20 | cache[key] = func(*args, **kwargs) 21 | return cache[key] 22 | return memoized_func 23 | 24 | 25 | def zip_dict_of_tuple(d: Mapping[X, Tuple[Y, Z]])\ 26 | -> Tuple[Mapping[X, Y], Mapping[X, Z]]: 27 | d1 = {k: v1 for k, (v1, _) in d.items()} 28 | d2 = {k: v2 for k, (_, v2) in d.items()} 29 | return d1, d2 30 | 31 | 32 | def sum_dicts(dicts: Sequence[Mapping[X, float]]) -> Mapping[X, float]: 33 | return {k: sum(d.get(k, 0) for d in dicts) 34 | for k in set.union(*[set(d1) for d1 in dicts])} 35 | 36 | 37 | def is_approx_eq(a: float, b: float) -> bool: 38 | return abs(a - b) <= epsilon 39 | 40 | 41 | def transpose_dict_of_dicts(d: Mapping[X, Mapping[Y, Z]])\ 42 | -> Mapping[Y, Mapping[X, Z]]: 43 | """ 44 | Returns the transposed dictionary of dictionaries. 45 | Works on irregularly shaped (non-rectangular) dicts of dicts 46 | """ 47 | all_y = set(y for _, di in d.items() for y, _ in di.items()) 48 | return {y: {x: val for x, di in d.items() 49 | for y1, val in di.items() if y1 == y} for y in all_y} 50 | 51 | 52 | def transpose_dict_of_lists(d: Mapping[X, Sequence[Y]])\ 53 | -> Sequence[Mapping[X, Y]]: 54 | """ 55 | Returns the transposed list of dictionaries. 56 | Works on irregularly shaped (non-rectangular) dicts of lists 57 | """ 58 | max_len = max(len(l) for _, l in d.items()) 59 | return [{k: l[i] for k, l in d.items() if i < len(l)} 60 | for i in range(max_len)] 61 | 62 | 63 | def transpose_list_of_dicts(l: Sequence[Mapping[X, Y]])\ 64 | -> Mapping[X, Sequence[Y]]: 65 | """ 66 | Returns the transposed dictionary of lists. 67 | Works on irregularly shaped (non-rectangular) lists of dicts 68 | Will 'compress' the result on irregularly shaped input 69 | """ 70 | all_k = set(k for d in l for k, _ in d.items()) 71 | return {k: [val for d in l for k1, val in d.items() 72 | if k1 == k] for k in all_k} 73 | 74 | 75 | def transpose_list_of_lists(l: Sequence[Sequence[X]]) -> Sequence[Sequence[X]]: 76 | """ 77 | Returns the transposed list of lists. 78 | Works on irregularly shaped (non-rectangular) lists of lists 79 | Will 'compress' the result on irregularly shaped input 80 | """ 81 | max_len = max(len(lin) for lin in l) 82 | return [[lin[i] for lin in l if i < len(lin)] for i in range(max_len)] 83 | 84 | 85 | def merge_dicts(d1: FlattenedDict, d2: FlattenedDict, operation): 86 | merged = d1 + d2 87 | from itertools import groupby 88 | from operator import itemgetter 89 | from functools import reduce 90 | sortd = sorted(merged, key=itemgetter(0)) 91 | grouped = groupby(sortd, key=itemgetter(0)) 92 | return [(key, reduce(operation, [x for _, x in group])) for key, group in grouped] 93 | 94 | -------------------------------------------------------------------------------- /src/utils/standard_typevars.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Sequence, Mapping, Tuple 2 | from utils.generic_typevars import S, A 3 | 4 | """ 5 | VFType (= Callable[[S], float]) is the type that represents a value 6 | function for the most general situation. Instead of thinking of a value 7 | function as a dictionary from states to returns, think of a value function 8 | as a function from states to returns. This representation works 9 | for all forms of MDPs, discrete/finite or continuous state spaces. 10 | 11 | QFType (= Callable[[S], Callable[[A], float]]) is the type that represents 12 | the action value function (or Q function) for the most general situation. 13 | Instead of thinking of a Q Function as a dictionary from state, action pairs 14 | to returns, think of a Q Function as a function from states to {functions 15 | from actions to returns}. This representation works for all forms of MDPs, 16 | discrete/finite or continuous state spaces and action spaces. 17 | 18 | PolicyType (= Callable[[S], Callable[[int], Sequence[A]]]) is the type 19 | thet represents a stochastic policy for the most general situation. Instead 20 | of thinking of a policy as a dictionary from states to {dictionary from 21 | actions to probabilities}, think of a policy as a function from states to 22 | probability distributions where a probability distribution has the most 23 | general representation (that would work for discrete or finite action spaces). 24 | This general representation of a probability distribution is a function 25 | that takes as input the number of action samples and produces as output 26 | a sequence of actions drawn from that probability distribution. In other 27 | words, we can make the probability distribution as fine or coarse as we 28 | wnt by controlling the input to this function (the requested number of 29 | sample points). 30 | 31 | VFDictType (= Mapping[S, float]) is the type that represents a value function 32 | for a finite set of states amd hence, is represented as a data structure rather 33 | than a function. One can always produce a VFType from a VFDictType by wrapping 34 | the dictionary with a function. 35 | 36 | QFDictType (= Mapping[S, Mapping[A, float]) is the type that represents an 37 | action value function (or Q function) for a finite set of states and actions. 38 | Hence, it is represented as a data structure rather than as a function. 39 | One can always produce a QFType from a QFDictType by wrapping the dictionary 40 | of dictionaries with a function returning a function. 41 | 42 | PolicyActDictType (= Callable[[S], Mapping[A, float]]) is the type that 43 | represents a policy for arbitrary state space and finite action spaces. 44 | 45 | The S*f types are types required for tabular methods which work with 46 | nested dictionaries (rather than functions) 47 | """ 48 | 49 | VFType = Callable[[S], float] 50 | QFType = Callable[[S], Callable[[A], float]] 51 | PolicyType = Callable[[S], Callable[[int], Sequence[A]]] 52 | 53 | VFDictType = Mapping[S, float] 54 | QFDictType = Mapping[S, Mapping[A, float]] 55 | PolicyActDictType = Callable[[S], Mapping[A, float]] 56 | 57 | SSf = Mapping[S, Mapping[S, float]] 58 | SSTff = Mapping[S, Mapping[S, Tuple[float, float]]] 59 | STSff = Mapping[S, Tuple[Mapping[S, float], float]], 60 | SAf = Mapping[S, Mapping[A, float]] 61 | SASf = Mapping[S, Mapping[A, Mapping[S, float]]] 62 | SASTff = Mapping[S, Mapping[A, Mapping[S, Tuple[float, float]]]] 63 | SATSff = Mapping[S, Mapping[A, Tuple[Mapping[S, float], float]]] 64 | 65 | -------------------------------------------------------------------------------- /src/func_approx/eligibility_traces.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, Callable 2 | from scipy.linalg import toeplitz 3 | import numpy as np 4 | 5 | 6 | def get_decay_toeplitz_matrix( 7 | size: int, 8 | decay_param: float 9 | ) -> np.ndarray: 10 | return toeplitz( 11 | np.power(decay_param, np.arange(size)), 12 | np.insert(np.zeros(size - 1), 0, 1.) 13 | ) 14 | 15 | 16 | # noinspection PyPep8Naming 17 | def get_generalized_back_prop( 18 | dnn_params: Sequence[np.ndarray], 19 | fwd_prop: Sequence[np.ndarray], 20 | dObj_dOL: np.ndarray, 21 | factors: np.ndarray, 22 | decay_param: float, 23 | hidden_activation_deriv: Callable[[np.ndarray], np.ndarray], 24 | output_activation_deriv: Callable[[np.ndarray], np.ndarray] 25 | ) -> Sequence[np.ndarray]: 26 | """ 27 | :param dnn_params: list (of length L+1) of (|O_L| x |I_L| + 1) 2-D array 28 | :param fwd_prop: list (of length L+2), the first (L+1)elements are 29 | n x (|I_l| + 1) 2-D arrays representing the inputs to the (L+1) layers, 30 | and the last element is a n x 1 2-D array 31 | :param dObj_dOL: 1-D array of length n 32 | :param factors: 1-D array of length n 33 | :param decay_param: [0,1] float representing decay in time 34 | :param hidden_activation_deriv: function representing the derivative 35 | of the hidden layer activation function (expressed as a function of the 36 | output of the hidden layer activation function). 37 | :param output_activation_deriv: function representing the derivative 38 | of the output layer activation function (expressed as a function of the 39 | output of the output layer activation function). 40 | L is the number of hidden layers, n is the number of points 41 | :return: list (of length L+1) of |O_l| x (|I_l| + 1) 2-D arrays, 42 | i.e., same as the type of self.params 43 | """ 44 | output = fwd_prop[-1][:, 0] 45 | layer_inputs = fwd_prop[:-1] 46 | # deriv initialized to 1 x n = |O_L| x n 2-D array 47 | deriv = (dObj_dOL * output_activation_deriv(output)).reshape(1, -1) 48 | decay_matrix = get_decay_toeplitz_matrix(len(factors), decay_param) 49 | back_prop = [] 50 | for l in reversed(range(len(dnn_params))): 51 | # layer l gradient is factors tensordot (decay_matrix tensordot 52 | # (deriv_l einsum layer_inputs_l) which is of dimension 53 | # n tensordot ((n x n) tensordot ((|O_l| x n) einsum (n x (|I_l| + 1))) 54 | # = n tensordot ((n x n) tensordot (n x |O_l| x (|I_l| + 1))) 55 | # = n tensordot (n x |O_l| x (|I_l| + 1)) = |O_l| x (|I_l| + 1) 56 | t1 = np.einsum('ij,jk->jik', deriv, layer_inputs[l]) 57 | if decay_param != 0: 58 | t2 = np.tensordot(decay_matrix, t1, axes=1) 59 | else: 60 | t2 = t1 61 | t3 = np.tensordot(factors, t2, axes=1) 62 | back_prop.append(t3) 63 | # deriv_l is dnn_params_{l+1}^T dot deriv_{l+1} haddamard g'(S_l), which is 64 | # ((|I_{l+1}| + 1) x |O_{l+1}|) dot (|O_{l+1}| x n) haddamard 65 | # ((|I_{l+1}| + 1) x n) --- g'(S_L) is expressed as hidden layer 66 | # activation derivative as a function of O_l (=I_{l+1}). 67 | # (notice first row of the result is removed after this calculation). 68 | # So, deriv_l has dimension |I_{l+1}| x n = |O_l| x n 69 | deriv = (np.dot(dnn_params[l].T, deriv) * 70 | hidden_activation_deriv(layer_inputs[l].T))[1:] 71 | return back_prop[::-1] 72 | -------------------------------------------------------------------------------- /src/algorithms/rl_func_approx/rl_func_approx_base.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Optional 2 | from abc import abstractmethod 3 | from algorithms.opt_base import OptBase 4 | from processes.mdp_rep_for_rl_fa import MDPRepForRLFA 5 | from algorithms.func_approx_spec import FuncApproxSpec 6 | from func_approx.func_approx_base import FuncApproxBase 7 | from algorithms.helper_funcs import get_uniform_policy_func 8 | from algorithms.helper_funcs import get_epsilon_decay_func 9 | from algorithms.helper_funcs import get_pdf_from_samples 10 | from operator import itemgetter 11 | from utils.generic_typevars import S, A 12 | from utils.standard_typevars import VFType, QFType 13 | from utils.standard_typevars import PolicyType, PolicyActDictType 14 | 15 | 16 | class RLFuncApproxBase(OptBase): 17 | 18 | NUM_SAMPLES_PER_ACTION = 10 19 | 20 | def __init__( 21 | self, 22 | mdp_rep_for_rl: MDPRepForRLFA, 23 | exploring_start: bool, 24 | softmax: bool, 25 | epsilon: float, 26 | epsilon_half_life: float, 27 | num_episodes: int, 28 | max_steps: int, 29 | fa_spec: FuncApproxSpec 30 | ) -> None: 31 | 32 | self.mdp_rep: MDPRepForRLFA = mdp_rep_for_rl 33 | self.exploring_start: bool = exploring_start 34 | self.softmax: bool = softmax 35 | self.epsilon_func: Callable[[int], float] = get_epsilon_decay_func( 36 | epsilon, 37 | epsilon_half_life 38 | ) 39 | self.num_episodes: int = num_episodes 40 | self.max_steps: int = max_steps 41 | self.vf_fa: FuncApproxBase = fa_spec.get_vf_func_approx_obj() 42 | self.qvf_fa: FuncApproxBase = fa_spec.get_qvf_func_approx_obj() 43 | self.state_action_func = self.mdp_rep.state_action_func 44 | 45 | def get_init_policy_func(self) -> PolicyActDictType: 46 | return get_uniform_policy_func(self.state_action_func) 47 | 48 | def get_value_func_fa(self, polf: PolicyActDictType) -> VFType: 49 | qv_func = self.get_qv_func_fa(polf) 50 | 51 | # noinspection PyShadowingNames 52 | def vf(s: S, polf=polf, qv_func=qv_func) -> float: 53 | return sum(polf(s)[a] * qv_func(s)(a) for a in 54 | self.state_action_func(s)) 55 | 56 | return vf 57 | 58 | # noinspection PyShadowingNames 59 | def get_value_func(self, pol_func: PolicyType) -> VFType: 60 | return self.get_value_func_fa( 61 | lambda s, pol_func=pol_func: get_pdf_from_samples( 62 | pol_func(s)(len(self.state_action_func(s)) * 63 | RLFuncApproxBase.NUM_SAMPLES_PER_ACTION) 64 | ) 65 | ) 66 | 67 | @abstractmethod 68 | def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType: 69 | pass 70 | 71 | # noinspection PyShadowingNames 72 | def get_act_value_func(self, pol_func: PolicyType) -> QFType: 73 | return self.get_qv_func_fa( 74 | lambda s, pol_func=pol_func: get_pdf_from_samples( 75 | pol_func(s)(len(self.state_action_func(s)) * 76 | RLFuncApproxBase.NUM_SAMPLES_PER_ACTION) 77 | ) 78 | ) 79 | 80 | def get_optimal_det_policy_func(self) -> Callable[[S], A]: 81 | qv_func = self.get_qv_func_fa(None) 82 | 83 | # noinspection PyShadowingNames 84 | def detp_func(s: S, qv_func=qv_func) -> A: 85 | return max( 86 | [(a, qv_func(s)(a)) for a in self.state_action_func(s)], 87 | key=itemgetter(1) 88 | )[0] 89 | 90 | return detp_func 91 | -------------------------------------------------------------------------------- /src/examples/exam_problems/W2021/career_optimization.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Mapping, Dict, Sequence, Iterable 2 | from rl.markov_decision_process import FiniteMarkovDecisionProcess 3 | from rl.dynamic_programming import value_iteration_result 4 | from rl.distribution import Categorical 5 | from scipy.stats import poisson 6 | 7 | IntPair = Tuple[int, int] 8 | CareerDecisionsMap = Mapping[int, Mapping[ 9 | IntPair, 10 | Categorical[Tuple[int, float]] 11 | ]] 12 | 13 | 14 | class CareerOptimization(FiniteMarkovDecisionProcess[int, IntPair]): 15 | 16 | def __init__( 17 | self, 18 | hours: int, 19 | wage_cap: int, 20 | alpha: float, 21 | beta: float 22 | ): 23 | self.hours = hours 24 | self.wage_cap = wage_cap 25 | self.alpha = alpha 26 | self.beta = beta 27 | super().__init__(self.get_transitions()) 28 | 29 | def get_transitions(self) -> CareerDecisionsMap: 30 | d: Dict[int, Mapping[IntPair, Categorical[Tuple[int, float]]]] = {} 31 | for w in range(1, self.wage_cap + 1): 32 | d1: Dict[IntPair, Categorical[Tuple[int, float]]] = {} 33 | for s in range(self.hours + 1): 34 | for t in range(self.hours + 1 - s): 35 | pd = poisson(self.alpha * t) 36 | prob: float = self.beta * s / self.hours 37 | r: float = w * (self.hours - s - t) 38 | same_prob: float = (1 - prob) * pd.pmf(0) 39 | sr_probs: Dict[Tuple[int, float], float] = {} 40 | if w == self.wage_cap: 41 | sr_probs[(w, r)] = 1. 42 | elif w == self.wage_cap - 1: 43 | sr_probs[(w, r)] = same_prob 44 | sr_probs[(w + 1, r)] = 1 - same_prob 45 | else: 46 | sr_probs[(w, r)] = same_prob 47 | sr_probs[(w + 1, r)] = prob * pd.pmf(0) + pd.pmf(1) 48 | for w1 in range(w + 2, self.wage_cap): 49 | sr_probs[(w1, r)] = pd.pmf(w1 - w) 50 | sr_probs[(self.wage_cap, r)] = \ 51 | 1 - pd.cdf(self.wage_cap - w - 1) 52 | d1[(s, t)] = Categorical(sr_probs) 53 | d[w] = d1 54 | return d 55 | 56 | 57 | if __name__ == '__main__': 58 | 59 | import matplotlib.pyplot as plt 60 | from pprint import pprint 61 | hours: int = 10 62 | wage_cap: int = 30 63 | alpha: float = 0.08 64 | beta: float = 0.82 65 | gamma: float = 0.95 66 | 67 | co: CareerOptimization = CareerOptimization( 68 | hours=hours, 69 | wage_cap=wage_cap, 70 | alpha=alpha, 71 | beta=beta 72 | ) 73 | 74 | _, opt_policy = value_iteration_result(co, gamma=gamma) 75 | wages: Iterable[int] = range(1, co.wage_cap + 1) 76 | opt_actions: Mapping[int, Tuple[int, int]] = \ 77 | {w: opt_policy.act(w).value for w in wages} 78 | searching: Sequence[int] = [s for _, (s, _) in opt_actions.items()] 79 | learning: Sequence[int] = [l for _, (_, l) in opt_actions.items()] 80 | working: Sequence[int] = [co.hours - s - l for _, (s, l) in 81 | opt_actions.items()] 82 | pprint(opt_actions) 83 | plt.xticks(wages) 84 | p1 = plt.bar(wages, searching, color='red') 85 | p2 = plt.bar(wages, learning, color='blue') 86 | p3 = plt.bar(wages, working, color='green') 87 | plt.legend((p1[0], p2[0], p3[0]), ('Job-Searching', 'Learning', 'Working')) 88 | plt.grid(axis='y') 89 | plt.xlabel("Hourly Wage Level") 90 | plt.ylabel("Hours Spent") 91 | plt.title("Career Optimization") 92 | plt.show() 93 | -------------------------------------------------------------------------------- /src/examples/exam_problems/mrp_tdmc_outline.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, Tuple, Mapping 2 | 3 | S = str 4 | DataType = Sequence[Sequence[Tuple[S, float]]] 5 | ProbFunc = Mapping[S, Mapping[S, float]] 6 | RewardFunc = Mapping[S, float] 7 | ValueFunc = Mapping[S, float] 8 | 9 | 10 | def get_state_return_samples( 11 | data: DataType 12 | ) -> Sequence[Tuple[S, float]]: 13 | """ 14 | prepare sequence of (state, return) pairs. 15 | Note: (state, return) pairs is not same as (state, reward) pairs. 16 | """ 17 | return [(s, sum(r for (_, r) in l[i:])) 18 | for l in data for i, (s, _) in enumerate(l)] 19 | 20 | 21 | def get_mc_value_function( 22 | state_return_samples: Sequence[Tuple[S, float]] 23 | ) -> ValueFunc: 24 | """ 25 | Implement tabular MC Value Function compatible with the interface defined above. 26 | """ 27 | 28 | 29 | def get_state_reward_next_state_samples( 30 | data: DataType 31 | ) -> Sequence[Tuple[S, float, S]]: 32 | """ 33 | prepare sequence of (state, reward, next_state) triples. 34 | """ 35 | return [(s, r, l[i+1][0] if i < len(l) - 1 else 'T') 36 | for l in data for i, (s, r) in enumerate(l)] 37 | 38 | 39 | def get_probability_and_reward_functions( 40 | srs_samples: Sequence[Tuple[S, float, S]] 41 | ) -> Tuple[ProbFunc, RewardFunc]: 42 | """ 43 | Implement code that produces the probability transitions and the 44 | reward function compatible with the interface defined above. 45 | """ 46 | 47 | 48 | def get_mrp_value_function( 49 | prob_func: ProbFunc, 50 | reward_func: RewardFunc 51 | ) -> ValueFunc: 52 | """ 53 | Implement code that calculates the MRP Value Function from the probability 54 | transitions and reward function, compatible with the interface defined above. 55 | Hint: Use the MRP Bellman Equation and simple linear algebra 56 | """ 57 | 58 | 59 | def get_td_value_function( 60 | srs_samples: Sequence[Tuple[S, float, S]], 61 | num_updates: int = 300000, 62 | learning_rate: float = 0.3, 63 | learning_rate_decay: int = 30 64 | ) -> ValueFunc: 65 | """ 66 | Implement tabular TD(0) (with experience replay) Value Function compatible 67 | with the interface defined above. Let the step size (alpha) be: 68 | learning_rate * (updates / learning_rate_decay + 1) ** -0.5 69 | so that Robbins-Monro condition is satisfied for the sequence of step sizes. 70 | """ 71 | 72 | 73 | def get_lstd_value_function( 74 | srs_samples: Sequence[Tuple[S, float, S]] 75 | ) -> ValueFunc: 76 | """ 77 | Implement LSTD Value Function compatible with the interface defined above. 78 | Hint: Tabular is a special case of linear function approx where each feature 79 | is an indicator variables for a corresponding state and each parameter is 80 | the value function for the corresponding state. 81 | """ 82 | 83 | 84 | if __name__ == '__main__': 85 | given_data: DataType = [ 86 | [('A', 2.), ('A', 6.), ('B', 1.), ('B', 2.)], 87 | [('A', 3.), ('B', 2.), ('A', 4.), ('B', 2.), ('B', 0.)], 88 | [('B', 3.), ('B', 6.), ('A', 1.), ('B', 1.)], 89 | [('A', 0.), ('B', 2.), ('A', 4.), ('B', 4.), ('B', 2.), ('B', 3.)], 90 | [('B', 8.), ('B', 2.)] 91 | ] 92 | 93 | sr_samps = get_state_return_samples(given_data) 94 | 95 | print("------------- MONTE CARLO VALUE FUNCTION --------------") 96 | print(get_mc_value_function(sr_samps)) 97 | 98 | srs_samps = get_state_reward_next_state_samples(given_data) 99 | 100 | pfunc, rfunc = get_probability_and_reward_functions(srs_samps) 101 | print("-------------- MRP VALUE FUNCTION ----------") 102 | print(get_mrp_value_function(pfunc, rfunc)) 103 | 104 | print("------------- TD VALUE FUNCTION --------------") 105 | print(get_td_value_function(srs_samps)) 106 | 107 | print("------------- LSTD VALUE FUNCTION --------------") 108 | print(get_lstd_value_function(srs_samps)) 109 | -------------------------------------------------------------------------------- /src/algorithms/backward_dp.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Sequence, Tuple, Generic 2 | from utils.gen_utils import is_approx_eq 3 | from utils.generic_typevars import S, A 4 | from utils.standard_typevars import SASTff 5 | from operator import itemgetter 6 | 7 | 8 | class BackwardDP(Generic[S, A]): 9 | 10 | def __init__( 11 | self, 12 | transitions_rewards: Sequence[SASTff], 13 | terminal_opt_val: Mapping[S, float], 14 | gamma: float 15 | ) -> None: 16 | if BackwardDP.verify_data(transitions_rewards, terminal_opt_val, gamma): 17 | self.transitions_rewards = transitions_rewards 18 | self.terminal_opt_val = terminal_opt_val 19 | self.gamma = gamma 20 | self.vf_and_policy = self.get_vf_and_policy() 21 | else: 22 | raise ValueError 23 | 24 | @staticmethod 25 | def verify_data( 26 | transitions_rewards: Sequence[SASTff], 27 | terminal_opt_val: Mapping[S, float], 28 | gamma: float 29 | ) -> bool: 30 | valid = 0. <= gamma <= 1. 31 | time_len = len(transitions_rewards) 32 | i = 0 33 | while valid and i < time_len: 34 | this_d = transitions_rewards[i] 35 | check_actions = all(len(v) > 0 for _, v in this_d.items()) 36 | next_dict = [{k: v for k, (v, _) in d1.items()} 37 | for _, d in this_d.items() for _, d1 in d.items()] 38 | check_pos = all(all(x >= 0 for x in d1.values()) for d1 in next_dict) 39 | check_sum = all(is_approx_eq(sum(d1.values()), 1.0) for d1 in next_dict) 40 | states = set((transitions_rewards[i+1] 41 | if i < time_len - 1 else terminal_opt_val).keys()) 42 | subset = all(set(d1.keys()).issubset(states) for d1 in next_dict) 43 | valid = valid and check_actions and check_pos and check_sum and subset 44 | i = i + 1 45 | return valid 46 | 47 | def get_vf_and_policy(self) -> Sequence[Mapping[S, Tuple[float, A]]]: 48 | vf_pol = {s: (v, None) for s, v in self.terminal_opt_val.items()} 49 | ret = [] 50 | for tr in self.transitions_rewards[::-1]: 51 | vf_pol = {s: max( 52 | [( 53 | sum(p * (r + self.gamma * vf_pol[s1][0]) 54 | for s1, (p, r) in d1.items()), 55 | a 56 | ) for a, d1 in d.items()], 57 | key=itemgetter(0) 58 | ) for s, d in tr.items()} 59 | ret.append(vf_pol) 60 | return ret[::-1] 61 | 62 | 63 | if __name__ == '__main__': 64 | from scipy.stats import poisson 65 | T: int = 50 # time steps 66 | M: int = 10 # initial inventory 67 | # the following are (price, poisson mean) pairs, i.e., elasticity 68 | el: Sequence[Tuple[float, float]] = [ 69 | (10.0, 0.1), (9.0, 0.16), (8.0, 0.22), 70 | (7.0, 0.28), (6.0, 0.38), (5.0, 0.5) 71 | ] 72 | rvs = [(p, poisson(l)) for p, l in el] 73 | 74 | tr_rew_dict = { 75 | s: { 76 | p: { 77 | s - d: ( 78 | rv.pmf(d) if d < s else 1. - rv.cdf(s - 1), 79 | d * p 80 | ) for d in range(s + 1) 81 | } for p, rv in rvs 82 | } for s in range(M + 1) 83 | } 84 | bdp = BackwardDP( 85 | transitions_rewards=[tr_rew_dict] * T, 86 | terminal_opt_val={s: 0. for s in range(M + 1)}, 87 | gamma=1. 88 | ) 89 | for i in range(T): 90 | print([(x, y) for x, (y, _) in bdp.vf_and_policy[i].items()]) 91 | for i in range(T): 92 | print([(x, z) for x, (_, z) in bdp.vf_and_policy[i].items()]) 93 | 94 | tr_rew_dicts = [] 95 | states = {float(M)} 96 | for t in range(T): 97 | tr_rew_dicts.append( 98 | { 99 | s: { 100 | p: { 101 | max(s - d, 0.): (1.0, min(s, d) * p) 102 | } for p, d in el 103 | } for s in states 104 | } 105 | ) 106 | states = {max(s - d, 0.) for s in states for _, d in el} 107 | 108 | bdp = BackwardDP( 109 | transitions_rewards=tr_rew_dicts, 110 | terminal_opt_val={s: 0. for s in states}, 111 | gamma=1. 112 | ) 113 | 114 | state = float(M) 115 | for t in range(T): 116 | v, p = bdp.vf_and_policy[t][state] 117 | print((t, state, p, v)) 118 | d = el[[x for x, _ in el].index(p)][1] 119 | state = max(state - d, 0.) 120 | print(bdp.vf_and_policy[0].items()) 121 | 122 | -------------------------------------------------------------------------------- /src/func_approx/linear_approx.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, Callable, Tuple, TypeVar 2 | from func_approx.func_approx_base import FuncApproxBase 3 | from func_approx.eligibility_traces import get_decay_toeplitz_matrix 4 | from scipy.stats import norm 5 | import numpy as np 6 | 7 | X = TypeVar('X') 8 | 9 | 10 | class LinearApprox(FuncApproxBase): 11 | 12 | def __init__( 13 | self, 14 | feature_funcs: Sequence[Callable[[X], float]], 15 | reglr_coeff: float = 0., 16 | learning_rate: float = 0.1, 17 | adam: bool = True, 18 | adam_decay1: float = 0.9, 19 | adam_decay2: float = 0.99, 20 | add_unit_feature: bool = True 21 | ): 22 | super().__init__( 23 | feature_funcs, 24 | reglr_coeff, 25 | learning_rate, 26 | adam, 27 | adam_decay1, 28 | adam_decay2, 29 | add_unit_feature 30 | ) 31 | 32 | def init_params(self) -> Sequence[np.ndarray]: 33 | return [np.zeros(self.num_features)] 34 | 35 | def init_adam_caches(self)\ 36 | -> Tuple[Sequence[np.ndarray], Sequence[np.ndarray]]: 37 | return [np.zeros(self.num_features)],\ 38 | [np.zeros(self.num_features)] 39 | 40 | def get_func_eval(self, x_vals: X): 41 | """ 42 | This must return a float but lint is not happy, so removed the 43 | return type annotation 44 | """ 45 | return np.dot(self.get_feature_vals(x_vals), self.params[0]) 46 | 47 | def get_func_eval_pts(self, x_vals_seq: Sequence[X]) -> np.ndarray: 48 | return np.dot( 49 | self.get_feature_vals_pts(x_vals_seq), 50 | self.params[0] 51 | ) 52 | 53 | def get_sum_loss_gradient( 54 | self, 55 | x_vals_seq: Sequence[X], 56 | supervisory_seq: Sequence[float] 57 | ) -> Sequence[np.ndarray]: 58 | # return [np.dot(self.get_func_eval_pts(x_vals_seq) - supervisory_seq, 59 | # self.get_feature_vals_pts(x_vals_seq))] 60 | return [np.sum((self.get_func_eval(x) - supervisory_seq[i]) * self.get_feature_vals(x) 61 | for i, x in enumerate(x_vals_seq))] 62 | 63 | # noinspection PyPep8Naming 64 | def get_sum_objective_gradient( 65 | self, 66 | x_vals_seq: Sequence[X], 67 | dObj_dOL: np.ndarray 68 | ) -> Sequence[np.ndarray]: 69 | return [dObj_dOL.dot(self.get_feature_vals_pts(x_vals_seq))] 70 | 71 | def get_el_tr_sum_loss_gradient( 72 | self, 73 | x_vals_seq: Sequence[X], 74 | supervisory_seq: Sequence[float], 75 | gamma_lambda: float 76 | ) -> Sequence[np.ndarray]: 77 | toeplitz_mat = get_decay_toeplitz_matrix(len(x_vals_seq), gamma_lambda) 78 | errors = self.get_func_eval_pts(x_vals_seq) - supervisory_seq 79 | func_grad = self.get_feature_vals_pts(x_vals_seq) 80 | return [errors.dot(toeplitz_mat.dot(func_grad))] 81 | 82 | # noinspection PyPep8Naming 83 | def get_el_tr_sum_objective_gradient( 84 | self, 85 | x_vals_seq: Sequence[X], 86 | dObj_dOL: np.ndarray, 87 | factors: np.ndarray, 88 | gamma_lambda: float 89 | ) -> Sequence[np.ndarray]: 90 | toep = get_decay_toeplitz_matrix(len(x_vals_seq), gamma_lambda) 91 | features = self.get_feature_vals_pts(x_vals_seq) 92 | return [factors.dot(toep.dot(np.diag(dObj_dOL).dot(features)))] 93 | 94 | 95 | if __name__ == '__main__': 96 | la = LinearApprox( 97 | feature_funcs=FuncApproxBase.get_identity_feature_funcs(3), 98 | reglr_coeff=0., 99 | learning_rate=0.1, 100 | adam=True, 101 | adam_decay1=0.9, 102 | adam_decay2=0.999, 103 | add_unit_feature=True 104 | ) 105 | alpha = 2.0 106 | beta_1 = 10.0 107 | beta_2 = 4.0 108 | beta_3 = -6.0 109 | beta = (beta_1, beta_2, beta_3) 110 | x_pts = np.arange(-10.0, 10.0, 0.5) 111 | y_pts = np.arange(-10.0, 10.0, 0.5) 112 | z_pts = np.arange(-10.0, 10.0, 0.5) 113 | pts = [(x, y, z) for x in x_pts for y in y_pts for z in z_pts] 114 | 115 | # noinspection PyShadowingNames 116 | def superv_func(pt, alpha=alpha, beta=beta): 117 | return alpha + np.dot(beta, pt) 118 | 119 | n = norm(loc=0., scale=1.) 120 | superv_pts = [superv_func(r) + n.rvs(size=1)[0] for r in pts] 121 | # import matplotlib.pyplot as plt 122 | for _ in range(1000): 123 | print(la.params[0]) 124 | la.update_params(pts, superv_pts) 125 | pred_pts = [la.get_func_eval(x) for x in pts] 126 | print(np.linalg.norm(np.array(pred_pts) - np.array(superv_pts)) / 127 | np.sqrt(len(superv_pts))) 128 | 129 | -------------------------------------------------------------------------------- /src/func_approx/func_approx_base.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, Callable, Tuple, TypeVar, List, Set 2 | from abc import ABC, abstractmethod 3 | import numpy as np 4 | 5 | X = TypeVar('X') 6 | very_small_pos = 1e-6 7 | 8 | 9 | class FuncApproxBase(ABC): 10 | 11 | def __init__( 12 | self, 13 | feature_funcs: Sequence[Callable[[X], float]], 14 | reglr_coeff: float, 15 | learning_rate: float, 16 | adam: bool, 17 | adam_decay1: float, 18 | adam_decay2: float, 19 | add_unit_feature: bool = True 20 | ): 21 | self.feature_funcs: Sequence[Callable[[X], float]] =\ 22 | ([FuncApproxBase.get_unit_func] if add_unit_feature else []) + feature_funcs 23 | self.num_features = len(self.feature_funcs) 24 | self.reglr_coeff = reglr_coeff 25 | self.learning_rate = learning_rate 26 | self.adam = adam 27 | self.adam_decay1 = adam_decay1 28 | self.adam_decay2 = adam_decay2 29 | self.time = 0 30 | self.params: List[np.ndarray] = self.init_params() 31 | self.adam_caches: Tuple[List[np.ndarray], List[np.ndarray]]\ 32 | = self.init_adam_caches() 33 | 34 | @staticmethod 35 | def get_unit_func(_: X) -> float: 36 | return 1. 37 | 38 | @staticmethod 39 | def get_identity_feature_funcs(n: int) -> List[Callable[[X], float]]: 40 | return [(lambda x, i=i: x[i]) for i in range(n)] 41 | 42 | @staticmethod 43 | def get_indicator_feature_funcs(values: Set[X])\ 44 | -> List[Callable[[X], float]]: 45 | return [(lambda x, v=v: 1. if x == v else 0.) for v in values] 46 | 47 | def get_feature_vals(self, x_vals: X) -> np.ndarray: 48 | return np.array([f(x_vals) for f in self.feature_funcs]) 49 | 50 | def get_feature_vals_pts(self, x_vals_seq: Sequence[X]) -> np.ndarray: 51 | return np.vstack(self.get_feature_vals(x) for x in x_vals_seq) 52 | 53 | @abstractmethod 54 | def init_params(self) -> Sequence[np.ndarray]: 55 | pass 56 | 57 | @abstractmethod 58 | def init_adam_caches(self)\ 59 | -> Tuple[Sequence[np.ndarray], Sequence[np.ndarray]]: 60 | pass 61 | 62 | @abstractmethod 63 | def get_func_eval(self, x_vals: X) -> float: 64 | pass 65 | 66 | @abstractmethod 67 | def get_func_eval_pts(self, x_vals_seq: Sequence[X]) -> np.ndarray: 68 | pass 69 | 70 | @abstractmethod 71 | def get_sum_loss_gradient( 72 | self, 73 | x_vals_seq: Sequence[X], 74 | supervisory_seq: Sequence[float] 75 | ) -> Sequence[np.ndarray]: 76 | pass 77 | 78 | # noinspection PyPep8Naming 79 | @abstractmethod 80 | def get_sum_objective_gradient( 81 | self, 82 | x_vals_seq: Sequence[X], 83 | dObj_dOL: np.ndarray 84 | ) -> Sequence[np.ndarray]: 85 | pass 86 | 87 | @abstractmethod 88 | def get_el_tr_sum_loss_gradient( 89 | self, 90 | x_vals_seq: Sequence[X], 91 | supervisory_seq: Sequence[float], 92 | gamma_lambda: float 93 | ) -> Sequence[np.ndarray]: 94 | pass 95 | 96 | # noinspection PyPep8Naming 97 | @abstractmethod 98 | def get_el_tr_sum_objective_gradient( 99 | self, 100 | x_vals_seq: Sequence[X], 101 | dObj_dOL: np.ndarray, 102 | factors: np.ndarray, 103 | gamma_lambda: float 104 | ) -> Sequence[np.ndarray]: 105 | pass 106 | 107 | def update_params( 108 | self, 109 | x_vals_seq: Sequence[X], 110 | supervisory_seq: Sequence[float] 111 | ) -> None: 112 | avg_loss_gradient = [g / len(x_vals_seq) for g in 113 | self.get_sum_loss_gradient(x_vals_seq, supervisory_seq)] 114 | self.update_params_from_gradient(avg_loss_gradient) 115 | 116 | def update_params_from_gradient( 117 | self, 118 | gradient: Sequence[np.ndarray] 119 | ) -> None: 120 | self.time += 1 121 | for l in range(len(self.params)): 122 | g = gradient[l] + self.reglr_coeff * self.params[l] 123 | if self.adam: 124 | self.adam_caches[0][l] = self.adam_decay1 * self.adam_caches[0][l] +\ 125 | (1 - self.adam_decay1) * g 126 | self.adam_caches[1][l] = self.adam_decay2 * self.adam_caches[1][l] +\ 127 | (1 - self.adam_decay2) * g ** 2 128 | self.params[l] -= self.learning_rate * self.adam_caches[0][l] /\ 129 | (np.sqrt(self.adam_caches[1][l]) + very_small_pos) *\ 130 | np.sqrt(1 - self.adam_decay2 ** self.time) /\ 131 | (1 - self.adam_decay1 ** self.time) 132 | else: 133 | self.params[l] -= self.learning_rate * g 134 | -------------------------------------------------------------------------------- /src/examples/exam_problems/mrp_tdmc.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, Tuple, Mapping 2 | from operator import itemgetter 3 | import numpy as np 4 | from itertools import groupby 5 | from numpy.random import randint 6 | 7 | S = str 8 | DataType = Sequence[Sequence[Tuple[S, float]]] 9 | ProbFunc = Mapping[S, Mapping[S, float]] 10 | RewardFunc = Mapping[S, float] 11 | ValueFunc = Mapping[S, float] 12 | 13 | 14 | def get_state_return_samples( 15 | data: DataType 16 | ) -> Sequence[Tuple[S, float]]: 17 | return [(s, sum(r for (_, r) in l[i:])) 18 | for l in data for i, (s, _) in enumerate(l)] 19 | 20 | 21 | def get_mc_value_function( 22 | state_return_samples: Sequence[Tuple[S, float]] 23 | ) -> ValueFunc: 24 | sorted_samples = sorted(state_return_samples, key=itemgetter(0)) 25 | return {s: np.mean([r for _, r in l]) 26 | for s, l in groupby(sorted_samples, itemgetter(0))} 27 | 28 | 29 | def get_state_reward_next_state_samples( 30 | data: DataType 31 | ) -> Sequence[Tuple[S, float, S]]: 32 | return [(s, r, l[i+1][0] if i < len(l) - 1 else 'T') 33 | for l in data for i, (s, r) in enumerate(l)] 34 | 35 | 36 | def get_probability_and_reward_functions( 37 | srs_samples: Sequence[Tuple[S, float, S]] 38 | ) -> Tuple[ProbFunc, RewardFunc]: 39 | d = {s: [(r, s1) for _, r, s1 in l] for s, l in 40 | groupby(sorted(srs_samples, key=itemgetter(0)), itemgetter(0))} 41 | 42 | prob_func = {s: {s1: len(list(l1)) / len(l) for s1, l1 in 43 | groupby(sorted(l, key=itemgetter(1)), itemgetter(1)) 44 | if s1 != 'T'} for s, l in d.items()} 45 | reward_func = {s: np.mean([r for r, _ in l]) for s, l in d.items()} 46 | 47 | return prob_func, reward_func 48 | 49 | 50 | def get_mrp_value_function( 51 | prob_func: ProbFunc, 52 | reward_func: RewardFunc 53 | ) -> ValueFunc: 54 | states_list = list(reward_func.keys()) 55 | reward_vec = np.array([reward_func[s] for s in states_list]) 56 | prob_matrix = np.array([[prob_func[s][s1] if s1 in prob_func[s] else 0. 57 | for s1 in states_list] for s in states_list]) 58 | vec = np.linalg.inv(np.eye(len(states_list)) - prob_matrix).dot(reward_vec) 59 | return {states_list[i]: vec[i] for i in range(len(states_list))} 60 | 61 | 62 | def get_td_value_function( 63 | srs_samples: Sequence[Tuple[S, float, S]], 64 | num_updates: int = 300000, 65 | learning_rate: float = 0.3, 66 | learning_rate_decay: int = 30 67 | ) -> ValueFunc: 68 | ret = {s: [0.] for s in set(x for x, _, _ in srs_samples)} 69 | samples = len(srs_samples) 70 | for updates in range(num_updates): 71 | s, r, s1 = srs_samples[randint(samples, size=1)[0]] 72 | ret[s].append(ret[s][-1] + learning_rate * 73 | (updates / learning_rate_decay + 1) ** -0.5 74 | * (r + (ret[s1][-1] if s1 != 'T' else 0.) - ret[s][-1])) 75 | return {s: np.mean(v[-int(len(v) * 0.9):]) for s, v in ret.items()} 76 | 77 | 78 | def get_lstd_value_function( 79 | srs_samples: Sequence[Tuple[S, float, S]] 80 | ) -> ValueFunc: 81 | nt_states = list(set(x for x, _, _ in srs_samples)) 82 | num_nt_states = len(nt_states) 83 | phi = np.eye(num_nt_states) 84 | a_mat = np.zeros((num_nt_states, num_nt_states)) 85 | b_vec = np.zeros(num_nt_states) 86 | for s, r, s1 in srs_samples: 87 | p1 = phi[nt_states.index(s)] 88 | p2 = phi[nt_states.index(s1)] if s1 != 'T' else np.zeros(num_nt_states) 89 | a_mat += np.outer(p1, p1 - p2) 90 | b_vec += p1 * r 91 | return {nt_states[i]: v for i, v in 92 | enumerate(np.linalg.inv(a_mat).dot(b_vec))} 93 | 94 | 95 | if __name__ == '__main__': 96 | given_data: DataType = [ 97 | [('A', 2.), ('A', 6.), ('B', 1.), ('B', 2.)], 98 | [('A', 3.), ('B', 2.), ('A', 4.), ('B', 2.), ('B', 0.)], 99 | [('B', 3.), ('B', 6.), ('A', 1.), ('B', 1.)], 100 | [('A', 0.), ('B', 2.), ('A', 4.), ('B', 4.), ('B', 2.), ('B', 3.)], 101 | [('B', 8.), ('B', 2.)] 102 | ] 103 | 104 | print("------------- STATE-RETURN SAMPLES --------------") 105 | sr_samps = get_state_return_samples(given_data) 106 | print(sr_samps) 107 | print("------------- MONTE CARLO VALUE FUNCTION --------------") 108 | print(get_mc_value_function(sr_samps)) 109 | 110 | print("------------- SRS SAMPLES ----------------") 111 | srs_samps = get_state_reward_next_state_samples(given_data) 112 | print(srs_samps) 113 | 114 | print("------------- MRP --------------") 115 | pfunc, rfunc = get_probability_and_reward_functions(srs_samps) 116 | print(pfunc) 117 | print(rfunc) 118 | print("-------------- MRP VALUE FUNCTION ----------") 119 | print(get_mrp_value_function(pfunc, rfunc)) 120 | 121 | print("------------- TD VALUE FUNCTION --------------") 122 | print(get_td_value_function(srs_samps)) 123 | 124 | print("------------- LSTD VALUE FUNCTION --------------") 125 | print(get_lstd_value_function(srs_samps)) 126 | -------------------------------------------------------------------------------- /src/examples/american_pricing/num_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Callable 2 | import numpy as np 3 | from src.examples.american_pricing.bs_pricing import EuropeanBSPricing 4 | from scipy.optimize import curve_fit 5 | from numpy.polynomial.laguerre import lagval 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def get_future_price_mean_var( 10 | x: float, 11 | t: float, 12 | delta_t: float, 13 | lognormal: bool, # whether dispersion is multiplied by x or not 14 | rate_int_func: Callable[[float], float], # ir integral 15 | sigma2_int_func: Callable[[float], float], # sigma^2 integral 16 | ) -> Tuple[float, float]: 17 | """ 18 | :param x: represents underlying price at time t (= x_t) 19 | :param t: represents current time t 20 | :param delta_t: represents interval of time beyond t at which 21 | we want the future price, i.e., at time t + delta_t 22 | :param lognormal: this indicates whether dispersion func is 23 | multiplied by x or not (i.e., whether lognormal or normal) 24 | :param rate_int_func: this is ir(t) func 25 | :param sigma2_int_func: this is isig(t) func 26 | :return: mean and variance of x_{t+delta_t} if 27 | lognormal == True, else return mean and variance of 28 | log(x_{t+delta_t}) 29 | 30 | rate_int_func is ir(t) = int_0^t r(u) du 31 | 32 | If lognormal == True, we have generalized GBM 33 | dx_t = r(t) x_t dt + sigma(t) x_t dz_t 34 | The solution is (denoting t + delta_t as t1): 35 | x_{t1} = x_t . e^{int_t^{t1} (r(u) - 36 | sigma^2(u)/2) du + int_t^{t1} sigma(u) dz_u} 37 | So, log(x_{t1}) is normal with: 38 | Mean[log(x_{t1})] = log(x_t) + int_t^{t1} (r(u) - sigma^2(u)/2) du 39 | Variance[log(x_{t1}] = int_t^{t1} sigma^2(u) du 40 | In the case that lognormal == True, sigma2_int_func 41 | = isig(t) = int_0^t sigma^2(u) du 42 | Therefore, in the case that lognormal == True, 43 | log(x_{t1}) is normal with: 44 | Mean[log(x_{t1})] = log(x_t) + ir(t1) - ir(t) + (isig(t) - isig(t1)) / 2 45 | Variance[log(x_{t1})] = isig(t1) - isig(t) 46 | 47 | If lognormal == False, we have generalize OU with mean-reversion to 0 48 | dx_t = r(t) x_t dt + sigma(t) dz_t 49 | The solution is (denoting t + delta_t as t1) 50 | x_{t1} = x_t e^{int_t^{t1} r(u) du} + 51 | (e^{int_0^{t1} r(u) du}) . (int_t^t1 sigma(u) e^{-int_0^u r(s) ds} d_zu) 52 | So, x_{t1} is normal with: 53 | Mean[x_{t1}] = x_t . e^{int_t^{t1} r(u) du} 54 | Variance[x_{t1}] = (e^{int_0^{t1} 2 r(u) du})) . 55 | (int_t^t1 sigma^2(u) e^{-int_0^u 2 r(s) ds} du) 56 | In the case that lognormal == False, sigma2_int_func 57 | = isig(t) = int_0^t sigma^2(u) . e^{-int_0^u 2 r(s) ds} . du 58 | Therefore, in the case that lognormal == False, 59 | x_{t1} is normal with: 60 | Mean[x_{t1}] = x_t . e^{ir(t1) - ir(t)} 61 | Variance[x_{t1}] = e^{2 ir(t1)} . (isig(t1) - isig(t)) 62 | """ 63 | ir_t = rate_int_func(t) 64 | ir_t1 = rate_int_func(t + delta_t) 65 | isig_t = sigma2_int_func(t) 66 | isig_t1 = sigma2_int_func(t + delta_t) 67 | ir_diff = ir_t1 - ir_t 68 | isig_diff = isig_t1 - isig_t 69 | 70 | if lognormal: 71 | mean = np.log(x) + ir_diff - isig_diff / 2. 72 | var = isig_diff 73 | else: 74 | mean = x * np.exp(ir_diff) 75 | var = np.exp(2. * ir_t1) * isig_diff 76 | return mean, var 77 | 78 | 79 | def plot_fitted_call_prices( 80 | is_call: bool, 81 | strike: float, 82 | expiry: float, 83 | r: float, 84 | sigma: float 85 | ) -> None: 86 | spot_prices = np.linspace(strike * 0.5, strike * 1.5, 1001) 87 | option_prices = [EuropeanBSPricing( 88 | is_call, 89 | s, 90 | strike, 91 | expiry, 92 | r, 93 | sigma 94 | ).get_option_price() for s in spot_prices] 95 | 96 | def fit_func( 97 | x: np.ndarray, 98 | a: float, 99 | b: float, 100 | c: float 101 | ) -> np.ndarray: 102 | return a * np.exp(b * x + c) 103 | 104 | def jac_func( 105 | x: np.ndarray, 106 | a: float, 107 | b: float, 108 | c: float 109 | ) -> np.ndarray: 110 | t = np.exp(b * x + c) 111 | da = t 112 | db = a * t * x 113 | dc = a * t 114 | return np.transpose([da, db, dc]) 115 | 116 | fp = curve_fit( 117 | f=fit_func, 118 | xdata=spot_prices, 119 | ydata=option_prices, 120 | jac=jac_func 121 | )[0] 122 | pred1_option_prices = fit_func(spot_prices, fp[0], fp[1], fp[2]) 123 | 124 | num_laguerre = 10 125 | ident = np.eye(num_laguerre) 126 | spot_features = np.array([[1.] + [np.exp(-s / (strike * 2)) * 127 | lagval(s / strike, ident[i]) for i in 128 | range(num_laguerre)] for s in spot_prices]) 129 | lp = np.linalg.lstsq( 130 | spot_features, 131 | np.array(option_prices), 132 | rcond=None 133 | )[0] 134 | pred2_option_prices = spot_features.dot(lp) 135 | 136 | plt.plot(spot_prices, option_prices, 'r') 137 | plt.plot(spot_prices, pred1_option_prices, 'b') 138 | plt.plot(spot_prices, pred2_option_prices, 'g') 139 | plt.show() 140 | 141 | 142 | if __name__ == '__main__': 143 | is_call_val = False 144 | strike_val = 80.0 145 | expiry_val = 0.4 146 | r_val = 0.02 147 | sigma_val = 0.3 148 | 149 | plot_fitted_call_prices( 150 | is_call=is_call_val, 151 | strike=strike_val, 152 | expiry=expiry_val, 153 | r=r_val, 154 | sigma=sigma_val 155 | ) 156 | -------------------------------------------------------------------------------- /src/algorithms/helper_funcs.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Set, Sequence, Optional, Callable, Tuple 2 | from processes.policy import Policy 3 | from processes.det_policy import DetPolicy 4 | import numpy as np 5 | from scipy.linalg import toeplitz 6 | from operator import itemgetter 7 | from collections import Counter 8 | from processes.mp_funcs import get_epsilon_action_probs 9 | from processes.mp_funcs import get_softmax_action_probs 10 | from utils.generic_typevars import S, A 11 | from utils.standard_typevars import SAf, PolicyType, PolicyActDictType 12 | 13 | 14 | def get_uniform_policy(state_action_dict: Mapping[S, Set[A]]) -> Policy: 15 | return Policy({s: {a: 1. / len(v) for a in v} for s, v in 16 | state_action_dict.items()}) 17 | 18 | 19 | def get_uniform_policy_func(state_action_func: Callable[[S], Set[A]]) \ 20 | -> Callable[[S], Mapping[A, float]]: 21 | 22 | # noinspection PyShadowingNames 23 | def upf(s: S, state_action_func=state_action_func) -> Mapping[A, float]: 24 | actions = state_action_func(s) 25 | return {a: 1. / len(actions) for a in actions} 26 | 27 | return upf 28 | 29 | 30 | def get_returns_from_rewards_terminating( 31 | rewards: Sequence[float], 32 | gamma: float 33 | ) -> np.ndarray: 34 | sz = len(rewards) 35 | return toeplitz( 36 | np.insert(np.zeros(sz - 1), 0, 1.), 37 | np.power(gamma, np.arange(sz)) 38 | ).dot(rewards) 39 | 40 | 41 | def get_returns_from_rewards_non_terminating( 42 | rewards: Sequence[float], 43 | gamma: float, 44 | points: Optional[int] = None 45 | ) -> np.ndarray: 46 | cnt = points if points is not None else len(rewards) 47 | return toeplitz( 48 | np.insert(np.zeros(cnt - 1), 0, 1.), 49 | np.concatenate(( 50 | np.power(gamma, np.arange(len(rewards) - cnt + 1)), 51 | np.zeros(cnt - 1) 52 | )) 53 | ).dot(rewards) 54 | 55 | 56 | def get_det_policy_from_qf_dict(qf_dict: SAf) -> DetPolicy: 57 | return DetPolicy({s: max(v.items(), key=itemgetter(1))[0] 58 | for s, v in qf_dict.items()}) 59 | 60 | 61 | def get_soft_policy_from_qf_dict( 62 | qf_dict: SAf, 63 | softmax: bool, 64 | epsilon: float 65 | ) -> Policy: 66 | if softmax: 67 | ret = Policy({s: get_softmax_action_probs(v) for s, v in 68 | qf_dict.items()}) 69 | else: 70 | ret = Policy({s: get_epsilon_action_probs(v, epsilon) for s, v in 71 | qf_dict.items()}) 72 | return ret 73 | 74 | 75 | def get_soft_policy_func_from_qf( 76 | qf: Callable[[Tuple[S, A]], float], 77 | state_action_func: Callable[[S], Set[A]], 78 | softmax: bool, 79 | epsilon: float 80 | ) -> Callable[[S], Mapping[A, float]]: 81 | 82 | # noinspection PyShadowingNames 83 | def sp_func( 84 | s: S, 85 | qf=qf, 86 | state_action_func=state_action_func, 87 | softmax=softmax, 88 | epsilon=epsilon 89 | ) -> Mapping[A, float]: 90 | av_dict = {a: qf((s, a)) for a in state_action_func(s)} 91 | return get_softmax_action_probs(av_dict) if softmax else\ 92 | get_epsilon_action_probs(av_dict, epsilon) 93 | 94 | return sp_func 95 | 96 | 97 | def get_vf_dict_from_qf_dict_and_policy( 98 | qf_dict: SAf, 99 | pol: Policy 100 | ) -> Mapping[A, float]: 101 | return {s: sum(pol.get_state_action_probability(s, a) * q 102 | for a, q in v.items()) for s, v in qf_dict.items()} 103 | 104 | 105 | def get_policy_func_for_fa( 106 | pol_func: Callable[[S], Callable[[A], float]], 107 | state_action_func: Callable[[S], Set[A]] 108 | ) -> Callable[[S], Mapping[A, float]]: 109 | 110 | # noinspection PyShadowingNames 111 | def pf( 112 | s: S, 113 | pol_func=pol_func, 114 | state_action_func=state_action_func 115 | ) -> Mapping[A, float]: 116 | return {a: pol_func(s)(a) for a in state_action_func(s)} 117 | 118 | return pf 119 | 120 | 121 | def get_nt_return_eval_steps( 122 | max_steps: int, 123 | gamma: float, 124 | eps: float 125 | ) -> int: 126 | low_limit = 0.2 * max_steps 127 | high_limit = float(max_steps - 1) 128 | if gamma == 0.: 129 | val = high_limit 130 | elif gamma == 1.: 131 | val = low_limit 132 | else: 133 | val = min( 134 | high_limit, 135 | max( 136 | low_limit, 137 | max_steps - np.log(eps) / np.log(gamma) 138 | ) 139 | ) 140 | return int(np.floor(val)) 141 | 142 | 143 | def get_epsilon_decay_func( 144 | epsilon, 145 | epsilon_half_life 146 | ) -> Callable[[int], float]: 147 | 148 | # noinspection PyShadowingNames 149 | def epsilon_decay( 150 | t: int, 151 | epsilon=epsilon, 152 | epsilon_half_life=epsilon_half_life 153 | ) -> float: 154 | return epsilon * 2 ** -(t / epsilon_half_life) 155 | 156 | return epsilon_decay 157 | 158 | 159 | def get_pdf_from_samples(samples: Sequence[A]) -> Mapping[A, float]: 160 | num_samples = len(samples) 161 | c = Counter(samples) 162 | return {k: v / num_samples for k, v in c.items()} 163 | 164 | 165 | def get_policy_as_action_dict(polf: PolicyType, num_samples: int)\ 166 | -> PolicyActDictType: 167 | 168 | def pf(s: S) -> Mapping[A, float]: 169 | return get_pdf_from_samples(polf(s)(num_samples)) 170 | 171 | return pf 172 | 173 | 174 | if __name__ == '__main__': 175 | rewards_list = [1., 2., 3., 4., 5., 6.] 176 | gamma_val = 0.9 177 | count = 4 178 | nt_returns_list = get_returns_from_rewards_non_terminating( 179 | rewards_list, 180 | gamma_val, 181 | count 182 | ) 183 | print(nt_returns_list) 184 | term_returns_list = get_returns_from_rewards_terminating( 185 | rewards_list, 186 | gamma_val 187 | ) 188 | print(term_returns_list) 189 | 190 | pd = {'a': 0.3, 'b': 0.2, 'c': 0.4, 'd': 0.1} 191 | from processes.mp_funcs import get_sampling_func_from_prob_dict 192 | seqf = get_sampling_func_from_prob_dict(pd) 193 | seq = seqf(1000) 194 | print(seq) 195 | print(Counter(seq)) 196 | -------------------------------------------------------------------------------- /src/algorithms/ams.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Set, Sequence, Tuple, Generic, Callable, Optional 2 | from utils.generic_typevars import S, A 3 | import numpy as np 4 | from random import sample 5 | 6 | 7 | class AdaptiveMultistageSampling(Generic[S, A]): 8 | 9 | def __init__( 10 | self, 11 | start_state: S, 12 | actions_sets: Sequence[Set[A]], 13 | num_samples: Sequence[int], 14 | state_gen_reward_funcs: Sequence[Callable[[S, A], Tuple[Callable[[], S], float]]], 15 | terminal_opt_val_func: Callable[[S], float], 16 | discount: float, 17 | ) -> None: 18 | if len(actions_sets) == len(num_samples) == len(state_gen_reward_funcs) and \ 19 | 0. <= discount <= 1. and \ 20 | all(len(x) <= y for x, y in zip(actions_sets, num_samples)): 21 | self.start_state = start_state 22 | self.actions_sets = actions_sets 23 | self.num_samples = num_samples 24 | self.num_time_steps = len(actions_sets) 25 | self.state_gen_rewards_funcs = state_gen_reward_funcs 26 | self.terminal_opt_val_func = terminal_opt_val_func 27 | self.discount = discount 28 | else: 29 | raise ValueError 30 | 31 | def get_opt_val_and_internals( 32 | self, 33 | state: S, 34 | time_step: int 35 | ) -> Tuple[float, Optional[Mapping[A, Tuple[float, int]]]]: 36 | """ 37 | This function estimates the optimal value function V* 38 | for a given state in a given time step. The output is 39 | a tuple (pair) where the first element is the estimate 40 | of the optimal value function V* and the second element 41 | is a dictionary where the keys are the actions for that 42 | time step and the values are a pair where the first 43 | element in the estimated optimal Q-value function Q* 44 | for that action and the second element is the number of 45 | samples drawn for the action (that was used in estimating 46 | the Q-value function Q* for that action) 47 | """ 48 | if time_step == self.num_time_steps: 49 | ret = (self.terminal_opt_val_func(state), None) 50 | else: 51 | actions = self.actions_sets[time_step] 52 | state_gen_rewards = {a: self.state_gen_rewards_funcs[time_step](state, a) 53 | for a in actions} 54 | state_gens = {a: x for a, (x, _) in state_gen_rewards.items()} 55 | rewards = {a: y for a, (_, y) in state_gen_rewards.items()} 56 | # sample each action once, sample each action's next state, and 57 | # recursively call the next state's V* estimate 58 | val_sums = {a: self.get_opt_val_and_internals(state_gens[a](), time_step + 1)[0] 59 | for a in actions} 60 | counts = {a: 1 for a in actions} 61 | # loop num_samples[time_step] number of times (beyond the 62 | # len(actions) samples that have already been done above 63 | for i in range(len(actions), self.num_samples[time_step]): 64 | # determine the actions that dominate on the UCB Q* estimated value 65 | # and pick one of these dominating actions at random, call it a* 66 | ucb_vals = {a: rewards[a] + self.discount * val_sums[a] / counts[a] 67 | + np.sqrt(2 * np.log(i) / counts[a]) for a in actions} 68 | max_actions = {a for a, u in ucb_vals.items() if u == max(ucb_vals.values())} 69 | a_star = sample(max_actions, 1)[0] 70 | # sample a*'s next state at random, and recursively call the next state's 71 | # V* estimate 72 | next_state = state_gens[a_star]() 73 | val_sums[a_star] += self.get_opt_val_and_internals(next_state, time_step + 1)[0] 74 | counts[a_star] += 1 75 | 76 | # return estimated V* as weighted average of the estimated Q* where weights are 77 | # proportioned by the number of times an action was sampled 78 | ret1 = sum(counts[a] / self.num_samples[time_step] * 79 | (rewards[a] + self.discount * val_sums[a] / counts[a]) 80 | for a in actions) 81 | ret2 = {a: (rewards[a] + self.discount * val_sums[a] / counts[a], counts[a]) 82 | for a in actions} 83 | ret = (ret1, ret2) 84 | 85 | return ret 86 | 87 | 88 | if __name__ == '__main__': 89 | from scipy.stats import gamma 90 | from scipy.integrate import quad 91 | from utils.gen_utils import memoize 92 | 93 | init_inv: int = 80.0 # initial inventory 94 | steps: int = 4 # time steps 95 | step_samples: int = 20 96 | # the following are (price, gamma distribution mean) pairs, i.e., elasticity 97 | el: Mapping[float, float] = {10.0: 10.0, 8.0: 20.0, 5.0: 30.0} 98 | rvs = {p: gamma(l) for p, l in el.items()} 99 | terminal_vf: Callable[[S], float] = lambda s: 0. 100 | this_discount: float = 1.0 101 | 102 | # noinspection PyShadowingNames 103 | @memoize 104 | def state_gen_rew_func(state: float, action: float, rvs=rvs) -> Tuple[Callable[[], float], float]: 105 | # noinspection PyShadowingNames 106 | def rew_f(x: float, state=state, action=action, rvs=rvs) -> float: 107 | return rvs[action].pdf(x) * (action * min(state, x)) 108 | 109 | mu = rvs[action].mean() 110 | lower = mu - 4.0 * np.sqrt(mu) 111 | upper = mu + 4.0 * np.sqrt(mu) 112 | return ( 113 | lambda state=state, action=action, el=el: max( 114 | 0., 115 | state - np.random.gamma(el[action], scale=1.0, size=1)[0] 116 | ), 117 | quad(rew_f, lower, upper)[0] 118 | ) 119 | 120 | 121 | obj = AdaptiveMultistageSampling( 122 | start_state=init_inv, 123 | actions_sets=[set(el)] * steps, 124 | num_samples=[step_samples] * steps, 125 | state_gen_reward_funcs=[state_gen_rew_func] * steps, 126 | terminal_opt_val_func=terminal_vf, 127 | discount=this_discount 128 | ) 129 | 130 | res = obj.get_opt_val_and_internals(init_inv, 0) 131 | print(res) 132 | -------------------------------------------------------------------------------- /src/examples/clearance_pricing.py: -------------------------------------------------------------------------------- 1 | from scipy.stats import poisson 2 | from algorithms.backward_dp import BackwardDP 3 | from typing import List, Tuple, Mapping, Any, Sequence 4 | from matplotlib.ticker import PercentFormatter 5 | from pathlib import Path 6 | import numpy as np 7 | from pprint import pprint 8 | import matplotlib.pyplot as plt 9 | 10 | 11 | def get_clearance_backward_dp( 12 | time_steps: int, 13 | init_inv: int, 14 | base_demand: float, 15 | el: List[Tuple[float, float]], # (price, poisson mean) pairs 16 | ) -> BackwardDP: 17 | 18 | aug_el = [(0., 0.)] + el 19 | rvs = [poisson(base_demand * (1 + l)) for _, l in aug_el] 20 | num_el = len(aug_el) 21 | 22 | tr_rew_dict = { 23 | (s, p): { 24 | p1: { 25 | (s - d, p1): ( 26 | rvs[p1].pmf(d) if d < s else 1. - rvs[p1].cdf(s - 1), 27 | d * (1 - aug_el[p1][0]) 28 | ) for d in range(s + 1) 29 | } for p1 in range(p, num_el) 30 | } for s in range(init_inv + 1) for p in range(num_el) 31 | } 32 | return BackwardDP( 33 | transitions_rewards=[tr_rew_dict] * time_steps, 34 | terminal_opt_val={(s, p): 0. for s in range(init_inv + 1) 35 | for p in range(num_el)}, 36 | gamma=1. 37 | ) 38 | 39 | 40 | def get_performance( 41 | time_steps: int, 42 | init_inv: int, 43 | base_demand: float, 44 | el: List[Tuple[float, float]], 45 | num_traces: int 46 | ) -> Mapping[str, Any]: 47 | vf_and_pol = get_clearance_backward_dp( 48 | time_steps, 49 | init_inv, 50 | base_demand, 51 | el 52 | ).vf_and_policy 53 | opt_vf = vf_and_pol[0][(init_inv, 0)][0] 54 | 55 | aug_el = [(0., 0.)] + el 56 | rvs = [poisson(base_demand * (1 + l)) for _, l in aug_el] 57 | 58 | all_revs = np.empty(num_traces) 59 | all_rem = np.empty((num_traces, time_steps)) 60 | all_actions = np.empty((num_traces, time_steps)) 61 | for i in range(num_traces): 62 | rev = 0. 63 | state = (init_inv, 0) 64 | for t in range(time_steps): 65 | action = vf_and_pol[t][state][1] 66 | price = 1 - aug_el[action][0] 67 | demand = rvs[action].rvs() 68 | rev += (min(state[0], demand) * price) 69 | state = (max(0, state[0] - demand), action) 70 | all_rem[i, t] = state[0] 71 | all_actions[i, t] = aug_el[action][0] 72 | all_revs[i] = rev 73 | 74 | mean_remaining = np.mean(all_rem, axis=0) /init_inv 75 | mean_salvage = mean_remaining[-1] 76 | mean_revenue = np.mean(all_revs) / init_inv 77 | mean_a_markdown = 1. - mean_salvage - mean_revenue 78 | mean_actions = np.mean(all_actions, axis=0) / init_inv 79 | stdev_remaining = np.std(all_rem, axis=0) / init_inv 80 | stdev_salvage = stdev_remaining[-1] 81 | stdev_revenue = np.std(all_revs) / init_inv 82 | stdev_a_markdown = np.sqrt(stdev_salvage ** 2 + stdev_revenue ** 2) 83 | stdev_actions = np.std(all_actions, axis=0) / init_inv 84 | 85 | return { 86 | "Optimal VF": opt_vf, 87 | "Mean Revenue": mean_revenue, 88 | "Mean AMarkdown": mean_a_markdown, 89 | "Mean Salvage": mean_salvage, 90 | "Stdev Revenue": stdev_revenue, 91 | "Stdev AMarkdown": stdev_a_markdown, 92 | "Stdev Salvage": stdev_salvage, 93 | "Mean Remaining": mean_remaining, 94 | "Mean Price Reductions": mean_actions, 95 | "Stdev Remaining": stdev_remaining, 96 | "Stdev Price Reductions": stdev_actions, 97 | } 98 | 99 | 100 | def graph_perf( 101 | time_steps: int, 102 | demand: float, 103 | inv: Sequence[int], 104 | elasticity: Tuple[float, float, float] 105 | ) -> None: 106 | revs = [] 107 | ams = [] 108 | sals = [] 109 | for initial_inv in inv: 110 | perf = get_performance( 111 | time_steps, 112 | initial_inv, 113 | demand, 114 | list(zip((0.3, 0.5, 0.7), elasticity)), 115 | 10000 116 | ) 117 | revs.append(perf["Mean Revenue"] * 100) 118 | ams.append(perf["Mean AMarkdown"] * 100) 119 | sals.append(perf["Mean Salvage"] * 100) 120 | plt.grid() 121 | plt.plot(inv, revs, "k", label="Revenue") 122 | plt.plot(inv, ams, "b", label="A-Markdown") 123 | plt.plot(inv, sals, "r", label="Salvage") 124 | plt.gca().yaxis.set_major_formatter(PercentFormatter()) 125 | plt.xlabel("Initial Inventory", fontsize=10) 126 | plt.ylabel("Percentage of Initial Value", fontsize=10) 127 | tup = ( 128 | time_steps, 129 | demand, 130 | elasticity[0] * 100, 131 | elasticity[1] * 100, 132 | elasticity[2] * 100 133 | ) 134 | plt.title( 135 | "Weeks=%d,WeeklyDemand=%.1f,Elasticity=[%d,%d,%d]" % tup, 136 | fontsize=10 137 | ) 138 | plt.legend(loc="upper right") 139 | file_name = str(Path.home()) + ("/wks=%d&dem=%d&el=%d-%d-%d.png" % tup) 140 | print("Created png file: " + file_name) 141 | plt.savefig(file_name) 142 | plt.close() 143 | 144 | 145 | if __name__ == '__main__': 146 | ts: int = 8 # time steps 147 | ii: int = 12 # initial inventory 148 | bd: float = 1.0 # base demand 149 | this_el: List[Tuple[float, float]] = [ 150 | (0.3, 0.5), (0.5, 1.1), (0.7, 1.4) 151 | ] 152 | # bdp = get_clearance_backward_dp(ts, ii, bd, this_el) 153 | # 154 | # for i in range(ts): 155 | # print([(x, y) for x, (y, _) in bdp.vf_and_policy[i].items()]) 156 | # for i in range(ts): 157 | # print([(x, z) for x, (_, z) in bdp.vf_and_policy[i].items()]) 158 | 159 | traces = 10000 160 | per = get_performance(ts, ii, bd, this_el, traces) 161 | pprint(per) 162 | 163 | # ts: int = 8 # time steps 164 | # bd: float = 1.0 # base demand 165 | # invs: Sequence[int] = list(range(2, 30, 2)) 166 | # 167 | # elasticities = [ 168 | # (0.1, 0.3, 0.5), 169 | # (0.3, 0.7, 1.0), 170 | # (0.5, 0.8, 1.1), 171 | # (0.7, 1.2, 1.5), 172 | # (0.8, 1.3, 1.7), 173 | # (1.0, 1.5, 2.0), 174 | # (1.0, 2.0, 2.5), 175 | # (1.5, 2.5, 3.5), 176 | # (2.0, 4.0, 6.0) 177 | # ] 178 | # for els in elasticities: 179 | # graph_perf(ts, bd, invs, els) 180 | -------------------------------------------------------------------------------- /src/processes/mdp.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Set, Tuple, Generic 2 | from utils.gen_utils import zip_dict_of_tuple, is_approx_eq 3 | from processes.mp_funcs import get_all_states, get_actions_for_states 4 | from processes.mp_funcs import verify_mdp, get_lean_transitions 5 | from processes.policy import Policy 6 | from processes.det_policy import DetPolicy 7 | from processes.mp_funcs import mdp_rep_to_mrp_rep1, mdp_rep_to_mrp_rep2 8 | from operator import itemgetter 9 | from processes.mrp import MRP 10 | from processes.mp_funcs import get_rv_gen_func 11 | from processes.mdp_rep_for_adp import MDPRepForADP 12 | from utils.generic_typevars import S, A 13 | 14 | 15 | class MDP(Generic[S, A]): 16 | 17 | def __init__( 18 | self, 19 | info: Mapping[S, Mapping[A, Tuple[Mapping[S, float], float]]], 20 | gamma: float 21 | ) -> None: 22 | if verify_mdp(info): 23 | d = {k: zip_dict_of_tuple(v) for k, v in info.items()} 24 | d1, d2 = zip_dict_of_tuple(d) 25 | self.all_states: Set[S] = get_all_states(info) 26 | self.state_action_dict: Mapping[S, Set[A]] = \ 27 | get_actions_for_states(info) 28 | self.transitions: Mapping[S, Mapping[A, Mapping[S, float]]] = \ 29 | {s: {a: get_lean_transitions(v1) for a, v1 in v.items()} 30 | for s, v in d1.items()} 31 | self.rewards: Mapping[S, Mapping[A, float]] = d2 32 | self.gamma: float = gamma 33 | self.terminal_states: Set[S] = self.get_terminal_states() 34 | else: 35 | raise ValueError 36 | 37 | def get_sink_states(self) -> Set[S]: 38 | return {k for k, v in self.transitions.items() if 39 | all(len(v1) == 1 and k in v1.keys() for _, v1 in v.items()) 40 | } 41 | 42 | def get_terminal_states(self) -> Set[S]: 43 | """ 44 | A terminal state is a sink state (100% probability to going back 45 | to itself, FOR EACH ACTION) and the rewards on those transitions back 46 | to itself are zero. 47 | """ 48 | sink = self.get_sink_states() 49 | return {s for s in sink if 50 | all(is_approx_eq(r, 0.0) for _, r in self.rewards[s].items())} 51 | 52 | def get_mrp(self, pol: Policy) -> MRP: 53 | tr = mdp_rep_to_mrp_rep1(self.transitions, pol.policy_data) 54 | rew = mdp_rep_to_mrp_rep2(self.rewards, pol.policy_data) 55 | return MRP({s: (v, rew[s]) for s, v in tr.items()}, self.gamma) 56 | 57 | def get_value_func_dict(self, pol: Policy)\ 58 | -> Mapping[S, float]: 59 | mrp_obj = self.get_mrp(pol) 60 | value_func_vec = mrp_obj.get_value_func_vec() 61 | nt_vf = {mrp_obj.nt_states_list[i]: value_func_vec[i] 62 | for i in range(len(mrp_obj.nt_states_list))} 63 | t_vf = {s: 0. for s in self.terminal_states} 64 | return {**nt_vf, **t_vf} 65 | 66 | def get_act_value_func_dict(self, pol: Policy)\ 67 | -> Mapping[S, Mapping[A, float]]: 68 | v_dict = self.get_value_func_dict(pol) 69 | return {s: {a: r + self.gamma * sum(p * v_dict[s1] for s1, p in 70 | self.transitions[s][a].items()) 71 | for a, r in v.items()} 72 | for s, v in self.rewards.items()} 73 | 74 | def get_improved_policy(self, pol: Policy) -> DetPolicy: 75 | q_dict = self.get_act_value_func_dict(pol) 76 | return DetPolicy({s: max(v.items(), key=itemgetter(1))[0] 77 | for s, v in q_dict.items()}) 78 | 79 | def get_optimal_policy(self, tol=1e-4) -> DetPolicy: 80 | pol = Policy({s: {a: 1. / len(v) for a in v} for s, v in 81 | self.state_action_dict.items()}) 82 | vf = self.get_value_func_dict(pol) 83 | epsilon = tol * 1e4 84 | while epsilon >= tol: 85 | pol = self.get_improved_policy(pol) 86 | new_vf = self.get_value_func_dict(pol) 87 | epsilon = max(abs(new_vf[s] - v) for s, v in vf.items()) 88 | vf = new_vf 89 | return pol 90 | 91 | def get_mdp_rep_for_adp(self) -> MDPRepForADP: 92 | return MDPRepForADP( 93 | state_action_func=lambda s: self.state_action_dict[s], 94 | gamma=self.gamma, 95 | sample_states_gen_func=get_rv_gen_func( 96 | {s: 1. / len(self.state_action_dict) for s in 97 | self.state_action_dict.keys()} 98 | ), 99 | reward_func=lambda s, a: self.rewards[s][a], 100 | transitions_func=lambda s, a: self.transitions[s][a] 101 | ) 102 | 103 | 104 | if __name__ == '__main__': 105 | data = { 106 | 1: { 107 | 'a': ({1: 0.3, 2: 0.6, 3: 0.1}, 5.0), 108 | 'b': ({2: 0.3, 3: 0.7}, 2.8), 109 | 'c': ({1: 0.2, 2: 0.4, 3: 0.4}, -7.2) 110 | }, 111 | 2: { 112 | 'a': ({1: 0.3, 2: 0.6, 3: 0.1}, 5.0), 113 | 'c': ({1: 0.2, 2: 0.4, 3: 0.4}, -7.2) 114 | }, 115 | 3: { 116 | 'a': ({3: 1.0}, 0.0), 117 | 'b': ({3: 1.0}, 0.0) 118 | } 119 | } 120 | mdp_obj = MDP(data, 0.95) 121 | print(mdp_obj.all_states) 122 | print(mdp_obj.transitions) 123 | print(mdp_obj.rewards) 124 | terminal = mdp_obj.get_terminal_states() 125 | print(terminal) 126 | policy_data = { 127 | 1: {'a': 0.4, 'b': 0.6}, 128 | 2: {'a': 0.7, 'c': 0.3}, 129 | 3: {'b': 1.0} 130 | } 131 | pol_obj = Policy(policy_data) 132 | mdp_data = { 133 | 1: { 134 | 'a': ({1: 0.2, 2: 0.6, 3: 0.2}, 7.0), 135 | 'b': ({1: 0.6, 2: 0.3, 3: 0.1}, -2.0), 136 | 'c': ({1: 0.1, 2: 0.2, 3: 0.7}, 10.0) 137 | }, 138 | 2: { 139 | 'a': ({1: 0.1, 2: 0.6, 3: 0.3}, 1.0), 140 | 'c': ({1: 0.6, 2: 0.2, 3: 0.2}, -1.2) 141 | }, 142 | 3: { 143 | 'b': ({3: 1.0}, 0.0) 144 | } 145 | } 146 | mdp1_obj = MDP(mdp_data, gamma=0.9) 147 | mrp1_obj = mdp1_obj.get_mrp(pol_obj) 148 | print(mrp1_obj.transitions) 149 | print(mrp1_obj.rewards) 150 | print(mrp1_obj.trans_matrix) 151 | print(mrp1_obj.rewards_vec) 152 | print(mrp1_obj.get_value_func_vec()) 153 | opt_policy = mdp1_obj.get_optimal_policy() 154 | print(opt_policy.policy_data) 155 | opt_vf_dict = mdp1_obj.get_value_func_dict(opt_policy) 156 | print(opt_vf_dict) 157 | -------------------------------------------------------------------------------- /src/examples/american_pricing/vanilla_american_test.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Any 2 | import numpy as np 3 | from algorithms.td_algo_enum import TDAlgorithm 4 | from numpy.polynomial.laguerre import lagval 5 | from examples.american_pricing.american_pricing import AmericanPricing 6 | from examples.american_pricing.grid_pricing import GridPricing 7 | from src.examples.american_pricing.num_utils import get_future_price_mean_var 8 | 9 | LARGENUM = 1e8 10 | 11 | 12 | # noinspection PyShadowingNames 13 | def get_vanilla_american_price( 14 | is_call: bool, 15 | spot_price: float, 16 | strike: float, 17 | expiry: float, 18 | lognormal: bool, 19 | r: float, 20 | sigma: float, 21 | num_dt: int, 22 | num_paths: int, 23 | num_laguerre: int, 24 | params_bag: Mapping[str, Any] 25 | ) -> Mapping[str, float]: 26 | opt_payoff = lambda _, x, is_call=is_call, strike=strike:\ 27 | max(x - strike, 0.) if is_call else max(strike - x, 0.) 28 | # noinspection PyShadowingNames 29 | ir_func = lambda t, r=r: r * t 30 | isig_func = lambda t, sigma=sigma: sigma * sigma * t 31 | 32 | num_dx = 200 33 | expiry_mean, expiry_var = get_future_price_mean_var( 34 | spot_price, 35 | 0., 36 | expiry, 37 | lognormal, 38 | ir_func, 39 | isig_func 40 | ) 41 | grid_price = GridPricing( 42 | spot_price=spot_price, 43 | payoff=opt_payoff, 44 | expiry=expiry, 45 | lognormal=lognormal, 46 | ir=ir_func, 47 | isig=isig_func 48 | ).get_price( 49 | num_dt=num_dt, 50 | num_dx=num_dx, 51 | center=expiry_mean, 52 | width=np.sqrt(expiry_var) * 4 53 | ) 54 | 55 | gp = AmericanPricing( 56 | spot_price=spot_price, 57 | payoff=(lambda t, x, opt_payoff=opt_payoff: opt_payoff(t, x[-1])), 58 | expiry=expiry, 59 | lognormal=lognormal, 60 | ir=ir_func, 61 | isig=isig_func 62 | ) 63 | ident = np.eye(num_laguerre) 64 | 65 | # noinspection PyShadowingNames 66 | def laguerre_feature_func( 67 | x: float, 68 | i: int, 69 | ident=ident, 70 | strike=strike 71 | ) -> float: 72 | # noinspection PyTypeChecker 73 | xp = x / strike 74 | return np.exp(-xp / 2) * lagval(xp, ident[i]) 75 | 76 | ls_price = gp.get_ls_price( 77 | num_dt=num_dt, 78 | num_paths=num_paths, 79 | feature_funcs=[lambda _, x: 1.] + 80 | [(lambda _, x, i=i: laguerre_feature_func(x[-1], i)) for i in 81 | range(num_laguerre)] 82 | ) 83 | 84 | # noinspection PyShadowingNames 85 | def rl_feature_func( 86 | ind: int, 87 | x: float, 88 | a: bool, 89 | i: int, 90 | num_laguerre: int = num_laguerre, 91 | num_dt: int = num_dt, 92 | expiry: float = expiry 93 | ) -> float: 94 | dt = expiry / num_dt 95 | t = ind * dt 96 | if i < num_laguerre + 4: 97 | if ind < num_dt and not a: 98 | if i == 0: 99 | ret = 1. 100 | elif i < num_laguerre + 1: 101 | ret = laguerre_feature_func(x, i - 1) 102 | elif i == num_laguerre + 1: 103 | ret = np.sin(-t * np.pi / (2. * expiry) + np.pi / 2.) 104 | elif i == num_laguerre + 2: 105 | ret = np.log(expiry - t) 106 | else: 107 | rat = t / expiry 108 | ret = rat * rat 109 | else: 110 | ret = 0. 111 | else: 112 | if ind <= num_dt and a: 113 | ret = np.exp(-r * (ind * dt)) * opt_payoff(ind * dt, x) 114 | else: 115 | ret = 0. 116 | 117 | return ret 118 | 119 | rl_price = gp.get_rl_fa_price( 120 | num_dt=num_dt, 121 | method=params_bag["method"], 122 | exploring_start=params_bag["exploring_start"], 123 | algorithm=params_bag["algorithm"], 124 | softmax=params_bag["softmax"], 125 | epsilon=params_bag["epsilon"], 126 | epsilon_half_life=params_bag["epsilon_half_life"], 127 | lambd=params_bag["lambda"], 128 | num_paths=num_paths, 129 | batch_size=params_bag["batch_size"], 130 | feature_funcs=[(lambda x, i=i: rl_feature_func( 131 | x[0][0], 132 | x[0][1][-1], 133 | x[1], 134 | i 135 | )) for i in range(num_laguerre + 5)], 136 | neurons=params_bag["neurons"], 137 | learning_rate=params_bag["learning_rate"], 138 | learning_rate_decay=params_bag["learning_rate_decay"], 139 | adam=params_bag["adam"], 140 | offline=params_bag["offline"] 141 | ) 142 | 143 | return { 144 | "Grid": grid_price, 145 | "LS": ls_price, 146 | "RL": rl_price 147 | } 148 | 149 | 150 | if __name__ == '__main__': 151 | is_call_val = False 152 | spot_price_val = 80.0 153 | strike_val = 75.0 154 | expiry_val = 2.0 155 | lognormal_val = True 156 | r_val = 0.02 157 | sigma_val = 0.25 158 | num_dt_val = 10 159 | num_paths_val = 1000000 160 | num_laguerre_val = 3 161 | 162 | params_bag_val = { 163 | "method": "LSPI", 164 | "exploring_start": False, 165 | "algorithm": TDAlgorithm.ExpectedSARSA, 166 | "softmax": False, 167 | "epsilon": 0.2, 168 | "epsilon_half_life": 100000, 169 | "batch_size": 10000, 170 | "neurons": None, 171 | "learning_rate": 0.03, 172 | "learning_rate_decay": 10000, 173 | "adam": (True, 0.9, 0.99), 174 | "lambda": 0.8, 175 | "offline": True, 176 | } 177 | 178 | am_prices = get_vanilla_american_price( 179 | is_call=is_call_val, 180 | spot_price=spot_price_val, 181 | strike=strike_val, 182 | expiry=expiry_val, 183 | lognormal=lognormal_val, 184 | r=r_val, 185 | sigma=sigma_val, 186 | num_dt=num_dt_val, 187 | num_paths=num_paths_val, 188 | num_laguerre=num_laguerre_val, 189 | params_bag=params_bag_val 190 | ) 191 | print(am_prices) 192 | print(params_bag_val) 193 | from examples.american_pricing.bs_pricing import EuropeanBSPricing 194 | 195 | ebsp = EuropeanBSPricing( 196 | is_call=is_call_val, 197 | spot_price=spot_price_val, 198 | strike=strike_val, 199 | expiry=expiry_val, 200 | r=r_val, 201 | sigma=sigma_val 202 | ) 203 | print(ebsp.option_price) 204 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MDP-DP-RL 2 | 3 | The goal of this project was to develop all Dynamic Programming and Reinforcement Learning algorithms 4 | from scratch (i.e., with no use of standard libraries, except for basic numpy and scipy tools). The 5 | "develop from scratch" goal was motivated by educational purposes - students learning this topic 6 | can understand the concepts throroughly only when they develop and work with code developed from 7 | scratch. I teach courses on this topic to a variety of student backgrounds, and each such course 8 | is big on precise programming implementations of the techniques/algorithms. In particular, I 9 | use this codebase when I teach Stanford CME 241: Reinforcement Learning for Stochastic 10 | Control Problems in Finance (http://cme241.stanford.edu). 11 | 12 | Any feedback on code readability, performance and bugs will be greatly appreciated as the code 13 | is still fairly raw and untested in various parts (started working on this code in August 2018, 14 | and have mainly been in code-growth mode so far). 15 | 16 | The project started by implementing the foundational data structures for finite Markov Processes 17 | (a.k.a. Markov Chains), Markov Reward Processes (MRP), and Markov Decision Processes (MDP). This was followed by 18 | Dynamic Programming (DP) algorithms, where the focus was to represent Bellman equations in clear mathematical 19 | terms within the code. Next was the core educational material of Reinforcement Learning, implementing 20 | the Generalized Policy Iteration algorithms based on simulations (Monte Carlo and Temporal Difference, 21 | including eligibility traces). However, the emphasis was to first implement the tabular methods so that 22 | one can work with actual data structures (finite, hence tabular), rather than functions to represent 23 | MDP rewards and transition specifications as well as value functions and policies. Once the tabular RL 24 | methods were implemented, it was straightforward to write the same algorithms as functional approximation-based 25 | algorithms. However, this required a detour to build some foundation for function approximation. I chose 26 | to implement linear and deep neural network approximations, both of which require a specification of 27 | feature functions. Backpropagation was developed from scratch, again for educational purposes. On a whim, 28 | I also implemented Approximate Dynamic Programming (ADP) Algorithms, which was basically the same old 29 | Policy Iteration and Value Iteration algorithms but now using the output of function approximation for 30 | the right-hand-side of the Bellman update, and using the updated values as training data for gradient 31 | descent on the parameters of the function approximation. So far, I am finding ADP 32 | as the most valuable algorithm for the MDP problems I typically work with. I am a bit surprised that the 33 | "literature" focuses so much on model-free whereas I often know the model for many of the MDPs I work on, 34 | and so, ADP is ideal. 35 | 36 | I have chosen Python 3 as the language, mainly because I can't expect my students to have expertise in 37 | the potentially more-appropriate languages for this project, such as Scala, Ocaml and Haskell. These are 38 | functional programming languages and this topic/project is best done through a tasteful application of 39 | Functional Progamming. But Python 3 is not such a bad choice as functions are fast-class entities. My core 40 | technique in this project is indeed Functional Programming, but I had to very careful in getting around 41 | Python's "naughty" handling of function closures. I have also made heavy use of classes and TypeVars. 42 | Object-oriented polymorphism as well as type-parametrized polymorphism enabled me to cover a wide range of 43 | algorithms with plenty of common code. Python 3 also provided me the benefit of type annotations, which I 44 | have taken heavy advantage of in this project. Type annotations support turned out to be extremely valuable 45 | in the project as my IDE (PyCharm) caught a lot of errors/warnings statically, in fact as I was typing code, 46 | it would spot errors. More importantly, type annotations makes the interfaces very clear and I believe any 47 | sort of mathematical programming needs a strong style of type annotations (if not static typing). 48 | 49 | This is how the modules of the project are organized. 50 | 51 | processes: All about Markov Processes, MRP, MDP and classes that serve as minimal but complete representations 52 | of an MDP for specific classes of algorithms, eg: a representation for tabular RL, a representation for function 53 | approximation RL, and a representation for ADP. A lot of the heavy lifting is done in the "helper" sub-module 54 | mp_funcs.py 55 | 56 | func_approx: Linear and Deep-Neural Network (DNN) function approximation. Implements function evaluation (forward 57 | propagation for DNN) and gradient calculation/gradient descent (backward propagation for DNN) using ADAM. Took 58 | advantage of numpy vectors, matrices, tensors and efficiently computing with them. 59 | 60 | algorithms: within this, we have the modules dp (for Dynamic Programming), adp (for Approximate Dynamic Programming), 61 | rl_tabular (for Tabular RL - Monte Carlo, SARSA, Q-Learning, Expected SARSA), rl_func_aprprox (for Function 62 | Approximation RL - same algorithms as Tabular RL). Note that I have implemented TD(0) and TD(Lambda) separately 63 | for both Tabular RL and Function Approximation RL, although TD(0) is a special case of TD(Lambda). TD(0) was 64 | implemented separately for the usual reason in the project - that I find it easy to introduce a special case (in 65 | this case TD(0)) for pedagogical reasons and so showing students TD(0) as a special case with simpler/lighter 66 | code that focuses on the concept (versus the complication of eligibility traces) is quite beneficial. This is the 67 | same reason I implemented Tabular (Tabular is a special case of Linear Function Approximation where the features 68 | are indicator functions, one for each of the states/state-action pairs). Note the deep object-oriented inheritance 69 | hiereracy - rooted at the abstract base class OptBase. Note also that a lot of heavy lifting happens in the 70 | module helper_funcs.py. A couple of semi-advanced algorithms such as LSTD/LSPI and Policy Gradient are also implemented here (LSPI provides batch-efficiency and Policy Gradient is valuable when the action space is large/continuous). Some special but highly useful model-based algorithms such as Backward Induction (backward_dp.py) and Adapative Multistage Sampling (ams.py) have also been implemented. 71 | 72 | examples: Implemented a few common examples of problems that are ideal for RL: Windy Grid, Inventory Control. For http://cme241.stanford.edu, I have also implemented initial versions of two important and interesting finance problems that can be solved by modeling them as MDPs and solving with DP/RL: 1) Optimal Asset-Allocation and Consumption when managing a portfolio of risky assets and 1 riskless asset, 2) Optimal Exercise of American Options when the option-payoff is either path-dependent or if the state space of the option is high-dimensional. 73 | 74 | utils: Some generic utility functions to transform data structures. 75 | 76 | -------------------------------------------------------------------------------- /src/processes/mdp_refined.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import numpy as np 4 | 5 | from processes.mdp import MDP 6 | from processes.mdp_rep_for_adp_pg import MDPRepForADPPG 7 | from processes.mdp_rep_for_rl_pg import MDPRepForRLPG 8 | from processes.mdp_rep_for_rl_tabular import MDPRepForRLTabular 9 | from processes.mp_funcs import flatten_sasf_dict 10 | from processes.mp_funcs import flatten_ssf_dict 11 | from processes.mp_funcs import get_rv_gen_func 12 | from processes.mp_funcs import get_rv_gen_func_single 13 | from processes.mp_funcs import get_sampling_func_from_prob_dict 14 | from processes.mp_funcs import get_state_reward_gen_dict 15 | from processes.mp_funcs import get_state_reward_gen_func 16 | from processes.mp_funcs import mdp_rep_to_mrp_rep1 17 | from processes.mp_funcs import unflatten_sasf_dict 18 | from processes.mp_funcs import unflatten_ssf_dict 19 | from processes.mrp_refined import MRPRefined 20 | from processes.policy import Policy 21 | from utils.gen_utils import zip_dict_of_tuple, merge_dicts 22 | from utils.standard_typevars import SASf, SAf, SASTff 23 | 24 | 25 | class MDPRefined(MDP): 26 | 27 | def __init__( 28 | self, 29 | info: SASTff, 30 | gamma: float 31 | ) -> None: 32 | d1, d2, d3 = MDPRefined.split_info(info) 33 | super().__init__( 34 | {s: {a: (v1, d3[s][a]) for a, v1 in v.items()} 35 | for s, v in d1.items()}, 36 | gamma 37 | ) 38 | self.rewards_refined: SASf = d2 39 | 40 | @staticmethod 41 | def split_info(info: SASTff) -> Tuple[SASf, SASf, SAf]: 42 | c = {s: {a: zip_dict_of_tuple(v1) for a, v1 in v.items()} 43 | for s, v in info.items()} 44 | d = {k: zip_dict_of_tuple(v) for k, v in c.items()} 45 | d1, d2 = zip_dict_of_tuple(d) 46 | d3 = {s: {a: sum(np.prod(x) for x in v1.values()) 47 | for a, v1 in v.items()} for s, v in info.items()} 48 | return d1, d2, d3 49 | 50 | def get_mrp_refined(self, pol: Policy) -> MRPRefined: 51 | flat_transitions = flatten_sasf_dict(self.transitions) 52 | flat_rewards_refined = flatten_sasf_dict(self.rewards_refined) 53 | 54 | flat_exp_rewards = merge_dicts(flat_rewards_refined, flat_transitions, lambda x, y: x * y) 55 | exp_rewards = unflatten_sasf_dict(flat_exp_rewards) 56 | 57 | tr = mdp_rep_to_mrp_rep1(self.transitions, pol.policy_data) 58 | rew_ref = mdp_rep_to_mrp_rep1( 59 | exp_rewards, 60 | pol.policy_data 61 | ) 62 | flat_tr = flatten_ssf_dict(tr) 63 | flat_rew_ref = flatten_ssf_dict(rew_ref) 64 | flat_norm_rewards = merge_dicts(flat_rew_ref, flat_tr, lambda x, y: x / y) 65 | norm_rewards = unflatten_ssf_dict(flat_norm_rewards) 66 | 67 | return MRPRefined( 68 | {s: {s1: (v1, norm_rewards[s][s1]) for s1, v1 in v.items()} 69 | for s, v in tr.items()}, 70 | self.gamma 71 | ) 72 | 73 | def get_mdp_rep_for_rl_tabular(self) -> MDPRepForRLTabular: 74 | return MDPRepForRLTabular( 75 | state_action_dict=self.state_action_dict, 76 | terminal_states=self.terminal_states, 77 | state_reward_gen_dict=get_state_reward_gen_dict( 78 | self.transitions, 79 | self.rewards_refined 80 | ), 81 | gamma=self.gamma 82 | ) 83 | 84 | def get_mdp_rep_for_adp_pg(self) -> MDPRepForADPPG: 85 | return MDPRepForADPPG( 86 | gamma=self.gamma, 87 | init_states_gen_func=get_rv_gen_func( 88 | {s: 1. / len(self.state_action_dict) for s in 89 | self.state_action_dict.keys()} 90 | ), 91 | state_reward_gen_func=lambda s, a, n: 92 | [(s1, self.rewards_refined[s][a][s1]) for s1 in 93 | get_sampling_func_from_prob_dict(self.transitions[s][a])(n)], 94 | # reward_func=lambda s, a: self.rewards[s][a], 95 | # transitions_func=lambda s, a: self.transitions[s][a], 96 | terminal_state_func=lambda s: s in self.terminal_states 97 | ) 98 | 99 | def get_mdp_rep_for_rl_pg(self) -> MDPRepForRLPG: 100 | return MDPRepForRLPG( 101 | gamma=self.gamma, 102 | init_state_gen_func=get_rv_gen_func_single( 103 | {s: 1. / len(self.state_action_dict) for s in 104 | self.state_action_dict.keys()} 105 | ), 106 | state_reward_gen_func=lambda s, a: get_state_reward_gen_func( 107 | self.transitions[s][a], 108 | self.rewards_refined[s][a], 109 | )(), 110 | terminal_state_func=lambda s: s in self.terminal_states, 111 | ) 112 | 113 | 114 | if __name__ == '__main__': 115 | # data = { 116 | # 1: { 117 | # 'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)}, 118 | # 'b': {2: (0.3, -0.5), 3: (0.7, 2.6)}, 119 | # 'c': {1: (0.2, 4.8), 2: (0.4, -4.9), 3: (0.4, 0.0)} 120 | # }, 121 | # 2: { 122 | # 'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)}, 123 | # 'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)} 124 | # }, 125 | # 3: { 126 | # 'a': {3: (1.0, 0.0)}, 127 | # 'b': {3: (1.0, 0.0)} 128 | # } 129 | # } 130 | # mdp_refined_obj = MDPRefined(data, 0.95) 131 | # print(mdp_refined_obj.all_states) 132 | # print(mdp_refined_obj.transitions) 133 | # print(mdp_refined_obj.rewards) 134 | # print(mdp_refined_obj.rewards_refined) 135 | # terminal = mdp_refined_obj.get_terminal_states() 136 | # print(terminal) 137 | 138 | print("This is MDPRefined") 139 | mdp_refined_data = { 140 | 1: { 141 | 'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)}, 142 | 'b': {2: (0.3, -0.5), 3: (0.7, 2.6)}, 143 | 'c': {1: (0.2, 4.8), 2: (0.4, -4.9), 3: (0.4, 0.0)} 144 | }, 145 | 2: { 146 | 'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)}, 147 | 'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)} 148 | }, 149 | 3: { 150 | 'a': {3: (1.0, 0.0)}, 151 | 'b': {3: (1.0, 0.0)} 152 | } 153 | } 154 | mdp_refined_obj = MDPRefined(mdp_refined_data, 0.97) 155 | print("Transitions") 156 | print(mdp_refined_obj.transitions) 157 | print("Rewards Refined") 158 | print(mdp_refined_obj.rewards_refined) 159 | 160 | print("----------------") 161 | print("This is the Policy") 162 | policy_data = { 163 | 1: {'a': 0.4, 'b': 0.6}, 164 | 2: {'a': 0.7, 'c': 0.3}, 165 | 3: {'b': 1.0} 166 | } 167 | pol_obj = Policy(policy_data) 168 | print(pol_obj.policy_data) 169 | 170 | print("----------------") 171 | print("This is MRPRefined") 172 | mrp_refined_obj = mdp_refined_obj.get_mrp_refined(pol_obj) 173 | print("Transitions") 174 | print(mrp_refined_obj.transitions) 175 | print("Rewards Refined") 176 | print(mrp_refined_obj.rewards_refined) 177 | 178 | print("-----------------") 179 | print("This is MDP") 180 | print("Rewards") 181 | print(mdp_refined_obj.rewards) 182 | 183 | print("-----------------") 184 | print("This is MRP from MDP") 185 | mrp_obj1 = mdp_refined_obj.get_mrp(pol_obj) 186 | print("Rewards") 187 | print(mrp_obj1.rewards) 188 | 189 | print("---------------") 190 | print("This is MRP from MRPRefined") 191 | print("Rewards") 192 | print(mrp_refined_obj.rewards) 193 | 194 | 195 | -------------------------------------------------------------------------------- /src/algorithms/rl_tabular/td0.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from algorithms.td_algo_enum import TDAlgorithm 3 | from algorithms.rl_tabular.rl_tabular_base import RLTabularBase 4 | from processes.policy import Policy 5 | from processes.mp_funcs import get_rv_gen_func_single 6 | from processes.mdp_rep_for_rl_tabular import MDPRepForRLTabular 7 | from processes.mp_funcs import get_expected_action_value 8 | from utils.standard_typevars import VFDictType, QFDictType 9 | 10 | 11 | class TD0(RLTabularBase): 12 | 13 | def __init__( 14 | self, 15 | mdp_rep_for_rl: MDPRepForRLTabular, 16 | exploring_start: bool, 17 | algorithm: TDAlgorithm, 18 | softmax: bool, 19 | epsilon: float, 20 | epsilon_half_life: float, 21 | learning_rate: float, 22 | learning_rate_decay: float, 23 | num_episodes: int, 24 | max_steps: int 25 | ) -> None: 26 | 27 | super().__init__( 28 | mdp_rep_for_rl=mdp_rep_for_rl, 29 | exploring_start=exploring_start, 30 | softmax=softmax, 31 | epsilon=epsilon, 32 | epsilon_half_life=epsilon_half_life, 33 | num_episodes=num_episodes, 34 | max_steps=max_steps 35 | ) 36 | self.algorithm: TDAlgorithm = algorithm 37 | self.learning_rate: float = learning_rate 38 | self.learning_rate_decay: Optional[float] = learning_rate_decay 39 | 40 | def get_value_func_dict(self, pol: Policy) -> VFDictType: 41 | sa_dict = self.mdp_rep.state_action_dict 42 | vf_dict = {s: 0.0 for s in sa_dict.keys()} 43 | act_gen_dict = {s: get_rv_gen_func_single(pol.get_state_probabilities(s)) 44 | for s in sa_dict.keys()} 45 | episodes = 0 46 | updates = 0 47 | 48 | while episodes < self.num_episodes: 49 | state = self.mdp_rep.init_state_gen() 50 | steps = 0 51 | terminate = False 52 | 53 | while not terminate: 54 | action = act_gen_dict[state]() 55 | next_state, reward = \ 56 | self.mdp_rep.state_reward_gen_dict[state][action]() 57 | vf_dict[state] += self.learning_rate *\ 58 | (updates / self.learning_rate_decay + 1) ** -0.5 *\ 59 | (reward + self.mdp_rep.gamma * vf_dict[next_state] - 60 | vf_dict[state]) 61 | updates += 1 62 | steps += 1 63 | terminate = steps >= self.max_steps or \ 64 | state in self.mdp_rep.terminal_states 65 | state = next_state 66 | 67 | episodes += 1 68 | 69 | return vf_dict 70 | 71 | def get_qv_func_dict(self, pol: Optional[Policy]) -> QFDictType: 72 | control = pol is None 73 | this_pol = pol if pol is not None else self.get_init_policy() 74 | sa_dict = self.mdp_rep.state_action_dict 75 | qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()} 76 | episodes = 0 77 | updates = 0 78 | 79 | while episodes < self.num_episodes: 80 | if self.exploring_start: 81 | state, action = self.mdp_rep.init_state_action_gen() 82 | else: 83 | state = self.mdp_rep.init_state_gen() 84 | action = get_rv_gen_func_single( 85 | this_pol.get_state_probabilities(state) 86 | )() 87 | steps = 0 88 | terminate = False 89 | 90 | while not terminate: 91 | next_state, reward = \ 92 | self.mdp_rep.state_reward_gen_dict[state][action]() 93 | next_action = get_rv_gen_func_single( 94 | this_pol.get_state_probabilities(next_state) 95 | )() 96 | if self.algorithm == TDAlgorithm.QLearning and control: 97 | next_qv = max(qf_dict[next_state][a] for a in 98 | qf_dict[next_state]) 99 | elif self.algorithm == TDAlgorithm.ExpectedSARSA and control: 100 | # next_qv = sum(this_pol.get_state_action_probability( 101 | # next_state, 102 | # a 103 | # ) * qf_dict[next_state][a] for a in qf_dict[next_state]) 104 | next_qv = get_expected_action_value( 105 | qf_dict[next_state], 106 | self.softmax, 107 | self.epsilon_func(episodes) 108 | ) 109 | else: 110 | next_qv = qf_dict[next_state][next_action] 111 | 112 | qf_dict[state][action] += self.learning_rate *\ 113 | (updates / self.learning_rate_decay + 1) ** -0.5 *\ 114 | (reward + self.mdp_rep.gamma * next_qv - 115 | qf_dict[state][action]) 116 | updates += 1 117 | if control: 118 | if self.softmax: 119 | this_pol.edit_state_action_to_softmax( 120 | state, 121 | qf_dict[state] 122 | ) 123 | else: 124 | this_pol.edit_state_action_to_epsilon_greedy( 125 | state, 126 | qf_dict[state], 127 | self.epsilon_func(episodes) 128 | ) 129 | steps += 1 130 | terminate = steps >= self.max_steps or \ 131 | state in self.mdp_rep.terminal_states 132 | state = next_state 133 | action = next_action 134 | 135 | episodes += 1 136 | 137 | return qf_dict 138 | 139 | 140 | if __name__ == '__main__': 141 | from processes.mdp_refined import MDPRefined 142 | mdp_refined_data = { 143 | 1: { 144 | 'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)}, 145 | 'b': {2: (0.3, -0.5), 3: (0.7, 2.6)}, 146 | 'c': {1: (0.2, 4.8), 2: (0.4, -4.9), 3: (0.4, 0.0)} 147 | }, 148 | 2: { 149 | 'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)}, 150 | 'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)} 151 | }, 152 | 3: { 153 | 'a': {3: (1.0, 0.0)}, 154 | 'b': {3: (1.0, 0.0)} 155 | } 156 | } 157 | gamma_val = 1.0 158 | mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val) 159 | mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular() 160 | 161 | exploring_start_val = False 162 | algorithm_type = TDAlgorithm.ExpectedSARSA 163 | softmax_flag = False 164 | epsilon_val = 0.1 165 | epsilon_half_life_val = 1000 166 | learning_rate_val = 0.1 167 | learning_rate_decay_val = 1e6 168 | episodes_limit = 10000 169 | max_steps_val = 1000 170 | sarsa_obj = TD0( 171 | mdp_rep_obj, 172 | exploring_start_val, 173 | algorithm_type, 174 | softmax_flag, 175 | epsilon_val, 176 | epsilon_half_life_val, 177 | learning_rate_val, 178 | learning_rate_decay_val, 179 | episodes_limit, 180 | max_steps_val 181 | ) 182 | 183 | policy_data = { 184 | 1: {'a': 0.4, 'b': 0.6}, 185 | 2: {'a': 0.7, 'c': 0.3}, 186 | 3: {'b': 1.0} 187 | } 188 | pol_obj = Policy(policy_data) 189 | 190 | this_qf_dict = sarsa_obj.get_act_value_func_dict(pol_obj) 191 | print(this_qf_dict) 192 | this_vf_dict = sarsa_obj.get_value_func_dict(pol_obj) 193 | print(this_vf_dict) 194 | 195 | opt_pol = sarsa_obj.get_optimal_det_policy() 196 | print(opt_pol) 197 | opt_vf_dict = sarsa_obj.get_value_func_dict(opt_pol) 198 | print(opt_vf_dict) 199 | -------------------------------------------------------------------------------- /src/algorithms/rl_tabular/monte_carlo.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple, Sequence 2 | from algorithms.rl_tabular.rl_tabular_base import RLTabularBase 3 | from processes.policy import Policy 4 | from processes.mp_funcs import get_rv_gen_func_single 5 | from processes.mdp_rep_for_rl_tabular import MDPRepForRLTabular 6 | from algorithms.helper_funcs import get_returns_from_rewards_terminating 7 | from algorithms.helper_funcs import get_returns_from_rewards_non_terminating 8 | from algorithms.helper_funcs import get_soft_policy_from_qf_dict 9 | from algorithms.helper_funcs import get_nt_return_eval_steps 10 | import numpy as np 11 | from utils.generic_typevars import S, A 12 | from utils.standard_typevars import VFDictType, QFDictType 13 | 14 | 15 | class MonteCarlo(RLTabularBase): 16 | 17 | def __init__( 18 | self, 19 | mdp_rep_for_rl: MDPRepForRLTabular, 20 | exploring_start: bool, 21 | first_visit: bool, 22 | softmax: bool, 23 | epsilon: float, 24 | epsilon_half_life: float, 25 | num_episodes: int, 26 | max_steps: int 27 | ) -> None: 28 | 29 | super().__init__( 30 | mdp_rep_for_rl=mdp_rep_for_rl, 31 | exploring_start=exploring_start, 32 | softmax=softmax, 33 | epsilon=epsilon, 34 | epsilon_half_life=epsilon_half_life, 35 | num_episodes=num_episodes, 36 | max_steps=max_steps 37 | ) 38 | self.first_visit: bool = first_visit 39 | self.nt_return_eval_steps = get_nt_return_eval_steps( 40 | max_steps, 41 | mdp_rep_for_rl.gamma, 42 | 1e-4 43 | ) 44 | 45 | def get_mc_path( 46 | self, 47 | pol: Policy, 48 | start_state: S, 49 | start_action: Optional[A] = None, 50 | ) -> Sequence[Tuple[S, A, float, bool]]: 51 | 52 | res = [] 53 | state = start_state 54 | steps = 0 55 | terminate = False 56 | occ_states = set() 57 | act_gen_dict = {s: get_rv_gen_func_single(pol.get_state_probabilities(s)) 58 | for s in self.mdp_rep.state_action_dict.keys()} 59 | 60 | while not terminate: 61 | first = state not in occ_states 62 | occ_states.add(state) 63 | action = act_gen_dict[state]()\ 64 | if (steps > 0 or start_action is None) else start_action 65 | next_state, reward =\ 66 | self.mdp_rep.state_reward_gen_dict[state][action]() 67 | res.append((state, action, reward, first)) 68 | steps += 1 69 | terminate = steps >= self.max_steps or\ 70 | state in self.mdp_rep.terminal_states 71 | state = next_state 72 | return res 73 | 74 | def get_value_func_dict(self, pol: Policy) -> VFDictType: 75 | sa_dict = self.mdp_rep.state_action_dict 76 | counts_dict = {s: 0 for s in sa_dict.keys()} 77 | vf_dict = {s: 0.0 for s in sa_dict.keys()} 78 | episodes = 0 79 | 80 | while episodes < self.num_episodes: 81 | start_state = self.mdp_rep.init_state_gen() 82 | mc_path = self.get_mc_path( 83 | pol, 84 | start_state, 85 | start_action=None 86 | ) 87 | 88 | rew_arr = np.array([x for _, _, x, _ in mc_path]) 89 | if mc_path[-1][0] in self.mdp_rep.terminal_states: 90 | returns = get_returns_from_rewards_terminating( 91 | rew_arr, 92 | self.mdp_rep.gamma 93 | ) 94 | else: 95 | returns = get_returns_from_rewards_non_terminating( 96 | rew_arr, 97 | self.mdp_rep.gamma, 98 | self.nt_return_eval_steps 99 | ) 100 | for i, r in enumerate(returns): 101 | s, _, _, f = mc_path[i] 102 | if not self.first_visit or f: 103 | counts_dict[s] += 1 104 | c = counts_dict[s] 105 | vf_dict[s] = (vf_dict[s] * (c - 1) + r) / c 106 | episodes += 1 107 | 108 | return vf_dict 109 | 110 | def get_qv_func_dict(self, pol: Optional[Policy]) -> QFDictType: 111 | control = pol is None 112 | this_pol = pol if pol is not None else self.get_init_policy() 113 | sa_dict = self.mdp_rep.state_action_dict 114 | counts_dict = {s: {a: 0 for a in v} for s, v in sa_dict.items()} 115 | qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()} 116 | episodes = 0 117 | 118 | while episodes < self.num_episodes: 119 | if self.exploring_start: 120 | start_state, start_action = self.mdp_rep.init_state_action_gen() 121 | else: 122 | start_state = self.mdp_rep.init_state_gen() 123 | start_action = None 124 | mc_path = self.get_mc_path( 125 | this_pol, 126 | start_state, 127 | start_action 128 | ) 129 | rew_arr = np.array([x for _, _, x, _ in mc_path]) 130 | if mc_path[-1][0] in self.mdp_rep.terminal_states: 131 | returns = get_returns_from_rewards_terminating( 132 | rew_arr, 133 | self.mdp_rep.gamma 134 | ) 135 | else: 136 | returns = get_returns_from_rewards_non_terminating( 137 | rew_arr, 138 | self.mdp_rep.gamma, 139 | self.nt_return_eval_steps 140 | ) 141 | for i, r in enumerate(returns): 142 | s, a, _, f = mc_path[i] 143 | if not self.first_visit or f: 144 | counts_dict[s][a] += 1 145 | c = counts_dict[s][a] 146 | qf_dict[s][a] = (qf_dict[s][a] * (c - 1) + r) / c 147 | if control: 148 | this_pol = get_soft_policy_from_qf_dict( 149 | qf_dict, 150 | self.softmax, 151 | self.epsilon_func(episodes) 152 | ) 153 | episodes += 1 154 | 155 | return qf_dict 156 | 157 | 158 | if __name__ == '__main__': 159 | from processes.mdp_refined import MDPRefined 160 | mdp_refined_data = { 161 | 1: { 162 | 'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)}, 163 | 'b': {2: (0.3, -0.5), 3: (0.7, 2.6)}, 164 | 'c': {1: (0.2, 4.8), 2: (0.4, -4.9), 3: (0.4, 0.0)} 165 | }, 166 | 2: { 167 | 'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)}, 168 | 'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)} 169 | }, 170 | 3: { 171 | 'a': {3: (1.0, 0.0)}, 172 | 'b': {3: (1.0, 0.0)} 173 | } 174 | } 175 | gamma_val = 1.0 176 | mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val) 177 | mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular() 178 | 179 | exploring_start_val = False 180 | first_visit_flag = True 181 | softmax_flag = False 182 | episodes_limit = 1000 183 | epsilon_val = 0.1 184 | epsilon_half_life_val = 100 185 | max_steps_val = 1000 186 | mc_obj = MonteCarlo( 187 | mdp_rep_obj, 188 | exploring_start_val, 189 | first_visit_flag, 190 | softmax_flag, 191 | epsilon_val, 192 | epsilon_half_life_val, 193 | episodes_limit, 194 | max_steps_val 195 | ) 196 | 197 | policy_data = { 198 | 1: {'a': 0.4, 'b': 0.6}, 199 | 2: {'a': 0.7, 'c': 0.3}, 200 | 3: {'b': 1.0} 201 | } 202 | pol_obj = Policy(policy_data) 203 | 204 | this_mc_path = mc_obj.get_mc_path(pol_obj, 1) 205 | print(this_mc_path) 206 | 207 | this_qf_dict = mc_obj.get_act_value_func_dict(pol_obj) 208 | print(this_qf_dict) 209 | this_vf_dict = mc_obj.get_value_func_dict(pol_obj) 210 | print(this_vf_dict) 211 | 212 | opt_pol = mc_obj.get_optimal_det_policy() 213 | print(opt_pol) 214 | opt_vf_dict = mc_obj.get_value_func_dict(opt_pol) 215 | print(opt_vf_dict) 216 | -------------------------------------------------------------------------------- /src/algorithms/rl_func_approx/td0.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Optional 2 | from algorithms.td_algo_enum import TDAlgorithm 3 | from algorithms.rl_func_approx.rl_func_approx_base import RLFuncApproxBase 4 | from algorithms.func_approx_spec import FuncApproxSpec 5 | from processes.mdp_rep_for_rl_fa import MDPRepForRLFA 6 | from processes.mp_funcs import get_rv_gen_func_single 7 | from algorithms.helper_funcs import get_soft_policy_func_from_qf 8 | from processes.mp_funcs import get_expected_action_value 9 | from utils.generic_typevars import S, A 10 | from utils.standard_typevars import VFType, QFType, PolicyActDictType 11 | 12 | 13 | class TD0(RLFuncApproxBase): 14 | 15 | def __init__( 16 | self, 17 | mdp_rep_for_rl: MDPRepForRLFA, 18 | exploring_start: bool, 19 | algorithm: TDAlgorithm, 20 | softmax: bool, 21 | epsilon: float, 22 | epsilon_half_life: float, 23 | num_episodes: int, 24 | max_steps: int, 25 | fa_spec: FuncApproxSpec 26 | ) -> None: 27 | 28 | super().__init__( 29 | mdp_rep_for_rl=mdp_rep_for_rl, 30 | exploring_start=exploring_start, 31 | softmax=softmax, 32 | epsilon=epsilon, 33 | epsilon_half_life=epsilon_half_life, 34 | num_episodes=num_episodes, 35 | max_steps=max_steps, 36 | fa_spec=fa_spec 37 | ) 38 | self.algorithm: TDAlgorithm = algorithm 39 | 40 | def get_value_func_fa(self, polf: PolicyActDictType) -> VFType: 41 | episodes = 0 42 | 43 | while episodes < self.num_episodes: 44 | state = self.mdp_rep.init_state_gen() 45 | steps = 0 46 | terminate = False 47 | 48 | while not terminate: 49 | action = get_rv_gen_func_single(polf(state))() 50 | next_state, reward = \ 51 | self.mdp_rep.state_reward_gen_func(state, action) 52 | target = reward + self.mdp_rep.gamma *\ 53 | self.vf_fa.get_func_eval(next_state) 54 | self.vf_fa.update_params([state], [target]) 55 | steps += 1 56 | terminate = steps >= self.max_steps or \ 57 | self.mdp_rep.terminal_state_func(state) 58 | state = next_state 59 | 60 | episodes += 1 61 | 62 | return self.vf_fa.get_func_eval 63 | 64 | # noinspection PyShadowingNames 65 | def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType: 66 | control = polf is None 67 | this_polf = polf if polf is not None else self.get_init_policy_func() 68 | episodes = 0 69 | 70 | while episodes < self.num_episodes: 71 | if self.exploring_start: 72 | state, action = self.mdp_rep.init_state_action_gen() 73 | else: 74 | state = self.mdp_rep.init_state_gen() 75 | action = get_rv_gen_func_single(this_polf(state))() 76 | 77 | # print((episodes, max(self.qvf_fa.get_func_eval((state, a)) for a in 78 | # self.mdp_rep.state_action_func(state)))) 79 | # print(self.qvf_fa.params) 80 | 81 | steps = 0 82 | terminate = False 83 | 84 | while not terminate: 85 | next_state, reward = \ 86 | self.mdp_rep.state_reward_gen_func(state, action) 87 | next_action = get_rv_gen_func_single(this_polf(next_state))() 88 | if self.algorithm == TDAlgorithm.QLearning and control: 89 | next_qv = max(self.qvf_fa.get_func_eval((next_state, a)) for a in 90 | self.state_action_func(next_state)) 91 | elif self.algorithm == TDAlgorithm.ExpectedSARSA and control: 92 | # next_qv = sum(this_polf(next_state).get(a, 0.) * 93 | # self.qvf_fa.get_func_eval((next_state, a)) 94 | # for a in self.state_action_func(next_state)) 95 | next_qv = get_expected_action_value( 96 | {a: self.qvf_fa.get_func_eval((next_state, a)) for a in 97 | self.state_action_func(next_state)}, 98 | self.softmax, 99 | self.epsilon_func(episodes) 100 | ) 101 | else: 102 | next_qv = self.qvf_fa.get_func_eval((next_state, next_action)) 103 | 104 | target = reward + self.mdp_rep.gamma * next_qv 105 | # TD is online update and so, policy improves at every time step 106 | self.qvf_fa.update_params([(state, action)], [target]) 107 | if control: 108 | this_polf = get_soft_policy_func_from_qf( 109 | self.qvf_fa.get_func_eval, 110 | self.state_action_func, 111 | self.softmax, 112 | self.epsilon_func(episodes) 113 | ) 114 | steps += 1 115 | terminate = steps >= self.max_steps or \ 116 | self.mdp_rep.terminal_state_func(state) 117 | state = next_state 118 | action = next_action 119 | 120 | episodes += 1 121 | 122 | return lambda st: lambda act, st=st: self.qvf_fa.get_func_eval((st, act)) 123 | 124 | 125 | if __name__ == '__main__': 126 | from processes.mdp_refined import MDPRefined 127 | mdp_refined_data = { 128 | 1: { 129 | 'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)}, 130 | 'b': {2: (0.3, -0.5), 3: (0.7, 2.6)}, 131 | 'c': {1: (0.2, 4.8), 2: (0.4, -4.9), 3: (0.4, 0.0)} 132 | }, 133 | 2: { 134 | 'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)}, 135 | 'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)} 136 | }, 137 | 3: { 138 | 'a': {3: (1.0, 0.0)}, 139 | 'b': {3: (1.0, 0.0)} 140 | } 141 | } 142 | gamma_val = 1.0 143 | mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val) 144 | mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular() 145 | 146 | exploring_start_val = False 147 | algorithm_type = TDAlgorithm.ExpectedSARSA 148 | softmax_flag = False 149 | epsilon_val = 0.1 150 | epsilon_half_life_val = 1000 151 | learning_rate_val = 0.1 152 | episodes_limit = 10000 153 | max_steps_val = 1000 154 | state_ff = [lambda s: float(s)] 155 | sa_ff = [ 156 | lambda x: float(x[0]), 157 | lambda x: 1. if x[1] == 'a' else 0., 158 | lambda x: 1. if x[1] == 'b' else 0., 159 | lambda x: 1. if x[1] == 'c' else 0., 160 | ] 161 | fa_spec_val = FuncApproxSpec( 162 | state_feature_funcs=state_ff, 163 | sa_feature_funcs=sa_ff, 164 | dnn_spec=None, 165 | learning_rate=learning_rate_val 166 | ) 167 | sarsa_obj = TD0( 168 | mdp_rep_obj, 169 | exploring_start_val, 170 | algorithm_type, 171 | softmax_flag, 172 | epsilon_val, 173 | epsilon_half_life_val, 174 | episodes_limit, 175 | max_steps_val, 176 | fa_spec_val 177 | ) 178 | 179 | def policy_func(i: int) -> Mapping[str, float]: 180 | if i == 1: 181 | ret = {'a': 0.4, 'b': 0.6} 182 | elif i == 2: 183 | ret = {'a': 0.7, 'c': 0.3} 184 | elif i == 3: 185 | ret = {'b': 1.0} 186 | else: 187 | raise ValueError 188 | return ret 189 | 190 | this_qf = sarsa_obj.get_qv_func_fa(policy_func) 191 | this_vf = sarsa_obj.get_value_func_fa(policy_func) 192 | print(this_vf(1)) 193 | print(this_vf(2)) 194 | print(this_vf(3)) 195 | 196 | opt_det_polf = sarsa_obj.get_optimal_det_policy_func() 197 | 198 | # noinspection PyShadowingNames 199 | def opt_polf(s: S, opt_det_polf=opt_det_polf) -> Mapping[A, float]: 200 | return {opt_det_polf(s): 1.0} 201 | 202 | opt_vf = sarsa_obj.get_value_func_fa(opt_polf) 203 | print(opt_polf(1)) 204 | print(opt_polf(2)) 205 | print(opt_polf(3)) 206 | print(opt_vf(1)) 207 | print(opt_vf(2)) 208 | print(opt_vf(3)) 209 | -------------------------------------------------------------------------------- /src/algorithms/rl_tabular/tdlambda.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from algorithms.td_algo_enum import TDAlgorithm 3 | from algorithms.rl_tabular.rl_tabular_base import RLTabularBase 4 | from processes.policy import Policy 5 | from processes.mp_funcs import get_rv_gen_func_single 6 | from processes.mdp_rep_for_rl_tabular import MDPRepForRLTabular 7 | from processes.mp_funcs import get_expected_action_value 8 | from utils.standard_typevars import VFDictType, QFDictType 9 | 10 | 11 | class TDLambda(RLTabularBase): 12 | 13 | def __init__( 14 | self, 15 | mdp_rep_for_rl: MDPRepForRLTabular, 16 | exploring_start: bool, 17 | algorithm: TDAlgorithm, 18 | softmax: bool, 19 | epsilon: float, 20 | epsilon_half_life: float, 21 | learning_rate: float, 22 | learning_rate_decay: float, 23 | lambd: float, 24 | num_episodes: int, 25 | max_steps: int 26 | ) -> None: 27 | 28 | super().__init__( 29 | mdp_rep_for_rl=mdp_rep_for_rl, 30 | exploring_start=exploring_start, 31 | softmax=softmax, 32 | epsilon=epsilon, 33 | epsilon_half_life=epsilon_half_life, 34 | num_episodes=num_episodes, 35 | max_steps=max_steps 36 | ) 37 | self.algorithm: TDAlgorithm = algorithm 38 | self.learning_rate: float = learning_rate 39 | self.learning_rate_decay: float = learning_rate_decay 40 | self.gamma_lambda = self.mdp_rep.gamma * lambd 41 | 42 | def get_value_func_dict(self, pol: Policy) -> VFDictType: 43 | sa_dict = self.mdp_rep.state_action_dict 44 | vf_dict = {s: 0. for s in sa_dict.keys()} 45 | act_gen_dict = {s: get_rv_gen_func_single(pol.get_state_probabilities(s)) 46 | for s in sa_dict.keys()} 47 | episodes = 0 48 | updates = 0 49 | 50 | while episodes < self.num_episodes: 51 | et_dict = {s: 0. for s in sa_dict.keys()} 52 | state = self.mdp_rep.init_state_gen() 53 | steps = 0 54 | terminate = False 55 | 56 | while not terminate: 57 | action = act_gen_dict[state]() 58 | next_state, reward =\ 59 | self.mdp_rep.state_reward_gen_dict[state][action]() 60 | delta = reward + self.mdp_rep.gamma * vf_dict[next_state] -\ 61 | vf_dict[state] 62 | et_dict[state] += 1 63 | alpha = self.learning_rate * (updates / self.learning_rate_decay 64 | + 1) ** -0.5 65 | for s in sa_dict.keys(): 66 | vf_dict[s] += alpha * delta * et_dict[s] 67 | et_dict[s] *= self.gamma_lambda 68 | updates += 1 69 | steps += 1 70 | terminate = steps >= self.max_steps or\ 71 | state in self.mdp_rep.terminal_states 72 | state = next_state 73 | 74 | episodes += 1 75 | 76 | return vf_dict 77 | 78 | def get_qv_func_dict(self, pol: Optional[Policy]) -> QFDictType: 79 | control = pol is None 80 | this_pol = pol if pol is not None else self.get_init_policy() 81 | sa_dict = self.mdp_rep.state_action_dict 82 | qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()} 83 | episodes = 0 84 | updates = 0 85 | 86 | while episodes < self.num_episodes: 87 | et_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()} 88 | if self.exploring_start: 89 | state, action = self.mdp_rep.init_state_action_gen() 90 | else: 91 | state = self.mdp_rep.init_state_gen() 92 | action = get_rv_gen_func_single( 93 | this_pol.get_state_probabilities(state) 94 | )() 95 | steps = 0 96 | terminate = False 97 | 98 | while not terminate: 99 | next_state, reward = \ 100 | self.mdp_rep.state_reward_gen_dict[state][action]() 101 | next_action = get_rv_gen_func_single( 102 | this_pol.get_state_probabilities(next_state) 103 | )() 104 | if self.algorithm == TDAlgorithm.QLearning and control: 105 | next_qv = max(qf_dict[next_state][a] for a in 106 | qf_dict[next_state]) 107 | elif self.algorithm == TDAlgorithm.ExpectedSARSA and control: 108 | # next_qv = sum(this_pol.get_state_action_probability( 109 | # next_state, 110 | # a 111 | # ) * qf_dict[next_state][a] for a in qf_dict[next_state]) 112 | next_qv = get_expected_action_value( 113 | qf_dict[next_state], 114 | self.softmax, 115 | self.epsilon_func(episodes) 116 | ) 117 | else: 118 | next_qv = qf_dict[next_state][next_action] 119 | 120 | delta = reward + self.mdp_rep.gamma * next_qv -\ 121 | qf_dict[state][action] 122 | et_dict[state][action] += 1 123 | alpha = self.learning_rate * (updates / self.learning_rate_decay 124 | + 1) ** -0.5 125 | for s, a_set in sa_dict.items(): 126 | for a in a_set: 127 | qf_dict[s][a] += alpha * delta * et_dict[s][a] 128 | et_dict[s][a] *= self.gamma_lambda 129 | updates += 1 130 | if control: 131 | if self.softmax: 132 | this_pol.edit_state_action_to_softmax( 133 | state, 134 | qf_dict[state] 135 | ) 136 | else: 137 | this_pol.edit_state_action_to_epsilon_greedy( 138 | state, 139 | qf_dict[state], 140 | self.epsilon_func(episodes) 141 | ) 142 | steps += 1 143 | terminate = steps >= self.max_steps or \ 144 | state in self.mdp_rep.terminal_states 145 | state = next_state 146 | action = next_action 147 | 148 | episodes += 1 149 | 150 | return qf_dict 151 | 152 | 153 | if __name__ == '__main__': 154 | from processes.mdp_refined import MDPRefined 155 | mdp_refined_data = { 156 | 1: { 157 | 'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)}, 158 | 'b': {2: (0.3, -0.5), 3: (0.7, 2.6)}, 159 | 'c': {1: (0.2, 4.8), 2: (0.4, -4.9), 3: (0.4, 0.0)} 160 | }, 161 | 2: { 162 | 'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)}, 163 | 'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)} 164 | }, 165 | 3: { 166 | 'a': {3: (1.0, 0.0)}, 167 | 'b': {3: (1.0, 0.0)} 168 | } 169 | } 170 | gamma_val = 0.9 171 | mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val) 172 | mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular() 173 | 174 | exploring_start_val = False 175 | algorithm_type = TDAlgorithm.ExpectedSARSA 176 | softmax_flag = True 177 | epsilon_val = 0.1 178 | epsilon_half_life_val = 100 179 | learning_rate_val = 0.1 180 | learning_rate_decay_val = 1e6 181 | lambda_val = 0.2 182 | episodes_limit = 1000 183 | max_steps_val = 1000 184 | esl_obj = TDLambda( 185 | mdp_rep_obj, 186 | exploring_start_val, 187 | algorithm_type, 188 | softmax_flag, 189 | epsilon_val, 190 | epsilon_half_life_val, 191 | learning_rate_val, 192 | learning_rate_decay_val, 193 | lambda_val, 194 | episodes_limit, 195 | max_steps_val 196 | ) 197 | 198 | policy_data = { 199 | 1: {'a': 0.4, 'b': 0.6}, 200 | 2: {'a': 0.7, 'c': 0.3}, 201 | 3: {'b': 1.0} 202 | } 203 | pol_obj = Policy(policy_data) 204 | 205 | this_qf_dict = esl_obj.get_act_value_func_dict(pol_obj) 206 | print(this_qf_dict) 207 | this_vf_dict = esl_obj.get_value_func_dict(pol_obj) 208 | print(this_vf_dict) 209 | 210 | opt_pol = esl_obj.get_optimal_det_policy() 211 | print(opt_pol) 212 | opt_vf_dict = esl_obj.get_value_func_dict(opt_pol) 213 | print(opt_vf_dict) 214 | -------------------------------------------------------------------------------- /src/examples/inv_control.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, NamedTuple, Set, Mapping, Sequence 2 | from itertools import chain, product, groupby 3 | import numpy as np 4 | from numpy.core.multiarray import ndarray 5 | from scipy.stats import poisson 6 | from processes.mdp_refined import MDPRefined 7 | from func_approx.dnn_spec import DNNSpec 8 | from func_approx.func_approx_base import FuncApproxBase 9 | from algorithms.func_approx_spec import FuncApproxSpec 10 | from copy import deepcopy 11 | from operator import itemgetter 12 | from processes.det_policy import DetPolicy 13 | from examples.run_all_algorithms import RunAllAlgorithms 14 | 15 | StateType = Tuple[int, ...] 16 | 17 | 18 | class InvControl(NamedTuple): 19 | demand_lambda: float 20 | lead_time: int 21 | stockout_cost: float 22 | fixed_order_cost: float 23 | epoch_disc_factor: float 24 | order_limit: int 25 | space_limit: int 26 | throwout_cost: float 27 | stockout_limit: int 28 | stockout_limit_excess_cost: float 29 | 30 | def validate_spec(self) -> bool: 31 | b1 = self.demand_lambda > 0. 32 | b2 = self.lead_time >= 0 33 | b3 = self.stockout_cost > 1. 34 | b4 = self.fixed_order_cost >= 0. 35 | b5 = 0. <= self.epoch_disc_factor <= 1. 36 | b6 = self.order_limit > 0 37 | b7 = self.space_limit > 0 38 | b8 = self.throwout_cost > 1. 39 | b9 = self.stockout_limit > 0. 40 | b10 = self.stockout_limit_excess_cost > 0. 41 | return all([b1, b2, b3, b4, b5, b6, b7, b8, b9, b10]) 42 | 43 | def get_all_states(self) -> Set[StateType]: 44 | on_hand_range = range(-self.stockout_limit, self.space_limit + 1) 45 | on_order_range = range(self.order_limit + 1) 46 | return set(product( 47 | *chain([on_hand_range], [on_order_range] * self.lead_time) 48 | )) 49 | 50 | # Order of operations in an epoch are: 51 | # 1) Order Placement (Action) 52 | # 2) Receipt 53 | # 3) Throwout Space-Limited-Excess Inventory 54 | # 4) Demand 55 | # 5) Adjust (Negative) Inventory to not fall below stockout limit 56 | 57 | # In the following func, the input "state" is represented by 58 | # the on-hand and on-order right before an order is placed (the very 59 | # first event in the epoch) and the "state"s in the output are represented 60 | # by the on-hand and on-order just before the next order is placed (in the 61 | # next epoch). Both the input and output "state"s are arrays of length (L+1). 62 | 63 | def get_next_states_probs_rewards( 64 | self, 65 | state: StateType, 66 | action: int, 67 | demand_probs: Sequence[float] 68 | ) -> Mapping[StateType, Tuple[float, float]]: 69 | next_state_arr: ndarray = np.array(state) 70 | # The next line represents state change due to Action and Receipt 71 | next_state_arr = np.insert( 72 | np.zeros(len(next_state_arr) - 1), 73 | 0, 74 | next_state_arr[0] 75 | ) + np.append(next_state_arr[1:], action) 76 | excess = max(0, next_state_arr[0] - self.space_limit) 77 | cost = (self.fixed_order_cost if action > 0 else 0.) + \ 78 | excess * self.throwout_cost 79 | # The next line represents throwing out excess inventory 80 | next_state_arr[0] -= excess 81 | # The next line represents state change due to demand 82 | temp_list = [] 83 | for demand, prob in enumerate(demand_probs): 84 | ns = deepcopy(next_state_arr) 85 | ns[0] -= demand 86 | excess_stockout = max(0, -self.stockout_limit - ns[0]) 87 | this_cost = cost + excess_stockout * \ 88 | (self.stockout_cost + self.stockout_limit_excess_cost) 89 | # the next line represents adjustment of negative inventory 90 | # to not fall below stockout limit 91 | ns[0] += excess_stockout 92 | inv = ns[0] 93 | onhand = max(0., inv) 94 | stockout = max(0., -inv) 95 | this_cost += (onhand + self.stockout_cost * stockout) 96 | ns_tup = tuple(int(x) for x in ns) 97 | temp_list.append((ns_tup, prob, -this_cost)) 98 | 99 | ret = {} 100 | crit = itemgetter(0) 101 | for s, v in groupby(sorted(temp_list, key=crit), key=crit): 102 | tl = [(p, r) for _, p, r in v] 103 | sum_p = sum(p for p, _ in tl) 104 | avg_r = sum(p * r for p, r in tl) / sum_p if sum_p != 0. else 0. 105 | ret[s] = (sum_p, avg_r) 106 | return ret 107 | 108 | def get_mdp_refined_dict(self) \ 109 | -> Mapping[StateType, 110 | Mapping[int, 111 | Mapping[StateType, 112 | Tuple[float, float]]]]: 113 | rv = poisson(mu=self.demand_lambda) 114 | raw_probs = [rv.pmf(i) for i in range(int(rv.ppf(0.9999)))] 115 | pp = [p / sum(raw_probs) for p in raw_probs] 116 | return {s: {a: self.get_next_states_probs_rewards(s, a, pp) 117 | for a in range(self.order_limit + 1)} 118 | for s in self.get_all_states()} 119 | 120 | def get_mdp_refined(self) -> MDPRefined: 121 | return MDPRefined(self.get_mdp_refined_dict(), self.epoch_disc_factor) 122 | 123 | def get_optimal_policy(self) -> DetPolicy: 124 | return self.get_mdp_refined().get_optimal_policy() 125 | 126 | def get_ips_orders_dict(self) -> Mapping[int, Sequence[int]]: 127 | sa_pairs = self.get_optimal_policy().get_state_to_action_map().items() 128 | 129 | def crit(x: Tuple[Tuple[int, ...], int]) -> int: 130 | return sum(x[0]) 131 | 132 | return {ip: [y for _, y in v] for ip, v in 133 | groupby(sorted(sa_pairs, key=crit), key=crit)} 134 | 135 | 136 | if __name__ == '__main__': 137 | 138 | ic = InvControl( 139 | demand_lambda=0.5, 140 | lead_time=1, 141 | stockout_cost=49., 142 | fixed_order_cost=0.0, 143 | epoch_disc_factor=0.98, 144 | order_limit=7, 145 | space_limit=8, 146 | throwout_cost=30., 147 | stockout_limit=5, 148 | stockout_limit_excess_cost=30. 149 | ) 150 | if not ic.validate_spec(): 151 | raise ValueError 152 | mdp_ref_obj = ic.get_mdp_refined() 153 | this_tolerance = 1e-3 154 | exploring_start = False 155 | this_first_visit_mc = True 156 | num_samples = 30 157 | this_softmax = True 158 | this_epsilon = 0.05 159 | this_epsilon_half_life = 30 160 | this_learning_rate = 0.1 161 | this_learning_rate_decay = 1e6 162 | this_lambd = 0.8 163 | this_num_episodes = 3000 164 | this_max_steps = 1000 165 | this_tdl_fa_offline = True 166 | state_ffs = FuncApproxBase.get_identity_feature_funcs(ic.lead_time + 1) 167 | sa_ffs = [(lambda x, f=f: f(x[0])) for f in state_ffs] + [lambda x: x[1]] 168 | this_fa_spec = FuncApproxSpec( 169 | state_feature_funcs=state_ffs, 170 | sa_feature_funcs=sa_ffs, 171 | dnn_spec=DNNSpec( 172 | neurons=[2, 4], 173 | hidden_activation=DNNSpec.relu, 174 | hidden_activation_deriv=DNNSpec.relu_deriv, 175 | output_activation=DNNSpec.identity, 176 | output_activation_deriv=DNNSpec.identity_deriv 177 | ) 178 | ) 179 | 180 | raa = RunAllAlgorithms( 181 | mdp_refined=mdp_ref_obj, 182 | tolerance=this_tolerance, 183 | exploring_start=exploring_start, 184 | first_visit_mc=this_first_visit_mc, 185 | num_samples=num_samples, 186 | softmax=this_softmax, 187 | epsilon=this_epsilon, 188 | epsilon_half_life=this_epsilon_half_life, 189 | learning_rate=this_learning_rate, 190 | learning_rate_decay=this_learning_rate_decay, 191 | lambd=this_lambd, 192 | num_episodes=this_num_episodes, 193 | max_steps=this_max_steps, 194 | tdl_fa_offline=this_tdl_fa_offline, 195 | fa_spec=this_fa_spec 196 | ) 197 | 198 | def criter(x: Tuple[Tuple[int, ...], int]) -> int: 199 | return sum(x[0]) 200 | 201 | for st, mo in raa.get_all_algorithms().items(): 202 | print("Starting %s" % st) 203 | opt_pol_func = mo.get_optimal_det_policy_func() 204 | opt_pol = {s: opt_pol_func(s) for s in mdp_ref_obj.all_states} 205 | print(sorted( 206 | [(ip, np.mean([float(y) for _, y in v])) for ip, v in 207 | groupby(sorted(opt_pol.items(), key=criter), key=criter)], 208 | key=itemgetter(0) 209 | )) 210 | -------------------------------------------------------------------------------- /src/algorithms/mab/plot_mab_graphs.py: -------------------------------------------------------------------------------- 1 | from typing import NoReturn 2 | from operator import itemgetter 3 | from processes.mab_env import MabEnv 4 | from algorithms.mab.epsilon_greedy import EpsilonGreedy 5 | from algorithms.mab.ucb1 import UCB1 6 | from algorithms.mab.ts_gaussian import ThompsonSamplingGaussian 7 | from algorithms.mab.ts_bernoulli import ThompsonSamplingBernoulli 8 | from algorithms.mab.gradient_bandits import GradientBandits 9 | from numpy import arange 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | def plot_gaussian_algorithms() -> NoReturn: 14 | mean_vars_data = [ 15 | (0., 10.), 16 | (2., 20.), 17 | (4., 1.), 18 | (6., 8.), 19 | (8., 4.), 20 | (9., 6.), 21 | (10., 4.)] 22 | mu_star = max(mean_vars_data, key=itemgetter(0))[0] 23 | 24 | steps = 500 25 | episodes = 500 26 | 27 | eps = 0.3 28 | eps_hl = 400 29 | 30 | ci = 5 31 | mi = mu_star * 3. 32 | 33 | ts_mi = 0. 34 | ts_si = 10. 35 | 36 | lr = 0.1 37 | lr_decay = 20. 38 | 39 | me = MabEnv.get_gaussian_mab_env(mean_vars_data) 40 | 41 | greedy_opt_init = EpsilonGreedy( 42 | mab=me, 43 | time_steps=steps, 44 | num_episodes=episodes, 45 | epsilon=0., 46 | epsilon_half_life=1e8, 47 | count_init=ci, 48 | mean_init=mi 49 | ) 50 | eps_greedy = EpsilonGreedy( 51 | mab=me, 52 | time_steps=steps, 53 | num_episodes=episodes, 54 | epsilon=eps, 55 | epsilon_half_life=1e8, 56 | count_init=0, 57 | mean_init=0. 58 | ) 59 | decay_eps_greedy = EpsilonGreedy( 60 | mab=me, 61 | time_steps=steps, 62 | num_episodes=episodes, 63 | epsilon=eps, 64 | epsilon_half_life=eps_hl, 65 | count_init=0, 66 | mean_init=0. 67 | ) 68 | ts = ThompsonSamplingGaussian( 69 | mab=me, 70 | time_steps=steps, 71 | num_episodes=episodes, 72 | init_mean=ts_mi, 73 | init_stdev=ts_si 74 | ) 75 | grad_bandits = GradientBandits( 76 | mab=me, 77 | time_steps=steps, 78 | num_episodes=episodes, 79 | learning_rate=lr, 80 | learning_rate_decay=lr_decay 81 | ) 82 | 83 | plot_colors = ['r', 'b', 'g', 'k', 'y'] 84 | labels = [ 85 | 'Greedy, Optimistic Initialization', 86 | '$\epsilon$-Greedy', 87 | 'Decaying $\epsilon$-Greedy', 88 | 'Thompson Sampling', 89 | 'Gradient Bandits' 90 | ] 91 | 92 | exp_cum_regrets = [ 93 | greedy_opt_init.get_expected_cum_regret(mu_star), 94 | eps_greedy.get_expected_cum_regret(mu_star), 95 | decay_eps_greedy.get_expected_cum_regret(mu_star), 96 | ts.get_expected_cum_regret(mu_star), 97 | grad_bandits.get_expected_cum_regret(mu_star) 98 | ] 99 | 100 | x_vals = range(1, steps + 1) 101 | for i in range(len(exp_cum_regrets)): 102 | plt.plot(exp_cum_regrets[i], color=plot_colors[i], label=labels[i]) 103 | plt.xlabel("Time Steps", fontsize=20) 104 | plt.ylabel("Expected Cumulative Regret", fontsize=20) 105 | plt.title("Cumulative Regret Curves", fontsize=25) 106 | plt.xlim(xmin=x_vals[0], xmax=x_vals[-1]) 107 | plt.ylim(ymin=0.0) 108 | plt.grid(True) 109 | plt.legend(loc='upper left', fontsize=15) 110 | plt.show() 111 | 112 | exp_act_counts = [ 113 | greedy_opt_init.get_expected_action_counts(), 114 | eps_greedy.get_expected_action_counts(), 115 | decay_eps_greedy.get_expected_action_counts(), 116 | ts.get_expected_action_counts(), 117 | grad_bandits.get_expected_action_counts() 118 | ] 119 | index = arange(len(me.arms_sampling_funcs)) 120 | spacing = 0.4 121 | width = (1 - spacing) / len(exp_act_counts) 122 | 123 | for i in range(len(exp_act_counts)): 124 | plt.bar( 125 | index - (1 - spacing) / 2 + (i - 1.5) * width, 126 | exp_act_counts[i], 127 | width, 128 | color=plot_colors[i], 129 | label=labels[i] 130 | ) 131 | plt.xlabel("Arms", fontsize=20) 132 | plt.ylabel("Expected Counts of Arms", fontsize=20) 133 | plt.title("Arms Counts Plot", fontsize=25) 134 | plt.xticks( 135 | index - 0.3, 136 | ["$\mu$=%.1f,$\sigma$=%.1f" % (m, s) for m, s in mean_vars_data] 137 | ) 138 | plt.legend(loc='upper left', fontsize=15) 139 | plt.tight_layout() 140 | plt.show() 141 | 142 | 143 | def plot_bernoulli_algorithms() -> NoReturn: 144 | probs_data = [0.1, 0.2, 0.4, 0.5, 0.6, 0.75, 0.8, 0.85, 0.9] 145 | mu_star = max(probs_data) 146 | 147 | steps = 500 148 | episodes = 500 149 | 150 | eps = 0.3 151 | eps_hl = 400 152 | 153 | ci = 5 154 | mi = mu_star * 3. 155 | 156 | ucb_alpha = 4.0 157 | 158 | lr = 0.5 159 | lr_decay = 20. 160 | 161 | me = MabEnv.get_bernoulli_mab_env(probs_data) 162 | 163 | greedy_opt_init = EpsilonGreedy( 164 | mab=me, 165 | time_steps=steps, 166 | num_episodes=episodes, 167 | epsilon=0., 168 | epsilon_half_life=1e8, 169 | count_init=ci, 170 | mean_init=mi 171 | ) 172 | eps_greedy = EpsilonGreedy( 173 | mab=me, 174 | time_steps=steps, 175 | num_episodes=episodes, 176 | epsilon=eps, 177 | epsilon_half_life=1e8, 178 | count_init=0, 179 | mean_init=0. 180 | ) 181 | decay_eps_greedy = EpsilonGreedy( 182 | mab=me, 183 | time_steps=steps, 184 | num_episodes=episodes, 185 | epsilon=eps, 186 | epsilon_half_life=eps_hl, 187 | count_init=0, 188 | mean_init=0. 189 | ) 190 | ucb1 = UCB1( 191 | mab=me, 192 | time_steps=steps, 193 | num_episodes=episodes, 194 | bounds_range=1.0, 195 | alpha=ucb_alpha 196 | ) 197 | ts = ThompsonSamplingBernoulli( 198 | mab=me, 199 | time_steps=steps, 200 | num_episodes=episodes 201 | ) 202 | grad_bandits = GradientBandits( 203 | mab=me, 204 | time_steps=steps, 205 | num_episodes=episodes, 206 | learning_rate=lr, 207 | learning_rate_decay=lr_decay 208 | ) 209 | 210 | plot_colors = ['r', 'b', 'g', 'y', 'k', 'c'] 211 | labels = [ 212 | 'Greedy, Optimistic Initialization', 213 | '$\epsilon$-Greedy', 214 | 'Decaying $\epsilon$-Greedy', 215 | 'UCB1', 216 | 'Thompson Sampling', 217 | 'Gradient Bandits' 218 | ] 219 | 220 | exp_cum_regrets = [ 221 | greedy_opt_init.get_expected_cum_regret(mu_star), 222 | eps_greedy.get_expected_cum_regret(mu_star), 223 | decay_eps_greedy.get_expected_cum_regret(mu_star), 224 | ucb1.get_expected_cum_regret(mu_star), 225 | ts.get_expected_cum_regret(mu_star), 226 | grad_bandits.get_expected_cum_regret(mu_star) 227 | ] 228 | 229 | x_vals = range(1, steps + 1) 230 | for i in range(len(exp_cum_regrets)): 231 | plt.plot(exp_cum_regrets[i], color=plot_colors[i], label=labels[i]) 232 | plt.xlabel("Time Steps", fontsize=20) 233 | plt.ylabel("Expected Cumulative Regret", fontsize=20) 234 | plt.title("Cumulative Regret Curves", fontsize=25) 235 | plt.xlim(xmin=x_vals[0], xmax=x_vals[-1]) 236 | plt.ylim(ymin=0.0) 237 | plt.grid(True) 238 | plt.legend(loc='upper left', fontsize=15) 239 | plt.show() 240 | 241 | exp_act_counts = [ 242 | greedy_opt_init.get_expected_action_counts(), 243 | eps_greedy.get_expected_action_counts(), 244 | decay_eps_greedy.get_expected_action_counts(), 245 | ucb1.get_expected_action_counts(), 246 | ts.get_expected_action_counts(), 247 | grad_bandits.get_expected_action_counts() 248 | ] 249 | index = arange(len(me.arms_sampling_funcs)) 250 | spacing = 0.4 251 | width = (1 - spacing) / len(exp_act_counts) 252 | 253 | for i in range(len(exp_act_counts)): 254 | plt.bar( 255 | index - (1 - spacing) / 2 + (i - 1.5) * width, 256 | exp_act_counts[i], 257 | width, 258 | color=plot_colors[i], 259 | label=labels[i] 260 | ) 261 | plt.xlabel("Arms", fontsize=20) 262 | plt.ylabel("Expected Counts of Arms", fontsize=20) 263 | plt.title("Arms Counts Plot", fontsize=25) 264 | plt.xticks( 265 | index - 0.2, 266 | ["$p$=%.2f" % p for p in probs_data] 267 | ) 268 | plt.legend(loc='upper left', fontsize=15) 269 | plt.tight_layout() 270 | plt.show() 271 | 272 | 273 | if __name__ == '__main__': 274 | # plot_gaussian_algorithms() 275 | plot_bernoulli_algorithms() 276 | -------------------------------------------------------------------------------- /src/algorithms/rl_func_approx/lspi.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Optional, Sequence, Callable, Tuple 2 | from algorithms.rl_func_approx.rl_func_approx_base import RLFuncApproxBase 3 | from algorithms.func_approx_spec import FuncApproxSpec 4 | import numpy as np 5 | from processes.mdp_rep_for_rl_fa import MDPRepForRLFA 6 | from processes.mp_funcs import get_rv_gen_func_single 7 | from algorithms.helper_funcs import get_soft_policy_func_from_qf 8 | from operator import itemgetter 9 | from utils.generic_typevars import S, A 10 | from utils.standard_typevars import VFType, QFType, PolicyActDictType 11 | 12 | 13 | class LSPI(RLFuncApproxBase): 14 | 15 | def __init__( 16 | self, 17 | mdp_rep_for_rl: MDPRepForRLFA, 18 | exploring_start: bool, 19 | softmax: bool, 20 | epsilon: float, 21 | epsilon_half_life: float, 22 | num_episodes: int, 23 | batch_size: int, 24 | max_steps: int, 25 | state_feature_funcs: Sequence[Callable[[S], float]], 26 | sa_feature_funcs: Sequence[Callable[[Tuple[S, A]], float]] 27 | ) -> None: 28 | 29 | super().__init__( 30 | mdp_rep_for_rl=mdp_rep_for_rl, 31 | exploring_start=exploring_start, 32 | softmax=softmax, 33 | epsilon=epsilon, 34 | epsilon_half_life=epsilon_half_life, 35 | num_episodes=num_episodes, 36 | max_steps=max_steps, 37 | fa_spec=FuncApproxSpec( 38 | state_feature_funcs=state_feature_funcs, 39 | sa_feature_funcs=sa_feature_funcs, 40 | dnn_spec=None, 41 | reglr_coeff=0., 42 | learning_rate=0., 43 | adam_params=(False, 0., 0.), 44 | add_unit_feature=True 45 | ) 46 | ) 47 | self.batch_size: int = batch_size 48 | 49 | def get_value_func_fa(self, polf: PolicyActDictType) -> VFType: 50 | ffs = self.vf_fa.feature_funcs 51 | features = len(ffs) 52 | a_mat = np.zeros((features, features)) 53 | b_vec = np.zeros(features) 54 | 55 | for _ in range(self.num_episodes): 56 | state = self.mdp_rep.init_state_gen() 57 | steps = 0 58 | terminate = False 59 | 60 | while not terminate: 61 | action = get_rv_gen_func_single(polf(state))() 62 | next_state, reward = \ 63 | self.mdp_rep.state_reward_gen_func(state, action) 64 | phi_s = np.array([f(state) for f in ffs]) 65 | phi_sp = np.array([f(next_state) for f in ffs]) 66 | a_mat += np.outer( 67 | phi_s, 68 | phi_s - self.mdp_rep.gamma * phi_sp 69 | ) 70 | b_vec += reward * phi_s 71 | steps += 1 72 | terminate = steps >= self.max_steps or \ 73 | self.mdp_rep.terminal_state_func(state) 74 | state = next_state 75 | 76 | self.vf_fa.params = [np.linalg.inv(a_mat).dot(b_vec)] 77 | 78 | return self.vf_fa.get_func_eval 79 | 80 | # noinspection PyShadowingNames 81 | def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType: 82 | ffs = self.qvf_fa.feature_funcs 83 | features = len(ffs) 84 | a_mat = np.zeros((features, features)) 85 | b_vec = np.zeros(features) 86 | control = polf is None 87 | this_polf = polf if polf is not None else self.get_init_policy_func() 88 | 89 | for episode in range(self.num_episodes): 90 | if self.exploring_start: 91 | state, action = self.mdp_rep.init_state_action_gen() 92 | else: 93 | state = self.mdp_rep.init_state_gen() 94 | action = get_rv_gen_func_single(this_polf(state))() 95 | 96 | # print((episodes, max(self.qvf_fa.get_func_eval((state, a)) for a in 97 | # self.mdp_rep.state_action_func(state)))) 98 | # print(self.qvf_fa.params) 99 | 100 | steps = 0 101 | terminate = False 102 | 103 | while not terminate: 104 | next_state, reward = \ 105 | self.mdp_rep.state_reward_gen_func(state, action) 106 | phi_s = np.array([f((state, action)) for f in ffs]) 107 | next_action = get_rv_gen_func_single(this_polf(next_state))() 108 | if control: 109 | next_act = max( 110 | [(a, self.qvf_fa.get_func_eval((next_state, a))) for a in 111 | self.state_action_func(next_state)], 112 | key=itemgetter(1) 113 | )[0] 114 | else: 115 | next_act = next_action 116 | phi_sp = np.array([f((next_state, next_act)) for f in ffs]) 117 | a_mat += np.outer( 118 | phi_s, 119 | phi_s - self.mdp_rep.gamma * phi_sp 120 | ) 121 | b_vec += reward * phi_s 122 | 123 | steps += 1 124 | terminate = steps >= self.max_steps or \ 125 | self.mdp_rep.terminal_state_func(state) 126 | state = next_state 127 | action = next_action 128 | 129 | if control and (episode + 1) % self.batch_size == 0: 130 | self.qvf_fa.params = [np.linalg.inv(a_mat).dot(b_vec)] 131 | # print(self.qvf_fa.params) 132 | this_polf = get_soft_policy_func_from_qf( 133 | self.qvf_fa.get_func_eval, 134 | self.state_action_func, 135 | self.softmax, 136 | self.epsilon_func(episode) 137 | ) 138 | a_mat = np.zeros((features, features)) 139 | b_vec = np.zeros(features) 140 | 141 | if not control: 142 | self.qvf_fa.params = [np.linalg.inv(a_mat).dot(b_vec)] 143 | 144 | return lambda st: lambda act, st=st: self.qvf_fa.get_func_eval((st, act)) 145 | 146 | 147 | if __name__ == '__main__': 148 | from processes.mdp_refined import MDPRefined 149 | mdp_refined_data = { 150 | 1: { 151 | 'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)}, 152 | 'b': {2: (0.3, -0.5), 3: (0.7, 2.6)}, 153 | 'c': {1: (0.2, 4.8), 2: (0.4, -4.9), 3: (0.4, 0.0)} 154 | }, 155 | 2: { 156 | 'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)}, 157 | 'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)} 158 | }, 159 | 3: { 160 | 'a': {3: (1.0, 0.0)}, 161 | 'b': {3: (1.0, 0.0)} 162 | } 163 | } 164 | gamma_val = 0.9 165 | mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val) 166 | mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular() 167 | 168 | exploring_start_val = False 169 | softmax_flag = False 170 | epsilon_val = 0.1 171 | epsilon_half_life_val = 10000 172 | num_episodes_val = 100000 173 | batch_size_val = 1000 174 | max_steps_val = 1000 175 | state_ff = [lambda s: float(s)] 176 | sa_ff = [ 177 | lambda x: float(x[0]), 178 | lambda x: 1. if x[1] == 'a' else 0., 179 | lambda x: 1. if x[1] == 'b' else 0., 180 | lambda x: 1. if x[1] == 'c' else 0., 181 | ] 182 | lspi_obj = LSPI( 183 | mdp_rep_obj, 184 | exploring_start_val, 185 | softmax_flag, 186 | epsilon_val, 187 | epsilon_half_life_val, 188 | num_episodes_val, 189 | batch_size_val, 190 | max_steps_val, 191 | state_ff, 192 | sa_ff 193 | ) 194 | 195 | def policy_func(i: int) -> Mapping[str, float]: 196 | if i == 1: 197 | ret = {'a': 0.4, 'b': 0.6} 198 | elif i == 2: 199 | ret = {'a': 0.7, 'c': 0.3} 200 | elif i == 3: 201 | ret = {'b': 1.0} 202 | else: 203 | raise ValueError 204 | return ret 205 | 206 | # this_qf = lspi_obj.get_qv_func_fa(policy_func) 207 | this_vf = lspi_obj.get_value_func_fa(policy_func) 208 | print(this_vf(1)) 209 | print(this_vf(2)) 210 | print(this_vf(3)) 211 | 212 | opt_det_polf = lspi_obj.get_optimal_det_policy_func() 213 | 214 | # noinspection PyShadowingNames 215 | def opt_polf(s: S, opt_det_polf=opt_det_polf) -> Mapping[A, float]: 216 | return {opt_det_polf(s): 1.0} 217 | 218 | opt_vf = lspi_obj.get_value_func_fa(opt_polf) 219 | print(opt_polf(1)) 220 | print(opt_polf(2)) 221 | print(opt_polf(3)) 222 | print(opt_vf(1)) 223 | print(opt_vf(2)) 224 | print(opt_vf(3)) 225 | --------------------------------------------------------------------------------