├── src
    ├── utils
    │   ├── __init__.py
    │   ├── generic_typevars.py
    │   ├── beta_distribution.py
    │   ├── gen_utils.py
    │   └── standard_typevars.py
    ├── algorithms
    │   ├── __init__.py
    │   ├── adp
    │   │   └── __init__.py
    │   ├── dp
    │   │   ├── __init__.py
    │   │   ├── dp_analytic.py
    │   │   ├── dp_numeric.py
    │   │   └── dp_base.py
    │   ├── mab
    │   │   ├── __init__.py
    │   │   ├── mab_graphs_gen.py
    │   │   ├── ts_bernoulli.py
    │   │   ├── mab_base.py
    │   │   ├── ucb1.py
    │   │   ├── gradient_bandits.py
    │   │   ├── epsilon_greedy.py
    │   │   ├── ts_gaussian.py
    │   │   └── plot_mab_graphs.py
    │   ├── rl_tabular
    │   │   ├── __init__.py
    │   │   ├── rl_tabular_base.py
    │   │   ├── td0.py
    │   │   ├── monte_carlo.py
    │   │   └── tdlambda.py
    │   ├── rl_func_approx
    │   │   ├── __init__.py
    │   │   ├── rl_func_approx_base.py
    │   │   ├── td0.py
    │   │   └── lspi.py
    │   ├── td_algo_enum.py
    │   ├── opt_base.py
    │   ├── tabular_base.py
    │   ├── func_approx_spec.py
    │   ├── backward_adp.py
    │   ├── backward_dp.py
    │   ├── helper_funcs.py
    │   └── ams.py
    ├── examples
    │   ├── __init__.py
    │   ├── port_opt
    │   │   └── __init__.py
    │   ├── american_pricing
    │   │   ├── __init__.py
    │   │   ├── bs_pricing.py
    │   │   ├── num_utils.py
    │   │   └── vanilla_american_test.py
    │   ├── deriv_pricing_hedging
    │   │   └── __init__.py
    │   ├── exam_problems
    │   │   ├── grid_maze.py
    │   │   ├── price_control.py
    │   │   ├── frog_lilypad.py
    │   │   ├── wage_max.py
    │   │   ├── W2021
    │   │   │   └── career_optimization.py
    │   │   ├── mrp_tdmc_outline.py
    │   │   └── mrp_tdmc.py
    │   ├── clearance_pricing.py
    │   └── inv_control.py
    ├── func_approx
    │   ├── __init__.py
    │   ├── dnn_spec.py
    │   ├── eligibility_traces.py
    │   ├── linear_approx.py
    │   └── func_approx_base.py
    └── processes
    │   ├── __init__.py
    │   ├── mdp_rep_for_rl_pg.py
    │   ├── det_policy.py
    │   ├── mdp_rep_for_adp.py
    │   ├── mdp_rep_for_adp_pg.py
    │   ├── mrp_refined.py
    │   ├── mab_env.py
    │   ├── policy.py
    │   ├── mdp_rep_for_rl_tabular.py
    │   ├── mdp_rep_for_rl_fa.py
    │   ├── mp.py
    │   ├── mrp.py
    │   ├── mdp.py
    │   └── mdp_refined.py
├── .gitignore
└── README.md


/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/algorithms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/func_approx/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/processes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/algorithms/adp/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/algorithms/dp/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/algorithms/mab/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/examples/port_opt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__
3 | 


--------------------------------------------------------------------------------
/src/algorithms/rl_tabular/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/algorithms/rl_func_approx/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/examples/american_pricing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/examples/deriv_pricing_hedging/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/utils/generic_typevars.py:
--------------------------------------------------------------------------------
1 | from typing import TypeVar
2 | 
3 | S = TypeVar('S')
4 | A = TypeVar('A')
5 | 


--------------------------------------------------------------------------------
/src/algorithms/td_algo_enum.py:
--------------------------------------------------------------------------------
1 | from enum import Enum, auto
2 | 
3 | 
4 | class TDAlgorithm(Enum):
5 |     SARSA = auto()
6 |     QLearning = auto()
7 |     ExpectedSARSA = auto()
8 | 


--------------------------------------------------------------------------------
/src/processes/mdp_rep_for_rl_pg.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Generic, Tuple
 2 | from utils.generic_typevars import S, A
 3 | 
 4 | 
 5 | class MDPRepForRLPG(Generic[S, A]):
 6 | 
 7 |     def __init__(
 8 |         self,
 9 |         gamma: float,
10 |         init_state_gen_func: Callable[[], S],
11 |         state_reward_gen_func: Callable[[S, A], Tuple[S, float]],
12 |         terminal_state_func: Callable[[S], bool],
13 |     ) -> None:
14 |         self.gamma: float = gamma
15 |         self.init_state_gen_func: Callable[[], S] = init_state_gen_func
16 |         self.state_reward_gen_func: Callable[[S, A], Tuple[S, float]] = \
17 |             state_reward_gen_func
18 |         self.terminal_state_func = terminal_state_func
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     print(0)
23 | 


--------------------------------------------------------------------------------
/src/processes/det_policy.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping
 2 | from processes.policy import Policy
 3 | from utils.generic_typevars import S, A
 4 | 
 5 | 
 6 | class DetPolicy(Policy):
 7 | 
 8 |     def __init__(self, det_policy_data: Mapping[S, A]) -> None:
 9 |         super().__init__({s: {a: 1.0} for s, a in det_policy_data.items()})
10 | 
11 |     def get_action_for_state(self, state: S) -> A:
12 |         return list(self.get_state_probabilities(state).keys())[0]
13 | 
14 |     def get_state_to_action_map(self) -> Mapping[S, A]:
15 |         return {s: self.get_action_for_state(s) for s in self.policy_data}
16 | 
17 |     def __repr__(self) -> str:
18 |         return self.get_state_to_action_map().__repr__()
19 | 
20 |     def __str__(self) -> str:
21 |         return self.get_state_to_action_map().__str__()
22 | 
23 | 


--------------------------------------------------------------------------------
/src/processes/mdp_rep_for_adp.py:
--------------------------------------------------------------------------------
 1 | from typing import Set, Callable, Sequence, Mapping, Generic
 2 | from utils.generic_typevars import S, A
 3 | 
 4 | 
 5 | class MDPRepForADP(Generic[S, A]):
 6 | 
 7 |     def __init__(
 8 |         self,
 9 |         state_action_func: Callable[[S], Set[A]],
10 |         gamma: float,
11 |         sample_states_gen_func: Callable[[int], Sequence[S]],
12 |         reward_func: Callable[[S, A], float],
13 |         transitions_func: Callable[[S, A], Mapping[S, float]]
14 |     ) -> None:
15 |         self.state_action_func: Callable[[S], Set[A]] = state_action_func
16 |         self.gamma: float = gamma
17 |         self.sample_states_gen_func: Callable[[int], Sequence[S]] = \
18 |             sample_states_gen_func
19 |         self.reward_func: Callable[[S, A], float] = reward_func
20 |         self.transitions_func: Callable[[S, A], Mapping[S, float]] = \
21 |             transitions_func
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     print(0)
26 | 
27 | 


--------------------------------------------------------------------------------
/src/algorithms/opt_base.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable
 2 | from abc import ABC, abstractmethod
 3 | from utils.generic_typevars import S, A
 4 | from utils.standard_typevars import VFType, QFType, PolicyType
 5 | 
 6 | 
 7 | class OptBase(ABC):
 8 | 
 9 |     @abstractmethod
10 |     def get_value_func(self, polf: PolicyType) -> VFType:
11 |         pass
12 | 
13 |     @abstractmethod
14 |     def get_act_value_func(self, polf: PolicyType) -> QFType:
15 |         pass
16 | 
17 |     @abstractmethod
18 |     def get_optimal_det_policy_func(self) -> Callable[[S], A]:
19 |         pass
20 | 
21 |     # noinspection PyShadowingNames
22 |     def get_optimal_value_func(self) -> VFType:
23 |         pf = self.get_optimal_det_policy_func()
24 |         return self.get_value_func(
25 |             lambda s, pf=pf: lambda n, s=s, pf=pf: [pf(s)] * n
26 |         )
27 | 
28 |     # noinspection PyShadowingNames
29 |     def get_optimal_act_value_func(self) -> QFType:
30 |         pf = self.get_optimal_det_policy_func()
31 |         return self.get_act_value_func(
32 |             lambda s, pf=pf: lambda n, s=s, pf=pf: [pf(s)] * n
33 |         )
34 | 
35 | 


--------------------------------------------------------------------------------
/src/processes/mdp_rep_for_adp_pg.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Sequence, Generic, Tuple
 2 | from utils.generic_typevars import S, A
 3 | 
 4 | 
 5 | class MDPRepForADPPG(Generic[S, A]):
 6 | 
 7 |     def __init__(
 8 |         self,
 9 |         gamma: float,
10 |         init_states_gen_func: Callable[[int], Sequence[S]],
11 |         state_reward_gen_func: Callable[[S, A, int], Sequence[Tuple[S, float]]],
12 |         # reward_func: Callable[[S, A], float],
13 |         # transitions_func: Callable[[S, A], Mapping[S, float]],
14 |         terminal_state_func: Callable[[S], bool],
15 |     ) -> None:
16 |         self.gamma: float = gamma
17 |         self.init_states_gen_func: Callable[[int], Sequence[S]] = \
18 |             init_states_gen_func
19 |         self.state_reward_gen_func: Callable[[S, A, int], Sequence[Tuple[S, float]]] =\
20 |             state_reward_gen_func
21 |         # self.reward_func: Callable[[S, A], float] = reward_func
22 |         # self.transitions_func: Callable[[S, A], Mapping[S, float]] = \
23 |         #     transitions_func
24 |         self.terminal_state_func = terminal_state_func
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     print(0)
29 | 


--------------------------------------------------------------------------------
/src/processes/mrp_refined.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping, Tuple
 2 | from processes.mrp import MRP
 3 | from utils.gen_utils import zip_dict_of_tuple
 4 | import numpy as np
 5 | from utils.generic_typevars import S
 6 | from utils.standard_typevars import SSf, SSTff
 7 | 
 8 | 
 9 | class MRPRefined(MRP):
10 | 
11 |     def __init__(
12 |         self,
13 |         info: SSTff,
14 |         gamma: float
15 |     ) -> None:
16 |         d1, d2, d3 = MRPRefined.split_info(info)
17 |         super().__init__({k: (v, d3[k]) for k, v in d1.items()}, gamma)
18 |         self.rewards_refined: SSf = d2
19 | 
20 |     @staticmethod
21 |     def split_info(info: SSTff) -> Tuple[SSf, SSf, Mapping[S, float]]:
22 |         d = {k: zip_dict_of_tuple(v) for k, v in info.items()}
23 |         d1, d2 = zip_dict_of_tuple(d)
24 |         d3 = {k: sum(np.prod(x) for x in v.values()) for k, v in info.items()}
25 |         return d1, d2, d3
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     data = {
30 |         1: {1: (0.3, 9.2), 2: (0.6, 3.4), 3: (0.1, -0.3)},
31 |         2: {1: (0.4, 0.0), 2: (0.2, 8.9), 3: (0.4, 3.5)},
32 |         3: {3: (1.0, 0.0)}
33 |     }
34 |     mrp_refined_obj = MRPRefined(data, 0.95)
35 |     print(mrp_refined_obj.trans_matrix)
36 |     print(mrp_refined_obj.rewards_vec)
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/src/examples/exam_problems/grid_maze.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple, Mapping
 2 | 
 3 | SPACE = 'SPACE'
 4 | BLOCK = 'BLOCK'
 5 | GOAL = 'GOAL'
 6 | 
 7 | maze_grid = {(0, 0): SPACE, (0, 1): BLOCK, (0, 2): SPACE, (0, 3): SPACE, (0, 4): SPACE, 
 8 |              (0, 5): SPACE, (0, 6): SPACE, (0, 7): SPACE, (1, 0): SPACE, (1, 1): BLOCK,
 9 |              (1, 2): BLOCK, (1, 3): SPACE, (1, 4): BLOCK, (1, 5): BLOCK, (1, 6): BLOCK, 
10 |              (1, 7): BLOCK, (2, 0): SPACE, (2, 1): BLOCK, (2, 2): SPACE, (2, 3): SPACE, 
11 |              (2, 4): SPACE, (2, 5): SPACE, (2, 6): BLOCK, (2, 7): SPACE, (3, 0): SPACE, 
12 |              (3, 1): SPACE, (3, 2): SPACE, (3, 3): BLOCK, (3, 4): BLOCK, (3, 5): SPACE, 
13 |              (3, 6): BLOCK, (3, 7): SPACE, (4, 0): SPACE, (4, 1): BLOCK, (4, 2): SPACE, 
14 |              (4, 3): BLOCK, (4, 4): SPACE, (4, 5): SPACE, (4, 6): SPACE, (4, 7): SPACE, 
15 |              (5, 0): BLOCK, (5, 1): BLOCK, (5, 2): SPACE, (5, 3): BLOCK, (5, 4): SPACE, 
16 |              (5, 5): BLOCK, (5, 6): SPACE, (5, 7): BLOCK, (6, 0): SPACE, (6, 1): BLOCK, 
17 |              (6, 2): BLOCK, (6, 3): BLOCK, (6, 4): SPACE, (6, 5): BLOCK, (6, 6): SPACE, 
18 |              (6, 7): SPACE, (7, 0): SPACE, (7, 1): SPACE, (7, 2): SPACE, (7, 3): SPACE, 
19 |              (7, 4): SPACE, (7, 5): BLOCK, (7, 6): BLOCK, (7, 7): GOAL}
20 | 


--------------------------------------------------------------------------------
/src/processes/mab_env.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple, Callable, Sequence, NamedTuple
 2 | from numpy.random import normal, binomial, uniform
 3 | 
 4 | 
 5 | class MabEnv(NamedTuple):
 6 | 
 7 |     arms_sampling_funcs: Sequence[Callable[[], float]]
 8 | 
 9 |     @staticmethod
10 |     def get_gaussian_mab_env(means_vars: Sequence[Tuple[float, float]]) -> 'MabEnv':
11 |         return MabEnv([lambda m=m, s=s: normal(m, s, 1)[0] for m, s in means_vars])
12 | 
13 |     @staticmethod
14 |     def get_bernoulli_mab_env(probs: Sequence[float]) -> 'MabEnv':
15 |         return MabEnv([lambda p=p: float(binomial(1, p, 1)[0]) for p in probs])
16 | 
17 |     @staticmethod
18 |     def get_uniform_mab_env(bounds: Sequence[Tuple[float, float]]) -> 'MabEnv':
19 |         return MabEnv([lambda c=c, d=d: uniform(c, d, 1)[0] for c, d in bounds])
20 | 
21 |     @staticmethod
22 |     def get_binomial_mab_env(params: Sequence[Tuple[int, float]]) -> 'MabEnv':
23 |         return MabEnv([lambda n=n, p=p: float(binomial(n, p, 1)[0]) for n, p in params])
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     mean_vars_data = [(5., 2.), (10., 3.), (0., 4.)]
28 |     me = MabEnv.get_gaussian_mab_env(mean_vars_data)
29 |     asf = me.arms_sampling_funcs
30 |     res = [[asf[i]() for _ in range(10000)] for i in range(len(asf))]
31 |     for i in range(len(mean_vars_data)):
32 |         nums = res[i]
33 |         print("Mean = %.3f" % np.mean(nums))
34 |         print("Stdev = %.3f" % np.std(nums))
35 | 


--------------------------------------------------------------------------------
/src/examples/exam_problems/price_control.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from typing import Sequence, Tuple
 3 | from scipy.stats import poisson
 4 | import matplotlib.pyplot as plt
 5 | from mpl_toolkits.mplot3d.axes3d import Axes3D
 6 | from matplotlib import cm
 7 | 
 8 | T: int = 10  # time steps
 9 | M: int = 20  # initial inventory
10 | # the following are (price, poisson mean) pairs, i.e., elasticity
11 | el: Sequence[Tuple[float, float]] = [
12 |     (10.0, 0.3), (9.0, 0.8), (8.0, 1.6),
13 |     (7.0, 2.7), (6.0, 4.1), (5.0, 7.2)
14 | ]
15 | 
16 | # v represents the Optimal Value Function (time, Inventory) -> E[Sum of Sales Revenue]
17 | # pi represents the Optimal Policy (time, Inventory) -> Price
18 | v: np.ndarray = np.zeros((T + 1, M + 1))
19 | pi: np.ndarray = np.zeros((T, M + 1))
20 | rvs: Sequence = [poisson(l) for _, l in el]
21 | 
22 | for t in range(T - 1, -1, -1):
23 |     for s in range(M + 1):
24 |         q_vals = [sum(rvs[i].pmf(d) * (d * p + v[t + 1, s - d])
25 |                     for d in range(s)) +
26 |                 (1. - rvs[i].cdf(s - 1)) * s * p
27 |                 for i, (p, _) in enumerate(el)]
28 |         v[t, s] = np.max(q_vals)
29 |         pi[t, s] = el[int(np.argmax(q_vals))][0]
30 | 
31 | print(pi)
32 | print(v)
33 | 
34 | 
35 | x, y = np.meshgrid(range(M + 1), range(T))
36 | fig = plt.figure()
37 | ax = fig.gca(projection='3d')
38 | surf = ax.plot_surface(x, y, pi, cmap=cm.coolwarm,
39 |                        linewidth=0, antialiased=False)
40 | fig.colorbar(surf, shrink=0.5, aspect=5)
41 | plt.show()
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/src/processes/policy.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping, Generic, Dict
 2 | from processes.mp_funcs import verify_policy
 3 | from processes.mp_funcs import get_epsilon_action_probs
 4 | from processes.mp_funcs import get_softmax_action_probs
 5 | from utils.generic_typevars import S, A
 6 | 
 7 | 
 8 | class Policy(Generic[S, A]):
 9 | 
10 |     def __init__(self, data: Dict[S, Mapping[A, float]]) -> None:
11 |         if verify_policy(data):
12 |             self.policy_data = data
13 |         else:
14 |             raise ValueError
15 | 
16 |     def get_state_probabilities(self, state: S) -> Mapping[A, float]:
17 |         return self.policy_data[state]
18 | 
19 |     def get_state_action_probability(self, state: S, action: A) -> float:
20 |         return self.get_state_probabilities(state).get(action, 0.)
21 | 
22 |     def edit_state_action_to_epsilon_greedy(
23 |         self,
24 |         state: S,
25 |         action_value_dict: Mapping[A, float],
26 |         epsilon: float
27 |     ) -> None:
28 |         self.policy_data[state] = get_epsilon_action_probs(
29 |             action_value_dict,
30 |             epsilon
31 |         )
32 | 
33 |     def edit_state_action_to_softmax(
34 |             self,
35 |             state: S,
36 |             action_value_dict: Mapping[A, float]
37 |     ) -> None:
38 |         self.policy_data[state] = get_softmax_action_probs(
39 |             action_value_dict
40 |         )
41 | 
42 |     def __repr__(self):
43 |         return self.policy_data.__repr__()
44 | 
45 |     def __str__(self):
46 |         return self.policy_data.__str__()
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/src/processes/mdp_rep_for_rl_tabular.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping, Set, Callable, Tuple
 2 | from processes.mp_funcs import get_rv_gen_func_single
 3 | from processes.mdp_rep_for_rl_fa import MDPRepForRLFA
 4 | from utils.generic_typevars import S, A
 5 | 
 6 | Type1 = Mapping[S, Mapping[A, Callable[[], Tuple[S, float]]]]
 7 | 
 8 | 
 9 | class MDPRepForRLTabular(MDPRepForRLFA):
10 | 
11 |     def __init__(
12 |         self,
13 |         state_action_dict: Mapping[S, Set[A]],
14 |         terminal_states: Set[S],
15 |         state_reward_gen_dict: Type1,
16 |         gamma: float
17 |     ) -> None:
18 |         self.state_action_dict: Mapping[S, Set[A]] = state_action_dict
19 |         self.terminal_states: Set[S] = terminal_states
20 |         self.state_reward_gen_dict: Type1 = state_reward_gen_dict
21 |         super().__init__(
22 |             state_action_func=lambda x: self.state_action_dict[x],
23 |             gamma=gamma,
24 |             terminal_state_func=lambda x: x in self.terminal_states,
25 |             state_reward_gen_func=lambda x, y: self.state_reward_gen_dict[x][y](),
26 |             init_state_gen=get_rv_gen_func_single(
27 |                 {s: 1. / len(self.state_action_dict) for s
28 |                  in self.state_action_dict.keys()}
29 |             ),
30 |             init_state_action_gen=get_rv_gen_func_single(
31 |                 {(s, a): 1. / sum(len(v) for v
32 |                                   in self.state_action_dict.values())
33 |                  for s, v1 in self.state_action_dict.items() for a in v1}
34 |             )
35 |         )
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     print(0)
40 | 


--------------------------------------------------------------------------------
/src/processes/mdp_rep_for_rl_fa.py:
--------------------------------------------------------------------------------
 1 | from typing import Set, Callable, Tuple, Optional, Generic
 2 | from processes.mp_funcs import get_rv_gen_func_single
 3 | from utils.generic_typevars import S, A
 4 | 
 5 | 
 6 | class MDPRepForRLFA(Generic[S, A]):
 7 | 
 8 |     def __init__(
 9 |         self,
10 |         state_action_func: Callable[[S], Set[A]],
11 |         gamma: float,
12 |         terminal_state_func: Callable[[S], bool],
13 |         state_reward_gen_func: Callable[[S, A], Tuple[S, float]],
14 |         init_state_gen: Callable[[], S],
15 |         init_state_action_gen: Optional[Callable[[], Tuple[S, A]]]
16 |     ) -> None:
17 |         # noinspection PyShadowingNames
18 |         def init_sa(
19 |             init_state_gen=init_state_gen,
20 |             state_action_func=state_action_func
21 |         ) -> Tuple[S, A]:
22 |             s = init_state_gen()
23 |             actions = state_action_func(s)
24 |             a = get_rv_gen_func_single({a: 1. / len(actions) for a in actions})()
25 |             return s, a
26 | 
27 |         self.state_action_func: Callable[[S], Set[A]] = state_action_func
28 |         self.gamma: float = gamma
29 |         self.terminal_state_func: Callable[[S], bool] = terminal_state_func
30 |         self.state_reward_gen_func: Callable[[S, A], Tuple[S, float]] = \
31 |             state_reward_gen_func
32 |         self.init_state_gen: Callable[[], S] = init_state_gen
33 |         self.init_state_action_gen: Callable[[], Tuple[S, A]] =\
34 |             (init_state_action_gen if init_state_action_gen is not None
35 |              else init_sa)
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     print(0)
40 | 


--------------------------------------------------------------------------------
/src/algorithms/mab/mab_graphs_gen.py:
--------------------------------------------------------------------------------
 1 | from typing import NoReturn
 2 | import numpy as np
 3 | 
 4 | 
 5 | def graph_regret_curve() -> NoReturn:
 6 |     import matplotlib.pyplot as plt
 7 |     x_vals = range(1, 71)
 8 |     plt.plot(x_vals, [3*x for x in x_vals], "r", label="Greedy")
 9 |     plt.plot(x_vals, [2*x for x in x_vals], "b", label="$\epsilon$-Greedy")
10 |     plt.plot(x_vals, [20 * np.log(x) for x in x_vals], "g", label="Decaying $\epsilon$-Greedy")
11 |     plt.xlabel("Time Steps", fontsize=25)
12 |     plt.ylabel("Total Regret", fontsize=25)
13 |     plt.title("Total Regret Curves", fontsize=25)
14 |     plt.xlim(xmin=x_vals[0], xmax=x_vals[-1])
15 |     plt.ylim(ymin=0.0)
16 |     # plt.xticks(x_vals)
17 |     plt.grid(True)
18 |     plt.legend(loc='upper left', fontsize=15)
19 |     plt.show()
20 | 
21 | def get_pdf(x: float, mu: float, sigma: float) -> float:
22 |     return np.exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)) / (np.sqrt(2 * np.pi) * sigma)
23 | 
24 | def graph_qestimate_pdfs() -> NoReturn:
25 |     import matplotlib.pyplot as plt
26 |     x_vals = np.arange(-2., 6., 0.01)
27 |     mu_b = 1.0
28 |     sigma_b = 1.0
29 |     mu_r = 2.0
30 |     sigma_r = 0.8
31 |     mu_g = 2.5
32 |     sigma_g = 0.3
33 |     plt.plot(x_vals, [get_pdf(x, mu_b, sigma_b) for x in x_vals], "b", label="$Q(a_1)$")
34 |     plt.plot(x_vals, [get_pdf(x, mu_r, sigma_r) for x in x_vals], "r", label="$Q(a_2)$")
35 |     plt.plot(x_vals, [get_pdf(x, mu_g, sigma_g) for x in x_vals], "g", label="$Q(a_3)$")
36 |     plt.xlabel("Q", fontsize=25)
37 |     plt.ylabel("Prob(Q)", fontsize=25)
38 |     # plt.title("Total Regret Curves", fontsize=25)
39 |     plt.xlim(xmin=x_vals[0], xmax=x_vals[-1])
40 |     plt.ylim(ymin=0.0)
41 |     # plt.xticks(x_vals)
42 |     plt.grid(True)
43 |     plt.legend(loc='upper left', fontsize=15)
44 |     plt.show()
45 | 
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     # graph_regret_curve()
50 |     graph_qestimate_pdfs()


--------------------------------------------------------------------------------
/src/algorithms/mab/ts_bernoulli.py:
--------------------------------------------------------------------------------
 1 | from typing import Sequence, Tuple, List
 2 | from processes.mab_env import MabEnv
 3 | from operator import itemgetter
 4 | from numpy import ndarray, empty
 5 | from numpy.random import beta
 6 | from algorithms.mab.mab_base import MABBase
 7 | 
 8 | 
 9 | class ThompsonSamplingBernoulli(MABBase):
10 | 
11 |     def __init__(
12 |         self,
13 |         mab: MabEnv,
14 |         time_steps: int,
15 |         num_episodes: int
16 |     ) -> None:
17 |         super().__init__(
18 |             mab=mab,
19 |             time_steps=time_steps,
20 |             num_episodes=num_episodes
21 |         )
22 | 
23 |     def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]:
24 |         ep_rewards: ndarray = empty(self.time_steps)
25 |         ep_actions: ndarray = empty(self.time_steps, dtype=int)
26 |         bayes: List[Tuple[int, int]] = [(1, 1)] * self.num_arms
27 | 
28 |         for i in range(self.time_steps):
29 |             mean_draws: Sequence[float] = [beta(a, b, 1)[0] for a, b in bayes]
30 |             action: int = max(enumerate(mean_draws), key=itemgetter(1))[0]
31 |             reward: float = self.mab_funcs[action]()
32 |             a, b = bayes[action]
33 |             bayes[action] = (a + int(reward), b + int(1 - reward))
34 |             ep_rewards[i] = reward
35 |             ep_actions[i] = action
36 |         return ep_rewards, ep_actions
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     probs_data = [0.2, 0.4, 0.8, 0.5, 0.1, 0.9]
41 |     mu_star = max(probs_data)
42 |     steps = 200
43 |     episodes = 1000
44 | 
45 |     me = MabEnv.get_bernoulli_mab_env(probs_data)
46 |     ucb1 = ThompsonSamplingBernoulli(
47 |         mab=me,
48 |         time_steps=steps,
49 |         num_episodes=episodes
50 |     )
51 |     exp_cum_regret = ucb1.get_expected_cum_regret(mu_star)
52 |     print(exp_cum_regret)
53 | 
54 |     exp_act_count = ucb1.get_expected_action_counts()
55 |     print(exp_act_count)
56 | 
57 |     ucb1.plot_exp_cum_regret_curve(mu_star)
58 | 


--------------------------------------------------------------------------------
/src/utils/beta_distribution.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from typing import Tuple
 3 | from scipy.special import digamma
 4 | 
 5 | 
 6 | class BetaDistribution:
 7 | 
 8 |     SMALL_POS = 1e-8
 9 |     """
10 |     Beta Distribution is normally defined by parameters alpha and beta
11 |     with alpha, beta > 0. Here we define the beta distribution in terms
12 |     of parameters mu (for mean of beta distribution and nu (= alpha + beta).
13 | 
14 |     So,  mu = alpha / (alpha + beta) = alpha / nu
15 |     alpha = mu * nu, beta = (1-mu) * nu
16 | 
17 |     p(x) = Gamma(alpa + beta) / (Gamma(alpha) * Gamma(beta)) *
18 |     x^{alpha-1) * (1-x)^{beta-1)
19 | 
20 |     Score_mu(x) = d(log(p(x)))/d(mu) = Score_alpha * d(alpha)/d(mu)
21 |     + Score_beta * d(beta)/d(mu)
22 |     = (digamma(beta) - digamma(alpha) + log(x) - log(1-x)) * nu
23 | 
24 |     Score_nu(x) = d(log(p(x)))/d(nu) = Score_alpha * d(alpha)/d(nu)
25 |     + Score_beta * d(beta)/d(nu)
26 |     = (digamma(beta) - digamma(alpha) + log(x) - log(1-x)) * mu +
27 |     digamma(nu) - digamma(beta) + log(1-x)
28 |     """
29 | 
30 |     def __init__(self, mu, nu) -> None:
31 |         if 0 < mu < 1 and nu > 0:
32 |             self.mu = mu
33 |             self.nu = nu
34 |             self.alpha = mu * nu
35 |             self.beta = (1. - mu) * nu
36 |         else:
37 |             raise ValueError("mu = %.3f, nu = %.3f" % (mu, nu))
38 | 
39 |     def get_samples(self, n: int) -> np.ndarray:
40 |         sp = BetaDistribution.SMALL_POS
41 |         return np.vectorize(lambda x: min(1. - sp, max(sp, x)))(
42 |             np.random.beta(a=self.alpha, b=self.beta, size=n)
43 |         )
44 | 
45 |     def get_mu_nu_scores(self, x: float) -> Tuple[float, float]:
46 |         diga = digamma(self.alpha)
47 |         digb = digamma(self.beta)
48 |         dign = digamma(self.nu)
49 |         lx = np.log(x)
50 |         l1x = np.log(1. - x)
51 |         temp = digb - diga + lx - l1x
52 |         r1 = temp * self.nu
53 |         r2 = temp * self.mu + dign - digb + l1x
54 |         return r1, r2
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/src/algorithms/rl_tabular/rl_tabular_base.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping, Optional, Set, Callable
 2 | from abc import abstractmethod
 3 | from algorithms.tabular_base import TabularBase
 4 | from processes.mdp_rep_for_rl_tabular import MDPRepForRLTabular
 5 | from processes.policy import Policy
 6 | from processes.det_policy import DetPolicy
 7 | from algorithms.helper_funcs import get_vf_dict_from_qf_dict_and_policy
 8 | from algorithms.helper_funcs import get_uniform_policy
 9 | from algorithms.helper_funcs import get_det_policy_from_qf_dict
10 | from algorithms.helper_funcs import get_epsilon_decay_func
11 | from utils.generic_typevars import S, A
12 | from utils.standard_typevars import VFDictType, QFDictType
13 | 
14 | 
15 | class RLTabularBase(TabularBase):
16 | 
17 |     def __init__(
18 |         self,
19 |         mdp_rep_for_rl: MDPRepForRLTabular,
20 |         exploring_start: bool,
21 |         softmax: bool,
22 |         epsilon: float,
23 |         epsilon_half_life: float,
24 |         num_episodes: int,
25 |         max_steps: int
26 |     ) -> None:
27 | 
28 |         self.mdp_rep: MDPRepForRLTabular = mdp_rep_for_rl
29 |         self.exploring_start: bool = exploring_start
30 |         self.softmax: bool = softmax
31 |         self.epsilon_func: Callable[[int], float] = get_epsilon_decay_func(
32 |             epsilon,
33 |             epsilon_half_life
34 |         )
35 |         self.num_episodes: int = num_episodes
36 |         self.max_steps: int = max_steps
37 | 
38 |     def get_state_action_dict(self) -> Mapping[S, Set[A]]:
39 |         return self.mdp_rep.state_action_dict
40 | 
41 |     def get_init_policy(self) -> Policy:
42 |         return get_uniform_policy(self.mdp_rep.state_action_dict)
43 | 
44 |     def get_value_func_dict(self, pol: Policy) -> VFDictType:
45 |         return get_vf_dict_from_qf_dict_and_policy(
46 |             self.get_qv_func_dict(pol),
47 |             pol
48 |         )
49 | 
50 |     @abstractmethod
51 |     def get_qv_func_dict(self, pol: Optional[Policy]) -> QFDictType:
52 |         pass
53 | 
54 |     def get_act_value_func_dict(self, pol: Policy) -> QFDictType:
55 |         return self.get_qv_func_dict(pol)
56 | 
57 |     def get_optimal_det_policy(self) -> DetPolicy:
58 |         return get_det_policy_from_qf_dict(self.get_qv_func_dict(None))
59 | 


--------------------------------------------------------------------------------
/src/algorithms/dp/dp_analytic.py:
--------------------------------------------------------------------------------
 1 | from algorithms.dp.dp_base import DPBase
 2 | from processes.policy import Policy
 3 | from processes.mdp import MDP
 4 | from processes.det_policy import DetPolicy
 5 | from utils.standard_typevars import VFDictType
 6 | 
 7 | 
 8 | class DPAnalytic(DPBase):
 9 | 
10 |     def __init__(self, mdp_obj: MDP, tol: float) -> None:
11 |         super().__init__(mdp_obj, tol)
12 | 
13 |     def get_value_func_dict(self, pol: Policy) -> VFDictType:
14 |         mrp_obj = self.mdp_obj.get_mrp(pol)
15 |         value_func_vec = mrp_obj.get_value_func_vec()
16 |         nt_vf = {mrp_obj.nt_states_list[i]: value_func_vec[i]
17 |                  for i in range(len(mrp_obj.nt_states_list))}
18 |         t_vf = {s: 0. for s in self.mdp_obj.terminal_states}
19 |         return {**nt_vf, **t_vf}
20 | 
21 |     def get_optimal_det_policy(self) -> DetPolicy:
22 |         return self.get_optimal_policy_pi()
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     from processes.mdp import MDP
27 |     policy_data = {
28 |         1: {'a': 0.4, 'b': 0.6},
29 |         2: {'a': 0.7, 'c': 0.3},
30 |         3: {'b': 1.0}
31 |     }
32 |     pol_obj = Policy(policy_data)
33 |     mdp_data = {
34 |         1: {
35 |             'a': ({1: 0.2, 2: 0.6, 3: 0.2}, 7.0),
36 |             'b': ({1: 0.6, 2: 0.3, 3: 0.1}, -2.0),
37 |             'c': ({1: 0.1, 2: 0.2, 3: 0.7}, 10.0)
38 |         },
39 |         2: {
40 |             'a': ({1: 0.1, 2: 0.6, 3: 0.3}, 1.0),
41 |             'c': ({1: 0.6, 2: 0.2, 3: 0.2}, -1.2)
42 |         },
43 |         3: {
44 |             'b': ({3: 1.0}, 0.0)
45 |         }
46 |     }
47 |     gamma_val = 0.9
48 |     mdp1_obj = MDP(mdp_data, gamma_val)
49 |     mrp1_obj = mdp1_obj.get_mrp(pol_obj)
50 |     print(mrp1_obj.transitions)
51 |     print(mrp1_obj.rewards)
52 |     print(mrp1_obj.trans_matrix)
53 |     print(mrp1_obj.rewards_vec)
54 |     print(mrp1_obj.get_value_func_vec())
55 |     tol_val = 1e-4
56 |     opn = DPAnalytic(mdp1_obj, tol_val)
57 |     opt_policy_pi = opn.get_optimal_policy_pi()
58 |     print(opt_policy_pi)
59 |     opt_vf_dict_pi = opn.get_value_func_dict(opt_policy_pi)
60 |     print(opt_vf_dict_pi)
61 |     opt_policy_vi = opn.get_optimal_policy_vi()
62 |     print(opt_policy_vi)
63 |     opt_vf_dict_vi = opn.get_value_func_dict(opt_policy_vi)
64 |     print(opt_vf_dict_vi)
65 | 


--------------------------------------------------------------------------------
/src/algorithms/tabular_base.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping, Set, Callable
 2 | from abc import abstractmethod
 3 | from algorithms.opt_base import OptBase
 4 | from processes.policy import Policy
 5 | from processes.det_policy import DetPolicy
 6 | from algorithms.helper_funcs import get_pdf_from_samples
 7 | from utils.generic_typevars import S, A
 8 | from utils.standard_typevars import VFDictType, QFDictType, PolicyType
 9 | 
10 | 
11 | class TabularBase(OptBase):
12 | 
13 |     NUM_SAMPLES_PER_ACTION = 10
14 | 
15 |     @abstractmethod
16 |     def get_init_policy(self) -> Policy:
17 |         pass
18 | 
19 |     @abstractmethod
20 |     def get_value_func_dict(self, pol: Policy) -> VFDictType:
21 |         pass
22 | 
23 |     @abstractmethod
24 |     def get_act_value_func_dict(self, pol: Policy) -> QFDictType:
25 |         pass
26 | 
27 |     @abstractmethod
28 |     def get_optimal_det_policy(self) -> DetPolicy:
29 |         pass
30 | 
31 |     @abstractmethod
32 |     def get_state_action_dict(self) -> Mapping[S, Set[A]]:
33 |         pass
34 | 
35 |     def get_value_func(self, polf: PolicyType) -> Callable[[S], float]:
36 |         pol = Policy({s: get_pdf_from_samples(
37 |             polf(s)(len(v) * TabularBase.NUM_SAMPLES_PER_ACTION)
38 |         ) for s, v in self.get_state_action_dict().items()})
39 | 
40 |         # noinspection PyShadowingNames
41 |         def vf(state: S, pol=pol) -> float:
42 |             return self.get_value_func_dict(pol)[state]
43 | 
44 |         return vf
45 | 
46 |     def get_act_value_func(self, polf: PolicyType)\
47 |             -> Callable[[S], Callable[[A], float]]:
48 |         pol = Policy({s: get_pdf_from_samples(
49 |             polf(s)(len(v) * TabularBase.NUM_SAMPLES_PER_ACTION)
50 |         ) for s, v in self.get_state_action_dict().items()})
51 | 
52 |         # noinspection PyShadowingNames
53 |         def qvf(state: S, pol=pol) -> Callable[[A], float]:
54 | 
55 |             # noinspection PyShadowingNames
56 |             def inner_f(action: A, pol=pol, state=state) -> float:
57 |                 return self.get_act_value_func_dict(pol)[state][action]
58 | 
59 |             return inner_f
60 | 
61 |         return qvf
62 | 
63 |     def get_optimal_det_policy_func(self) -> Callable[[S], A]:
64 |         return lambda s: self.get_optimal_det_policy().get_action_for_state(s)
65 | 
66 | 


--------------------------------------------------------------------------------
/src/processes/mp.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping, Set, Generic, Sequence
 2 | from graphviz import Digraph
 3 | from processes.mp_funcs import get_all_states, verify_mp, get_lean_transitions
 4 | import numpy as np
 5 | from scipy.linalg import eig
 6 | from utils.generic_typevars import S
 7 | from utils.standard_typevars import SSf
 8 | 
 9 | 
10 | class MP(Generic[S]):
11 | 
12 |     def __init__(
13 |         self,
14 |         tr: SSf
15 |     ) -> None:
16 |         if verify_mp(tr):
17 |             self.all_states_list: Sequence[S] = list(get_all_states(tr))
18 |             self.transitions: SSf = {s: get_lean_transitions(v)
19 |                                      for s, v in tr.items()}
20 |         else:
21 |             raise ValueError
22 | 
23 |     def get_sink_states(self) -> Set[S]:
24 |         return {k for k, v in self.transitions.items()
25 |                 if len(v) == 1 and k in v.keys()}
26 | 
27 |     def generate_image(self):
28 |         d = Digraph()
29 |         for s in self.all_states_list:
30 |             d.node(str(s))
31 |         for s, v in self.transitions.items():
32 |             for s1, p in v.items():
33 |                 d.edge(str(s), str(s1), label=str(p))
34 |         d.view()
35 | 
36 |     def get_stationary_distribution(self) -> Mapping[S, float]:
37 |         sz = len(self.all_states_list)
38 |         mat = np.zeros((sz, sz))
39 |         for i, s1 in enumerate(self.all_states_list):
40 |             for j, s2 in enumerate(self.all_states_list):
41 |                 mat[i, j] = self.transitions[s1].get(s2, 0.)
42 | 
43 |         eig_vals, eig_vecs = eig(mat.T)
44 |         stat = np.array(
45 |             eig_vecs[:, np.where(np.abs(eig_vals - 1.) < 1e-8)[0][0]].flat
46 |         ).astype(float)
47 |         norm_stat = stat / sum(stat)
48 |         return {s: norm_stat[i] for i, s in enumerate(self.all_states_list)}
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     transitions = {
53 |         1: {1: 0.1, 2: 0.6, 3: 0.1, 4: 0.2},
54 |         2: {1: 0.25, 2: 0.22, 3: 0.24, 4: 0.29},
55 |         3: {1: 0.7, 2: 0.3},
56 |         4: {1: 0.3, 2: 0.5, 3: 0.2}
57 |     }
58 |     mp_obj = MP(transitions)
59 |     print(mp_obj.transitions)
60 |     print(mp_obj.all_states_list)
61 |     print(mp_obj.get_sink_states())
62 |     stationary = mp_obj.get_stationary_distribution()
63 |     print(stationary)
64 |     mp_obj.generate_image()
65 | 


--------------------------------------------------------------------------------
/src/func_approx/dnn_spec.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Sequence, NamedTuple
 2 | import numpy as np
 3 | 
 4 | 
 5 | class DNNSpec(NamedTuple):
 6 | 
 7 |     neurons: Sequence[int]
 8 |     hidden_activation: Callable[[np.ndarray], np.ndarray]
 9 |     hidden_activation_deriv: Callable[[np.ndarray], np.ndarray]
10 |     output_activation: Callable[[np.ndarray], np.ndarray]
11 |     output_activation_deriv: Callable[[np.ndarray], np.ndarray]
12 | 
13 |     SMALL_POS = 1e-8
14 | 
15 |     @staticmethod
16 |     def fexp(arg: np.ndarray) -> np.ndarray:
17 |         return np.vectorize(
18 |             lambda x: max(DNNSpec.SMALL_POS, x)
19 |         )(np.exp(arg))
20 | 
21 |     @staticmethod
22 |     def relu(arg: np.ndarray) -> np.ndarray:
23 |         return np.vectorize(lambda x: x if x > 0. else 0.)(arg)
24 | 
25 |     @staticmethod
26 |     def relu_deriv(res: np.ndarray) -> np.ndarray:
27 |         return np.vectorize(lambda x: 1. if x > 0. else 0.)(res)
28 | 
29 |     @staticmethod
30 |     def identity(arg: np.ndarray) -> np.ndarray:
31 |         return arg
32 | 
33 |     @staticmethod
34 |     def identity_deriv(res: np.ndarray) -> np.ndarray:
35 |         return np.ones_like(res)
36 | 
37 |     @staticmethod
38 |     def sigmoid(arg: np.ndarray) -> np.ndarray:
39 |         return np.vectorize(
40 |             lambda x: max(
41 |                 DNNSpec.SMALL_POS,
42 |                 1. / (1. + max(DNNSpec.SMALL_POS, np.exp(-x)))
43 |             )
44 |         )(arg)
45 | 
46 |     @staticmethod
47 |     def sigmoid_deriv(res: np.ndarray) -> np.ndarray:
48 |         return res * (1. - res)
49 | 
50 |     @staticmethod
51 |     def softplus(arg: np.ndarray) -> np.ndarray:
52 |         return np.log(1. + DNNSpec.fexp(arg))
53 | 
54 |     @staticmethod
55 |     def softplus_deriv(res: np.ndarray) -> np.ndarray:
56 |         return 1. - DNNSpec.fexp(-res)
57 | 
58 |     @staticmethod
59 |     def log_squish(arg: np.ndarray) -> np.ndarray:
60 |         return np.sign(arg) * np.log(1 + np.abs(arg))
61 | 
62 |     @staticmethod
63 |     def log_squish_deriv(res: np.ndarray) -> np.ndarray:
64 |         return DNNSpec.fexp(-np.abs(res))
65 | 
66 |     @staticmethod
67 |     def pos_log_squish(arg: np.ndarray) -> np.ndarray:
68 |         return np.vectorize(
69 |             lambda x: 1. + np.log(1. + x) if x > 0. else DNNSpec.fexp(x)
70 |         )(arg)
71 | 
72 |     @staticmethod
73 |     def pos_log_squish_deriv(res: np.ndarray) -> np.ndarray:
74 |         return np.vectorize(
75 |             lambda x: DNNSpec.fexp(1. - x) if x > 1. else x
76 |         )(res)
77 | 


--------------------------------------------------------------------------------
/src/examples/exam_problems/frog_lilypad.py:
--------------------------------------------------------------------------------
 1 | from processes.mdp_refined import MDPRefined
 2 | from typing import Sequence, Mapping, Tuple, NoReturn
 3 | 
 4 | 
 5 | def get_lily_pads_mdp(n: int) -> MDPRefined:
 6 |     data = {
 7 |         i: {
 8 |             'A': {
 9 |                 i - 1: (i / n, 0.),
10 |                 i + 1: (1. - i / n, 1. if i == n - 1 else 0.)
11 |             },
12 |             'B': {
13 |                 j: (1 / n, 1. if j == n else 0.)
14 |                 for j in range(n + 1) if j != i
15 |             }
16 |         } for i in range(1, n)
17 |     }
18 |     data[0] = {'A': {0: (1., 0.)}, 'B': {0: (1., 0.)}}
19 |     data[n] = {'A': {n: (1., 0.)}, 'B': {n: (1., 0.)}}
20 | 
21 |     gamma = 1.0
22 |     return MDPRefined(data, gamma)
23 | 
24 | 
25 | def get_sorted_q_val(
26 |     q_val: Mapping[int, Mapping[str, float]]
27 | ) -> Sequence[Tuple[float, float]]:
28 |     d = sorted([(s, (t['A'], t['B'])) for s, t in q_val.items()], key=lambda x: x[0])
29 |     return [z for _, z in d[1:-1]]
30 | 
31 | 
32 | def direct_bellman(n: int) -> Mapping[int, float]:
33 |     vf = [0.5] * (n + 1)
34 |     vf[0] = 0.
35 |     vf[n] = 0.
36 |     tol = 1e-8
37 |     epsilon = tol * 1e4
38 |     while epsilon >= tol:
39 |         old_vf = [v for v in vf]
40 |         for i in range(1, n):
41 |             vf[i] = max(
42 |                 (1. if i == n - 1 else 0.) + i * vf[i - 1] + (n - i) * vf[i + 1],
43 |                 1. + sum(vf[j] for j in range(1, n) if j != i)
44 |             ) / n
45 |         epsilon = max(abs(old_vf[i] - v) for i, v in enumerate(vf))
46 |     return {v: f for v, f in enumerate(vf)}
47 | 
48 | 
49 | def graph_q_func(a: Sequence[Tuple[float, float]]) -> NoReturn:
50 |     import matplotlib.pyplot as plt
51 |     x_vals = range(1, len(a) + 1)
52 |     plt.plot(x_vals, [x for x, _ in a], "r", label="Q* for Action A")
53 |     plt.plot(x_vals, [y for _, y in a], "b", label="Q* for Action B")
54 |     plt.xlabel("Lilypad Number")
55 |     plt.ylabel("Value")
56 |     plt.title("Optimal Action Value Function")
57 |     plt.xlim(xmin=x_vals[0], xmax=x_vals[-1])
58 |     plt.ylim(ymin=0.5, ymax=0.8)
59 |     plt.xticks(x_vals)
60 |     plt.grid(True)
61 |     plt.legend(loc='lower right')
62 |     plt.show()
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     pads: int = 10
67 |     mdp: MDPRefined = get_lily_pads_mdp(pads)
68 |     pol = mdp.get_optimal_policy(1e-8)
69 |     print(pol.policy_data)
70 |     print(mdp.get_value_func_dict(pol))
71 |     qv = mdp.get_act_value_func_dict(pol)
72 |     graph_q_func(get_sorted_q_val(qv))
73 | 


--------------------------------------------------------------------------------
/src/processes/mrp.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping, Set, Sequence
 2 | from processes.mp import MP
 3 | from utils.gen_utils import zip_dict_of_tuple, is_approx_eq
 4 | import numpy as np
 5 | from utils.generic_typevars import S
 6 | from utils.standard_typevars import STSff
 7 | 
 8 | 
 9 | class MRP(MP):
10 | 
11 |     def __init__(
12 |         self,
13 |         info: STSff,
14 |         gamma: float
15 |     ):
16 |         d1, d2 = zip_dict_of_tuple(info)
17 |         super().__init__(d1)
18 |         self.gamma: float = gamma
19 |         self.rewards: Mapping[S, float] = d2
20 |         self.terminal_states = self.get_terminal_states()
21 |         self.nt_states_list: Sequence[S] = self.get_nt_states_list()
22 |         self.trans_matrix: np.ndarray = self.get_trans_matrix()
23 |         self.rewards_vec: np.ndarray = self.get_rewards_vec()
24 | 
25 |     def get_terminal_states(self) -> Set[S]:
26 |         sink = self.get_sink_states()
27 |         return {s for s in sink if is_approx_eq(self.rewards[s], 0.0)}
28 | 
29 |     def get_nt_states_list(self) -> Sequence[S]:
30 |         return [s for s in self.all_states_list
31 |                 if s not in self.terminal_states]
32 | 
33 |     def get_trans_matrix(self) -> np.ndarray:
34 |         """
35 |         This transition matrix is only for the non-terminal states
36 |         """
37 |         n = len(self.nt_states_list)
38 |         m = np.zeros((n, n))
39 |         for i in range(n):
40 |             for s, d in self.transitions[self.nt_states_list[i]].items():
41 |                 if s in self.nt_states_list:
42 |                     m[i, self.nt_states_list.index(s)] = d
43 |         return m
44 | 
45 |     def get_rewards_vec(self) -> np.ndarray:
46 |         """
47 |         This rewards vec is only for the non-terminal states
48 |         """
49 |         return np.array([self.rewards[s] for s in self.nt_states_list])
50 | 
51 |     def get_value_func_vec(self) -> np.ndarray:
52 |         """
53 |         This value func vec is only for the non-terminal states
54 |         """
55 |         return np.linalg.inv(
56 |             np.eye(len(self.nt_states_list)) - self.gamma * self.trans_matrix
57 |         ).dot(self.rewards_vec)
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     data = {
62 |         1: ({1: 0.6, 2: 0.3, 3: 0.1}, 7.0),
63 |         2: ({1: 0.1, 2: 0.2, 3: 0.7}, 10.0),
64 |         3: ({3: 1.0}, 0.0)
65 |     }
66 |     mrp_obj = MRP(data, 1.0)
67 |     print(mrp_obj.trans_matrix)
68 |     print(mrp_obj.rewards_vec)
69 |     terminal = mrp_obj.get_terminal_states()
70 |     print(terminal)
71 |     value_func_vec = mrp_obj.get_value_func_vec()
72 |     print(value_func_vec)
73 | 


--------------------------------------------------------------------------------
/src/algorithms/dp/dp_numeric.py:
--------------------------------------------------------------------------------
 1 | from algorithms.dp.dp_base import DPBase
 2 | from processes.policy import Policy
 3 | from processes.det_policy import DetPolicy
 4 | from processes.mp_funcs import mdp_rep_to_mrp_rep1, mdp_rep_to_mrp_rep2
 5 | from processes.mdp import MDP
 6 | from utils.standard_typevars import VFDictType
 7 | 
 8 | 
 9 | class DPNumeric(DPBase):
10 | 
11 |     def __init__(self, mdp_obj: MDP, tol: float) -> None:
12 |         super().__init__(mdp_obj, tol)
13 | 
14 |     def get_value_func_dict(self, pol: Policy) -> VFDictType:
15 |         vf = {s: 0. for s in self.mdp_obj.all_states}
16 |         epsilon = self.tol * 1e4
17 |         mo = self.mdp_obj
18 |         pd = pol.policy_data
19 |         rew = mdp_rep_to_mrp_rep2(mo.rewards, pd)
20 |         prob = mdp_rep_to_mrp_rep1(mo.transitions, pd)
21 |         while epsilon >= self.tol:
22 |             new_vf = {s: rew[s] + mo.gamma * sum(p * vf[s1]
23 |                                                  for s1, p in prob[s].items())
24 |                       for s in mo.all_states}
25 |             epsilon = max(abs(new_vf[s] - v) for s, v in vf.items())
26 |             vf = new_vf
27 |         return vf
28 | 
29 |     def get_optimal_det_policy(self) -> DetPolicy:
30 |         return self.get_optimal_policy_vi()
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     from processes.mdp import MDP
35 |     policy_data = {
36 |         1: {'a': 0.4, 'b': 0.6},
37 |         2: {'a': 0.7, 'c': 0.3},
38 |         3: {'b': 1.0}
39 |     }
40 |     pol_obj = Policy(policy_data)
41 |     mdp_data = {
42 |         1: {
43 |             'a': ({1: 0.2, 2: 0.6, 3: 0.2}, 7.0),
44 |             'b': ({1: 0.6, 2: 0.3, 3: 0.1}, -2.0),
45 |             'c': ({1: 0.1, 2: 0.2, 3: 0.7}, 10.0)
46 |         },
47 |         2: {
48 |             'a': ({1: 0.1, 2: 0.6, 3: 0.3}, 1.0),
49 |             'c': ({1: 0.6, 2: 0.2, 3: 0.2}, -1.2)
50 |         },
51 |         3: {
52 |             'b': ({3: 1.0}, 0.0)
53 |         }
54 |     }
55 |     gamma_val = 0.9
56 |     mdp1_obj = MDP(mdp_data, gamma_val)
57 |     mrp1_obj = mdp1_obj.get_mrp(pol_obj)
58 |     print(mrp1_obj.transitions)
59 |     print(mrp1_obj.rewards)
60 |     print(mrp1_obj.trans_matrix)
61 |     print(mrp1_obj.rewards_vec)
62 |     print(mrp1_obj.get_value_func_vec())
63 |     tol_val = 1e-4
64 |     opn = DPNumeric(mdp1_obj, tol_val)
65 |     opt_policy_pi = opn.get_optimal_policy_pi()
66 |     print(opt_policy_pi)
67 |     opt_vf_dict_pi = opn.get_value_func_dict(opt_policy_pi)
68 |     print(opt_vf_dict_pi)
69 |     opt_policy_vi = opn.get_optimal_policy_vi()
70 |     print(opt_policy_vi)
71 |     opt_vf_dict_vi = opn.get_value_func_dict(opt_policy_vi)
72 |     print(opt_vf_dict_vi)
73 | 


--------------------------------------------------------------------------------
/src/algorithms/mab/mab_base.py:
--------------------------------------------------------------------------------
 1 | from typing import Sequence, Callable, Tuple, NoReturn
 2 | from abc import ABC, abstractmethod
 3 | from processes.mab_env import MabEnv
 4 | from numpy import ndarray, mean, vstack, cumsum, full, bincount
 5 | from utils.gen_utils import memoize
 6 | 
 7 | 
 8 | class MABBase(ABC):
 9 | 
10 |     def __init__(
11 |         self,
12 |         mab: MabEnv,
13 |         time_steps: int,
14 |         num_episodes: int
15 |     ) -> None:
16 |         self.mab_funcs: Sequence[Callable[[], float]] = mab.arms_sampling_funcs
17 |         self.num_arms: int = len(self.mab_funcs)
18 |         self.time_steps: int = time_steps
19 |         self.num_episodes: int = num_episodes
20 | 
21 |     @abstractmethod
22 |     def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]:
23 |         pass
24 | 
25 |     @memoize
26 |     def get_all_rewards_actions(self) -> Sequence[Tuple[ndarray, ndarray]]:
27 |         return [self.get_episode_rewards_actions() for _ in range(self.num_episodes)]
28 | 
29 |     def get_rewards_matrix(self) -> ndarray:
30 |         return vstack([x for x, _ in self.get_all_rewards_actions()])
31 | 
32 |     def get_actions_matrix(self) -> ndarray:
33 |         return vstack([y for _, y in self.get_all_rewards_actions()])
34 | 
35 |     def get_expected_rewards(self) -> ndarray:
36 |         return mean(self.get_rewards_matrix(), axis=0)
37 | 
38 |     def get_expected_cum_rewards(self) -> ndarray:
39 |         return cumsum(self.get_expected_rewards())
40 | 
41 |     def get_expected_regret(self, best_mean) -> ndarray:
42 |         return full(self.time_steps, best_mean) - self.get_expected_rewards()
43 | 
44 |     def get_expected_cum_regret(self, best_mean) -> ndarray:
45 |         return cumsum(self.get_expected_regret(best_mean))
46 | 
47 |     def get_action_counts(self) -> ndarray:
48 |         return vstack([bincount(ep, minlength=self.num_arms)
49 |                        for ep in self.get_actions_matrix()])
50 | 
51 |     def get_expected_action_counts(self) -> ndarray:
52 |         return mean(self.get_action_counts(), axis=0)
53 | 
54 |     def plot_exp_cum_regret_curve(self, best_mean) -> NoReturn:
55 |         import matplotlib.pyplot as plt
56 |         x_vals = range(1, self.time_steps + 1)
57 |         plt.plot(self.get_expected_cum_regret(best_mean), "b", label="Exp Cum Regret")
58 |         plt.xlabel("Time Steps", fontsize=20)
59 |         plt.ylabel("Expected Cumulative Regret", fontsize=20)
60 |         plt.title("Cumulative Regret Curve", fontsize=25)
61 |         plt.xlim(xmin=x_vals[0], xmax=x_vals[-1])
62 |         plt.ylim(ymin=0.0)
63 |         # plt.xticks(x_vals)
64 |         plt.grid(True)
65 |         # plt.legend(loc='upper left')
66 |         plt.show()
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/src/algorithms/mab/ucb1.py:
--------------------------------------------------------------------------------
 1 | from typing import Sequence, Tuple, List
 2 | from processes.mab_env import MabEnv
 3 | from operator import itemgetter
 4 | from numpy import ndarray, empty, sqrt, log
 5 | from algorithms.mab.mab_base import MABBase
 6 | 
 7 | 
 8 | class UCB1(MABBase):
 9 | 
10 |     def __init__(
11 |         self,
12 |         mab: MabEnv,
13 |         time_steps: int,
14 |         num_episodes: int,
15 |         bounds_range: float,
16 |         alpha: float
17 |     ) -> None:
18 |         if bounds_range < 0 or alpha <= 0:
19 |             raise ValueError
20 |         super().__init__(
21 |             mab=mab,
22 |             time_steps=time_steps,
23 |             num_episodes=num_episodes
24 |         )
25 |         self.bounds_range: float = bounds_range
26 |         self.alpha: float = alpha
27 | 
28 |     def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]:
29 |         ep_rewards: ndarray = empty(self.time_steps)
30 |         ep_actions: ndarray = empty(self.time_steps, dtype=int)
31 |         for i in range(self.num_arms):
32 |             ep_rewards[i] = self.mab_funcs[i]()
33 |             ep_actions[i] = i
34 |         counts: List[int] = [1] * self.num_arms
35 |         means: List[float] = [ep_rewards[j] for j in range(self.num_arms)]
36 |         for i in range(self.num_arms, self.time_steps):
37 |             ucbs: Sequence[float] = [means[j] + self.bounds_range *
38 |                                      sqrt(0.5 * self.alpha * log(i) / counts[j])
39 |                                      for j in range(self.num_arms)]
40 |             action: int = max(enumerate(ucbs), key=itemgetter(1))[0]
41 |             reward: float = self.mab_funcs[action]()
42 |             counts[action] += 1
43 |             means[action] += (reward - means[action]) / counts[action]
44 |             ep_rewards[i] = reward
45 |             ep_actions[i] = action
46 |         return ep_rewards, ep_actions
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     binomial_count = 10
51 |     binomial_probs = [0.4, 0.8, 0.1, 0.5, 0.9, 0.2]
52 |     binomial_params = [(binomial_count, p) for p in binomial_probs]
53 |     mu_star = max(n * p for n, p in binomial_params)
54 |     steps = 200
55 |     episodes = 1000
56 |     this_range = binomial_count
57 |     this_alpha = 4.0
58 | 
59 |     me = MabEnv.get_binomial_mab_env(binomial_params)
60 |     ucb1 = UCB1(
61 |         mab=me,
62 |         time_steps=steps,
63 |         num_episodes=episodes,
64 |         bounds_range=this_range,
65 |         alpha=this_alpha
66 |     )
67 |     exp_cum_regret = ucb1.get_expected_cum_regret(mu_star)
68 |     print(exp_cum_regret)
69 | 
70 |     exp_act_count = ucb1.get_expected_action_counts()
71 |     print(exp_act_count)
72 | 
73 |     ucb1.plot_exp_cum_regret_curve(mu_star)
74 | 
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/src/algorithms/func_approx_spec.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Sequence, NamedTuple, Optional, Tuple
 2 | from func_approx.dnn_spec import DNNSpec
 3 | from func_approx.func_approx_base import FuncApproxBase
 4 | from func_approx.linear_approx import LinearApprox
 5 | from func_approx.dnn import DNN
 6 | from utils.generic_typevars import S, A
 7 | 
 8 | 
 9 | class FuncApproxSpec(NamedTuple):
10 |     state_feature_funcs: Sequence[Callable[[S], float]]
11 |     sa_feature_funcs: Sequence[Callable[[Tuple[S, A]], float]]
12 |     dnn_spec: Optional[DNNSpec]
13 |     reglr_coeff: float = 0.
14 |     learning_rate: float = 0.1
15 |     adam_params: Tuple[bool, float, float] = (True, 0.9, 0.99)
16 |     add_unit_feature: bool = True
17 | 
18 |     def get_vf_func_approx_obj(self) -> FuncApproxBase:
19 |         if self.dnn_spec is None:
20 |             ret = LinearApprox(
21 |                 feature_funcs=self.state_feature_funcs,
22 |                 reglr_coeff=self.reglr_coeff,
23 |                 learning_rate=self.learning_rate,
24 |                 adam=self.adam_params[0],
25 |                 adam_decay1=self.adam_params[1],
26 |                 adam_decay2=self.adam_params[2],
27 |                 add_unit_feature=self.add_unit_feature
28 |             )
29 |         else:
30 |             ret = DNN(
31 |                 feature_funcs=self.state_feature_funcs,
32 |                 dnn_obj=self.dnn_spec,
33 |                 reglr_coeff=self.reglr_coeff,
34 |                 learning_rate=self.learning_rate,
35 |                 adam=self.adam_params[0],
36 |                 adam_decay1=self.adam_params[1],
37 |                 adam_decay2=self.adam_params[2],
38 |                 add_unit_feature=self.add_unit_feature
39 |             )
40 |         return ret
41 | 
42 |     def get_qvf_func_approx_obj(self) -> FuncApproxBase:
43 |         if self.dnn_spec is None:
44 |             ret = LinearApprox(
45 |                 feature_funcs=self.sa_feature_funcs,
46 |                 reglr_coeff=self.reglr_coeff,
47 |                 learning_rate=self.learning_rate,
48 |                 adam=self.adam_params[0],
49 |                 adam_decay1=self.adam_params[1],
50 |                 adam_decay2=self.adam_params[2],
51 |                 add_unit_feature=self.add_unit_feature
52 |             )
53 |         else:
54 |             ret = DNN(
55 |                 feature_funcs=self.sa_feature_funcs,
56 |                 dnn_obj=self.dnn_spec,
57 |                 reglr_coeff=self.reglr_coeff,
58 |                 learning_rate=self.learning_rate,
59 |                 adam=self.adam_params[0],
60 |                 adam_decay1=self.adam_params[1],
61 |                 adam_decay2=self.adam_params[2],
62 |                 add_unit_feature=self.add_unit_feature
63 |             )
64 |         return ret
65 | 


--------------------------------------------------------------------------------
/src/algorithms/mab/gradient_bandits.py:
--------------------------------------------------------------------------------
 1 | from typing import Sequence, Tuple, List
 2 | from processes.mab_env import MabEnv
 3 | from operator import itemgetter
 4 | from numpy import ndarray, empty, exp
 5 | from numpy.random import choice
 6 | from algorithms.mab.mab_base import MABBase
 7 | 
 8 | 
 9 | class GradientBandits(MABBase):
10 | 
11 |     def __init__(
12 |         self,
13 |         mab: MabEnv,
14 |         time_steps: int,
15 |         num_episodes: int,
16 |         learning_rate: float,
17 |         learning_rate_decay: float
18 |     ) -> None:
19 |         if learning_rate <= 0 or learning_rate_decay <= 0:
20 |             raise ValueError
21 |         super().__init__(
22 |             mab=mab,
23 |             time_steps=time_steps,
24 |             num_episodes=num_episodes
25 |         )
26 |         self.learning_rate: float = learning_rate
27 |         self.learning_rate_decay: float = learning_rate_decay
28 | 
29 |     def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]:
30 |         ep_rewards: ndarray = empty(self.time_steps)
31 |         ep_actions: ndarray = empty(self.time_steps, dtype=int)
32 |         scores: List[float] = [0.] * self.num_arms
33 |         avg_reward: float = 0.
34 | 
35 |         for i in range(self.time_steps):
36 |             max_score: float = max(scores)
37 |             exp_scores: Sequence[float] = [exp(s - max_score) for s in scores]
38 |             sum_exp_scores = sum(exp_scores)
39 |             probs: Sequence[float] = [s / sum_exp_scores for s in exp_scores]
40 |             action: int = choice(self.num_arms, p=probs)
41 |             reward: float = self.mab_funcs[action]()
42 |             avg_reward += (reward - avg_reward) / (i + 1)
43 |             step_size: float = self.learning_rate *\
44 |                 (i / self.learning_rate_decay + 1) ** -0.5
45 |             for j in range(self.num_arms):
46 |                 scores[j] += step_size * (reward - avg_reward) *\
47 |                              ((1 if j == action else 0) - probs[j])
48 | 
49 |             ep_rewards[i] = reward
50 |             ep_actions[i] = action
51 |         return ep_rewards, ep_actions
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     mean_vars_data = [(9., 5.), (10., 2.), (0., 4.), (6., 10.), (2., 20.), (4., 1.)]
56 |     mu_star = max(mean_vars_data, key=itemgetter(0))[0]
57 |     steps = 200
58 |     episodes = 1000
59 |     lr = 0.1
60 |     lr_decay = 20.0
61 | 
62 |     me = MabEnv.get_gaussian_mab_env(mean_vars_data)
63 |     ucb1 = GradientBandits(
64 |         mab=me,
65 |         time_steps=steps,
66 |         num_episodes=episodes,
67 |         learning_rate=lr,
68 |         learning_rate_decay=lr_decay
69 |     )
70 |     exp_cum_regret = ucb1.get_expected_cum_regret(mu_star)
71 |     print(exp_cum_regret)
72 | 
73 |     exp_act_count = ucb1.get_expected_action_counts()
74 |     print(exp_act_count)
75 | 
76 |     ucb1.plot_exp_cum_regret_curve(mu_star)
77 | 


--------------------------------------------------------------------------------
/src/examples/exam_problems/wage_max.py:
--------------------------------------------------------------------------------
 1 | from typing import Sequence, NamedTuple
 2 | import numpy as np
 3 | 
 4 | 
 5 | class WageMax(NamedTuple):
 6 | 
 7 |     probs: Sequence[float]
 8 |     wages: Sequence[float]
 9 |     gamma: float
10 |     alpha: float
11 |     risk_aversion: float
12 | 
13 |     def validate_inputs(self) -> bool:
14 |         b1 = abs(sum(self.probs) - 1) <= 1e-8
15 |         b2 = len(self.probs) + 1 == len(self.wages)
16 |         b3 = all(self.wages[0] < w for w in self.wages[1:])
17 |         b4 = 0. <= self.gamma < 1.
18 |         b5 = 0. <= self.alpha <= 1.
19 |         b6 = self.risk_aversion > 0.
20 |         return all([b1, b2, b3, b4, b5, b6])
21 | 
22 |     # noinspection PyShadowingNames
23 |     def get_wages_utility(self) -> Sequence[float]:
24 |         a = self.risk_aversion
25 |         f = (lambda x, a=a: (pow(x, 1 - a) - 1) / (1 - a)) \
26 |             if a != 1 else (lambda x: np.log(x))
27 |         return [f(w) for w in self.wages]
28 | 
29 |     def get_opt_vf(self) -> Sequence[float]:
30 |         jobs = len(self.probs)
31 |         utils = self.get_wages_utility()
32 |         vf = [0.] * (jobs + 1)
33 |         tol = 1e-6
34 |         epsilon = tol * 1e6
35 |         while epsilon >= tol:
36 |             old_vf = [v for v in vf]
37 |             vf[0] = sum(self.probs[i] * max(
38 |                 vf[i + 1],
39 |                 utils[0] + self.gamma * vf[0]
40 |             ) for i in range(jobs))
41 |             for i in range(1, jobs + 1):
42 |                 vf[i] = utils[i] + self.gamma *\
43 |                             (self.alpha * vf[0] + (1 - self.alpha) * vf[i])
44 |             epsilon = max(abs(old_vf[i] - v) for i, v in enumerate(vf))
45 |         return vf
46 | 
47 |     def get_opt_policy(self) -> Sequence[str]:
48 |         jobs = len(self.probs)
49 |         utils = self.get_wages_utility()
50 |         vf = self.get_opt_vf()
51 |         return ["Accept" if vf[i] > utils[0] + self.gamma * vf[0]
52 |                 else "Decline" for i in range(1, jobs + 1)]
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     this_probs: Sequence[float] = [0.5, 0.3, 0.2]
57 |     this_wages: Sequence[float] = [1.0, 1.8, 2.8, 5.2]
58 |     this_gamma: float = 0.9
59 |     this_alpha: float = 0.2
60 |     this_risk_aversion: float = 1.0
61 |     # all_jobs = 10
62 |     # this_probs: Sequence[float] = [1. / all_jobs] * all_jobs
63 |     # this_wages: Sequence[float] = [i + 1 for i in range(all_jobs + 1)]
64 |     # this_gamma: float = 0.5
65 |     # this_alpha: float = 0.1
66 |     # this_risk_aversion: float = 0.5
67 |     wm = WageMax(
68 |         probs=this_probs,
69 |         wages=this_wages,
70 |         gamma=this_gamma,
71 |         alpha=this_alpha,
72 |         risk_aversion=this_risk_aversion
73 |     )
74 |     if not wm.validate_inputs():
75 |         raise ValueError
76 |     opt_vf = wm.get_opt_vf()
77 |     opt_policy = wm.get_opt_policy()
78 |     print(opt_vf)
79 |     print(opt_policy)


--------------------------------------------------------------------------------
/src/algorithms/mab/epsilon_greedy.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Callable, Tuple
 2 | from processes.mab_env import MabEnv
 3 | from algorithms.helper_funcs import get_epsilon_decay_func
 4 | from operator import itemgetter
 5 | from numpy.random import binomial, randint
 6 | from numpy import ndarray, empty
 7 | from algorithms.mab.mab_base import MABBase
 8 | 
 9 | 
10 | class EpsilonGreedy(MABBase):
11 | 
12 |     def __init__(
13 |         self,
14 |         mab: MabEnv,
15 |         time_steps: int,
16 |         num_episodes: int,
17 |         epsilon: float,
18 |         epsilon_half_life: float = 1e8,
19 |         count_init: int = 0,
20 |         mean_init: float = 0.,
21 |     ) -> None:
22 |         if epsilon < 0 or epsilon > 1 or epsilon_half_life <= 1 or count_init < 0:
23 |             raise ValueError
24 | 
25 |         super().__init__(
26 |             mab=mab,
27 |             time_steps=time_steps,
28 |             num_episodes=num_episodes
29 |         )
30 |         self.epsilon_func: Callable[[int], float] = get_epsilon_decay_func(
31 |             epsilon,
32 |             epsilon_half_life
33 |         )
34 |         self.count_init: int = count_init
35 |         self.mean_init: float = mean_init
36 | 
37 |     def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]:
38 |         counts: List[int] = [self.count_init] * self.num_arms
39 |         means: List[float] = [self.mean_init] * self.num_arms
40 |         ep_rewards: ndarray = empty(self.time_steps)
41 |         ep_actions: ndarray = empty(self.time_steps, dtype=int)
42 |         for i in range(self.time_steps):
43 |             max_action: int = max(enumerate(means), key=itemgetter(1))[0]
44 |             epsl: float = self.epsilon_func(i)
45 |             action: int = max_action if binomial(1, epsl, size=1)[0] == 0 else\
46 |                 randint(self.num_arms, size=1)[0]
47 |             reward: float = self.mab_funcs[action]()
48 |             counts[action] += 1
49 |             means[action] += (reward - means[action]) / counts[action]
50 |             ep_rewards[i] = reward
51 |             ep_actions[i] = action
52 |         return ep_rewards, ep_actions
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     mean_vars_data = [(9., 5.), (10., 2.), (0., 4.), (6., 10.), (2., 20.), (4., 1.)]
57 |     mu_star = max(mean_vars_data, key=itemgetter(0))[0]
58 |     steps = 200
59 |     episodes = 1000
60 |     eps = 0.2
61 |     eps_hl = 50
62 |     ci = 5
63 |     mi = mu_star * 3.
64 | 
65 |     me = MabEnv.get_gaussian_mab_env(mean_vars_data)
66 |     eg = EpsilonGreedy(
67 |         mab=me,
68 |         time_steps=steps,
69 |         num_episodes=episodes,
70 |         epsilon=eps,
71 |         epsilon_half_life=eps_hl,
72 |         count_init=ci,
73 |         mean_init=mi
74 |     )
75 |     exp_cum_regret = eg.get_expected_cum_regret(mu_star)
76 |     print(exp_cum_regret)
77 | 
78 |     exp_act_count = eg.get_expected_action_counts()
79 |     print(exp_act_count)
80 | 
81 |     eg.plot_exp_cum_regret_curve(mu_star)
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/src/algorithms/dp/dp_base.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping, Set
 2 | from abc import abstractmethod
 3 | from algorithms.tabular_base import TabularBase
 4 | from processes.policy import Policy
 5 | from processes.det_policy import DetPolicy
 6 | from processes.mdp import MDP
 7 | from operator import itemgetter
 8 | from algorithms.helper_funcs import get_uniform_policy
 9 | from algorithms.helper_funcs import get_det_policy_from_qf_dict
10 | from utils.generic_typevars import S, A
11 | from utils.standard_typevars import VFDictType, QFDictType
12 | 
13 | 
14 | class DPBase(TabularBase):
15 | 
16 |     def __init__(self, mdp_obj: MDP, tol: float) -> None:
17 |         self.mdp_obj: MDP = mdp_obj
18 |         self.tol = tol
19 | 
20 |     def get_state_action_dict(self) -> Mapping[S, Set[A]]:
21 |         return self.mdp_obj.state_action_dict
22 | 
23 |     def get_init_policy(self) -> Policy:
24 |         return get_uniform_policy(self.mdp_obj.state_action_dict)
25 | 
26 |     @abstractmethod
27 |     def get_value_func_dict(self, pol: Policy) -> VFDictType:
28 |         pass
29 | 
30 |     def get_improved_det_policy(self, pol: Policy) -> DetPolicy:
31 |         return get_det_policy_from_qf_dict(self.get_act_value_func_dict(pol))
32 | 
33 |     def get_act_value_func_dict(self, pol: Policy) -> QFDictType:
34 |         v_dict = self.get_value_func_dict(pol)
35 |         mo = self.mdp_obj
36 |         return {s: {a: r + mo.gamma *
37 |                 sum(p * v_dict[s1] for s1, p in
38 |                     mo.transitions[s][a].items()) for a, r in v.items()}
39 |                 for s, v in mo.rewards.items()}
40 | 
41 |     def get_optimal_policy_pi(self) -> DetPolicy:
42 |         pol = self.get_init_policy()
43 |         vf = self.get_value_func_dict(pol)
44 |         epsilon = self.tol * 1e4
45 |         while epsilon >= self.tol:
46 |             pol = self.get_improved_det_policy(pol)
47 |             new_vf = self.get_value_func_dict(pol)
48 |             epsilon = max(abs(new_vf[s] - v) for s, v in vf.items())
49 |             vf = new_vf
50 |         return pol
51 | 
52 |     def get_optimal_policy_vi(self) -> DetPolicy:
53 |         vf = {s: 0. for s in self.mdp_obj.all_states}
54 |         epsilon = self.tol * 1e4
55 |         mo = self.mdp_obj
56 |         while epsilon >= self.tol:
57 |             new_vf = {s: max(r + mo.gamma * sum(p * vf[s1] for s1, p in
58 |                                                 mo.transitions[s][a].items())
59 |                              for a, r in v.items())
60 |                       for s, v in mo.rewards.items()}
61 |             epsilon = max(abs(new_vf[s] - v) for s, v in vf.items())
62 |             vf = new_vf
63 |         pol = DetPolicy({s: max(
64 |             [(a, r + mo.gamma * sum(p * vf[s1]
65 |                                     for s1, p in mo.transitions[s][a].items()))
66 |              for a, r in v.items()],
67 |             key=itemgetter(1)
68 |         )[0] for s, v in mo.rewards.items()})
69 |         return pol
70 | 
71 |     @abstractmethod
72 |     def get_optimal_det_policy(self) -> DetPolicy:
73 |         pass
74 | 
75 | 


--------------------------------------------------------------------------------
/src/examples/american_pricing/bs_pricing.py:
--------------------------------------------------------------------------------
 1 | from scipy.stats import norm
 2 | import numpy as np
 3 | from typing import Mapping, Tuple
 4 | 
 5 | 
 6 | class EuropeanBSPricing:
 7 | 
 8 |     def __init__(
 9 |         self,
10 |         is_call: bool,
11 |         spot_price: float,
12 |         strike: float,
13 |         expiry: float,
14 |         r: float,
15 |         sigma: float
16 |     ) -> None:
17 |         self.is_call: bool = is_call
18 |         self.spot_price: float = spot_price
19 |         self.strike: float = strike
20 |         self.expiry: float = expiry
21 |         self.r: float = r
22 |         self.sigma: float = sigma
23 |         self.option_price: float = self.get_option_price()
24 |         self.greeks: Mapping[str, float] = self.get_greeks()
25 | 
26 |     def get_d1_d2(self) -> Tuple[float, float]:
27 |         sigma_sqrt = self.sigma * np.sqrt(self.expiry)
28 |         d1 = (np.log(self.spot_price / self.strike) +
29 |               (self.r + self.sigma ** 2 / 2.) * self.expiry) / sigma_sqrt
30 |         d2 = d1 - sigma_sqrt
31 |         return d1, d2
32 | 
33 |     def get_option_price(self) -> float:
34 |         d1, d2 = self.get_d1_d2()
35 |         if self.is_call:
36 |             ret = self.spot_price * norm.cdf(d1) -\
37 |                   self.strike * np.exp(-self.r * self.expiry) * norm.cdf(d2)
38 |         else:
39 |             ret = self.strike * np.exp(-self.r * self.expiry) * norm.cdf(-d2)\
40 |                   - self.spot_price * norm.cdf(-d1)
41 |         return ret
42 | 
43 |     def get_greeks(self) -> Mapping[str, float]:
44 |         d1, d2 = self.get_d1_d2()
45 |         sqrtt = np.sqrt(self.expiry)
46 | 
47 |         gamma = norm.pdf(d1) / (self.spot_price * self.sigma * sqrtt)
48 |         vega = self.spot_price * sqrtt * norm.pdf(d1)
49 |         rho_temp = -self.strike * self.expiry * np.exp(-self.r * self.expiry)
50 |         theta_temp1 = (self.spot_price * self.sigma * norm.pdf(d1)) / (2 * sqrtt)
51 |         theta_temp2 = self.r * self.strike * np.exp(-self.r * self.expiry)
52 | 
53 |         if self.is_call:
54 |             delta = norm.cdf(d1)
55 |             theta = - theta_temp1 - theta_temp2 * norm.cdf(d2)
56 |             rho = rho_temp * norm.cdf(d2)
57 |         else:
58 |             delta = -norm.cdf(-d1)
59 |             theta = - theta_temp1 + theta_temp2 * norm.cdf(-d2)
60 |             rho = rho_temp * norm.cdf(-d2)
61 | 
62 |         return {
63 |             "Delta": delta,
64 |             "Gamma": gamma,
65 |             "Theta": theta,
66 |             "Vega": vega,
67 |             "Rho": rho
68 |         }
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     is_call_val = False
73 |     spot_price_val = 80.0
74 |     strike_val = 78.0
75 |     expiry_val = 2.0
76 |     r_val = 0.02
77 |     sigma_val = 0.25
78 |     opt_obj = EuropeanBSPricing(
79 |         is_call=is_call_val,
80 |         spot_price=spot_price_val,
81 |         strike=strike_val,
82 |         expiry=expiry_val,
83 |         r=r_val,
84 |         sigma=sigma_val
85 |     )
86 |     print(opt_obj.option_price)
87 |     print(opt_obj.greeks)
88 | 


--------------------------------------------------------------------------------
/src/algorithms/mab/ts_gaussian.py:
--------------------------------------------------------------------------------
 1 | from typing import Sequence, Tuple, List
 2 | from processes.mab_env import MabEnv
 3 | from operator import itemgetter
 4 | from numpy import ndarray, empty, sqrt
 5 | from numpy.random import gamma, normal
 6 | from algorithms.mab.mab_base import MABBase
 7 | 
 8 | 
 9 | class ThompsonSamplingGaussian(MABBase):
10 | 
11 |     def __init__(
12 |         self,
13 |         mab: MabEnv,
14 |         time_steps: int,
15 |         num_episodes: int,
16 |         init_mean: float,
17 |         init_stdev: float
18 |     ) -> None:
19 |         if init_stdev <= 0:
20 |             raise ValueError
21 |         super().__init__(
22 |             mab=mab,
23 |             time_steps=time_steps,
24 |             num_episodes=num_episodes
25 |         )
26 |         self.mu0: float = init_mean
27 |         self.n0: int = 1
28 |         self.alpha0: float = 1
29 |         self.beta0: float = init_stdev * init_stdev
30 | 
31 |     def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]:
32 |         # Bayesian update based on the treatment in
33 |         # https://people.eecs.berkeley.edu/~jordan/courses/260-spring10/lectures/lecture5.pdf
34 |         # (Section 3 on page 5, where both the mean and the variance are random)
35 |         ep_rewards: ndarray = empty(self.time_steps)
36 |         ep_actions: ndarray = empty(self.time_steps, dtype=int)
37 |         bayes: List[Tuple[float, int, float, float]] =\
38 |             [(self.mu0, self.n0, self.alpha0, self.beta0)] * self.num_arms
39 | 
40 |         for i in range(self.time_steps):
41 |             mean_draws: Sequence[float] = [normal(
42 |                 mu,
43 |                 1 / sqrt(n * gamma(alpha, 1 / beta, 1)[0]),
44 |                 1
45 |             )[0] for mu, n, alpha, beta in bayes]
46 |             action: int = max(enumerate(mean_draws), key=itemgetter(1))[0]
47 |             reward: float = self.mab_funcs[action]()
48 |             mu, n, alpha, beta = bayes[action]
49 |             bayes[action] = (
50 |                 (reward + n * mu) / (n + 1),
51 |                 n + 1,
52 |                 alpha + 0.5,
53 |                 beta + 0.5 * n / (n + 1) * (reward - mu) * (reward - mu)
54 |             )
55 |             ep_rewards[i] = reward
56 |             ep_actions[i] = action
57 |         return ep_rewards, ep_actions
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     mean_vars_data = [(9., 5.), (10., 2.), (0., 4.), (6., 10.), (2., 20.), (4., 1.)]
62 |     mu_star = max(mean_vars_data, key=itemgetter(0))[0]
63 |     steps = 200
64 |     episodes = 1000
65 |     guess_mean = 0.
66 |     guess_stdev = 10.
67 | 
68 |     me = MabEnv.get_gaussian_mab_env(mean_vars_data)
69 |     ucb1 = ThompsonSamplingGaussian(
70 |         mab=me,
71 |         time_steps=steps,
72 |         num_episodes=episodes,
73 |         init_mean=guess_mean,
74 |         init_stdev=guess_stdev
75 |     )
76 |     exp_cum_regret = ucb1.get_expected_cum_regret(mu_star)
77 |     print(exp_cum_regret)
78 | 
79 |     exp_act_count = ucb1.get_expected_action_counts()
80 |     print(exp_act_count)
81 | 
82 |     ucb1.plot_exp_cum_regret_curve(mu_star)
83 | 


--------------------------------------------------------------------------------
/src/algorithms/backward_adp.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping, Set, Sequence, Tuple, Generic, Callable
 2 | from utils.gen_utils import is_approx_eq
 3 | from utils.generic_typevars import S, A
 4 | from operator import itemgetter
 5 | from algorithms.func_approx_spec import FuncApproxSpec
 6 | from func_approx.func_approx_base import FuncApproxBase
 7 | 
 8 | 
 9 | class BackwardADP(Generic[S, A]):
10 | 
11 |     def __init__(
12 |         self,
13 |         state_actions_funcs: Sequence[Callable[[S], Set[A]]],
14 |         sample_states_gen_funcs: Sequence[Callable[[int], Sequence[S]]],
15 |         transitions_rewards_funcs: Sequence[Callable[[S, A], Mapping[S, Tuple[float, float]]]],
16 |         terminal_opt_val_func: Callable[[S], float],
17 |         gamma: float,
18 |         fa_specs: Sequence[FuncApproxSpec]
19 |     ) -> None:
20 |         if (len(state_actions_funcs) == len(sample_states_gen_funcs)\
21 |             == len(transitions_rewards_funcs) == len(fa_specs))\
22 |             and 0. <= gamma <= 1.:
23 |             self.state_actions_funcs = state_actions_funcs
24 |             self.sample_states_gen_funcs = sample_states_gen_funcs
25 |             self.transitions_rewards_funcs = transitions_rewards_funcs
26 |             self.terminal_opt_val_func = terminal_opt_val_func
27 |             self.gamma = gamma
28 |             self.fas: Sequence[FuncApproxBase] = [x.get_vf_func_approx_obj() for x in fa_specs]
29 |             self.vf_and_policy_func = self.get_vf_and_policy_func()
30 |         else:
31 |             raise ValueError
32 | 
33 | 
34 |     def get_vf_and_policy_func(self) -> Sequence[Mapping[S, Tuple[float, A]]]:
35 |         vf_pol = {s: (v, None) for s, v in self.terminal_opt_val.items()}
36 |         ret = []
37 |         for tr in self.transitions_rewards[::-1]:
38 |             vf_pol = {s: max(
39 |                 [(
40 |                     sum(p * (r + self.gamma * vf_pol[s1][0])
41 |                         for s1, (p, r) in d1.items()),
42 |                     a
43 |                 ) for a, d1 in d.items()],
44 |                 key=itemgetter(0)
45 |             ) for s, d in tr.items()}
46 |             ret.append(vf_pol)
47 |         return ret[::-1]
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     from scipy.stats import poisson
52 |     T: int = 10  # time steps
53 |     M: int = 200  # initial inventory
54 |     # the following are (price, poisson mean) pairs, i.e., elasticity
55 |     el: Sequence[Tuple[float, float]] = [
56 |         (10.0, 10.0), (9.0, 16.0), (8.0, 20.0),
57 |         (7.0, 23.0), (6.0, 25.0), (5.0, 26.0)
58 |     ]
59 |     rvs = [(p, poisson(l)) for p, l in el]
60 | 
61 |     tr_rew_dict = {
62 |         s: {
63 |             p: {
64 |                 s - d: (
65 |                     rv.pmf(d) if d < s else 1. - rv.cdf(s - 1),
66 |                     d * p
67 |                 ) for d in range(s + 1)
68 |             } for p, rv in rvs
69 |         } for s in range(M + 1)
70 |     }
71 | 
72 |     bdp = BackwardADP(
73 |         transitions_rewards=[tr_rew_dict] * T,
74 |         terminal_opt_val={s: 0. for s in range(M + 1)},
75 |         gamma=1.
76 |     )
77 |     print(bdp.vf_and_policy[0])
78 | 


--------------------------------------------------------------------------------
/src/utils/gen_utils.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | from typing import Mapping, TypeVar, Tuple, Sequence, List
 3 | 
 4 | FlattenedDict = List[Tuple[Tuple, float]]
 5 | 
 6 | X = TypeVar('X')
 7 | Y = TypeVar('Y')
 8 | Z = TypeVar('Z')
 9 | 
10 | epsilon = 1e-8
11 | 
12 | 
13 | def memoize(func):
14 |     cache = func.cache = {}
15 | 
16 |     @functools.wraps(func)
17 |     def memoized_func(*args, **kwargs):
18 |         key = str(args) + str(kwargs)
19 |         if key not in cache:
20 |             cache[key] = func(*args, **kwargs)
21 |         return cache[key]
22 |     return memoized_func
23 | 
24 | 
25 | def zip_dict_of_tuple(d: Mapping[X, Tuple[Y, Z]])\
26 |         -> Tuple[Mapping[X, Y], Mapping[X, Z]]:
27 |     d1 = {k: v1 for k, (v1, _) in d.items()}
28 |     d2 = {k: v2 for k, (_, v2) in d.items()}
29 |     return d1, d2
30 | 
31 | 
32 | def sum_dicts(dicts: Sequence[Mapping[X, float]]) -> Mapping[X, float]:
33 |     return {k: sum(d.get(k, 0) for d in dicts)
34 |             for k in set.union(*[set(d1) for d1 in dicts])}
35 | 
36 | 
37 | def is_approx_eq(a: float, b: float) -> bool:
38 |     return abs(a - b) <= epsilon
39 | 
40 | 
41 | def transpose_dict_of_dicts(d: Mapping[X, Mapping[Y, Z]])\
42 |         -> Mapping[Y, Mapping[X, Z]]:
43 |     """
44 |     Returns the transposed dictionary of dictionaries.
45 |     Works on irregularly shaped (non-rectangular) dicts of dicts
46 |     """
47 |     all_y = set(y for _, di in d.items() for y, _ in di.items())
48 |     return {y: {x: val for x, di in d.items()
49 |                 for y1, val in di.items() if y1 == y} for y in all_y}
50 | 
51 | 
52 | def transpose_dict_of_lists(d: Mapping[X, Sequence[Y]])\
53 |         -> Sequence[Mapping[X, Y]]:
54 |     """
55 |     Returns the transposed list of dictionaries.
56 |     Works on irregularly shaped (non-rectangular) dicts of lists
57 |     """
58 |     max_len = max(len(l) for _, l in d.items())
59 |     return [{k: l[i] for k, l in d.items() if i < len(l)}
60 |             for i in range(max_len)]
61 | 
62 | 
63 | def transpose_list_of_dicts(l: Sequence[Mapping[X, Y]])\
64 |         -> Mapping[X, Sequence[Y]]:
65 |     """
66 |     Returns the transposed dictionary of lists.
67 |     Works on irregularly shaped (non-rectangular) lists of dicts
68 |     Will 'compress' the result on irregularly shaped input
69 |     """
70 |     all_k = set(k for d in l for k, _ in d.items())
71 |     return {k: [val for d in l for k1, val in d.items()
72 |                 if k1 == k] for k in all_k}
73 | 
74 | 
75 | def transpose_list_of_lists(l: Sequence[Sequence[X]]) -> Sequence[Sequence[X]]:
76 |     """
77 |     Returns the transposed list of lists.
78 |     Works on irregularly shaped (non-rectangular) lists of lists
79 |     Will 'compress' the result on irregularly shaped input
80 |     """
81 |     max_len = max(len(lin) for lin in l)
82 |     return [[lin[i] for lin in l if i < len(lin)] for i in range(max_len)]
83 | 
84 | 
85 | def merge_dicts(d1: FlattenedDict, d2: FlattenedDict, operation):
86 |     merged = d1 + d2
87 |     from itertools import groupby
88 |     from operator import itemgetter
89 |     from functools import reduce
90 |     sortd = sorted(merged, key=itemgetter(0))
91 |     grouped = groupby(sortd, key=itemgetter(0))
92 |     return [(key, reduce(operation, [x for _, x in group])) for key, group in grouped]
93 | 
94 | 


--------------------------------------------------------------------------------
/src/utils/standard_typevars.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Sequence, Mapping, Tuple
 2 | from utils.generic_typevars import S, A
 3 | 
 4 | """
 5 | VFType (= Callable[[S], float]) is the type that represents a value
 6 | function for the most general situation. Instead of thinking of a value
 7 | function as a dictionary from states to returns, think of a value function
 8 | as a function from states to returns. This representation works
 9 | for all forms of MDPs, discrete/finite or continuous state spaces. 
10 | 
11 | QFType (= Callable[[S], Callable[[A], float]]) is the type that represents
12 | the action value function (or Q function) for the most general situation.
13 | Instead of thinking of a Q Function as a dictionary from state, action pairs
14 | to returns, think of a Q Function as a function from states to {functions
15 | from actions to returns}. This representation works for all forms of MDPs,
16 | discrete/finite or continuous state spaces and action spaces.
17 | 
18 | PolicyType (= Callable[[S], Callable[[int], Sequence[A]]]) is the type
19 | thet represents a stochastic policy for the most general situation. Instead 
20 | of thinking of a policy as a dictionary from states to {dictionary from
21 | actions to probabilities}, think of a policy as a function from states to
22 | probability distributions where a probability distribution has the most
23 | general representation (that would work for discrete or finite action spaces).
24 | This general representation of a probability distribution is a function 
25 | that takes as input the number of action samples and produces as output
26 | a sequence of actions drawn from that probability distribution. In other
27 | words, we can make the probability distribution as fine or coarse as we
28 | wnt by controlling the input to this function (the requested number of
29 | sample points).
30 | 
31 | VFDictType (= Mapping[S, float]) is the type that represents a value function
32 | for a finite set of states amd hence, is represented as a data structure rather
33 | than a function. One can always produce a VFType from a VFDictType by wrapping
34 | the dictionary with a function.
35 | 
36 | QFDictType (= Mapping[S, Mapping[A, float]) is the type that represents an
37 | action value function (or Q function) for a finite set of states and actions.
38 | Hence, it is represented as a data structure rather than as a function.
39 | One can always produce a QFType from a QFDictType by wrapping the dictionary
40 | of dictionaries with a function returning a function.
41 | 
42 | PolicyActDictType (= Callable[[S], Mapping[A, float]]) is the type that 
43 | represents a policy for arbitrary state space and finite action spaces.
44 | 
45 | The S*f types are types required for tabular methods which work with
46 | nested dictionaries (rather than functions)
47 | """
48 | 
49 | VFType = Callable[[S], float]
50 | QFType = Callable[[S], Callable[[A], float]]
51 | PolicyType = Callable[[S], Callable[[int], Sequence[A]]]
52 | 
53 | VFDictType = Mapping[S, float]
54 | QFDictType = Mapping[S, Mapping[A, float]]
55 | PolicyActDictType = Callable[[S], Mapping[A, float]]
56 | 
57 | SSf = Mapping[S, Mapping[S, float]]
58 | SSTff = Mapping[S, Mapping[S, Tuple[float, float]]]
59 | STSff = Mapping[S, Tuple[Mapping[S, float], float]],
60 | SAf = Mapping[S, Mapping[A, float]]
61 | SASf = Mapping[S, Mapping[A, Mapping[S, float]]]
62 | SASTff = Mapping[S, Mapping[A, Mapping[S, Tuple[float, float]]]]
63 | SATSff = Mapping[S, Mapping[A, Tuple[Mapping[S, float], float]]]
64 | 
65 | 


--------------------------------------------------------------------------------
/src/func_approx/eligibility_traces.py:
--------------------------------------------------------------------------------
 1 | from typing import Sequence, Callable
 2 | from scipy.linalg import toeplitz
 3 | import numpy as np
 4 | 
 5 | 
 6 | def get_decay_toeplitz_matrix(
 7 |     size: int,
 8 |     decay_param: float
 9 | ) -> np.ndarray:
10 |     return toeplitz(
11 |         np.power(decay_param, np.arange(size)),
12 |         np.insert(np.zeros(size - 1), 0, 1.)
13 |     )
14 | 
15 | 
16 | # noinspection PyPep8Naming
17 | def get_generalized_back_prop(
18 |     dnn_params: Sequence[np.ndarray],
19 |     fwd_prop: Sequence[np.ndarray],
20 |     dObj_dOL: np.ndarray,
21 |     factors: np.ndarray,
22 |     decay_param: float,
23 |     hidden_activation_deriv: Callable[[np.ndarray], np.ndarray],
24 |     output_activation_deriv: Callable[[np.ndarray], np.ndarray]
25 | ) -> Sequence[np.ndarray]:
26 |     """
27 |     :param dnn_params: list (of length L+1) of (|O_L| x |I_L| + 1) 2-D array
28 |     :param fwd_prop: list (of length L+2), the first (L+1)elements are
29 |      n x (|I_l| + 1) 2-D arrays representing the inputs to the (L+1) layers,
30 |      and the last element is a n x 1 2-D array
31 |     :param dObj_dOL: 1-D array of length n
32 |     :param factors: 1-D array of length n
33 |     :param decay_param: [0,1] float representing decay in time
34 |     :param hidden_activation_deriv: function representing the derivative
35 |     of the hidden layer activation function (expressed as a function of the
36 |     output of the hidden layer activation function).
37 |     :param output_activation_deriv: function representing the derivative
38 |     of the output layer activation function (expressed as a function of the
39 |     output of the output layer activation function).
40 |     L is the number of hidden layers, n is the number of points
41 |     :return: list (of length L+1) of |O_l| x (|I_l| + 1) 2-D arrays,
42 |              i.e., same as the type of self.params
43 |     """
44 |     output = fwd_prop[-1][:, 0]
45 |     layer_inputs = fwd_prop[:-1]
46 |     # deriv initialized to 1 x n  = |O_L| x n 2-D array
47 |     deriv = (dObj_dOL * output_activation_deriv(output)).reshape(1, -1)
48 |     decay_matrix = get_decay_toeplitz_matrix(len(factors), decay_param)
49 |     back_prop = []
50 |     for l in reversed(range(len(dnn_params))):
51 |         # layer l gradient is factors tensordot (decay_matrix tensordot
52 |         # (deriv_l einsum layer_inputs_l) which is of dimension
53 |         # n tensordot ((n x n) tensordot ((|O_l| x n) einsum (n x (|I_l| + 1)))
54 |         # = n tensordot ((n x n) tensordot (n x |O_l| x (|I_l| + 1)))
55 |         # = n tensordot (n x |O_l| x (|I_l| + 1)) = |O_l| x (|I_l| + 1)
56 |         t1 = np.einsum('ij,jk->jik', deriv, layer_inputs[l])
57 |         if decay_param != 0:
58 |             t2 = np.tensordot(decay_matrix, t1, axes=1)
59 |         else:
60 |             t2 = t1
61 |         t3 = np.tensordot(factors, t2, axes=1)
62 |         back_prop.append(t3)
63 |         # deriv_l is dnn_params_{l+1}^T dot deriv_{l+1} haddamard g'(S_l), which is
64 |         # ((|I_{l+1}| + 1) x |O_{l+1}|) dot (|O_{l+1}| x n) haddamard
65 |         # ((|I_{l+1}| + 1) x n) --- g'(S_L) is expressed as hidden layer
66 |         # activation derivative as a function of O_l (=I_{l+1}).
67 |         # (notice first row  of the result is removed after this calculation).
68 |         # So, deriv_l has dimension |I_{l+1}| x n = |O_l| x n
69 |         deriv = (np.dot(dnn_params[l].T, deriv) *
70 |                  hidden_activation_deriv(layer_inputs[l].T))[1:]
71 |     return back_prop[::-1]
72 | 


--------------------------------------------------------------------------------
/src/algorithms/rl_func_approx/rl_func_approx_base.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Optional
 2 | from abc import abstractmethod
 3 | from algorithms.opt_base import OptBase
 4 | from processes.mdp_rep_for_rl_fa import MDPRepForRLFA
 5 | from algorithms.func_approx_spec import FuncApproxSpec
 6 | from func_approx.func_approx_base import FuncApproxBase
 7 | from algorithms.helper_funcs import get_uniform_policy_func
 8 | from algorithms.helper_funcs import get_epsilon_decay_func
 9 | from algorithms.helper_funcs import get_pdf_from_samples
10 | from operator import itemgetter
11 | from utils.generic_typevars import S, A
12 | from utils.standard_typevars import VFType, QFType
13 | from utils.standard_typevars import PolicyType, PolicyActDictType
14 | 
15 | 
16 | class RLFuncApproxBase(OptBase):
17 | 
18 |     NUM_SAMPLES_PER_ACTION = 10
19 | 
20 |     def __init__(
21 |         self,
22 |         mdp_rep_for_rl: MDPRepForRLFA,
23 |         exploring_start: bool,
24 |         softmax: bool,
25 |         epsilon: float,
26 |         epsilon_half_life: float,
27 |         num_episodes: int,
28 |         max_steps: int,
29 |         fa_spec: FuncApproxSpec
30 |     ) -> None:
31 | 
32 |         self.mdp_rep: MDPRepForRLFA = mdp_rep_for_rl
33 |         self.exploring_start: bool = exploring_start
34 |         self.softmax: bool = softmax
35 |         self.epsilon_func: Callable[[int], float] = get_epsilon_decay_func(
36 |             epsilon,
37 |             epsilon_half_life
38 |         )
39 |         self.num_episodes: int = num_episodes
40 |         self.max_steps: int = max_steps
41 |         self.vf_fa: FuncApproxBase = fa_spec.get_vf_func_approx_obj()
42 |         self.qvf_fa: FuncApproxBase = fa_spec.get_qvf_func_approx_obj()
43 |         self.state_action_func = self.mdp_rep.state_action_func
44 | 
45 |     def get_init_policy_func(self) -> PolicyActDictType:
46 |         return get_uniform_policy_func(self.state_action_func)
47 | 
48 |     def get_value_func_fa(self, polf: PolicyActDictType) -> VFType:
49 |         qv_func = self.get_qv_func_fa(polf)
50 | 
51 |         # noinspection PyShadowingNames
52 |         def vf(s: S, polf=polf, qv_func=qv_func) -> float:
53 |             return sum(polf(s)[a] * qv_func(s)(a) for a in
54 |                        self.state_action_func(s))
55 | 
56 |         return vf
57 | 
58 |     # noinspection PyShadowingNames
59 |     def get_value_func(self, pol_func: PolicyType) -> VFType:
60 |         return self.get_value_func_fa(
61 |             lambda s, pol_func=pol_func: get_pdf_from_samples(
62 |                 pol_func(s)(len(self.state_action_func(s)) *
63 |                             RLFuncApproxBase.NUM_SAMPLES_PER_ACTION)
64 |             )
65 |         )
66 | 
67 |     @abstractmethod
68 |     def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType:
69 |         pass
70 | 
71 |     # noinspection PyShadowingNames
72 |     def get_act_value_func(self, pol_func: PolicyType) -> QFType:
73 |         return self.get_qv_func_fa(
74 |             lambda s, pol_func=pol_func: get_pdf_from_samples(
75 |                 pol_func(s)(len(self.state_action_func(s)) *
76 |                             RLFuncApproxBase.NUM_SAMPLES_PER_ACTION)
77 |             )
78 |         )
79 | 
80 |     def get_optimal_det_policy_func(self) -> Callable[[S], A]:
81 |         qv_func = self.get_qv_func_fa(None)
82 | 
83 |         # noinspection PyShadowingNames
84 |         def detp_func(s: S, qv_func=qv_func) -> A:
85 |             return max(
86 |                 [(a, qv_func(s)(a)) for a in self.state_action_func(s)],
87 |                 key=itemgetter(1)
88 |             )[0]
89 | 
90 |         return detp_func
91 | 


--------------------------------------------------------------------------------
/src/examples/exam_problems/W2021/career_optimization.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple, Mapping, Dict, Sequence, Iterable
 2 | from rl.markov_decision_process import FiniteMarkovDecisionProcess
 3 | from rl.dynamic_programming import value_iteration_result
 4 | from rl.distribution import Categorical
 5 | from scipy.stats import poisson
 6 | 
 7 | IntPair = Tuple[int, int]
 8 | CareerDecisionsMap = Mapping[int, Mapping[
 9 |     IntPair,
10 |     Categorical[Tuple[int, float]]
11 | ]]
12 | 
13 | 
14 | class CareerOptimization(FiniteMarkovDecisionProcess[int, IntPair]):
15 | 
16 |     def __init__(
17 |         self,
18 |         hours: int,
19 |         wage_cap: int,
20 |         alpha: float,
21 |         beta: float
22 |     ):
23 |         self.hours = hours
24 |         self.wage_cap = wage_cap
25 |         self.alpha = alpha
26 |         self.beta = beta
27 |         super().__init__(self.get_transitions())
28 | 
29 |     def get_transitions(self) -> CareerDecisionsMap:
30 |         d: Dict[int, Mapping[IntPair, Categorical[Tuple[int, float]]]] = {}
31 |         for w in range(1, self.wage_cap + 1):
32 |             d1: Dict[IntPair, Categorical[Tuple[int, float]]] = {}
33 |             for s in range(self.hours + 1):
34 |                 for t in range(self.hours + 1 - s):
35 |                     pd = poisson(self.alpha * t)
36 |                     prob: float = self.beta * s / self.hours
37 |                     r: float = w * (self.hours - s - t)
38 |                     same_prob: float = (1 - prob) * pd.pmf(0)
39 |                     sr_probs: Dict[Tuple[int, float], float] = {}
40 |                     if w == self.wage_cap:
41 |                         sr_probs[(w, r)] = 1.
42 |                     elif w == self.wage_cap - 1:
43 |                         sr_probs[(w, r)] = same_prob
44 |                         sr_probs[(w + 1, r)] = 1 - same_prob
45 |                     else:
46 |                         sr_probs[(w, r)] = same_prob
47 |                         sr_probs[(w + 1, r)] = prob * pd.pmf(0) + pd.pmf(1)
48 |                         for w1 in range(w + 2, self.wage_cap):
49 |                             sr_probs[(w1, r)] = pd.pmf(w1 - w)
50 |                         sr_probs[(self.wage_cap, r)] = \
51 |                             1 - pd.cdf(self.wage_cap - w - 1)
52 |                     d1[(s, t)] = Categorical(sr_probs)
53 |             d[w] = d1
54 |         return d
55 | 
56 | 
57 | if __name__ == '__main__':
58 | 
59 |     import matplotlib.pyplot as plt
60 |     from pprint import pprint
61 |     hours: int = 10
62 |     wage_cap: int = 30
63 |     alpha: float = 0.08
64 |     beta: float = 0.82
65 |     gamma: float = 0.95
66 | 
67 |     co: CareerOptimization = CareerOptimization(
68 |         hours=hours,
69 |         wage_cap=wage_cap,
70 |         alpha=alpha,
71 |         beta=beta
72 |     )
73 | 
74 |     _, opt_policy = value_iteration_result(co, gamma=gamma)
75 |     wages: Iterable[int] = range(1, co.wage_cap + 1)
76 |     opt_actions: Mapping[int, Tuple[int, int]] = \
77 |         {w: opt_policy.act(w).value for w in wages}
78 |     searching: Sequence[int] = [s for _, (s, _) in opt_actions.items()]
79 |     learning: Sequence[int] = [l for _, (_, l) in opt_actions.items()]
80 |     working: Sequence[int] = [co.hours - s - l for _, (s, l) in
81 |                               opt_actions.items()]
82 |     pprint(opt_actions)
83 |     plt.xticks(wages)
84 |     p1 = plt.bar(wages, searching, color='red')
85 |     p2 = plt.bar(wages, learning, color='blue')
86 |     p3 = plt.bar(wages, working, color='green')
87 |     plt.legend((p1[0], p2[0], p3[0]), ('Job-Searching', 'Learning', 'Working'))
88 |     plt.grid(axis='y')
89 |     plt.xlabel("Hourly Wage Level")
90 |     plt.ylabel("Hours Spent")
91 |     plt.title("Career Optimization")
92 |     plt.show()
93 | 


--------------------------------------------------------------------------------
/src/examples/exam_problems/mrp_tdmc_outline.py:
--------------------------------------------------------------------------------
  1 | from typing import Sequence, Tuple, Mapping
  2 | 
  3 | S = str
  4 | DataType = Sequence[Sequence[Tuple[S, float]]]
  5 | ProbFunc = Mapping[S, Mapping[S, float]]
  6 | RewardFunc = Mapping[S, float]
  7 | ValueFunc = Mapping[S, float]
  8 | 
  9 | 
 10 | def get_state_return_samples(
 11 |     data: DataType
 12 | ) -> Sequence[Tuple[S, float]]:
 13 |     """
 14 |     prepare sequence of (state, return) pairs.
 15 |     Note: (state, return) pairs is not same as (state, reward) pairs.
 16 |     """
 17 |     return [(s, sum(r for (_, r) in l[i:]))
 18 |             for l in data for i, (s, _) in enumerate(l)]
 19 | 
 20 | 
 21 | def get_mc_value_function(
 22 |     state_return_samples: Sequence[Tuple[S, float]]
 23 | ) -> ValueFunc:
 24 |     """
 25 |     Implement tabular MC Value Function compatible with the interface defined above.
 26 |     """
 27 | 
 28 | 
 29 | def get_state_reward_next_state_samples(
 30 |     data: DataType
 31 | ) -> Sequence[Tuple[S, float, S]]:
 32 |     """
 33 |     prepare sequence of (state, reward, next_state) triples.
 34 |     """
 35 |     return [(s, r, l[i+1][0] if i < len(l) - 1 else 'T')
 36 |             for l in data for i, (s, r) in enumerate(l)]
 37 | 
 38 | 
 39 | def get_probability_and_reward_functions(
 40 |     srs_samples: Sequence[Tuple[S, float, S]]
 41 | ) -> Tuple[ProbFunc, RewardFunc]:
 42 |     """
 43 |     Implement code that produces the probability transitions and the
 44 |     reward function compatible with the interface defined above.
 45 |     """
 46 | 
 47 | 
 48 | def get_mrp_value_function(
 49 |     prob_func: ProbFunc,
 50 |     reward_func: RewardFunc
 51 | ) -> ValueFunc:
 52 |     """
 53 |     Implement code that calculates the MRP Value Function from the probability
 54 |     transitions and reward function, compatible with the interface defined above.
 55 |     Hint: Use the MRP Bellman Equation and simple linear algebra
 56 |     """
 57 | 
 58 | 
 59 | def get_td_value_function(
 60 |     srs_samples: Sequence[Tuple[S, float, S]],
 61 |     num_updates: int = 300000,
 62 |     learning_rate: float = 0.3,
 63 |     learning_rate_decay: int = 30
 64 | ) -> ValueFunc:
 65 |     """
 66 |     Implement tabular TD(0) (with experience replay) Value Function compatible
 67 |     with the interface defined above. Let the step size (alpha) be:
 68 |     learning_rate * (updates / learning_rate_decay + 1) ** -0.5
 69 |     so that Robbins-Monro condition is satisfied for the sequence of step sizes.
 70 |     """
 71 | 
 72 | 
 73 | def get_lstd_value_function(
 74 |     srs_samples: Sequence[Tuple[S, float, S]]
 75 | ) -> ValueFunc:
 76 |     """
 77 |     Implement LSTD Value Function compatible with the interface defined above.
 78 |     Hint: Tabular is a special case of linear function approx where each feature
 79 |     is an indicator variables for a corresponding state and each parameter is
 80 |     the value function for the corresponding state.
 81 |     """
 82 | 
 83 | 
 84 | if __name__ == '__main__':
 85 |     given_data: DataType = [
 86 |         [('A', 2.), ('A', 6.), ('B', 1.), ('B', 2.)],
 87 |         [('A', 3.), ('B', 2.), ('A', 4.), ('B', 2.), ('B', 0.)],
 88 |         [('B', 3.), ('B', 6.), ('A', 1.), ('B', 1.)],
 89 |         [('A', 0.), ('B', 2.), ('A', 4.), ('B', 4.), ('B', 2.), ('B', 3.)],
 90 |         [('B', 8.), ('B', 2.)]
 91 |     ]
 92 | 
 93 |     sr_samps = get_state_return_samples(given_data)
 94 | 
 95 |     print("------------- MONTE CARLO VALUE FUNCTION --------------")
 96 |     print(get_mc_value_function(sr_samps))
 97 | 
 98 |     srs_samps = get_state_reward_next_state_samples(given_data)
 99 | 
100 |     pfunc, rfunc = get_probability_and_reward_functions(srs_samps)
101 |     print("-------------- MRP VALUE FUNCTION ----------")
102 |     print(get_mrp_value_function(pfunc, rfunc))
103 | 
104 |     print("------------- TD VALUE FUNCTION --------------")
105 |     print(get_td_value_function(srs_samps))
106 | 
107 |     print("------------- LSTD VALUE FUNCTION --------------")
108 |     print(get_lstd_value_function(srs_samps))
109 | 


--------------------------------------------------------------------------------
/src/algorithms/backward_dp.py:
--------------------------------------------------------------------------------
  1 | from typing import Mapping, Sequence, Tuple, Generic
  2 | from utils.gen_utils import is_approx_eq
  3 | from utils.generic_typevars import S, A
  4 | from utils.standard_typevars import SASTff
  5 | from operator import itemgetter
  6 | 
  7 | 
  8 | class BackwardDP(Generic[S, A]):
  9 | 
 10 |     def __init__(
 11 |         self,
 12 |         transitions_rewards: Sequence[SASTff],
 13 |         terminal_opt_val: Mapping[S, float],
 14 |         gamma: float
 15 |     ) -> None:
 16 |         if BackwardDP.verify_data(transitions_rewards, terminal_opt_val, gamma):
 17 |             self.transitions_rewards = transitions_rewards
 18 |             self.terminal_opt_val = terminal_opt_val
 19 |             self.gamma = gamma
 20 |             self.vf_and_policy = self.get_vf_and_policy()
 21 |         else:
 22 |             raise ValueError
 23 | 
 24 |     @staticmethod
 25 |     def verify_data(
 26 |         transitions_rewards: Sequence[SASTff],
 27 |         terminal_opt_val: Mapping[S, float],
 28 |         gamma: float
 29 |     ) -> bool:
 30 |         valid = 0. <= gamma <= 1.
 31 |         time_len = len(transitions_rewards)
 32 |         i = 0
 33 |         while valid and i < time_len:
 34 |             this_d = transitions_rewards[i]
 35 |             check_actions = all(len(v) > 0 for _, v in this_d.items())
 36 |             next_dict = [{k: v for k, (v, _) in d1.items()}
 37 |                          for _, d in this_d.items() for _, d1 in d.items()]
 38 |             check_pos = all(all(x >= 0 for x in d1.values()) for d1 in next_dict)
 39 |             check_sum = all(is_approx_eq(sum(d1.values()), 1.0) for d1 in next_dict)
 40 |             states = set((transitions_rewards[i+1]
 41 |                          if i < time_len - 1 else terminal_opt_val).keys())
 42 |             subset = all(set(d1.keys()).issubset(states) for d1 in next_dict)
 43 |             valid = valid and check_actions and check_pos and check_sum and subset
 44 |             i = i + 1
 45 |         return valid
 46 | 
 47 |     def get_vf_and_policy(self) -> Sequence[Mapping[S, Tuple[float, A]]]:
 48 |         vf_pol = {s: (v, None) for s, v in self.terminal_opt_val.items()}
 49 |         ret = []
 50 |         for tr in self.transitions_rewards[::-1]:
 51 |             vf_pol = {s: max(
 52 |                 [(
 53 |                     sum(p * (r + self.gamma * vf_pol[s1][0])
 54 |                         for s1, (p, r) in d1.items()),
 55 |                     a
 56 |                 ) for a, d1 in d.items()],
 57 |                 key=itemgetter(0)
 58 |             ) for s, d in tr.items()}
 59 |             ret.append(vf_pol)
 60 |         return ret[::-1]
 61 | 
 62 | 
 63 | if __name__ == '__main__':
 64 |     from scipy.stats import poisson
 65 |     T: int = 50  # time steps
 66 |     M: int = 10  # initial inventory
 67 |     # the following are (price, poisson mean) pairs, i.e., elasticity
 68 |     el: Sequence[Tuple[float, float]] = [
 69 |         (10.0, 0.1), (9.0, 0.16), (8.0, 0.22),
 70 |         (7.0, 0.28), (6.0, 0.38), (5.0, 0.5)
 71 |     ]
 72 |     rvs = [(p, poisson(l)) for p, l in el]
 73 | 
 74 |     tr_rew_dict = {
 75 |         s: {
 76 |             p: {
 77 |                 s - d: (
 78 |                     rv.pmf(d) if d < s else 1. - rv.cdf(s - 1),
 79 |                     d * p
 80 |                 ) for d in range(s + 1)
 81 |             } for p, rv in rvs
 82 |         } for s in range(M + 1)
 83 |     }
 84 |     bdp = BackwardDP(
 85 |         transitions_rewards=[tr_rew_dict] * T,
 86 |         terminal_opt_val={s: 0. for s in range(M + 1)},
 87 |         gamma=1.
 88 |     )
 89 |     for i in range(T):
 90 |         print([(x, y) for x, (y, _) in bdp.vf_and_policy[i].items()])
 91 |     for i in range(T):
 92 |         print([(x, z) for x, (_, z) in bdp.vf_and_policy[i].items()])
 93 | 
 94 |     tr_rew_dicts = []
 95 |     states = {float(M)}
 96 |     for t in range(T):
 97 |         tr_rew_dicts.append(
 98 |             {
 99 |                 s: {
100 |                     p: {
101 |                         max(s - d, 0.): (1.0, min(s, d) * p)
102 |                     } for p, d in el
103 |                 } for s in states
104 |             }
105 |         )
106 |         states = {max(s - d, 0.) for s in states for _, d in el}
107 | 
108 |     bdp = BackwardDP(
109 |         transitions_rewards=tr_rew_dicts,
110 |         terminal_opt_val={s: 0. for s in states},
111 |         gamma=1.
112 |     )
113 | 
114 |     state = float(M)
115 |     for t in range(T):
116 |         v, p = bdp.vf_and_policy[t][state]
117 |         print((t, state, p, v))
118 |         d = el[[x for x, _ in el].index(p)][1]
119 |         state = max(state - d, 0.)
120 |     print(bdp.vf_and_policy[0].items())
121 | 
122 | 


--------------------------------------------------------------------------------
/src/func_approx/linear_approx.py:
--------------------------------------------------------------------------------
  1 | from typing import Sequence, Callable, Tuple, TypeVar
  2 | from func_approx.func_approx_base import FuncApproxBase
  3 | from func_approx.eligibility_traces import get_decay_toeplitz_matrix
  4 | from scipy.stats import norm
  5 | import numpy as np
  6 | 
  7 | X = TypeVar('X')
  8 | 
  9 | 
 10 | class LinearApprox(FuncApproxBase):
 11 | 
 12 |     def __init__(
 13 |         self,
 14 |         feature_funcs: Sequence[Callable[[X], float]],
 15 |         reglr_coeff: float = 0.,
 16 |         learning_rate: float = 0.1,
 17 |         adam: bool = True,
 18 |         adam_decay1: float = 0.9,
 19 |         adam_decay2: float = 0.99,
 20 |         add_unit_feature: bool = True
 21 |     ):
 22 |         super().__init__(
 23 |             feature_funcs,
 24 |             reglr_coeff,
 25 |             learning_rate,
 26 |             adam,
 27 |             adam_decay1,
 28 |             adam_decay2,
 29 |             add_unit_feature
 30 |         )
 31 | 
 32 |     def init_params(self) -> Sequence[np.ndarray]:
 33 |         return [np.zeros(self.num_features)]
 34 | 
 35 |     def init_adam_caches(self)\
 36 |             -> Tuple[Sequence[np.ndarray], Sequence[np.ndarray]]:
 37 |         return [np.zeros(self.num_features)],\
 38 |                [np.zeros(self.num_features)]
 39 | 
 40 |     def get_func_eval(self, x_vals: X):
 41 |         """
 42 |         This must return a float but lint is not happy, so removed the
 43 |         return type annotation
 44 |         """
 45 |         return np.dot(self.get_feature_vals(x_vals), self.params[0])
 46 | 
 47 |     def get_func_eval_pts(self, x_vals_seq: Sequence[X]) -> np.ndarray:
 48 |         return np.dot(
 49 |             self.get_feature_vals_pts(x_vals_seq),
 50 |             self.params[0]
 51 |         )
 52 | 
 53 |     def get_sum_loss_gradient(
 54 |         self,
 55 |         x_vals_seq: Sequence[X],
 56 |         supervisory_seq: Sequence[float]
 57 |     ) -> Sequence[np.ndarray]:
 58 |         # return [np.dot(self.get_func_eval_pts(x_vals_seq) - supervisory_seq,
 59 |         #               self.get_feature_vals_pts(x_vals_seq))]
 60 |         return [np.sum((self.get_func_eval(x) - supervisory_seq[i]) * self.get_feature_vals(x)
 61 |                        for i, x in enumerate(x_vals_seq))]
 62 | 
 63 |     # noinspection PyPep8Naming
 64 |     def get_sum_objective_gradient(
 65 |         self,
 66 |         x_vals_seq: Sequence[X],
 67 |         dObj_dOL: np.ndarray
 68 |     ) -> Sequence[np.ndarray]:
 69 |         return [dObj_dOL.dot(self.get_feature_vals_pts(x_vals_seq))]
 70 | 
 71 |     def get_el_tr_sum_loss_gradient(
 72 |         self,
 73 |         x_vals_seq: Sequence[X],
 74 |         supervisory_seq: Sequence[float],
 75 |         gamma_lambda: float
 76 |     ) -> Sequence[np.ndarray]:
 77 |         toeplitz_mat = get_decay_toeplitz_matrix(len(x_vals_seq), gamma_lambda)
 78 |         errors = self.get_func_eval_pts(x_vals_seq) - supervisory_seq
 79 |         func_grad = self.get_feature_vals_pts(x_vals_seq)
 80 |         return [errors.dot(toeplitz_mat.dot(func_grad))]
 81 | 
 82 |     # noinspection PyPep8Naming
 83 |     def get_el_tr_sum_objective_gradient(
 84 |         self,
 85 |         x_vals_seq: Sequence[X],
 86 |         dObj_dOL: np.ndarray,
 87 |         factors: np.ndarray,
 88 |         gamma_lambda: float
 89 |     ) -> Sequence[np.ndarray]:
 90 |         toep = get_decay_toeplitz_matrix(len(x_vals_seq), gamma_lambda)
 91 |         features = self.get_feature_vals_pts(x_vals_seq)
 92 |         return [factors.dot(toep.dot(np.diag(dObj_dOL).dot(features)))]
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 |     la = LinearApprox(
 97 |         feature_funcs=FuncApproxBase.get_identity_feature_funcs(3),
 98 |         reglr_coeff=0.,
 99 |         learning_rate=0.1,
100 |         adam=True,
101 |         adam_decay1=0.9,
102 |         adam_decay2=0.999,
103 |         add_unit_feature=True
104 |     )
105 |     alpha = 2.0
106 |     beta_1 = 10.0
107 |     beta_2 = 4.0
108 |     beta_3 = -6.0
109 |     beta = (beta_1, beta_2, beta_3)
110 |     x_pts = np.arange(-10.0, 10.0, 0.5)
111 |     y_pts = np.arange(-10.0, 10.0, 0.5)
112 |     z_pts = np.arange(-10.0, 10.0, 0.5)
113 |     pts = [(x, y, z) for x in x_pts for y in y_pts for z in z_pts]
114 | 
115 |     # noinspection PyShadowingNames
116 |     def superv_func(pt, alpha=alpha, beta=beta):
117 |         return alpha + np.dot(beta, pt)
118 | 
119 |     n = norm(loc=0., scale=1.)
120 |     superv_pts = [superv_func(r) + n.rvs(size=1)[0] for r in pts]
121 |     # import matplotlib.pyplot as plt
122 |     for _ in range(1000):
123 |         print(la.params[0])
124 |         la.update_params(pts, superv_pts)
125 |         pred_pts = [la.get_func_eval(x) for x in pts]
126 |         print(np.linalg.norm(np.array(pred_pts) - np.array(superv_pts)) /
127 |               np.sqrt(len(superv_pts)))
128 | 
129 | 


--------------------------------------------------------------------------------
/src/func_approx/func_approx_base.py:
--------------------------------------------------------------------------------
  1 | from typing import Sequence, Callable, Tuple, TypeVar, List, Set
  2 | from abc import ABC, abstractmethod
  3 | import numpy as np
  4 | 
  5 | X = TypeVar('X')
  6 | very_small_pos = 1e-6
  7 | 
  8 | 
  9 | class FuncApproxBase(ABC):
 10 | 
 11 |     def __init__(
 12 |         self,
 13 |         feature_funcs: Sequence[Callable[[X], float]],
 14 |         reglr_coeff: float,
 15 |         learning_rate: float,
 16 |         adam: bool,
 17 |         adam_decay1: float,
 18 |         adam_decay2: float,
 19 |         add_unit_feature: bool = True
 20 |     ):
 21 |         self.feature_funcs: Sequence[Callable[[X], float]] =\
 22 |             ([FuncApproxBase.get_unit_func] if add_unit_feature else []) + feature_funcs
 23 |         self.num_features = len(self.feature_funcs)
 24 |         self.reglr_coeff = reglr_coeff
 25 |         self.learning_rate = learning_rate
 26 |         self.adam = adam
 27 |         self.adam_decay1 = adam_decay1
 28 |         self.adam_decay2 = adam_decay2
 29 |         self.time = 0
 30 |         self.params: List[np.ndarray] = self.init_params()
 31 |         self.adam_caches: Tuple[List[np.ndarray], List[np.ndarray]]\
 32 |             = self.init_adam_caches()
 33 | 
 34 |     @staticmethod
 35 |     def get_unit_func(_: X) -> float:
 36 |         return 1.
 37 | 
 38 |     @staticmethod
 39 |     def get_identity_feature_funcs(n: int) -> List[Callable[[X], float]]:
 40 |         return [(lambda x, i=i: x[i]) for i in range(n)]
 41 | 
 42 |     @staticmethod
 43 |     def get_indicator_feature_funcs(values: Set[X])\
 44 |             -> List[Callable[[X], float]]:
 45 |         return [(lambda x, v=v: 1. if x == v else 0.) for v in values]
 46 | 
 47 |     def get_feature_vals(self, x_vals: X) -> np.ndarray:
 48 |         return np.array([f(x_vals) for f in self.feature_funcs])
 49 | 
 50 |     def get_feature_vals_pts(self, x_vals_seq: Sequence[X]) -> np.ndarray:
 51 |         return np.vstack(self.get_feature_vals(x) for x in x_vals_seq)
 52 | 
 53 |     @abstractmethod
 54 |     def init_params(self) -> Sequence[np.ndarray]:
 55 |         pass
 56 | 
 57 |     @abstractmethod
 58 |     def init_adam_caches(self)\
 59 |             -> Tuple[Sequence[np.ndarray], Sequence[np.ndarray]]:
 60 |         pass
 61 | 
 62 |     @abstractmethod
 63 |     def get_func_eval(self, x_vals: X) -> float:
 64 |         pass
 65 | 
 66 |     @abstractmethod
 67 |     def get_func_eval_pts(self, x_vals_seq: Sequence[X]) -> np.ndarray:
 68 |         pass
 69 | 
 70 |     @abstractmethod
 71 |     def get_sum_loss_gradient(
 72 |         self,
 73 |         x_vals_seq: Sequence[X],
 74 |         supervisory_seq: Sequence[float]
 75 |     ) -> Sequence[np.ndarray]:
 76 |         pass
 77 | 
 78 |     # noinspection PyPep8Naming
 79 |     @abstractmethod
 80 |     def get_sum_objective_gradient(
 81 |         self,
 82 |         x_vals_seq: Sequence[X],
 83 |         dObj_dOL: np.ndarray
 84 |     ) -> Sequence[np.ndarray]:
 85 |         pass
 86 | 
 87 |     @abstractmethod
 88 |     def get_el_tr_sum_loss_gradient(
 89 |         self,
 90 |         x_vals_seq: Sequence[X],
 91 |         supervisory_seq: Sequence[float],
 92 |         gamma_lambda: float
 93 |     ) -> Sequence[np.ndarray]:
 94 |         pass
 95 | 
 96 |     # noinspection PyPep8Naming
 97 |     @abstractmethod
 98 |     def get_el_tr_sum_objective_gradient(
 99 |         self,
100 |         x_vals_seq: Sequence[X],
101 |         dObj_dOL: np.ndarray,
102 |         factors: np.ndarray,
103 |         gamma_lambda: float
104 |     ) -> Sequence[np.ndarray]:
105 |         pass
106 | 
107 |     def update_params(
108 |         self,
109 |         x_vals_seq: Sequence[X],
110 |         supervisory_seq: Sequence[float]
111 |     ) -> None:
112 |         avg_loss_gradient = [g / len(x_vals_seq) for g in
113 |                              self.get_sum_loss_gradient(x_vals_seq, supervisory_seq)]
114 |         self.update_params_from_gradient(avg_loss_gradient)
115 | 
116 |     def update_params_from_gradient(
117 |         self,
118 |         gradient: Sequence[np.ndarray]
119 |     ) -> None:
120 |         self.time += 1
121 |         for l in range(len(self.params)):
122 |             g = gradient[l] + self.reglr_coeff * self.params[l]
123 |             if self.adam:
124 |                 self.adam_caches[0][l] = self.adam_decay1 * self.adam_caches[0][l] +\
125 |                     (1 - self.adam_decay1) * g
126 |                 self.adam_caches[1][l] = self.adam_decay2 * self.adam_caches[1][l] +\
127 |                     (1 - self.adam_decay2) * g ** 2
128 |                 self.params[l] -= self.learning_rate * self.adam_caches[0][l] /\
129 |                     (np.sqrt(self.adam_caches[1][l]) + very_small_pos) *\
130 |                     np.sqrt(1 - self.adam_decay2 ** self.time) /\
131 |                     (1 - self.adam_decay1 ** self.time)
132 |             else:
133 |                 self.params[l] -= self.learning_rate * g
134 | 


--------------------------------------------------------------------------------
/src/examples/exam_problems/mrp_tdmc.py:
--------------------------------------------------------------------------------
  1 | from typing import Sequence, Tuple, Mapping
  2 | from operator import itemgetter
  3 | import numpy as np
  4 | from itertools import groupby
  5 | from numpy.random import randint
  6 | 
  7 | S = str
  8 | DataType = Sequence[Sequence[Tuple[S, float]]]
  9 | ProbFunc = Mapping[S, Mapping[S, float]]
 10 | RewardFunc = Mapping[S, float]
 11 | ValueFunc = Mapping[S, float]
 12 | 
 13 | 
 14 | def get_state_return_samples(
 15 |     data: DataType
 16 | ) -> Sequence[Tuple[S, float]]:
 17 |     return [(s, sum(r for (_, r) in l[i:]))
 18 |             for l in data for i, (s, _) in enumerate(l)]
 19 | 
 20 | 
 21 | def get_mc_value_function(
 22 |     state_return_samples: Sequence[Tuple[S, float]]
 23 | ) -> ValueFunc:
 24 |     sorted_samples = sorted(state_return_samples, key=itemgetter(0))
 25 |     return {s: np.mean([r for _, r in l])
 26 |             for s, l in groupby(sorted_samples, itemgetter(0))}
 27 | 
 28 | 
 29 | def get_state_reward_next_state_samples(
 30 |     data: DataType
 31 | ) -> Sequence[Tuple[S, float, S]]:
 32 |     return [(s, r, l[i+1][0] if i < len(l) - 1 else 'T')
 33 |             for l in data for i, (s, r) in enumerate(l)]
 34 | 
 35 | 
 36 | def get_probability_and_reward_functions(
 37 |     srs_samples: Sequence[Tuple[S, float, S]]
 38 | ) -> Tuple[ProbFunc, RewardFunc]:
 39 |     d = {s: [(r, s1) for _, r, s1 in l] for s, l in
 40 |          groupby(sorted(srs_samples, key=itemgetter(0)), itemgetter(0))}
 41 | 
 42 |     prob_func = {s: {s1: len(list(l1)) / len(l) for s1, l1 in
 43 |                      groupby(sorted(l, key=itemgetter(1)), itemgetter(1))
 44 |                      if s1 != 'T'} for s, l in d.items()}
 45 |     reward_func = {s: np.mean([r for r, _ in l]) for s, l in d.items()}
 46 | 
 47 |     return prob_func, reward_func
 48 | 
 49 | 
 50 | def get_mrp_value_function(
 51 |     prob_func: ProbFunc,
 52 |     reward_func: RewardFunc
 53 | ) -> ValueFunc:
 54 |     states_list = list(reward_func.keys())
 55 |     reward_vec = np.array([reward_func[s] for s in states_list])
 56 |     prob_matrix = np.array([[prob_func[s][s1] if s1 in prob_func[s] else 0.
 57 |                             for s1 in states_list] for s in states_list])
 58 |     vec = np.linalg.inv(np.eye(len(states_list)) - prob_matrix).dot(reward_vec)
 59 |     return {states_list[i]: vec[i] for i in range(len(states_list))}
 60 | 
 61 | 
 62 | def get_td_value_function(
 63 |     srs_samples: Sequence[Tuple[S, float, S]],
 64 |     num_updates: int = 300000,
 65 |     learning_rate: float = 0.3,
 66 |     learning_rate_decay: int = 30
 67 | ) -> ValueFunc:
 68 |     ret = {s: [0.] for s in set(x for x, _, _ in srs_samples)}
 69 |     samples = len(srs_samples)
 70 |     for updates in range(num_updates):
 71 |         s, r, s1 = srs_samples[randint(samples, size=1)[0]]
 72 |         ret[s].append(ret[s][-1] + learning_rate *
 73 |                       (updates / learning_rate_decay + 1) ** -0.5
 74 |                       * (r + (ret[s1][-1] if s1 != 'T' else 0.) - ret[s][-1]))
 75 |     return {s: np.mean(v[-int(len(v) * 0.9):]) for s, v in ret.items()}
 76 | 
 77 | 
 78 | def get_lstd_value_function(
 79 |     srs_samples: Sequence[Tuple[S, float, S]]
 80 | ) -> ValueFunc:
 81 |     nt_states = list(set(x for x, _, _ in srs_samples))
 82 |     num_nt_states = len(nt_states)
 83 |     phi = np.eye(num_nt_states)
 84 |     a_mat = np.zeros((num_nt_states, num_nt_states))
 85 |     b_vec = np.zeros(num_nt_states)
 86 |     for s, r, s1 in srs_samples:
 87 |         p1 = phi[nt_states.index(s)]
 88 |         p2 = phi[nt_states.index(s1)] if s1 != 'T' else np.zeros(num_nt_states)
 89 |         a_mat += np.outer(p1, p1 - p2)
 90 |         b_vec += p1 * r
 91 |     return {nt_states[i]: v for i, v in
 92 |             enumerate(np.linalg.inv(a_mat).dot(b_vec))}
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 |     given_data: DataType = [
 97 |         [('A', 2.), ('A', 6.), ('B', 1.), ('B', 2.)],
 98 |         [('A', 3.), ('B', 2.), ('A', 4.), ('B', 2.), ('B', 0.)],
 99 |         [('B', 3.), ('B', 6.), ('A', 1.), ('B', 1.)],
100 |         [('A', 0.), ('B', 2.), ('A', 4.), ('B', 4.), ('B', 2.), ('B', 3.)],
101 |         [('B', 8.), ('B', 2.)]
102 |     ]
103 | 
104 |     print("------------- STATE-RETURN SAMPLES --------------")
105 |     sr_samps = get_state_return_samples(given_data)
106 |     print(sr_samps)
107 |     print("------------- MONTE CARLO VALUE FUNCTION --------------")
108 |     print(get_mc_value_function(sr_samps))
109 | 
110 |     print("------------- SRS SAMPLES ----------------")
111 |     srs_samps = get_state_reward_next_state_samples(given_data)
112 |     print(srs_samps)
113 | 
114 |     print("------------- MRP --------------")
115 |     pfunc, rfunc = get_probability_and_reward_functions(srs_samps)
116 |     print(pfunc)
117 |     print(rfunc)
118 |     print("-------------- MRP VALUE FUNCTION ----------")
119 |     print(get_mrp_value_function(pfunc, rfunc))
120 | 
121 |     print("------------- TD VALUE FUNCTION --------------")
122 |     print(get_td_value_function(srs_samps))
123 | 
124 |     print("------------- LSTD VALUE FUNCTION --------------")
125 |     print(get_lstd_value_function(srs_samps))
126 | 


--------------------------------------------------------------------------------
/src/examples/american_pricing/num_utils.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple, Callable
  2 | import numpy as np
  3 | from src.examples.american_pricing.bs_pricing import EuropeanBSPricing
  4 | from scipy.optimize import curve_fit
  5 | from numpy.polynomial.laguerre import lagval
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | 
  9 | def get_future_price_mean_var(
 10 |     x: float,
 11 |     t: float,
 12 |     delta_t: float,
 13 |     lognormal: bool,  # whether dispersion is multiplied by x or not
 14 |     rate_int_func: Callable[[float], float],  # ir integral
 15 |     sigma2_int_func: Callable[[float], float],  # sigma^2 integral
 16 | ) -> Tuple[float, float]:
 17 |     """
 18 |     :param x: represents underlying price at time t (= x_t)
 19 |     :param t: represents current time t
 20 |     :param delta_t: represents interval of time beyond t at which
 21 |     we want the future price, i.e., at time t + delta_t
 22 |     :param lognormal: this indicates whether dispersion func is
 23 |     multiplied by x or not (i.e., whether lognormal or normal)
 24 |     :param rate_int_func: this is ir(t) func
 25 |     :param sigma2_int_func: this is isig(t) func
 26 |     :return: mean and variance of x_{t+delta_t} if
 27 |     lognormal == True, else return mean and variance of
 28 |     log(x_{t+delta_t})
 29 | 
 30 |     rate_int_func is ir(t) = int_0^t r(u) du
 31 | 
 32 |     If lognormal == True, we have generalized GBM
 33 |     dx_t = r(t) x_t dt + sigma(t) x_t dz_t
 34 |     The solution is (denoting t + delta_t as t1):
 35 |     x_{t1} = x_t . e^{int_t^{t1} (r(u) -
 36 |      sigma^2(u)/2) du + int_t^{t1} sigma(u) dz_u}
 37 |      So, log(x_{t1}) is normal with:
 38 |     Mean[log(x_{t1})] = log(x_t) + int_t^{t1} (r(u) - sigma^2(u)/2) du
 39 |     Variance[log(x_{t1}] = int_t^{t1} sigma^2(u) du
 40 |     In the case that lognormal == True, sigma2_int_func
 41 |     = isig(t) = int_0^t sigma^2(u) du
 42 |     Therefore, in the case that lognormal == True,
 43 |     log(x_{t1}) is normal with:
 44 |     Mean[log(x_{t1})] = log(x_t) + ir(t1) - ir(t) + (isig(t) - isig(t1)) / 2
 45 |     Variance[log(x_{t1})] = isig(t1) - isig(t)
 46 | 
 47 |     If lognormal == False, we have generalize OU with mean-reversion to 0
 48 |     dx_t = r(t) x_t dt + sigma(t) dz_t
 49 |     The solution is (denoting t + delta_t as t1)
 50 |     x_{t1} = x_t e^{int_t^{t1} r(u) du} +
 51 |      (e^{int_0^{t1} r(u) du}) . (int_t^t1 sigma(u) e^{-int_0^u r(s) ds} d_zu)
 52 |      So, x_{t1} is normal with:
 53 |     Mean[x_{t1}] = x_t . e^{int_t^{t1} r(u) du}
 54 |     Variance[x_{t1}] = (e^{int_0^{t1} 2 r(u) du})) .
 55 |     (int_t^t1 sigma^2(u) e^{-int_0^u 2 r(s) ds} du)
 56 |     In the case that lognormal == False, sigma2_int_func
 57 |     = isig(t) = int_0^t sigma^2(u) . e^{-int_0^u 2 r(s) ds} . du
 58 |     Therefore, in the case that lognormal == False,
 59 |     x_{t1} is normal with:
 60 |     Mean[x_{t1}] = x_t . e^{ir(t1) - ir(t)}
 61 |     Variance[x_{t1}] = e^{2 ir(t1)} . (isig(t1) - isig(t))
 62 |     """
 63 |     ir_t = rate_int_func(t)
 64 |     ir_t1 = rate_int_func(t + delta_t)
 65 |     isig_t = sigma2_int_func(t)
 66 |     isig_t1 = sigma2_int_func(t + delta_t)
 67 |     ir_diff = ir_t1 - ir_t
 68 |     isig_diff = isig_t1 - isig_t
 69 | 
 70 |     if lognormal:
 71 |         mean = np.log(x) + ir_diff - isig_diff / 2.
 72 |         var = isig_diff
 73 |     else:
 74 |         mean = x * np.exp(ir_diff)
 75 |         var = np.exp(2. * ir_t1) * isig_diff
 76 |     return mean, var
 77 | 
 78 | 
 79 | def plot_fitted_call_prices(
 80 |     is_call: bool,
 81 |     strike: float,
 82 |     expiry: float,
 83 |     r: float,
 84 |     sigma: float
 85 | ) -> None:
 86 |     spot_prices = np.linspace(strike * 0.5, strike * 1.5, 1001)
 87 |     option_prices = [EuropeanBSPricing(
 88 |         is_call,
 89 |         s,
 90 |         strike,
 91 |         expiry,
 92 |         r,
 93 |         sigma
 94 |     ).get_option_price() for s in spot_prices]
 95 | 
 96 |     def fit_func(
 97 |         x: np.ndarray,
 98 |         a: float,
 99 |         b: float,
100 |         c: float
101 |     ) -> np.ndarray:
102 |         return a * np.exp(b * x + c)
103 | 
104 |     def jac_func(
105 |         x: np.ndarray,
106 |         a: float,
107 |         b: float,
108 |         c: float
109 |     ) -> np.ndarray:
110 |         t = np.exp(b * x + c)
111 |         da = t
112 |         db = a * t * x
113 |         dc = a * t
114 |         return np.transpose([da, db, dc])
115 | 
116 |     fp = curve_fit(
117 |         f=fit_func,
118 |         xdata=spot_prices,
119 |         ydata=option_prices,
120 |         jac=jac_func
121 |     )[0]
122 |     pred1_option_prices = fit_func(spot_prices, fp[0], fp[1], fp[2])
123 | 
124 |     num_laguerre = 10
125 |     ident = np.eye(num_laguerre)
126 |     spot_features = np.array([[1.] + [np.exp(-s / (strike * 2)) *
127 |                                       lagval(s / strike, ident[i]) for i in
128 |                                       range(num_laguerre)] for s in spot_prices])
129 |     lp = np.linalg.lstsq(
130 |         spot_features,
131 |         np.array(option_prices),
132 |         rcond=None
133 |     )[0]
134 |     pred2_option_prices = spot_features.dot(lp)
135 | 
136 |     plt.plot(spot_prices, option_prices, 'r')
137 |     plt.plot(spot_prices, pred1_option_prices, 'b')
138 |     plt.plot(spot_prices, pred2_option_prices, 'g')
139 |     plt.show()
140 | 
141 | 
142 | if __name__ == '__main__':
143 |     is_call_val = False
144 |     strike_val = 80.0
145 |     expiry_val = 0.4
146 |     r_val = 0.02
147 |     sigma_val = 0.3
148 | 
149 |     plot_fitted_call_prices(
150 |         is_call=is_call_val,
151 |         strike=strike_val,
152 |         expiry=expiry_val,
153 |         r=r_val,
154 |         sigma=sigma_val
155 |     )
156 | 


--------------------------------------------------------------------------------
/src/algorithms/helper_funcs.py:
--------------------------------------------------------------------------------
  1 | from typing import Mapping, Set, Sequence, Optional, Callable, Tuple
  2 | from processes.policy import Policy
  3 | from processes.det_policy import DetPolicy
  4 | import numpy as np
  5 | from scipy.linalg import toeplitz
  6 | from operator import itemgetter
  7 | from collections import Counter
  8 | from processes.mp_funcs import get_epsilon_action_probs
  9 | from processes.mp_funcs import get_softmax_action_probs
 10 | from utils.generic_typevars import S, A
 11 | from utils.standard_typevars import SAf, PolicyType, PolicyActDictType
 12 | 
 13 | 
 14 | def get_uniform_policy(state_action_dict: Mapping[S, Set[A]]) -> Policy:
 15 |     return Policy({s: {a: 1. / len(v) for a in v} for s, v in
 16 |                    state_action_dict.items()})
 17 | 
 18 | 
 19 | def get_uniform_policy_func(state_action_func: Callable[[S], Set[A]]) \
 20 |         -> Callable[[S], Mapping[A, float]]:
 21 | 
 22 |     # noinspection PyShadowingNames
 23 |     def upf(s: S, state_action_func=state_action_func) -> Mapping[A, float]:
 24 |         actions = state_action_func(s)
 25 |         return {a: 1. / len(actions) for a in actions}
 26 | 
 27 |     return upf
 28 | 
 29 | 
 30 | def get_returns_from_rewards_terminating(
 31 |     rewards: Sequence[float],
 32 |     gamma: float
 33 | ) -> np.ndarray:
 34 |     sz = len(rewards)
 35 |     return toeplitz(
 36 |         np.insert(np.zeros(sz - 1), 0, 1.),
 37 |         np.power(gamma, np.arange(sz))
 38 |     ).dot(rewards)
 39 | 
 40 | 
 41 | def get_returns_from_rewards_non_terminating(
 42 |     rewards: Sequence[float],
 43 |     gamma: float,
 44 |     points: Optional[int] = None
 45 | ) -> np.ndarray:
 46 |     cnt = points if points is not None else len(rewards)
 47 |     return toeplitz(
 48 |         np.insert(np.zeros(cnt - 1), 0, 1.),
 49 |         np.concatenate((
 50 |             np.power(gamma, np.arange(len(rewards) - cnt + 1)),
 51 |             np.zeros(cnt - 1)
 52 |         ))
 53 |     ).dot(rewards)
 54 | 
 55 | 
 56 | def get_det_policy_from_qf_dict(qf_dict: SAf) -> DetPolicy:
 57 |     return DetPolicy({s: max(v.items(), key=itemgetter(1))[0]
 58 |                       for s, v in qf_dict.items()})
 59 | 
 60 | 
 61 | def get_soft_policy_from_qf_dict(
 62 |     qf_dict: SAf,
 63 |     softmax: bool,
 64 |     epsilon: float
 65 | ) -> Policy:
 66 |     if softmax:
 67 |         ret = Policy({s: get_softmax_action_probs(v) for s, v in
 68 |                       qf_dict.items()})
 69 |     else:
 70 |         ret = Policy({s: get_epsilon_action_probs(v, epsilon) for s, v in
 71 |                       qf_dict.items()})
 72 |     return ret
 73 | 
 74 | 
 75 | def get_soft_policy_func_from_qf(
 76 |     qf: Callable[[Tuple[S, A]], float],
 77 |     state_action_func: Callable[[S], Set[A]],
 78 |     softmax: bool,
 79 |     epsilon: float
 80 | ) -> Callable[[S], Mapping[A, float]]:
 81 | 
 82 |     # noinspection PyShadowingNames
 83 |     def sp_func(
 84 |         s: S,
 85 |         qf=qf,
 86 |         state_action_func=state_action_func,
 87 |         softmax=softmax,
 88 |         epsilon=epsilon
 89 |     ) -> Mapping[A, float]:
 90 |         av_dict = {a: qf((s, a)) for a in state_action_func(s)}
 91 |         return get_softmax_action_probs(av_dict) if softmax else\
 92 |             get_epsilon_action_probs(av_dict, epsilon)
 93 | 
 94 |     return sp_func
 95 | 
 96 | 
 97 | def get_vf_dict_from_qf_dict_and_policy(
 98 |     qf_dict: SAf,
 99 |     pol: Policy
100 | ) -> Mapping[A, float]:
101 |     return {s: sum(pol.get_state_action_probability(s, a) * q
102 |             for a, q in v.items()) for s, v in qf_dict.items()}
103 | 
104 | 
105 | def get_policy_func_for_fa(
106 |     pol_func: Callable[[S], Callable[[A], float]],
107 |     state_action_func: Callable[[S], Set[A]]
108 | ) -> Callable[[S], Mapping[A, float]]:
109 | 
110 |     # noinspection PyShadowingNames
111 |     def pf(
112 |         s: S,
113 |         pol_func=pol_func,
114 |         state_action_func=state_action_func
115 |     ) -> Mapping[A, float]:
116 |         return {a: pol_func(s)(a) for a in state_action_func(s)}
117 | 
118 |     return pf
119 | 
120 | 
121 | def get_nt_return_eval_steps(
122 |     max_steps: int,
123 |     gamma: float,
124 |     eps: float
125 | ) -> int:
126 |     low_limit = 0.2 * max_steps
127 |     high_limit = float(max_steps - 1)
128 |     if gamma == 0.:
129 |         val = high_limit
130 |     elif gamma == 1.:
131 |         val = low_limit
132 |     else:
133 |         val = min(
134 |             high_limit,
135 |             max(
136 |                 low_limit,
137 |                 max_steps - np.log(eps) / np.log(gamma)
138 |             )
139 |         )
140 |     return int(np.floor(val))
141 | 
142 | 
143 | def get_epsilon_decay_func(
144 |     epsilon,
145 |     epsilon_half_life
146 | ) -> Callable[[int], float]:
147 | 
148 |     # noinspection PyShadowingNames
149 |     def epsilon_decay(
150 |         t: int,
151 |         epsilon=epsilon,
152 |         epsilon_half_life=epsilon_half_life
153 |     ) -> float:
154 |         return epsilon * 2 ** -(t / epsilon_half_life)
155 | 
156 |     return epsilon_decay
157 | 
158 | 
159 | def get_pdf_from_samples(samples: Sequence[A]) -> Mapping[A, float]:
160 |     num_samples = len(samples)
161 |     c = Counter(samples)
162 |     return {k: v / num_samples for k, v in c.items()}
163 | 
164 | 
165 | def get_policy_as_action_dict(polf: PolicyType, num_samples: int)\
166 |         -> PolicyActDictType:
167 | 
168 |     def pf(s: S) -> Mapping[A, float]:
169 |         return get_pdf_from_samples(polf(s)(num_samples))
170 | 
171 |     return pf
172 | 
173 | 
174 | if __name__ == '__main__':
175 |     rewards_list = [1., 2., 3., 4., 5., 6.]
176 |     gamma_val = 0.9
177 |     count = 4
178 |     nt_returns_list = get_returns_from_rewards_non_terminating(
179 |         rewards_list,
180 |         gamma_val,
181 |         count
182 |     )
183 |     print(nt_returns_list)
184 |     term_returns_list = get_returns_from_rewards_terminating(
185 |         rewards_list,
186 |         gamma_val
187 |     )
188 |     print(term_returns_list)
189 | 
190 |     pd = {'a': 0.3, 'b': 0.2, 'c': 0.4, 'd': 0.1}
191 |     from processes.mp_funcs import get_sampling_func_from_prob_dict
192 |     seqf = get_sampling_func_from_prob_dict(pd)
193 |     seq = seqf(1000)
194 |     print(seq)
195 |     print(Counter(seq))
196 | 


--------------------------------------------------------------------------------
/src/algorithms/ams.py:
--------------------------------------------------------------------------------
  1 | from typing import Mapping, Set, Sequence, Tuple, Generic, Callable, Optional
  2 | from utils.generic_typevars import S, A
  3 | import numpy as np
  4 | from random import sample
  5 | 
  6 | 
  7 | class AdaptiveMultistageSampling(Generic[S, A]):
  8 | 
  9 |     def __init__(
 10 |             self,
 11 |             start_state: S,
 12 |             actions_sets: Sequence[Set[A]],
 13 |             num_samples: Sequence[int],
 14 |             state_gen_reward_funcs: Sequence[Callable[[S, A], Tuple[Callable[[], S], float]]],
 15 |             terminal_opt_val_func: Callable[[S], float],
 16 |             discount: float,
 17 |     ) -> None:
 18 |         if len(actions_sets) == len(num_samples) == len(state_gen_reward_funcs) and \
 19 |                 0. <= discount <= 1. and \
 20 |                 all(len(x) <= y for x, y in zip(actions_sets, num_samples)):
 21 |             self.start_state = start_state
 22 |             self.actions_sets = actions_sets
 23 |             self.num_samples = num_samples
 24 |             self.num_time_steps = len(actions_sets)
 25 |             self.state_gen_rewards_funcs = state_gen_reward_funcs
 26 |             self.terminal_opt_val_func = terminal_opt_val_func
 27 |             self.discount = discount
 28 |         else:
 29 |             raise ValueError
 30 | 
 31 |     def get_opt_val_and_internals(
 32 |             self,
 33 |             state: S,
 34 |             time_step: int
 35 |     ) -> Tuple[float, Optional[Mapping[A, Tuple[float, int]]]]:
 36 |         """
 37 |         This function estimates the optimal value function V*
 38 |         for a given state in a given time step. The output is
 39 |         a tuple (pair) where the first element is the estimate
 40 |         of the optimal value function V* and the second element
 41 |         is a dictionary where the keys are the actions for that
 42 |         time step and the values are a pair where the first
 43 |         element in the estimated optimal Q-value function Q*
 44 |         for that action and the second element is the number of
 45 |         samples drawn for the action (that was used in estimating
 46 |         the Q-value function Q* for that action)
 47 |         """
 48 |         if time_step == self.num_time_steps:
 49 |             ret = (self.terminal_opt_val_func(state), None)
 50 |         else:
 51 |             actions = self.actions_sets[time_step]
 52 |             state_gen_rewards = {a: self.state_gen_rewards_funcs[time_step](state, a)
 53 |                                  for a in actions}
 54 |             state_gens = {a: x for a, (x, _) in state_gen_rewards.items()}
 55 |             rewards = {a: y for a, (_, y) in state_gen_rewards.items()}
 56 |             #  sample each action once, sample each action's next state, and
 57 |             #  recursively call the next state's V* estimate
 58 |             val_sums = {a: self.get_opt_val_and_internals(state_gens[a](), time_step + 1)[0]
 59 |                         for a in actions}
 60 |             counts = {a: 1 for a in actions}
 61 |             #  loop num_samples[time_step] number of times (beyond the
 62 |             #  len(actions) samples that have already been done above
 63 |             for i in range(len(actions), self.num_samples[time_step]):
 64 |                 #  determine the actions that dominate on the UCB Q* estimated value
 65 |                 #  and pick one of these dominating actions at random, call it a*
 66 |                 ucb_vals = {a: rewards[a] + self.discount * val_sums[a] / counts[a]
 67 |                             + np.sqrt(2 * np.log(i) / counts[a]) for a in actions}
 68 |                 max_actions = {a for a, u in ucb_vals.items() if u == max(ucb_vals.values())}
 69 |                 a_star = sample(max_actions, 1)[0]
 70 |                 #  sample a*'s next state at random, and recursively call the next state's
 71 |                 #  V* estimate
 72 |                 next_state = state_gens[a_star]()
 73 |                 val_sums[a_star] += self.get_opt_val_and_internals(next_state, time_step + 1)[0]
 74 |                 counts[a_star] += 1
 75 | 
 76 |             #  return estimated V* as weighted average of the estimated Q* where weights are
 77 |             #  proportioned by the number of times an action was sampled
 78 |             ret1 = sum(counts[a] / self.num_samples[time_step] *
 79 |                        (rewards[a] + self.discount * val_sums[a] / counts[a])
 80 |                        for a in actions)
 81 |             ret2 = {a: (rewards[a] + self.discount * val_sums[a] / counts[a], counts[a])
 82 |                     for a in actions}
 83 |             ret = (ret1, ret2)
 84 | 
 85 |         return ret
 86 | 
 87 | 
 88 | if __name__ == '__main__':
 89 |     from scipy.stats import gamma
 90 |     from scipy.integrate import quad
 91 |     from utils.gen_utils import memoize
 92 | 
 93 |     init_inv: int = 80.0  # initial inventory
 94 |     steps: int = 4  # time steps
 95 |     step_samples: int = 20
 96 |     # the following are (price, gamma distribution mean) pairs, i.e., elasticity
 97 |     el: Mapping[float, float] = {10.0: 10.0, 8.0: 20.0, 5.0: 30.0}
 98 |     rvs = {p: gamma(l) for p, l in el.items()}
 99 |     terminal_vf: Callable[[S], float] = lambda s: 0.
100 |     this_discount: float = 1.0
101 | 
102 |     # noinspection PyShadowingNames
103 |     @memoize
104 |     def state_gen_rew_func(state: float, action: float, rvs=rvs) -> Tuple[Callable[[], float], float]:
105 |         # noinspection PyShadowingNames
106 |         def rew_f(x: float, state=state, action=action, rvs=rvs) -> float:
107 |             return rvs[action].pdf(x) * (action * min(state, x))
108 | 
109 |         mu = rvs[action].mean()
110 |         lower = mu - 4.0 * np.sqrt(mu)
111 |         upper = mu + 4.0 * np.sqrt(mu)
112 |         return (
113 |             lambda state=state, action=action, el=el: max(
114 |                 0.,
115 |                 state - np.random.gamma(el[action], scale=1.0, size=1)[0]
116 |             ),
117 |             quad(rew_f, lower, upper)[0]
118 |         )
119 | 
120 | 
121 |     obj = AdaptiveMultistageSampling(
122 |         start_state=init_inv,
123 |         actions_sets=[set(el)] * steps,
124 |         num_samples=[step_samples] * steps,
125 |         state_gen_reward_funcs=[state_gen_rew_func] * steps,
126 |         terminal_opt_val_func=terminal_vf,
127 |         discount=this_discount
128 |     )
129 | 
130 |     res = obj.get_opt_val_and_internals(init_inv, 0)
131 |     print(res)
132 | 


--------------------------------------------------------------------------------
/src/examples/clearance_pricing.py:
--------------------------------------------------------------------------------
  1 | from scipy.stats import poisson
  2 | from algorithms.backward_dp import BackwardDP
  3 | from typing import List, Tuple, Mapping, Any, Sequence
  4 | from matplotlib.ticker import PercentFormatter
  5 | from pathlib import Path
  6 | import numpy as np
  7 | from pprint import pprint
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | 
 11 | def get_clearance_backward_dp(
 12 |     time_steps: int,
 13 |     init_inv: int,
 14 |     base_demand: float,
 15 |     el: List[Tuple[float, float]],  # (price, poisson mean) pairs
 16 | ) -> BackwardDP:
 17 | 
 18 |     aug_el = [(0., 0.)] + el
 19 |     rvs = [poisson(base_demand * (1 + l)) for _, l in aug_el]
 20 |     num_el = len(aug_el)
 21 | 
 22 |     tr_rew_dict = {
 23 |         (s, p): {
 24 |             p1: {
 25 |                 (s - d, p1): (
 26 |                     rvs[p1].pmf(d) if d < s else 1. - rvs[p1].cdf(s - 1),
 27 |                     d * (1 - aug_el[p1][0])
 28 |                 ) for d in range(s + 1)
 29 |             } for p1 in range(p, num_el)
 30 |         } for s in range(init_inv + 1) for p in range(num_el)
 31 |     }
 32 |     return BackwardDP(
 33 |         transitions_rewards=[tr_rew_dict] * time_steps,
 34 |         terminal_opt_val={(s, p): 0. for s in range(init_inv + 1)
 35 |                           for p in range(num_el)},
 36 |         gamma=1.
 37 |     )
 38 | 
 39 | 
 40 | def get_performance(
 41 |     time_steps: int,
 42 |     init_inv: int,
 43 |     base_demand: float,
 44 |     el: List[Tuple[float, float]],
 45 |     num_traces: int
 46 | ) -> Mapping[str, Any]:
 47 |     vf_and_pol = get_clearance_backward_dp(
 48 |         time_steps,
 49 |         init_inv,
 50 |         base_demand,
 51 |         el
 52 |     ).vf_and_policy
 53 |     opt_vf = vf_and_pol[0][(init_inv, 0)][0]
 54 | 
 55 |     aug_el = [(0., 0.)] + el
 56 |     rvs = [poisson(base_demand * (1 + l)) for _, l in aug_el]
 57 | 
 58 |     all_revs = np.empty(num_traces)
 59 |     all_rem = np.empty((num_traces, time_steps))
 60 |     all_actions = np.empty((num_traces, time_steps))
 61 |     for i in range(num_traces):
 62 |         rev = 0.
 63 |         state = (init_inv, 0)
 64 |         for t in range(time_steps):
 65 |             action = vf_and_pol[t][state][1]
 66 |             price = 1 - aug_el[action][0]
 67 |             demand = rvs[action].rvs()
 68 |             rev += (min(state[0], demand) * price)
 69 |             state = (max(0, state[0] - demand), action)
 70 |             all_rem[i, t] = state[0]
 71 |             all_actions[i, t] = aug_el[action][0]
 72 |         all_revs[i] = rev
 73 | 
 74 |     mean_remaining = np.mean(all_rem, axis=0) /init_inv
 75 |     mean_salvage = mean_remaining[-1]
 76 |     mean_revenue = np.mean(all_revs) / init_inv
 77 |     mean_a_markdown = 1. - mean_salvage - mean_revenue
 78 |     mean_actions = np.mean(all_actions, axis=0) / init_inv
 79 |     stdev_remaining = np.std(all_rem, axis=0) / init_inv
 80 |     stdev_salvage = stdev_remaining[-1]
 81 |     stdev_revenue = np.std(all_revs) / init_inv
 82 |     stdev_a_markdown = np.sqrt(stdev_salvage ** 2 + stdev_revenue ** 2)
 83 |     stdev_actions = np.std(all_actions, axis=0) / init_inv
 84 | 
 85 |     return {
 86 |         "Optimal VF": opt_vf,
 87 |         "Mean Revenue": mean_revenue,
 88 |         "Mean AMarkdown": mean_a_markdown,
 89 |         "Mean Salvage": mean_salvage,
 90 |         "Stdev Revenue": stdev_revenue,
 91 |         "Stdev AMarkdown": stdev_a_markdown,
 92 |         "Stdev Salvage": stdev_salvage,
 93 |         "Mean Remaining": mean_remaining,
 94 |         "Mean Price Reductions": mean_actions,
 95 |         "Stdev Remaining": stdev_remaining,
 96 |         "Stdev Price Reductions": stdev_actions,
 97 |     }
 98 | 
 99 | 
100 | def graph_perf(
101 |     time_steps: int,
102 |     demand: float,
103 |     inv: Sequence[int],
104 |     elasticity: Tuple[float, float, float]
105 | ) -> None:
106 |     revs = []
107 |     ams = []
108 |     sals = []
109 |     for initial_inv in inv:
110 |         perf = get_performance(
111 |             time_steps,
112 |             initial_inv,
113 |             demand,
114 |             list(zip((0.3, 0.5, 0.7), elasticity)),
115 |             10000
116 |         )
117 |         revs.append(perf["Mean Revenue"] * 100)
118 |         ams.append(perf["Mean AMarkdown"] * 100)
119 |         sals.append(perf["Mean Salvage"] * 100)
120 |     plt.grid()
121 |     plt.plot(inv, revs, "k", label="Revenue")
122 |     plt.plot(inv, ams, "b", label="A-Markdown")
123 |     plt.plot(inv, sals, "r", label="Salvage")
124 |     plt.gca().yaxis.set_major_formatter(PercentFormatter())
125 |     plt.xlabel("Initial Inventory", fontsize=10)
126 |     plt.ylabel("Percentage of Initial Value", fontsize=10)
127 |     tup = (
128 |         time_steps,
129 |         demand,
130 |         elasticity[0] * 100,
131 |         elasticity[1] * 100,
132 |         elasticity[2] * 100
133 |     )
134 |     plt.title(
135 |         "Weeks=%d,WeeklyDemand=%.1f,Elasticity=[%d,%d,%d]" % tup,
136 |         fontsize=10
137 |     )
138 |     plt.legend(loc="upper right")
139 |     file_name = str(Path.home()) + ("/wks=%d&dem=%d&el=%d-%d-%d.png" % tup)
140 |     print("Created png file: " + file_name)
141 |     plt.savefig(file_name)
142 |     plt.close()
143 | 
144 | 
145 | if __name__ == '__main__':
146 |     ts: int = 8  # time steps
147 |     ii: int = 12  # initial inventory
148 |     bd: float = 1.0  # base demand
149 |     this_el: List[Tuple[float, float]] = [
150 |         (0.3, 0.5), (0.5, 1.1), (0.7, 1.4)
151 |     ]
152 |     # bdp = get_clearance_backward_dp(ts, ii, bd, this_el)
153 |     #
154 |     # for i in range(ts):
155 |     #     print([(x, y) for x, (y, _) in bdp.vf_and_policy[i].items()])
156 |     # for i in range(ts):
157 |     #     print([(x, z) for x, (_, z) in bdp.vf_and_policy[i].items()])
158 | 
159 |     traces = 10000
160 |     per = get_performance(ts, ii, bd, this_el, traces)
161 |     pprint(per)
162 | 
163 |     # ts: int = 8  # time steps
164 |     # bd: float = 1.0  # base demand
165 |     # invs: Sequence[int] = list(range(2, 30, 2))
166 |     #
167 |     # elasticities = [
168 |     #     (0.1, 0.3, 0.5),
169 |     #     (0.3, 0.7, 1.0),
170 |     #     (0.5, 0.8, 1.1),
171 |     #     (0.7, 1.2, 1.5),
172 |     #     (0.8, 1.3, 1.7),
173 |     #     (1.0, 1.5, 2.0),
174 |     #     (1.0, 2.0, 2.5),
175 |     #     (1.5, 2.5, 3.5),
176 |     #     (2.0, 4.0, 6.0)
177 |     # ]
178 |     # for els in elasticities:
179 |     #     graph_perf(ts, bd, invs, els)
180 | 


--------------------------------------------------------------------------------
/src/processes/mdp.py:
--------------------------------------------------------------------------------
  1 | from typing import Mapping, Set, Tuple, Generic
  2 | from utils.gen_utils import zip_dict_of_tuple, is_approx_eq
  3 | from processes.mp_funcs import get_all_states, get_actions_for_states
  4 | from processes.mp_funcs import verify_mdp, get_lean_transitions
  5 | from processes.policy import Policy
  6 | from processes.det_policy import DetPolicy
  7 | from processes.mp_funcs import mdp_rep_to_mrp_rep1, mdp_rep_to_mrp_rep2
  8 | from operator import itemgetter
  9 | from processes.mrp import MRP
 10 | from processes.mp_funcs import get_rv_gen_func
 11 | from processes.mdp_rep_for_adp import MDPRepForADP
 12 | from utils.generic_typevars import S, A
 13 | 
 14 | 
 15 | class MDP(Generic[S, A]):
 16 | 
 17 |     def __init__(
 18 |         self,
 19 |         info: Mapping[S, Mapping[A, Tuple[Mapping[S, float], float]]],
 20 |         gamma: float
 21 |     ) -> None:
 22 |         if verify_mdp(info):
 23 |             d = {k: zip_dict_of_tuple(v) for k, v in info.items()}
 24 |             d1, d2 = zip_dict_of_tuple(d)
 25 |             self.all_states: Set[S] = get_all_states(info)
 26 |             self.state_action_dict: Mapping[S, Set[A]] = \
 27 |                 get_actions_for_states(info)
 28 |             self.transitions: Mapping[S, Mapping[A, Mapping[S, float]]] = \
 29 |                 {s: {a: get_lean_transitions(v1) for a, v1 in v.items()}
 30 |                  for s, v in d1.items()}
 31 |             self.rewards: Mapping[S, Mapping[A, float]] = d2
 32 |             self.gamma: float = gamma
 33 |             self.terminal_states: Set[S] = self.get_terminal_states()
 34 |         else:
 35 |             raise ValueError
 36 | 
 37 |     def get_sink_states(self) -> Set[S]:
 38 |         return {k for k, v in self.transitions.items() if
 39 |                 all(len(v1) == 1 and k in v1.keys() for _, v1 in v.items())
 40 |                 }
 41 | 
 42 |     def get_terminal_states(self) -> Set[S]:
 43 |         """
 44 |         A terminal state is a sink state (100% probability to going back
 45 |         to itself, FOR EACH ACTION) and the rewards on those transitions back
 46 |         to itself are zero.
 47 |         """
 48 |         sink = self.get_sink_states()
 49 |         return {s for s in sink if
 50 |                 all(is_approx_eq(r, 0.0) for _, r in self.rewards[s].items())}
 51 | 
 52 |     def get_mrp(self, pol: Policy) -> MRP:
 53 |         tr = mdp_rep_to_mrp_rep1(self.transitions, pol.policy_data)
 54 |         rew = mdp_rep_to_mrp_rep2(self.rewards, pol.policy_data)
 55 |         return MRP({s: (v, rew[s]) for s, v in tr.items()}, self.gamma)
 56 | 
 57 |     def get_value_func_dict(self, pol: Policy)\
 58 |             -> Mapping[S, float]:
 59 |         mrp_obj = self.get_mrp(pol)
 60 |         value_func_vec = mrp_obj.get_value_func_vec()
 61 |         nt_vf = {mrp_obj.nt_states_list[i]: value_func_vec[i]
 62 |                  for i in range(len(mrp_obj.nt_states_list))}
 63 |         t_vf = {s: 0. for s in self.terminal_states}
 64 |         return {**nt_vf, **t_vf}
 65 | 
 66 |     def get_act_value_func_dict(self, pol: Policy)\
 67 |             -> Mapping[S, Mapping[A, float]]:
 68 |         v_dict = self.get_value_func_dict(pol)
 69 |         return {s: {a: r + self.gamma * sum(p * v_dict[s1] for s1, p in
 70 |                                             self.transitions[s][a].items())
 71 |                     for a, r in v.items()}
 72 |                 for s, v in self.rewards.items()}
 73 | 
 74 |     def get_improved_policy(self, pol: Policy) -> DetPolicy:
 75 |         q_dict = self.get_act_value_func_dict(pol)
 76 |         return DetPolicy({s: max(v.items(), key=itemgetter(1))[0]
 77 |                           for s, v in q_dict.items()})
 78 | 
 79 |     def get_optimal_policy(self, tol=1e-4) -> DetPolicy:
 80 |         pol = Policy({s: {a: 1. / len(v) for a in v} for s, v in
 81 |                       self.state_action_dict.items()})
 82 |         vf = self.get_value_func_dict(pol)
 83 |         epsilon = tol * 1e4
 84 |         while epsilon >= tol:
 85 |             pol = self.get_improved_policy(pol)
 86 |             new_vf = self.get_value_func_dict(pol)
 87 |             epsilon = max(abs(new_vf[s] - v) for s, v in vf.items())
 88 |             vf = new_vf
 89 |         return pol
 90 | 
 91 |     def get_mdp_rep_for_adp(self) -> MDPRepForADP:
 92 |         return MDPRepForADP(
 93 |             state_action_func=lambda s: self.state_action_dict[s],
 94 |             gamma=self.gamma,
 95 |             sample_states_gen_func=get_rv_gen_func(
 96 |                 {s: 1. / len(self.state_action_dict) for s in
 97 |                  self.state_action_dict.keys()}
 98 |             ),
 99 |             reward_func=lambda s, a: self.rewards[s][a],
100 |             transitions_func=lambda s, a: self.transitions[s][a]
101 |         )
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     data = {
106 |         1: {
107 |             'a': ({1: 0.3, 2: 0.6, 3: 0.1}, 5.0),
108 |             'b': ({2: 0.3, 3: 0.7}, 2.8),
109 |             'c': ({1: 0.2, 2: 0.4, 3: 0.4}, -7.2)
110 |         },
111 |         2: {
112 |             'a': ({1: 0.3, 2: 0.6, 3: 0.1}, 5.0),
113 |             'c': ({1: 0.2, 2: 0.4, 3: 0.4}, -7.2)
114 |         },
115 |         3: {
116 |             'a': ({3: 1.0}, 0.0),
117 |             'b': ({3: 1.0}, 0.0)
118 |         }
119 |     }
120 |     mdp_obj = MDP(data, 0.95)
121 |     print(mdp_obj.all_states)
122 |     print(mdp_obj.transitions)
123 |     print(mdp_obj.rewards)
124 |     terminal = mdp_obj.get_terminal_states()
125 |     print(terminal)
126 |     policy_data = {
127 |         1: {'a': 0.4, 'b': 0.6},
128 |         2: {'a': 0.7, 'c': 0.3},
129 |         3: {'b': 1.0}
130 |     }
131 |     pol_obj = Policy(policy_data)
132 |     mdp_data = {
133 |         1: {
134 |             'a': ({1: 0.2, 2: 0.6, 3: 0.2}, 7.0),
135 |             'b': ({1: 0.6, 2: 0.3, 3: 0.1}, -2.0),
136 |             'c': ({1: 0.1, 2: 0.2, 3: 0.7}, 10.0)
137 |         },
138 |         2: {
139 |             'a': ({1: 0.1, 2: 0.6, 3: 0.3}, 1.0),
140 |             'c': ({1: 0.6, 2: 0.2, 3: 0.2}, -1.2)
141 |         },
142 |         3: {
143 |             'b': ({3: 1.0}, 0.0)
144 |         }
145 |     }
146 |     mdp1_obj = MDP(mdp_data, gamma=0.9)
147 |     mrp1_obj = mdp1_obj.get_mrp(pol_obj)
148 |     print(mrp1_obj.transitions)
149 |     print(mrp1_obj.rewards)
150 |     print(mrp1_obj.trans_matrix)
151 |     print(mrp1_obj.rewards_vec)
152 |     print(mrp1_obj.get_value_func_vec())
153 |     opt_policy = mdp1_obj.get_optimal_policy()
154 |     print(opt_policy.policy_data)
155 |     opt_vf_dict = mdp1_obj.get_value_func_dict(opt_policy)
156 |     print(opt_vf_dict)
157 | 


--------------------------------------------------------------------------------
/src/examples/american_pricing/vanilla_american_test.py:
--------------------------------------------------------------------------------
  1 | from typing import Mapping, Any
  2 | import numpy as np
  3 | from algorithms.td_algo_enum import TDAlgorithm
  4 | from numpy.polynomial.laguerre import lagval
  5 | from examples.american_pricing.american_pricing import AmericanPricing
  6 | from examples.american_pricing.grid_pricing import GridPricing
  7 | from src.examples.american_pricing.num_utils import get_future_price_mean_var
  8 | 
  9 | LARGENUM = 1e8
 10 | 
 11 | 
 12 | # noinspection PyShadowingNames
 13 | def get_vanilla_american_price(
 14 |     is_call: bool,
 15 |     spot_price: float,
 16 |     strike: float,
 17 |     expiry: float,
 18 |     lognormal: bool,
 19 |     r: float,
 20 |     sigma: float,
 21 |     num_dt: int,
 22 |     num_paths: int,
 23 |     num_laguerre: int,
 24 |     params_bag: Mapping[str, Any]
 25 | ) -> Mapping[str, float]:
 26 |     opt_payoff = lambda _, x, is_call=is_call, strike=strike:\
 27 |         max(x - strike, 0.) if is_call else max(strike - x, 0.)
 28 |     # noinspection PyShadowingNames
 29 |     ir_func = lambda t, r=r: r * t
 30 |     isig_func = lambda t, sigma=sigma: sigma * sigma * t
 31 | 
 32 |     num_dx = 200
 33 |     expiry_mean, expiry_var = get_future_price_mean_var(
 34 |         spot_price,
 35 |         0.,
 36 |         expiry,
 37 |         lognormal,
 38 |         ir_func,
 39 |         isig_func
 40 |     )
 41 |     grid_price = GridPricing(
 42 |         spot_price=spot_price,
 43 |         payoff=opt_payoff,
 44 |         expiry=expiry,
 45 |         lognormal=lognormal,
 46 |         ir=ir_func,
 47 |         isig=isig_func
 48 |     ).get_price(
 49 |         num_dt=num_dt,
 50 |         num_dx=num_dx,
 51 |         center=expiry_mean,
 52 |         width=np.sqrt(expiry_var) * 4
 53 |     )
 54 | 
 55 |     gp = AmericanPricing(
 56 |         spot_price=spot_price,
 57 |         payoff=(lambda t, x, opt_payoff=opt_payoff: opt_payoff(t, x[-1])),
 58 |         expiry=expiry,
 59 |         lognormal=lognormal,
 60 |         ir=ir_func,
 61 |         isig=isig_func
 62 |     )
 63 |     ident = np.eye(num_laguerre)
 64 | 
 65 |     # noinspection PyShadowingNames
 66 |     def laguerre_feature_func(
 67 |         x: float,
 68 |         i: int,
 69 |         ident=ident,
 70 |         strike=strike
 71 |     ) -> float:
 72 |         # noinspection PyTypeChecker
 73 |         xp = x / strike
 74 |         return np.exp(-xp / 2) * lagval(xp, ident[i])
 75 | 
 76 |     ls_price = gp.get_ls_price(
 77 |         num_dt=num_dt,
 78 |         num_paths=num_paths,
 79 |         feature_funcs=[lambda _, x: 1.] +
 80 |                       [(lambda _, x, i=i: laguerre_feature_func(x[-1], i)) for i in
 81 |                        range(num_laguerre)]
 82 |     )
 83 | 
 84 |     # noinspection PyShadowingNames
 85 |     def rl_feature_func(
 86 |         ind: int,
 87 |         x: float,
 88 |         a: bool,
 89 |         i: int,
 90 |         num_laguerre: int = num_laguerre,
 91 |         num_dt: int = num_dt,
 92 |         expiry: float = expiry
 93 |     ) -> float:
 94 |         dt = expiry / num_dt
 95 |         t = ind * dt
 96 |         if i < num_laguerre + 4:
 97 |             if ind < num_dt and not a:
 98 |                 if i == 0:
 99 |                     ret = 1.
100 |                 elif i < num_laguerre + 1:
101 |                     ret = laguerre_feature_func(x, i - 1)
102 |                 elif i == num_laguerre + 1:
103 |                     ret = np.sin(-t * np.pi / (2. * expiry) + np.pi / 2.)
104 |                 elif i == num_laguerre + 2:
105 |                     ret = np.log(expiry - t)
106 |                 else:
107 |                     rat = t / expiry
108 |                     ret = rat * rat
109 |             else:
110 |                 ret = 0.
111 |         else:
112 |             if ind <= num_dt and a:
113 |                 ret = np.exp(-r * (ind * dt)) * opt_payoff(ind * dt, x)
114 |             else:
115 |                 ret = 0.
116 | 
117 |         return ret
118 | 
119 |     rl_price = gp.get_rl_fa_price(
120 |         num_dt=num_dt,
121 |         method=params_bag["method"],
122 |         exploring_start=params_bag["exploring_start"],
123 |         algorithm=params_bag["algorithm"],
124 |         softmax=params_bag["softmax"],
125 |         epsilon=params_bag["epsilon"],
126 |         epsilon_half_life=params_bag["epsilon_half_life"],
127 |         lambd=params_bag["lambda"],
128 |         num_paths=num_paths,
129 |         batch_size=params_bag["batch_size"],
130 |         feature_funcs=[(lambda x, i=i: rl_feature_func(
131 |             x[0][0],
132 |             x[0][1][-1],
133 |             x[1],
134 |             i
135 |         )) for i in range(num_laguerre + 5)],
136 |         neurons=params_bag["neurons"],
137 |         learning_rate=params_bag["learning_rate"],
138 |         learning_rate_decay=params_bag["learning_rate_decay"],
139 |         adam=params_bag["adam"],
140 |         offline=params_bag["offline"]
141 |     )
142 | 
143 |     return {
144 |         "Grid": grid_price,
145 |         "LS": ls_price,
146 |         "RL": rl_price
147 |     }
148 | 
149 | 
150 | if __name__ == '__main__':
151 |     is_call_val = False
152 |     spot_price_val = 80.0
153 |     strike_val = 75.0
154 |     expiry_val = 2.0
155 |     lognormal_val = True
156 |     r_val = 0.02
157 |     sigma_val = 0.25
158 |     num_dt_val = 10
159 |     num_paths_val = 1000000
160 |     num_laguerre_val = 3
161 | 
162 |     params_bag_val = {
163 |         "method": "LSPI",
164 |         "exploring_start": False,
165 |         "algorithm": TDAlgorithm.ExpectedSARSA,
166 |         "softmax": False,
167 |         "epsilon": 0.2,
168 |         "epsilon_half_life": 100000,
169 |         "batch_size": 10000,
170 |         "neurons": None,
171 |         "learning_rate": 0.03,
172 |         "learning_rate_decay": 10000,
173 |         "adam": (True, 0.9, 0.99),
174 |         "lambda": 0.8,
175 |         "offline": True,
176 |     }
177 | 
178 |     am_prices = get_vanilla_american_price(
179 |         is_call=is_call_val,
180 |         spot_price=spot_price_val,
181 |         strike=strike_val,
182 |         expiry=expiry_val,
183 |         lognormal=lognormal_val,
184 |         r=r_val,
185 |         sigma=sigma_val,
186 |         num_dt=num_dt_val,
187 |         num_paths=num_paths_val,
188 |         num_laguerre=num_laguerre_val,
189 |         params_bag=params_bag_val
190 |     )
191 |     print(am_prices)
192 |     print(params_bag_val)
193 |     from examples.american_pricing.bs_pricing import EuropeanBSPricing
194 | 
195 |     ebsp = EuropeanBSPricing(
196 |         is_call=is_call_val,
197 |         spot_price=spot_price_val,
198 |         strike=strike_val,
199 |         expiry=expiry_val,
200 |         r=r_val,
201 |         sigma=sigma_val
202 |     )
203 |     print(ebsp.option_price)
204 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MDP-DP-RL
 2 | 
 3 | The goal of this project was to develop all Dynamic Programming and Reinforcement Learning algorithms
 4 | from scratch (i.e., with no use of standard libraries, except for basic numpy and scipy tools). The
 5 |  "develop from scratch" goal was motivated by educational purposes - students learning this topic
 6 |  can understand the concepts throroughly only when they develop and work with code developed from
 7 |  scratch. I teach courses on this topic to a variety of student backgrounds, and each such course
 8 |  is big on precise programming implementations of the techniques/algorithms. In particular, I
 9 |  use this codebase when I teach Stanford CME 241: Reinforcement Learning for Stochastic
10 |  Control Problems in Finance (http://cme241.stanford.edu).
11 |  
12 |  Any feedback on code readability, performance and bugs will be greatly appreciated as the code
13 |  is still fairly raw and untested in various parts (started working on this code in August 2018,
14 |  and have mainly been in code-growth mode so far).
15 |  
16 |  The project started by implementing the foundational data structures for finite Markov Processes
17 |  (a.k.a. Markov Chains), Markov Reward Processes (MRP), and Markov Decision Processes (MDP). This was followed by
18 |  Dynamic Programming (DP) algorithms, where the focus was to represent Bellman equations in clear mathematical
19 |  terms within the code. Next was the core educational material of Reinforcement Learning, implementing
20 |  the Generalized Policy Iteration algorithms based on simulations (Monte Carlo and Temporal Difference,
21 |  including eligibility traces). However, the emphasis was to first implement the tabular methods so that
22 |  one can work with actual data structures (finite, hence tabular), rather than functions to represent
23 |  MDP rewards and transition specifications as well as value functions and policies. Once the tabular RL
24 |  methods were implemented, it was straightforward to write the same algorithms as functional approximation-based
25 |  algorithms. However, this required a detour to build some foundation for function approximation. I chose
26 |  to implement linear and deep neural network approximations, both of which require a specification of 
27 |  feature functions. Backpropagation was developed from scratch, again for educational purposes. On a whim,
28 |  I also implemented Approximate Dynamic Programming (ADP) Algorithms, which was basically the same old
29 |  Policy Iteration and Value Iteration algorithms but now using the output of function approximation for
30 |  the right-hand-side of the Bellman update, and using the updated values as training data for gradient
31 |  descent on the parameters of the function approximation. So far, I am finding ADP
32 |  as the most valuable algorithm for the MDP problems I typically work with. I am a bit surprised that the
33 |  "literature" focuses so much on model-free whereas I often know the model for many of the MDPs I work on,
34 |  and so, ADP is ideal.
35 |   
36 |  I have chosen Python 3 as the language, mainly because I can't expect my students to have expertise in
37 |  the potentially more-appropriate languages for this project, such as Scala, Ocaml and Haskell. These are
38 |  functional programming languages and this topic/project is best done through a tasteful application of
39 |  Functional Progamming. But Python 3 is not such a bad choice as functions are fast-class entities. My core
40 |  technique in this project is indeed Functional Programming, but I had to very careful in getting around
41 |  Python's "naughty" handling of function closures. I have also made heavy use of classes and TypeVars.
42 |  Object-oriented polymorphism as well as type-parametrized polymorphism enabled me to cover a wide range of
43 |  algorithms with plenty of common code. Python 3 also provided me the benefit of type annotations, which I
44 |  have taken heavy advantage of in this project. Type annotations support turned out to be extremely valuable
45 |  in the project as my IDE (PyCharm) caught a lot of  errors/warnings statically, in fact as I was typing code,
46 |  it would spot errors. More importantly, type annotations makes the interfaces very clear and I believe any
47 |  sort of mathematical programming needs a strong style of type annotations (if not static typing).
48 |  
49 |  This is how the modules of the project are organized.
50 |  
51 |  processes: All about Markov Processes, MRP, MDP and classes that serve as minimal but complete representations
52 |  of an MDP for specific classes of algorithms, eg: a representation for tabular RL, a representation for function
53 |  approximation RL, and a representation for ADP. A lot of the heavy lifting is done in the "helper" sub-module
54 |  mp_funcs.py
55 |  
56 | func_approx: Linear and Deep-Neural Network (DNN) function approximation. Implements function evaluation (forward
57 | propagation for DNN) and gradient calculation/gradient descent (backward propagation for DNN) using ADAM. Took
58 | advantage of numpy vectors, matrices, tensors and efficiently computing with them.
59 | 
60 | algorithms: within this, we have the modules dp (for Dynamic Programming), adp (for Approximate Dynamic Programming),
61 | rl_tabular (for Tabular RL - Monte Carlo, SARSA, Q-Learning, Expected SARSA), rl_func_aprprox (for Function
62 | Approximation RL - same algorithms as Tabular RL). Note that I have implemented TD(0) and TD(Lambda) separately
63 | for both Tabular RL and Function Approximation RL, although TD(0) is a special case of TD(Lambda). TD(0) was
64 | implemented separately for the usual reason in the project - that I find it easy to introduce a special case (in
65 | this case TD(0)) for pedagogical reasons and so showing students TD(0) as a special case with simpler/lighter
66 | code that focuses on the concept (versus the complication of eligibility traces) is quite beneficial. This is the
67 | same reason I implemented Tabular (Tabular is a special case of Linear Function Approximation where the features
68 | are indicator functions, one for each of the states/state-action pairs). Note the deep object-oriented inheritance
69 | hiereracy - rooted at the abstract base class OptBase. Note also that a lot of heavy lifting happens in the
70 | module helper_funcs.py. A couple of semi-advanced algorithms such as LSTD/LSPI and Policy Gradient are also implemented here (LSPI provides batch-efficiency and Policy Gradient is valuable when the action space is large/continuous). Some special but highly useful model-based algorithms such as Backward Induction (backward_dp.py) and Adapative Multistage Sampling (ams.py) have also been implemented.
71 | 
72 | examples: Implemented a few common examples of problems that are ideal for RL: Windy Grid, Inventory Control. For http://cme241.stanford.edu, I have also implemented initial versions of two important and interesting finance problems that can be solved by modeling them as MDPs and solving with DP/RL: 1) Optimal Asset-Allocation and Consumption when managing a portfolio of risky assets and 1 riskless asset, 2) Optimal Exercise of American Options when the option-payoff is either path-dependent or if the state space of the option is high-dimensional.
73 | 
74 | utils: Some generic utility functions to transform data structures.
75 | 
76 | 


--------------------------------------------------------------------------------
/src/processes/mdp_refined.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple
  2 | 
  3 | import numpy as np
  4 | 
  5 | from processes.mdp import MDP
  6 | from processes.mdp_rep_for_adp_pg import MDPRepForADPPG
  7 | from processes.mdp_rep_for_rl_pg import MDPRepForRLPG
  8 | from processes.mdp_rep_for_rl_tabular import MDPRepForRLTabular
  9 | from processes.mp_funcs import flatten_sasf_dict
 10 | from processes.mp_funcs import flatten_ssf_dict
 11 | from processes.mp_funcs import get_rv_gen_func
 12 | from processes.mp_funcs import get_rv_gen_func_single
 13 | from processes.mp_funcs import get_sampling_func_from_prob_dict
 14 | from processes.mp_funcs import get_state_reward_gen_dict
 15 | from processes.mp_funcs import get_state_reward_gen_func
 16 | from processes.mp_funcs import mdp_rep_to_mrp_rep1
 17 | from processes.mp_funcs import unflatten_sasf_dict
 18 | from processes.mp_funcs import unflatten_ssf_dict
 19 | from processes.mrp_refined import MRPRefined
 20 | from processes.policy import Policy
 21 | from utils.gen_utils import zip_dict_of_tuple, merge_dicts
 22 | from utils.standard_typevars import SASf, SAf, SASTff
 23 | 
 24 | 
 25 | class MDPRefined(MDP):
 26 | 
 27 |     def __init__(
 28 |         self,
 29 |         info: SASTff,
 30 |         gamma: float
 31 |     ) -> None:
 32 |         d1, d2, d3 = MDPRefined.split_info(info)
 33 |         super().__init__(
 34 |             {s: {a: (v1, d3[s][a]) for a, v1 in v.items()}
 35 |              for s, v in d1.items()},
 36 |             gamma
 37 |         )
 38 |         self.rewards_refined: SASf = d2
 39 | 
 40 |     @staticmethod
 41 |     def split_info(info: SASTff) -> Tuple[SASf, SASf, SAf]:
 42 |         c = {s: {a: zip_dict_of_tuple(v1) for a, v1 in v.items()}
 43 |              for s, v in info.items()}
 44 |         d = {k: zip_dict_of_tuple(v) for k, v in c.items()}
 45 |         d1, d2 = zip_dict_of_tuple(d)
 46 |         d3 = {s: {a: sum(np.prod(x) for x in v1.values())
 47 |                   for a, v1 in v.items()} for s, v in info.items()}
 48 |         return d1, d2, d3
 49 | 
 50 |     def get_mrp_refined(self, pol: Policy) -> MRPRefined:
 51 |         flat_transitions = flatten_sasf_dict(self.transitions)
 52 |         flat_rewards_refined = flatten_sasf_dict(self.rewards_refined)
 53 | 
 54 |         flat_exp_rewards = merge_dicts(flat_rewards_refined, flat_transitions, lambda x, y: x * y)
 55 |         exp_rewards = unflatten_sasf_dict(flat_exp_rewards)
 56 | 
 57 |         tr = mdp_rep_to_mrp_rep1(self.transitions, pol.policy_data)
 58 |         rew_ref = mdp_rep_to_mrp_rep1(
 59 |             exp_rewards,
 60 |             pol.policy_data
 61 |         )
 62 |         flat_tr = flatten_ssf_dict(tr)
 63 |         flat_rew_ref = flatten_ssf_dict(rew_ref)
 64 |         flat_norm_rewards = merge_dicts(flat_rew_ref, flat_tr, lambda x, y: x / y)
 65 |         norm_rewards = unflatten_ssf_dict(flat_norm_rewards)
 66 | 
 67 |         return MRPRefined(
 68 |             {s: {s1: (v1, norm_rewards[s][s1]) for s1, v1 in v.items()}
 69 |              for s, v in tr.items()},
 70 |             self.gamma
 71 |         )
 72 | 
 73 |     def get_mdp_rep_for_rl_tabular(self) -> MDPRepForRLTabular:
 74 |         return MDPRepForRLTabular(
 75 |             state_action_dict=self.state_action_dict,
 76 |             terminal_states=self.terminal_states,
 77 |             state_reward_gen_dict=get_state_reward_gen_dict(
 78 |                 self.transitions,
 79 |                 self.rewards_refined
 80 |             ),
 81 |             gamma=self.gamma
 82 |         )
 83 | 
 84 |     def get_mdp_rep_for_adp_pg(self) -> MDPRepForADPPG:
 85 |         return MDPRepForADPPG(
 86 |             gamma=self.gamma,
 87 |             init_states_gen_func=get_rv_gen_func(
 88 |                 {s: 1. / len(self.state_action_dict) for s in
 89 |                  self.state_action_dict.keys()}
 90 |             ),
 91 |             state_reward_gen_func=lambda s, a, n:
 92 |                 [(s1, self.rewards_refined[s][a][s1]) for s1 in
 93 |                  get_sampling_func_from_prob_dict(self.transitions[s][a])(n)],
 94 |             # reward_func=lambda s, a: self.rewards[s][a],
 95 |             # transitions_func=lambda s, a: self.transitions[s][a],
 96 |             terminal_state_func=lambda s: s in self.terminal_states
 97 |         )
 98 | 
 99 |     def get_mdp_rep_for_rl_pg(self) -> MDPRepForRLPG:
100 |         return MDPRepForRLPG(
101 |             gamma=self.gamma,
102 |             init_state_gen_func=get_rv_gen_func_single(
103 |                 {s: 1. / len(self.state_action_dict) for s in
104 |                  self.state_action_dict.keys()}
105 |             ),
106 |             state_reward_gen_func=lambda s, a: get_state_reward_gen_func(
107 |                 self.transitions[s][a],
108 |                 self.rewards_refined[s][a],
109 |             )(),
110 |             terminal_state_func=lambda s: s in self.terminal_states,
111 |         )
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     # data = {
116 |     #     1: {
117 |     #         'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)},
118 |     #         'b': {2: (0.3, -0.5), 3: (0.7, 2.6)},
119 |     #         'c': {1: (0.2, 4.8), 2: (0.4, -4.9), 3: (0.4, 0.0)}
120 |     #     },
121 |     #     2: {
122 |     #         'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)},
123 |     #         'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)}
124 |     #     },
125 |     #     3: {
126 |     #         'a': {3: (1.0, 0.0)},
127 |     #         'b': {3: (1.0, 0.0)}
128 |     #     }
129 |     # }
130 |     # mdp_refined_obj = MDPRefined(data, 0.95)
131 |     # print(mdp_refined_obj.all_states)
132 |     # print(mdp_refined_obj.transitions)
133 |     # print(mdp_refined_obj.rewards)
134 |     # print(mdp_refined_obj.rewards_refined)
135 |     # terminal = mdp_refined_obj.get_terminal_states()
136 |     # print(terminal)
137 | 
138 |     print("This is MDPRefined")
139 |     mdp_refined_data = {
140 |         1: {
141 |             'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)},
142 |             'b': {2: (0.3, -0.5), 3: (0.7, 2.6)},
143 |             'c': {1: (0.2, 4.8), 2: (0.4, -4.9), 3: (0.4, 0.0)}
144 |         },
145 |         2: {
146 |             'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)},
147 |             'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)}
148 |         },
149 |         3: {
150 |             'a': {3: (1.0, 0.0)},
151 |             'b': {3: (1.0, 0.0)}
152 |         }
153 |     }
154 |     mdp_refined_obj = MDPRefined(mdp_refined_data, 0.97)
155 |     print("Transitions")
156 |     print(mdp_refined_obj.transitions)
157 |     print("Rewards Refined")
158 |     print(mdp_refined_obj.rewards_refined)
159 | 
160 |     print("----------------")
161 |     print("This is the Policy")
162 |     policy_data = {
163 |         1: {'a': 0.4, 'b': 0.6},
164 |         2: {'a': 0.7, 'c': 0.3},
165 |         3: {'b': 1.0}
166 |     }
167 |     pol_obj = Policy(policy_data)
168 |     print(pol_obj.policy_data)
169 | 
170 |     print("----------------")
171 |     print("This is MRPRefined")
172 |     mrp_refined_obj = mdp_refined_obj.get_mrp_refined(pol_obj)
173 |     print("Transitions")
174 |     print(mrp_refined_obj.transitions)
175 |     print("Rewards Refined")
176 |     print(mrp_refined_obj.rewards_refined)
177 | 
178 |     print("-----------------")
179 |     print("This is MDP")
180 |     print("Rewards")
181 |     print(mdp_refined_obj.rewards)
182 | 
183 |     print("-----------------")
184 |     print("This is MRP from MDP")
185 |     mrp_obj1 = mdp_refined_obj.get_mrp(pol_obj)
186 |     print("Rewards")
187 |     print(mrp_obj1.rewards)
188 | 
189 |     print("---------------")
190 |     print("This is MRP from MRPRefined")
191 |     print("Rewards")
192 |     print(mrp_refined_obj.rewards)
193 | 
194 | 
195 | 


--------------------------------------------------------------------------------
/src/algorithms/rl_tabular/td0.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | from algorithms.td_algo_enum import TDAlgorithm
  3 | from algorithms.rl_tabular.rl_tabular_base import RLTabularBase
  4 | from processes.policy import Policy
  5 | from processes.mp_funcs import get_rv_gen_func_single
  6 | from processes.mdp_rep_for_rl_tabular import MDPRepForRLTabular
  7 | from processes.mp_funcs import get_expected_action_value
  8 | from utils.standard_typevars import VFDictType, QFDictType
  9 | 
 10 | 
 11 | class TD0(RLTabularBase):
 12 | 
 13 |     def __init__(
 14 |             self,
 15 |             mdp_rep_for_rl: MDPRepForRLTabular,
 16 |             exploring_start: bool,
 17 |             algorithm: TDAlgorithm,
 18 |             softmax: bool,
 19 |             epsilon: float,
 20 |             epsilon_half_life: float,
 21 |             learning_rate: float,
 22 |             learning_rate_decay: float,
 23 |             num_episodes: int,
 24 |             max_steps: int
 25 |     ) -> None:
 26 | 
 27 |         super().__init__(
 28 |             mdp_rep_for_rl=mdp_rep_for_rl,
 29 |             exploring_start=exploring_start,
 30 |             softmax=softmax,
 31 |             epsilon=epsilon,
 32 |             epsilon_half_life=epsilon_half_life,
 33 |             num_episodes=num_episodes,
 34 |             max_steps=max_steps
 35 |         )
 36 |         self.algorithm: TDAlgorithm = algorithm
 37 |         self.learning_rate: float = learning_rate
 38 |         self.learning_rate_decay: Optional[float] = learning_rate_decay
 39 | 
 40 |     def get_value_func_dict(self, pol: Policy) -> VFDictType:
 41 |         sa_dict = self.mdp_rep.state_action_dict
 42 |         vf_dict = {s: 0.0 for s in sa_dict.keys()}
 43 |         act_gen_dict = {s: get_rv_gen_func_single(pol.get_state_probabilities(s))
 44 |                         for s in sa_dict.keys()}
 45 |         episodes = 0
 46 |         updates = 0
 47 | 
 48 |         while episodes < self.num_episodes:
 49 |             state = self.mdp_rep.init_state_gen()
 50 |             steps = 0
 51 |             terminate = False
 52 | 
 53 |             while not terminate:
 54 |                 action = act_gen_dict[state]()
 55 |                 next_state, reward = \
 56 |                     self.mdp_rep.state_reward_gen_dict[state][action]()
 57 |                 vf_dict[state] += self.learning_rate *\
 58 |                     (updates / self.learning_rate_decay + 1) ** -0.5 *\
 59 |                     (reward + self.mdp_rep.gamma * vf_dict[next_state] -
 60 |                      vf_dict[state])
 61 |                 updates += 1
 62 |                 steps += 1
 63 |                 terminate = steps >= self.max_steps or \
 64 |                     state in self.mdp_rep.terminal_states
 65 |                 state = next_state
 66 | 
 67 |             episodes += 1
 68 | 
 69 |         return vf_dict
 70 | 
 71 |     def get_qv_func_dict(self, pol: Optional[Policy]) -> QFDictType:
 72 |         control = pol is None
 73 |         this_pol = pol if pol is not None else self.get_init_policy()
 74 |         sa_dict = self.mdp_rep.state_action_dict
 75 |         qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
 76 |         episodes = 0
 77 |         updates = 0
 78 | 
 79 |         while episodes < self.num_episodes:
 80 |             if self.exploring_start:
 81 |                 state, action = self.mdp_rep.init_state_action_gen()
 82 |             else:
 83 |                 state = self.mdp_rep.init_state_gen()
 84 |                 action = get_rv_gen_func_single(
 85 |                     this_pol.get_state_probabilities(state)
 86 |                 )()
 87 |             steps = 0
 88 |             terminate = False
 89 | 
 90 |             while not terminate:
 91 |                 next_state, reward = \
 92 |                     self.mdp_rep.state_reward_gen_dict[state][action]()
 93 |                 next_action = get_rv_gen_func_single(
 94 |                     this_pol.get_state_probabilities(next_state)
 95 |                 )()
 96 |                 if self.algorithm == TDAlgorithm.QLearning and control:
 97 |                     next_qv = max(qf_dict[next_state][a] for a in
 98 |                                   qf_dict[next_state])
 99 |                 elif self.algorithm == TDAlgorithm.ExpectedSARSA and control:
100 |                     # next_qv = sum(this_pol.get_state_action_probability(
101 |                     #     next_state,
102 |                     #     a
103 |                     # ) * qf_dict[next_state][a] for a in qf_dict[next_state])
104 |                     next_qv = get_expected_action_value(
105 |                         qf_dict[next_state],
106 |                         self.softmax,
107 |                         self.epsilon_func(episodes)
108 |                     )
109 |                 else:
110 |                     next_qv = qf_dict[next_state][next_action]
111 | 
112 |                 qf_dict[state][action] += self.learning_rate *\
113 |                     (updates / self.learning_rate_decay + 1) ** -0.5 *\
114 |                     (reward + self.mdp_rep.gamma * next_qv -
115 |                      qf_dict[state][action])
116 |                 updates += 1
117 |                 if control:
118 |                     if self.softmax:
119 |                         this_pol.edit_state_action_to_softmax(
120 |                             state,
121 |                             qf_dict[state]
122 |                         )
123 |                     else:
124 |                         this_pol.edit_state_action_to_epsilon_greedy(
125 |                             state,
126 |                             qf_dict[state],
127 |                             self.epsilon_func(episodes)
128 |                         )
129 |                 steps += 1
130 |                 terminate = steps >= self.max_steps or \
131 |                     state in self.mdp_rep.terminal_states
132 |                 state = next_state
133 |                 action = next_action
134 | 
135 |             episodes += 1
136 | 
137 |         return qf_dict
138 | 
139 | 
140 | if __name__ == '__main__':
141 |     from processes.mdp_refined import MDPRefined
142 |     mdp_refined_data = {
143 |         1: {
144 |             'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)},
145 |             'b': {2: (0.3, -0.5), 3: (0.7, 2.6)},
146 |             'c': {1: (0.2, 4.8), 2: (0.4, -4.9), 3: (0.4, 0.0)}
147 |         },
148 |         2: {
149 |             'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)},
150 |             'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)}
151 |         },
152 |         3: {
153 |             'a': {3: (1.0, 0.0)},
154 |             'b': {3: (1.0, 0.0)}
155 |         }
156 |     }
157 |     gamma_val = 1.0
158 |     mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val)
159 |     mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular()
160 | 
161 |     exploring_start_val = False
162 |     algorithm_type = TDAlgorithm.ExpectedSARSA
163 |     softmax_flag = False
164 |     epsilon_val = 0.1
165 |     epsilon_half_life_val = 1000
166 |     learning_rate_val = 0.1
167 |     learning_rate_decay_val = 1e6
168 |     episodes_limit = 10000
169 |     max_steps_val = 1000
170 |     sarsa_obj = TD0(
171 |         mdp_rep_obj,
172 |         exploring_start_val,
173 |         algorithm_type,
174 |         softmax_flag,
175 |         epsilon_val,
176 |         epsilon_half_life_val,
177 |         learning_rate_val,
178 |         learning_rate_decay_val,
179 |         episodes_limit,
180 |         max_steps_val
181 |     )
182 | 
183 |     policy_data = {
184 |         1: {'a': 0.4, 'b': 0.6},
185 |         2: {'a': 0.7, 'c': 0.3},
186 |         3: {'b': 1.0}
187 |     }
188 |     pol_obj = Policy(policy_data)
189 | 
190 |     this_qf_dict = sarsa_obj.get_act_value_func_dict(pol_obj)
191 |     print(this_qf_dict)
192 |     this_vf_dict = sarsa_obj.get_value_func_dict(pol_obj)
193 |     print(this_vf_dict)
194 | 
195 |     opt_pol = sarsa_obj.get_optimal_det_policy()
196 |     print(opt_pol)
197 |     opt_vf_dict = sarsa_obj.get_value_func_dict(opt_pol)
198 |     print(opt_vf_dict)
199 | 


--------------------------------------------------------------------------------
/src/algorithms/rl_tabular/monte_carlo.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Tuple, Sequence
  2 | from algorithms.rl_tabular.rl_tabular_base import RLTabularBase
  3 | from processes.policy import Policy
  4 | from processes.mp_funcs import get_rv_gen_func_single
  5 | from processes.mdp_rep_for_rl_tabular import MDPRepForRLTabular
  6 | from algorithms.helper_funcs import get_returns_from_rewards_terminating
  7 | from algorithms.helper_funcs import get_returns_from_rewards_non_terminating
  8 | from algorithms.helper_funcs import get_soft_policy_from_qf_dict
  9 | from algorithms.helper_funcs import get_nt_return_eval_steps
 10 | import numpy as np
 11 | from utils.generic_typevars import S, A
 12 | from utils.standard_typevars import VFDictType, QFDictType
 13 | 
 14 | 
 15 | class MonteCarlo(RLTabularBase):
 16 | 
 17 |     def __init__(
 18 |         self,
 19 |         mdp_rep_for_rl: MDPRepForRLTabular,
 20 |         exploring_start: bool,
 21 |         first_visit: bool,
 22 |         softmax: bool,
 23 |         epsilon: float,
 24 |         epsilon_half_life: float,
 25 |         num_episodes: int,
 26 |         max_steps: int
 27 |     ) -> None:
 28 | 
 29 |         super().__init__(
 30 |             mdp_rep_for_rl=mdp_rep_for_rl,
 31 |             exploring_start=exploring_start,
 32 |             softmax=softmax,
 33 |             epsilon=epsilon,
 34 |             epsilon_half_life=epsilon_half_life,
 35 |             num_episodes=num_episodes,
 36 |             max_steps=max_steps
 37 |         )
 38 |         self.first_visit: bool = first_visit
 39 |         self.nt_return_eval_steps = get_nt_return_eval_steps(
 40 |             max_steps,
 41 |             mdp_rep_for_rl.gamma,
 42 |             1e-4
 43 |         )
 44 | 
 45 |     def get_mc_path(
 46 |         self,
 47 |         pol: Policy,
 48 |         start_state: S,
 49 |         start_action: Optional[A] = None,
 50 |     ) -> Sequence[Tuple[S, A, float, bool]]:
 51 | 
 52 |         res = []
 53 |         state = start_state
 54 |         steps = 0
 55 |         terminate = False
 56 |         occ_states = set()
 57 |         act_gen_dict = {s: get_rv_gen_func_single(pol.get_state_probabilities(s))
 58 |                         for s in self.mdp_rep.state_action_dict.keys()}
 59 | 
 60 |         while not terminate:
 61 |             first = state not in occ_states
 62 |             occ_states.add(state)
 63 |             action = act_gen_dict[state]()\
 64 |                 if (steps > 0 or start_action is None) else start_action
 65 |             next_state, reward =\
 66 |                 self.mdp_rep.state_reward_gen_dict[state][action]()
 67 |             res.append((state, action, reward, first))
 68 |             steps += 1
 69 |             terminate = steps >= self.max_steps or\
 70 |                 state in self.mdp_rep.terminal_states
 71 |             state = next_state
 72 |         return res
 73 | 
 74 |     def get_value_func_dict(self, pol: Policy) -> VFDictType:
 75 |         sa_dict = self.mdp_rep.state_action_dict
 76 |         counts_dict = {s: 0 for s in sa_dict.keys()}
 77 |         vf_dict = {s: 0.0 for s in sa_dict.keys()}
 78 |         episodes = 0
 79 | 
 80 |         while episodes < self.num_episodes:
 81 |             start_state = self.mdp_rep.init_state_gen()
 82 |             mc_path = self.get_mc_path(
 83 |                 pol,
 84 |                 start_state,
 85 |                 start_action=None
 86 |             )
 87 | 
 88 |             rew_arr = np.array([x for _, _, x, _ in mc_path])
 89 |             if mc_path[-1][0] in self.mdp_rep.terminal_states:
 90 |                 returns = get_returns_from_rewards_terminating(
 91 |                     rew_arr,
 92 |                     self.mdp_rep.gamma
 93 |                 )
 94 |             else:
 95 |                 returns = get_returns_from_rewards_non_terminating(
 96 |                     rew_arr,
 97 |                     self.mdp_rep.gamma,
 98 |                     self.nt_return_eval_steps
 99 |                 )
100 |             for i, r in enumerate(returns):
101 |                 s, _, _, f = mc_path[i]
102 |                 if not self.first_visit or f:
103 |                     counts_dict[s] += 1
104 |                     c = counts_dict[s]
105 |                     vf_dict[s] = (vf_dict[s] * (c - 1) + r) / c
106 |             episodes += 1
107 | 
108 |         return vf_dict
109 | 
110 |     def get_qv_func_dict(self, pol: Optional[Policy]) -> QFDictType:
111 |         control = pol is None
112 |         this_pol = pol if pol is not None else self.get_init_policy()
113 |         sa_dict = self.mdp_rep.state_action_dict
114 |         counts_dict = {s: {a: 0 for a in v} for s, v in sa_dict.items()}
115 |         qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
116 |         episodes = 0
117 | 
118 |         while episodes < self.num_episodes:
119 |             if self.exploring_start:
120 |                 start_state, start_action = self.mdp_rep.init_state_action_gen()
121 |             else:
122 |                 start_state = self.mdp_rep.init_state_gen()
123 |                 start_action = None
124 |             mc_path = self.get_mc_path(
125 |                 this_pol,
126 |                 start_state,
127 |                 start_action
128 |             )
129 |             rew_arr = np.array([x for _, _, x, _ in mc_path])
130 |             if mc_path[-1][0] in self.mdp_rep.terminal_states:
131 |                 returns = get_returns_from_rewards_terminating(
132 |                     rew_arr,
133 |                     self.mdp_rep.gamma
134 |                 )
135 |             else:
136 |                 returns = get_returns_from_rewards_non_terminating(
137 |                     rew_arr,
138 |                     self.mdp_rep.gamma,
139 |                     self.nt_return_eval_steps
140 |                 )
141 |             for i, r in enumerate(returns):
142 |                 s, a, _, f = mc_path[i]
143 |                 if not self.first_visit or f:
144 |                     counts_dict[s][a] += 1
145 |                     c = counts_dict[s][a]
146 |                     qf_dict[s][a] = (qf_dict[s][a] * (c - 1) + r) / c
147 |             if control:
148 |                 this_pol = get_soft_policy_from_qf_dict(
149 |                     qf_dict,
150 |                     self.softmax,
151 |                     self.epsilon_func(episodes)
152 |                 )
153 |             episodes += 1
154 | 
155 |         return qf_dict
156 | 
157 | 
158 | if __name__ == '__main__':
159 |     from processes.mdp_refined import MDPRefined
160 |     mdp_refined_data = {
161 |         1: {
162 |             'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)},
163 |             'b': {2: (0.3, -0.5), 3: (0.7, 2.6)},
164 |             'c': {1: (0.2, 4.8), 2: (0.4, -4.9), 3: (0.4, 0.0)}
165 |         },
166 |         2: {
167 |             'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)},
168 |             'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)}
169 |         },
170 |         3: {
171 |             'a': {3: (1.0, 0.0)},
172 |             'b': {3: (1.0, 0.0)}
173 |         }
174 |     }
175 |     gamma_val = 1.0
176 |     mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val)
177 |     mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular()
178 | 
179 |     exploring_start_val = False
180 |     first_visit_flag = True
181 |     softmax_flag = False
182 |     episodes_limit = 1000
183 |     epsilon_val = 0.1
184 |     epsilon_half_life_val = 100
185 |     max_steps_val = 1000
186 |     mc_obj = MonteCarlo(
187 |         mdp_rep_obj,
188 |         exploring_start_val,
189 |         first_visit_flag,
190 |         softmax_flag,
191 |         epsilon_val,
192 |         epsilon_half_life_val,
193 |         episodes_limit,
194 |         max_steps_val
195 |     )
196 | 
197 |     policy_data = {
198 |         1: {'a': 0.4, 'b': 0.6},
199 |         2: {'a': 0.7, 'c': 0.3},
200 |         3: {'b': 1.0}
201 |     }
202 |     pol_obj = Policy(policy_data)
203 | 
204 |     this_mc_path = mc_obj.get_mc_path(pol_obj, 1)
205 |     print(this_mc_path)
206 | 
207 |     this_qf_dict = mc_obj.get_act_value_func_dict(pol_obj)
208 |     print(this_qf_dict)
209 |     this_vf_dict = mc_obj.get_value_func_dict(pol_obj)
210 |     print(this_vf_dict)
211 | 
212 |     opt_pol = mc_obj.get_optimal_det_policy()
213 |     print(opt_pol)
214 |     opt_vf_dict = mc_obj.get_value_func_dict(opt_pol)
215 |     print(opt_vf_dict)
216 | 


--------------------------------------------------------------------------------
/src/algorithms/rl_func_approx/td0.py:
--------------------------------------------------------------------------------
  1 | from typing import Mapping, Optional
  2 | from algorithms.td_algo_enum import TDAlgorithm
  3 | from algorithms.rl_func_approx.rl_func_approx_base import RLFuncApproxBase
  4 | from algorithms.func_approx_spec import FuncApproxSpec
  5 | from processes.mdp_rep_for_rl_fa import MDPRepForRLFA
  6 | from processes.mp_funcs import get_rv_gen_func_single
  7 | from algorithms.helper_funcs import get_soft_policy_func_from_qf
  8 | from processes.mp_funcs import get_expected_action_value
  9 | from utils.generic_typevars import S, A
 10 | from utils.standard_typevars import VFType, QFType, PolicyActDictType
 11 | 
 12 | 
 13 | class TD0(RLFuncApproxBase):
 14 | 
 15 |     def __init__(
 16 |             self,
 17 |             mdp_rep_for_rl: MDPRepForRLFA,
 18 |             exploring_start: bool,
 19 |             algorithm: TDAlgorithm,
 20 |             softmax: bool,
 21 |             epsilon: float,
 22 |             epsilon_half_life: float,
 23 |             num_episodes: int,
 24 |             max_steps: int,
 25 |             fa_spec: FuncApproxSpec
 26 |     ) -> None:
 27 | 
 28 |         super().__init__(
 29 |             mdp_rep_for_rl=mdp_rep_for_rl,
 30 |             exploring_start=exploring_start,
 31 |             softmax=softmax,
 32 |             epsilon=epsilon,
 33 |             epsilon_half_life=epsilon_half_life,
 34 |             num_episodes=num_episodes,
 35 |             max_steps=max_steps,
 36 |             fa_spec=fa_spec
 37 |         )
 38 |         self.algorithm: TDAlgorithm = algorithm
 39 | 
 40 |     def get_value_func_fa(self, polf: PolicyActDictType) -> VFType:
 41 |         episodes = 0
 42 | 
 43 |         while episodes < self.num_episodes:
 44 |             state = self.mdp_rep.init_state_gen()
 45 |             steps = 0
 46 |             terminate = False
 47 | 
 48 |             while not terminate:
 49 |                 action = get_rv_gen_func_single(polf(state))()
 50 |                 next_state, reward = \
 51 |                     self.mdp_rep.state_reward_gen_func(state, action)
 52 |                 target = reward + self.mdp_rep.gamma *\
 53 |                     self.vf_fa.get_func_eval(next_state)
 54 |                 self.vf_fa.update_params([state], [target])
 55 |                 steps += 1
 56 |                 terminate = steps >= self.max_steps or \
 57 |                     self.mdp_rep.terminal_state_func(state)
 58 |                 state = next_state
 59 | 
 60 |             episodes += 1
 61 | 
 62 |         return self.vf_fa.get_func_eval
 63 | 
 64 |     # noinspection PyShadowingNames
 65 |     def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType:
 66 |         control = polf is None
 67 |         this_polf = polf if polf is not None else self.get_init_policy_func()
 68 |         episodes = 0
 69 | 
 70 |         while episodes < self.num_episodes:
 71 |             if self.exploring_start:
 72 |                 state, action = self.mdp_rep.init_state_action_gen()
 73 |             else:
 74 |                 state = self.mdp_rep.init_state_gen()
 75 |                 action = get_rv_gen_func_single(this_polf(state))()
 76 | 
 77 |             # print((episodes, max(self.qvf_fa.get_func_eval((state, a)) for a in
 78 |             #        self.mdp_rep.state_action_func(state))))
 79 |             # print(self.qvf_fa.params)
 80 | 
 81 |             steps = 0
 82 |             terminate = False
 83 | 
 84 |             while not terminate:
 85 |                 next_state, reward = \
 86 |                     self.mdp_rep.state_reward_gen_func(state, action)
 87 |                 next_action = get_rv_gen_func_single(this_polf(next_state))()
 88 |                 if self.algorithm == TDAlgorithm.QLearning and control:
 89 |                     next_qv = max(self.qvf_fa.get_func_eval((next_state, a)) for a in
 90 |                                   self.state_action_func(next_state))
 91 |                 elif self.algorithm == TDAlgorithm.ExpectedSARSA and control:
 92 |                     # next_qv = sum(this_polf(next_state).get(a, 0.) *
 93 |                     #               self.qvf_fa.get_func_eval((next_state, a))
 94 |                     #               for a in self.state_action_func(next_state))
 95 |                     next_qv = get_expected_action_value(
 96 |                         {a: self.qvf_fa.get_func_eval((next_state, a)) for a in
 97 |                          self.state_action_func(next_state)},
 98 |                         self.softmax,
 99 |                         self.epsilon_func(episodes)
100 |                     )
101 |                 else:
102 |                     next_qv = self.qvf_fa.get_func_eval((next_state, next_action))
103 | 
104 |                 target = reward + self.mdp_rep.gamma * next_qv
105 |                 # TD is online update and so, policy improves at every time step
106 |                 self.qvf_fa.update_params([(state, action)], [target])
107 |                 if control:
108 |                     this_polf = get_soft_policy_func_from_qf(
109 |                         self.qvf_fa.get_func_eval,
110 |                         self.state_action_func,
111 |                         self.softmax,
112 |                         self.epsilon_func(episodes)
113 |                     )
114 |                 steps += 1
115 |                 terminate = steps >= self.max_steps or \
116 |                     self.mdp_rep.terminal_state_func(state)
117 |                 state = next_state
118 |                 action = next_action
119 | 
120 |             episodes += 1
121 | 
122 |         return lambda st: lambda act, st=st: self.qvf_fa.get_func_eval((st, act))
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     from processes.mdp_refined import MDPRefined
127 |     mdp_refined_data = {
128 |         1: {
129 |             'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)},
130 |             'b': {2: (0.3, -0.5), 3: (0.7, 2.6)},
131 |             'c': {1: (0.2, 4.8), 2: (0.4, -4.9), 3: (0.4, 0.0)}
132 |         },
133 |         2: {
134 |             'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)},
135 |             'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)}
136 |         },
137 |         3: {
138 |             'a': {3: (1.0, 0.0)},
139 |             'b': {3: (1.0, 0.0)}
140 |         }
141 |     }
142 |     gamma_val = 1.0
143 |     mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val)
144 |     mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular()
145 | 
146 |     exploring_start_val = False
147 |     algorithm_type = TDAlgorithm.ExpectedSARSA
148 |     softmax_flag = False
149 |     epsilon_val = 0.1
150 |     epsilon_half_life_val = 1000
151 |     learning_rate_val = 0.1
152 |     episodes_limit = 10000
153 |     max_steps_val = 1000
154 |     state_ff = [lambda s: float(s)]
155 |     sa_ff = [
156 |         lambda x: float(x[0]),
157 |         lambda x: 1. if x[1] == 'a' else 0.,
158 |         lambda x: 1. if x[1] == 'b' else 0.,
159 |         lambda x: 1. if x[1] == 'c' else 0.,
160 |     ]
161 |     fa_spec_val = FuncApproxSpec(
162 |         state_feature_funcs=state_ff,
163 |         sa_feature_funcs=sa_ff,
164 |         dnn_spec=None,
165 |         learning_rate=learning_rate_val
166 |     )
167 |     sarsa_obj = TD0(
168 |         mdp_rep_obj,
169 |         exploring_start_val,
170 |         algorithm_type,
171 |         softmax_flag,
172 |         epsilon_val,
173 |         epsilon_half_life_val,
174 |         episodes_limit,
175 |         max_steps_val,
176 |         fa_spec_val
177 |     )
178 | 
179 |     def policy_func(i: int) -> Mapping[str, float]:
180 |         if i == 1:
181 |             ret = {'a': 0.4, 'b': 0.6}
182 |         elif i == 2:
183 |             ret = {'a': 0.7, 'c': 0.3}
184 |         elif i == 3:
185 |             ret = {'b': 1.0}
186 |         else:
187 |             raise ValueError
188 |         return ret
189 | 
190 |     this_qf = sarsa_obj.get_qv_func_fa(policy_func)
191 |     this_vf = sarsa_obj.get_value_func_fa(policy_func)
192 |     print(this_vf(1))
193 |     print(this_vf(2))
194 |     print(this_vf(3))
195 | 
196 |     opt_det_polf = sarsa_obj.get_optimal_det_policy_func()
197 | 
198 |     # noinspection PyShadowingNames
199 |     def opt_polf(s: S, opt_det_polf=opt_det_polf) -> Mapping[A, float]:
200 |         return {opt_det_polf(s): 1.0}
201 | 
202 |     opt_vf = sarsa_obj.get_value_func_fa(opt_polf)
203 |     print(opt_polf(1))
204 |     print(opt_polf(2))
205 |     print(opt_polf(3))
206 |     print(opt_vf(1))
207 |     print(opt_vf(2))
208 |     print(opt_vf(3))
209 | 


--------------------------------------------------------------------------------
/src/algorithms/rl_tabular/tdlambda.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | from algorithms.td_algo_enum import TDAlgorithm
  3 | from algorithms.rl_tabular.rl_tabular_base import RLTabularBase
  4 | from processes.policy import Policy
  5 | from processes.mp_funcs import get_rv_gen_func_single
  6 | from processes.mdp_rep_for_rl_tabular import MDPRepForRLTabular
  7 | from processes.mp_funcs import get_expected_action_value
  8 | from utils.standard_typevars import VFDictType, QFDictType
  9 | 
 10 | 
 11 | class TDLambda(RLTabularBase):
 12 | 
 13 |     def __init__(
 14 |         self,
 15 |         mdp_rep_for_rl: MDPRepForRLTabular,
 16 |         exploring_start: bool,
 17 |         algorithm: TDAlgorithm,
 18 |         softmax: bool,
 19 |         epsilon: float,
 20 |         epsilon_half_life: float,
 21 |         learning_rate: float,
 22 |         learning_rate_decay: float,
 23 |         lambd: float,
 24 |         num_episodes: int,
 25 |         max_steps: int
 26 |     ) -> None:
 27 | 
 28 |         super().__init__(
 29 |             mdp_rep_for_rl=mdp_rep_for_rl,
 30 |             exploring_start=exploring_start,
 31 |             softmax=softmax,
 32 |             epsilon=epsilon,
 33 |             epsilon_half_life=epsilon_half_life,
 34 |             num_episodes=num_episodes,
 35 |             max_steps=max_steps
 36 |         )
 37 |         self.algorithm: TDAlgorithm = algorithm
 38 |         self.learning_rate: float = learning_rate
 39 |         self.learning_rate_decay: float = learning_rate_decay
 40 |         self.gamma_lambda = self.mdp_rep.gamma * lambd
 41 | 
 42 |     def get_value_func_dict(self, pol: Policy) -> VFDictType:
 43 |         sa_dict = self.mdp_rep.state_action_dict
 44 |         vf_dict = {s: 0. for s in sa_dict.keys()}
 45 |         act_gen_dict = {s: get_rv_gen_func_single(pol.get_state_probabilities(s))
 46 |                         for s in sa_dict.keys()}
 47 |         episodes = 0
 48 |         updates = 0
 49 | 
 50 |         while episodes < self.num_episodes:
 51 |             et_dict = {s: 0. for s in sa_dict.keys()}
 52 |             state = self.mdp_rep.init_state_gen()
 53 |             steps = 0
 54 |             terminate = False
 55 | 
 56 |             while not terminate:
 57 |                 action = act_gen_dict[state]()
 58 |                 next_state, reward =\
 59 |                     self.mdp_rep.state_reward_gen_dict[state][action]()
 60 |                 delta = reward + self.mdp_rep.gamma * vf_dict[next_state] -\
 61 |                     vf_dict[state]
 62 |                 et_dict[state] += 1
 63 |                 alpha = self.learning_rate * (updates / self.learning_rate_decay
 64 |                                               + 1) ** -0.5
 65 |                 for s in sa_dict.keys():
 66 |                     vf_dict[s] += alpha * delta * et_dict[s]
 67 |                     et_dict[s] *= self.gamma_lambda
 68 |                 updates += 1
 69 |                 steps += 1
 70 |                 terminate = steps >= self.max_steps or\
 71 |                     state in self.mdp_rep.terminal_states
 72 |                 state = next_state
 73 | 
 74 |             episodes += 1
 75 | 
 76 |         return vf_dict
 77 | 
 78 |     def get_qv_func_dict(self, pol: Optional[Policy]) -> QFDictType:
 79 |         control = pol is None
 80 |         this_pol = pol if pol is not None else self.get_init_policy()
 81 |         sa_dict = self.mdp_rep.state_action_dict
 82 |         qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
 83 |         episodes = 0
 84 |         updates = 0
 85 | 
 86 |         while episodes < self.num_episodes:
 87 |             et_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
 88 |             if self.exploring_start:
 89 |                 state, action = self.mdp_rep.init_state_action_gen()
 90 |             else:
 91 |                 state = self.mdp_rep.init_state_gen()
 92 |                 action = get_rv_gen_func_single(
 93 |                     this_pol.get_state_probabilities(state)
 94 |                 )()
 95 |             steps = 0
 96 |             terminate = False
 97 | 
 98 |             while not terminate:
 99 |                 next_state, reward = \
100 |                     self.mdp_rep.state_reward_gen_dict[state][action]()
101 |                 next_action = get_rv_gen_func_single(
102 |                     this_pol.get_state_probabilities(next_state)
103 |                 )()
104 |                 if self.algorithm == TDAlgorithm.QLearning and control:
105 |                     next_qv = max(qf_dict[next_state][a] for a in
106 |                                   qf_dict[next_state])
107 |                 elif self.algorithm == TDAlgorithm.ExpectedSARSA and control:
108 |                     # next_qv = sum(this_pol.get_state_action_probability(
109 |                     #     next_state,
110 |                     #     a
111 |                     # ) * qf_dict[next_state][a] for a in qf_dict[next_state])
112 |                     next_qv = get_expected_action_value(
113 |                         qf_dict[next_state],
114 |                         self.softmax,
115 |                         self.epsilon_func(episodes)
116 |                     )
117 |                 else:
118 |                     next_qv = qf_dict[next_state][next_action]
119 | 
120 |                 delta = reward + self.mdp_rep.gamma * next_qv -\
121 |                     qf_dict[state][action]
122 |                 et_dict[state][action] += 1
123 |                 alpha = self.learning_rate * (updates / self.learning_rate_decay
124 |                                               + 1) ** -0.5
125 |                 for s, a_set in sa_dict.items():
126 |                     for a in a_set:
127 |                         qf_dict[s][a] += alpha * delta * et_dict[s][a]
128 |                         et_dict[s][a] *= self.gamma_lambda
129 |                 updates += 1
130 |                 if control:
131 |                     if self.softmax:
132 |                         this_pol.edit_state_action_to_softmax(
133 |                             state,
134 |                             qf_dict[state]
135 |                         )
136 |                     else:
137 |                         this_pol.edit_state_action_to_epsilon_greedy(
138 |                             state,
139 |                             qf_dict[state],
140 |                             self.epsilon_func(episodes)
141 |                         )
142 |                 steps += 1
143 |                 terminate = steps >= self.max_steps or \
144 |                     state in self.mdp_rep.terminal_states
145 |                 state = next_state
146 |                 action = next_action
147 | 
148 |             episodes += 1
149 | 
150 |         return qf_dict
151 | 
152 | 
153 | if __name__ == '__main__':
154 |     from processes.mdp_refined import MDPRefined
155 |     mdp_refined_data = {
156 |         1: {
157 |             'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)},
158 |             'b': {2: (0.3, -0.5), 3: (0.7, 2.6)},
159 |             'c': {1: (0.2, 4.8), 2: (0.4, -4.9), 3: (0.4, 0.0)}
160 |         },
161 |         2: {
162 |             'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)},
163 |             'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)}
164 |         },
165 |         3: {
166 |             'a': {3: (1.0, 0.0)},
167 |             'b': {3: (1.0, 0.0)}
168 |         }
169 |     }
170 |     gamma_val = 0.9
171 |     mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val)
172 |     mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular()
173 | 
174 |     exploring_start_val = False
175 |     algorithm_type = TDAlgorithm.ExpectedSARSA
176 |     softmax_flag = True
177 |     epsilon_val = 0.1
178 |     epsilon_half_life_val = 100
179 |     learning_rate_val = 0.1
180 |     learning_rate_decay_val = 1e6
181 |     lambda_val = 0.2
182 |     episodes_limit = 1000
183 |     max_steps_val = 1000
184 |     esl_obj = TDLambda(
185 |         mdp_rep_obj,
186 |         exploring_start_val,
187 |         algorithm_type,
188 |         softmax_flag,
189 |         epsilon_val,
190 |         epsilon_half_life_val,
191 |         learning_rate_val,
192 |         learning_rate_decay_val,
193 |         lambda_val,
194 |         episodes_limit,
195 |         max_steps_val
196 |     )
197 | 
198 |     policy_data = {
199 |         1: {'a': 0.4, 'b': 0.6},
200 |         2: {'a': 0.7, 'c': 0.3},
201 |         3: {'b': 1.0}
202 |     }
203 |     pol_obj = Policy(policy_data)
204 | 
205 |     this_qf_dict = esl_obj.get_act_value_func_dict(pol_obj)
206 |     print(this_qf_dict)
207 |     this_vf_dict = esl_obj.get_value_func_dict(pol_obj)
208 |     print(this_vf_dict)
209 | 
210 |     opt_pol = esl_obj.get_optimal_det_policy()
211 |     print(opt_pol)
212 |     opt_vf_dict = esl_obj.get_value_func_dict(opt_pol)
213 |     print(opt_vf_dict)
214 | 


--------------------------------------------------------------------------------
/src/examples/inv_control.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple, NamedTuple, Set, Mapping, Sequence
  2 | from itertools import chain, product, groupby
  3 | import numpy as np
  4 | from numpy.core.multiarray import ndarray
  5 | from scipy.stats import poisson
  6 | from processes.mdp_refined import MDPRefined
  7 | from func_approx.dnn_spec import DNNSpec
  8 | from func_approx.func_approx_base import FuncApproxBase
  9 | from algorithms.func_approx_spec import FuncApproxSpec
 10 | from copy import deepcopy
 11 | from operator import itemgetter
 12 | from processes.det_policy import DetPolicy
 13 | from examples.run_all_algorithms import RunAllAlgorithms
 14 | 
 15 | StateType = Tuple[int, ...]
 16 | 
 17 | 
 18 | class InvControl(NamedTuple):
 19 |     demand_lambda: float
 20 |     lead_time: int
 21 |     stockout_cost: float
 22 |     fixed_order_cost: float
 23 |     epoch_disc_factor: float
 24 |     order_limit: int
 25 |     space_limit: int
 26 |     throwout_cost: float
 27 |     stockout_limit: int
 28 |     stockout_limit_excess_cost: float
 29 | 
 30 |     def validate_spec(self) -> bool:
 31 |         b1 = self.demand_lambda > 0.
 32 |         b2 = self.lead_time >= 0
 33 |         b3 = self.stockout_cost > 1.
 34 |         b4 = self.fixed_order_cost >= 0.
 35 |         b5 = 0. <= self.epoch_disc_factor <= 1.
 36 |         b6 = self.order_limit > 0
 37 |         b7 = self.space_limit > 0
 38 |         b8 = self.throwout_cost > 1.
 39 |         b9 = self.stockout_limit > 0.
 40 |         b10 = self.stockout_limit_excess_cost > 0.
 41 |         return all([b1, b2, b3, b4, b5, b6, b7, b8, b9, b10])
 42 | 
 43 |     def get_all_states(self) -> Set[StateType]:
 44 |         on_hand_range = range(-self.stockout_limit, self.space_limit + 1)
 45 |         on_order_range = range(self.order_limit + 1)
 46 |         return set(product(
 47 |             *chain([on_hand_range], [on_order_range] * self.lead_time)
 48 |         ))
 49 | 
 50 |     # Order of operations in an epoch are:
 51 |     # 1) Order Placement (Action)
 52 |     # 2) Receipt
 53 |     # 3) Throwout Space-Limited-Excess Inventory
 54 |     # 4) Demand
 55 |     # 5) Adjust (Negative) Inventory to not fall below stockout limit
 56 | 
 57 |     # In the following func, the input "state" is represented by
 58 |     # the on-hand and on-order right before an order is placed (the very
 59 |     # first event in the epoch) and the "state"s in the output are represented
 60 |     # by the  on-hand and on-order just before the next order is placed (in the
 61 |     # next epoch).  Both the input and output "state"s are arrays of length (L+1).
 62 | 
 63 |     def get_next_states_probs_rewards(
 64 |             self,
 65 |             state: StateType,
 66 |             action: int,
 67 |             demand_probs: Sequence[float]
 68 |     ) -> Mapping[StateType, Tuple[float, float]]:
 69 |         next_state_arr: ndarray = np.array(state)
 70 |         # The next line represents state change due to Action and Receipt
 71 |         next_state_arr = np.insert(
 72 |             np.zeros(len(next_state_arr) - 1),
 73 |             0,
 74 |             next_state_arr[0]
 75 |         ) + np.append(next_state_arr[1:], action)
 76 |         excess = max(0, next_state_arr[0] - self.space_limit)
 77 |         cost = (self.fixed_order_cost if action > 0 else 0.) + \
 78 |             excess * self.throwout_cost
 79 |         # The next line represents throwing out excess inventory
 80 |         next_state_arr[0] -= excess
 81 |         # The next line represents state change due to demand
 82 |         temp_list = []
 83 |         for demand, prob in enumerate(demand_probs):
 84 |             ns = deepcopy(next_state_arr)
 85 |             ns[0] -= demand
 86 |             excess_stockout = max(0, -self.stockout_limit - ns[0])
 87 |             this_cost = cost + excess_stockout * \
 88 |                 (self.stockout_cost + self.stockout_limit_excess_cost)
 89 |             # the next line represents adjustment of negative inventory
 90 |             # to not fall below stockout limit
 91 |             ns[0] += excess_stockout
 92 |             inv = ns[0]
 93 |             onhand = max(0., inv)
 94 |             stockout = max(0., -inv)
 95 |             this_cost += (onhand + self.stockout_cost * stockout)
 96 |             ns_tup = tuple(int(x) for x in ns)
 97 |             temp_list.append((ns_tup, prob, -this_cost))
 98 | 
 99 |         ret = {}
100 |         crit = itemgetter(0)
101 |         for s, v in groupby(sorted(temp_list, key=crit), key=crit):
102 |             tl = [(p, r) for _, p, r in v]
103 |             sum_p = sum(p for p, _ in tl)
104 |             avg_r = sum(p * r for p, r in tl) / sum_p if sum_p != 0. else 0.
105 |             ret[s] = (sum_p, avg_r)
106 |         return ret
107 | 
108 |     def get_mdp_refined_dict(self) \
109 |             -> Mapping[StateType,
110 |                        Mapping[int,
111 |                                Mapping[StateType,
112 |                                        Tuple[float, float]]]]:
113 |         rv = poisson(mu=self.demand_lambda)
114 |         raw_probs = [rv.pmf(i) for i in range(int(rv.ppf(0.9999)))]
115 |         pp = [p / sum(raw_probs) for p in raw_probs]
116 |         return {s: {a: self.get_next_states_probs_rewards(s, a, pp)
117 |                     for a in range(self.order_limit + 1)}
118 |                 for s in self.get_all_states()}
119 | 
120 |     def get_mdp_refined(self) -> MDPRefined:
121 |         return MDPRefined(self.get_mdp_refined_dict(), self.epoch_disc_factor)
122 | 
123 |     def get_optimal_policy(self) -> DetPolicy:
124 |         return self.get_mdp_refined().get_optimal_policy()
125 | 
126 |     def get_ips_orders_dict(self) -> Mapping[int, Sequence[int]]:
127 |         sa_pairs = self.get_optimal_policy().get_state_to_action_map().items()
128 | 
129 |         def crit(x: Tuple[Tuple[int, ...], int]) -> int:
130 |             return sum(x[0])
131 | 
132 |         return {ip: [y for _, y in v] for ip, v in
133 |                 groupby(sorted(sa_pairs, key=crit), key=crit)}
134 | 
135 | 
136 | if __name__ == '__main__':
137 | 
138 |     ic = InvControl(
139 |         demand_lambda=0.5,
140 |         lead_time=1,
141 |         stockout_cost=49.,
142 |         fixed_order_cost=0.0,
143 |         epoch_disc_factor=0.98,
144 |         order_limit=7,
145 |         space_limit=8,
146 |         throwout_cost=30.,
147 |         stockout_limit=5,
148 |         stockout_limit_excess_cost=30.
149 |     )
150 |     if not ic.validate_spec():
151 |         raise ValueError
152 |     mdp_ref_obj = ic.get_mdp_refined()
153 |     this_tolerance = 1e-3
154 |     exploring_start = False
155 |     this_first_visit_mc = True
156 |     num_samples = 30
157 |     this_softmax = True
158 |     this_epsilon = 0.05
159 |     this_epsilon_half_life = 30
160 |     this_learning_rate = 0.1
161 |     this_learning_rate_decay = 1e6
162 |     this_lambd = 0.8
163 |     this_num_episodes = 3000
164 |     this_max_steps = 1000
165 |     this_tdl_fa_offline = True
166 |     state_ffs = FuncApproxBase.get_identity_feature_funcs(ic.lead_time + 1)
167 |     sa_ffs = [(lambda x, f=f: f(x[0])) for f in state_ffs] + [lambda x: x[1]]
168 |     this_fa_spec = FuncApproxSpec(
169 |         state_feature_funcs=state_ffs,
170 |         sa_feature_funcs=sa_ffs,
171 |         dnn_spec=DNNSpec(
172 |             neurons=[2, 4],
173 |             hidden_activation=DNNSpec.relu,
174 |             hidden_activation_deriv=DNNSpec.relu_deriv,
175 |             output_activation=DNNSpec.identity,
176 |             output_activation_deriv=DNNSpec.identity_deriv
177 |         )
178 |     )
179 | 
180 |     raa = RunAllAlgorithms(
181 |         mdp_refined=mdp_ref_obj,
182 |         tolerance=this_tolerance,
183 |         exploring_start=exploring_start,
184 |         first_visit_mc=this_first_visit_mc,
185 |         num_samples=num_samples,
186 |         softmax=this_softmax,
187 |         epsilon=this_epsilon,
188 |         epsilon_half_life=this_epsilon_half_life,
189 |         learning_rate=this_learning_rate,
190 |         learning_rate_decay=this_learning_rate_decay,
191 |         lambd=this_lambd,
192 |         num_episodes=this_num_episodes,
193 |         max_steps=this_max_steps,
194 |         tdl_fa_offline=this_tdl_fa_offline,
195 |         fa_spec=this_fa_spec
196 |     )
197 | 
198 |     def criter(x: Tuple[Tuple[int, ...], int]) -> int:
199 |         return sum(x[0])
200 | 
201 |     for st, mo in raa.get_all_algorithms().items():
202 |         print("Starting %s" % st)
203 |         opt_pol_func = mo.get_optimal_det_policy_func()
204 |         opt_pol = {s: opt_pol_func(s) for s in mdp_ref_obj.all_states}
205 |         print(sorted(
206 |             [(ip, np.mean([float(y) for _, y in v])) for ip, v in
207 |              groupby(sorted(opt_pol.items(), key=criter), key=criter)],
208 |             key=itemgetter(0)
209 |         ))
210 | 


--------------------------------------------------------------------------------
/src/algorithms/mab/plot_mab_graphs.py:
--------------------------------------------------------------------------------
  1 | from typing import NoReturn
  2 | from operator import itemgetter
  3 | from processes.mab_env import MabEnv
  4 | from algorithms.mab.epsilon_greedy import EpsilonGreedy
  5 | from algorithms.mab.ucb1 import UCB1
  6 | from algorithms.mab.ts_gaussian import ThompsonSamplingGaussian
  7 | from algorithms.mab.ts_bernoulli import ThompsonSamplingBernoulli
  8 | from algorithms.mab.gradient_bandits import GradientBandits
  9 | from numpy import arange
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | 
 13 | def plot_gaussian_algorithms() -> NoReturn:
 14 |     mean_vars_data = [
 15 |         (0., 10.),
 16 |         (2., 20.),
 17 |         (4., 1.),
 18 |         (6., 8.),
 19 |         (8., 4.),
 20 |         (9., 6.),
 21 |         (10., 4.)]
 22 |     mu_star = max(mean_vars_data, key=itemgetter(0))[0]
 23 | 
 24 |     steps = 500
 25 |     episodes = 500
 26 | 
 27 |     eps = 0.3
 28 |     eps_hl = 400
 29 | 
 30 |     ci = 5
 31 |     mi = mu_star * 3.
 32 | 
 33 |     ts_mi = 0.
 34 |     ts_si = 10.
 35 | 
 36 |     lr = 0.1
 37 |     lr_decay = 20.
 38 | 
 39 |     me = MabEnv.get_gaussian_mab_env(mean_vars_data)
 40 | 
 41 |     greedy_opt_init = EpsilonGreedy(
 42 |         mab=me,
 43 |         time_steps=steps,
 44 |         num_episodes=episodes,
 45 |         epsilon=0.,
 46 |         epsilon_half_life=1e8,
 47 |         count_init=ci,
 48 |         mean_init=mi
 49 |     )
 50 |     eps_greedy = EpsilonGreedy(
 51 |         mab=me,
 52 |         time_steps=steps,
 53 |         num_episodes=episodes,
 54 |         epsilon=eps,
 55 |         epsilon_half_life=1e8,
 56 |         count_init=0,
 57 |         mean_init=0.
 58 |     )
 59 |     decay_eps_greedy = EpsilonGreedy(
 60 |         mab=me,
 61 |         time_steps=steps,
 62 |         num_episodes=episodes,
 63 |         epsilon=eps,
 64 |         epsilon_half_life=eps_hl,
 65 |         count_init=0,
 66 |         mean_init=0.
 67 |     )
 68 |     ts = ThompsonSamplingGaussian(
 69 |         mab=me,
 70 |         time_steps=steps,
 71 |         num_episodes=episodes,
 72 |         init_mean=ts_mi,
 73 |         init_stdev=ts_si
 74 |     )
 75 |     grad_bandits = GradientBandits(
 76 |         mab=me,
 77 |         time_steps=steps,
 78 |         num_episodes=episodes,
 79 |         learning_rate=lr,
 80 |         learning_rate_decay=lr_decay
 81 |     )
 82 | 
 83 |     plot_colors = ['r', 'b', 'g', 'k', 'y']
 84 |     labels = [
 85 |         'Greedy, Optimistic Initialization',
 86 |         '$\epsilon$-Greedy',
 87 |         'Decaying $\epsilon$-Greedy',
 88 |         'Thompson Sampling',
 89 |         'Gradient Bandits'
 90 |     ]
 91 | 
 92 |     exp_cum_regrets = [
 93 |         greedy_opt_init.get_expected_cum_regret(mu_star),
 94 |         eps_greedy.get_expected_cum_regret(mu_star),
 95 |         decay_eps_greedy.get_expected_cum_regret(mu_star),
 96 |         ts.get_expected_cum_regret(mu_star),
 97 |         grad_bandits.get_expected_cum_regret(mu_star)
 98 |     ]
 99 | 
100 |     x_vals = range(1, steps + 1)
101 |     for i in range(len(exp_cum_regrets)):
102 |         plt.plot(exp_cum_regrets[i], color=plot_colors[i], label=labels[i])
103 |     plt.xlabel("Time Steps", fontsize=20)
104 |     plt.ylabel("Expected Cumulative Regret", fontsize=20)
105 |     plt.title("Cumulative Regret Curves", fontsize=25)
106 |     plt.xlim(xmin=x_vals[0], xmax=x_vals[-1])
107 |     plt.ylim(ymin=0.0)
108 |     plt.grid(True)
109 |     plt.legend(loc='upper left', fontsize=15)
110 |     plt.show()
111 | 
112 |     exp_act_counts = [
113 |         greedy_opt_init.get_expected_action_counts(),
114 |         eps_greedy.get_expected_action_counts(),
115 |         decay_eps_greedy.get_expected_action_counts(),
116 |         ts.get_expected_action_counts(),
117 |         grad_bandits.get_expected_action_counts()
118 |     ]
119 |     index = arange(len(me.arms_sampling_funcs))
120 |     spacing = 0.4
121 |     width = (1 - spacing) / len(exp_act_counts)
122 | 
123 |     for i in range(len(exp_act_counts)):
124 |         plt.bar(
125 |             index - (1 - spacing) / 2 + (i - 1.5) * width,
126 |             exp_act_counts[i],
127 |             width,
128 |             color=plot_colors[i],
129 |             label=labels[i]
130 |         )
131 |     plt.xlabel("Arms", fontsize=20)
132 |     plt.ylabel("Expected Counts of Arms", fontsize=20)
133 |     plt.title("Arms Counts Plot", fontsize=25)
134 |     plt.xticks(
135 |         index - 0.3,
136 |         ["$\mu$=%.1f,$\sigma$=%.1f" % (m, s) for m, s in mean_vars_data]
137 |     )
138 |     plt.legend(loc='upper left', fontsize=15)
139 |     plt.tight_layout()
140 |     plt.show()
141 | 
142 | 
143 | def plot_bernoulli_algorithms() -> NoReturn:
144 |     probs_data = [0.1, 0.2, 0.4, 0.5, 0.6, 0.75, 0.8, 0.85, 0.9]
145 |     mu_star = max(probs_data)
146 | 
147 |     steps = 500
148 |     episodes = 500
149 | 
150 |     eps = 0.3
151 |     eps_hl = 400
152 | 
153 |     ci = 5
154 |     mi = mu_star * 3.
155 | 
156 |     ucb_alpha = 4.0
157 | 
158 |     lr = 0.5
159 |     lr_decay = 20.
160 | 
161 |     me = MabEnv.get_bernoulli_mab_env(probs_data)
162 | 
163 |     greedy_opt_init = EpsilonGreedy(
164 |         mab=me,
165 |         time_steps=steps,
166 |         num_episodes=episodes,
167 |         epsilon=0.,
168 |         epsilon_half_life=1e8,
169 |         count_init=ci,
170 |         mean_init=mi
171 |     )
172 |     eps_greedy = EpsilonGreedy(
173 |         mab=me,
174 |         time_steps=steps,
175 |         num_episodes=episodes,
176 |         epsilon=eps,
177 |         epsilon_half_life=1e8,
178 |         count_init=0,
179 |         mean_init=0.
180 |     )
181 |     decay_eps_greedy = EpsilonGreedy(
182 |         mab=me,
183 |         time_steps=steps,
184 |         num_episodes=episodes,
185 |         epsilon=eps,
186 |         epsilon_half_life=eps_hl,
187 |         count_init=0,
188 |         mean_init=0.
189 |     )
190 |     ucb1 = UCB1(
191 |         mab=me,
192 |         time_steps=steps,
193 |         num_episodes=episodes,
194 |         bounds_range=1.0,
195 |         alpha=ucb_alpha
196 |     )
197 |     ts = ThompsonSamplingBernoulli(
198 |         mab=me,
199 |         time_steps=steps,
200 |         num_episodes=episodes
201 |     )
202 |     grad_bandits = GradientBandits(
203 |         mab=me,
204 |         time_steps=steps,
205 |         num_episodes=episodes,
206 |         learning_rate=lr,
207 |         learning_rate_decay=lr_decay
208 |     )
209 | 
210 |     plot_colors = ['r', 'b', 'g', 'y', 'k', 'c']
211 |     labels = [
212 |         'Greedy, Optimistic Initialization',
213 |         '$\epsilon$-Greedy',
214 |         'Decaying $\epsilon$-Greedy',
215 |         'UCB1',
216 |         'Thompson Sampling',
217 |         'Gradient Bandits'
218 |     ]
219 | 
220 |     exp_cum_regrets = [
221 |         greedy_opt_init.get_expected_cum_regret(mu_star),
222 |         eps_greedy.get_expected_cum_regret(mu_star),
223 |         decay_eps_greedy.get_expected_cum_regret(mu_star),
224 |         ucb1.get_expected_cum_regret(mu_star),
225 |         ts.get_expected_cum_regret(mu_star),
226 |         grad_bandits.get_expected_cum_regret(mu_star)
227 |     ]
228 | 
229 |     x_vals = range(1, steps + 1)
230 |     for i in range(len(exp_cum_regrets)):
231 |         plt.plot(exp_cum_regrets[i], color=plot_colors[i], label=labels[i])
232 |     plt.xlabel("Time Steps", fontsize=20)
233 |     plt.ylabel("Expected Cumulative Regret", fontsize=20)
234 |     plt.title("Cumulative Regret Curves", fontsize=25)
235 |     plt.xlim(xmin=x_vals[0], xmax=x_vals[-1])
236 |     plt.ylim(ymin=0.0)
237 |     plt.grid(True)
238 |     plt.legend(loc='upper left', fontsize=15)
239 |     plt.show()
240 | 
241 |     exp_act_counts = [
242 |         greedy_opt_init.get_expected_action_counts(),
243 |         eps_greedy.get_expected_action_counts(),
244 |         decay_eps_greedy.get_expected_action_counts(),
245 |         ucb1.get_expected_action_counts(),
246 |         ts.get_expected_action_counts(),
247 |         grad_bandits.get_expected_action_counts()
248 |     ]
249 |     index = arange(len(me.arms_sampling_funcs))
250 |     spacing = 0.4
251 |     width = (1 - spacing) / len(exp_act_counts)
252 | 
253 |     for i in range(len(exp_act_counts)):
254 |         plt.bar(
255 |             index - (1 - spacing) / 2 + (i - 1.5) * width,
256 |             exp_act_counts[i],
257 |             width,
258 |             color=plot_colors[i],
259 |             label=labels[i]
260 |         )
261 |     plt.xlabel("Arms", fontsize=20)
262 |     plt.ylabel("Expected Counts of Arms", fontsize=20)
263 |     plt.title("Arms Counts Plot", fontsize=25)
264 |     plt.xticks(
265 |         index - 0.2,
266 |         ["$p$=%.2f" % p for p in probs_data]
267 |     )
268 |     plt.legend(loc='upper left', fontsize=15)
269 |     plt.tight_layout()
270 |     plt.show()
271 | 
272 | 
273 | if __name__ == '__main__':
274 |     # plot_gaussian_algorithms()
275 |     plot_bernoulli_algorithms()
276 | 


--------------------------------------------------------------------------------
/src/algorithms/rl_func_approx/lspi.py:
--------------------------------------------------------------------------------
  1 | from typing import Mapping, Optional, Sequence, Callable, Tuple
  2 | from algorithms.rl_func_approx.rl_func_approx_base import RLFuncApproxBase
  3 | from algorithms.func_approx_spec import FuncApproxSpec
  4 | import numpy as np
  5 | from processes.mdp_rep_for_rl_fa import MDPRepForRLFA
  6 | from processes.mp_funcs import get_rv_gen_func_single
  7 | from algorithms.helper_funcs import get_soft_policy_func_from_qf
  8 | from operator import itemgetter
  9 | from utils.generic_typevars import S, A
 10 | from utils.standard_typevars import VFType, QFType, PolicyActDictType
 11 | 
 12 | 
 13 | class LSPI(RLFuncApproxBase):
 14 | 
 15 |     def __init__(
 16 |         self,
 17 |         mdp_rep_for_rl: MDPRepForRLFA,
 18 |         exploring_start: bool,
 19 |         softmax: bool,
 20 |         epsilon: float,
 21 |         epsilon_half_life: float,
 22 |         num_episodes: int,
 23 |         batch_size: int,
 24 |         max_steps: int,
 25 |         state_feature_funcs: Sequence[Callable[[S], float]],
 26 |         sa_feature_funcs: Sequence[Callable[[Tuple[S, A]], float]]
 27 |     ) -> None:
 28 | 
 29 |         super().__init__(
 30 |             mdp_rep_for_rl=mdp_rep_for_rl,
 31 |             exploring_start=exploring_start,
 32 |             softmax=softmax,
 33 |             epsilon=epsilon,
 34 |             epsilon_half_life=epsilon_half_life,
 35 |             num_episodes=num_episodes,
 36 |             max_steps=max_steps,
 37 |             fa_spec=FuncApproxSpec(
 38 |                 state_feature_funcs=state_feature_funcs,
 39 |                 sa_feature_funcs=sa_feature_funcs,
 40 |                 dnn_spec=None,
 41 |                 reglr_coeff=0.,
 42 |                 learning_rate=0.,
 43 |                 adam_params=(False, 0., 0.),
 44 |                 add_unit_feature=True
 45 |             )
 46 |         )
 47 |         self.batch_size: int = batch_size
 48 | 
 49 |     def get_value_func_fa(self, polf: PolicyActDictType) -> VFType:
 50 |         ffs = self.vf_fa.feature_funcs
 51 |         features = len(ffs)
 52 |         a_mat = np.zeros((features, features))
 53 |         b_vec = np.zeros(features)
 54 | 
 55 |         for _ in range(self.num_episodes):
 56 |             state = self.mdp_rep.init_state_gen()
 57 |             steps = 0
 58 |             terminate = False
 59 | 
 60 |             while not terminate:
 61 |                 action = get_rv_gen_func_single(polf(state))()
 62 |                 next_state, reward = \
 63 |                     self.mdp_rep.state_reward_gen_func(state, action)
 64 |                 phi_s = np.array([f(state) for f in ffs])
 65 |                 phi_sp = np.array([f(next_state) for f in ffs])
 66 |                 a_mat += np.outer(
 67 |                     phi_s,
 68 |                     phi_s - self.mdp_rep.gamma * phi_sp
 69 |                 )
 70 |                 b_vec += reward * phi_s
 71 |                 steps += 1
 72 |                 terminate = steps >= self.max_steps or \
 73 |                     self.mdp_rep.terminal_state_func(state)
 74 |                 state = next_state
 75 | 
 76 |         self.vf_fa.params = [np.linalg.inv(a_mat).dot(b_vec)]
 77 | 
 78 |         return self.vf_fa.get_func_eval
 79 | 
 80 |     # noinspection PyShadowingNames
 81 |     def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType:
 82 |         ffs = self.qvf_fa.feature_funcs
 83 |         features = len(ffs)
 84 |         a_mat = np.zeros((features, features))
 85 |         b_vec = np.zeros(features)
 86 |         control = polf is None
 87 |         this_polf = polf if polf is not None else self.get_init_policy_func()
 88 | 
 89 |         for episode in range(self.num_episodes):
 90 |             if self.exploring_start:
 91 |                 state, action = self.mdp_rep.init_state_action_gen()
 92 |             else:
 93 |                 state = self.mdp_rep.init_state_gen()
 94 |                 action = get_rv_gen_func_single(this_polf(state))()
 95 | 
 96 |             # print((episodes, max(self.qvf_fa.get_func_eval((state, a)) for a in
 97 |             #        self.mdp_rep.state_action_func(state))))
 98 |             # print(self.qvf_fa.params)
 99 | 
100 |             steps = 0
101 |             terminate = False
102 | 
103 |             while not terminate:
104 |                 next_state, reward = \
105 |                     self.mdp_rep.state_reward_gen_func(state, action)
106 |                 phi_s = np.array([f((state, action)) for f in ffs])
107 |                 next_action = get_rv_gen_func_single(this_polf(next_state))()
108 |                 if control:
109 |                     next_act = max(
110 |                         [(a, self.qvf_fa.get_func_eval((next_state, a))) for a in
111 |                          self.state_action_func(next_state)],
112 |                         key=itemgetter(1)
113 |                     )[0]
114 |                 else:
115 |                     next_act = next_action
116 |                 phi_sp = np.array([f((next_state, next_act)) for f in ffs])
117 |                 a_mat += np.outer(
118 |                     phi_s,
119 |                     phi_s - self.mdp_rep.gamma * phi_sp
120 |                 )
121 |                 b_vec += reward * phi_s
122 | 
123 |                 steps += 1
124 |                 terminate = steps >= self.max_steps or \
125 |                     self.mdp_rep.terminal_state_func(state)
126 |                 state = next_state
127 |                 action = next_action
128 | 
129 |             if control and (episode + 1) % self.batch_size == 0:
130 |                 self.qvf_fa.params = [np.linalg.inv(a_mat).dot(b_vec)]
131 |                 # print(self.qvf_fa.params)
132 |                 this_polf = get_soft_policy_func_from_qf(
133 |                     self.qvf_fa.get_func_eval,
134 |                     self.state_action_func,
135 |                     self.softmax,
136 |                     self.epsilon_func(episode)
137 |                 )
138 |                 a_mat = np.zeros((features, features))
139 |                 b_vec = np.zeros(features)
140 | 
141 |         if not control:
142 |             self.qvf_fa.params = [np.linalg.inv(a_mat).dot(b_vec)]
143 | 
144 |         return lambda st: lambda act, st=st: self.qvf_fa.get_func_eval((st, act))
145 | 
146 | 
147 | if __name__ == '__main__':
148 |     from processes.mdp_refined import MDPRefined
149 |     mdp_refined_data = {
150 |         1: {
151 |             'a': {1: (0.3, 9.2), 2: (0.6, 4.5), 3: (0.1, 5.0)},
152 |             'b': {2: (0.3, -0.5), 3: (0.7, 2.6)},
153 |             'c': {1: (0.2, 4.8), 2: (0.4, -4.9), 3: (0.4, 0.0)}
154 |         },
155 |         2: {
156 |             'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)},
157 |             'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)}
158 |         },
159 |         3: {
160 |             'a': {3: (1.0, 0.0)},
161 |             'b': {3: (1.0, 0.0)}
162 |         }
163 |     }
164 |     gamma_val = 0.9
165 |     mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val)
166 |     mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular()
167 | 
168 |     exploring_start_val = False
169 |     softmax_flag = False
170 |     epsilon_val = 0.1
171 |     epsilon_half_life_val = 10000
172 |     num_episodes_val = 100000
173 |     batch_size_val = 1000
174 |     max_steps_val = 1000
175 |     state_ff = [lambda s: float(s)]
176 |     sa_ff = [
177 |         lambda x: float(x[0]),
178 |         lambda x: 1. if x[1] == 'a' else 0.,
179 |         lambda x: 1. if x[1] == 'b' else 0.,
180 |         lambda x: 1. if x[1] == 'c' else 0.,
181 |     ]
182 |     lspi_obj = LSPI(
183 |         mdp_rep_obj,
184 |         exploring_start_val,
185 |         softmax_flag,
186 |         epsilon_val,
187 |         epsilon_half_life_val,
188 |         num_episodes_val,
189 |         batch_size_val,
190 |         max_steps_val,
191 |         state_ff,
192 |         sa_ff
193 |     )
194 | 
195 |     def policy_func(i: int) -> Mapping[str, float]:
196 |         if i == 1:
197 |             ret = {'a': 0.4, 'b': 0.6}
198 |         elif i == 2:
199 |             ret = {'a': 0.7, 'c': 0.3}
200 |         elif i == 3:
201 |             ret = {'b': 1.0}
202 |         else:
203 |             raise ValueError
204 |         return ret
205 | 
206 |     # this_qf = lspi_obj.get_qv_func_fa(policy_func)
207 |     this_vf = lspi_obj.get_value_func_fa(policy_func)
208 |     print(this_vf(1))
209 |     print(this_vf(2))
210 |     print(this_vf(3))
211 | 
212 |     opt_det_polf = lspi_obj.get_optimal_det_policy_func()
213 | 
214 |     # noinspection PyShadowingNames
215 |     def opt_polf(s: S, opt_det_polf=opt_det_polf) -> Mapping[A, float]:
216 |         return {opt_det_polf(s): 1.0}
217 | 
218 |     opt_vf = lspi_obj.get_value_func_fa(opt_polf)
219 |     print(opt_polf(1))
220 |     print(opt_polf(2))
221 |     print(opt_polf(3))
222 |     print(opt_vf(1))
223 |     print(opt_vf(2))
224 |     print(opt_vf(3))
225 | 


--------------------------------------------------------------------------------