├── README.md
└── MuesliExample.ipynb


/README.md:
--------------------------------------------------------------------------------
 1 | # MuesliJupyterExample
 2 | 
 3 | This is a simple reimplementation of Muesli algorithm for two-player board games.
 4 | 
 5 | https://arxiv.org/abs/2104.06159
 6 | 
 7 | 
 8 | MuZero version is here: https://github.com/YuriCat/MuZeroJupyterExample
 9 | 
10 | AlphaZero version is here: https://github.com/YuriCat/AlphaZeroJupyterExample
11 | 


--------------------------------------------------------------------------------
/MuesliExample.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# environment:\n",
 10 |     "# pip3 install torch"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "# Implementation of simple game: Tic-Tac-Toe\n",
 20 |     "# You can change this to another two-player game.\n",
 21 |     "\n",
 22 |     "import numpy as np\n",
 23 |     "\n",
 24 |     "BLACK, WHITE = 1, -1  # first turn or second turn player\n",
 25 |     "\n",
 26 |     "class State:\n",
 27 |     "    '''Board implementation of Tic-Tac-Toe'''\n",
 28 |     "    X, Y = 'ABC',  '123'\n",
 29 |     "    C = {0: '_', BLACK: 'O', WHITE: 'X'}\n",
 30 |     "\n",
 31 |     "    def __init__(self):\n",
 32 |     "        self.board = np.zeros((3, 3)) # (x, y)\n",
 33 |     "        self.color = 1\n",
 34 |     "        self.win_color = 0\n",
 35 |     "        self.record = []\n",
 36 |     "\n",
 37 |     "    def action2str(self, a):\n",
 38 |     "        return self.X[a // 3] + self.Y[a % 3]\n",
 39 |     "\n",
 40 |     "    def str2action(self, s):\n",
 41 |     "        return self.X.find(s[0]) * 3 + self.Y.find(s[1])\n",
 42 |     "\n",
 43 |     "    def record_string(self):\n",
 44 |     "        return ' '.join([self.action2str(a) for a in self.record])\n",
 45 |     "\n",
 46 |     "    def __str__(self):\n",
 47 |     "        # output board.\n",
 48 |     "        s = '   ' + ' '.join(self.Y) + '\\n'\n",
 49 |     "        for i in range(3):\n",
 50 |     "            s += self.X[i] + ' ' + ' '.join([self.C[self.board[i, j]] for j in range(3)]) + '\\n'\n",
 51 |     "        s += 'record = ' + self.record_string()\n",
 52 |     "        return s\n",
 53 |     "\n",
 54 |     "    def play(self, action):\n",
 55 |     "        # state transition function\n",
 56 |     "        # action is position inerger (0~8) or string representation of action sequence\n",
 57 |     "        if isinstance(action, str):\n",
 58 |     "            for astr in action.split():\n",
 59 |     "                self.play(self.str2action(astr))\n",
 60 |     "            return self\n",
 61 |     "\n",
 62 |     "        x, y = action // 3, action % 3\n",
 63 |     "        self.board[x, y] = self.color\n",
 64 |     "\n",
 65 |     "        # check whether 3 stones are on the line\n",
 66 |     "        if self.board[x, :].sum() == 3 * self.color \\\n",
 67 |     "          or self.board[:, y].sum() == 3 * self.color \\\n",
 68 |     "          or (x == y and np.diag(self.board, k=0).sum() == 3 * self.color) \\\n",
 69 |     "          or (x == 2 - y and np.diag(self.board[::-1,:], k=0).sum() == 3 * self.color):\n",
 70 |     "            self.win_color = self.color\n",
 71 |     "\n",
 72 |     "        self.color = -self.color\n",
 73 |     "        self.record.append(action)\n",
 74 |     "        return self\n",
 75 |     "\n",
 76 |     "    def terminal(self):\n",
 77 |     "        # terminal state check\n",
 78 |     "        return self.win_color != 0 or len(self.record) == 3 * 3\n",
 79 |     "\n",
 80 |     "    def terminal_reward(self):\n",
 81 |     "        # terminal reward \n",
 82 |     "        return self.win_color\n",
 83 |     "\n",
 84 |     "    def action_length(self):\n",
 85 |     "        return 3 * 3\n",
 86 |     "    \n",
 87 |     "    def legal_actions(self):\n",
 88 |     "        # list of legal actions on each state\n",
 89 |     "        return [a for a in range(3 * 3) if self.board[a // 3, a % 3] == 0]\n",
 90 |     "\n",
 91 |     "    def feature(self):\n",
 92 |     "        # input tensor for neural net (state)\n",
 93 |     "        return np.stack([self.board == self.color, self.board == -self.color]).astype(np.float32)\n",
 94 |     "\n",
 95 |     "    def action_feature(self, action):\n",
 96 |     "        # input tensor for neural net (action)\n",
 97 |     "        a = np.zeros((1, 3, 3), dtype=np.float32)\n",
 98 |     "        a[0, action // 3, action % 3] = 1\n",
 99 |     "        return a\n",
100 |     "\n",
101 |     "state = State().play('B1')\n",
102 |     "print(state)\n",
103 |     "print('input feature')\n",
104 |     "print(state.feature())\n",
105 |     "state = State().play('B2 A1 C2')\n",
106 |     "print('input feature')\n",
107 |     "print(state.feature())"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "# Small neural nets with PyTorch\n",
117 |     "\n",
118 |     "import torch\n",
119 |     "import torch.nn as nn\n",
120 |     "import torch.nn.functional as F\n",
121 |     "\n",
122 |     "class Conv(nn.Module):\n",
123 |     "    def __init__(self, filters0, filters1, kernel_size, bn=False):\n",
124 |     "        super().__init__()\n",
125 |     "        self.conv = nn.Conv2d(filters0, filters1, kernel_size, stride=1, padding=kernel_size//2, bias=False)\n",
126 |     "        self.bn = None\n",
127 |     "        if bn:\n",
128 |     "            self.bn = nn.BatchNorm2d(filters1)\n",
129 |     "\n",
130 |     "    def forward(self, x):\n",
131 |     "        h = self.conv(x)\n",
132 |     "        if self.bn is not None:\n",
133 |     "            h = self.bn(h)\n",
134 |     "        return h\n",
135 |     "\n",
136 |     "class ResidualBlock(nn.Module):\n",
137 |     "    def __init__(self, filters):\n",
138 |     "        super().__init__()\n",
139 |     "        self.conv = Conv(filters, filters, 3, True)\n",
140 |     "\n",
141 |     "    def forward(self, x):\n",
142 |     "        return F.relu(x + (self.conv(x)))"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "num_filters = 16\n",
152 |     "num_blocks = 4\n",
153 |     "\n",
154 |     "class Representation(nn.Module):\n",
155 |     "    ''' Conversion from observation to inner abstract state '''\n",
156 |     "    def __init__(self, input_shape):\n",
157 |     "        super().__init__()\n",
158 |     "        self.input_shape = input_shape\n",
159 |     "        self.board_size = self.input_shape[1] * self.input_shape[2]\n",
160 |     "\n",
161 |     "        self.layer0 = Conv(self.input_shape[0], num_filters, 3, bn=True)\n",
162 |     "        self.blocks = nn.ModuleList([ResidualBlock(num_filters) for _ in range(num_blocks)])\n",
163 |     "\n",
164 |     "    def forward(self, x):\n",
165 |     "        h = F.relu(self.layer0(x))\n",
166 |     "        for block in self.blocks:\n",
167 |     "            h = block(h)\n",
168 |     "        return h\n",
169 |     "\n",
170 |     "    def inference(self, x):\n",
171 |     "        self.eval()\n",
172 |     "        with torch.no_grad():\n",
173 |     "            rp = self(torch.from_numpy(x).unsqueeze(0))\n",
174 |     "        return rp.cpu().numpy()[0]\n",
175 |     "\n",
176 |     "class Prediction(nn.Module):\n",
177 |     "    ''' Policy and value prediction from inner abstract state '''\n",
178 |     "    def __init__(self, action_shape):\n",
179 |     "        super().__init__()\n",
180 |     "        self.board_size = np.prod(action_shape[1:])\n",
181 |     "        self.action_size = action_shape[0] * self.board_size\n",
182 |     "\n",
183 |     "        self.conv_p1 = Conv(num_filters, 4, 1, bn=True)\n",
184 |     "        self.conv_p2 = Conv(4, 1, 1)\n",
185 |     "\n",
186 |     "        self.conv_v = Conv(num_filters, 4, 1, bn=True)\n",
187 |     "        self.fc_v = nn.Linear(self.board_size * 4, 1, bias=False)\n",
188 |     "\n",
189 |     "    def forward(self, rp):\n",
190 |     "        h_p = F.relu(self.conv_p1(rp))\n",
191 |     "        h_p = self.conv_p2(h_p).view(-1, self.action_size)\n",
192 |     "\n",
193 |     "        h_v = F.relu(self.conv_v(rp))\n",
194 |     "        h_v = self.fc_v(h_v.view(-1, self.board_size * 4))\n",
195 |     "\n",
196 |     "        # range of value is -1 ~ 1\n",
197 |     "        return F.softmax(h_p, dim=-1), torch.tanh(h_v)\n",
198 |     "\n",
199 |     "    def inference(self, rp):\n",
200 |     "        self.eval()\n",
201 |     "        with torch.no_grad():\n",
202 |     "            p, v = self(torch.from_numpy(rp).unsqueeze(0))\n",
203 |     "        return p.cpu().numpy()[0], v.cpu().numpy()[0][0]\n",
204 |     "\n",
205 |     "class Dynamics(nn.Module):\n",
206 |     "    '''Abstract state transition'''\n",
207 |     "    def __init__(self, rp_shape, act_shape):\n",
208 |     "        super().__init__()\n",
209 |     "        self.rp_shape = rp_shape\n",
210 |     "        self.layer0 = Conv(rp_shape[0] + act_shape[0], num_filters, 3, bn=True)\n",
211 |     "        self.blocks = nn.ModuleList([ResidualBlock(num_filters) for _ in range(num_blocks)])\n",
212 |     "\n",
213 |     "    def forward(self, rp, a):\n",
214 |     "        h = torch.cat([rp, a], dim=1)\n",
215 |     "        h = self.layer0(h)\n",
216 |     "        for block in self.blocks:\n",
217 |     "            h = block(h)\n",
218 |     "        return h\n",
219 |     "\n",
220 |     "    def inference(self, rp, a):\n",
221 |     "        self.eval()\n",
222 |     "        with torch.no_grad():\n",
223 |     "            rp = self(torch.from_numpy(rp).unsqueeze(0), torch.from_numpy(a).unsqueeze(0))\n",
224 |     "        return rp.cpu().numpy()[0]\n",
225 |     "\n",
226 |     "class Net(nn.Module):\n",
227 |     "    '''Whole net'''\n",
228 |     "    def __init__(self):\n",
229 |     "        super().__init__()\n",
230 |     "        state = State()\n",
231 |     "        input_shape = state.feature().shape\n",
232 |     "        action_shape = state.action_feature(0).shape\n",
233 |     "        rp_shape = (num_filters, *input_shape[1:])\n",
234 |     "\n",
235 |     "        self.representation = Representation(input_shape)\n",
236 |     "        self.prediction = Prediction(action_shape)\n",
237 |     "        self.dynamics = Dynamics(rp_shape, action_shape)\n",
238 |     "\n",
239 |     "    def predict(self, state0, path):\n",
240 |     "        '''Predict p and v from original state and path'''\n",
241 |     "        outputs = []\n",
242 |     "        x = state0.feature()\n",
243 |     "        rp = self.representation.inference(x)\n",
244 |     "        outputs.append(self.prediction.inference(rp))\n",
245 |     "        for action in path:\n",
246 |     "            a = state0.action_feature(action)\n",
247 |     "            rp = self.dynamics.inference(rp, a)\n",
248 |     "            outputs.append(self.prediction.inference(rp))\n",
249 |     "        return outputs"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "metadata": {},
256 |    "outputs": [],
257 |    "source": [
258 |     "def show_net(net, state):\n",
259 |     "    '''Display policy (p) and value (v)'''\n",
260 |     "    print(state)\n",
261 |     "    p, v = net.predict(state, [])[-1]\n",
262 |     "    print('p = ')\n",
263 |     "    print((p * 1000).astype(int).reshape((-1, *net.representation.input_shape[1:3])))\n",
264 |     "    print('v = ', v)\n",
265 |     "    print()\n",
266 |     "\n",
267 |     "#  Outputs before training\n",
268 |     "show_net(Net(), State())"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "# Training of neural net\n",
278 |     "\n",
279 |     "import torch.optim as optim\n",
280 |     "\n",
281 |     "batch_size = 32\n",
282 |     "num_steps = 100\n",
283 |     "K = 1\n",
284 |     "\n",
285 |     "def gen_target(state, ep):\n",
286 |     "    '''Generate inputs and targets for training'''\n",
287 |     "    # path, reward, observation, action, policy\n",
288 |     "    ep_length = len(ep['feature'])\n",
289 |     "    turn_idx = np.random.randint(ep_length)\n",
290 |     "    \n",
291 |     "    x = ep['feature'][turn_idx]\n",
292 |     "    ps, rs, acts, axs = [], [], [], []\n",
293 |     "    sas, seas, szs = [], [], []\n",
294 |     "    for t in range(turn_idx, turn_idx + K + 1):\n",
295 |     "        if t < ep_length:\n",
296 |     "            p = ep['policy'][t]\n",
297 |     "            a = ep['action'][t]\n",
298 |     "            ax = ep['action_feature'][t]\n",
299 |     "            sa = ep['sampled_info'][t]['a']\n",
300 |     "            sea = ep['sampled_info'][t]['exadv']\n",
301 |     "            sz = ep['sampled_info'][t]['z']\n",
302 |     "        else: # state after finishing game\n",
303 |     "            p = np.zeros_like(ep['policy'][-1])\n",
304 |     "            # random action selection\n",
305 |     "            a = np.random.randint(state.action_length())\n",
306 |     "            ax = state.action_feature(a)\n",
307 |     "            sa = np.random.randint(state.action_length(), size=len(sa))\n",
308 |     "            sea = np.ones_like(sea)\n",
309 |     "            sz = np.ones_like(sz)\n",
310 |     "        \n",
311 |     "        rs.append([ep['reward'] if t % 2 == 0 else -ep['reward']])\n",
312 |     "        acts.append([a])\n",
313 |     "        axs.append(ax)\n",
314 |     "        ps.append(p)\n",
315 |     "        sas.append(sa)\n",
316 |     "        seas.append(sea)\n",
317 |     "        szs.append(sz)\n",
318 |     "        \n",
319 |     "    return x, rs, acts, axs, ps, sas, seas, szs\n",
320 |     "\n",
321 |     "def train(episodes, net, opt):\n",
322 |     "    '''Train neural net'''\n",
323 |     "    pg_loss_sum, cmpo_loss_sum, v_loss_sum = 0, 0, 0\n",
324 |     "    net.train()\n",
325 |     "    state = State()\n",
326 |     "\n",
327 |     "    for _ in range(num_steps):\n",
328 |     "        targets = [gen_target(state, episodes[np.random.randint(len(episodes))]) for j in range(batch_size)]\n",
329 |     "        x, r, a, ax, p_prior, sa, sea, sz = zip(*targets)\n",
330 |     "        x = torch.from_numpy(np.array(x))\n",
331 |     "        r = torch.from_numpy(np.array(r))\n",
332 |     "        a = torch.from_numpy(np.array(a))\n",
333 |     "        ax = torch.from_numpy(np.array(ax))\n",
334 |     "        p_prior = torch.from_numpy(np.array(p_prior))\n",
335 |     "        sa = torch.from_numpy(np.array(sa))\n",
336 |     "        sea = torch.from_numpy(np.array(sea))\n",
337 |     "        sz = torch.from_numpy(np.array(sz))\n",
338 |     "\n",
339 |     "        # Compute losses for k (+ current) steps\n",
340 |     "        ps, vs = [], []\n",
341 |     "        rp = net.representation(x)\n",
342 |     "        for t in range(K + 1):\n",
343 |     "            p, v = net.prediction(rp)\n",
344 |     "            ps.append(p)\n",
345 |     "            vs.append(v)\n",
346 |     "            rp = net.dynamics(rp, ax[:, t])\n",
347 |     "\n",
348 |     "        cmpo_loss, v_loss = 0, 0\n",
349 |     "        for t in range(K, -1, -1):\n",
350 |     "            cmpo_loss += -torch.mean(sea[:, t] / sz[:, t] * torch.log(ps[t].gather(1, sa[:, t])), dim=1).sum()\n",
351 |     "            v_loss += torch.sum(((vs[t] - r[:, t]) ** 2) / 2)\n",
352 |     "\n",
353 |     "        p_selected = ps[0].gather(1, a[:, 0])\n",
354 |     "        p_selected_prior = p_prior[:, 0].gather(1, a[:, 0])\n",
355 |     "        clipped_rho = torch.clamp(p_selected.detach() / p_selected_prior, 0, 1)\n",
356 |     "        pg_loss = torch.sum(-clipped_rho * torch.log(p_selected) * (r[:, 0] - vs[0]))\n",
357 |     "\n",
358 |     "        pg_loss_sum  += pg_loss.item()\n",
359 |     "        cmpo_loss_sum += cmpo_loss.item() / (K + 1)\n",
360 |     "        v_loss_sum += v_loss.item() / (K + 1)\n",
361 |     "\n",
362 |     "        optimizer.zero_grad()\n",
363 |     "        (pg_loss + cmpo_loss + v_loss).backward()\n",
364 |     "        optimizer.step()\n",
365 |     "\n",
366 |     "    data_count = num_steps * batch_size\n",
367 |     "    print('pg_loss %f cmpo_loss %f v_loss %f' % (pg_loss_sum / data_count, cmpo_loss_sum / data_count, v_loss_sum / data_count))\n",
368 |     "    return net"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": null,
374 |    "metadata": {},
375 |    "outputs": [],
376 |    "source": [
377 |     "#  Battle against random agents\n",
378 |     "\n",
379 |     "def vs_random(net, n=100):\n",
380 |     "    results = {}\n",
381 |     "    for i in range(n):\n",
382 |     "        first_turn = i % 2 == 0\n",
383 |     "        turn = first_turn\n",
384 |     "        state = State()\n",
385 |     "        while not state.terminal():\n",
386 |     "            if turn:\n",
387 |     "                p, _ = net.predict(state, [])[-1]\n",
388 |     "                action = sorted([(a, p[a]) for a in state.legal_actions()], key=lambda x:-x[1])[0][0]\n",
389 |     "            else:\n",
390 |     "                action = np.random.choice(state.legal_actions())\n",
391 |     "            state.play(action)\n",
392 |     "            turn = not turn\n",
393 |     "        r = state.terminal_reward() if first_turn else -state.terminal_reward()\n",
394 |     "        results[r] = results.get(r, 0) + 1\n",
395 |     "    return results"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": null,
401 |    "metadata": {
402 |     "scrolled": false
403 |    },
404 |    "outputs": [],
405 |    "source": [
406 |     "# Main algorithm of Muesli\n",
407 |     "\n",
408 |     "num_games = 5000\n",
409 |     "num_games_one_epoch = 40\n",
410 |     "num_sampled_actions = 10\n",
411 |     "simulation_depth = 1\n",
412 |     "\n",
413 |     "C = 1\n",
414 |     "\n",
415 |     "net = Net()\n",
416 |     "optimizer = optim.SGD(net.parameters(), lr=3e-4, weight_decay=3e-5, momentum=0.8)\n",
417 |     "\n",
418 |     "# Display battle results\n",
419 |     "vs_random_sum = vs_random(net)\n",
420 |     "print('vs_random   win: %d  draw: %d  lose: %d' %\n",
421 |     "         (vs_random_sum.get(1, 0), vs_random_sum.get(0, 0), vs_random_sum.get(-1, 0)))\n",
422 |     "\n",
423 |     "episodes = []\n",
424 |     "result_distribution = {1: 0, 0: 0, -1: 0}\n",
425 |     "\n",
426 |     "for g in range(num_games):\n",
427 |     "    # Generate one episode\n",
428 |     "    state = State()\n",
429 |     "\n",
430 |     "    features, policies,  selected_actions,  selected_action_features = [], [], [], []\n",
431 |     "    sampled_infos = []\n",
432 |     "    while not state.terminal():\n",
433 |     "        feature = state.feature()\n",
434 |     "        rp_root = net.representation.inference(feature)\n",
435 |     "        p_root, v_root = net.prediction.inference(rp_root)\n",
436 |     "        p_mask = np.zeros_like(p_root)\n",
437 |     "        p_mask[state.legal_actions()] = 1\n",
438 |     "        p_root *= p_mask\n",
439 |     "        p_root /= p_root.sum()\n",
440 |     "        \n",
441 |     "        features.append(feature)\n",
442 |     "        policies.append(p_root)\n",
443 |     "\n",
444 |     "        actions, exadvs = [], []\n",
445 |     "        for i in range(num_sampled_actions):\n",
446 |     "            action = np.random.choice(np.arange(len(p_root)), p=p_root)\n",
447 |     "            actions.append(action)\n",
448 |     "\n",
449 |     "            rp = rp_root\n",
450 |     "            qs = []\n",
451 |     "            for t in range(simulation_depth):\n",
452 |     "                action_feature = state.action_feature(action)\n",
453 |     "                rp = net.dynamics.inference(rp, action_feature)\n",
454 |     "                p, v = net.prediction.inference(rp)\n",
455 |     "                qs.append(-v if t % 2 == 0 else v)\n",
456 |     "                action = np.random.choice(np.arange(len(p)), p=p)\n",
457 |     "\n",
458 |     "            q = np.mean(qs)\n",
459 |     "            exadvs.append(np.exp(np.clip(q - v_root, -C, C)))\n",
460 |     "    \n",
461 |     "        exadv_sum = np.sum(exadvs)\n",
462 |     "        zs = []\n",
463 |     "        for exadv in exadvs:\n",
464 |     "            z = (1 + exadv_sum - exadv) / num_sampled_actions\n",
465 |     "            zs.append(z)\n",
466 |     "        sampled_infos.append({'a': actions, 'q': qs, 'exadv': exadvs, 'z': zs})\n",
467 |     "\n",
468 |     "        # Select action with generated distribution, and then make a transition by that action\n",
469 |     "        selected_action = np.random.choice(np.arange(len(p_root)), p=p_root)\n",
470 |     "        selected_actions.append(selected_action)\n",
471 |     "        selected_action_features.append(state.action_feature(selected_action))\n",
472 |     "        state.play(selected_action)\n",
473 |     "\n",
474 |     "    # reward seen from the first turn player\n",
475 |     "    reward = state.terminal_reward()\n",
476 |     "    result_distribution[reward] += 1\n",
477 |     "    episodes.append({\n",
478 |     "        'feature': features, 'action': selected_actions, \n",
479 |     "        'action_feature': selected_action_features, 'policy': policies,\n",
480 |     "        'reward': reward,\n",
481 |     "        'sampled_info': sampled_infos})\n",
482 |     "\n",
483 |     "    if g % num_games_one_epoch == 0:\n",
484 |     "        print('game ', end='')\n",
485 |     "    print(g, ' ', end='')\n",
486 |     "\n",
487 |     "    # Training of neural net\n",
488 |     "    if (g + 1) % num_games_one_epoch == 0:\n",
489 |     "        # Show the result distributiuon of generated episodes\n",
490 |     "        print('generated = ', sorted(result_distribution.items()))\n",
491 |     "        net = train(episodes, net, optimizer)\n",
492 |     "        vs_random_once = vs_random(net)\n",
493 |     "        print('vs_random   win: %d  draw: %d  lose: %d' %\n",
494 |     "                  (vs_random_once.get(1, 0), vs_random_once.get(0, 0), vs_random_once.get(-1, 0)))\n",
495 |     "        for r, n in vs_random_once.items():\n",
496 |     "            vs_random_sum[r] += n\n",
497 |     "        print('(total)           win: %d  draw: %d  lose: %d ' %\n",
498 |     "                  (vs_random_sum.get(1, 0), vs_random_sum.get(0, 0), vs_random_sum.get(-1, 0)))\n",
499 |     "        #show_net(net, State())\n",
500 |     "        #show_net(net, State().play('A1 C1 A2 C2'))\n",
501 |     "        #show_net(net, State().play('A1 B2 C3 B3 C1'))\n",
502 |     "        #show_net(net, State().play('B2 A2 A3 C1 B3'))\n",
503 |     "        #show_net(net, State().play('B2 A2 A3 C1'))\n",
504 |     "print('finished')"
505 |    ]
506 |   },
507 |   {
508 |    "cell_type": "code",
509 |    "execution_count": null,
510 |    "metadata": {},
511 |    "outputs": [],
512 |    "source": [
513 |     "# Show outputs from trained net\n",
514 |     "\n",
515 |     "print('initial state')\n",
516 |     "show_net(net, State())\n",
517 |     "\n",
518 |     "print('WIN by put')\n",
519 |     "show_net(net, State().play('A1 C1 A2 C2'))\n",
520 |     "\n",
521 |     "print('LOSE by opponent\\'s double')\n",
522 |     "show_net(net, State().play('B2 A2 A3 C1 B3'))\n",
523 |     "\n",
524 |     "print('WIN through double')\n",
525 |     "show_net(net, State().play('B2 A2 A3 C1'))\n",
526 |     "\n",
527 |     "# hard case: putting on A1 will cause double\n",
528 |     "print('strategic WIN by following double')\n",
529 |     "show_net(net, State().play('B1 A3'))"
530 |    ]
531 |   }
532 |  ],
533 |  "metadata": {
534 |   "kernelspec": {
535 |    "display_name": "Python 3 (ipykernel)",
536 |    "language": "python",
537 |    "name": "python3"
538 |   },
539 |   "language_info": {
540 |    "codemirror_mode": {
541 |     "name": "ipython",
542 |     "version": 3
543 |    },
544 |    "file_extension": ".py",
545 |    "mimetype": "text/x-python",
546 |    "name": "python",
547 |    "nbconvert_exporter": "python",
548 |    "pygments_lexer": "ipython3",
549 |    "version": "3.9.7"
550 |   }
551 |  },
552 |  "nbformat": 4,
553 |  "nbformat_minor": 2
554 | }
555 | 


--------------------------------------------------------------------------------