Policy and value iteration

Învățare prin Recompensă - rezolvarea proceselor de decizie Markov prin tehnici de programare dinamică (Value Iteration, Policy Iteration)

Autori:

Tudor Berariu - 2018
Alexandru Sorici - 2020

1. Scopul laboratorului

Scopul laboratorului îl reprezintă înțelegerea conceptelor de proces markov de decizie (MDP), politică, valoare de stare, precum și implementarea unor metode de programare dinamică pentru rezolvarea problemei de control a unui MDP.

În cadrul laboratorului veți:

implementa algoritmul de iterare a politicilor
implementa algoritmul de iterare a valorilor de stare

2. Workspace setup

Câteva bibioteci de care vom avea nevoie

Python
1import sys2import os.path3from argparse import ArgumentParser4from copy import copy5from random import choice6​7from typing import Dict, List, Tuple

Definirea unui labirint

Python
1class Maze:2​3    NORTH, EAST, SOUTH, WEST = 0, 1, 2, 3  # actions4​5    DYNAMICS = {  # the stochastic effects of actions6        NORTH: {(0, -1): 0.1, (-1, 0): .8, (0, 1): .1},7        EAST: {(-1, 0): 0.1, (0, 1): .8, (1, 0): .1},8        SOUTH: {(0, 1): 0.1, (1, 0): .8, (0, -1): .1},9        WEST: {(1, 0): 0.1, (0, -1): .8, (-1, 0): .1},10    }11​12    WALL, EMPTY = "x", " "13​14    VISUALS = {15        (0, 0, 1, 1): "\N{BOX DRAWINGS HEAVY DOWN AND RIGHT}",16        (1, 0, 0, 1): "\N{BOX DRAWINGS HEAVY DOWN AND LEFT}",17        (1, 0, 1, 0): "\N{BOX DRAWINGS HEAVY HORIZONTAL}",18        (0, 1, 1, 0): "\N{BOX DRAWINGS HEAVY UP AND RIGHT}",19        (1, 1, 0, 0): "\N{BOX DRAWINGS HEAVY UP AND LEFT}",20        (0, 1, 0, 1): "\N{BOX DRAWINGS HEAVY VERTICAL}",21        (1, 1, 1, 1): "\N{BOX DRAWINGS HEAVY VERTICAL AND HORIZONTAL}",22        (1, 1, 1, 0): "\N{BOX DRAWINGS HEAVY UP AND HORIZONTAL}",23        (1, 1, 0, 1): "\N{BOX DRAWINGS HEAVY VERTICAL AND LEFT}",24        (1, 0, 1, 1): "\N{BOX DRAWINGS HEAVY DOWN AND HORIZONTAL}",25        (0, 1, 1, 1): "\N{BOX DRAWINGS HEAVY VERTICAL AND RIGHT}",26        (1, 0, 0, 0): "\N{BOX DRAWINGS HEAVY LEFT}",27        (0, 1, 0, 0): "\N{BOX DRAWINGS HEAVY UP}",28        (0, 0, 1, 0): "\N{BOX DRAWINGS HEAVY RIGHT}",29        (0, 0, 0, 1): "\N{BOX DRAWINGS HEAVY DOWN}",30        (0, 0, 0, 0): "\N{BOX DRAWINGS HEAVY VERTICAL AND HORIZONTAL}",31        WEST: "\N{LEFTWARDS ARROW}",32        NORTH: "\N{UPWARDS ARROW}",33        EAST: "\N{RIGHTWARDS ARROW}",34        SOUTH: "\N{DOWNWARDS ARROW}",35    }36​37    def __init__(self, map_name):38        self._rewards, self._cells = {}, []39        with open(os.path.join("maps", map_name), "r") as map_file:40            for line in map_file.readlines():41                if ":" in line:42                    name, value = line.strip().split(":")43                    self._rewards[name] = float(value)44                else:45                    self._cells.append(list(line.strip()))46        self._states = [(i, j) for i, row in enumerate(self._cells)47                        for j, cell in enumerate(row) if cell != Maze.WALL]48​49    @property50    def actions(self):51        return [Maze.NORTH, Maze.EAST, Maze.SOUTH, Maze.WEST]52​53    @property54    def states(self):55        return copy(self._states)56​57    def is_final(self, state):58        row, col = state59        return self._cells[row][col] != Maze.EMPTY60​61    def effects(self, state, action):62        if self.is_final(state):63            return []64        row, col = state65        next_states = {}66        for (d_row, d_col), prob in Maze.DYNAMICS[action].items():67            next_row, next_col = row + d_row, col + d_col68            if self._cells[next_row][next_col] == Maze.WALL:69                next_row, next_col = row, col70            if (next_row, next_col) in next_states:71                prev_prob, _ = next_states[(next_row, next_col)]72                prob += prev_prob73            cell = self._cells[next_row][next_col]74            reward = self._rewards["default" if cell == Maze.EMPTY else cell]75            next_states[(next_row, next_col)] = (prob, reward)76        return [(s, p, r) for s, (p, r) in next_states.items()]77​78    def print_policy(self, policy):79        last_row = []80        height = len(self._cells)81​82        for row, row_cells in enumerate(self._cells):83            width = len(row_cells)84            for col, cell in enumerate(row_cells):85                if cell == Maze.WALL:86                    north, south, west, east = 0, 0, 0, 087                    if last_row and len(last_row) > col:88                        north = last_row[col] == Maze.WALL89                    if row + 1 < height:90                        south = self._cells[row + 1][col] == Maze.WALL91                    if col > 0:92                        west = row_cells[col - 1] == Maze.WALL93                    if col + 1 < width:94                        east = row_cells[col + 1] == Maze.WALL95                    sys.stdout.write(Maze.VISUALS[(west, north, east, south)])96                elif self.is_final((row, col)):97                    sys.stdout.write(cell)98                else:99                    action = policy[(row, col)]100                    sys.stdout.write(Maze.VISUALS[action])101            sys.stdout.write("\n")102            last_row = row_cells103        sys.stdout.flush()104​105    def print_values(self, v):106        for r, row_cells in enumerate(self._cells):107            print(" | ".join(["%5.2f" % v[(r, c)] if cell == Maze.EMPTY else "     "108                              for c, cell in enumerate(row_cells)]))

Cerințe

Python
1MAP_NAME = 'complex'  #@param ['simple', 'complex', 'be_careful', 'suffer']2gamma = 0.9 #@param {type: "slider", min: 0.0, max: 1.0, step: 0.1}3max_delta = 1e-8 #@param {type:"float"}.

Cerința 1

Implementați funcția policy_iteration pentru calculul politicii optime și a tabelului de utilitate așteptată pentru fiecare stare (celulă din grid) a labirintului.

Python
1def policy_iteration(game: Maze) -> Tuple[Dict[Tuple[int, int], int], Dict[Tuple[int, int], float]]:2    v = {s: 0 for s in game.states}3    policy = {s: choice(game.actions)4              for s in game.states if not game.is_final(s)}5    return policy, v

Cerința 2

Implementați funcția value_iteration pentru calculul politicii optime și a tabelului de utilitate așteptată pentru fiecare stare (celulă din grid) a labirintului.

Python
1def value_iteration(game: Maze) -> Tuple[Dict[Tuple[int, int], int], Dict[Tuple[int, int], float]]:2    v = {s: 0 for s in game.states}3    policy = {s: choice(game.actions)4              for s in game.states if not game.is_final(s)}5    return policy, v

Evaluare

Python
1#@title2    3game = Maze(MAP_NAME)4​5print("Policy iteration:")6policy, v = policy_iteration(game)7game.print_values(v)8game.print_policy(policy)9​10print("Value iteration:")11policy, v = value_iteration(game)12game.print_values(v)13game.print_policy(policy)

Output:

1Policy iteration:2      |       |       |       |       |       |       |       |       |       |       |       |      3      |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |       |      4      |       |       |       |       |       |       |       |       |       |       |  0.00 |  0.00 |       |       |       |       |       |      5      |       |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |       |       |       |       |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |      6      |       |       |       |       |       |  0.00 |       |       |       |       |       |       |       |       |       |       |  0.00 |      7      |       |       |       |       |       |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |      8      |       |       |       |       |       |       |       |       |       |       |       |       |       |       |       |       |       |      9┏━━━━━━━━━━━┓10┃→←←↓↓→↑←↑←↑┗┓11┃A╺━━━━┳┳┳┓←↑┗━━━━┓12┃B→←→←→┣╋╋┫↑→↑↑←↓↓┃13┣┳┳┳┳┓↑┗┻┻┻━━━━━╸→┃14┣╋╋╋╋┫↓←↓↑→←←↑↓↓←→┃15┗┻┻┻┻┻━━━━━━━━━━━━┛16Value iteration:17      |       |       |       |       |       |       |       |       |       |       |       |      18      |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |       |      19      |       |       |       |       |       |       |       |       |       |       |  0.00 |  0.00 |       |       |       |       |       |      20      |       |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |       |       |       |       |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |      21      |       |       |       |       |       |  0.00 |       |       |       |       |       |       |       |       |       |       |  0.00 |      22      |       |       |       |       |       |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |  0.00 |      23      |       |       |       |       |       |       |       |       |       |       |       |       |       |       |       |       |       |      24┏━━━━━━━━━━━┓25┃↓↓↓↓↑↓→→↓←→┗┓26┃A╺━━━━┳┳┳┓↑←┗━━━━┓27┃B→↑↓←→┣╋╋┫→→↑→↓↑↑┃28┣┳┳┳┳┓↓┗┻┻┻━━━━━╸↓┃29┣╋╋╋╋┫←→↓←↓↑←↑↓↑←↑┃30┗┻┻┻┻┻━━━━━━━━━━━━┛

Policy Iteration Visual Demo

Python
1import os2import sys3import pygame4from random import choice5from functools import reduce6from copy import copy7​8# Constants9CELL_SIZE = 160      10FONT_SIZE = 18       11ARROW_WIDTH = 6      # Thicker arrow line12ARROW_HEAD = 10      # Arrowhead size13​14VALUE_COLOR = (0, 0, 180)15REWARD_COLOR = (0, 0, 0)16POS_FINAL_COLOR = (100, 255, 100)17NEG_FINAL_COLOR = (255, 100, 100)18​19ARROW_COLOR = (0, 0, 0)20# VALUE_COLOR = (0, 0, 128)21BG_COLOR = (0, 0, 0)22WALL_COLOR = (50, 50, 50)23FPS = 3024gamma = 0.9925max_delta = 1e-326​27EQUATION_FONT_SIZE = 2228EQUATION_COLOR_BELLMAN = (255, 255, 255)  # White29EQUATION_COLOR_POLICY = (255, 255, 0)     # Yellow30​31class Maze:32​33    NORTH, EAST, SOUTH, WEST = 0, 1, 2, 3  # actions34​35    DYNAMICS = {  # the stochastic effects of actions36        NORTH: {(0, -1): 0.1, (-1, 0): .8, (0, 1): .1},37        EAST: {(-1, 0): 0.1, (0, 1): .8, (1, 0): .1},38        SOUTH: {(0, 1): 0.1, (1, 0): .8, (0, -1): .1},39        WEST: {(1, 0): 0.1, (0, -1): .8, (-1, 0): .1},40    }41​42    WALL, EMPTY = "x", " "43​44    VISUALS = {45        (0, 0, 1, 1): "\N{BOX DRAWINGS HEAVY DOWN AND RIGHT}",46        (1, 0, 0, 1): "\N{BOX DRAWINGS HEAVY DOWN AND LEFT}",47        (1, 0, 1, 0): "\N{BOX DRAWINGS HEAVY HORIZONTAL}",48        (0, 1, 1, 0): "\N{BOX DRAWINGS HEAVY UP AND RIGHT}",49        (1, 1, 0, 0): "\N{BOX DRAWINGS HEAVY UP AND LEFT}",50        (0, 1, 0, 1): "\N{BOX DRAWINGS HEAVY VERTICAL}",51        (1, 1, 1, 1): "\N{BOX DRAWINGS HEAVY VERTICAL AND HORIZONTAL}",52        (1, 1, 1, 0): "\N{BOX DRAWINGS HEAVY UP AND HORIZONTAL}",53        (1, 1, 0, 1): "\N{BOX DRAWINGS HEAVY VERTICAL AND LEFT}",54        (1, 0, 1, 1): "\N{BOX DRAWINGS HEAVY DOWN AND HORIZONTAL}",55        (0, 1, 1, 1): "\N{BOX DRAWINGS HEAVY VERTICAL AND RIGHT}",56        (1, 0, 0, 0): "\N{BOX DRAWINGS HEAVY LEFT}",57        (0, 1, 0, 0): "\N{BOX DRAWINGS HEAVY UP}",58        (0, 0, 1, 0): "\N{BOX DRAWINGS HEAVY RIGHT}",59        (0, 0, 0, 1): "\N{BOX DRAWINGS HEAVY DOWN}",60        (0, 0, 0, 0): "\N{BOX DRAWINGS HEAVY VERTICAL AND HORIZONTAL}",61        WEST: "\N{LEFTWARDS ARROW}",62        NORTH: "\N{UPWARDS ARROW}",63        EAST: "\N{RIGHTWARDS ARROW}",64        SOUTH: "\N{DOWNWARDS ARROW}",65    }66​67    def __init__(self, map_name):68        self._rewards, self._cells = {}, []69        with open(os.path.join("maps", map_name), "r") as map_file:70            for line in map_file.readlines():71                if ":" in line:72                    name, value = line.strip().split(":")73                    self._rewards[name] = float(value)74                else:75                    self._cells.append(list(line.strip()))76        self._states = [(i, j) for i, row in enumerate(self._cells)77                        for j, cell in enumerate(row) if cell != Maze.WALL]78​79    @property80    def actions(self):81        return [Maze.NORTH, Maze.EAST, Maze.SOUTH, Maze.WEST]82​83    @property84    def states(self):85        return copy(self._states)86​87    def is_final(self, state):88        row, col = state89        return self._cells[row][col] != Maze.EMPTY90​91    def effects(self, state, action):92        if self.is_final(state):93            return []94        row, col = state95        next_states = {}96        for (d_row, d_col), prob in Maze.DYNAMICS[action].items():97            next_row, next_col = row + d_row, col + d_col98            if self._cells[next_row][next_col] == Maze.WALL:99                next_row, next_col = row, col100            if (next_row, next_col) in next_states:101                prev_prob, _ = next_states[(next_row, next_col)]102                prob += prev_prob103            cell = self._cells[next_row][next_col]104            reward = self._rewards["default" if cell == Maze.EMPTY else cell]105            next_states[(next_row, next_col)] = (prob, reward)106        return [(s, p, r) for s, (p, r) in next_states.items()]107​108    def print_policy(self, policy):109        last_row = []110        height = len(self._cells)111​112        for row, row_cells in enumerate(self._cells):113            width = len(row_cells)114            for col, cell in enumerate(row_cells):115                if cell == Maze.WALL:116                    north, south, west, east = 0, 0, 0, 0117                    if last_row and len(last_row) > col:118                        north = last_row[col] == Maze.WALL119                    if row + 1 < height:120                        south = self._cells[row + 1][col] == Maze.WALL121                    if col > 0:122                        west = row_cells[col - 1] == Maze.WALL123                    if col + 1 < width:124                        east = row_cells[col + 1] == Maze.WALL125                    sys.stdout.write(Maze.VISUALS[(west, north, east, south)])126                elif self.is_final((row, col)):127                    sys.stdout.write(cell)128                else:129                    action = policy[(row, col)]130                    sys.stdout.write(Maze.VISUALS[action])131            sys.stdout.write("\n")132            last_row = row_cells133        sys.stdout.flush()134​135    def print_values(self, v):136        for r, row_cells in enumerate(self._cells):137            print(" | ".join(["%5.2f" % v[(r, c)] if cell == Maze.EMPTY else "     "138                              for c, cell in enumerate(row_cells)]))139​140​141class VisualPolicyIteration:142    def __init__(self, game: Maze):143        pygame.init()144        self.game = game145        self.v = {s: 0 for s in game.states}146        self.policy = {s: choice(game.actions) for s in game.states if not game.is_final(s)}147        self.width = len(game._cells[0])148        self.height = len(game._cells)149        # self.screen = pygame.display.set_mode((CELL_SIZE * self.width, CELL_SIZE * self.height))150        151        total_height = self.height * CELL_SIZE + 80  # extra for equations152        total_width = self.width * CELL_SIZE153        self.screen = pygame.display.set_mode((total_width, total_height))154​155        pygame.display.set_caption("Policy Iteration Teaching Tool")156        self.font = pygame.font.SysFont("monospace", FONT_SIZE)157        self.clock = pygame.time.Clock()158        self.done = False159        self.iteration = 0160​161        self.equation_font = pygame.font.SysFont("Arial", EQUATION_FONT_SIZE)162​163​164    def draw_grid(self):165        self.screen.fill(BG_COLOR)166        for row in range(self.height):167            for col in range(self.width):168                x, y = col * CELL_SIZE, row * CELL_SIZE169                rect = pygame.Rect(x, y, CELL_SIZE, CELL_SIZE)170                cell = self.game._cells[row][col]171​172                if cell == Maze.WALL:173                    pygame.draw.rect(self.screen, WALL_COLOR, rect)174                else:175                    state = (row, col)176                    is_final = self.game.is_final(state)177​178                    # Determine background color179                    if is_final:180                        reward = self.game._rewards.get(cell, 0)181                        color = POS_FINAL_COLOR if reward > 0 else NEG_FINAL_COLOR182                        pygame.draw.rect(self.screen, color, rect)183                    else:184                        pygame.draw.rect(self.screen, (255, 255, 255), rect)185​186                    # Border187                    pygame.draw.rect(self.screen, (0, 0, 0), rect, 1)188​189                    # Draw policy arrow190                    if not is_final:191                        action = self.policy[state]192                        self._draw_arrow(action, x + CELL_SIZE // 2, y + CELL_SIZE // 2)193​194                    # Draw value195                    if state in self.v:196                        val_text = f"{self.v[state]:.2f}"197                        val_surface = self.font.render(val_text, True, VALUE_COLOR)198                        val_rect = val_surface.get_rect(center=(x + CELL_SIZE // 2, y + CELL_SIZE - 22))199                        self.screen.blit(val_surface, val_rect)200​201                    # Draw reward in final state202                    if is_final and cell in self.game._rewards:203                        r_text = f"R={self.game._rewards[cell]:.0f}"204                        r_surface = self.font.render(r_text, True, REWARD_COLOR)205                        r_rect = r_surface.get_rect(center=(x + CELL_SIZE // 2, y + 20))206                        self.screen.blit(r_surface, r_rect)207​208        # Bellman update equation rendering209        bellman_eq = "V(s) <-- Σ_s' T(s, π(s), s') [ R(s, π(s), s') + γ·V(s') ]"210        bellman_surf = self.equation_font.render(bellman_eq, True, EQUATION_COLOR_BELLMAN)211        bellman_rect = bellman_surf.get_rect(midtop=(self.screen.get_width() // 2, self.height * CELL_SIZE + 10))212        self.screen.blit(bellman_surf, bellman_rect)213​214        # Policy update equation rendering215        policy_eq = "π(s) <-- argmax_a Σ_s' T(s,a,s') [ R(s,a,s') + γ·V(s') ]"216        policy_surf = self.equation_font.render(policy_eq, True, EQUATION_COLOR_POLICY)217        policy_rect = policy_surf.get_rect(midtop=(self.screen.get_width() // 2, bellman_rect.bottom + 10))218        self.screen.blit(policy_surf, policy_rect)219​220        # Draw the screen221        pygame.display.flip()222                223​224    # inside _draw_arrow method225    def _draw_arrow(self, action, cx, cy):226        length = CELL_SIZE // 3227        if action == Maze.NORTH:228            end = (cx, cy - length)229            head1 = (cx - ARROW_HEAD, cy - length + ARROW_HEAD)230            head2 = (cx + ARROW_HEAD, cy - length + ARROW_HEAD)231        elif action == Maze.SOUTH:232            end = (cx, cy + length)233            head1 = (cx - ARROW_HEAD, cy + length - ARROW_HEAD)234            head2 = (cx + ARROW_HEAD, cy + length - ARROW_HEAD)235        elif action == Maze.EAST:236            end = (cx + length, cy)237            head1 = (cx + length - ARROW_HEAD, cy - ARROW_HEAD)238            head2 = (cx + length - ARROW_HEAD, cy + ARROW_HEAD)239        elif action == Maze.WEST:240            end = (cx - length, cy)241            head1 = (cx - length + ARROW_HEAD, cy - ARROW_HEAD)242            head2 = (cx - length + ARROW_HEAD, cy + ARROW_HEAD)243​244        pygame.draw.line(self.screen, ARROW_COLOR, (cx, cy), end, ARROW_WIDTH)245        pygame.draw.polygon(self.screen, ARROW_COLOR, [end, head1, head2])246​247​248    def step_evaluation(self):249        delta = 0250        new_v = {}251        for s in filter(lambda s: not self.game.is_final(s), self.game.states):252            v_old = self.v[s]253            new_v[s] = reduce(lambda x, y: x + y,254                              map(lambda e: e[1] * (e[2] + gamma * self.v[e[0]]),255                                  self.game.effects(s, self.policy[s])))256            delta = max(delta, abs(new_v[s] - v_old))257        self.v.update(new_v)258        return delta259​260    def run_full_evaluation(self):261        while True:262            delta = self.step_evaluation()263            if delta < max_delta:264                break265​266    def improve_policy(self):267        stable = True268        for s in filter(lambda s: not self.game.is_final(s), self.game.states):269            a_old = self.policy[s]270            self.policy[s] = max(271                map(lambda a: (a, reduce(lambda x, y: x + y,272                                         map(lambda e: e[1] * (e[2] + gamma * self.v[e[0]]),273                                             self.game.effects(s, a)))),274                    self.game.actions),275                key=lambda i: i[1])[0]276            if a_old != self.policy[s]:277                stable = False278        return stable279​280    def run(self):281        running = True282        while running:283            self.clock.tick(FPS)284            self.draw_grid()285​286            for event in pygame.event.get():287                if event.type == pygame.QUIT:288                    running = False289​290                elif event.type == pygame.KEYDOWN:291                    if event.key == pygame.K_s:  # Step evaluation292                        self.step_evaluation()293​294                    elif event.key == pygame.K_e:  # Full evaluation295                        self.run_full_evaluation()296​297                    elif event.key == pygame.K_u:  # Policy improvement298                        self.improve_policy()299​300                    elif event.key == pygame.K_f:  # Full policy iteration301                        self.run_full_evaluation()302                        while not self.improve_policy():303                            self.run_full_evaluation()304​305        pygame.quit()306​307​308if __name__ == "__main__":309    game = Maze("volcano-crossing")  # replace with your actual maze filename310    vis = VisualPolicyIteration(game)311    vis.run()