Q-Learning - MLCompete

Download maps

Învățare prin Recompensă - Algoritmul Q-learning

Autori:

Tudor Berariu - 2016
George Muraru - 2020

1. Scopul laboratorului

Scopul laboratorului îl reprezintă ̆întelegerea și implementarea algoritmului Q-Learning.

2. Algoritmul Q-Learning

Q-learning

3. Workspace Setup

Câteva biblioteci de care vom avea nevoie

Python
1from copy import deepcopy2from random import choice, random3from time import sleep4import urllib.request5​6from IPython.display import clear_output

Parametrii necesari rulării

Python
1# File to read map from2MAP_NAME = "mini_map" #@param ["mini_map", "big_map", "huge_map"]3​4# Meta-parameters5​6LEARNING_RATE = 0.1 #@param {type: "slider", min: 0.001, max: 1.0, step: 0.01}7DISCOUNT_FACTOR = 0.99 #@param {type: "slider", min: 0.01, max: 1.0, step: 0.01}8​9# Probabilit to choose a random action10EPSILON = 0.05 #@param {type: "slider", min: 0.0, max:1.0, step: 0.05, default: 0.05}11​12​13# Training and evaluation episodes14TRAIN_EPISODES = 1000 #@param {type: "slider", min: 1, max: 20000, default: 1000}15​16# Evaluate after specified number of episodes17EVAL_EVERY = 10 #@param {type: "slider", min: 0, max: 1000}18​19# Evaluate using the specified number of episodes20EVAL_EPISODES = 10 #@param {type: "slider", min: 1, max: 1000}21​22# Display23VERBOSE = False #@param {type: "boolean"}24PLOT_SCORE = True #@param {type: "boolean"}25SLEEP_TIME = 1 #@param {type: "slider", min:1, max:10}26​27# Show the end result28FINAL_SHOW = True #@param {type: "boolean"}

Clasă care abstractizează jocul

Python
1URL_PREFIX = "https://raw.githubusercontent.com/cs-pub-ro/ML/master/lab/lab7/maps/"2​3ACTIONS = ["UP", "RIGHT", "DOWN", "LEFT", "STAY"]4​5ACTION_EFFECTS = {6    "UP": (-1,0),7    "RIGHT": (0,1),8    "DOWN": (1,0),9    "LEFT": (0,-1),10    "STAY": (0,0)11}12​13MOVE_REWARD = -0.114WIN_REWARD = 10.015LOSE_REWARD = -10.016​17## Functions to serialize / deserialize game states18def __serialize_state(state):19    return "\n".join(map(lambda row: "".join(row), state))20​21def __deserialize_state(str_state):22    return list(map(list, str_state.split("\n")))23​24## Return the initial state of the game25def get_initial_state(map_file_name):26    full_url = f"{URL_PREFIX}{MAP_NAME}"27    state = urllib.request.urlopen(full_url).read().strip()28​29    return state.decode("utf-8")30​31## Get the coordinates of an actor32def __get_position(state, marker):33    for row_idx, row in enumerate(state):34        if marker in row:35            return row_idx, row.index(marker)36    return -1, -137​38## Check if is a final state39def is_final_state(str_state, score):40    return score < -20.0 or "G" not in str_state or "o" not in str_state41​42## Check if the given coordinates are valid (on map and not a wall)43def __is_valid_cell(state, row, col):44    return row >= 0 and row < len(state) and \45        col >= 0 and col < len(state[row]) and \46        state[row][col] != "*"47​48## Move to next state49def apply_action(str_state, action):50    assert(action in ACTIONS)51    message = "Greuceanu moved %s." % action52​53    state = __deserialize_state(str_state)54    g_row, g_col = __get_position(state, "G")55    assert(g_row >= 0 and g_col >= 0)56​57    next_g_row = g_row + ACTION_EFFECTS[action][0]58    next_g_col = g_col + ACTION_EFFECTS[action][1]59​60    if not __is_valid_cell(state, next_g_row, next_g_col):61        next_g_row = g_row62        next_g_col = g_col63        message = f"{message} Not a valid cell there."64​65    state[g_row][g_col] = " "66    if state[next_g_row][next_g_col] == "B":67        message = f"{message} Greuceanu stepped on the balaur."68        return __serialize_state(state), LOSE_REWARD, message69    elif state[next_g_row][next_g_col] == "o":70        state[next_g_row][next_g_col] = "G"71        message = f"{message} Greuceanu found 'marul fermecat'."72        return __serialize_state(state), WIN_REWARD, message73    state[next_g_row][next_g_col] = "G"74​75    ## Balaur moves now76    b_row, b_col = __get_position(state, "B")77    assert(b_row >= 0 and b_col >= 0)78​79    dy, dx = next_g_row - b_row, next_g_col - b_col80​81    is_good = lambda dr, dc:__is_valid_cell(state, b_row + dr, b_col + dc)82​83    next_b_row, next_b_col = b_row, b_col84    if abs(dy) > abs(dx) and is_good(dy // abs(dy), 0):85        next_b_row = b_row + dy // abs(dy)86    elif abs(dx) > abs(dy) and is_good(0, dx // abs(dx)):87        next_b_col = b_col + dx // abs(dx)88    else:89        options = []90        if abs(dx) > 0:91            if is_good(0, dx // abs(dx)):92                options.append((b_row, b_col + dx // abs(dx)))93        else:94            if is_good(0, -1):95                options.append((b_row, b_col - 1))96            if is_good(0, 1):97                options.append((b_row, b_col + 1))98        if abs(dy) > 0:99            if is_good(dy // abs(dy), 0):100                options.append((b_row + dy // abs(dy), b_col))101        else:102            if is_good(-1, 0):103                options.append((b_row - 1, b_col))104            if is_good(1, 0):105                options.append((b_row + 1, b_col))106​107        if len(options) > 0:108            next_b_row, next_b_col = choice(options)109​110    if state[next_b_row][next_b_col] == "G":111        message = f"{message} The balaur ate Greuceanu."112        reward = LOSE_REWARD113    elif state[next_b_row][next_b_col] == "o":114        message = f"{message} The balaur found marul fermecat. Greuceanu lost!"115        reward = LOSE_REWARD116    else:117        message = f"{message} The balaur follows Greuceanu."118        reward = MOVE_REWARD119​120    state[b_row][b_col] = " "121    state[next_b_row][next_b_col] = "B"122​123    return __serialize_state(state), reward, message124​125def display_state(state):126    print(state)

3. Problemă de rezolvat

Greuceanu și Balaurul

Pe o hartă bidimensională se înfruntă Greuceanu și-un balaur.

Greuceanu trebuie să găsească mărul fermecat înainte de a fi prins de balaur și înainte ca balaurul să calce pe măr. Balaurul simte direcția în care se află Greuceanu și se îndreaptă către el.

Concret, Greuceanu câștigă jocul și 10 puncte dacă ajunge primul la mărul fermecat.

Greuceanu pierde jocul dacă este prins de balaur sau dacă balaurul calcă pe măr. Deasemenea, la fiecare moment de timp Greuceanu pierde câte 0.1 puncte.

Dacă ajunge la -20 de puncte, Greuceanu pierde jocul.

4. Cerințe

[6 pct] Implementați algoritmul Q-learning (completași funcția q_learning).
[2 pct] Implementați strategia $\epsilon$ -greedy de selecție a unei acțiuni. Funcția primește toate acțiunile valide dintr-o stare dată. Atât timp cât există acțiuni ce nu au fost explorate, se va alege aleator una dintre acestea.
Altfel, cu o probabilitate $\epsilon$ se va alege o acțiune aleatoare, iar cu o probabilitate 1 − $\epsilon$ se va alege cea mai bună acțiune din starea respectivă.
[2 pct] Implementați rutina de evaluare a politicii lacome (care alege întotdeauna cea mai bună acțiune).
[2 pct] Găsiți metaparametrii potriviți pentru o învățare cât mai rapidă pe toate cele trei hărți (rata de învățare, valoarea lui $\epsilon$ ). Încercați să modificați $\epsilon$ pe parcursul învățarii.

Python
1def get_legal_actions(str_state):2    #TODO (1) : Get the actions Greuceanu can do3    return deepcopy(ACTIONS)4​5def epsilon_greedy(Q, state, legal_actions, epsilon):6    # TODO (2) : Epsilon greedy7    return choice(legal_actions)8​9def best_action(Q, state, legal_actions):10    # TODO (3) : Best action11    return choice(legal_actions)12​13def q_learning():14    Q = {}15    train_scores = []16    eval_scores = []17    initial_state = get_initial_state(MAP_NAME)18​19    for train_ep in range(1, TRAIN_EPISODES+1):20        clear_output(wait=True)21        score = 022        state = deepcopy(initial_state)23​24        if VERBOSE:25            display_state(state); sleep(SLEEP_TIME)26            clear_output(wait=True)27​28        while not is_final_state(state, score):29​30            actions = get_legal_actions(state)31            action = epsilon_greedy(Q, state, actions, EPSILON)32​33            state, reward, msg = apply_action(state, action)34            score += reward35            36            # TODO (1) : Q-Learning37            if VERBOSE:38                print(msg); display_state(state); sleep(SLEEP_TIME)39                clear_output(wait=True)40​41​42        print(f"Episode {train_ep} / {TRAIN_EPISODES}")43        train_scores.append(score)44​45        # evaluate the greedy policy46        if train_ep % EVAL_EVERY == 0:47            avg_score = .048​49            # TODO (4) : Evaluate50            eval_scores.append(avg_score)51​52    # --------------------------------------------------------------------------53    if FINAL_SHOW:54        state = deepcopy(initial_state)55        while not is_final_state(state, score):56            action = best_action(Q, state, get_legal_actions(state))57            state, _, msg = apply_action(state, action)58            print(msg); display_state(state); sleep(SLEEP_TIME)59            clear_output(wait=True)60​61    if PLOT_SCORE:62        from matplotlib import pyplot as plt63        import numpy as np64        plt.xlabel("Episode")65        plt.ylabel("Average score")66        plt.plot(67            np.linspace(1, TRAIN_EPISODES, TRAIN_EPISODES),68            np.convolve(train_scores, [0.2,0.2,0.2,0.2,0.2], "same"),69            linewidth = 1.0, color = "blue"70        )71        plt.plot(72            np.linspace(EVAL_EVERY, TRAIN_EPISODES, len(eval_scores)),73                        eval_scores, linewidth = 2.0, color = "red"74        )75        plt.show()76​77q_learning()