import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.table import Table

matplotlib.use('Agg')

문제 정의¶

종단 상태 도달하기 전까지 이동에 대한 보상은 -1

4 x 4 의 Grid World

행동 : left, up, right, down (각 확률 25%)

deterministic transitional probability

WORLD_SIZE = 4
# left, up, right, down
ACTIONS = [np.array([0, -1]),
           np.array([-1, 0]),
           np.array([0, 1]),
           np.array([1, 0])]
ACTION_PROB = 0.25

for inplace in [True , False] :
    print(f"Inplace : {str(inplace)}")
    new_state_value = np.zeros((4,))
    if inplace :
        state_values = new_state_value
    else :
        state_values = new_state_value.copy()
    old_state_values = state_values.copy()
    new_state_value[0] = 10
    print(f"old state value : {old_state_values}")
    print(f"    state value : {state_values}")
    print(f"new state value : {new_state_value}")
    new_state_value[1] = 5
    print(f"old state value : {old_state_values}")
    print(f"    state value : {state_values}")
    print(f"new state value : {new_state_value}")
    if inplace :
        state_values = new_state_value
    else :
        state_values = new_state_value.copy()
    old_state_values = state_values.copy()
    new_state_value[0] = 3
    print(f"old state value : {old_state_values}")
    print(f"    state value : {state_values}")
    print(f"new state value : {new_state_value}")
    new_state_value[1] = 2
    print(f"old state value : {old_state_values}")
    print(f"    state value : {state_values}")
    print(f"new state value : {new_state_value}")
    print("="*20)

Inplace : True
old state value : [0. 0. 0. 0.]
    state value : [10.  0.  0.  0.]
new state value : [10.  0.  0.  0.]
old state value : [0. 0. 0. 0.]
    state value : [10.  5.  0.  0.]
new state value : [10.  5.  0.  0.]
old state value : [10.  5.  0.  0.]
    state value : [3. 5. 0. 0.]
new state value : [3. 5. 0. 0.]
old state value : [10.  5.  0.  0.]
    state value : [3. 2. 0. 0.]
new state value : [3. 2. 0. 0.]
====================
Inplace : False
old state value : [0. 0. 0. 0.]
    state value : [0. 0. 0. 0.]
new state value : [10.  0.  0.  0.]
old state value : [0. 0. 0. 0.]
    state value : [0. 0. 0. 0.]
new state value : [10.  5.  0.  0.]
old state value : [10.  5.  0.  0.]
    state value : [10.  5.  0.  0.]
new state value : [3. 5. 0. 0.]
old state value : [10.  5.  0.  0.]
    state value : [10.  5.  0.  0.]
new state value : [3. 2. 0. 0.]
====================

def is_terminal(state):
    x, y = state
    return (x == 0 and y == 0) or (x == WORLD_SIZE - 1 and y == WORLD_SIZE - 1)


def step(state, action):
    if is_terminal(state):
        return state, 0

    next_state = (np.array(state) + action).tolist()
    x, y = next_state
    """
    밖으로 넘어갈 시 현재 스테이트를 유지하면서 이동했다는 가정하에 전이 보상 -1
    """
    if x < 0 or x >= WORLD_SIZE or y < 0 or y >= WORLD_SIZE:
        next_state = state

    reward = -1
    return next_state, reward


def draw_image(image):
    fig, ax = plt.subplots()
    ax.set_axis_off()
    tb = Table(ax, bbox=[0, 0, 1, 1])

    nrows, ncols = image.shape
    width, height = 1.0 / ncols, 1.0 / nrows

    # Add cells
    for (i, j), val in np.ndenumerate(image):
        tb.add_cell(i, j, width, height, text=val,
                    loc='center', facecolor='white')

        # Row and column labels...
    for i in range(len(image)):
        tb.add_cell(i, -1, width, height, text=i+1, loc='right',
                    edgecolor='none', facecolor='none')
        tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
                    edgecolor='none', facecolor='none')
    ax.add_table(tb)


def compute_state_value(in_place=True, discount=1.0):
    new_state_values = np.zeros((WORLD_SIZE, WORLD_SIZE))
    iteration = 0
    while True:
        if in_place:
            state_values = new_state_values
        else:
            state_values = new_state_values.copy()
        old_state_values = state_values.copy()

        for i in range(WORLD_SIZE):
            for j in range(WORLD_SIZE):
                value = 0
                for action in ACTIONS:
                    (next_i, next_j), reward = step([i, j], action)
                    value += ACTION_PROB * (reward + discount * state_values[next_i, next_j])
                new_state_values[i, j] = value

        max_delta_value = abs(old_state_values - new_state_values).max()
        if max_delta_value < 1e-4:
            break

        iteration += 1

    return new_state_values, iteration


def figure_4_1():
    # While the author suggests using in-place iterative policy evaluation,
    # Figure 4.1 actually uses out-of-place version.
    _, asycn_iteration = compute_state_value(in_place=True)
    values, sync_iteration = compute_state_value(in_place=False)
    draw_image(np.round(values, decimals=2))
    print('In-place: {} iterations'.format(asycn_iteration))
    print('Synchronous: {} iterations'.format(sync_iteration))

    plt.savefig('./images/figure_4_1.png')
    plt.close()

if __name__ == '__main__':
    figure_4_1()

In-place: 113 iterations
Synchronous: 172 iterations

chapter 4 Dynamic Programming Example 도박사 문제 (0)	2020.05.05
chapter 4 Dynamic Programming Example Car Rental (in-place) (0)	2020.05.05
강화학습 - Dynamic Programming 공부 (0)	2020.05.01
state value / state action value 관련 자료 (0)	2020.04.27
Contextual Bandits and Reinforcement Learning - 리뷰 (0)	2020.02.18

All I Need Is Data.

All I Need Is Data.

태그

최근글

댓글

공지사항

아카이브

문제 정의¶

'관심있는 주제 > RL' 카테고리의 다른 글

관련글

티스토리툴바