chapter 4 Dynamic Programming Example Grid World
2020. 5. 5. 16:38ㆍ관심있는 주제/RL
2020/05/01 - [관심있는 주제/RL] - 강화학습 - Dynamic Programming 공부
2020/05/05 - [관심있는 주제/RL] - chapter 4 Dynamic Programming Example Grid World
2020/05/05 - [관심있는 주제/RL] - chapter 4 Dynamic Programming Example Car Rental (in-place)
2020/05/05 - [관심있는 주제/RL] - chapter 4 Dynamic Programming Example 도박사 문제
In [ ]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.table import Table
matplotlib.use('Agg')
문제 정의¶
- 종단 상태 도달하기 전까지 이동에 대한 보상은 -1
4 x 4 의 Grid World
행동 : left, up, right, down (각 확률 25%)
deterministic transitional probability
In [4]:
WORLD_SIZE = 4
# left, up, right, down
ACTIONS = [np.array([0, -1]),
np.array([-1, 0]),
np.array([0, 1]),
np.array([1, 0])]
ACTION_PROB = 0.25
In [40]:
for inplace in [True , False] :
print(f"Inplace : {str(inplace)}")
new_state_value = np.zeros((4,))
if inplace :
state_values = new_state_value
else :
state_values = new_state_value.copy()
old_state_values = state_values.copy()
new_state_value[0] = 10
print(f"old state value : {old_state_values}")
print(f" state value : {state_values}")
print(f"new state value : {new_state_value}")
new_state_value[1] = 5
print(f"old state value : {old_state_values}")
print(f" state value : {state_values}")
print(f"new state value : {new_state_value}")
if inplace :
state_values = new_state_value
else :
state_values = new_state_value.copy()
old_state_values = state_values.copy()
new_state_value[0] = 3
print(f"old state value : {old_state_values}")
print(f" state value : {state_values}")
print(f"new state value : {new_state_value}")
new_state_value[1] = 2
print(f"old state value : {old_state_values}")
print(f" state value : {state_values}")
print(f"new state value : {new_state_value}")
print("="*20)
In [1]:
def is_terminal(state):
x, y = state
return (x == 0 and y == 0) or (x == WORLD_SIZE - 1 and y == WORLD_SIZE - 1)
def step(state, action):
if is_terminal(state):
return state, 0
next_state = (np.array(state) + action).tolist()
x, y = next_state
"""
밖으로 넘어갈 시 현재 스테이트를 유지하면서 이동했다는 가정하에 전이 보상 -1
"""
if x < 0 or x >= WORLD_SIZE or y < 0 or y >= WORLD_SIZE:
next_state = state
reward = -1
return next_state, reward
def draw_image(image):
fig, ax = plt.subplots()
ax.set_axis_off()
tb = Table(ax, bbox=[0, 0, 1, 1])
nrows, ncols = image.shape
width, height = 1.0 / ncols, 1.0 / nrows
# Add cells
for (i, j), val in np.ndenumerate(image):
tb.add_cell(i, j, width, height, text=val,
loc='center', facecolor='white')
# Row and column labels...
for i in range(len(image)):
tb.add_cell(i, -1, width, height, text=i+1, loc='right',
edgecolor='none', facecolor='none')
tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
edgecolor='none', facecolor='none')
ax.add_table(tb)
def compute_state_value(in_place=True, discount=1.0):
new_state_values = np.zeros((WORLD_SIZE, WORLD_SIZE))
iteration = 0
while True:
if in_place:
state_values = new_state_values
else:
state_values = new_state_values.copy()
old_state_values = state_values.copy()
for i in range(WORLD_SIZE):
for j in range(WORLD_SIZE):
value = 0
for action in ACTIONS:
(next_i, next_j), reward = step([i, j], action)
value += ACTION_PROB * (reward + discount * state_values[next_i, next_j])
new_state_values[i, j] = value
max_delta_value = abs(old_state_values - new_state_values).max()
if max_delta_value < 1e-4:
break
iteration += 1
return new_state_values, iteration
def figure_4_1():
# While the author suggests using in-place iterative policy evaluation,
# Figure 4.1 actually uses out-of-place version.
_, asycn_iteration = compute_state_value(in_place=True)
values, sync_iteration = compute_state_value(in_place=False)
draw_image(np.round(values, decimals=2))
print('In-place: {} iterations'.format(asycn_iteration))
print('Synchronous: {} iterations'.format(sync_iteration))
plt.savefig('./images/figure_4_1.png')
plt.close()
if __name__ == '__main__':
figure_4_1()
728x90
'관심있는 주제 > RL' 카테고리의 다른 글
chapter 4 Dynamic Programming Example 도박사 문제 (0) | 2020.05.05 |
---|---|
chapter 4 Dynamic Programming Example Car Rental (in-place) (0) | 2020.05.05 |
강화학습 - Dynamic Programming 공부 (0) | 2020.05.01 |
state value / state action value 관련 자료 (0) | 2020.04.27 |
Contextual Bandits and Reinforcement Learning - 리뷰 (0) | 2020.02.18 |