Code Experiment
測試環(huán)境叫做cliff run,如下圖:

起始位置是左下角(3,0),目標(biāo)位置是右下角(3,11),其中黃色方框是當(dāng)前位置,深紫色的是懸崖,如果掉下去了這個回合就結(jié)束了,得重新開始。
具體的文件可以在我的Github上,下載下來,直接運行jupyter notebook即可
https://github.com/Qxxxx/ReinforcementLearning.git
Q-Learning
class QLearningAgent(Agent):
def __init__(self, actions, epsilon=0.01, alpha=0.5, gamma=1):
super(QLearningAgent, self).__init__(actions)
## Initialize empty dictionary here
## In addition, initialize the value of epsilon, alpha and gamma
self.Q = {}
self.epsilon = epsilon
self.alpha = alpha
self.gamma = gamma
def stateToString(self, state):
mystring = ""
if np.isscalar(state):
mystring = str(state)
else:
for digit in state:
mystring += str(digit)
return mystring
def act(self, state):
stateStr = self.stateToString(state)
action = np.random.randint(0, self.num_actions)
Q = self.num_actions*[0]
for a in range(self.num_actions):
if not stateStr+' %i'%a in self.Q:
self.Q[stateStr+' %i'%a] = 0
Q[a] = self.Q[stateStr+' %i'%a]
choice = None
if self.epsilon == 0:
choice = 0
elif self.epsilon == 1:
choice = 1
else:
choice = np.random.binomial(1, self.epsilon)
if choice == 1:
return np.random.randint(0, self.num_actions)
else:
m = max(Q)
best_Q = [i for i, j in enumerate(Q) if j == m]
action = np.random.choice(best_Q)
#set_trace()
return action
return action
def learn(self, state1, action1, reward, state2, done):
state1Str = self.stateToString(state1)
state2Str = self.stateToString(state2)
Q = self.num_actions*[0]
for a in range(self.num_actions):
if not state2Str+' %i'%a in self.Q:
self.Q[state2Str+' %i'%a] = 0
Q[a] = self.Q[state2Str+' %i'%a]
self.Q[state1Str+' %i'%action1] = self.Q[state1Str+' %i'%action1]+\
self.alpha*(reward+self.gamma*max(Q)-\
self.Q[state1Str+' %i'%action1])
"""
Q-learning Update:
Q(s,a) <- Q(s,a) + alpha * (reward + gamma * max(Q(s') - Q(s,a))
"""
SARSA
class SarsaAgent(Agent):
def __init__(self, actions, epsilon=0.01, alpha=0.5, gamma=1):
super(SarsaAgent, self).__init__(actions)
## Initialize empty dictionary here
## In addition, initialize the value of epsilon, alpha and gamma
self.Q = {}
self.epsilon = epsilon
self.alpha = alpha
self.gamma = gamma
def stateToString(self, state):
mystring = ""
if np.isscalar(state):
mystring = str(state)
else:
for digit in state:
mystring += str(digit)
return mystring
def act(self, state):
stateStr = self.stateToString(state)
action = np.random.randint(0, self.num_actions)
Q = self.num_actions*[0]
for a in range(self.num_actions):
if not stateStr+' %i'%a in self.Q:
self.Q[stateStr+' %i'%a] = 0
Q[a] = self.Q[stateStr+' %i'%a]
## Implement epsilon greedy policy here
choice = None
if self.epsilon == 0:
choice = 0
elif self.epsilon == 1:
choice = 1
else:
choice = np.random.binomial(1, self.epsilon)
if choice == 1:
return np.random.randint(0, self.num_actions)
else:
m = max(Q)
best_Q = [i for i, j in enumerate(Q) if j == m]
action = np.random.choice(best_Q)
#set_trace()
return action
def learn(self, state1, action1, reward, state2, action2):
state1Str = self.stateToString(state1)
state2Str = self.stateToString(state2)
## Implement the sarsa update here
#if not state2Str+' %i'%action2 in self.Q:
# self.Q[state2Str+' %i'%action2] = 0
self.Q[state1Str+' %i'%action1] = self.Q[state1Str+' %i'%action1]+\
self.alpha*(reward+self.gamma*self.Q[state2Str+' %i'%action2]-self.Q[state1Str+' %i'%action1])
#set_trace()
"""
SARSA Update
Q(s,a) <- Q(s,a) + alpha * (reward + gamma * Q(s',a') - Q(s,a))
"""
別的地方幾乎都是一樣的,唯一的區(qū)別就是learn這個函數(shù),這是唯一的區(qū)別。
下面是兩個算法的結(jié)果:
SARSA

Q-Learning

可以發(fā)現(xiàn)一下幾點:
- Q-learning收斂稍稍比SARSA快一點,至少在這個測試環(huán)境中
- 實際上Q-Learning收斂到13步(最優(yōu)解)走到目標(biāo)位置,而SARSA收斂到17步
- Q-Learning比起SARSA在收斂之后,明顯更加容易“跌入懸崖”,這個問題我也沒有想明白,歡迎討論,但是可以decay epsilon可以有解決。