-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrl.py
149 lines (125 loc) · 4.65 KB
/
rl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
Created on Tue April 24 21:29 2018
@author: hanxy
"""
import numpy as np
import pandas as pd
# Base Reinforcement Learning
class RL(object):
def __init__(self, actions,
learning_rate=0.01,
reward_decay=0.9,
e_greedy=0.9):
self.actions = actions
self.n_actions = len(actions)
self.LR = learning_rate
self.Gamma = reward_decay
self.Epsilon = e_greedy
self.q_table = pd.DataFrame(columns=self.actions,
dtype=np.float64)
# choose action based on observation
def choose_action(self, observation):
self.check_state_exist(observation)
# action selection
if np.random.uniform() < self.Epsilon:
# choose best action
state_action = self.q_table.loc[observation, :]
state_action = state_action.reindex(
np.random.permutation(state_action.index
))
action = state_action.idxmax()
else:
# choose random action
action = np.random.choice(self.actions)
return action
def learn(self, *args):
pass
# check if state exists
def check_state_exist(self, state):
if state not in self.q_table.index:
# append new state to q table
self.q_table = self.q_table.append(
pd.Series(
np.zeros(self.n_actions),
index=self.q_table.columns,
name=state
)
)
# Q-learning
class QLearning(RL):
def __init__(self, actions,
learning_rate=0.01,
reward_decay=0.9,
e_greedy=0.9):
super(QLearning, self).__init__(actions,
learning_rate,
reward_decay,
e_greedy)
def learn(self, s, a, r, s_):
self.check_state_exist(s_)
q_predict = self.q_table.loc[s, a]
if s_ != 'terminal' and s_ != 'hell':
q_target = r + self.Gamma * self.q_table.loc[s_, :].max()
else:
q_target = r
self.q_table.loc[s, a] += self.LR * (q_target - q_predict)
# Sarsa on-policy
class Sarsa(RL):
def __init__(self, actions,
learning_rate=0.01,
reward_decay=0.9,
e_greedy=0.9):
super(Sarsa, self).__init__(actions,
learning_rate,
reward_decay,
e_greedy)
def learn(self, s, a, r, s_, a_):
self.check_state_exist(s_)
q_predict = self.q_table.loc[s, a]
if s_ != 'terminal' and s_ != 'hell':
q_target = r + self.Gamma * self.q_table.loc[s_, a_]
else:
q_target = r
self.q_table.loc[s, a] += self.LR * (q_target - q_predict)
# backward eligibility traces
class SarsaLambda(RL):
def __init__(self, actions,
learning_rate=0.01,
reward_decay=0.9,
e_greedy=0.9,
trace_decay=0.9):
super(SarsaLambda, self).__init__(actions,
learning_rate,
reward_decay,
e_greedy)
self.Lambda = trace_decay
self.eligibility_trace = self.q_table
def check_state_exist(self, state):
if state not in self.q_table.index:
# append new state to q table
to_be_appended = pd.Series(
np.zeros(self.n_actions),
index=self.q_table.columns,
name=state
)
self.q_table = self.q_table.append(to_be_appended)
# also update eligibility trace
self.eligibility_trace = self.eligibility_trace.append(to_be_appended)
def learn(self, s, a, r, s_, a_):
self.check_state_exist(s_)
q_predict = self.q_table.loc[s, a]
if s_ != 'terminal' and s_ != 'hell':
q_target = r + self.Gamma * self.q_table.loc[s_, a_]
else:
q_target = r
error = q_target - q_predict
# increase trace amount for visited state-action pair
# Method 1:
# self.eligibility_trace.loc[s, a] += 1
# Method 2:
self.eligibility_trace.loc[s, :] *= 0
self.eligibility_trace.loc[s, a] = 1
# Q update
self.q_table += self.LR * error * self.eligibility_trace
# decay eligibility trace after update
self.eligibility_trace *= self.Gamma * self.Lambda