-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path03_q_learning_gym_taxi_v3
75 lines (56 loc) · 2.42 KB
/
03_q_learning_gym_taxi_v3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import numpy as np
import gym
import matplotlib.pyplot as plt
# Initialize the Taxi-v3 environment
env = gym.make('Taxi-v3')
# Hyperparameters
alpha = 0.1 # Learning rate
gamma = 0.9 # Discount factor
max_epsilon = 1.0 # Initial exploration rate
min_epsilon = 0.1 # Minimum exploration rate
decay_rate = 0.01 # Exponential decay rate for exploration probability
num_episodes = 10000 # Number of episodes
# Initialize the Q-table to zeros
Q = np.zeros((env.observation_space.n, env.action_space.n))
# This list will store the total reward for each episode
rewards_per_episode = []
# Q-learning process
for i in range(num_episodes):
state = env.reset() # Reset the environment for a new episode
done = False
# Decay the exploration rate using the exponential decay formula
epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * i)
while not done:
# Choose an action using the epsilon-greedy policy
if np.random.uniform(0, 1) < epsilon:
action = env.action_space.sample() # Exploration: choose a random action
else:
action = np.argmax(Q[state, :]) # Exploitation: choose the action with the highest Q-value
# Execute the chosen action and observe the next state and reward
next_state, reward, done, _ = env.step(action)
# Update the Q-value using the Q-learning update rule
Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state, :]) - Q[state, action])
# Move on to the next state
state = next_state
# Evaluate the learned policy
test_episodes = 100
total_rewards = 0
for _ in range(test_episodes):
state = env.reset()
done = False
episode_reward = 0
while not done:
action = np.argmax(Q[state, :]) # Choose the best action based on the learned Q-values
next_state, reward, done, _ = env.step(action)
episode_reward += reward
state = next_state
rewards_per_episode.append(episode_reward) # Append the reward for this episode to the list
total_rewards += episode_reward
average_reward = total_rewards / test_episodes
print(f"Average reward over {test_episodes} episodes: {average_reward}")
# Now, plot the rewards using matplotlib
plt.plot(rewards_per_episode)
plt.ylabel('Total Reward')
plt.xlabel('Episode')
plt.title('Reward per Episode over Time')
plt.show()