03_q_learning_gym_taxi_v3

import numpy as np
import gym
import matplotlib.pyplot as plt


# Initialize the Taxi-v3 environment
env = gym.make('Taxi-v3')

# Hyperparameters
alpha = 0.1            # Learning rate
gamma = 0.9            # Discount factor
max_epsilon = 1.0      # Initial exploration rate
min_epsilon = 0.1      # Minimum exploration rate
decay_rate = 0.01      # Exponential decay rate for exploration probability
num_episodes = 10000   # Number of episodes

# Initialize the Q-table to zeros
Q = np.zeros((env.observation_space.n, env.action_space.n))

# This list will store the total reward for each episode
rewards_per_episode = []

# Q-learning process
for i in range(num_episodes):
    state = env.reset()  # Reset the environment for a new episode
    done = False
    
    # Decay the exploration rate using the exponential decay formula
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * i)
    
    while not done:
        # Choose an action using the epsilon-greedy policy
        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Exploration: choose a random action
        else:
            action = np.argmax(Q[state, :])     # Exploitation: choose the action with the highest Q-value

        # Execute the chosen action and observe the next state and reward
        next_state, reward, done, _ = env.step(action)

        # Update the Q-value using the Q-learning update rule
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state, :]) - Q[state, action])
        
        # Move on to the next state
        state = next_state

# Evaluate the learned policy
test_episodes = 100
total_rewards = 0

for _ in range(test_episodes):
    state = env.reset()
    done = False
    episode_reward = 0
    
    while not done:
        action = np.argmax(Q[state, :])  # Choose the best action based on the learned Q-values
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        state = next_state
        
    rewards_per_episode.append(episode_reward)  # Append the reward for this episode to the list

    total_rewards += episode_reward

average_reward = total_rewards / test_episodes
print(f"Average reward over {test_episodes} episodes: {average_reward}")


# Now, plot the rewards using matplotlib
plt.plot(rewards_per_episode)
plt.ylabel('Total Reward')
plt.xlabel('Episode')
plt.title('Reward per Episode over Time')
plt.show()