#!/usr/bin/env python
# coding: utf-8
# # Q* Learning - FrozenLake
#
# In this Notebook, we'll implement an agent that plays FrozenLake.
#
#
# The goal of this game is to go from the starting state (S) to the goal state (G) by walking only on frozen tiles (F) and avoid holes (H).However, the ice is slippery, so you won't always move in the direction you intend (stochastic environment)
#
#
# Reference: code adapted from
# https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Q%20learning/FrozenLake/Q%20Learning%20with%20FrozenLake.ipynb
# In[18]:
#!pip install gym
#!pip install pygame
#!pip install gym[toy_text]
# In[41]:
import numpy as np
import gymnasium as gym #to load the FrozenLake Environment
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
# In[42]:
#env = gym.make("FrozenLake-v0")
env = gym.make("FrozenLake-v1",is_slippery=True, render_mode="rgb_array")#creates the FrozenLake environment using the gym library of environments
# In[43]:
print(env.reward_range,
env.metadata,
env.spec,
env.action_space.contains(2))
#env.desc
# ## Step 2: Create the Q-table and initialize it
# In[56]:
action_size = env.action_space.n#how many actions there are
state_size = env.observation_space.n#how many states there are
qtable = np.zeros((state_size, action_size))
print(qtable)
# ## Set the hyperparameters
# In[82]:
total_episodes = 100000 # Total episodes
learning_rate = 0.1 # Learning rate
max_steps = 99 # Max steps per episode
gamma_rate = 0.95 # Discounting rate
# Exploration parameters
epsilon = 0.1 # Exploration rate
max_epsilon = 0.5 # Exploration probability at start
min_epsilon = 0.01 # Minimum exploration probability
decay_rate = 0.00 # Exponential decay rate for exploration prob
# ## Step 4: The Q learning algorithm
# - Now we implement the Q learning algorithm:
#
# In[83]:
# List of rewards
rewards = []
for episode in tqdm(range(total_episodes)):
# Reset the environment
state,_= env.reset()
step = 0
done = False
total_rewards = 0
for step in range(max_steps):
# epsilon-greedy strategy
exploration_or_exploitation = random.uniform(0, 1)
## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
if exploration_or_exploitation > epsilon:
action = np.argmax(qtable[state,:])
# Else doing a random choice --> exploration
else:
action = env.action_space.sample()
# Take the action (a) and observe the outcome state(s') and reward (r)
new_state, reward, terminated, truncated, info = env.step(action)
# Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
# qtable[new_state,:] : all the actions we can take from new state
qtable[state, action] += learning_rate * (reward + gamma_rate * np.max(qtable[new_state, :]) - qtable[state, action])
total_rewards += reward
# Our new state is state
state = new_state
# If done (if we're dead) : finish episode
if terminated|truncated:
break
# Schedule to reduce epsilon
epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
rewards.append(total_rewards)
print ("Score over time: " + str(sum(rewards)/total_episodes))
print(qtable)
# In[88]:
# visualisation de la table Q pour FrozenLake
def plotQ(q_table,env):
MAP=env.desc
map_size=MAP.shape[0]
best_value = np.max(q_table, axis = 1).reshape((map_size,map_size))
best_policy = np.argmax(q_table, axis = 1).reshape((map_size,map_size))
fig, ax = plt.subplots()
im = ax.imshow(best_value,cmap=plt.cm.Wistia)#Pastel1, spring, autumn
arrow_list=['<','v','>','^']
for i in range(best_value.shape[0]):
for j in range(best_value.shape[1]):
if MAP[i][j].decode('utf-8') in 'GH':#terminal states
arrow = MAP[i][j].decode('utf-8')
else :
arrow=arrow_list[best_policy[i, j]]
if MAP[i][j].decode('utf-8') in 'S':
arrow = 'S ' + arrow
text = ax.text(j, i, arrow, ha = "center", va = "center",
color = "black")
text2 = ax.text(j, i+0.2, str(np.round(best_value[i,j],2)),
ha = "center", va = "center",color = "blue")
cbar = ax.figure.colorbar(im, ax = ax)
plt.axis('off')
fig.tight_layout()
plt.show()
plotQ(qtable,env)
# In[85]:
#print(qtable)
#np.max(qtable, axis = 1).reshape((4,4))
#look at https://gsverhoeven.github.io/post/frozenlake-qlearning-convergence/ for the optimal value function
# ## Step 5: Use the Q-table to play FrozenLake
# - After 10 000 episodes, our Q-table can be used as a "cheatsheet" to play FrozenLake"
# - By running this cell you can see our agent playing FrozenLake.
# In[90]:
rewards = []
total_test_episodes=10
for episode in range(total_test_episodes):
state,_ = env.reset()
step = 0
done = False
total_rewards = 0
print("****************************************************")
print("EPISODE ", episode)
for step in range(max_steps):
# Take the action (index) that have the maximum expected future reward given that state
action = np.argmax(qtable[state,:])
#env.render()
new_state, reward, term,trunc, info = env.step(action)
total_rewards +=reward
if term|trunc:
# Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
plt.imshow(env.render())
# We print the number of step it took.
print("Number of steps", step)
break
state = new_state
rewards.append(total_rewards)
print ("Score over time: " + str(sum(rewards)/total_test_episodes))
env.close()
# ##Questions
#
# - is the obtained value function optimal ?
# - is the obtained policy optimal ?
# - when $\gamma$ goes to $1.0$ do the policy changes ?
# - change the total number of training episodes
# - implement Double Q-learning ...
#
# In[ ]: