# (c) Gabriel Turinici feb 2026 # # # Reference: code adapted from 2024 version of # https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Q%20learning/FrozenLake/Q%20Learning%20with%20FrozenLake.ipynb #old commands, not used in this version #!pip install gym #!pip install pygame #!pip install gym[toy_text] # In[41]: import numpy as np import gymnasium as gym #to load the FrozenLake Environment import random import matplotlib.pyplot as plt from tqdm import tqdm # In[42]: #env = gym.make("FrozenLake-v0") env = gym.make("FrozenLake-v1",is_slippery=True, render_mode="rgb_array")#creates the FrozenLake environment using the gym library of environments #this may be needed env=env.unwrapped # In[43]: print(env.reward_range, env.metadata, env.spec, env.action_space.contains(2)) #env.desc # ## Step 2: Create the Q-table and initialize it # In[56]: action_size = env.action_space.n#how many actions there are state_size = env.observation_space.n#how many states there are qtable = np.zeros((state_size, action_size)) print(qtable) # ## Set the hyperparameters # In[82]: total_episodes = 100000 # Total episodes learning_rate = 0.1 # Learning rate max_steps = 99 # Max steps per episode gamma_rate = 0.95 # Discounting rate # Exploration parameters epsilon = 0.1 # Exploration rate max_epsilon = 0.5 # Exploration probability at start min_epsilon = 0.01 # Minimum exploration probability decay_rate = 0.00 # Exponential decay rate for exploration prob # ## Step 4: The Q learning algorithm # List of rewards rewards = [] for episode in tqdm(range(total_episodes)): # Reset the environment state,_= env.reset() step = 0 done = False total_rewards = 0 for step in range(max_steps): # epsilon-greedy strategy exploration_or_exploitation = random.uniform(0, 1) ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state) if exploration_or_exploitation > epsilon: action = None # Else doing a random choice --> exploration else: action = None # Take the action (a) and observe the outcome state(s') and reward (r) new_state, reward, terminated, truncated, info = env.step(action) # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)] # qtable[new_state,:] : all the actions we can take from new state qtable[state, action] += None total_rewards += reward # Our new state is state state = new_state # If done (if we're dead) : finish episode if terminated|truncated: break # Schedule to reduce epsilon epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) rewards.append(total_rewards) print ("Score over time: " + str(sum(rewards)/total_episodes)) print(qtable) # In[88]: # visualisation de la table Q pour FrozenLake def plotQ(q_table,env): MAP=env.desc map_size=MAP.shape[0] best_value = np.max(q_table, axis = 1).reshape((map_size,map_size)) best_policy = np.argmax(q_table, axis = 1).reshape((map_size,map_size)) fig, ax = plt.subplots() im = ax.imshow(best_value,cmap=plt.cm.Wistia)#Pastel1, spring, autumn arrow_list=['<','v','>','^'] for i in range(best_value.shape[0]): for j in range(best_value.shape[1]): if MAP[i][j].decode('utf-8') in 'GH':#terminal states arrow = MAP[i][j].decode('utf-8') else : arrow=arrow_list[best_policy[i, j]] if MAP[i][j].decode('utf-8') in 'S': arrow = 'S ' + arrow text = ax.text(j, i, arrow, ha = "center", va = "center", color = "black") text2 = ax.text(j, i+0.2, str(np.round(best_value[i,j],2)), ha = "center", va = "center",color = "blue") cbar = ax.figure.colorbar(im, ax = ax) plt.axis('off') fig.tight_layout() plt.show() plotQ(qtable,env) # In[85]: #print(qtable) #np.max(qtable, axis = 1).reshape((4,4)) #look at https://gsverhoeven.github.io/post/frozenlake-qlearning-convergence/ for the optimal value function # ## Step 5: Use the Q-table to play FrozenLake # - After 10 000 episodes, our Q-table can be used as a "cheatsheet" to play FrozenLake" # - By running this cell you can see our agent playing FrozenLake. # In[90]: rewards = [] total_test_episodes=10 for episode in range(total_test_episodes): state,_ = env.reset() step = 0 done = False total_rewards = 0 print("****************************************************") print("EPISODE ", episode) for step in range(max_steps): # Take the action (index) that have the maximum expected future reward given that state action = np.argmax(qtable[state,:]) #env.render() new_state, reward, term,trunc, info = env.step(action) total_rewards +=reward if term|trunc: # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole) plt.imshow(env.render()) # We print the number of step it took. print("Number of steps", step) break state = new_state rewards.append(total_rewards) print ("Score over time: " + str(sum(rewards)/total_test_episodes)) env.close() # ##Questions # # - is the obtained value function optimal ? # - is the obtained policy optimal ? # - when $\gamma$ goes to $1.0$ do the policy changes ? # - change the total number of training episodes # - implement Double Q-learning ... # # In[ ]: