# -*- coding: utf-8 -*-
"""Q-Learning-FrozenLake_v3.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1tuAm79lVLp6s6_eCNW3DyH4ohTpWI3sQ

# Q* Learning - FrozenLake
<br> 
In this Notebook, we'll implement an agent <b>that plays FrozenLake.</b>
<img src="https://www.gymlibrary.ml/_images/frozen_lake.gif" alt="Frozen Lake"/>

The goal of this game is <b>to go from the starting state (S) to the goal state (G)</b> by walking only on frozen tiles (F) and avoid holes (H).However, the ice is slippery, <b>so you won't always move in the direction you intend (stochastic environment)</b>


Reference: code adapted from 
https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Q%20learning/FrozenLake/Q%20Learning%20with%20FrozenLake.ipynb
"""

import numpy as np
import gym #to load the FrozenLake Environment
import random
import matplotlib.pyplot as plt

env = gym.make("FrozenLake-v0")#this command creates the FrozenLake environment using the gym library of environments
#is_slippery=False

env.reward_range
env.metadata
env.spec
env.action_space.contains(2)
#env.desc

env.reset()

"""## Step 2: Create the Q-table and initialize it """

action_size = env.action_space.n#how many actions there are
state_size = env.observation_space.n#how many states there are
qtable = np.zeros((state_size, action_size))
print(qtable)

"""## Set the hyperparameters"""

total_episodes = 50        # Total episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma_rate = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005             # Exponential decay rate for exploration prob

"""## Step 4: The Q learning algorithm
- Now we implement the Q learning algorithm:
<img src="https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Q%20learning/FrozenLake/qtable_algo.png?raw=1" alt="Q algo"/>
"""

# List of rewards
rewards = []

for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # epsilon-greedy strategy
        exploration_or_exploitation = random.uniform(0, 1)
        
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exploration_or_exploitation > epsilon:
            action = np.argmax(qtable[state,:])

        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma_rate * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        
        # Our new state is state
        state = new_state
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break
        
    # Schedule to reduce epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

# visualisation de la table Q pour FrozenLake 
def plotQ(q_table,env):
  MAP=env.desc
  map_size=MAP.shape[0]
  best_value = np.max(q_table, axis = 1).reshape((map_size,map_size))
  best_policy = np.argmax(q_table, axis = 1).reshape((map_size,map_size))
    
  fig, ax = plt.subplots()
  im = ax.imshow(best_value,cmap=plt.cm.Wistia)#Pastel1, spring, autumn
  arrow_list=['<','v','>','^']
  for i in range(best_value.shape[0]):
      for j in range(best_value.shape[1]):
          if MAP[i][j].decode('utf-8') in 'GH':#terminal states
            arrow = MAP[i][j].decode('utf-8')
          else :
            arrow=arrow_list[best_policy[i, j]]
          if MAP[i][j].decode('utf-8') in 'S':
              arrow = 'S ' + arrow
          text = ax.text(j, i, arrow, ha = "center", va = "center",
                         color = "black")
          text2 = ax.text(j, i+0.2, str(np.round(best_value[i,j],2)), 
                          ha = "center", va = "center",color = "blue")
          
            
  cbar = ax.figure.colorbar(im, ax = ax)
  plt.axis('off')  
  fig.tight_layout()
  plt.show() 
plotQ(qtable,env)

#print(qtable)
#np.max(qtable, axis = 1).reshape((4,4))
#look at https://gsverhoeven.github.io/post/frozenlake-qlearning-convergence/ for the optimal value function

"""## Step 5: Use the Q-table to play FrozenLake
- After 10 000 episodes, our Q-table can be used as a "cheatsheet" to play FrozenLake"
- By running this cell you can see our agent playing FrozenLake.
"""

env.reset()
rewards = []
total_test_episodes=10

for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0

    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        #env.render()
        new_state, reward, done, info = env.step(action)
        total_rewards +=reward        
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            
            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/total_test_episodes))

env.close()

"""

```
# Ce texte est au format code
```
##Questions

- is the obtained value function optimal ?
- is the obtained policy optimal ?
- when $\gamma$ goes to $1.0$ do the policy changes ? 
- change the total number of training episodes 

"""