# -*- coding: utf-8 -*-
"""
Created on Fri Feb  9 11:09:14 2024

@author: gturinici
"""

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

class MultiArmedBandit:
    def __init__(self, num_actions):
        self.num_actions = num_actions
        self.action_values = np.random.normal(loc=0, scale=1, size=num_actions)

    def choose_action(self, policy):
        action = np.random.choice(np.arange(self.num_actions), p=policy)
        return action

def softmax(action_values):
    action_values -=np.max(action_values)
    exp_values = np.exp(action_values)
    return exp_values / np.sum(exp_values)

def policy_gradient(num_actions, num_episodes, learning_rate,Ntmax):
    Id=np.eye(num_actions)
    bandit = MultiArmedBandit(num_actions)
    Ht = np.random.rand(num_actions)
    rewards = []
    max_rewards = []

    for _ in tqdm(range(num_episodes)):
        episode_reward = 0

        for t in range(Nt):  # Episode length is Nt
            action = bandit.choose_action(softmax(Ht))
            reward = np.random.normal(loc=bandit.action_values[action], scale=1)
            episode_reward += reward

            # Update policy parameters after each action
            Ht += learning_rate * (Id[action] - softmax(Ht)[action]) * (reward- episode_reward/(t+1))

        rewards.append(episode_reward)
        max_rewards.append(np.max(bandit.action_values))

    return np.array(rewards),np.array(max_rewards)

if __name__ == "__main__":
    num_actions = 5
    num_episodes = 100
    Nt=500
    learning_rate = 0.01

    rewards,max_rewards = policy_gradient(num_actions, num_episodes, learning_rate,Nt)

    print("Average reward:", np.mean(rewards/max_rewards)/Nt)
    plt.figure('evolution relative reward')
    plt.plot(rewards/max_rewards/Nt,label='rel mean reward')
    plt.legend()
    plt.xlabel('episode no.')