# -*- coding: utf-8 -*- """ Created on Fri Feb 9 11:09:14 2024 @author: gturinici """ import numpy as np import matplotlib.pyplot as plt from tqdm import tqdm class MultiArmedBandit: def __init__(self, num_actions): self.num_actions = num_actions self.action_values = np.random.normal(loc=0, scale=1, size=num_actions) def choose_action(self, policy): action = np.random.choice(np.arange(self.num_actions), p=policy) return action def softmax(action_values): action_values -=np.max(action_values) exp_values = np.exp(action_values) return exp_values / np.sum(exp_values) def policy_gradient(num_actions, num_episodes, learning_rate,Ntmax): Id=np.eye(num_actions) bandit = MultiArmedBandit(num_actions) Ht = np.random.rand(num_actions) rewards = [] max_rewards = [] for _ in tqdm(range(num_episodes)): episode_reward = 0 for t in range(Nt): # Episode length is Nt action = bandit.choose_action(softmax(Ht)) reward = np.random.normal(loc=bandit.action_values[action], scale=1) episode_reward += reward # Update policy parameters after each action Ht += learning_rate * (Id[action] - softmax(Ht)[action]) * (reward- episode_reward/(t+1)) rewards.append(episode_reward) max_rewards.append(np.max(bandit.action_values)) return np.array(rewards),np.array(max_rewards) if __name__ == "__main__": num_actions = 5 num_episodes = 100 Nt=500 learning_rate = 0.01 rewards,max_rewards = policy_gradient(num_actions, num_episodes, learning_rate,Nt) print("Average reward:", np.mean(rewards/max_rewards)/Nt) plt.figure('evolution relative reward') plt.plot(rewards/max_rewards/Nt,label='rel mean reward') plt.legend() plt.xlabel('episode no.')