import numpy as np class MultiArmedBandit: def __init__(self, num_actions): self.num_actions = num_actions self.action_values = np.random.normal(loc=0, scale=1, size=num_actions) def choose_action(self, policy): action_probs = policy(self.action_values) action = np.random.choice(np.arange(self.num_actions), p=action_probs) return action def softmax_policy(action_values): exp_values = np.exp(action_values) return exp_values / np.sum(exp_values) def policy_gradient(num_actions, num_episodes, learning_rate, gamma): bandit = MultiArmedBandit(num_actions) policy_params = np.random.rand(num_actions) rewards = [] for _ in range(num_episodes): episode_reward = 0 for _ in range(1000): # Episode length is capped at 1000 steps action = bandit.choose_action(lambda x: softmax_policy(policy_params)) reward = np.random.normal(loc=bandit.action_values[action], scale=1) episode_reward += reward action_prob = softmax_policy(policy_params)[action] gradient = np.zeros(num_actions) for a in range(num_actions): if a == action: gradient[a] = (1 - action_prob) * reward else: gradient[a] = -action_prob * reward # Update policy parameters after each action policy_params += learning_rate * gamma ** _ * gradient rewards.append(episode_reward) return rewards if __name__ == "__main__": num_actions = 5 num_episodes = 1000 learning_rate = 0.01 gamma = 0.99 rewards = policy_gradient(num_actions, num_episodes, learning_rate, gamma) print("Average reward:", np.mean(rewards))