# -*- coding: utf-8 -*- """ Created on Thu Mar 13 21:29:53 2025 @author: G. Turinici Multi-Armed Bandit Simulation Game This script simulates a simple multi-armed bandit (MAB) problem where the user interacts with a bandit that has multiple arms, each with an unknown true mean reward. The goal is for the user to explore the different arms, collect rewards, and potentially discover the best-performing arm. The game consists of two main functions: 1. initialize_bandit(k): Initializes the bandit environment with k arms. Each arm has a randomly assigned true mean reward, and the game tracks how many times each arm has been chosen, as well as the accumulated rewards. 2. play_bandit(k): Main function that simulates playing the bandit game. The user chooses an arm to pull or opts to quit. For each choice, a reward is drawn from a normal distribution with the arm's true mean and a fixed standard deviation. The game continues until the user chooses to quit, at which point the true mean values for each arm and the best-performing arm are revealed. The bandit arms' true mean rewards are randomly initialized from a standard normal distribution, and rewards for the chosen arms are sampled from a normal distribution with a fixed standard deviation of 3. Usage: - The user will be prompted to input the number of arms (k). - The game will display the current state of each arm (number of times chosen, average reward). - The user will choose an arm or quit the game. - When quitting, the total number of choices made and the best-performing arm (based on the highest true mean reward) are displayed. Example: Enter the number of arms (k): 5 Choose an arm (1-5) or press 'q' to quit: This script provides a simple interactive experience to understand the exploration-exploitation trade-off in multi-armed bandit problems. Author: [Your Name or Organization] Date: [Date of Creation or Last Update] """ import numpy as np def initialize_bandit(k): """ Initialize the multi-armed bandit environment. This function sets up the initial state of the bandit game by: - Creating arrays to track the number of times each arm has been selected. - Creating arrays to track the total rewards received from each arm. - Randomly initializing the true mean reward for each arm from a standard normal distribution. Parameters: k (int): The number of arms in the bandit. Returns: tuple: A tuple containing three numpy arrays: - num_chosen (numpy.ndarray): The number of times each arm has been chosen. - total_reward (numpy.ndarray): The total reward obtained from each arm. - arm_means (numpy.ndarray): The true mean reward of each arm, randomly initialized. """ # Initialize arrays to store the number of times each arm is chosen and the total reward obtained from each arm num_chosen = np.zeros(k) total_reward = np.zeros(k) # Initialize array to store the true mean reward for each arm (randomly chosen from standard normal distribution) arm_means = np.random.normal(loc=0, scale=1, size=k) return num_chosen, total_reward, arm_means def play_bandit(k): """ Simulate playing the multi-armed bandit game. This function manages the interaction between the user and the bandit game, where: - The user can choose an arm or quit the game. - The chosen arm generates a reward sampled from a normal distribution with the arm's true mean reward and a fixed standard deviation. - The game keeps track of the number of times each arm is chosen and the accumulated rewards. - When the user quits, the game displays the total number of choices made and the true mean rewards for each arm. Parameters: k (int): The number of arms in the bandit. """ # Initialize the bandit num_chosen, total_reward, arm_means = initialize_bandit(k) total_choices = 0 # Play the bandit game while True: # Display the current state of the bandit print("\nBandit state:") print("Arm\t|\tTimes\t|\tAverage") print("\t|\tChosen\t|\tReward") print("---------------------------") # Display the state for each arm for arm in range(k): # Display the arm number, the number of times chosen, and the average reward for that arm print(f"{arm+1}\t|\t{int(num_chosen[arm])}\t\t|\t{total_reward[arm]/(num_chosen[arm]+1e-6):.2f}") print("---------------------------") # Prompt the user to choose an arm or quit choice = input(f"\nChoose an arm (1-{k}) or press 'q' to quit: ") # Check if the user wants to quit if choice == 'q': # Display total number of choices and the true mean rewards for each arm print(f"Total number of choices: {total_choices}") print("True mean values for each arm:") for arm in range(k): print(f"Arm {arm+1}: {arm_means[arm]:.2f}") # Identify and display the arm with the highest true mean reward best_arm = np.argmax(arm_means) + 1 print(f"The best arm is: {best_arm}") break # Convert input to an integer (check for valid input) try: choice = int(choice) except ValueError: print("Invalid choice. Please enter an integer or 'q' to quit.") continue # Check if the choice is valid if choice < 1 or choice > k: print(f"Invalid choice. Please choose an arm between 1 and {k}.") continue # Increment the number of times the chosen arm is selected arm_index = choice - 1 num_chosen[arm_index] += 1 total_choices += 1 # Simulate a reward from the chosen arm (normally distributed with mean arm_means[arm_index] and std deviation 3) reward = np.random.normal(loc=arm_means[arm_index], scale=3) print(f"Chosen arm={choice}, reward={reward:.3f}.") # Update the total reward obtained from the chosen arm total_reward[arm_index] += reward if __name__ == "__main__": """ Entry point for the script to play the multi-armed bandit game. This part of the code will prompt the user for the number of arms in the bandit game and then start the bandit game by calling the play_bandit function. """ # Get the number of arms from the user k = int(input("Enter the number of arms (k): ")) # Start the bandit game play_bandit(k)