# -*- coding: utf-8 -*- """DQN_Breakout_2023_ok_v2_4.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1sJQf2UcgBWp4Ev_WhsMvHFuY2LUtNl40 # DQN agent training on the Breakout Atari Game, may 2023 by Gabriel Turinici Many parts of the code use the implementation from https://github.com/algila/Deep-Q-Learning-DQN. The code has been simplified (for instance the logging is not present). A part has been added to play the game using the trained network. ## Basic package setup """ !pip install gymnasium !pip install gymnasium[atari] !pip install gymnasium[accept-rom-license] #v2 keep working in TF2.3.0 #v3 added DQN and used a FIFO for the REPLAY_MEMORY (now it is working very slowly) #v4 used pre-processing 84x84x4 as for DeepMind and adapted DQN for that input #v5 improved ReplayMemory to use less RAM (derived from code fg91) import gymnasium as gym import numpy as np from numpy import asarray import matplotlib.pyplot as plt import datetime from collections import deque import time import random from tqdm import tqdm from PIL import Image import cv2 import os import sys import tensorflow as tf from tensorflow import keras import keras.backend as backend from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Activation, Flatten from tensorflow.keras.optimizers import Adam from tensorflow.python.framework.ops import disable_eager_execution from tensorflow.python.framework.ops import enable_eager_execution from functools import partial print= partial(print, flush=True)#to always flush the output if tf.executing_eagerly(): print('Eagerly is running') else: print('Eagerly is disables') print(f'tensorflow version {tf.__version__}') print(f'keras version {tf.keras.__version__}') #env = gym.make("MountainCar-v0") #env = gym.make("Breakout-v4") env = gym.make("ALE/Breakout-v5",render_mode="rgb_array") #env = gym.make("PongDeterministic-v4") #env = gym.make("CartPole-v1") """## Set parameters""" print('Set parameters') DEBUG = False SHOW_EVERY = 200 DISCOUNT = 0.99 # Discount factor gamma used in the Q-learning update LEARNING_RATE = 0.00025 # 0.00001 0.00025 Mnih et all 2015 # Hessel et al. 2017 used 0.0000625 # in Pong use 0.00025 REPLAY_MEMORY_SIZE = 1000000 #100000 # How many last steps to keep in memory for model training REPLAY_MEMORY_SIZE = 100000 #100000 # How many last steps to keep in memory for model training MIN_REPLAY_MEMORY_SIZE = 50000 #50000 # Minimum number of random steps before to start training with the memory MIN_REPLAY_MEMORY_SIZE = 5000 #50000 # Minimum number of random steps before to start training with the memory # This is also the Number of completely random actions before the agent starts learning MAX_FRAMES = 25000000 #50milion # Total number of frames the agent sees during training MAX_FRAMES = 25000000 # Total number of frames the agent sees during training MINIBATCH_SIZE = 32 # How many steps (samples) to use for training UPDATE_TARGET_MODEL =10000 #10000 # Number of chosen actions between updating the target network. UPDATE_TARGET_MODEL =100 #10000 # Number of chosen actions between updating the target network. # According to Mnih et al. 2015 this is measured in the number of # parameter updates (every four actions), however, in the # DeepMind code, it is clearly measured in the number # of actions the agent choses UPDATE_MODEL = 4 MODEL_NAME = 'DQN_DeepMind' MIN_REWARD = -200 # For model save #EPISODES = 20000 # number of match played in total during training EPISODES = 2000 # number of match played in total during training SAVE_EPISODE_EVERY= int(EPISODES/10)#save model 10 times in the whole run # Exploration annealing settings epsilon = 1 # not a constant, going to be decayed MIN_EPSILON = 0.01 MAX_EPSILON = 1.00 MID_EPSILON = 0.1 EPSILON_DECAY_1 = (MAX_EPSILON-MID_EPSILON)/(REPLAY_MEMORY_SIZE-MIN_REPLAY_MEMORY_SIZE) EPSILON_DECAY_2 = (MID_EPSILON-MIN_EPSILON)/(MAX_FRAMES-REPLAY_MEMORY_SIZE) # Stats settings AGGREGATE_STATS_EVERY = 100 # episodes/match # used to print on the terminal ep_rewards = [] aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []} """## Define classes""" class DQNAgent(): def __init__(self): # Main model self.model = self.create_model() print(self.model.summary()) # Target network self.target_model = self.create_model() self.target_model.set_weights(self.model.get_weights()) # Used to count when to update target network with main network's weights self.target_update_counter = 0 def create_model(self): model = Sequential() model.add(Conv2D(32, (8, 8), input_shape=[84, 84, 4], strides=4, kernel_initializer=keras.initializers.VarianceScaling(scale=2.0))) # 4 frame greyscale 84x84 model.add(Activation('relu')) #model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.4)) model.add(Conv2D(64, (4, 4), strides=2, kernel_initializer=keras.initializers.VarianceScaling(scale=2.0))) model.add(Activation('relu')) #model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.2)) model.add(Conv2D(64, (3, 3), strides=1, kernel_initializer=keras.initializers.VarianceScaling(scale=2.0))) model.add(Activation('relu')) #model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.2)) model.add(Flatten()) # this converts our 3D feature maps to 1D feature vectors model.add(Dense(512, kernel_initializer=keras.initializers.VarianceScaling(scale=2.0), activation='relu')) model.add(Dense(env.action_space.n, activation='linear', kernel_initializer=keras.initializers.VarianceScaling(scale=2.0))) # action_space = how many choices (2) #model.compile(loss="mse", optimizer=Adam(lr=0.00025), metrics=['accuracy']) model.compile(loss=tf.keras.losses.Huber(), optimizer=Adam(learning_rate=LEARNING_RATE), metrics=['accuracy']) return model # Trains main network every step during episode def train(self, minibatch): # Get current states from minibatch, then query NN model for Q values current_states = np.array(minibatch[0])/255 current_qs_list = self.model.predict(current_states,verbose=0) # Get future states from minibatch, then query NN model for Q values new_current_states = np.array(minibatch[3])/255 future_qs_list = self.target_model.predict(new_current_states,verbose=0) X = [] Y = [] # Now we need to enumerate our batches for ii in range(MINIBATCH_SIZE): current_state = minibatch[0][ii] action = minibatch[1][ii] reward = minibatch[2][ii] new_current_state = minibatch[3][ii] done = minibatch[4][ii] # Bellman equation. Q = r + gamma*max Q', # If not a terminal state, get new q from future states, otherwise set it to 0 # almost like with Q Learning, but we use just part of equation here if not done: new_q = reward + DISCOUNT * np.max(future_qs_list[ii]) else: new_q = reward # Update Q value for given state current_qs = current_qs_list[ii] current_qs[action] = new_q # And append to our training data X.append(current_state) Y.append(current_qs) # Fit on all samples as one batch, NO log file saved. Quicker simulation # Note : not need to shuffle, this is already done in the get_minibatch part self.model.fit(np.array(X), np.array(Y), batch_size=MINIBATCH_SIZE, verbose=0, epochs=1,shuffle=False) # update target model function frames as verified into main (9*) def update_target_model(self): self.target_model.set_weights(self.model.get_weights()) self.target_update_counter = 0 # Queries main network for Q values given current observation space (environment state) def get_qs(self, state): return self.model.predict(np.array(state).reshape(-1, *state.shape)/255,verbose=0)[0] """## Replay Memory class""" class ReplayMemory(object): # derived from https://github.com/fg91/Deep-Q-Learning/blob/master/DQN.ipynb """Replay Memory that stores the last "size" transitions """ def __init__(self, size=1000, frame_height=84, frame_width=84, agent_history_length=4, batch_size=32): """ Args: size: Integer, Number of stored transitions frame_height: Integer, Height of a frame of an Atari game frame_width: Integer, Width of a frame of an Atari game agent_history_length: Integer, Number of frames stacked together to create a state batch_size: Integer, Number if transitions returned in a minibatch """ self.size = size self.frame_height = frame_height self.frame_width = frame_width self.agent_history_length = agent_history_length self.batch_size = batch_size self.count = 0 self.current = 0 # Pre-allocate memory self.actions = np.empty(self.size, dtype=np.int32) self.rewards = np.empty(self.size, dtype=np.float32) self.frames = np.empty((self.size, self.frame_height, self.frame_width), dtype=np.uint8) self.terminal_flags = np.empty(self.size, dtype=bool) # Pre-allocate memory for the states and new_states in a minibatch self.states = np.empty((self.batch_size, self.agent_history_length, self.frame_height, self.frame_width), dtype=np.uint8) self.new_states = np.empty((self.batch_size, self.agent_history_length, self.frame_height, self.frame_width), dtype=np.uint8) self.indices = np.empty(self.batch_size, dtype=np.int32) def add_experience(self, action, frame, reward, terminal): """ Args: action: An integer between 0 and env.action_space.n - 1 determining the action the agent performed [a_t] frame: A (84, 84, 1) frame of an Atari game in grayscale reached due to the action [s_t+1] reward: A float determining the reward the agent received for performing an action [r_t] terminal: A bool stating whether the episode terminated """ if frame.shape != (self.frame_height, self.frame_width): raise ValueError('Dimension of frame is wrong!') self.actions[self.current] = action self.frames[self.current, ...] = frame self.rewards[self.current] = reward self.terminal_flags[self.current] = terminal self.count = max(self.count, self.current + 1) self.current = (self.current + 1) % self.size #overwrite replay buffer if necessary def _get_state(self, index): if self.count==0: raise ValueError("The replay memory is empty!") if index < self.agent_history_length - 1: raise ValueError("Index must be min 3") return self.frames[index - self.agent_history_length + 1:index + 1, ...] def _get_valid_indices(self): """ We store all frames the agent sees in self.frames. When a game terminates (terminal=True) at index i, frame at index i belongs to a different episode than the frame at i+1. We want to avoid creating a state with frames from two different episodes. Finally we need to make sure that an index is not smaller than the number of frames stacked toghether to create a state (self.agent_history_length=4), so that a state and new_state can be sliced out of the array. """ for i in range(self.batch_size): while True: index = random.randint(self.agent_history_length, self.count - 1) if index < self.agent_history_length: continue if index >= self.current and index - self.agent_history_length <= self.current: continue if self.terminal_flags[index - self.agent_history_length:index].any(): continue break self.indices[i] = index def get_minibatch(self): """ Returns a minibatch of self.batch_size = 32 transitions """ if self.count < self.agent_history_length: raise ValueError('Not enough memories to get a minibatch') self._get_valid_indices() for i, idx in enumerate(self.indices): self.states[i] = self._get_state(idx - 1) self.new_states[i] = self._get_state(idx) minibatch = (np.transpose(self.states, axes=(0, 2, 3, 1)), self.actions[self.indices], self.rewards[ self.indices], np.transpose(self.new_states, axes=(0, 2, 3, 1)), self.terminal_flags[self.indices]) return minibatch """## Pre-processing utility""" def pre_processing (frame): # single Frame Processor from 210x160x3 to 84x84x1 frame_gray = np.dot(frame, [0.299, 0.587, 0.114]) # 210x160 convert gray scale #plt.imshow(np.array(np.squeeze(frame_gray)), cmap='gray') #plt.show() frame_gray = frame_gray[31:195, 0:160] # crop off upper score (31 lines) and below black area (15 lines) #resized_img0 = Image.fromarray(frame_gray).resize(size=(84, 84), resample=Image.BILINEAR) # 84x84x1 resized_img0 = Image.fromarray(frame_gray).resize(size=(84, 84), resample=Image.NEAREST) # 84x84x1 #plt.imshow(np.array(np.squeeze(resized_img0)), cmap='gray') #plt.show() return asarray(resized_img0, dtype=np.uint8) """## Main training loop""" print('main training loop') # Create models folder if not os.path.isdir('models'): os.makedirs('models') agent = DQNAgent() print("The environment has the following {} actions: {}".format(env.action_space.n, env.unwrapped.get_action_meanings())) my_replay_memory = ReplayMemory(size=REPLAY_MEMORY_SIZE, batch_size=MINIBATCH_SIZE) # (★) frame_number = 0 # total number of step conducted from the first match till the last # Iterate over episodes for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'): #1 episode = 1 match # Restarting episode - reset episode reward and step number episode_reward = 0 step = 1 # Reset environment and get initial state # current_frame,_ = env.reset() # single frame 84x84 #NEW VERSION : take care, the new format of the result is a tuple ... current_frame,_ = env.reset() # single frame 84x84 current_frame = pre_processing(current_frame) current_state = np.dstack((current_frame, current_frame, current_frame, current_frame)) # create imm 84x84 grouping in 4 frames # Reset flag and start iterating until episode ends done = False while not done: #in the environment MountainCar-v0 done=False after 200 step #in the environment Breakout-v4 done=False after lost all lifes (num life =5) # Exploration-exploitation trade-off # Take new action # train main network # Set new state # Add new reward # Exploration-exploitation trade-off after a number of steps with completely random action if np.random.random() > epsilon and frame_number > MIN_REPLAY_MEMORY_SIZE: # Get action from DQN model: exploit action = np.argmax(agent.get_qs(current_state)) else: # Get random action: explore action = np.random.randint(0, env.action_space.n) # Take new action # new_frame, reward, done, _ = env.step(action) #new format new_frame, reward, done1,done2, _ = env.step(action) done = done1 |done2 reward = np.clip(reward,-1,1)#clip reward to be between -1,1 # Set and preprocess new state new_frame = pre_processing(new_frame) # single frame 84x84 preprocessed new_state = np.dstack((new_frame, current_state[:, :, 0], current_state[:, :, 1], current_state[:, :, 2]))# create imm 84x84 grouping in 4 frames episode_reward += reward if episode % SHOW_EVERY == 0: # and DEBUG: # plot one match every SHOW_EVERY #time.sleep(0.01) #print(f'Step: {step}') env.render() # Every step/frame we update replay memory with action, new frame, reward due to the action my_replay_memory.add_experience(action, new_frame, reward, done) # Every step we evaluate to train main network and/or to update weights of target network if frame_number % UPDATE_MODEL == 0 and frame_number > MIN_REPLAY_MEMORY_SIZE: # model update every 4 frame/action # Get a minibatch of random samples from the replay memory minibatch = my_replay_memory.get_minibatch() agent.train(minibatch) # if DEBUG: # plot of the minibatch used to train (only 1 image over 4 into the frame) fig = plt.figure(figsize=(8*4, 4*4)) for i in range(my_replay_memory.batch_size): plt.subplot(4, 8, i + 1) plt.imshow(minibatch[0][i, :, :, 0], cmap='jet') plt.show() plt.savefig('minibatch'+str(frame_number)+'.jpg') del minibatch if frame_number % UPDATE_TARGET_MODEL == 0 and frame_number > MIN_REPLAY_MEMORY_SIZE: # model target update every 10000 frame/action agent.update_target_model() # (9★) # # Set new state (9*) current_state = new_state step += 1 frame_number += 1 # Append episode reward to a list and log stats (every given number of episodes) ep_rewards.append(episode_reward) if episode % 10 == 0 and episode>AGGREGATE_STATS_EVERY: average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:]) / len(ep_rewards[-AGGREGATE_STATS_EVERY:]) min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:]) max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:]) print(f'Episode:{episode:>5d}, frame_number:{frame_number:>7d}, '+ f'avg rew:{average_reward:>4.1f}, '+ f'max rew:{max(ep_rewards[-AGGREGATE_STATS_EVERY:]):>4.1f}, '+ f'min rew:{min(ep_rewards[-AGGREGATE_STATS_EVERY:]):>4.1f}, '+ f'current epsilon:{epsilon:>1.5f}') # Decay epsilon. Only start after replay memory is over min size if frame_number > MIN_REPLAY_MEMORY_SIZE: if frame_number < REPLAY_MEMORY_SIZE: epsilon = MID_EPSILON + EPSILON_DECAY_1*(REPLAY_MEMORY_SIZE-frame_number) else: epsilon = MIN_EPSILON +EPSILON_DECAY_2*(MAX_FRAMES-frame_number) epsilon = np.clip(epsilon,MIN_EPSILON,MAX_EPSILON) if episode % SAVE_EPISODE_EVERY == 0 and episode>AGGREGATE_STATS_EVERY: agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}'+ f'max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}'+ f'min__{int(time.time())}.model') env.close() # Save model, agent.model.save(f'models/{MODEL_NAME}__'+ f'{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_'+ f'{min_reward:_>7.2f}min__{int(time.time())}.model') plt.figure('stats') plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['avg'], label="average rewards") plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['max'], label="max rewards") plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['min'], label="min rewards") plt.legend(loc=2) plt.grid(True) plt.show() plt.savefig('grid.jpg') """# Debug and test the results ### Plot some mini-batch at random """ print('debug and test results') if(False): print('plot some mini-batch at random') minibatch = my_replay_memory.get_minibatch() no_frames=my_replay_memory.agent_history_length fig = plt.figure(figsize=(MINIBATCH_SIZE*4,no_frames*4)) for ii in range(no_frames): for iii in range(MINIBATCH_SIZE): plt.subplot(my_replay_memory.agent_history_length, MINIBATCH_SIZE, iii+ii*MINIBATCH_SIZE+1) plt.imshow(minibatch[0][iii, :, :, ii], cmap='jet')#images are in minibatch[0] stored as minibatch[0].shape=(32, 84, 84, 4) ndarray plt.axis('off') plt.show() env_test = gym.make("ALE/Breakout-v5",render_mode="rgb_array") action_meanings=env_test.unwrapped.get_action_meanings() if(False): print('load model from directory model_bak') loaded_model=keras.models.load_model('model_bak') agent.model.set_weights(loaded_model.get_weights()) print('play game and draw results') import matplotlib.animation as animation images = [] # List to store the generated images current_frame,_ = env_test.reset() # single frame 84x84 current_frame_pre = pre_processing(current_frame) #initialize current_states=np.zeros(current_frame_pre.shape + (4,) ) current_states[:,:,0]=current_frame_pre.copy() for jj in range(3): new_frame, reward, done1,done2, _ = env_test.step(action) current_states[:,:,jj+1]=pre_processing(new_frame) plt.figure('play') done=False count=0 max_frames_play=20 while (not done) and (count