# -*- coding: utf-8 -*- """ Created on Thu Feb 29 22:53:13 2024 @author: turinici """ import numpy as np # Define the grid size n_rows = 3 n_cols = 4 # Define the reward values for each state rewards = np.zeros((n_rows, n_cols)) rewards[0, 3] = 1 # reward for reaching the goal state rewards[1, 3] = -1 # penalty for reaching the bad state # Add obstacle at location (2,2) obstacle_row, obstacle_col = 1, 1 rewards[obstacle_row, obstacle_col] = 0 # no reward for obstacle up,down,left,right=0,1,2,3 # Define the transition probabilities skid_probability = 0.2 # Probability of skidding transition_probs = np.zeros((n_rows, n_cols, 4, n_rows, n_cols)) for i in range(n_rows): for j in range(n_cols): # if (i, j) == (obstacle_row, obstacle_col): # Obstacle state if (i, j) == (0, 3) or (i, j) == (1, 3) or (i, j) == (obstacle_row, obstacle_col): # Terminal states or obstacle continue transition_probs[i, j, up, max(0, i - 1), j] += 1 - skid_probability transition_probs[i, j, up, i, max(0, j - 1)] += skid_probability / 2 # Skid left transition_probs[i, j, up, i, min(n_cols - 1, j + 1)] += skid_probability / 2 # Skid right transition_probs[i, j, down, min(n_rows - 1, i + 1), j] += 1 - skid_probability transition_probs[i, j, down, i, max(0, j - 1)] += skid_probability / 2 # Skid left transition_probs[i, j, down, i, min(n_cols - 1, j + 1)] += skid_probability / 2 # Skid right transition_probs[i, j, left, i, max(0, j - 1)] += 1 - skid_probability transition_probs[i, j, left, max(0, i - 1), j] += skid_probability / 2 # Skid up transition_probs[i, j, left, min(n_rows - 1, i + 1), j] += skid_probability / 2 # Skid down transition_probs[i, j, right, i, min(n_cols - 1, j + 1)] += 1 - skid_probability transition_probs[i, j, right, max(0, i - 1), j] += skid_probability / 2 # Skid up transition_probs[i, j, right, min(n_rows - 1, i + 1), j] += skid_probability / 2 # Skid down # Adjust transition probabilities for transitions to the obstacle state: # if we arrived at obstable reassign this probability to staying at initial state for a in range(4): for i in range(n_rows): for j in range(n_cols): transition_probs[i, j, a,i,j ] += transition_probs[i, j, a, obstacle_row, obstacle_col] transition_probs[i, j, a, obstacle_row, obstacle_col] = 0 #for (i,j) in ((0,3),(1,3),(obstacle_row,obstacle_col)): # transition_probs[i, j, a, i,j]=1 # Discount factor gamma = 0.9 # Value iteration def value_iteration(rewards, transition_probs, gamma, epsilon=1e-6): V = np.zeros((n_rows, n_cols)) # Initialize values to 0 count=0 while True: count+=1 for i in range(n_rows): for j in range(n_cols): None value_iteration.count=count return V optimal_values = value_iteration(rewards, transition_probs, gamma) print("After", value_iteration.count,"steps optimal values:\n", np.round(optimal_values,2)) """ Result for gamma=0.9 skid_prabability=0.2, convention v(final)= reward Optimal values: [[ 0.64496923 0.74438015 0.84776628 1. ] [ 0.56631443 0. 0.57185903 -1. ] [ 0.49068376 0.43084384 0.47547091 0.27729537]] Result for gamma=0.9 skid_prabability=0.2, convention v(final)= 0 After 25 steps optimal values: [[0.72 0.83 0.94 0. ] [0.63 0. 0.64 0. ] [0.55 0.48 0.53 0.31]] """