Nov 2022
import gym
import pickle
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers, optimizers
from IPython.display import HTML, display
from PIL import Image
from pathlib import Path
from collections import deque
from statistics import mean
seed = 111
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)
# For Kaggle
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"
# Lunar Lander environment initialization
env = gym.make("LunarLander-v2",render_mode='rgb_array')
obs = env.reset()
image = env.render()
plt.imshow(image)
<matplotlib.image.AxesImage at 0x7fa68ceaeb90>
I chose to play with LunarLander Environment on openAI.
In this environment, Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector. Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points. If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main engine is -0.3 points each frame. Solved is 200 points. Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land on its first attempt. Four discrete actions available: do nothing, fire left orientation engine, fire main engine, fire right orientation engine.
I evalute different methods of reinforcement learning through this journey.
N_STATES = env.observation_space.shape[0]
N_ACTIONS = env.action_space.n
def create_model():
"""Construct the critic network with q-values per action as output"""
model = keras.Sequential([
layers.Input(shape=(N_STATES)),
layers.Dense(64, activation="relu"),
layers.Dense(64, activation="relu", name='prunable'),
layers.Dense(N_ACTIONS, activation='linear')
])
model.compile(optimizer=optimizers.Adam(.001), loss='mse')
return model
model = create_model()
model.summary()
Model: "sequential_7" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense_19 (Dense) (None, 64) 576 prunable (Dense) (None, 64) 4160 dense_20 (Dense) (None, 4) 260 ================================================================= Total params: 4,996 Trainable params: 4,996 Non-trainable params: 0 _________________________________________________________________
import gym
import numpy as np
import pandas as pd
from collections import deque
import random
from keras import Sequential
from keras.layers import Dense
from keras.activations import relu, linear
from keras.optimizers import Adam
from keras.losses import mean_squared_error
from keras.models import load_model
import pickle
from matplotlib import pyplot as plt
class DQN:
def __init__(self, env, lr, gamma, epsilon, epsilon_decay):
self.env = env
self.action_space = env.action_space
self.observation_space = env.observation_space
self.counter = 0
self.lr = lr
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_decay = epsilon_decay
self.rewards_list = []
self.replay_memory_buffer = deque(maxlen=500000)
self.batch_size = 64
self.epsilon_min = 0.01
self.num_action_space = self.action_space.n
self.num_observation_space = env.observation_space.shape[0]
self.model = self.initialize_model()
def initialize_model(self):
model = Sequential()
model.add(Dense(512, input_dim=self.num_observation_space, activation=relu))
model.add(Dense(256, activation=relu))
model.add(Dense(self.num_action_space, activation=linear))
# Compile the model
model.compile(loss=mean_squared_error,optimizer=Adam(lr=self.lr))
print(model.summary())
return model
def get_action(self, state):
if np.random.rand() < self.epsilon:
return random.randrange(self.num_action_space)
predicted_actions = self.model.predict(state)
return np.argmax(predicted_actions[0])
def add_to_replay_memory(self, state, action, reward, next_state, done):
self.replay_memory_buffer.append((state, action, reward, next_state, done))
def learn_and_update_weights_by_reply(self):
# replay_memory_buffer size check
if len(self.replay_memory_buffer) < self.batch_size or self.counter != 0:
return
# Early Stopping
if np.mean(self.rewards_list[-10:]) > 180:
return
random_sample = self.get_random_sample_from_replay_mem()
states, actions, rewards, next_states, done_list = self.get_attribues_from_sample(random_sample)
targets = rewards + self.gamma * (np.amax(self.model.predict_on_batch(next_states), axis=1)) * (1 - done_list)
target_vec = self.model.predict_on_batch(states)
indexes = np.array([i for i in range(self.batch_size)])
target_vec[[indexes], [actions]] = targets
self.model.fit(states, target_vec, epochs=1, verbose=0)
def get_attribues_from_sample(self, random_sample):
states = np.array([i[0] for i in random_sample])
actions = np.array([i[1] for i in random_sample])
rewards = np.array([i[2] for i in random_sample])
next_states = np.array([i[3] for i in random_sample])
done_list = np.array([i[4] for i in random_sample])
states = np.squeeze(states)
next_states = np.squeeze(next_states)
return np.squeeze(states), actions, rewards, next_states, done_list
def get_random_sample_from_replay_mem(self):
random_sample = random.sample(self.replay_memory_buffer, self.batch_size)
return random_sample
def train(self, num_episodes=2000, can_stop=True):
for episode in range(num_episodes):
state = env.reset()
reward_for_episode = 0
num_steps = 1000
state = np.reshape(state, [1, self.num_observation_space])
for step in range(num_steps):
env.render()
received_action = self.get_action(state)
# print("received_action:", received_action)
next_state, reward, done, info = env.step(received_action)
next_state = np.reshape(next_state, [1, self.num_observation_space])
# Store the experience in replay memory
self.add_to_replay_memory(state, received_action, reward, next_state, done)
# add up rewards
reward_for_episode += reward
state = next_state
self.update_counter()
self.learn_and_update_weights_by_reply()
if done:
break
self.rewards_list.append(reward_for_episode)
# Decay the epsilon after each experience completion
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
# Check for breaking condition
last_rewards_mean = np.mean(self.rewards_list[-100:])
if last_rewards_mean > 200 and can_stop:
print("DQN Training Complete...")
break
print(episode, "\t: Episode || Reward: ",reward_for_episode, "\t|| Average Reward: ",last_rewards_mean, "\t epsilon: ", self.epsilon )
def update_counter(self):
self.counter += 1
step_size = 5
self.counter = self.counter % step_size
def save(self, name):
self.model.save(name)
def test_already_trained_model(trained_model):
rewards_list = []
num_test_episode = 100
env = gym.make("LunarLander-v2")
print("Starting Testing of the trained model...")
step_count = 1000
for test_episode in range(num_test_episode):
current_state = env.reset()
num_observation_space = env.observation_space.shape[0]
current_state = np.reshape(current_state, [1, num_observation_space])
reward_for_episode = 0
for step in range(step_count):
env.render()
selected_action = np.argmax(trained_model.predict(current_state)[0])
new_state, reward, done, info = env.step(selected_action)
new_state = np.reshape(new_state, [1, num_observation_space])
current_state = new_state
reward_for_episode += reward
if done:
break
rewards_list.append(reward_for_episode)
print(test_episode, "\t: Episode || Reward: ", reward_for_episode)
return rewards_list
def plot_df(df, chart_name, title, x_axis_label, y_axis_label):
plt.rcParams.update({'font.size': 17})
df['rolling_mean'] = df[df.columns[0]].rolling(100).mean()
plt.figure(figsize=(15, 8))
plt.close()
plt.figure()
# plot = df.plot(linewidth=1.5, figsize=(15, 8), title=title)
plot = df.plot(linewidth=1.5, figsize=(15, 8))
plot.set_xlabel(x_axis_label)
plot.set_ylabel(y_axis_label)
# plt.ylim((-400, 300))
fig = plot.get_figure()
plt.legend().set_visible(False)
fig.savefig(chart_name)
def plot_df2(df, chart_name, title, x_axis_label, y_axis_label):
df['mean'] = df[df.columns[0]].mean()
plt.rcParams.update({'font.size': 17})
plt.figure(figsize=(15, 8))
plt.close()
plt.figure()
# plot = df.plot(linewidth=1.5, figsize=(15, 8), title=title)
plot = df.plot(linewidth=1.5, figsize=(15, 8))
plot.set_xlabel(x_axis_label)
plot.set_ylabel(y_axis_label)
plt.ylim((0, 300))
plt.xlim((0, 100))
plt.legend().set_visible(False)
fig = plot.get_figure()
fig.savefig(chart_name)
def plot_experiments(df, chart_name, title, x_axis_label, y_axis_label, y_limit):
plt.rcParams.update({'font.size': 17})
plt.figure(figsize=(15, 8))
plt.close()
plt.figure()
plot = df.plot(linewidth=1, figsize=(15, 8), title=title)
plot.set_xlabel(x_axis_label)
plot.set_ylabel(y_axis_label)
plt.ylim(y_limit)
fig = plot.get_figure()
fig.savefig(chart_name)
def run_experiment_for_gamma():
print('Running Experiment for gamma...')
env = gym.make('LunarLander-v2')
# set seeds
env.seed(21)
np.random.seed(21)
# setting up params
lr = 0.001
epsilon = 1.0
epsilon_decay = 0.995
gamma_list = [0.99, 0.9, 0.8, 0.7]
training_episodes = 1000
rewards_list_for_gammas = []
for gamma_value in gamma_list:
# save_dir = "hp_gamma_"+ str(gamma_value) + "_"
model = DQN(env, lr, gamma_value, epsilon, epsilon_decay)
print("Training model for Gamma: {}".format(gamma_value))
model.train(training_episodes, False)
rewards_list_for_gammas.append(model.rewards_list)
pickle.dump(rewards_list_for_gammas, open("rewards_list_for_gammas.p", "wb"))
rewards_list_for_gammas = pickle.load(open("rewards_list_for_gammas.p", "rb"))
gamma_rewards_pd = pd.DataFrame(index=pd.Series(range(1, training_episodes + 1)))
for i in range(len(gamma_list)):
col_name = "gamma=" + str(gamma_list[i])
gamma_rewards_pd[col_name] = rewards_list_for_gammas[i]
plot_experiments(gamma_rewards_pd, "Figure 4: Rewards per episode for different gamma values",
"Figure 4: Rewards per episode for different gamma values", "Episodes", "Reward", (-600, 300))
def run_experiment_for_lr():
print('Running Experiment for learning rate...')
env = gym.make('LunarLander-v2')
# set seeds
env.seed(21)
np.random.seed(21)
# setting up params
lr_values = [0.0001, 0.001, 0.01, 0.1]
epsilon = 1.0
epsilon_decay = 0.995
gamma = 0.99
training_episodes = 1000
rewards_list_for_lrs = []
for lr_value in lr_values:
model = DQN(env, lr_value, gamma, epsilon, epsilon_decay)
print("Training model for LR: {}".format(lr_value))
model.train(training_episodes, False)
rewards_list_for_lrs.append(model.rewards_list)
pickle.dump(rewards_list_for_lrs, open("rewards_list_for_lrs.p", "wb"))
rewards_list_for_lrs = pickle.load(open("rewards_list_for_lrs.p", "rb"))
lr_rewards_pd = pd.DataFrame(index=pd.Series(range(1, training_episodes + 1)))
for i in range(len(lr_values)):
col_name = "lr="+ str(lr_values[i])
lr_rewards_pd[col_name] = rewards_list_for_lrs[i]
plot_experiments(lr_rewards_pd, "Figure 3: Rewards per episode for different learning rates", "Figure 3: Rewards per episode for different learning rates", "Episodes", "Reward", (-2000, 300))
def run_experiment_for_ed():
print('Running Experiment for epsilon decay...')
env = gym.make('LunarLander-v2')
# set seeds
env.seed(21)
np.random.seed(21)
# setting up params
lr = 0.001
epsilon = 1.0
ed_values = [0.999, 0.995, 0.990, 0.9]
gamma = 0.99
training_episodes = 1000
rewards_list_for_ed = []
for ed in ed_values:
save_dir = "hp_ed_"+ str(ed) + "_"
model = DQN(env, lr, gamma, epsilon, ed)
print("Training model for ED: {}".format(ed))
model.train(training_episodes, False)
rewards_list_for_ed.append(model.rewards_list)
pickle.dump(rewards_list_for_ed, open("rewards_list_for_ed.p", "wb"))
rewards_list_for_ed = pickle.load(open("rewards_list_for_ed.p", "rb"))
ed_rewards_pd = pd.DataFrame(index=pd.Series(range(1, training_episodes+1)))
for i in range(len(ed_values)):
col_name = "epsilon_decay = "+ str(ed_values[i])
ed_rewards_pd[col_name] = rewards_list_for_ed[i]
plot_experiments(ed_rewards_pd, "Figure 5: Rewards per episode for different epsilon(ε) decay", "Figure 5: Rewards per episode for different epsilon(ε) decay values", "Episodes", "Reward", (-600, 300))
if __name__ == '__main__':
env = gym.make('LunarLander-v2')
# set seeds
env.seed(21)
np.random.seed(21)
# setting up params
lr = 0.001
epsilon = 1.0
epsilon_decay = 0.995
gamma = 0.99
training_episodes = 2000
print('St')
model = DQN(env, lr, gamma, epsilon, epsilon_decay)
model.train(training_episodes, True)
# Save Everything
save_dir = "saved_models"
# Save trained model
model.save(save_dir + "trained_model.h5")
# Save Rewards list
pickle.dump(model.rewards_list, open(save_dir + "train_rewards_list.p", "wb"))
rewards_list = pickle.load(open(save_dir + "train_rewards_list.p", "rb"))
# plot reward in graph
reward_df = pd.DataFrame(rewards_list)
plot_df(reward_df, "Figure 1: Reward for each training episode", "Reward for each training episode", "Episode","Reward")
# Test the model
trained_model = load_model(save_dir + "trained_model.h5")
test_rewards = test_already_trained_model(trained_model)
pickle.dump(test_rewards, open(save_dir + "test_rewards.p", "wb"))
test_rewards = pickle.load(open(save_dir + "test_rewards.p", "rb"))
plot_df2(pd.DataFrame(test_rewards), "Figure 2: Reward for each testing episode","Reward for each testing episode", "Episode", "Reward")
print("Training and Testing Completed...!")
# Run experiments for hyper-parameter
run_experiment_for_lr()
run_experiment_for_ed()
run_experiment_for_gamma()
St Model: "sequential_8" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense_21 (Dense) (None, 512) 4608 dense_22 (Dense) (None, 256) 131328 dense_23 (Dense) (None, 4) 1028 ================================================================= Total params: 136,964 Trainable params: 136,964 Non-trainable params: 0 _________________________________________________________________ None
/home/ahuratus/.local/lib/python3.10/site-packages/keras/optimizers/optimizer_v2/adam.py:117: UserWarning: The `lr` argument is deprecated, use `learning_rate` instead. super().__init__(name, **kwargs)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In [39], line 325 323 print('St') 324 model = DQN(env, lr, gamma, epsilon, epsilon_decay) --> 325 model.train(training_episodes, True) 327 # Save Everything 328 save_dir = "saved_models" Cell In [39], line 100, in DQN.train(self, num_episodes, can_stop) 98 reward_for_episode = 0 99 num_steps = 1000 --> 100 state = np.reshape(state, [1, self.num_observation_space]) 101 for step in range(num_steps): 102 env.render() File <__array_function__ internals>:180, in reshape(*args, **kwargs) File ~/.local/lib/python3.10/site-packages/numpy/core/fromnumeric.py:298, in reshape(a, newshape, order) 198 @array_function_dispatch(_reshape_dispatcher) 199 def reshape(a, newshape, order='C'): 200 """ 201 Gives a new shape to an array without changing its data. 202 (...) 296 [5, 6]]) 297 """ --> 298 return _wrapfunc(a, 'reshape', newshape, order=order) File ~/.local/lib/python3.10/site-packages/numpy/core/fromnumeric.py:54, in _wrapfunc(obj, method, *args, **kwds) 52 bound = getattr(obj, method, None) 53 if bound is None: ---> 54 return _wrapit(obj, method, *args, **kwds) 56 try: 57 return bound(*args, **kwds) File ~/.local/lib/python3.10/site-packages/numpy/core/fromnumeric.py:43, in _wrapit(obj, method, *args, **kwds) 41 except AttributeError: 42 wrap = None ---> 43 result = getattr(asarray(obj), method)(*args, **kwds) 44 if wrap: 45 if not isinstance(result, mu.ndarray): ValueError: cannot reshape array of size 2 into shape (1,8)