Difference between my Q-Learning implementations in Tensorflow and Keras
up vote
0
down vote
favorite
I wrote a Q-Learning implementation to solve OpenAI FrozenLake-v0 problem using simple NN.
My neural network looks like that:
Input layer: 16
Output layer: 4
Implementation in clear tensorflow did very well. About 70% finished episodes after training of 10k episodes.
After that I wanted to write the same algorithm using Keras, but this time algorithm did very poorly, after 10k episodes about 5% finished episodes.
I'm guessing I made mistake in my Keras implementation but I cannot figure it out.
Tensorflow implementation:
import gym
import numpy as np
import tensorflow as tf
env = gym.make('FrozenLake-v0')
discount_rate = 0.99
random_action_chance = 0.1
num_episodes = 10000
max_episode_step = 100
log_interval = 100
tf.reset_default_graph()
inputs = tf.placeholder(shape=[1, 16], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))
Q = tf.matmul(inputs, W)
predict = tf.argmax(Q, 1)
Qnext = tf.placeholder(shape=[1, 4], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(Qnext - Q))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = optimizer.minimize(loss)
init = tf.initialize_all_variables()
rewards_from_episodes =
with tf.Session() as sess:
sess.run(init)
for episode in range(num_episodes):
observation = env.reset()
episode_reward = 0
if episode % log_interval == 0 and episode > 0:
print('Episode: {}, Reward: {}'.format(episode, sum(rewards_from_episodes[episode - log_interval: episode]) / log_interval))
W1 =
for step in range(max_episode_step):
# Select action
action, targetQ = sess.run([predict, Q], feed_dict={inputs: np.identity(16)[observation:observation + 1]})
if np.random.rand(1) < random_action_chance:
action[0] = env.action_space.sample()
new_observation, reward, done, _ = env.step(action[0])
Qnew = sess.run(Q, feed_dict={inputs: np.identity(16)[new_observation:new_observation + 1]})
maxQvalue = np.max(Qnew)
targetQ[0, action[0]] = reward + discount_rate * maxQvalue
# Train network using target and predicted Q values
_, W1 = sess.run([updateModel, W], feed_dict={inputs: np.identity(16)[observation:observation + 1],
Qnext: targetQ})
episode_reward += reward
observation = new_observation
if done:
random_action_chance = 1. / ((episode / 50) + 10)
break
rewards_from_episodes.append(episode_reward)
print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))
Keras implementation:
import gym
import numpy as np
import tensorflow as tf
import random
from tensorflow.python.keras.layers import *
env = gym.make('FrozenLake-v0')
learning_rate = 0.1
discount_rate = 0.99
random_action_chance = 0.1
num_episodes = 10000
max_episode_step = 100
log_interval = 100
model = tf.keras.Sequential()
model.add(Dense(4, kernel_initializer='uniform'))
model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
loss='mean_squared_error')
rewards_from_episodes =
for episode in range(num_episodes):
observation = env.reset()
episode_reward = 0
if episode % log_interval == 0 and episode > 0:
print('Episode: {}, Reward: {}'.format(episode, sum(
rewards_from_episodes[episode - log_interval: episode]) / log_interval))
for step in range(max_episode_step):
# Select action
targetQ = model.predict(np.identity(16)[observation:observation + 1], batch_size=1)
action = np.argmax(targetQ)
if random.random() < random_action_chance:
action = env.action_space.sample()
new_observation, reward, done, _ = env.step(action)
Qnew = model.predict(np.identity(16)[new_observation:new_observation + 1], batch_size=1)
maxQvalue = np.max(Qnew)
targetQ[0, action] = reward + discount_rate * maxQvalue
# Train network using target and predicted Q values
model.fit(np.identity(16)[observation:observation + 1], targetQ, epochs=1, batch_size=1, verbose=0)
episode_reward += reward
observation = new_observation
if done:
random_action_chance = 1. / ((episode / 50) + 10)
break
rewards_from_episodes.append(episode_reward)
print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))
tensorflow keras neural-network q-learning
add a comment |
up vote
0
down vote
favorite
I wrote a Q-Learning implementation to solve OpenAI FrozenLake-v0 problem using simple NN.
My neural network looks like that:
Input layer: 16
Output layer: 4
Implementation in clear tensorflow did very well. About 70% finished episodes after training of 10k episodes.
After that I wanted to write the same algorithm using Keras, but this time algorithm did very poorly, after 10k episodes about 5% finished episodes.
I'm guessing I made mistake in my Keras implementation but I cannot figure it out.
Tensorflow implementation:
import gym
import numpy as np
import tensorflow as tf
env = gym.make('FrozenLake-v0')
discount_rate = 0.99
random_action_chance = 0.1
num_episodes = 10000
max_episode_step = 100
log_interval = 100
tf.reset_default_graph()
inputs = tf.placeholder(shape=[1, 16], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))
Q = tf.matmul(inputs, W)
predict = tf.argmax(Q, 1)
Qnext = tf.placeholder(shape=[1, 4], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(Qnext - Q))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = optimizer.minimize(loss)
init = tf.initialize_all_variables()
rewards_from_episodes =
with tf.Session() as sess:
sess.run(init)
for episode in range(num_episodes):
observation = env.reset()
episode_reward = 0
if episode % log_interval == 0 and episode > 0:
print('Episode: {}, Reward: {}'.format(episode, sum(rewards_from_episodes[episode - log_interval: episode]) / log_interval))
W1 =
for step in range(max_episode_step):
# Select action
action, targetQ = sess.run([predict, Q], feed_dict={inputs: np.identity(16)[observation:observation + 1]})
if np.random.rand(1) < random_action_chance:
action[0] = env.action_space.sample()
new_observation, reward, done, _ = env.step(action[0])
Qnew = sess.run(Q, feed_dict={inputs: np.identity(16)[new_observation:new_observation + 1]})
maxQvalue = np.max(Qnew)
targetQ[0, action[0]] = reward + discount_rate * maxQvalue
# Train network using target and predicted Q values
_, W1 = sess.run([updateModel, W], feed_dict={inputs: np.identity(16)[observation:observation + 1],
Qnext: targetQ})
episode_reward += reward
observation = new_observation
if done:
random_action_chance = 1. / ((episode / 50) + 10)
break
rewards_from_episodes.append(episode_reward)
print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))
Keras implementation:
import gym
import numpy as np
import tensorflow as tf
import random
from tensorflow.python.keras.layers import *
env = gym.make('FrozenLake-v0')
learning_rate = 0.1
discount_rate = 0.99
random_action_chance = 0.1
num_episodes = 10000
max_episode_step = 100
log_interval = 100
model = tf.keras.Sequential()
model.add(Dense(4, kernel_initializer='uniform'))
model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
loss='mean_squared_error')
rewards_from_episodes =
for episode in range(num_episodes):
observation = env.reset()
episode_reward = 0
if episode % log_interval == 0 and episode > 0:
print('Episode: {}, Reward: {}'.format(episode, sum(
rewards_from_episodes[episode - log_interval: episode]) / log_interval))
for step in range(max_episode_step):
# Select action
targetQ = model.predict(np.identity(16)[observation:observation + 1], batch_size=1)
action = np.argmax(targetQ)
if random.random() < random_action_chance:
action = env.action_space.sample()
new_observation, reward, done, _ = env.step(action)
Qnew = model.predict(np.identity(16)[new_observation:new_observation + 1], batch_size=1)
maxQvalue = np.max(Qnew)
targetQ[0, action] = reward + discount_rate * maxQvalue
# Train network using target and predicted Q values
model.fit(np.identity(16)[observation:observation + 1], targetQ, epochs=1, batch_size=1, verbose=0)
episode_reward += reward
observation = new_observation
if done:
random_action_chance = 1. / ((episode / 50) + 10)
break
rewards_from_episodes.append(episode_reward)
print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))
tensorflow keras neural-network q-learning
add a comment |
up vote
0
down vote
favorite
up vote
0
down vote
favorite
I wrote a Q-Learning implementation to solve OpenAI FrozenLake-v0 problem using simple NN.
My neural network looks like that:
Input layer: 16
Output layer: 4
Implementation in clear tensorflow did very well. About 70% finished episodes after training of 10k episodes.
After that I wanted to write the same algorithm using Keras, but this time algorithm did very poorly, after 10k episodes about 5% finished episodes.
I'm guessing I made mistake in my Keras implementation but I cannot figure it out.
Tensorflow implementation:
import gym
import numpy as np
import tensorflow as tf
env = gym.make('FrozenLake-v0')
discount_rate = 0.99
random_action_chance = 0.1
num_episodes = 10000
max_episode_step = 100
log_interval = 100
tf.reset_default_graph()
inputs = tf.placeholder(shape=[1, 16], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))
Q = tf.matmul(inputs, W)
predict = tf.argmax(Q, 1)
Qnext = tf.placeholder(shape=[1, 4], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(Qnext - Q))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = optimizer.minimize(loss)
init = tf.initialize_all_variables()
rewards_from_episodes =
with tf.Session() as sess:
sess.run(init)
for episode in range(num_episodes):
observation = env.reset()
episode_reward = 0
if episode % log_interval == 0 and episode > 0:
print('Episode: {}, Reward: {}'.format(episode, sum(rewards_from_episodes[episode - log_interval: episode]) / log_interval))
W1 =
for step in range(max_episode_step):
# Select action
action, targetQ = sess.run([predict, Q], feed_dict={inputs: np.identity(16)[observation:observation + 1]})
if np.random.rand(1) < random_action_chance:
action[0] = env.action_space.sample()
new_observation, reward, done, _ = env.step(action[0])
Qnew = sess.run(Q, feed_dict={inputs: np.identity(16)[new_observation:new_observation + 1]})
maxQvalue = np.max(Qnew)
targetQ[0, action[0]] = reward + discount_rate * maxQvalue
# Train network using target and predicted Q values
_, W1 = sess.run([updateModel, W], feed_dict={inputs: np.identity(16)[observation:observation + 1],
Qnext: targetQ})
episode_reward += reward
observation = new_observation
if done:
random_action_chance = 1. / ((episode / 50) + 10)
break
rewards_from_episodes.append(episode_reward)
print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))
Keras implementation:
import gym
import numpy as np
import tensorflow as tf
import random
from tensorflow.python.keras.layers import *
env = gym.make('FrozenLake-v0')
learning_rate = 0.1
discount_rate = 0.99
random_action_chance = 0.1
num_episodes = 10000
max_episode_step = 100
log_interval = 100
model = tf.keras.Sequential()
model.add(Dense(4, kernel_initializer='uniform'))
model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
loss='mean_squared_error')
rewards_from_episodes =
for episode in range(num_episodes):
observation = env.reset()
episode_reward = 0
if episode % log_interval == 0 and episode > 0:
print('Episode: {}, Reward: {}'.format(episode, sum(
rewards_from_episodes[episode - log_interval: episode]) / log_interval))
for step in range(max_episode_step):
# Select action
targetQ = model.predict(np.identity(16)[observation:observation + 1], batch_size=1)
action = np.argmax(targetQ)
if random.random() < random_action_chance:
action = env.action_space.sample()
new_observation, reward, done, _ = env.step(action)
Qnew = model.predict(np.identity(16)[new_observation:new_observation + 1], batch_size=1)
maxQvalue = np.max(Qnew)
targetQ[0, action] = reward + discount_rate * maxQvalue
# Train network using target and predicted Q values
model.fit(np.identity(16)[observation:observation + 1], targetQ, epochs=1, batch_size=1, verbose=0)
episode_reward += reward
observation = new_observation
if done:
random_action_chance = 1. / ((episode / 50) + 10)
break
rewards_from_episodes.append(episode_reward)
print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))
tensorflow keras neural-network q-learning
I wrote a Q-Learning implementation to solve OpenAI FrozenLake-v0 problem using simple NN.
My neural network looks like that:
Input layer: 16
Output layer: 4
Implementation in clear tensorflow did very well. About 70% finished episodes after training of 10k episodes.
After that I wanted to write the same algorithm using Keras, but this time algorithm did very poorly, after 10k episodes about 5% finished episodes.
I'm guessing I made mistake in my Keras implementation but I cannot figure it out.
Tensorflow implementation:
import gym
import numpy as np
import tensorflow as tf
env = gym.make('FrozenLake-v0')
discount_rate = 0.99
random_action_chance = 0.1
num_episodes = 10000
max_episode_step = 100
log_interval = 100
tf.reset_default_graph()
inputs = tf.placeholder(shape=[1, 16], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))
Q = tf.matmul(inputs, W)
predict = tf.argmax(Q, 1)
Qnext = tf.placeholder(shape=[1, 4], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(Qnext - Q))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = optimizer.minimize(loss)
init = tf.initialize_all_variables()
rewards_from_episodes =
with tf.Session() as sess:
sess.run(init)
for episode in range(num_episodes):
observation = env.reset()
episode_reward = 0
if episode % log_interval == 0 and episode > 0:
print('Episode: {}, Reward: {}'.format(episode, sum(rewards_from_episodes[episode - log_interval: episode]) / log_interval))
W1 =
for step in range(max_episode_step):
# Select action
action, targetQ = sess.run([predict, Q], feed_dict={inputs: np.identity(16)[observation:observation + 1]})
if np.random.rand(1) < random_action_chance:
action[0] = env.action_space.sample()
new_observation, reward, done, _ = env.step(action[0])
Qnew = sess.run(Q, feed_dict={inputs: np.identity(16)[new_observation:new_observation + 1]})
maxQvalue = np.max(Qnew)
targetQ[0, action[0]] = reward + discount_rate * maxQvalue
# Train network using target and predicted Q values
_, W1 = sess.run([updateModel, W], feed_dict={inputs: np.identity(16)[observation:observation + 1],
Qnext: targetQ})
episode_reward += reward
observation = new_observation
if done:
random_action_chance = 1. / ((episode / 50) + 10)
break
rewards_from_episodes.append(episode_reward)
print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))
Keras implementation:
import gym
import numpy as np
import tensorflow as tf
import random
from tensorflow.python.keras.layers import *
env = gym.make('FrozenLake-v0')
learning_rate = 0.1
discount_rate = 0.99
random_action_chance = 0.1
num_episodes = 10000
max_episode_step = 100
log_interval = 100
model = tf.keras.Sequential()
model.add(Dense(4, kernel_initializer='uniform'))
model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
loss='mean_squared_error')
rewards_from_episodes =
for episode in range(num_episodes):
observation = env.reset()
episode_reward = 0
if episode % log_interval == 0 and episode > 0:
print('Episode: {}, Reward: {}'.format(episode, sum(
rewards_from_episodes[episode - log_interval: episode]) / log_interval))
for step in range(max_episode_step):
# Select action
targetQ = model.predict(np.identity(16)[observation:observation + 1], batch_size=1)
action = np.argmax(targetQ)
if random.random() < random_action_chance:
action = env.action_space.sample()
new_observation, reward, done, _ = env.step(action)
Qnew = model.predict(np.identity(16)[new_observation:new_observation + 1], batch_size=1)
maxQvalue = np.max(Qnew)
targetQ[0, action] = reward + discount_rate * maxQvalue
# Train network using target and predicted Q values
model.fit(np.identity(16)[observation:observation + 1], targetQ, epochs=1, batch_size=1, verbose=0)
episode_reward += reward
observation = new_observation
if done:
random_action_chance = 1. / ((episode / 50) + 10)
break
rewards_from_episodes.append(episode_reward)
print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))
tensorflow keras neural-network q-learning
tensorflow keras neural-network q-learning
edited Nov 11 at 6:44
asked Nov 10 at 21:52
Valverde
1517
1517
add a comment |
add a comment |
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53243764%2fdifference-between-my-q-learning-implementations-in-tensorflow-and-keras%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown