Difference between my Q-Learning implementations in Tensorflow and Keras











up vote
0
down vote

favorite












I wrote a Q-Learning implementation to solve OpenAI FrozenLake-v0 problem using simple NN.



My neural network looks like that:



Input layer: 16



Output layer: 4



Implementation in clear tensorflow did very well. About 70% finished episodes after training of 10k episodes.



After that I wanted to write the same algorithm using Keras, but this time algorithm did very poorly, after 10k episodes about 5% finished episodes.



I'm guessing I made mistake in my Keras implementation but I cannot figure it out.



Tensorflow implementation:



import gym
import numpy as np
import tensorflow as tf

env = gym.make('FrozenLake-v0')

discount_rate = 0.99
random_action_chance = 0.1
num_episodes = 10000
max_episode_step = 100
log_interval = 100

tf.reset_default_graph()
inputs = tf.placeholder(shape=[1, 16], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))
Q = tf.matmul(inputs, W)
predict = tf.argmax(Q, 1)

Qnext = tf.placeholder(shape=[1, 4], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(Qnext - Q))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = optimizer.minimize(loss)

init = tf.initialize_all_variables()
rewards_from_episodes =
with tf.Session() as sess:
sess.run(init)
for episode in range(num_episodes):
observation = env.reset()
episode_reward = 0
if episode % log_interval == 0 and episode > 0:
print('Episode: {}, Reward: {}'.format(episode, sum(rewards_from_episodes[episode - log_interval: episode]) / log_interval))

W1 =
for step in range(max_episode_step):
# Select action
action, targetQ = sess.run([predict, Q], feed_dict={inputs: np.identity(16)[observation:observation + 1]})
if np.random.rand(1) < random_action_chance:
action[0] = env.action_space.sample()

new_observation, reward, done, _ = env.step(action[0])
Qnew = sess.run(Q, feed_dict={inputs: np.identity(16)[new_observation:new_observation + 1]})
maxQvalue = np.max(Qnew)
targetQ[0, action[0]] = reward + discount_rate * maxQvalue
# Train network using target and predicted Q values
_, W1 = sess.run([updateModel, W], feed_dict={inputs: np.identity(16)[observation:observation + 1],
Qnext: targetQ})
episode_reward += reward
observation = new_observation
if done:
random_action_chance = 1. / ((episode / 50) + 10)
break
rewards_from_episodes.append(episode_reward)

print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))


Keras implementation:



import gym
import numpy as np
import tensorflow as tf
import random
from tensorflow.python.keras.layers import *

env = gym.make('FrozenLake-v0')

learning_rate = 0.1
discount_rate = 0.99
random_action_chance = 0.1
num_episodes = 10000
max_episode_step = 100
log_interval = 100


model = tf.keras.Sequential()
model.add(Dense(4, kernel_initializer='uniform'))
model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
loss='mean_squared_error')

rewards_from_episodes =

for episode in range(num_episodes):
observation = env.reset()
episode_reward = 0
if episode % log_interval == 0 and episode > 0:
print('Episode: {}, Reward: {}'.format(episode, sum(
rewards_from_episodes[episode - log_interval: episode]) / log_interval))

for step in range(max_episode_step):
# Select action
targetQ = model.predict(np.identity(16)[observation:observation + 1], batch_size=1)
action = np.argmax(targetQ)

if random.random() < random_action_chance:
action = env.action_space.sample()

new_observation, reward, done, _ = env.step(action)
Qnew = model.predict(np.identity(16)[new_observation:new_observation + 1], batch_size=1)
maxQvalue = np.max(Qnew)
targetQ[0, action] = reward + discount_rate * maxQvalue

# Train network using target and predicted Q values
model.fit(np.identity(16)[observation:observation + 1], targetQ, epochs=1, batch_size=1, verbose=0)

episode_reward += reward
observation = new_observation
if done:
random_action_chance = 1. / ((episode / 50) + 10)
break
rewards_from_episodes.append(episode_reward)

print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))









share|improve this question




























    up vote
    0
    down vote

    favorite












    I wrote a Q-Learning implementation to solve OpenAI FrozenLake-v0 problem using simple NN.



    My neural network looks like that:



    Input layer: 16



    Output layer: 4



    Implementation in clear tensorflow did very well. About 70% finished episodes after training of 10k episodes.



    After that I wanted to write the same algorithm using Keras, but this time algorithm did very poorly, after 10k episodes about 5% finished episodes.



    I'm guessing I made mistake in my Keras implementation but I cannot figure it out.



    Tensorflow implementation:



    import gym
    import numpy as np
    import tensorflow as tf

    env = gym.make('FrozenLake-v0')

    discount_rate = 0.99
    random_action_chance = 0.1
    num_episodes = 10000
    max_episode_step = 100
    log_interval = 100

    tf.reset_default_graph()
    inputs = tf.placeholder(shape=[1, 16], dtype=tf.float32)
    W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))
    Q = tf.matmul(inputs, W)
    predict = tf.argmax(Q, 1)

    Qnext = tf.placeholder(shape=[1, 4], dtype=tf.float32)
    loss = tf.reduce_sum(tf.square(Qnext - Q))
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
    updateModel = optimizer.minimize(loss)

    init = tf.initialize_all_variables()
    rewards_from_episodes =
    with tf.Session() as sess:
    sess.run(init)
    for episode in range(num_episodes):
    observation = env.reset()
    episode_reward = 0
    if episode % log_interval == 0 and episode > 0:
    print('Episode: {}, Reward: {}'.format(episode, sum(rewards_from_episodes[episode - log_interval: episode]) / log_interval))

    W1 =
    for step in range(max_episode_step):
    # Select action
    action, targetQ = sess.run([predict, Q], feed_dict={inputs: np.identity(16)[observation:observation + 1]})
    if np.random.rand(1) < random_action_chance:
    action[0] = env.action_space.sample()

    new_observation, reward, done, _ = env.step(action[0])
    Qnew = sess.run(Q, feed_dict={inputs: np.identity(16)[new_observation:new_observation + 1]})
    maxQvalue = np.max(Qnew)
    targetQ[0, action[0]] = reward + discount_rate * maxQvalue
    # Train network using target and predicted Q values
    _, W1 = sess.run([updateModel, W], feed_dict={inputs: np.identity(16)[observation:observation + 1],
    Qnext: targetQ})
    episode_reward += reward
    observation = new_observation
    if done:
    random_action_chance = 1. / ((episode / 50) + 10)
    break
    rewards_from_episodes.append(episode_reward)

    print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))


    Keras implementation:



    import gym
    import numpy as np
    import tensorflow as tf
    import random
    from tensorflow.python.keras.layers import *

    env = gym.make('FrozenLake-v0')

    learning_rate = 0.1
    discount_rate = 0.99
    random_action_chance = 0.1
    num_episodes = 10000
    max_episode_step = 100
    log_interval = 100


    model = tf.keras.Sequential()
    model.add(Dense(4, kernel_initializer='uniform'))
    model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
    loss='mean_squared_error')

    rewards_from_episodes =

    for episode in range(num_episodes):
    observation = env.reset()
    episode_reward = 0
    if episode % log_interval == 0 and episode > 0:
    print('Episode: {}, Reward: {}'.format(episode, sum(
    rewards_from_episodes[episode - log_interval: episode]) / log_interval))

    for step in range(max_episode_step):
    # Select action
    targetQ = model.predict(np.identity(16)[observation:observation + 1], batch_size=1)
    action = np.argmax(targetQ)

    if random.random() < random_action_chance:
    action = env.action_space.sample()

    new_observation, reward, done, _ = env.step(action)
    Qnew = model.predict(np.identity(16)[new_observation:new_observation + 1], batch_size=1)
    maxQvalue = np.max(Qnew)
    targetQ[0, action] = reward + discount_rate * maxQvalue

    # Train network using target and predicted Q values
    model.fit(np.identity(16)[observation:observation + 1], targetQ, epochs=1, batch_size=1, verbose=0)

    episode_reward += reward
    observation = new_observation
    if done:
    random_action_chance = 1. / ((episode / 50) + 10)
    break
    rewards_from_episodes.append(episode_reward)

    print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))









    share|improve this question


























      up vote
      0
      down vote

      favorite









      up vote
      0
      down vote

      favorite











      I wrote a Q-Learning implementation to solve OpenAI FrozenLake-v0 problem using simple NN.



      My neural network looks like that:



      Input layer: 16



      Output layer: 4



      Implementation in clear tensorflow did very well. About 70% finished episodes after training of 10k episodes.



      After that I wanted to write the same algorithm using Keras, but this time algorithm did very poorly, after 10k episodes about 5% finished episodes.



      I'm guessing I made mistake in my Keras implementation but I cannot figure it out.



      Tensorflow implementation:



      import gym
      import numpy as np
      import tensorflow as tf

      env = gym.make('FrozenLake-v0')

      discount_rate = 0.99
      random_action_chance = 0.1
      num_episodes = 10000
      max_episode_step = 100
      log_interval = 100

      tf.reset_default_graph()
      inputs = tf.placeholder(shape=[1, 16], dtype=tf.float32)
      W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))
      Q = tf.matmul(inputs, W)
      predict = tf.argmax(Q, 1)

      Qnext = tf.placeholder(shape=[1, 4], dtype=tf.float32)
      loss = tf.reduce_sum(tf.square(Qnext - Q))
      optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
      updateModel = optimizer.minimize(loss)

      init = tf.initialize_all_variables()
      rewards_from_episodes =
      with tf.Session() as sess:
      sess.run(init)
      for episode in range(num_episodes):
      observation = env.reset()
      episode_reward = 0
      if episode % log_interval == 0 and episode > 0:
      print('Episode: {}, Reward: {}'.format(episode, sum(rewards_from_episodes[episode - log_interval: episode]) / log_interval))

      W1 =
      for step in range(max_episode_step):
      # Select action
      action, targetQ = sess.run([predict, Q], feed_dict={inputs: np.identity(16)[observation:observation + 1]})
      if np.random.rand(1) < random_action_chance:
      action[0] = env.action_space.sample()

      new_observation, reward, done, _ = env.step(action[0])
      Qnew = sess.run(Q, feed_dict={inputs: np.identity(16)[new_observation:new_observation + 1]})
      maxQvalue = np.max(Qnew)
      targetQ[0, action[0]] = reward + discount_rate * maxQvalue
      # Train network using target and predicted Q values
      _, W1 = sess.run([updateModel, W], feed_dict={inputs: np.identity(16)[observation:observation + 1],
      Qnext: targetQ})
      episode_reward += reward
      observation = new_observation
      if done:
      random_action_chance = 1. / ((episode / 50) + 10)
      break
      rewards_from_episodes.append(episode_reward)

      print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))


      Keras implementation:



      import gym
      import numpy as np
      import tensorflow as tf
      import random
      from tensorflow.python.keras.layers import *

      env = gym.make('FrozenLake-v0')

      learning_rate = 0.1
      discount_rate = 0.99
      random_action_chance = 0.1
      num_episodes = 10000
      max_episode_step = 100
      log_interval = 100


      model = tf.keras.Sequential()
      model.add(Dense(4, kernel_initializer='uniform'))
      model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
      loss='mean_squared_error')

      rewards_from_episodes =

      for episode in range(num_episodes):
      observation = env.reset()
      episode_reward = 0
      if episode % log_interval == 0 and episode > 0:
      print('Episode: {}, Reward: {}'.format(episode, sum(
      rewards_from_episodes[episode - log_interval: episode]) / log_interval))

      for step in range(max_episode_step):
      # Select action
      targetQ = model.predict(np.identity(16)[observation:observation + 1], batch_size=1)
      action = np.argmax(targetQ)

      if random.random() < random_action_chance:
      action = env.action_space.sample()

      new_observation, reward, done, _ = env.step(action)
      Qnew = model.predict(np.identity(16)[new_observation:new_observation + 1], batch_size=1)
      maxQvalue = np.max(Qnew)
      targetQ[0, action] = reward + discount_rate * maxQvalue

      # Train network using target and predicted Q values
      model.fit(np.identity(16)[observation:observation + 1], targetQ, epochs=1, batch_size=1, verbose=0)

      episode_reward += reward
      observation = new_observation
      if done:
      random_action_chance = 1. / ((episode / 50) + 10)
      break
      rewards_from_episodes.append(episode_reward)

      print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))









      share|improve this question















      I wrote a Q-Learning implementation to solve OpenAI FrozenLake-v0 problem using simple NN.



      My neural network looks like that:



      Input layer: 16



      Output layer: 4



      Implementation in clear tensorflow did very well. About 70% finished episodes after training of 10k episodes.



      After that I wanted to write the same algorithm using Keras, but this time algorithm did very poorly, after 10k episodes about 5% finished episodes.



      I'm guessing I made mistake in my Keras implementation but I cannot figure it out.



      Tensorflow implementation:



      import gym
      import numpy as np
      import tensorflow as tf

      env = gym.make('FrozenLake-v0')

      discount_rate = 0.99
      random_action_chance = 0.1
      num_episodes = 10000
      max_episode_step = 100
      log_interval = 100

      tf.reset_default_graph()
      inputs = tf.placeholder(shape=[1, 16], dtype=tf.float32)
      W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))
      Q = tf.matmul(inputs, W)
      predict = tf.argmax(Q, 1)

      Qnext = tf.placeholder(shape=[1, 4], dtype=tf.float32)
      loss = tf.reduce_sum(tf.square(Qnext - Q))
      optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
      updateModel = optimizer.minimize(loss)

      init = tf.initialize_all_variables()
      rewards_from_episodes =
      with tf.Session() as sess:
      sess.run(init)
      for episode in range(num_episodes):
      observation = env.reset()
      episode_reward = 0
      if episode % log_interval == 0 and episode > 0:
      print('Episode: {}, Reward: {}'.format(episode, sum(rewards_from_episodes[episode - log_interval: episode]) / log_interval))

      W1 =
      for step in range(max_episode_step):
      # Select action
      action, targetQ = sess.run([predict, Q], feed_dict={inputs: np.identity(16)[observation:observation + 1]})
      if np.random.rand(1) < random_action_chance:
      action[0] = env.action_space.sample()

      new_observation, reward, done, _ = env.step(action[0])
      Qnew = sess.run(Q, feed_dict={inputs: np.identity(16)[new_observation:new_observation + 1]})
      maxQvalue = np.max(Qnew)
      targetQ[0, action[0]] = reward + discount_rate * maxQvalue
      # Train network using target and predicted Q values
      _, W1 = sess.run([updateModel, W], feed_dict={inputs: np.identity(16)[observation:observation + 1],
      Qnext: targetQ})
      episode_reward += reward
      observation = new_observation
      if done:
      random_action_chance = 1. / ((episode / 50) + 10)
      break
      rewards_from_episodes.append(episode_reward)

      print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))


      Keras implementation:



      import gym
      import numpy as np
      import tensorflow as tf
      import random
      from tensorflow.python.keras.layers import *

      env = gym.make('FrozenLake-v0')

      learning_rate = 0.1
      discount_rate = 0.99
      random_action_chance = 0.1
      num_episodes = 10000
      max_episode_step = 100
      log_interval = 100


      model = tf.keras.Sequential()
      model.add(Dense(4, kernel_initializer='uniform'))
      model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
      loss='mean_squared_error')

      rewards_from_episodes =

      for episode in range(num_episodes):
      observation = env.reset()
      episode_reward = 0
      if episode % log_interval == 0 and episode > 0:
      print('Episode: {}, Reward: {}'.format(episode, sum(
      rewards_from_episodes[episode - log_interval: episode]) / log_interval))

      for step in range(max_episode_step):
      # Select action
      targetQ = model.predict(np.identity(16)[observation:observation + 1], batch_size=1)
      action = np.argmax(targetQ)

      if random.random() < random_action_chance:
      action = env.action_space.sample()

      new_observation, reward, done, _ = env.step(action)
      Qnew = model.predict(np.identity(16)[new_observation:new_observation + 1], batch_size=1)
      maxQvalue = np.max(Qnew)
      targetQ[0, action] = reward + discount_rate * maxQvalue

      # Train network using target and predicted Q values
      model.fit(np.identity(16)[observation:observation + 1], targetQ, epochs=1, batch_size=1, verbose=0)

      episode_reward += reward
      observation = new_observation
      if done:
      random_action_chance = 1. / ((episode / 50) + 10)
      break
      rewards_from_episodes.append(episode_reward)

      print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))






      tensorflow keras neural-network q-learning






      share|improve this question















      share|improve this question













      share|improve this question




      share|improve this question








      edited Nov 11 at 6:44

























      asked Nov 10 at 21:52









      Valverde

      1517




      1517





























          active

          oldest

          votes











          Your Answer






          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "1"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          convertImagesToLinks: true,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: 10,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });














           

          draft saved


          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53243764%2fdifference-between-my-q-learning-implementations-in-tensorflow-and-keras%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown






























          active

          oldest

          votes













          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes
















           

          draft saved


          draft discarded



















































           


          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53243764%2fdifference-between-my-q-learning-implementations-in-tensorflow-and-keras%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          Full-time equivalent

          Bicuculline

          さくらももこ