Difference between my Q-Learning implementations in Tensorflow and Keras

up vote
0
down vote

favorite

I wrote a Q-Learning implementation to solve OpenAI FrozenLake-v0 problem using simple NN.

My neural network looks like that:

Input layer: 16

Output layer: 4

Implementation in clear tensorflow did very well. About 70% finished episodes after training of 10k episodes.

After that I wanted to write the same algorithm using Keras, but this time algorithm did very poorly, after 10k episodes about 5% finished episodes.

I'm guessing I made mistake in my Keras implementation but I cannot figure it out.

Tensorflow implementation:

import gym

import numpy as np

import tensorflow as tf



env = gym.make('FrozenLake-v0')



discount_rate = 0.99

random_action_chance = 0.1

num_episodes = 10000

max_episode_step = 100

log_interval = 100



tf.reset_default_graph()

inputs = tf.placeholder(shape=[1, 16], dtype=tf.float32)

W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))

Q = tf.matmul(inputs, W)

predict = tf.argmax(Q, 1)



Qnext = tf.placeholder(shape=[1, 4], dtype=tf.float32)

loss = tf.reduce_sum(tf.square(Qnext - Q))

optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)

updateModel = optimizer.minimize(loss)



init = tf.initialize_all_variables()

rewards_from_episodes = 

with tf.Session() as sess:

    sess.run(init)

    for episode in range(num_episodes):

        observation = env.reset()

        episode_reward = 0

        if episode % log_interval == 0 and episode > 0:

            print('Episode: {}, Reward: {}'.format(episode, sum(rewards_from_episodes[episode - log_interval: episode]) / log_interval))



        W1 = 

        for step in range(max_episode_step):

            # Select action

            action, targetQ = sess.run([predict, Q], feed_dict={inputs: np.identity(16)[observation:observation + 1]})

            if np.random.rand(1) < random_action_chance:

                action[0] = env.action_space.sample()



            new_observation, reward, done, _ = env.step(action[0])

            Qnew = sess.run(Q, feed_dict={inputs: np.identity(16)[new_observation:new_observation + 1]})

            maxQvalue = np.max(Qnew)

            targetQ[0, action[0]] = reward + discount_rate * maxQvalue

            # Train network using target and predicted Q values

            _, W1 = sess.run([updateModel, W], feed_dict={inputs: np.identity(16)[observation:observation + 1],

                                                          Qnext: targetQ})

            episode_reward += reward

            observation = new_observation

            if done:

                random_action_chance = 1. / ((episode / 50) + 10)

                break

        rewards_from_episodes.append(episode_reward)



print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))

Keras implementation:

import gym

import numpy as np

import tensorflow as tf

import random

from tensorflow.python.keras.layers import *



env = gym.make('FrozenLake-v0')



learning_rate = 0.1

discount_rate = 0.99

random_action_chance = 0.1

num_episodes = 10000

max_episode_step = 100

log_interval = 100





model = tf.keras.Sequential()

model.add(Dense(4, kernel_initializer='uniform'))

model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),

              loss='mean_squared_error')



rewards_from_episodes = 



for episode in range(num_episodes):

    observation = env.reset()

    episode_reward = 0

    if episode % log_interval == 0 and episode > 0:

        print('Episode: {}, Reward: {}'.format(episode, sum(

            rewards_from_episodes[episode - log_interval: episode]) / log_interval))



    for step in range(max_episode_step):

        # Select action

        targetQ = model.predict(np.identity(16)[observation:observation + 1], batch_size=1)

        action = np.argmax(targetQ)



        if random.random() < random_action_chance:

            action = env.action_space.sample()



        new_observation, reward, done, _ = env.step(action)

        Qnew = model.predict(np.identity(16)[new_observation:new_observation + 1], batch_size=1)

        maxQvalue = np.max(Qnew)

        targetQ[0, action] = reward + discount_rate * maxQvalue



        # Train network using target and predicted Q values

        model.fit(np.identity(16)[observation:observation + 1], targetQ, epochs=1, batch_size=1, verbose=0)



        episode_reward += reward

        observation = new_observation

        if done:

            random_action_chance = 1. / ((episode / 50) + 10)

            break

    rewards_from_episodes.append(episode_reward)



print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))

edited Nov 11 at 6:44

asked Nov 10 at 21:52

Valverde

1517

add a comment |

up vote
0
down vote

favorite

I wrote a Q-Learning implementation to solve OpenAI FrozenLake-v0 problem using simple NN.

My neural network looks like that:

Input layer: 16

Output layer: 4

Implementation in clear tensorflow did very well. About 70% finished episodes after training of 10k episodes.

After that I wanted to write the same algorithm using Keras, but this time algorithm did very poorly, after 10k episodes about 5% finished episodes.

I'm guessing I made mistake in my Keras implementation but I cannot figure it out.

Tensorflow implementation:

import gym

import numpy as np

import tensorflow as tf



env = gym.make('FrozenLake-v0')



discount_rate = 0.99

random_action_chance = 0.1

num_episodes = 10000

max_episode_step = 100

log_interval = 100



tf.reset_default_graph()

inputs = tf.placeholder(shape=[1, 16], dtype=tf.float32)

W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))

Q = tf.matmul(inputs, W)

predict = tf.argmax(Q, 1)



Qnext = tf.placeholder(shape=[1, 4], dtype=tf.float32)

loss = tf.reduce_sum(tf.square(Qnext - Q))

optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)

updateModel = optimizer.minimize(loss)



init = tf.initialize_all_variables()

rewards_from_episodes = 

with tf.Session() as sess:

    sess.run(init)

    for episode in range(num_episodes):

        observation = env.reset()

        episode_reward = 0

        if episode % log_interval == 0 and episode > 0:

            print('Episode: {}, Reward: {}'.format(episode, sum(rewards_from_episodes[episode - log_interval: episode]) / log_interval))



        W1 = 

        for step in range(max_episode_step):

            # Select action

            action, targetQ = sess.run([predict, Q], feed_dict={inputs: np.identity(16)[observation:observation + 1]})

            if np.random.rand(1) < random_action_chance:

                action[0] = env.action_space.sample()



            new_observation, reward, done, _ = env.step(action[0])

            Qnew = sess.run(Q, feed_dict={inputs: np.identity(16)[new_observation:new_observation + 1]})

            maxQvalue = np.max(Qnew)

            targetQ[0, action[0]] = reward + discount_rate * maxQvalue

            # Train network using target and predicted Q values

            _, W1 = sess.run([updateModel, W], feed_dict={inputs: np.identity(16)[observation:observation + 1],

                                                          Qnext: targetQ})

            episode_reward += reward

            observation = new_observation

            if done:

                random_action_chance = 1. / ((episode / 50) + 10)

                break

        rewards_from_episodes.append(episode_reward)



print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))

Keras implementation:

import gym

import numpy as np

import tensorflow as tf

import random

from tensorflow.python.keras.layers import *



env = gym.make('FrozenLake-v0')



learning_rate = 0.1

discount_rate = 0.99

random_action_chance = 0.1

num_episodes = 10000

max_episode_step = 100

log_interval = 100





model = tf.keras.Sequential()

model.add(Dense(4, kernel_initializer='uniform'))

model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),

              loss='mean_squared_error')



rewards_from_episodes = 



for episode in range(num_episodes):

    observation = env.reset()

    episode_reward = 0

    if episode % log_interval == 0 and episode > 0:

        print('Episode: {}, Reward: {}'.format(episode, sum(

            rewards_from_episodes[episode - log_interval: episode]) / log_interval))



    for step in range(max_episode_step):

        # Select action

        targetQ = model.predict(np.identity(16)[observation:observation + 1], batch_size=1)

        action = np.argmax(targetQ)



        if random.random() < random_action_chance:

            action = env.action_space.sample()



        new_observation, reward, done, _ = env.step(action)

        Qnew = model.predict(np.identity(16)[new_observation:new_observation + 1], batch_size=1)

        maxQvalue = np.max(Qnew)

        targetQ[0, action] = reward + discount_rate * maxQvalue



        # Train network using target and predicted Q values

        model.fit(np.identity(16)[observation:observation + 1], targetQ, epochs=1, batch_size=1, verbose=0)



        episode_reward += reward

        observation = new_observation

        if done:

            random_action_chance = 1. / ((episode / 50) + 10)

            break

    rewards_from_episodes.append(episode_reward)



print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))

edited Nov 11 at 6:44

asked Nov 10 at 21:52

Valverde

1517

add a comment |

up vote
0
down vote

favorite

I wrote a Q-Learning implementation to solve OpenAI FrozenLake-v0 problem using simple NN.

My neural network looks like that:

Input layer: 16

Output layer: 4

Implementation in clear tensorflow did very well. About 70% finished episodes after training of 10k episodes.

After that I wanted to write the same algorithm using Keras, but this time algorithm did very poorly, after 10k episodes about 5% finished episodes.

I'm guessing I made mistake in my Keras implementation but I cannot figure it out.

Tensorflow implementation:

import gym

import numpy as np

import tensorflow as tf



env = gym.make('FrozenLake-v0')



discount_rate = 0.99

random_action_chance = 0.1

num_episodes = 10000

max_episode_step = 100

log_interval = 100



tf.reset_default_graph()

inputs = tf.placeholder(shape=[1, 16], dtype=tf.float32)

W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))

Q = tf.matmul(inputs, W)

predict = tf.argmax(Q, 1)



Qnext = tf.placeholder(shape=[1, 4], dtype=tf.float32)

loss = tf.reduce_sum(tf.square(Qnext - Q))

optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)

updateModel = optimizer.minimize(loss)



init = tf.initialize_all_variables()

rewards_from_episodes = 

with tf.Session() as sess:

    sess.run(init)

    for episode in range(num_episodes):

        observation = env.reset()

        episode_reward = 0

        if episode % log_interval == 0 and episode > 0:

            print('Episode: {}, Reward: {}'.format(episode, sum(rewards_from_episodes[episode - log_interval: episode]) / log_interval))



        W1 = 

        for step in range(max_episode_step):

            # Select action

            action, targetQ = sess.run([predict, Q], feed_dict={inputs: np.identity(16)[observation:observation + 1]})

            if np.random.rand(1) < random_action_chance:

                action[0] = env.action_space.sample()



            new_observation, reward, done, _ = env.step(action[0])

            Qnew = sess.run(Q, feed_dict={inputs: np.identity(16)[new_observation:new_observation + 1]})

            maxQvalue = np.max(Qnew)

            targetQ[0, action[0]] = reward + discount_rate * maxQvalue

            # Train network using target and predicted Q values

            _, W1 = sess.run([updateModel, W], feed_dict={inputs: np.identity(16)[observation:observation + 1],

                                                          Qnext: targetQ})

            episode_reward += reward

            observation = new_observation

            if done:

                random_action_chance = 1. / ((episode / 50) + 10)

                break

        rewards_from_episodes.append(episode_reward)



print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))

Keras implementation:

import gym

import numpy as np

import tensorflow as tf

import random

from tensorflow.python.keras.layers import *



env = gym.make('FrozenLake-v0')



learning_rate = 0.1

discount_rate = 0.99

random_action_chance = 0.1

num_episodes = 10000

max_episode_step = 100

log_interval = 100





model = tf.keras.Sequential()

model.add(Dense(4, kernel_initializer='uniform'))

model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),

              loss='mean_squared_error')



rewards_from_episodes = 



for episode in range(num_episodes):

    observation = env.reset()

    episode_reward = 0

    if episode % log_interval == 0 and episode > 0:

        print('Episode: {}, Reward: {}'.format(episode, sum(

            rewards_from_episodes[episode - log_interval: episode]) / log_interval))



    for step in range(max_episode_step):

        # Select action

        targetQ = model.predict(np.identity(16)[observation:observation + 1], batch_size=1)

        action = np.argmax(targetQ)



        if random.random() < random_action_chance:

            action = env.action_space.sample()



        new_observation, reward, done, _ = env.step(action)

        Qnew = model.predict(np.identity(16)[new_observation:new_observation + 1], batch_size=1)

        maxQvalue = np.max(Qnew)

        targetQ[0, action] = reward + discount_rate * maxQvalue



        # Train network using target and predicted Q values

        model.fit(np.identity(16)[observation:observation + 1], targetQ, epochs=1, batch_size=1, verbose=0)



        episode_reward += reward

        observation = new_observation

        if done:

            random_action_chance = 1. / ((episode / 50) + 10)

            break

    rewards_from_episodes.append(episode_reward)



print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))

edited Nov 11 at 6:44

asked Nov 10 at 21:52

Valverde

1517

I wrote a Q-Learning implementation to solve OpenAI FrozenLake-v0 problem using simple NN.

My neural network looks like that:

Input layer: 16

Output layer: 4

Implementation in clear tensorflow did very well. About 70% finished episodes after training of 10k episodes.

After that I wanted to write the same algorithm using Keras, but this time algorithm did very poorly, after 10k episodes about 5% finished episodes.

I'm guessing I made mistake in my Keras implementation but I cannot figure it out.

Tensorflow implementation:

import gym

import numpy as np

import tensorflow as tf



env = gym.make('FrozenLake-v0')



discount_rate = 0.99

random_action_chance = 0.1

num_episodes = 10000

max_episode_step = 100

log_interval = 100



tf.reset_default_graph()

inputs = tf.placeholder(shape=[1, 16], dtype=tf.float32)

W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))

Q = tf.matmul(inputs, W)

predict = tf.argmax(Q, 1)



Qnext = tf.placeholder(shape=[1, 4], dtype=tf.float32)

loss = tf.reduce_sum(tf.square(Qnext - Q))

optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)

updateModel = optimizer.minimize(loss)



init = tf.initialize_all_variables()

rewards_from_episodes = 

with tf.Session() as sess:

    sess.run(init)

    for episode in range(num_episodes):

        observation = env.reset()

        episode_reward = 0

        if episode % log_interval == 0 and episode > 0:

            print('Episode: {}, Reward: {}'.format(episode, sum(rewards_from_episodes[episode - log_interval: episode]) / log_interval))



        W1 = 

        for step in range(max_episode_step):

            # Select action

            action, targetQ = sess.run([predict, Q], feed_dict={inputs: np.identity(16)[observation:observation + 1]})

            if np.random.rand(1) < random_action_chance:

                action[0] = env.action_space.sample()



            new_observation, reward, done, _ = env.step(action[0])

            Qnew = sess.run(Q, feed_dict={inputs: np.identity(16)[new_observation:new_observation + 1]})

            maxQvalue = np.max(Qnew)

            targetQ[0, action[0]] = reward + discount_rate * maxQvalue

            # Train network using target and predicted Q values

            _, W1 = sess.run([updateModel, W], feed_dict={inputs: np.identity(16)[observation:observation + 1],

                                                          Qnext: targetQ})

            episode_reward += reward

            observation = new_observation

            if done:

                random_action_chance = 1. / ((episode / 50) + 10)

                break

        rewards_from_episodes.append(episode_reward)



print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))

Keras implementation:

import gym

import numpy as np

import tensorflow as tf

import random

from tensorflow.python.keras.layers import *



env = gym.make('FrozenLake-v0')



learning_rate = 0.1

discount_rate = 0.99

random_action_chance = 0.1

num_episodes = 10000

max_episode_step = 100

log_interval = 100





model = tf.keras.Sequential()

model.add(Dense(4, kernel_initializer='uniform'))

model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),

              loss='mean_squared_error')



rewards_from_episodes = 



for episode in range(num_episodes):

    observation = env.reset()

    episode_reward = 0

    if episode % log_interval == 0 and episode > 0:

        print('Episode: {}, Reward: {}'.format(episode, sum(

            rewards_from_episodes[episode - log_interval: episode]) / log_interval))



    for step in range(max_episode_step):

        # Select action

        targetQ = model.predict(np.identity(16)[observation:observation + 1], batch_size=1)

        action = np.argmax(targetQ)



        if random.random() < random_action_chance:

            action = env.action_space.sample()



        new_observation, reward, done, _ = env.step(action)

        Qnew = model.predict(np.identity(16)[new_observation:new_observation + 1], batch_size=1)

        maxQvalue = np.max(Qnew)

        targetQ[0, action] = reward + discount_rate * maxQvalue



        # Train network using target and predicted Q values

        model.fit(np.identity(16)[observation:observation + 1], targetQ, epochs=1, batch_size=1, verbose=0)



        episode_reward += reward

        observation = new_observation

        if done:

            random_action_chance = 1. / ((episode / 50) + 10)

            break

    rewards_from_episodes.append(episode_reward)



print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))

tensorflow keras neural-network q-learning

edited Nov 11 at 6:44

asked Nov 10 at 21:52

Valverde

1517

edited Nov 11 at 6:44

asked Nov 10 at 21:52

Valverde

1517

edited Nov 11 at 6:44

asked Nov 10 at 21:52

Valverde

1517

asked Nov 10 at 21:52

Valverde

1517

asked Nov 10 at 21:52

Valverde

1517

add a comment |

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53243764%2fdifference-between-my-q-learning-implementations-in-tensorflow-and-keras%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

active

oldest

votes

draft saved

draft discarded

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Nrthugu