mirror of
https://github.com/bulletphysics/bullet3
synced 2024-12-14 05:40:05 +00:00
make sure that the pre-trained galloping and trotting policies work for the minitaur_reactive_env and minitaur_trotting_env environments.
This commit is contained in:
parent
982453abc6
commit
a375a349ce
@ -0,0 +1,54 @@
|
||||
!!python/object/new:pybullet_envs.minitaur.agents.tools.attr_dict.AttrDict
|
||||
dictitems:
|
||||
algorithm: !!python/name:pybullet_envs.minitaur.agents.ppo.algorithm.PPOAlgorithm ''
|
||||
discount: 0.9868209124499899
|
||||
env: !!python/object/apply:functools.partial
|
||||
args:
|
||||
- &id001 !!python/name:pybullet_envs.minitaur.envs.minitaur_reactive_env.MinitaurReactiveEnv ''
|
||||
state: !!python/tuple
|
||||
- *id001
|
||||
- !!python/tuple []
|
||||
- accurate_motor_model_enabled: true
|
||||
control_latency: 0.02
|
||||
energy_weight: 0.005
|
||||
env_randomizer: null
|
||||
motor_kd: 0.015
|
||||
num_steps_to_log: 1000
|
||||
pd_latency: 0.003
|
||||
remove_default_joint_damping: true
|
||||
render: false
|
||||
urdf_version: rainbow_dash_v0
|
||||
- null
|
||||
eval_episodes: 25
|
||||
init_logstd: -1.1579536194508315
|
||||
init_mean_factor: 0.3084392491563408
|
||||
kl_cutoff_coef: 1000
|
||||
kl_cutoff_factor: 2
|
||||
kl_init_penalty: 1
|
||||
kl_target: 0.01
|
||||
logdir: /cns/ij-d/home/jietan/experiment/minitaur_vizier_study_ppo/minreact_nonexp_nr_02_186515603_186518344/333
|
||||
max_length: 1000
|
||||
network: !!python/name:pybullet_envs.minitaur.agents.scripts.networks.ForwardGaussianPolicy ''
|
||||
network_config: {}
|
||||
num_agents: 25
|
||||
policy_layers: !!python/tuple
|
||||
- 114
|
||||
- 45
|
||||
policy_lr: 0.00023516695218031146
|
||||
policy_optimizer: AdamOptimizer
|
||||
steps: 7000000.0
|
||||
update_epochs_policy: 25
|
||||
update_epochs_value: 25
|
||||
update_every: 25
|
||||
use_gpu: false
|
||||
value_layers: !!python/tuple
|
||||
- 170
|
||||
- 78
|
||||
value_lr: 0.00031014032715987193
|
||||
value_optimizer: AdamOptimizer
|
||||
weight_summaries:
|
||||
all: .*
|
||||
policy: .*/policy/.*
|
||||
value: .*/value/.*
|
||||
state:
|
||||
_mutable: false
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,51 @@
|
||||
!!python/object/new:pybullet_envs.minitaur.agents.tools.attr_dict.AttrDict
|
||||
dictitems:
|
||||
algorithm: !!python/name:pybullet_envs.minitaur.agents.ppo.algorithm.PPOAlgorithm ''
|
||||
discount: 0.9899764168788918
|
||||
env: !!python/object/apply:functools.partial
|
||||
args:
|
||||
- &id001 !!python/name:pybullet_envs.minitaur.envs.minitaur_trotting_env.MinitaurTrottingEnv ''
|
||||
state: !!python/tuple
|
||||
- *id001
|
||||
- !!python/tuple []
|
||||
- env_randomizer: null
|
||||
motor_kd: 0.015
|
||||
num_steps_to_log: 1000
|
||||
pd_latency: 0.003
|
||||
remove_default_joint_damping: true
|
||||
render: false
|
||||
urdf_version: rainbow_dash_v0
|
||||
- null
|
||||
eval_episodes: 25
|
||||
init_logstd: -0.6325707791047228
|
||||
init_mean_factor: 0.6508531688665261
|
||||
kl_cutoff_coef: 1000
|
||||
kl_cutoff_factor: 2
|
||||
kl_init_penalty: 1
|
||||
kl_target: 0.01
|
||||
logdir: /cns/ij-d/home/jietan/experiment/minitaur_vizier_study_ppo/mintrot_nonexp_nr_01_186515603_186518344/373
|
||||
max_length: 1000
|
||||
network: !!python/name:pybullet_envs.minitaur.agents.scripts.networks.ForwardGaussianPolicy ''
|
||||
network_config: {}
|
||||
num_agents: 25
|
||||
policy_layers: !!python/tuple
|
||||
- 133
|
||||
- 100
|
||||
policy_lr: 0.00048104185841752015
|
||||
policy_optimizer: AdamOptimizer
|
||||
steps: 7000000.0
|
||||
update_epochs_policy: 25
|
||||
update_epochs_value: 25
|
||||
update_every: 25
|
||||
use_gpu: false
|
||||
value_layers: !!python/tuple
|
||||
- 64
|
||||
- 57
|
||||
value_lr: 0.0012786382882055453
|
||||
value_optimizer: AdamOptimizer
|
||||
weight_summaries:
|
||||
all: .*
|
||||
policy: .*/policy/.*
|
||||
value: .*/value/.*
|
||||
state:
|
||||
_mutable: false
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1 +1,3 @@
|
||||
import gym
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1 @@
|
||||
|
@ -0,0 +1,21 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Proximal Policy Optimization algorithm."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from .algorithm import PPOAlgorithm
|
@ -0,0 +1,558 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Proximal Policy Optimization algorithm.
|
||||
|
||||
Based on John Schulman's implementation in Python and Theano:
|
||||
https://github.com/joschu/modular_rl/blob/master/modular_rl/ppo.py
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from . import memory
|
||||
from . import normalize
|
||||
from . import utility
|
||||
|
||||
|
||||
_NetworkOutput = collections.namedtuple(
|
||||
'NetworkOutput', 'policy, mean, logstd, value, state')
|
||||
|
||||
|
||||
class PPOAlgorithm(object):
|
||||
"""A vectorized implementation of the PPO algorithm by John Schulman."""
|
||||
|
||||
def __init__(self, batch_env, step, is_training, should_log, config):
|
||||
"""Create an instance of the PPO algorithm.
|
||||
|
||||
Args:
|
||||
batch_env: In-graph batch environment.
|
||||
step: Integer tensor holding the current training step.
|
||||
is_training: Boolean tensor for whether the algorithm should train.
|
||||
should_log: Boolean tensor for whether summaries should be returned.
|
||||
config: Object containing the agent configuration as attributes.
|
||||
"""
|
||||
self._batch_env = batch_env
|
||||
self._step = step
|
||||
self._is_training = is_training
|
||||
self._should_log = should_log
|
||||
self._config = config
|
||||
self._observ_filter = normalize.StreamingNormalize(
|
||||
self._batch_env.observ[0], center=True, scale=True, clip=5,
|
||||
name='normalize_observ')
|
||||
self._reward_filter = normalize.StreamingNormalize(
|
||||
self._batch_env.reward[0], center=False, scale=True, clip=10,
|
||||
name='normalize_reward')
|
||||
# Memory stores tuple of observ, action, mean, logstd, reward.
|
||||
template = (
|
||||
self._batch_env.observ[0], self._batch_env.action[0],
|
||||
self._batch_env.action[0], self._batch_env.action[0],
|
||||
self._batch_env.reward[0])
|
||||
self._memory = memory.EpisodeMemory(
|
||||
template, config.update_every, config.max_length, 'memory')
|
||||
self._memory_index = tf.Variable(0, False)
|
||||
use_gpu = self._config.use_gpu and utility.available_gpus()
|
||||
with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
|
||||
# Create network variables for later calls to reuse.
|
||||
self._network(
|
||||
tf.zeros_like(self._batch_env.observ)[:, None],
|
||||
tf.ones(len(self._batch_env)), reuse=None)
|
||||
cell = self._config.network(self._batch_env.action.shape[1].value)
|
||||
with tf.variable_scope('ppo_temporary'):
|
||||
self._episodes = memory.EpisodeMemory(
|
||||
template, len(batch_env), config.max_length, 'episodes')
|
||||
self._last_state = utility.create_nested_vars(
|
||||
cell.zero_state(len(batch_env), tf.float32))
|
||||
self._last_action = tf.Variable(
|
||||
tf.zeros_like(self._batch_env.action), False, name='last_action')
|
||||
self._last_mean = tf.Variable(
|
||||
tf.zeros_like(self._batch_env.action), False, name='last_mean')
|
||||
self._last_logstd = tf.Variable(
|
||||
tf.zeros_like(self._batch_env.action), False, name='last_logstd')
|
||||
self._penalty = tf.Variable(
|
||||
self._config.kl_init_penalty, False, dtype=tf.float32)
|
||||
self._policy_optimizer = self._config.policy_optimizer(
|
||||
self._config.policy_lr, name='policy_optimizer')
|
||||
self._value_optimizer = self._config.value_optimizer(
|
||||
self._config.value_lr, name='value_optimizer')
|
||||
|
||||
def begin_episode(self, agent_indices):
|
||||
"""Reset the recurrent states and stored episode.
|
||||
|
||||
Args:
|
||||
agent_indices: 1D tensor of batch indices for agents starting an episode.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope('begin_episode/'):
|
||||
reset_state = utility.reinit_nested_vars(self._last_state, agent_indices)
|
||||
reset_buffer = self._episodes.clear(agent_indices)
|
||||
with tf.control_dependencies([reset_state, reset_buffer]):
|
||||
return tf.constant('')
|
||||
|
||||
def perform(self, observ):
|
||||
"""Compute batch of actions and a summary for a batch of observation.
|
||||
|
||||
Args:
|
||||
observ: Tensor of a batch of observations for all agents.
|
||||
|
||||
Returns:
|
||||
Tuple of action batch tensor and summary tensor.
|
||||
"""
|
||||
with tf.name_scope('perform/'):
|
||||
observ = self._observ_filter.transform(observ)
|
||||
network = self._network(
|
||||
observ[:, None], tf.ones(observ.shape[0]), self._last_state)
|
||||
action = tf.cond(
|
||||
self._is_training, network.policy.sample, lambda: network.mean)
|
||||
logprob = network.policy.log_prob(action)[:, 0]
|
||||
# pylint: disable=g-long-lambda
|
||||
summary = tf.cond(self._should_log, lambda: tf.summary.merge([
|
||||
tf.summary.histogram('mean', network.mean[:, 0]),
|
||||
tf.summary.histogram('std', tf.exp(network.logstd[:, 0])),
|
||||
tf.summary.histogram('action', action[:, 0]),
|
||||
tf.summary.histogram('logprob', logprob)]), str)
|
||||
# Remember current policy to append to memory in the experience callback.
|
||||
with tf.control_dependencies([
|
||||
utility.assign_nested_vars(self._last_state, network.state),
|
||||
self._last_action.assign(action[:, 0]),
|
||||
self._last_mean.assign(network.mean[:, 0]),
|
||||
self._last_logstd.assign(network.logstd[:, 0])]):
|
||||
return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
|
||||
|
||||
def experience(self, observ, action, reward, unused_done, unused_nextob):
|
||||
"""Process the transition tuple of the current step.
|
||||
|
||||
When training, add the current transition tuple to the memory and update
|
||||
the streaming statistics for observations and rewards. A summary string is
|
||||
returned if requested at this step.
|
||||
|
||||
Args:
|
||||
observ: Batch tensor of observations.
|
||||
action: Batch tensor of actions.
|
||||
reward: Batch tensor of rewards.
|
||||
unused_done: Batch tensor of done flags.
|
||||
unused_nextob: Batch tensor of successor observations.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope('experience/'):
|
||||
return tf.cond(
|
||||
self._is_training,
|
||||
lambda: self._define_experience(observ, action, reward), str)
|
||||
|
||||
def _define_experience(self, observ, action, reward):
|
||||
"""Implement the branch of experience() entered during training."""
|
||||
update_filters = tf.summary.merge([
|
||||
self._observ_filter.update(observ),
|
||||
self._reward_filter.update(reward)])
|
||||
with tf.control_dependencies([update_filters]):
|
||||
if self._config.train_on_agent_action:
|
||||
# NOTE: Doesn't seem to change much.
|
||||
action = self._last_action
|
||||
batch = observ, action, self._last_mean, self._last_logstd, reward
|
||||
append = self._episodes.append(batch, tf.range(len(self._batch_env)))
|
||||
with tf.control_dependencies([append]):
|
||||
norm_observ = self._observ_filter.transform(observ)
|
||||
norm_reward = tf.reduce_mean(self._reward_filter.transform(reward))
|
||||
# pylint: disable=g-long-lambda
|
||||
summary = tf.cond(self._should_log, lambda: tf.summary.merge([
|
||||
update_filters,
|
||||
self._observ_filter.summary(),
|
||||
self._reward_filter.summary(),
|
||||
tf.summary.scalar('memory_size', self._memory_index),
|
||||
tf.summary.histogram('normalized_observ', norm_observ),
|
||||
tf.summary.histogram('action', self._last_action),
|
||||
tf.summary.scalar('normalized_reward', norm_reward)]), str)
|
||||
return summary
|
||||
|
||||
def end_episode(self, agent_indices):
|
||||
"""Add episodes to the memory and perform update steps if memory is full.
|
||||
|
||||
During training, add the collected episodes of the batch indices that
|
||||
finished their episode to the memory. If the memory is full, train on it,
|
||||
and then clear the memory. A summary string is returned if requested at
|
||||
this step.
|
||||
|
||||
Args:
|
||||
agent_indices: 1D tensor of batch indices for agents starting an episode.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope('end_episode/'):
|
||||
return tf.cond(
|
||||
self._is_training,
|
||||
lambda: self._define_end_episode(agent_indices), str)
|
||||
|
||||
def _define_end_episode(self, agent_indices):
|
||||
"""Implement the branch of end_episode() entered during training."""
|
||||
episodes, length = self._episodes.data(agent_indices)
|
||||
space_left = self._config.update_every - self._memory_index
|
||||
use_episodes = tf.range(tf.minimum(
|
||||
tf.shape(agent_indices)[0], space_left))
|
||||
episodes = [tf.gather(elem, use_episodes) for elem in episodes]
|
||||
append = self._memory.replace(
|
||||
episodes, tf.gather(length, use_episodes),
|
||||
use_episodes + self._memory_index)
|
||||
with tf.control_dependencies([append]):
|
||||
inc_index = self._memory_index.assign_add(tf.shape(use_episodes)[0])
|
||||
with tf.control_dependencies([inc_index]):
|
||||
memory_full = self._memory_index >= self._config.update_every
|
||||
return tf.cond(memory_full, self._training, str)
|
||||
|
||||
def _training(self):
|
||||
"""Perform multiple training iterations of both policy and value baseline.
|
||||
|
||||
Training on the episodes collected in the memory. Reset the memory
|
||||
afterwards. Always returns a summary string.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope('training'):
|
||||
assert_full = tf.assert_equal(
|
||||
self._memory_index, self._config.update_every)
|
||||
with tf.control_dependencies([assert_full]):
|
||||
data = self._memory.data()
|
||||
(observ, action, old_mean, old_logstd, reward), length = data
|
||||
with tf.control_dependencies([tf.assert_greater(length, 0)]):
|
||||
length = tf.identity(length)
|
||||
observ = self._observ_filter.transform(observ)
|
||||
reward = self._reward_filter.transform(reward)
|
||||
policy_summary = self._update_policy(
|
||||
observ, action, old_mean, old_logstd, reward, length)
|
||||
with tf.control_dependencies([policy_summary]):
|
||||
value_summary = self._update_value(observ, reward, length)
|
||||
with tf.control_dependencies([value_summary]):
|
||||
penalty_summary = self._adjust_penalty(
|
||||
observ, old_mean, old_logstd, length)
|
||||
with tf.control_dependencies([penalty_summary]):
|
||||
clear_memory = tf.group(
|
||||
self._memory.clear(), self._memory_index.assign(0))
|
||||
with tf.control_dependencies([clear_memory]):
|
||||
weight_summary = utility.variable_summaries(
|
||||
tf.trainable_variables(), self._config.weight_summaries)
|
||||
return tf.summary.merge([
|
||||
policy_summary, value_summary, penalty_summary, weight_summary])
|
||||
|
||||
def _update_value(self, observ, reward, length):
|
||||
"""Perform multiple update steps of the value baseline.
|
||||
|
||||
We need to decide for the summary of one iteration, and thus choose the one
|
||||
after half of the iterations.
|
||||
|
||||
Args:
|
||||
observ: Sequences of observations.
|
||||
reward: Sequences of reward.
|
||||
length: Batch of sequence lengths.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope('update_value'):
|
||||
loss, summary = tf.scan(
|
||||
lambda _1, _2: self._update_value_step(observ, reward, length),
|
||||
tf.range(self._config.update_epochs_value),
|
||||
[0., ''], parallel_iterations=1)
|
||||
print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'value loss: ')
|
||||
with tf.control_dependencies([loss, print_loss]):
|
||||
return summary[self._config.update_epochs_value // 2]
|
||||
|
||||
def _update_value_step(self, observ, reward, length):
|
||||
"""Compute the current value loss and perform a gradient update step.
|
||||
|
||||
Args:
|
||||
observ: Sequences of observations.
|
||||
reward: Sequences of reward.
|
||||
length: Batch of sequence lengths.
|
||||
|
||||
Returns:
|
||||
Tuple of loss tensor and summary tensor.
|
||||
"""
|
||||
loss, summary = self._value_loss(observ, reward, length)
|
||||
gradients, variables = (
|
||||
zip(*self._value_optimizer.compute_gradients(loss)))
|
||||
optimize = self._value_optimizer.apply_gradients(
|
||||
zip(gradients, variables))
|
||||
summary = tf.summary.merge([
|
||||
summary,
|
||||
tf.summary.scalar('gradient_norm', tf.global_norm(gradients)),
|
||||
utility.gradient_summaries(
|
||||
zip(gradients, variables), dict(value=r'.*'))])
|
||||
with tf.control_dependencies([optimize]):
|
||||
return [tf.identity(loss), tf.identity(summary)]
|
||||
|
||||
def _value_loss(self, observ, reward, length):
|
||||
"""Compute the loss function for the value baseline.
|
||||
|
||||
The value loss is the difference between empirical and approximated returns
|
||||
over the collected episodes. Returns the loss tensor and a summary strin.
|
||||
|
||||
Args:
|
||||
observ: Sequences of observations.
|
||||
reward: Sequences of reward.
|
||||
length: Batch of sequence lengths.
|
||||
|
||||
Returns:
|
||||
Tuple of loss tensor and summary tensor.
|
||||
"""
|
||||
with tf.name_scope('value_loss'):
|
||||
value = self._network(observ, length).value
|
||||
return_ = utility.discounted_return(
|
||||
reward, length, self._config.discount)
|
||||
advantage = return_ - value
|
||||
value_loss = 0.5 * self._mask(advantage ** 2, length)
|
||||
summary = tf.summary.merge([
|
||||
tf.summary.histogram('value_loss', value_loss),
|
||||
tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss))])
|
||||
value_loss = tf.reduce_mean(value_loss)
|
||||
return tf.check_numerics(value_loss, 'value_loss'), summary
|
||||
|
||||
def _update_policy(
|
||||
self, observ, action, old_mean, old_logstd, reward, length):
|
||||
"""Perform multiple update steps of the policy.
|
||||
|
||||
The advantage is computed once at the beginning and shared across
|
||||
iterations. We need to decide for the summary of one iteration, and thus
|
||||
choose the one after half of the iterations.
|
||||
|
||||
Args:
|
||||
observ: Sequences of observations.
|
||||
action: Sequences of actions.
|
||||
old_mean: Sequences of action means of the behavioral policy.
|
||||
old_logstd: Sequences of action log stddevs of the behavioral policy.
|
||||
reward: Sequences of rewards.
|
||||
length: Batch of sequence lengths.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope('update_policy'):
|
||||
return_ = utility.discounted_return(
|
||||
reward, length, self._config.discount)
|
||||
value = self._network(observ, length).value
|
||||
if self._config.gae_lambda:
|
||||
advantage = utility.lambda_return(
|
||||
reward, value, length, self._config.discount,
|
||||
self._config.gae_lambda)
|
||||
else:
|
||||
advantage = return_ - value
|
||||
mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
|
||||
advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
|
||||
advantage = tf.Print(
|
||||
advantage, [tf.reduce_mean(return_), tf.reduce_mean(value)],
|
||||
'return and value: ')
|
||||
advantage = tf.Print(
|
||||
advantage, [tf.reduce_mean(advantage)],
|
||||
'normalized advantage: ')
|
||||
# pylint: disable=g-long-lambda
|
||||
loss, summary = tf.scan(
|
||||
lambda _1, _2: self._update_policy_step(
|
||||
observ, action, old_mean, old_logstd, advantage, length),
|
||||
tf.range(self._config.update_epochs_policy),
|
||||
[0., ''], parallel_iterations=1)
|
||||
print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'policy loss: ')
|
||||
with tf.control_dependencies([loss, print_loss]):
|
||||
return summary[self._config.update_epochs_policy // 2]
|
||||
|
||||
def _update_policy_step(
|
||||
self, observ, action, old_mean, old_logstd, advantage, length):
|
||||
"""Compute the current policy loss and perform a gradient update step.
|
||||
|
||||
Args:
|
||||
observ: Sequences of observations.
|
||||
action: Sequences of actions.
|
||||
old_mean: Sequences of action means of the behavioral policy.
|
||||
old_logstd: Sequences of action log stddevs of the behavioral policy.
|
||||
advantage: Sequences of advantages.
|
||||
length: Batch of sequence lengths.
|
||||
|
||||
Returns:
|
||||
Tuple of loss tensor and summary tensor.
|
||||
"""
|
||||
network = self._network(observ, length)
|
||||
loss, summary = self._policy_loss(
|
||||
network.mean, network.logstd, old_mean, old_logstd, action,
|
||||
advantage, length)
|
||||
gradients, variables = (
|
||||
zip(*self._policy_optimizer.compute_gradients(loss)))
|
||||
optimize = self._policy_optimizer.apply_gradients(
|
||||
zip(gradients, variables))
|
||||
summary = tf.summary.merge([
|
||||
summary,
|
||||
tf.summary.scalar('gradient_norm', tf.global_norm(gradients)),
|
||||
utility.gradient_summaries(
|
||||
zip(gradients, variables), dict(policy=r'.*'))])
|
||||
with tf.control_dependencies([optimize]):
|
||||
return [tf.identity(loss), tf.identity(summary)]
|
||||
|
||||
def _policy_loss(
|
||||
self, mean, logstd, old_mean, old_logstd, action, advantage, length):
|
||||
"""Compute the policy loss composed of multiple components.
|
||||
|
||||
1. The policy gradient loss is importance sampled from the data-collecting
|
||||
policy at the beginning of training.
|
||||
2. The second term is a KL penalty between the policy at the beginning of
|
||||
training and the current policy.
|
||||
3. Additionally, if this KL already changed more than twice the target
|
||||
amount, we activate a strong penalty discouraging further divergence.
|
||||
|
||||
Args:
|
||||
mean: Sequences of action means of the current policy.
|
||||
logstd: Sequences of action log stddevs of the current policy.
|
||||
old_mean: Sequences of action means of the behavioral policy.
|
||||
old_logstd: Sequences of action log stddevs of the behavioral policy.
|
||||
action: Sequences of actions.
|
||||
advantage: Sequences of advantages.
|
||||
length: Batch of sequence lengths.
|
||||
|
||||
Returns:
|
||||
Tuple of loss tensor and summary tensor.
|
||||
"""
|
||||
with tf.name_scope('policy_loss'):
|
||||
entropy = utility.diag_normal_entropy(mean, logstd)
|
||||
kl = tf.reduce_mean(self._mask(utility.diag_normal_kl(
|
||||
old_mean, old_logstd, mean, logstd), length), 1)
|
||||
policy_gradient = tf.exp(
|
||||
utility.diag_normal_logpdf(mean, logstd, action) -
|
||||
utility.diag_normal_logpdf(old_mean, old_logstd, action))
|
||||
surrogate_loss = -tf.reduce_mean(self._mask(
|
||||
policy_gradient * tf.stop_gradient(advantage), length), 1)
|
||||
kl_penalty = self._penalty * kl
|
||||
cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor
|
||||
cutoff_count = tf.reduce_sum(
|
||||
tf.cast(kl > cutoff_threshold, tf.int32))
|
||||
with tf.control_dependencies([tf.cond(
|
||||
cutoff_count > 0,
|
||||
lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int)]):
|
||||
kl_cutoff = (
|
||||
self._config.kl_cutoff_coef *
|
||||
tf.cast(kl > cutoff_threshold, tf.float32) *
|
||||
(kl - cutoff_threshold) ** 2)
|
||||
policy_loss = surrogate_loss + kl_penalty + kl_cutoff
|
||||
summary = tf.summary.merge([
|
||||
tf.summary.histogram('entropy', entropy),
|
||||
tf.summary.histogram('kl', kl),
|
||||
tf.summary.histogram('surrogate_loss', surrogate_loss),
|
||||
tf.summary.histogram('kl_penalty', kl_penalty),
|
||||
tf.summary.histogram('kl_cutoff', kl_cutoff),
|
||||
tf.summary.histogram('kl_penalty_combined', kl_penalty + kl_cutoff),
|
||||
tf.summary.histogram('policy_loss', policy_loss),
|
||||
tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)),
|
||||
tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)),
|
||||
tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss))])
|
||||
policy_loss = tf.reduce_mean(policy_loss, 0)
|
||||
return tf.check_numerics(policy_loss, 'policy_loss'), summary
|
||||
|
||||
def _adjust_penalty(self, observ, old_mean, old_logstd, length):
|
||||
"""Adjust the KL policy between the behavioral and current policy.
|
||||
|
||||
Compute how much the policy actually changed during the multiple
|
||||
update steps. Adjust the penalty strength for the next training phase if we
|
||||
overshot or undershot the target divergence too much.
|
||||
|
||||
Args:
|
||||
observ: Sequences of observations.
|
||||
old_mean: Sequences of action means of the behavioral policy.
|
||||
old_logstd: Sequences of action log stddevs of the behavioral policy.
|
||||
length: Batch of sequence lengths.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope('adjust_penalty'):
|
||||
network = self._network(observ, length)
|
||||
assert_change = tf.assert_equal(
|
||||
tf.reduce_all(tf.equal(network.mean, old_mean)), False,
|
||||
message='policy should change')
|
||||
print_penalty = tf.Print(0, [self._penalty], 'current penalty: ')
|
||||
with tf.control_dependencies([assert_change, print_penalty]):
|
||||
kl_change = tf.reduce_mean(self._mask(utility.diag_normal_kl(
|
||||
old_mean, old_logstd, network.mean, network.logstd), length))
|
||||
kl_change = tf.Print(kl_change, [kl_change], 'kl change: ')
|
||||
maybe_increase = tf.cond(
|
||||
kl_change > 1.3 * self._config.kl_target,
|
||||
# pylint: disable=g-long-lambda
|
||||
lambda: tf.Print(self._penalty.assign(
|
||||
self._penalty * 1.5), [0], 'increase penalty '),
|
||||
float)
|
||||
maybe_decrease = tf.cond(
|
||||
kl_change < 0.7 * self._config.kl_target,
|
||||
# pylint: disable=g-long-lambda
|
||||
lambda: tf.Print(self._penalty.assign(
|
||||
self._penalty / 1.5), [0], 'decrease penalty '),
|
||||
float)
|
||||
with tf.control_dependencies([maybe_increase, maybe_decrease]):
|
||||
return tf.summary.merge([
|
||||
tf.summary.scalar('kl_change', kl_change),
|
||||
tf.summary.scalar('penalty', self._penalty)])
|
||||
|
||||
def _mask(self, tensor, length):
|
||||
"""Set padding elements of a batch of sequences to zero.
|
||||
|
||||
Useful to then safely sum along the time dimension.
|
||||
|
||||
Args:
|
||||
tensor: Tensor of sequences.
|
||||
length: Batch of sequence lengths.
|
||||
|
||||
Returns:
|
||||
Masked sequences.
|
||||
"""
|
||||
with tf.name_scope('mask'):
|
||||
range_ = tf.range(tensor.shape[1].value)
|
||||
mask = tf.cast(range_[None, :] < length[:, None], tf.float32)
|
||||
masked = tensor * mask
|
||||
return tf.check_numerics(masked, 'masked')
|
||||
|
||||
def _network(self, observ, length=None, state=None, reuse=True):
|
||||
"""Compute the network output for a batched sequence of observations.
|
||||
|
||||
Optionally, the initial state can be specified. The weights should be
|
||||
reused for all calls, except for the first one. Output is a named tuple
|
||||
containing the policy as a TensorFlow distribution, the policy mean and log
|
||||
standard deviation, the approximated state value, and the new recurrent
|
||||
state.
|
||||
|
||||
Args:
|
||||
observ: Sequences of observations.
|
||||
length: Batch of sequence lengths.
|
||||
state: Batch of initial recurrent states.
|
||||
reuse: Python boolean whether to reuse previous variables.
|
||||
|
||||
Returns:
|
||||
NetworkOutput tuple.
|
||||
"""
|
||||
with tf.variable_scope('network', reuse=reuse):
|
||||
observ = tf.convert_to_tensor(observ)
|
||||
use_gpu = self._config.use_gpu and utility.available_gpus()
|
||||
with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
|
||||
observ = tf.check_numerics(observ, 'observ')
|
||||
cell = self._config.network(self._batch_env.action.shape[1].value)
|
||||
(mean, logstd, value), state = tf.nn.dynamic_rnn(
|
||||
cell, observ, length, state, tf.float32, swap_memory=True)
|
||||
mean = tf.check_numerics(mean, 'mean')
|
||||
logstd = tf.check_numerics(logstd, 'logstd')
|
||||
value = tf.check_numerics(value, 'value')
|
||||
policy = tf.contrib.distributions.MultivariateNormalDiag(
|
||||
mean, tf.exp(logstd))
|
||||
return _NetworkOutput(policy, mean, logstd, value, state)
|
@ -0,0 +1,152 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Memory that stores episodes."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class EpisodeMemory(object):
|
||||
"""Memory that stores episodes."""
|
||||
|
||||
def __init__(self, template, capacity, max_length, scope):
|
||||
"""Create a memory that stores episodes.
|
||||
|
||||
Each transition tuple consists of quantities specified by the template.
|
||||
These quantities would typically be be observartions, actions, rewards, and
|
||||
done indicators.
|
||||
|
||||
Args:
|
||||
template: List of tensors to derive shapes and dtypes of each transition.
|
||||
capacity: Number of episodes, or rows, hold by the memory.
|
||||
max_length: Allocated sequence length for the episodes.
|
||||
scope: Variable scope to use for internal variables.
|
||||
"""
|
||||
self._capacity = capacity
|
||||
self._max_length = max_length
|
||||
with tf.variable_scope(scope) as scope:
|
||||
self._scope = scope
|
||||
self._length = tf.Variable(tf.zeros(capacity, tf.int32), False)
|
||||
self._buffers = [
|
||||
tf.Variable(tf.zeros(
|
||||
[capacity, max_length] + elem.shape.as_list(),
|
||||
elem.dtype), False)
|
||||
for elem in template]
|
||||
|
||||
def length(self, rows=None):
|
||||
"""Tensor holding the current length of episodes.
|
||||
|
||||
Args:
|
||||
rows: Episodes to select length from, defaults to all.
|
||||
|
||||
Returns:
|
||||
Batch tensor of sequence lengths.
|
||||
"""
|
||||
rows = tf.range(self._capacity) if rows is None else rows
|
||||
return tf.gather(self._length, rows)
|
||||
|
||||
def append(self, transitions, rows=None):
|
||||
"""Append a batch of transitions to rows of the memory.
|
||||
|
||||
Args:
|
||||
transitions: Tuple of transition quantities with batch dimension.
|
||||
rows: Episodes to append to, defaults to all.
|
||||
|
||||
Returns:
|
||||
Operation.
|
||||
"""
|
||||
rows = tf.range(self._capacity) if rows is None else rows
|
||||
assert rows.shape.ndims == 1
|
||||
assert_capacity = tf.assert_less(
|
||||
rows, self._capacity,
|
||||
message='capacity exceeded')
|
||||
with tf.control_dependencies([assert_capacity]):
|
||||
assert_max_length = tf.assert_less(
|
||||
tf.gather(self._length, rows), self._max_length,
|
||||
message='max length exceeded')
|
||||
append_ops = []
|
||||
with tf.control_dependencies([assert_max_length]):
|
||||
for buffer_, elements in zip(self._buffers, transitions):
|
||||
timestep = tf.gather(self._length, rows)
|
||||
indices = tf.stack([rows, timestep], 1)
|
||||
append_ops.append(tf.scatter_nd_update(buffer_, indices, elements))
|
||||
with tf.control_dependencies(append_ops):
|
||||
episode_mask = tf.reduce_sum(tf.one_hot(
|
||||
rows, self._capacity, dtype=tf.int32), 0)
|
||||
return self._length.assign_add(episode_mask)
|
||||
|
||||
def replace(self, episodes, length, rows=None):
|
||||
"""Replace full episodes.
|
||||
|
||||
Args:
|
||||
episodes: Tuple of transition quantities with batch and time dimensions.
|
||||
length: Batch of sequence lengths.
|
||||
rows: Episodes to replace, defaults to all.
|
||||
|
||||
Returns:
|
||||
Operation.
|
||||
"""
|
||||
rows = tf.range(self._capacity) if rows is None else rows
|
||||
assert rows.shape.ndims == 1
|
||||
assert_capacity = tf.assert_less(
|
||||
rows, self._capacity, message='capacity exceeded')
|
||||
with tf.control_dependencies([assert_capacity]):
|
||||
assert_max_length = tf.assert_less_equal(
|
||||
length, self._max_length, message='max length exceeded')
|
||||
replace_ops = []
|
||||
with tf.control_dependencies([assert_max_length]):
|
||||
for buffer_, elements in zip(self._buffers, episodes):
|
||||
replace_op = tf.scatter_update(buffer_, rows, elements)
|
||||
replace_ops.append(replace_op)
|
||||
with tf.control_dependencies(replace_ops):
|
||||
return tf.scatter_update(self._length, rows, length)
|
||||
|
||||
def data(self, rows=None):
|
||||
"""Access a batch of episodes from the memory.
|
||||
|
||||
Padding elements after the length of each episode are unspecified and might
|
||||
contain old data.
|
||||
|
||||
Args:
|
||||
rows: Episodes to select, defaults to all.
|
||||
|
||||
Returns:
|
||||
Tuple containing a tuple of transition quantiries with batch and time
|
||||
dimensions, and a batch of sequence lengths.
|
||||
"""
|
||||
rows = tf.range(self._capacity) if rows is None else rows
|
||||
assert rows.shape.ndims == 1
|
||||
episode = [tf.gather(buffer_, rows) for buffer_ in self._buffers]
|
||||
length = tf.gather(self._length, rows)
|
||||
return episode, length
|
||||
|
||||
def clear(self, rows=None):
|
||||
"""Reset episodes in the memory.
|
||||
|
||||
Internally, this only sets their lengths to zero. The memory entries will
|
||||
be overridden by future calls to append() or replace().
|
||||
|
||||
Args:
|
||||
rows: Episodes to clear, defaults to all.
|
||||
|
||||
Returns:
|
||||
Operation.
|
||||
"""
|
||||
rows = tf.range(self._capacity) if rows is None else rows
|
||||
assert rows.shape.ndims == 1
|
||||
return tf.scatter_update(self._length, rows, tf.zeros_like(rows))
|
@ -0,0 +1,168 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Normalize tensors based on streaming estimates of mean and variance."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class StreamingNormalize(object):
|
||||
"""Normalize tensors based on streaming estimates of mean and variance."""
|
||||
|
||||
def __init__(
|
||||
self, template, center=True, scale=True, clip=10, name='normalize'):
|
||||
"""Normalize tensors based on streaming estimates of mean and variance.
|
||||
|
||||
Centering the value, scaling it by the standard deviation, and clipping
|
||||
outlier values are optional.
|
||||
|
||||
Args:
|
||||
template: Example tensor providing shape and dtype of the vaule to track.
|
||||
center: Python boolean indicating whether to subtract mean from values.
|
||||
scale: Python boolean indicating whether to scale values by stddev.
|
||||
clip: If and when to clip normalized values.
|
||||
name: Parent scope of operations provided by this class.
|
||||
"""
|
||||
self._center = center
|
||||
self._scale = scale
|
||||
self._clip = clip
|
||||
self._name = name
|
||||
with tf.name_scope(name):
|
||||
self._count = tf.Variable(0, False)
|
||||
self._mean = tf.Variable(tf.zeros_like(template), False)
|
||||
self._var_sum = tf.Variable(tf.zeros_like(template), False)
|
||||
|
||||
def transform(self, value):
|
||||
"""Normalize a single or batch tensor.
|
||||
|
||||
Applies the activated transformations in the constructor using current
|
||||
estimates of mean and variance.
|
||||
|
||||
Args:
|
||||
value: Batch or single value tensor.
|
||||
|
||||
Returns:
|
||||
Normalized batch or single value tensor.
|
||||
"""
|
||||
with tf.name_scope(self._name + '/transform'):
|
||||
no_batch_dim = value.shape.ndims == self._mean.shape.ndims
|
||||
if no_batch_dim:
|
||||
# Add a batch dimension if necessary.
|
||||
value = value[None, ...]
|
||||
if self._center:
|
||||
value -= self._mean[None, ...]
|
||||
if self._scale:
|
||||
# We cannot scale before seeing at least two samples.
|
||||
value /= tf.cond(
|
||||
self._count > 1, lambda: self._std() + 1e-8,
|
||||
lambda: tf.ones_like(self._var_sum))[None]
|
||||
if self._clip:
|
||||
value = tf.clip_by_value(value, -self._clip, self._clip)
|
||||
# Remove batch dimension if necessary.
|
||||
if no_batch_dim:
|
||||
value = value[0]
|
||||
return tf.check_numerics(value, 'value')
|
||||
|
||||
def update(self, value):
|
||||
"""Update the mean and variance estimates.
|
||||
|
||||
Args:
|
||||
value: Batch or single value tensor.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope(self._name + '/update'):
|
||||
if value.shape.ndims == self._mean.shape.ndims:
|
||||
# Add a batch dimension if necessary.
|
||||
value = value[None, ...]
|
||||
count = tf.shape(value)[0]
|
||||
with tf.control_dependencies([self._count.assign_add(count)]):
|
||||
step = tf.cast(self._count, tf.float32)
|
||||
mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0)
|
||||
new_mean = self._mean + mean_delta / step
|
||||
new_mean = tf.cond(self._count > 1, lambda: new_mean, lambda: value[0])
|
||||
var_delta = (
|
||||
value - self._mean[None, ...]) * (value - new_mean[None, ...])
|
||||
new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0)
|
||||
with tf.control_dependencies([new_mean, new_var_sum]):
|
||||
update = self._mean.assign(new_mean), self._var_sum.assign(new_var_sum)
|
||||
with tf.control_dependencies(update):
|
||||
if value.shape.ndims == 1:
|
||||
value = tf.reduce_mean(value)
|
||||
return self._summary('value', tf.reduce_mean(value))
|
||||
|
||||
def reset(self):
|
||||
"""Reset the estimates of mean and variance.
|
||||
|
||||
Resets the full state of this class.
|
||||
|
||||
Returns:
|
||||
Operation.
|
||||
"""
|
||||
with tf.name_scope(self._name + '/reset'):
|
||||
return tf.group(
|
||||
self._count.assign(0),
|
||||
self._mean.assign(tf.zeros_like(self._mean)),
|
||||
self._var_sum.assign(tf.zeros_like(self._var_sum)))
|
||||
|
||||
def summary(self):
|
||||
"""Summary string of mean and standard deviation.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
with tf.name_scope(self._name + '/summary'):
|
||||
mean_summary = tf.cond(
|
||||
self._count > 0, lambda: self._summary('mean', self._mean), str)
|
||||
std_summary = tf.cond(
|
||||
self._count > 1, lambda: self._summary('stddev', self._std()), str)
|
||||
return tf.summary.merge([mean_summary, std_summary])
|
||||
|
||||
def _std(self):
|
||||
"""Computes the current estimate of the standard deviation.
|
||||
|
||||
Note that the standard deviation is not defined until at least two samples
|
||||
were seen.
|
||||
|
||||
Returns:
|
||||
Tensor of current variance.
|
||||
"""
|
||||
variance = tf.cond(
|
||||
self._count > 1,
|
||||
lambda: self._var_sum / tf.cast(self._count - 1, tf.float32),
|
||||
lambda: tf.ones_like(self._var_sum) * float('nan'))
|
||||
# The epsilon corrects for small negative variance values caused by
|
||||
# the algorithm. It was empirically chosen to work with all environments
|
||||
# tested.
|
||||
return tf.sqrt(variance + 1e-4)
|
||||
|
||||
def _summary(self, name, tensor):
|
||||
"""Create a scalar or histogram summary matching the rank of the tensor.
|
||||
|
||||
Args:
|
||||
name: Name for the summary.
|
||||
tensor: Tensor to summarize.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
if tensor.shape.ndims == 0:
|
||||
return tf.summary.scalar(name, tensor)
|
||||
else:
|
||||
return tf.summary.histogram(name, tensor)
|
@ -0,0 +1,222 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Utilities for the PPO algorithm."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import math
|
||||
import re
|
||||
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.client import device_lib
|
||||
|
||||
|
||||
def create_nested_vars(tensors):
|
||||
"""Create variables matching a nested tuple of tensors.
|
||||
|
||||
Args:
|
||||
tensors: Nested tuple of list of tensors.
|
||||
|
||||
Returns:
|
||||
Nested tuple or list of variables.
|
||||
"""
|
||||
if isinstance(tensors, (tuple, list)):
|
||||
return type(tensors)(create_nested_vars(tensor) for tensor in tensors)
|
||||
return tf.Variable(tensors, False)
|
||||
|
||||
|
||||
def reinit_nested_vars(variables, indices=None):
|
||||
"""Reset all variables in a nested tuple to zeros.
|
||||
|
||||
Args:
|
||||
variables: Nested tuple or list of variaables.
|
||||
indices: Indices along the first dimension to reset, defaults to all.
|
||||
|
||||
Returns:
|
||||
Operation.
|
||||
"""
|
||||
if isinstance(variables, (tuple, list)):
|
||||
return tf.group(*[
|
||||
reinit_nested_vars(variable, indices) for variable in variables])
|
||||
if indices is None:
|
||||
return variables.assign(tf.zeros_like(variables))
|
||||
else:
|
||||
zeros = tf.zeros([tf.shape(indices)[0]] + variables.shape[1:].as_list())
|
||||
return tf.scatter_update(variables, indices, zeros)
|
||||
|
||||
|
||||
def assign_nested_vars(variables, tensors):
|
||||
"""Assign tensors to matching nested tuple of variables.
|
||||
|
||||
Args:
|
||||
variables: Nested tuple or list of variables to update.
|
||||
tensors: Nested tuple or list of tensors to assign.
|
||||
|
||||
Returns:
|
||||
Operation.
|
||||
"""
|
||||
if isinstance(variables, (tuple, list)):
|
||||
return tf.group(*[
|
||||
assign_nested_vars(variable, tensor)
|
||||
for variable, tensor in zip(variables, tensors)])
|
||||
return variables.assign(tensors)
|
||||
|
||||
|
||||
def discounted_return(reward, length, discount):
|
||||
"""Discounted Monte-Carlo returns."""
|
||||
timestep = tf.range(reward.shape[1].value)
|
||||
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
|
||||
return_ = tf.reverse(tf.transpose(tf.scan(
|
||||
lambda agg, cur: cur + discount * agg,
|
||||
tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]),
|
||||
tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1])
|
||||
return tf.check_numerics(tf.stop_gradient(return_), 'return')
|
||||
|
||||
|
||||
def fixed_step_return(reward, value, length, discount, window):
|
||||
"""N-step discounted return."""
|
||||
timestep = tf.range(reward.shape[1].value)
|
||||
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
|
||||
return_ = tf.zeros_like(reward)
|
||||
for _ in range(window):
|
||||
return_ += reward
|
||||
reward = discount * tf.concat(
|
||||
[reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1)
|
||||
return_ += discount ** window * tf.concat(
|
||||
[value[:, window:], tf.zeros_like(value[:, -window:]), 1])
|
||||
return tf.check_numerics(tf.stop_gradient(mask * return_), 'return')
|
||||
|
||||
|
||||
def lambda_return(reward, value, length, discount, lambda_):
|
||||
"""TD-lambda returns."""
|
||||
timestep = tf.range(reward.shape[1].value)
|
||||
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
|
||||
sequence = mask * reward + discount * value * (1 - lambda_)
|
||||
discount = mask * discount * lambda_
|
||||
sequence = tf.stack([sequence, discount], 2)
|
||||
return_ = tf.reverse(tf.transpose(tf.scan(
|
||||
lambda agg, cur: cur[0] + cur[1] * agg,
|
||||
tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]),
|
||||
tf.zeros_like(value[:, -1]), 1, False), [1, 0]), [1])
|
||||
return tf.check_numerics(tf.stop_gradient(return_), 'return')
|
||||
|
||||
|
||||
def lambda_advantage(reward, value, length, discount):
|
||||
"""Generalized Advantage Estimation."""
|
||||
timestep = tf.range(reward.shape[1].value)
|
||||
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
|
||||
next_value = tf.concat([value[:, 1:], tf.zeros_like(value[:, -1:])], 1)
|
||||
delta = reward + discount * next_value - value
|
||||
advantage = tf.reverse(tf.transpose(tf.scan(
|
||||
lambda agg, cur: cur + discount * agg,
|
||||
tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]),
|
||||
tf.zeros_like(delta[:, -1]), 1, False), [1, 0]), [1])
|
||||
return tf.check_numerics(tf.stop_gradient(advantage), 'advantage')
|
||||
|
||||
|
||||
def diag_normal_kl(mean0, logstd0, mean1, logstd1):
|
||||
"""Epirical KL divergence of two normals with diagonal covariance."""
|
||||
logstd0_2, logstd1_2 = 2 * logstd0, 2 * logstd1
|
||||
return 0.5 * (
|
||||
tf.reduce_sum(tf.exp(logstd0_2 - logstd1_2), -1) +
|
||||
tf.reduce_sum((mean1 - mean0) ** 2 / tf.exp(logstd1_2), -1) +
|
||||
tf.reduce_sum(logstd1_2, -1) - tf.reduce_sum(logstd0_2, -1) -
|
||||
mean0.shape[-1].value)
|
||||
|
||||
|
||||
def diag_normal_logpdf(mean, logstd, loc):
|
||||
"""Log density of a normal with diagonal covariance."""
|
||||
constant = -0.5 * (math.log(2 * math.pi) + logstd)
|
||||
value = -0.5 * ((loc - mean) / tf.exp(logstd)) ** 2
|
||||
return tf.reduce_sum(constant + value, -1)
|
||||
|
||||
|
||||
def diag_normal_entropy(mean, logstd):
|
||||
"""Empirical entropy of a normal with diagonal covariance."""
|
||||
constant = mean.shape[-1].value * math.log(2 * math.pi * math.e)
|
||||
return (constant + tf.reduce_sum(2 * logstd, 1)) / 2
|
||||
|
||||
|
||||
def available_gpus():
|
||||
"""List of GPU device names detected by TensorFlow."""
|
||||
local_device_protos = device_lib.list_local_devices()
|
||||
return [x.name for x in local_device_protos if x.device_type == 'GPU']
|
||||
|
||||
|
||||
def gradient_summaries(grad_vars, groups=None, scope='gradients'):
|
||||
"""Create histogram summaries of the gradient.
|
||||
|
||||
Summaries can be grouped via regexes matching variables names.
|
||||
|
||||
Args:
|
||||
grad_vars: List of (gradient, variable) tuples as returned by optimizers.
|
||||
groups: Mapping of name to regex for grouping summaries.
|
||||
scope: Name scope for this operation.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
groups = groups or {r'all': r'.*'}
|
||||
grouped = collections.defaultdict(list)
|
||||
for grad, var in grad_vars:
|
||||
if grad is None:
|
||||
continue
|
||||
for name, pattern in groups.items():
|
||||
if re.match(pattern, var.name):
|
||||
name = re.sub(pattern, name, var.name)
|
||||
grouped[name].append(grad)
|
||||
for name in groups:
|
||||
if name not in grouped:
|
||||
tf.logging.warn("No variables matching '{}' group.".format(name))
|
||||
summaries = []
|
||||
for name, grads in grouped.items():
|
||||
grads = [tf.reshape(grad, [-1]) for grad in grads]
|
||||
grads = tf.concat(grads, 0)
|
||||
summaries.append(tf.summary.histogram(scope + '/' + name, grads))
|
||||
return tf.summary.merge(summaries)
|
||||
|
||||
|
||||
def variable_summaries(vars_, groups=None, scope='weights'):
|
||||
"""Create histogram summaries for the provided variables.
|
||||
|
||||
Summaries can be grouped via regexes matching variables names.
|
||||
|
||||
Args:
|
||||
vars_: List of variables to summarize.
|
||||
groups: Mapping of name to regex for grouping summaries.
|
||||
scope: Name scope for this operation.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
groups = groups or {r'all': r'.*'}
|
||||
grouped = collections.defaultdict(list)
|
||||
for var in vars_:
|
||||
for name, pattern in groups.items():
|
||||
if re.match(pattern, var.name):
|
||||
name = re.sub(pattern, name, var.name)
|
||||
grouped[name].append(var)
|
||||
for name in groups:
|
||||
if name not in grouped:
|
||||
tf.logging.warn("No variables matching '{}' group.".format(name))
|
||||
summaries = []
|
||||
for name, vars_ in grouped.items():
|
||||
vars_ = [tf.reshape(var, [-1]) for var in vars_]
|
||||
vars_ = tf.concat(vars_, 0)
|
||||
summaries.append(tf.summary.histogram(scope + '/' + name, vars_))
|
||||
return tf.summary.merge(summaries)
|
@ -0,0 +1,23 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Executable scripts for reinforcement learning."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from . import train
|
||||
from . import utility
|
||||
from . import visualize
|
@ -0,0 +1,128 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Example configurations using the PPO algorithm."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
# pylint: disable=unused-variable
|
||||
|
||||
from pybullet_envs.minitaur.agents import ppo
|
||||
from pybullet_envs.minitaur.agents.scripts import networks
|
||||
|
||||
|
||||
def default():
|
||||
"""Default configuration for PPO."""
|
||||
# General
|
||||
algorithm = ppo.PPOAlgorithm
|
||||
num_agents = 10
|
||||
eval_episodes = 25
|
||||
use_gpu = False
|
||||
# Network
|
||||
network = networks.ForwardGaussianPolicy
|
||||
weight_summaries = dict(
|
||||
all=r'.*',
|
||||
policy=r'.*/policy/.*',
|
||||
value=r'.*/value/.*')
|
||||
policy_layers = 200, 100
|
||||
value_layers = 200, 100
|
||||
init_mean_factor = 0.05
|
||||
init_logstd = -1
|
||||
# Optimization
|
||||
update_every = 25
|
||||
policy_optimizer = 'AdamOptimizer'
|
||||
value_optimizer = 'AdamOptimizer'
|
||||
update_epochs_policy = 50
|
||||
update_epochs_value = 50
|
||||
policy_lr = 1e-4
|
||||
value_lr = 3e-4
|
||||
# Losses
|
||||
discount = 0.985
|
||||
kl_target = 1e-2
|
||||
kl_cutoff_factor = 2
|
||||
kl_cutoff_coef = 1000
|
||||
kl_init_penalty = 1
|
||||
return locals()
|
||||
|
||||
|
||||
def pendulum():
|
||||
"""Configuration for the pendulum classic control task."""
|
||||
locals().update(default())
|
||||
# Environment
|
||||
env = 'Pendulum-v0'
|
||||
max_length = 200
|
||||
steps = 1e6 # 1M
|
||||
return locals()
|
||||
|
||||
|
||||
def cheetah():
|
||||
"""Configuration for MuJoCo's half cheetah task."""
|
||||
locals().update(default())
|
||||
# Environment
|
||||
env = 'HalfCheetah-v1'
|
||||
max_length = 1000
|
||||
steps = 1e7 # 10M
|
||||
return locals()
|
||||
|
||||
|
||||
def walker():
|
||||
"""Configuration for MuJoCo's walker task."""
|
||||
locals().update(default())
|
||||
# Environment
|
||||
env = 'Walker2d-v1'
|
||||
max_length = 1000
|
||||
steps = 1e7 # 10M
|
||||
return locals()
|
||||
|
||||
|
||||
def reacher():
|
||||
"""Configuration for MuJoCo's reacher task."""
|
||||
locals().update(default())
|
||||
# Environment
|
||||
env = 'Reacher-v1'
|
||||
max_length = 1000
|
||||
steps = 1e7 # 10M
|
||||
return locals()
|
||||
|
||||
|
||||
def hopper():
|
||||
"""Configuration for MuJoCo's hopper task."""
|
||||
locals().update(default())
|
||||
# Environment
|
||||
env = 'Hopper-v1'
|
||||
max_length = 1000
|
||||
steps = 2e7 # 20M
|
||||
return locals()
|
||||
|
||||
|
||||
def ant():
|
||||
"""Configuration for MuJoCo's ant task."""
|
||||
locals().update(default())
|
||||
# Environment
|
||||
env = 'Ant-v1'
|
||||
max_length = 1000
|
||||
steps = 5e7 # 50M
|
||||
return locals()
|
||||
|
||||
|
||||
def humanoid():
|
||||
"""Configuration for MuJoCo's humanoid task."""
|
||||
locals().update(default())
|
||||
# Environment
|
||||
env = 'Humanoid-v1'
|
||||
max_length = 1000
|
||||
steps = 5e7 # 50M
|
||||
return locals()
|
@ -0,0 +1,167 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Networks for the PPO algorithm defined as recurrent cells."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
_MEAN_WEIGHTS_INITIALIZER = tf.contrib.layers.variance_scaling_initializer(
|
||||
factor=0.1)
|
||||
_LOGSTD_INITIALIZER = tf.random_normal_initializer(-1, 1e-10)
|
||||
|
||||
class LinearGaussianPolicy(tf.contrib.rnn.RNNCell):
|
||||
"""Indepent linear network with a tanh at the end for policy and feedforward network for the value.
|
||||
|
||||
The policy network outputs the mean action and the log standard deviation
|
||||
is learned as indepent parameter vector.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
policy_layers,
|
||||
value_layers,
|
||||
action_size,
|
||||
mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
|
||||
logstd_initializer=_LOGSTD_INITIALIZER):
|
||||
self._policy_layers = policy_layers
|
||||
self._value_layers = value_layers
|
||||
self._action_size = action_size
|
||||
self._mean_weights_initializer = mean_weights_initializer
|
||||
self._logstd_initializer = logstd_initializer
|
||||
|
||||
@property
|
||||
def state_size(self):
|
||||
unused_state_size = 1
|
||||
return unused_state_size
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
return (self._action_size, self._action_size, tf.TensorShape([]))
|
||||
|
||||
def __call__(self, observation, state):
|
||||
with tf.variable_scope('policy'):
|
||||
x = tf.contrib.layers.flatten(observation)
|
||||
mean = tf.contrib.layers.fully_connected(
|
||||
x,
|
||||
self._action_size,
|
||||
tf.tanh,
|
||||
weights_initializer=self._mean_weights_initializer)
|
||||
logstd = tf.get_variable('logstd', mean.shape[1:], tf.float32,
|
||||
self._logstd_initializer)
|
||||
logstd = tf.tile(logstd[None, ...],
|
||||
[tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
|
||||
with tf.variable_scope('value'):
|
||||
x = tf.contrib.layers.flatten(observation)
|
||||
for size in self._value_layers:
|
||||
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
|
||||
value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
|
||||
return (mean, logstd, value), state
|
||||
|
||||
|
||||
class ForwardGaussianPolicy(tf.contrib.rnn.RNNCell):
|
||||
"""Independent feed forward networks for policy and value.
|
||||
|
||||
The policy network outputs the mean action and the log standard deviation
|
||||
is learned as independent parameter vector.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, policy_layers, value_layers, action_size,
|
||||
mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
|
||||
logstd_initializer=_LOGSTD_INITIALIZER):
|
||||
self._policy_layers = policy_layers
|
||||
self._value_layers = value_layers
|
||||
self._action_size = action_size
|
||||
self._mean_weights_initializer = mean_weights_initializer
|
||||
self._logstd_initializer = logstd_initializer
|
||||
|
||||
@property
|
||||
def state_size(self):
|
||||
unused_state_size = 1
|
||||
return unused_state_size
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
return (self._action_size, self._action_size, tf.TensorShape([]))
|
||||
|
||||
def __call__(self, observation, state):
|
||||
with tf.variable_scope('policy'):
|
||||
x = tf.contrib.layers.flatten(observation)
|
||||
for size in self._policy_layers:
|
||||
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
|
||||
mean = tf.contrib.layers.fully_connected(
|
||||
x, self._action_size, tf.tanh,
|
||||
weights_initializer=self._mean_weights_initializer)
|
||||
logstd = tf.get_variable(
|
||||
'logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
|
||||
logstd = tf.tile(
|
||||
logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
|
||||
with tf.variable_scope('value'):
|
||||
x = tf.contrib.layers.flatten(observation)
|
||||
for size in self._value_layers:
|
||||
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
|
||||
value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
|
||||
return (mean, logstd, value), state
|
||||
|
||||
|
||||
class RecurrentGaussianPolicy(tf.contrib.rnn.RNNCell):
|
||||
"""Independent recurrent policy and feed forward value networks.
|
||||
|
||||
The policy network outputs the mean action and the log standard deviation
|
||||
is learned as independent parameter vector. The last policy layer is recurrent
|
||||
and uses a GRU cell.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, policy_layers, value_layers, action_size,
|
||||
mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
|
||||
logstd_initializer=_LOGSTD_INITIALIZER):
|
||||
self._policy_layers = policy_layers
|
||||
self._value_layers = value_layers
|
||||
self._action_size = action_size
|
||||
self._mean_weights_initializer = mean_weights_initializer
|
||||
self._logstd_initializer = logstd_initializer
|
||||
self._cell = tf.contrib.rnn.GRUBlockCell(100)
|
||||
|
||||
@property
|
||||
def state_size(self):
|
||||
return self._cell.state_size
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
return (self._action_size, self._action_size, tf.TensorShape([]))
|
||||
|
||||
def __call__(self, observation, state):
|
||||
with tf.variable_scope('policy'):
|
||||
x = tf.contrib.layers.flatten(observation)
|
||||
for size in self._policy_layers[:-1]:
|
||||
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
|
||||
x, state = self._cell(x, state)
|
||||
mean = tf.contrib.layers.fully_connected(
|
||||
x, self._action_size, tf.tanh,
|
||||
weights_initializer=self._mean_weights_initializer)
|
||||
logstd = tf.get_variable(
|
||||
'logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
|
||||
logstd = tf.tile(
|
||||
logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
|
||||
with tf.variable_scope('value'):
|
||||
x = tf.contrib.layers.flatten(observation)
|
||||
for size in self._value_layers:
|
||||
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
|
||||
value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
|
||||
return (mean, logstd, value), state
|
@ -0,0 +1,165 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
r"""Script to train a batch reinforcement learning algorithm.
|
||||
|
||||
Command line:
|
||||
|
||||
python3 -m agents.scripts.train --logdir=/path/to/logdir --config=pendulum
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import datetime
|
||||
import functools
|
||||
import os
|
||||
|
||||
import gym
|
||||
import tensorflow as tf
|
||||
|
||||
from pybullet_envs.minitaur.agents import tools
|
||||
from pybullet_envs.minitaur.agents.scripts import configs
|
||||
from pybullet_envs.minitaur.agents.scripts import utility
|
||||
|
||||
|
||||
def _create_environment(config):
|
||||
"""Constructor for an instance of the environment.
|
||||
|
||||
Args:
|
||||
config: Object providing configurations via attributes.
|
||||
|
||||
Returns:
|
||||
Wrapped OpenAI Gym environment.
|
||||
"""
|
||||
if isinstance(config.env, str):
|
||||
env = gym.make(config.env)
|
||||
else:
|
||||
env = config.env()
|
||||
if config.max_length:
|
||||
env = tools.wrappers.LimitDuration(env, config.max_length)
|
||||
env = tools.wrappers.RangeNormalize(env)
|
||||
env = tools.wrappers.ClipAction(env)
|
||||
env = tools.wrappers.ConvertTo32Bit(env)
|
||||
return env
|
||||
|
||||
|
||||
def _define_loop(graph, logdir, train_steps, eval_steps):
|
||||
"""Create and configure a training loop with training and evaluation phases.
|
||||
|
||||
Args:
|
||||
graph: Object providing graph elements via attributes.
|
||||
logdir: Log directory for storing checkpoints and summaries.
|
||||
train_steps: Number of training steps per epoch.
|
||||
eval_steps: Number of evaluation steps per epoch.
|
||||
|
||||
Returns:
|
||||
Loop object.
|
||||
"""
|
||||
loop = tools.Loop(
|
||||
logdir, graph.step, graph.should_log, graph.do_report,
|
||||
graph.force_reset)
|
||||
loop.add_phase(
|
||||
'train', graph.done, graph.score, graph.summary, train_steps,
|
||||
report_every=None,
|
||||
log_every=train_steps // 2,
|
||||
checkpoint_every=None,
|
||||
feed={graph.is_training: True})
|
||||
loop.add_phase(
|
||||
'eval', graph.done, graph.score, graph.summary, eval_steps,
|
||||
report_every=eval_steps,
|
||||
log_every=eval_steps // 2,
|
||||
checkpoint_every=10 * eval_steps,
|
||||
feed={graph.is_training: False})
|
||||
return loop
|
||||
|
||||
|
||||
def train(config, env_processes):
|
||||
"""Training and evaluation entry point yielding scores.
|
||||
|
||||
Resolves some configuration attributes, creates environments, graph, and
|
||||
training loop. By default, assigns all operations to the CPU.
|
||||
|
||||
Args:
|
||||
config: Object providing configurations via attributes.
|
||||
env_processes: Whether to step environments in separate processes.
|
||||
|
||||
Yields:
|
||||
Evaluation scores.
|
||||
"""
|
||||
tf.reset_default_graph()
|
||||
with config.unlocked:
|
||||
config.network = functools.partial(
|
||||
utility.define_network, config.network, config)
|
||||
config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
|
||||
config.value_optimizer = getattr(tf.train, config.value_optimizer)
|
||||
if config.update_every % config.num_agents:
|
||||
tf.logging.warn('Number of agents should divide episodes per update.')
|
||||
with tf.device('/cpu:0'):
|
||||
batch_env = utility.define_batch_env(
|
||||
lambda: _create_environment(config),
|
||||
config.num_agents, env_processes)
|
||||
graph = utility.define_simulation_graph(
|
||||
batch_env, config.algorithm, config)
|
||||
loop = _define_loop(
|
||||
graph, config.logdir,
|
||||
config.update_every * config.max_length,
|
||||
config.eval_episodes * config.max_length)
|
||||
total_steps = int(
|
||||
config.steps / config.update_every *
|
||||
(config.update_every + config.eval_episodes))
|
||||
# Exclude episode related variables since the Python state of environments is
|
||||
# not checkpointed and thus new episodes start after resuming.
|
||||
saver = utility.define_saver(exclude=(r'.*_temporary/.*',))
|
||||
sess_config = tf.ConfigProto(allow_soft_placement=True)
|
||||
sess_config.gpu_options.allow_growth = True
|
||||
with tf.Session(config=sess_config) as sess:
|
||||
utility.initialize_variables(sess, saver, config.logdir)
|
||||
for score in loop.run(sess, saver, total_steps):
|
||||
yield score
|
||||
batch_env.close()
|
||||
|
||||
|
||||
def main(_):
|
||||
"""Create or load configuration and launch the trainer."""
|
||||
utility.set_up_logging()
|
||||
if not FLAGS.config:
|
||||
raise KeyError('You must specify a configuration.')
|
||||
logdir = FLAGS.logdir and os.path.expanduser(os.path.join(
|
||||
FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config)))
|
||||
try:
|
||||
config = utility.load_config(logdir)
|
||||
except IOError:
|
||||
config = tools.AttrDict(getattr(configs, FLAGS.config)())
|
||||
config = utility.save_config(config, logdir)
|
||||
for score in train(config, FLAGS.env_processes):
|
||||
tf.logging.info('Score {}.'.format(score))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
FLAGS = tf.app.flags.FLAGS
|
||||
tf.app.flags.DEFINE_string(
|
||||
'logdir', None,
|
||||
'Base directory to store logs.')
|
||||
tf.app.flags.DEFINE_string(
|
||||
'timestamp', datetime.datetime.now().strftime('%Y%m%dT%H%M%S'),
|
||||
'Sub directory to store logs.')
|
||||
tf.app.flags.DEFINE_string(
|
||||
'config', None,
|
||||
'Configuration to execute.')
|
||||
tf.app.flags.DEFINE_boolean(
|
||||
'env_processes', True,
|
||||
'Step environments in separate processes to circumvent the GIL.')
|
||||
tf.app.run()
|
@ -0,0 +1,110 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Tests for the PPO algorithm usage example."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import functools
|
||||
import itertools
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from google3.robotics.reinforcement_learning.agents import ppo
|
||||
from google3.robotics.reinforcement_learning.agents import tools
|
||||
from google3.robotics.reinforcement_learning.agents.scripts import configs
|
||||
from google3.robotics.reinforcement_learning.agents.scripts import networks
|
||||
from google3.robotics.reinforcement_learning.agents.scripts import train
|
||||
|
||||
|
||||
FLAGS = tf.app.flags.FLAGS
|
||||
|
||||
|
||||
class PPOTest(tf.test.TestCase):
|
||||
|
||||
def test_no_crash_cheetah(self):
|
||||
nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
|
||||
for network in nets:
|
||||
config = self._define_config()
|
||||
with config.unlocked:
|
||||
config.env = 'HalfCheetah-v1'
|
||||
config.max_length = 200
|
||||
config.steps = 1000
|
||||
config.network = network
|
||||
for score in train.train(config, env_processes=True):
|
||||
float(score)
|
||||
|
||||
def test_no_crash_ant(self):
|
||||
nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
|
||||
for network in nets:
|
||||
config = self._define_config()
|
||||
with config.unlocked:
|
||||
config.env = 'Ant-v1'
|
||||
config.max_length = 200
|
||||
config.steps = 1000
|
||||
config.network = network
|
||||
for score in train.train(config, env_processes=True):
|
||||
float(score)
|
||||
|
||||
def test_no_crash_observation_shape(self):
|
||||
nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
|
||||
observ_shapes = (1,), (2, 3), (2, 3, 4)
|
||||
for network, observ_shape in itertools.product(nets, observ_shapes):
|
||||
config = self._define_config()
|
||||
with config.unlocked:
|
||||
config.env = functools.partial(
|
||||
tools.MockEnvironment, observ_shape, action_shape=(3,),
|
||||
min_duration=15, max_duration=15)
|
||||
config.max_length = 20
|
||||
config.steps = 100
|
||||
config.network = network
|
||||
for score in train.train(config, env_processes=False):
|
||||
float(score)
|
||||
|
||||
def test_no_crash_variable_duration(self):
|
||||
config = self._define_config()
|
||||
with config.unlocked:
|
||||
config.env = functools.partial(
|
||||
tools.MockEnvironment, observ_shape=(2, 3), action_shape=(3,),
|
||||
min_duration=5, max_duration=25)
|
||||
config.max_length = 25
|
||||
config.steps = 200
|
||||
config.network = networks.RecurrentGaussianPolicy
|
||||
for score in train.train(config, env_processes=False):
|
||||
float(score)
|
||||
|
||||
def _define_config(self):
|
||||
# Start from the example configuration.
|
||||
locals().update(configs.default())
|
||||
# pylint: disable=unused-variable
|
||||
# General
|
||||
algorithm = ppo.PPOAlgorithm
|
||||
num_agents = 2
|
||||
update_every = 4
|
||||
use_gpu = False
|
||||
# Network
|
||||
policy_layers = 20, 10
|
||||
value_layers = 20, 10
|
||||
# Optimization
|
||||
update_epochs_policy = 2
|
||||
update_epochs_value = 2
|
||||
# pylint: enable=unused-variable
|
||||
return tools.AttrDict(locals())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
FLAGS.config = 'unused'
|
||||
tf.test.main()
|
@ -0,0 +1,213 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Utilities for using reinforcement learning algorithms."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
import ruamel.yaml as yaml
|
||||
import tensorflow as tf
|
||||
|
||||
from pybullet_envs.minitaur.agents import tools
|
||||
|
||||
|
||||
def define_simulation_graph(batch_env, algo_cls, config):
|
||||
"""Define the algortihm and environment interaction.
|
||||
|
||||
Args:
|
||||
batch_env: In-graph environments object.
|
||||
algo_cls: Constructor of a batch algorithm.
|
||||
config: Configuration object for the algorithm.
|
||||
|
||||
Returns:
|
||||
Object providing graph elements via attributes.
|
||||
"""
|
||||
# pylint: disable=unused-variable
|
||||
step = tf.Variable(0, False, dtype=tf.int32, name='global_step')
|
||||
is_training = tf.placeholder(tf.bool, name='is_training')
|
||||
should_log = tf.placeholder(tf.bool, name='should_log')
|
||||
do_report = tf.placeholder(tf.bool, name='do_report')
|
||||
force_reset = tf.placeholder(tf.bool, name='force_reset')
|
||||
algo = algo_cls(batch_env, step, is_training, should_log, config)
|
||||
done, score, summary = tools.simulate(
|
||||
batch_env, algo, should_log, force_reset)
|
||||
message = 'Graph contains {} trainable variables.'
|
||||
tf.logging.info(message.format(tools.count_weights()))
|
||||
# pylint: enable=unused-variable
|
||||
return tools.AttrDict(locals())
|
||||
|
||||
|
||||
def define_batch_env(constructor, num_agents, env_processes):
|
||||
"""Create environments and apply all desired wrappers.
|
||||
|
||||
Args:
|
||||
constructor: Constructor of an OpenAI gym environment.
|
||||
num_agents: Number of environments to combine in the batch.
|
||||
env_processes: Whether to step environment in external processes.
|
||||
|
||||
Returns:
|
||||
In-graph environments object.
|
||||
"""
|
||||
with tf.variable_scope('environments'):
|
||||
if env_processes:
|
||||
envs = [
|
||||
tools.wrappers.ExternalProcess(constructor)
|
||||
for _ in range(num_agents)]
|
||||
else:
|
||||
envs = [constructor() for _ in range(num_agents)]
|
||||
batch_env = tools.BatchEnv(envs, blocking=not env_processes)
|
||||
batch_env = tools.InGraphBatchEnv(batch_env)
|
||||
return batch_env
|
||||
|
||||
|
||||
def define_saver(exclude=None):
|
||||
"""Create a saver for the variables we want to checkpoint.
|
||||
|
||||
Args:
|
||||
exclude: List of regexes to match variable names to exclude.
|
||||
|
||||
Returns:
|
||||
Saver object.
|
||||
"""
|
||||
variables = []
|
||||
exclude = exclude or []
|
||||
exclude = [re.compile(regex) for regex in exclude]
|
||||
for variable in tf.global_variables():
|
||||
if any(regex.match(variable.name) for regex in exclude):
|
||||
continue
|
||||
variables.append(variable)
|
||||
saver = tf.train.Saver(variables, keep_checkpoint_every_n_hours=5)
|
||||
return saver
|
||||
|
||||
|
||||
def define_network(constructor, config, action_size):
|
||||
"""Constructor for the recurrent cell for the algorithm.
|
||||
|
||||
Args:
|
||||
constructor: Callable returning the network as RNNCell.
|
||||
config: Object providing configurations via attributes.
|
||||
action_size: Integer indicating the amount of action dimensions.
|
||||
|
||||
Returns:
|
||||
Created recurrent cell object.
|
||||
"""
|
||||
mean_weights_initializer = (
|
||||
tf.contrib.layers.variance_scaling_initializer(
|
||||
factor=config.init_mean_factor))
|
||||
logstd_initializer = tf.random_normal_initializer(
|
||||
config.init_logstd, 1e-10)
|
||||
network = constructor(
|
||||
config.policy_layers, config.value_layers, action_size,
|
||||
mean_weights_initializer=mean_weights_initializer,
|
||||
logstd_initializer=logstd_initializer)
|
||||
return network
|
||||
|
||||
|
||||
def initialize_variables(sess, saver, logdir, checkpoint=None, resume=None):
|
||||
"""Initialize or restore variables from a checkpoint if available.
|
||||
|
||||
Args:
|
||||
sess: Session to initialize variables in.
|
||||
saver: Saver to restore variables.
|
||||
logdir: Directory to search for checkpoints.
|
||||
checkpoint: Specify what checkpoint name to use; defaults to most recent.
|
||||
resume: Whether to expect recovering a checkpoint or starting a new run.
|
||||
|
||||
Raises:
|
||||
ValueError: If resume expected but no log directory specified.
|
||||
RuntimeError: If no resume expected but a checkpoint was found.
|
||||
"""
|
||||
sess.run(tf.group(
|
||||
tf.local_variables_initializer(),
|
||||
tf.global_variables_initializer()))
|
||||
if resume and not (logdir or checkpoint):
|
||||
raise ValueError('Need to specify logdir to resume a checkpoint.')
|
||||
if logdir:
|
||||
state = tf.train.get_checkpoint_state(logdir)
|
||||
if checkpoint:
|
||||
checkpoint = os.path.join(logdir, checkpoint)
|
||||
if not checkpoint and state and state.model_checkpoint_path:
|
||||
checkpoint = state.model_checkpoint_path
|
||||
if checkpoint and resume is False:
|
||||
message = 'Found unexpected checkpoint when starting a new run.'
|
||||
raise RuntimeError(message)
|
||||
if checkpoint:
|
||||
saver.restore(sess, checkpoint)
|
||||
|
||||
|
||||
def save_config(config, logdir=None):
|
||||
"""Save a new configuration by name.
|
||||
|
||||
If a logging directory is specified, is will be created and the configuration
|
||||
will be stored there. Otherwise, a log message will be printed.
|
||||
|
||||
Args:
|
||||
config: Configuration object.
|
||||
logdir: Location for writing summaries and checkpoints if specified.
|
||||
|
||||
Returns:
|
||||
Configuration object.
|
||||
"""
|
||||
if logdir:
|
||||
with config.unlocked:
|
||||
config.logdir = logdir
|
||||
message = 'Start a new run and write summaries and checkpoints to {}.'
|
||||
tf.logging.info(message.format(config.logdir))
|
||||
tf.gfile.MakeDirs(config.logdir)
|
||||
config_path = os.path.join(config.logdir, 'config.yaml')
|
||||
with tf.gfile.FastGFile(config_path, 'w') as file_:
|
||||
yaml.dump(config, file_, default_flow_style=False)
|
||||
else:
|
||||
message = (
|
||||
'Start a new run without storing summaries and checkpoints since no '
|
||||
'logging directory was specified.')
|
||||
tf.logging.info(message)
|
||||
return config
|
||||
|
||||
|
||||
def load_config(logdir):
|
||||
"""Load a configuration from the log directory.
|
||||
|
||||
Args:
|
||||
logdir: The logging directory containing the configuration file.
|
||||
|
||||
Raises:
|
||||
IOError: The logging directory does not contain a configuration file.
|
||||
|
||||
Returns:
|
||||
Configuration object.
|
||||
"""
|
||||
config_path = logdir and os.path.join(logdir, 'config.yaml')
|
||||
if not config_path or not tf.gfile.Exists(config_path):
|
||||
message = (
|
||||
'Cannot resume an existing run since the logging directory does not '
|
||||
'contain a configuration file.')
|
||||
raise IOError(message)
|
||||
with tf.gfile.FastGFile(config_path, 'r') as file_:
|
||||
config = yaml.load(file_)
|
||||
message = 'Resume run and write summaries and checkpoints to {}.'
|
||||
tf.logging.info(message.format(config.logdir))
|
||||
return config
|
||||
|
||||
|
||||
def set_up_logging():
|
||||
"""Configure the TensorFlow logger."""
|
||||
tf.logging.set_verbosity(tf.logging.INFO)
|
||||
logging.getLogger('tensorflow').propagate = False
|
@ -0,0 +1,157 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
r"""Script to render videos of the Proximal Policy Gradient algorithm.
|
||||
|
||||
Command line:
|
||||
|
||||
python3 -m agents.scripts.visualize \
|
||||
--logdir=/path/to/logdir/<time>-<config> --outdir=/path/to/outdir/
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import functools
|
||||
import os
|
||||
|
||||
import gym
|
||||
import tensorflow as tf
|
||||
|
||||
from pybullet_envs.minitaur.agents import tools
|
||||
from pybullet_envs.minitaur.agents.scripts import utility
|
||||
|
||||
|
||||
def _create_environment(config, outdir):
|
||||
"""Constructor for an instance of the environment.
|
||||
|
||||
Args:
|
||||
config: Object providing configurations via attributes.
|
||||
outdir: Directory to store videos in.
|
||||
|
||||
Returns:
|
||||
Wrapped OpenAI Gym environment.
|
||||
"""
|
||||
if isinstance(config.env, str):
|
||||
env = gym.make(config.env)
|
||||
else:
|
||||
env = config.env()
|
||||
# Ensure that the environment has the specification attribute set as expected
|
||||
# by the monitor wrapper.
|
||||
if not hasattr(env, 'spec'):
|
||||
setattr(env, 'spec', getattr(env, 'spec', None))
|
||||
if config.max_length:
|
||||
env = tools.wrappers.LimitDuration(env, config.max_length)
|
||||
# env = gym.wrappers.Monitor(
|
||||
# env, outdir, lambda unused_episode_number: True)
|
||||
env = tools.wrappers.RangeNormalize(env)
|
||||
env = tools.wrappers.ClipAction(env)
|
||||
env = tools.wrappers.ConvertTo32Bit(env)
|
||||
return env
|
||||
|
||||
|
||||
def _define_loop(graph, eval_steps):
|
||||
"""Create and configure an evaluation loop.
|
||||
|
||||
Args:
|
||||
graph: Object providing graph elements via attributes.
|
||||
eval_steps: Number of evaluation steps per epoch.
|
||||
|
||||
Returns:
|
||||
Loop object.
|
||||
"""
|
||||
loop = tools.Loop(
|
||||
None, graph.step, graph.should_log, graph.do_report, graph.force_reset)
|
||||
loop.add_phase(
|
||||
'eval', graph.done, graph.score, graph.summary, eval_steps,
|
||||
report_every=eval_steps,
|
||||
log_every=None,
|
||||
checkpoint_every=None,
|
||||
feed={graph.is_training: False})
|
||||
return loop
|
||||
|
||||
|
||||
def visualize(
|
||||
logdir, outdir, num_agents, num_episodes, checkpoint=None,
|
||||
env_processes=True):
|
||||
"""Recover checkpoint and render videos from it.
|
||||
|
||||
Args:
|
||||
logdir: Logging directory of the trained algorithm.
|
||||
outdir: Directory to store rendered videos in.
|
||||
num_agents: Number of environments to simulate in parallel.
|
||||
num_episodes: Total number of episodes to simulate.
|
||||
checkpoint: Checkpoint name to load; defaults to most recent.
|
||||
env_processes: Whether to step environments in separate processes.
|
||||
"""
|
||||
config = utility.load_config(logdir)
|
||||
with config.unlocked:
|
||||
config.network = functools.partial(
|
||||
utility.define_network, config.network, config)
|
||||
config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
|
||||
config.value_optimizer = getattr(tf.train, config.value_optimizer)
|
||||
with tf.device('/cpu:0'):
|
||||
batch_env = utility.define_batch_env(
|
||||
lambda: _create_environment(config, outdir),
|
||||
num_agents, env_processes)
|
||||
graph = utility.define_simulation_graph(
|
||||
batch_env, config.algorithm, config)
|
||||
total_steps = num_episodes * config.max_length
|
||||
loop = _define_loop(graph, total_steps)
|
||||
saver = utility.define_saver(
|
||||
exclude=(r'.*_temporary/.*', r'global_step'))
|
||||
sess_config = tf.ConfigProto(allow_soft_placement=True)
|
||||
sess_config.gpu_options.allow_growth = True
|
||||
with tf.Session(config=sess_config) as sess:
|
||||
utility.initialize_variables(
|
||||
sess, saver, config.logdir, checkpoint, resume=True)
|
||||
for unused_score in loop.run(sess, saver, total_steps):
|
||||
pass
|
||||
batch_env.close()
|
||||
|
||||
|
||||
def main(_):
|
||||
"""Load a trained algorithm and render videos."""
|
||||
utility.set_up_logging()
|
||||
if not FLAGS.logdir or not FLAGS.outdir:
|
||||
raise KeyError('You must specify logging and outdirs directories.')
|
||||
FLAGS.logdir = os.path.expanduser(FLAGS.logdir)
|
||||
FLAGS.outdir = os.path.expanduser(FLAGS.outdir)
|
||||
visualize(
|
||||
FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents, FLAGS.num_episodes,
|
||||
FLAGS.checkpoint, FLAGS.env_processes)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
FLAGS = tf.app.flags.FLAGS
|
||||
tf.app.flags.DEFINE_string(
|
||||
'logdir', None,
|
||||
'Directory to the checkpoint of a training run.')
|
||||
tf.app.flags.DEFINE_string(
|
||||
'outdir', None,
|
||||
'Local directory for storing the monitoring outdir.')
|
||||
tf.app.flags.DEFINE_string(
|
||||
'checkpoint', None,
|
||||
'Checkpoint name to load; defaults to most recent.')
|
||||
tf.app.flags.DEFINE_integer(
|
||||
'num_agents', 1,
|
||||
'How many environments to step in parallel.')
|
||||
tf.app.flags.DEFINE_integer(
|
||||
'num_episodes', 5,
|
||||
'Minimum number of episodes to render.')
|
||||
tf.app.flags.DEFINE_boolean(
|
||||
'env_processes', True,
|
||||
'Step environments in separate processes to circumvent the GIL.')
|
||||
tf.app.run()
|
@ -0,0 +1,30 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Tools for reinforcement learning."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from .attr_dict import AttrDict
|
||||
from .batch_env import BatchEnv
|
||||
from .count_weights import count_weights
|
||||
from .in_graph_batch_env import InGraphBatchEnv
|
||||
from .in_graph_env import InGraphEnv
|
||||
from .loop import Loop
|
||||
from .mock_algorithm import MockAlgorithm
|
||||
from .mock_environment import MockEnvironment
|
||||
from .simulate import simulate
|
||||
from .streaming_mean import StreamingMean
|
@ -0,0 +1,54 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Wrap a dictionary to access keys as attributes."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import contextlib
|
||||
|
||||
|
||||
class AttrDict(dict):
|
||||
"""Wrap a dictionary to access keys as attributes."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(AttrDict, self).__init__(*args, **kwargs)
|
||||
super(AttrDict, self).__setattr__('_mutable', False)
|
||||
|
||||
def __getattr__(self, key):
|
||||
# Do not provide None for unimplemented magic attributes.
|
||||
if key.startswith('__'):
|
||||
raise AttributeError
|
||||
return self.get(key, None)
|
||||
|
||||
def __setattr__(self, key, value):
|
||||
if not self._mutable:
|
||||
message = "Cannot set attribute '{}'.".format(key)
|
||||
message += " Use 'with obj.unlocked:' scope to set attributes."
|
||||
raise RuntimeError(message)
|
||||
if key.startswith('__'):
|
||||
raise AttributeError("Cannot set magic attribute '{}'".format(key))
|
||||
self[key] = value
|
||||
|
||||
@property
|
||||
@contextlib.contextmanager
|
||||
def unlocked(self):
|
||||
super(AttrDict, self).__setattr__('_mutable', True)
|
||||
yield
|
||||
super(AttrDict, self).__setattr__('_mutable', False)
|
||||
|
||||
def copy(self):
|
||||
return type(self)(super(AttrDict, self).copy())
|
@ -0,0 +1,71 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Tests for the attribute dictionary."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from pybullet_envs.minitaur.agents.tools import attr_dict
|
||||
|
||||
|
||||
class AttrDictTest(tf.test.TestCase):
|
||||
|
||||
def test_construct_from_dict(self):
|
||||
initial = dict(foo=13, bar=42)
|
||||
obj = attr_dict.AttrDict(initial)
|
||||
self.assertEqual(13, obj.foo)
|
||||
self.assertEqual(42, obj.bar)
|
||||
|
||||
def test_construct_from_kwargs(self):
|
||||
obj = attr_dict.AttrDict(foo=13, bar=42)
|
||||
self.assertEqual(13, obj.foo)
|
||||
self.assertEqual(42, obj.bar)
|
||||
|
||||
def test_has_attribute(self):
|
||||
obj = attr_dict.AttrDict(foo=13)
|
||||
self.assertTrue('foo' in obj)
|
||||
self.assertFalse('bar' in obj)
|
||||
|
||||
def test_access_default(self):
|
||||
obj = attr_dict.AttrDict()
|
||||
self.assertEqual(None, obj.foo)
|
||||
|
||||
def test_access_magic(self):
|
||||
obj = attr_dict.AttrDict()
|
||||
with self.assertRaises(AttributeError):
|
||||
obj.__getstate__ # pylint: disable=pointless-statement
|
||||
|
||||
def test_immutable_create(self):
|
||||
obj = attr_dict.AttrDict()
|
||||
with self.assertRaises(RuntimeError):
|
||||
obj.foo = 42
|
||||
|
||||
def test_immutable_modify(self):
|
||||
obj = attr_dict.AttrDict(foo=13)
|
||||
with self.assertRaises(RuntimeError):
|
||||
obj.foo = 42
|
||||
|
||||
def test_immutable_unlocked(self):
|
||||
obj = attr_dict.AttrDict()
|
||||
with obj.unlocked:
|
||||
obj.foo = 42
|
||||
self.assertEqual(42, obj.foo)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
tf.test.main()
|
@ -0,0 +1,125 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Combine multiple environments to step them in batch."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class BatchEnv(object):
|
||||
"""Combine multiple environments to step them in batch."""
|
||||
|
||||
def __init__(self, envs, blocking):
|
||||
"""Combine multiple environments to step them in batch.
|
||||
|
||||
To step environments in parallel, environments must support a
|
||||
`blocking=False` argument to their step and reset functions that makes them
|
||||
return callables instead to receive the result at a later time.
|
||||
|
||||
Args:
|
||||
envs: List of environments.
|
||||
blocking: Step environments after another rather than in parallel.
|
||||
|
||||
Raises:
|
||||
ValueError: Environments have different observation or action spaces.
|
||||
"""
|
||||
self._envs = envs
|
||||
self._blocking = blocking
|
||||
observ_space = self._envs[0].observation_space
|
||||
if not all(env.observation_space == observ_space for env in self._envs):
|
||||
raise ValueError('All environments must use the same observation space.')
|
||||
action_space = self._envs[0].action_space
|
||||
if not all(env.action_space == action_space for env in self._envs):
|
||||
raise ValueError('All environments must use the same observation space.')
|
||||
|
||||
def __len__(self):
|
||||
"""Number of combined environments."""
|
||||
return len(self._envs)
|
||||
|
||||
def __getitem__(self, index):
|
||||
"""Access an underlying environment by index."""
|
||||
return self._envs[index]
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Forward unimplemented attributes to one of the original environments.
|
||||
|
||||
Args:
|
||||
name: Attribute that was accessed.
|
||||
|
||||
Returns:
|
||||
Value behind the attribute name one of the wrapped environments.
|
||||
"""
|
||||
return getattr(self._envs[0], name)
|
||||
|
||||
def step(self, action):
|
||||
"""Forward a batch of actions to the wrapped environments.
|
||||
|
||||
Args:
|
||||
action: Batched action to apply to the environment.
|
||||
|
||||
Raises:
|
||||
ValueError: Invalid actions.
|
||||
|
||||
Returns:
|
||||
Batch of observations, rewards, and done flags.
|
||||
"""
|
||||
actions = action
|
||||
for index, (env, action) in enumerate(zip(self._envs, actions)):
|
||||
if not env.action_space.contains(action):
|
||||
message = 'Invalid action at index {}: {}'
|
||||
raise ValueError(message.format(index, action))
|
||||
if self._blocking:
|
||||
transitions = [
|
||||
env.step(action)
|
||||
for env, action in zip(self._envs, actions)]
|
||||
else:
|
||||
transitions = [
|
||||
env.step(action, blocking=False)
|
||||
for env, action in zip(self._envs, actions)]
|
||||
transitions = [transition() for transition in transitions]
|
||||
observs, rewards, dones, infos = zip(*transitions)
|
||||
observ = np.stack(observs)
|
||||
reward = np.stack(rewards)
|
||||
done = np.stack(dones)
|
||||
info = tuple(infos)
|
||||
return observ, reward, done, info
|
||||
|
||||
def reset(self, indices=None):
|
||||
"""Reset the environment and convert the resulting observation.
|
||||
|
||||
Args:
|
||||
indices: The batch indices of environments to reset; defaults to all.
|
||||
|
||||
Returns:
|
||||
Batch of observations.
|
||||
"""
|
||||
if indices is None:
|
||||
indices = np.arange(len(self._envs))
|
||||
if self._blocking:
|
||||
observs = [self._envs[index].reset() for index in indices]
|
||||
else:
|
||||
observs = [self._envs[index].reset(blocking=False) for index in indices]
|
||||
observs = [observ() for observ in observs]
|
||||
observ = np.stack(observs)
|
||||
return observ
|
||||
|
||||
def close(self):
|
||||
"""Send close messages to the external process and join them."""
|
||||
for env in self._envs:
|
||||
if hasattr(env, 'close'):
|
||||
env.close()
|
@ -0,0 +1,48 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Count learnable parameters."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def count_weights(scope=None, exclude=None, graph=None):
|
||||
"""Count learnable parameters.
|
||||
|
||||
Args:
|
||||
scope: Resrict the count to a variable scope.
|
||||
exclude: Regex to match variable names to exclude.
|
||||
graph: Operate on a graph other than the current default graph.
|
||||
|
||||
Returns:
|
||||
Number of learnable parameters as integer.
|
||||
"""
|
||||
if scope:
|
||||
scope = scope if scope.endswith('/') else scope + '/'
|
||||
graph = graph or tf.get_default_graph()
|
||||
vars_ = graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
|
||||
if scope:
|
||||
vars_ = [var for var in vars_ if var.name.startswith(scope)]
|
||||
if exclude:
|
||||
exclude = re.compile(exclude)
|
||||
vars_ = [var for var in vars_ if not exclude.match(var.name)]
|
||||
shapes = [var.get_shape().as_list() for var in vars_]
|
||||
return int(sum(np.prod(shape) for shape in shapes))
|
@ -0,0 +1,98 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Tests for the weight counting utility."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from agents.tools import count_weights
|
||||
|
||||
|
||||
class CountWeightsTest(tf.test.TestCase):
|
||||
|
||||
def test_count_trainable(self):
|
||||
tf.Variable(tf.zeros((5, 3)), trainable=True)
|
||||
tf.Variable(tf.zeros((1, 1)), trainable=True)
|
||||
tf.Variable(tf.zeros((5,)), trainable=True)
|
||||
self.assertEqual(15 + 1 + 5, count_weights())
|
||||
|
||||
def test_ignore_non_trainable(self):
|
||||
tf.Variable(tf.zeros((5, 3)), trainable=False)
|
||||
tf.Variable(tf.zeros((1, 1)), trainable=False)
|
||||
tf.Variable(tf.zeros((5,)), trainable=False)
|
||||
self.assertEqual(0, count_weights())
|
||||
|
||||
def test_trainable_and_non_trainable(self):
|
||||
tf.Variable(tf.zeros((5, 3)), trainable=True)
|
||||
tf.Variable(tf.zeros((8, 2)), trainable=False)
|
||||
tf.Variable(tf.zeros((1, 1)), trainable=True)
|
||||
tf.Variable(tf.zeros((5,)), trainable=True)
|
||||
tf.Variable(tf.zeros((3, 1)), trainable=False)
|
||||
self.assertEqual(15 + 1 + 5, count_weights())
|
||||
|
||||
def test_include_scopes(self):
|
||||
tf.Variable(tf.zeros((3, 2)), trainable=True)
|
||||
with tf.variable_scope('foo'):
|
||||
tf.Variable(tf.zeros((5, 2)), trainable=True)
|
||||
self.assertEqual(6 + 10, count_weights())
|
||||
|
||||
def test_restrict_scope(self):
|
||||
tf.Variable(tf.zeros((3, 2)), trainable=True)
|
||||
with tf.variable_scope('foo'):
|
||||
tf.Variable(tf.zeros((5, 2)), trainable=True)
|
||||
with tf.variable_scope('bar'):
|
||||
tf.Variable(tf.zeros((1, 2)), trainable=True)
|
||||
self.assertEqual(10 + 2, count_weights('foo'))
|
||||
|
||||
def test_restrict_nested_scope(self):
|
||||
tf.Variable(tf.zeros((3, 2)), trainable=True)
|
||||
with tf.variable_scope('foo'):
|
||||
tf.Variable(tf.zeros((5, 2)), trainable=True)
|
||||
with tf.variable_scope('bar'):
|
||||
tf.Variable(tf.zeros((1, 2)), trainable=True)
|
||||
self.assertEqual(2, count_weights('foo/bar'))
|
||||
|
||||
def test_restrict_invalid_scope(self):
|
||||
tf.Variable(tf.zeros((3, 2)), trainable=True)
|
||||
with tf.variable_scope('foo'):
|
||||
tf.Variable(tf.zeros((5, 2)), trainable=True)
|
||||
with tf.variable_scope('bar'):
|
||||
tf.Variable(tf.zeros((1, 2)), trainable=True)
|
||||
self.assertEqual(0, count_weights('bar'))
|
||||
|
||||
def test_exclude_by_regex(self):
|
||||
tf.Variable(tf.zeros((3, 2)), trainable=True)
|
||||
with tf.variable_scope('foo'):
|
||||
tf.Variable(tf.zeros((5, 2)), trainable=True)
|
||||
with tf.variable_scope('bar'):
|
||||
tf.Variable(tf.zeros((1, 2)), trainable=True)
|
||||
self.assertEqual(0, count_weights(exclude=r'.*'))
|
||||
self.assertEqual(6, count_weights(exclude=r'(^|/)foo/.*'))
|
||||
self.assertEqual(16, count_weights(exclude=r'.*/bar/.*'))
|
||||
|
||||
def test_non_default_graph(self):
|
||||
graph = tf.Graph()
|
||||
with graph.as_default():
|
||||
tf.Variable(tf.zeros((5, 3)), trainable=True)
|
||||
tf.Variable(tf.zeros((8, 2)), trainable=False)
|
||||
self.assertNotEqual(graph, tf.get_default_graph)
|
||||
self.assertEqual(15, count_weights(graph=graph))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
tf.test.main()
|
@ -0,0 +1,178 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Batch of environments inside the TensorFlow graph."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
import pdb
|
||||
import gym
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class InGraphBatchEnv(object):
|
||||
"""Batch of environments inside the TensorFlow graph.
|
||||
|
||||
The batch of environments will be stepped and reset inside of the graph using
|
||||
a tf.py_func(). The current batch of observations, actions, rewards, and done
|
||||
flags are held in according variables.
|
||||
"""
|
||||
|
||||
def __init__(self, batch_env):
|
||||
"""Batch of environments inside the TensorFlow graph.
|
||||
|
||||
Args:
|
||||
batch_env: Batch environment.
|
||||
"""
|
||||
self._batch_env = batch_env
|
||||
observ_shape = self._parse_shape(self._batch_env.observation_space)
|
||||
observ_dtype = self._parse_dtype(self._batch_env.observation_space)
|
||||
action_shape = self._parse_shape(self._batch_env.action_space)
|
||||
action_dtype = self._parse_dtype(self._batch_env.action_space)
|
||||
with tf.variable_scope('env_temporary'):
|
||||
self._observ = tf.Variable(
|
||||
tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype),
|
||||
name='observ', trainable=False)
|
||||
self._action = tf.Variable(
|
||||
tf.zeros((len(self._batch_env),) + action_shape, action_dtype),
|
||||
name='action', trainable=False)
|
||||
self._reward = tf.Variable(
|
||||
tf.zeros((len(self._batch_env),), tf.float32),
|
||||
name='reward', trainable=False)
|
||||
self._done = tf.Variable(
|
||||
tf.cast(tf.ones((len(self._batch_env),)), tf.bool),
|
||||
name='done', trainable=False)
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Forward unimplemented attributes to one of the original environments.
|
||||
|
||||
Args:
|
||||
name: Attribute that was accessed.
|
||||
|
||||
Returns:
|
||||
Value behind the attribute name in one of the original environments.
|
||||
"""
|
||||
return getattr(self._batch_env, name)
|
||||
|
||||
def __len__(self):
|
||||
"""Number of combined environments."""
|
||||
return len(self._batch_env)
|
||||
|
||||
def __getitem__(self, index):
|
||||
"""Access an underlying environment by index."""
|
||||
return self._batch_env[index]
|
||||
|
||||
def simulate(self, action):
|
||||
"""Step the batch of environments.
|
||||
|
||||
The results of the step can be accessed from the variables defined below.
|
||||
|
||||
Args:
|
||||
action: Tensor holding the batch of actions to apply.
|
||||
|
||||
Returns:
|
||||
Operation.
|
||||
"""
|
||||
with tf.name_scope('environment/simulate'):
|
||||
if action.dtype in (tf.float16, tf.float32, tf.float64):
|
||||
action = tf.check_numerics(action, 'action')
|
||||
observ_dtype = self._parse_dtype(self._batch_env.observation_space)
|
||||
observ, reward, done = tf.py_func(
|
||||
lambda a: self._batch_env.step(a)[:3], [action],
|
||||
[observ_dtype, tf.float32, tf.bool], name='step')
|
||||
observ = tf.check_numerics(observ, 'observ')
|
||||
reward = tf.check_numerics(reward, 'reward')
|
||||
return tf.group(
|
||||
self._observ.assign(observ),
|
||||
self._action.assign(action),
|
||||
self._reward.assign(reward),
|
||||
self._done.assign(done))
|
||||
|
||||
def reset(self, indices=None):
|
||||
"""Reset the batch of environments.
|
||||
|
||||
Args:
|
||||
indices: The batch indices of the environments to reset; defaults to all.
|
||||
|
||||
Returns:
|
||||
Batch tensor of the new observations.
|
||||
"""
|
||||
if indices is None:
|
||||
indices = tf.range(len(self._batch_env))
|
||||
observ_dtype = self._parse_dtype(self._batch_env.observation_space)
|
||||
observ = tf.py_func(
|
||||
self._batch_env.reset, [indices], observ_dtype, name='reset')
|
||||
observ = tf.check_numerics(observ, 'observ')
|
||||
reward = tf.zeros_like(indices, tf.float32)
|
||||
done = tf.zeros_like(indices, tf.bool)
|
||||
with tf.control_dependencies([
|
||||
tf.scatter_update(self._observ, indices, observ),
|
||||
tf.scatter_update(self._reward, indices, reward),
|
||||
tf.scatter_update(self._done, indices, done)]):
|
||||
return tf.identity(observ)
|
||||
|
||||
@property
|
||||
def observ(self):
|
||||
"""Access the variable holding the current observation."""
|
||||
return self._observ
|
||||
|
||||
@property
|
||||
def action(self):
|
||||
"""Access the variable holding the last recieved action."""
|
||||
return self._action
|
||||
|
||||
@property
|
||||
def reward(self):
|
||||
"""Access the variable holding the current reward."""
|
||||
return self._reward
|
||||
|
||||
@property
|
||||
def done(self):
|
||||
"""Access the variable indicating whether the episode is done."""
|
||||
return self._done
|
||||
|
||||
def close(self):
|
||||
"""Send close messages to the external process and join them."""
|
||||
self._batch_env.close()
|
||||
|
||||
def _parse_shape(self, space):
|
||||
"""Get a tensor shape from a OpenAI Gym space.
|
||||
|
||||
Args:
|
||||
space: Gym space.
|
||||
|
||||
Returns:
|
||||
Shape tuple.
|
||||
"""
|
||||
if isinstance(space, gym.spaces.Discrete):
|
||||
return ()
|
||||
if isinstance(space, gym.spaces.Box):
|
||||
return space.shape
|
||||
raise NotImplementedError()
|
||||
|
||||
def _parse_dtype(self, space):
|
||||
"""Get a tensor dtype from a OpenAI Gym space.
|
||||
|
||||
Args:
|
||||
space: Gym space.
|
||||
|
||||
Returns:
|
||||
TensorFlow data type.
|
||||
"""
|
||||
if isinstance(space, gym.spaces.Discrete):
|
||||
return tf.int32
|
||||
if isinstance(space, gym.spaces.Box):
|
||||
return tf.float32
|
||||
raise NotImplementedError()
|
@ -0,0 +1,162 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Put an OpenAI Gym environment into the TensorFlow graph."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import gym
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class InGraphEnv(object):
|
||||
"""Put an OpenAI Gym environment into the TensorFlow graph.
|
||||
|
||||
The environment will be stepped and reset inside of the graph using
|
||||
tf.py_func(). The current observation, action, reward, and done flag are held
|
||||
in according variables.
|
||||
"""
|
||||
|
||||
def __init__(self, env):
|
||||
"""Put an OpenAI Gym environment into the TensorFlow graph.
|
||||
|
||||
Args:
|
||||
env: OpenAI Gym environment.
|
||||
"""
|
||||
self._env = env
|
||||
observ_shape = self._parse_shape(self._env.observation_space)
|
||||
observ_dtype = self._parse_dtype(self._env.observation_space)
|
||||
action_shape = self._parse_shape(self._env.action_space)
|
||||
action_dtype = self._parse_dtype(self._env.action_space)
|
||||
with tf.name_scope('environment'):
|
||||
self._observ = tf.Variable(
|
||||
tf.zeros(observ_shape, observ_dtype), name='observ', trainable=False)
|
||||
self._action = tf.Variable(
|
||||
tf.zeros(action_shape, action_dtype), name='action', trainable=False)
|
||||
self._reward = tf.Variable(
|
||||
0.0, dtype=tf.float32, name='reward', trainable=False)
|
||||
self._done = tf.Variable(
|
||||
True, dtype=tf.bool, name='done', trainable=False)
|
||||
self._step = tf.Variable(
|
||||
0, dtype=tf.int32, name='step', trainable=False)
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Forward unimplemented attributes to the original environment.
|
||||
|
||||
Args:
|
||||
name: Attribute that was accessed.
|
||||
|
||||
Returns:
|
||||
Value behind the attribute name in the wrapped environment.
|
||||
"""
|
||||
return getattr(self._env, name)
|
||||
|
||||
def simulate(self, action):
|
||||
"""Step the environment.
|
||||
|
||||
The result of the step can be accessed from the variables defined below.
|
||||
|
||||
Args:
|
||||
action: Tensor holding the action to apply.
|
||||
|
||||
Returns:
|
||||
Operation.
|
||||
"""
|
||||
with tf.name_scope('environment/simulate'):
|
||||
if action.dtype in (tf.float16, tf.float32, tf.float64):
|
||||
action = tf.check_numerics(action, 'action')
|
||||
observ_dtype = self._parse_dtype(self._env.observation_space)
|
||||
observ, reward, done = tf.py_func(
|
||||
lambda a: self._env.step(a)[:3], [action],
|
||||
[observ_dtype, tf.float32, tf.bool], name='step')
|
||||
observ = tf.check_numerics(observ, 'observ')
|
||||
reward = tf.check_numerics(reward, 'reward')
|
||||
return tf.group(
|
||||
self._observ.assign(observ),
|
||||
self._action.assign(action),
|
||||
self._reward.assign(reward),
|
||||
self._done.assign(done),
|
||||
self._step.assign_add(1))
|
||||
|
||||
def reset(self):
|
||||
"""Reset the environment.
|
||||
|
||||
Returns:
|
||||
Tensor of the current observation.
|
||||
"""
|
||||
observ_dtype = self._parse_dtype(self._env.observation_space)
|
||||
observ = tf.py_func(self._env.reset, [], observ_dtype, name='reset')
|
||||
observ = tf.check_numerics(observ, 'observ')
|
||||
with tf.control_dependencies([
|
||||
self._observ.assign(observ),
|
||||
self._reward.assign(0),
|
||||
self._done.assign(False)]):
|
||||
return tf.identity(observ)
|
||||
|
||||
@property
|
||||
def observ(self):
|
||||
"""Access the variable holding the current observation."""
|
||||
return self._observ
|
||||
|
||||
@property
|
||||
def action(self):
|
||||
"""Access the variable holding the last recieved action."""
|
||||
return self._action
|
||||
|
||||
@property
|
||||
def reward(self):
|
||||
"""Access the variable holding the current reward."""
|
||||
return self._reward
|
||||
|
||||
@property
|
||||
def done(self):
|
||||
"""Access the variable indicating whether the episode is done."""
|
||||
return self._done
|
||||
|
||||
@property
|
||||
def step(self):
|
||||
"""Access the variable containg total steps of this environment."""
|
||||
return self._step
|
||||
|
||||
def _parse_shape(self, space):
|
||||
"""Get a tensor shape from a OpenAI Gym space.
|
||||
|
||||
Args:
|
||||
space: Gym space.
|
||||
|
||||
Returns:
|
||||
Shape tuple.
|
||||
"""
|
||||
if isinstance(space, gym.spaces.Discrete):
|
||||
return ()
|
||||
if isinstance(space, gym.spaces.Box):
|
||||
return space.shape
|
||||
raise NotImplementedError()
|
||||
|
||||
def _parse_dtype(self, space):
|
||||
"""Get a tensor dtype from a OpenAI Gym space.
|
||||
|
||||
Args:
|
||||
space: Gym space.
|
||||
|
||||
Returns:
|
||||
TensorFlow data type.
|
||||
"""
|
||||
if isinstance(space, gym.spaces.Discrete):
|
||||
return tf.int32
|
||||
if isinstance(space, gym.spaces.Box):
|
||||
return tf.float32
|
||||
raise NotImplementedError()
|
@ -0,0 +1,233 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Execute operations in a loop and coordinate logging and checkpoints."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import os
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from pybullet_envs.minitaur.agents.tools import streaming_mean
|
||||
|
||||
|
||||
_Phase = collections.namedtuple(
|
||||
'Phase',
|
||||
'name, writer, op, batch, steps, feed, report_every, log_every,'
|
||||
'checkpoint_every')
|
||||
|
||||
|
||||
class Loop(object):
|
||||
"""Execute operations in a loop and coordinate logging and checkpoints.
|
||||
|
||||
Supports multiple phases, that define their own operations to run, and
|
||||
intervals for reporting scores, logging summaries, and storing checkpoints.
|
||||
All class state is stored in-graph to properly recover from checkpoints.
|
||||
"""
|
||||
|
||||
def __init__(self, logdir, step=None, log=None, report=None, reset=None):
|
||||
"""Execute operations in a loop and coordinate logging and checkpoints.
|
||||
|
||||
The step, log, report, and report arguments will get created if not
|
||||
provided. Reset is used to indicate switching to a new phase, so that the
|
||||
model can start a new computation in case its computation is split over
|
||||
multiple training steps.
|
||||
|
||||
Args:
|
||||
logdir: Will contain checkpoints and summaries for each phase.
|
||||
step: Variable of the global step (optional).
|
||||
log: Tensor indicating to the model to compute summary tensors.
|
||||
report: Tensor indicating to the loop to report the current mean score.
|
||||
reset: Tensor indicating to the model to start a new computation.
|
||||
"""
|
||||
self._logdir = logdir
|
||||
self._step = (
|
||||
tf.Variable(0, False, name='global_step') if step is None else step)
|
||||
self._log = tf.placeholder(tf.bool) if log is None else log
|
||||
self._report = tf.placeholder(tf.bool) if report is None else report
|
||||
self._reset = tf.placeholder(tf.bool) if reset is None else reset
|
||||
self._phases = []
|
||||
|
||||
def add_phase(
|
||||
self, name, done, score, summary, steps,
|
||||
report_every=None, log_every=None, checkpoint_every=None, feed=None):
|
||||
"""Add a phase to the loop protocol.
|
||||
|
||||
If the model breaks long computation into multiple steps, the done tensor
|
||||
indicates whether the current score should be added to the mean counter.
|
||||
For example, in reinforcement learning we only have a valid score at the
|
||||
end of the episode.
|
||||
|
||||
Score and done tensors can either be scalars or vectors, to support
|
||||
single and batched computations.
|
||||
|
||||
Args:
|
||||
name: Name for the phase, used for the summary writer.
|
||||
done: Tensor indicating whether current score can be used.
|
||||
score: Tensor holding the current, possibly intermediate, score.
|
||||
summary: Tensor holding summary string to write if not an empty string.
|
||||
steps: Duration of the phase in steps.
|
||||
report_every: Yield mean score every this number of steps.
|
||||
log_every: Request summaries via `log` tensor every this number of steps.
|
||||
checkpoint_every: Write checkpoint every this number of steps.
|
||||
feed: Additional feed dictionary for the session run call.
|
||||
|
||||
Raises:
|
||||
ValueError: Unknown rank for done or score tensors.
|
||||
"""
|
||||
done = tf.convert_to_tensor(done, tf.bool)
|
||||
score = tf.convert_to_tensor(score, tf.float32)
|
||||
summary = tf.convert_to_tensor(summary, tf.string)
|
||||
feed = feed or {}
|
||||
if done.shape.ndims is None or score.shape.ndims is None:
|
||||
raise ValueError("Rank of 'done' and 'score' tensors must be known.")
|
||||
writer = self._logdir and tf.summary.FileWriter(
|
||||
os.path.join(self._logdir, name), tf.get_default_graph(),
|
||||
flush_secs=60)
|
||||
op = self._define_step(done, score, summary)
|
||||
batch = 1 if score.shape.ndims == 0 else score.shape[0].value
|
||||
self._phases.append(_Phase(
|
||||
name, writer, op, batch, int(steps), feed, report_every,
|
||||
log_every, checkpoint_every))
|
||||
|
||||
def run(self, sess, saver, max_step=None):
|
||||
"""Run the loop schedule for a specified number of steps.
|
||||
|
||||
Call the operation of the current phase until the global step reaches the
|
||||
specified maximum step. Phases are repeated over and over in the order they
|
||||
were added.
|
||||
|
||||
Args:
|
||||
sess: Session to use to run the phase operation.
|
||||
saver: Saver used for checkpointing.
|
||||
max_step: Run the operations until the step reaches this limit.
|
||||
|
||||
Yields:
|
||||
Reported mean scores.
|
||||
"""
|
||||
global_step = sess.run(self._step)
|
||||
steps_made = 1
|
||||
while True:
|
||||
if max_step and global_step >= max_step:
|
||||
break
|
||||
phase, epoch, steps_in = self._find_current_phase(global_step)
|
||||
phase_step = epoch * phase.steps + steps_in
|
||||
if steps_in % phase.steps < steps_made:
|
||||
message = '\n' + ('-' * 50) + '\n'
|
||||
message += 'Phase {} (phase step {}, global step {}).'
|
||||
tf.logging.info(message.format(phase.name, phase_step, global_step))
|
||||
# Populate book keeping tensors.
|
||||
phase.feed[self._reset] = (steps_in < steps_made)
|
||||
phase.feed[self._log] = (
|
||||
phase.writer and
|
||||
self._is_every_steps(phase_step, phase.batch, phase.log_every))
|
||||
phase.feed[self._report] = (
|
||||
self._is_every_steps(phase_step, phase.batch, phase.report_every))
|
||||
summary, mean_score, global_step, steps_made = sess.run(
|
||||
phase.op, phase.feed)
|
||||
if self._is_every_steps(phase_step, phase.batch, phase.checkpoint_every):
|
||||
self._store_checkpoint(sess, saver, global_step)
|
||||
if self._is_every_steps(phase_step, phase.batch, phase.report_every):
|
||||
yield mean_score
|
||||
if summary and phase.writer:
|
||||
# We want smaller phases to catch up at the beginnig of each epoch so
|
||||
# that their graphs are aligned.
|
||||
longest_phase = max(phase.steps for phase in self._phases)
|
||||
summary_step = epoch * longest_phase + steps_in
|
||||
phase.writer.add_summary(summary, summary_step)
|
||||
|
||||
def _is_every_steps(self, phase_step, batch, every):
|
||||
"""Determine whether a periodic event should happen at this step.
|
||||
|
||||
Args:
|
||||
phase_step: The incrementing step.
|
||||
batch: The number of steps progressed at once.
|
||||
every: The interval of the periode.
|
||||
|
||||
Returns:
|
||||
Boolean of whether the event should happen.
|
||||
"""
|
||||
if not every:
|
||||
return False
|
||||
covered_steps = range(phase_step, phase_step + batch)
|
||||
return any((step + 1) % every == 0 for step in covered_steps)
|
||||
|
||||
def _find_current_phase(self, global_step):
|
||||
"""Determine the current phase based on the global step.
|
||||
|
||||
This ensures continuing the correct phase after restoring checkoints.
|
||||
|
||||
Args:
|
||||
global_step: The global number of steps performed across all phases.
|
||||
|
||||
Returns:
|
||||
Tuple of phase object, epoch number, and phase steps within the epoch.
|
||||
"""
|
||||
epoch_size = sum(phase.steps for phase in self._phases)
|
||||
epoch = int(global_step // epoch_size)
|
||||
steps_in = global_step % epoch_size
|
||||
for phase in self._phases:
|
||||
if steps_in < phase.steps:
|
||||
return phase, epoch, steps_in
|
||||
steps_in -= phase.steps
|
||||
|
||||
def _define_step(self, done, score, summary):
|
||||
"""Combine operations of a phase.
|
||||
|
||||
Keeps track of the mean score and when to report it.
|
||||
|
||||
Args:
|
||||
done: Tensor indicating whether current score can be used.
|
||||
score: Tensor holding the current, possibly intermediate, score.
|
||||
summary: Tensor holding summary string to write if not an empty string.
|
||||
|
||||
Returns:
|
||||
Tuple of summary tensor, mean score, and new global step. The mean score
|
||||
is zero for non reporting steps.
|
||||
"""
|
||||
if done.shape.ndims == 0:
|
||||
done = done[None]
|
||||
if score.shape.ndims == 0:
|
||||
score = score[None]
|
||||
score_mean = streaming_mean.StreamingMean((), tf.float32)
|
||||
with tf.control_dependencies([done, score, summary]):
|
||||
done_score = tf.gather(score, tf.where(done)[:, 0])
|
||||
submit_score = tf.cond(
|
||||
tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op)
|
||||
with tf.control_dependencies([submit_score]):
|
||||
mean_score = tf.cond(self._report, score_mean.clear, float)
|
||||
steps_made = tf.shape(score)[0]
|
||||
next_step = self._step.assign_add(steps_made)
|
||||
with tf.control_dependencies([mean_score, next_step]):
|
||||
return tf.identity(summary), mean_score, next_step, steps_made
|
||||
|
||||
def _store_checkpoint(self, sess, saver, global_step):
|
||||
"""Store a checkpoint if a log directory was provided to the constructor.
|
||||
|
||||
The directory will be created if needed.
|
||||
|
||||
Args:
|
||||
sess: Session containing variables to store.
|
||||
saver: Saver used for checkpointing.
|
||||
global_step: Step number of the checkpoint name.
|
||||
"""
|
||||
if not self._logdir or not saver:
|
||||
return
|
||||
tf.gfile.MakeDirs(self._logdir)
|
||||
filename = os.path.join(self._logdir, 'model.ckpt')
|
||||
saver.save(sess, filename, global_step)
|
@ -0,0 +1,111 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Tests for the training loop."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from google3.robotics.reinforcement_learning.agents import tools
|
||||
|
||||
|
||||
class LoopTest(tf.test.TestCase):
|
||||
|
||||
def test_report_every_step(self):
|
||||
step = tf.Variable(0, False, dtype=tf.int32, name='step')
|
||||
loop = tools.Loop(None, step)
|
||||
loop.add_phase(
|
||||
'phase_1', done=True, score=0, summary='', steps=1, report_every=3)
|
||||
# Step: 0 1 2 3 4 5 6 7 8
|
||||
# Report: x x x
|
||||
with self.test_session() as sess:
|
||||
sess.run(tf.global_variables_initializer())
|
||||
scores = loop.run(sess, saver=None, max_step=9)
|
||||
next(scores)
|
||||
self.assertEqual(3, sess.run(step))
|
||||
next(scores)
|
||||
self.assertEqual(6, sess.run(step))
|
||||
next(scores)
|
||||
self.assertEqual(9, sess.run(step))
|
||||
|
||||
def test_phases_feed(self):
|
||||
score = tf.placeholder(tf.float32, [])
|
||||
loop = tools.Loop(None)
|
||||
loop.add_phase(
|
||||
'phase_1', done=True, score=score, summary='', steps=1, report_every=1,
|
||||
log_every=None, checkpoint_every=None, feed={score: 1})
|
||||
loop.add_phase(
|
||||
'phase_2', done=True, score=score, summary='', steps=3, report_every=1,
|
||||
log_every=None, checkpoint_every=None, feed={score: 2})
|
||||
loop.add_phase(
|
||||
'phase_3', done=True, score=score, summary='', steps=2, report_every=1,
|
||||
log_every=None, checkpoint_every=None, feed={score: 3})
|
||||
with self.test_session() as sess:
|
||||
sess.run(tf.global_variables_initializer())
|
||||
scores = list(loop.run(sess, saver=None, max_step=15))
|
||||
self.assertAllEqual([1, 2, 2, 2, 3, 3, 1, 2, 2, 2, 3, 3, 1, 2, 2], scores)
|
||||
|
||||
def test_average_score_over_phases(self):
|
||||
loop = tools.Loop(None)
|
||||
loop.add_phase(
|
||||
'phase_1', done=True, score=1, summary='', steps=1, report_every=2)
|
||||
loop.add_phase(
|
||||
'phase_2', done=True, score=2, summary='', steps=2, report_every=5)
|
||||
# Score: 1 2 2 1 2 2 1 2 2 1 2 2 1 2 2 1 2
|
||||
# Report 1: x x x
|
||||
# Report 2: x x
|
||||
with self.test_session() as sess:
|
||||
sess.run(tf.global_variables_initializer())
|
||||
scores = list(loop.run(sess, saver=None, max_step=17))
|
||||
self.assertAllEqual([1, 2, 1, 2, 1], scores)
|
||||
|
||||
def test_not_done(self):
|
||||
step = tf.Variable(0, False, dtype=tf.int32, name='step')
|
||||
done = tf.equal((step + 1) % 2, 0)
|
||||
score = tf.cast(step, tf.float32)
|
||||
loop = tools.Loop(None, step)
|
||||
loop.add_phase(
|
||||
'phase_1', done, score, summary='', steps=1, report_every=3)
|
||||
# Score: 0 1 2 3 4 5 6 7 8
|
||||
# Done: x x x x
|
||||
# Report: x x x
|
||||
with self.test_session() as sess:
|
||||
sess.run(tf.global_variables_initializer())
|
||||
scores = list(loop.run(sess, saver=None, max_step=9))
|
||||
self.assertAllEqual([1, 4, 7], scores)
|
||||
|
||||
def test_not_done_batch(self):
|
||||
step = tf.Variable(0, False, dtype=tf.int32, name='step')
|
||||
done = tf.equal([step % 3, step % 4], 0)
|
||||
score = tf.cast([step, step ** 2], tf.float32)
|
||||
loop = tools.Loop(None, step)
|
||||
loop.add_phase(
|
||||
'phase_1', done, score, summary='', steps=1, report_every=8)
|
||||
# Step: 0 2 4 6
|
||||
# Score 1: 0 2 4 6
|
||||
# Done 1: x x
|
||||
# Score 2: 0 4 16 32
|
||||
# Done 2: x x
|
||||
with self.test_session() as sess:
|
||||
sess.run(tf.global_variables_initializer())
|
||||
scores = list(loop.run(sess, saver=None, max_step=8))
|
||||
self.assertEqual(8, sess.run(step))
|
||||
self.assertAllEqual([(0 + 0 + 16 + 6) / 4], scores)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
tf.test.main()
|
@ -0,0 +1,49 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Mock algorithm for testing reinforcement learning code."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class MockAlgorithm(object):
|
||||
"""Produce random actions and empty summaries."""
|
||||
|
||||
def __init__(self, envs):
|
||||
"""Produce random actions and empty summaries.
|
||||
|
||||
Args:
|
||||
envs: List of in-graph environments.
|
||||
"""
|
||||
self._envs = envs
|
||||
|
||||
def begin_episode(self, unused_agent_indices):
|
||||
return tf.constant('')
|
||||
|
||||
def perform(self, unused_observ):
|
||||
shape = (len(self._envs),) + self._envs[0].action_space.shape
|
||||
low = self._envs[0].action_space.low
|
||||
high = self._envs[0].action_space.high
|
||||
action = tf.random_uniform(shape) * (high - low) + low
|
||||
return action, tf.constant('')
|
||||
|
||||
def experience(self, *unused_transition):
|
||||
return tf.constant('')
|
||||
|
||||
def end_episode(self, unused_agent_indices):
|
||||
return tf.constant('')
|
@ -0,0 +1,86 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Mock environment for testing reinforcement learning code."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import gym
|
||||
import gym.spaces
|
||||
import numpy as np
|
||||
|
||||
|
||||
class MockEnvironment(object):
|
||||
"""Generate random agent input and keep track of statistics."""
|
||||
|
||||
def __init__(self, observ_shape, action_shape, min_duration, max_duration):
|
||||
"""Generate random agent input and keep track of statistics.
|
||||
|
||||
Args:
|
||||
observ_shape: Shape for the random observations.
|
||||
action_shape: Shape for the action space.
|
||||
min_duration: Minimum number of steps per episode.
|
||||
max_duration: Maximum number of steps per episode.
|
||||
|
||||
Attributes:
|
||||
steps: List of actual simulated lengths for all episodes.
|
||||
durations: List of decided lengths for all episodes.
|
||||
"""
|
||||
self._observ_shape = observ_shape
|
||||
self._action_shape = action_shape
|
||||
self._min_duration = min_duration
|
||||
self._max_duration = max_duration
|
||||
self._random = np.random.RandomState(0)
|
||||
self.steps = []
|
||||
self.durations = []
|
||||
|
||||
@property
|
||||
def observation_space(self):
|
||||
low = np.zeros(self._observ_shape)
|
||||
high = np.ones(self._observ_shape)
|
||||
return gym.spaces.Box(low, high)
|
||||
|
||||
@property
|
||||
def action_space(self):
|
||||
low = np.zeros(self._action_shape)
|
||||
high = np.ones(self._action_shape)
|
||||
return gym.spaces.Box(low, high)
|
||||
|
||||
@property
|
||||
def unwrapped(self):
|
||||
return self
|
||||
|
||||
def step(self, action):
|
||||
assert self.action_space.contains(action)
|
||||
assert self.steps[-1] < self.durations[-1]
|
||||
self.steps[-1] += 1
|
||||
observ = self._current_observation()
|
||||
reward = self._current_reward()
|
||||
done = self.steps[-1] >= self.durations[-1]
|
||||
info = {}
|
||||
return observ, reward, done, info
|
||||
|
||||
def reset(self):
|
||||
duration = self._random.randint(self._min_duration, self._max_duration + 1)
|
||||
self.steps.append(0)
|
||||
self.durations.append(duration)
|
||||
return self._current_observation()
|
||||
|
||||
def _current_observation(self):
|
||||
return self._random.uniform(0, 1, self._observ_shape)
|
||||
|
||||
def _current_reward(self):
|
||||
return self._random.uniform(-1, 1)
|
@ -0,0 +1,145 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""In-graph simulation step of a vecrotized algorithm with environments."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from pybullet_envs.minitaur.agents.tools import streaming_mean
|
||||
|
||||
|
||||
def simulate(batch_env, algo, log=True, reset=False):
|
||||
"""Simulation step of a vecrotized algorithm with in-graph environments.
|
||||
|
||||
Integrates the operations implemented by the algorithm and the environments
|
||||
into a combined operation.
|
||||
|
||||
Args:
|
||||
batch_env: In-graph batch environment.
|
||||
algo: Algorithm instance implementing required operations.
|
||||
log: Tensor indicating whether to compute and return summaries.
|
||||
reset: Tensor causing all environments to reset.
|
||||
|
||||
Returns:
|
||||
Tuple of tensors containing done flags for the current episodes, possibly
|
||||
intermediate scores for the episodes, and a summary tensor.
|
||||
"""
|
||||
|
||||
def _define_begin_episode(agent_indices):
|
||||
"""Reset environments, intermediate scores and durations for new episodes.
|
||||
|
||||
Args:
|
||||
agent_indices: Tensor containing batch indices starting an episode.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
assert agent_indices.shape.ndims == 1
|
||||
zero_scores = tf.zeros_like(agent_indices, tf.float32)
|
||||
zero_durations = tf.zeros_like(agent_indices)
|
||||
reset_ops = [
|
||||
batch_env.reset(agent_indices),
|
||||
tf.scatter_update(score, agent_indices, zero_scores),
|
||||
tf.scatter_update(length, agent_indices, zero_durations)]
|
||||
with tf.control_dependencies(reset_ops):
|
||||
return algo.begin_episode(agent_indices)
|
||||
|
||||
def _define_step():
|
||||
"""Request actions from the algorithm and apply them to the environments.
|
||||
|
||||
Increments the lengths of all episodes and increases their scores by the
|
||||
current reward. After stepping the environments, provides the full
|
||||
transition tuple to the algorithm.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
prevob = batch_env.observ + 0 # Ensure a copy of the variable value.
|
||||
action, step_summary = algo.perform(prevob)
|
||||
action.set_shape(batch_env.action.shape)
|
||||
with tf.control_dependencies([batch_env.simulate(action)]):
|
||||
add_score = score.assign_add(batch_env.reward)
|
||||
inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
|
||||
with tf.control_dependencies([add_score, inc_length]):
|
||||
experience_summary = algo.experience(
|
||||
prevob, batch_env.action, batch_env.reward, batch_env.done,
|
||||
batch_env.observ)
|
||||
return tf.summary.merge([step_summary, experience_summary])
|
||||
|
||||
def _define_end_episode(agent_indices):
|
||||
"""Notify the algorithm of ending episodes.
|
||||
|
||||
Also updates the mean score and length counters used for summaries.
|
||||
|
||||
Args:
|
||||
agent_indices: Tensor holding batch indices that end their episodes.
|
||||
|
||||
Returns:
|
||||
Summary tensor.
|
||||
"""
|
||||
assert agent_indices.shape.ndims == 1
|
||||
submit_score = mean_score.submit(tf.gather(score, agent_indices))
|
||||
submit_length = mean_length.submit(
|
||||
tf.cast(tf.gather(length, agent_indices), tf.float32))
|
||||
with tf.control_dependencies([submit_score, submit_length]):
|
||||
return algo.end_episode(agent_indices)
|
||||
|
||||
def _define_summaries():
|
||||
"""Reset the average score and duration, and return them as summary.
|
||||
|
||||
Returns:
|
||||
Summary string.
|
||||
"""
|
||||
score_summary = tf.cond(
|
||||
tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
|
||||
lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
|
||||
length_summary = tf.cond(
|
||||
tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
|
||||
lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
|
||||
return tf.summary.merge([score_summary, length_summary])
|
||||
|
||||
with tf.name_scope('simulate'):
|
||||
log = tf.convert_to_tensor(log)
|
||||
reset = tf.convert_to_tensor(reset)
|
||||
with tf.variable_scope('simulate_temporary'):
|
||||
score = tf.Variable(
|
||||
tf.zeros(len(batch_env), dtype=tf.float32), False, name='score')
|
||||
length = tf.Variable(
|
||||
tf.zeros(len(batch_env), dtype=tf.int32), False, name='length')
|
||||
mean_score = streaming_mean.StreamingMean((), tf.float32)
|
||||
mean_length = streaming_mean.StreamingMean((), tf.float32)
|
||||
agent_indices = tf.cond(
|
||||
reset,
|
||||
lambda: tf.range(len(batch_env)),
|
||||
lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
|
||||
begin_episode = tf.cond(
|
||||
tf.cast(tf.shape(agent_indices)[0], tf.bool),
|
||||
lambda: _define_begin_episode(agent_indices), str)
|
||||
with tf.control_dependencies([begin_episode]):
|
||||
step = _define_step()
|
||||
with tf.control_dependencies([step]):
|
||||
agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
|
||||
end_episode = tf.cond(
|
||||
tf.cast(tf.shape(agent_indices)[0], tf.bool),
|
||||
lambda: _define_end_episode(agent_indices), str)
|
||||
with tf.control_dependencies([end_episode]):
|
||||
summary = tf.summary.merge([
|
||||
_define_summaries(), begin_episode, step, end_episode])
|
||||
with tf.control_dependencies([summary]):
|
||||
done, score = tf.identity(batch_env.done), tf.identity(score)
|
||||
return done, score, summary
|
@ -0,0 +1,98 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Tests for the simulation operation."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from google3.robotics.reinforcement_learning.agents import tools
|
||||
|
||||
|
||||
class SimulateTest(tf.test.TestCase):
|
||||
|
||||
def test_done_automatic(self):
|
||||
batch_env = self._create_test_batch_env((1, 2, 3, 4))
|
||||
algo = tools.MockAlgorithm(batch_env)
|
||||
done, _, _ = tools.simulate(batch_env, algo, log=False, reset=False)
|
||||
with self.test_session() as sess:
|
||||
sess.run(tf.global_variables_initializer())
|
||||
self.assertAllEqual([True, False, False, False], sess.run(done))
|
||||
self.assertAllEqual([True, True, False, False], sess.run(done))
|
||||
self.assertAllEqual([True, False, True, False], sess.run(done))
|
||||
self.assertAllEqual([True, True, False, True], sess.run(done))
|
||||
|
||||
def test_done_forced(self):
|
||||
reset = tf.placeholder_with_default(False, ())
|
||||
batch_env = self._create_test_batch_env((2, 4))
|
||||
algo = tools.MockAlgorithm(batch_env)
|
||||
done, _, _ = tools.simulate(batch_env, algo, False, reset)
|
||||
with self.test_session() as sess:
|
||||
sess.run(tf.global_variables_initializer())
|
||||
self.assertAllEqual([False, False], sess.run(done))
|
||||
self.assertAllEqual([False, False], sess.run(done, {reset: True}))
|
||||
self.assertAllEqual([True, False], sess.run(done))
|
||||
self.assertAllEqual([False, False], sess.run(done, {reset: True}))
|
||||
self.assertAllEqual([True, False], sess.run(done))
|
||||
self.assertAllEqual([False, False], sess.run(done))
|
||||
self.assertAllEqual([True, True], sess.run(done))
|
||||
|
||||
def test_reset_automatic(self):
|
||||
batch_env = self._create_test_batch_env((1, 2, 3, 4))
|
||||
algo = tools.MockAlgorithm(batch_env)
|
||||
done, _, _ = tools.simulate(batch_env, algo, log=False, reset=False)
|
||||
with self.test_session() as sess:
|
||||
sess.run(tf.global_variables_initializer())
|
||||
for _ in range(10):
|
||||
sess.run(done)
|
||||
self.assertAllEqual([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], batch_env[0].steps)
|
||||
self.assertAllEqual([2, 2, 2, 2, 2], batch_env[1].steps)
|
||||
self.assertAllEqual([3, 3, 3, 1], batch_env[2].steps)
|
||||
self.assertAllEqual([4, 4, 2], batch_env[3].steps)
|
||||
|
||||
def test_reset_forced(self):
|
||||
reset = tf.placeholder_with_default(False, ())
|
||||
batch_env = self._create_test_batch_env((2, 4))
|
||||
algo = tools.MockAlgorithm(batch_env)
|
||||
done, _, _ = tools.simulate(batch_env, algo, False, reset)
|
||||
with self.test_session() as sess:
|
||||
sess.run(tf.global_variables_initializer())
|
||||
sess.run(done)
|
||||
sess.run(done, {reset: True})
|
||||
sess.run(done)
|
||||
sess.run(done, {reset: True})
|
||||
sess.run(done)
|
||||
sess.run(done)
|
||||
sess.run(done)
|
||||
self.assertAllEqual([1, 2, 2, 2], batch_env[0].steps)
|
||||
self.assertAllEqual([1, 2, 4], batch_env[1].steps)
|
||||
|
||||
def _create_test_batch_env(self, durations):
|
||||
envs = []
|
||||
for duration in durations:
|
||||
env = tools.MockEnvironment(
|
||||
observ_shape=(2, 3), action_shape=(3,),
|
||||
min_duration=duration, max_duration=duration)
|
||||
env = tools.wrappers.ConvertTo32Bit(env)
|
||||
envs.append(env)
|
||||
batch_env = tools.BatchEnv(envs, blocking=True)
|
||||
batch_env = tools.InGraphBatchEnv(batch_env)
|
||||
return batch_env
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
tf.test.main()
|
@ -0,0 +1,67 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Compute a streaming estimation of the mean of submitted tensors."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class StreamingMean(object):
|
||||
"""Compute a streaming estimation of the mean of submitted tensors."""
|
||||
|
||||
def __init__(self, shape, dtype):
|
||||
"""Specify the shape and dtype of the mean to be estimated.
|
||||
|
||||
Note that a float mean to zero submitted elements is NaN, while computing
|
||||
the integer mean of zero elements raises a division by zero error.
|
||||
|
||||
Args:
|
||||
shape: Shape of the mean to compute.
|
||||
dtype: Data type of the mean to compute.
|
||||
"""
|
||||
self._dtype = dtype
|
||||
self._sum = tf.Variable(lambda: tf.zeros(shape, dtype), False)
|
||||
self._count = tf.Variable(lambda: 0, trainable=False)
|
||||
|
||||
@property
|
||||
def value(self):
|
||||
"""The current value of the mean."""
|
||||
return self._sum / tf.cast(self._count, self._dtype)
|
||||
|
||||
@property
|
||||
def count(self):
|
||||
"""The number of submitted samples."""
|
||||
return self._count
|
||||
|
||||
def submit(self, value):
|
||||
"""Submit a single or batch tensor to refine the streaming mean."""
|
||||
# Add a batch dimension if necessary.
|
||||
if value.shape.ndims == self._sum.shape.ndims:
|
||||
value = value[None, ...]
|
||||
return tf.group(
|
||||
self._sum.assign_add(tf.reduce_sum(value, 0)),
|
||||
self._count.assign_add(tf.shape(value)[0]))
|
||||
|
||||
def clear(self):
|
||||
"""Return the mean estimate and reset the streaming statistics."""
|
||||
value = self._sum / tf.cast(self._count, self._dtype)
|
||||
with tf.control_dependencies([value]):
|
||||
reset_value = self._sum.assign(tf.zeros_like(self._sum))
|
||||
reset_count = self._count.assign(0)
|
||||
with tf.control_dependencies([reset_value, reset_count]):
|
||||
return tf.identity(value)
|
@ -0,0 +1,552 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Wrappers for OpenAI Gym environments."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import atexit
|
||||
import functools
|
||||
import multiprocessing
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
import gym
|
||||
import gym.spaces
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class AutoReset(object):
|
||||
"""Automatically reset environment when the episode is done."""
|
||||
|
||||
def __init__(self, env):
|
||||
self._env = env
|
||||
self._done = True
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._env, name)
|
||||
|
||||
def step(self, action):
|
||||
if self._done:
|
||||
observ, reward, done, info = self._env.reset(), 0.0, False, {}
|
||||
else:
|
||||
observ, reward, done, info = self._env.step(action)
|
||||
self._done = done
|
||||
return observ, reward, done, info
|
||||
|
||||
def reset(self):
|
||||
self._done = False
|
||||
return self._env.reset()
|
||||
|
||||
|
||||
class ActionRepeat(object):
|
||||
"""Repeat the agent action multiple steps."""
|
||||
|
||||
def __init__(self, env, amount):
|
||||
self._env = env
|
||||
self._amount = amount
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._env, name)
|
||||
|
||||
def step(self, action):
|
||||
done = False
|
||||
total_reward = 0
|
||||
current_step = 0
|
||||
while current_step < self._amount and not done:
|
||||
observ, reward, done, info = self._env.step(action)
|
||||
total_reward += reward
|
||||
current_step += 1
|
||||
return observ, total_reward, done, info
|
||||
|
||||
|
||||
class RandomStart(object):
|
||||
"""Perform random number of random actions at the start of the episode."""
|
||||
|
||||
def __init__(self, env, max_steps):
|
||||
self._env = env
|
||||
self._max_steps = max_steps
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._env, name)
|
||||
|
||||
def reset(self):
|
||||
observ = self._env.reset()
|
||||
random_steps = np.random.randint(0, self._max_steps)
|
||||
for _ in range(random_steps):
|
||||
action = self._env.action_space.sample()
|
||||
observ, unused_reward, done, unused_info = self._env.step(action)
|
||||
if done:
|
||||
tf.logging.warning('Episode ended during random start.')
|
||||
return self.reset()
|
||||
return observ
|
||||
|
||||
|
||||
class FrameHistory(object):
|
||||
"""Augment the observation with past observations."""
|
||||
|
||||
def __init__(self, env, past_indices, flatten):
|
||||
"""Augment the observation with past observations.
|
||||
|
||||
Implemented as a Numpy ring buffer holding the necessary past observations.
|
||||
|
||||
Args:
|
||||
env: OpenAI Gym environment to wrap.
|
||||
past_indices: List of non-negative integers indicating the time offsets
|
||||
from the current time step of observations to include.
|
||||
flatten: Concatenate the past observations rather than stacking them.
|
||||
|
||||
Raises:
|
||||
KeyError: The current observation is not included in the indices.
|
||||
"""
|
||||
if 0 not in past_indices:
|
||||
raise KeyError('Past indices should include 0 for the current frame.')
|
||||
self._env = env
|
||||
self._past_indices = past_indices
|
||||
self._step = 0
|
||||
self._buffer = None
|
||||
self._capacity = max(past_indices)
|
||||
self._flatten = flatten
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._env, name)
|
||||
|
||||
@property
|
||||
def observation_space(self):
|
||||
low = self._env.observation_space.low
|
||||
high = self._env.observation_space.high
|
||||
low = np.repeat(low[None, ...], len(self._past_indices), 0)
|
||||
high = np.repeat(high[None, ...], len(self._past_indices), 0)
|
||||
if self._flatten:
|
||||
low = np.reshape(low, (-1,) + low.shape[2:])
|
||||
high = np.reshape(high, (-1,) + high.shape[2:])
|
||||
return gym.spaces.Box(low, high)
|
||||
|
||||
def step(self, action):
|
||||
observ, reward, done, info = self._env.step(action)
|
||||
self._step += 1
|
||||
self._buffer[self._step % self._capacity] = observ
|
||||
observ = self._select_frames()
|
||||
return observ, reward, done, info
|
||||
|
||||
def reset(self):
|
||||
observ = self._env.reset()
|
||||
self._buffer = np.repeat(observ[None, ...], self._capacity, 0)
|
||||
self._step = 0
|
||||
return self._select_frames()
|
||||
|
||||
def _select_frames(self):
|
||||
indices = [
|
||||
(self._step - index) % self._capacity for index in self._past_indices]
|
||||
observ = self._buffer[indices]
|
||||
if self._flatten:
|
||||
observ = np.reshape(observ, (-1,) + observ.shape[2:])
|
||||
return observ
|
||||
|
||||
|
||||
class FrameDelta(object):
|
||||
"""Convert the observation to a difference from the previous observation."""
|
||||
|
||||
def __init__(self, env):
|
||||
self._env = env
|
||||
self._last = None
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._env, name)
|
||||
|
||||
@property
|
||||
def observation_space(self):
|
||||
low = self._env.observation_space.low
|
||||
high = self._env.observation_space.high
|
||||
low, high = low - high, high - low
|
||||
return gym.spaces.Box(low, high)
|
||||
|
||||
def step(self, action):
|
||||
observ, reward, done, info = self._env.step(action)
|
||||
delta = observ - self._last
|
||||
self._last = observ
|
||||
return delta, reward, done, info
|
||||
|
||||
def reset(self):
|
||||
observ = self._env.reset()
|
||||
self._last = observ
|
||||
return observ
|
||||
|
||||
|
||||
class RangeNormalize(object):
|
||||
"""Normalize the specialized observation and action ranges to [-1, 1]."""
|
||||
|
||||
def __init__(self, env, observ=None, action=None):
|
||||
self._env = env
|
||||
self._should_normalize_observ = (
|
||||
observ is not False and self._is_finite(self._env.observation_space))
|
||||
if observ is True and not self._should_normalize_observ:
|
||||
raise ValueError('Cannot normalize infinite observation range.')
|
||||
if observ is None and not self._should_normalize_observ:
|
||||
tf.logging.info('Not normalizing infinite observation range.')
|
||||
self._should_normalize_action = (
|
||||
action is not False and self._is_finite(self._env.action_space))
|
||||
if action is True and not self._should_normalize_action:
|
||||
raise ValueError('Cannot normalize infinite action range.')
|
||||
if action is None and not self._should_normalize_action:
|
||||
tf.logging.info('Not normalizing infinite action range.')
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._env, name)
|
||||
|
||||
@property
|
||||
def observation_space(self):
|
||||
space = self._env.observation_space
|
||||
if not self._should_normalize_observ:
|
||||
return space
|
||||
return gym.spaces.Box(-np.ones(space.shape), np.ones(space.shape))
|
||||
|
||||
@property
|
||||
def action_space(self):
|
||||
space = self._env.action_space
|
||||
if not self._should_normalize_action:
|
||||
return space
|
||||
return gym.spaces.Box(-np.ones(space.shape), np.ones(space.shape))
|
||||
|
||||
def step(self, action):
|
||||
if self._should_normalize_action:
|
||||
action = self._denormalize_action(action)
|
||||
observ, reward, done, info = self._env.step(action)
|
||||
if self._should_normalize_observ:
|
||||
observ = self._normalize_observ(observ)
|
||||
return observ, reward, done, info
|
||||
|
||||
def reset(self):
|
||||
observ = self._env.reset()
|
||||
if self._should_normalize_observ:
|
||||
observ = self._normalize_observ(observ)
|
||||
return observ
|
||||
|
||||
def _denormalize_action(self, action):
|
||||
min_ = self._env.action_space.low
|
||||
max_ = self._env.action_space.high
|
||||
action = (action + 1) / 2 * (max_ - min_) + min_
|
||||
return action
|
||||
|
||||
def _normalize_observ(self, observ):
|
||||
min_ = self._env.observation_space.low
|
||||
max_ = self._env.observation_space.high
|
||||
observ = 2 * (observ - min_) / (max_ - min_) - 1
|
||||
return observ
|
||||
|
||||
def _is_finite(self, space):
|
||||
return np.isfinite(space.low).all() and np.isfinite(space.high).all()
|
||||
|
||||
|
||||
class ClipAction(object):
|
||||
"""Clip out of range actions to the action space of the environment."""
|
||||
|
||||
def __init__(self, env):
|
||||
self._env = env
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._env, name)
|
||||
|
||||
@property
|
||||
def action_space(self):
|
||||
shape = self._env.action_space.shape
|
||||
return gym.spaces.Box(-np.inf * np.ones(shape), np.inf * np.ones(shape))
|
||||
|
||||
def step(self, action):
|
||||
action_space = self._env.action_space
|
||||
action = np.clip(action, action_space.low, action_space.high)
|
||||
return self._env.step(action)
|
||||
|
||||
|
||||
class LimitDuration(object):
|
||||
"""End episodes after specified number of steps."""
|
||||
|
||||
def __init__(self, env, duration):
|
||||
self._env = env
|
||||
self._duration = duration
|
||||
self._step = None
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._env, name)
|
||||
|
||||
def step(self, action):
|
||||
if self._step is None:
|
||||
raise RuntimeError('Must reset environment.')
|
||||
observ, reward, done, info = self._env.step(action)
|
||||
self._step += 1
|
||||
if self._step >= self._duration:
|
||||
done = True
|
||||
self._step = None
|
||||
return observ, reward, done, info
|
||||
|
||||
def reset(self):
|
||||
self._step = 0
|
||||
return self._env.reset()
|
||||
|
||||
|
||||
class ExternalProcess(object):
|
||||
"""Step environment in a separate process for lock free paralellism."""
|
||||
|
||||
# Message types for communication via the pipe.
|
||||
_ACTION = 1
|
||||
_RESET = 2
|
||||
_CLOSE = 3
|
||||
_ATTRIBUTE = 4
|
||||
_TRANSITION = 5
|
||||
_OBSERV = 6
|
||||
_EXCEPTION = 7
|
||||
_VALUE = 8
|
||||
|
||||
def __init__(self, constructor):
|
||||
"""Step environment in a separate process for lock free paralellism.
|
||||
|
||||
The environment will be created in the external process by calling the
|
||||
specified callable. This can be an environment class, or a function
|
||||
creating the environment and potentially wrapping it. The returned
|
||||
environment should not access global variables.
|
||||
|
||||
Args:
|
||||
constructor: Callable that creates and returns an OpenAI gym environment.
|
||||
|
||||
Attributes:
|
||||
observation_space: The cached observation space of the environment.
|
||||
action_space: The cached action space of the environment.
|
||||
"""
|
||||
self._conn, conn = multiprocessing.Pipe()
|
||||
self._process = multiprocessing.Process(
|
||||
target=self._worker, args=(constructor, conn))
|
||||
atexit.register(self.close)
|
||||
self._process.start()
|
||||
self._observ_space = None
|
||||
self._action_space = None
|
||||
|
||||
@property
|
||||
def observation_space(self):
|
||||
if not self._observ_space:
|
||||
self._observ_space = self.__getattr__('observation_space')
|
||||
return self._observ_space
|
||||
|
||||
@property
|
||||
def action_space(self):
|
||||
if not self._action_space:
|
||||
self._action_space = self.__getattr__('action_space')
|
||||
return self._action_space
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Request an attribute from the environment.
|
||||
|
||||
Note that this involves communication with the external process, so it can
|
||||
be slow.
|
||||
|
||||
Args:
|
||||
name: Attribute to access.
|
||||
|
||||
Returns:
|
||||
Value of the attribute.
|
||||
"""
|
||||
self._conn.send((self._ATTRIBUTE, name))
|
||||
return self._receive(self._VALUE)
|
||||
|
||||
def step(self, action, blocking=True):
|
||||
"""Step the environment.
|
||||
|
||||
Args:
|
||||
action: The action to apply to the environment.
|
||||
blocking: Whether to wait for the result.
|
||||
|
||||
Returns:
|
||||
Transition tuple when blocking, otherwise callable that returns the
|
||||
transition tuple.
|
||||
"""
|
||||
self._conn.send((self._ACTION, action))
|
||||
if blocking:
|
||||
return self._receive(self._TRANSITION)
|
||||
else:
|
||||
return functools.partial(self._receive, self._TRANSITION)
|
||||
|
||||
def reset(self, blocking=True):
|
||||
"""Reset the environment.
|
||||
|
||||
Args:
|
||||
blocking: Whether to wait for the result.
|
||||
|
||||
Returns:
|
||||
New observation when blocking, otherwise callable that returns the new
|
||||
observation.
|
||||
"""
|
||||
self._conn.send((self._RESET, None))
|
||||
if blocking:
|
||||
return self._receive(self._OBSERV)
|
||||
else:
|
||||
return functools.partial(self._receive, self._OBSERV)
|
||||
|
||||
def close(self):
|
||||
"""Send a close message to the external process and join it."""
|
||||
try:
|
||||
self._conn.send((self._CLOSE, None))
|
||||
self._conn.close()
|
||||
except IOError:
|
||||
# The connection was already closed.
|
||||
pass
|
||||
self._process.join()
|
||||
|
||||
def _receive(self, expected_message):
|
||||
"""Wait for a message from the worker process and return its payload.
|
||||
|
||||
Args:
|
||||
expected_message: Type of the expected message.
|
||||
|
||||
Raises:
|
||||
Exception: An exception was raised inside the worker process.
|
||||
KeyError: The reveived message is not of the expected type.
|
||||
|
||||
Returns:
|
||||
Payload object of the message.
|
||||
"""
|
||||
message, payload = self._conn.recv()
|
||||
# Re-raise exceptions in the main process.
|
||||
if message == self._EXCEPTION:
|
||||
stacktrace = payload
|
||||
raise Exception(stacktrace)
|
||||
if message == expected_message:
|
||||
return payload
|
||||
raise KeyError('Received message of unexpected type {}'.format(message))
|
||||
|
||||
def _worker(self, constructor, conn):
|
||||
"""The process waits for actions and sends back environment results.
|
||||
|
||||
Args:
|
||||
constructor: Constructor for the OpenAI Gym environment.
|
||||
conn: Connection for communication to the main process.
|
||||
"""
|
||||
try:
|
||||
env = constructor()
|
||||
while True:
|
||||
try:
|
||||
# Only block for short times to have keyboard exceptions be raised.
|
||||
if not conn.poll(0.1):
|
||||
continue
|
||||
message, payload = conn.recv()
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
break
|
||||
if message == self._ACTION:
|
||||
action = payload
|
||||
conn.send((self._TRANSITION, env.step(action)))
|
||||
continue
|
||||
if message == self._RESET:
|
||||
assert payload is None
|
||||
conn.send((self._OBSERV, env.reset()))
|
||||
continue
|
||||
if message == self._ATTRIBUTE:
|
||||
name = payload
|
||||
conn.send((self._VALUE, getattr(env, name)))
|
||||
continue
|
||||
if message == self._CLOSE:
|
||||
assert payload is None
|
||||
break
|
||||
raise KeyError('Received message of unknown type {}'.format(message))
|
||||
except Exception: # pylint: disable=broad-except
|
||||
stacktrace = ''.join(traceback.format_exception(*sys.exc_info()))
|
||||
conn.send((self._EXCEPTION, stacktrace))
|
||||
tf.logging.error('Error in environment process: {}'.format(stacktrace))
|
||||
conn.close()
|
||||
|
||||
|
||||
class ConvertTo32Bit(object):
|
||||
"""Convert data types of an OpenAI Gym environment to 32 bit."""
|
||||
|
||||
def __init__(self, env):
|
||||
"""Convert data types of an OpenAI Gym environment to 32 bit.
|
||||
|
||||
Args:
|
||||
env: OpenAI Gym environment.
|
||||
"""
|
||||
self._env = env
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Forward unimplemented attributes to the original environment.
|
||||
|
||||
Args:
|
||||
name: Attribute that was accessed.
|
||||
|
||||
Returns:
|
||||
Value behind the attribute name in the wrapped environment.
|
||||
"""
|
||||
return getattr(self._env, name)
|
||||
|
||||
def step(self, action):
|
||||
"""Forward action to the wrapped environment.
|
||||
|
||||
Args:
|
||||
action: Action to apply to the environment.
|
||||
|
||||
Raises:
|
||||
ValueError: Invalid action.
|
||||
|
||||
Returns:
|
||||
Converted observation, converted reward, done flag, and info object.
|
||||
"""
|
||||
observ, reward, done, info = self._env.step(action)
|
||||
observ = self._convert_observ(observ)
|
||||
reward = self._convert_reward(reward)
|
||||
return observ, reward, done, info
|
||||
|
||||
def reset(self):
|
||||
"""Reset the environment and convert the resulting observation.
|
||||
|
||||
Returns:
|
||||
Converted observation.
|
||||
"""
|
||||
observ = self._env.reset()
|
||||
observ = self._convert_observ(observ)
|
||||
return observ
|
||||
|
||||
def _convert_observ(self, observ):
|
||||
"""Convert the observation to 32 bits.
|
||||
|
||||
Args:
|
||||
observ: Numpy observation.
|
||||
|
||||
Raises:
|
||||
ValueError: Observation contains infinite values.
|
||||
|
||||
Returns:
|
||||
Numpy observation with 32-bit data type.
|
||||
"""
|
||||
if not np.isfinite(observ).all():
|
||||
raise ValueError('Infinite observation encountered.')
|
||||
if observ.dtype == np.float64:
|
||||
return observ.astype(np.float32)
|
||||
if observ.dtype == np.int64:
|
||||
return observ.astype(np.int32)
|
||||
return observ
|
||||
|
||||
def _convert_reward(self, reward):
|
||||
"""Convert the reward to 32 bits.
|
||||
|
||||
Args:
|
||||
reward: Numpy reward.
|
||||
|
||||
Raises:
|
||||
ValueError: Rewards contain infinite values.
|
||||
|
||||
Returns:
|
||||
Numpy reward with 32-bit data type.
|
||||
"""
|
||||
if not np.isfinite(reward).all():
|
||||
raise ValueError('Infinite reward encountered.')
|
||||
return np.array(reward, dtype=np.float32)
|
@ -0,0 +1,90 @@
|
||||
# Copyright 2017 The TensorFlow Agents Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Tests for environment wrappers."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import functools
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from agents import tools
|
||||
|
||||
|
||||
class ExternalProcessTest(tf.test.TestCase):
|
||||
|
||||
def test_close_no_hang_after_init(self):
|
||||
constructor = functools.partial(
|
||||
tools.MockEnvironment,
|
||||
observ_shape=(2, 3), action_shape=(2,),
|
||||
min_duration=2, max_duration=2)
|
||||
env = tools.wrappers.ExternalProcess(constructor)
|
||||
env.close()
|
||||
|
||||
def test_close_no_hang_after_step(self):
|
||||
constructor = functools.partial(
|
||||
tools.MockEnvironment,
|
||||
observ_shape=(2, 3), action_shape=(2,),
|
||||
min_duration=5, max_duration=5)
|
||||
env = tools.wrappers.ExternalProcess(constructor)
|
||||
env.reset()
|
||||
env.step(env.action_space.sample())
|
||||
env.step(env.action_space.sample())
|
||||
env.close()
|
||||
|
||||
def test_reraise_exception_in_init(self):
|
||||
constructor = MockEnvironmentCrashInInit
|
||||
env = tools.wrappers.ExternalProcess(constructor)
|
||||
with self.assertRaises(Exception):
|
||||
env.step(env.action_space.sample())
|
||||
|
||||
def test_reraise_exception_in_step(self):
|
||||
constructor = functools.partial(
|
||||
MockEnvironmentCrashInStep, crash_at_step=3)
|
||||
env = tools.wrappers.ExternalProcess(constructor)
|
||||
env.reset()
|
||||
env.step(env.action_space.sample())
|
||||
env.step(env.action_space.sample())
|
||||
with self.assertRaises(Exception):
|
||||
env.step(env.action_space.sample())
|
||||
|
||||
|
||||
class MockEnvironmentCrashInInit(object):
|
||||
"""Raise an error when instantiated."""
|
||||
|
||||
def __init__(self, *unused_args, **unused_kwargs):
|
||||
raise RuntimeError()
|
||||
|
||||
|
||||
class MockEnvironmentCrashInStep(tools.MockEnvironment):
|
||||
"""Raise an error after specified number of steps in an episode."""
|
||||
|
||||
def __init__(self, crash_at_step):
|
||||
super(MockEnvironmentCrashInStep, self).__init__(
|
||||
observ_shape=(2, 3), action_shape=(2,),
|
||||
min_duration=crash_at_step + 1, max_duration=crash_at_step + 1)
|
||||
self._crash_at_step = crash_at_step
|
||||
|
||||
def step(self, *args, **kwargs):
|
||||
transition = super(MockEnvironmentCrashInStep, self).step(*args, **kwargs)
|
||||
if self.steps[-1] == self._crash_at_step:
|
||||
raise RuntimeError()
|
||||
return transition
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
tf.test.main()
|
11
examples/pybullet/gym/pybullet_envs/minitaur/envs/README
Normal file
11
examples/pybullet/gym/pybullet_envs/minitaur/envs/README
Normal file
@ -0,0 +1,11 @@
|
||||
This folder contains a number of simulated Minitaur environments implemented using pybullet.
|
||||
The following two environments are used in the RSS paper "Sim-to-Real: Learning Agile Locomotion For Quadruped Robots":
|
||||
1) Galloping example: minitaur_reactive_env.py
|
||||
python minitaur_reactive_env_example.py runs a pre-trained PPO agent that performs galloping gait.
|
||||
|
||||
2) Trotting example: minitaur_trotting_env.py
|
||||
python minitaur_trotting_env_example.py runs a pre-trained PPO agent that performs trotting gait.
|
||||
|
||||
The rest are experimental environments.
|
||||
|
||||
|
@ -216,7 +216,7 @@ class MinitaurFourLegStandEnv(minitaur_gym_env.MinitaurGymEnv):
|
||||
self._env_step_counter += 1
|
||||
done = self._termination()
|
||||
obs = self._get_true_observation()
|
||||
reward = self._reward(action, obs)
|
||||
reward = self._reward()
|
||||
if self._log_path is not None:
|
||||
minitaur_logging.update_episode_proto(self._episode_proto, self.minitaur,
|
||||
action, self._env_step_counter)
|
||||
@ -272,7 +272,7 @@ class MinitaurFourLegStandEnv(minitaur_gym_env.MinitaurGymEnv):
|
||||
np.asarray([0, 0, 1]), np.asarray(local_up))
|
||||
return local_global_up_dot_product < 0.85 or height < 0.15
|
||||
|
||||
def _reward(self, action, obs):
|
||||
def _reward(self):
|
||||
roll, pitch, _ = self.minitaur.GetBaseRollPitchYaw()
|
||||
return 1.0 / (0.001 + math.fabs(roll) + math.fabs(pitch))
|
||||
|
||||
|
@ -1,8 +1,4 @@
|
||||
r"""An example to use simple_ppo_agent.
|
||||
|
||||
blaze run -c opt \
|
||||
//robotics/reinforcement_learning/minitaur/envs:minitaur_reactive_env_example
|
||||
"""
|
||||
r"""Running a pre-trained ppo agent on minitaur_reactive_env."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
@ -11,13 +7,13 @@ from __future__ import print_function
|
||||
import os
|
||||
import time
|
||||
import tensorflow as tf
|
||||
from agents.scripts import utility
|
||||
from pybullet_envs.minitaur.agents.scripts import utility
|
||||
import pybullet_data
|
||||
import simple_ppo_agent
|
||||
|
||||
flags = tf.app.flags
|
||||
FLAGS = tf.app.flags.FLAGS
|
||||
LOG_DIR = (
|
||||
"testdata/minitaur_reactive_env_test")
|
||||
LOG_DIR = os.path.join(pybullet_data.getDataPath(), "policies/ppo/minitaur_reactive_env")
|
||||
CHECKPOINT = "model.ckpt-14000000"
|
||||
|
||||
|
||||
@ -43,7 +39,6 @@ def main(argv):
|
||||
while True:
|
||||
action = agent.get_action([observation])
|
||||
observation, reward, done, _ = env.step(action[0])
|
||||
# This sleep is to prevent serial communication error on the real robot.
|
||||
time.sleep(0.002)
|
||||
sum_reward += reward
|
||||
if done:
|
||||
|
@ -1,41 +1,51 @@
|
||||
"""An example to run the minitaur environment of trotting gait.
|
||||
r"""Running a pre-trained ppo agent on minitaur_trotting_env."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
"""
|
||||
import time
|
||||
import os
|
||||
import numpy as np
|
||||
import time
|
||||
import tensorflow as tf
|
||||
from pybullet_envs.minitaur.envs import minitaur_gym_env
|
||||
from pybullet_envs.minitaur.envs import minitaur_trotting_env
|
||||
from pybullet_envs.minitaur.agents.scripts import utility
|
||||
import pybullet_data
|
||||
import simple_ppo_agent
|
||||
|
||||
#FLAGS = tf.flags.FLAGS
|
||||
#tf.flags.DEFINE_string("log_path", None, "The directory to write the log file.")
|
||||
flags = tf.app.flags
|
||||
FLAGS = tf.app.flags.FLAGS
|
||||
LOG_DIR = os.path.join(pybullet_data.getDataPath(), "policies/ppo/minitaur_trotting_env")
|
||||
CHECKPOINT = "model.ckpt-14000000"
|
||||
|
||||
|
||||
def main(unused_argv):
|
||||
environment = minitaur_trotting_env.MinitaurTrottingEnv(
|
||||
urdf_version=minitaur_gym_env.RAINBOW_DASH_V0_URDF_VERSION,
|
||||
use_signal_in_observation=False,
|
||||
use_angle_in_observation=False,
|
||||
render=True,
|
||||
log_path=os.getcwd())
|
||||
def main(argv):
|
||||
del argv # Unused.
|
||||
config = utility.load_config(LOG_DIR)
|
||||
policy_layers = config.policy_layers
|
||||
value_layers = config.value_layers
|
||||
env = config.env(render=True)
|
||||
network = config.network
|
||||
|
||||
np.random.seed(100)
|
||||
sum_reward = 0
|
||||
environment.reset()
|
||||
with tf.Session() as sess:
|
||||
agent = simple_ppo_agent.SimplePPOPolicy(
|
||||
sess,
|
||||
env,
|
||||
network,
|
||||
policy_layers=policy_layers,
|
||||
value_layers=value_layers,
|
||||
checkpoint=os.path.join(LOG_DIR, CHECKPOINT))
|
||||
|
||||
steps = 5000
|
||||
for _ in range(steps):
|
||||
# Sleep to prevent serial buffer overflow on microcontroller.
|
||||
time.sleep(0.002)
|
||||
action = [0] * 8
|
||||
_, reward, done, _ = environment.step(action)
|
||||
sum_reward += reward
|
||||
if done:
|
||||
break
|
||||
tf.logging.info("reward: {}".format(sum_reward))
|
||||
sum_reward = 0
|
||||
observation = env.reset()
|
||||
while True:
|
||||
action = agent.get_action([observation])
|
||||
observation, reward, done, _ = env.step(action[0])
|
||||
time.sleep(0.002)
|
||||
sum_reward += reward
|
||||
if done:
|
||||
break
|
||||
tf.logging.info("reward: %s", sum_reward)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tf.logging.set_verbosity(tf.logging.INFO)
|
||||
tf.app.run()
|
||||
tf.app.run(main)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user