make sure that the pre-trained galloping and trotting policies work for the minitaur_reactive_env and minitaur_trotting_env environments.

This commit is contained in:
Jie 2018-04-24 21:48:27 -07:00
parent 982453abc6
commit a375a349ce
43 changed files with 4447 additions and 42 deletions

View File

@ -0,0 +1,54 @@
!!python/object/new:pybullet_envs.minitaur.agents.tools.attr_dict.AttrDict
dictitems:
algorithm: !!python/name:pybullet_envs.minitaur.agents.ppo.algorithm.PPOAlgorithm ''
discount: 0.9868209124499899
env: !!python/object/apply:functools.partial
args:
- &id001 !!python/name:pybullet_envs.minitaur.envs.minitaur_reactive_env.MinitaurReactiveEnv ''
state: !!python/tuple
- *id001
- !!python/tuple []
- accurate_motor_model_enabled: true
control_latency: 0.02
energy_weight: 0.005
env_randomizer: null
motor_kd: 0.015
num_steps_to_log: 1000
pd_latency: 0.003
remove_default_joint_damping: true
render: false
urdf_version: rainbow_dash_v0
- null
eval_episodes: 25
init_logstd: -1.1579536194508315
init_mean_factor: 0.3084392491563408
kl_cutoff_coef: 1000
kl_cutoff_factor: 2
kl_init_penalty: 1
kl_target: 0.01
logdir: /cns/ij-d/home/jietan/experiment/minitaur_vizier_study_ppo/minreact_nonexp_nr_02_186515603_186518344/333
max_length: 1000
network: !!python/name:pybullet_envs.minitaur.agents.scripts.networks.ForwardGaussianPolicy ''
network_config: {}
num_agents: 25
policy_layers: !!python/tuple
- 114
- 45
policy_lr: 0.00023516695218031146
policy_optimizer: AdamOptimizer
steps: 7000000.0
update_epochs_policy: 25
update_epochs_value: 25
update_every: 25
use_gpu: false
value_layers: !!python/tuple
- 170
- 78
value_lr: 0.00031014032715987193
value_optimizer: AdamOptimizer
weight_summaries:
all: .*
policy: .*/policy/.*
value: .*/value/.*
state:
_mutable: false

View File

@ -0,0 +1,51 @@
!!python/object/new:pybullet_envs.minitaur.agents.tools.attr_dict.AttrDict
dictitems:
algorithm: !!python/name:pybullet_envs.minitaur.agents.ppo.algorithm.PPOAlgorithm ''
discount: 0.9899764168788918
env: !!python/object/apply:functools.partial
args:
- &id001 !!python/name:pybullet_envs.minitaur.envs.minitaur_trotting_env.MinitaurTrottingEnv ''
state: !!python/tuple
- *id001
- !!python/tuple []
- env_randomizer: null
motor_kd: 0.015
num_steps_to_log: 1000
pd_latency: 0.003
remove_default_joint_damping: true
render: false
urdf_version: rainbow_dash_v0
- null
eval_episodes: 25
init_logstd: -0.6325707791047228
init_mean_factor: 0.6508531688665261
kl_cutoff_coef: 1000
kl_cutoff_factor: 2
kl_init_penalty: 1
kl_target: 0.01
logdir: /cns/ij-d/home/jietan/experiment/minitaur_vizier_study_ppo/mintrot_nonexp_nr_01_186515603_186518344/373
max_length: 1000
network: !!python/name:pybullet_envs.minitaur.agents.scripts.networks.ForwardGaussianPolicy ''
network_config: {}
num_agents: 25
policy_layers: !!python/tuple
- 133
- 100
policy_lr: 0.00048104185841752015
policy_optimizer: AdamOptimizer
steps: 7000000.0
update_epochs_policy: 25
update_epochs_value: 25
update_every: 25
use_gpu: false
value_layers: !!python/tuple
- 64
- 57
value_lr: 0.0012786382882055453
value_optimizer: AdamOptimizer
weight_summaries:
all: .*
policy: .*/policy/.*
value: .*/value/.*
state:
_mutable: false

View File

@ -1 +1,3 @@
import gym

View File

@ -0,0 +1,21 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Proximal Policy Optimization algorithm."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from .algorithm import PPOAlgorithm

View File

@ -0,0 +1,558 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Proximal Policy Optimization algorithm.
Based on John Schulman's implementation in Python and Theano:
https://github.com/joschu/modular_rl/blob/master/modular_rl/ppo.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import tensorflow as tf
from . import memory
from . import normalize
from . import utility
_NetworkOutput = collections.namedtuple(
'NetworkOutput', 'policy, mean, logstd, value, state')
class PPOAlgorithm(object):
"""A vectorized implementation of the PPO algorithm by John Schulman."""
def __init__(self, batch_env, step, is_training, should_log, config):
"""Create an instance of the PPO algorithm.
Args:
batch_env: In-graph batch environment.
step: Integer tensor holding the current training step.
is_training: Boolean tensor for whether the algorithm should train.
should_log: Boolean tensor for whether summaries should be returned.
config: Object containing the agent configuration as attributes.
"""
self._batch_env = batch_env
self._step = step
self._is_training = is_training
self._should_log = should_log
self._config = config
self._observ_filter = normalize.StreamingNormalize(
self._batch_env.observ[0], center=True, scale=True, clip=5,
name='normalize_observ')
self._reward_filter = normalize.StreamingNormalize(
self._batch_env.reward[0], center=False, scale=True, clip=10,
name='normalize_reward')
# Memory stores tuple of observ, action, mean, logstd, reward.
template = (
self._batch_env.observ[0], self._batch_env.action[0],
self._batch_env.action[0], self._batch_env.action[0],
self._batch_env.reward[0])
self._memory = memory.EpisodeMemory(
template, config.update_every, config.max_length, 'memory')
self._memory_index = tf.Variable(0, False)
use_gpu = self._config.use_gpu and utility.available_gpus()
with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
# Create network variables for later calls to reuse.
self._network(
tf.zeros_like(self._batch_env.observ)[:, None],
tf.ones(len(self._batch_env)), reuse=None)
cell = self._config.network(self._batch_env.action.shape[1].value)
with tf.variable_scope('ppo_temporary'):
self._episodes = memory.EpisodeMemory(
template, len(batch_env), config.max_length, 'episodes')
self._last_state = utility.create_nested_vars(
cell.zero_state(len(batch_env), tf.float32))
self._last_action = tf.Variable(
tf.zeros_like(self._batch_env.action), False, name='last_action')
self._last_mean = tf.Variable(
tf.zeros_like(self._batch_env.action), False, name='last_mean')
self._last_logstd = tf.Variable(
tf.zeros_like(self._batch_env.action), False, name='last_logstd')
self._penalty = tf.Variable(
self._config.kl_init_penalty, False, dtype=tf.float32)
self._policy_optimizer = self._config.policy_optimizer(
self._config.policy_lr, name='policy_optimizer')
self._value_optimizer = self._config.value_optimizer(
self._config.value_lr, name='value_optimizer')
def begin_episode(self, agent_indices):
"""Reset the recurrent states and stored episode.
Args:
agent_indices: 1D tensor of batch indices for agents starting an episode.
Returns:
Summary tensor.
"""
with tf.name_scope('begin_episode/'):
reset_state = utility.reinit_nested_vars(self._last_state, agent_indices)
reset_buffer = self._episodes.clear(agent_indices)
with tf.control_dependencies([reset_state, reset_buffer]):
return tf.constant('')
def perform(self, observ):
"""Compute batch of actions and a summary for a batch of observation.
Args:
observ: Tensor of a batch of observations for all agents.
Returns:
Tuple of action batch tensor and summary tensor.
"""
with tf.name_scope('perform/'):
observ = self._observ_filter.transform(observ)
network = self._network(
observ[:, None], tf.ones(observ.shape[0]), self._last_state)
action = tf.cond(
self._is_training, network.policy.sample, lambda: network.mean)
logprob = network.policy.log_prob(action)[:, 0]
# pylint: disable=g-long-lambda
summary = tf.cond(self._should_log, lambda: tf.summary.merge([
tf.summary.histogram('mean', network.mean[:, 0]),
tf.summary.histogram('std', tf.exp(network.logstd[:, 0])),
tf.summary.histogram('action', action[:, 0]),
tf.summary.histogram('logprob', logprob)]), str)
# Remember current policy to append to memory in the experience callback.
with tf.control_dependencies([
utility.assign_nested_vars(self._last_state, network.state),
self._last_action.assign(action[:, 0]),
self._last_mean.assign(network.mean[:, 0]),
self._last_logstd.assign(network.logstd[:, 0])]):
return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
def experience(self, observ, action, reward, unused_done, unused_nextob):
"""Process the transition tuple of the current step.
When training, add the current transition tuple to the memory and update
the streaming statistics for observations and rewards. A summary string is
returned if requested at this step.
Args:
observ: Batch tensor of observations.
action: Batch tensor of actions.
reward: Batch tensor of rewards.
unused_done: Batch tensor of done flags.
unused_nextob: Batch tensor of successor observations.
Returns:
Summary tensor.
"""
with tf.name_scope('experience/'):
return tf.cond(
self._is_training,
lambda: self._define_experience(observ, action, reward), str)
def _define_experience(self, observ, action, reward):
"""Implement the branch of experience() entered during training."""
update_filters = tf.summary.merge([
self._observ_filter.update(observ),
self._reward_filter.update(reward)])
with tf.control_dependencies([update_filters]):
if self._config.train_on_agent_action:
# NOTE: Doesn't seem to change much.
action = self._last_action
batch = observ, action, self._last_mean, self._last_logstd, reward
append = self._episodes.append(batch, tf.range(len(self._batch_env)))
with tf.control_dependencies([append]):
norm_observ = self._observ_filter.transform(observ)
norm_reward = tf.reduce_mean(self._reward_filter.transform(reward))
# pylint: disable=g-long-lambda
summary = tf.cond(self._should_log, lambda: tf.summary.merge([
update_filters,
self._observ_filter.summary(),
self._reward_filter.summary(),
tf.summary.scalar('memory_size', self._memory_index),
tf.summary.histogram('normalized_observ', norm_observ),
tf.summary.histogram('action', self._last_action),
tf.summary.scalar('normalized_reward', norm_reward)]), str)
return summary
def end_episode(self, agent_indices):
"""Add episodes to the memory and perform update steps if memory is full.
During training, add the collected episodes of the batch indices that
finished their episode to the memory. If the memory is full, train on it,
and then clear the memory. A summary string is returned if requested at
this step.
Args:
agent_indices: 1D tensor of batch indices for agents starting an episode.
Returns:
Summary tensor.
"""
with tf.name_scope('end_episode/'):
return tf.cond(
self._is_training,
lambda: self._define_end_episode(agent_indices), str)
def _define_end_episode(self, agent_indices):
"""Implement the branch of end_episode() entered during training."""
episodes, length = self._episodes.data(agent_indices)
space_left = self._config.update_every - self._memory_index
use_episodes = tf.range(tf.minimum(
tf.shape(agent_indices)[0], space_left))
episodes = [tf.gather(elem, use_episodes) for elem in episodes]
append = self._memory.replace(
episodes, tf.gather(length, use_episodes),
use_episodes + self._memory_index)
with tf.control_dependencies([append]):
inc_index = self._memory_index.assign_add(tf.shape(use_episodes)[0])
with tf.control_dependencies([inc_index]):
memory_full = self._memory_index >= self._config.update_every
return tf.cond(memory_full, self._training, str)
def _training(self):
"""Perform multiple training iterations of both policy and value baseline.
Training on the episodes collected in the memory. Reset the memory
afterwards. Always returns a summary string.
Returns:
Summary tensor.
"""
with tf.name_scope('training'):
assert_full = tf.assert_equal(
self._memory_index, self._config.update_every)
with tf.control_dependencies([assert_full]):
data = self._memory.data()
(observ, action, old_mean, old_logstd, reward), length = data
with tf.control_dependencies([tf.assert_greater(length, 0)]):
length = tf.identity(length)
observ = self._observ_filter.transform(observ)
reward = self._reward_filter.transform(reward)
policy_summary = self._update_policy(
observ, action, old_mean, old_logstd, reward, length)
with tf.control_dependencies([policy_summary]):
value_summary = self._update_value(observ, reward, length)
with tf.control_dependencies([value_summary]):
penalty_summary = self._adjust_penalty(
observ, old_mean, old_logstd, length)
with tf.control_dependencies([penalty_summary]):
clear_memory = tf.group(
self._memory.clear(), self._memory_index.assign(0))
with tf.control_dependencies([clear_memory]):
weight_summary = utility.variable_summaries(
tf.trainable_variables(), self._config.weight_summaries)
return tf.summary.merge([
policy_summary, value_summary, penalty_summary, weight_summary])
def _update_value(self, observ, reward, length):
"""Perform multiple update steps of the value baseline.
We need to decide for the summary of one iteration, and thus choose the one
after half of the iterations.
Args:
observ: Sequences of observations.
reward: Sequences of reward.
length: Batch of sequence lengths.
Returns:
Summary tensor.
"""
with tf.name_scope('update_value'):
loss, summary = tf.scan(
lambda _1, _2: self._update_value_step(observ, reward, length),
tf.range(self._config.update_epochs_value),
[0., ''], parallel_iterations=1)
print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'value loss: ')
with tf.control_dependencies([loss, print_loss]):
return summary[self._config.update_epochs_value // 2]
def _update_value_step(self, observ, reward, length):
"""Compute the current value loss and perform a gradient update step.
Args:
observ: Sequences of observations.
reward: Sequences of reward.
length: Batch of sequence lengths.
Returns:
Tuple of loss tensor and summary tensor.
"""
loss, summary = self._value_loss(observ, reward, length)
gradients, variables = (
zip(*self._value_optimizer.compute_gradients(loss)))
optimize = self._value_optimizer.apply_gradients(
zip(gradients, variables))
summary = tf.summary.merge([
summary,
tf.summary.scalar('gradient_norm', tf.global_norm(gradients)),
utility.gradient_summaries(
zip(gradients, variables), dict(value=r'.*'))])
with tf.control_dependencies([optimize]):
return [tf.identity(loss), tf.identity(summary)]
def _value_loss(self, observ, reward, length):
"""Compute the loss function for the value baseline.
The value loss is the difference between empirical and approximated returns
over the collected episodes. Returns the loss tensor and a summary strin.
Args:
observ: Sequences of observations.
reward: Sequences of reward.
length: Batch of sequence lengths.
Returns:
Tuple of loss tensor and summary tensor.
"""
with tf.name_scope('value_loss'):
value = self._network(observ, length).value
return_ = utility.discounted_return(
reward, length, self._config.discount)
advantage = return_ - value
value_loss = 0.5 * self._mask(advantage ** 2, length)
summary = tf.summary.merge([
tf.summary.histogram('value_loss', value_loss),
tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss))])
value_loss = tf.reduce_mean(value_loss)
return tf.check_numerics(value_loss, 'value_loss'), summary
def _update_policy(
self, observ, action, old_mean, old_logstd, reward, length):
"""Perform multiple update steps of the policy.
The advantage is computed once at the beginning and shared across
iterations. We need to decide for the summary of one iteration, and thus
choose the one after half of the iterations.
Args:
observ: Sequences of observations.
action: Sequences of actions.
old_mean: Sequences of action means of the behavioral policy.
old_logstd: Sequences of action log stddevs of the behavioral policy.
reward: Sequences of rewards.
length: Batch of sequence lengths.
Returns:
Summary tensor.
"""
with tf.name_scope('update_policy'):
return_ = utility.discounted_return(
reward, length, self._config.discount)
value = self._network(observ, length).value
if self._config.gae_lambda:
advantage = utility.lambda_return(
reward, value, length, self._config.discount,
self._config.gae_lambda)
else:
advantage = return_ - value
mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
advantage = tf.Print(
advantage, [tf.reduce_mean(return_), tf.reduce_mean(value)],
'return and value: ')
advantage = tf.Print(
advantage, [tf.reduce_mean(advantage)],
'normalized advantage: ')
# pylint: disable=g-long-lambda
loss, summary = tf.scan(
lambda _1, _2: self._update_policy_step(
observ, action, old_mean, old_logstd, advantage, length),
tf.range(self._config.update_epochs_policy),
[0., ''], parallel_iterations=1)
print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'policy loss: ')
with tf.control_dependencies([loss, print_loss]):
return summary[self._config.update_epochs_policy // 2]
def _update_policy_step(
self, observ, action, old_mean, old_logstd, advantage, length):
"""Compute the current policy loss and perform a gradient update step.
Args:
observ: Sequences of observations.
action: Sequences of actions.
old_mean: Sequences of action means of the behavioral policy.
old_logstd: Sequences of action log stddevs of the behavioral policy.
advantage: Sequences of advantages.
length: Batch of sequence lengths.
Returns:
Tuple of loss tensor and summary tensor.
"""
network = self._network(observ, length)
loss, summary = self._policy_loss(
network.mean, network.logstd, old_mean, old_logstd, action,
advantage, length)
gradients, variables = (
zip(*self._policy_optimizer.compute_gradients(loss)))
optimize = self._policy_optimizer.apply_gradients(
zip(gradients, variables))
summary = tf.summary.merge([
summary,
tf.summary.scalar('gradient_norm', tf.global_norm(gradients)),
utility.gradient_summaries(
zip(gradients, variables), dict(policy=r'.*'))])
with tf.control_dependencies([optimize]):
return [tf.identity(loss), tf.identity(summary)]
def _policy_loss(
self, mean, logstd, old_mean, old_logstd, action, advantage, length):
"""Compute the policy loss composed of multiple components.
1. The policy gradient loss is importance sampled from the data-collecting
policy at the beginning of training.
2. The second term is a KL penalty between the policy at the beginning of
training and the current policy.
3. Additionally, if this KL already changed more than twice the target
amount, we activate a strong penalty discouraging further divergence.
Args:
mean: Sequences of action means of the current policy.
logstd: Sequences of action log stddevs of the current policy.
old_mean: Sequences of action means of the behavioral policy.
old_logstd: Sequences of action log stddevs of the behavioral policy.
action: Sequences of actions.
advantage: Sequences of advantages.
length: Batch of sequence lengths.
Returns:
Tuple of loss tensor and summary tensor.
"""
with tf.name_scope('policy_loss'):
entropy = utility.diag_normal_entropy(mean, logstd)
kl = tf.reduce_mean(self._mask(utility.diag_normal_kl(
old_mean, old_logstd, mean, logstd), length), 1)
policy_gradient = tf.exp(
utility.diag_normal_logpdf(mean, logstd, action) -
utility.diag_normal_logpdf(old_mean, old_logstd, action))
surrogate_loss = -tf.reduce_mean(self._mask(
policy_gradient * tf.stop_gradient(advantage), length), 1)
kl_penalty = self._penalty * kl
cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor
cutoff_count = tf.reduce_sum(
tf.cast(kl > cutoff_threshold, tf.int32))
with tf.control_dependencies([tf.cond(
cutoff_count > 0,
lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int)]):
kl_cutoff = (
self._config.kl_cutoff_coef *
tf.cast(kl > cutoff_threshold, tf.float32) *
(kl - cutoff_threshold) ** 2)
policy_loss = surrogate_loss + kl_penalty + kl_cutoff
summary = tf.summary.merge([
tf.summary.histogram('entropy', entropy),
tf.summary.histogram('kl', kl),
tf.summary.histogram('surrogate_loss', surrogate_loss),
tf.summary.histogram('kl_penalty', kl_penalty),
tf.summary.histogram('kl_cutoff', kl_cutoff),
tf.summary.histogram('kl_penalty_combined', kl_penalty + kl_cutoff),
tf.summary.histogram('policy_loss', policy_loss),
tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)),
tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)),
tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss))])
policy_loss = tf.reduce_mean(policy_loss, 0)
return tf.check_numerics(policy_loss, 'policy_loss'), summary
def _adjust_penalty(self, observ, old_mean, old_logstd, length):
"""Adjust the KL policy between the behavioral and current policy.
Compute how much the policy actually changed during the multiple
update steps. Adjust the penalty strength for the next training phase if we
overshot or undershot the target divergence too much.
Args:
observ: Sequences of observations.
old_mean: Sequences of action means of the behavioral policy.
old_logstd: Sequences of action log stddevs of the behavioral policy.
length: Batch of sequence lengths.
Returns:
Summary tensor.
"""
with tf.name_scope('adjust_penalty'):
network = self._network(observ, length)
assert_change = tf.assert_equal(
tf.reduce_all(tf.equal(network.mean, old_mean)), False,
message='policy should change')
print_penalty = tf.Print(0, [self._penalty], 'current penalty: ')
with tf.control_dependencies([assert_change, print_penalty]):
kl_change = tf.reduce_mean(self._mask(utility.diag_normal_kl(
old_mean, old_logstd, network.mean, network.logstd), length))
kl_change = tf.Print(kl_change, [kl_change], 'kl change: ')
maybe_increase = tf.cond(
kl_change > 1.3 * self._config.kl_target,
# pylint: disable=g-long-lambda
lambda: tf.Print(self._penalty.assign(
self._penalty * 1.5), [0], 'increase penalty '),
float)
maybe_decrease = tf.cond(
kl_change < 0.7 * self._config.kl_target,
# pylint: disable=g-long-lambda
lambda: tf.Print(self._penalty.assign(
self._penalty / 1.5), [0], 'decrease penalty '),
float)
with tf.control_dependencies([maybe_increase, maybe_decrease]):
return tf.summary.merge([
tf.summary.scalar('kl_change', kl_change),
tf.summary.scalar('penalty', self._penalty)])
def _mask(self, tensor, length):
"""Set padding elements of a batch of sequences to zero.
Useful to then safely sum along the time dimension.
Args:
tensor: Tensor of sequences.
length: Batch of sequence lengths.
Returns:
Masked sequences.
"""
with tf.name_scope('mask'):
range_ = tf.range(tensor.shape[1].value)
mask = tf.cast(range_[None, :] < length[:, None], tf.float32)
masked = tensor * mask
return tf.check_numerics(masked, 'masked')
def _network(self, observ, length=None, state=None, reuse=True):
"""Compute the network output for a batched sequence of observations.
Optionally, the initial state can be specified. The weights should be
reused for all calls, except for the first one. Output is a named tuple
containing the policy as a TensorFlow distribution, the policy mean and log
standard deviation, the approximated state value, and the new recurrent
state.
Args:
observ: Sequences of observations.
length: Batch of sequence lengths.
state: Batch of initial recurrent states.
reuse: Python boolean whether to reuse previous variables.
Returns:
NetworkOutput tuple.
"""
with tf.variable_scope('network', reuse=reuse):
observ = tf.convert_to_tensor(observ)
use_gpu = self._config.use_gpu and utility.available_gpus()
with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
observ = tf.check_numerics(observ, 'observ')
cell = self._config.network(self._batch_env.action.shape[1].value)
(mean, logstd, value), state = tf.nn.dynamic_rnn(
cell, observ, length, state, tf.float32, swap_memory=True)
mean = tf.check_numerics(mean, 'mean')
logstd = tf.check_numerics(logstd, 'logstd')
value = tf.check_numerics(value, 'value')
policy = tf.contrib.distributions.MultivariateNormalDiag(
mean, tf.exp(logstd))
return _NetworkOutput(policy, mean, logstd, value, state)

View File

@ -0,0 +1,152 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Memory that stores episodes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
class EpisodeMemory(object):
"""Memory that stores episodes."""
def __init__(self, template, capacity, max_length, scope):
"""Create a memory that stores episodes.
Each transition tuple consists of quantities specified by the template.
These quantities would typically be be observartions, actions, rewards, and
done indicators.
Args:
template: List of tensors to derive shapes and dtypes of each transition.
capacity: Number of episodes, or rows, hold by the memory.
max_length: Allocated sequence length for the episodes.
scope: Variable scope to use for internal variables.
"""
self._capacity = capacity
self._max_length = max_length
with tf.variable_scope(scope) as scope:
self._scope = scope
self._length = tf.Variable(tf.zeros(capacity, tf.int32), False)
self._buffers = [
tf.Variable(tf.zeros(
[capacity, max_length] + elem.shape.as_list(),
elem.dtype), False)
for elem in template]
def length(self, rows=None):
"""Tensor holding the current length of episodes.
Args:
rows: Episodes to select length from, defaults to all.
Returns:
Batch tensor of sequence lengths.
"""
rows = tf.range(self._capacity) if rows is None else rows
return tf.gather(self._length, rows)
def append(self, transitions, rows=None):
"""Append a batch of transitions to rows of the memory.
Args:
transitions: Tuple of transition quantities with batch dimension.
rows: Episodes to append to, defaults to all.
Returns:
Operation.
"""
rows = tf.range(self._capacity) if rows is None else rows
assert rows.shape.ndims == 1
assert_capacity = tf.assert_less(
rows, self._capacity,
message='capacity exceeded')
with tf.control_dependencies([assert_capacity]):
assert_max_length = tf.assert_less(
tf.gather(self._length, rows), self._max_length,
message='max length exceeded')
append_ops = []
with tf.control_dependencies([assert_max_length]):
for buffer_, elements in zip(self._buffers, transitions):
timestep = tf.gather(self._length, rows)
indices = tf.stack([rows, timestep], 1)
append_ops.append(tf.scatter_nd_update(buffer_, indices, elements))
with tf.control_dependencies(append_ops):
episode_mask = tf.reduce_sum(tf.one_hot(
rows, self._capacity, dtype=tf.int32), 0)
return self._length.assign_add(episode_mask)
def replace(self, episodes, length, rows=None):
"""Replace full episodes.
Args:
episodes: Tuple of transition quantities with batch and time dimensions.
length: Batch of sequence lengths.
rows: Episodes to replace, defaults to all.
Returns:
Operation.
"""
rows = tf.range(self._capacity) if rows is None else rows
assert rows.shape.ndims == 1
assert_capacity = tf.assert_less(
rows, self._capacity, message='capacity exceeded')
with tf.control_dependencies([assert_capacity]):
assert_max_length = tf.assert_less_equal(
length, self._max_length, message='max length exceeded')
replace_ops = []
with tf.control_dependencies([assert_max_length]):
for buffer_, elements in zip(self._buffers, episodes):
replace_op = tf.scatter_update(buffer_, rows, elements)
replace_ops.append(replace_op)
with tf.control_dependencies(replace_ops):
return tf.scatter_update(self._length, rows, length)
def data(self, rows=None):
"""Access a batch of episodes from the memory.
Padding elements after the length of each episode are unspecified and might
contain old data.
Args:
rows: Episodes to select, defaults to all.
Returns:
Tuple containing a tuple of transition quantiries with batch and time
dimensions, and a batch of sequence lengths.
"""
rows = tf.range(self._capacity) if rows is None else rows
assert rows.shape.ndims == 1
episode = [tf.gather(buffer_, rows) for buffer_ in self._buffers]
length = tf.gather(self._length, rows)
return episode, length
def clear(self, rows=None):
"""Reset episodes in the memory.
Internally, this only sets their lengths to zero. The memory entries will
be overridden by future calls to append() or replace().
Args:
rows: Episodes to clear, defaults to all.
Returns:
Operation.
"""
rows = tf.range(self._capacity) if rows is None else rows
assert rows.shape.ndims == 1
return tf.scatter_update(self._length, rows, tf.zeros_like(rows))

View File

@ -0,0 +1,168 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Normalize tensors based on streaming estimates of mean and variance."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
class StreamingNormalize(object):
"""Normalize tensors based on streaming estimates of mean and variance."""
def __init__(
self, template, center=True, scale=True, clip=10, name='normalize'):
"""Normalize tensors based on streaming estimates of mean and variance.
Centering the value, scaling it by the standard deviation, and clipping
outlier values are optional.
Args:
template: Example tensor providing shape and dtype of the vaule to track.
center: Python boolean indicating whether to subtract mean from values.
scale: Python boolean indicating whether to scale values by stddev.
clip: If and when to clip normalized values.
name: Parent scope of operations provided by this class.
"""
self._center = center
self._scale = scale
self._clip = clip
self._name = name
with tf.name_scope(name):
self._count = tf.Variable(0, False)
self._mean = tf.Variable(tf.zeros_like(template), False)
self._var_sum = tf.Variable(tf.zeros_like(template), False)
def transform(self, value):
"""Normalize a single or batch tensor.
Applies the activated transformations in the constructor using current
estimates of mean and variance.
Args:
value: Batch or single value tensor.
Returns:
Normalized batch or single value tensor.
"""
with tf.name_scope(self._name + '/transform'):
no_batch_dim = value.shape.ndims == self._mean.shape.ndims
if no_batch_dim:
# Add a batch dimension if necessary.
value = value[None, ...]
if self._center:
value -= self._mean[None, ...]
if self._scale:
# We cannot scale before seeing at least two samples.
value /= tf.cond(
self._count > 1, lambda: self._std() + 1e-8,
lambda: tf.ones_like(self._var_sum))[None]
if self._clip:
value = tf.clip_by_value(value, -self._clip, self._clip)
# Remove batch dimension if necessary.
if no_batch_dim:
value = value[0]
return tf.check_numerics(value, 'value')
def update(self, value):
"""Update the mean and variance estimates.
Args:
value: Batch or single value tensor.
Returns:
Summary tensor.
"""
with tf.name_scope(self._name + '/update'):
if value.shape.ndims == self._mean.shape.ndims:
# Add a batch dimension if necessary.
value = value[None, ...]
count = tf.shape(value)[0]
with tf.control_dependencies([self._count.assign_add(count)]):
step = tf.cast(self._count, tf.float32)
mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0)
new_mean = self._mean + mean_delta / step
new_mean = tf.cond(self._count > 1, lambda: new_mean, lambda: value[0])
var_delta = (
value - self._mean[None, ...]) * (value - new_mean[None, ...])
new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0)
with tf.control_dependencies([new_mean, new_var_sum]):
update = self._mean.assign(new_mean), self._var_sum.assign(new_var_sum)
with tf.control_dependencies(update):
if value.shape.ndims == 1:
value = tf.reduce_mean(value)
return self._summary('value', tf.reduce_mean(value))
def reset(self):
"""Reset the estimates of mean and variance.
Resets the full state of this class.
Returns:
Operation.
"""
with tf.name_scope(self._name + '/reset'):
return tf.group(
self._count.assign(0),
self._mean.assign(tf.zeros_like(self._mean)),
self._var_sum.assign(tf.zeros_like(self._var_sum)))
def summary(self):
"""Summary string of mean and standard deviation.
Returns:
Summary tensor.
"""
with tf.name_scope(self._name + '/summary'):
mean_summary = tf.cond(
self._count > 0, lambda: self._summary('mean', self._mean), str)
std_summary = tf.cond(
self._count > 1, lambda: self._summary('stddev', self._std()), str)
return tf.summary.merge([mean_summary, std_summary])
def _std(self):
"""Computes the current estimate of the standard deviation.
Note that the standard deviation is not defined until at least two samples
were seen.
Returns:
Tensor of current variance.
"""
variance = tf.cond(
self._count > 1,
lambda: self._var_sum / tf.cast(self._count - 1, tf.float32),
lambda: tf.ones_like(self._var_sum) * float('nan'))
# The epsilon corrects for small negative variance values caused by
# the algorithm. It was empirically chosen to work with all environments
# tested.
return tf.sqrt(variance + 1e-4)
def _summary(self, name, tensor):
"""Create a scalar or histogram summary matching the rank of the tensor.
Args:
name: Name for the summary.
tensor: Tensor to summarize.
Returns:
Summary tensor.
"""
if tensor.shape.ndims == 0:
return tf.summary.scalar(name, tensor)
else:
return tf.summary.histogram(name, tensor)

View File

@ -0,0 +1,222 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utilities for the PPO algorithm."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import re
import tensorflow as tf
from tensorflow.python.client import device_lib
def create_nested_vars(tensors):
"""Create variables matching a nested tuple of tensors.
Args:
tensors: Nested tuple of list of tensors.
Returns:
Nested tuple or list of variables.
"""
if isinstance(tensors, (tuple, list)):
return type(tensors)(create_nested_vars(tensor) for tensor in tensors)
return tf.Variable(tensors, False)
def reinit_nested_vars(variables, indices=None):
"""Reset all variables in a nested tuple to zeros.
Args:
variables: Nested tuple or list of variaables.
indices: Indices along the first dimension to reset, defaults to all.
Returns:
Operation.
"""
if isinstance(variables, (tuple, list)):
return tf.group(*[
reinit_nested_vars(variable, indices) for variable in variables])
if indices is None:
return variables.assign(tf.zeros_like(variables))
else:
zeros = tf.zeros([tf.shape(indices)[0]] + variables.shape[1:].as_list())
return tf.scatter_update(variables, indices, zeros)
def assign_nested_vars(variables, tensors):
"""Assign tensors to matching nested tuple of variables.
Args:
variables: Nested tuple or list of variables to update.
tensors: Nested tuple or list of tensors to assign.
Returns:
Operation.
"""
if isinstance(variables, (tuple, list)):
return tf.group(*[
assign_nested_vars(variable, tensor)
for variable, tensor in zip(variables, tensors)])
return variables.assign(tensors)
def discounted_return(reward, length, discount):
"""Discounted Monte-Carlo returns."""
timestep = tf.range(reward.shape[1].value)
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
return_ = tf.reverse(tf.transpose(tf.scan(
lambda agg, cur: cur + discount * agg,
tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]),
tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1])
return tf.check_numerics(tf.stop_gradient(return_), 'return')
def fixed_step_return(reward, value, length, discount, window):
"""N-step discounted return."""
timestep = tf.range(reward.shape[1].value)
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
return_ = tf.zeros_like(reward)
for _ in range(window):
return_ += reward
reward = discount * tf.concat(
[reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1)
return_ += discount ** window * tf.concat(
[value[:, window:], tf.zeros_like(value[:, -window:]), 1])
return tf.check_numerics(tf.stop_gradient(mask * return_), 'return')
def lambda_return(reward, value, length, discount, lambda_):
"""TD-lambda returns."""
timestep = tf.range(reward.shape[1].value)
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
sequence = mask * reward + discount * value * (1 - lambda_)
discount = mask * discount * lambda_
sequence = tf.stack([sequence, discount], 2)
return_ = tf.reverse(tf.transpose(tf.scan(
lambda agg, cur: cur[0] + cur[1] * agg,
tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]),
tf.zeros_like(value[:, -1]), 1, False), [1, 0]), [1])
return tf.check_numerics(tf.stop_gradient(return_), 'return')
def lambda_advantage(reward, value, length, discount):
"""Generalized Advantage Estimation."""
timestep = tf.range(reward.shape[1].value)
mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
next_value = tf.concat([value[:, 1:], tf.zeros_like(value[:, -1:])], 1)
delta = reward + discount * next_value - value
advantage = tf.reverse(tf.transpose(tf.scan(
lambda agg, cur: cur + discount * agg,
tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]),
tf.zeros_like(delta[:, -1]), 1, False), [1, 0]), [1])
return tf.check_numerics(tf.stop_gradient(advantage), 'advantage')
def diag_normal_kl(mean0, logstd0, mean1, logstd1):
"""Epirical KL divergence of two normals with diagonal covariance."""
logstd0_2, logstd1_2 = 2 * logstd0, 2 * logstd1
return 0.5 * (
tf.reduce_sum(tf.exp(logstd0_2 - logstd1_2), -1) +
tf.reduce_sum((mean1 - mean0) ** 2 / tf.exp(logstd1_2), -1) +
tf.reduce_sum(logstd1_2, -1) - tf.reduce_sum(logstd0_2, -1) -
mean0.shape[-1].value)
def diag_normal_logpdf(mean, logstd, loc):
"""Log density of a normal with diagonal covariance."""
constant = -0.5 * (math.log(2 * math.pi) + logstd)
value = -0.5 * ((loc - mean) / tf.exp(logstd)) ** 2
return tf.reduce_sum(constant + value, -1)
def diag_normal_entropy(mean, logstd):
"""Empirical entropy of a normal with diagonal covariance."""
constant = mean.shape[-1].value * math.log(2 * math.pi * math.e)
return (constant + tf.reduce_sum(2 * logstd, 1)) / 2
def available_gpus():
"""List of GPU device names detected by TensorFlow."""
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
def gradient_summaries(grad_vars, groups=None, scope='gradients'):
"""Create histogram summaries of the gradient.
Summaries can be grouped via regexes matching variables names.
Args:
grad_vars: List of (gradient, variable) tuples as returned by optimizers.
groups: Mapping of name to regex for grouping summaries.
scope: Name scope for this operation.
Returns:
Summary tensor.
"""
groups = groups or {r'all': r'.*'}
grouped = collections.defaultdict(list)
for grad, var in grad_vars:
if grad is None:
continue
for name, pattern in groups.items():
if re.match(pattern, var.name):
name = re.sub(pattern, name, var.name)
grouped[name].append(grad)
for name in groups:
if name not in grouped:
tf.logging.warn("No variables matching '{}' group.".format(name))
summaries = []
for name, grads in grouped.items():
grads = [tf.reshape(grad, [-1]) for grad in grads]
grads = tf.concat(grads, 0)
summaries.append(tf.summary.histogram(scope + '/' + name, grads))
return tf.summary.merge(summaries)
def variable_summaries(vars_, groups=None, scope='weights'):
"""Create histogram summaries for the provided variables.
Summaries can be grouped via regexes matching variables names.
Args:
vars_: List of variables to summarize.
groups: Mapping of name to regex for grouping summaries.
scope: Name scope for this operation.
Returns:
Summary tensor.
"""
groups = groups or {r'all': r'.*'}
grouped = collections.defaultdict(list)
for var in vars_:
for name, pattern in groups.items():
if re.match(pattern, var.name):
name = re.sub(pattern, name, var.name)
grouped[name].append(var)
for name in groups:
if name not in grouped:
tf.logging.warn("No variables matching '{}' group.".format(name))
summaries = []
for name, vars_ in grouped.items():
vars_ = [tf.reshape(var, [-1]) for var in vars_]
vars_ = tf.concat(vars_, 0)
summaries.append(tf.summary.histogram(scope + '/' + name, vars_))
return tf.summary.merge(summaries)

View File

@ -0,0 +1,23 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Executable scripts for reinforcement learning."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from . import train
from . import utility
from . import visualize

View File

@ -0,0 +1,128 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Example configurations using the PPO algorithm."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# pylint: disable=unused-variable
from pybullet_envs.minitaur.agents import ppo
from pybullet_envs.minitaur.agents.scripts import networks
def default():
"""Default configuration for PPO."""
# General
algorithm = ppo.PPOAlgorithm
num_agents = 10
eval_episodes = 25
use_gpu = False
# Network
network = networks.ForwardGaussianPolicy
weight_summaries = dict(
all=r'.*',
policy=r'.*/policy/.*',
value=r'.*/value/.*')
policy_layers = 200, 100
value_layers = 200, 100
init_mean_factor = 0.05
init_logstd = -1
# Optimization
update_every = 25
policy_optimizer = 'AdamOptimizer'
value_optimizer = 'AdamOptimizer'
update_epochs_policy = 50
update_epochs_value = 50
policy_lr = 1e-4
value_lr = 3e-4
# Losses
discount = 0.985
kl_target = 1e-2
kl_cutoff_factor = 2
kl_cutoff_coef = 1000
kl_init_penalty = 1
return locals()
def pendulum():
"""Configuration for the pendulum classic control task."""
locals().update(default())
# Environment
env = 'Pendulum-v0'
max_length = 200
steps = 1e6 # 1M
return locals()
def cheetah():
"""Configuration for MuJoCo's half cheetah task."""
locals().update(default())
# Environment
env = 'HalfCheetah-v1'
max_length = 1000
steps = 1e7 # 10M
return locals()
def walker():
"""Configuration for MuJoCo's walker task."""
locals().update(default())
# Environment
env = 'Walker2d-v1'
max_length = 1000
steps = 1e7 # 10M
return locals()
def reacher():
"""Configuration for MuJoCo's reacher task."""
locals().update(default())
# Environment
env = 'Reacher-v1'
max_length = 1000
steps = 1e7 # 10M
return locals()
def hopper():
"""Configuration for MuJoCo's hopper task."""
locals().update(default())
# Environment
env = 'Hopper-v1'
max_length = 1000
steps = 2e7 # 20M
return locals()
def ant():
"""Configuration for MuJoCo's ant task."""
locals().update(default())
# Environment
env = 'Ant-v1'
max_length = 1000
steps = 5e7 # 50M
return locals()
def humanoid():
"""Configuration for MuJoCo's humanoid task."""
locals().update(default())
# Environment
env = 'Humanoid-v1'
max_length = 1000
steps = 5e7 # 50M
return locals()

View File

@ -0,0 +1,167 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Networks for the PPO algorithm defined as recurrent cells."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
_MEAN_WEIGHTS_INITIALIZER = tf.contrib.layers.variance_scaling_initializer(
factor=0.1)
_LOGSTD_INITIALIZER = tf.random_normal_initializer(-1, 1e-10)
class LinearGaussianPolicy(tf.contrib.rnn.RNNCell):
"""Indepent linear network with a tanh at the end for policy and feedforward network for the value.
The policy network outputs the mean action and the log standard deviation
is learned as indepent parameter vector.
"""
def __init__(self,
policy_layers,
value_layers,
action_size,
mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
logstd_initializer=_LOGSTD_INITIALIZER):
self._policy_layers = policy_layers
self._value_layers = value_layers
self._action_size = action_size
self._mean_weights_initializer = mean_weights_initializer
self._logstd_initializer = logstd_initializer
@property
def state_size(self):
unused_state_size = 1
return unused_state_size
@property
def output_size(self):
return (self._action_size, self._action_size, tf.TensorShape([]))
def __call__(self, observation, state):
with tf.variable_scope('policy'):
x = tf.contrib.layers.flatten(observation)
mean = tf.contrib.layers.fully_connected(
x,
self._action_size,
tf.tanh,
weights_initializer=self._mean_weights_initializer)
logstd = tf.get_variable('logstd', mean.shape[1:], tf.float32,
self._logstd_initializer)
logstd = tf.tile(logstd[None, ...],
[tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
with tf.variable_scope('value'):
x = tf.contrib.layers.flatten(observation)
for size in self._value_layers:
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
return (mean, logstd, value), state
class ForwardGaussianPolicy(tf.contrib.rnn.RNNCell):
"""Independent feed forward networks for policy and value.
The policy network outputs the mean action and the log standard deviation
is learned as independent parameter vector.
"""
def __init__(
self, policy_layers, value_layers, action_size,
mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
logstd_initializer=_LOGSTD_INITIALIZER):
self._policy_layers = policy_layers
self._value_layers = value_layers
self._action_size = action_size
self._mean_weights_initializer = mean_weights_initializer
self._logstd_initializer = logstd_initializer
@property
def state_size(self):
unused_state_size = 1
return unused_state_size
@property
def output_size(self):
return (self._action_size, self._action_size, tf.TensorShape([]))
def __call__(self, observation, state):
with tf.variable_scope('policy'):
x = tf.contrib.layers.flatten(observation)
for size in self._policy_layers:
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
mean = tf.contrib.layers.fully_connected(
x, self._action_size, tf.tanh,
weights_initializer=self._mean_weights_initializer)
logstd = tf.get_variable(
'logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
logstd = tf.tile(
logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
with tf.variable_scope('value'):
x = tf.contrib.layers.flatten(observation)
for size in self._value_layers:
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
return (mean, logstd, value), state
class RecurrentGaussianPolicy(tf.contrib.rnn.RNNCell):
"""Independent recurrent policy and feed forward value networks.
The policy network outputs the mean action and the log standard deviation
is learned as independent parameter vector. The last policy layer is recurrent
and uses a GRU cell.
"""
def __init__(
self, policy_layers, value_layers, action_size,
mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
logstd_initializer=_LOGSTD_INITIALIZER):
self._policy_layers = policy_layers
self._value_layers = value_layers
self._action_size = action_size
self._mean_weights_initializer = mean_weights_initializer
self._logstd_initializer = logstd_initializer
self._cell = tf.contrib.rnn.GRUBlockCell(100)
@property
def state_size(self):
return self._cell.state_size
@property
def output_size(self):
return (self._action_size, self._action_size, tf.TensorShape([]))
def __call__(self, observation, state):
with tf.variable_scope('policy'):
x = tf.contrib.layers.flatten(observation)
for size in self._policy_layers[:-1]:
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
x, state = self._cell(x, state)
mean = tf.contrib.layers.fully_connected(
x, self._action_size, tf.tanh,
weights_initializer=self._mean_weights_initializer)
logstd = tf.get_variable(
'logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
logstd = tf.tile(
logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
with tf.variable_scope('value'):
x = tf.contrib.layers.flatten(observation)
for size in self._value_layers:
x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
return (mean, logstd, value), state

View File

@ -0,0 +1,165 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""Script to train a batch reinforcement learning algorithm.
Command line:
python3 -m agents.scripts.train --logdir=/path/to/logdir --config=pendulum
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import datetime
import functools
import os
import gym
import tensorflow as tf
from pybullet_envs.minitaur.agents import tools
from pybullet_envs.minitaur.agents.scripts import configs
from pybullet_envs.minitaur.agents.scripts import utility
def _create_environment(config):
"""Constructor for an instance of the environment.
Args:
config: Object providing configurations via attributes.
Returns:
Wrapped OpenAI Gym environment.
"""
if isinstance(config.env, str):
env = gym.make(config.env)
else:
env = config.env()
if config.max_length:
env = tools.wrappers.LimitDuration(env, config.max_length)
env = tools.wrappers.RangeNormalize(env)
env = tools.wrappers.ClipAction(env)
env = tools.wrappers.ConvertTo32Bit(env)
return env
def _define_loop(graph, logdir, train_steps, eval_steps):
"""Create and configure a training loop with training and evaluation phases.
Args:
graph: Object providing graph elements via attributes.
logdir: Log directory for storing checkpoints and summaries.
train_steps: Number of training steps per epoch.
eval_steps: Number of evaluation steps per epoch.
Returns:
Loop object.
"""
loop = tools.Loop(
logdir, graph.step, graph.should_log, graph.do_report,
graph.force_reset)
loop.add_phase(
'train', graph.done, graph.score, graph.summary, train_steps,
report_every=None,
log_every=train_steps // 2,
checkpoint_every=None,
feed={graph.is_training: True})
loop.add_phase(
'eval', graph.done, graph.score, graph.summary, eval_steps,
report_every=eval_steps,
log_every=eval_steps // 2,
checkpoint_every=10 * eval_steps,
feed={graph.is_training: False})
return loop
def train(config, env_processes):
"""Training and evaluation entry point yielding scores.
Resolves some configuration attributes, creates environments, graph, and
training loop. By default, assigns all operations to the CPU.
Args:
config: Object providing configurations via attributes.
env_processes: Whether to step environments in separate processes.
Yields:
Evaluation scores.
"""
tf.reset_default_graph()
with config.unlocked:
config.network = functools.partial(
utility.define_network, config.network, config)
config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
config.value_optimizer = getattr(tf.train, config.value_optimizer)
if config.update_every % config.num_agents:
tf.logging.warn('Number of agents should divide episodes per update.')
with tf.device('/cpu:0'):
batch_env = utility.define_batch_env(
lambda: _create_environment(config),
config.num_agents, env_processes)
graph = utility.define_simulation_graph(
batch_env, config.algorithm, config)
loop = _define_loop(
graph, config.logdir,
config.update_every * config.max_length,
config.eval_episodes * config.max_length)
total_steps = int(
config.steps / config.update_every *
(config.update_every + config.eval_episodes))
# Exclude episode related variables since the Python state of environments is
# not checkpointed and thus new episodes start after resuming.
saver = utility.define_saver(exclude=(r'.*_temporary/.*',))
sess_config = tf.ConfigProto(allow_soft_placement=True)
sess_config.gpu_options.allow_growth = True
with tf.Session(config=sess_config) as sess:
utility.initialize_variables(sess, saver, config.logdir)
for score in loop.run(sess, saver, total_steps):
yield score
batch_env.close()
def main(_):
"""Create or load configuration and launch the trainer."""
utility.set_up_logging()
if not FLAGS.config:
raise KeyError('You must specify a configuration.')
logdir = FLAGS.logdir and os.path.expanduser(os.path.join(
FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config)))
try:
config = utility.load_config(logdir)
except IOError:
config = tools.AttrDict(getattr(configs, FLAGS.config)())
config = utility.save_config(config, logdir)
for score in train(config, FLAGS.env_processes):
tf.logging.info('Score {}.'.format(score))
if __name__ == '__main__':
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string(
'logdir', None,
'Base directory to store logs.')
tf.app.flags.DEFINE_string(
'timestamp', datetime.datetime.now().strftime('%Y%m%dT%H%M%S'),
'Sub directory to store logs.')
tf.app.flags.DEFINE_string(
'config', None,
'Configuration to execute.')
tf.app.flags.DEFINE_boolean(
'env_processes', True,
'Step environments in separate processes to circumvent the GIL.')
tf.app.run()

View File

@ -0,0 +1,110 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for the PPO algorithm usage example."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import itertools
import tensorflow as tf
from google3.robotics.reinforcement_learning.agents import ppo
from google3.robotics.reinforcement_learning.agents import tools
from google3.robotics.reinforcement_learning.agents.scripts import configs
from google3.robotics.reinforcement_learning.agents.scripts import networks
from google3.robotics.reinforcement_learning.agents.scripts import train
FLAGS = tf.app.flags.FLAGS
class PPOTest(tf.test.TestCase):
def test_no_crash_cheetah(self):
nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
for network in nets:
config = self._define_config()
with config.unlocked:
config.env = 'HalfCheetah-v1'
config.max_length = 200
config.steps = 1000
config.network = network
for score in train.train(config, env_processes=True):
float(score)
def test_no_crash_ant(self):
nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
for network in nets:
config = self._define_config()
with config.unlocked:
config.env = 'Ant-v1'
config.max_length = 200
config.steps = 1000
config.network = network
for score in train.train(config, env_processes=True):
float(score)
def test_no_crash_observation_shape(self):
nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
observ_shapes = (1,), (2, 3), (2, 3, 4)
for network, observ_shape in itertools.product(nets, observ_shapes):
config = self._define_config()
with config.unlocked:
config.env = functools.partial(
tools.MockEnvironment, observ_shape, action_shape=(3,),
min_duration=15, max_duration=15)
config.max_length = 20
config.steps = 100
config.network = network
for score in train.train(config, env_processes=False):
float(score)
def test_no_crash_variable_duration(self):
config = self._define_config()
with config.unlocked:
config.env = functools.partial(
tools.MockEnvironment, observ_shape=(2, 3), action_shape=(3,),
min_duration=5, max_duration=25)
config.max_length = 25
config.steps = 200
config.network = networks.RecurrentGaussianPolicy
for score in train.train(config, env_processes=False):
float(score)
def _define_config(self):
# Start from the example configuration.
locals().update(configs.default())
# pylint: disable=unused-variable
# General
algorithm = ppo.PPOAlgorithm
num_agents = 2
update_every = 4
use_gpu = False
# Network
policy_layers = 20, 10
value_layers = 20, 10
# Optimization
update_epochs_policy = 2
update_epochs_value = 2
# pylint: enable=unused-variable
return tools.AttrDict(locals())
if __name__ == '__main__':
FLAGS.config = 'unused'
tf.test.main()

View File

@ -0,0 +1,213 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utilities for using reinforcement learning algorithms."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import os
import re
import ruamel.yaml as yaml
import tensorflow as tf
from pybullet_envs.minitaur.agents import tools
def define_simulation_graph(batch_env, algo_cls, config):
"""Define the algortihm and environment interaction.
Args:
batch_env: In-graph environments object.
algo_cls: Constructor of a batch algorithm.
config: Configuration object for the algorithm.
Returns:
Object providing graph elements via attributes.
"""
# pylint: disable=unused-variable
step = tf.Variable(0, False, dtype=tf.int32, name='global_step')
is_training = tf.placeholder(tf.bool, name='is_training')
should_log = tf.placeholder(tf.bool, name='should_log')
do_report = tf.placeholder(tf.bool, name='do_report')
force_reset = tf.placeholder(tf.bool, name='force_reset')
algo = algo_cls(batch_env, step, is_training, should_log, config)
done, score, summary = tools.simulate(
batch_env, algo, should_log, force_reset)
message = 'Graph contains {} trainable variables.'
tf.logging.info(message.format(tools.count_weights()))
# pylint: enable=unused-variable
return tools.AttrDict(locals())
def define_batch_env(constructor, num_agents, env_processes):
"""Create environments and apply all desired wrappers.
Args:
constructor: Constructor of an OpenAI gym environment.
num_agents: Number of environments to combine in the batch.
env_processes: Whether to step environment in external processes.
Returns:
In-graph environments object.
"""
with tf.variable_scope('environments'):
if env_processes:
envs = [
tools.wrappers.ExternalProcess(constructor)
for _ in range(num_agents)]
else:
envs = [constructor() for _ in range(num_agents)]
batch_env = tools.BatchEnv(envs, blocking=not env_processes)
batch_env = tools.InGraphBatchEnv(batch_env)
return batch_env
def define_saver(exclude=None):
"""Create a saver for the variables we want to checkpoint.
Args:
exclude: List of regexes to match variable names to exclude.
Returns:
Saver object.
"""
variables = []
exclude = exclude or []
exclude = [re.compile(regex) for regex in exclude]
for variable in tf.global_variables():
if any(regex.match(variable.name) for regex in exclude):
continue
variables.append(variable)
saver = tf.train.Saver(variables, keep_checkpoint_every_n_hours=5)
return saver
def define_network(constructor, config, action_size):
"""Constructor for the recurrent cell for the algorithm.
Args:
constructor: Callable returning the network as RNNCell.
config: Object providing configurations via attributes.
action_size: Integer indicating the amount of action dimensions.
Returns:
Created recurrent cell object.
"""
mean_weights_initializer = (
tf.contrib.layers.variance_scaling_initializer(
factor=config.init_mean_factor))
logstd_initializer = tf.random_normal_initializer(
config.init_logstd, 1e-10)
network = constructor(
config.policy_layers, config.value_layers, action_size,
mean_weights_initializer=mean_weights_initializer,
logstd_initializer=logstd_initializer)
return network
def initialize_variables(sess, saver, logdir, checkpoint=None, resume=None):
"""Initialize or restore variables from a checkpoint if available.
Args:
sess: Session to initialize variables in.
saver: Saver to restore variables.
logdir: Directory to search for checkpoints.
checkpoint: Specify what checkpoint name to use; defaults to most recent.
resume: Whether to expect recovering a checkpoint or starting a new run.
Raises:
ValueError: If resume expected but no log directory specified.
RuntimeError: If no resume expected but a checkpoint was found.
"""
sess.run(tf.group(
tf.local_variables_initializer(),
tf.global_variables_initializer()))
if resume and not (logdir or checkpoint):
raise ValueError('Need to specify logdir to resume a checkpoint.')
if logdir:
state = tf.train.get_checkpoint_state(logdir)
if checkpoint:
checkpoint = os.path.join(logdir, checkpoint)
if not checkpoint and state and state.model_checkpoint_path:
checkpoint = state.model_checkpoint_path
if checkpoint and resume is False:
message = 'Found unexpected checkpoint when starting a new run.'
raise RuntimeError(message)
if checkpoint:
saver.restore(sess, checkpoint)
def save_config(config, logdir=None):
"""Save a new configuration by name.
If a logging directory is specified, is will be created and the configuration
will be stored there. Otherwise, a log message will be printed.
Args:
config: Configuration object.
logdir: Location for writing summaries and checkpoints if specified.
Returns:
Configuration object.
"""
if logdir:
with config.unlocked:
config.logdir = logdir
message = 'Start a new run and write summaries and checkpoints to {}.'
tf.logging.info(message.format(config.logdir))
tf.gfile.MakeDirs(config.logdir)
config_path = os.path.join(config.logdir, 'config.yaml')
with tf.gfile.FastGFile(config_path, 'w') as file_:
yaml.dump(config, file_, default_flow_style=False)
else:
message = (
'Start a new run without storing summaries and checkpoints since no '
'logging directory was specified.')
tf.logging.info(message)
return config
def load_config(logdir):
"""Load a configuration from the log directory.
Args:
logdir: The logging directory containing the configuration file.
Raises:
IOError: The logging directory does not contain a configuration file.
Returns:
Configuration object.
"""
config_path = logdir and os.path.join(logdir, 'config.yaml')
if not config_path or not tf.gfile.Exists(config_path):
message = (
'Cannot resume an existing run since the logging directory does not '
'contain a configuration file.')
raise IOError(message)
with tf.gfile.FastGFile(config_path, 'r') as file_:
config = yaml.load(file_)
message = 'Resume run and write summaries and checkpoints to {}.'
tf.logging.info(message.format(config.logdir))
return config
def set_up_logging():
"""Configure the TensorFlow logger."""
tf.logging.set_verbosity(tf.logging.INFO)
logging.getLogger('tensorflow').propagate = False

View File

@ -0,0 +1,157 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""Script to render videos of the Proximal Policy Gradient algorithm.
Command line:
python3 -m agents.scripts.visualize \
--logdir=/path/to/logdir/<time>-<config> --outdir=/path/to/outdir/
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import os
import gym
import tensorflow as tf
from pybullet_envs.minitaur.agents import tools
from pybullet_envs.minitaur.agents.scripts import utility
def _create_environment(config, outdir):
"""Constructor for an instance of the environment.
Args:
config: Object providing configurations via attributes.
outdir: Directory to store videos in.
Returns:
Wrapped OpenAI Gym environment.
"""
if isinstance(config.env, str):
env = gym.make(config.env)
else:
env = config.env()
# Ensure that the environment has the specification attribute set as expected
# by the monitor wrapper.
if not hasattr(env, 'spec'):
setattr(env, 'spec', getattr(env, 'spec', None))
if config.max_length:
env = tools.wrappers.LimitDuration(env, config.max_length)
# env = gym.wrappers.Monitor(
# env, outdir, lambda unused_episode_number: True)
env = tools.wrappers.RangeNormalize(env)
env = tools.wrappers.ClipAction(env)
env = tools.wrappers.ConvertTo32Bit(env)
return env
def _define_loop(graph, eval_steps):
"""Create and configure an evaluation loop.
Args:
graph: Object providing graph elements via attributes.
eval_steps: Number of evaluation steps per epoch.
Returns:
Loop object.
"""
loop = tools.Loop(
None, graph.step, graph.should_log, graph.do_report, graph.force_reset)
loop.add_phase(
'eval', graph.done, graph.score, graph.summary, eval_steps,
report_every=eval_steps,
log_every=None,
checkpoint_every=None,
feed={graph.is_training: False})
return loop
def visualize(
logdir, outdir, num_agents, num_episodes, checkpoint=None,
env_processes=True):
"""Recover checkpoint and render videos from it.
Args:
logdir: Logging directory of the trained algorithm.
outdir: Directory to store rendered videos in.
num_agents: Number of environments to simulate in parallel.
num_episodes: Total number of episodes to simulate.
checkpoint: Checkpoint name to load; defaults to most recent.
env_processes: Whether to step environments in separate processes.
"""
config = utility.load_config(logdir)
with config.unlocked:
config.network = functools.partial(
utility.define_network, config.network, config)
config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
config.value_optimizer = getattr(tf.train, config.value_optimizer)
with tf.device('/cpu:0'):
batch_env = utility.define_batch_env(
lambda: _create_environment(config, outdir),
num_agents, env_processes)
graph = utility.define_simulation_graph(
batch_env, config.algorithm, config)
total_steps = num_episodes * config.max_length
loop = _define_loop(graph, total_steps)
saver = utility.define_saver(
exclude=(r'.*_temporary/.*', r'global_step'))
sess_config = tf.ConfigProto(allow_soft_placement=True)
sess_config.gpu_options.allow_growth = True
with tf.Session(config=sess_config) as sess:
utility.initialize_variables(
sess, saver, config.logdir, checkpoint, resume=True)
for unused_score in loop.run(sess, saver, total_steps):
pass
batch_env.close()
def main(_):
"""Load a trained algorithm and render videos."""
utility.set_up_logging()
if not FLAGS.logdir or not FLAGS.outdir:
raise KeyError('You must specify logging and outdirs directories.')
FLAGS.logdir = os.path.expanduser(FLAGS.logdir)
FLAGS.outdir = os.path.expanduser(FLAGS.outdir)
visualize(
FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents, FLAGS.num_episodes,
FLAGS.checkpoint, FLAGS.env_processes)
if __name__ == '__main__':
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string(
'logdir', None,
'Directory to the checkpoint of a training run.')
tf.app.flags.DEFINE_string(
'outdir', None,
'Local directory for storing the monitoring outdir.')
tf.app.flags.DEFINE_string(
'checkpoint', None,
'Checkpoint name to load; defaults to most recent.')
tf.app.flags.DEFINE_integer(
'num_agents', 1,
'How many environments to step in parallel.')
tf.app.flags.DEFINE_integer(
'num_episodes', 5,
'Minimum number of episodes to render.')
tf.app.flags.DEFINE_boolean(
'env_processes', True,
'Step environments in separate processes to circumvent the GIL.')
tf.app.run()

View File

@ -0,0 +1,30 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tools for reinforcement learning."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from .attr_dict import AttrDict
from .batch_env import BatchEnv
from .count_weights import count_weights
from .in_graph_batch_env import InGraphBatchEnv
from .in_graph_env import InGraphEnv
from .loop import Loop
from .mock_algorithm import MockAlgorithm
from .mock_environment import MockEnvironment
from .simulate import simulate
from .streaming_mean import StreamingMean

View File

@ -0,0 +1,54 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Wrap a dictionary to access keys as attributes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import contextlib
class AttrDict(dict):
"""Wrap a dictionary to access keys as attributes."""
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
super(AttrDict, self).__setattr__('_mutable', False)
def __getattr__(self, key):
# Do not provide None for unimplemented magic attributes.
if key.startswith('__'):
raise AttributeError
return self.get(key, None)
def __setattr__(self, key, value):
if not self._mutable:
message = "Cannot set attribute '{}'.".format(key)
message += " Use 'with obj.unlocked:' scope to set attributes."
raise RuntimeError(message)
if key.startswith('__'):
raise AttributeError("Cannot set magic attribute '{}'".format(key))
self[key] = value
@property
@contextlib.contextmanager
def unlocked(self):
super(AttrDict, self).__setattr__('_mutable', True)
yield
super(AttrDict, self).__setattr__('_mutable', False)
def copy(self):
return type(self)(super(AttrDict, self).copy())

View File

@ -0,0 +1,71 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for the attribute dictionary."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from pybullet_envs.minitaur.agents.tools import attr_dict
class AttrDictTest(tf.test.TestCase):
def test_construct_from_dict(self):
initial = dict(foo=13, bar=42)
obj = attr_dict.AttrDict(initial)
self.assertEqual(13, obj.foo)
self.assertEqual(42, obj.bar)
def test_construct_from_kwargs(self):
obj = attr_dict.AttrDict(foo=13, bar=42)
self.assertEqual(13, obj.foo)
self.assertEqual(42, obj.bar)
def test_has_attribute(self):
obj = attr_dict.AttrDict(foo=13)
self.assertTrue('foo' in obj)
self.assertFalse('bar' in obj)
def test_access_default(self):
obj = attr_dict.AttrDict()
self.assertEqual(None, obj.foo)
def test_access_magic(self):
obj = attr_dict.AttrDict()
with self.assertRaises(AttributeError):
obj.__getstate__ # pylint: disable=pointless-statement
def test_immutable_create(self):
obj = attr_dict.AttrDict()
with self.assertRaises(RuntimeError):
obj.foo = 42
def test_immutable_modify(self):
obj = attr_dict.AttrDict(foo=13)
with self.assertRaises(RuntimeError):
obj.foo = 42
def test_immutable_unlocked(self):
obj = attr_dict.AttrDict()
with obj.unlocked:
obj.foo = 42
self.assertEqual(42, obj.foo)
if __name__ == '__main__':
tf.test.main()

View File

@ -0,0 +1,125 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Combine multiple environments to step them in batch."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
class BatchEnv(object):
"""Combine multiple environments to step them in batch."""
def __init__(self, envs, blocking):
"""Combine multiple environments to step them in batch.
To step environments in parallel, environments must support a
`blocking=False` argument to their step and reset functions that makes them
return callables instead to receive the result at a later time.
Args:
envs: List of environments.
blocking: Step environments after another rather than in parallel.
Raises:
ValueError: Environments have different observation or action spaces.
"""
self._envs = envs
self._blocking = blocking
observ_space = self._envs[0].observation_space
if not all(env.observation_space == observ_space for env in self._envs):
raise ValueError('All environments must use the same observation space.')
action_space = self._envs[0].action_space
if not all(env.action_space == action_space for env in self._envs):
raise ValueError('All environments must use the same observation space.')
def __len__(self):
"""Number of combined environments."""
return len(self._envs)
def __getitem__(self, index):
"""Access an underlying environment by index."""
return self._envs[index]
def __getattr__(self, name):
"""Forward unimplemented attributes to one of the original environments.
Args:
name: Attribute that was accessed.
Returns:
Value behind the attribute name one of the wrapped environments.
"""
return getattr(self._envs[0], name)
def step(self, action):
"""Forward a batch of actions to the wrapped environments.
Args:
action: Batched action to apply to the environment.
Raises:
ValueError: Invalid actions.
Returns:
Batch of observations, rewards, and done flags.
"""
actions = action
for index, (env, action) in enumerate(zip(self._envs, actions)):
if not env.action_space.contains(action):
message = 'Invalid action at index {}: {}'
raise ValueError(message.format(index, action))
if self._blocking:
transitions = [
env.step(action)
for env, action in zip(self._envs, actions)]
else:
transitions = [
env.step(action, blocking=False)
for env, action in zip(self._envs, actions)]
transitions = [transition() for transition in transitions]
observs, rewards, dones, infos = zip(*transitions)
observ = np.stack(observs)
reward = np.stack(rewards)
done = np.stack(dones)
info = tuple(infos)
return observ, reward, done, info
def reset(self, indices=None):
"""Reset the environment and convert the resulting observation.
Args:
indices: The batch indices of environments to reset; defaults to all.
Returns:
Batch of observations.
"""
if indices is None:
indices = np.arange(len(self._envs))
if self._blocking:
observs = [self._envs[index].reset() for index in indices]
else:
observs = [self._envs[index].reset(blocking=False) for index in indices]
observs = [observ() for observ in observs]
observ = np.stack(observs)
return observ
def close(self):
"""Send close messages to the external process and join them."""
for env in self._envs:
if hasattr(env, 'close'):
env.close()

View File

@ -0,0 +1,48 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Count learnable parameters."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
import numpy as np
import tensorflow as tf
def count_weights(scope=None, exclude=None, graph=None):
"""Count learnable parameters.
Args:
scope: Resrict the count to a variable scope.
exclude: Regex to match variable names to exclude.
graph: Operate on a graph other than the current default graph.
Returns:
Number of learnable parameters as integer.
"""
if scope:
scope = scope if scope.endswith('/') else scope + '/'
graph = graph or tf.get_default_graph()
vars_ = graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
if scope:
vars_ = [var for var in vars_ if var.name.startswith(scope)]
if exclude:
exclude = re.compile(exclude)
vars_ = [var for var in vars_ if not exclude.match(var.name)]
shapes = [var.get_shape().as_list() for var in vars_]
return int(sum(np.prod(shape) for shape in shapes))

View File

@ -0,0 +1,98 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for the weight counting utility."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from agents.tools import count_weights
class CountWeightsTest(tf.test.TestCase):
def test_count_trainable(self):
tf.Variable(tf.zeros((5, 3)), trainable=True)
tf.Variable(tf.zeros((1, 1)), trainable=True)
tf.Variable(tf.zeros((5,)), trainable=True)
self.assertEqual(15 + 1 + 5, count_weights())
def test_ignore_non_trainable(self):
tf.Variable(tf.zeros((5, 3)), trainable=False)
tf.Variable(tf.zeros((1, 1)), trainable=False)
tf.Variable(tf.zeros((5,)), trainable=False)
self.assertEqual(0, count_weights())
def test_trainable_and_non_trainable(self):
tf.Variable(tf.zeros((5, 3)), trainable=True)
tf.Variable(tf.zeros((8, 2)), trainable=False)
tf.Variable(tf.zeros((1, 1)), trainable=True)
tf.Variable(tf.zeros((5,)), trainable=True)
tf.Variable(tf.zeros((3, 1)), trainable=False)
self.assertEqual(15 + 1 + 5, count_weights())
def test_include_scopes(self):
tf.Variable(tf.zeros((3, 2)), trainable=True)
with tf.variable_scope('foo'):
tf.Variable(tf.zeros((5, 2)), trainable=True)
self.assertEqual(6 + 10, count_weights())
def test_restrict_scope(self):
tf.Variable(tf.zeros((3, 2)), trainable=True)
with tf.variable_scope('foo'):
tf.Variable(tf.zeros((5, 2)), trainable=True)
with tf.variable_scope('bar'):
tf.Variable(tf.zeros((1, 2)), trainable=True)
self.assertEqual(10 + 2, count_weights('foo'))
def test_restrict_nested_scope(self):
tf.Variable(tf.zeros((3, 2)), trainable=True)
with tf.variable_scope('foo'):
tf.Variable(tf.zeros((5, 2)), trainable=True)
with tf.variable_scope('bar'):
tf.Variable(tf.zeros((1, 2)), trainable=True)
self.assertEqual(2, count_weights('foo/bar'))
def test_restrict_invalid_scope(self):
tf.Variable(tf.zeros((3, 2)), trainable=True)
with tf.variable_scope('foo'):
tf.Variable(tf.zeros((5, 2)), trainable=True)
with tf.variable_scope('bar'):
tf.Variable(tf.zeros((1, 2)), trainable=True)
self.assertEqual(0, count_weights('bar'))
def test_exclude_by_regex(self):
tf.Variable(tf.zeros((3, 2)), trainable=True)
with tf.variable_scope('foo'):
tf.Variable(tf.zeros((5, 2)), trainable=True)
with tf.variable_scope('bar'):
tf.Variable(tf.zeros((1, 2)), trainable=True)
self.assertEqual(0, count_weights(exclude=r'.*'))
self.assertEqual(6, count_weights(exclude=r'(^|/)foo/.*'))
self.assertEqual(16, count_weights(exclude=r'.*/bar/.*'))
def test_non_default_graph(self):
graph = tf.Graph()
with graph.as_default():
tf.Variable(tf.zeros((5, 3)), trainable=True)
tf.Variable(tf.zeros((8, 2)), trainable=False)
self.assertNotEqual(graph, tf.get_default_graph)
self.assertEqual(15, count_weights(graph=graph))
if __name__ == '__main__':
tf.test.main()

View File

@ -0,0 +1,178 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Batch of environments inside the TensorFlow graph."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pdb
import gym
import tensorflow as tf
class InGraphBatchEnv(object):
"""Batch of environments inside the TensorFlow graph.
The batch of environments will be stepped and reset inside of the graph using
a tf.py_func(). The current batch of observations, actions, rewards, and done
flags are held in according variables.
"""
def __init__(self, batch_env):
"""Batch of environments inside the TensorFlow graph.
Args:
batch_env: Batch environment.
"""
self._batch_env = batch_env
observ_shape = self._parse_shape(self._batch_env.observation_space)
observ_dtype = self._parse_dtype(self._batch_env.observation_space)
action_shape = self._parse_shape(self._batch_env.action_space)
action_dtype = self._parse_dtype(self._batch_env.action_space)
with tf.variable_scope('env_temporary'):
self._observ = tf.Variable(
tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype),
name='observ', trainable=False)
self._action = tf.Variable(
tf.zeros((len(self._batch_env),) + action_shape, action_dtype),
name='action', trainable=False)
self._reward = tf.Variable(
tf.zeros((len(self._batch_env),), tf.float32),
name='reward', trainable=False)
self._done = tf.Variable(
tf.cast(tf.ones((len(self._batch_env),)), tf.bool),
name='done', trainable=False)
def __getattr__(self, name):
"""Forward unimplemented attributes to one of the original environments.
Args:
name: Attribute that was accessed.
Returns:
Value behind the attribute name in one of the original environments.
"""
return getattr(self._batch_env, name)
def __len__(self):
"""Number of combined environments."""
return len(self._batch_env)
def __getitem__(self, index):
"""Access an underlying environment by index."""
return self._batch_env[index]
def simulate(self, action):
"""Step the batch of environments.
The results of the step can be accessed from the variables defined below.
Args:
action: Tensor holding the batch of actions to apply.
Returns:
Operation.
"""
with tf.name_scope('environment/simulate'):
if action.dtype in (tf.float16, tf.float32, tf.float64):
action = tf.check_numerics(action, 'action')
observ_dtype = self._parse_dtype(self._batch_env.observation_space)
observ, reward, done = tf.py_func(
lambda a: self._batch_env.step(a)[:3], [action],
[observ_dtype, tf.float32, tf.bool], name='step')
observ = tf.check_numerics(observ, 'observ')
reward = tf.check_numerics(reward, 'reward')
return tf.group(
self._observ.assign(observ),
self._action.assign(action),
self._reward.assign(reward),
self._done.assign(done))
def reset(self, indices=None):
"""Reset the batch of environments.
Args:
indices: The batch indices of the environments to reset; defaults to all.
Returns:
Batch tensor of the new observations.
"""
if indices is None:
indices = tf.range(len(self._batch_env))
observ_dtype = self._parse_dtype(self._batch_env.observation_space)
observ = tf.py_func(
self._batch_env.reset, [indices], observ_dtype, name='reset')
observ = tf.check_numerics(observ, 'observ')
reward = tf.zeros_like(indices, tf.float32)
done = tf.zeros_like(indices, tf.bool)
with tf.control_dependencies([
tf.scatter_update(self._observ, indices, observ),
tf.scatter_update(self._reward, indices, reward),
tf.scatter_update(self._done, indices, done)]):
return tf.identity(observ)
@property
def observ(self):
"""Access the variable holding the current observation."""
return self._observ
@property
def action(self):
"""Access the variable holding the last recieved action."""
return self._action
@property
def reward(self):
"""Access the variable holding the current reward."""
return self._reward
@property
def done(self):
"""Access the variable indicating whether the episode is done."""
return self._done
def close(self):
"""Send close messages to the external process and join them."""
self._batch_env.close()
def _parse_shape(self, space):
"""Get a tensor shape from a OpenAI Gym space.
Args:
space: Gym space.
Returns:
Shape tuple.
"""
if isinstance(space, gym.spaces.Discrete):
return ()
if isinstance(space, gym.spaces.Box):
return space.shape
raise NotImplementedError()
def _parse_dtype(self, space):
"""Get a tensor dtype from a OpenAI Gym space.
Args:
space: Gym space.
Returns:
TensorFlow data type.
"""
if isinstance(space, gym.spaces.Discrete):
return tf.int32
if isinstance(space, gym.spaces.Box):
return tf.float32
raise NotImplementedError()

View File

@ -0,0 +1,162 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Put an OpenAI Gym environment into the TensorFlow graph."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gym
import tensorflow as tf
class InGraphEnv(object):
"""Put an OpenAI Gym environment into the TensorFlow graph.
The environment will be stepped and reset inside of the graph using
tf.py_func(). The current observation, action, reward, and done flag are held
in according variables.
"""
def __init__(self, env):
"""Put an OpenAI Gym environment into the TensorFlow graph.
Args:
env: OpenAI Gym environment.
"""
self._env = env
observ_shape = self._parse_shape(self._env.observation_space)
observ_dtype = self._parse_dtype(self._env.observation_space)
action_shape = self._parse_shape(self._env.action_space)
action_dtype = self._parse_dtype(self._env.action_space)
with tf.name_scope('environment'):
self._observ = tf.Variable(
tf.zeros(observ_shape, observ_dtype), name='observ', trainable=False)
self._action = tf.Variable(
tf.zeros(action_shape, action_dtype), name='action', trainable=False)
self._reward = tf.Variable(
0.0, dtype=tf.float32, name='reward', trainable=False)
self._done = tf.Variable(
True, dtype=tf.bool, name='done', trainable=False)
self._step = tf.Variable(
0, dtype=tf.int32, name='step', trainable=False)
def __getattr__(self, name):
"""Forward unimplemented attributes to the original environment.
Args:
name: Attribute that was accessed.
Returns:
Value behind the attribute name in the wrapped environment.
"""
return getattr(self._env, name)
def simulate(self, action):
"""Step the environment.
The result of the step can be accessed from the variables defined below.
Args:
action: Tensor holding the action to apply.
Returns:
Operation.
"""
with tf.name_scope('environment/simulate'):
if action.dtype in (tf.float16, tf.float32, tf.float64):
action = tf.check_numerics(action, 'action')
observ_dtype = self._parse_dtype(self._env.observation_space)
observ, reward, done = tf.py_func(
lambda a: self._env.step(a)[:3], [action],
[observ_dtype, tf.float32, tf.bool], name='step')
observ = tf.check_numerics(observ, 'observ')
reward = tf.check_numerics(reward, 'reward')
return tf.group(
self._observ.assign(observ),
self._action.assign(action),
self._reward.assign(reward),
self._done.assign(done),
self._step.assign_add(1))
def reset(self):
"""Reset the environment.
Returns:
Tensor of the current observation.
"""
observ_dtype = self._parse_dtype(self._env.observation_space)
observ = tf.py_func(self._env.reset, [], observ_dtype, name='reset')
observ = tf.check_numerics(observ, 'observ')
with tf.control_dependencies([
self._observ.assign(observ),
self._reward.assign(0),
self._done.assign(False)]):
return tf.identity(observ)
@property
def observ(self):
"""Access the variable holding the current observation."""
return self._observ
@property
def action(self):
"""Access the variable holding the last recieved action."""
return self._action
@property
def reward(self):
"""Access the variable holding the current reward."""
return self._reward
@property
def done(self):
"""Access the variable indicating whether the episode is done."""
return self._done
@property
def step(self):
"""Access the variable containg total steps of this environment."""
return self._step
def _parse_shape(self, space):
"""Get a tensor shape from a OpenAI Gym space.
Args:
space: Gym space.
Returns:
Shape tuple.
"""
if isinstance(space, gym.spaces.Discrete):
return ()
if isinstance(space, gym.spaces.Box):
return space.shape
raise NotImplementedError()
def _parse_dtype(self, space):
"""Get a tensor dtype from a OpenAI Gym space.
Args:
space: Gym space.
Returns:
TensorFlow data type.
"""
if isinstance(space, gym.spaces.Discrete):
return tf.int32
if isinstance(space, gym.spaces.Box):
return tf.float32
raise NotImplementedError()

View File

@ -0,0 +1,233 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Execute operations in a loop and coordinate logging and checkpoints."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import os
import tensorflow as tf
from pybullet_envs.minitaur.agents.tools import streaming_mean
_Phase = collections.namedtuple(
'Phase',
'name, writer, op, batch, steps, feed, report_every, log_every,'
'checkpoint_every')
class Loop(object):
"""Execute operations in a loop and coordinate logging and checkpoints.
Supports multiple phases, that define their own operations to run, and
intervals for reporting scores, logging summaries, and storing checkpoints.
All class state is stored in-graph to properly recover from checkpoints.
"""
def __init__(self, logdir, step=None, log=None, report=None, reset=None):
"""Execute operations in a loop and coordinate logging and checkpoints.
The step, log, report, and report arguments will get created if not
provided. Reset is used to indicate switching to a new phase, so that the
model can start a new computation in case its computation is split over
multiple training steps.
Args:
logdir: Will contain checkpoints and summaries for each phase.
step: Variable of the global step (optional).
log: Tensor indicating to the model to compute summary tensors.
report: Tensor indicating to the loop to report the current mean score.
reset: Tensor indicating to the model to start a new computation.
"""
self._logdir = logdir
self._step = (
tf.Variable(0, False, name='global_step') if step is None else step)
self._log = tf.placeholder(tf.bool) if log is None else log
self._report = tf.placeholder(tf.bool) if report is None else report
self._reset = tf.placeholder(tf.bool) if reset is None else reset
self._phases = []
def add_phase(
self, name, done, score, summary, steps,
report_every=None, log_every=None, checkpoint_every=None, feed=None):
"""Add a phase to the loop protocol.
If the model breaks long computation into multiple steps, the done tensor
indicates whether the current score should be added to the mean counter.
For example, in reinforcement learning we only have a valid score at the
end of the episode.
Score and done tensors can either be scalars or vectors, to support
single and batched computations.
Args:
name: Name for the phase, used for the summary writer.
done: Tensor indicating whether current score can be used.
score: Tensor holding the current, possibly intermediate, score.
summary: Tensor holding summary string to write if not an empty string.
steps: Duration of the phase in steps.
report_every: Yield mean score every this number of steps.
log_every: Request summaries via `log` tensor every this number of steps.
checkpoint_every: Write checkpoint every this number of steps.
feed: Additional feed dictionary for the session run call.
Raises:
ValueError: Unknown rank for done or score tensors.
"""
done = tf.convert_to_tensor(done, tf.bool)
score = tf.convert_to_tensor(score, tf.float32)
summary = tf.convert_to_tensor(summary, tf.string)
feed = feed or {}
if done.shape.ndims is None or score.shape.ndims is None:
raise ValueError("Rank of 'done' and 'score' tensors must be known.")
writer = self._logdir and tf.summary.FileWriter(
os.path.join(self._logdir, name), tf.get_default_graph(),
flush_secs=60)
op = self._define_step(done, score, summary)
batch = 1 if score.shape.ndims == 0 else score.shape[0].value
self._phases.append(_Phase(
name, writer, op, batch, int(steps), feed, report_every,
log_every, checkpoint_every))
def run(self, sess, saver, max_step=None):
"""Run the loop schedule for a specified number of steps.
Call the operation of the current phase until the global step reaches the
specified maximum step. Phases are repeated over and over in the order they
were added.
Args:
sess: Session to use to run the phase operation.
saver: Saver used for checkpointing.
max_step: Run the operations until the step reaches this limit.
Yields:
Reported mean scores.
"""
global_step = sess.run(self._step)
steps_made = 1
while True:
if max_step and global_step >= max_step:
break
phase, epoch, steps_in = self._find_current_phase(global_step)
phase_step = epoch * phase.steps + steps_in
if steps_in % phase.steps < steps_made:
message = '\n' + ('-' * 50) + '\n'
message += 'Phase {} (phase step {}, global step {}).'
tf.logging.info(message.format(phase.name, phase_step, global_step))
# Populate book keeping tensors.
phase.feed[self._reset] = (steps_in < steps_made)
phase.feed[self._log] = (
phase.writer and
self._is_every_steps(phase_step, phase.batch, phase.log_every))
phase.feed[self._report] = (
self._is_every_steps(phase_step, phase.batch, phase.report_every))
summary, mean_score, global_step, steps_made = sess.run(
phase.op, phase.feed)
if self._is_every_steps(phase_step, phase.batch, phase.checkpoint_every):
self._store_checkpoint(sess, saver, global_step)
if self._is_every_steps(phase_step, phase.batch, phase.report_every):
yield mean_score
if summary and phase.writer:
# We want smaller phases to catch up at the beginnig of each epoch so
# that their graphs are aligned.
longest_phase = max(phase.steps for phase in self._phases)
summary_step = epoch * longest_phase + steps_in
phase.writer.add_summary(summary, summary_step)
def _is_every_steps(self, phase_step, batch, every):
"""Determine whether a periodic event should happen at this step.
Args:
phase_step: The incrementing step.
batch: The number of steps progressed at once.
every: The interval of the periode.
Returns:
Boolean of whether the event should happen.
"""
if not every:
return False
covered_steps = range(phase_step, phase_step + batch)
return any((step + 1) % every == 0 for step in covered_steps)
def _find_current_phase(self, global_step):
"""Determine the current phase based on the global step.
This ensures continuing the correct phase after restoring checkoints.
Args:
global_step: The global number of steps performed across all phases.
Returns:
Tuple of phase object, epoch number, and phase steps within the epoch.
"""
epoch_size = sum(phase.steps for phase in self._phases)
epoch = int(global_step // epoch_size)
steps_in = global_step % epoch_size
for phase in self._phases:
if steps_in < phase.steps:
return phase, epoch, steps_in
steps_in -= phase.steps
def _define_step(self, done, score, summary):
"""Combine operations of a phase.
Keeps track of the mean score and when to report it.
Args:
done: Tensor indicating whether current score can be used.
score: Tensor holding the current, possibly intermediate, score.
summary: Tensor holding summary string to write if not an empty string.
Returns:
Tuple of summary tensor, mean score, and new global step. The mean score
is zero for non reporting steps.
"""
if done.shape.ndims == 0:
done = done[None]
if score.shape.ndims == 0:
score = score[None]
score_mean = streaming_mean.StreamingMean((), tf.float32)
with tf.control_dependencies([done, score, summary]):
done_score = tf.gather(score, tf.where(done)[:, 0])
submit_score = tf.cond(
tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op)
with tf.control_dependencies([submit_score]):
mean_score = tf.cond(self._report, score_mean.clear, float)
steps_made = tf.shape(score)[0]
next_step = self._step.assign_add(steps_made)
with tf.control_dependencies([mean_score, next_step]):
return tf.identity(summary), mean_score, next_step, steps_made
def _store_checkpoint(self, sess, saver, global_step):
"""Store a checkpoint if a log directory was provided to the constructor.
The directory will be created if needed.
Args:
sess: Session containing variables to store.
saver: Saver used for checkpointing.
global_step: Step number of the checkpoint name.
"""
if not self._logdir or not saver:
return
tf.gfile.MakeDirs(self._logdir)
filename = os.path.join(self._logdir, 'model.ckpt')
saver.save(sess, filename, global_step)

View File

@ -0,0 +1,111 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for the training loop."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from google3.robotics.reinforcement_learning.agents import tools
class LoopTest(tf.test.TestCase):
def test_report_every_step(self):
step = tf.Variable(0, False, dtype=tf.int32, name='step')
loop = tools.Loop(None, step)
loop.add_phase(
'phase_1', done=True, score=0, summary='', steps=1, report_every=3)
# Step: 0 1 2 3 4 5 6 7 8
# Report: x x x
with self.test_session() as sess:
sess.run(tf.global_variables_initializer())
scores = loop.run(sess, saver=None, max_step=9)
next(scores)
self.assertEqual(3, sess.run(step))
next(scores)
self.assertEqual(6, sess.run(step))
next(scores)
self.assertEqual(9, sess.run(step))
def test_phases_feed(self):
score = tf.placeholder(tf.float32, [])
loop = tools.Loop(None)
loop.add_phase(
'phase_1', done=True, score=score, summary='', steps=1, report_every=1,
log_every=None, checkpoint_every=None, feed={score: 1})
loop.add_phase(
'phase_2', done=True, score=score, summary='', steps=3, report_every=1,
log_every=None, checkpoint_every=None, feed={score: 2})
loop.add_phase(
'phase_3', done=True, score=score, summary='', steps=2, report_every=1,
log_every=None, checkpoint_every=None, feed={score: 3})
with self.test_session() as sess:
sess.run(tf.global_variables_initializer())
scores = list(loop.run(sess, saver=None, max_step=15))
self.assertAllEqual([1, 2, 2, 2, 3, 3, 1, 2, 2, 2, 3, 3, 1, 2, 2], scores)
def test_average_score_over_phases(self):
loop = tools.Loop(None)
loop.add_phase(
'phase_1', done=True, score=1, summary='', steps=1, report_every=2)
loop.add_phase(
'phase_2', done=True, score=2, summary='', steps=2, report_every=5)
# Score: 1 2 2 1 2 2 1 2 2 1 2 2 1 2 2 1 2
# Report 1: x x x
# Report 2: x x
with self.test_session() as sess:
sess.run(tf.global_variables_initializer())
scores = list(loop.run(sess, saver=None, max_step=17))
self.assertAllEqual([1, 2, 1, 2, 1], scores)
def test_not_done(self):
step = tf.Variable(0, False, dtype=tf.int32, name='step')
done = tf.equal((step + 1) % 2, 0)
score = tf.cast(step, tf.float32)
loop = tools.Loop(None, step)
loop.add_phase(
'phase_1', done, score, summary='', steps=1, report_every=3)
# Score: 0 1 2 3 4 5 6 7 8
# Done: x x x x
# Report: x x x
with self.test_session() as sess:
sess.run(tf.global_variables_initializer())
scores = list(loop.run(sess, saver=None, max_step=9))
self.assertAllEqual([1, 4, 7], scores)
def test_not_done_batch(self):
step = tf.Variable(0, False, dtype=tf.int32, name='step')
done = tf.equal([step % 3, step % 4], 0)
score = tf.cast([step, step ** 2], tf.float32)
loop = tools.Loop(None, step)
loop.add_phase(
'phase_1', done, score, summary='', steps=1, report_every=8)
# Step: 0 2 4 6
# Score 1: 0 2 4 6
# Done 1: x x
# Score 2: 0 4 16 32
# Done 2: x x
with self.test_session() as sess:
sess.run(tf.global_variables_initializer())
scores = list(loop.run(sess, saver=None, max_step=8))
self.assertEqual(8, sess.run(step))
self.assertAllEqual([(0 + 0 + 16 + 6) / 4], scores)
if __name__ == '__main__':
tf.test.main()

View File

@ -0,0 +1,49 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mock algorithm for testing reinforcement learning code."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
class MockAlgorithm(object):
"""Produce random actions and empty summaries."""
def __init__(self, envs):
"""Produce random actions and empty summaries.
Args:
envs: List of in-graph environments.
"""
self._envs = envs
def begin_episode(self, unused_agent_indices):
return tf.constant('')
def perform(self, unused_observ):
shape = (len(self._envs),) + self._envs[0].action_space.shape
low = self._envs[0].action_space.low
high = self._envs[0].action_space.high
action = tf.random_uniform(shape) * (high - low) + low
return action, tf.constant('')
def experience(self, *unused_transition):
return tf.constant('')
def end_episode(self, unused_agent_indices):
return tf.constant('')

View File

@ -0,0 +1,86 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mock environment for testing reinforcement learning code."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gym
import gym.spaces
import numpy as np
class MockEnvironment(object):
"""Generate random agent input and keep track of statistics."""
def __init__(self, observ_shape, action_shape, min_duration, max_duration):
"""Generate random agent input and keep track of statistics.
Args:
observ_shape: Shape for the random observations.
action_shape: Shape for the action space.
min_duration: Minimum number of steps per episode.
max_duration: Maximum number of steps per episode.
Attributes:
steps: List of actual simulated lengths for all episodes.
durations: List of decided lengths for all episodes.
"""
self._observ_shape = observ_shape
self._action_shape = action_shape
self._min_duration = min_duration
self._max_duration = max_duration
self._random = np.random.RandomState(0)
self.steps = []
self.durations = []
@property
def observation_space(self):
low = np.zeros(self._observ_shape)
high = np.ones(self._observ_shape)
return gym.spaces.Box(low, high)
@property
def action_space(self):
low = np.zeros(self._action_shape)
high = np.ones(self._action_shape)
return gym.spaces.Box(low, high)
@property
def unwrapped(self):
return self
def step(self, action):
assert self.action_space.contains(action)
assert self.steps[-1] < self.durations[-1]
self.steps[-1] += 1
observ = self._current_observation()
reward = self._current_reward()
done = self.steps[-1] >= self.durations[-1]
info = {}
return observ, reward, done, info
def reset(self):
duration = self._random.randint(self._min_duration, self._max_duration + 1)
self.steps.append(0)
self.durations.append(duration)
return self._current_observation()
def _current_observation(self):
return self._random.uniform(0, 1, self._observ_shape)
def _current_reward(self):
return self._random.uniform(-1, 1)

View File

@ -0,0 +1,145 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""In-graph simulation step of a vecrotized algorithm with environments."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from pybullet_envs.minitaur.agents.tools import streaming_mean
def simulate(batch_env, algo, log=True, reset=False):
"""Simulation step of a vecrotized algorithm with in-graph environments.
Integrates the operations implemented by the algorithm and the environments
into a combined operation.
Args:
batch_env: In-graph batch environment.
algo: Algorithm instance implementing required operations.
log: Tensor indicating whether to compute and return summaries.
reset: Tensor causing all environments to reset.
Returns:
Tuple of tensors containing done flags for the current episodes, possibly
intermediate scores for the episodes, and a summary tensor.
"""
def _define_begin_episode(agent_indices):
"""Reset environments, intermediate scores and durations for new episodes.
Args:
agent_indices: Tensor containing batch indices starting an episode.
Returns:
Summary tensor.
"""
assert agent_indices.shape.ndims == 1
zero_scores = tf.zeros_like(agent_indices, tf.float32)
zero_durations = tf.zeros_like(agent_indices)
reset_ops = [
batch_env.reset(agent_indices),
tf.scatter_update(score, agent_indices, zero_scores),
tf.scatter_update(length, agent_indices, zero_durations)]
with tf.control_dependencies(reset_ops):
return algo.begin_episode(agent_indices)
def _define_step():
"""Request actions from the algorithm and apply them to the environments.
Increments the lengths of all episodes and increases their scores by the
current reward. After stepping the environments, provides the full
transition tuple to the algorithm.
Returns:
Summary tensor.
"""
prevob = batch_env.observ + 0 # Ensure a copy of the variable value.
action, step_summary = algo.perform(prevob)
action.set_shape(batch_env.action.shape)
with tf.control_dependencies([batch_env.simulate(action)]):
add_score = score.assign_add(batch_env.reward)
inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
with tf.control_dependencies([add_score, inc_length]):
experience_summary = algo.experience(
prevob, batch_env.action, batch_env.reward, batch_env.done,
batch_env.observ)
return tf.summary.merge([step_summary, experience_summary])
def _define_end_episode(agent_indices):
"""Notify the algorithm of ending episodes.
Also updates the mean score and length counters used for summaries.
Args:
agent_indices: Tensor holding batch indices that end their episodes.
Returns:
Summary tensor.
"""
assert agent_indices.shape.ndims == 1
submit_score = mean_score.submit(tf.gather(score, agent_indices))
submit_length = mean_length.submit(
tf.cast(tf.gather(length, agent_indices), tf.float32))
with tf.control_dependencies([submit_score, submit_length]):
return algo.end_episode(agent_indices)
def _define_summaries():
"""Reset the average score and duration, and return them as summary.
Returns:
Summary string.
"""
score_summary = tf.cond(
tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
length_summary = tf.cond(
tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
return tf.summary.merge([score_summary, length_summary])
with tf.name_scope('simulate'):
log = tf.convert_to_tensor(log)
reset = tf.convert_to_tensor(reset)
with tf.variable_scope('simulate_temporary'):
score = tf.Variable(
tf.zeros(len(batch_env), dtype=tf.float32), False, name='score')
length = tf.Variable(
tf.zeros(len(batch_env), dtype=tf.int32), False, name='length')
mean_score = streaming_mean.StreamingMean((), tf.float32)
mean_length = streaming_mean.StreamingMean((), tf.float32)
agent_indices = tf.cond(
reset,
lambda: tf.range(len(batch_env)),
lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
begin_episode = tf.cond(
tf.cast(tf.shape(agent_indices)[0], tf.bool),
lambda: _define_begin_episode(agent_indices), str)
with tf.control_dependencies([begin_episode]):
step = _define_step()
with tf.control_dependencies([step]):
agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
end_episode = tf.cond(
tf.cast(tf.shape(agent_indices)[0], tf.bool),
lambda: _define_end_episode(agent_indices), str)
with tf.control_dependencies([end_episode]):
summary = tf.summary.merge([
_define_summaries(), begin_episode, step, end_episode])
with tf.control_dependencies([summary]):
done, score = tf.identity(batch_env.done), tf.identity(score)
return done, score, summary

View File

@ -0,0 +1,98 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for the simulation operation."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from google3.robotics.reinforcement_learning.agents import tools
class SimulateTest(tf.test.TestCase):
def test_done_automatic(self):
batch_env = self._create_test_batch_env((1, 2, 3, 4))
algo = tools.MockAlgorithm(batch_env)
done, _, _ = tools.simulate(batch_env, algo, log=False, reset=False)
with self.test_session() as sess:
sess.run(tf.global_variables_initializer())
self.assertAllEqual([True, False, False, False], sess.run(done))
self.assertAllEqual([True, True, False, False], sess.run(done))
self.assertAllEqual([True, False, True, False], sess.run(done))
self.assertAllEqual([True, True, False, True], sess.run(done))
def test_done_forced(self):
reset = tf.placeholder_with_default(False, ())
batch_env = self._create_test_batch_env((2, 4))
algo = tools.MockAlgorithm(batch_env)
done, _, _ = tools.simulate(batch_env, algo, False, reset)
with self.test_session() as sess:
sess.run(tf.global_variables_initializer())
self.assertAllEqual([False, False], sess.run(done))
self.assertAllEqual([False, False], sess.run(done, {reset: True}))
self.assertAllEqual([True, False], sess.run(done))
self.assertAllEqual([False, False], sess.run(done, {reset: True}))
self.assertAllEqual([True, False], sess.run(done))
self.assertAllEqual([False, False], sess.run(done))
self.assertAllEqual([True, True], sess.run(done))
def test_reset_automatic(self):
batch_env = self._create_test_batch_env((1, 2, 3, 4))
algo = tools.MockAlgorithm(batch_env)
done, _, _ = tools.simulate(batch_env, algo, log=False, reset=False)
with self.test_session() as sess:
sess.run(tf.global_variables_initializer())
for _ in range(10):
sess.run(done)
self.assertAllEqual([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], batch_env[0].steps)
self.assertAllEqual([2, 2, 2, 2, 2], batch_env[1].steps)
self.assertAllEqual([3, 3, 3, 1], batch_env[2].steps)
self.assertAllEqual([4, 4, 2], batch_env[3].steps)
def test_reset_forced(self):
reset = tf.placeholder_with_default(False, ())
batch_env = self._create_test_batch_env((2, 4))
algo = tools.MockAlgorithm(batch_env)
done, _, _ = tools.simulate(batch_env, algo, False, reset)
with self.test_session() as sess:
sess.run(tf.global_variables_initializer())
sess.run(done)
sess.run(done, {reset: True})
sess.run(done)
sess.run(done, {reset: True})
sess.run(done)
sess.run(done)
sess.run(done)
self.assertAllEqual([1, 2, 2, 2], batch_env[0].steps)
self.assertAllEqual([1, 2, 4], batch_env[1].steps)
def _create_test_batch_env(self, durations):
envs = []
for duration in durations:
env = tools.MockEnvironment(
observ_shape=(2, 3), action_shape=(3,),
min_duration=duration, max_duration=duration)
env = tools.wrappers.ConvertTo32Bit(env)
envs.append(env)
batch_env = tools.BatchEnv(envs, blocking=True)
batch_env = tools.InGraphBatchEnv(batch_env)
return batch_env
if __name__ == '__main__':
tf.test.main()

View File

@ -0,0 +1,67 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Compute a streaming estimation of the mean of submitted tensors."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
class StreamingMean(object):
"""Compute a streaming estimation of the mean of submitted tensors."""
def __init__(self, shape, dtype):
"""Specify the shape and dtype of the mean to be estimated.
Note that a float mean to zero submitted elements is NaN, while computing
the integer mean of zero elements raises a division by zero error.
Args:
shape: Shape of the mean to compute.
dtype: Data type of the mean to compute.
"""
self._dtype = dtype
self._sum = tf.Variable(lambda: tf.zeros(shape, dtype), False)
self._count = tf.Variable(lambda: 0, trainable=False)
@property
def value(self):
"""The current value of the mean."""
return self._sum / tf.cast(self._count, self._dtype)
@property
def count(self):
"""The number of submitted samples."""
return self._count
def submit(self, value):
"""Submit a single or batch tensor to refine the streaming mean."""
# Add a batch dimension if necessary.
if value.shape.ndims == self._sum.shape.ndims:
value = value[None, ...]
return tf.group(
self._sum.assign_add(tf.reduce_sum(value, 0)),
self._count.assign_add(tf.shape(value)[0]))
def clear(self):
"""Return the mean estimate and reset the streaming statistics."""
value = self._sum / tf.cast(self._count, self._dtype)
with tf.control_dependencies([value]):
reset_value = self._sum.assign(tf.zeros_like(self._sum))
reset_count = self._count.assign(0)
with tf.control_dependencies([reset_value, reset_count]):
return tf.identity(value)

View File

@ -0,0 +1,552 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Wrappers for OpenAI Gym environments."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import atexit
import functools
import multiprocessing
import sys
import traceback
import gym
import gym.spaces
import numpy as np
import tensorflow as tf
class AutoReset(object):
"""Automatically reset environment when the episode is done."""
def __init__(self, env):
self._env = env
self._done = True
def __getattr__(self, name):
return getattr(self._env, name)
def step(self, action):
if self._done:
observ, reward, done, info = self._env.reset(), 0.0, False, {}
else:
observ, reward, done, info = self._env.step(action)
self._done = done
return observ, reward, done, info
def reset(self):
self._done = False
return self._env.reset()
class ActionRepeat(object):
"""Repeat the agent action multiple steps."""
def __init__(self, env, amount):
self._env = env
self._amount = amount
def __getattr__(self, name):
return getattr(self._env, name)
def step(self, action):
done = False
total_reward = 0
current_step = 0
while current_step < self._amount and not done:
observ, reward, done, info = self._env.step(action)
total_reward += reward
current_step += 1
return observ, total_reward, done, info
class RandomStart(object):
"""Perform random number of random actions at the start of the episode."""
def __init__(self, env, max_steps):
self._env = env
self._max_steps = max_steps
def __getattr__(self, name):
return getattr(self._env, name)
def reset(self):
observ = self._env.reset()
random_steps = np.random.randint(0, self._max_steps)
for _ in range(random_steps):
action = self._env.action_space.sample()
observ, unused_reward, done, unused_info = self._env.step(action)
if done:
tf.logging.warning('Episode ended during random start.')
return self.reset()
return observ
class FrameHistory(object):
"""Augment the observation with past observations."""
def __init__(self, env, past_indices, flatten):
"""Augment the observation with past observations.
Implemented as a Numpy ring buffer holding the necessary past observations.
Args:
env: OpenAI Gym environment to wrap.
past_indices: List of non-negative integers indicating the time offsets
from the current time step of observations to include.
flatten: Concatenate the past observations rather than stacking them.
Raises:
KeyError: The current observation is not included in the indices.
"""
if 0 not in past_indices:
raise KeyError('Past indices should include 0 for the current frame.')
self._env = env
self._past_indices = past_indices
self._step = 0
self._buffer = None
self._capacity = max(past_indices)
self._flatten = flatten
def __getattr__(self, name):
return getattr(self._env, name)
@property
def observation_space(self):
low = self._env.observation_space.low
high = self._env.observation_space.high
low = np.repeat(low[None, ...], len(self._past_indices), 0)
high = np.repeat(high[None, ...], len(self._past_indices), 0)
if self._flatten:
low = np.reshape(low, (-1,) + low.shape[2:])
high = np.reshape(high, (-1,) + high.shape[2:])
return gym.spaces.Box(low, high)
def step(self, action):
observ, reward, done, info = self._env.step(action)
self._step += 1
self._buffer[self._step % self._capacity] = observ
observ = self._select_frames()
return observ, reward, done, info
def reset(self):
observ = self._env.reset()
self._buffer = np.repeat(observ[None, ...], self._capacity, 0)
self._step = 0
return self._select_frames()
def _select_frames(self):
indices = [
(self._step - index) % self._capacity for index in self._past_indices]
observ = self._buffer[indices]
if self._flatten:
observ = np.reshape(observ, (-1,) + observ.shape[2:])
return observ
class FrameDelta(object):
"""Convert the observation to a difference from the previous observation."""
def __init__(self, env):
self._env = env
self._last = None
def __getattr__(self, name):
return getattr(self._env, name)
@property
def observation_space(self):
low = self._env.observation_space.low
high = self._env.observation_space.high
low, high = low - high, high - low
return gym.spaces.Box(low, high)
def step(self, action):
observ, reward, done, info = self._env.step(action)
delta = observ - self._last
self._last = observ
return delta, reward, done, info
def reset(self):
observ = self._env.reset()
self._last = observ
return observ
class RangeNormalize(object):
"""Normalize the specialized observation and action ranges to [-1, 1]."""
def __init__(self, env, observ=None, action=None):
self._env = env
self._should_normalize_observ = (
observ is not False and self._is_finite(self._env.observation_space))
if observ is True and not self._should_normalize_observ:
raise ValueError('Cannot normalize infinite observation range.')
if observ is None and not self._should_normalize_observ:
tf.logging.info('Not normalizing infinite observation range.')
self._should_normalize_action = (
action is not False and self._is_finite(self._env.action_space))
if action is True and not self._should_normalize_action:
raise ValueError('Cannot normalize infinite action range.')
if action is None and not self._should_normalize_action:
tf.logging.info('Not normalizing infinite action range.')
def __getattr__(self, name):
return getattr(self._env, name)
@property
def observation_space(self):
space = self._env.observation_space
if not self._should_normalize_observ:
return space
return gym.spaces.Box(-np.ones(space.shape), np.ones(space.shape))
@property
def action_space(self):
space = self._env.action_space
if not self._should_normalize_action:
return space
return gym.spaces.Box(-np.ones(space.shape), np.ones(space.shape))
def step(self, action):
if self._should_normalize_action:
action = self._denormalize_action(action)
observ, reward, done, info = self._env.step(action)
if self._should_normalize_observ:
observ = self._normalize_observ(observ)
return observ, reward, done, info
def reset(self):
observ = self._env.reset()
if self._should_normalize_observ:
observ = self._normalize_observ(observ)
return observ
def _denormalize_action(self, action):
min_ = self._env.action_space.low
max_ = self._env.action_space.high
action = (action + 1) / 2 * (max_ - min_) + min_
return action
def _normalize_observ(self, observ):
min_ = self._env.observation_space.low
max_ = self._env.observation_space.high
observ = 2 * (observ - min_) / (max_ - min_) - 1
return observ
def _is_finite(self, space):
return np.isfinite(space.low).all() and np.isfinite(space.high).all()
class ClipAction(object):
"""Clip out of range actions to the action space of the environment."""
def __init__(self, env):
self._env = env
def __getattr__(self, name):
return getattr(self._env, name)
@property
def action_space(self):
shape = self._env.action_space.shape
return gym.spaces.Box(-np.inf * np.ones(shape), np.inf * np.ones(shape))
def step(self, action):
action_space = self._env.action_space
action = np.clip(action, action_space.low, action_space.high)
return self._env.step(action)
class LimitDuration(object):
"""End episodes after specified number of steps."""
def __init__(self, env, duration):
self._env = env
self._duration = duration
self._step = None
def __getattr__(self, name):
return getattr(self._env, name)
def step(self, action):
if self._step is None:
raise RuntimeError('Must reset environment.')
observ, reward, done, info = self._env.step(action)
self._step += 1
if self._step >= self._duration:
done = True
self._step = None
return observ, reward, done, info
def reset(self):
self._step = 0
return self._env.reset()
class ExternalProcess(object):
"""Step environment in a separate process for lock free paralellism."""
# Message types for communication via the pipe.
_ACTION = 1
_RESET = 2
_CLOSE = 3
_ATTRIBUTE = 4
_TRANSITION = 5
_OBSERV = 6
_EXCEPTION = 7
_VALUE = 8
def __init__(self, constructor):
"""Step environment in a separate process for lock free paralellism.
The environment will be created in the external process by calling the
specified callable. This can be an environment class, or a function
creating the environment and potentially wrapping it. The returned
environment should not access global variables.
Args:
constructor: Callable that creates and returns an OpenAI gym environment.
Attributes:
observation_space: The cached observation space of the environment.
action_space: The cached action space of the environment.
"""
self._conn, conn = multiprocessing.Pipe()
self._process = multiprocessing.Process(
target=self._worker, args=(constructor, conn))
atexit.register(self.close)
self._process.start()
self._observ_space = None
self._action_space = None
@property
def observation_space(self):
if not self._observ_space:
self._observ_space = self.__getattr__('observation_space')
return self._observ_space
@property
def action_space(self):
if not self._action_space:
self._action_space = self.__getattr__('action_space')
return self._action_space
def __getattr__(self, name):
"""Request an attribute from the environment.
Note that this involves communication with the external process, so it can
be slow.
Args:
name: Attribute to access.
Returns:
Value of the attribute.
"""
self._conn.send((self._ATTRIBUTE, name))
return self._receive(self._VALUE)
def step(self, action, blocking=True):
"""Step the environment.
Args:
action: The action to apply to the environment.
blocking: Whether to wait for the result.
Returns:
Transition tuple when blocking, otherwise callable that returns the
transition tuple.
"""
self._conn.send((self._ACTION, action))
if blocking:
return self._receive(self._TRANSITION)
else:
return functools.partial(self._receive, self._TRANSITION)
def reset(self, blocking=True):
"""Reset the environment.
Args:
blocking: Whether to wait for the result.
Returns:
New observation when blocking, otherwise callable that returns the new
observation.
"""
self._conn.send((self._RESET, None))
if blocking:
return self._receive(self._OBSERV)
else:
return functools.partial(self._receive, self._OBSERV)
def close(self):
"""Send a close message to the external process and join it."""
try:
self._conn.send((self._CLOSE, None))
self._conn.close()
except IOError:
# The connection was already closed.
pass
self._process.join()
def _receive(self, expected_message):
"""Wait for a message from the worker process and return its payload.
Args:
expected_message: Type of the expected message.
Raises:
Exception: An exception was raised inside the worker process.
KeyError: The reveived message is not of the expected type.
Returns:
Payload object of the message.
"""
message, payload = self._conn.recv()
# Re-raise exceptions in the main process.
if message == self._EXCEPTION:
stacktrace = payload
raise Exception(stacktrace)
if message == expected_message:
return payload
raise KeyError('Received message of unexpected type {}'.format(message))
def _worker(self, constructor, conn):
"""The process waits for actions and sends back environment results.
Args:
constructor: Constructor for the OpenAI Gym environment.
conn: Connection for communication to the main process.
"""
try:
env = constructor()
while True:
try:
# Only block for short times to have keyboard exceptions be raised.
if not conn.poll(0.1):
continue
message, payload = conn.recv()
except (EOFError, KeyboardInterrupt):
break
if message == self._ACTION:
action = payload
conn.send((self._TRANSITION, env.step(action)))
continue
if message == self._RESET:
assert payload is None
conn.send((self._OBSERV, env.reset()))
continue
if message == self._ATTRIBUTE:
name = payload
conn.send((self._VALUE, getattr(env, name)))
continue
if message == self._CLOSE:
assert payload is None
break
raise KeyError('Received message of unknown type {}'.format(message))
except Exception: # pylint: disable=broad-except
stacktrace = ''.join(traceback.format_exception(*sys.exc_info()))
conn.send((self._EXCEPTION, stacktrace))
tf.logging.error('Error in environment process: {}'.format(stacktrace))
conn.close()
class ConvertTo32Bit(object):
"""Convert data types of an OpenAI Gym environment to 32 bit."""
def __init__(self, env):
"""Convert data types of an OpenAI Gym environment to 32 bit.
Args:
env: OpenAI Gym environment.
"""
self._env = env
def __getattr__(self, name):
"""Forward unimplemented attributes to the original environment.
Args:
name: Attribute that was accessed.
Returns:
Value behind the attribute name in the wrapped environment.
"""
return getattr(self._env, name)
def step(self, action):
"""Forward action to the wrapped environment.
Args:
action: Action to apply to the environment.
Raises:
ValueError: Invalid action.
Returns:
Converted observation, converted reward, done flag, and info object.
"""
observ, reward, done, info = self._env.step(action)
observ = self._convert_observ(observ)
reward = self._convert_reward(reward)
return observ, reward, done, info
def reset(self):
"""Reset the environment and convert the resulting observation.
Returns:
Converted observation.
"""
observ = self._env.reset()
observ = self._convert_observ(observ)
return observ
def _convert_observ(self, observ):
"""Convert the observation to 32 bits.
Args:
observ: Numpy observation.
Raises:
ValueError: Observation contains infinite values.
Returns:
Numpy observation with 32-bit data type.
"""
if not np.isfinite(observ).all():
raise ValueError('Infinite observation encountered.')
if observ.dtype == np.float64:
return observ.astype(np.float32)
if observ.dtype == np.int64:
return observ.astype(np.int32)
return observ
def _convert_reward(self, reward):
"""Convert the reward to 32 bits.
Args:
reward: Numpy reward.
Raises:
ValueError: Rewards contain infinite values.
Returns:
Numpy reward with 32-bit data type.
"""
if not np.isfinite(reward).all():
raise ValueError('Infinite reward encountered.')
return np.array(reward, dtype=np.float32)

View File

@ -0,0 +1,90 @@
# Copyright 2017 The TensorFlow Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for environment wrappers."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import tensorflow as tf
from agents import tools
class ExternalProcessTest(tf.test.TestCase):
def test_close_no_hang_after_init(self):
constructor = functools.partial(
tools.MockEnvironment,
observ_shape=(2, 3), action_shape=(2,),
min_duration=2, max_duration=2)
env = tools.wrappers.ExternalProcess(constructor)
env.close()
def test_close_no_hang_after_step(self):
constructor = functools.partial(
tools.MockEnvironment,
observ_shape=(2, 3), action_shape=(2,),
min_duration=5, max_duration=5)
env = tools.wrappers.ExternalProcess(constructor)
env.reset()
env.step(env.action_space.sample())
env.step(env.action_space.sample())
env.close()
def test_reraise_exception_in_init(self):
constructor = MockEnvironmentCrashInInit
env = tools.wrappers.ExternalProcess(constructor)
with self.assertRaises(Exception):
env.step(env.action_space.sample())
def test_reraise_exception_in_step(self):
constructor = functools.partial(
MockEnvironmentCrashInStep, crash_at_step=3)
env = tools.wrappers.ExternalProcess(constructor)
env.reset()
env.step(env.action_space.sample())
env.step(env.action_space.sample())
with self.assertRaises(Exception):
env.step(env.action_space.sample())
class MockEnvironmentCrashInInit(object):
"""Raise an error when instantiated."""
def __init__(self, *unused_args, **unused_kwargs):
raise RuntimeError()
class MockEnvironmentCrashInStep(tools.MockEnvironment):
"""Raise an error after specified number of steps in an episode."""
def __init__(self, crash_at_step):
super(MockEnvironmentCrashInStep, self).__init__(
observ_shape=(2, 3), action_shape=(2,),
min_duration=crash_at_step + 1, max_duration=crash_at_step + 1)
self._crash_at_step = crash_at_step
def step(self, *args, **kwargs):
transition = super(MockEnvironmentCrashInStep, self).step(*args, **kwargs)
if self.steps[-1] == self._crash_at_step:
raise RuntimeError()
return transition
if __name__ == '__main__':
tf.test.main()

View File

@ -0,0 +1,11 @@
This folder contains a number of simulated Minitaur environments implemented using pybullet.
The following two environments are used in the RSS paper "Sim-to-Real: Learning Agile Locomotion For Quadruped Robots":
1) Galloping example: minitaur_reactive_env.py
python minitaur_reactive_env_example.py runs a pre-trained PPO agent that performs galloping gait.
2) Trotting example: minitaur_trotting_env.py
python minitaur_trotting_env_example.py runs a pre-trained PPO agent that performs trotting gait.
The rest are experimental environments.

View File

@ -216,7 +216,7 @@ class MinitaurFourLegStandEnv(minitaur_gym_env.MinitaurGymEnv):
self._env_step_counter += 1
done = self._termination()
obs = self._get_true_observation()
reward = self._reward(action, obs)
reward = self._reward()
if self._log_path is not None:
minitaur_logging.update_episode_proto(self._episode_proto, self.minitaur,
action, self._env_step_counter)
@ -272,7 +272,7 @@ class MinitaurFourLegStandEnv(minitaur_gym_env.MinitaurGymEnv):
np.asarray([0, 0, 1]), np.asarray(local_up))
return local_global_up_dot_product < 0.85 or height < 0.15
def _reward(self, action, obs):
def _reward(self):
roll, pitch, _ = self.minitaur.GetBaseRollPitchYaw()
return 1.0 / (0.001 + math.fabs(roll) + math.fabs(pitch))

View File

@ -1,8 +1,4 @@
r"""An example to use simple_ppo_agent.
blaze run -c opt \
//robotics/reinforcement_learning/minitaur/envs:minitaur_reactive_env_example
"""
r"""Running a pre-trained ppo agent on minitaur_reactive_env."""
from __future__ import absolute_import
from __future__ import division
@ -11,13 +7,13 @@ from __future__ import print_function
import os
import time
import tensorflow as tf
from agents.scripts import utility
from pybullet_envs.minitaur.agents.scripts import utility
import pybullet_data
import simple_ppo_agent
flags = tf.app.flags
FLAGS = tf.app.flags.FLAGS
LOG_DIR = (
"testdata/minitaur_reactive_env_test")
LOG_DIR = os.path.join(pybullet_data.getDataPath(), "policies/ppo/minitaur_reactive_env")
CHECKPOINT = "model.ckpt-14000000"
@ -43,7 +39,6 @@ def main(argv):
while True:
action = agent.get_action([observation])
observation, reward, done, _ = env.step(action[0])
# This sleep is to prevent serial communication error on the real robot.
time.sleep(0.002)
sum_reward += reward
if done:

View File

@ -1,41 +1,51 @@
"""An example to run the minitaur environment of trotting gait.
r"""Running a pre-trained ppo agent on minitaur_trotting_env."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""
import time
import os
import numpy as np
import time
import tensorflow as tf
from pybullet_envs.minitaur.envs import minitaur_gym_env
from pybullet_envs.minitaur.envs import minitaur_trotting_env
from pybullet_envs.minitaur.agents.scripts import utility
import pybullet_data
import simple_ppo_agent
#FLAGS = tf.flags.FLAGS
#tf.flags.DEFINE_string("log_path", None, "The directory to write the log file.")
flags = tf.app.flags
FLAGS = tf.app.flags.FLAGS
LOG_DIR = os.path.join(pybullet_data.getDataPath(), "policies/ppo/minitaur_trotting_env")
CHECKPOINT = "model.ckpt-14000000"
def main(unused_argv):
environment = minitaur_trotting_env.MinitaurTrottingEnv(
urdf_version=minitaur_gym_env.RAINBOW_DASH_V0_URDF_VERSION,
use_signal_in_observation=False,
use_angle_in_observation=False,
render=True,
log_path=os.getcwd())
def main(argv):
del argv # Unused.
config = utility.load_config(LOG_DIR)
policy_layers = config.policy_layers
value_layers = config.value_layers
env = config.env(render=True)
network = config.network
np.random.seed(100)
sum_reward = 0
environment.reset()
with tf.Session() as sess:
agent = simple_ppo_agent.SimplePPOPolicy(
sess,
env,
network,
policy_layers=policy_layers,
value_layers=value_layers,
checkpoint=os.path.join(LOG_DIR, CHECKPOINT))
steps = 5000
for _ in range(steps):
# Sleep to prevent serial buffer overflow on microcontroller.
time.sleep(0.002)
action = [0] * 8
_, reward, done, _ = environment.step(action)
sum_reward += reward
if done:
break
tf.logging.info("reward: {}".format(sum_reward))
sum_reward = 0
observation = env.reset()
while True:
action = agent.get_action([observation])
observation, reward, done, _ = env.step(action[0])
time.sleep(0.002)
sum_reward += reward
if done:
break
tf.logging.info("reward: %s", sum_reward)
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run()
tf.app.run(main)