make sure that the pre-trained galloping and trotting policies work for the minitaur_reactive_env and minitaur_trotting_env environments.

2024-12-14 05:40:05 +00:00 · 2018-04-24 21:48:27 -07:00 · 2018-04-24 21:48:27 -07:00 · a375a349ce
commit a375a349ce
parent 982453abc6
43 changed files with 4447 additions and 42 deletions
--- a/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/config.yaml
+++ b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/config.yaml
@ -0,0 +1,54 @@
+!!python/object/new:pybullet_envs.minitaur.agents.tools.attr_dict.AttrDict
+dictitems:
+  algorithm: !!python/name:pybullet_envs.minitaur.agents.ppo.algorithm.PPOAlgorithm ''
+  discount: 0.9868209124499899
+  env: !!python/object/apply:functools.partial
+    args:
+    - &id001 !!python/name:pybullet_envs.minitaur.envs.minitaur_reactive_env.MinitaurReactiveEnv ''
+    state: !!python/tuple
+    - *id001
+    - !!python/tuple []
+    - accurate_motor_model_enabled: true
+      control_latency: 0.02
+      energy_weight: 0.005
+      env_randomizer: null
+      motor_kd: 0.015
+      num_steps_to_log: 1000
+      pd_latency: 0.003
+      remove_default_joint_damping: true
+      render: false
+      urdf_version: rainbow_dash_v0
+    - null
+  eval_episodes: 25
+  init_logstd: -1.1579536194508315
+  init_mean_factor: 0.3084392491563408
+  kl_cutoff_coef: 1000
+  kl_cutoff_factor: 2
+  kl_init_penalty: 1
+  kl_target: 0.01
+  logdir: /cns/ij-d/home/jietan/experiment/minitaur_vizier_study_ppo/minreact_nonexp_nr_02_186515603_186518344/333
+  max_length: 1000
+  network: !!python/name:pybullet_envs.minitaur.agents.scripts.networks.ForwardGaussianPolicy ''
+  network_config: {}
+  num_agents: 25
+  policy_layers: !!python/tuple
+  - 114
+  - 45
+  policy_lr: 0.00023516695218031146
+  policy_optimizer: AdamOptimizer
+  steps: 7000000.0
+  update_epochs_policy: 25
+  update_epochs_value: 25
+  update_every: 25
+  use_gpu: false
+  value_layers: !!python/tuple
+  - 170
+  - 78
+  value_lr: 0.00031014032715987193
+  value_optimizer: AdamOptimizer
+  weight_summaries:
+    all: .*
+    policy: .*/policy/.*
+    value: .*/value/.*
+state:
+  _mutable: false
--- a/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/model.ckpt-14000000.data-00000-of-00001
+++ b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/model.ckpt-14000000.data-00000-of-00001
--- a/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/model.ckpt-14000000.index
+++ b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/model.ckpt-14000000.index
--- a/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/model.ckpt-14000000.meta
+++ b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_reactive_env/model.ckpt-14000000.meta
--- a/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/config.yaml
+++ b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/config.yaml
@ -0,0 +1,51 @@
+!!python/object/new:pybullet_envs.minitaur.agents.tools.attr_dict.AttrDict
+dictitems:
+  algorithm: !!python/name:pybullet_envs.minitaur.agents.ppo.algorithm.PPOAlgorithm ''
+  discount: 0.9899764168788918
+  env: !!python/object/apply:functools.partial
+    args:
+    - &id001 !!python/name:pybullet_envs.minitaur.envs.minitaur_trotting_env.MinitaurTrottingEnv ''
+    state: !!python/tuple
+    - *id001
+    - !!python/tuple []
+    - env_randomizer: null
+      motor_kd: 0.015
+      num_steps_to_log: 1000
+      pd_latency: 0.003
+      remove_default_joint_damping: true
+      render: false
+      urdf_version: rainbow_dash_v0
+    - null
+  eval_episodes: 25
+  init_logstd: -0.6325707791047228
+  init_mean_factor: 0.6508531688665261
+  kl_cutoff_coef: 1000
+  kl_cutoff_factor: 2
+  kl_init_penalty: 1
+  kl_target: 0.01
+  logdir: /cns/ij-d/home/jietan/experiment/minitaur_vizier_study_ppo/mintrot_nonexp_nr_01_186515603_186518344/373
+  max_length: 1000
+  network: !!python/name:pybullet_envs.minitaur.agents.scripts.networks.ForwardGaussianPolicy ''
+  network_config: {}
+  num_agents: 25
+  policy_layers: !!python/tuple
+  - 133
+  - 100
+  policy_lr: 0.00048104185841752015
+  policy_optimizer: AdamOptimizer
+  steps: 7000000.0
+  update_epochs_policy: 25
+  update_epochs_value: 25
+  update_every: 25
+  use_gpu: false
+  value_layers: !!python/tuple
+  - 64
+  - 57
+  value_lr: 0.0012786382882055453
+  value_optimizer: AdamOptimizer
+  weight_summaries:
+    all: .*
+    policy: .*/policy/.*
+    value: .*/value/.*
+state:
+  _mutable: false
--- a/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/model.ckpt-14000000.data-00000-of-00001
+++ b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/model.ckpt-14000000.data-00000-of-00001
--- a/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/model.ckpt-14000000.index
+++ b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/model.ckpt-14000000.index
--- a/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/model.ckpt-14000000.meta
+++ b/examples/pybullet/gym/pybullet_data/policies/ppo/minitaur_trotting_env/model.ckpt-14000000.meta
--- a/examples/pybullet/gym/pybullet_envs/minitaur/init.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/init.py
@ -1 +1,3 @@
-import gym
+
+
+
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/init.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/init.py
@ -0,0 +1 @@
+ 
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/init.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/init.py
@ -0,0 +1,21 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Proximal Policy Optimization algorithm."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from .algorithm import PPOAlgorithm
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/algorithm.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/algorithm.py
@ -0,0 +1,558 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Proximal Policy Optimization algorithm.
+
+Based on John Schulman's implementation in Python and Theano:
+https://github.com/joschu/modular_rl/blob/master/modular_rl/ppo.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import tensorflow as tf
+
+from . import memory
+from . import normalize
+from . import utility
+
+
+_NetworkOutput = collections.namedtuple(
+    'NetworkOutput', 'policy, mean, logstd, value, state')
+
+
+class PPOAlgorithm(object):
+  """A vectorized implementation of the PPO algorithm by John Schulman."""
+
+  def __init__(self, batch_env, step, is_training, should_log, config):
+    """Create an instance of the PPO algorithm.
+
+    Args:
+      batch_env: In-graph batch environment.
+      step: Integer tensor holding the current training step.
+      is_training: Boolean tensor for whether the algorithm should train.
+      should_log: Boolean tensor for whether summaries should be returned.
+      config: Object containing the agent configuration as attributes.
+    """
+    self._batch_env = batch_env
+    self._step = step
+    self._is_training = is_training
+    self._should_log = should_log
+    self._config = config
+    self._observ_filter = normalize.StreamingNormalize(
+        self._batch_env.observ[0], center=True, scale=True, clip=5,
+        name='normalize_observ')
+    self._reward_filter = normalize.StreamingNormalize(
+        self._batch_env.reward[0], center=False, scale=True, clip=10,
+        name='normalize_reward')
+    # Memory stores tuple of observ, action, mean, logstd, reward.
+    template = (
+        self._batch_env.observ[0], self._batch_env.action[0],
+        self._batch_env.action[0], self._batch_env.action[0],
+        self._batch_env.reward[0])
+    self._memory = memory.EpisodeMemory(
+        template, config.update_every, config.max_length, 'memory')
+    self._memory_index = tf.Variable(0, False)
+    use_gpu = self._config.use_gpu and utility.available_gpus()
+    with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
+      # Create network variables for later calls to reuse.
+      self._network(
+          tf.zeros_like(self._batch_env.observ)[:, None],
+          tf.ones(len(self._batch_env)), reuse=None)
+      cell = self._config.network(self._batch_env.action.shape[1].value)
+      with tf.variable_scope('ppo_temporary'):
+        self._episodes = memory.EpisodeMemory(
+            template, len(batch_env), config.max_length, 'episodes')
+        self._last_state = utility.create_nested_vars(
+            cell.zero_state(len(batch_env), tf.float32))
+        self._last_action = tf.Variable(
+            tf.zeros_like(self._batch_env.action), False, name='last_action')
+        self._last_mean = tf.Variable(
+            tf.zeros_like(self._batch_env.action), False, name='last_mean')
+        self._last_logstd = tf.Variable(
+            tf.zeros_like(self._batch_env.action), False, name='last_logstd')
+    self._penalty = tf.Variable(
+        self._config.kl_init_penalty, False, dtype=tf.float32)
+    self._policy_optimizer = self._config.policy_optimizer(
+        self._config.policy_lr, name='policy_optimizer')
+    self._value_optimizer = self._config.value_optimizer(
+        self._config.value_lr, name='value_optimizer')
+
+  def begin_episode(self, agent_indices):
+    """Reset the recurrent states and stored episode.
+
+    Args:
+      agent_indices: 1D tensor of batch indices for agents starting an episode.
+
+    Returns:
+      Summary tensor.
+    """
+    with tf.name_scope('begin_episode/'):
+      reset_state = utility.reinit_nested_vars(self._last_state, agent_indices)
+      reset_buffer = self._episodes.clear(agent_indices)
+      with tf.control_dependencies([reset_state, reset_buffer]):
+        return tf.constant('')
+
+  def perform(self, observ):
+    """Compute batch of actions and a summary for a batch of observation.
+
+    Args:
+      observ: Tensor of a batch of observations for all agents.
+
+    Returns:
+      Tuple of action batch tensor and summary tensor.
+    """
+    with tf.name_scope('perform/'):
+      observ = self._observ_filter.transform(observ)
+      network = self._network(
+          observ[:, None], tf.ones(observ.shape[0]), self._last_state)
+      action = tf.cond(
+          self._is_training, network.policy.sample, lambda: network.mean)
+      logprob = network.policy.log_prob(action)[:, 0]
+      # pylint: disable=g-long-lambda
+      summary = tf.cond(self._should_log, lambda: tf.summary.merge([
+          tf.summary.histogram('mean', network.mean[:, 0]),
+          tf.summary.histogram('std', tf.exp(network.logstd[:, 0])),
+          tf.summary.histogram('action', action[:, 0]),
+          tf.summary.histogram('logprob', logprob)]), str)
+      # Remember current policy to append to memory in the experience callback.
+      with tf.control_dependencies([
+          utility.assign_nested_vars(self._last_state, network.state),
+          self._last_action.assign(action[:, 0]),
+          self._last_mean.assign(network.mean[:, 0]),
+          self._last_logstd.assign(network.logstd[:, 0])]):
+        return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
+
+  def experience(self, observ, action, reward, unused_done, unused_nextob):
+    """Process the transition tuple of the current step.
+
+    When training, add the current transition tuple to the memory and update
+    the streaming statistics for observations and rewards. A summary string is
+    returned if requested at this step.
+
+    Args:
+      observ: Batch tensor of observations.
+      action: Batch tensor of actions.
+      reward: Batch tensor of rewards.
+      unused_done: Batch tensor of done flags.
+      unused_nextob: Batch tensor of successor observations.
+
+    Returns:
+      Summary tensor.
+    """
+    with tf.name_scope('experience/'):
+      return tf.cond(
+          self._is_training,
+          lambda: self._define_experience(observ, action, reward), str)
+
+  def _define_experience(self, observ, action, reward):
+    """Implement the branch of experience() entered during training."""
+    update_filters = tf.summary.merge([
+        self._observ_filter.update(observ),
+        self._reward_filter.update(reward)])
+    with tf.control_dependencies([update_filters]):
+      if self._config.train_on_agent_action:
+        # NOTE: Doesn't seem to change much.
+        action = self._last_action
+      batch = observ, action, self._last_mean, self._last_logstd, reward
+      append = self._episodes.append(batch, tf.range(len(self._batch_env)))
+    with tf.control_dependencies([append]):
+      norm_observ = self._observ_filter.transform(observ)
+      norm_reward = tf.reduce_mean(self._reward_filter.transform(reward))
+      # pylint: disable=g-long-lambda
+      summary = tf.cond(self._should_log, lambda: tf.summary.merge([
+          update_filters,
+          self._observ_filter.summary(),
+          self._reward_filter.summary(),
+          tf.summary.scalar('memory_size', self._memory_index),
+          tf.summary.histogram('normalized_observ', norm_observ),
+          tf.summary.histogram('action', self._last_action),
+          tf.summary.scalar('normalized_reward', norm_reward)]), str)
+      return summary
+
+  def end_episode(self, agent_indices):
+    """Add episodes to the memory and perform update steps if memory is full.
+
+    During training, add the collected episodes of the batch indices that
+    finished their episode to the memory. If the memory is full, train on it,
+    and then clear the memory. A summary string is returned if requested at
+    this step.
+
+    Args:
+      agent_indices: 1D tensor of batch indices for agents starting an episode.
+
+    Returns:
+       Summary tensor.
+    """
+    with tf.name_scope('end_episode/'):
+      return tf.cond(
+          self._is_training,
+          lambda: self._define_end_episode(agent_indices), str)
+
+  def _define_end_episode(self, agent_indices):
+    """Implement the branch of end_episode() entered during training."""
+    episodes, length = self._episodes.data(agent_indices)
+    space_left = self._config.update_every - self._memory_index
+    use_episodes = tf.range(tf.minimum(
+        tf.shape(agent_indices)[0], space_left))
+    episodes = [tf.gather(elem, use_episodes) for elem in episodes]
+    append = self._memory.replace(
+        episodes, tf.gather(length, use_episodes),
+        use_episodes + self._memory_index)
+    with tf.control_dependencies([append]):
+      inc_index = self._memory_index.assign_add(tf.shape(use_episodes)[0])
+    with tf.control_dependencies([inc_index]):
+      memory_full = self._memory_index >= self._config.update_every
+      return tf.cond(memory_full, self._training, str)
+
+  def _training(self):
+    """Perform multiple training iterations of both policy and value baseline.
+
+    Training on the episodes collected in the memory. Reset the memory
+    afterwards. Always returns a summary string.
+
+    Returns:
+      Summary tensor.
+    """
+    with tf.name_scope('training'):
+      assert_full = tf.assert_equal(
+          self._memory_index, self._config.update_every)
+      with tf.control_dependencies([assert_full]):
+        data = self._memory.data()
+      (observ, action, old_mean, old_logstd, reward), length = data
+      with tf.control_dependencies([tf.assert_greater(length, 0)]):
+        length = tf.identity(length)
+      observ = self._observ_filter.transform(observ)
+      reward = self._reward_filter.transform(reward)
+      policy_summary = self._update_policy(
+          observ, action, old_mean, old_logstd, reward, length)
+      with tf.control_dependencies([policy_summary]):
+        value_summary = self._update_value(observ, reward, length)
+      with tf.control_dependencies([value_summary]):
+        penalty_summary = self._adjust_penalty(
+            observ, old_mean, old_logstd, length)
+      with tf.control_dependencies([penalty_summary]):
+        clear_memory = tf.group(
+            self._memory.clear(), self._memory_index.assign(0))
+      with tf.control_dependencies([clear_memory]):
+        weight_summary = utility.variable_summaries(
+            tf.trainable_variables(), self._config.weight_summaries)
+        return tf.summary.merge([
+            policy_summary, value_summary, penalty_summary, weight_summary])
+
+  def _update_value(self, observ, reward, length):
+    """Perform multiple update steps of the value baseline.
+
+    We need to decide for the summary of one iteration, and thus choose the one
+    after half of the iterations.
+
+    Args:
+      observ: Sequences of observations.
+      reward: Sequences of reward.
+      length: Batch of sequence lengths.
+
+    Returns:
+      Summary tensor.
+    """
+    with tf.name_scope('update_value'):
+      loss, summary = tf.scan(
+          lambda _1, _2: self._update_value_step(observ, reward, length),
+          tf.range(self._config.update_epochs_value),
+          [0., ''], parallel_iterations=1)
+      print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'value loss: ')
+      with tf.control_dependencies([loss, print_loss]):
+        return summary[self._config.update_epochs_value // 2]
+
+  def _update_value_step(self, observ, reward, length):
+    """Compute the current value loss and perform a gradient update step.
+
+    Args:
+      observ: Sequences of observations.
+      reward: Sequences of reward.
+      length: Batch of sequence lengths.
+
+    Returns:
+      Tuple of loss tensor and summary tensor.
+    """
+    loss, summary = self._value_loss(observ, reward, length)
+    gradients, variables = (
+        zip(*self._value_optimizer.compute_gradients(loss)))
+    optimize = self._value_optimizer.apply_gradients(
+        zip(gradients, variables))
+    summary = tf.summary.merge([
+        summary,
+        tf.summary.scalar('gradient_norm', tf.global_norm(gradients)),
+        utility.gradient_summaries(
+            zip(gradients, variables), dict(value=r'.*'))])
+    with tf.control_dependencies([optimize]):
+      return [tf.identity(loss), tf.identity(summary)]
+
+  def _value_loss(self, observ, reward, length):
+    """Compute the loss function for the value baseline.
+
+    The value loss is the difference between empirical and approximated returns
+    over the collected episodes. Returns the loss tensor and a summary strin.
+
+    Args:
+      observ: Sequences of observations.
+      reward: Sequences of reward.
+      length: Batch of sequence lengths.
+
+    Returns:
+      Tuple of loss tensor and summary tensor.
+    """
+    with tf.name_scope('value_loss'):
+      value = self._network(observ, length).value
+      return_ = utility.discounted_return(
+          reward, length, self._config.discount)
+      advantage = return_ - value
+      value_loss = 0.5 * self._mask(advantage ** 2, length)
+      summary = tf.summary.merge([
+          tf.summary.histogram('value_loss', value_loss),
+          tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss))])
+      value_loss = tf.reduce_mean(value_loss)
+      return tf.check_numerics(value_loss, 'value_loss'), summary
+
+  def _update_policy(
+      self, observ, action, old_mean, old_logstd, reward, length):
+    """Perform multiple update steps of the policy.
+
+    The advantage is computed once at the beginning and shared across
+    iterations. We need to decide for the summary of one iteration, and thus
+    choose the one after half of the iterations.
+
+    Args:
+      observ: Sequences of observations.
+      action: Sequences of actions.
+      old_mean: Sequences of action means of the behavioral policy.
+      old_logstd: Sequences of action log stddevs of the behavioral policy.
+      reward: Sequences of rewards.
+      length: Batch of sequence lengths.
+
+    Returns:
+      Summary tensor.
+    """
+    with tf.name_scope('update_policy'):
+      return_ = utility.discounted_return(
+          reward, length, self._config.discount)
+      value = self._network(observ, length).value
+      if self._config.gae_lambda:
+        advantage = utility.lambda_return(
+            reward, value, length, self._config.discount,
+            self._config.gae_lambda)
+      else:
+        advantage = return_ - value
+      mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
+      advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
+      advantage = tf.Print(
+          advantage, [tf.reduce_mean(return_), tf.reduce_mean(value)],
+          'return and value: ')
+      advantage = tf.Print(
+          advantage, [tf.reduce_mean(advantage)],
+          'normalized advantage: ')
+      # pylint: disable=g-long-lambda
+      loss, summary = tf.scan(
+          lambda _1, _2: self._update_policy_step(
+              observ, action, old_mean, old_logstd, advantage, length),
+          tf.range(self._config.update_epochs_policy),
+          [0., ''], parallel_iterations=1)
+      print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'policy loss: ')
+      with tf.control_dependencies([loss, print_loss]):
+        return summary[self._config.update_epochs_policy // 2]
+
+  def _update_policy_step(
+      self, observ, action, old_mean, old_logstd, advantage, length):
+    """Compute the current policy loss and perform a gradient update step.
+
+    Args:
+      observ: Sequences of observations.
+      action: Sequences of actions.
+      old_mean: Sequences of action means of the behavioral policy.
+      old_logstd: Sequences of action log stddevs of the behavioral policy.
+      advantage: Sequences of advantages.
+      length: Batch of sequence lengths.
+
+    Returns:
+      Tuple of loss tensor and summary tensor.
+    """
+    network = self._network(observ, length)
+    loss, summary = self._policy_loss(
+        network.mean, network.logstd, old_mean, old_logstd, action,
+        advantage, length)
+    gradients, variables = (
+        zip(*self._policy_optimizer.compute_gradients(loss)))
+    optimize = self._policy_optimizer.apply_gradients(
+        zip(gradients, variables))
+    summary = tf.summary.merge([
+        summary,
+        tf.summary.scalar('gradient_norm', tf.global_norm(gradients)),
+        utility.gradient_summaries(
+            zip(gradients, variables), dict(policy=r'.*'))])
+    with tf.control_dependencies([optimize]):
+      return [tf.identity(loss), tf.identity(summary)]
+
+  def _policy_loss(
+      self, mean, logstd, old_mean, old_logstd, action, advantage, length):
+    """Compute the policy loss composed of multiple components.
+
+    1. The policy gradient loss is importance sampled from the data-collecting
+       policy at the beginning of training.
+    2. The second term is a KL penalty between the policy at the beginning of
+       training and the current policy.
+    3. Additionally, if this KL already changed more than twice the target
+       amount, we activate a strong penalty discouraging further divergence.
+
+    Args:
+      mean: Sequences of action means of the current policy.
+      logstd: Sequences of action log stddevs of the current policy.
+      old_mean: Sequences of action means of the behavioral policy.
+      old_logstd: Sequences of action log stddevs of the behavioral policy.
+      action: Sequences of actions.
+      advantage: Sequences of advantages.
+      length: Batch of sequence lengths.
+
+    Returns:
+      Tuple of loss tensor and summary tensor.
+    """
+    with tf.name_scope('policy_loss'):
+      entropy = utility.diag_normal_entropy(mean, logstd)
+      kl = tf.reduce_mean(self._mask(utility.diag_normal_kl(
+          old_mean, old_logstd, mean, logstd), length), 1)
+      policy_gradient = tf.exp(
+          utility.diag_normal_logpdf(mean, logstd, action) -
+          utility.diag_normal_logpdf(old_mean, old_logstd, action))
+      surrogate_loss = -tf.reduce_mean(self._mask(
+          policy_gradient * tf.stop_gradient(advantage), length), 1)
+      kl_penalty = self._penalty * kl
+      cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor
+      cutoff_count = tf.reduce_sum(
+          tf.cast(kl > cutoff_threshold, tf.int32))
+      with tf.control_dependencies([tf.cond(
+          cutoff_count > 0,
+          lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int)]):
+        kl_cutoff = (
+            self._config.kl_cutoff_coef *
+            tf.cast(kl > cutoff_threshold, tf.float32) *
+            (kl - cutoff_threshold) ** 2)
+      policy_loss = surrogate_loss + kl_penalty + kl_cutoff
+      summary = tf.summary.merge([
+          tf.summary.histogram('entropy', entropy),
+          tf.summary.histogram('kl', kl),
+          tf.summary.histogram('surrogate_loss', surrogate_loss),
+          tf.summary.histogram('kl_penalty', kl_penalty),
+          tf.summary.histogram('kl_cutoff', kl_cutoff),
+          tf.summary.histogram('kl_penalty_combined', kl_penalty + kl_cutoff),
+          tf.summary.histogram('policy_loss', policy_loss),
+          tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)),
+          tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)),
+          tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss))])
+      policy_loss = tf.reduce_mean(policy_loss, 0)
+      return tf.check_numerics(policy_loss, 'policy_loss'), summary
+
+  def _adjust_penalty(self, observ, old_mean, old_logstd, length):
+    """Adjust the KL policy between the behavioral and current policy.
+
+    Compute how much the policy actually changed during the multiple
+    update steps. Adjust the penalty strength for the next training phase if we
+    overshot or undershot the target divergence too much.
+
+    Args:
+      observ: Sequences of observations.
+      old_mean: Sequences of action means of the behavioral policy.
+      old_logstd: Sequences of action log stddevs of the behavioral policy.
+      length: Batch of sequence lengths.
+
+    Returns:
+      Summary tensor.
+    """
+    with tf.name_scope('adjust_penalty'):
+      network = self._network(observ, length)
+      assert_change = tf.assert_equal(
+          tf.reduce_all(tf.equal(network.mean, old_mean)), False,
+          message='policy should change')
+      print_penalty = tf.Print(0, [self._penalty], 'current penalty: ')
+      with tf.control_dependencies([assert_change, print_penalty]):
+        kl_change = tf.reduce_mean(self._mask(utility.diag_normal_kl(
+            old_mean, old_logstd, network.mean, network.logstd), length))
+        kl_change = tf.Print(kl_change, [kl_change], 'kl change: ')
+        maybe_increase = tf.cond(
+            kl_change > 1.3 * self._config.kl_target,
+            # pylint: disable=g-long-lambda
+            lambda: tf.Print(self._penalty.assign(
+                self._penalty * 1.5), [0], 'increase penalty '),
+            float)
+        maybe_decrease = tf.cond(
+            kl_change < 0.7 * self._config.kl_target,
+            # pylint: disable=g-long-lambda
+            lambda: tf.Print(self._penalty.assign(
+                self._penalty / 1.5), [0], 'decrease penalty '),
+            float)
+      with tf.control_dependencies([maybe_increase, maybe_decrease]):
+        return tf.summary.merge([
+            tf.summary.scalar('kl_change', kl_change),
+            tf.summary.scalar('penalty', self._penalty)])
+
+  def _mask(self, tensor, length):
+    """Set padding elements of a batch of sequences to zero.
+
+    Useful to then safely sum along the time dimension.
+
+    Args:
+      tensor: Tensor of sequences.
+      length: Batch of sequence lengths.
+
+    Returns:
+      Masked sequences.
+    """
+    with tf.name_scope('mask'):
+      range_ = tf.range(tensor.shape[1].value)
+      mask = tf.cast(range_[None, :] < length[:, None], tf.float32)
+      masked = tensor * mask
+      return tf.check_numerics(masked, 'masked')
+
+  def _network(self, observ, length=None, state=None, reuse=True):
+    """Compute the network output for a batched sequence of observations.
+
+    Optionally, the initial state can be specified. The weights should be
+    reused for all calls, except for the first one. Output is a named tuple
+    containing the policy as a TensorFlow distribution, the policy mean and log
+    standard deviation, the approximated state value, and the new recurrent
+    state.
+
+    Args:
+      observ: Sequences of observations.
+      length: Batch of sequence lengths.
+      state: Batch of initial recurrent states.
+      reuse: Python boolean whether to reuse previous variables.
+
+    Returns:
+      NetworkOutput tuple.
+    """
+    with tf.variable_scope('network', reuse=reuse):
+      observ = tf.convert_to_tensor(observ)
+      use_gpu = self._config.use_gpu and utility.available_gpus()
+      with tf.device('/gpu:0' if use_gpu else '/cpu:0'):
+        observ = tf.check_numerics(observ, 'observ')
+        cell = self._config.network(self._batch_env.action.shape[1].value)
+        (mean, logstd, value), state = tf.nn.dynamic_rnn(
+            cell, observ, length, state, tf.float32, swap_memory=True)
+      mean = tf.check_numerics(mean, 'mean')
+      logstd = tf.check_numerics(logstd, 'logstd')
+      value = tf.check_numerics(value, 'value')
+      policy = tf.contrib.distributions.MultivariateNormalDiag(
+          mean, tf.exp(logstd))
+      return _NetworkOutput(policy, mean, logstd, value, state)
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/memory.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/memory.py
@ -0,0 +1,152 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Memory that stores episodes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class EpisodeMemory(object):
+  """Memory that stores episodes."""
+
+  def __init__(self, template, capacity, max_length, scope):
+    """Create a memory that stores episodes.
+
+    Each transition tuple consists of quantities specified by the template.
+    These quantities would typically be be observartions, actions, rewards, and
+    done indicators.
+
+    Args:
+      template: List of tensors to derive shapes and dtypes of each transition.
+      capacity: Number of episodes, or rows, hold by the memory.
+      max_length: Allocated sequence length for the episodes.
+      scope: Variable scope to use for internal variables.
+    """
+    self._capacity = capacity
+    self._max_length = max_length
+    with tf.variable_scope(scope) as scope:
+      self._scope = scope
+      self._length = tf.Variable(tf.zeros(capacity, tf.int32), False)
+      self._buffers = [
+          tf.Variable(tf.zeros(
+              [capacity, max_length] + elem.shape.as_list(),
+              elem.dtype), False)
+          for elem in template]
+
+  def length(self, rows=None):
+    """Tensor holding the current length of episodes.
+
+    Args:
+      rows: Episodes to select length from, defaults to all.
+
+    Returns:
+      Batch tensor of sequence lengths.
+    """
+    rows = tf.range(self._capacity) if rows is None else rows
+    return tf.gather(self._length, rows)
+
+  def append(self, transitions, rows=None):
+    """Append a batch of transitions to rows of the memory.
+
+    Args:
+      transitions: Tuple of transition quantities with batch dimension.
+      rows: Episodes to append to, defaults to all.
+
+    Returns:
+      Operation.
+    """
+    rows = tf.range(self._capacity) if rows is None else rows
+    assert rows.shape.ndims == 1
+    assert_capacity = tf.assert_less(
+        rows, self._capacity,
+        message='capacity exceeded')
+    with tf.control_dependencies([assert_capacity]):
+      assert_max_length = tf.assert_less(
+          tf.gather(self._length, rows), self._max_length,
+          message='max length exceeded')
+    append_ops = []
+    with tf.control_dependencies([assert_max_length]):
+      for buffer_, elements in zip(self._buffers, transitions):
+        timestep = tf.gather(self._length, rows)
+        indices = tf.stack([rows, timestep], 1)
+        append_ops.append(tf.scatter_nd_update(buffer_, indices, elements))
+    with tf.control_dependencies(append_ops):
+      episode_mask = tf.reduce_sum(tf.one_hot(
+          rows, self._capacity, dtype=tf.int32), 0)
+      return self._length.assign_add(episode_mask)
+
+  def replace(self, episodes, length, rows=None):
+    """Replace full episodes.
+
+    Args:
+      episodes: Tuple of transition quantities with batch and time dimensions.
+      length: Batch of sequence lengths.
+      rows: Episodes to replace, defaults to all.
+
+    Returns:
+      Operation.
+    """
+    rows = tf.range(self._capacity) if rows is None else rows
+    assert rows.shape.ndims == 1
+    assert_capacity = tf.assert_less(
+        rows, self._capacity, message='capacity exceeded')
+    with tf.control_dependencies([assert_capacity]):
+      assert_max_length = tf.assert_less_equal(
+          length, self._max_length, message='max length exceeded')
+    replace_ops = []
+    with tf.control_dependencies([assert_max_length]):
+      for buffer_, elements in zip(self._buffers, episodes):
+        replace_op = tf.scatter_update(buffer_, rows, elements)
+        replace_ops.append(replace_op)
+    with tf.control_dependencies(replace_ops):
+      return tf.scatter_update(self._length, rows, length)
+
+  def data(self, rows=None):
+    """Access a batch of episodes from the memory.
+
+    Padding elements after the length of each episode are unspecified and might
+    contain old data.
+
+    Args:
+      rows: Episodes to select, defaults to all.
+
+    Returns:
+      Tuple containing a tuple of transition quantiries with batch and time
+      dimensions, and a batch of sequence lengths.
+    """
+    rows = tf.range(self._capacity) if rows is None else rows
+    assert rows.shape.ndims == 1
+    episode = [tf.gather(buffer_, rows) for buffer_ in self._buffers]
+    length = tf.gather(self._length, rows)
+    return episode, length
+
+  def clear(self, rows=None):
+    """Reset episodes in the memory.
+
+    Internally, this only sets their lengths to zero. The memory entries will
+    be overridden by future calls to append() or replace().
+
+    Args:
+      rows: Episodes to clear, defaults to all.
+
+    Returns:
+      Operation.
+    """
+    rows = tf.range(self._capacity) if rows is None else rows
+    assert rows.shape.ndims == 1
+    return tf.scatter_update(self._length, rows, tf.zeros_like(rows))
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/normalize.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/normalize.py
@ -0,0 +1,168 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Normalize tensors based on streaming estimates of mean and variance."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class StreamingNormalize(object):
+  """Normalize tensors based on streaming estimates of mean and variance."""
+
+  def __init__(
+      self, template, center=True, scale=True, clip=10, name='normalize'):
+    """Normalize tensors based on streaming estimates of mean and variance.
+
+    Centering the value, scaling it by the standard deviation, and clipping
+    outlier values are optional.
+
+    Args:
+      template: Example tensor providing shape and dtype of the vaule to track.
+      center: Python boolean indicating whether to subtract mean from values.
+      scale: Python boolean indicating whether to scale values by stddev.
+      clip: If and when to clip normalized values.
+      name: Parent scope of operations provided by this class.
+    """
+    self._center = center
+    self._scale = scale
+    self._clip = clip
+    self._name = name
+    with tf.name_scope(name):
+      self._count = tf.Variable(0, False)
+      self._mean = tf.Variable(tf.zeros_like(template), False)
+      self._var_sum = tf.Variable(tf.zeros_like(template), False)
+
+  def transform(self, value):
+    """Normalize a single or batch tensor.
+
+    Applies the activated transformations in the constructor using current
+    estimates of mean and variance.
+
+    Args:
+      value: Batch or single value tensor.
+
+    Returns:
+      Normalized batch or single value tensor.
+    """
+    with tf.name_scope(self._name + '/transform'):
+      no_batch_dim = value.shape.ndims == self._mean.shape.ndims
+      if no_batch_dim:
+        # Add a batch dimension if necessary.
+        value = value[None, ...]
+      if self._center:
+        value -= self._mean[None, ...]
+      if self._scale:
+        # We cannot scale before seeing at least two samples.
+        value /= tf.cond(
+            self._count > 1, lambda: self._std() + 1e-8,
+            lambda: tf.ones_like(self._var_sum))[None]
+      if self._clip:
+        value = tf.clip_by_value(value, -self._clip, self._clip)
+      # Remove batch dimension if necessary.
+      if no_batch_dim:
+        value = value[0]
+      return tf.check_numerics(value, 'value')
+
+  def update(self, value):
+    """Update the mean and variance estimates.
+
+    Args:
+      value: Batch or single value tensor.
+
+    Returns:
+      Summary tensor.
+    """
+    with tf.name_scope(self._name + '/update'):
+      if value.shape.ndims == self._mean.shape.ndims:
+        # Add a batch dimension if necessary.
+        value = value[None, ...]
+      count = tf.shape(value)[0]
+      with tf.control_dependencies([self._count.assign_add(count)]):
+        step = tf.cast(self._count, tf.float32)
+        mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0)
+        new_mean = self._mean + mean_delta / step
+        new_mean = tf.cond(self._count > 1, lambda: new_mean, lambda: value[0])
+        var_delta = (
+            value - self._mean[None, ...]) * (value - new_mean[None, ...])
+        new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0)
+      with tf.control_dependencies([new_mean, new_var_sum]):
+        update = self._mean.assign(new_mean), self._var_sum.assign(new_var_sum)
+      with tf.control_dependencies(update):
+        if value.shape.ndims == 1:
+          value = tf.reduce_mean(value)
+        return self._summary('value', tf.reduce_mean(value))
+
+  def reset(self):
+    """Reset the estimates of mean and variance.
+
+    Resets the full state of this class.
+
+    Returns:
+      Operation.
+    """
+    with tf.name_scope(self._name + '/reset'):
+      return tf.group(
+          self._count.assign(0),
+          self._mean.assign(tf.zeros_like(self._mean)),
+          self._var_sum.assign(tf.zeros_like(self._var_sum)))
+
+  def summary(self):
+    """Summary string of mean and standard deviation.
+
+    Returns:
+      Summary tensor.
+    """
+    with tf.name_scope(self._name + '/summary'):
+      mean_summary = tf.cond(
+          self._count > 0, lambda: self._summary('mean', self._mean), str)
+      std_summary = tf.cond(
+          self._count > 1, lambda: self._summary('stddev', self._std()), str)
+      return tf.summary.merge([mean_summary, std_summary])
+
+  def _std(self):
+    """Computes the current estimate of the standard deviation.
+
+    Note that the standard deviation is not defined until at least two samples
+    were seen.
+
+    Returns:
+      Tensor of current variance.
+    """
+    variance = tf.cond(
+        self._count > 1,
+        lambda: self._var_sum / tf.cast(self._count - 1, tf.float32),
+        lambda: tf.ones_like(self._var_sum) * float('nan'))
+    # The epsilon corrects for small negative variance values caused by
+    # the algorithm. It was empirically chosen to work with all environments
+    # tested.
+    return tf.sqrt(variance + 1e-4)
+
+  def _summary(self, name, tensor):
+    """Create a scalar or histogram summary matching the rank of the tensor.
+
+    Args:
+      name: Name for the summary.
+      tensor: Tensor to summarize.
+
+    Returns:
+      Summary tensor.
+    """
+    if tensor.shape.ndims == 0:
+      return tf.summary.scalar(name, tensor)
+    else:
+      return tf.summary.histogram(name, tensor)
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/utility.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/ppo/utility.py
@ -0,0 +1,222 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for the PPO algorithm."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+import re
+
+import tensorflow as tf
+from tensorflow.python.client import device_lib
+
+
+def create_nested_vars(tensors):
+  """Create variables matching a nested tuple of tensors.
+
+  Args:
+    tensors: Nested tuple of list of tensors.
+
+  Returns:
+    Nested tuple or list of variables.
+  """
+  if isinstance(tensors, (tuple, list)):
+    return type(tensors)(create_nested_vars(tensor) for tensor in tensors)
+  return tf.Variable(tensors, False)
+
+
+def reinit_nested_vars(variables, indices=None):
+  """Reset all variables in a nested tuple to zeros.
+
+  Args:
+    variables: Nested tuple or list of variaables.
+    indices: Indices along the first dimension to reset, defaults to all.
+
+  Returns:
+    Operation.
+  """
+  if isinstance(variables, (tuple, list)):
+    return tf.group(*[
+        reinit_nested_vars(variable, indices) for variable in variables])
+  if indices is None:
+    return variables.assign(tf.zeros_like(variables))
+  else:
+    zeros = tf.zeros([tf.shape(indices)[0]] + variables.shape[1:].as_list())
+    return tf.scatter_update(variables, indices, zeros)
+
+
+def assign_nested_vars(variables, tensors):
+  """Assign tensors to matching nested tuple of variables.
+
+  Args:
+    variables: Nested tuple or list of variables to update.
+    tensors: Nested tuple or list of tensors to assign.
+
+  Returns:
+    Operation.
+  """
+  if isinstance(variables, (tuple, list)):
+    return tf.group(*[
+        assign_nested_vars(variable, tensor)
+        for variable, tensor in zip(variables, tensors)])
+  return variables.assign(tensors)
+
+
+def discounted_return(reward, length, discount):
+  """Discounted Monte-Carlo returns."""
+  timestep = tf.range(reward.shape[1].value)
+  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
+  return_ = tf.reverse(tf.transpose(tf.scan(
+      lambda agg, cur: cur + discount * agg,
+      tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]),
+      tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1])
+  return tf.check_numerics(tf.stop_gradient(return_), 'return')
+
+
+def fixed_step_return(reward, value, length, discount, window):
+  """N-step discounted return."""
+  timestep = tf.range(reward.shape[1].value)
+  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
+  return_ = tf.zeros_like(reward)
+  for _ in range(window):
+    return_ += reward
+    reward = discount * tf.concat(
+        [reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1)
+  return_ += discount ** window * tf.concat(
+      [value[:, window:], tf.zeros_like(value[:, -window:]), 1])
+  return tf.check_numerics(tf.stop_gradient(mask * return_), 'return')
+
+
+def lambda_return(reward, value, length, discount, lambda_):
+  """TD-lambda returns."""
+  timestep = tf.range(reward.shape[1].value)
+  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
+  sequence = mask * reward + discount * value * (1 - lambda_)
+  discount = mask * discount * lambda_
+  sequence = tf.stack([sequence, discount], 2)
+  return_ = tf.reverse(tf.transpose(tf.scan(
+      lambda agg, cur: cur[0] + cur[1] * agg,
+      tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]),
+      tf.zeros_like(value[:, -1]), 1, False), [1, 0]), [1])
+  return tf.check_numerics(tf.stop_gradient(return_), 'return')
+
+
+def lambda_advantage(reward, value, length, discount):
+  """Generalized Advantage Estimation."""
+  timestep = tf.range(reward.shape[1].value)
+  mask = tf.cast(timestep[None, :] < length[:, None], tf.float32)
+  next_value = tf.concat([value[:, 1:], tf.zeros_like(value[:, -1:])], 1)
+  delta = reward + discount * next_value - value
+  advantage = tf.reverse(tf.transpose(tf.scan(
+      lambda agg, cur: cur + discount * agg,
+      tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]),
+      tf.zeros_like(delta[:, -1]), 1, False), [1, 0]), [1])
+  return tf.check_numerics(tf.stop_gradient(advantage), 'advantage')
+
+
+def diag_normal_kl(mean0, logstd0, mean1, logstd1):
+  """Epirical KL divergence of two normals with diagonal covariance."""
+  logstd0_2, logstd1_2 = 2 * logstd0, 2 * logstd1
+  return 0.5 * (
+      tf.reduce_sum(tf.exp(logstd0_2 - logstd1_2), -1) +
+      tf.reduce_sum((mean1 - mean0) ** 2 / tf.exp(logstd1_2), -1) +
+      tf.reduce_sum(logstd1_2, -1) - tf.reduce_sum(logstd0_2, -1) -
+      mean0.shape[-1].value)
+
+
+def diag_normal_logpdf(mean, logstd, loc):
+  """Log density of a normal with diagonal covariance."""
+  constant = -0.5 * (math.log(2 * math.pi) + logstd)
+  value = -0.5 * ((loc - mean) / tf.exp(logstd)) ** 2
+  return tf.reduce_sum(constant + value, -1)
+
+
+def diag_normal_entropy(mean, logstd):
+  """Empirical entropy of a normal with diagonal covariance."""
+  constant = mean.shape[-1].value * math.log(2 * math.pi * math.e)
+  return (constant + tf.reduce_sum(2 * logstd, 1)) / 2
+
+
+def available_gpus():
+  """List of GPU device names detected by TensorFlow."""
+  local_device_protos = device_lib.list_local_devices()
+  return [x.name for x in local_device_protos if x.device_type == 'GPU']
+
+
+def gradient_summaries(grad_vars, groups=None, scope='gradients'):
+  """Create histogram summaries of the gradient.
+
+  Summaries can be grouped via regexes matching variables names.
+
+  Args:
+    grad_vars: List of (gradient, variable) tuples as returned by optimizers.
+    groups: Mapping of name to regex for grouping summaries.
+    scope: Name scope for this operation.
+
+  Returns:
+    Summary tensor.
+  """
+  groups = groups or {r'all': r'.*'}
+  grouped = collections.defaultdict(list)
+  for grad, var in grad_vars:
+    if grad is None:
+      continue
+    for name, pattern in groups.items():
+      if re.match(pattern, var.name):
+        name = re.sub(pattern, name, var.name)
+        grouped[name].append(grad)
+  for name in groups:
+    if name not in grouped:
+      tf.logging.warn("No variables matching '{}' group.".format(name))
+  summaries = []
+  for name, grads in grouped.items():
+    grads = [tf.reshape(grad, [-1]) for grad in grads]
+    grads = tf.concat(grads, 0)
+    summaries.append(tf.summary.histogram(scope + '/' + name, grads))
+  return tf.summary.merge(summaries)
+
+
+def variable_summaries(vars_, groups=None, scope='weights'):
+  """Create histogram summaries for the provided variables.
+
+  Summaries can be grouped via regexes matching variables names.
+
+  Args:
+    vars_: List of variables to summarize.
+    groups: Mapping of name to regex for grouping summaries.
+    scope: Name scope for this operation.
+
+  Returns:
+    Summary tensor.
+  """
+  groups = groups or {r'all': r'.*'}
+  grouped = collections.defaultdict(list)
+  for var in vars_:
+    for name, pattern in groups.items():
+      if re.match(pattern, var.name):
+        name = re.sub(pattern, name, var.name)
+        grouped[name].append(var)
+  for name in groups:
+    if name not in grouped:
+      tf.logging.warn("No variables matching '{}' group.".format(name))
+  summaries = []
+  for name, vars_ in grouped.items():
+    vars_ = [tf.reshape(var, [-1]) for var in vars_]
+    vars_ = tf.concat(vars_, 0)
+    summaries.append(tf.summary.histogram(scope + '/' + name, vars_))
+  return tf.summary.merge(summaries)
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/init.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/init.py
@ -0,0 +1,23 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Executable scripts for reinforcement learning."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from . import train
+from . import utility
+from . import visualize
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/configs.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/configs.py
@ -0,0 +1,128 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Example configurations using the PPO algorithm."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-variable
+
+from pybullet_envs.minitaur.agents import ppo
+from pybullet_envs.minitaur.agents.scripts import networks
+
+
+def default():
+  """Default configuration for PPO."""
+  # General
+  algorithm = ppo.PPOAlgorithm
+  num_agents = 10
+  eval_episodes = 25
+  use_gpu = False
+  # Network
+  network = networks.ForwardGaussianPolicy
+  weight_summaries = dict(
+      all=r'.*',
+      policy=r'.*/policy/.*',
+      value=r'.*/value/.*')
+  policy_layers = 200, 100
+  value_layers = 200, 100
+  init_mean_factor = 0.05
+  init_logstd = -1
+  # Optimization
+  update_every = 25
+  policy_optimizer = 'AdamOptimizer'
+  value_optimizer = 'AdamOptimizer'
+  update_epochs_policy = 50
+  update_epochs_value = 50
+  policy_lr = 1e-4
+  value_lr = 3e-4
+  # Losses
+  discount = 0.985
+  kl_target = 1e-2
+  kl_cutoff_factor = 2
+  kl_cutoff_coef = 1000
+  kl_init_penalty = 1
+  return locals()
+
+
+def pendulum():
+  """Configuration for the pendulum classic control task."""
+  locals().update(default())
+  # Environment
+  env = 'Pendulum-v0'
+  max_length = 200
+  steps = 1e6  # 1M
+  return locals()
+
+
+def cheetah():
+  """Configuration for MuJoCo's half cheetah task."""
+  locals().update(default())
+  # Environment
+  env = 'HalfCheetah-v1'
+  max_length = 1000
+  steps = 1e7  # 10M
+  return locals()
+
+
+def walker():
+  """Configuration for MuJoCo's walker task."""
+  locals().update(default())
+  # Environment
+  env = 'Walker2d-v1'
+  max_length = 1000
+  steps = 1e7  # 10M
+  return locals()
+
+
+def reacher():
+  """Configuration for MuJoCo's reacher task."""
+  locals().update(default())
+  # Environment
+  env = 'Reacher-v1'
+  max_length = 1000
+  steps = 1e7  # 10M
+  return locals()
+
+
+def hopper():
+  """Configuration for MuJoCo's hopper task."""
+  locals().update(default())
+  # Environment
+  env = 'Hopper-v1'
+  max_length = 1000
+  steps = 2e7  # 20M
+  return locals()
+
+
+def ant():
+  """Configuration for MuJoCo's ant task."""
+  locals().update(default())
+  # Environment
+  env = 'Ant-v1'
+  max_length = 1000
+  steps = 5e7  # 50M
+  return locals()
+
+
+def humanoid():
+  """Configuration for MuJoCo's humanoid task."""
+  locals().update(default())
+  # Environment
+  env = 'Humanoid-v1'
+  max_length = 1000
+  steps = 5e7  # 50M
+  return locals()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/networks.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/networks.py
@ -0,0 +1,167 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Networks for the PPO algorithm defined as recurrent cells."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+_MEAN_WEIGHTS_INITIALIZER = tf.contrib.layers.variance_scaling_initializer(
+    factor=0.1)
+_LOGSTD_INITIALIZER = tf.random_normal_initializer(-1, 1e-10)
+
+class LinearGaussianPolicy(tf.contrib.rnn.RNNCell):
+  """Indepent linear network with a tanh at the end for policy and feedforward network for the value.
+
+  The policy network outputs the mean action and the log standard deviation
+  is learned as indepent parameter vector.
+  """
+
+  def __init__(self,
+               policy_layers,
+               value_layers,
+               action_size,
+               mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
+               logstd_initializer=_LOGSTD_INITIALIZER):
+    self._policy_layers = policy_layers
+    self._value_layers = value_layers
+    self._action_size = action_size
+    self._mean_weights_initializer = mean_weights_initializer
+    self._logstd_initializer = logstd_initializer
+
+  @property
+  def state_size(self):
+    unused_state_size = 1
+    return unused_state_size
+
+  @property
+  def output_size(self):
+    return (self._action_size, self._action_size, tf.TensorShape([]))
+
+  def __call__(self, observation, state):
+    with tf.variable_scope('policy'):
+      x = tf.contrib.layers.flatten(observation)
+      mean = tf.contrib.layers.fully_connected(
+          x,
+          self._action_size,
+          tf.tanh,
+          weights_initializer=self._mean_weights_initializer)
+      logstd = tf.get_variable('logstd', mean.shape[1:], tf.float32,
+                               self._logstd_initializer)
+      logstd = tf.tile(logstd[None, ...],
+                       [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
+    with tf.variable_scope('value'):
+      x = tf.contrib.layers.flatten(observation)
+      for size in self._value_layers:
+        x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
+      value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
+    return (mean, logstd, value), state
+
+
+class ForwardGaussianPolicy(tf.contrib.rnn.RNNCell):
+  """Independent feed forward networks for policy and value.
+
+  The policy network outputs the mean action and the log standard deviation
+  is learned as independent parameter vector.
+  """
+
+  def __init__(
+      self, policy_layers, value_layers, action_size,
+      mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
+      logstd_initializer=_LOGSTD_INITIALIZER):
+    self._policy_layers = policy_layers
+    self._value_layers = value_layers
+    self._action_size = action_size
+    self._mean_weights_initializer = mean_weights_initializer
+    self._logstd_initializer = logstd_initializer
+
+  @property
+  def state_size(self):
+    unused_state_size = 1
+    return unused_state_size
+
+  @property
+  def output_size(self):
+    return (self._action_size, self._action_size, tf.TensorShape([]))
+
+  def __call__(self, observation, state):
+    with tf.variable_scope('policy'):
+      x = tf.contrib.layers.flatten(observation)
+      for size in self._policy_layers:
+        x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
+      mean = tf.contrib.layers.fully_connected(
+          x, self._action_size, tf.tanh,
+          weights_initializer=self._mean_weights_initializer)
+      logstd = tf.get_variable(
+          'logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
+      logstd = tf.tile(
+          logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
+    with tf.variable_scope('value'):
+      x = tf.contrib.layers.flatten(observation)
+      for size in self._value_layers:
+        x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
+      value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
+    return (mean, logstd, value), state
+
+
+class RecurrentGaussianPolicy(tf.contrib.rnn.RNNCell):
+  """Independent recurrent policy and feed forward value networks.
+
+  The policy network outputs the mean action and the log standard deviation
+  is learned as independent parameter vector. The last policy layer is recurrent
+  and uses a GRU cell.
+  """
+
+  def __init__(
+      self, policy_layers, value_layers, action_size,
+      mean_weights_initializer=_MEAN_WEIGHTS_INITIALIZER,
+      logstd_initializer=_LOGSTD_INITIALIZER):
+    self._policy_layers = policy_layers
+    self._value_layers = value_layers
+    self._action_size = action_size
+    self._mean_weights_initializer = mean_weights_initializer
+    self._logstd_initializer = logstd_initializer
+    self._cell = tf.contrib.rnn.GRUBlockCell(100)
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return (self._action_size, self._action_size, tf.TensorShape([]))
+
+  def __call__(self, observation, state):
+    with tf.variable_scope('policy'):
+      x = tf.contrib.layers.flatten(observation)
+      for size in self._policy_layers[:-1]:
+        x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
+      x, state = self._cell(x, state)
+      mean = tf.contrib.layers.fully_connected(
+          x, self._action_size, tf.tanh,
+          weights_initializer=self._mean_weights_initializer)
+      logstd = tf.get_variable(
+          'logstd', mean.shape[1:], tf.float32, self._logstd_initializer)
+      logstd = tf.tile(
+          logstd[None, ...], [tf.shape(mean)[0]] + [1] * logstd.shape.ndims)
+    with tf.variable_scope('value'):
+      x = tf.contrib.layers.flatten(observation)
+      for size in self._value_layers:
+        x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
+      value = tf.contrib.layers.fully_connected(x, 1, None)[:, 0]
+    return (mean, logstd, value), state
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train.py
@ -0,0 +1,165 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Script to train a batch reinforcement learning algorithm.
+
+Command line:
+
+  python3 -m agents.scripts.train --logdir=/path/to/logdir --config=pendulum
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import datetime
+import functools
+import os
+
+import gym
+import tensorflow as tf
+
+from pybullet_envs.minitaur.agents import tools
+from pybullet_envs.minitaur.agents.scripts import configs
+from pybullet_envs.minitaur.agents.scripts import utility
+
+
+def _create_environment(config):
+  """Constructor for an instance of the environment.
+
+  Args:
+    config: Object providing configurations via attributes.
+
+  Returns:
+    Wrapped OpenAI Gym environment.
+  """
+  if isinstance(config.env, str):
+    env = gym.make(config.env)
+  else:
+    env = config.env()
+  if config.max_length:
+    env = tools.wrappers.LimitDuration(env, config.max_length)
+  env = tools.wrappers.RangeNormalize(env)
+  env = tools.wrappers.ClipAction(env)
+  env = tools.wrappers.ConvertTo32Bit(env)
+  return env
+
+
+def _define_loop(graph, logdir, train_steps, eval_steps):
+  """Create and configure a training loop with training and evaluation phases.
+
+  Args:
+    graph: Object providing graph elements via attributes.
+    logdir: Log directory for storing checkpoints and summaries.
+    train_steps: Number of training steps per epoch.
+    eval_steps: Number of evaluation steps per epoch.
+
+  Returns:
+    Loop object.
+  """
+  loop = tools.Loop(
+      logdir, graph.step, graph.should_log, graph.do_report,
+      graph.force_reset)
+  loop.add_phase(
+      'train', graph.done, graph.score, graph.summary, train_steps,
+      report_every=None,
+      log_every=train_steps // 2,
+      checkpoint_every=None,
+      feed={graph.is_training: True})
+  loop.add_phase(
+      'eval', graph.done, graph.score, graph.summary, eval_steps,
+      report_every=eval_steps,
+      log_every=eval_steps // 2,
+      checkpoint_every=10 * eval_steps,
+      feed={graph.is_training: False})
+  return loop
+
+
+def train(config, env_processes):
+  """Training and evaluation entry point yielding scores.
+
+  Resolves some configuration attributes, creates environments, graph, and
+  training loop. By default, assigns all operations to the CPU.
+
+  Args:
+    config: Object providing configurations via attributes.
+    env_processes: Whether to step environments in separate processes.
+
+  Yields:
+    Evaluation scores.
+  """
+  tf.reset_default_graph()
+  with config.unlocked:
+    config.network = functools.partial(
+        utility.define_network, config.network, config)
+    config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
+    config.value_optimizer = getattr(tf.train, config.value_optimizer)
+  if config.update_every % config.num_agents:
+    tf.logging.warn('Number of agents should divide episodes per update.')
+  with tf.device('/cpu:0'):
+    batch_env = utility.define_batch_env(
+        lambda: _create_environment(config),
+        config.num_agents, env_processes)
+    graph = utility.define_simulation_graph(
+        batch_env, config.algorithm, config)
+    loop = _define_loop(
+        graph, config.logdir,
+        config.update_every * config.max_length,
+        config.eval_episodes * config.max_length)
+    total_steps = int(
+        config.steps / config.update_every *
+        (config.update_every + config.eval_episodes))
+  # Exclude episode related variables since the Python state of environments is
+  # not checkpointed and thus new episodes start after resuming.
+  saver = utility.define_saver(exclude=(r'.*_temporary/.*',))
+  sess_config = tf.ConfigProto(allow_soft_placement=True)
+  sess_config.gpu_options.allow_growth = True
+  with tf.Session(config=sess_config) as sess:
+    utility.initialize_variables(sess, saver, config.logdir)
+    for score in loop.run(sess, saver, total_steps):
+      yield score
+  batch_env.close()
+
+
+def main(_):
+  """Create or load configuration and launch the trainer."""
+  utility.set_up_logging()
+  if not FLAGS.config:
+    raise KeyError('You must specify a configuration.')
+  logdir = FLAGS.logdir and os.path.expanduser(os.path.join(
+      FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config)))
+  try:
+    config = utility.load_config(logdir)
+  except IOError:
+    config = tools.AttrDict(getattr(configs, FLAGS.config)())
+    config = utility.save_config(config, logdir)
+  for score in train(config, FLAGS.env_processes):
+    tf.logging.info('Score {}.'.format(score))
+
+
+if __name__ == '__main__':
+  FLAGS = tf.app.flags.FLAGS
+  tf.app.flags.DEFINE_string(
+      'logdir', None,
+      'Base directory to store logs.')
+  tf.app.flags.DEFINE_string(
+      'timestamp', datetime.datetime.now().strftime('%Y%m%dT%H%M%S'),
+      'Sub directory to store logs.')
+  tf.app.flags.DEFINE_string(
+      'config', None,
+      'Configuration to execute.')
+  tf.app.flags.DEFINE_boolean(
+      'env_processes', True,
+      'Step environments in separate processes to circumvent the GIL.')
+  tf.app.run()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train_ppo_test.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/train_ppo_test.py
@ -0,0 +1,110 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the PPO algorithm usage example."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import itertools
+
+import tensorflow as tf
+
+from google3.robotics.reinforcement_learning.agents import ppo
+from google3.robotics.reinforcement_learning.agents import tools
+from google3.robotics.reinforcement_learning.agents.scripts import configs
+from google3.robotics.reinforcement_learning.agents.scripts import networks
+from google3.robotics.reinforcement_learning.agents.scripts import train
+
+
+FLAGS = tf.app.flags.FLAGS
+
+
+class PPOTest(tf.test.TestCase):
+
+  def test_no_crash_cheetah(self):
+    nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
+    for network in nets:
+      config = self._define_config()
+      with config.unlocked:
+        config.env = 'HalfCheetah-v1'
+        config.max_length = 200
+        config.steps = 1000
+        config.network = network
+      for score in train.train(config, env_processes=True):
+        float(score)
+
+  def test_no_crash_ant(self):
+    nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
+    for network in nets:
+      config = self._define_config()
+      with config.unlocked:
+        config.env = 'Ant-v1'
+        config.max_length = 200
+        config.steps = 1000
+        config.network = network
+      for score in train.train(config, env_processes=True):
+        float(score)
+
+  def test_no_crash_observation_shape(self):
+    nets = networks.ForwardGaussianPolicy, networks.RecurrentGaussianPolicy
+    observ_shapes = (1,), (2, 3), (2, 3, 4)
+    for network, observ_shape in itertools.product(nets, observ_shapes):
+      config = self._define_config()
+      with config.unlocked:
+        config.env = functools.partial(
+            tools.MockEnvironment, observ_shape, action_shape=(3,),
+            min_duration=15, max_duration=15)
+        config.max_length = 20
+        config.steps = 100
+        config.network = network
+      for score in train.train(config, env_processes=False):
+        float(score)
+
+  def test_no_crash_variable_duration(self):
+    config = self._define_config()
+    with config.unlocked:
+      config.env = functools.partial(
+          tools.MockEnvironment, observ_shape=(2, 3), action_shape=(3,),
+          min_duration=5, max_duration=25)
+      config.max_length = 25
+      config.steps = 200
+      config.network = networks.RecurrentGaussianPolicy
+    for score in train.train(config, env_processes=False):
+      float(score)
+
+  def _define_config(self):
+    # Start from the example configuration.
+    locals().update(configs.default())
+    # pylint: disable=unused-variable
+    # General
+    algorithm = ppo.PPOAlgorithm
+    num_agents = 2
+    update_every = 4
+    use_gpu = False
+    # Network
+    policy_layers = 20, 10
+    value_layers = 20, 10
+    # Optimization
+    update_epochs_policy = 2
+    update_epochs_value = 2
+    # pylint: enable=unused-variable
+    return tools.AttrDict(locals())
+
+
+if __name__ == '__main__':
+  FLAGS.config = 'unused'
+  tf.test.main()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/utility.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/utility.py
@ -0,0 +1,213 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for using reinforcement learning algorithms."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+import os
+import re
+
+import ruamel.yaml as yaml
+import tensorflow as tf
+
+from pybullet_envs.minitaur.agents import tools
+
+
+def define_simulation_graph(batch_env, algo_cls, config):
+  """Define the algortihm and environment interaction.
+
+  Args:
+    batch_env: In-graph environments object.
+    algo_cls: Constructor of a batch algorithm.
+    config: Configuration object for the algorithm.
+
+  Returns:
+    Object providing graph elements via attributes.
+  """
+  # pylint: disable=unused-variable
+  step = tf.Variable(0, False, dtype=tf.int32, name='global_step')
+  is_training = tf.placeholder(tf.bool, name='is_training')
+  should_log = tf.placeholder(tf.bool, name='should_log')
+  do_report = tf.placeholder(tf.bool, name='do_report')
+  force_reset = tf.placeholder(tf.bool, name='force_reset')
+  algo = algo_cls(batch_env, step, is_training, should_log, config)
+  done, score, summary = tools.simulate(
+      batch_env, algo, should_log, force_reset)
+  message = 'Graph contains {} trainable variables.'
+  tf.logging.info(message.format(tools.count_weights()))
+  # pylint: enable=unused-variable
+  return tools.AttrDict(locals())
+
+
+def define_batch_env(constructor, num_agents, env_processes):
+  """Create environments and apply all desired wrappers.
+
+  Args:
+    constructor: Constructor of an OpenAI gym environment.
+    num_agents: Number of environments to combine in the batch.
+    env_processes: Whether to step environment in external processes.
+
+  Returns:
+    In-graph environments object.
+  """
+  with tf.variable_scope('environments'):
+    if env_processes:
+      envs = [
+          tools.wrappers.ExternalProcess(constructor)
+          for _ in range(num_agents)]
+    else:
+      envs = [constructor() for _ in range(num_agents)]
+    batch_env = tools.BatchEnv(envs, blocking=not env_processes)
+    batch_env = tools.InGraphBatchEnv(batch_env)
+  return batch_env
+
+
+def define_saver(exclude=None):
+  """Create a saver for the variables we want to checkpoint.
+
+  Args:
+    exclude: List of regexes to match variable names to exclude.
+
+  Returns:
+    Saver object.
+  """
+  variables = []
+  exclude = exclude or []
+  exclude = [re.compile(regex) for regex in exclude]
+  for variable in tf.global_variables():
+    if any(regex.match(variable.name) for regex in exclude):
+      continue
+    variables.append(variable)
+  saver = tf.train.Saver(variables, keep_checkpoint_every_n_hours=5)
+  return saver
+
+
+def define_network(constructor, config, action_size):
+  """Constructor for the recurrent cell for the algorithm.
+
+  Args:
+    constructor: Callable returning the network as RNNCell.
+    config: Object providing configurations via attributes.
+    action_size: Integer indicating the amount of action dimensions.
+
+  Returns:
+    Created recurrent cell object.
+  """
+  mean_weights_initializer = (
+      tf.contrib.layers.variance_scaling_initializer(
+          factor=config.init_mean_factor))
+  logstd_initializer = tf.random_normal_initializer(
+      config.init_logstd, 1e-10)
+  network = constructor(
+      config.policy_layers, config.value_layers, action_size,
+      mean_weights_initializer=mean_weights_initializer,
+      logstd_initializer=logstd_initializer)
+  return network
+
+
+def initialize_variables(sess, saver, logdir, checkpoint=None, resume=None):
+  """Initialize or restore variables from a checkpoint if available.
+
+  Args:
+    sess: Session to initialize variables in.
+    saver: Saver to restore variables.
+    logdir: Directory to search for checkpoints.
+    checkpoint: Specify what checkpoint name to use; defaults to most recent.
+    resume: Whether to expect recovering a checkpoint or starting a new run.
+
+  Raises:
+    ValueError: If resume expected but no log directory specified.
+    RuntimeError: If no resume expected but a checkpoint was found.
+  """
+  sess.run(tf.group(
+      tf.local_variables_initializer(),
+      tf.global_variables_initializer()))
+  if resume and not (logdir or checkpoint):
+    raise ValueError('Need to specify logdir to resume a checkpoint.')
+  if logdir:
+    state = tf.train.get_checkpoint_state(logdir)
+    if checkpoint:
+      checkpoint = os.path.join(logdir, checkpoint)
+    if not checkpoint and state and state.model_checkpoint_path:
+      checkpoint = state.model_checkpoint_path
+    if checkpoint and resume is False:
+      message = 'Found unexpected checkpoint when starting a new run.'
+      raise RuntimeError(message)
+    if checkpoint:
+      saver.restore(sess, checkpoint)
+
+
+def save_config(config, logdir=None):
+  """Save a new configuration by name.
+
+  If a logging directory is specified, is will be created and the configuration
+  will be stored there. Otherwise, a log message will be printed.
+
+  Args:
+    config: Configuration object.
+    logdir: Location for writing summaries and checkpoints if specified.
+
+  Returns:
+    Configuration object.
+  """
+  if logdir:
+    with config.unlocked:
+      config.logdir = logdir
+    message = 'Start a new run and write summaries and checkpoints to {}.'
+    tf.logging.info(message.format(config.logdir))
+    tf.gfile.MakeDirs(config.logdir)
+    config_path = os.path.join(config.logdir, 'config.yaml')
+    with tf.gfile.FastGFile(config_path, 'w') as file_:
+      yaml.dump(config, file_, default_flow_style=False)
+  else:
+    message = (
+        'Start a new run without storing summaries and checkpoints since no '
+        'logging directory was specified.')
+    tf.logging.info(message)
+  return config
+
+
+def load_config(logdir):
+  """Load a configuration from the log directory.
+
+  Args:
+    logdir: The logging directory containing the configuration file.
+
+  Raises:
+    IOError: The logging directory does not contain a configuration file.
+
+  Returns:
+    Configuration object.
+  """
+  config_path = logdir and os.path.join(logdir, 'config.yaml')
+  if not config_path or not tf.gfile.Exists(config_path):
+    message = (
+        'Cannot resume an existing run since the logging directory does not '
+        'contain a configuration file.')
+    raise IOError(message)
+  with tf.gfile.FastGFile(config_path, 'r') as file_:
+    config = yaml.load(file_)
+  message = 'Resume run and write summaries and checkpoints to {}.'
+  tf.logging.info(message.format(config.logdir))
+  return config
+
+
+def set_up_logging():
+  """Configure the TensorFlow logger."""
+  tf.logging.set_verbosity(tf.logging.INFO)
+  logging.getLogger('tensorflow').propagate = False
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/visualize.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/scripts/visualize.py
@ -0,0 +1,157 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Script to render videos of the Proximal Policy Gradient algorithm.
+
+Command line:
+
+  python3 -m agents.scripts.visualize \
+      --logdir=/path/to/logdir/<time>-<config> --outdir=/path/to/outdir/
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+import gym
+import tensorflow as tf
+
+from pybullet_envs.minitaur.agents import tools
+from pybullet_envs.minitaur.agents.scripts import utility
+
+
+def _create_environment(config, outdir):
+  """Constructor for an instance of the environment.
+
+  Args:
+    config: Object providing configurations via attributes.
+    outdir: Directory to store videos in.
+
+  Returns:
+    Wrapped OpenAI Gym environment.
+  """
+  if isinstance(config.env, str):
+    env = gym.make(config.env)
+  else:
+    env = config.env()
+  # Ensure that the environment has the specification attribute set as expected
+  # by the monitor wrapper.
+  if not hasattr(env, 'spec'):
+    setattr(env, 'spec', getattr(env, 'spec', None))
+  if config.max_length:
+    env = tools.wrappers.LimitDuration(env, config.max_length)
+#  env = gym.wrappers.Monitor(
+#      env, outdir, lambda unused_episode_number: True)
+  env = tools.wrappers.RangeNormalize(env)
+  env = tools.wrappers.ClipAction(env)
+  env = tools.wrappers.ConvertTo32Bit(env)
+  return env
+
+
+def _define_loop(graph, eval_steps):
+  """Create and configure an evaluation loop.
+
+  Args:
+    graph: Object providing graph elements via attributes.
+    eval_steps: Number of evaluation steps per epoch.
+
+  Returns:
+    Loop object.
+  """
+  loop = tools.Loop(
+      None, graph.step, graph.should_log, graph.do_report, graph.force_reset)
+  loop.add_phase(
+      'eval', graph.done, graph.score, graph.summary, eval_steps,
+      report_every=eval_steps,
+      log_every=None,
+      checkpoint_every=None,
+      feed={graph.is_training: False})
+  return loop
+
+
+def visualize(
+    logdir, outdir, num_agents, num_episodes, checkpoint=None,
+    env_processes=True):
+  """Recover checkpoint and render videos from it.
+
+  Args:
+    logdir: Logging directory of the trained algorithm.
+    outdir: Directory to store rendered videos in.
+    num_agents: Number of environments to simulate in parallel.
+    num_episodes: Total number of episodes to simulate.
+    checkpoint: Checkpoint name to load; defaults to most recent.
+    env_processes: Whether to step environments in separate processes.
+  """
+  config = utility.load_config(logdir)
+  with config.unlocked:
+    config.network = functools.partial(
+        utility.define_network, config.network, config)
+    config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
+    config.value_optimizer = getattr(tf.train, config.value_optimizer)
+  with tf.device('/cpu:0'):
+    batch_env = utility.define_batch_env(
+        lambda: _create_environment(config, outdir),
+        num_agents, env_processes)
+    graph = utility.define_simulation_graph(
+        batch_env, config.algorithm, config)
+    total_steps = num_episodes * config.max_length
+    loop = _define_loop(graph, total_steps)
+  saver = utility.define_saver(
+      exclude=(r'.*_temporary/.*', r'global_step'))
+  sess_config = tf.ConfigProto(allow_soft_placement=True)
+  sess_config.gpu_options.allow_growth = True
+  with tf.Session(config=sess_config) as sess:
+    utility.initialize_variables(
+        sess, saver, config.logdir, checkpoint, resume=True)
+    for unused_score in loop.run(sess, saver, total_steps):
+      pass
+  batch_env.close()
+
+
+def main(_):
+  """Load a trained algorithm and render videos."""
+  utility.set_up_logging()
+  if not FLAGS.logdir or not FLAGS.outdir:
+    raise KeyError('You must specify logging and outdirs directories.')
+  FLAGS.logdir = os.path.expanduser(FLAGS.logdir)
+  FLAGS.outdir = os.path.expanduser(FLAGS.outdir)
+  visualize(
+      FLAGS.logdir, FLAGS.outdir, FLAGS.num_agents, FLAGS.num_episodes,
+      FLAGS.checkpoint, FLAGS.env_processes)
+
+
+if __name__ == '__main__':
+  FLAGS = tf.app.flags.FLAGS
+  tf.app.flags.DEFINE_string(
+      'logdir', None,
+      'Directory to the checkpoint of a training run.')
+  tf.app.flags.DEFINE_string(
+      'outdir', None,
+      'Local directory for storing the monitoring outdir.')
+  tf.app.flags.DEFINE_string(
+      'checkpoint', None,
+      'Checkpoint name to load; defaults to most recent.')
+  tf.app.flags.DEFINE_integer(
+      'num_agents', 1,
+      'How many environments to step in parallel.')
+  tf.app.flags.DEFINE_integer(
+      'num_episodes', 5,
+      'Minimum number of episodes to render.')
+  tf.app.flags.DEFINE_boolean(
+      'env_processes', True,
+      'Step environments in separate processes to circumvent the GIL.')
+  tf.app.run()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/init.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/init.py
@ -0,0 +1,30 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tools for reinforcement learning."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from .attr_dict import AttrDict
+from .batch_env import BatchEnv
+from .count_weights import count_weights
+from .in_graph_batch_env import InGraphBatchEnv
+from .in_graph_env import InGraphEnv
+from .loop import Loop
+from .mock_algorithm import MockAlgorithm
+from .mock_environment import MockEnvironment
+from .simulate import simulate
+from .streaming_mean import StreamingMean
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/attr_dict.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/attr_dict.py
@ -0,0 +1,54 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Wrap a dictionary to access keys as attributes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+
+class AttrDict(dict):
+  """Wrap a dictionary to access keys as attributes."""
+
+  def __init__(self, *args, **kwargs):
+    super(AttrDict, self).__init__(*args, **kwargs)
+    super(AttrDict, self).__setattr__('_mutable', False)
+
+  def __getattr__(self, key):
+    # Do not provide None for unimplemented magic attributes.
+    if key.startswith('__'):
+      raise AttributeError
+    return self.get(key, None)
+
+  def __setattr__(self, key, value):
+    if not self._mutable:
+      message = "Cannot set attribute '{}'.".format(key)
+      message += " Use 'with obj.unlocked:' scope to set attributes."
+      raise RuntimeError(message)
+    if key.startswith('__'):
+      raise AttributeError("Cannot set magic attribute '{}'".format(key))
+    self[key] = value
+
+  @property
+  @contextlib.contextmanager
+  def unlocked(self):
+    super(AttrDict, self).__setattr__('_mutable', True)
+    yield
+    super(AttrDict, self).__setattr__('_mutable', False)
+
+  def copy(self):
+    return type(self)(super(AttrDict, self).copy())
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/attr_dict_test.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/attr_dict_test.py
@ -0,0 +1,71 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the attribute dictionary."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from pybullet_envs.minitaur.agents.tools import attr_dict
+
+
+class AttrDictTest(tf.test.TestCase):
+
+  def test_construct_from_dict(self):
+    initial = dict(foo=13, bar=42)
+    obj = attr_dict.AttrDict(initial)
+    self.assertEqual(13, obj.foo)
+    self.assertEqual(42, obj.bar)
+
+  def test_construct_from_kwargs(self):
+    obj = attr_dict.AttrDict(foo=13, bar=42)
+    self.assertEqual(13, obj.foo)
+    self.assertEqual(42, obj.bar)
+
+  def test_has_attribute(self):
+    obj = attr_dict.AttrDict(foo=13)
+    self.assertTrue('foo' in obj)
+    self.assertFalse('bar' in obj)
+
+  def test_access_default(self):
+    obj = attr_dict.AttrDict()
+    self.assertEqual(None, obj.foo)
+
+  def test_access_magic(self):
+    obj = attr_dict.AttrDict()
+    with self.assertRaises(AttributeError):
+      obj.__getstate__  # pylint: disable=pointless-statement
+
+  def test_immutable_create(self):
+    obj = attr_dict.AttrDict()
+    with self.assertRaises(RuntimeError):
+      obj.foo = 42
+
+  def test_immutable_modify(self):
+    obj = attr_dict.AttrDict(foo=13)
+    with self.assertRaises(RuntimeError):
+      obj.foo = 42
+
+  def test_immutable_unlocked(self):
+    obj = attr_dict.AttrDict()
+    with obj.unlocked:
+      obj.foo = 42
+    self.assertEqual(42, obj.foo)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/batch_env.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/batch_env.py
@ -0,0 +1,125 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Combine multiple environments to step them in batch."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+class BatchEnv(object):
+  """Combine multiple environments to step them in batch."""
+
+  def __init__(self, envs, blocking):
+    """Combine multiple environments to step them in batch.
+
+    To step environments in parallel, environments must support a
+    `blocking=False` argument to their step and reset functions that makes them
+    return callables instead to receive the result at a later time.
+
+    Args:
+      envs: List of environments.
+      blocking: Step environments after another rather than in parallel.
+
+    Raises:
+      ValueError: Environments have different observation or action spaces.
+    """
+    self._envs = envs
+    self._blocking = blocking
+    observ_space = self._envs[0].observation_space
+    if not all(env.observation_space == observ_space for env in self._envs):
+      raise ValueError('All environments must use the same observation space.')
+    action_space = self._envs[0].action_space
+    if not all(env.action_space == action_space for env in self._envs):
+      raise ValueError('All environments must use the same observation space.')
+
+  def __len__(self):
+    """Number of combined environments."""
+    return len(self._envs)
+
+  def __getitem__(self, index):
+    """Access an underlying environment by index."""
+    return self._envs[index]
+
+  def __getattr__(self, name):
+    """Forward unimplemented attributes to one of the original environments.
+
+    Args:
+      name: Attribute that was accessed.
+
+    Returns:
+      Value behind the attribute name one of the wrapped environments.
+    """
+    return getattr(self._envs[0], name)
+
+  def step(self, action):
+    """Forward a batch of actions to the wrapped environments.
+
+    Args:
+      action: Batched action to apply to the environment.
+
+    Raises:
+      ValueError: Invalid actions.
+
+    Returns:
+      Batch of observations, rewards, and done flags.
+    """
+    actions = action
+    for index, (env, action) in enumerate(zip(self._envs, actions)):
+      if not env.action_space.contains(action):
+        message = 'Invalid action at index {}: {}'
+        raise ValueError(message.format(index, action))
+    if self._blocking:
+      transitions = [
+          env.step(action)
+          for env, action in zip(self._envs, actions)]
+    else:
+      transitions = [
+          env.step(action, blocking=False)
+          for env, action in zip(self._envs, actions)]
+      transitions = [transition() for transition in transitions]
+    observs, rewards, dones, infos = zip(*transitions)
+    observ = np.stack(observs)
+    reward = np.stack(rewards)
+    done = np.stack(dones)
+    info = tuple(infos)
+    return observ, reward, done, info
+
+  def reset(self, indices=None):
+    """Reset the environment and convert the resulting observation.
+
+    Args:
+      indices: The batch indices of environments to reset; defaults to all.
+
+    Returns:
+      Batch of observations.
+    """
+    if indices is None:
+      indices = np.arange(len(self._envs))
+    if self._blocking:
+      observs = [self._envs[index].reset() for index in indices]
+    else:
+      observs = [self._envs[index].reset(blocking=False) for index in indices]
+      observs = [observ() for observ in observs]
+    observ = np.stack(observs)
+    return observ
+
+  def close(self):
+    """Send close messages to the external process and join them."""
+    for env in self._envs:
+      if hasattr(env, 'close'):
+        env.close()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/count_weights.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/count_weights.py
@ -0,0 +1,48 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Count learnable parameters."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+import numpy as np
+import tensorflow as tf
+
+
+def count_weights(scope=None, exclude=None, graph=None):
+  """Count learnable parameters.
+
+  Args:
+    scope: Resrict the count to a variable scope.
+    exclude: Regex to match variable names to exclude.
+    graph: Operate on a graph other than the current default graph.
+
+  Returns:
+    Number of learnable parameters as integer.
+  """
+  if scope:
+    scope = scope if scope.endswith('/') else scope + '/'
+  graph = graph or tf.get_default_graph()
+  vars_ = graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
+  if scope:
+    vars_ = [var for var in vars_ if var.name.startswith(scope)]
+  if exclude:
+    exclude = re.compile(exclude)
+    vars_ = [var for var in vars_ if not exclude.match(var.name)]
+  shapes = [var.get_shape().as_list() for var in vars_]
+  return int(sum(np.prod(shape) for shape in shapes))
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/count_weights_test.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/count_weights_test.py
@ -0,0 +1,98 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the weight counting utility."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from agents.tools import count_weights
+
+
+class CountWeightsTest(tf.test.TestCase):
+
+  def test_count_trainable(self):
+    tf.Variable(tf.zeros((5, 3)), trainable=True)
+    tf.Variable(tf.zeros((1, 1)), trainable=True)
+    tf.Variable(tf.zeros((5,)), trainable=True)
+    self.assertEqual(15 + 1 + 5, count_weights())
+
+  def test_ignore_non_trainable(self):
+    tf.Variable(tf.zeros((5, 3)), trainable=False)
+    tf.Variable(tf.zeros((1, 1)), trainable=False)
+    tf.Variable(tf.zeros((5,)), trainable=False)
+    self.assertEqual(0, count_weights())
+
+  def test_trainable_and_non_trainable(self):
+    tf.Variable(tf.zeros((5, 3)), trainable=True)
+    tf.Variable(tf.zeros((8, 2)), trainable=False)
+    tf.Variable(tf.zeros((1, 1)), trainable=True)
+    tf.Variable(tf.zeros((5,)), trainable=True)
+    tf.Variable(tf.zeros((3, 1)), trainable=False)
+    self.assertEqual(15 + 1 + 5, count_weights())
+
+  def test_include_scopes(self):
+    tf.Variable(tf.zeros((3, 2)), trainable=True)
+    with tf.variable_scope('foo'):
+      tf.Variable(tf.zeros((5, 2)), trainable=True)
+    self.assertEqual(6 + 10, count_weights())
+
+  def test_restrict_scope(self):
+    tf.Variable(tf.zeros((3, 2)), trainable=True)
+    with tf.variable_scope('foo'):
+      tf.Variable(tf.zeros((5, 2)), trainable=True)
+      with tf.variable_scope('bar'):
+        tf.Variable(tf.zeros((1, 2)), trainable=True)
+    self.assertEqual(10 + 2, count_weights('foo'))
+
+  def test_restrict_nested_scope(self):
+    tf.Variable(tf.zeros((3, 2)), trainable=True)
+    with tf.variable_scope('foo'):
+      tf.Variable(tf.zeros((5, 2)), trainable=True)
+      with tf.variable_scope('bar'):
+        tf.Variable(tf.zeros((1, 2)), trainable=True)
+    self.assertEqual(2, count_weights('foo/bar'))
+
+  def test_restrict_invalid_scope(self):
+    tf.Variable(tf.zeros((3, 2)), trainable=True)
+    with tf.variable_scope('foo'):
+      tf.Variable(tf.zeros((5, 2)), trainable=True)
+      with tf.variable_scope('bar'):
+        tf.Variable(tf.zeros((1, 2)), trainable=True)
+    self.assertEqual(0, count_weights('bar'))
+
+  def test_exclude_by_regex(self):
+    tf.Variable(tf.zeros((3, 2)), trainable=True)
+    with tf.variable_scope('foo'):
+      tf.Variable(tf.zeros((5, 2)), trainable=True)
+      with tf.variable_scope('bar'):
+        tf.Variable(tf.zeros((1, 2)), trainable=True)
+    self.assertEqual(0, count_weights(exclude=r'.*'))
+    self.assertEqual(6, count_weights(exclude=r'(^|/)foo/.*'))
+    self.assertEqual(16, count_weights(exclude=r'.*/bar/.*'))
+
+  def test_non_default_graph(self):
+    graph = tf.Graph()
+    with graph.as_default():
+      tf.Variable(tf.zeros((5, 3)), trainable=True)
+      tf.Variable(tf.zeros((8, 2)), trainable=False)
+    self.assertNotEqual(graph, tf.get_default_graph)
+    self.assertEqual(15, count_weights(graph=graph))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/in_graph_batch_env.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/in_graph_batch_env.py
@ -0,0 +1,178 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Batch of environments inside the TensorFlow graph."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import pdb
+import gym
+import tensorflow as tf
+
+
+class InGraphBatchEnv(object):
+  """Batch of environments inside the TensorFlow graph.
+
+  The batch of environments will be stepped and reset inside of the graph using
+  a tf.py_func(). The current batch of observations, actions, rewards, and done
+  flags are held in according variables.
+  """
+
+  def __init__(self, batch_env):
+    """Batch of environments inside the TensorFlow graph.
+
+    Args:
+      batch_env: Batch environment.
+    """
+    self._batch_env = batch_env
+    observ_shape = self._parse_shape(self._batch_env.observation_space)
+    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
+    action_shape = self._parse_shape(self._batch_env.action_space)
+    action_dtype = self._parse_dtype(self._batch_env.action_space)
+    with tf.variable_scope('env_temporary'):
+      self._observ = tf.Variable(
+          tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype),
+          name='observ', trainable=False)
+      self._action = tf.Variable(
+          tf.zeros((len(self._batch_env),) + action_shape, action_dtype),
+          name='action', trainable=False)
+      self._reward = tf.Variable(
+          tf.zeros((len(self._batch_env),), tf.float32),
+          name='reward', trainable=False)
+      self._done = tf.Variable(
+          tf.cast(tf.ones((len(self._batch_env),)), tf.bool),
+          name='done', trainable=False)
+
+  def __getattr__(self, name):
+    """Forward unimplemented attributes to one of the original environments.
+
+    Args:
+      name: Attribute that was accessed.
+
+    Returns:
+      Value behind the attribute name in one of the original environments.
+    """
+    return getattr(self._batch_env, name)
+
+  def __len__(self):
+    """Number of combined environments."""
+    return len(self._batch_env)
+
+  def __getitem__(self, index):
+    """Access an underlying environment by index."""
+    return self._batch_env[index]
+
+  def simulate(self, action):
+    """Step the batch of environments.
+
+    The results of the step can be accessed from the variables defined below.
+
+    Args:
+      action: Tensor holding the batch of actions to apply.
+
+    Returns:
+      Operation.
+    """
+    with tf.name_scope('environment/simulate'):
+      if action.dtype in (tf.float16, tf.float32, tf.float64):
+        action = tf.check_numerics(action, 'action')
+      observ_dtype = self._parse_dtype(self._batch_env.observation_space)
+      observ, reward, done = tf.py_func(
+          lambda a: self._batch_env.step(a)[:3], [action],
+          [observ_dtype, tf.float32, tf.bool], name='step')
+      observ = tf.check_numerics(observ, 'observ')
+      reward = tf.check_numerics(reward, 'reward')
+      return tf.group(
+          self._observ.assign(observ),
+          self._action.assign(action),
+          self._reward.assign(reward),
+          self._done.assign(done))
+
+  def reset(self, indices=None):
+    """Reset the batch of environments.
+
+    Args:
+      indices: The batch indices of the environments to reset; defaults to all.
+
+    Returns:
+      Batch tensor of the new observations.
+    """
+    if indices is None:
+      indices = tf.range(len(self._batch_env))
+    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
+    observ = tf.py_func(
+        self._batch_env.reset, [indices], observ_dtype, name='reset')
+    observ = tf.check_numerics(observ, 'observ')
+    reward = tf.zeros_like(indices, tf.float32)
+    done = tf.zeros_like(indices, tf.bool)
+    with tf.control_dependencies([
+        tf.scatter_update(self._observ, indices, observ),
+        tf.scatter_update(self._reward, indices, reward),
+        tf.scatter_update(self._done, indices, done)]):
+      return tf.identity(observ)
+
+  @property
+  def observ(self):
+    """Access the variable holding the current observation."""
+    return self._observ
+
+  @property
+  def action(self):
+    """Access the variable holding the last recieved action."""
+    return self._action
+
+  @property
+  def reward(self):
+    """Access the variable holding the current reward."""
+    return self._reward
+
+  @property
+  def done(self):
+    """Access the variable indicating whether the episode is done."""
+    return self._done
+
+  def close(self):
+    """Send close messages to the external process and join them."""
+    self._batch_env.close()
+
+  def _parse_shape(self, space):
+    """Get a tensor shape from a OpenAI Gym space.
+
+    Args:
+      space: Gym space.
+
+    Returns:
+      Shape tuple.
+    """
+    if isinstance(space, gym.spaces.Discrete):
+      return ()
+    if isinstance(space, gym.spaces.Box):
+      return space.shape
+    raise NotImplementedError()
+
+  def _parse_dtype(self, space):
+    """Get a tensor dtype from a OpenAI Gym space.
+
+    Args:
+      space: Gym space.
+
+    Returns:
+      TensorFlow data type.
+    """
+    if isinstance(space, gym.spaces.Discrete):
+      return tf.int32
+    if isinstance(space, gym.spaces.Box):
+      return tf.float32
+    raise NotImplementedError()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/in_graph_env.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/in_graph_env.py
@ -0,0 +1,162 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Put an OpenAI Gym environment into the TensorFlow graph."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gym
+import tensorflow as tf
+
+
+class InGraphEnv(object):
+  """Put an OpenAI Gym environment into the TensorFlow graph.
+
+  The environment will be stepped and reset inside of the graph using
+  tf.py_func(). The current observation, action, reward, and done flag are held
+  in according variables.
+  """
+
+  def __init__(self, env):
+    """Put an OpenAI Gym environment into the TensorFlow graph.
+
+    Args:
+      env: OpenAI Gym environment.
+    """
+    self._env = env
+    observ_shape = self._parse_shape(self._env.observation_space)
+    observ_dtype = self._parse_dtype(self._env.observation_space)
+    action_shape = self._parse_shape(self._env.action_space)
+    action_dtype = self._parse_dtype(self._env.action_space)
+    with tf.name_scope('environment'):
+      self._observ = tf.Variable(
+          tf.zeros(observ_shape, observ_dtype), name='observ', trainable=False)
+      self._action = tf.Variable(
+          tf.zeros(action_shape, action_dtype), name='action', trainable=False)
+      self._reward = tf.Variable(
+          0.0, dtype=tf.float32, name='reward', trainable=False)
+      self._done = tf.Variable(
+          True, dtype=tf.bool, name='done', trainable=False)
+      self._step = tf.Variable(
+          0, dtype=tf.int32, name='step', trainable=False)
+
+  def __getattr__(self, name):
+    """Forward unimplemented attributes to the original environment.
+
+    Args:
+      name: Attribute that was accessed.
+
+    Returns:
+      Value behind the attribute name in the wrapped environment.
+    """
+    return getattr(self._env, name)
+
+  def simulate(self, action):
+    """Step the environment.
+
+    The result of the step can be accessed from the variables defined below.
+
+    Args:
+      action: Tensor holding the action to apply.
+
+    Returns:
+      Operation.
+    """
+    with tf.name_scope('environment/simulate'):
+      if action.dtype in (tf.float16, tf.float32, tf.float64):
+        action = tf.check_numerics(action, 'action')
+      observ_dtype = self._parse_dtype(self._env.observation_space)
+      observ, reward, done = tf.py_func(
+          lambda a: self._env.step(a)[:3], [action],
+          [observ_dtype, tf.float32, tf.bool], name='step')
+      observ = tf.check_numerics(observ, 'observ')
+      reward = tf.check_numerics(reward, 'reward')
+      return tf.group(
+          self._observ.assign(observ),
+          self._action.assign(action),
+          self._reward.assign(reward),
+          self._done.assign(done),
+          self._step.assign_add(1))
+
+  def reset(self):
+    """Reset the environment.
+
+    Returns:
+      Tensor of the current observation.
+    """
+    observ_dtype = self._parse_dtype(self._env.observation_space)
+    observ = tf.py_func(self._env.reset, [], observ_dtype, name='reset')
+    observ = tf.check_numerics(observ, 'observ')
+    with tf.control_dependencies([
+        self._observ.assign(observ),
+        self._reward.assign(0),
+        self._done.assign(False)]):
+      return tf.identity(observ)
+
+  @property
+  def observ(self):
+    """Access the variable holding the current observation."""
+    return self._observ
+
+  @property
+  def action(self):
+    """Access the variable holding the last recieved action."""
+    return self._action
+
+  @property
+  def reward(self):
+    """Access the variable holding the current reward."""
+    return self._reward
+
+  @property
+  def done(self):
+    """Access the variable indicating whether the episode is done."""
+    return self._done
+
+  @property
+  def step(self):
+    """Access the variable containg total steps of this environment."""
+    return self._step
+
+  def _parse_shape(self, space):
+    """Get a tensor shape from a OpenAI Gym space.
+
+    Args:
+      space: Gym space.
+
+    Returns:
+      Shape tuple.
+    """
+    if isinstance(space, gym.spaces.Discrete):
+      return ()
+    if isinstance(space, gym.spaces.Box):
+      return space.shape
+    raise NotImplementedError()
+
+  def _parse_dtype(self, space):
+    """Get a tensor dtype from a OpenAI Gym space.
+
+    Args:
+      space: Gym space.
+
+    Returns:
+      TensorFlow data type.
+    """
+    if isinstance(space, gym.spaces.Discrete):
+      return tf.int32
+    if isinstance(space, gym.spaces.Box):
+      return tf.float32
+    raise NotImplementedError()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/loop.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/loop.py
@ -0,0 +1,233 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Execute operations in a loop and coordinate logging and checkpoints."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import os
+
+import tensorflow as tf
+
+from pybullet_envs.minitaur.agents.tools import streaming_mean
+
+
+_Phase = collections.namedtuple(
+    'Phase',
+    'name, writer, op, batch, steps, feed, report_every, log_every,'
+    'checkpoint_every')
+
+
+class Loop(object):
+  """Execute operations in a loop and coordinate logging and checkpoints.
+
+  Supports multiple phases, that define their own operations to run, and
+  intervals for reporting scores, logging summaries, and storing checkpoints.
+  All class state is stored in-graph to properly recover from checkpoints.
+  """
+
+  def __init__(self, logdir, step=None, log=None, report=None, reset=None):
+    """Execute operations in a loop and coordinate logging and checkpoints.
+
+    The step, log, report, and report arguments will get created if not
+    provided. Reset is used to indicate switching to a new phase, so that the
+    model can start a new computation in case its computation is split over
+    multiple training steps.
+
+    Args:
+      logdir: Will contain checkpoints and summaries for each phase.
+      step: Variable of the global step (optional).
+      log: Tensor indicating to the model to compute summary tensors.
+      report: Tensor indicating to the loop to report the current mean score.
+      reset: Tensor indicating to the model to start a new computation.
+    """
+    self._logdir = logdir
+    self._step = (
+        tf.Variable(0, False, name='global_step') if step is None else step)
+    self._log = tf.placeholder(tf.bool) if log is None else log
+    self._report = tf.placeholder(tf.bool) if report is None else report
+    self._reset = tf.placeholder(tf.bool) if reset is None else reset
+    self._phases = []
+
+  def add_phase(
+      self, name, done, score, summary, steps,
+      report_every=None, log_every=None, checkpoint_every=None, feed=None):
+    """Add a phase to the loop protocol.
+
+    If the model breaks long computation into multiple steps, the done tensor
+    indicates whether the current score should be added to the mean counter.
+    For example, in reinforcement learning we only have a valid score at the
+    end of the episode.
+
+    Score and done tensors can either be scalars or vectors, to support
+    single and batched computations.
+
+    Args:
+      name: Name for the phase, used for the summary writer.
+      done: Tensor indicating whether current score can be used.
+      score: Tensor holding the current, possibly intermediate, score.
+      summary: Tensor holding summary string to write if not an empty string.
+      steps: Duration of the phase in steps.
+      report_every: Yield mean score every this number of steps.
+      log_every: Request summaries via `log` tensor every this number of steps.
+      checkpoint_every: Write checkpoint every this number of steps.
+      feed: Additional feed dictionary for the session run call.
+
+    Raises:
+      ValueError: Unknown rank for done or score tensors.
+    """
+    done = tf.convert_to_tensor(done, tf.bool)
+    score = tf.convert_to_tensor(score, tf.float32)
+    summary = tf.convert_to_tensor(summary, tf.string)
+    feed = feed or {}
+    if done.shape.ndims is None or score.shape.ndims is None:
+      raise ValueError("Rank of 'done' and 'score' tensors must be known.")
+    writer = self._logdir and tf.summary.FileWriter(
+        os.path.join(self._logdir, name), tf.get_default_graph(),
+        flush_secs=60)
+    op = self._define_step(done, score, summary)
+    batch = 1 if score.shape.ndims == 0 else score.shape[0].value
+    self._phases.append(_Phase(
+        name, writer, op, batch, int(steps), feed, report_every,
+        log_every, checkpoint_every))
+
+  def run(self, sess, saver, max_step=None):
+    """Run the loop schedule for a specified number of steps.
+
+    Call the operation of the current phase until the global step reaches the
+    specified maximum step. Phases are repeated over and over in the order they
+    were added.
+
+    Args:
+      sess: Session to use to run the phase operation.
+      saver: Saver used for checkpointing.
+      max_step: Run the operations until the step reaches this limit.
+
+    Yields:
+      Reported mean scores.
+    """
+    global_step = sess.run(self._step)
+    steps_made = 1
+    while True:
+      if max_step and global_step >= max_step:
+        break
+      phase, epoch, steps_in = self._find_current_phase(global_step)
+      phase_step = epoch * phase.steps + steps_in
+      if steps_in % phase.steps < steps_made:
+        message = '\n' + ('-' * 50) + '\n'
+        message += 'Phase {} (phase step {}, global step {}).'
+        tf.logging.info(message.format(phase.name, phase_step, global_step))
+      # Populate book keeping tensors.
+      phase.feed[self._reset] = (steps_in < steps_made)
+      phase.feed[self._log] = (
+          phase.writer and
+          self._is_every_steps(phase_step, phase.batch, phase.log_every))
+      phase.feed[self._report] = (
+          self._is_every_steps(phase_step, phase.batch, phase.report_every))
+      summary, mean_score, global_step, steps_made = sess.run(
+          phase.op, phase.feed)
+      if self._is_every_steps(phase_step, phase.batch, phase.checkpoint_every):
+        self._store_checkpoint(sess, saver, global_step)
+      if self._is_every_steps(phase_step, phase.batch, phase.report_every):
+        yield mean_score
+      if summary and phase.writer:
+        # We want smaller phases to catch up at the beginnig of each epoch so
+        # that their graphs are aligned.
+        longest_phase = max(phase.steps for phase in self._phases)
+        summary_step = epoch * longest_phase + steps_in
+        phase.writer.add_summary(summary, summary_step)
+
+  def _is_every_steps(self, phase_step, batch, every):
+    """Determine whether a periodic event should happen at this step.
+
+    Args:
+      phase_step: The incrementing step.
+      batch: The number of steps progressed at once.
+      every: The interval of the periode.
+
+    Returns:
+      Boolean of whether the event should happen.
+    """
+    if not every:
+      return False
+    covered_steps = range(phase_step, phase_step + batch)
+    return any((step + 1) % every == 0 for step in covered_steps)
+
+  def _find_current_phase(self, global_step):
+    """Determine the current phase based on the global step.
+
+    This ensures continuing the correct phase after restoring checkoints.
+
+    Args:
+      global_step: The global number of steps performed across all phases.
+
+    Returns:
+      Tuple of phase object, epoch number, and phase steps within the epoch.
+    """
+    epoch_size = sum(phase.steps for phase in self._phases)
+    epoch = int(global_step // epoch_size)
+    steps_in = global_step % epoch_size
+    for phase in self._phases:
+      if steps_in < phase.steps:
+        return phase, epoch, steps_in
+      steps_in -= phase.steps
+
+  def _define_step(self, done, score, summary):
+    """Combine operations of a phase.
+
+    Keeps track of the mean score and when to report it.
+
+    Args:
+      done: Tensor indicating whether current score can be used.
+      score: Tensor holding the current, possibly intermediate, score.
+      summary: Tensor holding summary string to write if not an empty string.
+
+    Returns:
+      Tuple of summary tensor, mean score, and new global step. The mean score
+      is zero for non reporting steps.
+    """
+    if done.shape.ndims == 0:
+      done = done[None]
+    if score.shape.ndims == 0:
+      score = score[None]
+    score_mean = streaming_mean.StreamingMean((), tf.float32)
+    with tf.control_dependencies([done, score, summary]):
+      done_score = tf.gather(score, tf.where(done)[:, 0])
+      submit_score = tf.cond(
+          tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op)
+    with tf.control_dependencies([submit_score]):
+      mean_score = tf.cond(self._report, score_mean.clear, float)
+      steps_made = tf.shape(score)[0]
+      next_step = self._step.assign_add(steps_made)
+    with tf.control_dependencies([mean_score, next_step]):
+      return tf.identity(summary), mean_score, next_step, steps_made
+
+  def _store_checkpoint(self, sess, saver, global_step):
+    """Store a checkpoint if a log directory was provided to the constructor.
+
+    The directory will be created if needed.
+
+    Args:
+      sess: Session containing variables to store.
+      saver: Saver used for checkpointing.
+      global_step: Step number of the checkpoint name.
+    """
+    if not self._logdir or not saver:
+      return
+    tf.gfile.MakeDirs(self._logdir)
+    filename = os.path.join(self._logdir, 'model.ckpt')
+    saver.save(sess, filename, global_step)
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/loop_test.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/loop_test.py
@ -0,0 +1,111 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the training loop."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from google3.robotics.reinforcement_learning.agents import tools
+
+
+class LoopTest(tf.test.TestCase):
+
+  def test_report_every_step(self):
+    step = tf.Variable(0, False, dtype=tf.int32, name='step')
+    loop = tools.Loop(None, step)
+    loop.add_phase(
+        'phase_1', done=True, score=0, summary='', steps=1, report_every=3)
+    # Step:   0 1 2 3 4 5 6 7 8
+    # Report:     x     x     x
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      scores = loop.run(sess, saver=None, max_step=9)
+      next(scores)
+      self.assertEqual(3, sess.run(step))
+      next(scores)
+      self.assertEqual(6, sess.run(step))
+      next(scores)
+      self.assertEqual(9, sess.run(step))
+
+  def test_phases_feed(self):
+    score = tf.placeholder(tf.float32, [])
+    loop = tools.Loop(None)
+    loop.add_phase(
+        'phase_1', done=True, score=score, summary='', steps=1, report_every=1,
+        log_every=None, checkpoint_every=None, feed={score: 1})
+    loop.add_phase(
+        'phase_2', done=True, score=score, summary='', steps=3, report_every=1,
+        log_every=None, checkpoint_every=None, feed={score: 2})
+    loop.add_phase(
+        'phase_3', done=True, score=score, summary='', steps=2, report_every=1,
+        log_every=None, checkpoint_every=None, feed={score: 3})
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      scores = list(loop.run(sess, saver=None, max_step=15))
+    self.assertAllEqual([1, 2, 2, 2, 3, 3, 1, 2, 2, 2, 3, 3, 1, 2, 2], scores)
+
+  def test_average_score_over_phases(self):
+    loop = tools.Loop(None)
+    loop.add_phase(
+        'phase_1', done=True, score=1, summary='', steps=1, report_every=2)
+    loop.add_phase(
+        'phase_2', done=True, score=2, summary='', steps=2, report_every=5)
+    # Score:    1 2 2 1 2 2 1 2 2 1 2 2 1 2 2 1 2
+    # Report 1:       x           x           x
+    # Report 2:               x             x
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      scores = list(loop.run(sess, saver=None, max_step=17))
+    self.assertAllEqual([1, 2, 1, 2, 1], scores)
+
+  def test_not_done(self):
+    step = tf.Variable(0, False, dtype=tf.int32, name='step')
+    done = tf.equal((step + 1) % 2, 0)
+    score = tf.cast(step, tf.float32)
+    loop = tools.Loop(None, step)
+    loop.add_phase(
+        'phase_1', done, score, summary='', steps=1, report_every=3)
+    # Score:  0 1 2 3 4 5 6 7 8
+    # Done:     x   x   x   x
+    # Report:     x     x     x
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      scores = list(loop.run(sess, saver=None, max_step=9))
+    self.assertAllEqual([1, 4, 7], scores)
+
+  def test_not_done_batch(self):
+    step = tf.Variable(0, False, dtype=tf.int32, name='step')
+    done = tf.equal([step % 3, step % 4], 0)
+    score = tf.cast([step, step ** 2], tf.float32)
+    loop = tools.Loop(None, step)
+    loop.add_phase(
+        'phase_1', done, score, summary='', steps=1, report_every=8)
+    # Step:    0  2  4  6
+    # Score 1: 0  2  4  6
+    # Done 1:  x        x
+    # Score 2: 0  4 16 32
+    # Done 2:  x     x
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      scores = list(loop.run(sess, saver=None, max_step=8))
+      self.assertEqual(8, sess.run(step))
+    self.assertAllEqual([(0 + 0 + 16 + 6) / 4], scores)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/mock_algorithm.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/mock_algorithm.py
@ -0,0 +1,49 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Mock algorithm for testing reinforcement learning code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class MockAlgorithm(object):
+  """Produce random actions and empty summaries."""
+
+  def __init__(self, envs):
+    """Produce random actions and empty summaries.
+
+    Args:
+      envs: List of in-graph environments.
+    """
+    self._envs = envs
+
+  def begin_episode(self, unused_agent_indices):
+    return tf.constant('')
+
+  def perform(self, unused_observ):
+    shape = (len(self._envs),) + self._envs[0].action_space.shape
+    low = self._envs[0].action_space.low
+    high = self._envs[0].action_space.high
+    action = tf.random_uniform(shape) * (high - low) + low
+    return action, tf.constant('')
+
+  def experience(self, *unused_transition):
+    return tf.constant('')
+
+  def end_episode(self, unused_agent_indices):
+    return tf.constant('')
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/mock_environment.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/mock_environment.py
@ -0,0 +1,86 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Mock environment for testing reinforcement learning code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gym
+import gym.spaces
+import numpy as np
+
+
+class MockEnvironment(object):
+  """Generate random agent input and keep track of statistics."""
+
+  def __init__(self, observ_shape, action_shape, min_duration, max_duration):
+    """Generate random agent input and keep track of statistics.
+
+    Args:
+      observ_shape: Shape for the random observations.
+      action_shape: Shape for the action space.
+      min_duration: Minimum number of steps per episode.
+      max_duration: Maximum number of steps per episode.
+
+    Attributes:
+      steps: List of actual simulated lengths for all episodes.
+      durations: List of decided lengths for all episodes.
+    """
+    self._observ_shape = observ_shape
+    self._action_shape = action_shape
+    self._min_duration = min_duration
+    self._max_duration = max_duration
+    self._random = np.random.RandomState(0)
+    self.steps = []
+    self.durations = []
+
+  @property
+  def observation_space(self):
+    low = np.zeros(self._observ_shape)
+    high = np.ones(self._observ_shape)
+    return gym.spaces.Box(low, high)
+
+  @property
+  def action_space(self):
+    low = np.zeros(self._action_shape)
+    high = np.ones(self._action_shape)
+    return gym.spaces.Box(low, high)
+
+  @property
+  def unwrapped(self):
+    return self
+
+  def step(self, action):
+    assert self.action_space.contains(action)
+    assert self.steps[-1] < self.durations[-1]
+    self.steps[-1] += 1
+    observ = self._current_observation()
+    reward = self._current_reward()
+    done = self.steps[-1] >= self.durations[-1]
+    info = {}
+    return observ, reward, done, info
+
+  def reset(self):
+    duration = self._random.randint(self._min_duration, self._max_duration + 1)
+    self.steps.append(0)
+    self.durations.append(duration)
+    return self._current_observation()
+
+  def _current_observation(self):
+    return self._random.uniform(0, 1, self._observ_shape)
+
+  def _current_reward(self):
+    return self._random.uniform(-1, 1)
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/simulate.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/simulate.py
@ -0,0 +1,145 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""In-graph simulation step of a vecrotized algorithm with environments."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from pybullet_envs.minitaur.agents.tools import streaming_mean
+
+
+def simulate(batch_env, algo, log=True, reset=False):
+  """Simulation step of a vecrotized algorithm with in-graph environments.
+
+  Integrates the operations implemented by the algorithm and the environments
+  into a combined operation.
+
+  Args:
+    batch_env: In-graph batch environment.
+    algo: Algorithm instance implementing required operations.
+    log: Tensor indicating whether to compute and return summaries.
+    reset: Tensor causing all environments to reset.
+
+  Returns:
+    Tuple of tensors containing done flags for the current episodes, possibly
+    intermediate scores for the episodes, and a summary tensor.
+  """
+
+  def _define_begin_episode(agent_indices):
+    """Reset environments, intermediate scores and durations for new episodes.
+
+    Args:
+      agent_indices: Tensor containing batch indices starting an episode.
+
+    Returns:
+      Summary tensor.
+    """
+    assert agent_indices.shape.ndims == 1
+    zero_scores = tf.zeros_like(agent_indices, tf.float32)
+    zero_durations = tf.zeros_like(agent_indices)
+    reset_ops = [
+        batch_env.reset(agent_indices),
+        tf.scatter_update(score, agent_indices, zero_scores),
+        tf.scatter_update(length, agent_indices, zero_durations)]
+    with tf.control_dependencies(reset_ops):
+      return algo.begin_episode(agent_indices)
+
+  def _define_step():
+    """Request actions from the algorithm and apply them to the environments.
+
+    Increments the lengths of all episodes and increases their scores by the
+    current reward. After stepping the environments, provides the full
+    transition tuple to the algorithm.
+
+    Returns:
+      Summary tensor.
+    """
+    prevob = batch_env.observ + 0  # Ensure a copy of the variable value.
+    action, step_summary = algo.perform(prevob)
+    action.set_shape(batch_env.action.shape)
+    with tf.control_dependencies([batch_env.simulate(action)]):
+      add_score = score.assign_add(batch_env.reward)
+      inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32))
+    with tf.control_dependencies([add_score, inc_length]):
+      experience_summary = algo.experience(
+          prevob, batch_env.action, batch_env.reward, batch_env.done,
+          batch_env.observ)
+    return tf.summary.merge([step_summary, experience_summary])
+
+  def _define_end_episode(agent_indices):
+    """Notify the algorithm of ending episodes.
+
+    Also updates the mean score and length counters used for summaries.
+
+    Args:
+      agent_indices: Tensor holding batch indices that end their episodes.
+
+    Returns:
+      Summary tensor.
+    """
+    assert agent_indices.shape.ndims == 1
+    submit_score = mean_score.submit(tf.gather(score, agent_indices))
+    submit_length = mean_length.submit(
+        tf.cast(tf.gather(length, agent_indices), tf.float32))
+    with tf.control_dependencies([submit_score, submit_length]):
+      return algo.end_episode(agent_indices)
+
+  def _define_summaries():
+    """Reset the average score and duration, and return them as summary.
+
+    Returns:
+      Summary string.
+    """
+    score_summary = tf.cond(
+        tf.logical_and(log, tf.cast(mean_score.count, tf.bool)),
+        lambda: tf.summary.scalar('mean_score', mean_score.clear()), str)
+    length_summary = tf.cond(
+        tf.logical_and(log, tf.cast(mean_length.count, tf.bool)),
+        lambda: tf.summary.scalar('mean_length', mean_length.clear()), str)
+    return tf.summary.merge([score_summary, length_summary])
+
+  with tf.name_scope('simulate'):
+    log = tf.convert_to_tensor(log)
+    reset = tf.convert_to_tensor(reset)
+    with tf.variable_scope('simulate_temporary'):
+      score = tf.Variable(
+          tf.zeros(len(batch_env), dtype=tf.float32), False, name='score')
+      length = tf.Variable(
+          tf.zeros(len(batch_env), dtype=tf.int32), False, name='length')
+    mean_score = streaming_mean.StreamingMean((), tf.float32)
+    mean_length = streaming_mean.StreamingMean((), tf.float32)
+    agent_indices = tf.cond(
+        reset,
+        lambda: tf.range(len(batch_env)),
+        lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32))
+    begin_episode = tf.cond(
+        tf.cast(tf.shape(agent_indices)[0], tf.bool),
+        lambda: _define_begin_episode(agent_indices), str)
+    with tf.control_dependencies([begin_episode]):
+      step = _define_step()
+    with tf.control_dependencies([step]):
+      agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)
+      end_episode = tf.cond(
+          tf.cast(tf.shape(agent_indices)[0], tf.bool),
+          lambda: _define_end_episode(agent_indices), str)
+    with tf.control_dependencies([end_episode]):
+      summary = tf.summary.merge([
+          _define_summaries(), begin_episode, step, end_episode])
+    with tf.control_dependencies([summary]):
+      done, score = tf.identity(batch_env.done), tf.identity(score)
+    return done, score, summary
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/simulate_test.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/simulate_test.py
@ -0,0 +1,98 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the simulation operation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from google3.robotics.reinforcement_learning.agents import tools
+
+
+class SimulateTest(tf.test.TestCase):
+
+  def test_done_automatic(self):
+    batch_env = self._create_test_batch_env((1, 2, 3, 4))
+    algo = tools.MockAlgorithm(batch_env)
+    done, _, _ = tools.simulate(batch_env, algo, log=False, reset=False)
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      self.assertAllEqual([True, False, False, False], sess.run(done))
+      self.assertAllEqual([True, True, False, False], sess.run(done))
+      self.assertAllEqual([True, False, True, False], sess.run(done))
+      self.assertAllEqual([True, True, False, True], sess.run(done))
+
+  def test_done_forced(self):
+    reset = tf.placeholder_with_default(False, ())
+    batch_env = self._create_test_batch_env((2, 4))
+    algo = tools.MockAlgorithm(batch_env)
+    done, _, _ = tools.simulate(batch_env, algo, False, reset)
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      self.assertAllEqual([False, False], sess.run(done))
+      self.assertAllEqual([False, False], sess.run(done, {reset: True}))
+      self.assertAllEqual([True, False], sess.run(done))
+      self.assertAllEqual([False, False], sess.run(done, {reset: True}))
+      self.assertAllEqual([True, False], sess.run(done))
+      self.assertAllEqual([False, False], sess.run(done))
+      self.assertAllEqual([True, True], sess.run(done))
+
+  def test_reset_automatic(self):
+    batch_env = self._create_test_batch_env((1, 2, 3, 4))
+    algo = tools.MockAlgorithm(batch_env)
+    done, _, _ = tools.simulate(batch_env, algo, log=False, reset=False)
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      for _ in range(10):
+        sess.run(done)
+    self.assertAllEqual([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], batch_env[0].steps)
+    self.assertAllEqual([2, 2, 2, 2, 2], batch_env[1].steps)
+    self.assertAllEqual([3, 3, 3, 1], batch_env[2].steps)
+    self.assertAllEqual([4, 4, 2], batch_env[3].steps)
+
+  def test_reset_forced(self):
+    reset = tf.placeholder_with_default(False, ())
+    batch_env = self._create_test_batch_env((2, 4))
+    algo = tools.MockAlgorithm(batch_env)
+    done, _, _ = tools.simulate(batch_env, algo, False, reset)
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      sess.run(done)
+      sess.run(done, {reset: True})
+      sess.run(done)
+      sess.run(done, {reset: True})
+      sess.run(done)
+      sess.run(done)
+      sess.run(done)
+    self.assertAllEqual([1, 2, 2, 2], batch_env[0].steps)
+    self.assertAllEqual([1, 2, 4], batch_env[1].steps)
+
+  def _create_test_batch_env(self, durations):
+    envs = []
+    for duration in durations:
+      env = tools.MockEnvironment(
+          observ_shape=(2, 3), action_shape=(3,),
+          min_duration=duration, max_duration=duration)
+      env = tools.wrappers.ConvertTo32Bit(env)
+      envs.append(env)
+    batch_env = tools.BatchEnv(envs, blocking=True)
+    batch_env = tools.InGraphBatchEnv(batch_env)
+    return batch_env
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/streaming_mean.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/streaming_mean.py
@ -0,0 +1,67 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Compute a streaming estimation of the mean of submitted tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class StreamingMean(object):
+  """Compute a streaming estimation of the mean of submitted tensors."""
+
+  def __init__(self, shape, dtype):
+    """Specify the shape and dtype of the mean to be estimated.
+
+    Note that a float mean to zero submitted elements is NaN, while computing
+    the integer mean of zero elements raises a division by zero error.
+
+    Args:
+      shape: Shape of the mean to compute.
+      dtype: Data type of the mean to compute.
+    """
+    self._dtype = dtype
+    self._sum = tf.Variable(lambda: tf.zeros(shape, dtype), False)
+    self._count = tf.Variable(lambda: 0, trainable=False)
+
+  @property
+  def value(self):
+    """The current value of the mean."""
+    return self._sum / tf.cast(self._count, self._dtype)
+
+  @property
+  def count(self):
+    """The number of submitted samples."""
+    return self._count
+
+  def submit(self, value):
+    """Submit a single or batch tensor to refine the streaming mean."""
+    # Add a batch dimension if necessary.
+    if value.shape.ndims == self._sum.shape.ndims:
+      value = value[None, ...]
+    return tf.group(
+        self._sum.assign_add(tf.reduce_sum(value, 0)),
+        self._count.assign_add(tf.shape(value)[0]))
+
+  def clear(self):
+    """Return the mean estimate and reset the streaming statistics."""
+    value = self._sum / tf.cast(self._count, self._dtype)
+    with tf.control_dependencies([value]):
+      reset_value = self._sum.assign(tf.zeros_like(self._sum))
+      reset_count = self._count.assign(0)
+    with tf.control_dependencies([reset_value, reset_count]):
+      return tf.identity(value)
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/wrappers.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/wrappers.py
@ -0,0 +1,552 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Wrappers for OpenAI Gym environments."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import atexit
+import functools
+import multiprocessing
+import sys
+import traceback
+
+import gym
+import gym.spaces
+import numpy as np
+import tensorflow as tf
+
+
+class AutoReset(object):
+  """Automatically reset environment when the episode is done."""
+
+  def __init__(self, env):
+    self._env = env
+    self._done = True
+
+  def __getattr__(self, name):
+    return getattr(self._env, name)
+
+  def step(self, action):
+    if self._done:
+      observ, reward, done, info = self._env.reset(), 0.0, False, {}
+    else:
+      observ, reward, done, info = self._env.step(action)
+    self._done = done
+    return observ, reward, done, info
+
+  def reset(self):
+    self._done = False
+    return self._env.reset()
+
+
+class ActionRepeat(object):
+  """Repeat the agent action multiple steps."""
+
+  def __init__(self, env, amount):
+    self._env = env
+    self._amount = amount
+
+  def __getattr__(self, name):
+    return getattr(self._env, name)
+
+  def step(self, action):
+    done = False
+    total_reward = 0
+    current_step = 0
+    while current_step < self._amount and not done:
+      observ, reward, done, info = self._env.step(action)
+      total_reward += reward
+      current_step += 1
+    return observ, total_reward, done, info
+
+
+class RandomStart(object):
+  """Perform random number of random actions at the start of the episode."""
+
+  def __init__(self, env, max_steps):
+    self._env = env
+    self._max_steps = max_steps
+
+  def __getattr__(self, name):
+    return getattr(self._env, name)
+
+  def reset(self):
+    observ = self._env.reset()
+    random_steps = np.random.randint(0, self._max_steps)
+    for _ in range(random_steps):
+      action = self._env.action_space.sample()
+      observ, unused_reward, done, unused_info = self._env.step(action)
+      if done:
+        tf.logging.warning('Episode ended during random start.')
+        return self.reset()
+    return observ
+
+
+class FrameHistory(object):
+  """Augment the observation with past observations."""
+
+  def __init__(self, env, past_indices, flatten):
+    """Augment the observation with past observations.
+
+    Implemented as a Numpy ring buffer holding the necessary past observations.
+
+    Args:
+      env: OpenAI Gym environment to wrap.
+      past_indices: List of non-negative integers indicating the time offsets
+        from the current time step of observations to include.
+      flatten: Concatenate the past observations rather than stacking them.
+
+    Raises:
+      KeyError: The current observation is not included in the indices.
+    """
+    if 0 not in past_indices:
+      raise KeyError('Past indices should include 0 for the current frame.')
+    self._env = env
+    self._past_indices = past_indices
+    self._step = 0
+    self._buffer = None
+    self._capacity = max(past_indices)
+    self._flatten = flatten
+
+  def __getattr__(self, name):
+    return getattr(self._env, name)
+
+  @property
+  def observation_space(self):
+    low = self._env.observation_space.low
+    high = self._env.observation_space.high
+    low = np.repeat(low[None, ...], len(self._past_indices), 0)
+    high = np.repeat(high[None, ...], len(self._past_indices), 0)
+    if self._flatten:
+      low = np.reshape(low, (-1,) + low.shape[2:])
+      high = np.reshape(high, (-1,) + high.shape[2:])
+    return gym.spaces.Box(low, high)
+
+  def step(self, action):
+    observ, reward, done, info = self._env.step(action)
+    self._step += 1
+    self._buffer[self._step % self._capacity] = observ
+    observ = self._select_frames()
+    return observ, reward, done, info
+
+  def reset(self):
+    observ = self._env.reset()
+    self._buffer = np.repeat(observ[None, ...], self._capacity, 0)
+    self._step = 0
+    return self._select_frames()
+
+  def _select_frames(self):
+    indices = [
+        (self._step - index) % self._capacity for index in self._past_indices]
+    observ = self._buffer[indices]
+    if self._flatten:
+      observ = np.reshape(observ, (-1,) + observ.shape[2:])
+    return observ
+
+
+class FrameDelta(object):
+  """Convert the observation to a difference from the previous observation."""
+
+  def __init__(self, env):
+    self._env = env
+    self._last = None
+
+  def __getattr__(self, name):
+    return getattr(self._env, name)
+
+  @property
+  def observation_space(self):
+    low = self._env.observation_space.low
+    high = self._env.observation_space.high
+    low, high = low - high, high - low
+    return gym.spaces.Box(low, high)
+
+  def step(self, action):
+    observ, reward, done, info = self._env.step(action)
+    delta = observ - self._last
+    self._last = observ
+    return delta, reward, done, info
+
+  def reset(self):
+    observ = self._env.reset()
+    self._last = observ
+    return observ
+
+
+class RangeNormalize(object):
+  """Normalize the specialized observation and action ranges to [-1, 1]."""
+
+  def __init__(self, env, observ=None, action=None):
+    self._env = env
+    self._should_normalize_observ = (
+        observ is not False and self._is_finite(self._env.observation_space))
+    if observ is True and not self._should_normalize_observ:
+      raise ValueError('Cannot normalize infinite observation range.')
+    if observ is None and not self._should_normalize_observ:
+      tf.logging.info('Not normalizing infinite observation range.')
+    self._should_normalize_action = (
+        action is not False and self._is_finite(self._env.action_space))
+    if action is True and not self._should_normalize_action:
+      raise ValueError('Cannot normalize infinite action range.')
+    if action is None and not self._should_normalize_action:
+      tf.logging.info('Not normalizing infinite action range.')
+
+  def __getattr__(self, name):
+    return getattr(self._env, name)
+
+  @property
+  def observation_space(self):
+    space = self._env.observation_space
+    if not self._should_normalize_observ:
+      return space
+    return gym.spaces.Box(-np.ones(space.shape), np.ones(space.shape))
+
+  @property
+  def action_space(self):
+    space = self._env.action_space
+    if not self._should_normalize_action:
+      return space
+    return gym.spaces.Box(-np.ones(space.shape), np.ones(space.shape))
+
+  def step(self, action):
+    if self._should_normalize_action:
+      action = self._denormalize_action(action)
+    observ, reward, done, info = self._env.step(action)
+    if self._should_normalize_observ:
+      observ = self._normalize_observ(observ)
+    return observ, reward, done, info
+
+  def reset(self):
+    observ = self._env.reset()
+    if self._should_normalize_observ:
+      observ = self._normalize_observ(observ)
+    return observ
+
+  def _denormalize_action(self, action):
+    min_ = self._env.action_space.low
+    max_ = self._env.action_space.high
+    action = (action + 1) / 2 * (max_ - min_) + min_
+    return action
+
+  def _normalize_observ(self, observ):
+    min_ = self._env.observation_space.low
+    max_ = self._env.observation_space.high
+    observ = 2 * (observ - min_) / (max_ - min_) - 1
+    return observ
+
+  def _is_finite(self, space):
+    return np.isfinite(space.low).all() and np.isfinite(space.high).all()
+
+
+class ClipAction(object):
+  """Clip out of range actions to the action space of the environment."""
+
+  def __init__(self, env):
+    self._env = env
+
+  def __getattr__(self, name):
+    return getattr(self._env, name)
+
+  @property
+  def action_space(self):
+    shape = self._env.action_space.shape
+    return gym.spaces.Box(-np.inf * np.ones(shape), np.inf * np.ones(shape))
+
+  def step(self, action):
+    action_space = self._env.action_space
+    action = np.clip(action, action_space.low, action_space.high)
+    return self._env.step(action)
+
+
+class LimitDuration(object):
+  """End episodes after specified number of steps."""
+
+  def __init__(self, env, duration):
+    self._env = env
+    self._duration = duration
+    self._step = None
+
+  def __getattr__(self, name):
+    return getattr(self._env, name)
+
+  def step(self, action):
+    if self._step is None:
+      raise RuntimeError('Must reset environment.')
+    observ, reward, done, info = self._env.step(action)
+    self._step += 1
+    if self._step >= self._duration:
+      done = True
+      self._step = None
+    return observ, reward, done, info
+
+  def reset(self):
+    self._step = 0
+    return self._env.reset()
+
+
+class ExternalProcess(object):
+  """Step environment in a separate process for lock free paralellism."""
+
+  # Message types for communication via the pipe.
+  _ACTION = 1
+  _RESET = 2
+  _CLOSE = 3
+  _ATTRIBUTE = 4
+  _TRANSITION = 5
+  _OBSERV = 6
+  _EXCEPTION = 7
+  _VALUE = 8
+
+  def __init__(self, constructor):
+    """Step environment in a separate process for lock free paralellism.
+
+    The environment will be created in the external process by calling the
+    specified callable. This can be an environment class, or a function
+    creating the environment and potentially wrapping it. The returned
+    environment should not access global variables.
+
+    Args:
+      constructor: Callable that creates and returns an OpenAI gym environment.
+
+    Attributes:
+      observation_space: The cached observation space of the environment.
+      action_space: The cached action space of the environment.
+    """
+    self._conn, conn = multiprocessing.Pipe()
+    self._process = multiprocessing.Process(
+        target=self._worker, args=(constructor, conn))
+    atexit.register(self.close)
+    self._process.start()
+    self._observ_space = None
+    self._action_space = None
+
+  @property
+  def observation_space(self):
+    if not self._observ_space:
+      self._observ_space = self.__getattr__('observation_space')
+    return self._observ_space
+
+  @property
+  def action_space(self):
+    if not self._action_space:
+      self._action_space = self.__getattr__('action_space')
+    return self._action_space
+
+  def __getattr__(self, name):
+    """Request an attribute from the environment.
+
+    Note that this involves communication with the external process, so it can
+    be slow.
+
+    Args:
+      name: Attribute to access.
+
+    Returns:
+      Value of the attribute.
+    """
+    self._conn.send((self._ATTRIBUTE, name))
+    return self._receive(self._VALUE)
+
+  def step(self, action, blocking=True):
+    """Step the environment.
+
+    Args:
+      action: The action to apply to the environment.
+      blocking: Whether to wait for the result.
+
+    Returns:
+      Transition tuple when blocking, otherwise callable that returns the
+      transition tuple.
+    """
+    self._conn.send((self._ACTION, action))
+    if blocking:
+      return self._receive(self._TRANSITION)
+    else:
+      return functools.partial(self._receive, self._TRANSITION)
+
+  def reset(self, blocking=True):
+    """Reset the environment.
+
+    Args:
+      blocking: Whether to wait for the result.
+
+    Returns:
+      New observation when blocking, otherwise callable that returns the new
+      observation.
+    """
+    self._conn.send((self._RESET, None))
+    if blocking:
+      return self._receive(self._OBSERV)
+    else:
+      return functools.partial(self._receive, self._OBSERV)
+
+  def close(self):
+    """Send a close message to the external process and join it."""
+    try:
+      self._conn.send((self._CLOSE, None))
+      self._conn.close()
+    except IOError:
+      # The connection was already closed.
+      pass
+    self._process.join()
+
+  def _receive(self, expected_message):
+    """Wait for a message from the worker process and return its payload.
+
+    Args:
+      expected_message: Type of the expected message.
+
+    Raises:
+      Exception: An exception was raised inside the worker process.
+      KeyError: The reveived message is not of the expected type.
+
+    Returns:
+      Payload object of the message.
+    """
+    message, payload = self._conn.recv()
+    # Re-raise exceptions in the main process.
+    if message == self._EXCEPTION:
+      stacktrace = payload
+      raise Exception(stacktrace)
+    if message == expected_message:
+      return payload
+    raise KeyError('Received message of unexpected type {}'.format(message))
+
+  def _worker(self, constructor, conn):
+    """The process waits for actions and sends back environment results.
+
+    Args:
+      constructor: Constructor for the OpenAI Gym environment.
+      conn: Connection for communication to the main process.
+    """
+    try:
+      env = constructor()
+      while True:
+        try:
+          # Only block for short times to have keyboard exceptions be raised.
+          if not conn.poll(0.1):
+            continue
+          message, payload = conn.recv()
+        except (EOFError, KeyboardInterrupt):
+          break
+        if message == self._ACTION:
+          action = payload
+          conn.send((self._TRANSITION, env.step(action)))
+          continue
+        if message == self._RESET:
+          assert payload is None
+          conn.send((self._OBSERV, env.reset()))
+          continue
+        if message == self._ATTRIBUTE:
+          name = payload
+          conn.send((self._VALUE, getattr(env, name)))
+          continue
+        if message == self._CLOSE:
+          assert payload is None
+          break
+        raise KeyError('Received message of unknown type {}'.format(message))
+    except Exception:  # pylint: disable=broad-except
+      stacktrace = ''.join(traceback.format_exception(*sys.exc_info()))
+      conn.send((self._EXCEPTION, stacktrace))
+      tf.logging.error('Error in environment process: {}'.format(stacktrace))
+    conn.close()
+
+
+class ConvertTo32Bit(object):
+  """Convert data types of an OpenAI Gym environment to 32 bit."""
+
+  def __init__(self, env):
+    """Convert data types of an OpenAI Gym environment to 32 bit.
+
+    Args:
+      env: OpenAI Gym environment.
+    """
+    self._env = env
+
+  def __getattr__(self, name):
+    """Forward unimplemented attributes to the original environment.
+
+    Args:
+      name: Attribute that was accessed.
+
+    Returns:
+      Value behind the attribute name in the wrapped environment.
+    """
+    return getattr(self._env, name)
+
+  def step(self, action):
+    """Forward action to the wrapped environment.
+
+    Args:
+      action: Action to apply to the environment.
+
+    Raises:
+      ValueError: Invalid action.
+
+    Returns:
+      Converted observation, converted reward, done flag, and info object.
+    """
+    observ, reward, done, info = self._env.step(action)
+    observ = self._convert_observ(observ)
+    reward = self._convert_reward(reward)
+    return observ, reward, done, info
+
+  def reset(self):
+    """Reset the environment and convert the resulting observation.
+
+    Returns:
+      Converted observation.
+    """
+    observ = self._env.reset()
+    observ = self._convert_observ(observ)
+    return observ
+
+  def _convert_observ(self, observ):
+    """Convert the observation to 32 bits.
+
+    Args:
+      observ: Numpy observation.
+
+    Raises:
+      ValueError: Observation contains infinite values.
+
+    Returns:
+      Numpy observation with 32-bit data type.
+    """
+    if not np.isfinite(observ).all():
+      raise ValueError('Infinite observation encountered.')
+    if observ.dtype == np.float64:
+      return observ.astype(np.float32)
+    if observ.dtype == np.int64:
+      return observ.astype(np.int32)
+    return observ
+
+  def _convert_reward(self, reward):
+    """Convert the reward to 32 bits.
+
+    Args:
+      reward: Numpy reward.
+
+    Raises:
+      ValueError: Rewards contain infinite values.
+
+    Returns:
+      Numpy reward with 32-bit data type.
+    """
+    if not np.isfinite(reward).all():
+      raise ValueError('Infinite reward encountered.')
+    return np.array(reward, dtype=np.float32)
--- a/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/wrappers_test.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/agents/tools/wrappers_test.py
@ -0,0 +1,90 @@
+# Copyright 2017 The TensorFlow Agents Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for environment wrappers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import tensorflow as tf
+
+from agents import tools
+
+
+class ExternalProcessTest(tf.test.TestCase):
+
+  def test_close_no_hang_after_init(self):
+    constructor = functools.partial(
+        tools.MockEnvironment,
+        observ_shape=(2, 3), action_shape=(2,),
+        min_duration=2, max_duration=2)
+    env = tools.wrappers.ExternalProcess(constructor)
+    env.close()
+
+  def test_close_no_hang_after_step(self):
+    constructor = functools.partial(
+        tools.MockEnvironment,
+        observ_shape=(2, 3), action_shape=(2,),
+        min_duration=5, max_duration=5)
+    env = tools.wrappers.ExternalProcess(constructor)
+    env.reset()
+    env.step(env.action_space.sample())
+    env.step(env.action_space.sample())
+    env.close()
+
+  def test_reraise_exception_in_init(self):
+    constructor = MockEnvironmentCrashInInit
+    env = tools.wrappers.ExternalProcess(constructor)
+    with self.assertRaises(Exception):
+      env.step(env.action_space.sample())
+
+  def test_reraise_exception_in_step(self):
+    constructor = functools.partial(
+        MockEnvironmentCrashInStep, crash_at_step=3)
+    env = tools.wrappers.ExternalProcess(constructor)
+    env.reset()
+    env.step(env.action_space.sample())
+    env.step(env.action_space.sample())
+    with self.assertRaises(Exception):
+      env.step(env.action_space.sample())
+
+
+class MockEnvironmentCrashInInit(object):
+  """Raise an error when instantiated."""
+
+  def __init__(self, *unused_args, **unused_kwargs):
+    raise RuntimeError()
+
+
+class MockEnvironmentCrashInStep(tools.MockEnvironment):
+  """Raise an error after specified number of steps in an episode."""
+
+  def __init__(self, crash_at_step):
+    super(MockEnvironmentCrashInStep, self).__init__(
+        observ_shape=(2, 3), action_shape=(2,),
+        min_duration=crash_at_step + 1, max_duration=crash_at_step + 1)
+    self._crash_at_step = crash_at_step
+
+  def step(self, *args, **kwargs):
+    transition = super(MockEnvironmentCrashInStep, self).step(*args, **kwargs)
+    if self.steps[-1] == self._crash_at_step:
+      raise RuntimeError()
+    return transition
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/examples/pybullet/gym/pybullet_envs/minitaur/envs/README
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/envs/README
@ -0,0 +1,11 @@
+This folder contains a number of simulated Minitaur environments implemented using pybullet.
+The following two environments are used in the RSS paper "Sim-to-Real: Learning Agile Locomotion For Quadruped Robots":
+1) Galloping example: minitaur_reactive_env.py
+python minitaur_reactive_env_example.py runs a pre-trained PPO agent that performs galloping gait.
+
+2) Trotting example: minitaur_trotting_env.py
+python minitaur_trotting_env_example.py runs a pre-trained PPO agent that performs trotting gait.
+
+The rest are experimental environments.
+
+
--- a/examples/pybullet/gym/pybullet_envs/minitaur/envs/minitaur_four_leg_stand_env.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/envs/minitaur_four_leg_stand_env.py
@ -216,7 +216,7 @@ class MinitaurFourLegStandEnv(minitaur_gym_env.MinitaurGymEnv):
    self._env_step_counter += 1
    done = self._termination()
    obs = self._get_true_observation()
-    reward = self._reward(action, obs)
+    reward = self._reward()
    if self._log_path is not None:
      minitaur_logging.update_episode_proto(self._episode_proto, self.minitaur,
                                            action, self._env_step_counter)
@ -272,7 +272,7 @@ class MinitaurFourLegStandEnv(minitaur_gym_env.MinitaurGymEnv):
        np.asarray([0, 0, 1]), np.asarray(local_up))
    return local_global_up_dot_product < 0.85 or height < 0.15

-  def _reward(self, action, obs):
+  def _reward(self):
    roll, pitch, _ = self.minitaur.GetBaseRollPitchYaw()
    return 1.0 / (0.001 + math.fabs(roll) + math.fabs(pitch))

--- a/examples/pybullet/gym/pybullet_envs/minitaur/envs/minitaur_reactive_env_example.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/envs/minitaur_reactive_env_example.py
@ -1,8 +1,4 @@
-r"""An example to use simple_ppo_agent.
-
-blaze run -c opt \
-//robotics/reinforcement_learning/minitaur/envs:minitaur_reactive_env_example
-"""
+r"""Running a pre-trained ppo agent on minitaur_reactive_env."""

 from __future__ import absolute_import
 from __future__ import division
@ -11,13 +7,13 @@ from __future__ import print_function
 import os
 import time
 import tensorflow as tf
-from agents.scripts import utility
+from pybullet_envs.minitaur.agents.scripts import utility
+import pybullet_data
 import simple_ppo_agent

 flags = tf.app.flags
 FLAGS = tf.app.flags.FLAGS
-LOG_DIR = (
-    "testdata/minitaur_reactive_env_test")
+LOG_DIR = os.path.join(pybullet_data.getDataPath(), "policies/ppo/minitaur_reactive_env")
 CHECKPOINT = "model.ckpt-14000000"


@ -43,7 +39,6 @@ def main(argv):
    while True:
      action = agent.get_action([observation])
      observation, reward, done, _ = env.step(action[0])
-      # This sleep is to prevent serial communication error on the real robot.
      time.sleep(0.002)
      sum_reward += reward
      if done:
--- a/examples/pybullet/gym/pybullet_envs/minitaur/envs/minitaur_trotting_env_example.py
+++ b/examples/pybullet/gym/pybullet_envs/minitaur/envs/minitaur_trotting_env_example.py
@ -1,41 +1,51 @@
-"""An example to run the minitaur environment of trotting gait.
+r"""Running a pre-trained ppo agent on minitaur_trotting_env."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function

-"""
-import time
 import os
-import numpy as np
+import time
 import tensorflow as tf
-from pybullet_envs.minitaur.envs import minitaur_gym_env
-from pybullet_envs.minitaur.envs import minitaur_trotting_env
+from pybullet_envs.minitaur.agents.scripts import utility
+import pybullet_data
+import simple_ppo_agent

-#FLAGS = tf.flags.FLAGS
-#tf.flags.DEFINE_string("log_path", None, "The directory to write the log file.")
+flags = tf.app.flags
+FLAGS = tf.app.flags.FLAGS
+LOG_DIR = os.path.join(pybullet_data.getDataPath(), "policies/ppo/minitaur_trotting_env")
+CHECKPOINT = "model.ckpt-14000000"


-def main(unused_argv):
-  environment = minitaur_trotting_env.MinitaurTrottingEnv(
-      urdf_version=minitaur_gym_env.RAINBOW_DASH_V0_URDF_VERSION,
-      use_signal_in_observation=False,
-      use_angle_in_observation=False,
-      render=True,
-      log_path=os.getcwd())
+def main(argv):
+  del argv  # Unused.
+  config = utility.load_config(LOG_DIR)
+  policy_layers = config.policy_layers
+  value_layers = config.value_layers
+  env = config.env(render=True)
+  network = config.network

-  np.random.seed(100)
-  sum_reward = 0
-  environment.reset()
+  with tf.Session() as sess:
+    agent = simple_ppo_agent.SimplePPOPolicy(
+        sess,
+        env,
+        network,
+        policy_layers=policy_layers,
+        value_layers=value_layers,
+        checkpoint=os.path.join(LOG_DIR, CHECKPOINT))

-  steps = 5000
-  for _ in range(steps):
-    # Sleep to prevent serial buffer overflow on microcontroller.
-    time.sleep(0.002)
-    action = [0] * 8
-    _, reward, done, _ = environment.step(action)
-    sum_reward += reward
-    if done:
-      break
-  tf.logging.info("reward: {}".format(sum_reward))
+    sum_reward = 0
+    observation = env.reset()
+    while True:
+      action = agent.get_action([observation])
+      observation, reward, done, _ = env.step(action[0])
+      time.sleep(0.002)
+      sum_reward += reward
+      if done:
+        break
+    tf.logging.info("reward: %s", sum_reward)


 if __name__ == "__main__":
-  tf.logging.set_verbosity(tf.logging.INFO)
-  tf.app.run()
+  tf.app.run(main)
+