Enable 2 changes to composer environment resets:

DeepMind · copybara-github · commit c2959e6b94c4 · 2023-11-28T02:25:30.000-08:00
- Don't recompile the mjcf model every episode. This allows us to gain a lot of time in between resets if we are not making any changes to the mjcf model.
- Have a fixed initial state for every episode. This allows to have repeatable episode if desired.

PiperOrigin-RevId: 585901093
Change-Id: I9d0b29dc1aba80113b1437ff3fff3f06862923ef
diff --git a/dm_control/composer/environment.py b/dm_control/composer/environment.py
@@ -292,19 +292,26 @@ def control_timestep(self):
 class Environment(_CommonEnvironment, dm_env.Environment):
   """Reinforcement learning environment for Composer tasks."""
 
-  def __init__(self, task, time_limit=float('inf'), random_state=None,
-               n_sub_steps=None,
-               raise_exception_on_physics_error=True,
-               strip_singleton_obs_buffer_dim=False,
-               max_reset_attempts=1,
-               delayed_observation_padding=ObservationPadding.ZERO,
-               legacy_step: bool = True):
+  def __init__(
+      self,
+      task,
+      time_limit=float('inf'),
+      random_state=None,
+      n_sub_steps=None,
+      raise_exception_on_physics_error=True,
+      strip_singleton_obs_buffer_dim=False,
+      max_reset_attempts=1,
+      recompile_mjcf_every_episode=True,
+      fixed_initial_state=False,
+      delayed_observation_padding=ObservationPadding.ZERO,
+      legacy_step: bool = True,
+  ):
     """Initializes an instance of `Environment`.
 
     Args:
       task: Instance of `composer.base.Task`.
-      time_limit: (optional) A float, the time limit in seconds beyond which
-        an episode is forced to terminate.
+      time_limit: (optional) A float, the time limit in seconds beyond which an
+        episode is forced to terminate.
       random_state: (optional) an int seed or `np.random.RandomState` instance.
       n_sub_steps: (DEPRECATED) An integer, number of physics steps to take per
         agent control step. New code should instead override the
@@ -313,15 +320,22 @@ def __init__(self, task, time_limit=float('inf'), random_state=None,
         `PhysicsError` should be raised as an exception. If `False`, physics
         errors will result in the current episode being terminated with a
         warning logged, and a new episode started.
-      strip_singleton_obs_buffer_dim: (optional) A boolean, if `True`,
-        the array shape of observations with `buffer_size == 1` will not have a
-        leading buffer dimension.
+      strip_singleton_obs_buffer_dim: (optional) A boolean, if `True`, the array
+        shape of observations with `buffer_size == 1` will not have a leading
+        buffer dimension.
       max_reset_attempts: (optional) Maximum number of times to try resetting
-        the environment. If an `EpisodeInitializationError` is raised
-        during this process, an environment reset is reattempted up to this
-        number of times. If this count is exceeded then the most recent
-        exception will be allowed to propagate. Defaults to 1, i.e. no failure
-        is allowed.
+        the environment. If an `EpisodeInitializationError` is raised during
+        this process, an environment reset is reattempted up to this number of
+        times. If this count is exceeded then the most recent exception will be
+        allowed to propagate. Defaults to 1, i.e. no failure is allowed.
+      recompile_mjcf_every_episode: If True will recompile the mjcf model
+        between episodes. This specifically skips the `initialize_episode_mjcf`
+        and `after_compile` steps. This allows a speedup if no changes are made
+        to the model.
+      fixed_initial_state: If True the starting state of every single episode
+        will be the same. Meaning an identical sequence of action will lead to
+        an identical final state. If False, will randomize the starting state at
+        every episode.
       delayed_observation_padding: (optional) An `ObservationPadding` enum value
         specifying the padding behavior of the initial buffers for delayed
         observables. If `ZERO` then the buffer is initially filled with zeroes.
@@ -340,6 +354,10 @@ def __init__(self, task, time_limit=float('inf'), random_state=None,
         delayed_observation_padding=delayed_observation_padding,
         legacy_step=legacy_step)
     self._max_reset_attempts = max_reset_attempts
+    self._recompile_mjcf_every_episode = recompile_mjcf_every_episode
+    self._mjcf_never_compiled = True
+    self._fixed_initial_state = fixed_initial_state
+    self._fixed_random_state = self._random_state.get_state()
     self._reset_next_step = True
 
   def reset(self):
@@ -355,8 +373,15 @@ def reset(self):
           raise
 
   def _reset_attempt(self):
-    self._hooks.initialize_episode_mjcf(self._random_state)
-    self._recompile_physics_and_update_observables()
+    if self._recompile_mjcf_every_episode or self._mjcf_never_compiled:
+      if self._fixed_initial_state:
+        self._random_state.set_state(self._fixed_random_state)
+      self._hooks.initialize_episode_mjcf(self._random_state)
+      self._recompile_physics_and_update_observables()
+      self._mjcf_never_compiled = False
+
+    if self._fixed_initial_state:
+      self._random_state.set_state(self._fixed_random_state)
     with self._physics.reset_context():
       self._hooks.initialize_episode(self._physics_proxy, self._random_state)
     self._observation_updater.reset(self._physics_proxy, self._random_state)
diff --git a/dm_control/composer/environment_test.py b/dm_control/composer/environment_test.py
@@ -53,6 +53,25 @@ def initialize_episode(self, physics, random_state):
       raise composer.EpisodeInitializationError()
 
 
+class DummyTaskWithRandomObservation(composer.NullTask):
+
+  def __init__(self):
+    null_entity = composer.ModelWrapperEntity(mjcf.RootElement())
+    super().__init__(null_entity)
+
+    self._observation = [0.0] * 1000
+
+  def initialize_episode(self, physics, random_state):
+    del physics
+    self._observation = random_state.randint(1000, size=1000)
+
+  @property
+  def task_observables(self):
+    random_int = observable.Generic(lambda physics: self._observation)
+    random_int.enabled = True
+    return {'random_int': random_int}
+
+
 class EnvironmentTest(parameterized.TestCase):
 
   def test_failed_resets(self):
@@ -96,5 +115,48 @@ def test_can_provide_observation(self):
       self.assertLen(obs, 1)
       np.testing.assert_array_equal(obs['time'], env.physics.time())
 
+  def test_dont_compile_mjcf_between_episodes(self):
+    class AfterCompileHook(object):
+
+      def __init__(self):
+        self.after_compile_call_count = 0
+
+      def __call__(self, physics, random_state):
+        del physics, random_state
+        self.after_compile_call_count += 1
+
+    after_compile_hook = AfterCompileHook()
+    task = DummyTask()
+    env = composer.Environment(task, recompile_mjcf_every_episode=False)
+    env.add_extra_hook('after_compile', after_compile_hook)
+    env.reset()
+    self.assertEqual(after_compile_hook.after_compile_call_count, 1)
+    for _ in range(4):
+      env.reset()
+      env.step([])
+
+    # Check the hook is not called.
+    self.assertEqual(after_compile_hook.after_compile_call_count, 1)
+
+  def test_fixed_initial_state(self):
+    task = DummyTaskWithRandomObservation()
+    fixed_env = composer.Environment(task, fixed_initial_state=True)
+    non_fixed_env = composer.Environment(task, fixed_initial_state=False)
+    fixed_obs = fixed_env.reset().observation['random_int']
+    non_fixed_obs = non_fixed_env.reset().observation['random_int']
+    for _ in range(3):
+      np.testing.assert_array_equal(
+          fixed_env.reset().observation['random_int'], fixed_obs
+      )
+      self.assertTrue(
+          np.any(
+              np.not_equal(
+                  np.asarray(non_fixed_obs),
+                  np.asarray(non_fixed_env.reset().observation['random_int']),
+              )
+          )
+      )
+
+
 if __name__ == '__main__':
   absltest.main()