Add Wasserstein Policy Optimization (WPO, http://arxiv.org/pdf/2505.00663) to Acme

Acme Contributor · copybara-github · commit ed4e57a794ff · 2025-05-03T02:56:08.000-07:00
PiperOrigin-RevId: 754303872
Change-Id: I1e360c066abc5a122289b8ff87bb3794674dc61f
diff --git a/acme/jax/losses/__init__.py b/acme/jax/losses/__init__.py
@@ -18,3 +18,6 @@
 from acme.jax.losses.mpo import MPO
 from acme.jax.losses.mpo import MPOParams
 from acme.jax.losses.mpo import MPOStats
+from acme.jax.losses.wpo import WPO
+from acme.jax.losses.wpo import WPOParams
+from acme.jax.losses.wpo import WPOStats
diff --git a/examples/baselines/rl_continuous/run_wpo.py b/examples/baselines/rl_continuous/run_wpo.py
@@ -0,0 +1,89 @@
+# Copyright 2018 DeepMind Technologies Limited. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Example running WPO on continuous control tasks."""
+
+from absl import flags
+from acme import specs
+from acme.agents.jax import wpo
+from acme.agents.jax.wpo import types as wpo_types
+import helpers
+from absl import app
+from acme.jax import experiments
+from acme.utils import lp_utils
+import launchpad as lp
+
+RUN_DISTRIBUTED = flags.DEFINE_bool(
+    'run_distributed', True, 'Should an agent be executed in a distributed '
+    'way. If False, will run single-threaded.')
+ENV_NAME = flags.DEFINE_string(
+    'env_name', 'gym:HalfCheetah-v2',
+    'What environment to run on, in the format {gym|control}:{task}, '
+    'where "control" refers to the DM control suite. DM Control tasks are '
+    'further split into {domain_name}:{task_name}.')
+SEED = flags.DEFINE_integer('seed', 0, 'Random seed.')
+NUM_STEPS = flags.DEFINE_integer(
+    'num_steps', 1_000_000,
+    'Number of environment steps to run the experiment for.')
+EVAL_EVERY = flags.DEFINE_integer(
+    'eval_every', 50_000,
+    'How often (in actor environment steps) to run evaluation episodes.')
+EVAL_EPISODES = flags.DEFINE_integer(
+    'evaluation_episodes', 10,
+    'Number of evaluation episodes to run periodically.')
+
+
+def build_experiment_config():
+  """Builds MPO experiment config which can be executed in different ways."""
+  suite, task = ENV_NAME.value.split(':', 1)
+
+  def network_factory(spec: specs.EnvironmentSpec) -> wpo.WPONetworks:
+    return wpo.make_control_networks(
+        spec,
+        policy_layer_sizes=(256, 256, 256),
+        critic_layer_sizes=(256, 256, 256),
+        policy_init_scale=0.5)
+
+  # Configure and construct the agent builder.
+  config = wpo.WPOConfig(
+      policy_loss_config=wpo_types.GaussianPolicyLossConfig(epsilon_mean=0.01),
+      samples_per_insert=64,
+      learning_rate=3e-4,
+      experience_type=wpo_types.FromTransitions(n_step=5),
+      dual_learning_rate=0.0)  # Turn off dual learning.
+  agent_builder = wpo.WPOBuilder(config, sgd_steps_per_learner_step=1)
+
+  return experiments.ExperimentConfig(
+      builder=agent_builder,
+      environment_factory=lambda _: helpers.make_environment(suite, task),
+      network_factory=network_factory,
+      seed=SEED.value,
+      max_num_actor_steps=NUM_STEPS.value)
+
+
+def main(_):
+  config = build_experiment_config()
+  if RUN_DISTRIBUTED.value:
+    program = experiments.make_distributed_experiment(
+        experiment=config, num_actors=4)
+    lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
+  else:
+    experiments.run_experiment(
+        experiment=config,
+        eval_every=EVAL_EVERY.value,
+        num_eval_episodes=EVAL_EPISODES.value)
+
+
+if __name__ == '__main__':
+  app.run(main)