take a step towards the proposed follow up research in the paper, incorporating the world model embed when choosing actions during real env rollouts

lucidrains · lucidrains · commit a0cebd658262 · 2025-05-01T14:58:22.000Z
diff --git a/improving_transformers_world_model/agent.py b/improving_transformers_world_model/agent.py
@@ -112,6 +112,30 @@ def calc_target_and_gae(
 
     return returns, gae
 
+# FiLM for conditioning policy network on world model embed - suggested for follow up research in the paper
+
+class FiLM(Module):
+    def __init__(
+        self,
+        dim,
+        dim_out
+    ):
+        super().__init__()
+        self.to_gamma = nn.Linear(dim, dim_out, bias = False)
+        self.to_beta = nn.Linear(dim, dim_out, bias = False)
+
+        nn.init.zeros_(self.to_gamma.weight)
+        nn.init.zeros_(self.to_beta.weight)
+
+    def forward(
+        self,
+        x: Float['... d'],
+        cond: Float['... d']
+    ):
+        gamma, beta = self.to_gamma(cond), self.to_beta(cond)
+
+        return x * (gamma + 1.) + beta
+
 # symbol extractor
 # detailed in section C.3
 
@@ -267,6 +291,7 @@ def __init__(
         dim,
         *,
         num_actions,
+        dim_world_model_embed = None,
         num_layers = 3,
         expansion_factor = 2.,
     ):
@@ -292,9 +317,17 @@ def __init__(
 
         self.to_actions_pred = nn.Linear(dim, num_actions)
 
+        # able to condition on world model embed when predicting action - using classic film
+
+        self.can_cond_on_world_model = exists(dim_world_model_embed)
+
+        if self.can_cond_on_world_model:
+            self.world_model_film = FiLM(dim_world_model_embed, dim)
+
     def forward(
         self,
         state: Float['b c h w'],
+        world_model_embed: Float['b d'] | None = None,
         sample_action = False
     ) -> (
         Float['b'] |
@@ -303,6 +336,11 @@ def forward(
 
         embed = self.proj_in(state)
 
+        if exists(world_model_embed):
+            assert exists(self.world_model_film), f'`dim_world_model_embed` must be set on `Actor` to utilize world model for prediction'
+
+            embed = self.world_model_film(embed, world_model_embed)
+
         for layer in self.layers:
             embed = layer(embed) + embed
 
@@ -636,6 +674,7 @@ def interact_with_env(
         self,
         env,
         memories: Memories | None = None,
+        world_model: WorldModel | None = None,
         max_steps = float('inf')
 
     ) -> MemoriesWithNextState:
@@ -662,13 +701,39 @@ def interact_with_env(
         last_done = dones[0, -1]
         time_step = states.shape[2] + 1
 
+        # maybe conditioning actor with learned world model embed
+
+        if exists(world_model):
+            world_model_cache = None
+
         while time_step < max_steps and not last_done:
 
+            world_model_embed = None
+
+            if exists(world_model):
+                with torch.no_grad():
+                    world_model.eval()
+
+                    world_model_embed, world_model_cache = world_model(
+                        state_or_token_ids = states[:, :, -1:],
+                        actions = actions[:, -1:],
+                        rewards = rewards[:, -1:],
+                        cache = world_model_cache,
+                        remove_cache_len_from_time = False,
+                        return_embed = True,
+                        return_cache = True,
+                        return_loss = False
+                    )
+
+                    world_model_embed = rearrange(world_model_embed, '1 1 d -> 1 d')
+
+            # impala + actor - todo: cleanup the noisy tensor (un)squeezing ops
+
             next_state = rearrange(next_state, 'c h w -> 1 c h w')
 
             actor_critic_input, rnn_hidden = self.impala(next_state)
 
-            action, action_log_prob = self.actor(actor_critic_input, sample_action = True)
+            action, action_log_prob = self.actor(actor_critic_input, world_model_embed = world_model_embed, sample_action = True)
 
             next_state, next_reward, next_done = to_device_decorator(env)(action)
 
diff --git a/improving_transformers_world_model/world_model.py b/improving_transformers_world_model/world_model.py
@@ -770,6 +770,7 @@ def forward(
         detach_cache = False,
         return_loss = True,
         return_loss_breakdown = False,
+        return_embed = False,
         freeze_tokenizer = True
     ):
         batch = state_or_token_ids.shape[0]
@@ -784,12 +785,18 @@ def forward(
             token_ids = state_or_token_ids
 
         if return_loss:
+            assert token_ids.shape[1] > 1
+
             token_ids, state_labels = token_ids[:, :-1], token_ids[:, 1:]
 
-            is_terminal_labels = is_terminal[:, 1:]
+            if exists(is_terminal):
+                is_terminal_labels = is_terminal[:, 1:]
 
-            actions, last_action = actions[:, :-1], actions[:, -1:]
-            rewards, last_reward = rewards[:, :-1], rewards[:, -1:]
+            if exists(actions):
+                actions, last_action = actions[:, :-1], actions[:, -1:]
+
+            if exists(rewards):
+                rewards, last_reward = rewards[:, :-1], rewards[:, -1:]
 
         # either use own learned token embeddings
         # or project the codes (which are just the nearest neighbor memorized patch) and project
@@ -819,7 +826,9 @@ def forward(
             actions = actions.masked_fill(no_actions, 0)
             action_embeds = self.action_embed(actions)
 
-            action_embeds = einx.where('b t n, b t n d, -> b t n d', ~no_actions, action_embeds, 0.)
+            if not is_empty(action_embeds):
+                action_embeds = einx.where('b t n, b t n d, -> b t n d', ~no_actions, action_embeds, 0.)
+
             action_embeds = reduce(action_embeds, 'b t n d -> b t d', 'sum')
 
             action_embed_sos = repeat(self.action_embed_sos, 'd -> b 1 d', b = batch)
@@ -863,6 +872,14 @@ def inverse_time(t):
 
             embeds_with_time = reduce(embeds_with_space, 'b ... d -> b 1 d', 'mean')
 
+        # maybe return embed
+
+        if return_embed:
+            if not return_cache:
+                return embeds_with_time
+
+            return embeds_with_time, next_cache
+
         # reward and terminal
 
         reward_logits = None
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "improving-transformers-world-model"
-version = "0.0.57"
+version = "0.0.58"
 description = "Improving Transformers World Model for RL"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_agent.py b/tests/test_agent.py
@@ -12,8 +12,10 @@
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 @pytest.mark.parametrize('critic_use_regression', (False, True))
+@pytest.mark.parametrize('actor_use_world_model_embed', (False, True))
 def test_agent(
-    critic_use_regression
+    critic_use_regression,
+    actor_use_world_model_embed
 ):
 
     # world model
@@ -53,6 +55,7 @@ def test_agent(
         actor = dict(
             dim = 32,
             num_actions = 5,
+            dim_world_model_embed = 32 if actor_use_world_model_embed else None
         ),
         critic = dict(
             dim = 64,
@@ -62,9 +65,17 @@ def test_agent(
 
     env = Env((3, 63, 63))
 
-    dream_memories = agent(world_model, state[0, :, 0], max_steps = 5)
+    dream_memories = agent(
+        world_model,
+        state[0, :, 0],
+        max_steps = 5
+    )
 
-    real_memories = agent.interact_with_env(env, max_steps = 5)
+    real_memories = agent.interact_with_env(
+        env,
+        world_model = world_model if actor_use_world_model_embed else None,
+        max_steps = 5
+    )
 
     agent.learn([dream_memories, real_memories])
 
@@ -93,6 +104,9 @@ def world_model_burn_in():
     actions = torch.randint(0, 5, (2, 20, 1))
     is_terminal = torch.randint(0, 2, (2, 20)).bool()
 
+    loss = world_model(state, actions = actions, rewards = rewards, is_terminal = is_terminal)
+    loss.backward()
+
     _, burn_in_cache = world_model(
         state,
         actions = actions,