opendilab
diff --git a/‎lzero/mcts/buffer/game_buffer_efficientzero.py
Lines changed: 18 additions & 15 deletions b/‎lzero/mcts/buffer/game_buffer_efficientzero.py
Lines changed: 18 additions & 15 deletions
diff --git a/‎lzero/mcts/buffer/game_buffer_muzero.py
Lines changed: 17 additions & 15 deletions b/‎lzero/mcts/buffer/game_buffer_muzero.py
Lines changed: 17 additions & 15 deletions
diff --git a/‎lzero/mcts/tree_search/mcts_ctree.py
Lines changed: 32 additions & 14 deletions b/‎lzero/mcts/tree_search/mcts_ctree.py
Lines changed: 32 additions & 14 deletions
diff --git a/‎lzero/model/efficientzero_model_mlp.py
Lines changed: 31 additions & 9 deletions b/‎lzero/model/efficientzero_model_mlp.py
Lines changed: 31 additions & 9 deletions
@@ -102,15 +102,15 @@ def _prepare_reward_value_context(
             - reward_value_context (:obj:`list`): value_obs_list, value_mask, pos_in_game_segment_list, rewards_list, game_segment_lens,
               td_steps_list, action_mask_segment, to_play_segment
         """
-        # zero_obs = game_segment_list[0].zero_obs()
-        # zero_obs = np.array([{'agent_state': np.zeros((3, 18), dtype=np.float32),
-        #         'global_state': np.zeros((84,), dtype=np.float32),
-        #         'agent_alone_state': np.zeros((3, 14), dtype=np.float32),
-        #         'agent_alone_padding_state': np.zeros((3, 18), dtype=np.float32),}])
-        zero_obs = np.array([{'agent_state': np.zeros((1, 6), dtype=np.float32),
+        zero_obs = game_segment_list[0].zero_obs()
+        # zero_obs = np.array([{'agent_state': np.zeros((18,), dtype=np.float32),
+        #              'global_state': np.zeros((48,), dtype=np.float32),
+        #              'agent_alone_state': np.zeros((14,), dtype=np.float32),
+        #              'agent_alone_padding_state': np.zeros((18,), dtype=np.float32),}])
+        zero_obs = np.array([{'agent_state': np.zeros((6,), dtype=np.float32),
                 'global_state': np.zeros((14, ), dtype=np.float32),
-                'agent_alone_state': np.zeros((1, 12), dtype=np.float32),
-                'agent_alone_padding_state': np.zeros((1, 12), dtype=np.float32),}])
+                'agent_alone_state': np.zeros((12,), dtype=np.float32),
+                'agent_alone_padding_state': np.zeros((12,), dtype=np.float32),}])
         value_obs_list = []
         # the value is valid or not (out of trajectory)
         value_mask = []
@@ -221,13 +221,16 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
                     # EfficientZero related core code
                     # ==============================================================
                     # if not in training, obtain the scalars of the value/reward
-                    [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
-                        [
-                            m_output.latent_state,
-                            inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
-                            m_output.policy_logits
-                        ]
-                    )
+                    # [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
+                    #     [
+                    #         m_output.latent_state,
+                    #         inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
+                    #         m_output.policy_logits
+                    #     ]
+                    # )
+                    m_output.latent_state = (to_detach_cpu_numpy(m_output.latent_state[0]), to_detach_cpu_numpy(m_output.latent_state[1]))
+                    m_output.value = to_detach_cpu_numpy(inverse_scalar_transform(m_output.value, self._cfg.model.support_scale))
+                    m_output.policy_logits = to_detach_cpu_numpy(m_output.policy_logits)
                     m_output.reward_hidden_state = (
                         m_output.reward_hidden_state[0].detach().cpu().numpy(),
                         m_output.reward_hidden_state[1].detach().cpu().numpy()
 
@@ -201,14 +201,14 @@ def _prepare_reward_value_context(
               td_steps_list, action_mask_segment, to_play_segment
         """
         zero_obs = game_segment_list[0].zero_obs()
-        # zero_obs = np.array([{'agent_state': np.zeros((3, 18), dtype=np.float32),
-        #              'global_state': np.zeros((84,), dtype=np.float32),
-        #              'agent_alone_state': np.zeros((3, 14), dtype=np.float32),
-        #              'agent_alone_padding_state': np.zeros((3, 18), dtype=np.float32),}])
-        zero_obs = np.array([{'agent_state': np.zeros((1, 6), dtype=np.float32),
+        zero_obs = np.array([{'agent_state': np.zeros((18,), dtype=np.float32),
+                     'global_state': np.zeros((48,), dtype=np.float32),
+                     'agent_alone_state': np.zeros((14,), dtype=np.float32),
+                     'agent_alone_padding_state': np.zeros((18,), dtype=np.float32),}])
+        zero_obs = np.array([{'agent_state': np.zeros((6,), dtype=np.float32),
                 'global_state': np.zeros((14, ), dtype=np.float32),
-                'agent_alone_state': np.zeros((1, 12), dtype=np.float32),
-                'agent_alone_padding_state': np.zeros((1, 12), dtype=np.float32),}])
+                'agent_alone_state': np.zeros((12,), dtype=np.float32),
+                'agent_alone_padding_state': np.zeros((12,), dtype=np.float32),}])
         value_obs_list = []
         # the value is valid or not (out of game_segment)
         value_mask = []
@@ -400,14 +400,16 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
 
                 if not model.training:
                     # if not in training, obtain the scalars of the value/reward
-                    [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
-                        [
-                            m_output.latent_state,
-                            inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
-                            m_output.policy_logits
-                        ]
-                    )
-
+                    # [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
+                    #     [
+                    #         m_output.latent_state,
+                    #         inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
+                    #         m_output.policy_logits
+                    #     ]
+                    # )
+                    m_output.latent_state = (to_detach_cpu_numpy(m_output.latent_state[0]), to_detach_cpu_numpy(m_output.latent_state[1]))
+                    m_output.value = to_detach_cpu_numpy(inverse_scalar_transform(m_output.value, self._cfg.model.support_scale))
+                    m_output.policy_logits = to_detach_cpu_numpy(m_output.policy_logits)
                 network_output.append(m_output)
 
             # concat the output slices after model inference
 
@@ -96,7 +96,9 @@ def search(
             pb_c_base, pb_c_init, discount_factor = self._cfg.pb_c_base, self._cfg.pb_c_init, self._cfg.discount_factor
 
             # the data storage of latent states: storing the latent state of all the nodes in one search.
-            latent_state_batch_in_search_path = [latent_state_roots]
+            agent_latent_state_roots, global_latent_state_roots = latent_state_roots
+            agent_latent_state_batch_in_search_path = [agent_latent_state_roots]
+            global_latent_state_batch_in_search_path = [global_latent_state_roots]
             # the data storage of value prefix hidden states in LSTM
             reward_hidden_state_c_batch = [reward_hidden_state_roots[0]]
             reward_hidden_state_h_batch = [reward_hidden_state_roots[1]]
@@ -108,7 +110,8 @@ def search(
             for simulation_index in range(self._cfg.num_simulations):
                 # In each simulation, we expanded a new node, so in one search, we have ``num_simulations`` num of nodes at most.
 
-                latent_states = []
+                agent_latent_states = []
+                global_latent_states = []
                 hidden_states_c_reward = []
                 hidden_states_h_reward = []
 
@@ -132,11 +135,13 @@ def search(
 
                 # obtain the latent state for leaf node
                 for ix, iy in zip(latent_state_index_in_search_path, latent_state_index_in_batch):
-                    latent_states.append(latent_state_batch_in_search_path[ix][iy])
+                    agent_latent_states.append(agent_latent_state_batch_in_search_path[ix][iy])
+                    global_latent_states.append(global_latent_state_batch_in_search_path[ix][iy])
                     hidden_states_c_reward.append(reward_hidden_state_c_batch[ix][0][iy])
                     hidden_states_h_reward.append(reward_hidden_state_h_batch[ix][0][iy])
 
-                latent_states = torch.from_numpy(np.asarray(latent_states)).to(self._cfg.device).float()
+                agent_latent_states = torch.from_numpy(np.asarray(agent_latent_states)).to(self._cfg.device).float()
+                global_latent_states = torch.from_numpy(np.asarray(global_latent_states)).to(self._cfg.device).float()
                 hidden_states_c_reward = torch.from_numpy(np.asarray(hidden_states_c_reward)).to(self._cfg.device
                                                                                                  ).unsqueeze(0)
                 hidden_states_h_reward = torch.from_numpy(np.asarray(hidden_states_h_reward)).to(self._cfg.device
@@ -151,10 +156,12 @@ def search(
                     At the end of the simulation, the statistics along the trajectory are updated.
                 """
                 network_output = model.recurrent_inference(
-                    latent_states, (hidden_states_c_reward, hidden_states_h_reward), last_actions
+                    (agent_latent_states, global_latent_states), (hidden_states_c_reward, hidden_states_h_reward), last_actions
                 )
+                network_output_agent_latent_state, network_output_global_latent_state = network_output.latent_state 
 
-                network_output.latent_state = to_detach_cpu_numpy(network_output.latent_state)
+                network_output_agent_latent_state = to_detach_cpu_numpy(network_output_agent_latent_state)
+                network_output_global_latent_state = to_detach_cpu_numpy(network_output_global_latent_state)
                 network_output.policy_logits = to_detach_cpu_numpy(network_output.policy_logits)
                 network_output.value = to_detach_cpu_numpy(self.inverse_scalar_transform_handle(network_output.value))
                 network_output.value_prefix = to_detach_cpu_numpy(self.inverse_scalar_transform_handle(network_output.value_prefix))
@@ -164,7 +171,8 @@ def search(
                     network_output.reward_hidden_state[1].detach().cpu().numpy()
                 )
 
-                latent_state_batch_in_search_path.append(network_output.latent_state)
+                agent_latent_state_batch_in_search_path.append(network_output_agent_latent_state)
+                global_latent_state_batch_in_search_path.append(network_output_global_latent_state)
                 # tolist() is to be compatible with cpp datatype.
                 value_prefix_batch = network_output.value_prefix.reshape(-1).tolist()
                 value_batch = network_output.value.reshape(-1).tolist()
@@ -273,7 +281,9 @@ def search(
             batch_size = roots.num
             pb_c_base, pb_c_init, discount_factor = self._cfg.pb_c_base, self._cfg.pb_c_init, self._cfg.discount_factor
             # the data storage of latent states: storing the latent state of all the nodes in the search.
-            latent_state_batch_in_search_path = [latent_state_roots]
+            agent_latent_state_roots, global_latent_state_roots = latent_state_roots
+            agent_latent_state_batch_in_search_path = [agent_latent_state_roots]
+            global_latent_state_batch_in_search_path = [global_latent_state_roots]
 
             # minimax value storage
             min_max_stats_lst = tree_muzero.MinMaxStatsList(batch_size)
@@ -282,7 +292,8 @@ def search(
             for simulation_index in range(self._cfg.num_simulations):
                 # In each simulation, we expanded a new node, so in one search, we have ``num_simulations`` num of nodes at most.
 
-                latent_states = []
+                agent_latent_states = []
+                global_latent_states = []
 
                 # prepare a result wrapper to transport results between python and c++ parts
                 results = tree_muzero.ResultsWrapper(num=batch_size)
@@ -302,9 +313,11 @@ def search(
 
                 # obtain the latent state for leaf node
                 for ix, iy in zip(latent_state_index_in_search_path, latent_state_index_in_batch):
-                    latent_states.append(latent_state_batch_in_search_path[ix][iy])
+                    agent_latent_states.append(agent_latent_state_batch_in_search_path[ix][iy])
+                    global_latent_states.append(global_latent_state_batch_in_search_path[ix][iy])
 
-                latent_states = torch.from_numpy(np.asarray(latent_states)).to(self._cfg.device).float()
+                agent_latent_states = torch.from_numpy(np.asarray(agent_latent_states)).to(self._cfg.device).float()
+                global_latent_states = torch.from_numpy(np.asarray(global_latent_states)).to(self._cfg.device).float()
                 # .long() is only for discrete action
                 last_actions = torch.from_numpy(np.asarray(last_actions)).to(self._cfg.device).long()
                 """
@@ -314,14 +327,19 @@ def search(
                 MCTS stage 3: Backup
                     At the end of the simulation, the statistics along the trajectory are updated.
                 """
-                network_output = model.recurrent_inference(latent_states, last_actions)
+                network_output = model.recurrent_inference((agent_latent_states, global_latent_states), last_actions)
 
-                network_output.latent_state = to_detach_cpu_numpy(network_output.latent_state)
+                network_output_agent_latent_state, network_output_global_latent_state = network_output.latent_state 
+
+                # network_output.latent_state = to_detach_cpu_numpy(network_output.latent_state)
+                network_output_agent_latent_state = to_detach_cpu_numpy(network_output_agent_latent_state)
+                network_output_global_latent_state = to_detach_cpu_numpy(network_output_global_latent_state)
                 network_output.policy_logits = to_detach_cpu_numpy(network_output.policy_logits)
                 network_output.value = to_detach_cpu_numpy(self.inverse_scalar_transform_handle(network_output.value))
                 network_output.reward = to_detach_cpu_numpy(self.inverse_scalar_transform_handle(network_output.reward))
 
-                latent_state_batch_in_search_path.append(network_output.latent_state)
+                agent_latent_state_batch_in_search_path.append(network_output_agent_latent_state)
+                global_latent_state_batch_in_search_path.append(network_output_global_latent_state)
                 # tolist() is to be compatible with cpp datatype.
                 reward_batch = network_output.reward.reshape(-1).tolist()
                 value_batch = network_output.value.reshape(-1).tolist()
 
@@ -128,7 +128,17 @@ def __init__(
                 res_connection_in_dynamics=self.res_connection_in_dynamics,
             )
         else:
-            self.dynamics_network = state_dynamics
+            self.dynamics_network = state_dynamics(
+                action_encoding_dim=self.action_encoding_dim,
+                num_channels=latent_state_dim + self.action_encoding_dim,
+                common_layer_num=2,
+                lstm_hidden_size=lstm_hidden_size,
+                fc_reward_layers=fc_reward_layers,
+                output_support_size=self.reward_support_size,
+                last_linear_layer_init_zero=self.last_linear_layer_init_zero,
+                norm_type=norm_type,
+                res_connection_in_dynamics=self.res_connection_in_dynamics,
+            )
 
         if state_prediction == None:
             self.prediction_network = PredictionNetworkMLP(
@@ -141,7 +151,16 @@ def __init__(
                 norm_type=norm_type
             )
         else:
-            self.prediction_network = state_prediction
+            self.prediction_network = state_prediction(
+                action_space_size=action_space_size,
+                num_channels=latent_state_dim,
+                fc_value_layers=fc_value_layers,
+                fc_policy_layers=fc_policy_layers,
+                output_support_size=self.value_support_size,
+                last_linear_layer_init_zero=self.last_linear_layer_init_zero,
+                norm_type=norm_type
+
+            )
 
         if self.self_supervised_learning_loss:
             # self_supervised_learning_loss related network proposed in EfficientZero
@@ -186,7 +205,7 @@ def initial_inference(self, obs: torch.Tensor) -> EZNetworkOutput:
         """
         batch_size = get_shape0(obs)
         latent_state = self._representation(obs)
-        device = latent_state.device
+        device = latent_state[0].device
         policy_logits, value = self._prediction(latent_state)
         # zero initialization for reward hidden states
         # (hn, cn), each element shape is (layer_num=1, batch_size, lstm_hidden_size)
@@ -307,19 +326,22 @@ def _dynamics(self, latent_state: torch.Tensor, reward_hidden_state: Tuple,
                 # e.g.,  torch.Size([8]) ->  torch.Size([8, 1])
                 action_encoding = action_encoding.unsqueeze(-1)
 
-        action_encoding = action_encoding.to(latent_state.device).float()
+        agent_latent_state, global_latent_state = latent_state
+        action_encoding = action_encoding.to(agent_latent_state.device).float()
         # state_action_encoding shape: (batch_size, latent_state[1] + action_dim]) or
         # (batch_size, latent_state[1] + action_space_size]) depending on the discrete_action_encoding_type.
-        state_action_encoding = torch.cat((latent_state, action_encoding), dim=1)
+        agent_state_action_encoding = torch.cat((agent_latent_state, action_encoding), dim=1)
+        global_state_action_encoding = torch.cat((agent_latent_state, global_latent_state, action_encoding), dim=1)
 
         # NOTE: the key difference with MuZero
-        next_latent_state, next_reward_hidden_state, value_prefix = self.dynamics_network(
-            state_action_encoding, reward_hidden_state
+        (next_agent_latent_state, next_global_latent_state), next_reward_hidden_state, value_prefix = self.dynamics_network(
+            (agent_state_action_encoding, global_state_action_encoding), reward_hidden_state
         )
 
         if self.state_norm:
-            next_latent_state = renormalize(next_latent_state)
-        return next_latent_state, next_reward_hidden_state, value_prefix
+            next_agent_latent_state = renormalize(next_agent_latent_state)
+            next_global_latent_state = renormalize(next_global_latent_state)
+        return (next_agent_latent_state, next_global_latent_state), next_reward_hidden_state, value_prefix
 
     def project(self, latent_state: torch.Tensor, with_grad=True):
         """