Fusion blocks and testing finalisation

felix-e-h-p · felix-e-h-p · commit cc42d38f0a2c · 2025-01-15T13:32:10.000Z
diff --git a/pvnet/models/multimodal/fusion_blocks.py b/pvnet/models/multimodal/fusion_blocks.py
@@ -6,6 +6,8 @@
 Definition of foundational fusion mechanisms; DynamicFusionModule and ModalityGating
 
 Aformentioned fusion blocks apply dynamic attention, weighted combinations and / or gating mechanisms for feature learning
+
+Summararily, this enables dynamic feature learning through attention based weighting and modality specific gating
 """
 
 
@@ -31,6 +33,11 @@ def forward(
 
 
 class DynamicFusionModule(AbstractFusionBlock):
+
+    """ Implementation of dynamic multimodal fusion through cross attention and weighted combination """
+
+    # Input dimension specified and common embedding dimension
+    # Quantity of attention heads also specified
     def __init__(
         self,
         feature_dims: Dict[str, int],
@@ -40,12 +47,10 @@ def __init__(
         fusion_method: str = "weighted_sum",
         use_residual: bool = True
     ):
-        nn.Module.__init__(self)
+        super().__init__()
         
-        if hidden_dim <= 0:
-            raise ValueError("hidden_dim must be positive")
-        if num_heads <= 0:
-            raise ValueError("num_heads must be positive")
+        if hidden_dim <= 0 or num_heads <= 0:
+            raise ValueError("hidden_dim and num_heads must be positive")
             
         self.feature_dims = feature_dims
         self.hidden_dim = hidden_dim
@@ -55,7 +60,7 @@ def __init__(
         if fusion_method not in ["weighted_sum", "concat"]:
             raise ValueError(f"Invalid fusion method: {fusion_method}")
         
-        # Projections
+        # Projections - modality specific
         self.projections = nn.ModuleDict({
             name: nn.Sequential(
                 nn.Linear(dim, hidden_dim),
@@ -67,14 +72,14 @@ def __init__(
             if dim > 0
         })
         
-        # Attention
+        # Attention - cross modal
         self.cross_attention = MultiheadAttention(
             embed_dim=hidden_dim,
             num_heads=num_heads,
             dropout=dropout
         )
         
-        # Weight network
+        # Weight computation network definition
         self.weight_network = nn.Sequential(
             nn.Linear(hidden_dim, hidden_dim // 2),
             nn.ReLU(),
@@ -96,8 +101,10 @@ def __init__(
             self.layer_norm = nn.LayerNorm(hidden_dim)
             
     def _validate_features(self, features: Dict[str, torch.Tensor]) -> None:
+        """ Validates input feature dimensions and sequence lengths """
+
         if not features:
-            raise ValueError("Empty features dictionary")
+            raise ValueError("Empty features dict")
             
         seq_length = None
         for name, feat in features.items():
@@ -107,32 +114,26 @@ def _validate_features(self, features: Dict[str, torch.Tensor]) -> None:
             if seq_length is None:
                 seq_length = feat.size(1)
             elif feat.size(1) != seq_length:
-                raise ValueError("All modalities must have the same sequence length")
+                raise ValueError("All modalities must have same sequence length")
 
     def compute_modality_weights(
         self,
         features: torch.Tensor,
         modality_mask: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
-        """Compute weights for each feature.
-        
-        Args:
-            features: [batch_size, seq_len, hidden_dim] tensor
-            modality_mask: Optional attention mask
-            
-        Returns:
-            [batch_size, seq_len, 1] tensor of weights
-        """
-        # Compute weights for each feature
-        flat_features = features.reshape(-1, features.size(-1))  # [B*S, H]
-        weights = self.weight_network(flat_features)  # [B*S, 1]
-        weights = weights.reshape(features.size(0), features.size(1), 1)  # [B, S, 1]
+
+        """ Computation of attention weights for each feature """
+
+        batch_size, seq_len = features.size(0), features.size(1)
+        flat_features = features.reshape(-1, features.size(-1))
+        weights = self.weight_network(flat_features)
+        weights = weights.reshape(batch_size, seq_len, 1)
         
         if modality_mask is not None:
-            modality_mask = modality_mask.unsqueeze(-1)  # [B, S, 1]
-            weights = weights.masked_fill(~modality_mask, 0.0)
+            weights = weights.reshape(batch_size, -1, 1)[:, :modality_mask.size(1), :]
+            weights = weights.masked_fill(~modality_mask.unsqueeze(-1), 0.0)
             
-        # Normalize weights
+        # Normalisation of weights
         weights = weights / (weights.sum(dim=1, keepdim=True) + 1e-9)
         return weights
 
@@ -141,15 +142,9 @@ def forward(
         features: Dict[str, torch.Tensor],
         modality_mask: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
-        """Forward pass
-        
-        Args:
-            features: Dict of [batch_size, seq_len, feature_dim] tensors
-            modality_mask: Optional attention mask
-            
-        Returns:
-            [batch_size, hidden_dim] tensor if seq_len=1, else [batch_size, seq_len, hidden_dim]
-        """
+
+        """ Forward pass for dynamic fusion """
+
         self._validate_features(features)
         
         batch_size = next(iter(features.values())).size(0)
@@ -167,44 +162,64 @@ def forward(
         if not projected_features:
             raise ValueError("No valid features after projection")
             
-        # Stack and apply attention
-        feature_stack = torch.stack(projected_features, dim=2)  # [B, S, M, H]
+        # Stack features
+        feature_stack = torch.stack(projected_features, dim=1)
         
-        # Cross attention
-        attended_features = self.cross_attention(
-            feature_stack, feature_stack, feature_stack
-        )  # [B, S, M, H]
-        
-        # Average across modalities first
-        attended_avg = attended_features.mean(dim=2)  # [B, S, H]
+        # Apply cross attention
+        attended_features = []
+        for i in range(feature_stack.size(1)):
+            query = feature_stack[:, i]
+            key_value = feature_stack[:, [j for j in range(feature_stack.size(1)) if j != i]]
+            if key_value.size(1) > 0:
+                attended = self.cross_attention(query, key_value.reshape(-1, seq_len, self.hidden_dim), 
+                                            key_value.reshape(-1, seq_len, self.hidden_dim))
+                attended_features.append(attended)
+            else:
+                attended_features.append(query)
+                
+        # Average across modalities
+        attended_features = torch.stack(attended_features, dim=1)
+        attended_avg = attended_features.mean(dim=1)
         
-        # Compute weights on averaged features
-        weights = self.compute_modality_weights(attended_avg, modality_mask)  # [B, S, 1]
+        # Mask attended features to match
+        if modality_mask is not None:
+            # Create binary mask matching sequence length
+            seq_mask = torch.zeros((batch_size, seq_len), device=attended_avg.device).bool()
+            seq_mask[:, :modality_mask.size(1)] = modality_mask
+            
+            # Compute weights on masked features
+            weights = self.compute_modality_weights(attended_avg, seq_mask)
+            weights = weights.unsqueeze(1).expand(-1, attended_features.size(1), -1, 1)
+        else:
+            weights = self.compute_modality_weights(attended_avg)
+            weights = weights.unsqueeze(1).expand(-1, attended_features.size(1), -1, 1)
         
-        # Apply weights
-        weighted_features = attended_features * weights.unsqueeze(2)  # [B, S, M, H]
+        # Application of weighted features
+        weighted_features = attended_features * weights
         
         if self.fusion_method == "weighted_sum":
-            # Sum across modalities
-            fused = weighted_features.sum(dim=2)  # [B, S, H]
+            fused = weighted_features.sum(dim=1)
         else:
-            # Concatenate modalities
-            concat = weighted_features.reshape(batch_size, seq_len, -1)  # [B, S, M*H]
-            fused = self.output_projection(concat)  # [B, S, H]
+            concat = weighted_features.reshape(batch_size, seq_len, -1)
+            fused = self.output_projection(concat)
             
-        # Apply residual if needed    
+        # Application of residual
         if self.use_residual:
-            residual = feature_stack.mean(dim=2)  # [B, S, H]
+            residual = feature_stack.mean(dim=1)
             fused = self.layer_norm(fused + residual)
             
-        # Remove sequence dimension if length is 1    
-        if seq_len == 1:
-            fused = fused.squeeze(1)
+        # Collapse sequence dimension for output
+        fused = fused.mean(dim=1)
             
         return fused
 
 
+
+
 class ModalityGating(AbstractFusionBlock):
+    """ Implementation of modality specific gating mechanism """
+
+    # Input and hidden dimension definition
     def __init__(
         self,
         feature_dims: Dict[str, int],
@@ -219,7 +234,7 @@ def __init__(
         self.feature_dims = feature_dims
         self.hidden_dim = hidden_dim
         
-        # Create gate networks for each modality
+        # Define gate networks for each modality
         self.gate_networks = nn.ModuleDict({
             name: nn.Sequential(
                 nn.Linear(dim, hidden_dim),
@@ -233,9 +248,10 @@ def __init__(
         })
 
     def _validate_features(self, features: Dict[str, torch.Tensor]) -> None:
+        """ Validation helper for input feature dict """
 
         if not features:
-            raise ValueError("Empty features dictionary")
+            raise ValueError("Empty features dict")
         for name, feat in features.items():
             if feat is None:
                 raise ValueError(f"None tensor for modality: {name}")
@@ -246,25 +262,21 @@ def forward(
         features: Dict[str, torch.Tensor]
     ) -> Dict[str, torch.Tensor]:
 
-        self._validate_features(features)
+        """ Application of modality specific gating """
 
+        self._validate_features(features)
         gated_features = {}
         
         for name, feat in features.items():
             if feat is not None and name in self.gate_networks:
-                # Handle 3D tensors (batch_size, sequence_length, feature_dim)
                 batch_size, seq_len, feat_dim = feat.shape
-                
-                # Reshape to (batch_size * seq_len, feature_dim)
-                flat_feat = feat.reshape(-1, feat_dim)
-                
-                # Compute gates
-                gate = self.gate_networks[name](flat_feat)
-                
-                # Reshape gates back to match input
+
+                # Gate computation sequence 
+                flat_feat = feat.reshape(-1, feat_dim)                
+                gate = self.gate_networks[name](flat_feat)                
                 gate = gate.reshape(batch_size, seq_len, 1)
                 
-                # Apply gating
+                # Application of gating
                 gated_features[name] = feat * gate
                 
         return gated_features
diff --git a/tests/models/multimodal/test_fusion_blocks.py b/tests/models/multimodal/test_fusion_blocks.py
@@ -47,9 +47,9 @@ def attention_mask(config):
 
 
 # DynamicFusionModule Tests
-def test_dynamic_fusion_initialization(config):
-    """ Verify initialization and parameter validation """
-    # Test valid initialization
+def test_dynamic_fusion_initialisation(config):
+    """ Verify initialstion and parameter validation """
+    # Test valid initialisation
     fusion = DynamicFusionModule(
         feature_dims=config['feature_dims'],
         hidden_dim=config['hidden_dim'],
@@ -62,20 +62,13 @@ def test_dynamic_fusion_initialization(config):
     assert fusion.hidden_dim == config['hidden_dim']
     assert isinstance(fusion.cross_attention.embed_dim, int)
     assert isinstance(fusion.weight_network, torch.nn.Sequential)
-    
-    # Test invalid hidden_dim
-    with pytest.raises(ValueError, match="hidden_dim must be positive"):
+
+    # Test invalid hidden_dim and num_heads
+    with pytest.raises(ValueError, match="hidden_dim and num_heads must be positive"):
         DynamicFusionModule(
             feature_dims=config['feature_dims'],
             hidden_dim=0
-        )
-    
-    # Test invalid num_heads
-    with pytest.raises(ValueError, match="num_heads must be positive"):
-        DynamicFusionModule(
-            feature_dims=config['feature_dims'],
-            num_heads=0
-        )
+    )
 
 
 def test_dynamic_fusion_feature_validation(config, multimodal_features):
@@ -86,7 +79,7 @@ def test_dynamic_fusion_feature_validation(config, multimodal_features):
     )
     
     # Test empty features
-    with pytest.raises(ValueError, match="Empty features dictionary"):
+    with pytest.raises(ValueError, match="Empty features dict"):
         fusion({})
     
     # Test None tensor
@@ -177,12 +170,12 @@ def test_dynamic_fusion_different_sequence_lengths(config):
         'audio': torch.randn(config['batch_size'], 12, config['feature_dims']['audio'])
     }
     
-    with pytest.raises(ValueError, match=r"All modalities must have the same sequence length"):
+    with pytest.raises(ValueError, match="All modalities must have same sequence length"):
         output = fusion(varying_features)
 
 
 # ModalityGating Tests
-def test_modality_gating_initialization(config):
+def test_modality_gating_initialisation(config):
     """ Verify initialisation """
     gating = ModalityGating(
         feature_dims=config['feature_dims'],
@@ -193,6 +186,7 @@ def test_modality_gating_initialization(config):
     assert len(gating.gate_networks) == len(config['feature_dims'])
     for name, network in gating.gate_networks.items():
         assert isinstance(network, torch.nn.Sequential)
+
         # Verify input dimension of first layer matches feature dimension
         assert network[0].in_features == config['feature_dims'][name]
         
@@ -215,12 +209,13 @@ def test_modality_gating_forward(config, multimodal_features):
     # Verify output shapes and properties
     assert len(outputs) == len(multimodal_features)
     for modality, output in outputs.items():
-        assert output.shape == multimodal_features[modality].shape  # Should match 3D input shape
-        assert len(output.shape) == 3  # Ensure 3D output (batch, sequence, features)
+        assert output.shape == multimodal_features[modality].shape
+        assert len(output.shape) == 3
         assert not torch.isnan(output).any()
         assert not torch.isinf(output).any()
+
         # Verify gating values are between 0 and 1
-        gates = output / (multimodal_features[modality] + 1e-8)  # Avoid division by zero
+        gates = output / (multimodal_features[modality] + 1e-8)
         assert torch.all((gates >= 0) & (gates <= 1 + 1e-6))
 
 
@@ -267,7 +262,7 @@ def test_modality_gating_edge_cases(config):
     gating = ModalityGating(feature_dims={'visual': 64})
     
     # Empty input validation
-    with pytest.raises(ValueError, match="Empty features dictionary"):
+    with pytest.raises(ValueError, match="Empty features dict"):
         gating({})
     
     # Test with single timestep
@@ -306,4 +301,4 @@ def test_modality_gating_different_sequence_lengths(config):
     
     # Verify shapes are maintained
     assert outputs['visual'].shape == varying_features['visual'].shape
-    assert outputs['text'].shape == varying_features['text'].shape
+    assert outputs['text'].shape == varying_features['text'].shape